├── .gitignore
├── Licence.md
├── MD_simulation_on_alanine_dipeptide
    └── current_work
    │   ├── .gitignore
    │   ├── README.md
    │   ├── resources
    │       └── .gitignore
    │   ├── snapshot.sh
    │   ├── src
    │       ├── .gitignore
    │       ├── ANN_simulation.py
    │       ├── autoencoders.py
    │       ├── biased_simulation.py
    │       ├── biased_simulation_general.py
    │       ├── config.py
    │       ├── coordinates_data_files_list.py
    │       ├── generate_coordinates.py
    │       ├── helper_func.py
    │       ├── kernel_tica.py
    │       ├── main_work.py
    │       ├── molecule_spec_sutils.py
    │       ├── remove_water_mol.py
    │       ├── structural_alignment.py
    │       ├── tf_load.py
    │       ├── train_network_and_save_for_iter.py
    │       └── workqueue.py
    │   ├── target
    │       └── .gitignore
    │   └── tests
    │       ├── .gitignore
    │       ├── ANN_simulation_test.py
    │       └── Makefile
├── README.md
├── archive
    └── plumed_adp.zip
└── figures
    ├── autoencoder_1.png
    ├── autoencoder_2.png
    ├── diagram_autoencoder.png
    └── hierarchical_autoencoder.png


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | **/current_work/.idea/*
3 | *.sge
4 | *.sge.*
5 | *.mat
6 | **/.ipynb_checkpoints/
7 | 


--------------------------------------------------------------------------------
/Licence.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Wei Chen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MD_simulation_on_alanine_dipeptide/current_work/.gitignore:
--------------------------------------------------------------------------------
1 | previous_runs/**
2 | .spyderworkspace
3 | .vscode/**
4 | 
5 | 


--------------------------------------------------------------------------------
/MD_simulation_on_alanine_dipeptide/current_work/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiHelloWorld/accelerated_sampling_with_autoencoder/fe2b98bc81fc0b30db42ca8a83e23adb775a487d/MD_simulation_on_alanine_dipeptide/current_work/README.md


--------------------------------------------------------------------------------
/MD_simulation_on_alanine_dipeptide/current_work/resources/.gitignore:
--------------------------------------------------------------------------------
1 | *.pkl
2 | *.txt
3 | *.pdb
4 | *.png
5 | charmm36.xml
6 | 
7 | 


--------------------------------------------------------------------------------
/MD_simulation_on_alanine_dipeptide/current_work/snapshot.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | prefix=$1
 4 | 
 5 | current_time=$(date -u +%Y%m%d%H%M%S)
 6 | dir_name=${prefix}"/ss_"${current_time}
 7 | 
 8 | mkdir -p ${dir_name}
 9 | 
10 | for item in README.md resources src target; do
11 | 	echo "copying "${item}
12 | 	rsync -ar --exclude='.*' ${item} ${dir_name}
13 | done
14 | 


--------------------------------------------------------------------------------
/MD_simulation_on_alanine_dipeptide/current_work/src/.gitignore:
--------------------------------------------------------------------------------
 1 | *.png
 2 | nohup.out
 3 | temp.py
 4 | *.ipynb
 5 | ANN.py
 6 | ANN.pyc
 7 | ANN_wrap.cxx
 8 | *.pkl
 9 | temp_*.py
10 | temp_*.sh
11 | *.pdb
12 | *_coordinates.txt
13 | temp*.txt
14 | temp/**
15 | *.hdf5
16 | *.npy
17 | *.pdf
18 | temp_model.dot
19 | HDE_paper_notebooks/**
20 | 
21 | 


--------------------------------------------------------------------------------
/MD_simulation_on_alanine_dipeptide/current_work/src/ANN_simulation.py:
--------------------------------------------------------------------------------
  1 | from config import * # configuration file
  2 | from cluster_management import *
  3 | from autoencoders import *
  4 | from helper_func import *
  5 | 
  6 | """note that all configurations for a class should be in function __init__(), and take configuration parameters
  7 | from config.py
  8 | """
  9 | 
 10 | class plotting(object):
 11 |     """this class implements different plottings
 12 |     """
 13 | 
 14 |     def __init__(self, network=None):
 15 |         self._network = network
 16 |         pass
 17 | 
 18 |     @staticmethod
 19 |     def plot_fve_L_method(fve, CV_min, CV_max, fig, ax):
 20 |         temp_fve = np.array(fve).flatten()
 21 |         temp_fve = temp_fve.reshape(CV_max - CV_min, temp_fve.shape[0] / (CV_max - CV_min))
 22 |         evaluation_values = np.mean(temp_fve, axis=-1)
 23 |         optimal_num, x_data, y_data_left, y_data_right = Sutils.L_method(evaluation_values, list(range(CV_min, CV_max + 1)))
 24 |         x_data = [_ - CV_min for _ in x_data]
 25 |         ax.plot(x_data, y_data_left)
 26 |         ax.plot(x_data, y_data_right)
 27 |         ax.scatter(list(range(CV_max - CV_min + 1)), evaluation_values)
 28 |         df = pd.DataFrame(temp_fve.T)
 29 |         sns.boxplot(df, ax=ax)
 30 |         ax.set_xticklabels(list(range(CV_min, CV_max + 1)))
 31 |         ax.set_ylim([evaluation_values.min() - 0.1, evaluation_values.max() + 0.1])
 32 |         ax.set_xlabel('num of CVs')
 33 |         ax.set_ylabel('FVE')
 34 |         return fig, ax
 35 | 
 36 |     def plotting_with_coloring_option(self, plotting_space,
 37 |                                             fig_object,
 38 |                                             axis_object,
 39 |                                             network=None,
 40 |                                             input_data_for_plotting=None,   # input could be cossin or Cartesian
 41 |                                             color_option='other',
 42 |                                             other_coloring=None,
 43 |                                             contain_title=True,
 44 |                                             title=None,
 45 |                                             axis_ranges=None,
 46 |                                             contain_colorbar=True,
 47 |                                             colorbar_label=None,
 48 |                                             smoothing_using_RNR = False,  # smooth the coloring values for data points using RadiusNeighborsRegressor()
 49 |                                             variance_using_RNR = False,  # get variance of coloring values over space using RNR
 50 |                                             smoothing_radius = 0.1,
 51 |                                             enable_mousing_clicking_event = False,
 52 |                                             related_coor_list_obj = None,
 53 |                                             saving_snapshot_mode = 'single_point'
 54 |                                       ):
 55 |         """
 56 |         by default, we are using training data, and we also allow external data input
 57 |         :param related_coor_list_obj,  this must be specified when enable_mousing_clicking_event == True
 58 |         """
 59 |         if enable_mousing_clicking_event and related_coor_list_obj is None:
 60 |             raise Exception('related_coor_list_obj not defined!')
 61 | 
 62 |         if network is None: network = self._network
 63 |         if title is None: title = "plotting in %s, coloring with %s" % (plotting_space, color_option)  # default title
 64 |         if input_data_for_plotting is None:
 65 |             input_data = self._network._data_set
 66 |         else:
 67 |             input_data = input_data_for_plotting
 68 | 
 69 |         if plotting_space == "PC":
 70 |             PCs_to_plot = network.get_PCs(input_data= input_data)
 71 |             (x, y) = ([item[0] for item in PCs_to_plot], [item[1] for item in PCs_to_plot])
 72 |             labels = ["PC1", "PC2"]
 73 |         else:
 74 |             raise Exception('plotting_space not defined!')
 75 | 
 76 |         # coloring
 77 |         if color_option == 'step':
 78 |             coloring = list(range(len(x)))
 79 |         elif color_option == 'other':
 80 |             assert (len(other_coloring) == len(x)), (len(other_coloring), len(x))
 81 |             coloring = other_coloring
 82 |             if smoothing_using_RNR:    # smooth coloring using RNR
 83 |                 r_neigh = RadiusNeighborsRegressor(radius=smoothing_radius, weights='uniform')
 84 |                 temp_coors = [list(item) for item in zip(x, y)]
 85 |                 r_neigh.fit(temp_coors, coloring)
 86 |                 coloring = r_neigh.predict(temp_coors)
 87 |             elif variance_using_RNR:  # get variance of the coloring values over space, using RNR
 88 |                 r_neigh = RadiusNeighborsRegressor(radius=smoothing_radius, weights='uniform')
 89 |                 temp_coors = [list(item) for item in zip(x, y)]
 90 |                 r_neigh.fit(temp_coors, coloring)
 91 |                 coloring_mean = r_neigh.predict(temp_coors)
 92 |                 r_neigh.fit(temp_coors, np.multiply(np.array(coloring), np.array(coloring)))
 93 |                 coloring_square_mean = r_neigh.predict(temp_coors)
 94 |                 coloring = coloring_square_mean - np.multiply(coloring_mean, coloring_mean)
 95 |         else:
 96 |             raise Exception('color_option not defined!')
 97 | 
 98 |         im = axis_object.scatter(x,y,s=4, c=coloring, cmap='gist_rainbow', picker=True)
 99 |         axis_object.set_xlabel(labels[0])
100 |         axis_object.set_ylabel(labels[1])
101 |         if contain_title:
102 |             axis_object.set_title(title)
103 | 
104 |         if not axis_ranges is None:
105 |             axis_object.set_xlim(axis_ranges[0])
106 |             axis_object.set_ylim(axis_ranges[1])
107 | 
108 |         if contain_colorbar:
109 |             temp_colorbar = fig_object.colorbar(im, ax=axis_object)
110 |             if not colorbar_label is None:
111 |                 temp_colorbar.set_label(str(colorbar_label))
112 | 
113 |         # mouse clicking event
114 |         if enable_mousing_clicking_event:
115 |             folder_to_store_these_frames = 'temp_pdb'
116 |             if not os.path.exists(folder_to_store_these_frames):
117 |                 subprocess.check_output(['mkdir', folder_to_store_these_frames])
118 | 
119 |             # should calculate step_interval
120 |             total_num_of_lines_in_coor_files = sum(related_coor_list_obj.get_list_of_line_num_of_coor_data_file())
121 |             step_interval = int(total_num_of_lines_in_coor_files / len(input_data))
122 | 
123 |             if saving_snapshot_mode == 'multiple_points':
124 |                 axis_object.text(-1.2, -1.2, 'save_frames', picker = True, fontsize=12)  # TODO: find better coordinates
125 | 
126 |                 global temp_list_of_coor_index   # TODO: use better way instead of global variable
127 |                 temp_list_of_coor_index = []
128 |                 def onclick(event):
129 |                     global temp_list_of_coor_index
130 |                     if isinstance(event.artist, matplotlib.text.Text):
131 |                         if event.artist.get_text() == 'save_frames':
132 |                             print(temp_list_of_coor_index)
133 |                             related_coor_list_obj.write_pdb_frames_into_file_with_list_of_coor_index(temp_list_of_coor_index,
134 |                                                                             folder_to_store_these_frames + '/temp_frames.pdb')  # TODO: better naming
135 | 
136 |                             temp_list_of_coor_index = []  # output pdb file and clean up
137 |                             print ('done saving frames!')
138 |                     elif isinstance(event.artist, matplotlib.collections.PathCollection):
139 |                         ind_list = list(event.ind)
140 |                         print ('onclick:')
141 |                         temp_list_of_coor_index += [item * step_interval for item in ind_list]  # should include step_interval
142 | 
143 |                         for item in ind_list:
144 |                             print(item, x[item], y[item])
145 |                     return
146 | 
147 |             elif saving_snapshot_mode == 'single_point':
148 |                 global temp_global_index_click
149 |                 temp_global_index_click = 0
150 |                 def onclick(event):
151 |                     global temp_global_index_click
152 |                     if isinstance(event.artist, matplotlib.collections.PathCollection):
153 |                         ind_list = list(event.ind)
154 |                         print ('onclick:')
155 |                         for item in ind_list:
156 |                             print(item, x[item], y[item])
157 | 
158 |                         temp_ind_list = [item * step_interval for item in ind_list]  # should include step_interval
159 |                         average_x = sum([x[item] for item in ind_list]) / len(ind_list)
160 |                         average_y = sum([y[item] for item in ind_list]) / len(ind_list)
161 |                         # notation on the graph
162 |                         axis_object.scatter([average_x], [average_y], s=50, marker='s')
163 |                         axis_object.text(average_x, average_y, '%d' % temp_global_index_click, picker = False, fontsize=15)
164 |                         out_file_name = folder_to_store_these_frames + '/%02d_temp_frames_[%f,%f].pdb' % \
165 |                                                                     (temp_global_index_click, average_x, average_y)
166 | 
167 |                         temp_global_index_click += 1
168 |                         related_coor_list_obj.write_pdb_frames_into_file_with_list_of_coor_index(temp_ind_list,
169 |                             out_file_name=out_file_name)
170 |                         # need to verify PCs generated from this output pdb file are consistent from those in the list selected
171 |                         molecule_type.generate_coordinates_from_pdb_files(path_for_pdb=out_file_name)
172 |                         if CONFIG_48 == "cossin":
173 |                             temp_input_data = molecule_type.get_many_cossin_from_coordinates_in_list_of_files(
174 |                                 list_of_files=[out_file_name.replace('.pdb', '_coordinates.npy')])
175 |                         elif CONFIG_48 == "Cartesian" or 'pairwise_distance':
176 |                             scaling_factor = CONFIG_49
177 |                             temp_input_data = np.load(out_file_name.replace('.pdb', '_coordinates.npy')) / scaling_factor
178 |                             temp_input_data = Sutils.remove_translation(temp_input_data)
179 |                         else:
180 |                             raise Exception("input data type error")
181 | 
182 |                         PCs_of_points_selected = network.get_PCs(input_data=temp_input_data)
183 |                         assert_almost_equal(PCs_of_points_selected, np.array([[x[item], y[item]] for item in ind_list]), decimal=4)
184 | 
185 |                     return
186 |             else:
187 |                 raise Exception('saving_snapshot_mode error')
188 | 
189 |             fig_object.canvas.mpl_connect('pick_event', onclick)
190 | 
191 |         return fig_object, axis_object, im
192 | 
193 |     def density_plotting(self, fig_object, axis_object,
194 |                          network=None,
195 |                          data_for_plotting=None,
196 |                          n_levels=40
197 |                          ):
198 |         if network is None: network = self._network
199 |         temp_data = self._network._data_set if data_for_plotting is None else data_for_plotting
200 | 
201 |         x = [item[0] for item in network.get_PCs(temp_data)]
202 |         y = [item[1] for item in network.get_PCs(temp_data)]
203 | 
204 |         df = pd.DataFrame({'x': x, 'y': y})
205 |         sns.kdeplot(df.x, df.y, ax=axis_object, n_levels=n_levels)
206 | 
207 |         return fig_object, axis_object
208 | 
209 |     @staticmethod
210 |     def plotting_potential_centers(fig_object, axis_object,
211 |                                    list_of_coor_data_files, marker='x'):
212 |         potential_centers = [single_biased_simulation_data(None, item)._potential_center for item in list_of_coor_data_files]
213 |         [x, y] = list(zip(*potential_centers))
214 | 
215 |         axis_object.scatter(x, y, marker=marker)
216 |         return fig_object, axis_object
217 | 
218 |     def equilibration_check(self, coor_file_folder,
219 |                             scaling_factor, num_of_splits, save_fig=True,
220 |                             starting_index_of_last_few_frames=0
221 |                             ):
222 |         """this function checks equilibration by plotting each individual runs in PC space, colored with 'step',
223 |         note: inputs should be Cartesian coordinates, the case with input using cossin is not implemented
224 |         """
225 |         import scipy
226 |         ks_stats_list = []
227 |         temp_arrow_list = []
228 |         potential_centers_list = []
229 |         temp_arrow_start_list = []
230 |         _1 = coordinates_data_files_list([coor_file_folder])
231 |         for item in _1.get_list_of_coor_data_files():
232 |             data = np.load(item)[starting_index_of_last_few_frames:] / scaling_factor
233 |             data = Sutils.remove_translation(data)
234 |             potential_centers_list.append([float(item_1) for item_1 in item.split('_pc_[')[1].split(']')[0].split(',')])
235 |             # do analysis using K-S test
236 |             PCs = self._network.get_PCs(data)
237 |             dim_of_PCs = PCs.shape[1]
238 |             PCs = PCs[:int(PCs.shape[0]) // num_of_splits * num_of_splits]   # in case that PCs cannot be splitted evenly
239 |             samples_for_KS_testing = np.split(PCs, num_of_splits)
240 |             ks_stats = max([
241 |                 sum(
242 |                     [scipy.stats.ks_2samp(samples_for_KS_testing[xx][:,subindex], samples_for_KS_testing[yy][:,subindex])[0]
243 |                         for subindex in range(dim_of_PCs) 
244 |                     ]) / float(dim_of_PCs)
245 |                  for xx in range(num_of_splits) for yy in range(xx + 1, num_of_splits)] 
246 |             )
247 |             ks_stats_list.append(ks_stats)
248 |             # plot arrow from center of first split to last split
249 |             temp_arrow_start = np.average(samples_for_KS_testing[0], axis=0)
250 |             temp_arrow_end = np.average(samples_for_KS_testing[-1], axis=0)
251 |             temp_arrow = (temp_arrow_end - temp_arrow_start)
252 |             assert (temp_arrow.shape[0] == 2), temp_arrow.shape[0]
253 |             temp_arrow_list.append(temp_arrow)
254 |             temp_arrow_start_list.append(temp_arrow_start)
255 |             
256 |             fig, ax = plt.subplots()
257 |             self.plotting_with_coloring_option("PC", fig, ax, input_data_for_plotting=data, color_option='step',
258 |                                             title=item.strip().split('/')[-1])
259 |             ax.quiver([temp_arrow_start[0]], [temp_arrow_start[1]], [temp_arrow[0]], [temp_arrow[1]],
260 |                       units="xy", scale=1)
261 |             if save_fig:
262 |                 fig.savefig(ax.get_title() + '.png')
263 | 
264 |         # plotting K-S stats
265 |         potential_centers_list = np.array(potential_centers_list)
266 |         temp_arrow_list = np.array(temp_arrow_list)
267 |         temp_arrow_start_list = np.array(temp_arrow_start_list)
268 |         fig, ax = plt.subplots()
269 |         im = ax.scatter(potential_centers_list[:,0], potential_centers_list[:,1],  c=ks_stats_list, cmap="Blues")
270 |         col_bar = fig.colorbar(im, ax=ax)
271 |         col_bar.set_label("KS value")
272 |         for pc, arr_start in zip(potential_centers_list, temp_arrow_start_list):
273 |             # connect potential center to starting point of arrow with dashed line
274 |             ax.plot([pc[0], arr_start[0]], [pc[1], arr_start[1]], linestyle='dotted')
275 | 
276 |         ax.quiver(temp_arrow_start_list[:,0], temp_arrow_start_list[:,1],
277 |                   temp_arrow_list[:,0], temp_arrow_list[:,1],
278 |                   units = 'xy', scale=1)
279 |         ax.set_xlabel("PC1")
280 |         ax.set_ylabel("PC2")
281 |         fig.set_size_inches((10, 10))
282 |         fig.savefig("temp_harmonic_centers_and_stats.png")
283 | 
284 |         return
285 | 
286 | 
287 | class machine_independent_run(object):
288 |     def __init__(self):
289 |         return
290 | 
291 |     @staticmethod
292 |     def run_commands(machine_to_run_simulations, commands, cuda, max_num_failed_jobs):
293 |         if machine_to_run_simulations == 'cluster':
294 |             cluster_management.create_sge_files_for_commands(list_of_commands_to_run=commands,
295 |                                                              run_on_gpu=cuda)
296 |             cluster_management.monitor_status_and_submit_periodically(num = CONFIG_14,
297 |                             monitor_mode='normal',
298 |                             num_of_running_jobs_when_allowed_to_stop = 500)  # should not loop forever
299 |         elif machine_to_run_simulations == 'local':
300 |             total_num_failed_jobs = Helper_func.run_multiple_jobs_on_local_machine(commands=commands)
301 |             assert (total_num_failed_jobs < max_num_failed_jobs)
302 |         else:
303 |             raise Exception('machine type error')
304 |         return
305 | 
306 | 
307 | class iteration(object):
308 |     def __init__(self, index,
309 |                  network=None # if you want to start with existing network, assign value to "network"
310 |                  ):
311 |         self._index = index
312 |         self._network = network
313 | 
314 |     @staticmethod
315 |     def preprocessing(machine_to_run_simulations = CONFIG_24, target_folder=None):
316 |         """
317 |         1. aligned structure
318 |         2. generate coordinate files
319 |         """
320 |         reference_suffix_list = CONFIG_63
321 |         reference_configs = CONFIG_62
322 |         atom_selection_list = CONFIG_64
323 |         assert (len(reference_configs) == len(reference_suffix_list)), (
324 |         len(reference_configs), len(reference_suffix_list))
325 |         num_of_reference_configs = len(reference_configs)
326 |         if not target_folder is None:
327 |             temp_target_folder = target_folder
328 |         else:
329 |             if isinstance(molecule_type, Trp_cage):
330 |                 temp_target_folder = '../target/Trp_cage'
331 |             elif isinstance(molecule_type, Alanine_dipeptide):
332 |                 temp_target_folder = '../target/Alanine_dipeptide'
333 |             else:
334 |                 raise Exception("molecule type error")
335 | 
336 |         if CONFIG_48 == 'Cartesian':
337 |             for _1 in range(num_of_reference_configs):
338 |                 temp_command_list = ['python', 'structural_alignment.py', temp_target_folder,
339 |                                      '--ref', reference_configs[_1], '--suffix', reference_suffix_list[_1],
340 |                                      '--atom_selection', atom_selection_list[_1]
341 |                                      ]
342 |                 if machine_to_run_simulations == 'local':
343 |                     subprocess.check_output(temp_command_list)
344 |                 elif machine_to_run_simulations == 'cluster':
345 |                     temp_command = ' '.join(['"%s"' % item for item in temp_command_list]) + ' 2> /dev/null '  # TODO: does it work by adding quotation marks to everything
346 |                     cluster_management.run_a_command_and_wait_on_cluster(command=temp_command)
347 |                 else:
348 |                     raise Exception('machine type error')
349 | 
350 |         molecule_type.generate_coordinates_from_pdb_files(path_for_pdb=temp_target_folder)
351 |         return
352 | 
353 |     def train_network_and_save(self, machine_to_run_simulations = CONFIG_24,
354 |                                training_interval=1, num_of_trainings=CONFIG_13):
355 |         """num_of_trainings is the number of trainings that we are going to run, and
356 |         then pick one that has the largest Fraction of Variance Explained (FVE),
357 |         by doing this, we might avoid network with very poor quality
358 |         """
359 |         command = 'python ../src/train_network_and_save_for_iter.py %d --training_interval %d --num_of_trainings %d' %\
360 |                   (self._index, training_interval, num_of_trainings)
361 |         if machine_to_run_simulations == 'local':
362 |             print(command)
363 |             temp_output = subprocess.check_output(command.strip().split(' ')).decode("utf-8")
364 |         elif machine_to_run_simulations == 'cluster':
365 |             command = 'OMP_NUM_THREADS=6  ' + command
366 |             job_id = cluster_management.run_a_command_and_wait_on_cluster(command=command, ppn=10)
367 |             output_file, _ = cluster_management.get_output_and_err_with_job_id(job_id=job_id)
368 |             temp_output = subprocess.check_output(['cat', output_file]).decode("utf-8")
369 |         else:
370 |             raise Exception('machine type error')
371 |         autoencoder_filename = temp_output.strip().split(
372 |             'excited! this is the name of best network: ')[1].strip().split('\n')[0]    # locate filename in output
373 | 
374 |         print(temp_output)
375 |         return autoencoder.load_from_pkl_file(autoencoder_filename)
376 | 
377 |     def run_simulation(self, machine_to_run_simulations = CONFIG_24, commands = None, cuda=None):
378 |         if cuda is None: cuda = (CONFIG_23 == 'CUDA')
379 |         if commands is None:
380 |             commands = self._network.get_commands_for_further_biased_simulations()
381 |         machine_independent_run.run_commands(machine_to_run_simulations, commands, cuda,
382 |                                              CONFIG_31)    # we do not allow more than CONFIG_31 simulations failed in each iteration
383 | 
384 |         # next line only when the jobs are done, check this
385 |         if CONFIG_29:
386 |             molecule_type.remove_water_mol_and_Cl_from_pdb_file(preserve_original_file = CONFIG_50)
387 |         return
388 | 
389 | 
390 | class simulation_with_ANN_main(object):
391 |     def __init__(self, num_of_iterations = 1,
392 |                  initial_iteration=None,  # this is where we start with
393 |                  training_interval = None,
394 |                  ):
395 |         self._num_of_iterations = num_of_iterations
396 |         self._initial_iteration = initial_iteration
397 |         self._training_interval = training_interval
398 |         print("running iterations for system: %s" % CONFIG_30)
399 |         return
400 | 
401 |     def run_one_iteration(self, one_iteration):
402 |         one_iteration.preprocessing()
403 |         if one_iteration is None:
404 |             one_iteration = iteration(1, network=None)
405 |         if one_iteration._network is None:
406 |             one_iteration._network = one_iteration.train_network_and_save(
407 |                 training_interval = self._training_interval)   # train it if it is empty
408 |         one_iteration._network.write_coefficients_of_connections_into_file()
409 |         print('running this iteration #index = %d' % one_iteration._index)
410 |         one_iteration.run_simulation()
411 |         return
412 | 
413 |     def run_mult_iterations(self, num=None):
414 |         if num is None: num = self._num_of_iterations
415 | 
416 |         current_iter = self._initial_iteration
417 |         for _ in range(num):
418 |             self.run_one_iteration(current_iter)
419 |             next_index = current_iter._index + 1
420 |             current_iter = iteration(next_index, None)
421 | 
422 |         return
423 | 
424 | class single_biased_simulation_data(object):
425 |     def __init__(self, my_network, file_for_single_biased_simulation_coor):
426 |         """my_network is the corresponding network for this biased simulation"""
427 |         self._file_for_single_biased_simulation_coor = file_for_single_biased_simulation_coor
428 |         self._my_network = my_network
429 |         temp_potential_center_string = file_for_single_biased_simulation_coor.split('_pc_[')[1].split(']')[0]
430 |         self._potential_center = [float(item) for item in temp_potential_center_string.split(',')]
431 |         self._force_constant = float(file_for_single_biased_simulation_coor.split('output_fc_')[1].split('_pc_')[0])
432 |         self._number_of_data = float(subprocess.check_output(['wc', '-l', file_for_single_biased_simulation_coor]).decode("utf-8").split()[0])
433 | 
434 |         if not self._my_network is None:
435 |             if self._my_network._hidden_layers_type[1] == "Circular":
436 |                 self._dimension_of_PCs = self._my_network._node_num[2] // 2
437 |             else:
438 |                 self._dimension_of_PCs = self._my_network._node_num[2]
439 | 
440 |         return
441 | 
442 |     def get_center_of_data_cloud_in_this_biased_simulation(self, input_data_type):
443 |         if input_data_type == 'cossin':
444 |             PCs = self._my_network.get_PCs(molecule_type.get_many_cossin_from_coordinates_in_list_of_files(
445 |                 [self._file_for_single_biased_simulation_coor]))
446 |         elif input_data_type == 'Cartesian':
447 |             scaling_factor = CONFIG_49
448 |             temp_data = np.load(self._file_for_single_biased_simulation_coor) / scaling_factor
449 |             temp_data = Sutils.remove_translation(temp_data)
450 |             PCs = self._my_network.get_PCs(temp_data)
451 |         else:
452 |             raise Exception('error input_data_type')
453 | 
454 |         assert(len(PCs[0]) == self._dimension_of_PCs)
455 |         assert(len(PCs) == self._number_of_data)
456 |         PCs_transpose = list(zip(*PCs))
457 |         center_of_data_cloud = [sum(x) / len(x) for x in PCs_transpose]
458 |         return center_of_data_cloud
459 | 
460 |     def get_offset_between_potential_center_and_data_cloud_center(self, input_data_type):
461 |         """see if the push in this biased simulation actually works, large offset means it
462 |         does not work well
463 |         """
464 |         PCs_average = self.get_center_of_data_cloud_in_this_biased_simulation(input_data_type)
465 |         offset = [PCs_average[item] - self._potential_center[item] for item in range(self._dimension_of_PCs)]
466 |         return offset
467 | 
468 | 


--------------------------------------------------------------------------------
/MD_simulation_on_alanine_dipeptide/current_work/src/biased_simulation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file is for biased simulation for alanine dipeptide only, it is used as the test for
  3 | more general file biased_simulation_general.py, which could be easily extend to other new
  4 | systems.
  5 | """
  6 | 
  7 | from ANN_simulation import *
  8 | from simtk.openmm.app import *
  9 | from simtk.openmm import *
 10 | from simtk.unit import *
 11 | from sys import stdout
 12 | import ast, argparse
 13 | 
 14 | import os
 15 | import datetime
 16 | 
 17 | from config import *
 18 | 
 19 | parser = argparse.ArgumentParser()
 20 | parser.add_argument("record_interval", type=int, help="interval to take snapshots")
 21 | parser.add_argument("total_num_of_steps", type=int, help="total number of simulation steps")
 22 | parser.add_argument("force_constant", type=float, help="force constants")
 23 | parser.add_argument("folder_to_store_output_files", type=str, help="folder to store the output pdb and report files")
 24 | parser.add_argument("autoencoder_info_file", type=str, help="file to store autoencoder information (coefficients)")
 25 | parser.add_argument("pc_potential_center", type=str, help="potential center (should include 'pc_' as prefix)")
 26 | parser.add_argument("--out_traj", type=str, default=None, help="output trajectory file")
 27 | parser.add_argument("--layer_types", type=str, default=str(CONFIG_27), help='layer types')
 28 | parser.add_argument("--num_of_nodes", type=str, default=str(CONFIG_3[:3]), help='number of nodes in each layer')
 29 | parser.add_argument("--temperature", type=int, default= CONFIG_21, help='simulation temperature')
 30 | parser.add_argument("--data_type_in_input_layer", type=int, default=1, help='data_type_in_input_layer, 0 = cos/sin, 1 = Cartesian coordinates')
 31 | parser.add_argument("--platform", type=str, default=CONFIG_23, help='platform on which the simulation is run')
 32 | parser.add_argument("--scaling_factor", type=float, default = float(CONFIG_49), help='scaling_factor for ANN_Force')
 33 | parser.add_argument("--starting_pdb_file", type=str, default='../resources/alanine_dipeptide.pdb', help='the input pdb file to start simulation')
 34 | parser.add_argument("--starting_frame", type=int, default=0, help="index of starting frame in the starting pdb file")
 35 | parser.add_argument("--minimize_energy", type=int, default=1, help='whether to minimize energy (1 = yes, 0 = no)')
 36 | parser.add_argument("--equilibration_steps", type=int, default=1000, help="number of steps for the equilibration process")
 37 | # next few options are for metadynamics
 38 | parser.add_argument("--bias_method", type=str, default='US', help="biasing method for enhanced sampling, US = umbrella sampling, MTD = metadynamics")
 39 | parser.add_argument("--MTD_pace", type=int, default=CONFIG_66, help="pace of metadynamics")
 40 | parser.add_argument("--MTD_height", type=float, default=CONFIG_67, help="height of metadynamics")
 41 | parser.add_argument("--MTD_sigma", type=float, default=CONFIG_68, help="sigma of metadynamics")
 42 | parser.add_argument("--MTD_WT", type=int, default=CONFIG_69, help="whether to use well-tempered version")
 43 | parser.add_argument("--MTD_biasfactor", type=float, default=CONFIG_70, help="biasfactor of well-tempered metadynamics")
 44 | # following is for plumed script
 45 | parser.add_argument("--plumed_file", type=str, default=None, help="plumed script for biasing force, used only when the bias_method == plumed_other")
 46 | parser.add_argument("--plumed_add_string", type=str, default="", help="additional string to be attached to the end of plumed script in args.plumed_file")
 47 | # note on "force_constant_adjustable" mode:
 48 | # the simulation will stop if either:
 49 | # force constant is greater or equal to max_force_constant
 50 | # or distance between center of data cloud and potential center is smaller than distance_tolerance
 51 | parser.add_argument("--fc_adjustable", help="set the force constant to be adjustable", action="store_true")
 52 | parser.add_argument("--max_fc", type=float, default=CONFIG_32, help="max force constant (for force_constant_adjustable mode)")
 53 | parser.add_argument("--fc_step", type=float, default=CONFIG_34, help="the value by which the force constant is increased each time (for force_constant_adjustable mode)")
 54 | parser.add_argument("--distance_tolerance", type=float, default=CONFIG_35, help="max distance allowed between center of data cloud and potential center (for force_constant_adjustable mode)")
 55 | parser.add_argument("--autoencoder_file", type=str, help="pkl file that stores autoencoder (for force_constant_adjustable mode)")
 56 | parser.add_argument("--remove_previous", help="remove previous outputs while adjusting force constants", action="store_true")
 57 | args = parser.parse_args()
 58 | 
 59 | record_interval = args.record_interval
 60 | total_number_of_steps = args.total_num_of_steps
 61 | input_data_type = ['cossin', 'Cartesian', 'pairwise'][args.data_type_in_input_layer]
 62 | force_constant = args.force_constant
 63 | scaling_factor = args.scaling_factor
 64 | layer_types = re.sub("\[|\]|\"|\'| ",'', args.layer_types).split(',')
 65 | num_of_nodes = re.sub("\[|\]|\"|\'| ",'', args.num_of_nodes).split(',')
 66 | num_of_nodes = [int(item) for item in num_of_nodes]
 67 | out_format = '.dcd' if args.out_traj is None else os.path.splitext(args.out_traj)[1]
 68 | 
 69 | if float(force_constant) != 0:
 70 |     from ANN import *
 71 | 
 72 | folder_to_store_output_files = args.folder_to_store_output_files # this is used to separate outputs for different networks into different folders
 73 | autoencoder_info_file = args.autoencoder_info_file
 74 | 
 75 | potential_center = list([float(x) for x in args.pc_potential_center.replace('"','')\
 76 |                                 .replace('pc_','').split(',')])   # this API is the generalization for higher-dimensional cases
 77 | 
 78 | if not os.path.exists(folder_to_store_output_files):
 79 |     try: os.makedirs(folder_to_store_output_files)
 80 |     except: pass
 81 | 
 82 | def run_simulation(force_constant):
 83 |     assert(os.path.exists(folder_to_store_output_files))
 84 |     input_pdb_file_of_molecule = args.starting_pdb_file
 85 |     force_field_file = 'amber99sb.xml'
 86 |     water_field_file = 'tip3p.xml'
 87 |     pdb_reporter_file = '%s/output_fc_%f_pc_%s.pdb' %(folder_to_store_output_files, force_constant, str(potential_center).replace(' ',''))
 88 | 
 89 |     if not args.out_traj is None:
 90 |         pdb_reporter_file = args.out_traj
 91 | 
 92 |     state_data_reporter_file = pdb_reporter_file.replace('output_fc', 'report_fc').replace('.pdb', '.txt')
 93 | 
 94 |     # check if the file exist
 95 |     for item_filename in [pdb_reporter_file, state_data_reporter_file]:
 96 |         Helper_func.backup_rename_file_if_exists(item_filename)
 97 | 
 98 |     index_of_backbone_atoms = CONFIG_57[0]
 99 |     flag_random_seed = 0 # whether we need to fix this random seed
100 | 
101 |     simulation_temperature = args.temperature
102 |     time_step = CONFIG_22   # simulation time step, in ps
103 | 
104 |     pdb = PDBFile(input_pdb_file_of_molecule)
105 |     modeller = Modeller(pdb.topology, pdb.getPositions(frame=args.starting_frame))
106 |     solvent_opt = 'no_water'
107 |     if solvent_opt == 'explicit':
108 |         forcefield = ForceField(force_field_file, water_field_file)
109 |         modeller.addSolvent(forcefield, model=water_field_file.split('.xml')[0], boxSize=Vec3(3, 3, 3) * nanometers,
110 |                             ionicStrength=0 * molar)
111 |         system = forcefield.createSystem(modeller.topology, nonbondedMethod=PME, nonbondedCutoff=1.0 * nanometers,
112 |                                          constraints=AllBonds, ewaldErrorTolerance=0.0005)
113 |     else:
114 |         forcefield = ForceField(force_field_file)
115 |         system = forcefield.createSystem(modeller.topology, nonbondedMethod=NoCutoff, constraints=AllBonds)
116 | 
117 |     if args.bias_method == "US":
118 |         if float(force_constant) != 0:
119 |             force = ANN_Force()
120 |             force.set_layer_types(layer_types)
121 |             force.set_data_type_in_input_layer(args.data_type_in_input_layer)
122 |             force.set_list_of_index_of_atoms_forming_dihedrals_from_index_of_backbone_atoms(index_of_backbone_atoms)
123 |             force.set_index_of_backbone_atoms(index_of_backbone_atoms)
124 |             if args.data_type_in_input_layer == 2:
125 |                 force.set_list_of_pair_index_for_distances(CONFIG_80)
126 | 
127 |             force.set_num_of_nodes(num_of_nodes)
128 |             force.set_potential_center(potential_center)
129 |             force.set_force_constant(float(force_constant))
130 |             unit_scaling = 1.0  # TODO: check unit scaling
131 |             force.set_scaling_factor(float(scaling_factor) / unit_scaling)   # since default unit is nm in OpenMM
132 | 
133 |             # TODO: need to fix following for multi-hidden layer cases
134 |             temp_coeffs, temp_bias = np.load(autoencoder_info_file)
135 |             for item_layer_index in [0, 1]:
136 |                 assert (len(temp_coeffs[item_layer_index]) ==
137 |                         num_of_nodes[item_layer_index] * num_of_nodes[item_layer_index + 1]), (len(temp_coeffs[item_layer_index]),
138 |                                 (num_of_nodes[item_layer_index], num_of_nodes[item_layer_index + 1]))
139 |                 assert (len(temp_bias[item_layer_index]) == num_of_nodes[item_layer_index + 1]), (len(temp_bias[item_layer_index]), num_of_nodes[item_layer_index + 1])
140 | 
141 |             # need tolist() since C++ only accepts Python list
142 |             force.set_coeffients_of_connections([item_w.tolist() for item_w in temp_coeffs])
143 |             force.set_values_of_biased_nodes([item_w.tolist() for item_w in temp_bias])
144 | 
145 |             system.addForce(force)
146 |     elif args.bias_method == "US_on_phipsi":
147 |         from openmmplumed import PlumedForce
148 |         kappa_string = ','.join([str(force_constant) for _ in potential_center])
149 |         plumed_force_string = """
150 | phi: TORSION ATOMS=5,7,9,15
151 | psi: TORSION ATOMS=7,9,15,17
152 | restraint: RESTRAINT ARG=phi,psi AT=%f,%f KAPPA=%s
153 | PRINT STRIDE=10 ARG=* FILE=COLVAR
154 |         """ % (potential_center[0], potential_center[1], kappa_string)
155 |         system.addForce(PlumedForce(plumed_force_string))
156 |     elif args.bias_method == "MTD":
157 |         from openmmplumed import PlumedForce
158 |         plumed_force_string = Alanine_dipeptide.get_expression_script_for_plumed()
159 |         with open(autoencoder_info_file, 'r') as f_in:
160 |             plumed_force_string += f_in.read()
161 | 
162 |         # note that dimensionality of MTD is determined by potential_center string
163 |         plumed_script_ANN_mode = 'ANN'
164 |         if plumed_script_ANN_mode == 'native':
165 |             mtd_output_layer_string = ['l_2_out_%d' % item for item in range(len(potential_center))]
166 |         elif plumed_script_ANN_mode == 'ANN':
167 |             mtd_output_layer_string = ['ann_force.%d' % item for item in range(len(potential_center))]
168 |         else: raise Exception('mode error')
169 | 
170 |         mtd_output_layer_string = ','.join(mtd_output_layer_string)
171 |         mtd_sigma_string = ','.join([str(args.MTD_sigma) for _ in range(len(potential_center))])
172 |         if args.MTD_WT:
173 |             mtd_well_tempered_string = 'TEMP=%d BIASFACTOR=%f' % (args.temperature, args.MTD_biasfactor)
174 |         else:
175 |             mtd_well_tempered_string = ""
176 |         plumed_force_string += """
177 | metad: METAD ARG=%s PACE=%d HEIGHT=%f SIGMA=%s FILE=temp_MTD_hills.txt %s
178 | PRINT STRIDE=%d ARG=%s,metad.bias FILE=temp_MTD_out.txt
179 | """ % (mtd_output_layer_string, args.MTD_pace, args.MTD_height, mtd_sigma_string, mtd_well_tempered_string,
180 |        record_interval, mtd_output_layer_string)
181 |         # print plumed_force_string
182 |         system.addForce(PlumedForce(plumed_force_string))
183 |     elif args.bias_method == "SMD":
184 |         # TODO: this is temporary version
185 |         from openmmplumed import PlumedForce
186 |         kappa_string = '1000,1000'
187 |         plumed_force_string = """
188 | phi: TORSION ATOMS=5,7,9,15
189 | psi: TORSION ATOMS=7,9,15,17
190 | restraint: MOVINGRESTRAINT ARG=phi,psi AT0=-1.5,1.0  STEP0=0 KAPPA0=%s AT1=1.0,-1.0 STEP1=%d KAPPA1=%s
191 | PRINT STRIDE=10 ARG=* FILE=COLVAR
192 | """ % (kappa_string, total_number_of_steps, kappa_string)
193 |         system.addForce(PlumedForce(plumed_force_string))
194 |     elif args.bias_method == "TMD":  # targeted MD
195 |         # TODO: this is temporary version
196 |         from openmmplumed import PlumedForce
197 |         kappa_string = '10000'
198 |         plumed_force_string = """
199 | phi: TORSION ATOMS=5,7,9,15
200 | psi: TORSION ATOMS=7,9,15,17
201 | rmsd: RMSD REFERENCE=../resources/alanine_ref_1_TMD.pdb TYPE=OPTIMAL
202 | restraint: MOVINGRESTRAINT ARG=rmsd AT0=0 STEP0=0 KAPPA0=0 AT1=0 STEP1=%d KAPPA1=%s
203 | PRINT STRIDE=10 ARG=* FILE=COLVAR
204 |         """ % (total_number_of_steps, kappa_string)
205 |         system.addForce(PlumedForce(plumed_force_string))
206 |     elif args.bias_method == "plumed_other":
207 |         from openmmplumed import PlumedForce
208 |         with open(args.plumed_file, 'r') as f_in:
209 |             plumed_force_string = f_in.read().strip() + args.plumed_add_string
210 |         system.addForce(PlumedForce(plumed_force_string))
211 |     else:
212 |         raise Exception('bias method error')
213 |     # end of biased force
214 | 
215 |     integrator = LangevinIntegrator(simulation_temperature*kelvin, 1/picosecond, time_step*picoseconds)
216 |     if flag_random_seed:
217 |         integrator.setRandomNumberSeed(1)  # set random seed
218 | 
219 |     platform = Platform.getPlatformByName(args.platform)
220 |     platform.loadPluginsFromDirectory(CONFIG_25)  # load the plugin from specific directory
221 | 
222 |     simulation = Simulation(modeller.topology, system, integrator, platform)
223 |     simulation.context.setPositions(modeller.positions)
224 |     if args.minimize_energy:
225 |         print('begin Minimizing energy...')
226 |         print(datetime.datetime.now())
227 |         simulation.minimizeEnergy()
228 |         print('Done minimizing energy.')
229 |         print(datetime.datetime.now())
230 |     else:
231 |         print('energy minimization not required')
232 | 
233 |     simulation.step(args.equilibration_steps)
234 |     if out_format == '.pdb':
235 |         simulation.reporters.append(PDBReporter(pdb_reporter_file, record_interval))
236 |     elif out_format == '.dcd':
237 |         simulation.reporters.append(DCDReporter(pdb_reporter_file.replace('.pdb', '.dcd'), record_interval))
238 |     simulation.reporters.append(StateDataReporter(state_data_reporter_file, record_interval,
239 |                                     step=True, potentialEnergy=True, kineticEnergy=True, speed=True,
240 |                                                   temperature=True, progress=True, remainingTime=True,
241 |                                                   totalSteps=total_number_of_steps + args.equilibration_steps,
242 |                                                   ))
243 |     simulation.step(total_number_of_steps)
244 | 
245 |     print('Done biased simulation!')
246 |     return pdb_reporter_file
247 | 
248 | def get_distance_between_data_cloud_center_and_potential_center(pdb_file):
249 |     coor_file = Alanine_dipeptide().generate_coordinates_from_pdb_files(pdb_file)[0]
250 |     temp_network = autoencoder.load_from_pkl_file(args.autoencoder_file)
251 |     this_simulation_data = single_biased_simulation_data(temp_network, coor_file)
252 |     offset = this_simulation_data.get_offset_between_potential_center_and_data_cloud_center(input_data_type)
253 |     if layer_types[1] == "Circular":
254 |         offset = [min(abs(item), abs(item + 2 * np.pi), abs(item - 2 * np.pi)) for item in offset]
255 |         print("circular offset")
256 |     print('offset = %s' % str(offset))
257 |     distance = sqrt(sum([item * item for item in offset]))
258 |     return distance
259 | 
260 | 
261 | def run_simulation_ssages(force_constant):
262 |     ssages_output_file = '%s/output_fc_%f_pc_%s.json' % (
263 |         folder_to_store_output_files, force_constant, str(potential_center).replace(' ', ''))
264 |     subprocess.check_output('python ../src/temp_create_json_ssages.py %s %s %s %s %s' % (
265 |         ssages_output_file, str(potential_center).replace(' ', ''), autoencoder_info_file.replace('.npy', '.txt'),
266 |         ssages_output_file.replace('.json', '.trr'), force_constant), shell=True)
267 |     command = "ssages " + ssages_output_file
268 |     subprocess.check_output(command, shell=True)
269 |     pdb_reporter_file = ssages_output_file.replace('.json', '.pdb')
270 |     subprocess.check_output('mdconvert -o %s %s -t ../resources/alanine_dipeptide.pdb' % (
271 |         pdb_reporter_file, pdb_reporter_file.replace('.pdb', '.trr')), shell = True)
272 |     return pdb_reporter_file
273 | 
274 | 
275 | if __name__ == '__main__':
276 |     if not args.fc_adjustable:
277 |         run_simulation(args.force_constant)
278 |     else:
279 |         force_constant = args.force_constant
280 |         distance_of_data_cloud_center = float("inf")
281 |         while force_constant < args.max_fc and distance_of_data_cloud_center > args.distance_tolerance:
282 |             if args.remove_previous:
283 |                 try:
284 |                     command = 'rm %s/*%s*' % (folder_to_store_output_files, str(potential_center).replace(' ',''))
285 |                     command = command.replace('[','').replace(']','')
286 |                     subprocess.check_output(command, shell=True)
287 |                     print("removing previous results...")
288 |                 except:
289 |                     pass
290 |             pdb_file = run_simulation(force_constant)
291 |             distance_of_data_cloud_center = get_distance_between_data_cloud_center_and_potential_center(pdb_file)
292 |             force_constant += args.fc_step
293 |             print("distance_between_data_cloud_center_and_potential_center = %f" % distance_of_data_cloud_center)
294 | 
295 | 


--------------------------------------------------------------------------------
/MD_simulation_on_alanine_dipeptide/current_work/src/biased_simulation_general.py:
--------------------------------------------------------------------------------
  1 | from ANN_simulation import *
  2 | import datetime, os, argparse
  3 | from simtk.openmm.app import *
  4 | from simtk.openmm import *
  5 | from simtk.unit import *
  6 | from sys import stdout
  7 | import ast
  8 | from config import *
  9 | 
 10 | ############################ PARAMETERS BEGIN ###############################################################
 11 | 
 12 | parser = argparse.ArgumentParser()
 13 | parser.add_argument("molecule", type=str, help="type of molecule for the simulation")
 14 | parser.add_argument("record_interval", type=int, help="interval to take snapshots")
 15 | parser.add_argument("total_num_of_steps", type=int, help="total number of simulation steps")
 16 | parser.add_argument("force_constant", type=float, help="force constants")
 17 | parser.add_argument("folder_to_store_output_files", type=str, help="folder to store the output pdb and report files")
 18 | parser.add_argument("autoencoder_info_file", type=str, help="file to store autoencoder information (coefficients)")
 19 | parser.add_argument("pc_potential_center", type=str, help="potential center (should include 'pc_' as prefix)")
 20 | parser.add_argument("whether_to_add_water_mol_opt", type=str, help='whether to add water (options: explicit, implicit, water_already_included, no_water)')
 21 | parser.add_argument("ensemble_type", type=str, help='simulation ensemble type, either NVT or NPT')
 22 | parser.add_argument("--out_traj", type=str, default=None, help="output trajectory file")
 23 | parser.add_argument("--layer_types", type=str, default=str(CONFIG_27), help='layer types')
 24 | parser.add_argument("--num_of_nodes", type=str, default=str(CONFIG_3[:3]), help='number of nodes in each layer')
 25 | parser.add_argument("--scaling_factor", type=float, default = CONFIG_49, help='scaling_factor for ANN_Force')
 26 | parser.add_argument("--temperature", type=int, default= 300, help='simulation temperature')
 27 | parser.add_argument("--starting_pdb_file", type=str, default='auto', help='the input pdb file to start simulation')
 28 | parser.add_argument("--starting_frame", type=int, default=0, help="index of starting frame in the starting pdb file")
 29 | parser.add_argument("--minimize_energy", type=int, default=1, help='whether to minimize energy (1 = yes, 0 = no)')
 30 | parser.add_argument("--data_type_in_input_layer", type=int, default=1, help='data_type_in_input_layer, 0 = cos/sin, 1 = Cartesian coordinates')
 31 | parser.add_argument("--platform", type=str, default=CONFIG_23, help='platform on which the simulation is run')
 32 | parser.add_argument("--device", type=str, default='none', help='device index to run simulation on')
 33 | parser.add_argument("--checkpoint", type=int, default=1, help="whether to save checkpoint at the end of the simulation")
 34 | parser.add_argument("--starting_checkpoint", type=str, default="auto", help='starting checkpoint file, to resume simulation ("none" means no starting checkpoint file is provided, "auto" means automatically)')
 35 | parser.add_argument("--equilibration_steps", type=int, default=1000, help="number of steps for the equilibration process")
 36 | parser.add_argument("--fast_equilibration", type=int, default=0, help="do fast equilibration by running biased simulation with larger force constant")
 37 | parser.add_argument("--remove_eq_file", type=int, default=1, help="remove equilibration pdb files associated with fast equilibration")
 38 | parser.add_argument("--auto_equilibration", help="enable auto equilibration so that it will run enough equilibration steps", action="store_true")
 39 | # next few options are for metadynamics
 40 | parser.add_argument("--bias_method", type=str, default='US', help="biasing method for enhanced sampling, US = umbrella sampling, MTD = metadynamics")
 41 | parser.add_argument("--MTD_pace", type=int, default=CONFIG_66, help="pace of metadynamics")
 42 | parser.add_argument("--MTD_height", type=float, default=CONFIG_67, help="height of metadynamics")
 43 | parser.add_argument("--MTD_sigma", type=float, default=CONFIG_68, help="sigma of metadynamics")
 44 | parser.add_argument("--MTD_WT", type=int, default=CONFIG_69, help="whether to use well-tempered version")
 45 | parser.add_argument("--MTD_biasfactor", type=float, default=CONFIG_70, help="biasfactor of well-tempered metadynamics")
 46 | # following is for plumed script
 47 | parser.add_argument("--plumed_file", type=str, default=None, help="plumed script for biasing force, used only when the bias_method == plumed_other")
 48 | parser.add_argument("--plumed_add_string", type=str, default="", help="additional string to be attached to the end of plumed script in args.plumed_file")
 49 | # note on "force_constant_adjustable" mode:
 50 | # the simulation will stop if either:
 51 | # force constant is greater or equal to max_force_constant
 52 | # or distance between center of data cloud and potential center is smaller than distance_tolerance
 53 | parser.add_argument("--fc_adjustable", help="set the force constant to be adjustable", action="store_true")
 54 | parser.add_argument("--max_fc", type=float, default=CONFIG_32, help="max force constant (for force_constant_adjustable mode)")
 55 | parser.add_argument("--fc_step", type=float, default=CONFIG_34, help="the value by which the force constant is increased each time (for force_constant_adjustable mode)")
 56 | parser.add_argument("--distance_tolerance", type=float, default=CONFIG_35, help="max distance allowed between center of data cloud and potential center (for force_constant_adjustable mode)")
 57 | parser.add_argument("--autoencoder_file", type=str, help="pkl file that stores autoencoder (for force_constant_adjustable mode)")
 58 | parser.add_argument("--remove_previous", help="remove previous outputs while adjusting force constants", action="store_true")
 59 | args = parser.parse_args()
 60 | 
 61 | print("start simulation at %s" % datetime.datetime.now())  # to calculate compile time
 62 | 
 63 | record_interval = args.record_interval
 64 | total_number_of_steps = args.total_num_of_steps
 65 | force_constant = args.force_constant
 66 | scaling_factor = args.scaling_factor
 67 | num_of_nodes = re.sub("\[|\]|\"|\'| ",'', args.num_of_nodes).split(',')
 68 | num_of_nodes = [int(item) for item in num_of_nodes]
 69 | out_format = '.dcd' if args.out_traj is None else os.path.splitext(args.out_traj)[1]
 70 | 
 71 | platform = Platform.getPlatformByName(args.platform)
 72 | temperature = args.temperature
 73 | input_data_type = ['cossin', 'Cartesian', 'pairwise'][args.data_type_in_input_layer]
 74 | 
 75 | if float(force_constant) != 0:
 76 |     from ANN import *
 77 |     platform.loadPluginsFromDirectory(CONFIG_25)  # load the plugin from specific directory
 78 | 
 79 | folder_to_store_output_files = args.folder_to_store_output_files # this is used to separate outputs for different networks into different folders
 80 | autoencoder_info_file = args.autoencoder_info_file
 81 | 
 82 | potential_center = list([float(x) for x in args.pc_potential_center.replace('"','')\
 83 |                                 .replace('pc_','').split(',')])   # this API is the generalization for higher-dimensional cases
 84 | 
 85 | def run_simulation(force_constant, number_of_simulation_steps):
 86 |     if not os.path.exists(folder_to_store_output_files):
 87 |         try:
 88 |             os.makedirs(folder_to_store_output_files)
 89 |         except:
 90 |             pass
 91 | 
 92 |     assert(os.path.exists(folder_to_store_output_files))
 93 | 
 94 |     force_field_file = {'Trp_cage': 'amber03.xml', '2src': 'amber03.xml', '1y57': 'amber03.xml',
 95 |                         'BetaHairpin': 'amber03.xml', 'C24':'charmm36.xml', 'BPTI': 'amber03.xml'
 96 |                         }[args.molecule]
 97 |     water_field_file = {'Trp_cage': 'tip4pew.xml', '2src': 'tip3p.xml', '1y57': 'tip3p.xml',
 98 |                         'BetaHairpin': 'tip3p.xml', 'C24':'charmm36/spce.xml', 'BPTI': 'tip4pew.xml'}[args.molecule]
 99 |     water_model = water_field_file.replace('.xml', '').replace('charmm36/', '')
100 |     ionic_strength = {'Trp_cage': 0 * molar, '2src': 0.5 * .15 * molar, '1y57': 0.5 * .15 * molar,
101 |                       'BetaHairpin': 0 * molar, 'C24': 0 * molar, 'BPTI': 0 * molar}[args.molecule]
102 |     implicit_solvent_force_field = 'amber03_obc.xml'
103 | 
104 |     pdb_reporter_file = '%s/output_fc_%s_pc_%s_T_%d_%s_%s.pdb' % (folder_to_store_output_files, force_constant,
105 |                                                               str(potential_center).replace(' ', ''), temperature,
106 |                                                                   args.whether_to_add_water_mol_opt, args.ensemble_type)
107 | 
108 | 
109 |     if args.starting_pdb_file == 'auto':
110 |         input_pdb_file_of_molecule = {'Trp_cage': '../resources/1l2y.pdb',
111 |                                       '2src': '../resources/2src.pdb',
112 |                                       '1y57': '../resources/1y57.pdb',
113 |                                       'BetaHairpin': '../resources/BetaHairpin.pdb',
114 |                                       'C24': '../resources/C24.pdb', 'BPTI': '../resources/bpti.pdb'}[args.molecule]
115 |     else:
116 |         input_pdb_file_of_molecule = args.starting_pdb_file
117 |         pdb_reporter_file = pdb_reporter_file.split('.pdb')[0] + '_sf_%s.pdb' % \
118 |                                 args.starting_pdb_file.split('_sf_')[0].split('.pdb')[0].split('/')[-1]   # 'sf' means 'starting_from'
119 | 
120 |     print("start_pdb = %s" % input_pdb_file_of_molecule)
121 |     if args.starting_frame != 0:
122 |         pdb_reporter_file = pdb_reporter_file.split('.pdb')[0] + '_ff_%d.pdb' % args.starting_frame   # 'ff' means 'from_frame'
123 | 
124 |     if not args.out_traj is None:
125 |         pdb_reporter_file = args.out_traj
126 | 
127 |     state_data_reporter_file = pdb_reporter_file.replace('output_fc', 'report_fc').replace('.pdb', '.txt')
128 |     checkpoint_file = pdb_reporter_file.replace('output_fc', 'checkpoint_fc').replace('.pdb', '.chk')
129 |     if args.fast_equilibration:
130 |         checkpoint_file = checkpoint_file.replace(str(force_constant), str(args.force_constant))
131 | 
132 |     # check existence
133 |     for item_filename in [pdb_reporter_file, state_data_reporter_file]:
134 |         Helper_func.backup_rename_file_if_exists(item_filename)
135 | 
136 |     flag_random_seed = 0 # whether we need to fix this random seed
137 |     box_size = {'Trp_cage': 4.5, '2src': 8.0, '1y57': 8.0,
138 |                 'BetaHairpin': 5, 'C24': 5, 'BPTI': 5.1263}[args.molecule]
139 |     time_step = CONFIG_22       # simulation time step, in ps
140 | 
141 |     index_of_backbone_atoms = {'Trp_cage': CONFIG_57[1],
142 |                                '2src': CONFIG_57[2], '1y57': CONFIG_57[2],
143 |                                'BetaHairpin': CONFIG_57[3],
144 |                                'C24': CONFIG_57[4], 'BPTI': None}[args.molecule]
145 | 
146 |     layer_types = CONFIG_27
147 |     simulation_constraints = HBonds
148 | 
149 |     pdb = PDBFile(input_pdb_file_of_molecule)
150 |     modeller = Modeller(pdb.topology, pdb.getPositions(frame=args.starting_frame))
151 | 
152 |     if args.whether_to_add_water_mol_opt == 'explicit':
153 |         forcefield = ForceField(force_field_file, water_field_file)
154 |         modeller.addHydrogens(forcefield)
155 |         modeller.addSolvent(forcefield, model=water_model, boxSize=Vec3(box_size, box_size, box_size)*nanometers,
156 |                             ionicStrength=ionic_strength)
157 |         if not water_model == 'spce': modeller.addExtraParticles(forcefield)
158 |         system = forcefield.createSystem(modeller.topology, nonbondedMethod=PME, nonbondedCutoff=1.0 * nanometers,
159 |                                          constraints = simulation_constraints, ewaldErrorTolerance = 0.0005)
160 |     elif args.whether_to_add_water_mol_opt == 'implicit':
161 |         forcefield = ForceField(force_field_file, implicit_solvent_force_field)
162 |         modeller.addHydrogens(forcefield)
163 |         modeller.addExtraParticles(forcefield)
164 |         system = forcefield.createSystem(pdb.topology,nonbondedMethod=CutoffNonPeriodic, nonbondedCutoff=5 * nanometers,
165 |                                          constraints=simulation_constraints, rigidWater=True, removeCMMotion=True)
166 | 
167 |     elif args.whether_to_add_water_mol_opt == 'no_water' or args.whether_to_add_water_mol_opt == 'water_already_included':
168 |         forcefield = ForceField(force_field_file, water_field_file)
169 |         modeller.addExtraParticles(forcefield)
170 |         modeller.addHydrogens(forcefield)
171 |         system = forcefield.createSystem(modeller.topology, nonbondedMethod=NoCutoff,nonbondedCutoff=1.0 * nanometers,
172 |                                          constraints = simulation_constraints)
173 |     else:
174 |         raise Exception("parameter error")
175 | 
176 |     # print modeller.topology.getPeriodicBoxVectors()
177 | 
178 |     system.addForce(AndersenThermostat(temperature*kelvin, 1/picosecond))
179 |     if args.ensemble_type == "NPT" and args.whether_to_add_water_mol_opt == 'explicit':
180 |         system.addForce(MonteCarloBarostat(1*atmospheres, temperature*kelvin, 25))
181 | 
182 |     # add custom force (only for biased simulation)
183 |     if args.bias_method == "US":
184 |         if float(force_constant) != 0:
185 |             force = ANN_Force()
186 |             force.set_layer_types(layer_types)
187 |             force.set_data_type_in_input_layer(args.data_type_in_input_layer)
188 |             force.set_list_of_index_of_atoms_forming_dihedrals_from_index_of_backbone_atoms(index_of_backbone_atoms)
189 |             force.set_index_of_backbone_atoms(index_of_backbone_atoms)
190 |             if args.data_type_in_input_layer == 2:
191 |                 force.set_list_of_pair_index_for_distances(CONFIG_80)
192 |             force.set_num_of_nodes(num_of_nodes)
193 |             force.set_potential_center(potential_center)
194 |             force.set_force_constant(float(force_constant))
195 |             unit_scaling = 1.0  # TODO: check unit scaling
196 |             force.set_scaling_factor(float(scaling_factor) / unit_scaling)  # since default unit is nm in OpenMM
197 | 
198 |             with open(autoencoder_info_file, 'r') as f_in:
199 |                 content = f_in.readlines()
200 | 
201 |             # TODO: need to fix following for multi-hidden layer cases
202 |             temp_coeffs = [ast.literal_eval(content[0].strip())[0], ast.literal_eval(content[1].strip())[0]]
203 |             temp_bias  = [ast.literal_eval(content[2].strip())[0], ast.literal_eval(content[3].strip())[0]]
204 |             for item_layer_index in [0, 1]:
205 |                 assert (len(temp_coeffs[item_layer_index]) ==
206 |                         num_of_nodes[item_layer_index] * num_of_nodes[item_layer_index + 1]), \
207 |                     (len(temp_coeffs[item_layer_index]), num_of_nodes[item_layer_index], num_of_nodes[item_layer_index + 1])
208 |                 assert (len(temp_bias[item_layer_index]) == num_of_nodes[item_layer_index + 1]), (len(temp_bias[item_layer_index]), num_of_nodes[item_layer_index + 1])
209 | 
210 |             force.set_coeffients_of_connections(temp_coeffs)
211 |             force.set_values_of_biased_nodes(temp_bias)
212 | 
213 |             system.addForce(force)
214 |     elif args.bias_method == "MTD":
215 |         from openmmplumed import PlumedForce
216 |         molecule_type = {'Trp_cage': Trp_cage, '2src': Src_kinase, '1y57': Src_kinase, 'BetaHairpin': BetaHairpin}[args.molecule]
217 |         plumed_force_string = molecule_type.get_expression_script_for_plumed()
218 |         with open(autoencoder_info_file, 'r') as f_in:
219 |             plumed_force_string += f_in.read()
220 | 
221 |         # note that dimensionality of MTD is determined by potential_center string
222 |         mtd_output_layer_string = ['l_2_out_%d' % item for item in range(len(potential_center))]
223 |         mtd_output_layer_string = ','.join(mtd_output_layer_string)
224 |         mtd_sigma_string = ','.join([str(args.MTD_sigma) for _ in range(len(potential_center))])
225 |         if args.MTD_WT:
226 |             mtd_well_tempered_string = 'TEMP=%d BIASFACTOR=%f' % (args.temperature, args.MTD_biasfactor)
227 |         else:
228 |             mtd_well_tempered_string = ""
229 |         plumed_force_string += """
230 |         metad: METAD ARG=%s PACE=%d HEIGHT=%f SIGMA=%s FILE=temp_MTD_hills.txt %s
231 |         PRINT STRIDE=%d ARG=%s,metad.bias FILE=temp_MTD_out.txt
232 |         """ % (mtd_output_layer_string, args.MTD_pace, args.MTD_height, mtd_sigma_string, mtd_well_tempered_string,
233 |                record_interval, mtd_output_layer_string)
234 |         system.addForce(PlumedForce(plumed_force_string))
235 |     elif args.bias_method == "TMD":  # targeted MD
236 |         # TODO: this is temporary version
237 |         from openmmplumed import PlumedForce
238 |         kappa_string = str(args.force_constant)
239 |         plumed_force_string = """
240 | rmsd: RMSD REFERENCE=../resources/1y57_TMD.pdb TYPE=OPTIMAL
241 | restraint: MOVINGRESTRAINT ARG=rmsd AT0=0.4 STEP0=0 KAPPA0=%s AT1=0 STEP1=%d KAPPA1=%s
242 | PRINT STRIDE=500 ARG=* FILE=COLVAR
243 |             """ % (kappa_string, total_number_of_steps, kappa_string)
244 |         system.addForce(PlumedForce(plumed_force_string))
245 |     elif args.bias_method == "US_on_ANN_plumed":
246 |         # in this case, all ANN related parts (including scripts for inputs) have been stored in
247 |         # args.plumed_file, only need to add biasing plumed script for umbrella sampling
248 |         from openmmplumed import PlumedForce
249 |         with open(args.plumed_file, 'r') as f_in:
250 |             plumed_force_string = f_in.read()
251 |         arg_string = ','.join(['ann_force.%d' % _2 for _2 in range(len(potential_center))])
252 |         pc_string = ','.join([str(_2) for _2 in potential_center])
253 |         kappa_string = ','.join([str(force_constant) for _ in potential_center])
254 |         plumed_force_string += """\nmypotential: RESTRAINT ARG=%s AT=%s KAPPA=%s""" % (
255 |             arg_string, pc_string, kappa_string,
256 |         )
257 |         system.addForce(PlumedForce(plumed_force_string))
258 |     elif args.bias_method == "plumed_other":
259 |         from openmmplumed import PlumedForce
260 |         with open(args.plumed_file, 'r') as f_in:
261 |             plumed_force_string = f_in.read().strip() + args.plumed_add_string
262 |         system.addForce(PlumedForce(plumed_force_string))
263 |     else:
264 |         raise Exception('bias method error')
265 |     # end add custom force
266 | 
267 |     integrator = VerletIntegrator(time_step*picoseconds)
268 | 
269 |     if flag_random_seed:
270 |         integrator.setRandomNumberSeed(1)  # set random seed
271 | 
272 |     if args.platform == "CUDA" and args.device != 'none':
273 |         properties = {'CudaDeviceIndex': args.device}
274 |         simulation = Simulation(modeller.topology, system, integrator, platform, properties)
275 |     else:
276 |         simulation = Simulation(modeller.topology, system, integrator, platform)
277 |     # print "positions = "
278 |     # print (modeller.positions)
279 |     simulation.context.setPositions(modeller.positions)
280 |     print(datetime.datetime.now())
281 | 
282 |     if args.starting_checkpoint != 'none':
283 |         if args.starting_checkpoint == "auto":  # restart from checkpoint if it exists
284 |             if os.path.isfile(checkpoint_file):
285 |                 print("resume simulation from %s" % checkpoint_file)
286 |                 simulation.loadCheckpoint(checkpoint_file)
287 |         else:
288 |             print("resume simulation from %s" % args.starting_checkpoint)
289 |             simulation.loadCheckpoint(args.starting_checkpoint)     # the topology is already set by pdb file, and the positions in the pdb file will be overwritten by those in the starting_checkpoing file
290 | 
291 |     if args.minimize_energy:
292 |         print('begin Minimizing energy...')
293 |         print(datetime.datetime.now())
294 |         simulation.minimizeEnergy()
295 |         print('Done minimizing energy.')
296 |         print(datetime.datetime.now())
297 |     else:
298 |         print('energy minimization not required')
299 | 
300 |     print("begin equilibrating...")
301 |     print(datetime.datetime.now())
302 |     simulation.step(args.equilibration_steps)
303 |     previous_distance_to_potential_center = 100
304 |     current_distance_to_potential_center = 90
305 |     if args.auto_equilibration:
306 |         distance_change_tolerance = 0.05
307 |         while abs(previous_distance_to_potential_center - current_distance_to_potential_center) > distance_change_tolerance:
308 |             temp_pdb_reporter_file_for_auto_equilibration = pdb_reporter_file.replace('.pdb', '_temp.pdb')
309 |             simulation.reporters.append(PDBReporter(temp_pdb_reporter_file_for_auto_equilibration, record_interval))
310 |             simulation.step(args.equilibration_steps)
311 |             previous_distance_to_potential_center = current_distance_to_potential_center
312 |             current_distance_to_potential_center = get_distance_between_data_cloud_center_and_potential_center(
313 |                             temp_pdb_reporter_file_for_auto_equilibration)
314 |             subprocess.check_output(['rm', temp_pdb_reporter_file_for_auto_equilibration])
315 |             print("previous_distance_to_potential_center =  %f\ncurrent_distance_to_potential_center = %f" % (
316 |                 previous_distance_to_potential_center, current_distance_to_potential_center
317 |             ))
318 | 
319 |     print("Done equilibration")
320 |     print(datetime.datetime.now())
321 | 
322 |     if out_format == '.pdb':
323 |         simulation.reporters.append(PDBReporter(pdb_reporter_file, record_interval))
324 |     elif out_format == '.dcd':
325 |         simulation.reporters.append(DCDReporter(pdb_reporter_file.replace('.pdb', '.dcd'), record_interval))
326 |     simulation.reporters.append(StateDataReporter(state_data_reporter_file, record_interval, time=True,
327 |                                     step=True, potentialEnergy=True, kineticEnergy=True, speed=True,
328 |                                                   temperature=True, progress=True, remainingTime=True, volume = True,density=True,
329 |                                                   totalSteps=number_of_simulation_steps + args.equilibration_steps,
330 |                                                   ))
331 |     simulation.step(number_of_simulation_steps)
332 | 
333 |     if args.checkpoint:
334 |         Helper_func.backup_rename_file_if_exists(checkpoint_file)
335 |         simulation.saveCheckpoint(checkpoint_file)
336 | 
337 |     print('Done!')
338 |     print(datetime.datetime.now())
339 |     return pdb_reporter_file
340 | 
341 | def get_distance_between_data_cloud_center_and_potential_center(pdb_file):
342 |     coor_file = Trp_cage().generate_coordinates_from_pdb_files(pdb_file)[0]
343 |     temp_network = autoencoder.load_from_pkl_file(args.autoencoder_file)
344 |     print(coor_file)
345 |     this_simulation_data = single_biased_simulation_data(temp_network, coor_file)
346 |     offset = this_simulation_data.get_offset_between_potential_center_and_data_cloud_center(input_data_type)
347 |     if CONFIG_17[1] == "Circular":
348 |         offset = [min(abs(item), abs(item + 2 * np.pi), abs(item - 2 * np.pi)) for item in offset]
349 |         print("circular offset")
350 |     print('offset = %s' % str(offset))
351 |     distance = sqrt(sum([item * item for item in offset]))
352 |     return distance
353 | 
354 | if __name__ == '__main__':
355 |     if not args.fc_adjustable:
356 |         if args.fast_equilibration:
357 |             temp_eq_force_constants = [args.force_constant * item for item in [5, 3, 2, 1.5, 1.2]]
358 |             temp_eq_num_steps = [int(total_number_of_steps * item) for item in [0.02, 0.05, 0.05, 0.1, 0.1]]
359 |             for item_1, item_2 in zip(temp_eq_force_constants, temp_eq_num_steps):
360 |                 temp_eq_pdb = run_simulation(item_1, item_2)
361 |                 if args.remove_eq_file:
362 |                     subprocess.check_output(['rm', temp_eq_pdb])
363 |         
364 |         run_simulation(args.force_constant, total_number_of_steps)
365 | 
366 |     else:
367 |         force_constant = args.force_constant
368 |         distance_of_data_cloud_center = float("inf")
369 |         while force_constant < args.max_fc and distance_of_data_cloud_center > args.distance_tolerance:
370 |             if args.remove_previous:
371 |                 try:
372 |                     command = 'rm %s/*%s*' % (folder_to_store_output_files, str(potential_center).replace(' ',''))
373 |                     command = command.replace('[','').replace(']','')
374 |                     subprocess.check_output(command, shell=True)
375 |                     print("removing previous results...")
376 |                 except:
377 |                     pass
378 |             pdb_file = run_simulation(force_constant, total_number_of_steps)
379 |             distance_of_data_cloud_center = get_distance_between_data_cloud_center_and_potential_center(pdb_file)
380 |             force_constant += args.fc_step
381 |             print("distance_between_data_cloud_center_and_potential_center = %f" % distance_of_data_cloud_center)
382 | 


--------------------------------------------------------------------------------
/MD_simulation_on_alanine_dipeptide/current_work/src/config.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function, division
  2 | import copy, pickle, re, os, time, subprocess, datetime, itertools, sys, abc, argparse, matplotlib, glob
  3 | matplotlib.use('agg')
  4 | sys.path.append('/home/kengyangyao/Dropbox/temp_Linux/temp_research_proj/cluster_management/cm/src')
  5 | sys.path.append('/home/kengyangyao/Dropbox/temp_Linux/temp_research_proj/plumed_helper')
  6 | from plumed_helper import Plumed_helper
  7 | from scipy import io as sciio
  8 | import numpy as np, pandas as pd, seaborn as sns
  9 | from numpy.testing import assert_almost_equal
 10 | from math import *
 11 | import matplotlib.pyplot as plt
 12 | from sklearn.neighbors import RadiusNeighborsRegressor
 13 | import matplotlib
 14 | from Bio import PDB
 15 | from sklearn.metrics import mean_squared_error
 16 | from sklearn import linear_model
 17 | from MDAnalysis import Universe
 18 | from MDAnalysis.analysis.align import *
 19 | from MDAnalysis.analysis.rms import rmsd
 20 | from MDAnalysis.analysis.distances import distance_array
 21 | 
 22 | '''This is the configuration file for all Python code in this directory,
 23 | it configures all default values/global parameters for constructors/functions
 24 | '''
 25 | 
 26 | #######################################################################
 27 | ############   some global variables and helper functions  ############
 28 | #######################################################################
 29 | 
 30 | CONFIG_30 = "Alanine_dipeptide"     # the type of molecule we are studying
 31 | WARNING_INFO = "Comment out this line to continue."
 32 | 
 33 | def get_mol_param(parameter_list, molecule_name=CONFIG_30):   # get molecule specific parameter using a parameter list
 34 |     molecule_name_to_index = {"Alanine_dipeptide": 0, "Trp_cage": 1, "Src_kinase": 2,
 35 |                               "BetaHairpin": 3, "C24": 4}
 36 |     try:  result = parameter_list[molecule_name_to_index[molecule_name]]
 37 |     except: result = None
 38 |     return result
 39 | 
 40 | def get_index_list_with_selection_statement(pdb_file, atom_selection_statement):
 41 |     pdb_file_1 = os.path.join(os.path.dirname(__file__), pdb_file)
 42 |     return (Universe(pdb_file_1).select_atoms(atom_selection_statement).indices + 1).tolist()
 43 | 
 44 | #######################################################################
 45 | ##################   configurations  ##################################
 46 | #######################################################################
 47 | 
 48 | CONFIG_45 = 'keras'                         # training backend: "keras"
 49 | CONFIG_48 = 'Cartesian'           # input data type
 50 | CONFIG_76 = 'Cartesian'           # output data type
 51 | CONFIG_75 = get_mol_param([None, None, None, None, None])      # weights for the expected output (equivalent to modifying error functions)
 52 | CONFIG_52 = 64                # number of copies we generate for data augmentation
 53 | CONFIG_58 = True              # use representative points for training (generated by clustering)
 54 | CONFIG_59 = 1000               # number of representative points
 55 | 
 56 | # CONFIG_49 = get_mol_param([5.0, 20.0, 40.0, 20.0, 20.0]) # scaling factor for output for Cartesian coordinates
 57 | CONFIG_49 = get_mol_param([0.5, 2.0, 4.0, 2.0, 2.0]) # scaling factor for Cartesian coordinates, be careful about units
 58 | CONFIG_1 = ['../target/' + CONFIG_30] # list of directories that contains all coordinates files
 59 | 
 60 | CONFIG_57 = [
 61 |     get_index_list_with_selection_statement('../resources/alanine_dipeptide.pdb', 'name C or name CH3 or name CA or name N'),
 62 |     # get_index_list_with_selection_statement('../resources/alanine_dipeptide.pdb', 'not name H*'),
 63 |     get_index_list_with_selection_statement('../resources/1l2y.pdb', 'backbone and not name O'),
 64 |     # get_index_list_with_selection_statement('../resources/2src.pdb', 'backbone and not name O'),
 65 |     get_index_list_with_selection_statement('../resources/2src.pdb',
 66 |                                             '(resid 144:170 or resid 44:58) and not name H*'),
 67 |     get_index_list_with_selection_statement('../resources/BetaHairpin.pdb', 'backbone and not name O'),
 68 |     get_index_list_with_selection_statement('../resources/C24.pdb', 'name C*')
 69 | ]                                          # index list of atoms for training and biased simulations
 70 | 
 71 | CONFIG_73 = get_mol_param(['name C or name CH3 or name CA or name N', 'name CA',
 72 |                            '(resid 144:170 or resid 44:58) and name CA', 'name CA', 'name C*'
 73 |                            ])                         # atom selection for calculating pairwise distances, used only when it is in 'pairwise_distance' mode
 74 | temp_CONFIG_80 = get_index_list_with_selection_statement(
 75 |     get_mol_param(['../resources/alanine_dipeptide.pdb', '../resources/1l2y.pdb',
 76 |                 '../resources/2src.pdb', '../resources/BetaHairpin.pdb', '../resources/C24.pdb']), CONFIG_73
 77 | )
 78 | CONFIG_80 = [[temp_CONFIG_80[item_xx], temp_CONFIG_80[item_yy]]
 79 |               for item_xx in range(len(temp_CONFIG_80))
 80 |               for item_yy in range(item_xx + 1, len(temp_CONFIG_80))]    # pair index list for pairwise distances as input
 81 | 
 82 | CONFIG_17 = ['Tanh', 'Tanh', 'Tanh']  # types of hidden layers
 83 | CONFIG_78 = "Linear"                    # output layer type
 84 | CONFIG_79 = True                         # determine dimensionality of input/output of autoencoder automatically
 85 | if CONFIG_76 == 'cossin':
 86 |     CONFIG_4 = get_mol_param([
 87 |         [.5,.4,0, True, [0.001, 0.001, 0.001, 0.001]] if CONFIG_17[1] == "Circular" else [0.3, 0.9, 0, True, [0.00, 0.1, 0.00, 0.00]]
 88 |     ])
 89 | elif CONFIG_76 == 'Cartesian' or CONFIG_76 == 'combined':
 90 |     CONFIG_4 = get_mol_param([
 91 |         [.5, 0.5, 0, True, 0.0],
 92 |         [0.3, 0.9, 0, True, 0.0],
 93 |         [0.3, 0.9, 0, True, 0.0],
 94 |         [0.3, 0.9, 0, True, 0.0],
 95 |         [0.3, 0.9, 0, True, 0.0],
 96 |         ])   # [learning rates, momentum, learning rate decay, nesterov, regularization coeff]
 97 | elif CONFIG_76 == 'pairwise_distance':
 98 |     CONFIG_4 = get_mol_param([
 99 |         [0.3, 0.9, 0, True, 0.0],
100 |         [1.5, 0.9, 0, True, 0.0],
101 |         [1.5, 0.9, 0, True, 0.0],
102 |         [0.7, 0.8, 0, True, 0.0]
103 |     ])
104 | else: raise Exception('error')
105 | 
106 | CONFIG_5 = 50                   # max number of training epochs
107 | CONFIG_6 = None                # filename to save this network
108 | CONFIG_36 = 2                  #   dimensionality
109 | CONFIG_37 = 2 * CONFIG_36 if CONFIG_17[1] == "Circular" else CONFIG_36      # number of nodes in bottleneck layer
110 | 
111 | 
112 | CONFIG_71 = False                  # use mixed error function  (for Trp_cage only)
113 | CONFIG_62 = get_mol_param([
114 |     ['../resources/alanine_dipeptide.pdb', '../resources/alanine_ref_1.pdb'],
115 |     ['../resources/1l2y.pdb', '../resources/Trp_cage_ref_1.pdb'] if not CONFIG_71 else ['../resources/1l2y.pdb', '../resources/1l2y.pdb'], # mixed_err
116 |     # ['../resources/2src.pdb', '../resources/2src.pdb']
117 |     ['../resources/2src.pdb'],
118 |     ['../resources/BetaHairpin.pdb'], None
119 | ])                   # list of reference file
120 | CONFIG_63 = get_mol_param([
121 |     ['', '_1'],
122 |     ['', '_1'],
123 |     [''], [''], ['']
124 |     ]
125 | )                         # suffix for each reference configuration
126 | CONFIG_61 = ['_aligned%s_coordinates.npy' % item
127 |              for item in CONFIG_63]  # alignment_coor_file_suffix_list (we use different suffix for aligned files with respect to different references)
128 | CONFIG_64 = get_mol_param([
129 |     ['backbone', 'backbone'],
130 |     ['backbone', 'backbone'] if not CONFIG_71 else ['backbone and resid 2:8', 'backbone'], # mixed_err
131 |     # ['backbone and resid 144:170', 'backbone and resid 44:58']
132 |     ['backbone'],
133 |     ['backbone']
134 |     ])                             # atom selection statement list for structural alignment
135 | CONFIG_55 = len(CONFIG_61)                  # number of reference configurations used in training
136 | 
137 | CONFIG_3 = get_mol_param([       # the structure of ANN: number of nodes in each layer (input/output dim typically determined automatically)
138 |     [21, 40, CONFIG_37, 40, 0],
139 |     [0, 50, CONFIG_37, 50, 0],
140 |     [861, 100, CONFIG_37, 100, 0],
141 |     [0, 100, CONFIG_37, 100, 0],
142 |     [0, 100, CONFIG_37, 100, 0],
143 | ])
144 | 
145 | if CONFIG_3[-1] == 0: CONFIG_3[-1] = CONFIG_3[0]
146 | 
147 | CONFIG_74 = False                  # whether we start each biased simulation with nearest configuration or a fixed configuration
148 | CONFIG_40 = 'explicit'                  # whether to include water molecules, option: explicit, implicit, water_already_included, no_water
149 | CONFIG_51 = 'NVT'                  # simulation ensemble type
150 | CONFIG_42 = False                             # whether to enable force constant adjustable mode
151 | CONFIG_44 = False                             # whether to use hierarchical autoencoder
152 | CONFIG_77 = 2                      # hierarchical autoencoder variant index
153 | CONFIG_13 = 3              # num of trainings to run, and pick best one
154 | CONFIG_31 = 10        # maximum number of failed simulations allowed in each iteration
155 | 
156 | CONFIG_56 = get_mol_param([20, 8, 6, 6])    # number of biased simulations running in parallel
157 | CONFIG_14 = 50  # max number of jobs submitted each time
158 | CONFIG_29 = True  if CONFIG_40 == 'explicit' else False   # whether we need to remove the water molecules from pdb files
159 | CONFIG_50 = False   # whether we need to preserve original file if water molecules are removed
160 | 
161 | CONFIG_10 = 15               # num of bins for get_boundary_points()
162 | CONFIG_11 = 15                 # num of boundary points
163 | 
164 | CONFIG_39 = False    #  set the range of histogram automatically based on min,max values in each dimension
165 | CONFIG_41 = False    # whether we reverse the order of sorting of diff_with_neighbors values in get_boundary algorithm
166 | 
167 | if CONFIG_17[1] == "Circular":
168 |     CONFIG_18 = True  # whether we limit the boundary points to be between [-pi, pi], typically works for circularLayer
169 |     CONFIG_26 = [[-np.pi, np.pi] for item in range(CONFIG_36)]    # range of PCs, for circular case, it is typically [[-np.pi, np.pi],[-np.pi, np.pi]]
170 | elif CONFIG_17[1] == "Tanh":
171 |     CONFIG_18 = False
172 |     CONFIG_26 = [[-1, 1] for item in range(CONFIG_36)]
173 | else:
174 |     raise Exception('Layer not defined')
175 | 
176 | CONFIG_33 = CONFIG_3[0]   # length of list of cos/sin values, equal to the number of nodes in input layer
177 | CONFIG_12 = '../target/' + CONFIG_30  # folder that contains all pdb files
178 | 
179 | CONFIG_65 = "US"          # default biasing method
180 | CONFIG_16 = get_mol_param([500, 5000, 2000, 2000])                     # record interval (the frequency of writing system state into the file)
181 | CONFIG_8 = get_mol_param([50000, 500000, 200000, 200000])                  # num of simulation steps
182 | CONFIG_72 = 0             # enable fast equilibration
183 | # following: for umbrella sampling
184 | CONFIG_9 = get_mol_param([5000, 2000, 3000, 3000])                     # force constant for biased simulations
185 | CONFIG_53 = 'fixed'                      # use fixed/flexible force constants for biased simulation for each iteration
186 | CONFIG_54 = 2.50 * get_mol_param([30.0, 20.0, 15.0, 20.0, 20])             # max external potential energy allowed (in k_BT)
187 | # following: for metadynamics
188 | CONFIG_66 = 500             # pace of metadynamics
189 | CONFIG_67 = 2               # height of metadynamics
190 | CONFIG_68 = 0.1             # sigma of metadynamics
191 | CONFIG_69 = 0               # whether to use well-tempered version
192 | CONFIG_70 = 15              # biasfactor for well-tempered metadynamics
193 | 
194 | CONFIG_21 = 300   # simulation temperature
195 | CONFIG_22 = 0.002   # simulation time step, in ps
196 | 
197 | CONFIG_23 = get_mol_param(['CPU', 'CUDA', 'CUDA', 'CUDA', 'CUDA'])              # simulation platform
198 | 
199 | temp_home_directory = str(subprocess.check_output('echo $HOME', shell=True).strip().decode("utf-8"))
200 | if temp_home_directory == "/home/kengyangyao":
201 |     CONFIG_24 = 'local'  # machine to run the simulations
202 |     CONFIG_25 = temp_home_directory + '/.anaconda2/lib/plugins'  # this is the directory where the plugin is installed
203 | elif temp_home_directory == "/home/weichen9":
204 |     CONFIG_24 = 'cluster'  # machine to run the simulations
205 |     CONFIG_25 = temp_home_directory + '/.my_softwares/openmm7/lib/plugins'
206 | elif temp_home_directory == "/u/sciteam/chen21":
207 |     CONFIG_24 = 'cluster'
208 |     CONFIG_25 = temp_home_directory + '/.openmm/lib/plugins'
209 | else:
210 |     print('unknown user directory: %s' % temp_home_directory)
211 | 
212 | CONFIG_27 =  CONFIG_17[:2]  # layer_types for ANN_Force, it should be consistent with autoencoder
213 | 
214 | CONFIG_32 = 5000           # maximum force constant allowed (for force constant adjustable mode)
215 | CONFIG_34 = 500            # force constant step, the value by which the force constant is increased each time (for force constant adjustable mode)
216 | CONFIG_35 = 0.1            # distance tolerance, max distance allowed between center of data cloud and potential center (for force_constant_adjustable mode)
217 | 


--------------------------------------------------------------------------------
/MD_simulation_on_alanine_dipeptide/current_work/src/coordinates_data_files_list.py:
--------------------------------------------------------------------------------
 1 | from config import *
 2 | from helper_func import *
 3 | 
 4 | class coordinates_data_files_list(object):
 5 |     def __init__(self,
 6 |                 list_of_dir_of_coor_data_files = CONFIG_1, # this is the directory that holds corrdinates data files
 7 |                 ):
 8 |         assert (isinstance(list_of_dir_of_coor_data_files, list))    # to avoid passing the string in the constructor
 9 |         self._list_of_dir_of_coor_data_files = list_of_dir_of_coor_data_files
10 |         self._list_of_coor_data_files = []
11 | 
12 |         for item in self._list_of_dir_of_coor_data_files:
13 |             self._list_of_coor_data_files += subprocess.check_output('''find %s -name "*coordinates.npy"''' % item, shell=True).decode("utf-8").strip().split('\n')
14 | 
15 |         self._list_of_coor_data_files = list(set(self._list_of_coor_data_files))  # remove duplicates
16 |         self._list_of_coor_data_files = [x for x in self._list_of_coor_data_files if os.stat(x).st_size > 0]   # remove empty files
17 |         self._list_of_coor_data_files.sort()                # to be consistent
18 |         self._list_num_frames = [np.load(_1).shape[0] for _1 in self._list_of_coor_data_files]
19 | 
20 |         return
21 | 
22 |     def create_sub_coor_data_files_list_using_filter_conditional(self, filter_conditional):
23 |         """
24 |         :param filter_conditional: a lambda conditional expression on file names
25 |         :return: a coordinates_data_files_list object
26 |         """
27 |         temp_coor_files = list(filter(filter_conditional, self._list_of_coor_data_files))
28 |         return coordinates_data_files_list(temp_coor_files)
29 | 
30 |     def get_list_of_coor_data_files(self):
31 |         return self._list_of_coor_data_files
32 | 
33 |     def get_coor_data(self, scaling_factor, format='npy'):
34 |         result = np.concatenate([
35 |             Helper_func.load_npy(item, format=format) for item in self._list_of_coor_data_files], axis=0) / scaling_factor
36 |         assert (sum(self._list_num_frames) == result.shape[0])
37 |         return result
38 | 
39 |     def get_list_of_corresponding_pdb_dcd(self):
40 |         list_of_corresponding_pdb_files = [x.strip().replace('_coordinates.npy', '.pdb') for x in self.get_list_of_coor_data_files()]
41 |         for item in range(len(list_of_corresponding_pdb_files)):
42 |             if not os.path.exists(list_of_corresponding_pdb_files[item]):
43 |                 list_of_corresponding_pdb_files[item] = list_of_corresponding_pdb_files[item].replace('.pdb', '.dcd')
44 |                 try:
45 |                     assert os.path.exists(list_of_corresponding_pdb_files[item])
46 |                 except:
47 |                     raise Exception('%s does not exist!' % list_of_corresponding_pdb_files[item])
48 | 
49 |         return list_of_corresponding_pdb_files
50 | 
51 |     def write_pdb_frames_into_file_with_list_of_coor_index(self, list_of_coor_index, out_file_name, verbose=True):
52 |         """
53 |         This function picks several frames from pdb files, and write a new pdb file as output,
54 |         we could use this together with the mouse-clicking callback implemented in the scatter plot:
55 |         first we select a few points interactively in the scatter plot, and get corresponding index in the data point
56 |         list, the we find the corresponding pdb frames with the index
57 |         """
58 |         Helper_func.backup_rename_file_if_exists(out_file_name)
59 |         list_of_coor_index.sort()
60 |         pdb_files = self.get_list_of_corresponding_pdb_dcd()
61 |         accum_sum = np.cumsum(np.array(self._list_num_frames))  # use accumulative sum to find corresponding pdb files
62 |         for item in range(len(accum_sum)):
63 |             if item == 0:
64 |                 temp_index_related_to_this_pdb_file = [x for x in list_of_coor_index if x < accum_sum[item]]
65 |             else:
66 |                 temp_index_related_to_this_pdb_file = [x for x in list_of_coor_index if accum_sum[item - 1] <= x < accum_sum[item]]
67 |                 temp_index_related_to_this_pdb_file = [x - accum_sum[item - 1] for x in temp_index_related_to_this_pdb_file]
68 |             temp_index_related_to_this_pdb_file.sort()
69 | 
70 |             if len(temp_index_related_to_this_pdb_file) != 0:
71 |                 if verbose: print(pdb_files[item])
72 |                 with open(pdb_files[item], 'r') as in_file:
73 |                     content = in_file.read().split('MODEL')[1:]  # remove header
74 |                     frames_to_use = [content[ii] for ii in temp_index_related_to_this_pdb_file]
75 |                     with open(out_file_name, 'a') as out_file:
76 |                         for frame in frames_to_use:
77 |                             out_file.write("MODEL" + frame)
78 | 
79 |         return
80 | 
81 |     def get_pdb_name_and_corresponding_frame_index_with_global_coor_index(self, coor_index):
82 |         for item, temp_pdb in zip(self._list_num_frames, self.get_list_of_corresponding_pdb_dcd()):
83 |             if coor_index < item: break
84 |             else: coor_index -= item
85 |         return temp_pdb, coor_index
86 | 
87 |     def concat_all_pdb_files(self, out_pdb_file):
88 |         """
89 |         Why don't I use 'cat' in terminal? since I want to make order consistent with Python sort() function 
90 |         """
91 |         with open(out_pdb_file, 'w') as outfile:
92 |             for fname in self.get_list_of_corresponding_pdb_dcd():
93 |                 with open(fname) as infile:
94 |                     outfile.write(infile.read())
95 |         return
96 | 
97 | 


--------------------------------------------------------------------------------
/MD_simulation_on_alanine_dipeptide/current_work/src/generate_coordinates.py:
--------------------------------------------------------------------------------
 1 | from ANN_simulation import *
 2 | import argparse, subprocess, os
 3 | 
 4 | parser = argparse.ArgumentParser()
 5 | parser.add_argument("mol_type", type=str, help="molecule type of the pdb files")
 6 | parser.add_argument("--path", type=str, default="../target", help="specify the directory/file containing the pdb files")
 7 | args = parser.parse_args()
 8 | 
 9 | molecule_type = Sutils.create_subclass_instance_using_name(args.mol_type)
10 | temp_path = args.path
11 | 
12 | if os.path.exists(temp_path):
13 |     molecule_type.generate_coordinates_from_pdb_files(path_for_pdb=temp_path)
14 | else:
15 |     print("%s not existed!" % temp_path)
16 | 


--------------------------------------------------------------------------------
/MD_simulation_on_alanine_dipeptide/current_work/src/helper_func.py:
--------------------------------------------------------------------------------
  1 | from config import *
  2 | from scipy.special import erf
  3 | 
  4 | class Helper_func(object):
  5 |     def __init__(self):
  6 |         return
  7 | 
  8 |     @staticmethod
  9 |     def get_mutual_info_of_two_continuous_vars(temp_var_0, temp_var_1, bins=10, normalization=True):
 10 |         temp_hist_0, _ = np.histogramdd(temp_var_0, bins=bins)
 11 |         temp_hist_1, _ = np.histogramdd(temp_var_1, bins=bins)
 12 |         temp_hist_2, _ = np.histogramdd(np.array([temp_var_0, temp_var_1]).T, bins=bins)
 13 |         temp_hist_0 /= temp_hist_0.sum()
 14 |         temp_hist_1 /= temp_hist_1.sum()
 15 |         temp_hist_2 /= temp_hist_2.sum()
 16 |         result = np.sum([temp_hist_2[item_x, item_y] * np.log(
 17 |             temp_hist_2[item_x, item_y] / temp_hist_0[item_x] / temp_hist_1[item_y])
 18 |                      for item_x in range(bins) for item_y in range(bins) if temp_hist_2[item_x, item_y] != 0])
 19 |         if normalization:
 20 |             entropy_0 = - np.sum(temp_hist_0 * np.log(temp_hist_0))
 21 |             entropy_1 = - np.sum(temp_hist_1 * np.log(temp_hist_1))
 22 |             result /= (0.5 * (entropy_0 + entropy_1))
 23 |         return result
 24 | 
 25 |     @staticmethod
 26 |     def generate_alkane_residue_code_in_openmm_xml(num, name):
 27 |         print('''<Residue name="%s">
 28 | <Atom charge="0.09" name="H11" type="HGA3"/>
 29 | <Atom charge="0.09" name="H12" type="HGA3"/>
 30 | <Atom charge="0.09" name="H13" type="HGA3"/>
 31 | <Atom charge="-0.27" name="C1" type="CG331"/>''' % name)
 32 |         for item in range(num - 2):
 33 |             print('''<Atom charge="0.09" name="H%d1" type="HGA2"/>
 34 | <Atom charge="0.09" name="H%d2" type="HGA2"/>
 35 | <Atom charge="-0.18" name="C%d" type="CG321"/>''' % (item + 2, item + 2, item + 2))
 36 |         print("""<Atom charge="0.09" name="H%d1" type="HGA3"/>
 37 | <Atom charge="0.09" name="H%d2" type="HGA3"/>
 38 | <Atom charge="0.09" name="H%d3" type="HGA3"/>
 39 | <Atom charge="-0.27" name="C%d" type="CG331"/>
 40 | <Bond atomName1="H11" atomName2="C1"/>
 41 | <Bond atomName1="H12" atomName2="C1"/>
 42 | <Bond atomName1="H13" atomName2="C1"/>""" % (num, num, num, num))
 43 |         for item in range(num - 1):
 44 |             print("""<Bond atomName1="C%d" atomName2="C%d"/>
 45 | <Bond atomName1="H%d1" atomName2="C%d"/>
 46 | <Bond atomName1="H%d2" atomName2="C%d"/>""" % (item + 1, item + 2, item + 2, item + 2, item + 2, item + 2))
 47 |         print("""<Bond atomName1="H%d3" atomName2="C%d"/>
 48 | <AllowPatch name="MET1"/>
 49 | <AllowPatch name="MET2"/>
 50 | </Residue>""" % (num, num))
 51 |         return
 52 | 
 53 |     @staticmethod
 54 |     def check_center_of_mass_is_at_origin(result):
 55 |         coords_of_center_of_mass_after = [[np.average(result[item, ::3]), np.average(result[item, 1::3]),
 56 |                                            np.average(result[item, 2::3])]
 57 |                                           for item in range(result.shape[0])]
 58 |         return np.all(np.abs(np.array(coords_of_center_of_mass_after).flatten()) < 1e-5)
 59 | 
 60 |     @staticmethod
 61 |     def remove_translation(coords):  # remove the translational degree of freedom
 62 |         if len(coords.shape) == 1:  # convert 1D array (when there is only one coord) to 2D array
 63 |             coords = coords.reshape((1, coords.shape[0]))
 64 |         number_of_atoms = coords.shape[1] // 3
 65 |         coords_of_center_of_mass = [[np.average(coords[item, ::3]), np.average(coords[item, 1::3]),
 66 |                                      np.average(coords[item, 2::3])] * number_of_atoms
 67 |                                     for item in range(coords.shape[0])]
 68 |         result = coords - np.array(coords_of_center_of_mass)
 69 |         assert Helper_func.check_center_of_mass_is_at_origin(result)
 70 |         return result
 71 | 
 72 |     @staticmethod
 73 |     def get_gyration_tensor_and_principal_moments(coords):
 74 |         coords = Helper_func.remove_translation(coords)
 75 |         temp_coords = coords.reshape(coords.shape[0], coords.shape[1] // 3, 3)
 76 |         gyration = np.zeros((coords.shape[0], 3, 3))
 77 |         for xx in range(3):
 78 |             for yy in range(3):
 79 |                 gyration[:, xx, yy] = (temp_coords[:, :, xx] * temp_coords[:, :, yy]).mean(axis=-1)
 80 |         moments_gyration = np.linalg.eig(gyration)[0]
 81 |         moments_gyration.sort(axis=-1)
 82 |         return gyration, moments_gyration[:, ::-1]
 83 | 
 84 |     @staticmethod
 85 |     def get_norm_factor(rcut, sig):
 86 |         rcut2 = rcut*rcut
 87 |         sig2 = 2.0*sig*sig
 88 |         normconst = np.sqrt( np.pi * sig2 ) * erf( rcut / (sqrt(2.0)*sig) ) - 2*rcut* np.exp( - rcut2 / sig2 )
 89 |         preerf = np.sqrt( 0.5 * np.pi * sig * sig ) / normconst
 90 |         prelinear = np.exp( - rcut2 / sig2 ) / normconst
 91 |         return normconst, preerf, prelinear
 92 | 
 93 |     @staticmethod
 94 |     def get_cg_count_in_sphere(dis, r_hi, rcut, sig):  # get coarse grained counts
 95 |         # TODO: test if this function is correct
 96 |         normconst, preerf, prelinear = Helper_func.get_norm_factor(rcut, sig)
 97 |         hiMinus = r_hi - rcut
 98 |         hiPlus = r_hi + rcut
 99 |         count = np.float64((dis <= hiPlus).sum(axis=-1))
100 |         temp_in_boundary_region = ((dis > hiMinus) & (dis <= hiPlus))
101 |         temp_correction = ( 0.5 + preerf * erf( np.sqrt(0.5) * (dis - r_hi)/sig ) \
102 |                                              - prelinear * (dis - r_hi))
103 |         # print count.shape, temp_in_boundary_region.shape, temp_correction.shape
104 |         count -= (temp_in_boundary_region * temp_correction).sum(axis=-1)
105 |         actual_count = (dis < r_hi).sum(axis=-1)
106 |         return count, actual_count
107 | 
108 |     @staticmethod
109 |     def get_cg_count_in_shell(dis, r_low, r_hi, rcut, sig):
110 |         cg_1, actual_1 = Helper_func.get_cg_count_in_sphere(dis, r_hi, rcut, sig)
111 |         cg_2, actual_2 = Helper_func.get_cg_count_in_sphere(dis, r_low, rcut, sig)
112 |         return cg_1 - cg_2, actual_1 - actual_2
113 | 
114 |     @staticmethod
115 |     def get_cg_count_slice_representation(dis, r_shell_low, r_shell_high, num, rcut, sig):
116 |         temp_r = np.linspace(r_shell_low, r_shell_high, num)
117 |         r_low_list = temp_r[:-1]
118 |         r_high_list = temp_r[1:]
119 |         result = [Helper_func.get_cg_count_in_shell(dis, r_low, r_high, rcut, sig)[0]
120 |                   for (r_low, r_high) in zip(r_low_list, r_high_list)]
121 |         return np.concatenate(result, axis=1), temp_r
122 | 
123 |     @staticmethod
124 |     def get_box_length_list_fom_reporter_file(reporter_file, unit):  # require unit explicitly
125 |         reporter_file_content = np.loadtxt(reporter_file, delimiter=',', usecols=(6,))  # column 6 is volume of box
126 |         if unit == 'nm': scaling_factor = 1
127 |         elif unit == 'A': scaling_factor = 10
128 |         return scaling_factor * np.cbrt(reporter_file_content)
129 | 
130 |     @staticmethod
131 |     def compute_distances_min_image_convention(atoms_pos_1, atoms_pos_2, box_length_list):
132 |         # note: box_length may be different for different frames when using NPT, typically is read from reporter file
133 |         # shape of atoms_pos_{1,2}: (num of frames, num of atoms * 3)
134 |         # output: distance matrix
135 |         # why don't we use mdtraj?  Because it requires large memory for loading large pdb files
136 |         # why don't we use MDAnalysis?  Because it is not fast enough (looping over trajectory would take long time)
137 |         # this function is especially useful when both atoms_pos_1, atoms_pos_2 are not super long, while the number of frames is large, 
138 |         # since it vectorizes computation over frames
139 |         temp_dis_2 = np.zeros((atoms_pos_1.shape[0], atoms_pos_1.shape[1] // 3, atoms_pos_2.shape[1] // 3))
140 |         for index_1 in range(atoms_pos_1.shape[1] // 3):
141 |             # print index_1
142 |             for index_2 in range(atoms_pos_2.shape[1] // 3):
143 |                 temp_diff = atoms_pos_1[:, 3 * index_1: 3 * index_1 + 3] - atoms_pos_2[:, 3 * index_2: 3 * index_2 + 3]
144 |                 temp_vec = np.array([(item + box_length_list / 2.0) % box_length_list - box_length_list / 2.0 for item in temp_diff.T])
145 |                 temp_dis_2[:, index_1, index_2] = np.linalg.norm(temp_vec, axis=0)
146 |         return temp_dis_2
147 | 
148 |     @staticmethod
149 |     def get_index_list_of_O_atom_in_water(pdb_file, ignore_TER_line):
150 |         """this is used for solvent analysis, e.g. biased simulation with PLUMED"""
151 |         temp_u = Universe(pdb_file)
152 |         atom_sel = temp_u.select_atoms('resname HOH and name O')
153 |         if ignore_TER_line: return atom_sel.indices + 1
154 |         else: raise Exception('double check your pdb')
155 | 
156 |     @staticmethod
157 |     def get_distances_with_water_for_atom_list(pdb_file, atom_selection, box_length_list):
158 |         # box_length information is stored in reporter_file
159 |         temp_u = Universe(pdb_file)
160 |         water_pos, atoms_pos = [], []
161 |         water_sel = temp_u.select_atoms('resname HOH and name O')
162 |         atoms_sel = temp_u.select_atoms(atom_selection)
163 |         for _ in temp_u.trajectory:
164 |             water_pos.append(water_sel.positions.flatten())
165 |             atoms_pos.append(atoms_sel.positions.flatten())
166 |         atoms_pos = np.array(atoms_pos)
167 |         water_pos = np.array(water_pos)
168 |         distances = Helper_func.compute_distances_min_image_convention(atoms_pos_1=atoms_pos, atoms_pos_2=water_pos,
169 |                                                                        box_length_list=box_length_list)
170 |         return distances
171 | 
172 |     @staticmethod
173 |     def get_list_of_cg_count_for_atom_list(pdb_file, atom_selection, box_length_list, r_low, r_hi, rcut, sig):
174 |         """ cg = coarse grained, atom list is specified by atom_selection """
175 |         distances = Helper_func.get_distances_with_water_for_atom_list(pdb_file, atom_selection, box_length_list)
176 |         return Helper_func.get_cg_count_in_shell(distances, r_low, r_hi, rcut, sig)
177 | 
178 |     @staticmethod
179 |     def get_radial_distribution(distances, num, nbins, dr, length):
180 |         hist = np.zeros(nbins, )
181 |         for item in distances:
182 |             temp_target_index = int(item / dr)
183 |             if temp_target_index < nbins:
184 |                 hist[temp_target_index] += 1.0 / (4 / 3.0 * np.pi) / (
185 |                             ((temp_target_index + 1) * dr) ** 3 - ((temp_target_index + 0) * dr) ** 3)
186 |         return hist / (num / length ** 3)
187 | 
188 |     @staticmethod
189 |     def backup_rename_file_if_exists(filename):
190 |         extension = '.' + filename.split('.')[-1]
191 |         if os.path.isfile(filename):  # backup file if previous one exists
192 |             new_filename = filename + ".bak_" + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + extension
193 |             os.rename(filename, new_filename)
194 |         else: new_filename = None
195 |         return new_filename
196 | 
197 |     @staticmethod
198 |     def attempt_to_save_npy(npy_file, npy_array):
199 |         """when trying to save a npy array to a file, if it exists and contains a different value,
200 |         then save to another file"""
201 |         if npy_file.strip()[-4:] != '.npy': npy_file += '.npy'
202 |         original_npy_file = npy_file
203 |         index = 0
204 |         while True:
205 |             if os.path.isfile(npy_file):
206 |                 content = np.load(npy_file)
207 |                 if np.all(npy_array == content):
208 |                     break
209 |                 else:
210 |                     npy_file = original_npy_file.replace('.npy', '_%d.npy' % index)
211 |                     index += 1
212 |             else:
213 |                 np.save(npy_file, npy_array)
214 |                 break
215 |         return npy_file
216 | 
217 |     @staticmethod
218 |     def run_multiple_jobs_on_local_machine(commands, num_of_jobs_in_parallel=CONFIG_56):
219 |         total_num_failed_jobs = 0
220 |         for item in range(int(len(commands) / num_of_jobs_in_parallel) + 1):
221 |             temp_commands_parallel = commands[item * num_of_jobs_in_parallel: (item + 1) * num_of_jobs_in_parallel]
222 |             print("running: \t" + '\n'.join(temp_commands_parallel))
223 |             procs_to_run_commands = [subprocess.Popen(_1.strip(), shell=True) for _1 in temp_commands_parallel]
224 |             exit_codes = [p.wait() for p in procs_to_run_commands]
225 |             total_num_failed_jobs += sum(exit_codes)
226 |         return total_num_failed_jobs
227 | 
228 |     @staticmethod
229 |     def shuffle_multiple_arrays(list_of_arrays):
230 |         """can be used for shuffle training and validation set to improve sampling"""
231 |         indices = np.arange(list_of_arrays[0].shape[0])
232 |         np.random.shuffle(indices)
233 |         return [item[indices] for item in list_of_arrays]
234 | 
235 |     @staticmethod
236 |     def find_indices_of_points_in_array_near_each_point_in_ref_list(point_list, ref_list, threshold_r):
237 |         """used to find points near a specific point (in the reference list), useful for sampling structures
238 |         in a pdb file that are near a specific point in CV space  (result is the indices of pdb snapshots)
239 |         """
240 |         return [np.where(np.linalg.norm(point_list - item, axis=1) < threshold_r)[0]
241 |                 for item in ref_list]
242 | 
243 |     @staticmethod
244 |     def tica_inverse_transform(tica, data_list):
245 |         from msmbuilder.decomposition import tICA
246 |         assert (isinstance(tica, tICA))
247 |         result_list = []
248 |         for data in data_list:
249 |             result = np.dot(tica.covariance_.T, np.dot(tica.components_.T, data.T)).T + tica.means_
250 |             assert_almost_equal(tica.transform([result])[0], data)
251 |             result_list.append(result)
252 |         return result_list
253 | 
254 |     @staticmethod
255 |     def get_autocorr(x_list, lag_time):
256 |         return np.corrcoef(np.array([x_list[0:len(x_list) - lag_time], x_list[lag_time:len(x_list)]]))[0, 1]
257 | 
258 |     @staticmethod
259 |     def generate_sequence_with_constant_autocorrelation(constant_autocorrelation, length):
260 |         traj_list = [np.random.normal()]
261 |         for _ in range(length - 1):
262 |             temp_value = np.random.normal(constant_autocorrelation * traj_list[-1], scale=1)
263 |             traj_list.append(temp_value)
264 |         return traj_list
265 | 
266 |     @staticmethod
267 |     def load_object_from_pkl_file(file_path):
268 |         try:
269 |             result = pickle.load(open(file_path, 'rb'))
270 |         except:    # solve encoding issue for python2 -> python3
271 |             with open(file_path, 'rb') as ff:
272 |                 result = pickle.load(ff, encoding='latin1')
273 |         return result
274 | 
275 |     @staticmethod
276 |     def load_npy(file, format):
277 |         if format == 'txt': return np.loadtxt(file)
278 |         elif format == 'npy': return np.load(file)
279 | 


--------------------------------------------------------------------------------
/MD_simulation_on_alanine_dipeptide/current_work/src/kernel_tica.py:
--------------------------------------------------------------------------------
 1 | import numpy as np, pyemma as py
 2 | # from msmbuilder.decomposition.tica import tICA
 3 | from sklearn.kernel_approximation import Nystroem
 4 | 
 5 | """modified from https://github.com/msmbuilder/msmbuilder/blob/master/msmbuilder/decomposition/ktica.py"""
 6 | """reference: [1] Schwantes, Christian R., and Vijay S. Pande. J. Chem Theory Comput. 11.2 (2015): 600--608."""
 7 | 
 8 | class Kernel_tica(object):
 9 |     def __init__(self, n_components, lag_time,
10 |                  gamma,             # gamma value for rbf kernel
11 |                  n_components_nystroem=100,  # number of components for Nystroem kernel approximation
12 |                  landmarks = None,
13 |                  shrinkage = None,
14 |                  weights='empirical'    # if 'koopman', use Koopman reweighting for tICA (see Wu, Hao, et al. "Variational Koopman models: slow collective variables and molecular kinetics from short off-equilibrium simulations." The Journal of Chemical Physics 146.15 (2017): 154104.)
15 |                  ):
16 |         self._n_components = n_components
17 |         self._lag_time = lag_time
18 |         self._n_components_nystroem = n_components_nystroem
19 |         self._landmarks = landmarks
20 |         self._gamma = gamma
21 |         self._nystroem = Nystroem(gamma=gamma, n_components=n_components_nystroem)
22 |         self._weights = weights
23 |         # self._tica = tICA(n_components=n_components, lag_time=lag_time, shrinkage=shrinkage)
24 |         self._shrinkage = shrinkage
25 |         return
26 | 
27 |     def fit(self, sequence_list):
28 |         if self._landmarks is None:
29 |             self._nystroem.fit(np.concatenate(sequence_list))
30 |         else:
31 |             print("using landmarks")
32 |             self._nystroem.fit(self._landmarks)
33 |         sequence_transformed = [self._nystroem.transform(item) for item in sequence_list]
34 |         # define tica object at fit() with sequence_list supplied for initialization, as it is required by
35 |         # Koopman reweighting
36 |         self._tica = py.coordinates.tica(sequence_transformed, lag=self._lag_time,
37 |                                          dim=self._n_components, kinetic_map=True,
38 |                                          weights=self._weights)
39 |         return
40 | 
41 |     def transform(self, sequence_list):
42 |         return self._tica.transform(
43 |             [self._nystroem.transform(item) for item in sequence_list])
44 | 
45 |     def fit_transform(self, sequence_list):
46 |         self.fit(sequence_list)
47 |         return self.transform(sequence_list)
48 | 
49 |     def score(self, sequence_list):
50 |         model = self.__class__(n_components = self._n_components, lag_time=self._lag_time, gamma=self._gamma,
51 |                                n_components_nystroem=self._n_components_nystroem, landmarks=self._landmarks,
52 |                                shrinkage=self._shrinkage)
53 |         model.fit(sequence_list)
54 |         return np.sum(model._tica.eigenvalues)
55 | 


--------------------------------------------------------------------------------
/MD_simulation_on_alanine_dipeptide/current_work/src/main_work.py:
--------------------------------------------------------------------------------
 1 | from ANN_simulation import *
 2 | import argparse
 3 | 
 4 | parser = argparse.ArgumentParser()
 5 | parser.add_argument("--starting_index", type=int, default=1, help="index of starting iteration")
 6 | parser.add_argument("--num_of_iterations", type=int, default=10, help="number of iterations to run")
 7 | parser.add_argument("--starting_network_file", type=str, default=None, help="the network to start with")
 8 | parser.add_argument("--training_interval", type=int, default=1, help="training interval")
 9 | args = parser.parse_args()
10 | 
11 | if args.starting_network_file is None:
12 |     starting_network = None
13 | else:
14 |     starting_network = autoencoder.load_from_pkl_file(args.starting_network_file)
15 | 
16 | init_iter = iteration(index = args.starting_index, network = starting_network)
17 | 
18 | a = simulation_with_ANN_main(num_of_iterations = args.num_of_iterations, initial_iteration = init_iter, training_interval=args.training_interval)
19 | a.run_mult_iterations()
20 | 
21 | print("Done main work!")
22 | 


--------------------------------------------------------------------------------
/MD_simulation_on_alanine_dipeptide/current_work/src/molecule_spec_sutils.py:
--------------------------------------------------------------------------------
  1 | """Sutils: simulation unilities, some of them are molecule-specific (implemented as methods in subclasses)
  2 | """
  3 | 
  4 | from config import *
  5 | import random, mdtraj as md
  6 | from coordinates_data_files_list import *
  7 | from sklearn.cluster import KMeans
  8 | from helper_func import *
  9 | from functools import reduce
 10 | 
 11 | class Sutils(object):
 12 |     def __init__(self):
 13 |         return
 14 | 
 15 |     @staticmethod
 16 |     def get_num_of_non_overlapping_hyperspheres_that_filled_explored_phase_space(
 17 |             pdb_file_list, atom_selection, radius, step_interval=1, shuffle_list=True,
 18 |             distance_metric='RMSD'):
 19 |         """
 20 |         This functions is used to count how many non-overlapping hyperspheres are needed to fill the explored phase
 21 |         space, to estimate volumn of explored region
 22 |         :param atom_selection: atom selection statement for MDAnalysis
 23 |         :param radius: radius of hyperspheres
 24 |         :param distance_metric: distance metric of two frames
 25 |         :return: number of hyperspheres
 26 |         """
 27 |         if shuffle_list: random.shuffle(pdb_file_list)
 28 |         index = 0
 29 |         positions_list = []
 30 |         for sample_file in pdb_file_list:
 31 |             sample = Universe(sample_file)
 32 |             sample_atom_selection = sample.select_atoms(atom_selection)
 33 |             frame_index_list = list(range(sample.trajectory.n_frames))
 34 |             if shuffle_list: random.shuffle(frame_index_list)
 35 |             for item_index in frame_index_list:
 36 |                 sample.trajectory[item_index]
 37 |                 if index % step_interval == 0:
 38 |                     current_positions = sample_atom_selection.positions
 39 |                     distances_to_previous_frames = np.array(
 40 |                         [Sutils.get_RMSD_after_alignment(item, current_positions)
 41 |                             for item in positions_list])
 42 |                     if len(distances_to_previous_frames) == 0 or np.all(distances_to_previous_frames > radius):
 43 |                         # need to include a new hypershere
 44 |                         positions_list.append(current_positions)
 45 | 
 46 |                 index += 1
 47 | 
 48 |         return len(positions_list), np.array(positions_list)
 49 | 
 50 |     @staticmethod
 51 |     def mark_and_modify_pdb_for_calculating_RMSD_for_plumed(pdb_file, out_pdb,
 52 |                                                             atom_index_list, start_idx, item_positions=None):
 53 |         """
 54 |         :param pdb_file: input pdb
 55 |         :param out_pdb: output reference pdb
 56 |         :param atom_index_list: index list used to calculate RMSD
 57 |         :param item_positions: reference positions of selected atoms, set it None if we do not want to modify positions
 58 |         """
 59 |         indices = np.array(atom_index_list) - start_idx  # explicitly specify start_idx, to avoid confusion
 60 |         temp_sample = Universe(pdb_file)
 61 |         temp_atoms = temp_sample.select_atoms('all')
 62 |         if not item_positions is None:
 63 |             item_positions = item_positions.reshape((item_positions.shape[0] // 3, 3))
 64 |             temp_positions = temp_atoms.positions
 65 |             temp_positions[indices] = item_positions
 66 |             temp_atoms.positions = temp_positions
 67 | 
 68 |         temp_bfactors = np.zeros(len(temp_atoms))
 69 |         temp_bfactors[indices] = 1
 70 |         temp_atoms.tempfactors = temp_bfactors
 71 |         temp_atoms.occupancies = temp_bfactors
 72 |         temp_atoms.write(out_pdb)
 73 |         return out_pdb
 74 | 
 75 |     @staticmethod
 76 |     def get_plumed_script_that_generate_a_segment_connecting_two_configs(
 77 |             pdb_1, pdb_2, atom_selection_statement, num_steps, force_constant):
 78 |         """
 79 |         This function uses targeted MD to generate a segment connecting two configurations
 80 |         :param pdb_1, pdb_2: two ends of segment
 81 |         :param atom_selection_statement: atoms for calculating RMSD in targeted MD
 82 |         """
 83 |         atom_list = get_index_list_with_selection_statement(pdb_1, atom_selection_statement)
 84 |         ref_pdb = pdb_2.replace('.pdb', '_ref.pdb')
 85 |         Sutils.mark_and_modify_pdb_for_calculating_RMSD_for_plumed(pdb_2, ref_pdb, atom_list, None)
 86 |         rmsd_diff = Sutils.metric_RMSD_of_atoms([pdb_1], ref_file=ref_pdb,
 87 |                                                 atom_selection_statement=atom_selection_statement, step_interval=100)[0]  # TODO: check units
 88 |         plumed_script = """rmsd: RMSD REFERENCE=%s TYPE=OPTIMAL
 89 | restraint: MOVINGRESTRAINT ARG=rmsd AT0=%f STEP0=0 KAPPA0=%f AT1=0 STEP1=%d KAPPA1=%f
 90 | PRINT STRIDE=500 ARG=* FILE=COLVAR
 91 | """ % (ref_pdb, rmsd_diff, force_constant, num_steps, force_constant)
 92 |         return plumed_script
 93 | 
 94 |     @staticmethod
 95 |     def prepare_output_Cartesian_coor_with_multiple_ref_structures(
 96 |              folder_list,
 97 |              alignment_coor_file_suffix_list,
 98 |              scaling_factor
 99 |              ):
100 |         my_coor_data_obj = coordinates_data_files_list(list_of_dir_of_coor_data_files=folder_list)
101 |         coor_data_obj_input = my_coor_data_obj.create_sub_coor_data_files_list_using_filter_conditional(
102 |             lambda x: not 'aligned' in x)
103 |         assert (len(alignment_coor_file_suffix_list) == CONFIG_55)
104 |         coor_data_obj_output_list = [my_coor_data_obj.create_sub_coor_data_files_list_using_filter_conditional(
105 |             lambda x: item in x) for item in alignment_coor_file_suffix_list]
106 | 
107 |         for item in range(len(alignment_coor_file_suffix_list)):
108 |             for _1, _2 in zip(coor_data_obj_input.get_list_of_coor_data_files(),
109 |                               coor_data_obj_output_list[item].get_list_of_coor_data_files()):
110 |                 assert (_2 == _1.replace('_coordinates.npy', alignment_coor_file_suffix_list[item])), (_2, _1)
111 | 
112 |         output_data_set = np.concatenate([Sutils.remove_translation(item.get_coor_data(scaling_factor))
113 |                                           for item in coor_data_obj_output_list] , axis=1)
114 |         return output_data_set
115 | 
116 |     @staticmethod
117 |     def select_representative_points(data_set, output_data_set):
118 |         # clustering, pick representative points for training, two purposes:
119 |         # 1. avoid that training results are too good for densely-sampled regions, but bad for others.
120 |         # 2. reduce computation cost
121 |         print ("selecting representative points...")
122 |         kmeans = KMeans(init='k-means++', n_clusters=min(CONFIG_59, output_data_set.shape[0]), n_init=10)
123 |         kmeans.fit(output_data_set)
124 |         indices_of_representative_points = np.array([np.where(kmeans.labels_ == ii)[0][0]
125 |                                                      for ii in range(kmeans.n_clusters)])
126 |         return data_set[indices_of_representative_points], output_data_set[indices_of_representative_points]
127 | 
128 |     @staticmethod
129 |     def create_subclass_instance_using_name(name):
130 |         return {'Alanine_dipeptide': Alanine_dipeptide(), 'Trp_cage': Trp_cage()}[name]
131 | 
132 |     @staticmethod
133 |     def load_object_from_pkl_file(file_path):
134 |         return Helper_func.load_object_from_pkl_file(file_path)
135 | 
136 |     @staticmethod
137 |     def write_some_frames_into_a_new_file_based_on_index_list_for_pdb_file_list(list_of_files, index_list, new_pdb_file_name):
138 |         print("note that order may not be preserved!")
139 |         remaining_index_list = index_list
140 |         for _1 in list_of_files:
141 |             remaining_index_list = Sutils.write_some_frames_into_a_new_file_based_on_index_list(_1, remaining_index_list, new_pdb_file_name)
142 | 
143 |         # check number of frames to be correct
144 |         with open(new_pdb_file_name, 'r') as f_in:
145 |             content = f_in.read().strip().split('MODEL')[1:]
146 |             assert (len(content) == len(index_list)), (len(content), len(index_list))
147 | 
148 |         return
149 | 
150 |     @staticmethod
151 |     def write_some_frames_into_a_new_file_based_on_index_list(pdb_file_name, index_list, new_pdb_file_name=None,
152 |                                                               overwrite=False):
153 |         if os.stat(pdb_file_name).st_size > 1000000000: raise Exception('file may be too large, try to use other tools')
154 | 
155 |         if new_pdb_file_name is None:
156 |             new_pdb_file_name = pdb_file_name.strip().split('.pdb')[0] + '_someframes.pdb'
157 | 
158 |         with open(pdb_file_name, 'r') as f_in:
159 |             content = [item for item in f_in.readlines() if (not 'REMARK' in item) and (not 'END\n' in item)]
160 |             content = ''.join(content)
161 |             content = content.split('MODEL')[1:]  # remove header
162 |             num_of_frames_in_current_file = len(content)
163 |             index_for_this_file = [_2 for _2 in index_list if _2 < num_of_frames_in_current_file]
164 |             remaining_index_list = [_2 - num_of_frames_in_current_file for _2 in index_list if
165 |                                     _2 >= num_of_frames_in_current_file]
166 |             content_to_write = [content[_2] for _2 in index_for_this_file]
167 | 
168 |         write_flag = 'w' if overwrite else 'a'
169 |         with open(new_pdb_file_name, write_flag) as f_out:
170 |             for item in content_to_write:
171 |                 f_out.write("MODEL")
172 |                 f_out.write(item)
173 | 
174 |         return remaining_index_list
175 | 
176 |     @staticmethod
177 |     def concat_first_frame_in_all_pdb_files(list_of_pdb_files, new_pdb_file_name):
178 |         for item in list_of_pdb_files:
179 |             Sutils.write_some_frames_into_a_new_file_based_on_index_list(item, [0], new_pdb_file_name)
180 |         return
181 | 
182 |     @staticmethod
183 |     def write_some_frames_into_a_new_file(pdb_file_name, start_index, end_index, step_interval = 1,  # start_index included, end_index not included
184 |                                           new_pdb_file_name=None, method=1):
185 |         print('writing frames of %s: [%d:%d:%d]...' % (pdb_file_name, start_index, end_index, step_interval))
186 |         if new_pdb_file_name is None:
187 |             new_pdb_file_name = pdb_file_name.strip().split('.pdb')[0] + '_frame_%d_%d_%d.pdb' % (start_index, end_index, step_interval)
188 | 
189 |         if method == 0:
190 |             if os.stat(pdb_file_name).st_size > 1000000000: raise Exception('file may be too large, try to use other tools')
191 |             with open(pdb_file_name, 'r') as f_in:
192 |                 content = [item for item in f_in.readlines() if (not 'REMARK' in item) and (not 'END\n' in item)]
193 |                 content = ''.join(content)
194 |                 content = content.split('MODEL')[1:]  # remove header
195 |                 if end_index == 0:
196 |                     content_to_write = content[start_index::step_interval]     # for selecting last few frames
197 |                 else:
198 |                     content_to_write = content[start_index:end_index:step_interval]
199 | 
200 |             with open(new_pdb_file_name, 'w') as f_out:
201 |                 for item in content_to_write:
202 |                     f_out.write("MODEL")
203 |                     f_out.write(item)
204 |         elif method == 1:
205 |             index = -1
206 |             with open(pdb_file_name, 'r') as f_in, open(new_pdb_file_name, 'w') as f_out:
207 |                 for item in f_in:
208 |                     if 'MODEL' in item: index += 1
209 |                     if (not 'REMARK' in item) and (not 'END\n' in item) and (index % step_interval == 0) \
210 |                             and (
211 |                             (end_index != 0 and (start_index <= index < end_index))
212 |                             or (end_index == 0 and index >= start_index)):
213 |                         f_out.write(item)
214 |         return
215 | 
216 |     @staticmethod
217 |     def data_augmentation(data_set, output_data_set, num_of_copies, is_output_reconstructed_Cartesian=True):
218 |         """
219 |         assume that center of mass motion of data_set and output_data_set should be removed.
220 |         """
221 |         assert (Sutils.check_center_of_mass_is_at_origin(data_set))
222 |         if is_output_reconstructed_Cartesian:
223 |             assert (Sutils.check_center_of_mass_is_at_origin(output_data_set))
224 | 
225 |         num_of_data = data_set.shape[0]
226 |         output_data_set = np.array(output_data_set.tolist() * num_of_copies)
227 |         num_atoms = len(data_set[0]) // 3
228 |         data_set = data_set.reshape((num_of_data, num_atoms, 3))
229 |         temp_data_set = []
230 |         for _ in range(num_of_copies):
231 |             temp_data_set.append([Sutils.rotating_randomly_around_center_of_mass(x) for x in data_set])
232 | 
233 |         data_set = np.concatenate(temp_data_set, axis=0)
234 |         data_set = data_set.reshape((num_of_copies * num_of_data, num_atoms * 3))
235 |         return data_set, output_data_set
236 | 
237 |     @staticmethod
238 |     def check_center_of_mass_is_at_origin(result):
239 |         return Helper_func.check_center_of_mass_is_at_origin(result=result)
240 | 
241 |     @staticmethod
242 |     def remove_translation(coords):   # remove the translational degree of freedom
243 |         return Helper_func.remove_translation(coords=coords)
244 | 
245 |     @staticmethod
246 |     def rotating_randomly_around_center_of_mass(coords):
247 |         axis_vector = np.random.uniform(0, 1, 3)
248 |         angle = np.random.uniform(0, 2 * np.pi)
249 |         return Sutils.rotating_around_center_of_mass(coords, axis_vector, angle)
250 | 
251 |     @staticmethod
252 |     def rotating_around_center_of_mass(coords, axis_vector, angle):
253 |         center_of_mass = coords.mean(axis=0)
254 |         return Sutils.rotating_coordinates(coords, center_of_mass, axis_vector, angle)
255 | 
256 |     @staticmethod
257 |     def rotating_coordinates(coords, fixed_coord, axis_vector, angle):
258 |         indices_atoms = list(range(len(coords)))
259 |         return Sutils.rotating_group_of_atoms(coords, indices_atoms, fixed_coord, axis_vector, angle)
260 | 
261 |     @staticmethod
262 |     def rotating_group_of_atoms(coords, indices_atoms, fixed_coord, axis_vector, angle):
263 |         """
264 |         :param coords: coordinates of all atoms
265 |         :param indices_atoms: indices of atoms to rotate
266 |         :param fixed_coord: coordinates of fixed point
267 |         :param axis_vector: rotation axis
268 |         :param angle: rotation angle
269 |         :return: coordinates of all atoms after rotation
270 |         """
271 |         result = copy.deepcopy(coords)  # avoid modifying original input
272 |         temp_coords = coords[indices_atoms] - fixed_coord  # coordinates for rotation
273 |         temp_coords = np.array(temp_coords)
274 |         cos_value = np.cos(angle); sin_value = np.sin(angle)
275 |         axis_vector_length = np.sqrt(np.sum(np.array(axis_vector) ** 2))
276 |         ux = axis_vector[0] / axis_vector_length; uy = axis_vector[1] / axis_vector_length; uz = axis_vector[2] / axis_vector_length
277 |         rotation_matrix = np.array([[cos_value + ux ** 2 * (1 - cos_value),
278 |                                      ux * uy * (1 - cos_value) - uz * sin_value,
279 |                                      ux * uz * (1 - cos_value) + uy * sin_value],
280 |                                     [ux * uy * (1 - cos_value) + uz * sin_value,
281 |                                      cos_value + uy ** 2 * (1 - cos_value),
282 |                                      uy * uz * (1 - cos_value) - ux * sin_value],
283 |                                     [ux * uz * (1 - cos_value) - uy * sin_value,
284 |                                      uy * uz * (1 - cos_value) + ux * sin_value,
285 |                                      cos_value + uz ** 2 * (1 - cos_value)]])
286 |         result[indices_atoms] = np.dot(temp_coords, rotation_matrix) + fixed_coord
287 |         return result
288 | 
289 |     @staticmethod
290 |     def _generate_coordinates_from_pdb_files(atom_index, file_path=CONFIG_12, format='npy'):
291 |         atom_index = [int(_1) for _1 in atom_index]
292 |         atom_index = np.array(atom_index) - 1     # note that atom index starts from 1
293 |         filenames = subprocess.check_output([
294 |             'find', file_path, '-name', '*.pdb', '-o', '-name', '*.dcd']).decode("utf-8").strip().split('\n')
295 |         output_file_list = []
296 | 
297 |         for input_file in filenames:
298 |             output_file = input_file[:-4] + '_coordinates.' + format
299 | 
300 |             output_file_list += [output_file]
301 |             if os.path.exists(output_file) and os.path.getmtime(input_file) < os.path.getmtime(output_file):   # check modified time
302 |                 print("coordinate file already exists: %s (remove previous one if needed)" % output_file)
303 |             else:
304 |                 print('generating coordinates of ' + input_file)
305 |                 mdxyz = md.load(input_file, top=CONFIG_62[0]).xyz
306 |                 mdxyz = mdxyz[:, atom_index, :].reshape(mdxyz.shape[0], len(atom_index) * 3)
307 |                 if format == 'txt': np.savetxt(output_file, mdxyz)
308 |                 elif format == 'npy': np.save(output_file, mdxyz)
309 | 
310 |         print("Done generating coordinates files\n")
311 |         return output_file_list
312 | 
313 |     @staticmethod
314 |     def _get_plumed_script_with_pairwise_dis_as_input(index_atoms, scaling_factor):
315 |         return Plumed_helper.get_pairwise_dis(index_atoms, scaling_factor=scaling_factor,
316 |                                               unit_scaling=1.0, out_var_prefix='l_0_out_')
317 | 
318 |     @staticmethod
319 |     def remove_water_mol_and_Cl_from_pdb_file(folder_for_pdb = CONFIG_12, preserve_original_file=True):
320 |         """
321 |         This is used to remove water molecule from pdb file, purposes:
322 |         - save storage space
323 |         - reduce processing time of pdb file
324 |         """
325 |         filenames = subprocess.check_output(['find', folder_for_pdb, '-name', '*.pdb']).decode("utf-8").split('\n')[:-1]
326 |         for item in filenames:
327 |             print('removing water molecules from pdb file: ' + item)
328 |             output_file = item[:-4] + '_rm_tmp.pdb'
329 |             is_line_removed_flag = False
330 |             with open(item, 'r') as f_in, open(output_file, 'w') as f_out:
331 |                 for line in f_in:
332 |                     if not 'HOH' in line and not 'CL' in line and not "NA" in line and not 'SPC' in line and not 'pseu' in line:
333 |                         f_out.write(line)
334 |                     else: is_line_removed_flag = True
335 | 
336 |             if not preserve_original_file:
337 |                 if is_line_removed_flag:
338 |                     subprocess.check_output(['mv', output_file, item])
339 |                 else:
340 |                     subprocess.check_output(['rm', output_file])
341 | 
342 |         print('Done removing water molecules from all pdb files!')
343 |         return
344 | 
345 |     @staticmethod
346 |     def get_boundary_points(list_of_points,
347 |                             range_of_PCs = CONFIG_26,
348 |                             num_of_bins = CONFIG_10,
349 |                             num_of_boundary_points = CONFIG_11,
350 |                             is_circular_boundary = CONFIG_18,
351 |                             preprocessing = True,
352 |                             auto_range_for_histogram = CONFIG_39,   # set the range of histogram based on min,max values in each dimension
353 |                             reverse_sorting_mode = CONFIG_41        # whether we reverse the order of sorting of diff_with_neighbors values
354 |                             ):
355 |         """
356 |         :param preprocessing: if True, then more weight is not linear, this would be better based on experience
357 |         """
358 |         dimensionality = len(list_of_points[0])
359 |         list_of_points = list(zip(*list_of_points))
360 |         assert (len(list_of_points) == dimensionality)
361 | 
362 |         if is_circular_boundary or not auto_range_for_histogram:
363 |             hist_matrix, edges = np.histogramdd(list_of_points, bins= num_of_bins * np.ones(dimensionality), range = range_of_PCs)
364 |         else:
365 |             temp_hist_range = [[min(item) - (max(item) - min(item)) / (num_of_bins - 2), max(item) + (max(item) - min(item)) / (num_of_bins - 2)]\
366 |                                 for item in list_of_points]
367 |             hist_matrix, edges = np.histogramdd(list_of_points, bins=num_of_bins * np.ones(dimensionality), range=temp_hist_range)
368 | 
369 |         # following is the main algorithm to find boundary and holes
370 |         # simply find the points that are lower than average of its 4 neighbors
371 | 
372 |         if preprocessing:
373 |             hist_matrix = np.array([[- np.exp(- y) for y in x] for x in hist_matrix])   # preprocessing process
374 | 
375 |         if is_circular_boundary:  # typically works for circular autoencoder
376 |             diff_with_neighbors = hist_matrix - 1.0 / (2 * dimensionality) \
377 |                                             * sum(
378 |                                                 [np.roll(hist_matrix, 1, axis=x) + np.roll(hist_matrix, -1, axis=x) for x in list(range(dimensionality))]
379 |                                                 )
380 |         else:
381 |             # TODO: code not concise and general enough, fix this later
382 |             diff_with_neighbors = np.zeros(hist_matrix.shape)
383 |             temp_1 = [list(range(item)) for item in hist_matrix.shape]
384 |             for grid_index in itertools.product(*temp_1):
385 |                 neighbor_index_list = [(np.array(grid_index) + temp_2).astype(int) for temp_2 in np.eye(dimensionality)]
386 |                 neighbor_index_list += [(np.array(grid_index) - temp_2).astype(int) for temp_2 in np.eye(dimensionality)]
387 |                 neighbor_index_list = [x for x in neighbor_index_list if np.all(x >= 0) and np.all(x < num_of_bins)]
388 |                 # print "grid_index = %s" % str(grid_index)
389 |                 # print "neighbor_index_list = %s" % str(neighbor_index_list)
390 |                 diff_with_neighbors[tuple(grid_index)] = hist_matrix[tuple(grid_index)] - np.average(
391 |                     [hist_matrix[tuple(temp_2)] for temp_2 in neighbor_index_list]
392 |                 )
393 | 
394 |         # get grid centers
395 |         edge_centers = [0.5 * (np.array(x[1:]) + np.array(x[:-1])) for x in edges]
396 |         grid_centers = np.array(list(itertools.product(*edge_centers)))  # "itertools.product" gives Cartesian/direct product of several lists
397 |         grid_centers = np.reshape(grid_centers, np.append(num_of_bins * np.ones(dimensionality), dimensionality).astype(int))
398 |         # print grid_centers
399 | 
400 |         potential_centers = []
401 | 
402 |         # now sort these grids (that has no points in it)
403 |         # based on total number of points in its neighbors
404 | 
405 |         temp_seperate_index = []
406 | 
407 |         for _ in range(dimensionality):
408 |             temp_seperate_index.append(list(range(num_of_bins)))
409 | 
410 |         index_of_grids = list(itertools.product(
411 |                         *temp_seperate_index
412 |                         ))
413 | 
414 |         index_of_grids =  [x for x in index_of_grids if diff_with_neighbors[x] < 0]     # only apply to grids with diff_with_neighbors value < 0
415 |         sorted_index_of_grids = sorted(index_of_grids, key = lambda x: diff_with_neighbors[x]) # sort based on histogram, return index values
416 |         if reverse_sorting_mode:
417 |             sorted_index_of_grids.reverse()
418 | 
419 |         for index in sorted_index_of_grids[:num_of_boundary_points]:  # note index can be of dimension >= 2
420 |             temp_potential_center = [round(x, 2) for x in grid_centers[index]]
421 |             potential_centers.append(temp_potential_center)
422 | 
423 |         return potential_centers
424 | 
425 |     @staticmethod
426 |     def L_method(evaluation_values, num):
427 |         evaluation_values = np.array(evaluation_values)
428 |         num = np.array(num)
429 |         assert (evaluation_values.shape == num.shape)
430 |         min_weighted_err = float('inf')
431 |         optimal_num = 0
432 |         best_regr = None
433 |         for item in range(1, len(num) - 1):
434 |             y_left = evaluation_values[:item]
435 |             x_left = num[:item].reshape(item, 1)
436 |             y_right = evaluation_values[item - 1:]
437 |             x_right = num[item - 1:].reshape(len(num) - item + 1, 1)
438 |             regr_left = linear_model.LinearRegression()
439 |             regr_left.fit(x_left, y_left)
440 |             y_left_pred = regr_left.predict(x_left)
441 |             regr_right = linear_model.LinearRegression()
442 |             regr_right.fit(x_right, y_right)
443 |             y_right_pred = regr_right.predict(x_right)
444 | 
445 |             err_left = mean_squared_error(y_left, y_left_pred)
446 |             err_right = mean_squared_error(y_right, y_right_pred)
447 |             weighted_err = (err_left * item + err_right * (len(num) - item + 1)) / (len(num) + 1)
448 |             if weighted_err < min_weighted_err:
449 |                 optimal_num = num[item]
450 |                 min_weighted_err = weighted_err
451 |                 best_regr = [regr_left, regr_right]
452 | 
453 |         x_data = np.linspace(min(num), max(num), 100).reshape(100, 1)
454 |         y_data_left = best_regr[0].predict(x_data)
455 |         y_data_right = best_regr[1].predict(x_data)
456 | 
457 |         return optimal_num, x_data, y_data_left, y_data_right
458 | 
459 |     @staticmethod
460 |     def get_RMSD_after_alignment(position_1, position_2):
461 |         return rmsd(position_1, position_2, center=True, superposition=True)
462 | 
463 |     @staticmethod
464 |     def metric_RMSD_of_atoms(list_of_files, ref_file='../resources/1l2y.pdb', ref_index=0,
465 |                              atom_selection_statement="name CA", step_interval=1):
466 |         """
467 |         :param atom_selection_statement:  could be either
468 |          - "name CA" for alpha-carbon atoms only
469 |          - "protein" for all atoms
470 |          - "backbone" for backbone atoms
471 |          - others: see more information here: https://pythonhosted.org/MDAnalysis/documentation_pages/selections.html
472 |         """
473 |         ref = Universe(ref_file)
474 |         ref_atom_selection = ref.select_atoms(atom_selection_statement)
475 |         ref.trajectory[ref_index]
476 |         ref_positions = ref_atom_selection.positions
477 |         result_rmsd_of_atoms = []
478 |         index = 0
479 | 
480 |         for sample_file in list_of_files:
481 |             sample = Universe(ref_file, sample_file)
482 |             sample_atom_selection = sample.select_atoms(atom_selection_statement)
483 | 
484 |             for _ in sample.trajectory:
485 |                 if index % step_interval == 0:
486 |                     result_rmsd_of_atoms.append(Sutils.get_RMSD_after_alignment(ref_positions,
487 |                                                                     sample_atom_selection.positions))
488 | 
489 |                 index += 1
490 |         return np.array(result_rmsd_of_atoms)
491 | 
492 |     @staticmethod
493 |     def get_positions_from_list_of_pdb(pdb_file_list, atom_selection_statement='name CA'):
494 |         positions = []
495 |         for sample_file in pdb_file_list:
496 |             sample = Universe(sample_file)
497 |             sample_atom_selection = sample.select_atoms(atom_selection_statement)
498 |             for _ in sample.trajectory:
499 |                 positions.append(sample_atom_selection.positions)
500 |         return positions
501 | 
502 |     @staticmethod
503 |     def get_RMSD_of_a_point_wrt_neighbors_in_PC_space_with_list_of_pdb(PCs, pdb_file_list, radius=0.1):
504 |         """This function calculates RMSD of a configuration with respect to its neighbors in PC space,
505 |         the purpose is to see if similar structures (small RMSD) are projected to points close to each other
506 |         in PC space.
507 |         wrt = with respect to
508 |         """
509 |         from sklearn.metrics.pairwise import euclidean_distances
510 |         positions = Sutils.get_positions_from_list_of_pdb(pdb_file_list)
511 |         pairwise_dis_in_PC = euclidean_distances(PCs)
512 |         neighbor_matrix = pairwise_dis_in_PC < radius
513 |         RMSD_diff_of_neighbors = np.zeros(neighbor_matrix.shape)
514 |         for ii in range(len(PCs)):
515 |             for jj in range(ii + 1, len(PCs)):
516 |                 if neighbor_matrix[ii][jj]:
517 |                     RMSD_diff_of_neighbors[ii, jj] = RMSD_diff_of_neighbors[jj, ii] \
518 |                         = Sutils.get_RMSD_after_alignment(positions[ii], positions[jj])
519 |         average_RMSD_wrt_neighbors = [np.average([x for x in RMSD_diff_of_neighbors[ii] if x])
520 |                                       for ii in range(len(PCs))]
521 |         return average_RMSD_wrt_neighbors
522 | 
523 |     @staticmethod
524 |     def get_pairwise_distance_matrices_of_selected_atoms(list_of_files, step_interval=1, atom_selection='name CA'):
525 |         distances_list = []
526 |         index = 0
527 |         for sample_file in list_of_files:
528 |             sample = Universe(sample_file)
529 |             sample_atom_selection = sample.select_atoms(atom_selection)
530 |             for _ in sample.trajectory:
531 |                 if index % step_interval == 0:
532 |                     distances_list.append(
533 |                         distance_array(sample_atom_selection.positions, sample_atom_selection.positions))
534 | 
535 |                 index += 1
536 | 
537 |         return np.array(distances_list)
538 | 
539 |     @staticmethod
540 |     def get_non_repeated_pairwise_distance(list_of_files, step_interval=1, atom_selection='name CA'):
541 |         """each element in this result is a list, not a matrix"""
542 |         dis_matrix_list = Sutils.get_pairwise_distance_matrices_of_selected_atoms(list_of_files, step_interval,
543 |                                                                                   atom_selection)
544 |         num_atoms = dis_matrix_list[0].shape[0]
545 |         result = []
546 |         for mat in dis_matrix_list:
547 |             p_distances = []
548 |             for item_1 in range(num_atoms):
549 |                 for item_2 in range(item_1 + 1, num_atoms):
550 |                     p_distances += [mat[item_1][item_2]]
551 |             assert (len(p_distances) == num_atoms * (num_atoms - 1) // 2)
552 |             result += [p_distances]
553 | 
554 |         return np.array(result)
555 | 
556 |     @staticmethod
557 |     def get_non_repeated_pairwise_distance_from_pos_npy(pos_npy):
558 |         from sklearn.metrics.pairwise import pairwise_distances
559 |         num_atoms = pos_npy.shape[1] // 3
560 |         temp_pos_npy = pos_npy.reshape(pos_npy.shape[0], num_atoms, 3)
561 |         pairwise_dis = np.array([pairwise_distances(item, item) for item in temp_pos_npy])
562 |         temp_result = np.array(
563 |             [[item[_1][_2] for _1 in range(num_atoms) for _2 in range(_1 + 1, num_atoms)] for item in pairwise_dis])
564 |         return temp_result
565 | 
566 |     @staticmethod
567 |     def get_residue_relative_position_list(sample_file):
568 |         sample = Universe(sample_file)
569 |         temp_heavy_atoms = sample.select_atoms('not name H*')
570 |         temp_CA_atoms = sample.select_atoms('name CA')
571 |         residue_relative_position_list = []
572 | 
573 |         for _ in sample.trajectory:
574 |             temp_residue_relative_position_list = []
575 |             for temp_residue_index in sample.residues.resnums:
576 |                 temp_residue_relative_position_list.append(
577 |                     temp_heavy_atoms[temp_heavy_atoms.resnums == temp_residue_index].positions \
578 |                     - temp_CA_atoms[temp_CA_atoms.resnums == temp_residue_index].positions)
579 |             residue_relative_position_list.append(temp_residue_relative_position_list)
580 |         return residue_relative_position_list
581 | 
582 | 
583 | class Alanine_dipeptide(Sutils):
584 |     """docstring for Alanine_dipeptide"""
585 |     def __init__(self):
586 |         super(Alanine_dipeptide, self).__init__()
587 |         return
588 | 
589 |     @staticmethod
590 |     def get_cossin_from_a_coordinate(a_coordinate):
591 |         num_of_coordinates = len(list(a_coordinate)) // 3
592 |         a_coordinate = np.array(a_coordinate).reshape(num_of_coordinates, 3)
593 |         diff_coordinates = a_coordinate[1:num_of_coordinates, :] - a_coordinate[0:num_of_coordinates - 1,:]  # bond vectors
594 |         diff_coordinates_1=diff_coordinates[0:num_of_coordinates-2,:];diff_coordinates_2=diff_coordinates[1:num_of_coordinates-1,:]
595 |         normal_vectors = np.cross(diff_coordinates_1, diff_coordinates_2)
596 |         normal_vectors_normalized = np.array([x / sqrt(np.dot(x,x)) for x in normal_vectors])
597 |         normal_vectors_normalized_1 = normal_vectors_normalized[0:num_of_coordinates-3, :]; normal_vectors_normalized_2 = normal_vectors_normalized[1:num_of_coordinates-2,:]
598 |         diff_coordinates_mid = diff_coordinates[1:num_of_coordinates-2] # these are bond vectors in the middle (remove the first and last one), they should be perpendicular to adjacent normal vectors
599 | 
600 |         cos_of_angles = list(range(len(normal_vectors_normalized_1)))
601 |         sin_of_angles_vec = list(range(len(normal_vectors_normalized_1)))
602 |         sin_of_angles = list(range(len(normal_vectors_normalized_1))) # initialization
603 |         result = []
604 | 
605 |         for index in range(len(normal_vectors_normalized_1)):
606 |             cos_of_angles[index] = np.dot(normal_vectors_normalized_1[index], normal_vectors_normalized_2[index])
607 |             sin_of_angles_vec[index] = np.cross(normal_vectors_normalized_1[index], normal_vectors_normalized_2[index])
608 |             sin_of_angles[index] = sqrt(np.dot(sin_of_angles_vec[index], sin_of_angles_vec[index])) * np.sign(sum(sin_of_angles_vec[index]) * sum(diff_coordinates_mid[index]))
609 |             result += [cos_of_angles[index], sin_of_angles[index]]
610 | 
611 |         return result
612 | 
613 |     @staticmethod
614 |     def get_many_cossin_from_coordinates(coordinates):
615 |         return list(map(Alanine_dipeptide.get_cossin_from_a_coordinate, coordinates))
616 | 
617 |     @staticmethod
618 |     def get_many_cossin_from_coordinates_in_list_of_files(list_of_files, step_interval=1, format='npy'):
619 |         coordinates = []
620 |         for item in list_of_files:
621 |             temp_coordinates = Helper_func.load_npy(item, format=format)
622 |             # the result could be 1D or 2D numpy array, need further checking
623 |             if temp_coordinates.shape[0] != 0:  # remove info from empty files
624 |                 if len(temp_coordinates.shape) == 1:  # if 1D numpy array, convert it to 2D array for consistency
625 |                     temp_coordinates = temp_coordinates[:, None].T
626 | 
627 |                 coordinates += list(temp_coordinates)
628 | 
629 |         coordinates = coordinates[::step_interval]
630 |         result = Alanine_dipeptide.get_many_cossin_from_coordinates(coordinates)
631 | 
632 |         return result
633 | 
634 |     @staticmethod
635 |     def get_many_dihedrals_from_coordinates_in_file (list_of_files):
636 |         # why we need to get dihedrals from a list of coordinate files?
637 |         # because we will probably need to plot other files outside self._list_of_coor_data_files
638 |         temp = Alanine_dipeptide.get_many_cossin_from_coordinates_in_list_of_files(list_of_files)
639 |         return Alanine_dipeptide.get_many_dihedrals_from_cossin(temp)
640 | 
641 |     @staticmethod
642 |     def get_many_dihedrals_from_cossin(cossin):
643 |         result = []
644 |         for item in cossin:
645 |             assert (len(item) == 8)
646 |             temp_angle = []
647 |             for ii in range(4):
648 |                 temp_angle += [np.arctan2(item[2 * ii + 1], item[2 * ii])]
649 | 
650 |             result += [list(temp_angle)]
651 |         return result
652 | 
653 |     @staticmethod
654 |     def generate_coordinates_from_pdb_files(path_for_pdb=CONFIG_12):
655 |         index_of_backbone_atoms = [str(item) for item in CONFIG_57[0]]
656 |         output_file_list = Sutils._generate_coordinates_from_pdb_files(index_of_backbone_atoms, file_path=path_for_pdb)
657 |         return output_file_list
658 | 
659 |     @staticmethod
660 |     def get_expression_script_for_plumed(scaling_factor=CONFIG_49):
661 |         index_of_backbone_atoms = CONFIG_57[0]
662 |         return Plumed_helper.get_atom_positions(index_of_backbone_atoms, scaling_factor, unit_scaling=1.0)
663 | 
664 | 
665 | class Trp_cage(Sutils):
666 |     """docstring for Trp_cage"""
667 |     def __init__(self):
668 |         super(Trp_cage, self).__init__()
669 |         return
670 | 
671 |     @staticmethod
672 |     def get_cossin_of_a_dihedral_from_four_atoms(coord_1, coord_2, coord_3, coord_4):
673 |         """each parameter is a 3D Cartesian coordinates of an atom"""
674 |         coords_of_four = np.array([coord_1, coord_2, coord_3, coord_4])
675 |         num_of_coordinates = 4
676 |         diff_coordinates = coords_of_four[1:num_of_coordinates, :] - coords_of_four[0:num_of_coordinates - 1,:]  # bond vectors
677 |         diff_coordinates_1=diff_coordinates[0:num_of_coordinates-2,:];diff_coordinates_2=diff_coordinates[1:num_of_coordinates-1,:]
678 |         normal_vectors = np.cross(diff_coordinates_1, diff_coordinates_2)
679 |         normal_vectors_normalized = np.array([x / sqrt(np.dot(x,x)) for x in normal_vectors])
680 |         normal_vectors_normalized_1 = normal_vectors_normalized[0:num_of_coordinates-3, :]; normal_vectors_normalized_2 = normal_vectors_normalized[1:num_of_coordinates-2,:]
681 |         diff_coordinates_mid = diff_coordinates[1:num_of_coordinates-2] # these are bond vectors in the middle (remove the first and last one), they should be perpendicular to adjacent normal vectors
682 | 
683 |         index = 0
684 |         cos_of_angle = np.dot(normal_vectors_normalized_1[index], normal_vectors_normalized_2[index])
685 |         sin_of_angle_vec = np.cross(normal_vectors_normalized_1[index], normal_vectors_normalized_2[index])
686 |         if sin_of_angle_vec[0] != 0 and diff_coordinates_mid[index][0] != 0:
687 |             component_index = 0
688 |         elif sin_of_angle_vec[1] != 0 and diff_coordinates_mid[index][1] != 0:
689 |             component_index = 1
690 |         else:
691 |             component_index = 2
692 | 
693 |         sin_of_angle = sqrt(np.dot(sin_of_angle_vec, sin_of_angle_vec)) * np.sign(sin_of_angle_vec[component_index] * diff_coordinates_mid[index][component_index])
694 |         try:
695 |             assert ( cos_of_angle ** 2 + sin_of_angle ** 2 - 1 < 0.0001)
696 |         except:
697 |             print("error: cos^2 x+ sin^2 x != 1, it is %f" %(cos_of_angle ** 2 + sin_of_angle ** 2))
698 |             # print ("coordinates of four atoms are:")
699 |             # print (coords_of_four)
700 | 
701 |         return [cos_of_angle, sin_of_angle]
702 | 
703 |     @staticmethod
704 |     def get_coordinates_of_atom_with_index(a_coodinate, index):
705 |         """:param a_coodinate is coordinate of all 20 atoms"""
706 |         return [a_coodinate[3 * index], a_coodinate[3 * index + 1], a_coodinate[3 * index + 2]]
707 | 
708 |     @staticmethod
709 |     def get_cossin_from_a_coordinate(a_coordinate):
710 |         total_num_of_residues = 20
711 |         list_of_idx_four_atoms = [[[3 * x - 1, 3 * x, 3 * x + 1, 3 * x + 2],
712 |                                                 [3 * x, 3 * x + 1, 3 * x + 2, 3 * x + 3]] for x in list(range(total_num_of_residues))]
713 |         list_of_idx_four_atoms = reduce(lambda x, y: x + y, list_of_idx_four_atoms)
714 |         list_of_idx_four_atoms = [x for x in list_of_idx_four_atoms if x[0] >= 0 and x[3] < 3 * total_num_of_residues]
715 | 
716 |         assert (len(list_of_idx_four_atoms) == 38)
717 | 
718 |         result = []
719 | 
720 |         for item in list_of_idx_four_atoms:
721 |             parameter_list = [Trp_cage.get_coordinates_of_atom_with_index(a_coordinate, x) for x in item]
722 |             [cos_value, sin_value] = Trp_cage.get_cossin_of_a_dihedral_from_four_atoms(*parameter_list)
723 |             # print(item)
724 |             # print(cos_value, sin_value)
725 |             result += [cos_value, sin_value]
726 | 
727 |         return result
728 | 
729 |     @staticmethod
730 |     def get_many_cossin_from_coordinates(coordinates):
731 |         return list(map(Trp_cage.get_cossin_from_a_coordinate, coordinates))
732 | 
733 |     @staticmethod
734 |     def get_many_cossin_from_coordinates_in_list_of_files(list_of_files, step_interval=1, format='npy'):
735 |         coordinates = []
736 |         for item in list_of_files:
737 |             temp_coordinates = Helper_func.load_npy(item, format=format)  # the result could be 1D or 2D numpy array, need further checking
738 |             if temp_coordinates.shape[0] != 0:        # remove info from empty files
739 |                 if len(temp_coordinates.shape) == 1:  # if 1D numpy array, convert it to 2D array for consistency
740 |                     temp_coordinates = temp_coordinates[:, None].T
741 | 
742 |                 coordinates += list(temp_coordinates)
743 | 
744 |         coordinates = coordinates[::step_interval]
745 |         result = Trp_cage.get_many_cossin_from_coordinates(coordinates)
746 | 
747 |         return result
748 | 
749 |     @staticmethod
750 |     def get_many_dihedrals_from_coordinates_in_file (list_of_files, step_interval=1):
751 |         # why we need to get dihedrals from a list of coordinate files?
752 |         # because we will probably need to plot other files outside self._list_of_coor_data_files
753 |         temp = Trp_cage.get_many_cossin_from_coordinates_in_list_of_files(list_of_files, step_interval)
754 |         return Trp_cage.get_many_dihedrals_from_cossin(temp)
755 | 
756 |     @staticmethod
757 |     def get_many_dihedrals_from_cossin(cossin):
758 |         result = []
759 |         for item in cossin:
760 |             temp_angle = []
761 |             len_of_cos_sin = 76
762 |             assert (len(item) == len_of_cos_sin), (len(item), len_of_cos_sin)
763 |             for idx_of_angle in range(len_of_cos_sin // 2):
764 |                 temp_angle += [np.arctan2(item[2 * idx_of_angle + 1], item[2 * idx_of_angle])]
765 | 
766 |             assert (len(temp_angle) == len_of_cos_sin // 2)
767 | 
768 |             result += [temp_angle]
769 | 
770 |         assert (len(result) == len(cossin))
771 | 
772 |         return result
773 | 
774 |     @staticmethod
775 |     def generate_coordinates_from_pdb_files(path_for_pdb = CONFIG_12):
776 |         index_of_backbone_atoms = [str(item) for item in CONFIG_57[1]]
777 |         assert (len(index_of_backbone_atoms) % 3 == 0)
778 | 
779 |         output_file_list = Sutils._generate_coordinates_from_pdb_files(index_of_backbone_atoms, file_path=path_for_pdb)
780 | 
781 |         return output_file_list
782 | 
783 |     @staticmethod
784 |     def metric_get_diff_pairwise_distance_matrices_of_alpha_carbon(list_of_files, ref_file ='../resources/1l2y.pdb', step_interval = 1):
785 |         ref = Trp_cage.get_pairwise_distance_matrices_of_selected_atoms([ref_file])
786 |         sample = Trp_cage.get_pairwise_distance_matrices_of_selected_atoms(list_of_files, step_interval)
787 |         diff = [np.linalg.norm(ref[0] - x) for x in sample]
788 |         return diff
789 | 
790 |     @staticmethod
791 |     def metric_get_residue_9_16_salt_bridge_distance(list_of_files, step_interval = 1):
792 |         distances_list = []
793 |         index = 0
794 |         for sample_file in list_of_files:
795 |             sample = Universe(sample_file)
796 |             sample_atom_selection_1 = sample.select_atoms("name OD2 and resid 9")
797 |             sample_atom_selection_2 = sample.select_atoms("name NH2 and resid 16")
798 |             for _ in sample.trajectory:
799 |                 if index % step_interval == 0:
800 |                     distances_list.append(
801 |                         distance_array(sample_atom_selection_1.positions, sample_atom_selection_2.positions))
802 | 
803 |                 index += 1
804 | 
805 |         return np.array(distances_list).flatten()
806 | 
807 |     @staticmethod
808 |     def metric_chirality(list_of_files, step_interval=1):
809 |         result = []
810 |         index = 0
811 |         for temp_file in list_of_files:
812 |             temp_universe = Universe(temp_file)
813 |             for _ in temp_universe.trajectory:
814 |                 if index % step_interval == 0:
815 |                     atom_list = [temp_universe.select_atoms('name CA and resid %d' % item).positions[0]
816 |                                  for item in [1, 9, 14, 20]]
817 |                     result.append(Trp_cage.get_cossin_of_a_dihedral_from_four_atoms(
818 |                         atom_list[0], atom_list[1], atom_list[2], atom_list[3])[1])
819 |                 index += 1
820 |         return np.array(result)
821 | 
822 |     @staticmethod
823 |     def metric_vertical_shift(list_of_files, step_interval=1):
824 |         result = []
825 |         index = 0
826 |         for temp_file in list_of_files:
827 |             temp_universe = Universe(temp_file)
828 |             for _ in temp_universe.trajectory:
829 |                 if index % step_interval == 0:
830 |                     atom_list = [temp_universe.select_atoms('name CA and resid %d' % item).positions[0]
831 |                                  for item in [1, 11, 20]]
832 |                     result.append(np.linalg.norm(atom_list[0] - atom_list[1]) - np.linalg.norm(atom_list[2] - atom_list[1]))
833 |                 index += 1
834 |         return np.array(result)
835 | 
836 |     @staticmethod
837 |     def metric_get_number_of_native_contacts(list_of_files, ref_file ='../resources/1l2y.pdb', threshold = 8, step_interval = 1):
838 |         ref = Trp_cage.get_pairwise_distance_matrices_of_selected_atoms([ref_file])
839 |         sample = Trp_cage.get_pairwise_distance_matrices_of_selected_atoms(list_of_files, step_interval)
840 | 
841 |         result = [sum(sum(((x < threshold) & (ref[0] < threshold)).astype(int))) for x in sample]
842 |         return result
843 | 
844 |     @staticmethod
845 |     def metric_radius_of_gyration(list_of_files, step_interval = 1, atom_selection_statement = "name CA"):
846 |         result = []
847 |         index = 0
848 |         for item_file in list_of_files:
849 |             temp_sample = Universe(item_file)
850 |             temp_atoms = temp_sample.select_atoms(atom_selection_statement)
851 |             for _ in temp_sample.trajectory:
852 |                 if index % step_interval == 0:
853 |                     result.append(temp_atoms.radius_of_gyration())
854 |                 index += 1
855 | 
856 |         return result
857 | 
858 |     @staticmethod
859 |     def get_pairwise_RMSD_after_alignment_for_a_file(sample_file, atom_selection_statement = 'name CA'):
860 |         sample_1 = Universe(sample_file); sample_2 = Universe(sample_file)    # should use two variables here, otherwise it will be 0, might be related to iterator issue?
861 |         sel_1 = sample_1.select_atoms(atom_selection_statement); sel_2 = sample_2.select_atoms(atom_selection_statement)
862 | 
863 |         return [[rmsd(sel_1.positions, sel_2.positions, center=True, superposition=True) for _2 in sample_2.trajectory] for _1 in sample_1.trajectory]
864 | 
865 |     @staticmethod
866 |     def structure_clustering_in_a_file(sample_file, atom_selection_statement = 'name CA',
867 |                                        write_most_common_class_into_file = False,
868 |                                        output_file_name = None,
869 |                                        eps=0.5,
870 |                                        min_num_of_neighboring_samples = 2
871 |                                        ):
872 |         pairwise_RMSD = Trp_cage.get_pairwise_RMSD_after_alignment_for_a_file(sample_file, atom_selection_statement=atom_selection_statement)
873 |         from sklearn.cluster import DBSCAN
874 | 
875 |         dbscan_obj = DBSCAN(metric='precomputed', eps=eps, min_samples=min_num_of_neighboring_samples).fit(pairwise_RMSD)
876 |         class_labels = dbscan_obj.labels_
877 |         max_class_label = max(class_labels)
878 |         num_in_each_class = {label: np.where(class_labels == label)[0].shape[0] for label in range(-1, max_class_label + 1)}
879 |         most_common_class_labels = sorted(list(num_in_each_class.keys()), key=lambda x: num_in_each_class[x], reverse=True)
880 |         with open(sample_file, 'r') as in_file:
881 |             content = [item for item in in_file.readlines() if not 'REMARK' in item]
882 |             content = ''.join(content)
883 |             content = content.split('MODEL')[1:]  # remove header
884 |             assert (len(content) == len(class_labels))
885 | 
886 |         if most_common_class_labels[0] == -1:
887 |             raise Exception("too many outliers, check if there is actually a cluster, or adjust parameters")
888 |         else:
889 |             index_of_most_common_class = np.where(class_labels == most_common_class_labels[0])[0]
890 |             if write_most_common_class_into_file:
891 |                 if output_file_name is None:
892 |                     output_file_name = sample_file.replace('.pdb', '_most_common.pdb')
893 | 
894 |                 frames_to_use = [content[ii] for ii in index_of_most_common_class]
895 |                 with open(output_file_name, 'w') as out_file:
896 |                     for frame in frames_to_use:
897 |                         out_file.write("MODEL" + frame)
898 | 
899 |         return num_in_each_class, index_of_most_common_class, most_common_class_labels[0]
900 | 
901 |     @staticmethod
902 |     def rotating_dihedral_angles_and_save_to_pdb(input_pdb, target_dihedrals, output_pdb):
903 |         pdb_parser = PDB.PDBParser(QUIET=True)
904 |         temp_structure = pdb_parser.get_structure('temp', input_pdb)
905 |         coor_file = Trp_cage.generate_coordinates_from_pdb_files(input_pdb)[0]
906 |         current_dihedrals = Trp_cage.get_many_dihedrals_from_coordinates_in_file([coor_file])
907 |         rotation_angles = np.array(target_dihedrals) - np.array(current_dihedrals)
908 | 
909 |         atom_indices_in_each_residue = [[]] * 20
910 |         temp_model = list(temp_structure.get_models())[0]
911 |         for _1, item in list(enumerate(temp_model.get_residues())):
912 |             atom_indices_in_each_residue[_1] = [int(_2.get_serial_number()) - 1 for _2 in item.get_atoms()]
913 | 
914 |         for temp_model in temp_structure.get_models():
915 |             atoms_in_this_frame = list(temp_model.get_atoms())
916 |             temp_coords = np.array([_1.get_coord() for _1 in atoms_in_this_frame])
917 | 
918 |             for item in range(19):  # 19 * 2 = 38 dihedrals in total
919 |                 C_atom_in_this_residue = list(filter(lambda x: x.get_name() == "C", atoms_in_this_frame))[item]
920 |                 CA_atom_in_this_residue = list(filter(lambda x: x.get_name() == "CA", atoms_in_this_frame))[item]
921 |                 CA_atom_in_next_residue = list(filter(lambda x: x.get_name() == "CA", atoms_in_this_frame))[item + 1]
922 |                 N_atom_in_next_residue = list(filter(lambda x: x.get_name() == "N", atoms_in_this_frame))[item + 1]
923 | 
924 |                 axis_vector_0 = C_atom_in_this_residue.get_coord() - CA_atom_in_this_residue.get_coord()
925 |                 axis_vector_1 = CA_atom_in_next_residue.get_coord() - N_atom_in_next_residue.get_coord()
926 | 
927 |                 fixed_coord_0 = temp_coords[int(C_atom_in_this_residue.get_serial_number()) - 1]
928 |                 fixed_coord_1 = temp_coords[int(N_atom_in_next_residue.get_serial_number()) - 1]
929 | 
930 |                 indices_atom_to_rotate = reduce(lambda x, y: x + y, atom_indices_in_each_residue[:item + 1])
931 | 
932 |                 temp_coords = Sutils.rotating_group_of_atoms(temp_coords, indices_atom_to_rotate, fixed_coord_0,
933 |                                                              axis_vector_0, rotation_angles[temp_model.get_id()][2 * item])
934 |                 temp_coords = Sutils.rotating_group_of_atoms(temp_coords, indices_atom_to_rotate, fixed_coord_1,
935 |                                                              axis_vector_1, rotation_angles[temp_model.get_id()][2 * item + 1])
936 | 
937 |             # save coordinates into structure
938 |             for _1, item in enumerate(temp_model.get_atoms()):
939 |                 item.set_coord(temp_coords[_1])
940 | 
941 |         io = PDB.PDBIO()
942 |         io.set_structure(temp_structure)
943 |         io.save(output_pdb)
944 |         return
945 | 
946 |     @staticmethod
947 |     def get_expression_script_for_plumed(scaling_factor=CONFIG_49):
948 |         index_of_backbone_atoms = CONFIG_57[1]
949 |         return Plumed_helper.get_atom_positions(index_of_backbone_atoms, scaling_factor, unit_scaling=1.0)
950 | 


--------------------------------------------------------------------------------
/MD_simulation_on_alanine_dipeptide/current_work/src/remove_water_mol.py:
--------------------------------------------------------------------------------
 1 | from ANN_simulation import *
 2 | import argparse, subprocess
 3 | 
 4 | parser = argparse.ArgumentParser()
 5 | parser.add_argument("--path", type=str, default="", help="specify the directory/file containing the pdb files")
 6 | parser.add_argument("--remove_original", help="remove original pdb files", action="store_true")
 7 | args = parser.parse_args()
 8 | 
 9 | if args.remove_original:
10 |     Sutils.remove_water_mol_and_Cl_from_pdb_file(folder_for_pdb = args.path, preserve_original_file=False)
11 | else:
12 |     Sutils.remove_water_mol_and_Cl_from_pdb_file(folder_for_pdb = args.path, preserve_original_file=True)
13 | 


--------------------------------------------------------------------------------
/MD_simulation_on_alanine_dipeptide/current_work/src/structural_alignment.py:
--------------------------------------------------------------------------------
 1 | """
 2 | modified from the code: https://gist.github.com/andersx/6354971
 3 | """
 4 | 
 5 | import Bio.PDB, argparse, subprocess, os
 6 | from MDAnalysis import *
 7 | from MDAnalysis.analysis.align import *
 8 | from MDAnalysis.analysis.rms import rmsd
 9 | 
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument("sample_path", type=str, help="path (file or folder) of pdb file(s) to be aligned")
12 | parser.add_argument("--ignore_aligned_file",type=int, default=1)
13 | parser.add_argument("--ref", type=str, help="reference pdb file")
14 | parser.add_argument("--name", type=str, default=None, help='name of the aligned pdb file')
15 | parser.add_argument('--remove_original', help='remove original pdb file after doing structural alignment', action="store_true")
16 | parser.add_argument('--suffix', type=str, default="", help="string that appends at the end of filename")
17 | parser.add_argument('--atom_selection', type=str, default='backbone', help='atom_selection_statement for alignment')
18 | args = parser.parse_args()
19 | 
20 | ref_pdb = args.ref
21 | 
22 | traj_files = subprocess.check_output([
23 |     'find', args.sample_path, '-name', "*.pdb", '-o', '-name', '*.dcd']).decode("utf-8").strip().split('\n')
24 | if args.ignore_aligned_file:
25 |     traj_files = [x for x in traj_files if not '_aligned' in x]
26 | 
27 | for sample_traj in traj_files:
28 |     print("doing structural alignment for %s" % sample_traj)
29 | 
30 |     if args.name is None:
31 |         output_pdb_file = sample_traj[:-4] + '_aligned%s.pdb' % (args.suffix)
32 |     else:
33 |         output_pdb_file = parser.name
34 | 
35 |     if os.path.exists(output_pdb_file) and os.path.getmtime(sample_traj) < os.path.getmtime(output_pdb_file):
36 |         print("aligned file already exists: %s (remove previous one if needed)" % output_pdb_file)
37 |     else:
38 |         ref = Universe(ref_pdb)
39 |         m_traj = Universe(ref_pdb, sample_traj)
40 |         AlignTraj(m_traj, reference=ref, filename=output_pdb_file, select=args.atom_selection).run()
41 |         print("done structural alignment for %s" % sample_traj)
42 | 
43 |         if args.remove_original:
44 |             subprocess.check_output(['rm', sample_traj])
45 |             print("%s removed!" % sample_traj)
46 |     


--------------------------------------------------------------------------------
/MD_simulation_on_alanine_dipeptide/current_work/src/tf_load.py:
--------------------------------------------------------------------------------
1 | import os
2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'    # disable tensorflow warning messages  (https://stackoverflow.com/questions/35911252/disable-tensorflow-debugging-information)
3 | import tensorflow as tf, keras.backend as K
4 | from keras.models import load_model
5 | config = tf.ConfigProto()
6 | config.gpu_options.allow_growth = True      # avoid tensorflow using all GPU memory
7 | K.tensorflow_backend.set_session(tf.Session(config=config))
8 | 


--------------------------------------------------------------------------------
/MD_simulation_on_alanine_dipeptide/current_work/src/train_network_and_save_for_iter.py:
--------------------------------------------------------------------------------
  1 | """train autoencoder and save into file
  2 | this file is typically used for running training in an iteration
  3 | """
  4 | 
  5 | from ANN_simulation import *
  6 | import argparse
  7 | 
  8 | parser = argparse.ArgumentParser()
  9 | parser.add_argument("index", type=int, help="index of autoencoder")
 10 | parser.add_argument("--training_interval", type=int, default=1, help="training interval")
 11 | parser.add_argument("--num_of_trainings", type=int, default=CONFIG_13, help="total number of trainings (and pick the best one to save)")
 12 | parser.add_argument("--num_of_copies", type=int, default=CONFIG_52, help="num of copies for data augmentation")
 13 | parser.add_argument("--lr_m", type=str, default=None, help="learning rate and momentum")
 14 | parser.add_argument("--output_file", type=str, default=None, help="file name to save autoencoder")
 15 | parser.add_argument('--data_folder', type=str, default=None, help="folder containing training data")
 16 | parser.add_argument('--in_data', type=str, default=None, help="npy file containing pre-computed input data")
 17 | parser.add_argument('--out_data', type=str, default=None, help="npy file containing pre-computed output data, if in_data is not None while out_data is None, then out_data is set to be in_data")
 18 | parser.add_argument('--node_num', type=str, default=None, help="node number")
 19 | parser.add_argument('--batch_size', type=int, default=None, help='batch size')
 20 | parser.add_argument('--auto_dim', type=int, default=CONFIG_79, help="automatically determine input/output dim based on data")
 21 | parser.add_argument('--auto_scale', type=int, default=False, help="automatically scale inputs and outputs")
 22 | parser.add_argument('--save_to_data_files', type=str, default=None, help="save training data to external files if it is not None, example: 'temp_in.npy,temp_out.npy' ")
 23 | parser.add_argument('--lag_time', type=int, default=0, help='lag time for time lagged autoencoder')
 24 | parser.add_argument('--rec_loss_type', type=int, default=True, help='0: standard rec loss, 1: lagged rec loss, 2: no rec loss (pytorch only)')
 25 | parser.add_argument('--rec_weight', type=float, default=1.0, help='weight of reconstruction loss (pytorch only)')
 26 | parser.add_argument('--autocorr_weight', type=float, default=1.0, help='weight of autocorrelation loss in the loss function (pytorch only)')
 27 | parser.add_argument('--pearson_weight', type=float, default=None, help='weight of pearson loss (pytorch only)')
 28 | parser.add_argument('--sf', type=str, default=None, help='model to start with (pytorch only)')
 29 | args = parser.parse_args()
 30 | 
 31 | def get_data_from_folder(temp_folder, input_type, output_type):
 32 |     my_coor_data_obj = coordinates_data_files_list(
 33 |         list_of_dir_of_coor_data_files=[temp_folder])
 34 |     coor_data_obj_input = my_coor_data_obj.create_sub_coor_data_files_list_using_filter_conditional(
 35 |         lambda x: not 'aligned' in x)
 36 |     if input_type == 'cossin':
 37 |         data_set = np.array(molecule_type.get_many_cossin_from_coordinates_in_list_of_files(
 38 |             coor_data_obj_input.get_list_of_coor_data_files(), step_interval=args.training_interval))
 39 |     elif input_type == 'Cartesian':
 40 |         scaling_factor = CONFIG_49
 41 |         data_set = coor_data_obj_input.get_coor_data(scaling_factor)
 42 |         data_set = data_set[::args.training_interval]
 43 |         data_set = Sutils.remove_translation(data_set)
 44 |         assert (Sutils.check_center_of_mass_is_at_origin(data_set))
 45 |     elif input_type == 'pairwise_distance':
 46 |         data_set = np.array(Sutils.get_non_repeated_pairwise_distance(
 47 |             coor_data_obj_input.get_list_of_corresponding_pdb_dcd(), step_interval=args.training_interval,
 48 |             atom_selection=CONFIG_73)) / CONFIG_49
 49 |     else:
 50 |         raise Exception('error input type')
 51 | 
 52 |     if output_type == 'cossin':
 53 |         output_data_set = np.array(molecule_type.get_many_cossin_from_coordinates_in_list_of_files(
 54 |             coor_data_obj_input.get_list_of_coor_data_files(), step_interval=args.training_interval))
 55 |     elif output_type == 'Cartesian':
 56 |         scaling_factor = CONFIG_49
 57 |         alignment_coor_file_suffix_list = CONFIG_61
 58 |         output_data_set = Sutils.prepare_output_Cartesian_coor_with_multiple_ref_structures(
 59 |             [temp_folder], alignment_coor_file_suffix_list, scaling_factor)
 60 |         output_data_set = output_data_set[::args.training_interval]
 61 |         mixed_error_function = CONFIG_71  # TODO: refactor this part later
 62 |         if mixed_error_function:
 63 |             if CONFIG_30 == "Trp_cage":
 64 |                 output_data_set_1 = Sutils.remove_translation(
 65 |                     output_data_set[:, list(range(9 * 1, 9 * 8))])  # mixed_err
 66 |                 output_data_set_2 = Sutils.remove_translation(output_data_set[:, list(range(180, 360))])
 67 |                 output_data_set = np.concatenate([4.0 * output_data_set_1, output_data_set_2],
 68 |                                                  axis=1)  # TODO: may modify this relative weight later
 69 |             elif CONFIG_30 == "Src_kinase":
 70 |                 output_data_set_1 = Sutils.remove_translation(
 71 |                     output_data_set[:, list(range(9 * 143, 9 * 170))])  # mixed_err
 72 |                 output_data_set_2 = Sutils.remove_translation(
 73 |                     output_data_set[:, list(range(2358 + 9 * 43, 2358 + 9 * 58))])
 74 |                 output_data_set = np.concatenate([output_data_set_1, output_data_set_2], axis=1)
 75 |         assert (Sutils.check_center_of_mass_is_at_origin(output_data_set))
 76 |     elif output_type == 'pairwise_distance':
 77 |         output_data_set = np.array(Sutils.get_non_repeated_pairwise_distance(
 78 |             coor_data_obj_input.get_list_of_corresponding_pdb_dcd(), step_interval=args.training_interval,
 79 |             atom_selection=CONFIG_73)) / CONFIG_49
 80 |     elif output_type == 'combined':
 81 |         scaling_factor = CONFIG_49
 82 |         alignment_coor_file_suffix_list = CONFIG_61
 83 |         output_data_set = Sutils.prepare_output_Cartesian_coor_with_multiple_ref_structures(
 84 |             [temp_folder], alignment_coor_file_suffix_list, scaling_factor)
 85 |         output_data_set = output_data_set[::args.training_interval]
 86 |         mixed_error_function = CONFIG_71  # TODO: refactor this part later
 87 |         assert mixed_error_function  # mixed error is required
 88 |         if CONFIG_30 == "Trp_cage":
 89 |             output_data_set_1 = Sutils.remove_translation(output_data_set[:, list(range(9 * 1, 9 * 8))])  # mixed_err
 90 |             output_data_set_2 = Sutils.remove_translation(output_data_set[:, list(range(180, 360))])
 91 |             output_data_set = np.concatenate([4.0 * output_data_set_1, output_data_set_2],
 92 |                                              axis=1)  # TODO: may modify this relative weight later
 93 |         else:
 94 |             raise Exception('not defined')
 95 |         temp_output_data_set = np.array(Sutils.get_non_repeated_pairwise_distance(
 96 |             coor_data_obj_input.get_list_of_corresponding_pdb_dcd(), step_interval=args.training_interval,
 97 |             atom_selection=CONFIG_73)) / CONFIG_49
 98 |         output_data_set = np.concatenate([output_data_set, temp_output_data_set], axis=1)
 99 |     else:
100 |         raise Exception('error output data type')
101 |     return data_set, output_data_set
102 | 
103 | # used to process additional arguments
104 | additional_argument_list = {}
105 | if not args.output_file is None:
106 |     additional_argument_list['filename_to_save_network'] = args.output_file
107 | if not args.lr_m is None:
108 |     temp_lr = float(args.lr_m.strip().split(',')[0])
109 |     temp_momentum = float(args.lr_m.strip().split(',')[1])
110 |     additional_argument_list['network_parameters'] = [temp_lr, temp_momentum, 0, True, CONFIG_4[4]]
111 | if not args.batch_size is None:
112 |     additional_argument_list['batch_size'] = args.batch_size
113 | 
114 | if args.data_folder is None:
115 |     args.data_folder = '../target/' + CONFIG_30
116 | 
117 | fraction_of_data_to_be_saved = 1   # save all training data by default
118 | input_data_type, output_data_type = CONFIG_48, CONFIG_76
119 | 
120 | # getting training data
121 | if not args.in_data is None:
122 |     data_set = np.load(args.in_data)
123 |     if args.out_data is None:
124 |         output_data_set = data_set
125 |     else:
126 |         output_data_set = np.load(args.out_data)
127 | else:
128 |     data_set, output_data_set = get_data_from_folder(args.data_folder, input_data_type, output_data_type)
129 | 
130 | assert (len(data_set) == len(output_data_set))
131 | use_representative_points_for_training = CONFIG_58
132 | if use_representative_points_for_training:
133 |     data_set, output_data_set = Sutils.select_representative_points(data_set, output_data_set)
134 |     
135 | if input_data_type == 'Cartesian' and args.in_data is None:
136 |     print('applying data augmentation...')
137 |     data_set, output_data_set = Sutils.data_augmentation(data_set, output_data_set, args.num_of_copies,
138 |                              is_output_reconstructed_Cartesian=(output_data_type == 'Cartesian'))
139 |     fraction_of_data_to_be_saved = 1.0 / args.num_of_copies
140 | else:
141 |     print("data augmentation not applied")
142 | 
143 | scaling_factor_for_expected_output = CONFIG_75  # TODO: is this useful?
144 | if not scaling_factor_for_expected_output is None:
145 |     print("expected output is weighted by %s" % str(scaling_factor_for_expected_output))
146 |     output_data_set = np.dot(output_data_set, np.diag(scaling_factor_for_expected_output))
147 | 
148 | if args.node_num is None:
149 |     temp_node_num = CONFIG_3[:]       # deep copy list
150 | else:
151 |     temp_node_num = [int(item) for item in args.node_num.split(',')]
152 | 
153 | if args.auto_dim: temp_node_num[0], temp_node_num[-1] = data_set.shape[1], output_data_set.shape[1]
154 | additional_argument_list['node_num'] = temp_node_num
155 | 
156 | if args.auto_scale:
157 |     auto_scaling_factor = np.max(np.abs(data_set)).astype(np.float)
158 |     print("auto_scaling_factor = %f" % auto_scaling_factor)
159 |     data_set /= auto_scaling_factor
160 |     output_data_set /= (np.max(np.abs(output_data_set)).astype(np.float))
161 |     assert np.max(np.abs(data_set)) == 1.0 and np.max(np.abs(output_data_set)) == 1.0
162 | 
163 | print("min/max of output = %f, %f, min/max of input = %f, %f" % (np.min(output_data_set), np.max(output_data_set),
164 |                                                                   np.min(data_set), np.max(data_set)))
165 | 
166 | if not args.save_to_data_files is None:
167 |     args.save_to_data_files = args.save_to_data_files.split(',')
168 | 
169 | if CONFIG_45 == 'keras':
170 |     temp_network_list = [autoencoder_Keras(index=args.index,
171 |                                          data_set_for_training=data_set,
172 |                                          output_data_set=output_data_set,
173 |                                          data_files=args.save_to_data_files,
174 |                                          **additional_argument_list
175 |                                          ) for _ in range(args.num_of_trainings)]
176 | elif CONFIG_45 == 'pytorch':
177 |     additional_argument_list['rec_loss_type'] = args.rec_loss_type
178 |     additional_argument_list['start_from'] = args.sf
179 |     additional_argument_list['rec_weight'] = args.rec_weight
180 |     additional_argument_list['autocorr_weight'] = args.autocorr_weight
181 |     additional_argument_list['pearson_weight'] = args.pearson_weight
182 |     temp_network_list = [autoencoder_torch(index=args.index,
183 |                                            data_set_for_training=data_set,
184 |                                            output_data_set=output_data_set,
185 |                                            data_files=args.save_to_data_files,
186 |                                            **additional_argument_list
187 |                                            ) for _ in range(args.num_of_trainings)]
188 | else:
189 |     raise Exception ('this training backend not implemented')
190 | 
191 | for item in temp_network_list: item.train(lag_time=args.lag_time)
192 | 
193 | if len(temp_network_list) == 1:
194 |     best_network = temp_network_list[0]
195 |     # if np.all(np.isnan(best_network.get_PCs())):
196 |     #     best_network = None
197 | else:
198 |     temp_FVE_list = [item.get_fraction_of_variance_explained() for item in temp_network_list]
199 |     max_FVE = np.max(temp_FVE_list)
200 |     print('temp_FVE_list = %s, max_FVE = %f' % (str(temp_FVE_list), max_FVE))
201 |     best_network = temp_network_list[temp_FVE_list.index(max_FVE)]
202 |     assert (isinstance(best_network, autoencoder))
203 |     assert (best_network.get_fraction_of_variance_explained() == max_FVE)
204 | 
205 | best_network.save_into_file(fraction_of_data_to_be_saved=fraction_of_data_to_be_saved)
206 | print("excited! this is the name of best network: %s" % best_network._filename_to_save_network)  # this line is used to locate file name of neural network
207 | 


--------------------------------------------------------------------------------
/MD_simulation_on_alanine_dipeptide/current_work/src/workqueue.py:
--------------------------------------------------------------------------------
 1 | """
 2 | this programs takes a file containing all Python programs to run as input, and 
 3 | put these programs into a workqueue, and at every instance we make sure only 
 4 | n Python programs are running
 5 | 
 6 | ===========================
 7 | input: 
 8 | 
 9 | - file containing Python programs to run
10 | - number of programs allowed to run concurrently
11 | - time interval of checking the number of running programs
12 | """
13 | 
14 | import argparse, subprocess, time
15 | 
16 | def main():
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument("cmdfile", type=str, help="file containing Python programs to run")
19 |     parser.add_argument("file_finished", type=str, help="file containing the programs finished")
20 |     parser.add_argument("--num", type=int, default=20, help="number of programs allowed to run concurrently")
21 |     parser.add_argument("--interval", type=int, default=10, help="time interval of checking the number of running programs")
22 | 
23 |     args = parser.parse_args()
24 | 
25 |     command_file = args.cmdfile
26 |     num_of_programs_allowed = args.num
27 |     interval = args.interval
28 | 
29 |     with open(command_file, 'r') as cmdf:
30 |         command_list = cmdf.read().strip().split('\n')
31 | 
32 |     command_list = [x for x in command_list if x.strip() != "" and x.strip()[0] != "#"]  # remove empty commands
33 | 
34 |     total_num_jobs = len(command_list)
35 |     next_job_index = 0
36 | 
37 |     previous_running_python_jobs = []
38 | 
39 |     while next_job_index < total_num_jobs:
40 |         time.sleep(interval)
41 |         current_running_python_jobs = [x for x in subprocess.check_output(['ps', 'aux']).decode("utf-8").split('\n') if ' python ' in x and not 'python workqueue.py' in x]
42 |         current_running_python_jobs = [' '.join(x.split()[10:]) for x in current_running_python_jobs]      # 11th column is command
43 |         # print "current_running_jobs = %s" % str(current_running_python_jobs)
44 | 
45 |         # save finished programs into this file
46 |         with open(args.file_finished, 'a') as file_containing_programs_finished:
47 |             for item in previous_running_python_jobs:
48 |                 if not item in current_running_python_jobs:
49 |                     file_containing_programs_finished.write(item)
50 |                     file_containing_programs_finished.write('\n')
51 | 
52 |         previous_running_python_jobs = current_running_python_jobs
53 | 
54 |         num_of_running_jobs = len(current_running_python_jobs)
55 |         if num_of_running_jobs < num_of_programs_allowed:
56 |             if num_of_programs_allowed - num_of_running_jobs > total_num_jobs - next_job_index:
57 |                 run_programs(command_list, next_job_index, total_num_jobs)
58 |                 next_job_index = total_num_jobs
59 |             else:
60 |                 run_programs(command_list, next_job_index, next_job_index + num_of_programs_allowed - num_of_running_jobs)
61 |                 next_job_index += num_of_programs_allowed - num_of_running_jobs
62 | 
63 |     print("Done all programs in " + args.cmdfile)
64 |     return
65 | 
66 | 
67 | def run_programs(command_list, start_index, end_index, shell=True):
68 |     """
69 |     run programs with index [start_index, end_index - 1]
70 |     """
71 |     for item in range(start_index, end_index):
72 |         command_arg = command_list[item].strip()
73 |         if command_arg != "":
74 |             if command_arg[-1] == "&":
75 |                 command_arg = command_arg[:-1]
76 | 
77 |             print("running command: " + command_arg)
78 |             if not shell: command_arg = command_arg.split()
79 |             subprocess.Popen(command_arg, shell=shell)
80 | 
81 |     return
82 |     
83 | 
84 | if __name__ == '__main__':
85 |     main()
86 |     
87 | 


--------------------------------------------------------------------------------
/MD_simulation_on_alanine_dipeptide/current_work/target/.gitignore:
--------------------------------------------------------------------------------
1 | *.txt
2 | *.pdb
3 | *.png
4 | *.npy
5 | 


--------------------------------------------------------------------------------
/MD_simulation_on_alanine_dipeptide/current_work/tests/.gitignore:
--------------------------------------------------------------------------------
 1 | .coverage
 2 | *.png
 3 | *.pdf
 4 | *.jpg
 5 | *.pkl
 6 | dependency/**
 7 | *.txt
 8 | *.pdb
 9 | *.hdf5
10 | *.chk
11 | temp_model.dot
12 | 
13 | 


--------------------------------------------------------------------------------
/MD_simulation_on_alanine_dipeptide/current_work/tests/ANN_simulation_test.py:
--------------------------------------------------------------------------------
  1 | '''This is a test for functionality of ANN_simulation.py
  2 | '''
  3 | 
  4 | import sys, os, math, subprocess, matplotlib
  5 | from functools import reduce
  6 | matplotlib.use('agg')
  7 | 
  8 | sys.path.append('../src/')  # add the source file folder
  9 | 
 10 | from ANN_simulation import *
 11 | from numpy.testing import assert_almost_equal, assert_equal
 12 | 
 13 | 
 14 | class test_Sutils(object):
 15 |     @staticmethod
 16 |     def test_mark_and_modify_pdb_for_calculating_RMSD_for_plumed():
 17 |         temp_out = 'temp_out.pdb'
 18 |         Sutils.mark_and_modify_pdb_for_calculating_RMSD_for_plumed('../resources/1l2y.pdb', temp_out,
 19 |                                 get_index_list_with_selection_statement('../resources/1l2y.pdb', 'name CA'))
 20 |         a = Universe(temp_out)
 21 |         b = a.select_atoms('name CA')
 22 |         assert np.all(b.tempfactors) and np.all(b.occupancies)
 23 |         b = a.select_atoms('not name CA')
 24 |         assert not (np.any(b.tempfactors) or np.any(b.occupancies))
 25 |         subprocess.check_output(['rm', temp_out])
 26 |         return
 27 | 
 28 |     @staticmethod
 29 |     def test_write_some_frames_into_a_new_file():
 30 |         input_pdb = '../tests/dependency/temp_output_0.pdb'
 31 |         output_pdb = "../tests/dependency/temp_output_0_interval_3.pdb"
 32 |         output_coor = output_pdb.replace('.pdb', '_coordinates.npy')
 33 |         actual_output_coor = '../tests/dependency/temp_output_0_coor.npy'
 34 |         for interval in range(3, 10):
 35 |             Sutils.write_some_frames_into_a_new_file(input_pdb, 0, 0, interval, output_pdb)
 36 |             if os.path.exists(output_coor):
 37 |                 subprocess.check_output(['rm', output_coor])
 38 |             Alanine_dipeptide.generate_coordinates_from_pdb_files(output_pdb)
 39 |             assert_almost_equal(np.load(output_coor), np.load(actual_output_coor)[::interval])
 40 |             subprocess.check_output(['rm', output_coor, output_pdb])
 41 |         return
 42 | 
 43 |     @staticmethod
 44 |     def test_get_boundary_points():
 45 |         """generate plotting for tests"""
 46 |         cov = [[0.1, 0], [0, 0.1]]  # diagonal covariance
 47 |         get_points = lambda mean: np.random.multivariate_normal(mean, cov, 50)
 48 |         points = reduce(lambda x, y: np.concatenate((x, y)), list(map(get_points, [[0, 1], [0, -1]])))
 49 |         boundary_points = Sutils.get_boundary_points(points, preprocessing=True)
 50 |         x, y = list(zip(*points))
 51 |         x1, y1 = list(zip(*boundary_points))
 52 |         fig, ax = plt.subplots()
 53 |         ax.scatter(x, y, c='b')
 54 |         ax.scatter(x1, y1, c='r')
 55 |         fig.savefig('test_get_boundary_points_noncircular.png')
 56 | 
 57 |         points = reduce(lambda x, y: np.concatenate((x, y)), list(map(get_points, [[-.8, -.8]])))
 58 |         boundary_points = Sutils.get_boundary_points(points, preprocessing=True, is_circular_boundary=True,
 59 |                                                      range_of_PCs=[[-1, 1], [-1, 1]])
 60 |         x, y = list(zip(*points))
 61 |         x1, y1 = list(zip(*boundary_points))
 62 |         fig, ax = plt.subplots()
 63 |         ax.scatter(x, y, c='b')
 64 |         ax.scatter(x1, y1, c='r')
 65 |         fig.savefig('test_get_boundary_points_circular.png')
 66 |         return
 67 | 
 68 |     @staticmethod
 69 |     def test_get_boundary_points_2_diagram():
 70 |         """diagram for the find_boundary algorithm"""
 71 |         dimensionality = 2
 72 |         fig, axes = plt.subplots(2, 2)
 73 |         fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=None, hspace=0.3)
 74 |         fig.set_size_inches(15, 15)
 75 |         # hist_matrix = np.random.randint(1, 10, size=(size_of_grid, size_of_grid))
 76 |         hist_matrix = [
 77 |             [0, 0, 0, 0, 0, 0, 0, 0],
 78 |             [0, 0, 0, 0, 0, 1, 0, 0],
 79 |             [0, 0, 3, 5, 3, 2, 1, 0],
 80 |             [0, 0, 2, 9, 6, 2, 0, 0],
 81 |             [0, 0, 5, 1, 7, 2, 0, 0],
 82 |             [0, 1, 2, 9, 8, 1, 0, 0],
 83 |             [0, 0, 0, 1, 4, 0, 0, 0],
 84 |             [0, 0, 0, 0, 0, 0, 0, 0],
 85 |         ]
 86 |         hist_matrix = np.array(hist_matrix)
 87 |         hist_matrix_processed = [[- np.exp(- y) for y in x] for x in hist_matrix]  # preprocessing process
 88 | 
 89 |         diff_with_neighbors = hist_matrix_processed - 1.0 / (2 * dimensionality) * sum(
 90 |             [np.roll(hist_matrix_processed, 1, axis=x)
 91 |                           + np.roll(hist_matrix_processed, -1, axis=x) for x in range(dimensionality)]
 92 |         )
 93 |         temp_fontsize = 25
 94 |         sns.heatmap(hist_matrix, ax=axes[0][0], annot=True, cbar=False)
 95 |         sns.heatmap(hist_matrix_processed, ax=axes[0][1], annot=True, cbar=False)
 96 |         sns.heatmap(diff_with_neighbors, ax=axes[1][0], annot=True, cbar=False)
 97 |         sns.heatmap(diff_with_neighbors < 0, ax=axes[1][1], annot=False, cbar=False)
 98 |         axes[0][0].set_title(r'number of data points $n_i$', fontsize=temp_fontsize)
 99 |         axes[0][1].set_title(r'$p_i = -\exp{(-n_i)}$', fontsize=temp_fontsize)
100 |         axes[1][0].text(2, 8.5, r'$v_i = p_i-\frac{1}{| K_i |}\sum_{j \in K_i} p_j$', fontsize=temp_fontsize)
101 |         axes[1][1].set_title('locations of selected cells', fontsize=temp_fontsize)
102 |         temp_annotation = ['(a)', '(b)', '(c)', '(d)']
103 |         index = 0
104 |         for _1 in axes:
105 |             for ax in _1:
106 |                 ax.set_xlabel('$\\xi_1$', fontsize=temp_fontsize)
107 |                 ax.set_ylabel('$\\xi_2$', fontsize=temp_fontsize)
108 |                 ax.text(-0.5, 8.4, temp_annotation[index], fontsize=temp_fontsize - 5)
109 |                 index += 1
110 |                 #     fig.tight_layout()
111 |         fig.savefig('diagram_of_finding_boundary.pdf', format='pdf', bbox_inches='tight')
112 |         return
113 | 
114 |     @staticmethod
115 |     def test_L_method():
116 |         evaluation_values = [0, 0.1, 0.5, 0.85, 0.9, 0.93]
117 |         nums = list(range(len(evaluation_values)))
118 |         opt_num, x_data, y_data_left, y_data_right = Sutils.L_method(evaluation_values, nums)
119 |         fig, ax = plt.subplots()
120 |         ax.plot(x_data, y_data_left)
121 |         ax.plot(x_data, y_data_right)
122 |         ax.scatter(nums, evaluation_values)
123 |         fig.savefig("L_method.png")
124 |         assert (opt_num == 4), opt_num
125 |         return
126 | 
127 |     @staticmethod
128 |     def test_rotating_coordinates():
129 |         data = np.loadtxt('../tests/dependency/temp_Trp_cage_data/1l2y_coordinates.txt').reshape((38, 60, 3))[0]
130 |         actual = Sutils.rotating_coordinates(data, [0,0,0], [0,0,1], np.pi / 2)
131 |         expected = np.array([data[:, 1], - data[:,0], data[:,2]]).T
132 |         assert_almost_equal(expected, actual)
133 |         return
134 | 
135 |     @staticmethod
136 |     def test__get_expression_script_for_plumed():
137 |         with open('dependency/expected_plumed_Trp_script.txt', 'r') as my_f:
138 |             expected = my_f.read().strip()
139 |         actual = Trp_cage.get_expression_script_for_plumed(scaling_factor=2.0).strip()
140 |         assert (expected == actual), actual
141 |         return
142 | 
143 | 
144 | # class test_Alanine_dipeptide(object):
145 |     # @staticmethod
146 |     # def test_get_many_cossin_from_coordiantes_in_list_of_files():
147 |     #     list_of_files = ['../tests/dependency/biased_output_fc_1000_x1_0.7_x2_-1.07_coordinates.txt']
148 |     #     actual = Alanine_dipeptide().get_many_cossin_from_coordinates_in_list_of_files(list_of_files)
149 |     #     assert_equal(100, len(actual))
150 |     #     assert_equal(8, len(actual[0]))
151 |     #     expected = np.loadtxt('../tests/dependency/output_cossin.txt')
152 |     #     assert_almost_equal(expected, actual)
153 |     #     return
154 |     #
155 |     # @staticmethod
156 |     # def test_get_many_dihedrals_from_cossin():
157 |     #     angle = [.4, -.7, math.pi, -.45]
158 |     #     cossin = [[1, 0, -1, 0, 1, 0, -1, 0], [0, 1, 0, -1, 0, 1, 0, -1],
159 |     #               reduce(lambda x, y: x + y, [[cos(x), sin(x)] for x in angle])
160 |     #               ]
161 |     #     actual = Alanine_dipeptide().get_many_dihedrals_from_cossin(cossin)
162 |     #     expected = [[0, 0, 0, 0], [math.pi / 2, -math.pi / 2, math.pi / 2, -math.pi / 2], angle]
163 |     #     for item in range(len(actual)):
164 |     #         for index in range(4):
165 |     #             assert_almost_equal(actual[item][index], expected[item][index], 4)
166 |     #     return
167 |     #
168 |     # @staticmethod
169 |     # def test_get_many_dihedrals_from_coordinates_in_file():
170 |     #     list_of_files = ['../tests/dependency/biased_output_fc_1000_x1_0.7_x2_-1.07_coordinates.txt']
171 |     #     actual = Alanine_dipeptide.get_many_dihedrals_from_coordinates_in_file(list_of_files)
172 |     #     expected = np.loadtxt('../tests/dependency/output_dihedrals.txt')
173 |     #     assert_almost_equal(actual, expected)
174 |     #     return
175 | 
176 |     # @staticmethod
177 |     # def test_generate_coordinates_from_pdb_files():
178 |     #     pdb_file_name = '../tests/dependency/temp_output_0.pdb'
179 |     #     actual_output_file = pdb_file_name.replace('.pdb', '_coordinates.txt')
180 |     #     expected_output_files = '../tests/dependency/temp_output_0_coor.txt'
181 |     #     for interval in range(1, 10):
182 |     #         if interval != 1:
183 |     #             actual_output_file = pdb_file_name.replace('.pdb', '_int_%d_coordinates.txt' % interval)
184 |     #         if os.path.exists(actual_output_file):
185 |     #             subprocess.check_output(['rm', actual_output_file])
186 |     #         Alanine_dipeptide.generate_coordinates_from_pdb_files(pdb_file_name, step_interval=interval)
187 |     #         assert_equal(np.loadtxt(actual_output_file), np.loadtxt(expected_output_files)[::interval])
188 |     #         subprocess.check_output(['rm', actual_output_file])
189 |     #     return
190 | 
191 | 
192 | class test_Trp_cage(object):
193 |     @staticmethod
194 |     def test_get_non_repeated_pairwise_distance_as_list_of_alpha_carbon():
195 |         pdb_file_list = ['../tests/dependency/temp_Trp_cage_data/1l2y.pdb']
196 |         a = Trp_cage.get_pairwise_distance_matrices_of_selected_atoms(pdb_file_list)
197 |         a = [item.reshape(400, 1) for item in a]
198 |         b = Trp_cage.get_non_repeated_pairwise_distance(pdb_file_list)
199 |         assert (len(a) == len(b))
200 |         for _1 in range(len(b)):
201 |             for _2 in b[_1]:
202 |                 assert (_2 in a[_1])
203 |         return
204 | 
205 |     @staticmethod
206 |     def test_get_pairwise_distance_matrices_of_alpha_carbon():
207 |         actual = Trp_cage.get_pairwise_distance_matrices_of_selected_atoms(['../tests/dependency/temp_Trp_cage_data/1l2y.pdb'])[0]
208 |         expected = np.loadtxt("../tests/dependency/test_get_pairwise_distance_matrices_of_alpha_carbon.txt")
209 |         assert_almost_equal(actual, expected)
210 |         return
211 | 
212 |     @staticmethod
213 |     def test_rotating_dihedral_angles_and_save_to_pdb():
214 |         pdb_file = '../tests/dependency/temp_Trp_cage_data/1l2y.pdb'
215 |         output = 'temp_rotating_out.pdb'
216 |         target_dihedrals_list = [np.ones((38, 38)), np.zeros((38, 38))]
217 |         for target_dihedrals in target_dihedrals_list:
218 |             Trp_cage.rotating_dihedral_angles_and_save_to_pdb(pdb_file, target_dihedrals, output)
219 |             out_coor_file_list = Trp_cage.generate_coordinates_from_pdb_files(output)
220 |             actual_dihedrals = Trp_cage.get_many_dihedrals_from_coordinates_in_file(out_coor_file_list)
221 |             print(out_coor_file_list)
222 |             # print np.max(np.abs(actual_dihedrals - target_dihedrals))
223 |             assert_almost_equal(actual_dihedrals, target_dihedrals, decimal=2)
224 | 
225 |         return
226 | 
227 | 
228 | class test_coordinates_data_files_list(object):
229 |     @staticmethod
230 |     def test__init__():
231 |         folder = '../tests/dependency/temp_data'
232 |         num_of_coor_files = len(subprocess.check_output(['find', folder, '-name', "*_coordinates.npy"]).strip().split())
233 |         a = coordinates_data_files_list([folder])
234 |         assert len(a.get_list_of_coor_data_files()) == num_of_coor_files
235 |         assert a._list_num_frames == [100 for _ in range(num_of_coor_files)]
236 |         assert sorted(a.get_list_of_coor_data_files()) == a.get_list_of_coor_data_files()
237 |         assert len(a.get_list_of_corresponding_pdb_dcd()) == num_of_coor_files
238 |         assert sorted(a.get_list_of_corresponding_pdb_dcd()) == a.get_list_of_corresponding_pdb_dcd()
239 | 
240 |     @staticmethod
241 |     def test_create_sub_coor_data_files_list_using_filter_conditional():
242 |         folder = '../tests/dependency/temp_data'
243 |         a = coordinates_data_files_list([folder])
244 |         a_sub = a.create_sub_coor_data_files_list_using_filter_conditional(lambda x: '0.7' in x)
245 |         for item in a_sub.get_list_of_coor_data_files():
246 |             assert ('0.7' in item)
247 |         return
248 | 
249 |     @staticmethod
250 |     def test_get_pdb_name_and_corresponding_frame_index_with_global_coor_index():
251 |         _1 = coordinates_data_files_list(['../tests/dependency/temp_data/'])
252 |         pdb_files = _1.get_list_of_corresponding_pdb_dcd()
253 |         for item in range(1, 602, 100):
254 |             assert (_1.get_pdb_name_and_corresponding_frame_index_with_global_coor_index(item) == (pdb_files[item // 100], 1))
255 |         return
256 | 
257 | 
258 | class test_autoencoder_Keras(object):
259 |     def __init__(self):
260 |         my_file_list = coordinates_data_files_list(['../tests/dependency/noncircular_alanine_exploration_data/'])
261 |         self._data = np.array(Alanine_dipeptide.get_many_cossin_from_coordinates_in_list_of_files(
262 |             my_file_list.get_list_of_coor_data_files()))
263 |         self._dihedrals = Alanine_dipeptide.get_many_dihedrals_from_cossin(self._data)
264 | 
265 |     def test_train(self):
266 |         data, dihedrals = self._data, self._dihedrals
267 |         hidden_layers_list = [["Tanh", "Tanh", "Tanh", "Tanh", "Tanh", "Tanh", "Tanh"],
268 |                               ["Sigmoid", "Sigmoid", "Sigmoid", "Sigmoid", "Tanh", "Sigmoid", "Tanh"]]
269 |         model_type_list = [autoencoder_Keras, autoencoder_torch]
270 |         reg_list = [0.001, 0]
271 |         for item_activation in range(2):
272 |             for is_hi, hier_var in [(0, 0), (1,1), (1,2)]:    # do not test variant 0 for now
273 |                 for type_index, model_type in enumerate(model_type_list):
274 |                     model = model_type(1447, data,
275 |                                        data_files=['/tmp/train_in.npy', '/tmp/train_out.npy'],
276 |                                        node_num=[8, 8, 15, 8, 2, 15, 8, 8, 8],
277 |                                        hidden_layers_types=hidden_layers_list[item_activation],
278 |                                        network_parameters = [0.02, 0.9,0, True, [reg_list[item_activation]]* 8],
279 |                                        batch_size=100, hierarchical=is_hi, hi_variant=hier_var,
280 |                                        epochs=50
281 |                                       )
282 |                     model.train()
283 |                     PCs = model.get_PCs()
284 |                     [x, y] = list(zip(*PCs))
285 |                     psi = [item[2] for item in dihedrals]
286 |                     fig, ax = plt.subplots()
287 |                     ax.scatter(x, y, c=psi, cmap='gist_rainbow')
288 |                     ax.set_title("FVE = %f" % model.get_fraction_of_variance_explained())
289 |                     file_name = 'try_model_type_%d_hierarchical_%d_%d_act_%d.pkl' % (
290 |                         type_index, is_hi, hier_var, item_activation)
291 |                     model.save_into_file(file_name)
292 |                     fig.savefig(file_name.replace('.pkl', '.png'))
293 |         return
294 | 
295 |     def test_train_with_different_mse_weights(self):
296 |         data, dihedrals = self._data, self._dihedrals
297 |         for _1, weights in enumerate([None, np.array([1,1,0,0,0,0,1,1]),
298 |                                       np.array([0,0,1,1,1,1,0,0]), np.array([1,1,1,1,0,0,0,0])]):
299 |             model = autoencoder_Keras(1447, data,
300 |                                       node_num=[8, 8, 15, 8, 2, 15, 8, 8, 8],
301 |                                       hidden_layers_types=["Tanh", "Tanh", "Tanh", "Tanh", "Tanh", "Tanh", "Tanh"],
302 |                                       network_parameters=[0.02, 0.9, 0, True, [0.001] * 8],
303 |                                       batch_size=100, hierarchical=0,
304 |                                       mse_weights=weights
305 |                                       )
306 |             _, history = model.train()
307 |             PCs = model.get_PCs()
308 |             [x, y] = list(zip(*PCs))
309 |             psi = [item[2] for item in dihedrals]
310 |             fig, ax = plt.subplots()
311 |             ax.scatter(x, y, c=psi, cmap='gist_rainbow')
312 |             model.save_into_file('try_diff_weights_%02d.pkl' % _1)
313 |             fig.savefig('try_diff_weights_%02d.png' % _1)
314 |         return
315 | 
316 |     def test_train_2(self):
317 |         data, dihedrals = self._data, self._dihedrals
318 |         model = autoencoder_Keras(1447, data,
319 |                                   node_num=[8, 15, 4, 15, 8],
320 |                                   hidden_layers_types=["Tanh", "Circular", "Tanh"],
321 |                                   network_parameters = [0.1, 0.4,0, True, [0.001]* 4],
322 |                                   hierarchical=False
323 |                                   )
324 |         model.train()
325 |         PCs = model.get_PCs()
326 |         [x, y] = list(zip(*PCs))
327 |         psi = [item[2] for item in dihedrals]
328 |         fig, ax = plt.subplots()
329 |         ax.scatter(x, y, c=psi, cmap='gist_rainbow')
330 | 
331 |         fig.savefig('try_keras_circular.png')
332 |         return
333 | 
334 |     def test_save_into_file_and_load(self):
335 |         data = self._data
336 |         model = autoencoder_Keras(1447, data,
337 |                                   node_num=[8, 15, 2, 15, 8],
338 |                                   hidden_layers_types=["Tanh", "Tanh", "Tanh"],
339 |                                   network_parameters=[0.02, 0.9,0, True, [0.001]* 4],
340 |                                   batch_size=50,
341 |                                   data_files=['test_save_into_file.npy', 'test_save_into_file.npy']
342 |                                   )
343 |         model.train()
344 |         model.save_into_file('test_save_into_file.pkl')
345 |         model.save_into_file('test_save_into_file_fraction.pkl', fraction_of_data_to_be_saved=0.5)
346 |         model.save_into_file('temp_save/complicated/path/temp.pkl')
347 |         _ = autoencoder.load_from_pkl_file('test_save_into_file.pkl')
348 |         return
349 | 
350 |     @staticmethod
351 |     def check_two_plumed_strings_containing_floats(string_1, string_2):
352 |         """due to precision issue, string literals may not be exactly the same for two plumed strings, so we
353 |                 need to explicitly compare the float values"""
354 |         def is_float(s):
355 |             try:
356 |                 float(s)
357 |                 return True
358 |             except ValueError:
359 |                 return False
360 |         split_1 = re.split(' |\n|=|,', string_1)
361 |         split_2 = re.split(' |\n|=|,', string_2)
362 |         assert (len(split_1) == len(split_2)), (len(split_1), len(split_2))
363 |         for _1, _2 in zip(split_1, split_2):
364 |             if is_float(_1):
365 |                 assert_almost_equal(float(_1), float(_2), decimal=4)
366 |             else:
367 |                 assert (_1 == _2), (_1, _2)
368 |         return
369 | 
370 |     def test_get_plumed_script_for_biased_simulation_with_solute_pairwise_dis_and_solvent_cg_input_and_ANN(self):
371 |         scaling_factor_v = 26.9704478916
372 |         scaling_factor_u = 29.1703348377
373 |         r_high = 5.5
374 |         atom_indices = list(range(1, 25))
375 |         water_index_string = '75-11421:3'
376 | 
377 |         ae = autoencoder.load_from_pkl_file('../tests/dependency/solute_plus_solvent_AE/temp_alpha_0.5.pkl')
378 |         with open('../tests/dependency/solute_plus_solvent_AE/temp_plumed.txt', 'r') as my_f:
379 |             expected_plumed = my_f.read().strip()
380 |         plumed_string = ae.get_plumed_script_for_biased_simulation_with_solute_pairwise_dis_and_solvent_cg_input_and_ANN(
381 |             list(range(1, 25)), scaling_factor_u, water_index_string, atom_indices, -5, r_high, scaling_factor_v)
382 |         self.check_two_plumed_strings_containing_floats(plumed_string, expected_plumed)
383 | 
384 |         AE = autoencoder.load_from_pkl_file('../tests/dependency/solvent_AE/solvent_test.pkl')
385 |         with open('../tests/dependency/solvent_AE/temp_plumed.txt', 'r') as my_f:
386 |             expected_plumed = my_f.read().strip()
387 |         plumed_string = AE.get_plumed_script_for_biased_simulation_with_INDUS_cg_input_and_ANN(
388 |             water_index_string, atom_indices, -5, r_high, scaling_factor_v).strip()
389 |         self.check_two_plumed_strings_containing_floats(plumed_string, expected_plumed)
390 |         return
391 | 
392 | class test_autoencoder_torch(object):
393 |     @staticmethod
394 |     def test_general_train_save_and_load():
395 |         data = np.random.rand(1000, 21)
396 |         a = autoencoder_torch(1447, data,
397 |                               output_data_set=data,
398 |                               hierarchical=True, hi_variant=2,
399 |                               batch_size=500,
400 |                               node_num=[21, 100, 2, 100, 21],
401 |                               hidden_layers_types=['tanh', 'tanh', 'tanh'], epochs=10)
402 |         a.train(lag_time=10)
403 |         a.save_into_file('/tmp/temp_save.pkl')
404 |         torch.save(a._ae, '/tmp/temp.df')
405 |         model_1 = torch.load('/tmp/temp.df')
406 |         torch.save(a._ae.state_dict(), '/tmp/temp_2.df')
407 |         model_2 = AE_net([21, 100, 2], [2, 100, 21], activations=a._hidden_layers_type + ['linear'],
408 |                          hi_variant=2, hierarchical=True).cuda()
409 |         model_2.load_state_dict(torch.load('/tmp/temp_2.df'))
410 |         data_in = torch.rand(1000, 21).cuda()
411 |         assert_almost_equal(model_1(data_in)[0].cpu().data.numpy(), a._ae(data_in)[0].cpu().data.numpy())
412 |         assert_almost_equal(model_2(data_in)[0].cpu().data.numpy(), a._ae(data_in)[0].cpu().data.numpy())
413 |         _ = autoencoder_torch.load_from_pkl_file('/tmp/temp_save.pkl')
414 |         return
415 | 
416 |     @staticmethod
417 |     def test_time_lagged_AE_stored_data_saving():
418 |         data = np.random.rand(1000, 21)
419 |         a = autoencoder_torch(1447, data,
420 |                               output_data_set=data,
421 |                               hierarchical=True, hi_variant=2,
422 |                               rec_loss_type=1,
423 |                               batch_size=500,
424 |                               node_num=[21, 100, 2, 100, 21],
425 |                               hidden_layers_types=['tanh', 'tanh', 'tanh'], epochs=10)
426 |         a.train(lag_time=10)
427 |         assert_almost_equal(a._data_set, data[:-10])
428 |         assert_almost_equal(a._output_data_set, data[10:])
429 |         return
430 | 
431 |     @staticmethod
432 |     def test_save_and_load_data():
433 |         data = np.random.rand(1000, 21)
434 |         a = autoencoder_torch(1447, data,
435 |                               output_data_set=data,
436 |                               hierarchical=True,
437 |                               batch_size=500,
438 |                               node_num=[21, 100, 2, 100, 21], epochs=10,
439 |                               data_files=['data.npy', 'data.npy'])
440 |         a.train(lag_time=0)
441 |         a.save_into_file('temp_save_pytorch/temp_save.pkl')
442 |         assert (os.path.isfile('temp_save_pytorch/data.npy'))
443 |         b = autoencoder_torch.load_from_pkl_file('temp_save_pytorch/temp_save.pkl')
444 |         assert_almost_equal(a._data_set, b._data_set)
445 |         assert_almost_equal(a._output_data_set, b._output_data_set)
446 |         return
447 | 
448 | 
449 | class test_biased_simulation(object):
450 |     @staticmethod
451 |     def helper_biased_simulation_alanine_dipeptide(potential_center):
452 |         autoencoder_coeff_file = 'autoencoder_info_9.npy'
453 |         autoencoder_pkl_file = '../tests/dependency/test_biased_simulation/network_5.pkl'
454 |         my_network = autoencoder.load_from_pkl_file(autoencoder_pkl_file)
455 |         assert (isinstance(my_network, autoencoder))
456 |         my_network.write_coefficients_of_connections_into_file(autoencoder_coeff_file)
457 |         output_folder = 'temp_output_test_biased_simulation'
458 | 
459 |         if os.path.exists(output_folder):
460 |             subprocess.check_output(['rm', '-rf', output_folder])
461 | 
462 |         subprocess.check_output(
463 |             'python ../src/biased_simulation.py 50 5000 5000 %s %s pc_%s --num_of_nodes %s --layer_types %s --platform CPU --data_type_in_input_layer 1'
464 |             % (output_folder, autoencoder_coeff_file, potential_center, "21,40,2", "Tanh,Tanh"),
465 |             shell=True)
466 | 
467 |         Alanine_dipeptide.generate_coordinates_from_pdb_files(output_folder)
468 |         fig, ax = plt.subplots()
469 |         input_data = coordinates_data_files_list([output_folder]).get_coor_data(0.5)
470 |         input_data = Sutils.remove_translation(input_data)
471 |         PCs = my_network.get_PCs(input_data)
472 |         x, y = list(zip(*PCs))
473 |         ax.scatter(x, y, c=list(range(len(x))), cmap='gist_rainbow', s=5)
474 |         potential_center_num = [float(item_1) for item_1 in potential_center.split(',')]
475 |         ax.scatter([potential_center_num[0]], [potential_center_num[1]], marker='X', s=30)
476 |         fig.savefig('test_biased_simulation_%s.png' % potential_center)
477 |         subprocess.check_output(['rm', '-rf', output_folder])
478 |         return
479 | 
480 |     @staticmethod
481 |     def test_biased_simulation_alanine_dipeptide():
482 |         for item in ['-0.3,-0.7', '-0.3,-0.5', '-0.2,-0.4', '0,-0.4', '-0.1,-0.5']:
483 |             test_biased_simulation.helper_biased_simulation_alanine_dipeptide(item.replace(' ',''))
484 |         return
485 | 
486 |     @staticmethod
487 |     def test_biased_simulation_alanine_dipeptide_with_metadynamics(use_well_tempered=0, biasfactor=-1):
488 |         autoencoder_pkl_file = '../tests/dependency/test_biased_simulation/network_5.pkl'
489 |         output_folder = 'temp_output_test_biased_simulation'
490 |         a = autoencoder.load_from_pkl_file(autoencoder_pkl_file)
491 |         a.write_expression_script_for_plumed('temp_info.txt', mode='ANN')
492 |         subprocess.check_output(
493 | 'python ../src/biased_simulation.py 50 50000 0 %s temp_info.txt pc_0,0 --MTD_pace 100 --platform CPU --bias_method MTD --MTD_biasfactor %f --MTD_WT %d --equilibration_steps 0'
494 |                                 % (output_folder, biasfactor, use_well_tempered), shell=True)
495 |         subprocess.check_output(['python', '../src/generate_coordinates.py', 'Alanine_dipeptide', '--path', output_folder])
496 |         fig, axes = plt.subplots(1, 3)
497 |         data = np.load(
498 |             output_folder + '/output_fc_0.000000_pc_[0.0,0.0]_coordinates.npy')
499 |         data /= 0.5
500 |         data = Sutils.remove_translation(data)
501 |         PCs = a.get_PCs(data)
502 |         ax = axes[0]
503 |         ax.set_xlabel('CV1')
504 |         ax.set_ylabel('CV2')
505 |         ax.set_title('CV data generated by autoencoder')
506 |         im = ax.scatter(PCs[:, 0], PCs[:, 1], c=list(range(PCs.shape[0])), cmap='gist_rainbow', s=4)
507 |         fig.colorbar(im, ax=ax)
508 | 
509 |         out_data = np.loadtxt('temp_MTD_out.txt')
510 | 
511 |         ax = axes[1]
512 |         im = ax.scatter(out_data[:, 1], out_data[:, 2], c=list(range(out_data.shape[0])), cmap='gist_rainbow', s=4)
513 |         ax.set_xlabel('CV1')
514 |         ax.set_ylabel('CV2')
515 |         ax.set_title('CV data generated by PLUMED')
516 |         fig.colorbar(im, ax=ax)
517 | 
518 |         ax = axes[2]
519 |         dihedrals = Alanine_dipeptide.get_many_dihedrals_from_cossin(
520 |             Alanine_dipeptide.get_many_cossin_from_coordinates(data))
521 |         dihedrals = np.array(dihedrals)
522 |         im = ax.scatter(dihedrals[:, 1], dihedrals[:, 2], c=list(range(len(dihedrals))), cmap="gist_rainbow", s=4)
523 |         ax.set_xlabel('$\phi$')
524 |         ax.set_ylabel('$\psi$')
525 |         ax.set_title('data in phi-psi space')
526 |         fig.colorbar(im, ax=ax)
527 |         fig.set_size_inches((15, 5))
528 |         fig.savefig('metadynamics_biasfactor_%f.png' % biasfactor)
529 |         subprocess.check_output(['rm', '-rf', output_folder])
530 |         return
531 | 
532 |     @staticmethod
533 |     def test_biased_simulation_alanine_dipeptide_with_metadynamics_multiple():
534 |         test_biased_simulation.test_biased_simulation_alanine_dipeptide_with_metadynamics(0, -1)
535 |         for item in [5, 20, 100]:
536 |             test_biased_simulation.test_biased_simulation_alanine_dipeptide_with_metadynamics(1, item)
537 |         return
538 |         
539 | 
540 | class test_Helper_func(object):
541 |     @staticmethod
542 |     def test_compute_distances_min_image_convention():
543 |         output_pdb = 'out_for_computing_distances.pdb'
544 |         subprocess.check_output(['python', '../src/biased_simulation_general.py', 'Trp_cage', '50', '1000', '0', 'temp_out_12345',
545 |                              'none', 'pc_0,0', 'explicit', 'NPT', '--platform', 'CUDA', '--device', '0', '--out_traj', output_pdb])
546 |         import mdtraj as md
547 |         box_length = 4.5  # in nm
548 |         temp_t = md.load(output_pdb)
549 |         temp_t.unitcell_lengths, temp_t.unitcell_angles = box_length * np.ones((20, 3)), 90 * np.ones((20, 3))
550 |         temp_u = Universe(output_pdb)
551 |         a_sel = temp_u.select_atoms('name N')
552 |         b_sel = temp_u.select_atoms('name O and resname HOH')
553 |         absolute_index = b_sel.atoms.indices[30]
554 |         b_positions = np.array([b_sel.positions for _ in temp_u.trajectory])
555 |         b_positions = b_positions.reshape(20, b_positions.shape[1] * b_positions.shape[2])
556 |         a_positions = np.array([a_sel.positions for _ in temp_u.trajectory])
557 |         a_positions = a_positions.reshape(20, a_positions.shape[1] * a_positions.shape[2])
558 |         result = Helper_func.compute_distances_min_image_convention(a_positions, b_positions, 10 * box_length)
559 |         assert_almost_equal(md.compute_distances(temp_t, [[0, absolute_index]]).flatten(), result[:, 0, 30] / 10, decimal=4)
560 |         subprocess.check_output(['rm', '-rf', output_pdb, 'temp_out_12345'])
561 |         return
562 | 
563 |     @staticmethod
564 |     def test_shuffle_multiple_arrays():
565 |         a = np.random.rand(10, 2)
566 |         b1, b2 =Helper_func.shuffle_multiple_arrays([a[:, 0], a[:, 1]])
567 |         for item in range(10):
568 |             assert( [b1[item], b2[item]] in a)
569 |         return
570 | 
571 |     @staticmethod
572 |     def test_attempt_to_save_npy():
573 |         import shutil
574 |         def get_num_files_in_folder(temp_folder): return len(os.listdir(temp_folder))
575 |         a = 2 * np.eye(3, 3)
576 |         folder = 'temp_test_save_npy'
577 |         if os.path.exists(folder): shutil.rmtree(folder)
578 |         os.mkdir(folder)
579 |         filename = folder + '/1.npy'
580 |         Helper_func.attempt_to_save_npy(filename, a)
581 |         assert (os.path.isfile(filename))
582 |         assert (get_num_files_in_folder(folder) == 1)
583 |         Helper_func.attempt_to_save_npy(filename, a)
584 |         assert (get_num_files_in_folder(folder) == 1)
585 |         for item in range(10):
586 |             Helper_func.attempt_to_save_npy(filename, a+item)
587 |             assert (get_num_files_in_folder(folder) == item + 1)
588 |         shutil.rmtree(folder)
589 |         return
590 | 
591 | 
592 | class test_others(object):
593 |     @staticmethod
594 |     def test_SphSh_INDUS_PLUMED_plugin():
595 |         # test for original version of SphSh_INDUS_PLUMED_plugin provided by Prof. Amish Patel
596 |         num_frames = 20
597 |         potential_center = np.random.random(size=3) * 3
598 |         with open('temp_plumed.txt', 'w') as my_f:
599 |             my_f.write('''
600 | SPHSH ATOMS=306-11390:4 XCEN=%f YCEN=%f ZCEN=%f RLOW=-0.5 RHIGH=0.311 SIGMA=0.01 CUTOFF=0.02 LABEL=sph
601 | SPHSH ATOMS=306-11390:4 XCEN=%f YCEN=%f ZCEN=%f RLOW=0.05 RHIGH=0.311 SIGMA=0.01 CUTOFF=0.02 LABEL=sph_2
602 | RESTRAINT ARG=sph.Ntw AT=5 KAPPA=5 SLOPE=0 LABEL=mypotential
603 | PRINT STRIDE=50 ARG=sph.N,sph.Ntw,sph_2.N,sph_2.Ntw FILE=NDATA''' \
604 |     % (potential_center[0], potential_center[1], potential_center[2], 
605 |        potential_center[0], potential_center[1], potential_center[2]))  # since there is "TER" separating solute and solvent in pdb file, so index should start with 306, not 307
606 |         out_pdb = 'temp_plumed/output_fc_0.0_pc_[0.0,0.0]_T_300_explicit_NPT.pdb'
607 |         subprocess.check_output(['python', '../src/biased_simulation_general.py', 'Trp_cage', '50', '1000', '0',
608 |                                  'temp_plumed', 'none', 'pc_0,0', 'explicit', 'NPT', '--platform', 'CUDA',
609 |                                  '--bias_method', 'plumed_other', '--plumed_file', 'temp_plumed.txt',
610 |                                  '--out_traj', out_pdb])
611 |         temp_u = Universe(out_pdb)
612 |         reporter_file = out_pdb.replace('output', 'report').replace('.pdb', '.txt')
613 |         box_length_list = Helper_func.get_box_length_list_fom_reporter_file(reporter_file, unit='A')
614 |         O_sel = temp_u.select_atoms('name O and resname HOH')
615 |         O_coords = np.array([O_sel.positions for _ in temp_u.trajectory]).reshape(num_frames, 2772 * 3)
616 |         distances = Helper_func.compute_distances_min_image_convention(
617 |             10 * np.array([potential_center for _ in range(num_frames)]), O_coords, box_length_list)
618 |         coarse_count, actual_count = Helper_func.get_cg_count_in_sphere(distances, 3.11, 0.2, .1)
619 |         plumed_count = np.loadtxt('NDATA')
620 |         assert_almost_equal(plumed_count[-num_frames:, 1], actual_count.flatten())
621 |         assert_almost_equal(plumed_count[-num_frames:, 2], coarse_count.flatten(), decimal=2)
622 |         coarse_count_1, actual_count_1 = Helper_func.get_cg_count_in_shell(distances, 0.5, 3.11, 0.2, .1)
623 |         assert_almost_equal(plumed_count[-num_frames:, 3], actual_count_1.flatten())
624 |         assert_almost_equal(plumed_count[-num_frames:, 4], coarse_count_1.flatten(), decimal=2)
625 |         subprocess.check_output(['rm', '-rf', 'temp_plumed', 'NDATA', 'temp_plumed.txt'])
626 |         return
627 | 
628 |     @staticmethod
629 |     def test_SphShMod_INDUS_PLUMED_plugin():
630 |         num_frames = 20
631 |         with open('temp_plumed.txt', 'w') as my_f:
632 |             my_f.write('''
633 | SPHSHMOD ATOMS=306-11390:4 ATOMREF=1 RLOW=-0.5 RHIGH=0.311 SIGMA=0.01 CUTOFF=0.02 LABEL=sph
634 | RESTRAINT ARG=sph.Ntw AT=10 KAPPA=5 SLOPE=0 LABEL=mypotential
635 | PRINT STRIDE=50 ARG=sph.N,sph.Ntw FILE=NDATA''' )
636 |         out_pdb = 'temp_plumed/output_fc_0.0_pc_[0.0,0.0]_T_300_explicit_NPT.pdb'
637 |         subprocess.check_output(['python', '../src/biased_simulation_general.py', 'Trp_cage', '50', '1000', '0',
638 |                                  'temp_plumed', 'none', 'pc_0,0', 'explicit', 'NPT', '--platform', 'CUDA',
639 |                                  '--bias_method', 'plumed_other', '--plumed_file', 'temp_plumed.txt',
640 |                                  '--out_traj', out_pdb])
641 |         temp_u = Universe(out_pdb)
642 |         reporter_file = out_pdb.replace('output', 'report').replace('.pdb', '.txt')
643 |         box_length_list = Helper_func.get_box_length_list_fom_reporter_file(reporter_file, unit='A')
644 |         print(box_length_list)
645 |         O_sel = temp_u.select_atoms('name O and resname HOH')
646 |         N_sel = temp_u.select_atoms('resnum 1 and name N')
647 |         O_coords = np.array([O_sel.positions for _ in temp_u.trajectory]).reshape(num_frames, 2772 * 3)
648 |         N_coords = np.array([N_sel.positions for _ in temp_u.trajectory]).reshape(num_frames, 3)
649 |         distances = Helper_func.compute_distances_min_image_convention(N_coords, O_coords, box_length_list)
650 |         coarse_count, actual_count = Helper_func.get_cg_count_in_sphere(distances, 3.11, 0.2, .1)
651 |         plumed_count = np.loadtxt('NDATA')
652 |         assert_almost_equal(plumed_count[-num_frames:, 1], actual_count.flatten())
653 |         assert_almost_equal(plumed_count[-num_frames:, 2], coarse_count.flatten(), decimal=2)
654 |         subprocess.check_output(['rm', '-rf', 'temp_plumed', 'NDATA', 'temp_plumed.txt'])
655 |         return
656 | 


--------------------------------------------------------------------------------
/MD_simulation_on_alanine_dipeptide/current_work/tests/Makefile:
--------------------------------------------------------------------------------
1 | SHELL=/bin/bash
2 | 
3 | test:
4 | 	nosetests --with-coverage --cover-package=../ ANN_simulation_test.py   # nosetests with coverage
5 | 
6 | clean:
7 | 	rm -rf *.pkl *.pyc *.png *.txt *.pdb *.pdf *.hdf5 *.chk bck.* NDATA temp_save *.npy temp_model.dot temp_save_pytorch .coverage *.pth
8 | 
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![plumID:19.065](https://www.plumed-nest.org/eggs/19/065/badge.svg)](https://www.plumed-nest.org/eggs/19/065/)
  2 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
  3 | 
  4 | # Accelerated sampling with data-augmented autoencoders
  5 | 
  6 | This is the framework for running accelerated sampling with data-augmented autoencoders.
  7 | 
  8 | ## Dependency
  9 | 
 10 | OpenMM simulation pacakge: https://github.com/pandegroup/openmm
 11 | 
 12 | ANN_Force biasing force package: https://github.com/weiHelloWorld/ANN_Force
 13 | 
 14 | Keras: https://github.com/fchollet/keras
 15 | 
 16 | PyTorch: https://pytorch.org
 17 | 
 18 | MDAnalysis: https://github.com/MDAnalysis/mdanalysis
 19 | 
 20 | Nose testing framework: https://github.com/nose-devs/nose
 21 | 
 22 | PLUMED (ANN included): https://github.com/plumed/plumed2 + https://github.com/weiHelloWorld/plumed_additional
 23 | 
 24 | cluster management: https://github.com/weiHelloWorld/cluster_management
 25 | 
 26 | plumed helper: https://github.com/weiHelloWorld/plumed_helper
 27 | 
 28 | OpenMM-PLUMED force plugin: https://github.com/peastman/openmm-plumed
 29 | 
 30 | Bayes WHAM free energy calculation package: https://bitbucket.org/andrewlferguson/bayeswham_python
 31 | 
 32 | Some other Python scientific calculation packages (e.g. seaborn, pandas, sklearn) are also needed, it is recommended to install them with Anaconda: https://www.continuum.io/downloads
 33 | 
 34 | 
 35 | ## Installation and preparation
 36 | 
 37 | No installation is required.  You may simply have all dependent packages installed and checkout this repository.
 38 | 
 39 | It is **highly recommended** to run tests before running code to make sure packages are correctly installed.
 40 | 
 41 | ## Testing
 42 | 
 43 | This package uses `nosetest` framework.  To run testing, run
 44 | 
 45 | ```bash
 46 | root_dir=MD_simulation_on_alanine_dipeptide/current_work
 47 | cd ${root_dir}/tests
 48 | make test
 49 | ```
 50 | 
 51 | Tests include numerical unit tests (for tests with clear expected results) and figure plots (for others, such as neural network training).
 52 | 
 53 | ## 1-minute quick start
 54 | 
 55 | Go ahead to modify configuration file `${root_dir}/src/config.py`, and run 
 56 | 
 57 | ```bash
 58 | python main_work.py
 59 | ```
 60 | 
 61 | For more options, type
 62 | 
 63 | ```bash
 64 | python main_work.py --help
 65 | ```
 66 | 
 67 | ## Quick introduction to autoencoders
 68 | 
 69 | A typical autoencoder consists of encoder ANN and decoder ANN, where encoder ANN maps inputs to a small number of collective variables (CVs) in encoding layer and decoder ANN tries to reconstruct inputs (or some variants of inputs) from CVs:
 70 | 
 71 | ![](figures/diagram_autoencoder.png)
 72 | 
 73 | A typical 5-layer structure is given below:
 74 | 
 75 | ![](figures/autoencoder_2.png)
 76 | 
 77 | For traditional autoencoders, we minimize
 78 | 
 79 | $$E=|A(x)-x|^2 + R$$
 80 | 
 81 | where $A$ is autoencoder mapping function, $R$ is regularization term.
 82 | 
 83 | To remove external degrees of freedom, we use data-augmented autoencoders, which minimizes
 84 | 
 85 | $$E=|A(x)-L(x)|^2 + R$$
 86 | 
 87 | where $L$ is the alignment function responsible for data augmentation.  It can be written in "cat form" as (cat = molecule configuration, little human = alignment function L):
 88 | 
 89 | ![](figures/autoencoder_1.png)
 90 | 
 91 | To possibly remove dependency on specific reference, we apply multiple references to data-augmented autoencoders, corresponding error function is 
 92 | 
 93 | $$E=\sum_j |A_j(x)-L_j(x)|^2 + R$$
 94 | 
 95 | where $A_j$ are autoencoders that share all but the last layer, and $L_j$ is alignment functions with respect to reference $j$.
 96 | 
 97 | If we want to see relative importance among these CVs, we construct multiple outputs with each output taking contribution from some of CVs in encoding layer.  Two possible types of network topology are given below:
 98 | 
 99 | ![](figures/hierarchical_autoencoder.png)
100 | 
101 | Corresponding error function is then
102 | 
103 | $$E=E_{1}+E_{1,2}+E_{1,2,3}+...$$
104 | 
105 | where $E_{1}$ is reconstruction error when only 1st CV is used to compute output, $E_{1,2}$ is reconstruction error when only first two CVs are used to compute output, ...
106 | 
107 | See slides for more information: (TODO)
108 | 
109 | 
110 | ## Directory structure
111 | 
112 | Directories are arranged as follows:
113 | 
114 | ```
115 | ${root_dir}/src: source code
116 | ${root_dir}/target: output of simulation data (pdb files and coordinate files)
117 | ${root_dir}/resources: training results (autoencoders), and reference configurations files (pdb files)
118 | ${root_dir}/tests: test source code
119 | ```
120 | 
121 | 
122 | ## Extensions
123 | 
124 | #### 1. apply to new molecules
125 | 
126 | 1. Create a subclass of `Sutils` for the molecule and implement corresponding methods in `${root_dir}/src/molecule_spec_sutils.py`.
127 | 
128 | 2. Include molecule-specific information in the configuration file `${root_dir}/src/config.py`, and modify corresponding configuration settings.
129 | 
130 | 3. Modify biased simulation file (`${root_dir}/src/biased_simulation_general.py`) for the new molecule.
131 | 
132 | 4. Add molecule-related statements to `${root_dir}/src/ANN_simulation.py` and `${root_dir}/src/autoencoders.py` whenever `Trp_cage` appears.
133 | 
134 | #### 2. use a new neural network architecture or switch to a new training backend
135 | 
136 | 1. Create a subclass of `autoencoder` for the new structure/backend and do implementation.  Note that all abstract methods (`@abc.abstractmethod`) must be implemented.
137 | 
138 | 2. Include new network information in the configuration file `${root_dir}/src/config.py`.
139 | 
140 | #### 3. apply a new potential center selection algorithm
141 | 
142 | Modify method `Sutils.get_boundary_points()` in `${root_dir}/src/molecule_spec_sutils.py`.
143 | 
144 | #### 4. use a new simulation package
145 | 
146 | Modify `biased_simulation.py` or `biased_simulation_general.py`
147 | 
148 | ## Citation
149 | 
150 | If you use this code in your work, please cite:
151 | 
152 | - Chen, Wei, and Andrew L. Ferguson. "Molecular enhanced sampling with autoencoders: On‐the‐fly collective variable discovery and accelerated free energy landscape exploration." Journal of computational chemistry 39.25 (2018): 2079-2102.
153 | 
154 | - Chen, Wei, Aik Rui Tan, and Andrew L. Ferguson. "Collective variable discovery and enhanced sampling using autoencoders: Innovations in network architecture and error function design." The Journal of chemical physics 149.7 (2018): 072312.
155 | 
156 | ## Contact
157 | 
158 | For any questions, feel free to contact weichen9@illinois.edu or open a github issue.
159 | 


--------------------------------------------------------------------------------
/archive/plumed_adp.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiHelloWorld/accelerated_sampling_with_autoencoder/fe2b98bc81fc0b30db42ca8a83e23adb775a487d/archive/plumed_adp.zip


--------------------------------------------------------------------------------
/figures/autoencoder_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiHelloWorld/accelerated_sampling_with_autoencoder/fe2b98bc81fc0b30db42ca8a83e23adb775a487d/figures/autoencoder_1.png


--------------------------------------------------------------------------------
/figures/autoencoder_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiHelloWorld/accelerated_sampling_with_autoencoder/fe2b98bc81fc0b30db42ca8a83e23adb775a487d/figures/autoencoder_2.png


--------------------------------------------------------------------------------
/figures/diagram_autoencoder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiHelloWorld/accelerated_sampling_with_autoencoder/fe2b98bc81fc0b30db42ca8a83e23adb775a487d/figures/diagram_autoencoder.png


--------------------------------------------------------------------------------
/figures/hierarchical_autoencoder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weiHelloWorld/accelerated_sampling_with_autoencoder/fe2b98bc81fc0b30db42ca8a83e23adb775a487d/figures/hierarchical_autoencoder.png


--------------------------------------------------------------------------------