├── Makefile
├── README.md
├── analysis_pick_results.py
├── autoPick.py
├── autoPicker.py
├── dataLoader.py
├── deepModel.py
├── display.py
├── extractData.py
├── starReader.py
├── train.py
└── trained_model
    ├── checkpoint
    ├── model_demo_type3
    └── model_demo_type3.meta


/Makefile:
--------------------------------------------------------------------------------
 1 | # demo excution to the scripts as the `README.md` suggested.
 2 | trainType1:
 3 | 	python train.py --train_type 1 --train_inputDir '/media/bioserver1/Data/paper_test/dataset/trpv1/train/original' --particle_size 180 --mrc_number 100 --particle_number 10000 --coordinate_symbol '_manual_checked' --model_save_dir '../trained_model' --model_save_file 'model_demo_type1'
 4 | 
 5 | extractData:
 6 | 	python extractData.py --inputDir '/media/bioserver1/Data/paper_test/dataset/ss/train/original' --particle_size 320 --mrc_number 100 --coordinate_symbol '_manual_checked' --save_dir '../extracted_data' --save_file 'ss_original.pickle'
 7 | 	python extractData.py --inputDir '/media/bioserver1/Data/paper_test/dataset/gammas/train/original' --particle_size 180 --mrc_number 100 --coordinate_symbol '_manual_checked' --save_dir '../extracted_data' --save_file 'gammas_original.pickle'
 8 | 	python extractData.py --inputDir '/media/bioserver1/Data/paper_test/dataset/trpv1/train/original' --particle_size 180 --mrc_number 100 --coordinate_symbol '_manual_checked' --save_dir '../extracted_data' --save_file 'trpv1_original.pickle'
 9 | 
10 | trainType2:
11 | 	python train.py --train_type 2 --train_inputDir '../extracted_data' --train_inputFile 'ss_original.pickle;gammas_original.pickle;trpv1_original.pickle' --particle_number 30000 --model_save_dir './trained_model' --model_save_file 'model_demo_type3'
12 | 
13 | trainType3:
14 | 	python train.py --train_type 3 --train_inputFile '/media/bioserver1/Data/paper_test/dataset/trpv1/train/trpv1_manualpick_less.star' --particle_size 180 --particle_number 10000 --model_save_dir '../trained_model' --model_save_file 'model_demo_type1'
15 |         
16 | pick:
17 | 	python autoPick.py --inputDir '/media/bioserver1/Data/paper_test/dataset/trpv1/test/original' --pre_trained_model '../trained_model/model_demo_type2_ss_gammas' --particle_size 180 --mrc_number 20 --outputDir '../autopick-trpv1-by-demo-ss-gammas' --coordinate_symbol '_cnnPick' --threshold 0.5
18 | 
19 | trainType4:
20 | 	python train.py --train_type 4 --train_inputDir '/media/bioserver1/Data/paper_test/dataset/trpv1/test/original' --train_inputFile '../autopick-trpv1-by-demo-ss-gammas/autopick_results.pickle' --particle_size 180 --particle_number 10000 --model_save_dir '../trained_model' --model_save_file 'model_demo_type4_trpv1_iter1_by_ss_gammas'
21 | 
22 | analysis:
23 | 	python analysis_pick_results.py --inputFile '../autopick-trpv1-by-demo-ss-gammas/autopick_results.pickle' --inputDir '/media/bioserver1/Data/paper_test/dataset/trpv1/test/original' --particle_size 180 --coordinate_symbol '_refine_frealign' --minimum_distance_rate 0.2
24 | 
25 | recommended_step1:
26 | 	python autoPick.py --inputDir 'Your_mrc_file_DIR' --pre_trained_model './trained_model/model_demo_type3' --particle_size Your_particle_size --mrc_number 100 --outputDir '../autopick-results-by-demo-type3' --coordinate_symbol '_cnnPick' --threshold 0.5
27 | 	python autoPick.py --inputDir '/media/bioserver1/Data/paper_test/dataset/trpv1/test/original' --pre_trained_model '../trained_model/model_demo_type2_ss_gammas' --particle_size 180 --mrc_number 20 --outputDir '../autopick-trpv1-by-demo-ss-gammas' --coordinate_symbol '_cnnPick' --threshold 0.5
28 | 
29 | recommended_step2:
30 | 
31 | testTrain:
32 | 	python train.py --train_type 1 --train_inputDir '../data/Micrographs' --particle_size 60 --mrc_number 100 --particle_number 10000 --coordinate_symbol '_manual' --model_save_dir '../trained_model' --model_save_file 'model_test'
33 | 
34 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # DeepPicker
  2 | 
  3 | For more details about 'DeepPicker', please refer to the paper [DeepPicker](https://arxiv.org/abs/1605.01838). 
  4 | This is the python version based on [TensorFlow](https://www.tensorflow.org/). 
  5 | So far it only supports Ubuntu 12.0+, centOS 7.0+, and RHEL 7.0+.
  6 | 
  7 | ## 1. Install TensorFlow 
  8 | For more details about [Tensorflow](https://www.tensorflow.org/), please refer to the [website](https://www.tensorflow.org/). Cuda toolkit 7.5 and cuDNN v4 are required to install the GPU version of Tensorflow. There are 5 different ways to install tensorflow, and "Virtualenv install" is recommended for not impacting any existing Python program on your machine.
  9 | 
 10 | ### 1.1 Install Cuda Toolkit 7.5
 11 | download and install Cuda Toolkit 7.5
 12 | 
 13 | https://developer.nvidia.com/cuda-downloads
 14 | 
 15 | ### 1.2 Install cudnn v4
 16 | Download and install cuDNN v4
 17 | 
 18 | https://developer.nvidia.com/cudnn
 19 | 
 20 | Uncompress and copy the cuDNN files into the toolkit directory. Assuming the toolkit is installed in /usr/local/cuda, run the following commands (edited to reflect the cuDNN version you downloaded):
 21 | 
 22 |     tar xvzf cudnn-7.0-linux-x64-v4.0-prod.tgz
 23 |     sudo cp cudnn-7.0-linux-x64-v4/cudnn.h /usr/local/cuda/include
 24 |     sudo cp cudnn-7.0-linux-x64-v4/libcudnn* /usr/local/cuda/lib64
 25 |     sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn
 26 | 
 27 | ### 1.3 Virtualenv installation of Tensorflow
 28 | [Virtualenv](https://pypi.python.org/pypi/virtualenv) is a tool to keep the dependencies required by different Python projects in separate places. The Virtualenv installation of TensorFlow will not override pre-existing version of the Python packages needed by TensorFlow.
 29 | 
 30 | With [Virtualenv](https://pypi.python.org/pypi/virtualenv) the installation is as follows:
 31 | 
 32 |  - Install pip and Virtualenv.
 33 |  - Create a Virtualenv environment.
 34 |  - Activate the Virtualenv environment and install TensorFlow in it.
 35 |  - After the install you will activate the Virtualenv environment each time you want to use TensorFlow.
 36 |  
 37 | Install pip abd Virtualenv:
 38 | 
 39 |     # Ubuntu/Linux 64-bit
 40 |     $ sudo apt-get install python-pip python-dev python-virtualenv
 41 | 
 42 | Create a Virtualenv environment in the directory ~/tensorflow:
 43 | 
 44 |     $ virtualenv --system-site-packages ~/tensorflow
 45 | 
 46 | Activate the environment:
 47 | 
 48 |     $ source ~/tensorflow/bin/activate  # If using bash
 49 | $ source ~/tensorflow/bin/activate.csh  # If using csh
 50 |     (tensorflow)$  # Your prompt should change
 51 | 
 52 | Now, install TensorFlow just as you would for a regular Pip installation:
 53 | 
 54 |     # Ubuntu/Linux 64-bit, GPU enabled, Python 2.7 
 55 |     # Requires CUDA toolkit 7.5 and CuDNN v4. 
 56 |     (tensorflow)$ pip install --upgrade https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.9.0-cp27-none-linux_x86_64.whl
 57 | 
 58 | With the Virtualenv environment activated, you can now test your installation.
 59 | 
 60 |     $ python
 61 |     ...
 62 |     >>> import tensorflow as tf
 63 |     >>> hello = tf.constant('Hello, TensorFlow!')
 64 |     >>> sess = tf.Session()
 65 |     >>> print(sess.run(hello))
 66 |     Hello, TensorFlow!
 67 |     >>> a = tf.constant(10)
 68 |     >>> b = tf.constant(32)
 69 |     >>> print(sess.run(a + b))
 70 |     42
 71 |     >>>
 72 | 
 73 | When you are done using TensorFlow, deactivate the environment.
 74 | 
 75 |     (tensorflow)$ deactivate
 76 | $  # Your prompt should change back
 77 | 
 78 | To use TensorFlow later you will have to activate the Virtualenv environment again:
 79 | 
 80 |     $ source ~/tensorflow/bin/activate  # If using bash.
 81 | $ source ~/tensorflow/bin/activate.csh  # If using csh.
 82 |     (tensorflow)$  # Your prompt should change.
 83 |     
 84 |     # Run Python programs that use TensorFlow.
 85 |     ...
 86 |     # When you are done using TensorFlow, deactivate the environment.
 87 |     (tensorflow)$ deactivate
 88 | 
 89 | ### 1.4 Install other python packages
 90 |     
 91 |     # install package matplotlib and scipy
 92 |     # ubuntu system
 93 |     > sudo apt-get install python-matplotlib
 94 |     > sudo apt-get install python-scipy
 95 | 
 96 | ## 2. Recommended procedure
 97 | ### 2.1 fully automated particle picking
 98 | This is the way we used in our paper to do the fully automated particle picking. There are three steps.
 99 | 
100 | Step 1, before doing the automatic picking job, a pre-trained model is needed. Here we have offered a demo model in './trained_model/model_demo_type3'. It was trained in a cross-molecule manner (see Section 3.2 in our paper) with three types of molecules, including TRPV1, gammas-secretase and spliceosome. The number of positive samples for training is 30,000. You can either do your automatic particle picking job based on this model or train your own model based on more types of molecules and more training samples (see Section 3.2). After you get a pre-trained model, do the picking job. 
101 | 
102 |     python autoPick.py --inputDir 'Your_mrc_file_DIR' --pre_trained_model './trained_model/model_demo_type3' --particle_size Your_particle_size --mrc_number 100 --outputDir '../autopick-results-by-demo-type3' --coordinate_symbol '_cnnPick' --threshold 0.5
103 | 
104 | Step 2, do the iterative training (see Section 3.3):
105 | 
106 |     python train.py --train_type 3 --train_inputDir 'Your_mrc_file_DIR' --train_inputFile '../autopick-results-by-demo-type3/autopick_results.pickle' --particle_size Your_particle_size --particle_number 10000 --model_save_dir './trained_model' --model_save_file 'model_demo_type3_iter1_by_type3'
107 |     
108 | Step 3, do the final picking job (see Section 3.2):
109 |     
110 |     python autoPick.py --inputDir 'Your_mrc_file_DIR' --pre_trained_model './trained_model/model_demo_type3_iter1_by_type3' --particle_size Your_particle_size --mrc_number -1 --outputDir '../autopick-results-by-demo-type3-iter1' --coordinate_symbol '_cnnPick' --threshold 0.5
111 | 
112 | So the final picked coordinate files are produced in '../autopick-results-by-demo-type3-iter1'.
113 | 
114 | ### 2.2 cooperate with Relion 2D classification 
115 | This is a practical way to do the particle picking cooperating with Relion 2D classification.
116 | 
117 | Step 1, before doing the automatic picking job, a pre-trained model is needed. Here we have offered a demo model in './trained_model/model_demo_type3'. It was trained in a cross-molecule manner with three types of molecules, including TRPV1, gammas-secretase and spliceosome. And the number of positive samples for training is 30,000. You can either do your automatic picking job based on this model or train your own model based on more kinds of molecules and more training samples. After you get a pre-trained model, do the automatic particle picking job.
118 | 
119 |     python autoPick.py --inputDir 'Your_mrc_file_DIR' --pre_trained_model './trained_model/model_demo_type3' --particle_size Your_particle_size --mrc_number 100 --outputDir '../autopick-results-by-demo-type3' --coordinate_symbol '_cnnPick' --threshold 0.4
120 | 
121 | Step 2, do the 2D classification in Relion based on the picked coordinate files in '../autopick-results-by-demo-type3'. 
122 | Select those good average results to store in a '.star' file, like 'classification2D_demo.star'.
123 | 
124 | Step 3, do the training job based on the 'classification2D_demo.star' (see Section 3.4)
125 | 
126 |     python train.py --train_type 4 --train_inputFile '/Your_DIR/classification2D_demo.star' --particle_size Your_particle_size --particle_number -1 --model_save_dir './trained_model' --model_save_file 'model_demo_type3_2D'
127 |     
128 | Step 4, do the final picking job.
129 | 
130 |     python autoPick.py --inputDir 'Your_mrc_file_DIR' --pre_trained_model './trained_model/model_demo_type3_2D' --particle_size Your_particle_size --mrc_number -1 --outputDir '../autopick-results-by-demo-type3-2D' --coordinate_symbol '_cnnPick' --threshold 0.5
131 | 
132 | So the final picked coordinate files are produced in '../autopick-results-by-demo-type3-2D'.
133 | 
134 | ## 3. Training the model
135 | The main script for training a model is `train.py`. There are 4 ways to train a CNN model.
136 | 
137 | Type 1: It aims to train a CNN model based on a single type of molecule. The script loads the training data from micrograph directory directly.
138 | 
139 | Type 2: It aims to train a CNN model based on multiple types of molecules. It coorperates with script `extractData.py` to train a cross-molecule model (see Section 3.2).
140 | 
141 | Type 3: It aims to do the iterative training. It is a complement to the fully automatic particle picking which is based on a cross-molecule manner. Here we take the pre-picked particles as training samples to train a new model and then pick the particles based on the new model to mimic the semi-automated manner.
142 | 
143 | Type 4: It aims to improve the picking results coorperating with Relion 2D classification. It is a complement to the fully automatic particle picking. When the fully automatic particle picking finished, do the Relion 2D classification job to the picked particles and save the good class averaging results in a `.star` file. The program will extract all the particles in the `.star` file as the positive samples to train a CNN model. 
144 |  
145 | All the following commands can be found in the `Makefile`. 
146 | 
147 | ### 3.1 Train Type 1
148 | Options for training model in single-molecule manner are:
149 | 
150 |     --train_type, 1, specify the training type
151 |     --train_inputDir, string, specify the directory of micrograph files, like '/media/bioserver1/Data/paper_test/trpv1/train/'
152 |     --particle_size, int, the size of the particle
153 |     --mrc_number, int, the default value is -1, so all the micrographs with coordinate files will be used for training.
154 |     --particle_number, int, the default value is -1, so all the extracted particles will be used as training samples.
155 |     --coordinate_symbol, string, the symbol of the coordinate file, like '_manualpick'. The coordinate files should be in the same directory as micrographs.
156 |     --model_save_dir, string, specify the diretory to save the model.
157 |     --model_save_file, string, specify the file to save the model.
158 | 
159 | run the script `train.py`:
160 | 
161 |     python train.py --train_type 1 --train_inputDir '/media/bioserver1/Data/paper_test/trpv1/train' --particle_size 180 --mrc_number 100 --particle_number 10000 --coordinate_symbol '_manual_checked' --model_save_dir '../trained_model' --model_save_file 'model_demo_type1'
162 | 
163 | When finished, the trained model will be saved in file **'../trained_model/model_demo_type1'**.
164 | 
165 | ### 3.2 Train Type2
166 | Before training a model in multi-molecule manner, the positive samples and negative samples from different molecules should be extracted through script `extractData.py` at first.
167 | #### 3.2.1 extract particles into numpy binary file
168 | Options for extracting the positive and negative samples into a binary file are:
169 | 
170 |     --inputDir, string, specify the directory of micrograph files
171 |     --particle_size, int, the size of the particle
172 |     --mrc_number, int, the default value is -1, so all the micrographs with coordinate files will be extracted.
173 |     --coordinate_symbol, string, the symbol of the coordinate file, like '_manualpick'. The coordinate files should be in the same directory as the micrographs.
174 |     --save_dir, string, specify the diretory to save the extracted samples.
175 |     --save_file, string, specify the file to save the extracted samples, e.g., 'trpv1.pickle'
176 | 
177 | run the script `extractData.py`:
178 |     
179 |     # extract the samples of molecule A
180 |     python extractData.py --inputDir '/media/bioserver1/Data/paper_test/molecule_A/train' --particle_size 320 --mrc_number 300 --coordinate_symbol '_manual_checked' --save_dir '../extracted_data' --save_file 'molecule_A.pickle'
181 |     
182 |     # extract the samples of molecule B
183 |     python extractData.py --inputDir '/media/bioserver1/Data/paper_test/molecule_B/train/' --particle_size 180 --mrc_number 100 --coordinate_symbol '_manual_checked' --save_dir '../extracted_data' --save_file 'molecule_B.pickle'
184 | 
185 | When finished, the particles of molecule A and molecule B are stored in **'../extracted_data/molecule_A.pickle'** and **'../extracted_data/molecule_B.pickle'** respectively.
186 | 
187 | #### 3.2.2 training
188 | Options for training model in multi-molecule manner are:
189 | 
190 |     --train_type, 2, specify the training type
191 |     --train_inputDir, string, specify the input directory, like '../extracted_data'
192 |     --train_inputFile, string, specify the input file, like 'molecule_A.pickle;molecule_B.pickle', the separator must be ';'.
193 |     --particle_number, int, the default value is -1, so all the particles in the data file will be used for training. If it is set to 10000, and there are two kinds of molecules, then each one contributes only 5,000 positive samples.  
194 |     --model_save_dir, string, specify the diretory to save the model.
195 |     --model_save_file, string, specify the file to save the model.
196 |     
197 | run the script `train.py`:
198 | 
199 |     python train.py --train_type 2 --train_inputDir '../extracted_data' --train_inputFile 'molecule_A.pickle;molecule_B.pickle' --particle_number 10000 --model_save_dir '../trained_model' --model_save_file 'model_demo_type2_molecule_A_B'
200 | 
201 | When finished, the trained model will be saved in file **'../trained_model/model_demo_type2_molecule_A_B'**.  The model was trained by two kinds of molecules, each contributes 5,000 positive training samples.  
202 | 
203 | ### 3.3 Train Type 3
204 | Before we do the iterative training, we need to pick the particles based on pre-trained model. Suppose we have finished the picking step in Section 4. Then we can train a new model based on the picked results.
205 | Options for training model based on pre-picked results are:
206 | 
207 |     --train_type, 3, specify the training type
208 |     --train_inputDir, string, specify the input directory of the micrograph files
209 |     --train_inputFile, string, specify the input file of the pre-picked results, like '/PICK_PATH/autopick_results.pickle'
210 |     --particle_number, value, if the value is ranging (0,1), then it means the prediction threshold. If the value is ranging (1,100), then it means the proportion of the top sorted ranking particles. If the value is larger than 100, then it means the number of top sorted ranking particles.
211 | 
212 | run the script `train.py`:
213 |  
214 |     python train.py --train_type 3 --train_inputDir '/media/bioserver1/Data/paper_test/trpv1/test/' --train_inputFile '../autopick-trpv1-by-demo-molecule-A-B/autopick_results.pickle' --particle_size 180 --particle_number 10000 --model_save_dir '../trained_model' --model_save_file 'model_demo_type3_trpv1_iter1_by_molecule_A_B'
215 | 
216 | When finished, the trained model will be saved in file **'../trained_model/model_demo_type3_trpv1_iter1_by_molecule_A_B'**
217 | 
218 | ### 3.4 Train Type 4
219 | Options for training model based on Relion 2D classification results are:
220 | 
221 |     --train_type, 4, specify the training type
222 |     --train_inputFile, string, specify the input `.star` file, like '/${YOUR_PATH}/classification2D.star'
223 |     --particle_size, int, the size of the particle
224 |     --particle_number, int, the default value is -1, so all the particles in the `classification2D.star` file will be used as training samples.
225 |     --model_save_dir, string, specify the diretory to save the model.
226 |     --model_save_file, string, specify the file to save the model.
227 | 
228 | run the script `train.py`:
229 |     
230 |     python train.py --train_type 4 --train_inputFile '/media/bioserver1/Data/paper_test/trpv1/train/trpv1_manualpick_less.star' --particle_size 180 --particle_number -1 --model_save_dir '../trained_model' --model_save_file 'model_demo_type4'
231 | 
232 | When finished, the trained model will be saved in **'../trained_model/model_demo_type4'**.
233 | 
234 | ## 4. Picking
235 | Options for picking particles based on pre-trained model are:
236 | 
237 |     --inputDir, string, specify the directory of micrograph files 
238 |     --particle_size, int, the size of the particle
239 |     --mrc_number, int, the default value is -1, so the micrographs in the directory will be picked.
240 |     --pre_trained_model, string, specify the pre-trained model.
241 |     --outputDir, string, specify the directory of output coordinate files
242 |     --coordinate_symbol, string, the symbol of the saved coordinate file, like '_cnnPick'. 
243 |     --threshold, float, specify the threshold to pick particle, the default is 0.5.
244 |  
245 |  run the script `train.py`:
246 |     
247 |     python autoPick.py --inputDir '/media/bioserver1/Data/paper_test/trpv1/test/' --pre_trained_model '../trained_model/model_demo_type2_molecule_A_B' --particle_size 180 --mrc_number 20 --outputDir '../autopick-trpv1-by-demo-molecule-A-B' --coordinate_symbol '_cnnPick' --threshold 0.5
248 | 
249 | 
250 | When finished, the picked coordinate file will be saved in **'../autopick-trpv1-by-demo-molecule-A-B'**. The format of the coordinate file is Relion '.star'.
251 | 
252 | Besides, a binary file called **'../autopick-trpv1-by-demo-molecule-A-B/autopick_results.pickle'** is produced. It contains all the particles information. It will be used to do an iterative training or to estimate the precision and recall compared to the reference (e.g., those particles manually picked by experts).
253 | 
254 | ## 5. Comparing the picking results with reference
255 | The script `analysis_pick_results.py` is used to estimate the precision and recall based on the reference results (e.g., those particles manually picked by experts).
256 | 
257 | Options:
258 |  
259 |     --inputFile, string, specify the file of the picking results, like '/PICK_PATH/autopick_results.pickle'  
260 |     --inputDir, string, specify the directory of the reference coordinate files 
261 |     --particle_size, int, the size of the particle
262 |     --coordinate_symbol, string, the symbol of the reference coordinate file, like '_manualPick'. 
263 |     --minimum_distance_rate, float, take the value particle_size*minimum_distance_rate as the distance threshold for estimate the number of true positive samples, the default value is 0.2.
264 |     
265 | run the script `analysis_pick_results.py`:
266 | 
267 |     python analysis_pick_results.py --inputFile '../autopick-trpv1-by-demo-molecule-A-B/autopick_results.pickle' --inputDir '/media/bioserver1/Data/paper_test/trpv1/test' --particle_size 180 --coordinate_symbol '_refine_frealign' --minimum_distance_rate 0.2
268 | 
269 | When finished, a result file `../autopick-trpv1-by-demo-molecule-A-B/results.txt` will be produced. It records the precision and recall values as well as the deviations of the centers from the reference particles.
270 | 
271 | If you have any questions, please contact us at "*251138964@qq.com*".
272 | 
273 | 


--------------------------------------------------------------------------------
/analysis_pick_results.py:
--------------------------------------------------------------------------------
 1 | from autoPicker import AutoPicker
 2 | from optparse import OptionParser
 3 | 
 4 | def analysis_results():
 5 |     parser = OptionParser()
 6 |     parser.add_option("--inputFile", dest="inputFile", help="Input picking results file, like '/PATH/autopick_results.list'", metavar="FILE")
 7 |     parser.add_option("--inputDir", dest="inputDir", help="Reference coordinate directory", metavar="DIRECTORY")
 8 |     parser.add_option("--coordinate_symbol", dest="coordinate_symbol", help="The symbol of the coordinate file, like '_manualPick'", metavar="STRING")
 9 |     parser.add_option("--particle_size", dest="particle_size", help="the size of the particle.", metavar="VALUE", default=-1)
10 |     parser.add_option("--minimum_distance_rate", dest="minimum_distance_rate", help="Use the value particle_size*minimum_distance_rate as the distance threshold for estimate the number of true positive samples, the default value is 0.2", metavar="VALUE", default=0.2)
11 |     (opt, args) = parser.parse_args()
12 | 
13 |     pick_results_file = opt.inputFile
14 |     reference_mrc_file_dir = opt.inputDir
15 |     reference_coordinate_symbol = opt.coordinate_symbol
16 |     particle_size = int(opt.particle_size)
17 |     minimum_distance_rate = float(opt.minimum_distance_rate)
18 |     AutoPicker.analysis_pick_results(pick_results_file, reference_mrc_file_dir, reference_coordinate_symbol, particle_size, minimum_distance_rate)
19 | 
20 | def main(argv=None):
21 |     analysis_results()
22 | 
23 | if __name__ == '__main__':
24 |     main()
25 | 


--------------------------------------------------------------------------------
/autoPick.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | from datetime import datetime
  6 | import math
  7 | import time
  8 | import os
  9 | import re
 10 | import pickle
 11 | import numpy as np
 12 | from optparse import OptionParser
 13 | import tensorflow as tf
 14 | 
 15 | from deepModel import DeepModel
 16 | from autoPicker import AutoPicker
 17 | 
 18 | def pick_particle():
 19 |     # define the options
 20 |     parser = OptionParser()
 21 |     parser.add_option("--inputDir", dest="inputDir", help="Input directory", metavar="DIRECTORY")
 22 |     parser.add_option("--pre_trained_model", dest="pre_trained_model", help="Input the pre-trained model", metavar="FILE")
 23 |     parser.add_option("--mrc_number", dest="mrc_number", help="Number of mrc files to be picked.", metavar="VALUE", default=-1)
 24 |     parser.add_option("--particle_size", dest="particle_size", help="the size of the particle.", metavar="VALUE", default=-1)
 25 |     parser.add_option("--outputDir", dest="outputDir", help="Output directory, the coordinates file will be saved here.", metavar="DIRECTORY")
 26 |     parser.add_option("--coordinate_symbol", dest="coordinate_symbol", help="The symbol of the saveed coordinate file, like '_cnnPick'", metavar="STRING")
 27 |     parser.add_option("--threshold", dest="threshold", help="Pick the particles, the prediction value is larger than the threshold..", metavar="VALUE", default=0.5)
 28 |     (opt, args) = parser.parse_args()
 29 | 
 30 |     # set the random seed for numpy and tensorflow
 31 |     tf.set_random_seed(1234)
 32 |     np.random.seed(1234)
 33 |     
 34 |     # define the input size of the model
 35 |     model_input_size = [1000, 64, 64, 1]
 36 |     num_class = 2                   # the number of the class
 37 |     batch_size = model_input_size[0]
 38 | 
 39 |     particle_size = int(opt.particle_size)
 40 | 
 41 |     pre_trained_model = opt.pre_trained_model
 42 |     input_dir = opt.inputDir
 43 |     output_dir = opt.outputDir
 44 |     threshold = float(opt.threshold)
 45 |     coordinate_symbol = opt.coordinate_symbol
 46 |     mrc_number = int(opt.mrc_number)
 47 | 
 48 |     if not os.path.isfile(pre_trained_model):
 49 |         print("ERROR:%s is not a valid file."%(pre_trained_model))
 50 |     
 51 |     if not os.path.isdir(input_dir):
 52 |         print("ERROR:%s is not a valid dir."%(input_dir))
 53 | 
 54 |     if not os.path.isdir(output_dir):
 55 |         os.mkdir(output_dir)
 56 | 
 57 |     # initialize the model 
 58 |     deepModel = DeepModel(particle_size, model_input_size, num_class)
 59 |     deepModel.init_model_graph_evaluate()
 60 | 
 61 |     # load mrc files 
 62 |     mrc_file_all = []
 63 |     files = os.listdir(input_dir)
 64 |     for f in files:
 65 |         if re.search('\.mrc$', f): 
 66 |             filename = os.path.join(input_dir, f)
 67 |             mrc_file_all.append(filename)
 68 |     
 69 |     mrc_file_all.sort()
 70 |     if mrc_number<=0:
 71 |         mrc_number = len(mrc_file_all)
 72 | 
 73 |     if mrc_number > len(mrc_file_all):
 74 |         mrc_number = len(mrc_file_all)
 75 |     
 76 |     with tf.Session() as sess:
 77 |         # reload the pre-trained model
 78 |         saver = tf.train.Saver()
 79 |         saver.restore(sess, pre_trained_model)
 80 |         
 81 |         # do the autopick
 82 |         autopicker = AutoPicker(sess, model_input_size, deepModel, particle_size)    
 83 |         time1 = time.time()
 84 |         candidate_particle_all = []
 85 |         for i in range(mrc_number):
 86 |             # elements in list 'coordinate' are small list, [x_coordinate, y_coordinate, prediction_value, micrograph_name]
 87 |             coordinate = autopicker.pick(mrc_file_all[i])
 88 |             candidate_particle_all.append(coordinate)   
 89 |             AutoPicker.write_coordinate(coordinate, mrc_file_all[i], coordinate_symbol, threshold, output_dir)
 90 |         time_cost = time.time() - time1
 91 |         print("time cost: %.1f s"%time_cost)
 92 | 
 93 |         # write the pick all results(threshold=0) to file
 94 |         output_file = os.path.join(output_dir, 'autopick_results.pickle')
 95 |         AutoPicker.write_pick_results(candidate_particle_all, output_file)
 96 | 
 97 | def main(argv=None):
 98 |     pick_particle()
 99 | if __name__ == '__main__':
100 |     #tf.app.run()
101 |     main()
102 | 


--------------------------------------------------------------------------------
/autoPicker.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import  
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import gzip
  6 | import os
  7 | import re
  8 | import sys
  9 | import time
 10 | import math
 11 | import pickle
 12 | 
 13 | from six.moves import urllib
 14 | import tensorflow as tf
 15 | import numpy as np
 16 | import scipy.misc
 17 | import scipy.ndimage as ndimage
 18 | import scipy.ndimage.filters as filters
 19 | import matplotlib.pyplot as plt
 20 | from operator import itemgetter, attrgetter
 21 | 
 22 | from deepModel import DeepModel
 23 | from dataLoader import DataLoader
 24 | import display
 25 | # image data constants information
 26 | class AutoPicker(object):
 27 |     """
 28 | 
 29 |     """
 30 |     def __init__(self, sess, model_input_size, deepModel, particle_size):
 31 |         """Initialize the Autopicker.
 32 | 
 33 |         Args:
 34 |             sess: an instance of tensorflow session.
 35 |             model_input_size: a list of length 4, it is the input size of a placeholder of tensorflow.
 36 |             deepModel: an instance of class deepModel
 37 |             particle_size: the particle size of the molecular
 38 |         
 39 |         """
 40 |         self.sess = sess
 41 |         self.model_input_size = model_input_size
 42 |         self.deepModel = deepModel
 43 |         self.particle_size = particle_size
 44 |         self.SEED = 6543
 45 | 
 46 |     def peak_detection(self, image_2D, local_window_size):
 47 |         """Do the local peak dection to get the best coordinate of molecular center.
 48 | 
 49 |         This function does a local peak dection to the score map to get the best coordinates.
 50 | 
 51 |         Args:
 52 |             image_2d: numpy.array, it is a 2d array, the dim is 2, the value of it was a prediction score given by the CNN model.
 53 |             local_window_size: this is the distance threshold between two particles. The peak detection is done in the local window.
 54 | 
 55 |         Returns:
 56 |             return list_coordinate_clean
 57 |             list_coordinate_clean: a list, the length of this list stands for the number of picked particles.
 58 |                                    Each element in the list is also a list, the length is 3.
 59 |                                    The first one is x-axis, the second one is y-axis, the third one is the predicted score.
 60 |         """
 61 |         col = image_2D.shape[0] 
 62 |         row = image_2D.shape[1]
 63 |         # filter the array in local, the values are replaced by local max value. 
 64 |         data_max = filters.maximum_filter(image_2D, local_window_size)
 65 |         # compare the filter array to the original one, the same value in the same location is the local maximum.
 66 |         # maxima is a bool 2D array, true stands for the local maximum
 67 |         maxima = (image_2D == data_max)
 68 |         data_min = filters.minimum_filter(image_2D, local_window_size)
 69 |         diff = ((data_max - data_min) > 0)
 70 |         maxima[diff == 0] = 0 
 71 | 
 72 |         labeled, num_objects = ndimage.label(maxima)
 73 |         # get the coordinate of the local maximum
 74 |         # the shape of the array_y_x is (number, 2)
 75 |         array_y_x = np.array(ndimage.center_of_mass(image_2D, labeled, range(1, num_objects+1)))
 76 |         array_y_x = array_y_x.astype(int)
 77 |         list_y_x = array_y_x.tolist()
 78 |         #print("number of local maximum:%d"%len(list_y_x))
 79 |         for i in range(len(list_y_x)):
 80 |             # add the prediction score to the list
 81 |             list_y_x[i].append(image_2D[ array_y_x[i][0] ][array_y_x[i][1] ]) 
 82 |             # add a symbol to the list, and it is used to remove crowded candidate
 83 |             list_y_x[i].append(0)
 84 |        
 85 |         # remove close candidate
 86 |         for i in range(len(list_y_x)-1):
 87 |             if list_y_x[i][3] == 1:
 88 |                 continue
 89 |             
 90 |             for j in range(i+1, len(list_y_x)):
 91 |                 if list_y_x[i][3] == 1:
 92 |                     break
 93 |                 if list_y_x[j][3] == 1:
 94 |                     continue
 95 |                 d_y = list_y_x[i][0] - list_y_x[j][0]
 96 |                 d_x = list_y_x[i][1] - list_y_x[j][1]
 97 |                 d_distance = math.sqrt(d_y**2 + d_x**2)
 98 |                 if d_distance < local_window_size/2:
 99 |                     if list_y_x[i][2] >= list_y_x[j][2]:
100 |                         list_y_x[j][3] = 1
101 |                     else:
102 |                         list_y_x[i][3] = 1  
103 |                 
104 |         list_coordinate_clean = []
105 |         for i in range(len(list_y_x)):
106 |             if list_y_x[i][3] == 0:
107 |                 # remove the symbol element
108 |                 list_x_y = []
109 |                 list_x_y.append(list_y_x[i][1])
110 |                 list_x_y.append(list_y_x[i][0])
111 |                 list_x_y.append(list_y_x[i][2])
112 |                 list_coordinate_clean.append(list_x_y)
113 | 
114 |         return list_coordinate_clean
115 | 
116 | 
117 |     def pick(self, mrc_filename):
118 |         """Do the picking job through tensorflow.
119 | 
120 |         This function read the micrograph data information based on the given filename of micrograph.
121 |         Then do the auto picking based on pre-trained CNN model.
122 | 
123 |         Args:
124 |             mrc_filename: string, it is the filename of the target micrograph.
125 | 
126 |         Returns:
127 |             return list_coordinate
128 |             list_coordinate: a list, the length of this list stands for the number of picked particles.
129 |                                    Each element in the list is also a list, the length is 4, the first one is y-axis, 
130 |                                    the second one is x-axis, the third one is the predicted score, the fourth is the micrograph filename.
131 |         """
132 |         # read the micrograph image data
133 |         print(mrc_filename)
134 |         header, body = DataLoader.readMrcFile(mrc_filename)
135 |         num_col = header[0]
136 |         num_row = header[1]
137 |         body_2d = np.array(body, dtype = np.float32).reshape(num_row, num_col)
138 |         
139 |         # do process to micrograph
140 |         body_2d, bin_size = DataLoader.preprocess_micrograph(body_2d)
141 |         
142 |         # Edge detection to get the ice noise mask
143 |         # a binary matrix, 1 stands for the ice noise site
144 |         # mask = edge_detection_ice(body_2d)
145 | 
146 |         step_size = 4
147 |         candidate_patches = None
148 |         candidate_patches_exist = False
149 |         num_total_patch = 0
150 |         patch_size = int(self.particle_size/bin_size)
151 |         # the size to do peak detection 
152 |         local_window_size = int(0.6*patch_size/step_size)
153 | 
154 |         #print("image_col:", body_2d.shape[0])
155 |         #print("particle_size:", patch_size)
156 |         #print("step_size:", step_size)
157 |         map_col = int((body_2d.shape[0]-patch_size)/step_size)
158 |         map_row = int((body_2d.shape[1]-patch_size)/step_size)
159 |          
160 |         #prediction = np.zeros((map_col, map_row), dtype = float)
161 |         time1 = time.time()
162 |         particle_candidate_all = []
163 |         map_index_col = 0
164 |         for col in range(0, body_2d.shape[0]-patch_size+1, step_size):
165 |             for row in range(0, body_2d.shape[1]-patch_size+1, step_size):
166 |                 # extract the particle patch
167 |                 patch = np.copy(body_2d[col:(col+patch_size), row:(row+patch_size)])
168 |                 # do preprocess to the particle
169 |                 patch = DataLoader.preprocess_particle(patch, self.model_input_size)
170 |                 particle_candidate_all.append(patch)
171 |                 num_total_patch = num_total_patch + 1
172 |             map_index_col = map_index_col + 1
173 | 
174 |         map_index_row = map_index_col-map_col+map_row
175 |         #print("map_col:",map_col)
176 |         #print("map_row:",map_row)
177 |         #print(len(particle_candidate_all))
178 |         #print("map_index_col:",map_index_col)
179 |         #print("map_index_row:",map_index_row)
180 |         #print("col*row:",map_index_col*map_index_row)
181 |         # reshape it to fit the input format of the model
182 |         particle_candidate_all = np.array(particle_candidate_all).reshape(num_total_patch, self.model_input_size[1], self.model_input_size[2], 1)
183 |         # predict
184 |         predictions = self.deepModel.evaluation(particle_candidate_all, self.sess)
185 |         predictions = predictions[:, 1:2]
186 |         predictions = predictions.reshape(map_index_col, map_index_row)
187 | 
188 |         time_cost = time.time() - time1
189 |         print("time cost: %d s"%time_cost)
190 |         #display.save_image(prediction, "prediction.png")
191 |         # get the prediction value to be a positive sample, it is a value between 0~1
192 |         # the following code not tested
193 |         # do a connected component analysis
194 |         # prediction = detete_large_component(prediction)
195 | 
196 |         # do a local peak detection to get the best coordinate
197 |         # list_coordinate is a 2D list of shape (number_particle, 3)
198 |         # element in list_coordinate is [x_coordinate, y_coordinate, prediction_value]
199 |         list_coordinate = self.peak_detection(predictions, local_window_size)
200 |         # add the mrc filename to the list of each coordinate
201 |         for i in range(len(list_coordinate)):
202 |             list_coordinate[i].append(mrc_filename)
203 |             # transform the coordinates to the original size 
204 |             list_coordinate[i][0] = (list_coordinate[i][0]*step_size+patch_size/2)*bin_size
205 |             list_coordinate[i][1] = (list_coordinate[i][1]*step_size+patch_size/2)*bin_size
206 |             
207 |         return list_coordinate
208 |   
209 |     @staticmethod
210 |     def write_coordinate(coordinate, mrc_filename, coordinate_symbol, threshold, output_dir):
211 |         """Write the picking results in the Relion '.star' format.
212 | 
213 |         This function selects the particles based on the given threshold and saves these particles in Relion '.star' file. 
214 | 
215 |         Args:
216 |             coordinate: a list, all the coordinates in it are come from the same micrograph. 
217 |                         The length of the list stands for the number of the particles.
218 |                         And each element in the list is a small list of length of 3 at least.
219 |                         The first element in the small list is the coordinate x-aixs. 
220 |                         The second element in the small list is the coordinate y-aixs. 
221 |                         The third element in the small list is the prediction score. 
222 |                         The fourth element in the small list is the micrograph name. 
223 |             mrc_filename: string, the corresponding micrograph file.
224 |             coordinate_symbol: the symbol is used in the output star file name, like '_manualPick', '_cnnPick'. 
225 |             threshold: particles over the threshold are stored, a default value is 0.5.
226 |             output_dir: the directory to store the coordinate file.
227 |         """
228 |         mrc_basename = os.path.basename(mrc_filename)
229 |         print(mrc_basename)
230 |         coordinate_name = os.path.join(output_dir, mrc_basename[:-4]+coordinate_symbol+".star")
231 |         print(coordinate_name)
232 |         f = open(coordinate_name, 'w')
233 |         f.write('data_\n\nloop_\n_rlnCoordinateX #1\n_rlnCoordinateY #2\n')
234 |         for i in range(len(coordinate)):
235 |             if coordinate[i][2] > threshold:
236 |                 f.write(str(coordinate[i][0])+' '+str(coordinate[i][1])+'\n') 
237 | 
238 |         f.close() 
239 |     
240 |     @staticmethod
241 |     def write_pick_results(coordinate, output_file):
242 |         """Write the picking results in a file of binary format.
243 | 
244 |         This function writes the coordinates of all micrographs into a binary file. 
245 | 
246 |         Args:
247 |             coordinate: a list, the length of it stands for the number of picked micrograph file.
248 |                         Each element is a list too, which contains all coordinates from the same micrograph. 
249 |                         The length of the list stands for the number of the particles.
250 |                         And each element in the list is a small list of length of 4.
251 |                         The first element in the small list is the coordinate x-aixs. 
252 |                         The second element in the small list is the coordinate y-aixs. 
253 |                         The third element in the small list is the prediction score. 
254 |                         The fourth element in the small list is the micrograh name. 
255 |             output_file: string, the output file.
256 |         """
257 |         with open(output_file, 'wb') as f:
258 |             pickle.dump(coordinate, f)
259 |     
260 |     @staticmethod
261 |     def analysis_pick_results(pick_results_file, reference_coordinate_dir, reference_coordinate_symbol, particle_size, minimum_distance_rate):
262 |         """Load the picking results from a file of binary format and compare it with the reference coordinate.
263 | 
264 |         This function analysis the picking results with reference coordinate and calculate the recall, precision and the deviation from the center.
265 | 
266 |         Args:
267 |             pick_results_file: string, the file name of the pre-picked results.
268 |             reference_mrc_dir: string, the directory of the mrc file dir.
269 |             reference_coordinate_symbol: the symbol of the coordinate, like '_manualpick'
270 |             particle_size: int, the size of particle
271 |             minimum_distance_rate: float, the default is 0.2, a picked coordinate is considered to be a true positive only when the distance between the picked coordinate and the reference coordinate is less than minimum_distance_rate mutiplicate particle_size.
272 |         """
273 |         with open(pick_results_file, 'rb') as f:
274 |             coordinate = pickle.load(f)
275 |             """
276 |             coordinate: a list, the length of it stands for the number of picked micrograph file.
277 |                         Each element is a list too, which contains all coordinates from the same micrograph. 
278 |                         The length of the list stands for the number of the particles.
279 |                         And each element in the list is a small list of length of 4.
280 |                         The first element in the small list is the coordinate x-aixs. 
281 |                         The second element in the small list is the coordinate y-aixs. 
282 |                         The third element in the small list is the prediction score. 
283 |                         The fourth element in the small list is the micrograh name. 
284 |             """
285 |         tp = 0
286 |         total_pick = 0
287 |         total_reference = 0
288 |         coordinate_total = []
289 |         for i in range(len(coordinate)):
290 |             mrc_filename = os.path.basename(coordinate[i][0][3])
291 |             #print(mrc_filename)
292 |             reference_coordinate_file = mrc_filename.replace('.mrc', reference_coordinate_symbol+'.star')
293 |             reference_coordinate_file = os.path.join(reference_coordinate_dir, reference_coordinate_file)
294 |             #print(reference_coordinate_file)
295 |             if os.path.isfile(reference_coordinate_file):
296 |                 reference_coordinate = DataLoader.read_coordinate_from_star(reference_coordinate_file)
297 |                 """
298 |                 reference_coordinate: a list, the length of it stands for the number of picked particles.
299 |                             And each element in the list is a small list of length of 2.
300 |                             The first element in the small list is the coordinate x-aixs. 
301 |                             The second element in the small list is the coordinate y-aixs. 
302 |                 """    
303 |                 tp_sigle, average_distance = AutoPicker.calculate_tp(coordinate[i], reference_coordinate, particle_size*minimum_distance_rate)
304 |                 #print("tp:",tp_sigle)
305 |                 #print("average_distance:",average_distance)
306 |                 # calculate the number of true positive, when the threshold is set to 0.5
307 |                 tp_sigle = 0
308 |                 total_reference = total_reference + len(reference_coordinate)
309 |                 for j in range(len(coordinate[i])):
310 |                     coordinate_total.append(coordinate[i][j])
311 |                     if coordinate[i][j][2]>0.5:
312 |                         total_pick = total_pick + 1
313 |                         if coordinate[i][j][4] == 1:
314 |                             tp = tp + 1
315 |                             tp_sigle = tp_sigle + 1
316 |                 print(tp_sigle/len(reference_coordinate))
317 |             else:
318 |                 print("Can not find the reference coordinate:"+reference_coordinate_file)
319 |         precision = tp/total_pick
320 |         recall = tp/total_reference
321 |         print("(threshold 0.5)precision:%f recall:%f"%(precision, recall))
322 |         # sort the coordinate based on prediction score in a descending order.
323 |         coordinate_total = sorted(coordinate_total, key = itemgetter(2), reverse = True) 
324 |         total_tp = []
325 |         total_recall = []
326 |         total_precision = []
327 |         total_probability = []
328 |         total_average_distance = []
329 |         total_distance = 0
330 |         tp_tem = 0
331 |         for i in range(len(coordinate_total)):
332 |             if coordinate_total[i][4] == 1:
333 |                 tp_tem = tp_tem + 1
334 |                 total_distance = total_distance + coordinate_total[i][5]
335 |             precision = tp_tem/(i+1)
336 |             recall = tp_tem/total_reference
337 |             total_tp.append(tp_tem)
338 |             total_recall.append(recall)
339 |             total_precision.append(precision)
340 |             total_probability.append(coordinate_total[i][2])
341 |             if tp_tem==0:
342 |                 average_distance = 0
343 |             else:
344 |                 average_distance = total_distance/tp_tem
345 |             total_average_distance.append(average_distance)
346 |         # write the list results in file
347 |         directory_pick = os.path.dirname(pick_results_file)
348 |         total_results_file = os.path.join(directory_pick, 'results.txt')
349 |         f = open(total_results_file, 'w')
350 |         # write total_tp
351 |         f.write(','.join(map(str, total_tp))+'\n')
352 |         f.write(','.join(map(str, total_recall))+'\n')
353 |         f.write(','.join(map(str, total_precision))+'\n')
354 |         f.write(','.join(map(str, total_probability))+'\n')
355 |         f.write(','.join(map(str, total_average_distance))+'\n')
356 |         f.write('#total autopick number:%d\n'%(len(coordinate_total))) 
357 |         f.write('#total manual pick number:%d\n'%(total_reference))
358 |         f.write('#the first row is number of true positive\n')
359 |         f.write('#the second row is recall\n')
360 |         f.write('#the third row is precision\n')
361 |         f.write('#the fourth row is probability\n')
362 |         f.write('#the fiveth row is distance\n')    
363 |         
364 |         # show the recall and precision
365 |         times_of_manual = len(coordinate_total)//total_reference + 1
366 |         for i in range(times_of_manual):
367 |             print('autopick_total sort, take the head number of total_manualpick * ratio %d'%(i+1))
368 |             f.write('#autopick_total sort, take the head number of total_manualpick * ratio %d \n'%(i+1))
369 |             if i==times_of_manual-1:
370 |                 print('precision:%f \trecall:%f'%(total_precision[-1], total_recall[-1]))
371 |                 f.write('precision:%f \trecall:%f \n'%(total_precision[-1], total_recall[-1]))
372 |             else:
373 |                 print('precision:%f \trecall:%f'%(total_precision[(i+1)*total_reference-1], total_recall[(i+1)*total_reference-1]))
374 |                 f.write('precision:%f \trecall:%f \n'%(total_precision[(i+1)*total_reference-1], total_recall[(i+1)*total_reference-1]))
375 |         f.close()
376 | 
377 |       
378 |     @staticmethod
379 |     def calculate_tp(coordinate_pick, coordinate_reference, threshold):
380 |         if len(coordinate_pick)<1 or len(coordinate_reference)<1:
381 |             print("Invalid coordinate parameters in function calculate_tp()!")
382 |         
383 |         # add a symbol to index whether the coordinate is matched with a reference coordinate
384 |         for i in range(len(coordinate_pick)):
385 |             coordinate_pick[i].append(0)
386 | 
387 |         tp = 0
388 |         average_distance = 0
389 | 
390 |         for i in range(len(coordinate_reference)):
391 |             coordinate_reference[i].append(0)
392 |             coor_x = coordinate_reference[i][0]
393 |             coor_y = coordinate_reference[i][1]
394 |             neighbour = []
395 |             for k in range(len(coordinate_pick)):
396 |                 if coordinate_pick[k][4]==0:
397 |                     coor_mx = coordinate_pick[k][0]
398 |                     coor_my = coordinate_pick[k][1]
399 |                     abs_x = math.fabs(coor_mx-coor_x)
400 |                     abs_y = math.fabs(coor_my-coor_y)
401 |                     length = math.sqrt(math.pow(abs_x, 2)+math.pow(abs_y, 2)) 
402 |                     if length < threshold: 
403 |                         same_n = [] 
404 |                         same_n.append(k)
405 |                         same_n.append(length)
406 |                         neighbour.append(same_n)
407 |             if len(neighbour)>=1: 
408 |                 if len(neighbour)>1:
409 |                     neighbour = sorted(neighbour, key = itemgetter(1))
410 |                 index = neighbour[0][0]
411 |                 # change the symbol to 1, means it matchs with a reference coordinate
412 |                 coordinate_pick[index][4] = 1
413 |                 # add the distance to the list
414 |                 coordinate_pick[index].append(neighbour[0][1])
415 |                 coordinate_pick[index].append(coor_x)
416 |                 coordinate_pick[index].append(coor_y)
417 |                 tp = tp + 1 
418 |                 average_distance = average_distance+neighbour[0][1]
419 |                 coordinate_reference[i][2] = 1
420 |         average_distance = average_distance/tp
421 |         return tp, average_distance
422 | 


--------------------------------------------------------------------------------
/dataLoader.py:
--------------------------------------------------------------------------------
   1 | import os
   2 | import struct
   3 | from PIL import Image
   4 | from pylab import *
   5 | import numpy as np
   6 | import re
   7 | import pickle
   8 | from matplotlib.patches import Ellipse, Circle
   9 | import matplotlib.pyplot as plt
  10 | import scipy.misc
  11 | import scipy.ndimage
  12 | import tensorflow as tf
  13 | import random
  14 | from operator import itemgetter, attrgetter
  15 | from matplotlib import pyplot as plt
  16 | 
  17 | import display 
  18 | from starReader import starRead
  19 | 
  20 | class DataLoader(object):
  21 |     
  22 |     #def __init__(self):
  23 |   
  24 |     @staticmethod 
  25 |     def bin_2d(body_2d, bin_size):
  26 |         """Do the bin process to the 2D array.
  27 | 
  28 |         This function can make bin the image based on the bin_size.
  29 |         bin_size is a int value. if it was set to 2, then the 4 points in a small patch 2x2 of the body_2d
  30 |                are summed to one value. It likes an average pooling operation.  
  31 | 
  32 |         Args:
  33 |             body_2d: numpy.array, it is a 2d array, the dim is 2.
  34 |             bin_size: int value. 
  35 | 
  36 |         Returns:
  37 |             return pool_result
  38 |             pool_result: numpy.array, the shape of it is (body_2d.shape[0]/bin_size, body_2d.shape[1]/bin_size)
  39 |         
  40 |         """
  41 |         """
  42 |         # using the tensorflow pooling operation to do the bin preprocess
  43 |         # memory cost, out of memory
  44 |         col = body_2d.shape[0]
  45 |         row = body_2d.shape[1]
  46 |         body_2d = body_2d.reshape(1, col, row, 1)
  47 |         body_node = tf.constant(body_2d)
  48 |         body_pool = tf.nn.avg_pool(body_node, ksize=[1, bin_size, bin_size, 1], strides=[1, bin_size, bin_size, 1], padding='VALID')
  49 |         with tf.Session(config=tf.ConfigProto(log_device_placement=False)) as sess:
  50 |             pool_result = sess.run(body_pool)
  51 |             pool_result = pool_result.reshape((pool_result.shape[1], pool_result.shape[2]))
  52 |         return pool_result
  53 |         """
  54 |         # based on the numpy operation to do the bin process
  55 |         col = body_2d.shape[0]
  56 |         row = body_2d.shape[1]
  57 |         scale_col = col//bin_size
  58 |         scale_row = row//bin_size
  59 |         patch = np.copy(body_2d[0:scale_col*bin_size, 0:scale_row*bin_size])
  60 |         patch_view = patch.reshape(scale_col, bin_size, scale_row, bin_size)
  61 |         body_2d_bin = patch_view.mean(axis=3).mean(axis=1)
  62 |         return body_2d_bin
  63 |  
  64 |     @staticmethod
  65 |     def preprocess_micrograph(micrograph):
  66 |         """Do preprocess to the micrograph after the micrograph data is loaded into a numpy.array.
  67 | 
  68 |         Define this function to make sure that the same process is done to the micrograph 
  69 |             during the training process and picking process.
  70 |         
  71 |         Args:
  72 |             micrograph: numpy.array, the shape is (micrograph_col, micrograph_row)
  73 | 
  74 |         Returns:
  75 |             return micrograph
  76 |             micrograph: numpy.array
  77 |         """
  78 |         #mrc_col = micrograph.shape[0]
  79 |         #mrc_row = micrograph.shape[1]
  80 |         # lowpass
  81 |         micrograph = scipy.ndimage.filters.gaussian_filter(micrograph, 0.1) 
  82 |         # do the bin process 
  83 |         pooling_size = 3
  84 |         micrograph = DataLoader.bin_2d(micrograph, pooling_size)
  85 | 
  86 |         # low pass the micrograph
  87 |         #micrograph_lowpass = scipy.ndimage.filters.gaussian_filter(micrograph, 0.1) 
  88 |         #f = np.fft.fft2(micrograph)
  89 |         #fshift = np.fft.fftshift(f)
  90 |         #magnitude_spectrum = 20*np.log(np.abs(fshift))
  91 | 
  92 |         #plt.subplot(121),plt.imshow(micrograph, cmap = 'gray')
  93 |         #plt.title('Input Image'), plt.xticks([]), plt.yticks([])
  94 |         #plt.subplot(122),plt.imshow(micrograph_lowpass, cmap = 'gray')
  95 |         #plt.title('Magnitude Spectrum'), plt.xticks([]), plt.yticks([])
  96 |         #plt.show() 
  97 | 
  98 |         # nomalize the patch
  99 |         max_value = micrograph.max()
 100 |         min_value = micrograph.min()
 101 |         particle = (micrograph - min_value)/(max_value - min_value)
 102 |         mean_value = micrograph.mean()
 103 |         std_value = micrograph.std()
 104 |         micrograph = (micrograph - mean_value)/std_value
 105 |         #
 106 |         return micrograph, pooling_size
 107 | 
 108 |     @staticmethod
 109 |     def preprocess_particle(particle, model_input_size):
 110 |         """Do preprocess to the particle patch after the particle data is extracted from the micrograph.
 111 | 
 112 |         Define this function to make sure that the same process is done to the particle 
 113 |             during the training process and picking process.
 114 |         
 115 |         Args:
 116 |             particle: numpy.array, the shape is (particle_col, particle_row)
 117 |             model_input_size: a list with length 4. The size is to fit with the model input.
 118 |                               model_input_size[0] stands for the batchsize.
 119 |                               model_input_size[1] stands for the input col.
 120 |                               model_input_size[2] stands for the input row.
 121 |                               model_input_size[3] stands for the input channel.
 122 |         Returns:
 123 |             return particle
 124 |             particle: numpy.array
 125 |         """
 126 |         # resize the particle to fit the model input
 127 |         particle = scipy.misc.imresize(particle, (model_input_size[1], model_input_size[2]), interp = 'bilinear', mode = 'L')
 128 |         #particle = scipy.ndimage.zoom(particle, float(model_input_size[1])/particle.shape[1])
 129 |         # nomalize the patch
 130 |         mean_value = particle.mean()
 131 |         std_value = particle.std()
 132 |         particle = (particle - mean_value)/std_value
 133 |         return particle
 134 |         
 135 |     @staticmethod
 136 |     def preprocess_particle_online(particle_batch):
 137 |         """Do process to the particle batch before they are inputed to the CNN model.
 138 | 
 139 |         This is online process during the training process. This process mainly includes random rotation.
 140 |         
 141 |         Args:
 142 |             particle_batch: numpy.array, the shape is (batch_size, particle_col, particle_row, channel)
 143 | 
 144 |         Returns:
 145 |             return particle_batch
 146 |             particle_batch: numpy.array, the shape is (batch_size, particle_col, particle_row, channel)
 147 |         """
 148 |         # random rotate the particle
 149 |         for i in range(particle_batch.shape[0]):
 150 |             random_degree = random.randint(0, 359)
 151 |             sample = particle_batch[i].reshape(particle_batch[i].shape[0], particle_batch[i].shape[1])
 152 |             max_value = sample.max()
 153 |             min_value = sample.min()
 154 |             sample = 255*(sample - min_value)/(max_value - min_value)
 155 |             sample = sample.astype('uint8')
 156 |             sample = Image.fromarray(sample)
 157 |             sample = Image.Image.rotate(sample, random_degree)
 158 |             sample = np.array(sample)
 159 |             sample = sample.reshape(particle_batch[i].shape[0], particle_batch[i].shape[1], particle_batch[i].shape[2])
 160 |             mean_value = sample.mean()
 161 |             std_value = sample.std()
 162 |             particle_batch[i] = (sample - mean_value)/std_value
 163 |         # nomalize the patch
 164 |         return particle_batch
 165 | 
 166 |     @staticmethod
 167 |     def read_coordinate_from_star(starfile):
 168 |         """ Read the coordinate from star file.
 169 |         return a list  
 170 |         
 171 |         Args:
 172 |             starfile: string, the input coodinate star file.
 173 |     
 174 |         Returns:
 175 |             return coordinate_list
 176 |             coordinate_list: list, the length of the list stands for the number of particles.
 177 |                              Each particle is a list too, which contains two elements.
 178 |                              The first one is the x coordinate, the second one is y coordinate.
 179 |         """
 180 |         particle_star = starRead(starfile)
 181 |         table_star = particle_star.getByName('data_')
 182 |         coordinateX_list = table_star.getByName('_rlnCoordinateX')
 183 |         coordinateY_list = table_star.getByName('_rlnCoordinateY')
 184 |         coordinate_list = []
 185 |         for i in range(len(coordinateX_list)):
 186 |             coordinate = []
 187 |             coordinate.append(int(float(coordinateX_list[i])))
 188 |             coordinate.append(int(float(coordinateY_list[i])))
 189 |             coordinate_list.append(coordinate)
 190 |         return coordinate_list
 191 | 
 192 |     # read the mrc file, return the header and body
 193 |     @staticmethod
 194 |     def readMrcFile(fileName):
 195 |         """Read micrograph image data from mrc format file/
 196 | 
 197 |         Retrieves the header information and image body information from the mrc file.
 198 |         The header is a tuple, and all the parameters about the mrc file is included in the header tuple.
 199 |         The body is a 1-d list, the data type depends on the mode parameters in header[3]
 200 |         
 201 |         Args:
 202 |             filenName: string, the input mrc file name.
 203 |     
 204 |         Returns:
 205 |             return header,body
 206 |             header: tuple, contains all the parameters in the header.
 207 |                     There are several parameters that will be used in the following operation.
 208 |                     header[0], type int, it stands for the number of columns.
 209 |                     header[1], type int, it stands for the number of rows.
 210 |                     header[2], type int, it stands for the number of sections. If the mrc file is 2-dim, then this value will be 1. 
 211 |             body: list, contains the micrograph image data information.
 212 |                   It is a 1-d list, the length is header[0]*header[1]*header[2].
 213 |                   So you can transfer it into a numpy.array and reshape it into a 2D or 3D array. 
 214 | 
 215 |         Raises:
 216 |             None
 217 |         """
 218 |         if not os.path.isfile(fileName):
 219 |             print("ERROR:%s is not a valid file."%(fileName))
 220 |             return
 221 | 
 222 |         f = open(fileName,"rb")
 223 |         data = f.read() # read all data
 224 |         f.close()
 225 |     
 226 |         header_fmt = '10i6f3i3f2i100c3f4cifi800c'  # more information about the header please refer to mrc format in Internet.
 227 |         header = struct.unpack(header_fmt,data[0:1024])
 228 |         n_columns = header[0]
 229 |         n_rows = header[1]
 230 |         mode = header[3]
 231 |         #print "n_columns:",n_columns
 232 |         #print "n_rows:",n_rows
 233 |         #print "mode:",mode
 234 |         if mode == 0:
 235 |             # signed 8-bit bytes range -128 to 127
 236 |             pass
 237 |         elif mode == 1:
 238 |             # 16-bit halfwords
 239 |             pass
 240 |         elif mode == 2:
 241 |             # 32-bit float
 242 |             body_fmt = str(n_columns*n_rows)+"f"
 243 |         elif mode == 3:
 244 |             # complex 16-bit integers
 245 |             pass
 246 |         elif mode == 4:
 247 |             # complex 32-bit reals
 248 |             pass
 249 |         elif mode == 6:
 250 |             # unsigned 16-bit range 0 to 65535
 251 |             pass
 252 |         else:
 253 |             print("ERROR:mode %s is not a valid value,should be [0|1|2|3|4|6]."%(fileName))
 254 |             return None
 255 |     
 256 |         body = list(struct.unpack(body_fmt,data[1024:]))
 257 |         return header, body
 258 |     
 259 |     # write numpy array to mrc file
 260 |     @staticmethod
 261 |     def writeToMrcFile(body_array, mrc_filename):
 262 |         """Write numpy 2D array to mrc format or numpy 3D array to mrcs format file/
 263 | 
 264 |         Store the information of numpy array into mrc format.
 265 |         The header is a tuple, and all the parameters about the mrc file is included in the header tuple.
 266 |         The body is a 1-d list, the data type depends on the mode parameters in header[3]
 267 |         
 268 |         Args:
 269 |             body_array: numpy array, 2D or 3D, type float32, 2D array refers to the micrograph and 3D array refers to the extracted particles 
 270 |             mrc_filename: string, the output mrc file name.
 271 |     
 272 |         Returns:
 273 |             None
 274 |         Raises:
 275 |             None
 276 |         """
 277 |         if body_array.dim() == 2:
 278 |             n_columns = body_array.shape()[0]
 279 |             n_rows = body_array.shape()[1]
 280 |         elif body_array.dim() == 3:
 281 |             n_section = body_array.shape()[0]
 282 |             n_columns = body_array.shape()[1]
 283 |             n_rows = body_array.shape()[2]
 284 |         else:        
 285 |             print("ERROR:the dimension of body_array must be 2 or 3")
 286 |             return
 287 | 
 288 |         f = open(fileName,"wb")
 289 |         data = f.read() # read all data
 290 |         f.close()
 291 |     
 292 |         header_fmt = '10i6f3i3f2i100c3f4cifi800c'  # more information about the header please refer to mrc format in Internet.
 293 |         header = struct.unpack(header_fmt,data[0:1024])
 294 |         mode = 2
 295 |         body = list(struct.unpack(body_fmt,data[1024:]))
 296 |         return header, body
 297 |     
 298 |     			
 299 |     # read particles data from star format file
 300 |     @staticmethod
 301 |     def load_Particle_From_starFile(starFileName, particle_size, model_input_size, produce_negative=True, negative_distance_ratio=0.5, negative_number_ratio=1):
 302 |         """Read the particles data from star file.
 303 | 
 304 |         Based on the coordinates information and the corresponding mrc file information,
 305 |         extarct the particle patch when given the particle size.
 306 |         At the same time, select some negative particles randomly.
 307 |         The coordinates of the negative particles are enforced to be away from positive particles,
 308 |         the threshold is set to negative_distance_ratio*particle_size.
 309 | 
 310 |         Args:
 311 |             starFileName: string, the name of the star file.
 312 |             particle_size: int, the size of the particle.
 313 |             model_input_size: the size of Placeholder to fit the model input, like [100, 64, 64, 1]
 314 |             produce_negative: bool, whether to produce the negative particles.
 315 |     			
 316 |         Returns:
 317 |             return particle_array_positive,particle_array_negative
 318 |             particle_array_positive: numpy.array, a 4-dim array,the shape is (number_particles, particle_size, particle_size, 1).
 319 |             particle_array_negative: numpy.array, a 4-dim array,the shape is (number_particles, particle_size, particle_size, 1).
 320 |      
 321 |         Raises:
 322 |             None 
 323 |         """ 
 324 |         particle_star = starRead(starFileName)
 325 |         table_star = particle_star.getByName('data_')
 326 |         mrcfilename_list = table_star.getByName('_rlnMicrographName')
 327 |         coordinateX_list = table_star.getByName('_rlnCoordinateX')
 328 |         coordinateY_list = table_star.getByName('_rlnCoordinateY')
 329 | 
 330 |         # creat a dictionary to store the coordinate
 331 |         # the key is the mrc file name
 332 |         # the value is a list of the coordinates
 333 |         coordinate = {}
 334 |         path_star = os.path.split(starFileName)
 335 |         for i in range(len(mrcfilename_list)):
 336 |             fileName = mrcfilename_list[i]
 337 |             fileName = os.path.join(path_star[0], fileName)
 338 |             if fileName in coordinate:
 339 |                 coordinate[fileName][0].append(int(float(coordinateX_list[i])))
 340 |                 coordinate[fileName][1].append(int(float(coordinateY_list[i])))
 341 |             else:
 342 |                 coordinate[fileName] = [[],[]]
 343 |                 coordinate[fileName][0].append(int(float(coordinateX_list[i])))
 344 |                 coordinate[fileName][1].append(int(float(coordinateY_list[i])))
 345 | 
 346 |         # read mrc data
 347 |         particle_array_positive = []
 348 |         particle_array_negative = []
 349 |         number_total_particle = 0
 350 |         for key in coordinate:
 351 |             print key
 352 |             header, body = DataLoader.readMrcFile(key)
 353 |             n_col = header[0]
 354 |             n_row = header[1]
 355 |             body_2d = np.array(body, dtype=np.float32).reshape(n_row, n_col, 1)
 356 | 
 357 |             # show the micrograph with manually picked particles
 358 |             # plot the circle of the particle 
 359 |             #display.plot_circle_in_micrograph(body_2d, coordinate[key], particle_size, 'test.png') 
 360 |             # do preprocess to the micrograph
 361 |             body_2d, bin_size = DataLoader.preprocess_micrograph(body_2d)
 362 |             # bin scale the particle size and the coordinates
 363 |             particle_size_bin =int(particle_size/bin_size)
 364 |             n_col = int(n_col/bin_size)
 365 |             n_row = int(n_row/bin_size)
 366 |             for i in range(len(coordinate[key][0])):
 367 |                 coordinate[key][0][i] = int(coordinate[key][0][i]/bin_size)
 368 |                 coordinate[key][1][i] = int(coordinate[key][1][i]/bin_size)
 369 | 
 370 |             # delete the particle outside the boundry 
 371 |             radius = int(particle_size_bin/2)
 372 |             i = 0
 373 |             while True:
 374 |                 if i >= len(coordinate[key][0]):
 375 |                     break
 376 | 
 377 |                 coordinate_x = coordinate[key][0][i]
 378 |                 coordinate_y = coordinate[key][1][i]
 379 |                 if coordinate_x < radius or coordinate_y < radius or coordinate_y+radius > n_col or coordinate_x+radius > n_row:
 380 |                     del coordinate[key][0][i]	
 381 |                     del coordinate[key][1][i]
 382 |                 else:
 383 |                     i = i + 1	
 384 | 
 385 |             # number of positive particles	
 386 |             number_particle = len(coordinate[key][0])
 387 |             number_total_particle = number_total_particle + number_particle
 388 |             print 'number of particles:',number_particle
 389 | 
 390 |             # extract the positive particles
 391 |             # store the particles in a contacted array: particle_array_positive	
 392 |             for i in range(number_particle):
 393 |                 coordinate_x = coordinate[key][0][i]
 394 |                 coordinate_y = coordinate[key][1][i]
 395 |                 patch = np.copy(body_2d[(coordinate_y-radius):(coordinate_y+radius), (coordinate_x-radius):(coordinate_x+radius)])
 396 |                 patch = DataLoader.preprocess_particle(patch, model_input_size)
 397 |                 particle_array_positive.append(patch)
 398 |             # extract the negative particles
 399 |             # store the particles in a concated array: particle_array_negative	
 400 |             if produce_negative:
 401 |                 for i in range(number_particle):
 402 |                     while True:
 403 |                         isLegal = True
 404 |                         coor_x = np.random.randint(radius, n_row-radius)
 405 |                         coor_y = np.random.randint(radius, n_col-radius)
 406 |                         for j in range(number_particle):
 407 |                             coordinate_x = coordinate[key][0][i]
 408 |                             coordinate_y = coordinate[key][1][i]
 409 |                             distance = ((coor_x-coordinate_x)**2+(coor_y-coordinate_y)**2)**0.5
 410 |                             if distance < negative_distance_ratio*particle_size_bin:
 411 |                                 isLegal = False
 412 |                                 break
 413 |                         if isLegal:
 414 |                             patch = np.copy(body_2d[(coor_y-radius):(coor_y+radius), (coor_x-radius):(coor_x+radius)])
 415 |                             patch = DataLoader.preprocess_particle(patch, model_input_size)
 416 |                             particle_array_negative.append(patch)
 417 |                             break
 418 |         if produce_negative:
 419 | 	    particle_array_positive = np.array(particle_array_positive).reshape(number_total_particle, model_input_size[1], model_input_size[2], 1)	
 420 | 	    particle_array_negative = np.array(particle_array_negative).reshape(number_total_particle, model_input_size[1], model_input_size[2], 1)	
 421 |    	    return particle_array_positive, particle_array_negative 
 422 | 
 423 |         else:
 424 | 	    particle_array_positive = np.array(particle_array_positive).reshape(number_total_particle, model_input_size[1], model_input_size[2], 1)	
 425 |    	    return particle_array_positive
 426 | 
 427 | 
 428 |     @staticmethod
 429 |     def load_trainData_From_RelionStarFile(starFileName, particle_size, model_input_size, validation_ratio, train_number):    
 430 |         """read train_data and validation data from star file
 431 | 
 432 |         In order to train a CNN model based on Relion particle '.star' file, it need to loading the training particles
 433 |         samples from the star file.
 434 |   
 435 |         Args:
 436 |             starFileName: the name of star file
 437 |             particle_size: particle size
 438 |             model_input_size: the size of Placeholder to fit the model input, like [100, 64, 64, 1]
 439 |             validation_rate: divide the total samples into training dataset and validation dataset. 
 440 |                              This is the ratio of validation dataset compared to the total samples.
 441 | 
 442 |         Returns:
 443 |             return train_data,train_labels,validation_data,validation_labels
 444 |             train_data: numpy.array, np.float32, the shape is (number_samples, particle_size, particle_size, 1)  
 445 |             train_labels: numpy.array, int64, the shape is (number_samples)  
 446 |             validation_data: numpy.array, np.float32, the shape is (number_samples, particle_size, particle_size, 1)  
 447 |             validation_labels: numpy.array, int64, the shape is (number_samples)  
 448 |    
 449 |         Raises:
 450 |             None 
 451 |         """
 452 |         particle_array_positive, particle_array_negative = DataLoader.load_Particle_From_starFile(starFileName, particle_size, model_input_size)
 453 |         if train_number<len(particle_array_positive):
 454 |             particle_array_positive = particle_array_positive[:train_number, ...]
 455 |             particle_array_negative = particle_array_negative[:train_number, ...]
 456 | 
 457 |         np.random.shuffle(particle_array_positive)	
 458 |         np.random.shuffle(particle_array_negative)	
 459 | 
 460 |         validation_size = int(validation_ratio*particle_array_positive.shape[0])
 461 |         train_size = particle_array_positive.shape[0] - validation_size
 462 |         validation_data = particle_array_positive[:validation_size, ...]
 463 |         validation_data = concatenate((validation_data, particle_array_negative[:validation_size, ...]))
 464 |         validation_labels = concatenate((ones(validation_size, dtype=int64), zeros(validation_size, dtype=int64)))
 465 | 
 466 |         train_data = particle_array_positive[validation_size:, ...]
 467 |         train_data = concatenate((train_data, particle_array_negative[validation_size:, ...]))
 468 |         train_labels = concatenate((ones(train_size, dtype=int64), zeros(train_size, dtype=int64)))
 469 |         print train_data.shape, train_data.dtype
 470 |         print train_labels.shape, train_labels.dtype
 471 |         print validation_data.shape, validation_data.dtype
 472 |         print validation_labels.shape, validation_labels.dtype
 473 |         return train_data,train_labels, validation_data,validation_labels
 474 | 
 475 |     @staticmethod
 476 |     def load_Particle_From_mrcFileDir(trainInputDir, particle_size, model_input_size, coordinate_symbol, mrc_number, produce_negative = True, negative_distance_ratio = 0.5):
 477 |         """Read the particles data from mrc file dir.
 478 | 
 479 |         Based on the coordinates information and the corresponding mrc file information,
 480 |         extarct the particle patch when given the particle size.
 481 |         At the same time, select some negative particles randomly.
 482 |         The coordinates of the negative particles are enforced to be away from positive particles,
 483 |         the threshold is set to negative_distance_ratio*particle_size.
 484 | 
 485 |         Args:
 486 |             trainInputDir: string, the dir of mrc files as well as the coordinate files.
 487 |             particle_size: int, the size of the particle.
 488 |             model_input_size: the size of Placeholder to fit the model input, like [100, 64, 64, 1]
 489 |             coordinate_symbol: symbol of the coordinate file like '_manual'.
 490 |             mrc_number: number of mrc files to be used.
 491 |             produce_negative: bool, whether to produce the negative particles.
 492 |             negative_distance_ratio: float, a value between 0~1. It stands for the minimum distance between a positive sample 
 493 |                                      and negative sample compared to the particle_size. 
 494 |         Returns:
 495 |             return particle_array_positive,particle_array_negative
 496 |             particle_array_positive: numpy.array, a 4-dim array,the shape is (number_particles, particle_size, particle_size, 1).
 497 |             particle_array_negative: numpy.array, a 4-dim array,the shape is (number_particles, particle_size, particle_size, 1).
 498 |      
 499 |         Raises:
 500 |             None 
 501 |         """ 
 502 |         mrc_file_all = []
 503 |         mrc_file_coordinate = []
 504 |         file_coordinate = []
 505 |         if not os.path.isdir(trainInputDir):
 506 |             print("Invalide directory:",trainInputDir)
 507 |         
 508 |         files = os.listdir(trainInputDir)
 509 |         for f in files:
 510 |             if re.search('\.mrc$', f):
 511 |                 filename = os.path.join(trainInputDir, f)
 512 |                 mrc_file_all.append(filename)
 513 |         
 514 |         mrc_file_all.sort()
 515 |         for i in range(len(mrc_file_all)):
 516 |             filename_mrc = mrc_file_all[i]
 517 |             filename_coordinate = filename_mrc.replace('.mrc', coordinate_symbol+'.star') 
 518 |             if os.path.isfile(filename_coordinate):
 519 |                 mrc_file_coordinate.append(filename_mrc) 
 520 |                 file_coordinate.append(filename_coordinate) 
 521 | 
 522 |         # read mrc file 
 523 |         if mrc_number<=0:
 524 |             mrc_number = len(mrc_file_coordinate)
 525 |         else:
 526 |             if mrc_number>len(mrc_file_coordinate):
 527 |                 mrc_number = len(mrc_file_coordinate)
 528 |         
 529 |         particle_array_positive = []
 530 |         particle_array_negative = []
 531 |         number_total_particle = 0
 532 |         for i in range(mrc_number):
 533 |             # read mrc data
 534 |             print(mrc_file_coordinate[i])
 535 |             header, body = DataLoader.readMrcFile(mrc_file_coordinate[i])
 536 |             n_col = header[0]
 537 |             n_row = header[1]
 538 |             body_2d = np.array(body, dtype=np.float32).reshape(n_row, n_col, 1)
 539 |             # read star coordinate
 540 |             coordinate = DataLoader.read_coordinate_from_star(file_coordinate[i])
 541 |             # show the micrograph with manually picked particles
 542 |             # plot the circle of the particle 
 543 |             #display.plot_circle_in_micrograph(body_2d, coordinate, particle_size, 'test.png') 
 544 |             # do preprocess to the micrograph
 545 |             body_2d, bin_size = DataLoader.preprocess_micrograph(body_2d)
 546 |             # bin scale the particle size and the coordinates
 547 |             particle_size_bin =int(particle_size/bin_size)
 548 |             n_col = int(n_col/bin_size)
 549 |             n_row = int(n_row/bin_size)
 550 |             for i in range(len(coordinate)):
 551 |                 coordinate[i][0] = int(coordinate[i][0]/bin_size)
 552 |                 coordinate[i][1] = int(coordinate[i][1]/bin_size)
 553 | 
 554 |             # delete the particle outside the boundry 
 555 |             radius = int(particle_size_bin/2)
 556 |             i = 0
 557 |             while True:
 558 |                 if i >= len(coordinate):
 559 |                     break
 560 | 
 561 |                 coordinate_x = coordinate[i][0]
 562 |                 coordinate_y = coordinate[i][1]
 563 |                 if coordinate_x < radius or coordinate_y < radius or coordinate_y+radius > n_col or coordinate_x+radius > n_row:
 564 |                     del coordinate[i]	
 565 |                 else:
 566 |                     i = i + 1	
 567 | 
 568 |             # number of positive particles	
 569 |             number_particle = len(coordinate)
 570 |             number_total_particle = number_total_particle + number_particle
 571 |             print 'number of particles:',number_particle
 572 | 
 573 |             # extract the positive particles
 574 |             # store the particles in a contacted array: particle_array_positive	
 575 |             for i in range(number_particle):
 576 |                 coordinate_x = coordinate[i][0]
 577 |                 coordinate_y = coordinate[i][1]
 578 |                 patch = np.copy(body_2d[(coordinate_y-radius):(coordinate_y+radius), (coordinate_x-radius):(coordinate_x+radius)])
 579 |                 patch = DataLoader.preprocess_particle(patch, model_input_size)
 580 |                 particle_array_positive.append(patch)
 581 |             # extract the negative particles
 582 |             # store the particles in a concated array: particle_array_negative	
 583 |             if produce_negative:
 584 |                 for i in range(number_particle):
 585 |                     while True:
 586 |                         isLegal = True
 587 |                         coor_x = np.random.randint(radius, n_row-radius)
 588 |                         coor_y = np.random.randint(radius, n_col-radius)
 589 |                         for j in range(number_particle):
 590 |                             coordinate_x = coordinate[i][0]
 591 |                             coordinate_y = coordinate[i][1]
 592 |                             distance = ((coor_x-coordinate_x)**2+(coor_y-coordinate_y)**2)**0.5
 593 |                             if distance < negative_distance_ratio*particle_size_bin:
 594 |                                 isLegal = False
 595 |                                 break
 596 |                         if isLegal:
 597 |                             patch = np.copy(body_2d[(coor_y-radius):(coor_y+radius), (coor_x-radius):(coor_x+radius)])
 598 |                             patch = DataLoader.preprocess_particle(patch, model_input_size)
 599 |                             particle_array_negative.append(patch)
 600 |                             break
 601 |         if produce_negative:
 602 |             particle_array_positive = np.array(particle_array_positive).reshape(number_total_particle, model_input_size[1], model_input_size[2], 1)
 603 |             particle_array_negative = np.array(particle_array_negative).reshape(number_total_particle, model_input_size[1], model_input_size[2], 1)	
 604 |             return particle_array_positive, particle_array_negative 
 605 |         else:
 606 |             particle_array_positive = np.array(particle_array_positive).reshape(number_total_particle, model_input_size[1], model_input_size[2], 1)	
 607 |             return particle_array_positive
 608 |               
 609 |     # read input data from star format file
 610 |     @staticmethod
 611 |     def load_trainData_From_mrcFileDir(trainInputDir, particle_size, model_input_size, validation_ratio, coordinate_symbol, mrc_number, train_number):    
 612 |         """read train_data and validation data from a directory of mrc files 
 613 | 
 614 |         Train a CNN model based on mrc files and corresponding coordinates.
 615 |   
 616 |         Args:
 617 |             trainInputDir: the directory of mrc files
 618 |             particle_size: particle size
 619 |             model_input_size: the size of Placeholder to fit the model input, like [100, 64, 64, 1]
 620 |             validation_rate: divide the total samples into training dataset and validation dataset. 
 621 |                              This is the ratio of validation dataset compared to the total samples.
 622 |             coordinate_symbol: symbol of the coordinate file like '_manual'.
 623 |             mrc_number: number of mrc files to be used.
 624 |             train_number: number of positive particles to be used for training.
 625 | 
 626 |         Returns:
 627 |             return train_data,train_labels,validation_data,validation_labels
 628 |             train_data: numpy.array, np.float32, the shape is (number_samples, particle_size, particle_size, 1)  
 629 |             train_labels: numpy.array, int64, the shape is (number_samples)  
 630 |             validation_data: numpy.array, np.float32, the shape is (number_samples, particle_size, particle_size, 1)  
 631 |             validation_labels: numpy.array, int64, the shape is (number_samples)  
 632 |    
 633 |         Raises:
 634 |             None 
 635 |         """
 636 |         particle_array_positive, particle_array_negative = DataLoader.load_Particle_From_mrcFileDir(trainInputDir, particle_size, model_input_size, coordinate_symbol, mrc_number)
 637 |         if train_number<len(particle_array_positive):
 638 |             particle_array_positive = particle_array_positive[:train_number, ...]
 639 |             particle_array_negative = particle_array_negative[:train_number, ...]
 640 | 
 641 |         np.random.shuffle(particle_array_positive)	
 642 |         np.random.shuffle(particle_array_negative)	
 643 | 
 644 |         validation_size = int(validation_ratio*particle_array_positive.shape[0])
 645 |         train_size = particle_array_positive.shape[0] - validation_size
 646 |         validation_data = particle_array_positive[:validation_size, ...]
 647 |         validation_data = concatenate((validation_data, particle_array_negative[:validation_size, ...]))
 648 |         validation_labels = concatenate((ones(validation_size, dtype=int64), zeros(validation_size, dtype=int64)))
 649 | 
 650 |         train_data = particle_array_positive[validation_size:, ...]
 651 |         train_data = concatenate((train_data, particle_array_negative[validation_size:, ...]))
 652 |         train_labels = concatenate((ones(train_size, dtype=int64), zeros(train_size, dtype=int64)))
 653 |         print train_data.shape, train_data.dtype
 654 |         print train_labels.shape, train_labels.dtype
 655 |         print validation_data.shape, validation_data.dtype
 656 |         print validation_labels.shape, validation_labels.dtype
 657 |         return train_data, train_labels, validation_data, validation_labels
 658 | 
 659 |     @staticmethod
 660 |     def extractData(trainInputDir, particle_size, coordinate_symbol, mrc_number, output_filename, produce_negative = True, negative_distance_ratio = 0.5):
 661 |         """extract the particles data from mrc file dir into a file.
 662 | 
 663 |         Based on the coordinates information and the corresponding mrc file information,
 664 |         extarct the particle patch when given the particle size.
 665 |         At the same time, select some negative particles randomly.
 666 |         The coordinates of the negative particles are enforced to be away from positive particles,
 667 |         the threshold is set to negative_distance_ratio*particle_size.
 668 |         Finally, store the extarcted particles into a file based on the 'pickle' module.
 669 |         Before writing to the file, the particles are stored in a list of length 2.
 670 |         The first element is a list of the positive particle. 
 671 |         Each element in the list of the positive particle is a numpy array, the shape is [particle_size, particle_size, 1]
 672 |         The second element is a list of the negative particle. 
 673 |         Each element in the list of the negative particle is a numpy array, the shape is [particle_size, particle_size, 1]
 674 | 
 675 |         Args:
 676 |             trainInputDir: string, the dir of mrc files as well as the coordinate files.
 677 |             particle_size: int, the size of the particle.
 678 |             coordinate_symbol: symbol of the coordinate file like '_manual'.
 679 |             mrc_number: number of mrc files to be used.
 680 |             produce_negative: bool, whether to produce the negative particles.
 681 |             negative_distance_ratio: float, a value between 0~1. It stands for the minimum distance between a positive sample 
 682 |                                      and negative sample compared to the particle_size. 
 683 |             output_filename: string, the file to store the particles. 
 684 |      
 685 |         Raises:
 686 |             None 
 687 |         """ 
 688 |         mrc_file_all = []
 689 |         mrc_file_coordinate = []
 690 |         file_coordinate = []
 691 |         if not os.path.isdir(trainInputDir):
 692 |             print("Invalide directory:",trainInputDir)
 693 |         
 694 |         files = os.listdir(trainInputDir)
 695 |         for f in files:
 696 |             if re.search('\.mrc$', f):
 697 |                 filename = os.path.join(trainInputDir, f)
 698 |                 mrc_file_all.append(filename)
 699 |         
 700 |         mrc_file_all.sort()
 701 |         for i in range(len(mrc_file_all)):
 702 |             filename_mrc = mrc_file_all[i]
 703 |             filename_coordinate = filename_mrc.replace('.mrc', coordinate_symbol+'.star') 
 704 |             if os.path.isfile(filename_coordinate):
 705 |                 mrc_file_coordinate.append(filename_mrc) 
 706 |                 file_coordinate.append(filename_coordinate) 
 707 | 
 708 |         # read mrc file 
 709 |         if mrc_number<=0:
 710 |             mrc_number = len(mrc_file_coordinate)
 711 |         
 712 |         if mrc_number>len(mrc_file_coordinate):
 713 |             mrc_number = len(mrc_file_coordinate)
 714 | 
 715 |         particle_array_positive = []
 716 |         particle_array_negative = []
 717 |         number_total_particle = 0
 718 |         for i in range(mrc_number):
 719 |             # read mrc data
 720 |             print(mrc_file_coordinate[i])
 721 |             header, body = DataLoader.readMrcFile(mrc_file_coordinate[i])
 722 |             n_col = header[0]
 723 |             n_row = header[1]
 724 |             body_2d = np.array(body, dtype=np.float32).reshape(n_row, n_col, 1)
 725 |             # read star coordinate
 726 |             coordinate = DataLoader.read_coordinate_from_star(file_coordinate[i])
 727 |             # show the micrograph with manually picked particles
 728 |             # plot the circle of the particle 
 729 |             #display.plot_circle_in_micrograph(body_2d, coordinate, particle_size, 'test.png') 
 730 |             # do preprocess to the micrograph
 731 |             body_2d, bin_size = DataLoader.preprocess_micrograph(body_2d)
 732 |             # bin scale the particle size and the coordinates
 733 |             particle_size_bin =int(particle_size/bin_size)
 734 |             n_col = int(n_col/bin_size)
 735 |             n_row = int(n_row/bin_size)
 736 |             for i in range(len(coordinate)):
 737 |                 coordinate[i][0] = int(coordinate[i][0]/bin_size)
 738 |                 coordinate[i][1] = int(coordinate[i][1]/bin_size)
 739 | 
 740 |             # delete the particle outside the boundry 
 741 |             radius = int(particle_size_bin/2)
 742 |             i = 0
 743 |             while True:
 744 |                 if i >= len(coordinate):
 745 |                     break
 746 | 
 747 |                 coordinate_x = coordinate[i][0]
 748 |                 coordinate_y = coordinate[i][1]
 749 |                 if coordinate_x < radius or coordinate_y < radius or coordinate_y+radius > n_col or coordinate_x+radius > n_row:
 750 |                     del coordinate[i]	
 751 |                 else:
 752 |                     i = i + 1	
 753 | 
 754 |             # number of positive particles	
 755 |             number_particle = len(coordinate)
 756 |             number_total_particle = number_total_particle + number_particle
 757 |             print 'number of particles:',number_particle
 758 | 
 759 |             # extract the positive particles
 760 |             # store the particles in a contacted array: particle_array_positive	
 761 |             for i in range(number_particle):
 762 |                 
 763 |                 coordinate_x = coordinate[i][0]
 764 |                 coordinate_y = coordinate[i][1]
 765 |                 patch = np.copy(body_2d[(coordinate_y-radius):(coordinate_y+radius), (coordinate_x-radius):(coordinate_x+radius)])
 766 |                 #patch = DataLoader.preprocess_particle(patch, model_input_size)
 767 |                 particle_array_positive.append(patch)
 768 |             # extract the negative particles
 769 |             # store the particles in a concated array: particle_array_negative	
 770 |             if produce_negative:
 771 |                 for i in range(number_particle):
 772 |                     while True:
 773 |                         isLegal = True
 774 |                         coor_x = np.random.randint(radius, n_row-radius)
 775 |                         coor_y = np.random.randint(radius, n_col-radius)
 776 |                         for j in range(number_particle):
 777 |                             coordinate_x = coordinate[i][0]
 778 |                             coordinate_y = coordinate[i][1]
 779 |                             distance = ((coor_x-coordinate_x)**2+(coor_y-coordinate_y)**2)**0.5
 780 |                             if distance < negative_distance_ratio*particle_size_bin:
 781 |                                 isLegal = False
 782 |                                 break
 783 |                         if isLegal:
 784 |                             patch = np.copy(body_2d[(coor_y-radius):(coor_y+radius), (coor_x-radius):(coor_x+radius)])
 785 |                             #patch = DataLoader.preprocess_particle(patch, model_input_size)
 786 |                             particle_array_negative.append(patch)
 787 |                             break
 788 |         # save the extracted particles into file
 789 |         if produce_negative:
 790 |             particle = []
 791 |             particle.append(particle_array_positive)
 792 |             particle.append(particle_array_negative)
 793 |             with open(output_filename, 'wb') as f:
 794 |                 pickle.dump(particle, f)
 795 |         else:
 796 |             particle = []
 797 |             particle.append(particle_array_positive)
 798 |             with open(output_filename, 'wb') as f:
 799 |                 pickle.dump(particle, f)
 800 | 
 801 |     @staticmethod
 802 |     def load_trainData_From_ExtractedDataFile(train_inputDir, train_inputFile, model_input_size, validation_ratio, train_number):
 803 |         """read train_data and validation data from pre-extracted particles.
 804 | 
 805 |         Train a CNN model based on pre-extracted samples. This is the cross-molecule training strategy, through which you can get a more robust CNN model to achieve better fully automated picking results.
 806 |   
 807 |         Args:
 808 |             trainInputDir: the directory of the extarcted data.
 809 |             train_inputFile: the input extarcted data file, like 'gammas.pickle;trpv1.pickle', the separator must be ';'.
 810 |             model_input_size: the size of Placeholder to fit the model input, like [100, 64, 64, 1]
 811 |             validation_rate: divide the total samples into training dataset and validation dataset. 
 812 |                              This is the ratio of validation dataset compared to the total samples.
 813 |             train_number: the number of the total positive samples. If the number is set to 10000, and there are two kinds of molecule, then each one contributes only 5,000 positive samples.  
 814 |         Returns:
 815 |             return train_data,train_labels,validation_data,validation_labels
 816 |             train_data: numpy.array, np.float32, the shape is (number_samples, particle_size, particle_size, 1)  
 817 |             train_labels: numpy.array, int64, the shape is (number_samples)  
 818 |             validation_data: numpy.array, np.float32, the shape is (number_samples, particle_size, particle_size, 1)  
 819 |             validation_labels: numpy.array, int64, the shape is (number_samples)  
 820 |    
 821 |         Raises:
 822 |             None 
 823 |         """
 824 |         input_file_list = train_inputFile.split(";")
 825 |         # define the training number of each molecule
 826 |         if train_number<=0:
 827 |             number_each_molecule = -1
 828 |         else:
 829 |             number_each_molecule = train_number//len(input_file_list)
 830 |         
 831 |         particle_array_positive = []
 832 |         particle_array_negative = []
 833 |         for i in range(len(input_file_list)):
 834 |             input_file_name = input_file_list[i].strip()
 835 |             input_file_name = os.path.join(train_inputDir, input_file_name)
 836 |             with open(input_file_name, 'rb') as f:
 837 |                 coordinate = pickle.load(f)
 838 |             if number_each_molecule <=0:
 839 |                 number_particle = len(coordinate[0])
 840 |             else:
 841 |                 if number_each_molecule > len(coordinate[0]):
 842 |                     number_particle = len(coordinate[0])
 843 |                 else:
 844 |                     number_particle = number_each_molecule
 845 |             
 846 |             for j in range(number_particle):
 847 |                 patch_positive = DataLoader.preprocess_particle(coordinate[0][j], model_input_size)
 848 |                 particle_array_positive.append(patch_positive)
 849 |                 patch_negative = DataLoader.preprocess_particle(coordinate[1][j], model_input_size)
 850 |                 particle_array_negative.append(patch_negative)
 851 |              
 852 |         number_total_particle = len(particle_array_positive)
 853 |         particle_array_positive = np.array(particle_array_positive).reshape(number_total_particle, model_input_size[1], model_input_size[2], 1)	
 854 |         particle_array_negative = np.array(particle_array_negative).reshape(number_total_particle, model_input_size[1], model_input_size[2], 1)	
 855 |         np.random.shuffle(particle_array_positive)	
 856 |         np.random.shuffle(particle_array_negative)	
 857 | 
 858 |         validation_size = int(validation_ratio*particle_array_positive.shape[0])
 859 |         train_size = particle_array_positive.shape[0] - validation_size
 860 |         validation_data = particle_array_positive[:validation_size, ...]
 861 |         validation_data = concatenate((validation_data, particle_array_negative[:validation_size, ...]))
 862 |         validation_labels = concatenate((ones(validation_size, dtype=int64), zeros(validation_size, dtype=int64)))
 863 | 
 864 |         train_data = particle_array_positive[validation_size:, ...]
 865 |         train_data = concatenate((train_data, particle_array_negative[validation_size:, ...]))
 866 |         train_labels = concatenate((ones(train_size, dtype=int64), zeros(train_size, dtype=int64)))
 867 |         print train_data.shape, train_data.dtype
 868 |         print train_labels.shape, train_labels.dtype
 869 |         print validation_data.shape, validation_data.dtype
 870 |         print validation_labels.shape, validation_labels.dtype
 871 |         return train_data, train_labels, validation_data, validation_labels
 872 | 
 873 |     @staticmethod
 874 |     def load_trainData_From_PrePickedResults(train_inputDir, train_inputFile, particle_size, model_input_size, validation_ratio, train_number):
 875 |         """read train_data and validation data from pre-picked results
 876 | 
 877 |         Train a CNN model based on pre-picked particles. Then you can pick the particles again based on the new trained model.
 878 |         This will improve the precision and recall of picking results.
 879 |   
 880 |         Args:
 881 |             trainInputDir: the directory of mrc files
 882 |             trainInputFile: the file of the pre-picked results, like '/Your_pick_path/autopick_results.pickle'
 883 |             particle_size: particle size
 884 |             model_input_size: the size of Placeholder to fit the model input, like [100, 64, 64, 1]
 885 |             validation_rate: divide the total samples into training dataset and validation dataset. 
 886 |                              This is the ratio of validation dataset compared to the total samples.
 887 |             train_number: if the value is ranging (0,1), then it means the prediction threshold. If the value is ranging (1,100), then it means the proportion of top sorted ranking particles. If the value is larger than 100, then it means the number of top sorted ranking particles.
 888 | 
 889 |         Returns:
 890 |             return train_data,train_labels,validation_data,validation_labels
 891 |             train_data: numpy.array, np.float32, the shape is (number_samples, particle_size, particle_size, 1)  
 892 |             train_labels: numpy.array, int64, the shape is (number_samples)  
 893 |             validation_data: numpy.array, np.float32, the shape is (number_samples, particle_size, particle_size, 1)  
 894 |             validation_labels: numpy.array, int64, the shape is (number_samples)  
 895 |    
 896 |         Raises:
 897 |             None 
 898 |         """
 899 |         with open(train_inputFile, 'rb') as f:
 900 |             coordinate = pickle.load(f)
 901 |             """
 902 |             coordinate: a list, the length of it stands for the number of picked micrograph file.
 903 |                         Each element is a list too, which contains all coordinates from the same micrograph. 
 904 |                         The length of the list stands for the number of the particles.
 905 |                         And each element in the list is a small list of length of 4.
 906 |                         The first element in the small list is the coordinate x-aixs. 
 907 |                         The second element in the small list is the coordinate y-aixs. 
 908 |                         The third element in the small list is the prediction score. 
 909 |                         The fourth element in the small list is the micrograh name. 
 910 |             """
 911 |         # sort all particles based on the prediction score
 912 |         # get the top ranked particles
 913 |         if train_number>1:
 914 |             train_number = int(train_number)
 915 |             particles_all = []
 916 |             for i in range(len(coordinate)):
 917 |                 for j in range(len(coordinate[i])):
 918 |                     particles_all.append(coordinate[i][j])
 919 |             
 920 |             # sort all particles based on prediction score in descending order
 921 |             particles_all = sorted(particles_all, key=itemgetter(2), reverse=True)
 922 |             if train_number < 100 :
 923 |                 number_positive_samples = len(particles_all)*train_number/100
 924 |             else:
 925 |                 number_positive_samples = train_number
 926 |              
 927 |             print ("number_positive_samples:",number_positive_samples)
 928 |             particles_train = particles_all[:number_positive_samples]
 929 |              
 930 |             # recover 'particles_train' to the formate like 'coordinate'
 931 |             particles_train = sorted(particles_train, key=itemgetter(3)) 
 932 |             mrc_filename = particles_train[0][3]
 933 |             coordinate = []
 934 |             mrc_coordinate = []
 935 |             for i in range(len(particles_train)):
 936 |                 if particles_train[i][3]==mrc_filename:
 937 |                     mrc_coordinate.append(particles_train[i])
 938 |                 else:
 939 |                     coordinate.append(mrc_coordinate)
 940 |                     mrc_coordinate = []
 941 |                     mrc_filename = particles_train[i][3]
 942 |                     mrc_coordinate.append(particles_train[i])
 943 |                 if i==len(particles_train)-1:
 944 |                     coordinate.append(mrc_coordinate)
 945 | 
 946 |         # read mrc data
 947 |         particle_array_positive = []
 948 |         particle_array_negative = []
 949 |         number_total_particle = 0
 950 |         negative_distance_ratio = 0.5
 951 |         for i in range(len(coordinate)):
 952 |             mrc_filename = coordinate[i][0][3]
 953 |             #print(mrc_filename)
 954 |             mrc_filename = os.path.basename(mrc_filename)
 955 |             mrc_filename = os.path.join(train_inputDir, mrc_filename)
 956 |             print(mrc_filename)
 957 |             header,body = DataLoader.readMrcFile(mrc_filename)
 958 |             n_col = header[0]
 959 |             n_row = header[1]
 960 |             body_2d = np.array(body, dtype=np.float32).reshape(n_row, n_col, 1)
 961 | 
 962 |             # show the micrograph with manually picked particles
 963 |             # plot the circle of the particle 
 964 |             #display.plot_circle_in_micrograph(body_2d, coordinate[key], particle_size, 'test.png') 
 965 |             # do preprocess to the micrograph
 966 |             body_2d, bin_size = DataLoader.preprocess_micrograph(body_2d)
 967 |             # bin scale the particle size and the coordinates
 968 |             particle_size_bin =int(particle_size/bin_size)
 969 |             radius = int(particle_size_bin/2)
 970 |             n_col = int(n_col/bin_size)
 971 |             n_row = int(n_row/bin_size)
 972 |             for j in range(len(coordinate[i])):
 973 |                 coordinate[i][j][0] = int(coordinate[i][j][0]/bin_size)
 974 |                 coordinate[i][j][1] = int(coordinate[i][j][1]/bin_size)
 975 | 
 976 |             if train_number>0 and train_number<1:
 977 |                 coordinate_positive = []
 978 |                 for j in range(len(coordinate[i])):
 979 |                     if coordinate[i][j][2]>train_number:
 980 |                         coordinate_positive.append(coordinate[i][j])
 981 |             else:
 982 |                 coordinate_positive = coordinate[i]
 983 | 
 984 |             # number of positive particles      
 985 |             number_particle = len(coordinate_positive)
 986 |             number_total_particle = number_total_particle + number_particle
 987 |             print 'number of particles:',number_particle
 988 | 
 989 |             # extract the positive particles
 990 |             # store the particles in a contacted array: particle_array_positive 
 991 |             for j in range(number_particle):
 992 |                 coordinate_x = coordinate_positive[j][0]
 993 |                 coordinate_y = coordinate_positive[j][1]
 994 |                 patch = np.copy(body_2d[(coordinate_y-radius):(coordinate_y+radius), (coordinate_x-radius):(coordinate_x+radius)])
 995 |                 patch = DataLoader.preprocess_particle(patch, model_input_size)
 996 |                 particle_array_positive.append(patch)
 997 |             # extract the negative particles
 998 |             # store the particles in a concated array: particle_array_negative  
 999 |             for i in range(number_particle):
1000 |                 while True:
1001 |                     isLegal = True
1002 |                     coor_x = np.random.randint(radius, n_row-radius)
1003 |                     coor_y = np.random.randint(radius, n_col-radius)
1004 |                     for j in range(number_particle):
1005 |                         coordinate_x = coordinate_positive[j][0]
1006 |                         coordinate_y = coordinate_positive[j][1]
1007 |                         distance = ((coor_x-coordinate_x)**2+(coor_y-coordinate_y)**2)**0.5
1008 |                         if distance < negative_distance_ratio*particle_size_bin:
1009 |                             isLegal = False
1010 |                             break
1011 |                     if isLegal:
1012 |                         patch = np.copy(body_2d[(coor_y-radius):(coor_y+radius), (coor_x-radius):(coor_x+radius)])
1013 |                         patch = DataLoader.preprocess_particle(patch, model_input_size)
1014 |                         particle_array_negative.append(patch)
1015 |                         break
1016 |         
1017 |         # reshape all the positive samples and negative samples    
1018 |         particle_array_positive = np.array(particle_array_positive).reshape(number_total_particle, model_input_size[1], model_input_size[2], 1)
1019 |         particle_array_negative = np.array(particle_array_negative).reshape(number_total_particle, model_input_size[1], model_input_size[2], 1)
1020 |         np.random.shuffle(particle_array_positive)	
1021 |         np.random.shuffle(particle_array_negative)	
1022 | 
1023 |         validation_size = int(validation_ratio*particle_array_positive.shape[0])
1024 |         train_size = particle_array_positive.shape[0] - validation_size
1025 |         validation_data = particle_array_positive[:validation_size, ...]
1026 |         validation_data = concatenate((validation_data, particle_array_negative[:validation_size, ...]))
1027 |         validation_labels = concatenate((ones(validation_size, dtype=int64), zeros(validation_size, dtype=int64)))
1028 | 
1029 |         train_data = particle_array_positive[validation_size:, ...]
1030 |         train_data = concatenate((train_data, particle_array_negative[validation_size:, ...]))
1031 |         train_labels = concatenate((ones(train_size, dtype=int64), zeros(train_size, dtype=int64)))
1032 |         print train_data.shape, train_data.dtype
1033 |         print train_labels.shape, train_labels.dtype
1034 |         print validation_data.shape, validation_data.dtype
1035 |         print validation_labels.shape, validation_labels.dtype
1036 |         return train_data, train_labels, validation_data, validation_labels
1037 | 
1038 | 


--------------------------------------------------------------------------------
/deepModel.py:
--------------------------------------------------------------------------------
  1 | """
  2 | """
  3 | 
  4 | from __future__ import absolute_import
  5 | from __future__ import division
  6 | from __future__ import print_function
  7 | 
  8 | import gzip
  9 | import os
 10 | import re
 11 | import sys
 12 | import tarfile
 13 | 
 14 | from PIL import Image
 15 | from six.moves import urllib
 16 | import tensorflow as tf
 17 | import numpy as np
 18 | 
 19 | # image data constants information
 20 | class DeepModel(object):
 21 |     """
 22 | 
 23 |     """
 24 |     def __init__(self, particle_size, model_input_size, num_class):
 25 |         self.particle_size = particle_size
 26 |         self.batch_size = model_input_size[0]
 27 |         self.num_col = model_input_size[1]
 28 |         self.num_row = model_input_size[2]
 29 |         self.num_channel = model_input_size[3]
 30 |         self.num_class = num_class
 31 | 
 32 |     def init_learning_rate(self, learning_rate = 0.01, learning_rate_decay_factor = 0.95, decay_steps = 400, staircase = True):
 33 |         self.learning_rate = learning_rate
 34 |         self.learning_rate_decay_factor = learning_rate_decay_factor
 35 |         self.decay_steps = decay_steps
 36 |         self.staircase = staircase
 37 |         # define a global step variable
 38 |         self.global_step = tf.Variable(0,trainable = False)
 39 |         
 40 |     def init_momentum(self, momentum = 0.9):
 41 |         self.momentum = momentum
 42 | 
 43 |     """ create variable with weight decay    
 44 |     """
 45 |     # why not using tf.Variable()...
 46 |     # if the initializer is not None, then it has the same effect as tf.Variable()
 47 |     def __variable_with_weight_decay(self, name, shape, stddev, wd):
 48 |         var = tf.get_variable(name, shape,
 49 |                         initializer = tf.truncated_normal_initializer(stddev=stddev, seed = 1234))
 50 |         if wd is not None:
 51 |             weight_decay = tf.mul(tf.nn.l2_loss(var), wd, name='weight_loss')
 52 |             tf.add_to_collection('losses',weight_decay)
 53 |         return var
 54 | 
 55 |     def __inference(self, data, train=True):
 56 |         """ build cnn model, 
 57 |         input : data
 58 |         return : predictions
 59 |         """
 60 |         conv1 = tf.nn.conv2d(data, self.kernel1, strides=[1, 1, 1, 1], padding='VALID')
 61 |         relu1 = tf.nn.relu(tf.nn.bias_add(conv1, self.biases1))
 62 |         pool1 = tf.nn.max_pool(relu1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID')
 63 | 
 64 |         conv2 = tf.nn.conv2d(pool1, self.kernel2, strides=[1, 1, 1, 1], padding='VALID')
 65 |         relu2 = tf.nn.relu(tf.nn.bias_add(conv2, self.biases2))
 66 |         pool2 = tf.nn.max_pool(relu2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID')
 67 | 
 68 |         conv3 = tf.nn.conv2d(pool2, self.kernel3, strides=[1, 1, 1, 1], padding='VALID')
 69 |         relu3 = tf.nn.relu(tf.nn.bias_add(conv3, self.biases3))
 70 |         pool3 = tf.nn.max_pool(relu3, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID')
 71 | 
 72 |         conv4 = tf.nn.conv2d(pool3, self.kernel4, strides=[1, 1, 1, 1], padding='VALID')
 73 |         relu4 = tf.nn.relu(tf.nn.bias_add(conv4, self.biases4))
 74 |         pool4 = tf.nn.max_pool(relu4, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID')
 75 | 
 76 |         hidden = tf.reshape(pool4, [self.batch_size, -1])
 77 |         #print(hidden.get_shape())
 78 |         if train:
 79 |             hidden = tf.nn.dropout(hidden, 0.5, seed=6543)
 80 | 
 81 |         fc1 = tf.nn.relu(tf.matmul(hidden, self.weights_fc1) + self.biases_fc1)
 82 |         sotfmax = tf.add(tf.matmul(fc1, self.weights_fc2), self.biases_fc2)
 83 |         return (sotfmax)
 84 | 
 85 |     def __loss(self, logits):
 86 |         """compute loss with prediction and label, also will acount for L2 loss
 87 |         input : prediction, label
 88 |         output : loss
 89 |         """
 90 |         cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
 91 |                     logits, self.train_label_node, name = 'cross_entropy_all')
 92 |         cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
 93 |         tf.add_to_collection('losses', cross_entropy_mean)
 94 |         all_loss = tf.add_n(tf.get_collection('losses'), name='all_loss')
 95 |         return all_loss
 96 | 
 97 |     def __preprocess_particle(self, batch_data):
 98 |         # scale the image to the model input size
 99 |         #batch_data = tf.image.resize_images(batch_data, self.num_col, self.num_row)
100 |         # get the scale tensor shape
101 |         batch_data_shape = batch_data.get_shape().as_list()
102 |         # uppack the tensor into sub-tensor
103 |         batch_data_list = tf.unpack(batch_data)
104 |         for i in xrange(batch_data_shape[0]):
105 |             # Pass image tensor object to a PIL image
106 |             image = Image.fromarray(batch_data_list[i].eval())
107 |             # Use PIL or other library of the sort to rotate
108 |             random_degree = random.randint(0, 359)
109 |             rotated = Image.Image.rotate(image, random_degree)
110 |             # Convert rotated image back to tensor
111 |             rotated_tensor = tf.convert_to_tensor(np.array(rotated))
112 |             #slice_image = tf.slice(batch_data, [i, 0, 0, 0], [1, -1, -1, -1])
113 |             #slice_image_reshape = tf.reshape(slice_image, [batch_data_shape[1], batch_data_shape[2], batch_data_shape[3]])
114 |             #distorted_image = tf.image.random_flip_up_down(batch_data_list[i], seed = 1234)
115 |             #distorted_image = tf.image.random_flip_left_right(distorted_image, seed = 1234)
116 |             #distorted_image = tf.image.random_brightness(distorted_image, max_delta=63)
117 |             #distorted_image = tf.image.random_contrast(distorted_image, lower=0.2, upper=1.8)
118 |             # Subtract off the mean and divide by the variance of the pixels.
119 |             distorted_image = tf.image.per_image_whitening(rotated_tensor)
120 |             batch_data_list[i] = distorted_image
121 |         # pack the list of tensor into one tensor
122 |         batch_data = tf.pack(batch_data_list)
123 |         return batch_data
124 | 
125 |     def init_model_graph_train(self):
126 |         self.kernel1 = self.__variable_with_weight_decay('weights1', shape=[9, 9, 1, 8], stddev=0.05, wd = 0.0)
127 |         self.biases1 = tf.get_variable('biases1', [8], initializer=tf.constant_initializer(0.0))
128 | 
129 |         self.kernel2 = self.__variable_with_weight_decay('weights2', shape=[5, 5, 8, 16], stddev=0.05, wd = 0.0)
130 |         self.biases2 = tf.get_variable('biases2', [16], initializer=tf.constant_initializer(0.0))
131 | 
132 |         self.kernel3 = self.__variable_with_weight_decay('weights3', shape=[3, 3, 16, 32], stddev=0.05, wd = 0.0)
133 |         self.biases3 = tf.get_variable('biases3', [32], initializer=tf.constant_initializer(0.0))
134 | 
135 |         self.kernel4 = self.__variable_with_weight_decay('weights4', shape=[2, 2, 32, 64], stddev=0.05, wd = 0.0)
136 |         self.biases4 = tf.get_variable('biases4', [64], initializer=tf.constant_initializer(0.0))
137 | 
138 |         dim = 64*2*2
139 |         self.weights_fc1 = self.__variable_with_weight_decay('weightsf1', shape=[dim, 128], stddev=0.05, wd=0.0005)
140 |         self.biases_fc1 = tf.get_variable('biasesf1', [128], initializer=tf.constant_initializer(0.0))
141 | 
142 |         self.weights_fc2 = self.__variable_with_weight_decay('weightsf2', shape=[128, self.num_class], stddev=0.05, wd=0.0005)
143 |         self.biases_fc2 = tf.get_variable('biasesf2', [self.num_class], initializer=tf.constant_initializer(0.0))
144 | 
145 | 
146 | 	# define the holder for training procedure
147 |         self.train_data_node = tf.placeholder( tf.float32,
148 |                                                shape=(self.batch_size, self.num_col, self.num_row, self.num_channel))
149 |         self.train_label_node = tf.placeholder(tf.int64, shape=(self.batch_size,))
150 |         self.eval_data_node = tf.placeholder( tf.float32,
151 |                                               shape=(self.batch_size, self.num_col, self.num_col, self.num_channel))
152 |         # preprocess to the train data
153 |         #train_data_node_process = self.__preprocess_particle(self.train_data_node)
154 |         #eval_data_node_process = self.__preprocess_particle(self.eval_data_node)
155 | 
156 | 	# define the training procedure
157 |         # the value is not processed by softmax function.
158 | 	logits = self.__inference(self.train_data_node, train=True)
159 | 	# define the loss computation process and prediction computation process 
160 |         self.train_prediction_operation = tf.nn.softmax(logits)
161 |         self.loss_operation = self.__loss(logits)
162 |         # define the learning rate decay during training
163 |         self.learningRate_operation = tf.train.exponential_decay(self.learning_rate,
164 |                                         self.global_step,
165 |                                         self.decay_steps,
166 |                                         self.learning_rate_decay_factor, staircase=self.staircase)
167 |         # define the Optimizer
168 |         self.optimizer_operation = tf.train.MomentumOptimizer(self.learningRate_operation, self.momentum).minimize(self.loss_operation, 
169 |                                                       global_step = self.global_step)
170 |             
171 |         # define the evaluation procedure
172 |         evaluation_logits = self.__inference(self.eval_data_node, train=False)
173 |         self.evaluation_prediction_operation = tf.nn.softmax(evaluation_logits)
174 |     
175 |     def init_model_graph_evaluate(self):
176 |         self.kernel1 = self.__variable_with_weight_decay('weights1', shape=[9, 9, 1, 8], stddev=0.05, wd = 0.0)
177 |         self.biases1 = tf.get_variable('biases1', [8], initializer=tf.constant_initializer(0.0))
178 | 
179 |         self.kernel2 = self.__variable_with_weight_decay('weights2', shape=[5, 5, 8, 16], stddev=0.05, wd = 0.0)
180 |         self.biases2 = tf.get_variable('biases2', [16], initializer=tf.constant_initializer(0.0))
181 | 
182 |         self.kernel3 = self.__variable_with_weight_decay('weights3', shape=[3, 3, 16, 32], stddev=0.05, wd = 0.0)
183 |         self.biases3 = tf.get_variable('biases3', [32], initializer=tf.constant_initializer(0.0))
184 | 
185 |         self.kernel4 = self.__variable_with_weight_decay('weights4', shape=[2, 2, 32, 64], stddev=0.05, wd = 0.0)
186 |         self.biases4 = tf.get_variable('biases4', [64], initializer=tf.constant_initializer(0.0))
187 | 
188 |         dim = 64*2*2
189 |         self.weights_fc1 = self.__variable_with_weight_decay('weightsf1', shape=[dim, 128], stddev=0.05, wd=0.0005)
190 |         self.biases_fc1 = tf.get_variable('biasesf1', [128], initializer=tf.constant_initializer(0.0))
191 | 
192 |         self.weights_fc2 = self.__variable_with_weight_decay('weightsf2', shape=[128, self.num_class], stddev=0.05, wd=0.0005)
193 |         self.biases_fc2 = tf.get_variable('biasesf2', [self.num_class], initializer=tf.constant_initializer(0.0))
194 | 
195 | 
196 |         self.eval_data_node = tf.placeholder( tf.float32,
197 |                                               shape=(self.batch_size, self.num_col, self.num_col, self.num_channel))
198 |         # define the evaluation procedure
199 |         evaluation_logits = self.__inference(self.eval_data_node, train=False)
200 |         self.evaluation_prediction_operation = tf.nn.softmax(evaluation_logits)
201 |  
202 |     def evaluation(self, data, sess):
203 |         size = data.shape[0]
204 |         predictions = np.ndarray(shape=(size, self.num_class), dtype=np.float32)
205 |         for begin in xrange(0, size, self.batch_size):
206 |             end = begin + self.batch_size
207 |             if end <= size:
208 |                 batch_data = data[begin:end, ...]
209 |                 predictions[begin:end, :] = sess.run(
210 |                     self.evaluation_prediction_operation,
211 |                     feed_dict={self.eval_data_node: batch_data})
212 |             else:
213 |                 batch_data = data[-self.batch_size:, ...]
214 |                 batch_predictions = sess.run(
215 |                     self.evaluation_prediction_operation,
216 |                     feed_dict={self.eval_data_node: batch_data})
217 |                 predictions[begin:, :] = batch_predictions[begin - size:, :]
218 |         return predictions
219 | 
220 |     def train_batch(self, batch_data, batch_label, sess):
221 |          # do the computation
222 |          feed_dict = {self.train_data_node: batch_data, self.train_label_node: batch_label}
223 |          _, loss_value, learning_rate, prediction = sess.run(
224 |                        [self.optimizer_operation, self.loss_operation, self.learningRate_operation, self.train_prediction_operation],
225 |                        feed_dict=feed_dict)
226 |          return loss_value, learning_rate, prediction
227 | 
228 | 


--------------------------------------------------------------------------------
/display.py:
--------------------------------------------------------------------------------
 1 | from matplotlib.patches import Ellipse, Circle
 2 | import matplotlib.pyplot as plt
 3 | import scipy.misc
 4 | 
 5 | def plot_circle_in_micrograph(micrograph_2d, coordinate, particle_size, filename, color = 'white'):
 6 |     """plot the particle circle in micrograph image 
 7 | 
 8 |     Based on the coordinate of particle, plot circles of the particles in the micrograph.
 9 |     And save the ploted image in filename.
10 |  
11 |     Args:
12 |         micrograph_2d: numpy.array,it is a 2D numpy array.
13 |         coordinate: list, it is a 2D list, the shape is (num_particle, 2).
14 |         particle_size: int, the value of the particle size
15 |         filename: the filename of the image to be save.
16 |         color: define the color of the circle
17 | 
18 |     Raises:
19 |         pass
20 |     """
21 |     micrograph_2d = micrograph_2d.reshape(micrograph_2d.shape[0], micrograph_2d.shape[1])
22 |     fig = plt.figure()
23 |     ax = fig.add_subplot(111)
24 |     plt.axis('off')
25 |     plt.gray()
26 |     plt.imshow(micrograph_2d)
27 |     radius = particle_size/2
28 |     i = 0 
29 |     while True: 
30 |         if i >= len(coordinate):
31 |             break
32 |         coordinate_x = coordinate[i][0]
33 |         coordinate_y = coordinate[i][1]
34 |         cir1 = Circle(xy = (coordinate_x, coordinate_y), radius = radius, alpha = 0.5, color = color, fill = False)
35 |         ax.add_patch(cir1)
36 |         # extract the particles
37 |         i = i + 1
38 |     plt.savefig(filename)
39 | 
40 | def save_image(image_2d, filename):
41 |     scipy.misc.imsave(filename, image_2d)
42 | 
43 | def show_particle(numpy_array, filename):
44 |     numpy_array_small = numpy_array[:100, ...]
45 |     numpy_array_small = numpy_array_small.reshape(numpy_array_small.shape[0], numpy_array_small.shape[1], numpy_array_small.shape[2])
46 |     plt.figure(1)
47 |     index = 1
48 |     for i in range(10):
49 |         for j in range(10):
50 |             plt.subplot(10, 10, index)
51 |             plt.gray()
52 |             plt.imshow(numpy_array_small[index-1])
53 |             plt.axis('off')
54 |             index = index + 1
55 |     plt.subplots_adjust(wspace=0.01, hspace=0.01, top=0.99, bottom=0.01, left=0.01, right=0.99)
56 |     plt.savefig(filename) 
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/extractData.py:
--------------------------------------------------------------------------------
 1 | from dataLoader import DataLoader
 2 | 
 3 | import os
 4 | from optparse import OptionParser
 5 | 
 6 | def extractData():
 7 |     parser = OptionParser()
 8 |     parser.add_option("--inputDir", dest="inputDir", help="Input directory", metavar="DIRECTORY")
 9 |     parser.add_option("--mrc_number", dest="mrc_number", help="Number of mrc files to be trained.", metavar="VALUE", default=-1)
10 |     parser.add_option("--coordinate_symbol", dest="coordinate_symbol", help="The symbol of the coordinate file, like '_manualPick'", metavar="STRING")
11 |     parser.add_option("--particle_size", dest="particle_size", help="the size of the particle.", metavar="VALUE", default=-1)
12 |     parser.add_option("--save_dir", dest="save_dir", help="save the training samples to this directory", metavar="DIRECTORY", default="../trained_model")
13 |     parser.add_option("--save_file", dest="save_file", help="save the training samples to file", metavar="FILE")
14 |     (opt, args) = parser.parse_args()
15 | 
16 |     inputDir = opt.inputDir
17 |     particle_size = int(opt.particle_size)
18 |     coordinate_symbol = opt.coordinate_symbol
19 |     mrc_number = int(opt.mrc_number)
20 |     output_dir = opt.save_dir
21 |     output_filename = opt.save_file
22 |     if not os.path.isdir(output_dir):
23 |         os.mkdir(output_dir)
24 | 
25 |     if particle_size == -1:
26 |         print("particle size should be a positive value!")
27 |         return 
28 | 
29 |     output_filename = os.path.join(output_dir, output_filename)
30 |     DataLoader.extractData(inputDir, particle_size, coordinate_symbol, mrc_number, output_filename)
31 | 
32 | def main(argv=None):
33 |     extractData()
34 | 
35 | if __name__ == '__main__':
36 |     main()
37 | 


--------------------------------------------------------------------------------
/starReader.py:
--------------------------------------------------------------------------------
  1 | class BaseData:
  2 |     def __init__(self,name):
  3 |         self.name = name
  4 |         self.nameList = []
  5 |         self.dictionary = {}
  6 | 
  7 |     def getName(self):
  8 |         return self.name
  9 |     def getAllName(self):
 10 |         return self.nameList
 11 | 
 12 |     def getByName(self,name):
 13 |         return self.dictionary[name]
 14 | 
 15 |     def getLabelByIndex(self,index):
 16 |         return self.nameList[index]
 17 | 	
 18 |     def appendNameList(self,name):
 19 |         self.nameList.append(name);	
 20 | 
 21 |     def appendDictionary(self,name,value):		
 22 |         self.dictionary[name]=value
 23 | 
 24 |     def updateDictionary(self,row):
 25 |         for i in range(len(row)):
 26 |             self.dictionary[self.nameList[i]].append(row[i])
 27 | 
 28 | class starRead(BaseData):	
 29 |     def __init__(self,name):
 30 |         BaseData.__init__(self,name)		
 31 |         self.filePointer = open(name)	
 32 |         self.readFile()
 33 | 
 34 |     def readFile(self):
 35 |         readlineFlag = True
 36 |         while True:
 37 |             if readlineFlag:
 38 |                 line = self.filePointer.readline()
 39 |             else:
 40 |                 readlineFlag = True
 41 | 			
 42 |             if len(line) is 0:
 43 |                 break
 44 | 
 45 |             line = line.rstrip();
 46 |             #if we read an empty line
 47 |             if len(line) is 0:
 48 |                 continue
 49 | 
 50 |             if line.startswith('data_'):
 51 |                 #here we start a new table
 52 |                 self.appendNameList(line)
 53 |                 table = BaseData(line)
 54 |                 tableName = line;
 55 |                 #consider two type: with or without loop_				
 56 |                 line = self.filePointer.readline();					
 57 |                 while not (line.startswith('_rln') or line.startswith('loop_')):#eliminate empty lines
 58 |                     line = self.filePointer.readline().rstrip()
 59 | 				
 60 |                 if line.startswith('loop_'):
 61 |                     #with loop_
 62 |                     while not line.startswith('_rln'):
 63 |                         line = self.filePointer.readline().rstrip() #eliminate empty lines
 64 | 
 65 |                     readlineFlag = False
 66 | 		    while True:
 67 |                         if readlineFlag:
 68 |                             line = self.filePointer.readline()						
 69 |                         else:
 70 |                             readlineFlag = True					
 71 |                             #a new block start or we meet the end of line
 72 |                         if len(line) is 0 or line.startswith('data_'):
 73 |                             readlineFlag = False
 74 |                             break
 75 |                             # there may be emptyline 
 76 |                         if len(line.rstrip()) is 0:
 77 |                             continue
 78 |                         if line.startswith('_rln'):
 79 |                             lineList = line.split();			
 80 |                             table.appendNameList(lineList[0])
 81 |                             table.appendDictionary(lineList[0],[])
 82 |                         else:							
 83 |                             #update a new row
 84 |                             table.updateDictionary(line.split())
 85 |                 else:
 86 |                     #without loop_
 87 |                     readlineFlag = False
 88 |                     while True:
 89 |                         if readlineFlag:
 90 |                             line = self.filePointer.readline();							
 91 |                         else:
 92 |                             readlineFlag = True
 93 |                         #end of file or a new data block
 94 |                         if line is '' or line.startswith('data_'):								
 95 |                             readlineFlag = False
 96 |                             break
 97 |                         # there may be emptyline 
 98 |                         if line.rstrip() is '':
 99 |                             continue
100 |                         if line.startswith('_rln'):
101 |                             lineList = line.split();
102 |                             table.appendNameList(lineList[0])
103 |                             table.appendDictionary(lineList[0],lineList[1])
104 |                 self.appendDictionary(tableName,table)
105 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | from datetime import datetime
  6 | import os.path
  7 | import time
  8 | 
  9 | import numpy as np
 10 | from six.moves import xrange  # pylint: disable=redefined-builtin
 11 | import tensorflow as tf
 12 | from optparse import OptionParser
 13 | 
 14 | from deepModel import DeepModel
 15 | from dataLoader import DataLoader
 16 | import display
 17 | 
 18 | def shuffle_in_unison_inplace(a, b):
 19 |     assert len(a) == len(b)
 20 |     p = np.random.permutation(len(a))
 21 |     return a[p], b[p]
 22 | 
 23 | def error_rate(prediction, label):
 24 |     """Return the error rate based on dense predictions and sparse labels."""
 25 |     return 100.0 - (100.0 * np.sum(np.argmax(prediction, 1) == label) / prediction.shape[0])
 26 | 
 27 | def train():
 28 |     parser = OptionParser()
 29 |     parser.add_option("--train_inputDir", dest="train_inputDir", help="Input directory", metavar="DIRECTORY")
 30 |     parser.add_option("--train_inputFile", dest="train_inputFile", help="Input file", metavar="FILE")
 31 |     parser.add_option("--train_type", dest="train_type", help="Training type, 1|2|3|4.", metavar="VALUE", default=2)
 32 |     parser.add_option("--particle_number", dest="train_number", help="Number of positive samples to train.", metavar="VALUE", default=-1)
 33 |     parser.add_option("--mrc_number", dest="mrc_number", help="Number of mrc files to be trained.", metavar="VALUE", default=-1)
 34 |     parser.add_option("--coordinate_symbol", dest="coordinate_symbol", help="The symbol of the coordinate file, like '_manualPick'", metavar="STRING")
 35 |     parser.add_option("--particle_size", dest="particle_size", help="the size of the particle.", metavar="VALUE", default=-1)
 36 |     parser.add_option("--validation_ratio", dest="validation_ratio", help="the ratio.", metavar="VALUE", default=0.1)
 37 |     parser.add_option("--model_retrain", action="store_true", dest="model_retrain", help="train the model using the pre-trained model as parameters initialization .", default=False)
 38 |     parser.add_option("--model_load_file", dest="model_load_file", help="pre-trained model", metavar="FILE")
 39 |     parser.add_option("--model_save_dir", dest="model_save_dir", help="save the model to this directory", metavar="DIRECTORY", default="../trained_model")
 40 |     parser.add_option("--model_save_file", dest="model_save_file", help="save the model to file", metavar="FILE")
 41 |     (opt, args) = parser.parse_args()
 42 |  
 43 |     # set the tensoflow seed
 44 |     tf.set_random_seed(1234)
 45 |     # set the numpy seed
 46 |     np.random.seed(1234)
 47 | 
 48 |     # define the input size of the model
 49 |     model_input_size = [100, 64, 64, 1]
 50 |     num_class = 2                   # the number of the class
 51 |     batch_size = model_input_size[0]
 52 | 
 53 |     # define input parameters
 54 |     train_type = int(opt.train_type)
 55 |     train_inputDir = opt.train_inputDir
 56 |     train_inputFile = opt.train_inputFile
 57 |     train_number = float(opt.train_number) 
 58 |     mrc_number = int(opt.mrc_number)
 59 |     coordinate_symbol = opt.coordinate_symbol
 60 |     debug_dir = '../train_output'   # output dir
 61 |     particle_size = int(opt.particle_size)
 62 |     validation_ratio = float(opt.validation_ratio)   
 63 | 
 64 |     # define the save model
 65 |     model_retrain = opt.model_retrain
 66 |     model_load_file = opt.model_load_file
 67 |     model_save_dir = opt.model_save_dir
 68 |     model_save_file = os.path.join(model_save_dir, opt.model_save_file)
 69 | 
 70 |     if not os.access(model_save_dir, os.F_OK):
 71 |         os.mkdir(model_save_dir)
 72 |     if not os.access(debug_dir, os.F_OK):
 73 |         os.mkdir(debug_dir)
 74 | 
 75 |     # define the learning rate decay parameters
 76 |     # more information about this, refer to function tf.train.exponential_decay()
 77 |     learning_rate = 0.01
 78 |     learning_rate_decay_factor = 0.95
 79 |     # the value will be changed base on the train_size and batch size
 80 |     learning_rate_decay_steps = 400
 81 |     learning_rate_staircase = True
 82 |     # momentum
 83 |     momentum = 0.9
 84 | 
 85 |     # load training dataset
 86 |     dataLoader = DataLoader()
 87 |     if train_type == 1:
 88 |         # load train data from mrc file dir
 89 |         train_number = int(train_number)
 90 |         train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_mrcFileDir(train_inputDir, particle_size, model_input_size, validation_ratio, coordinate_symbol, mrc_number, train_number)
 91 |     elif train_type == 2:
 92 |         # load train data from numpy data struct
 93 |         train_number = int(train_number)
 94 |         train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_ExtractedDataFile(train_inputDir, train_inputFile, model_input_size, validation_ratio, train_number)
 95 |     elif train_type == 3:
 96 |         # load train data from prepicked results
 97 |         train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_PrePickedResults(train_inputDir, train_inputFile, particle_size, model_input_size, validation_ratio, train_number)
 98 |     elif train_type == 4:
 99 |         # load train data from relion .star file 
100 |         train_number = int(train_number)
101 |         train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_RelionStarFile(train_inputFile, particle_size, model_input_size, validation_ratio, train_number)
102 |     else:
103 |         print("ERROR: invalid value of train_type:", train_type)    
104 | 
105 |     # display.show_particle(train_data, os.path.join(debug_dir, 'positive.png'))
106 |     # test whether train_data exist
107 |     try: 
108 |         train_data
109 |     except NameError:
110 |         print("ERROR: in function load.loadInputTrainData.")
111 |         return None
112 |     else:
113 |         print("Load training data successfully!")
114 |     # shuffle the training data
115 |     train_data, train_label = shuffle_in_unison_inplace(train_data, train_label)
116 |     eval_data, eval_label = shuffle_in_unison_inplace(eval_data, eval_label)
117 | 
118 |     train_size = train_data.shape[0]
119 |     eval_size = eval_data.shape[0]    
120 |     # initalize the decay_steps based on train_size and batch size.
121 |     # change the learning rate each 2 epochs
122 |     learning_rate_decay_steps = 10*(train_size // batch_size)
123 |     # initialize the parameters of deepModel
124 |     deepModel = DeepModel(particle_size, model_input_size, num_class)
125 |     deepModel.init_learning_rate(learning_rate = learning_rate, learning_rate_decay_factor = learning_rate_decay_factor,
126 |                                   decay_steps = learning_rate_decay_steps, staircase = learning_rate_staircase)
127 |     deepModel.init_momentum(momentum = momentum)
128 |     # initialize the model
129 |     # define the computation procedure of optimizer, loss, lr, prediction, eval_prediction 
130 |     deepModel.init_model_graph_train()
131 |     saver = tf.train.Saver(tf.all_variables())
132 |     
133 |     start_time = time.time()
134 |     init = tf.initialize_all_variables()
135 |     with tf.Session(config=tf.ConfigProto(log_device_placement=False)) as sess:
136 |         # initialize all the parameters
137 |         sess.run(init)
138 |         max_epochs = 200   # the max number of epoch to train the model
139 |         best_eval_error_rate = 100
140 |         toleration_patience = 10 
141 |         toleration_patience_flag  = 0
142 |         eval_frequency = train_size // batch_size   # the frequency to evaluate the evaluation dataset
143 |         for step in xrange(int(max_epochs * train_size) // batch_size):
144 |             # get the batch training data
145 |             offset =  (step * batch_size) % (train_size - batch_size)
146 |             batch_data = train_data[offset:(offset+batch_size), ...]
147 |             batch_label = train_label[offset:(offset+batch_size)]
148 |             # online augmentation
149 |             #batch_data = DataLoader.preprocess_particle_online(batch_data)
150 |             loss_value, lr, train_prediction = deepModel.train_batch(batch_data, batch_label,sess)
151 | 
152 |             # do the computation
153 |             if step % eval_frequency == 0:
154 |                 stop_time = time.time() - start_time
155 |                 start_time = time.time()
156 |                 eval_prediction = deepModel.evaluation(eval_data, sess)
157 |                 eval_error_rate = error_rate(eval_prediction, eval_label)
158 |                 print('epoch: %.2f , %.2f ms' % (step * batch_size /train_size, 1000 * stop_time / eval_frequency)) 
159 |                 print('train loss: %.6f,\t learning rate: %.6f' % (loss_value, lr)) 
160 |                 print('train error: %.6f%%,\t valid error: %.6f%%' % (error_rate(train_prediction, batch_label), eval_error_rate))         
161 |                 if eval_error_rate < best_eval_error_rate:
162 |                     best_eval_error_rate = eval_error_rate
163 |                     toleration_patience = 10
164 |                 else:
165 |                     toleration_patience = toleration_patience - 1
166 |             if toleration_patience == 0:
167 |                 saver.save(sess, model_save_file)
168 |                 break
169 | 
170 | 
171 | def main(argv=None):
172 |     train()
173 | 
174 | if __name__ == '__main__':
175 |     #tf.app.run()
176 |     main()
177 | 


--------------------------------------------------------------------------------
/trained_model/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "model_demo_type3"
2 | all_model_checkpoint_paths: "model_demo_type3"
3 | 


--------------------------------------------------------------------------------
/trained_model/model_demo_type3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nejyeah/DeepPicker-python/3f46c8b0ffe2dbaa837fd9399b4a542588e991e6/trained_model/model_demo_type3


--------------------------------------------------------------------------------
/trained_model/model_demo_type3.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nejyeah/DeepPicker-python/3f46c8b0ffe2dbaa837fd9399b4a542588e991e6/trained_model/model_demo_type3.meta


--------------------------------------------------------------------------------