├── .gitignore
├── CRISPRidentify.py
├── LICENSE
├── README.md
├── TestFolderMultiline
├── MultilineFasta.fasta
└── MultilineFasta_1.fasta
├── TestInput
├── NC_006513.fa
├── NC_013216.fa
├── NC_014152.fa
├── NC_016625.fa
├── NC_017040.1.fasta
├── NC_018524.fa
└── NC_019693.fa
├── TestInputMultiline
└── MultilineFasta.fasta
├── components
├── __init__.py
├── components_detection.py
├── components_detection_refinement.py
├── components_eden.py
├── components_evaluated_arrays_enhancement.py
├── components_evaluation.py
├── components_helpers.py
├── components_ml.py
├── components_non_array_computations.py
├── components_output_maker.py
├── module_detection.py
├── module_detection_refinement.py
├── module_evaluated_arrays_enhancement.py
├── module_evaluation.py
├── module_non_array_computations.py
├── module_output_maker.py
└── pipeline.py
├── environment.yml
├── tools
├── CRISPRcasIdentifier
│ └── README.txt
├── blasting
│ ├── Verified_repeats_dataset1.fa
│ ├── Verified_repeats_dataset1.fa.nhr
│ ├── Verified_repeats_dataset1.fa.nin
│ ├── Verified_repeats_dataset1.fa.nog
│ ├── Verified_repeats_dataset1.fa.nsd
│ ├── Verified_repeats_dataset1.fa.nsi
│ ├── Verified_repeats_dataset1.fa.nsq
│ ├── Verified_repeats_dataset2.fa
│ ├── Verified_repeats_dataset2.fa.nhr
│ ├── Verified_repeats_dataset2.fa.nin
│ ├── Verified_repeats_dataset2.fa.nog
│ ├── Verified_repeats_dataset2.fa.nsd
│ ├── Verified_repeats_dataset2.fa.nsi
│ └── Verified_repeats_dataset2.fa.nsq
└── strand_prediction
│ └── CRISPRstrand
│ ├── CRISPRstrand.py
│ ├── CRISPRstrand.yml
│ ├── Example
│ ├── Input.fa
│ ├── Input.txt
│ ├── Input3.fa
│ ├── Input4.txt
│ └── Input5.fa
│ ├── Models
│ └── model_r.h5
│ ├── Results
│ └── CRISPRstrand_Summary.tsv
│ ├── cmd.txt
│ ├── convNets.py
│ ├── evaluate.py
│ ├── execute_strand.py
│ ├── preprocessing.py
│ └── utils.py
└── trained_models
├── eden
├── eden_ab_vs_n
├── eden_archaea
├── eden_bacteria
├── eden_merged
└── eden_merged_with_neg
└── extra_trees
├── extra_trees_ab_vs_n.pkl
├── extra_trees_archaea.pkl
├── extra_trees_bacteria.pkl
├── extra_trees_merged.pkl
├── extra_trees_merged_with_neg.pkl
├── extra_trees_subset.pkl
├── extra_trees_subset10features.pkl
├── extra_trees_subset8features.pkl
└── extra_trees_subset9features.pkl
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | .idea
3 | /tools/CRISPRcasIdentifier/CRISPRcasIdentifier/
4 |
--------------------------------------------------------------------------------
/CRISPRidentify.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import math
3 | import shutil
4 | import warnings
5 | import os
6 |
7 | from pathlib import Path
8 | from os import listdir
9 | from os.path import isfile, join
10 | from time import time
11 |
12 | from components.pipeline import Pipeline
13 | from components.components_ml import ClassifierWrapper
14 | from components.components_output_maker import CompleteFastaOutputMaker
15 | from components.components_output_maker import CompleteFolderSummaryMaker
16 | from components.components_output_maker import CompleteCasSummaryFolderMaker
17 | from components.components_output_maker import CompleteJsonOutputMaker
18 | from components.components_output_maker import CompleteSpacerCSVMaker
19 | from components.components_helpers import multiline_fasta_check, multiline_fasta_handle, multiline_fasta_handle_python
20 | from components.components_helpers import folder_of_multifasta_handle
21 |
22 | warnings.filterwarnings("ignore")
23 | warnings.simplefilter(action='ignore', category=FutureWarning)
24 |
25 | FLAG_DEVELOPER_MODE = False
26 |
27 | parser = argparse.ArgumentParser(description='Run Identifier')
28 | parser.add_argument('--input_folder', type=str, default=None,
29 | help='input folder (default: None)')
30 |
31 | parser.add_argument('--file', type=str, default=None,
32 | help='input file (default: None)')
33 |
34 | parser.add_argument('--input_folder_multifasta', type=str, default=None,
35 | help='input folder of multifasta (default: None)')
36 |
37 | parser.add_argument('--model', type=str, default="ALL",
38 | help='model_to_use (default: ALL)')
39 |
40 | parser.add_argument('--additional_model', type=str, default=None,
41 | help='model_to_use (default: None)')
42 |
43 | parser.add_argument('--result_folder', type=str, default="Results",
44 | help='folder with the result (default: Results)')
45 |
46 | parser.add_argument('--pickle_report', type=str, default='',
47 | help='pickled report file (default: None)')
48 |
49 | parser.add_argument('--json_report', type=str, default='',
50 | help='json report file (default: None)')
51 |
52 | parser.add_argument('--fasta_report', type=str, default=False,
53 | help='fasta report file (default: False)')
54 |
55 | parser.add_argument('--strand', type=str, default=True,
56 | help='CRISPR array orientation prediction (default: True)')
57 |
58 | parser.add_argument('--cas', type=str, default=False,
59 | help='cas genes computation (default: False)')
60 |
61 | parser.add_argument('--is_element', type=str, default=True,
62 | help='is element computation (default: True)')
63 |
64 | parser.add_argument('--parallel', type=str, default=True,
65 | help='parallel computations (default: True)')
66 |
67 | parser.add_argument('--cpu', type=str, default="ALL",
68 | help='parallel computations (default: ALL)')
69 |
70 | parser.add_argument('--fast_run', type=str, default=False,
71 | help='fast run option (default: False)')
72 |
73 | parser.add_argument('--degenerated', type=bool, default=True,
74 | help='degenerated_repeat_computation (default: True)')
75 |
76 | parser.add_argument('--min_len_rep', type=int, default=21,
77 | help='min avg. length of the repeats (default: 21)')
78 |
79 | parser.add_argument('--max_len_rep', type=int, default=55,
80 | help='max avg. length of the repeats (default: 55)')
81 |
82 | parser.add_argument('--min_len_spacer', type=int, default=18,
83 | help='min avg. length of spacers (default: 18)')
84 |
85 | parser.add_argument('--max_len_spacer', type=int, default=78,
86 | help='max avg. length of spacers (default: 78)')
87 |
88 | parser.add_argument('--min_repeats', type=int, default=3,
89 | help='min number of repeats (default: 3)')
90 |
91 | parser.add_argument('--enhancement_max_min', type=bool, default=True,
92 | help='enhancement with filter (default: True)')
93 |
94 | parser.add_argument('--enhancement_start_end', type=bool, default=True,
95 | help='enhancement with start end omitting (default: True)')
96 |
97 | parser.add_argument('--max_identical_spacers', type=int, default=4,
98 | help='maximum number of identical spacers in the array (default: 4)')
99 |
100 | parser.add_argument('--max_identical_cluster_spacers', type=int, default=3,
101 | help='maximum number of consecutive identical spacers in the array (default: 3)')
102 |
103 | parser.add_argument('--margin_degenerated', type=int, default=30,
104 | help='maximum length of the spacer margin for the degenerated search (default: 30)', )
105 |
106 | parser.add_argument('--max_edit_distance_enhanced', type=int, default=6,
107 | help='maximum edit distance for the evaluated array enhancement (default: 6)')
108 |
109 |
110 | script_absolute_path = os.path.dirname(os.path.abspath(__file__))
111 | work_directory = os.getcwd()
112 | pid = os.getpid()
113 |
114 | args = parser.parse_args()
115 |
116 | complete_path_folder = (args.input_folder)
117 | if complete_path_folder:
118 | complete_path_folder = Path(complete_path_folder).absolute()
119 |
120 | complete_path_file = args.file
121 | if complete_path_file:
122 | complete_path_file = Path(complete_path_file).absolute()
123 |
124 | complete_folder_multifasta = args.input_folder_multifasta
125 | if complete_folder_multifasta:
126 | complete_folder_multifasta = Path(complete_folder_multifasta).absolute()
127 |
128 | folder_result = args.result_folder
129 | if folder_result:
130 | folder_result = Path(folder_result).absolute()
131 |
132 | pickle_folder = args.pickle_report
133 | if pickle_folder:
134 | pickle_folder = Path(pickle_folder).absolute()
135 |
136 | json_folder = args.json_report
137 | if json_folder:
138 | json_folder = Path(json_folder).absolute()
139 |
140 | list_models = ["8", "9", "10"] if args.model == "ALL" else [args.model]
141 | flag_possible_differentiate_model = args.additional_model
142 | if flag_possible_differentiate_model not in ["possible", "all"]:
143 | flag_possible_differentiate_model = None
144 |
145 |
146 | flag_enhancement_max_min = args.enhancement_max_min
147 | flag_enhancement_start_end = args.enhancement_start_end
148 |
149 | flag_parallel = False if (args.parallel in ["False", False]) else True
150 | flag_cpu = args.cpu
151 | flag_fast_run = False if (args.fast_run in ["False", False]) else True
152 |
153 | strand_flag = False if (args.strand in ["False", False]) else True
154 | cas_flag = False if (args.cas in ["False", False]) else True
155 | is_flag = False if (args.is_element in ["False", False]) else True
156 | degenerated_flag = False if (args.degenerated in ["False", False]) else True
157 | fasta_report = False if (args.fasta_report in ["False", False]) else True
158 |
159 | flags = {"flag_parallel": flag_parallel,
160 | "flag_cpu": flag_cpu,
161 | "flag_fast_run": flag_fast_run,
162 | "flag_strand": strand_flag,
163 | "flag_cas": cas_flag,
164 | "flag_is": is_flag,
165 | "flag_fasta_report": fasta_report,
166 | "flag_degenerated": degenerated_flag,
167 | "flag_enhancement_min_max": flag_enhancement_max_min,
168 | "flag_enhancement_start_end": flag_enhancement_start_end
169 | }
170 |
171 | min_rep = args.min_len_rep
172 | max_rep = args.max_len_rep
173 | max_spacer = args.max_len_spacer
174 | min_spacer = args.min_len_spacer
175 | min_repeats = args.min_repeats
176 | max_identical_spacers = args.max_identical_spacers
177 | max_identical_cluster_spacers = args.max_identical_cluster_spacers
178 | margin_degenerated = args.margin_degenerated
179 | max_edit_distance_enhancement = args.max_edit_distance_enhanced
180 |
181 | parameters = {
182 | "param_min_avg_repeat_length": min_rep,
183 | "param_max_avg_repeat_length": max_rep,
184 | "param_max_avg_spacer_length": max_spacer,
185 | "param_min_avg_spacer_length": min_spacer,
186 | "param_min_repeats": min_repeats,
187 | "param_max_identical_spacers": max_identical_spacers,
188 | "param_max_identical_cluster_spacers": max_identical_cluster_spacers,
189 | "param_spacer_margin_degenerated_search": margin_degenerated,
190 | "param_max_edit_distance": max_edit_distance_enhancement
191 | }
192 |
193 |
194 | ALL_FEATURES = ['repeat_len', 'number_repeats', 'repeat_similarity',
195 | 'at_richness', 'avg_spacer_len', 'spacer_similarity',
196 | 'number_mismatches', 'spacer_evenness', 'mfe_score',
197 | 'orf_score', 'hmmr_score', 'blast_score_1', 'blast_score_2',
198 | 'eden_score']
199 |
200 | best_combinations = {
201 | "8": (2, 4, 5, 6, 7, 8, 9, 11),
202 | "9": (1, 2, 4, 5, 7, 8, 9, 10, 12),
203 | "10": (0, 2, 3, 4, 5, 6, 7, 10, 11, 12)
204 | }
205 |
206 |
207 | pid_work_directory = os.path.join(work_directory, 'Identify_Temp' + str(pid))
208 | if not os.path.exists(pid_work_directory):
209 | os.makedirs(pid_work_directory)
210 | os.chdir(pid_work_directory)
211 |
212 |
213 | feature_list = ['.'.join([ALL_FEATURES[i] for i in best_combinations[model]]) for model in list_models]
214 | list_ml_classifiers = [ClassifierWrapper(classifier_type=None,
215 | load_option=script_absolute_path + "/trained_models/extra_trees/extra_trees_subset{}features.pkl".
216 | format(model))
217 | for model in list_models]
218 |
219 |
220 | def run_over_folder_of_files(folder, result_folder, pickle_folder, chunk_number=None, number_of_chunks=None):
221 | files = [f for f in listdir(folder) if isfile(join(folder, f))]
222 | files_name_fix = [f.replace("\r", "").replace("\t", "").replace("\n", "") for f in files]
223 | for old_name, new_name in zip(files, files_name_fix):
224 | old_path = join(folder, old_name)
225 | new_path = join(folder, new_name)
226 | if old_path != new_path:
227 | os.system(f"mv {old_path} {new_path}")
228 | files = sorted(files_name_fix)
229 |
230 | if number_of_chunks:
231 | chunk_size = math.ceil(len(files) / number_of_chunks)
232 | chunk_start = (chunk_number - 1) * chunk_size
233 | chunk_end = chunk_number * chunk_size
234 | chunk = files[chunk_start:chunk_end]
235 | print(chunk_start)
236 | print(chunk_end)
237 | else:
238 | chunk = files
239 |
240 | for index, file in enumerate(chunk, 1):
241 | print("\n\n\n\t\t\t\tExecuting file {} out of {} ({})\n\n\n".format(index, len(chunk), file))
242 | pl = Pipeline(result_folder_path="{}/".format(result_folder),
243 | pickle_folder_path="{}".format(pickle_folder),
244 | json_folder_path="{}".format(json_folder),
245 | file_path=join(folder, file),
246 | list_ml_classifiers=list_ml_classifiers,
247 | list_features=feature_list,
248 | parameters=parameters,
249 | flags=flags,
250 | flag_dev_mode=FLAG_DEVELOPER_MODE,
251 | absolute_directory_path=script_absolute_path)
252 |
253 | cfsm = CompleteFolderSummaryMaker(folder_result=result_folder)
254 | ccfsm = CompleteCasSummaryFolderMaker(folder_result=result_folder)
255 | cfom = CompleteFastaOutputMaker(folder_result=result_folder)
256 | if cas_flag:
257 | cs_csv = CompleteSpacerCSVMaker(folder_result=result_folder)
258 | if json_folder:
259 | cjsm = CompleteJsonOutputMaker(folder_json_result=json_folder, folder_text_tesult=result_folder)
260 |
261 |
262 | def run_over_one_file(file, result_folder, pickle_folder, json_folder):
263 | print("\n\n\n\t\t\t\tExecuting file {}\n\n\n".format(file))
264 | pl = Pipeline(result_folder_path="{}/".format(result_folder),
265 | pickle_folder_path="{}".format(pickle_folder),
266 | json_folder_path="{}".format(json_folder),
267 | file_path=join(file),
268 | list_ml_classifiers=list_ml_classifiers,
269 | list_features=feature_list,
270 | parameters=parameters,
271 | flags=flags,
272 | flag_dev_mode=FLAG_DEVELOPER_MODE,
273 | absolute_directory_path=script_absolute_path)
274 |
275 | cfsm = CompleteFolderSummaryMaker(folder_result=result_folder)
276 | ccfsm = CompleteCasSummaryFolderMaker(folder_result=result_folder)
277 | cfom = CompleteFastaOutputMaker(folder_result=result_folder)
278 | if cas_flag:
279 | cs_csv = CompleteSpacerCSVMaker(folder_result=result_folder)
280 | if json_folder:
281 | cjsm = CompleteJsonOutputMaker(folder_json_result=json_folder, folder_text_tesult=result_folder)
282 |
283 |
284 |
285 | def main():
286 | start_time = time()
287 | if complete_path_file:
288 | folder_multifasta = multiline_fasta_handle_python(complete_path_file, flag_ncbi_formatting=True)
289 | print(folder_multifasta)
290 | run_over_folder_of_files(folder_multifasta, folder_result, pickle_folder, json_folder)
291 | shutil.rmtree(folder_multifasta)
292 | elif complete_path_folder:
293 | run_over_folder_of_files(complete_path_folder, folder_result, pickle_folder, json_folder)
294 | elif complete_folder_multifasta:
295 | print("Folder Multifasta")
296 | folder_multifasta = folder_of_multifasta_handle(complete_folder_multifasta)
297 | run_over_folder_of_files(folder_multifasta, folder_result, pickle_folder, json_folder)
298 | else:
299 | print("No input was provided")
300 |
301 | end_time = time()
302 | print("Elapsed time: ", end_time-start_time)
303 |
304 |
305 | if __name__ == "__main__":
306 | main()
307 | shutil.rmtree(pid_work_directory, ignore_errors=True)
308 |
309 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Bioinformatics Lab - Department of Computer Science - University Freiburg
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # CRISPRidentify: Identification of CRISPR arrays using machine learning approach
3 |
4 | CRISPRidentify is a tool to search for CRISPR arrays which utilises
5 | machine learning approach for distinguishing false candidates from true CRISPRS.
6 | CRISPRidentify, performs three steps: detection, feature extraction and
7 | classification based on manually curated sets of positive and negative examples of CRISPR arrays.
8 | The identified CRISPR arrays are then reported to the user accompanied by detailed annotation.
9 | We demonstrate that our approach identifies not only previously detected CRISPR arrays,
10 | but also CRISPR array candidates not detected by other tools. Compared to other methods,
11 | our tool has a drastically reduced false positive rate. In contrast to the existing tools, CRISPRidentify
12 | approach not only provides the user with the basic statistics on the identified CRISPR arrays
13 | but also produces a certainty score as an intuitive measure of the likelihood that a given
14 | genomic region is a CRISPR array.
15 |
16 | ## Getting Started
17 |
18 | These instructions will get you a copy of the project up and running on your local machine for development and testing purposes.
19 |
20 | ### Prerequisites
21 |
22 | First you need to install Miniconda
23 | Then create an environment and install the required libraries in it
24 |
25 |
26 | ### Creating a Miniconda environment
27 |
28 | First we install Miniconda for python 3.
29 | Miniconda can be downloaded from here:
30 |
31 | https://docs.conda.io/en/latest/miniconda.html
32 |
33 | Then Miniconda should be installed. On a linux machine the command is similar to this one:
34 |
35 | ```
36 | bash Miniconda3-latest-Linux-x86_64.sh
37 | ```
38 |
39 | Then we create an environment. The necessary setup is provided in the "environment.yml" file.
40 |
41 | In order to install the corresponding environment one can execute the following command.
42 |
43 | ```
44 | conda env create -f environment.yml
45 | ```
46 |
47 | We recommend to install mamba package manager which is a faster alternative to conda.
48 |
49 | ```
50 | conda install -c conda-forge mamba
51 | ```
52 |
53 | Then we can create the environment using mamba.
54 | ```
55 | mamba env create -f environment.yml
56 | ```
57 |
58 | We want to acknowledge Richard Stöckl @richardstoeckl for his contribution to the environment.yml file.
59 |
60 |
61 | ### Additional preparations
62 |
63 | CRISPRidentify utilizes CRISPRcasIdentifier for the detection of the cas genes.
64 | If you are interested in cas gene result please install CRISPRcasIdentifier.
65 |
66 | Please make sure that after you downloaded CRISPRcasIdentifier its relative path is:
67 |
68 | ```
69 | tools/CRISPRcasIdentifier/CRISPRcasIdentifier/CRISPRcasIdentifier.py
70 | ```
71 |
72 | You can find the CRISPRcasIdentifier tool and its description [here](https://github.com/BackofenLab/CRISPRcasIdentifier)
73 |
74 | You need to make two steps:
75 |
76 | Firstly, you need to download the CRISPRcasIdentifier tool:
77 | ```
78 | wget https://github.com/BackofenLab/CRISPRcasIdentifier/archive/v1.1.0.tar.gz
79 | tar -xzf v1.1.0.tar.gz
80 | ```
81 | Secondly, you need to download the models:
82 |
83 | Due to GitHub's file size constraints, authors made their HMM and ML models available in Google Drive. You can download them [here](https://drive.google.com/file/d/1YbTxkn9KuJP2D7U1-6kL1Yimu_4RqSl1/view?usp=sharing) and [here](https://drive.google.com/file/d/1Nc5o6QVB6QxMxpQjmLQcbwQwkRLk-thM/view?usp=sharing). Save both tar.gz files inside CRISPRcasIdentifier's directory.
84 |
85 |
86 | ### Activation of the environment
87 |
88 | Before running CRISPRidentify one need to activate the corresponding environment.
89 |
90 | ```
91 | conda activate crispr_identify_env
92 | ```
93 |
94 | ## Running CRISPRidentify
95 |
96 | We prepared the test folder which can be used for the test run.
97 |
98 | Example of running CRISPRidentify over a folder of files:
99 |
100 | ```
101 | python CRISPRidentify.py --input_folder TestInput
102 | ```
103 |
104 | Example of running CRISPRidentify over a single multiline fasta input:
105 | ```
106 | python CRISPRidentify.py --file TestInputMultiline/MultilineFasta.fasta
107 | ```
108 |
109 | ### Flags
110 |
111 | You can see the help by using the `-h` option
112 |
113 | ```
114 |
115 | python CRISPRidentify.py -h
116 |
117 | ```
118 |
119 | #### Mandatory flags
120 | The only mandatory parameter which has to be specified is the input.
121 | Our approach has two options to handle the input. User has to specify either the path to the folder with the input fasta files
122 | or the full path to a single fasta input file.
123 |
124 | ##### Input as a folder of fasta files
125 |
126 | * `--input_folder `
127 |
128 | Specifies the mode where a folder with fasta files which will be used as the input for CRISPRidentify. The CRISPR array search will be
129 | then conducted separately for each file in the corresponding input folder.
130 |
131 | ```
132 | python CRISPRidentify.py --input_folder TestInput
133 | ```
134 |
135 | ##### Input as a single file
136 |
137 | * `--file `
138 |
139 | Specifies the mode where a singe file is used as the input for the algorithm. The file might contain a single entry or multiple entries.
140 | The CRISPR array search will be done for each entry independently.
141 |
142 | For example:
143 |
144 | ```
145 | python CRISPRidentify.py --file InputFile
146 | ```
147 | ##### Input as a folder of multiline fasta files
148 |
149 | * `-- input_folder_multifasta `
150 |
151 | Specifies the mode where a folder with fasta files which will be used as the input for CRISPRidentify. The CRISPR array search will be
152 | then conducted separately for each file in the corresponding input folder. The difference between this mode and the previous one is that
153 | in this mode the input files can contain multiple entries.
154 |
155 | For example:
156 |
157 | ```
158 | python CRISPRidentify.py --input_folder_multifasta TestFolderMultiline
159 | ```
160 |
161 | #### Optional flags
162 |
163 | ##### Output
164 |
165 | * `--result_folder [paht_to_the_result_folder]`
166 |
167 | Specifies the path and name of the folder with the output results. If not specified the results will appear in "Results" folder
168 |
169 |
170 | For example:
171 |
172 | ```
173 | python CRISPRidentify.py --input_folder TestInput --result_folder Results
174 | ```
175 |
176 | * `--pickle_report [folder_to_put_pickle_results]`
177 |
178 | Specifies if found CRISPR arrays should be stored also as python objects. Turned off by default.
179 |
180 |
181 | For example:
182 |
183 | ```
184 | python CRISPRidentify.py --input_folder TestInput --pickle_report PickleReportFolder
185 | ```
186 |
187 |
188 | ##### Classification parameters
189 |
190 | * `--model [8/9/10/ALL]`
191 |
192 |
193 | Takes values: 8, 9, 10, ALL and specifies the classification model. The default value is `ALL`.
194 | If the value `ALL` is picked for the flag the certainty score will be calculated as average between all three available models.
195 |
196 |
197 | For example:
198 |
199 | ```
200 | python CRISPRidentify.py --input_folder TestInput --model 8
201 | ```
202 |
203 |
204 | ```
205 | python CRISPRidentify.py --input_folder TestInput --model ALL
206 | ```
207 |
208 |
209 | ##### Performance speed
210 | * `--fast_run [True/False]`
211 |
212 | Specifies if the repeat set enhancement step should be skipped which drastically speeds up the process but might decrease the recall quality.
213 | Only matching pairs found with Vmatch will be used as repeat candidates. Automatically turns off filter approximation and start_end approximation (see enhancement_max_min and enhancement_start_end)
214 | Turned off by default.
215 |
216 | For example:
217 |
218 | ```
219 | python CRISPRidentify.py --input_folder TestInput --fast_run True
220 | ```
221 |
222 | * `--enhancement_max_min [True/False]`
223 |
224 | Specifies if the filter approximation based on the max. and min. elements should be built
225 | The default value is True
226 |
227 | * `--enhancement_start_end [True/False]`
228 |
229 | Specifies if the start/end omitting of the repeat candidates should be done to enrich the candidate set.
230 | The default value is True
231 |
232 |
233 | For example:
234 |
235 | ```
236 | python CRISPRidentify.py --input_folder TestInput --enhancement_max_min True --enhancement_start_end False
237 | ```
238 |
239 | ##### Candidate filtering criteria
240 |
241 |
242 | * `--min_len_rep [integer]`
243 |
244 | Specifies the minimum length of repeats in a CRISPR array. The default value: 21
245 |
246 | * `--max_len_rep [integer]`
247 |
248 | Specifies the maximum length of repeats in a CRISPR array. The default value: 55
249 |
250 | * `--min_len_spacer [integer]`
251 |
252 | Specifies the minimum average length of spacers in a CRISPR array. The default value: 18
253 |
254 | * `--max_len_spacer [integer]`
255 |
256 | Specifies the maximum average length of spacers in a CRISPR array. The default value: 78
257 |
258 | * `--min_repeats [integer]`
259 |
260 | Specifies the minimum number of repeats in a CRISPR array. The default value: 3
261 |
262 |
263 | For example:
264 |
265 | ```
266 | python CRISPRidentify.py --input_folder TestInput --min_len_rep 25 --max_len_rep 50 --min_repeats 2
267 | ```
268 |
269 | #####Candidate Enhancement
270 |
271 | * `--degenerated' [True/False]`
272 |
273 | Allows search for degenerated repeat candidates on both ends of the CRISPR array candidate. The default value: True
274 |
275 | * `--margin_degenerated [int]`
276 |
277 | Specifies the maximum length difference between a new spacer sequence (obtained with the search of degenerated repeats) and the average value of spacer length in the array. The default value: 30
278 |
279 | * `--max_edit_distance_enhanced [int]`
280 |
281 | Specifies the number of editing operations for candidate enhancement. The default value: 6
282 |
283 |
284 | ##### Additional computations
285 |
286 | * `--strand[True/False]`
287 |
288 | Specifies if the array orientation should be predicted. The default value is True.
289 |
290 | * `--cas [True/False]`
291 |
292 | Specifies if cas genes should be predicted. The default value is False.
293 |
294 | * `--is_element [True/False]`
295 |
296 | Specifies if IS-Elements should be predicted. The default value is False.
297 |
298 |
299 | ```
300 | python CRISPRidentify.py --input_folder TestInput --cas True --is_element True
301 | ```
302 |
303 | ## Output files
304 |
305 | The output folder for each input entries consist of the following files:
306 |
307 | * Bona-Fide_Candidates. The file will contain the representation of the found CRISPR arrays complemented with the support information.
308 | For each candidate the output will contain the values for extracted features as well as the certainty score of the used classifier.
309 | On top of that in the support information you can find the orientation for each array, leader and downstream regions, cas genes and IS-elements (if the corresponding flags were selected).
310 |
311 | * Alternative_Candidates. In this file we demonstrate alternative representations of bona-fide arrays. These alternative representations also got a high score from the classifier but this score was lower than the corresponding score of the bona-fide representation.
312 | Alternative representation of a CRISPR array usually corresponds to a slightly longer/shorter repeat sequence but represents the same genomic region.
313 |
314 | The candidates with the certainty scores between 0.4 and 0.75 are stored in Possible_Candidates and Possible_Discarded_Candidates
315 |
316 | * Possible_Candidates. In this file the algorithm stores the candidate with the highest certainty score.
317 |
318 | * Possible_Discarded. Here are collected all the other representations
319 |
320 |
321 | The algorithm also demonstrates CRISPR-looking structures which obtained certainty score lower than 0.4 from the classifier.
322 |
323 | * Low_score_candidates. The user can find these structures in this file.
324 |
325 |
326 | On top of that the algorithm builds a csv summary.
327 |
328 | * Summary.csv
329 |
330 | Following information can be found in the summary:
331 |
332 | 1. Array index
333 | 2. Array start
334 | 3. Array end
335 | 4. Array length
336 | 5. Consensus repeat
337 | 6. Repeat length
338 | 7. Average length of the spacers
339 | 8. Number of spacers
340 | 9. Array orientation
341 | 10. Array category
342 |
343 | ## Metagenomic analysis
344 |
345 | CRISPRidentify is suitable for easy and powerful metagenomic analysis
346 | When `--file` or `--input_folder` flag is used the pipeline with automatically generate two complete summaries
347 | :
348 |
349 | 1. For all the identified arrays
350 | 2. For all labeled Cas genes
351 |
352 |
353 | On top of that the user might use the flag:
354 |
355 | `--fasta_report True`
356 |
357 | This option with create three fasta files:
358 | 1. All the array sequences with their origins in the header
359 | 2. All the repeat sequences with their origins and locations in the arrays
360 | 3. All the spacer sequences with their origins and locations in the arrays
361 |
362 | ## Improving CRISPRidentify
363 |
364 | We are constantly working on the improvements of CRISPRidentify. If you found a bug or incorrect/missing CRISPR array representation please submit via github issue interface.
365 |
366 |
367 |
368 |
369 |
--------------------------------------------------------------------------------
/TestFolderMultiline/MultilineFasta_1.fasta:
--------------------------------------------------------------------------------
1 | >gi|56475432|ref|NC_006513.1 Damage1| Aromatoleum aromaticum EbN1 chromosome
2 | ATCACGCCCTCCCATCCCGCCGATCCACCGCCCGACGATCCTTCCGCCGCTGCTCGCTCCGCACGCGCCG
3 | GTCACCATCGCGACGTTCCACCGCGCGGCGGTCGTCATGGACGGGCTCGACATAGGTTCGCTCGGCCACA
4 | TCAACCGGAGTTGGCACGAAGGCGGGGAGCGTGGTTTCGCTTTCCACGTCGCCAGTCCCGCGCAGATACC
5 | CCCAATCCACGTCGGGGCGGAGTTCTTCGCAGCGGACGGCGCCGGCTGTGGCGCGCTCGATGGCCGGGCA
6 | GCGCTCGGCGGGGATCGGTCGAACGCCAGTAACCCACTGGCTGACCGCAGCAGGCGTCACGCCGAGAGCA
7 | CGGGCCAAGGTTGCTTGGCCGCCTACGGATTCGCACGCCAAAAGGATCGGGTTTCGGTTCATGCCACGAC
8 | TATAGCACCGCTACAGATTACATATCAAGCCATGCTATTCATTCCAATAGATAGCATTGCTTCATCATGC
9 | TGATATGGTCACCGCACGAAGAAGCAGAACGTCTTAAGGCCCGTTTTGGAGCGGTCCCCAACCGGGAGAA
10 | GTTCGCTCGAGAAATTGGACTTCACGGCGGCGGATCAATGATCTACCAGCACATAAAGGGGATTCGCCCG
11 | ATCAGCCGCGAAGCGGCGGTTGCGTATGCGAAAGGCTTCAATTGCAGGCTCGAAGAAATCAGCCCGCGAA
12 | TCGCCCTTGAGATACAGCAGGCCACTTCTGTCTTGTCGCCAACGCCAGACCGTCCGCCCGAGTCGCCGAA
13 | CATCTGCGCCGGACCGGACCGCAAAGGCAAGGTGCCGCTAATCTCGTGGGTGCGCGCAGGTGAGTTCGCT
14 | CATGCCGCTGATCTTTTGCCGGTCGGCGAGGCCTATGAATGGGTGGAGACCGGCGTGAACGTGCAGCCCC
15 | ACACTTTTGCGCTGCGCGTCCAGGGCGACTCGATGGAGCCGGAATTCGTCGCTGGCACGATCATCGTGAT
16 | CGAGCCGCACATGGTCGCTGAGCCCGGCGACTACGTCATCGCCCGCAACGGCGACAACGAGGCCACTTTC
17 | AAGCAGCTCGTGCGCGACGGGGCGGACCTGTACCTCAAACCGCTGAACCCCCGCTACCCGATCAAGCCGC
18 | TGGGCGCCACGGCGATCATCGGCGTGGTTCGAGAGGCCGTGAAGCGCTATCGGTGAGCGGGTGTTTTCCA
19 | GGCCATCACCCCTTGCCCGCAATCTGTAACAGCCTCCCAACAACAACAGACTCATGCTATTTTTCAAGGG
20 | CTTGAACGGCCCGAAATTTGTCAAGTCCATCGTCGAAGGGGTCGGATTGTGGCTCGGTATCGTCAGCGGC
21 | TTGGCGTGGCTGTGGTCCGAATTGGCGATCGTGAAGGTCGAACTGACATGGGCCGTGACAGTCACGACCG
22 | GATTTTTCGTGTTCTACGTCGGGGTCCTGCTTTGCTTCACGCGGCAGGGGGTTCTGCAAACGCGCATCGA
23 | CGAATGCGCCCAGGCCAAGAAAGCGCTGGAAGAAGAAGTGCTGCGCAAGCGGCTGTCGTCCAGGAAAAAA
24 | GGCCGGTGAGGCTGATGGGGAGAAGTGGAAAATGAGCGTGATTGCCATTCACGCGGCGATCGTCGGCGTG
25 | ACGGTCGTCGTCGCATACGTCCTGCACATGCGCACGATGCGCATGAAAGCCTGTTTCAACCTCTTCCGCG
26 | TGCGCGACCGCTTCGTCCTGCTCGTCGCCAAAGACATCCTGCCCGAAGACAGCAGGGTGTTCGTCCACTA
27 | CTACGGACGCATCAACAAGCTGCTTTGCGACGCCCCGAAAGTCGGCATCGACGACATGCTGGCCACGATC
28 | TTTCGTCACGTGCCCAATGGTGAGTTCGACCAGGCACTGGAGCGCGCCCGCTCCCAGTCGCAAAAAATGC
29 | TGGCCGATCCGCTCATGCAGAACGACGAAGTGCGAGCGGCCGTGGCCGATTACTACCGCGCCATCCGCGC
30 | GATGCTGCTGTCCCACAGCAGCATCCTGAAGGTCATCTACCTGCTGTCGCACCGCTTCGCCACGTCGCTC
31 | CACTCCGGCTGGATCGGCGGCGAAGTCAGCCGCGGGCTGAAGGCCGCCGACTACGCCGACGAAGAAGCTG
32 | CCCTGTTTAAACCCGCCTGAGGTTATGGCGCCGGGGTGACGCCACGGCAGCCGGTCCTCGACACGGCCGG
33 | CTATGTTGTGGTGGGTGGCGGCATAAAAAAAGACCGCCGAAGCGGTCTTTACGTGGAATATGCAGACTCT
34 | TGCCCTACTCTTATTCCGCTGCCGCCGCCCGGCAGGTCGACGGGCGATACTTCGCGTCCACAGATCCGGC
35 | GCACGCCCACACTAGCACGCCAGTTGCGTCGGCCTTAGGCGTCAGCGTGATCGTCTCTGCGGCAATCGCG
36 | TCGTTTGCATCTACGCCGCCGGTCGATGTAATGACGCCGCTTGGTCCGATTGCGACGCTGGCGGTGTACT
37 | TACCCACGGCACCACCGTAGCCCGCAGCGGCATCACTGGCAGGGAGGGCGCCAGTCGACTGAAAAGTTTC
38 | AGCAACGGCAACCTTGGCGCCATCTGTCAGGGACATCAGTTCTGAAACCTGAGCCCGGATCGTGTAATCC
39 | TGATACGCCGGCAGCGCGACCGCTGCCAAAATCCCGATGATCGCGACGACGATCATCAGTTCGATCAGCG
40 | TGAAACCTTGTTGGACCTTTTTCATTTTCAGCTCCCTTGCTTGTTTGACGGGACGTCCCGTGCACATCGT
41 | CACGCAACAAGCGTGCCAACCACCAAAGCGGCATAATCGCCCCGGAGTGCCACCCATGTCGCACTTATTA
42 | CTGTGACATTCGGGCGAATGTCGCAGCGTGGCACACGACACGCCGCCCCCTTCCGGGCTTGAGCGCAGCC
43 | CTCTAAACTCCGTGCCCTGGAGGCTCCCATGACCGCAATCACCGTTCCCACCGCCGCACTCATCCTCGAC
44 | CGGACCACGCGCACCATCTGGCGCCGCATCGCCGACGGATCGCTGCCGGCGATCACCGAAGACGACCGGC
45 | AGAAGATCCCGCTCGACGCCGTCATCCGCGAGGCGTGCATTCCGATCGACCCGGACGACTACGAGCTCGT
46 | CACCGGCACCGACGCCGGCGATGCCGAATCGCAGTGCGACCTCGCACTGCTGTTCCTGCTGCGCGACCGC
47 | CCGCACATCGCCATGCCGCTGCTCAACCTGGCCGCCAAGGACGACTACCCGGAGGCGCTCTACCAGATCG
48 | CCCGCTGCCACATCGCCGGCAAGGGCGTGCCGCGCGACGGCAACGCCGGCATCATGTGGCTCGCTCGGGC
49 | CGCCAGCCGCGGCCACTCCGTAGCCCAGGAGCAGATGCGCGTCGTGCGCGAGTCCGGCACCGGCACCGAC
50 | CTCGACGCCCTCGACGCGCTGCTCGAGCGCATCGAACAGCGAGTCGTGTTCGCTGCACTGGAAACCACCG
51 | CAACCCGCTAGACCCCCCGCGCTTCGCAATCTGCCCGCCGCTTGAGCGGGCTTTTTTACGTCCGTAGCTT
52 | AAAGCCATTTCGCTGATATATAGCTGCGCTATTGACATTAAATATAGCGTTGCTATTATTTCTCCAACGC
53 | CTCCCTCGAGGCACCGGAGACCGCGATGCCCCCCGCTGCACCCCATCCCGTCCCGCCCGAAAAAAAGGCC
54 | >gi|56475432|ref|NC_006513.1 Damage2| Aromatoleum aromaticum EbN1 chromosome
55 | ATCCCGATGATCGCGACGACGATCATCAGTTCGATCAGCG
56 | >gi|56475432|ref|NC_006513.1 Damage3| Aromatoleum aromaticum EbN1 chromosome
57 | TGAAACCTTGTTGGACCTTTTTCATTTTCAGCTCCCTTGCTTGTTTGACGGGACGTCCCGTGCACATCGT
58 | CACGCAACAAGCGTGCCAACCACCAAAGCGGCATAATCGCCCCGGAGTGCCACC
59 |
--------------------------------------------------------------------------------
/components/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/components/__init__.py
--------------------------------------------------------------------------------
/components/components_detection_refinement.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import json
3 | from functools import wraps
4 | from itertools import groupby
5 |
6 |
7 | class SameStartEndFilter:
8 | def __init__(self, dict_crispr_candidates):
9 | self.dict_crispr_candidates = dict_crispr_candidates
10 | self.dict_filtered_start_end_crispr_candidates = {}
11 |
12 | self._filter_fuzzy_searches_same_start_end()
13 |
14 | def _filter_fuzzy_searches_same_start_end(self):
15 | for cluster_seq, list_fuzzy_s in self.dict_crispr_candidates.items():
16 | list_start_end = [fuzzy_s.start_end for fuzzy_s in list_fuzzy_s]
17 | pattern_len = [len(fuzzy_s.repeat_candidate) for fuzzy_s in list_fuzzy_s]
18 | tuples_st_end_len = zip(list_start_end, pattern_len)
19 |
20 | list_categories = [[fuzzy_s for fuzzy_s in list_fuzzy_s if
21 | (fuzzy_s.start_end, len(fuzzy_s.repeat_candidate)) == tuple_info]
22 | for tuple_info in tuples_st_end_len]
23 |
24 | best_fuzzy_s = [sorted(category, key=lambda x: x.number_errors)[0]
25 | for category in list_categories]
26 |
27 | best_fuzzy_s_unique_repeat = []
28 | u_repeats = []
29 | for b_fuz in best_fuzzy_s:
30 | repeat = b_fuz.repeat_candidate
31 | if repeat not in u_repeats:
32 | u_repeats.append(repeat)
33 | best_fuzzy_s_unique_repeat.append(b_fuz)
34 |
35 | self.dict_filtered_start_end_crispr_candidates[cluster_seq] = best_fuzzy_s_unique_repeat
36 |
37 | def output(self):
38 | return self.dict_filtered_start_end_crispr_candidates
39 |
40 |
41 | # For filtering out non CRISPR cases
42 | #############################################################
43 | #############################################################
44 | DEBUG_MODE = False
45 |
46 |
47 | def exception_handler(function):
48 | @wraps(function)
49 | def wrapper(*args, **kwargs):
50 | try:
51 | result = function(*args, **kwargs)
52 | return result
53 | except Exception:
54 | return False
55 | return wrapper
56 |
57 |
58 | def printing_if_filtered(function):
59 | @wraps(function)
60 | def wrapper(*args, **kwargs):
61 | result = function(*args, **kwargs)
62 | if DEBUG_MODE:
63 | if not result:
64 | with open("filtered_results.txt", "a") as f:
65 | f.write("\n\n")
66 | f.write("\n".join([str(arg) for arg in args]))
67 | f.write("\n\n")
68 | f.write(function.__name__)
69 | f.write("\n\n")
70 |
71 | return result
72 | return wrapper
73 |
74 |
75 | class AdvancedFuzzySearchFilter:
76 | def __init__(self, min_column_dominance_repeat, min_avg_spacer_length,
77 | max_spacer_length, max_column_dominance_spacer, max_allowed_consecutive_spacers,
78 | max_allowed_same_spacers, max_inconsistent_columns, min_avg_repeat_length,
79 | max_avg_repeat_length, max_avg_spacer_length, min_repeats):
80 |
81 | self.column_dominance = min_column_dominance_repeat
82 | self.min_avg_spacer_length = min_avg_spacer_length
83 | self.max_spacer_length = max_spacer_length
84 | self.max_column_dominance_spacer = max_column_dominance_spacer
85 | self.max_allowed_consecutive_spacers = max_allowed_consecutive_spacers
86 | self.max_allowed_same_spacers = max_allowed_same_spacers
87 | self.max_inconsistent_columns = max_inconsistent_columns
88 | self.min_avg_repeat_length = min_avg_repeat_length
89 | self.max_avg_repeat_length = max_avg_repeat_length
90 | self.max_avg_spacer_length = max_avg_spacer_length
91 | self.min_number_repeats = min_repeats
92 |
93 | @printing_if_filtered
94 | @exception_handler
95 | def _filter_by_column(self, candidate):
96 | def find_first_three_columns():
97 | list_three_columns = []
98 | list_gaped_repeats = candidate.list_gaped_repeats
99 | for index in range(len(list_gaped_repeats[0])):
100 | column_vec = [repeat[index] for repeat in list_gaped_repeats]
101 | column_gaps = sum([1 for x in column_vec if (x == " ")])
102 | percentage_gaps = column_gaps / len(column_vec)
103 | if percentage_gaps < 0.5:
104 | list_three_columns.append(column_vec)
105 | if len(list_three_columns) == 3:
106 | return list_three_columns
107 |
108 | def find_last_three_columns():
109 | list_three_columns = []
110 | list_gaped_repeats = candidate.list_gaped_repeats
111 | for index in range(len(list_gaped_repeats[0])-1, 0, -1):
112 | column_vec = [repeat[index] for repeat in list_gaped_repeats]
113 | column_gaps = sum([1 for x in column_vec if (x == " ")])
114 | percentage_gaps = column_gaps/len(column_vec)
115 | if percentage_gaps < 0.5:
116 | list_three_columns.append(column_vec)
117 | if len(list_three_columns) == 3:
118 | return list_three_columns
119 |
120 | for column in find_first_three_columns():
121 | column_characters = [x for x in column if (x not in (" ", "-"))]
122 | if column_characters:
123 | most_freq_char = max(column_characters, key=column_characters.count)
124 | most_freq_char_freq = column_characters.count(most_freq_char)
125 | freq = most_freq_char_freq/len(column_characters)
126 | if len(column) <= 4:
127 | if freq < 0.49:
128 | return False
129 | else:
130 | if freq < self.column_dominance:
131 | return False
132 | else:
133 | return False
134 |
135 | for column in find_last_three_columns():
136 | column_characters = [x for x in column if (x not in (" ", "-"))]
137 | if column_characters:
138 | most_freq_char = max(column_characters, key=column_characters.count)
139 | most_freq_char_freq = column_characters.count(most_freq_char)
140 | freq = most_freq_char_freq/len(column_characters)
141 | if len(column) <= 4:
142 | if freq < 0.49:
143 | return False
144 | else:
145 | if freq < self.column_dominance:
146 | return False
147 | else:
148 | return False
149 | return True
150 |
151 | @printing_if_filtered
152 | @exception_handler
153 | def _filter_by_min_avg_spacer(self, candidate):
154 | list_spacers = candidate.list_spacers
155 | avg_len = sum(len(x) for x in list_spacers) / len(list_spacers)
156 | if avg_len > self.min_avg_spacer_length:
157 | return True
158 | return False
159 |
160 | @printing_if_filtered
161 | @exception_handler
162 | def _filter_by_max_spacer(self, candidate):
163 | list_spacers = candidate.list_spacers
164 | long_spacers = [spacer for spacer in list_spacers if len(spacer) > self.max_spacer_length]
165 | if len(long_spacers) / len(list_spacers) > 0.3:
166 | return False
167 | if len(long_spacers) > 3:
168 | return False
169 | return True
170 |
171 | @printing_if_filtered
172 | @exception_handler
173 | def _filter_by_spacer_begin_end_similarity(self, candidate):
174 | list_spacers = candidate.list_spacers
175 | if len(list_spacers) >= 2:
176 | column_begin = [spacer[0] for spacer in list_spacers if spacer]
177 | most_freq_char_begin = max(column_begin, key=column_begin.count)
178 | most_freq_char_freq_begin = column_begin.count(most_freq_char_begin)
179 |
180 | freq_begin = most_freq_char_freq_begin / len(column_begin)
181 | if freq_begin > self.max_column_dominance_spacer:
182 | return False
183 |
184 | column_end = [spacer[-1] for spacer in list_spacers if spacer]
185 | most_freq_char_end = max(column_end, key=column_end.count)
186 | most_freq_char_freq_end = column_end.count(most_freq_char_end)
187 |
188 | freq_end = most_freq_char_freq_end / len(column_end)
189 | if freq_end > self.max_column_dominance_spacer:
190 | return False
191 | return True
192 |
193 | @printing_if_filtered
194 | @exception_handler
195 | def _filter_by_the_same_spacer(self, candidate):
196 | list_spacers = candidate.list_spacers
197 | list_spacers = [s for s in list_spacers if s]
198 | groups = [len(list(group)) for key, group in groupby(list_spacers)]
199 | if self.max_allowed_consecutive_spacers:
200 | if max(groups) > self.max_allowed_consecutive_spacers:
201 | return False
202 |
203 | list_sorted_spacers = sorted(list_spacers)
204 | groups_sorted = [len(list(group)) for key, group in groupby(list_sorted_spacers)]
205 | if self.max_allowed_same_spacers:
206 | if max(groups_sorted) > self.max_allowed_same_spacers:
207 | return False
208 | return True
209 |
210 | @printing_if_filtered
211 | @exception_handler
212 | def _filter_by_overall_repeat_consistency(self, candidate):
213 | list_column_consistency = []
214 | list_repeats_gaped = candidate.list_gaped_repeats
215 | for index, _ in enumerate(list_repeats_gaped[0]):
216 | column = [repeat[index] for repeat in list_repeats_gaped]
217 | column_characters = [x for x in column if (x not in (" ", "-"))]
218 | try:
219 | most_freq_char = max(column_characters, key=column_characters.count)
220 | most_freq_char_freq = column_characters.count(most_freq_char)
221 | freq = most_freq_char_freq / len(column_characters)
222 | list_column_consistency.append(freq)
223 | except ValueError:
224 | pass
225 |
226 | number_inconsistent = sum(1 for x in list_column_consistency if x < 0.66)
227 | if number_inconsistent > self.max_inconsistent_columns:
228 | return False
229 | return True
230 |
231 | @printing_if_filtered
232 | @exception_handler
233 | def _filter_min_number_repeats(self, candidate):
234 | list_repeats = candidate.list_repeats
235 | if len(list_repeats) >= self.min_number_repeats:
236 | return True
237 | return False
238 |
239 | @printing_if_filtered
240 | @exception_handler
241 | def _filter_min_avg_repeat_length(self, candidate):
242 | list_repeats = candidate.list_repeats
243 | avg_len = sum(len(x) for x in list_repeats) / len(list_repeats)
244 | if avg_len >= self.min_avg_repeat_length:
245 | return True
246 | return False
247 |
248 | @printing_if_filtered
249 | @exception_handler
250 | def _filter_max_avg_repeat_length(self, candidate):
251 | list_repeats = candidate.list_repeats
252 | avg_len = sum(len(x) for x in list_repeats) / len(list_repeats)
253 | if avg_len <= self.max_avg_repeat_length:
254 | return True
255 | return False
256 |
257 | @printing_if_filtered
258 | @exception_handler
259 | def _filter_max_avg_spacer_length(self, candidate):
260 | list_spacers = candidate.list_spacers
261 | if len(list_spacers) > 4:
262 | avg_len = sum(len(x) for x in list_spacers[1:-1]) / len(list_spacers)
263 | if avg_len <= self.max_avg_repeat_length:
264 | return True
265 | else:
266 | avg_len = sum(len(x) for x in list_spacers) / len(list_spacers)
267 | if avg_len <= self.max_avg_repeat_length:
268 | return True
269 | return False
270 |
271 | @printing_if_filtered
272 | @exception_handler
273 | def _filter_min_repeat_length(self, candidate):
274 | list_spacers = candidate.list_spacers
275 | avg_len = sum(len(x) for x in list_spacers) / len(list_spacers)
276 | if avg_len >= self.min_avg_repeat_length:
277 | return True
278 | return False
279 |
280 | def __call__(self, candidate):
281 | if not self._filter_by_column(candidate):
282 | return
283 | if not self._filter_by_min_avg_spacer(candidate):
284 | return
285 | if not self._filter_by_max_spacer(candidate):
286 | return
287 | if not self._filter_by_spacer_begin_end_similarity(candidate):
288 | return
289 | if not self._filter_by_the_same_spacer(candidate):
290 | return
291 | if not self._filter_by_overall_repeat_consistency(candidate):
292 | return
293 | if not self._filter_max_avg_repeat_length(candidate):
294 | return
295 | if not self._filter_min_avg_repeat_length(candidate):
296 | return
297 | if not self._filter_max_avg_spacer_length(candidate):
298 | return
299 | if not self._filter_min_number_repeats(candidate):
300 | return
301 | return candidate
302 |
303 | # CRISPR Candidate
304 | #####################################################
305 | #####################################################
306 | class CrisprConsensus(object):
307 | def __init__(self, list_repeats_gaped):
308 | self.list_repeats_gaped = list_repeats_gaped
309 |
310 | self.num_different_repeat_length = None
311 | self.consensus = None
312 | self.consensus_no_gap = None
313 | self.len_consensus = None
314 | self.number_repeats = None
315 |
316 | self._check_repeat_length()
317 | self._compute_consensus()
318 |
319 | def _check_repeat_length(self):
320 | list_lengths = [len(repeat) for repeat in self.list_repeats_gaped]
321 | self.num_different_repeat_length = len(set(list_lengths))
322 |
323 | def _compute_consensus(self):
324 | if self.num_different_repeat_length == 0:
325 | print('Got repeats of 0 length')
326 | elif self.num_different_repeat_length != 1:
327 | print('Got a case with different repeat lengths')
328 | for rep_gapped in self.list_repeats_gaped:
329 | print(rep_gapped)
330 | else:
331 | self.consensus = ''
332 | for char_ind, _ in enumerate(self.list_repeats_gaped[0]):
333 | list_char_in_column = [repeat[char_ind] for repeat in self.list_repeats_gaped]
334 | counter = collections.Counter(list_char_in_column)
335 | freq = counter.most_common()
336 | most_common_char = freq[0][0] if freq[0][0] != '-' else freq[1][0]
337 | self.consensus += most_common_char
338 |
339 | self.consensus_no_gap = self.consensus.replace(' ', '').replace('+', '')
340 | self.len_consensus = len(self.consensus_no_gap)
341 |
342 | def output(self):
343 | return self.consensus_no_gap, self.consensus
344 |
345 |
346 | class CrisprCandidate(object):
347 | def __init__(self, list_repeats, list_repeats_gaped, list_spacers, list_repeat_starts):
348 | self.list_repeats = list_repeats
349 | self.list_repeats_gaped = list_repeats_gaped
350 | self.list_spacers = list_spacers
351 | self.list_repeat_starts = list_repeat_starts
352 |
353 | self.list_repeat_mismatches = []
354 | self.list_mismatches_indexes = []
355 |
356 | self.consensus = None
357 | self.consensus_gaped = None
358 | self.total_mismatches = None
359 |
360 | self._filter_redundant_insertion_deletions()
361 | self._compute_consensus()
362 | self._compute_mismatches()
363 |
364 | self.list_gaped_repeats = self.list_repeats_gaped
365 |
366 | def _filter_redundant_insertion_deletions(self):
367 | def _fix_repeats(list_repeats, list_bad_indexes_to_fix):
368 | list_repeats_new = []
369 | for repeat in list_repeats:
370 | list_repeats_new.append(_fix_repeat(repeat, list_bad_indexes_to_fix))
371 |
372 | return list_repeats_new
373 |
374 | def _fix_repeat(repeat, list_bad_indexes_to_fix):
375 | new_repeat = ''
376 | for index, char in enumerate(repeat):
377 | if index not in list_bad_indexes_to_fix:
378 | new_repeat += char
379 |
380 | return new_repeat
381 |
382 | list_bad_indexes = []
383 | for char_ind, _ in enumerate(self.list_repeats_gaped[0]):
384 | list_char_in_column = [repeat[char_ind] for repeat in self.list_repeats_gaped]
385 | chars = set(list_char_in_column)
386 |
387 | if chars == {' '} or chars == {'-'}:
388 | list_bad_indexes.append(char_ind)
389 |
390 | if list_bad_indexes:
391 | self.list_repeats_gaped = _fix_repeats(self.list_repeats_gaped, list_bad_indexes)
392 |
393 | def _compute_consensus(self):
394 | self.consensus, self.consensus_gaped = CrisprConsensus(self.list_repeats_gaped).output()
395 |
396 | def _compute_mismatches(self):
397 | def _compute_mismatches_repeat(gaped_repeat):
398 | substitutions = 0
399 | insertions = 0
400 | deletions = 0
401 | list_mismatches_indexes_one_repeat = []
402 | for index, char_repeat, char_con_repeat in zip(range(len(gaped_repeat)),
403 | gaped_repeat,
404 | self.consensus_gaped):
405 |
406 | if char_con_repeat == ' ':
407 | if char_repeat != ' ':
408 | insertions += 1
409 | list_mismatches_indexes_one_repeat.append(index)
410 | else:
411 | if char_repeat == char_con_repeat:
412 | pass
413 | else:
414 | if char_repeat == '-':
415 | deletions += 1
416 | list_mismatches_indexes_one_repeat.append(index)
417 | elif char_repeat == ' ':
418 | deletions += 1
419 | else:
420 | substitutions += 1
421 | list_mismatches_indexes_one_repeat.append(index)
422 |
423 | return substitutions, insertions, deletions, list_mismatches_indexes_one_repeat
424 |
425 | for gaped_repeat in self.list_repeats_gaped:
426 | s, i, d, list_mismatches_indexes_one_repeat = _compute_mismatches_repeat(gaped_repeat)
427 | total = s + i + d
428 | repeat_stats = [s, i, d, total]
429 | self.list_repeat_mismatches.append(repeat_stats)
430 | self.list_mismatches_indexes.append(list_mismatches_indexes_one_repeat)
431 |
432 | self.total_mismatches = sum([x[3] for x in self.list_repeat_mismatches])
433 |
434 | def dot_repeat(self, gaped_repeat):
435 | string = ''
436 | substitutions = 0
437 | insertions = 0
438 | deletions = 0
439 | for char_repeat, char_consensus in zip(gaped_repeat, self.consensus_gaped):
440 | if char_consensus == ' ':
441 | string += char_repeat
442 | if char_repeat != ' ':
443 | insertions += 1
444 | else:
445 | if char_repeat == char_consensus:
446 | string += '.'
447 | else:
448 | string += char_repeat
449 | if char_repeat == '-':
450 | deletions += 1
451 | elif char_repeat == ' ':
452 | deletions += 1
453 | else:
454 | substitutions += 1
455 | return string, substitutions, insertions, deletions
456 |
457 | def dot_repr(self):
458 | string = ''
459 | g_s, g_i, g_d = 0, 0, 0
460 | max_length_start_index = max(len(str(start)) for start in self.list_repeat_starts) + 3
461 | max_length_spacer = max(len(spacer) for spacer in self.list_spacers) + 3
462 |
463 | for index, gaped_repeat in enumerate(self.list_repeats_gaped):
464 | repeat_start_index = self.list_repeat_starts[index] + 1
465 | n_gaps_after_start = max_length_start_index - len(str(repeat_start_index))
466 |
467 | if index == len(self.list_spacers):
468 | spacer = ""
469 | else:
470 | spacer = self.list_spacers[index]
471 | n_gaps_after_spacer = max_length_spacer - len(spacer)
472 |
473 | dotted_repeats, s, i, d = self.dot_repeat(gaped_repeat)
474 | errors = " s:{} i:{} d:{}".format(s, i, d)
475 | g_s += s
476 | g_i += i
477 | g_d += d
478 |
479 | string += "{}{}{} {}{}{}\n".format(repeat_start_index,
480 | " " * n_gaps_after_start,
481 | dotted_repeats, spacer,
482 | " " * n_gaps_after_spacer,
483 | errors)
484 |
485 | string += "_" * 100 + "\n"
486 |
487 | string += " " * max_length_start_index + self.consensus_gaped
488 | string += " " * (max_length_spacer + 2) + " s:{} i:{} d:{}".format(g_s, g_i, g_d) + "\n"
489 |
490 | return string
491 |
492 | def dot_repr_web_server(self):
493 | string = ''
494 | g_s, g_i, g_d = 0, 0, 0
495 | max_length_start_index = max(len(str(start)) for start in self.list_repeat_starts) + 3
496 | max_length_spacer = max(len(spacer) for spacer in self.list_spacers) + 3
497 |
498 | for index, gaped_repeat in enumerate(self.list_repeats_gaped):
499 | repeat_start_index = self.list_repeat_starts[index] + 1
500 | n_gaps_after_start = max_length_start_index - len(str(repeat_start_index))
501 |
502 | if index == len(self.list_spacers):
503 | spacer = ""
504 | else:
505 | spacer = "$" + self.list_spacers[index] + "$"
506 | n_gaps_after_spacer = max_length_spacer - len(spacer)
507 |
508 | dotted_repeats, s, i, d = self.dot_repeat(gaped_repeat)
509 | errors = " s:{} i:{} d:{}".format(s, i, d)
510 | g_s += s
511 | g_i += i
512 | g_d += d
513 |
514 | string += "{}{}{} {}{}{}\n".format(repeat_start_index,
515 | " " * n_gaps_after_start,
516 | dotted_repeats, spacer,
517 | " " * n_gaps_after_spacer,
518 | errors)
519 |
520 | string += "_" * 100 + "\n"
521 |
522 | string += " " * max_length_start_index + self.consensus_gaped
523 | string += " " * (max_length_spacer + 2) + " s:{} i:{} d:{}".format(g_s, g_i, g_d) + "\n"
524 |
525 | string += "_" * 100 + "\n"
526 |
527 | string += "consensus: " + self.consensus + "\n"
528 |
529 | return string
530 |
531 | def write_file(self, file_name):
532 | with open(file_name, "w") as f:
533 | f.write(self.dot_repr())
534 |
535 | def write_as_json(self, filename):
536 | dict_to_write = {"repeat_begins": self.list_repeat_starts,
537 | "repeats": self.list_repeats,
538 | "repeats_gaped": self.list_repeats_gaped,
539 | "spacers": self.list_spacers}
540 |
541 | with open(filename, 'w') as outfile:
542 | json.dump(dict_to_write, outfile)
543 |
544 | def compute_stats(self):
545 | start = self.list_repeat_starts[0] + 1
546 | end = self.list_repeat_starts[-1] + len(self.list_repeats[-1])
547 | avg_repeat = len(self.consensus)
548 | avg_spacer = int(sum((len(spacer) for spacer in self.list_spacers)) / len(self.list_spacers))
549 | number_repeats = len(self.list_repeats)
550 | return {"start": start, "end": end, "avg_repeat": avg_repeat,
551 | "avg_spacer": avg_spacer, "number_repeats": number_repeats}
552 |
553 | @classmethod
554 | def init_from_json(cls, file_name):
555 | with open(file_name) as json_file:
556 | dict_data = json.load(json_file)
557 |
558 | list_repeas = dict_data["repeats"]
559 | list_repeats_starts = dict_data["repeat_begins"]
560 | list_spacers = dict_data["spacers"]
561 | list_repeats_gaped = dict_data["repeats_gaped"]
562 |
563 | return cls(list_repeats=list_repeas, list_spacers=list_spacers,
564 | list_repeats_gaped=list_repeats_gaped, list_repeat_starts=list_repeats_starts)
565 |
566 | def __repr__(self):
567 | return self.dot_repr()
568 |
569 | def __eq__(self, other):
570 | if self.list_repeats == other.list_repeats:
571 | if self.list_repeats_gaped == other.list_repeats_gaped:
572 | if self.list_spacers == other.list_spacers:
573 | return True
574 | return False
575 |
576 | def __ne__(self, other):
577 | return not self.__eq__(other)
578 |
--------------------------------------------------------------------------------
/components/components_helpers.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | from os import listdir
4 | from os.path import isfile, join
5 |
6 |
7 | def process_string_from_header(input_string):
8 | # Define the function to replace based on the condition
9 | def replace_match(match):
10 | # If it's an integer, remove the dot and integer
11 | if match.group(2).isdigit():
12 | return match.group(1)
13 | # If it's not an integer, replace the dot with a hyphen
14 | return match.group(1) + "-" + match.group(2)
15 |
16 | # Use regex to find patterns with a dot followed by any characters
17 | result = re.sub(r'(\w+)\.(\w+)', replace_match, input_string)
18 | return result
19 |
20 | def multiline_fasta_check(file):
21 | with open(file, "r") as f:
22 | lines = f.readlines()
23 | number_of_inputs = sum([1 for line in lines if ">" in line])
24 | return number_of_inputs != 1
25 |
26 |
27 | def multiline_fasta_handle(file):
28 | base_name = str(os.path.basename(file).split(".")[0])
29 | try:
30 | os.mkdir(base_name)
31 | except OSError:
32 | pass
33 |
34 | cmd = f"cat {file}"
35 | cmd += " | awk '{ if (substr($0, 1, 1)==\">\") {"
36 | cmd += "filename=(\"{}/\"".format(base_name)
37 | cmd += "substr($0,2)\".fa\")} "
38 | cmd += f"print $0 > filename "
39 | cmd += "}'"
40 |
41 | os.system(cmd)
42 |
43 | return base_name
44 |
45 |
46 | def multiline_fasta_handle_python(file, flag_ncbi_formatting=False):
47 | base_name = str(os.path.basename(file).split(".")[0])
48 | try:
49 | os.mkdir(base_name)
50 | except OSError:
51 | pass
52 |
53 | with open(file, "r") as f:
54 | lines = f.readlines()
55 |
56 | headers = []
57 | dna_sequences = []
58 |
59 | dna_sequence = ''
60 | for line in lines:
61 | if line:
62 | if ">" in line:
63 | if dna_sequence:
64 | dna_sequences.append(dna_sequence)
65 | dna_sequence = ''
66 | headers.append(line)
67 | else:
68 | dna_sequence += line.strip()
69 |
70 | if dna_sequence:
71 | dna_sequences.append(dna_sequence)
72 |
73 | if flag_ncbi_formatting:
74 | for header, dna_sequence in zip(headers, dna_sequences):
75 | new_header = header.split(" ")[0]
76 | new_header = process_string_from_header(new_header)
77 | file_name = new_header.split(">")[1].replace(",", "-") \
78 | .replace(".", "-").replace(" ", "_").replace("|", "-") + ".fa"
79 | with open(os.path.join(base_name, file_name), "w") as f:
80 | f.writelines(new_header)
81 | f.write("\n")
82 | f.writelines(dna_sequence)
83 | else:
84 | for header, dna_sequence in zip(headers, dna_sequences):
85 | file_name = header.strip().split(">")[1].replace(",", "_")\
86 | .replace(".", "_").replace(" ", "_").replace("|", "_") + ".fa"
87 | with open(os.path.join(base_name, file_name), "w") as f:
88 | f.write(header)
89 | f.write(dna_sequence)
90 |
91 | return base_name
92 |
93 |
94 | def folder_of_multifasta_handle(folder_multifasta):
95 | list_files = [f for f in listdir(folder_multifasta) if isfile(join(folder_multifasta, f))]
96 | all_lines_in_files = []
97 | for file in list_files:
98 | with open(os.path.join(folder_multifasta, file), "r") as f:
99 | lines = f.readlines()
100 | all_lines_in_files.append(lines)
101 | with open("multifasta_folder.fa", "w") as f:
102 | for lines in all_lines_in_files:
103 | for line in lines:
104 | f.write(line)
105 |
106 | multiline_fasta_handle_python("multifasta_folder.fa")
107 | return "multifasta_folder"
--------------------------------------------------------------------------------
/components/components_ml.py:
--------------------------------------------------------------------------------
1 | import operator
2 | import numpy as np
3 | import sklearn
4 | import joblib
5 |
6 |
7 | class ClassifierWrapper(object):
8 | def __init__(self, classifier_type, load_option=None, hyper_parameters=None):
9 | self.classifier_type = classifier_type
10 | self._hyper_parameters = hyper_parameters
11 | self._load_option = load_option
12 |
13 | self._init_classifier()
14 |
15 | def _init_classifier(self):
16 | if self._load_option:
17 | self._load_model()
18 | else:
19 | if self.classifier_type == 'k_near_neighbors':
20 |
21 | if not self._hyper_parameters:
22 | self.classifier = sklearn.neighbors.KNeighborsClassifier(n_neighbors=7)
23 | else:
24 | self.classifier = sklearn.neighbors.KNeighborsClassifier(**self._hyper_parameters)
25 |
26 | elif self.classifier_type == 'svm':
27 |
28 | if not self._hyper_parameters:
29 | self.classifier = sklearn.svm.SVC()
30 | else:
31 | self.classifier = sklearn.svm.SVC(**self._hyper_parameters)
32 |
33 | elif self.classifier_type == 'naive_bayes':
34 |
35 | if not self._hyper_parameters:
36 | self.classifier = sklearn.naive_bayes.GaussianNB()
37 | else:
38 | self.classifier = sklearn.naive_bayes.GaussianNB(**self._hyper_parameters)
39 |
40 | elif self.classifier_type == 'random_forest':
41 |
42 | if not self._hyper_parameters:
43 | self.classifier = RandomForestClassifier(max_depth=3, random_state=None)
44 | else:
45 | self.classifier = RandomForestClassifier(**self._hyper_parameters)
46 |
47 | elif self.classifier_type == 'neural_network':
48 |
49 | if not self._hyper_parameters:
50 | self.classifier = MLPClassifier(solver='lbfgs', alpha=1e-5,
51 | hidden_layer_sizes=(100, 100), random_state=None)
52 | else:
53 | self.classifier = MLPClassifier(**self._hyper_parameters)
54 |
55 | elif self.classifier_type == 'extra_trees':
56 |
57 | if not self._hyper_parameters:
58 | self.classifier = ExtraTreesClassifier(max_depth=4)
59 | else:
60 | self.classifier = ExtraTreesClassifier(**self._hyper_parameters)
61 |
62 | else:
63 | raise ValueError('Wrong classifier')
64 |
65 | def _load_model(self):
66 | self.classifier = joblib.load(self._load_option)
67 |
68 | def train_classifier(self, train_set_pos, train_set_neg):
69 | train_y_pos = np.ones(len(train_set_pos))
70 | train_y_neg = np.zeros(len(train_set_neg))
71 | train_y = np.concatenate([train_y_pos, train_y_neg])
72 | train_x = np.concatenate([train_set_pos, train_set_neg])
73 | self.classifier.fit(train_x, train_y)
74 |
75 | def test_classifier(self, test_set_pos, test_set_neg):
76 | if (test_set_pos is not None) and (test_set_neg is not None):
77 | test_set_y_pos = np.ones(len(test_set_pos))
78 | test_set_y_neg = np.zeros(len(test_set_neg))
79 | test_set_y = np.concatenate([test_set_y_pos, test_set_y_neg])
80 | test_set_x = np.concatenate([test_set_pos, test_set_neg])
81 |
82 | elif test_set_pos is not None:
83 | test_set_y = np.ones(len(test_set_pos))
84 | test_set_x = test_set_pos
85 |
86 | elif test_set_neg is not None:
87 | test_set_y = np.zeros(len(test_set_neg))
88 | test_set_x = test_set_neg
89 |
90 | else:
91 | raise ValueError
92 |
93 | predict = self.classifier.predict(test_set_x)
94 | dif = test_set_y - predict
95 | return 1 - np.count_nonzero(dif) / float(len(dif))
96 |
97 | def predict(self, dataset):
98 | return self.classifier.predict(dataset)
99 |
100 | def predict_proba(self, dataset):
101 | return self.classifier.predict_proba(dataset)
102 |
103 | def save_model(self, model_name_dot_pkl):
104 | joblib.dump(self.classifier, model_name_dot_pkl)
105 |
106 |
107 |
108 |
--------------------------------------------------------------------------------
/components/module_detection.py:
--------------------------------------------------------------------------------
1 | import multiprocessing
2 | from multiprocessing import Pool
3 |
4 | from components.components_detection import VmatchRun
5 | from components.components_detection import ClusterMaker
6 | from components.components_detection import FilterApproximationClusters
7 | from components.components_detection import StartEndEnhancementClusters
8 | from components.components_detection import IntermediateEnhancementClusters
9 | from components.components_detection import ClusterSequence
10 | from components.components_detection import FuzzySearch
11 |
12 |
13 | class Detection:
14 | def __init__(self, file_path, flags, parameters, flag_dev_mode):
15 | self.file_path = file_path
16 | self.flags = flags
17 | self.parameters = parameters
18 | self.flag_parallel = flags["flag_parallel"]
19 | self.flag_cpu = flags["flag_cpu"]
20 | self.flag_fast_run = flags["flag_fast_run"]
21 | self.flag_enhancement_min_max = flags["flag_enhancement_min_max"]
22 | self.flag_enhancement_start_end = flags["flag_enhancement_start_end"]
23 | self.parameters = parameters
24 | self.flag_dev_mode = flag_dev_mode
25 |
26 | self.clusters = []
27 | self.cluster_sequences = []
28 | self.dict_fuzzy_crisprs = {}
29 |
30 | self._get_complete_dna()
31 | self._run_cluster_detection()
32 | self._extract_cluster_sequences()
33 | self._run_array_detection()
34 |
35 | def _get_complete_dna(self):
36 | with open(self.file_path, 'r') as f:
37 | lines = f.readlines()
38 |
39 | self.input_header = lines[0]
40 | self.dna = ''.join([line.strip() for line in lines[1:]])
41 | self.dna_length = len(self.dna)
42 | self.dna = self.dna.upper()
43 |
44 | def _run_cluster_detection(self):
45 | vr = VmatchRun(self.file_path, self.flag_fast_run)
46 | list_repeats_from_vmatch = vr.output()
47 | #print("list vmatch repeats", list_repeats_from_vmatch)
48 |
49 | cm = ClusterMaker(list_repeats_from_vmatch, self.dna)
50 | self.clusters = cm.output()
51 |
52 | fa = FilterApproximationClusters(self.clusters)
53 | self.clusters = fa.output()
54 |
55 | st = StartEndEnhancementClusters(self.clusters)
56 | self.clusters = st.output()
57 |
58 | ie = IntermediateEnhancementClusters(self.clusters)
59 | self.clusters = ie.output()
60 |
61 | def _extract_cluster_sequences(self):
62 | for cluster in self.clusters:
63 | seq_start = max(0, cluster.begin - 100)
64 | seq_end = min(len(self.dna), cluster.end + 100)
65 | cluster_seq = self.dna[seq_start:seq_end]
66 | tup_cluster_dif_rep = tuple(cluster.list_clust_dif_rep_seq)
67 |
68 | self.cluster_sequences.append(ClusterSequence(cluster_seq, seq_start, seq_end, tup_cluster_dif_rep))
69 |
70 | @staticmethod
71 | def _parallel_run_fuzzy_run(input_tuple):
72 | repeat, sequence, start, weighted_error = input_tuple
73 |
74 | return FuzzySearch(sequence, start,
75 | repeat, weighted_error)
76 |
77 | def _run_array_detection(self):
78 | weighted_error = "{i<=3,d<=3,s<=3,i+d+s<=6}"
79 | parallel = self.flag_parallel
80 |
81 | if parallel:
82 | for cluster_sequence in self.cluster_sequences:
83 | nr = len(cluster_sequence.tuple_repeats)
84 | input_tuples = zip(cluster_sequence.tuple_repeats, [cluster_sequence.sequence] * nr,
85 | [cluster_sequence.start] * nr, [weighted_error] * nr)
86 |
87 | num_workers_suggested = multiprocessing.cpu_count() if self.flag_cpu == "ALL" else int(self.flag_cpu)
88 | max_possible = multiprocessing.cpu_count()
89 | num_workers = num_workers_suggested if num_workers_suggested < max_possible else max_possible
90 | with Pool(num_workers) as p:
91 | fuzzy_results = p.map(self._parallel_run_fuzzy_run, input_tuples)
92 | fuzzy_results = [x for x in fuzzy_results if x.match_hit]
93 | fuzzy_results = [x for x in fuzzy_results if len(x.list_repeats) > 1]
94 |
95 | self.dict_fuzzy_crisprs[cluster_sequence] = fuzzy_results
96 | else:
97 | for cluster_sequence in self.cluster_sequences:
98 | list_fuzzy_results = []
99 | for repeat in cluster_sequence.tuple_repeats:
100 | fuzzy_s = FuzzySearch(cluster_sequence.sequence, cluster_sequence.start,
101 | repeat, weighted_error)
102 | if fuzzy_s.match_hit:
103 | if len(fuzzy_s.list_repeats) > 1:
104 | list_fuzzy_results.append(fuzzy_s)
105 |
106 | self.dict_fuzzy_crisprs[cluster_sequence] = list_fuzzy_results
107 |
108 | def output(self):
109 | return self.dict_fuzzy_crisprs
--------------------------------------------------------------------------------
/components/module_detection_refinement.py:
--------------------------------------------------------------------------------
1 | from components.components_detection_refinement import SameStartEndFilter
2 | from components.components_detection_refinement import AdvancedFuzzySearchFilter
3 | from components.components_detection_refinement import CrisprCandidate
4 |
5 |
6 | class DetectionRefinement:
7 | def __init__(self, dict_fuzzy_crisprs, parameters, flag_dev_mode):
8 | self.dict_fuzzy_crisprs = dict_fuzzy_crisprs
9 | self.parameters = parameters
10 | self.flag_dev_mode = flag_dev_mode
11 | self.dict_fuzzy_crisprs_refined_st_end = {}
12 | self.dict_fuzzy_crisprs_fully_refined = {}
13 |
14 | self._filter_out_same_start_end_cases()
15 | self._filter_out_non_crispr_cases()
16 | self._reformat_ac_crispr_candidates()
17 |
18 | def _filter_out_same_start_end_cases(self):
19 | ssef = SameStartEndFilter(self.dict_fuzzy_crisprs)
20 | self.dict_fuzzy_crisprs_refined_st_end = ssef.output()
21 |
22 | def _filter_out_non_crispr_cases(self):
23 | self.param_min_avg_repeat_length = self.parameters["param_min_avg_repeat_length"]
24 | self.param_max_avg_repeat_length = self.parameters["param_max_avg_repeat_length"]
25 | self.param_min_avg_spacer_length = self.parameters["param_min_avg_spacer_length"]
26 | self.param_max_avg_spacer_length = self.parameters["param_max_avg_spacer_length"]
27 | self.param_min_repeats = self.parameters["param_min_repeats"]
28 | self.param_max_identical_spacers = self.parameters["param_max_identical_spacers"]
29 | self.param_max_identical_cluster_spacers = self.parameters["param_max_identical_cluster_spacers"]
30 |
31 | afsf = AdvancedFuzzySearchFilter(min_column_dominance_repeat=0.6,
32 | max_spacer_length=140, max_column_dominance_spacer=0.8,
33 | max_allowed_consecutive_spacers=self.param_max_identical_cluster_spacers,
34 | max_allowed_same_spacers=self.param_max_identical_spacers,
35 | max_inconsistent_columns=5,
36 | min_avg_repeat_length=self.param_min_avg_repeat_length,
37 | max_avg_repeat_length=self.param_max_avg_repeat_length,
38 | min_avg_spacer_length=self.param_min_avg_spacer_length,
39 | max_avg_spacer_length=self.param_max_avg_spacer_length,
40 | min_repeats=self.param_min_repeats)
41 |
42 | for key, values in self.dict_fuzzy_crisprs_refined_st_end.items():
43 | list_filtered_advanced = [afsf(value) for value in values]
44 | list_filtered_advanced = [x for x in list_filtered_advanced if x]
45 | if not list_filtered_advanced:
46 | sorted_by_num_errors = sorted(list(values), key=lambda x: x.number_errors)
47 | if sorted_by_num_errors:
48 | candidate_fewer_mismatches = sorted_by_num_errors[0]
49 | self.dict_fuzzy_crisprs_fully_refined[key] = [candidate_fewer_mismatches]
50 | else:
51 | self.dict_fuzzy_crisprs_fully_refined[key] = list_filtered_advanced
52 |
53 | def _reformat_ac_crispr_candidates(self):
54 | self.dict_crispr_candidates = {}
55 | for key, list_fuzzy in self.dict_fuzzy_crisprs_fully_refined.items():
56 | new_key = (key.start, key.end)
57 | list_crispr_candidates = [CrisprCandidate(fuzzy.list_repeats, fuzzy.list_gaped_repeats,
58 | fuzzy.list_spacers, fuzzy.list_absolute_start)
59 | for fuzzy in list_fuzzy]
60 |
61 | self.dict_crispr_candidates[new_key] = list_crispr_candidates
62 |
63 | def output(self):
64 | return self.dict_crispr_candidates
65 |
--------------------------------------------------------------------------------
/components/module_evaluated_arrays_enhancement.py:
--------------------------------------------------------------------------------
1 | from os.path import basename
2 | from components.components_evaluated_arrays_enhancement import IterativeDegeneratedSearch
3 | from components.components_evaluated_arrays_enhancement import create_boundaries_for_intervals
4 | from components.components_evaluated_arrays_enhancement import ArrayRefinerInsertionsDeletions
5 | from components.components_detection_refinement import AdvancedFuzzySearchFilter
6 |
7 |
8 | class EvaluatedArraysEnhancement:
9 | def __init__(self, file_path, categories, parameters, flag_dev_mode):
10 | self.file_path = file_path
11 | self.categories = categories
12 | self.parameters = parameters
13 | self.flag_dev_mode = flag_dev_mode
14 |
15 | self.bona_fide_arrays = categories[0]
16 | self.alternative_arrays = categories[1]
17 | self.possible_arrays = categories[2]
18 |
19 | self.dict_arrays_into_categories_enhanced = {}
20 |
21 | self._get_complete_dna()
22 | self._search_missed_or_degenerated_repeats()
23 | self._refine_nucleotides_repeat_spacer()
24 | self._filter_enhanced()
25 |
26 | def _get_complete_dna(self):
27 | with open(self.file_path, 'r') as f:
28 | lines = f.readlines()
29 |
30 | self.input_header = lines[0]
31 | self.dna = ''.join([line.strip() for line in lines[1:]])
32 | self.dna_length = len(self.dna)
33 | self.dna = self.dna.upper()
34 |
35 | def _search_missed_or_degenerated_repeats(self):
36 | for category in [self.bona_fide_arrays, self.alternative_arrays, self.possible_arrays]:
37 | intervals = []
38 | arrays_for_intervals = []
39 |
40 | for interval, list_data in category.items():
41 | intervals.append(interval)
42 | arrays_for_intervals.append([el[1] for el in list_data])
43 |
44 | boundaries = create_boundaries_for_intervals(intervals, 500)
45 |
46 | for interval, arrays_in_interval, boundary in zip(intervals, arrays_for_intervals, boundaries):
47 | for array_index, array in enumerate(arrays_in_interval):
48 | consensus = array.consensus
49 | list_repeats = array.list_repeats
50 | list_repeats_starts = array.list_repeat_starts
51 | list_spacers = array.list_spacers
52 |
53 |
54 | ids = IterativeDegeneratedSearch(full_dna=self.dna,
55 | repeat_seq_candidate=consensus,
56 | spacer_margin=self.parameters["param_spacer_margin_degenerated_search"],
57 | repeat_seq_candidate_gaped=None,
58 | list_repeats_starts=list_repeats_starts,
59 | list_repeats=list_repeats,
60 | list_spacers=list_spacers,
61 | start_flanking_region_left=boundary[0],
62 | end_flanking_region_right=boundary[1],
63 | allowed_max_editing_distance=self.parameters["param_max_edit_distance"],
64 | iterative_size_flanking_region=150,
65 | prevent_long_spacers=True,
66 | attempt_to_improve_initial_array=True)
67 |
68 | new_crispr_candidate = ids.output()
69 |
70 | if self.flag_dev_mode:
71 | if array != new_crispr_candidate:
72 | with open("log.txt", "a") as f:
73 | acc_num = basename(self.file_path).split(".")[0]
74 | f.write(f"Iteractive degenerated search {acc_num}\n")
75 | f.write(array.dot_repr())
76 | f.write("\n\n")
77 | f.write(new_crispr_candidate.dot_repr())
78 | f.write("\n\n")
79 |
80 | """except Exception:
81 | new_crispr_candidate = array
82 |
83 | if self.flag_dev_mode:
84 | with open("log_error.txt", "a") as f:
85 | acc_num = basename(self.file_path).split(".")[0]
86 | f.write(f"Iteractive degenerated search error {acc_num}\n")
87 | f.write(array.dot_repr())
88 | f.write("\n\n")"""
89 |
90 | category[interval][array_index][1] = new_crispr_candidate
91 |
92 | def _refine_nucleotides_repeat_spacer(self):
93 | for category in [self.bona_fide_arrays, self.alternative_arrays, self.possible_arrays]:
94 | for interval, list_data in category.items():
95 | arrays = [el[1] for el in list_data]
96 | for array_index, array in enumerate(arrays):
97 | try:
98 | arid = ArrayRefinerInsertionsDeletions(array)
99 | new_crispr_candidate = arid.output()
100 |
101 | if self.flag_dev_mode:
102 | if array != new_crispr_candidate:
103 | with open("log.txt", "a") as f:
104 | acc_num = basename(self.file_path).split(".")[0]
105 | f.write(f"Array refinement {acc_num}\n")
106 | f.write(array.dot_repr())
107 | f.write("\n\n")
108 | f.write(new_crispr_candidate.dot_repr())
109 | f.write("\n\n")
110 |
111 | except Exception:
112 | new_crispr_candidate = array
113 |
114 | if self.flag_dev_mode:
115 | with open("log_error.txt", "a") as f:
116 | acc_num = basename(self.file_path).split(".")[0]
117 | f.write(f"Array refinement error {acc_num}\n")
118 | f.write(array.dot_repr())
119 | f.write("\n\n")
120 |
121 | category[interval][array_index][1] = new_crispr_candidate
122 |
123 | def _filter_enhanced(self):
124 | self.param_min_avg_repeat_length = self.parameters["param_min_avg_repeat_length"]
125 | self.param_max_avg_repeat_length = self.parameters["param_max_avg_repeat_length"]
126 | self.param_min_avg_spacer_length = self.parameters["param_min_avg_spacer_length"]
127 | self.param_max_avg_spacer_length = self.parameters["param_max_avg_spacer_length"]
128 | self.param_min_repeats = self.parameters["param_min_repeats"]
129 | self.param_max_identical_spacers = self.parameters["param_max_identical_spacers"]
130 | self.param_max_identical_cluster_spacers = self.parameters["param_max_identical_cluster_spacers"]
131 |
132 | afsf = AdvancedFuzzySearchFilter(min_column_dominance_repeat=0.6,
133 | max_spacer_length=140, max_column_dominance_spacer=0.8,
134 | max_allowed_consecutive_spacers=self.param_max_identical_cluster_spacers,
135 | max_allowed_same_spacers=self.param_max_identical_spacers,
136 | max_inconsistent_columns=5,
137 | min_avg_repeat_length=self.param_min_avg_repeat_length,
138 | max_avg_repeat_length=self.param_max_avg_repeat_length,
139 | min_avg_spacer_length=self.param_min_avg_spacer_length,
140 | max_avg_spacer_length=self.param_max_avg_spacer_length,
141 | min_repeats=self.param_min_repeats)
142 |
143 | bona_fide_not_filtered = self.categories[0]
144 | alternative_not_filtered = self.categories[1]
145 | possible_not_filtered = self.categories[2]
146 | low_score = self.categories[4]
147 |
148 | bona_fide_filtered = {}
149 | alternative_filtered = {}
150 | possible_filtered = {}
151 |
152 | for not_filtered_category, filtered_category in zip([bona_fide_not_filtered, alternative_not_filtered, possible_not_filtered],
153 | [bona_fide_filtered, alternative_filtered, possible_filtered]):
154 | for key, value in not_filtered_category.items():
155 | for crispr_tuple in value:
156 | crispr = crispr_tuple[1]
157 | if not afsf(crispr):
158 | if key in low_score:
159 | low_score[key].append(crispr_tuple)
160 | else:
161 | low_score[key] = [crispr_tuple]
162 | else:
163 | if key not in filtered_category:
164 | filtered_category[key] = [crispr_tuple]
165 | else:
166 | filtered_category[key].append(crispr_tuple)
167 |
168 | self.categories[0] = bona_fide_filtered
169 | self.categories[1] = alternative_filtered
170 | self.categories[2] = possible_filtered
171 | self.categories[4] = low_score
172 |
173 | def output(self):
174 | return self.categories
175 |
--------------------------------------------------------------------------------
/components/module_evaluation.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from components.components_evaluation import BulkFeatureExtractor
4 | from components.components_evaluation import FeatureExtractor
5 | from components.components_evaluation import get_full_vector
6 | from components.components_detection_refinement import AdvancedFuzzySearchFilter
7 |
8 |
9 | class ArrayEvaluation:
10 | def __init__(self, dict_crispr_array_candidates, list_ml_classifiers, list_features, parameters, flag_dev_mode):
11 | self.dict_crispr_array_candidates = dict_crispr_array_candidates
12 | self.list_ml_classifiers = list_ml_classifiers
13 | self.list_features = list_features
14 | self.parameters = parameters
15 | self.flag_dev_mode = flag_dev_mode
16 |
17 | self.dict_scored_result = {}
18 | self.dict_scored_result_with_all_vectors = {}
19 |
20 | self.dict_bona_fide = {}
21 | self.dict_alternative = {}
22 | self.dict_possible = {}
23 | self.dict_possible_discarded = {}
24 | self.dict_low_score = {}
25 |
26 | self._load_filter()
27 | self._extract_features_and_evaluate()
28 | self._split_into_categories()
29 |
30 | def _load_filter(self):
31 | self.param_min_avg_repeat_length = self.parameters["param_min_avg_repeat_length"]
32 | self.param_max_avg_repeat_length = self.parameters["param_max_avg_repeat_length"]
33 | self.param_min_avg_spacer_length = self.parameters["param_min_avg_spacer_length"]
34 | self.param_max_avg_spacer_length = self.parameters["param_max_avg_spacer_length"]
35 | self.param_min_repeats = self.parameters["param_min_repeats"]
36 | self.param_max_identical_spacers = self.parameters["param_max_identical_spacers"]
37 | self.param_max_identical_cluster_spacers = self.parameters["param_max_identical_cluster_spacers"]
38 | self. afsf = AdvancedFuzzySearchFilter(min_column_dominance_repeat=0.6,
39 | max_spacer_length=140, max_column_dominance_spacer=0.8,
40 | max_allowed_consecutive_spacers=self.param_max_identical_cluster_spacers,
41 | max_allowed_same_spacers=self.param_max_identical_spacers,
42 | max_inconsistent_columns=5,
43 | min_avg_repeat_length=self.param_min_avg_repeat_length,
44 | max_avg_repeat_length=self.param_max_avg_repeat_length,
45 | min_avg_spacer_length=self.param_min_avg_spacer_length,
46 | max_avg_spacer_length=self.param_max_avg_spacer_length,
47 | min_repeats=self.param_min_repeats)
48 |
49 | def _extract_features_and_evaluate(self):
50 | bfe = BulkFeatureExtractor(self.dict_crispr_array_candidates)
51 | results = bfe.output()
52 | blast_results, orf_results, hmm_results, mfe_results = results
53 | blast_scores_1, blast_scores_2 = blast_results
54 |
55 | list_features = ['repeat_len', 'number_repeats', 'repeat_similarity',
56 | 'at_richness', 'avg_spacer_len', 'spacer_similarity',
57 | 'number_mismatches', 'spacer_evenness']
58 |
59 | for key, list_crispr_candidates in self.dict_crispr_array_candidates.items():
60 | self.dict_scored_result[key] = []
61 | self.dict_scored_result_with_all_vectors[key] = []
62 | for index, crispr_candidate in enumerate(list_crispr_candidates):
63 | final_score = 0
64 |
65 | feature_vector = FeatureExtractor(0, crispr_candidate, list_features).extract()[0]
66 |
67 | mfe = mfe_results[key][index]
68 | orf = orf_results[key][index]
69 | hmmr = hmm_results[key][index]
70 | blast1 = blast_scores_1[key][index]
71 | blast2 = blast_scores_2[key][index]
72 |
73 | feature_vector_8_incomplete = feature_vector[np.array([2, 4, 5, 6, 7])]
74 | rest_8 = np.asarray([mfe, orf, blast1])
75 | feature_vector_8 = np.concatenate((feature_vector_8_incomplete, rest_8))
76 | feature_vector_8 = feature_vector_8.reshape(1, -1)
77 |
78 | feature_vector_9_incomplete = feature_vector[np.array([1, 2, 4, 5, 7])]
79 | rest_9 = np.asarray([mfe, orf, hmmr, blast2])
80 | feature_vector_9 = np.concatenate((feature_vector_9_incomplete, rest_9))
81 | feature_vector_9 = feature_vector_9.reshape(1, -1)
82 |
83 | feature_vector_10_incomplete = feature_vector[np.array([0, 2, 3, 4, 5, 6, 7])]
84 | rest_10 = np.asarray([hmmr, blast1, blast2])
85 | feature_vector_10 = np.concatenate((feature_vector_10_incomplete, rest_10))
86 | feature_vector_10 = feature_vector_10.reshape(1, -1)
87 |
88 | dict_feature_vectors = {8: feature_vector_8,
89 | 9: feature_vector_9,
90 | 10: feature_vector_10}
91 |
92 | feature_vectors = []
93 | for ml_classifier, feature_names in zip(self.list_ml_classifiers, self.list_features):
94 | len_features = len(feature_names)
95 | feature_vector = dict_feature_vectors[len_features]
96 | feature_vectors.append(feature_vector)
97 | final_score += ml_classifier.predict_proba(feature_vector)[0][1]
98 |
99 | final_score = final_score / len(self.list_ml_classifiers)
100 | score_crispr_candidate_feature_list = [final_score, crispr_candidate, feature_vectors]
101 | self.dict_scored_result[key].append(score_crispr_candidate_feature_list)
102 |
103 | all_feature_vectors = [feature_vector_8, feature_vector_9, feature_vector_10]
104 | score_crispr_candidate_all_feature_tuple = final_score, crispr_candidate, all_feature_vectors
105 | self.dict_scored_result_with_all_vectors[key].append(score_crispr_candidate_all_feature_tuple)
106 |
107 | def _split_into_categories(self):
108 | for key, data in self.dict_scored_result.items():
109 | data_pre_possible = [candidate for candidate in data if 0.75 > candidate[0] >= 0.5]
110 | data_alternative = [candidate for candidate in data if candidate[0] >= 0.75]
111 | data_alternative_filtered = []
112 | data_bad = [candidate for candidate in data if candidate[0] < 0.5]
113 |
114 | if data_alternative:
115 | for element in data_alternative:
116 | crispr = element[1]
117 | if self.afsf(crispr):
118 | data_alternative_filtered.append(element)
119 | else:
120 | data_bad.append(element)
121 |
122 | if data_alternative_filtered:
123 | data_alternative_filtered = sorted(data_alternative_filtered, key=lambda x: x[0], reverse=True)
124 | best_candidate = data_alternative_filtered[0]
125 | data_alternative_filtered = data_alternative_filtered[1:]
126 |
127 | self.dict_bona_fide[key] = [best_candidate]
128 | if data_alternative_filtered:
129 | self.dict_alternative[key] = data_alternative_filtered
130 |
131 | if data_pre_possible:
132 | if key in self.dict_bona_fide:
133 | data_show_in_alternative = [candidate for candidate in data_pre_possible if candidate[0] >= 0.6]
134 | if data_show_in_alternative:
135 | data_show_in_alternative_filtered = []
136 | for element in data_show_in_alternative:
137 | crispr = element[1]
138 | if self.afsf(crispr):
139 | data_show_in_alternative_filtered.append(element)
140 | else:
141 | data_bad.append(element)
142 |
143 | if key in self.dict_alternative:
144 | self.dict_alternative[key] += data_show_in_alternative_filtered
145 | else:
146 | self.dict_alternative[key] = data_show_in_alternative_filtered
147 |
148 | else:
149 | data_pre_possible = sorted(data_pre_possible, key=lambda x: x[0], reverse=True)
150 | best_possible_candidate = data_pre_possible[0]
151 | possible_discarded = data_pre_possible[1:]
152 |
153 | if self.afsf(best_possible_candidate[1]):
154 | self.dict_possible[key] = [best_possible_candidate]
155 | else:
156 | data_bad.append(best_possible_candidate)
157 |
158 | if possible_discarded:
159 | self.dict_possible_discarded[key] = possible_discarded
160 |
161 | if data_bad:
162 | self.dict_low_score[key] = data_bad
163 |
164 | def _split_into_categories_with_additional_classifier(self):
165 |
166 | for key, data in self.dict_scored_result_with_all_vectors.items():
167 | data_pre_possible = [candidate for candidate in data if 0.75 > candidate[0] >= 0.5]
168 | data_alternative = [candidate for candidate in data if candidate[0] >= 0.75]
169 | data_alternative_filtered = []
170 | data_bad = [candidate for candidate in data if candidate[0] < 0.5]
171 |
172 | if data_bad:
173 | self.dict_low_score[key] = data_bad
174 |
175 | if self.flag_possible_differential_model == "possible":
176 | if data_alternative:
177 | data_alternative = sorted(data_alternative, key=lambda x: x[0], reverse=True)
178 | best_candidate = data_alternative[0]
179 | data_alternative = data_alternative[1:]
180 |
181 | self.dict_bona_fide[key] = best_candidate
182 | if data_alternative:
183 | self.dict_alternative[key] = data_alternative
184 | else:
185 | if data_alternative:
186 | for element in data_alternative:
187 | crispr = element[1]
188 | if self.afsf(crispr):
189 | data_alternative_filtered.append(element)
190 | else:
191 | data_pre_possible.append(element)
192 |
193 | data_alternative_filtered = sorted(data_alternative_filtered, key=lambda x: x[0], reverse=True)
194 | best_candidate = data_alternative_filtered[0]
195 | data_alternative_filtered = data_alternative_filtered[1:]
196 |
197 | self.dict_bona_fide[key] = [best_candidate]
198 | if data_alternative_filtered:
199 | self.dict_alternative[key] = data_alternative_filtered
200 | data_alternative = sorted(data_alternative, key=lambda x: x[0], reverse=True)
201 | best_candidate_prev_model = data_alternative[0]
202 | data_alternative_prev_model = data_alternative[1:]
203 |
204 | vectors_alternative = [get_full_vector(data[2]) for data in data_alternative]
205 | scores_new_model = [self.possible_differentiate_model.predict_proba(v)[0][1] for v in
206 | vectors_alternative]
207 |
208 | scores_new_model, data_alternative_sorted = zip(*sorted(zip(scores_new_model, data_alternative),
209 | key=lambda x: x[0], reverse=True))
210 |
211 | best_candidate = data_alternative_sorted[0]
212 | best_score = scores_new_model[0]
213 | label = 1.0 if best_score >= 0.5 else 0.0
214 |
215 | if label == 1.0:
216 | self.dict_bona_fide[key] = [best_candidate]
217 | alternative = data_alternative_sorted[1:]
218 | if alternative:
219 | self.dict_alternative[key] = alternative
220 | else:
221 | self.dict_bona_fide[key] = [best_candidate_prev_model]
222 | if data_alternative_prev_model:
223 | self.dict_alternative[key] = data_alternative_prev_model
224 |
225 | if data_pre_possible:
226 | data_pre_possible = sorted(data_pre_possible, key=lambda x: x[0], reverse=True)
227 |
228 | vectors_pre_possible = [get_full_vector(data[2]) for data in data_pre_possible]
229 | scores_new_model = [self.possible_differentiate_model.predict_proba(v)[0][1] for v in vectors_pre_possible]
230 |
231 | scores_new_model, data_pre_possible_sorted = zip(*sorted(zip(scores_new_model, data_pre_possible),
232 | key=lambda x: x[0], reverse=True))
233 |
234 | best_possible_candidate = data_pre_possible_sorted[0]
235 | best_score = scores_new_model[0]
236 | label = 1.0 if best_score >= 0.5 else 0.0
237 |
238 | if label == 1.0:
239 | self.dict_possible[key] = [best_possible_candidate]
240 | possible_discarded = data_pre_possible_sorted[1:]
241 | self.dict_possible_discarded[key] = possible_discarded
242 | else:
243 | possible_discarded = data_pre_possible_sorted
244 | self.dict_possible_discarded[key] = possible_discarded
245 |
246 | def output(self):
247 | return [self.dict_bona_fide, self.dict_alternative, self.dict_possible,
248 | self.dict_possible_discarded, self.dict_low_score]
249 |
250 |
--------------------------------------------------------------------------------
/components/module_non_array_computations.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | from components.components_non_array_computations import StrandComputation
4 | from components.components_non_array_computations import StrandComputationNew
5 | from components.components_non_array_computations import FullISElementSearch
6 | from components.components_non_array_computations import complete_info_with_cas_identifier
7 | from components.components_non_array_computations import FullLeaderSeqSearch
8 | from components.components_non_array_computations import RevComComputation
9 |
10 |
11 | class NonArrayComputations:
12 | def __init__(self, file_path, categories, flags_non_arrays_computations, flag_dev_mode, absolute_directory_path):
13 | self.file_path = file_path
14 | self.categories = categories
15 | self.flags_non_arrays_computations = flags_non_arrays_computations
16 | self.flag_dev_mode=flag_dev_mode
17 | self.absolute_directory_path = absolute_directory_path
18 |
19 | self.list_of_crisprs_bona_fide = [self.categories[0][key][0][1] for key in sorted(self.categories[0].keys())]
20 | self.list_of_crisprs_alternative = [el[1] for key in self.categories[1].keys()
21 | for el in self.categories[1][key]]
22 | self.list_of_crisprs_possible = [el[1] for key in self.categories[2].keys()
23 | for el in self.categories[2][key]]
24 |
25 | self.hmm_model_is_elements = "tools/hmm_search/models_is_element.hmm"
26 |
27 | self.is_element_result = {}
28 | self.cas_results = {}
29 | self.cassete_results = {}
30 | self.unstructured_cas_result_from_cas_identifier = {}
31 | self.strand_results = {}
32 | self.leader_results = {}
33 | self.downstream_results = {}
34 | self.data_with_all_computations = {}
35 |
36 | self._get_complete_dna()
37 | self._calculate_all_non_array_values()
38 |
39 | def _get_complete_dna(self):
40 | with open(self.file_path, 'r') as f:
41 | lines = f.readlines()
42 |
43 | self.input_header = lines[0]
44 | self.dna = ''.join([line.strip() for line in lines[1:]])
45 | self.dna_length = len(self.dna)
46 | self.dna = self.dna.upper()
47 |
48 | def _calculate_all_non_array_values(self):
49 | self._calculate_strand()
50 | self._calculate_leader()
51 |
52 | if self.flags_non_arrays_computations["flag_cas"]:
53 | self._calculate_cas_proteins()
54 | if self.flags_non_arrays_computations["flag_is"]:
55 | self._calculate_is_elements()
56 |
57 | self.data_with_all_computations = {"IS": self.is_element_result,
58 | "Cas": self.cas_results,
59 | "Strand": self.strand_results,
60 | "Leader": [self.leader_results_bona_fide, self.leader_results_alternative, self.leader_results_possible],
61 | "Downstream": [self.downstream_results_bona_fide, self.downstream_results_alternative, self.downstream_results_possible],
62 | "Unstructured_Cas":self.unstructured_cas_result_from_cas_identifier,
63 | "Cassettes": self.cassete_results}
64 |
65 | def _calculate_is_elements(self):
66 | fies = FullISElementSearch(full_dna=self.dna, list_of_crisprs=self.list_of_crisprs_bona_fide,
67 | hmm_model=self.hmm_model_is_elements, min_similarity=0.9, min_coverage=0.9)
68 |
69 | self.is_element_result = fies.output()
70 |
71 | def _calculate_cas_proteins(self):
72 | def _get_crispr_intervals():
73 | intervals = [(x.compute_stats()["start"], x.compute_stats()["end"]) for x in self.list_of_crisprs_bona_fide]
74 | return intervals
75 |
76 | def _filter_cas_genes(intervals, dict_cas_genes):
77 | dict_filtered_cas_intervals = {}
78 | for key, value in dict_cas_genes.items():
79 | for interval in intervals:
80 | if interval[0] <= key[0] < interval[1]:
81 | break
82 | if interval[0] <= key[1] < interval[1]:
83 | break
84 | else:
85 | dict_filtered_cas_intervals[key] = value
86 |
87 | return dict_filtered_cas_intervals
88 |
89 | def _cluster_cas_genes(dict_cas_genes):
90 | list_clusters = []
91 | cluster = []
92 | for key in sorted(dict_cas_genes.keys()):
93 | value = dict_cas_genes[key]
94 | new_candidate = key[0], key[1], value
95 | if not cluster:
96 | cluster.append(new_candidate)
97 | elif abs(cluster[-1][1] - new_candidate[0]) < 500:
98 | cluster.append(new_candidate)
99 | else:
100 | list_clusters.append(cluster)
101 | cluster = [new_candidate]
102 |
103 | if cluster:
104 | list_clusters.append(cluster)
105 |
106 | return list_clusters
107 |
108 | def _clusters_to_simple_representation(list_clusters):
109 | list_simple_clusters = []
110 | for cluster in list_clusters:
111 | cluster_start = cluster[0][0]
112 | cluster_end = cluster[-1][1]
113 | list_cas_gene_descriptions = [x[2] for x in cluster]
114 | list_simple_clusters.append((cluster_start, cluster_end, list_cas_gene_descriptions))
115 | return list_simple_clusters
116 |
117 | def _compute_allowed_intervals(crispr_intervals):
118 | allowed_interwals = []
119 | if not crispr_intervals:
120 | return [(0, math.inf)]
121 | else:
122 | allowed_interwals.append((0, crispr_intervals[0][0]))
123 | for index in range(len(crispr_intervals) - 1):
124 | allowed_interwals.append((crispr_intervals[index][1], crispr_intervals[index+1][0]))
125 | allowed_interwals.append((crispr_intervals[-1][1], math.inf))
126 | return allowed_interwals
127 |
128 | def _group_by_output(allowed_intervals, list_simple_clusters):
129 | dict_cas_gene_order = {}
130 | for cluster in list_simple_clusters:
131 | for index, allowed_interval in enumerate(allowed_intervals):
132 | if allowed_interval[0] <= cluster[0] < allowed_interval[1]:
133 | if index in dict_cas_gene_order:
134 | dict_cas_gene_order[index].append(cluster)
135 | else:
136 | dict_cas_gene_order[index] = [cluster]
137 | break
138 | return dict_cas_gene_order
139 |
140 | def _group_by_output_separated(allowed_intervals, regular_clusters):
141 | dict_cas_gene_order_for_separated = {}
142 | for cluster in regular_clusters:
143 | for index, allowed_interval in enumerate(allowed_intervals):
144 | if allowed_interval[0] <= cluster[0][0] < allowed_interval[1]:
145 | if index in dict_cas_gene_order_for_separated:
146 | dict_cas_gene_order_for_separated[index].append(cluster)
147 | else:
148 | dict_cas_gene_order_for_separated[index] = [cluster]
149 | break
150 | return dict_cas_gene_order_for_separated
151 |
152 | dict_cas_genes, dict_cassete_labels = complete_info_with_cas_identifier(self.file_path,
153 | self.absolute_directory_path)
154 |
155 | self.cassete_results = dict_cassete_labels
156 | self.unstructured_cas_result_from_cas_identifier = dict_cas_genes
157 |
158 | intervals = _get_crispr_intervals()
159 | allowed_intervals = _compute_allowed_intervals(intervals)
160 | dict_filtered_cas_genes = _filter_cas_genes(intervals, dict_cas_genes)
161 | clustered_cas_genes = _cluster_cas_genes(dict_filtered_cas_genes)
162 |
163 | simple_clusters = _clusters_to_simple_representation(clustered_cas_genes)
164 | dict_groups = _group_by_output(allowed_intervals, simple_clusters)
165 | #dict_groups_separated = _group_by_output_separated(allowed_intervals, clustered_cas_genes)
166 |
167 | self.cas_results = dict_groups
168 |
169 | def _calculate_strand(self):
170 | if self.flags_non_arrays_computations["flag_strand"]:
171 | st = StrandComputationNew(list_of_crisprs=self.list_of_crisprs_bona_fide,
172 | absolute_directory_path=self.absolute_directory_path)
173 | self.strand_results["Bona-fide"] = st.output()
174 | st = StrandComputationNew(list_of_crisprs=self.list_of_crisprs_alternative,
175 | absolute_directory_path=self.absolute_directory_path)
176 | self.strand_results["Alternative"] = st.output()
177 | st = StrandComputationNew(list_of_crisprs=self.list_of_crisprs_possible,
178 | absolute_directory_path=self.absolute_directory_path)
179 | self.strand_results["Possible"] = st.output()
180 |
181 |
182 | #except Exception:
183 | # st = StrandComputation(list_of_crisprs=self.list_of_crisprs_bona_fide,
184 | # absolute_directory_path=self.absolute_directory_path)
185 | # self.strand_results["Bona-fide"] = st.output()
186 | # st = StrandComputation(list_of_crisprs=self.list_of_crisprs_alternative,
187 | # absolute_directory_path=self.absolute_directory_path)
188 | # self.strand_results["Alternative"] = st.output()
189 | # st = StrandComputation(list_of_crisprs=self.list_of_crisprs_possible,
190 | # absolute_directory_path=self.absolute_directory_path)
191 | # self.strand_results["Possible"] = st.output()
192 | else:
193 | self.strand_results["Bona-fide"] = {index: "Forward (Orientation was not computed)"
194 | for index in range(len(self.list_of_crisprs_bona_fide))}
195 | self.strand_results["Alternative"] = {index: "Forward (Orientation was not computed)"
196 | for index in range(len(self.list_of_crisprs_alternative))}
197 | self.strand_results["Possible"] = {index: "Forward (Orientation was not computed)"
198 | for index in range(len(self.list_of_crisprs_possible))}
199 |
200 | def _calculate_leader(self):
201 | flss_bona_fide = FullLeaderSeqSearch(self.list_of_crisprs_bona_fide, self.strand_results["Bona-fide"], self.dna)
202 | self.leader_results_bona_fide, self.downstream_results_bona_fide = flss_bona_fide.output()
203 |
204 | flss_alternative = FullLeaderSeqSearch(self.list_of_crisprs_alternative, self.strand_results["Alternative"],
205 | self.dna)
206 | self.leader_results_alternative, self.downstream_results_alternative = flss_alternative.output()
207 |
208 | flss_possible = FullLeaderSeqSearch(self.list_of_crisprs_possible, self.strand_results["Possible"], self.dna)
209 | self.leader_results_possible, self.downstream_results_possible = flss_possible.output()
210 |
211 | def output(self):
212 | return self.data_with_all_computations
213 |
--------------------------------------------------------------------------------
/components/module_output_maker.py:
--------------------------------------------------------------------------------
1 | from components.components_output_maker import SimpleOutputMaker
2 | from components.components_output_maker import SummaryOutputMaker
3 | from components.components_output_maker import SummaryMakerCSV
4 | from components.components_output_maker import PickleOutputMaker
5 | from components.components_output_maker import CasSummaryMaker
6 | from components.components_output_maker import FastaOutputArrayMaker
7 | from components.components_output_maker import JsonOutputMaker
8 |
9 | from components.components_output_maker import CompleteFastaOutputMaker
10 | from components.components_output_maker import CompleteFolderSummaryMaker
11 | from components.components_output_maker import CompleteCasSummaryFolderMaker
12 | from components.components_output_maker import SpacerSummaryMaker
13 | from components.components_output_maker import CompleteSpacerCSVMaker
14 |
15 |
16 | class OutputMaker:
17 | def __init__(self, file_path, parameters, flags, result_path, pickle_result_path,
18 | json_result_path, categories, non_array_data, list_features, header):
19 | self.file_path = file_path
20 | self.parameters = parameters
21 | self.flags = flags
22 | self.result_path = result_path
23 | self.pickle_result_path = pickle_result_path
24 | self.json_result_path = json_result_path
25 | self.categories = categories
26 | self.non_array_data = non_array_data
27 | self.list_features = list_features
28 | self.header = header
29 | self.global_result_folder = "/".join(self.result_path.split("/")[:-1])
30 |
31 | self._make_output()
32 |
33 | def _make_output(self):
34 | som = SimpleOutputMaker(categories=self.categories,
35 | result_path=self.result_path,
36 | non_array_data=self.non_array_data,
37 | list_features=self.list_features)
38 |
39 | suom = SummaryOutputMaker(result_path=self.result_path,
40 | categories=self.categories,
41 | non_array_data=self.non_array_data,
42 | header=self.header,
43 | list_feature_names=self.list_features)
44 |
45 | ssm = SpacerSummaryMaker(categories=self.categories,
46 | result_path=self.result_path)
47 |
48 | sm_csv = SummaryMakerCSV(result_path=self.result_path,
49 | categories=self.categories,
50 | non_array_data=self.non_array_data)
51 |
52 | if self.flags["flag_cas"] is True:
53 | sm_cas = CasSummaryMaker(result_path=self.result_path,
54 | non_array_data=self.non_array_data)
55 |
56 |
57 | #cfsm = CompleteFolderSummaryMaker(folder_result=self.global_result_folder)
58 | #ccfsm = CompleteCasSummaryFolderMaker(folder_result=self.global_result_folder)
59 |
60 | if self.flags["flag_fasta_report"] is True:
61 | foam = FastaOutputArrayMaker(folder_result=self.result_path,
62 | categories=self.categories,
63 | non_array_data=self.non_array_data)
64 |
65 | #cfom = CompleteFastaOutputMaker(folder_result=self.global_result_folder)
66 |
67 | if self.pickle_result_path:
68 | pom = PickleOutputMaker(file_path=self.file_path,
69 | pickle_result_folder=self.pickle_result_path,
70 | parameters=self.parameters,
71 | categories=self.categories,
72 | non_array_data=self.non_array_data,
73 | header=self.header,
74 | list_feature_names=self.list_features)
75 |
76 | if self.json_result_path:
77 | jom = JsonOutputMaker(file_path=self.file_path,
78 | json_result_folder=self.json_result_path,
79 | categories=self.categories,
80 | non_array_data=self.non_array_data,
81 | list_feature_names=self.non_array_data)
82 |
--------------------------------------------------------------------------------
/components/pipeline.py:
--------------------------------------------------------------------------------
1 | from components.module_detection import Detection
2 | from components.module_detection_refinement import DetectionRefinement
3 | from components.module_evaluation import ArrayEvaluation
4 | from components.module_evaluated_arrays_enhancement import EvaluatedArraysEnhancement
5 | from components.module_non_array_computations import NonArrayComputations
6 | from components.module_output_maker import OutputMaker
7 |
8 |
9 | class Pipeline:
10 | def __init__(self, result_folder_path, pickle_folder_path, json_folder_path, file_path,
11 | list_ml_classifiers, list_features, parameters, flags, flag_dev_mode, absolute_directory_path):
12 | self.result_folder_path = result_folder_path + "/" + file_path.split("/")[-1].split(".")[0]
13 | self.pickle_folder_path = pickle_folder_path
14 | self.json_folder_path = json_folder_path
15 | self.file_path = file_path
16 | self.list_ml_classifiers = list_ml_classifiers
17 | self.list_features = [features.strip().split(".") for features in list_features]
18 | self.flags = flags
19 | self.parameters = parameters
20 | self.flag_dev_mode = flag_dev_mode
21 | self.absolute_directory_path = absolute_directory_path
22 |
23 | self.header = None
24 | self.dict_fuzzy_crisprs = {}
25 | self.dict_crispr_candidates = {}
26 | self.categories = {}
27 | self.non_array_data = {}
28 |
29 | self._get_header()
30 | self._run_detection()
31 | self._run_detection_refinement()
32 | self._run_evaluation()
33 | self._results_enhancement()
34 | self._run_non_crispr_computation()
35 | self._write_output()
36 |
37 | def _get_header(self):
38 | with open(self.file_path) as f:
39 | self.header = f.readline()
40 |
41 | def _run_detection(self):
42 | print("1. Run initial array detection")
43 | detection = Detection(file_path=self.file_path,
44 | flags=self.flags,
45 | parameters=self.parameters,
46 | flag_dev_mode=self.flag_dev_mode)
47 | self.dict_fuzzy_crisprs = detection.output()
48 |
49 | def _run_detection_refinement(self):
50 | print("2. Refine detected arrays")
51 | det_ref = DetectionRefinement(dict_fuzzy_crisprs=self.dict_fuzzy_crisprs,
52 | parameters=self.parameters,
53 | flag_dev_mode=self.flag_dev_mode)
54 | self.dict_crispr_candidates = det_ref.output()
55 |
56 | def _run_evaluation(self):
57 | print("3. Evaluate candidates")
58 | ae = ArrayEvaluation(dict_crispr_array_candidates=self.dict_crispr_candidates,
59 | list_ml_classifiers=self.list_ml_classifiers,
60 | list_features=self.list_features,
61 | parameters=self.parameters,
62 | flag_dev_mode=self.flag_dev_mode)
63 | self.categories = ae.output()
64 |
65 | def _results_enhancement(self):
66 | print("4. Enhance evaluated arrays")
67 | a_enh = EvaluatedArraysEnhancement(file_path=self.file_path,
68 | categories=self.categories,
69 | parameters=self.parameters,
70 | flag_dev_mode=self.flag_dev_mode)
71 | self.categories = a_enh.output()
72 |
73 | def _run_non_crispr_computation(self):
74 | print("5. Complement arrays with additional info")
75 | nac = NonArrayComputations(file_path=self.file_path,
76 | categories=self.categories,
77 | flags_non_arrays_computations=self.flags,
78 | flag_dev_mode=self.flag_dev_mode,
79 | absolute_directory_path=self.absolute_directory_path)
80 | self.non_array_data = nac.output()
81 |
82 | def _write_output(self):
83 | print("6. Write down the results")
84 | om = OutputMaker(file_path=self.file_path,
85 | parameters=self.parameters,
86 | flags=self.flags,
87 | result_path=self.result_folder_path,
88 | pickle_result_path=self.pickle_folder_path,
89 | json_result_path=self.json_folder_path,
90 | categories=self.categories,
91 | non_array_data=self.non_array_data,
92 | list_features=self.list_features,
93 | header=self.header)
94 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: crispr_identify_env
2 | channels:
3 | - conda-forge
4 | - bioconda
5 | - nodefaults
6 | - biobuilds
7 | - r
8 | - axfeh
9 | dependencies:
10 | - python==3.7.6
11 | - pip
12 | - python_abi=3.7
13 | - biopython=1.76
14 | - h5py=2.10.0
15 | - hdf5=1.10.6
16 | - hmmer=3.3
17 | - numpy==1.18.1
18 | - pandas=1.0.3
19 | - matplotlib=3.1.3
20 | - perl=5.26.2
21 | - perl-compress-bgzf=0.005
22 | - perl-threaded=5.26.0
23 | - perl-yaml=1.29
24 | - prodigal==2.6.3
25 | - dill=0.3.3
26 | - protobuf=3.13.0.1
27 | - regex=2019.03.09
28 | - pyasn1=0.4.8
29 | - pycparser=2.20
30 | - networkx=2.5
31 | - pyjwt=1.7.1
32 | - pyparsing=2.4.7
33 | - pyqt=5.9.2
34 | - pysocks=1.7.1
35 | - python-dateutil=2.8.1
36 | - pytz=2020.1
37 | - pyyaml=5.3.1
38 | - scikit-learn==0.22.1
39 | - scipy=1.4.1
40 | - anaconda::tensorflow==2.3.0
41 | - tensorboard==2.3.0
42 | - tensorboard-plugin-wit==1.6.0
43 | - viennarna==2.4.15
44 | - pyopenssl=22.0.0
45 | - certifi=2022.12.7
46 | - vmatch==2.3.0
47 | - clustalo==1.2.3
48 | - blast==2.5.0
49 | - keras==2.4.3
50 | - libffi=3.2.1
51 | - spacerplacer
52 | - pip:
53 | - python-Levenshtein
54 |
--------------------------------------------------------------------------------
/tools/CRISPRcasIdentifier/README.txt:
--------------------------------------------------------------------------------
1 | link the CRISPRcasIdentifier folder here, such that you have a call like the following
2 |
3 | CRISPRidentify/tools/CRISPRcasIdentifier/CRISPRcasIdentifier/CRISPRcasIdentifier.py
4 |
--------------------------------------------------------------------------------
/tools/blasting/Verified_repeats_dataset1.fa:
--------------------------------------------------------------------------------
1 | >db1_1
2 | GATAATCTCTTATAGAATTGAAAG
3 | >db1_2
4 | GTTTTTATCGTACCTATGAGGAATTGAAAC
5 | >db1_3
6 | GTTTCAGACGAACCCTTGTGGGATTGAAGC
7 | >db1_4
8 | GTTTCAGACGAACCCTTGTGGGGTTGAAGC
9 | >db1_5
10 | GTTTCAGACGAACCCTTGTGGGTTTGAAGC
11 | >db1_6
12 | GATTAATCCCAAAAGGAATTGAAAG
13 | >db1_7
14 | GTCGCGTCCTCACGGGCGCGTGGATTGAAAC
15 | >db1_8
16 | GAGTTCCCCGCGCCAGCGGGGATAAACCG
17 | >db1_9
18 | GTGTTCCCCGCGCCAGCGGGGATAAACCG
19 | >db1_10
20 | GTTCACTGCCGTGTAGGCAGCTAAGAAA
21 | >db1_11
22 | GTTCACTGCCGTACAGGCAGCTTAGAAA
23 | >db1_12
24 | GTTGAAGTGGTACTTCCAGTAAAACAAGGATTGAAAC
25 | >db1_13
26 | CTAAAAGAATAACTTGCAAAATAACAAGCATTGAAAC
27 | >db1_14
28 | CTTTCCTTCTACTAATCCCGGCGATCGGGACTGAAAC
29 | >db1_15
30 | GTTTTAGAGCTATGCTGTTTTGAATGGTCCCAAAAC
31 | >db1_16
32 | GTTGTAGCTCCCTTTCTCATTTCGCAGTGCTACAAT
33 | >db1_17
34 | GTTTTAGTCCCTTTTTAAATTTCTTTATGGTAAAAT
35 | >db1_18
36 | GTTCCAATAAGACTAAAATAGAATTGAAAG
37 | >db1_19
38 | GATCGATACCCACCCCGAAGAAAAGGGGACGAGAAC
39 | >db1_20
40 | GTTCAACACCCTCTTTTCCCCGTCAGGGGACTGAAAC
41 | >db1_21
42 | GTCTCCACTCGTAGGAGAAATTAATTGATTGGAAAC
43 | >db1_22
44 | GAACAACTCAAAAGAGAATTGCAAG
45 | >db1_23
46 | ATTAAAATCAGACCGTTTCGGAATGGAAAT
47 | >db1_24
48 | GTTTTATATTAACTAAGTGGTATGTAAAG
49 | >db1_25
50 | GAATCTCAAAAAGAGGATTGAAAG
51 | >db1_26
52 | GTGGAAATCAAAAGATAGTAGAAAC
53 | >db1_27
54 | GGTTTTAGTACTCTGTAATTTTAG
55 |
--------------------------------------------------------------------------------
/tools/blasting/Verified_repeats_dataset1.fa.nhr:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/tools/blasting/Verified_repeats_dataset1.fa.nhr
--------------------------------------------------------------------------------
/tools/blasting/Verified_repeats_dataset1.fa.nin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/tools/blasting/Verified_repeats_dataset1.fa.nin
--------------------------------------------------------------------------------
/tools/blasting/Verified_repeats_dataset1.fa.nog:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/tools/blasting/Verified_repeats_dataset1.fa.nog
--------------------------------------------------------------------------------
/tools/blasting/Verified_repeats_dataset1.fa.nsd:
--------------------------------------------------------------------------------
1 | db1_10
2 | db1_109
3 | db1_1110
4 | db1_1211
5 | db1_1312
6 | db1_1413
7 | db1_1514
8 | db1_1615
9 | db1_1716
10 | db1_1817
11 | db1_1918
12 | db1_21
13 | db1_2019
14 | db1_2120
15 | db1_2221
16 | db1_2322
17 | db1_2423
18 | db1_2524
19 | db1_2625
20 | db1_2726
21 | db1_32
22 | db1_43
23 | db1_54
24 | db1_65
25 | db1_76
26 | db1_87
27 | db1_98
28 | lcl|db1_10
29 | lcl|db1_109
30 | lcl|db1_1110
31 | lcl|db1_1211
32 | lcl|db1_1312
33 | lcl|db1_1413
34 | lcl|db1_1514
35 | lcl|db1_1615
36 | lcl|db1_1716
37 | lcl|db1_1817
38 | lcl|db1_1918
39 | lcl|db1_21
40 | lcl|db1_2019
41 | lcl|db1_2120
42 | lcl|db1_2221
43 | lcl|db1_2322
44 | lcl|db1_2423
45 | lcl|db1_2524
46 | lcl|db1_2625
47 | lcl|db1_2726
48 | lcl|db1_32
49 | lcl|db1_43
50 | lcl|db1_54
51 | lcl|db1_65
52 | lcl|db1_76
53 | lcl|db1_87
54 | lcl|db1_98
55 |
--------------------------------------------------------------------------------
/tools/blasting/Verified_repeats_dataset1.fa.nsi:
--------------------------------------------------------------------------------
1 | b 6 @ b 4