├── .gitignore
├── CRISPRidentify.py
├── LICENSE
├── README.md
├── TestFolderMultiline
    ├── MultilineFasta.fasta
    └── MultilineFasta_1.fasta
├── TestInput
    ├── NC_006513.fa
    ├── NC_013216.fa
    ├── NC_014152.fa
    ├── NC_016625.fa
    ├── NC_017040.1.fasta
    ├── NC_018524.fa
    └── NC_019693.fa
├── TestInputMultiline
    └── MultilineFasta.fasta
├── components
    ├── __init__.py
    ├── components_detection.py
    ├── components_detection_refinement.py
    ├── components_eden.py
    ├── components_evaluated_arrays_enhancement.py
    ├── components_evaluation.py
    ├── components_helpers.py
    ├── components_ml.py
    ├── components_non_array_computations.py
    ├── components_output_maker.py
    ├── module_detection.py
    ├── module_detection_refinement.py
    ├── module_evaluated_arrays_enhancement.py
    ├── module_evaluation.py
    ├── module_non_array_computations.py
    ├── module_output_maker.py
    └── pipeline.py
├── environment.yml
├── tools
    ├── CRISPRcasIdentifier
    │   └── README.txt
    ├── blasting
    │   ├── Verified_repeats_dataset1.fa
    │   ├── Verified_repeats_dataset1.fa.nhr
    │   ├── Verified_repeats_dataset1.fa.nin
    │   ├── Verified_repeats_dataset1.fa.nog
    │   ├── Verified_repeats_dataset1.fa.nsd
    │   ├── Verified_repeats_dataset1.fa.nsi
    │   ├── Verified_repeats_dataset1.fa.nsq
    │   ├── Verified_repeats_dataset2.fa
    │   ├── Verified_repeats_dataset2.fa.nhr
    │   ├── Verified_repeats_dataset2.fa.nin
    │   ├── Verified_repeats_dataset2.fa.nog
    │   ├── Verified_repeats_dataset2.fa.nsd
    │   ├── Verified_repeats_dataset2.fa.nsi
    │   └── Verified_repeats_dataset2.fa.nsq
    └── strand_prediction
    │   └── CRISPRstrand
    │       ├── CRISPRstrand.py
    │       ├── CRISPRstrand.yml
    │       ├── Example
    │           ├── Input.fa
    │           ├── Input.txt
    │           ├── Input3.fa
    │           ├── Input4.txt
    │           └── Input5.fa
    │       ├── Models
    │           └── model_r.h5
    │       ├── Results
    │           └── CRISPRstrand_Summary.tsv
    │       ├── cmd.txt
    │       ├── convNets.py
    │       ├── evaluate.py
    │       ├── execute_strand.py
    │       ├── preprocessing.py
    │       └── utils.py
└── trained_models
    ├── eden
        ├── eden_ab_vs_n
        ├── eden_archaea
        ├── eden_bacteria
        ├── eden_merged
        └── eden_merged_with_neg
    └── extra_trees
        ├── extra_trees_ab_vs_n.pkl
        ├── extra_trees_archaea.pkl
        ├── extra_trees_bacteria.pkl
        ├── extra_trees_merged.pkl
        ├── extra_trees_merged_with_neg.pkl
        ├── extra_trees_subset.pkl
        ├── extra_trees_subset10features.pkl
        ├── extra_trees_subset8features.pkl
        └── extra_trees_subset9features.pkl


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | .idea
3 | /tools/CRISPRcasIdentifier/CRISPRcasIdentifier/
4 | 


--------------------------------------------------------------------------------
/CRISPRidentify.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import math
  3 | import shutil
  4 | import warnings
  5 | import os
  6 | 
  7 | from pathlib import Path
  8 | from os import listdir
  9 | from os.path import isfile, join
 10 | from time import time
 11 | 
 12 | from components.pipeline import Pipeline
 13 | from components.components_ml import ClassifierWrapper
 14 | from components.components_output_maker import CompleteFastaOutputMaker
 15 | from components.components_output_maker import CompleteFolderSummaryMaker
 16 | from components.components_output_maker import CompleteCasSummaryFolderMaker
 17 | from components.components_output_maker import CompleteJsonOutputMaker
 18 | from components.components_output_maker import CompleteSpacerCSVMaker
 19 | from components.components_helpers import multiline_fasta_check, multiline_fasta_handle, multiline_fasta_handle_python
 20 | from components.components_helpers import folder_of_multifasta_handle
 21 | 
 22 | warnings.filterwarnings("ignore")
 23 | warnings.simplefilter(action='ignore', category=FutureWarning)
 24 | 
 25 | FLAG_DEVELOPER_MODE = False
 26 | 
 27 | parser = argparse.ArgumentParser(description='Run Identifier')
 28 | parser.add_argument('--input_folder', type=str, default=None,
 29 |                     help='input folder (default: None)')
 30 | 
 31 | parser.add_argument('--file', type=str, default=None,
 32 |                     help='input file (default: None)')
 33 | 
 34 | parser.add_argument('--input_folder_multifasta', type=str, default=None,
 35 |                     help='input folder of multifasta (default: None)')
 36 | 
 37 | parser.add_argument('--model', type=str, default="ALL",
 38 |                     help='model_to_use (default: ALL)')
 39 | 
 40 | parser.add_argument('--additional_model', type=str, default=None,
 41 |                     help='model_to_use (default: None)')
 42 | 
 43 | parser.add_argument('--result_folder', type=str, default="Results",
 44 |                     help='folder with the result (default: Results)')
 45 | 
 46 | parser.add_argument('--pickle_report', type=str, default='',
 47 |                     help='pickled report file (default: None)')
 48 | 
 49 | parser.add_argument('--json_report', type=str, default='',
 50 |                     help='json report file (default: None)')
 51 | 
 52 | parser.add_argument('--fasta_report', type=str, default=False,
 53 |                     help='fasta report file (default: False)')
 54 | 
 55 | parser.add_argument('--strand', type=str, default=True,
 56 |                     help='CRISPR array orientation prediction (default: True)')
 57 | 
 58 | parser.add_argument('--cas', type=str, default=False,
 59 |                     help='cas genes computation (default: False)')
 60 | 
 61 | parser.add_argument('--is_element', type=str, default=True,
 62 |                     help='is element computation (default: True)')
 63 | 
 64 | parser.add_argument('--parallel', type=str, default=True,
 65 |                     help='parallel computations (default: True)')
 66 | 
 67 | parser.add_argument('--cpu', type=str, default="ALL",
 68 |                     help='parallel computations (default: ALL)')
 69 | 
 70 | parser.add_argument('--fast_run', type=str, default=False,
 71 |                     help='fast run option (default: False)')
 72 | 
 73 | parser.add_argument('--degenerated', type=bool, default=True,
 74 |                     help='degenerated_repeat_computation (default: True)')
 75 | 
 76 | parser.add_argument('--min_len_rep', type=int, default=21,
 77 |                     help='min avg. length of the repeats (default: 21)')
 78 | 
 79 | parser.add_argument('--max_len_rep', type=int, default=55,
 80 |                     help='max avg. length of the repeats (default: 55)')
 81 | 
 82 | parser.add_argument('--min_len_spacer', type=int, default=18,
 83 |                     help='min avg. length of spacers (default: 18)')
 84 | 
 85 | parser.add_argument('--max_len_spacer', type=int, default=78,
 86 |                     help='max avg. length of spacers (default: 78)')
 87 | 
 88 | parser.add_argument('--min_repeats', type=int, default=3,
 89 |                     help='min number of repeats (default: 3)')
 90 | 
 91 | parser.add_argument('--enhancement_max_min', type=bool, default=True,
 92 |                     help='enhancement with filter (default: True)')
 93 | 
 94 | parser.add_argument('--enhancement_start_end', type=bool, default=True,
 95 |                     help='enhancement with start end omitting (default: True)')
 96 | 
 97 | parser.add_argument('--max_identical_spacers', type=int, default=4,
 98 |                     help='maximum number of identical spacers in the array (default: 4)')
 99 | 
100 | parser.add_argument('--max_identical_cluster_spacers', type=int, default=3,
101 |                     help='maximum number of consecutive identical spacers in the array (default: 3)')
102 | 
103 | parser.add_argument('--margin_degenerated', type=int, default=30,
104 |                     help='maximum length of the spacer margin for the degenerated search (default: 30)', )
105 | 
106 | parser.add_argument('--max_edit_distance_enhanced', type=int, default=6,
107 |                     help='maximum edit distance for the evaluated array enhancement (default: 6)')
108 | 
109 | 
110 | script_absolute_path = os.path.dirname(os.path.abspath(__file__))
111 | work_directory = os.getcwd()
112 | pid = os.getpid()
113 | 
114 | args = parser.parse_args()
115 | 
116 | complete_path_folder = (args.input_folder)
117 | if complete_path_folder:
118 |     complete_path_folder = Path(complete_path_folder).absolute()
119 | 
120 | complete_path_file = args.file
121 | if complete_path_file:
122 |     complete_path_file = Path(complete_path_file).absolute()
123 | 
124 | complete_folder_multifasta = args.input_folder_multifasta
125 | if complete_folder_multifasta:
126 |     complete_folder_multifasta = Path(complete_folder_multifasta).absolute()
127 | 
128 | folder_result = args.result_folder
129 | if folder_result:
130 |     folder_result = Path(folder_result).absolute()
131 | 
132 | pickle_folder = args.pickle_report
133 | if pickle_folder:
134 |     pickle_folder = Path(pickle_folder).absolute()
135 | 
136 | json_folder = args.json_report
137 | if json_folder:
138 |     json_folder = Path(json_folder).absolute()
139 | 
140 | list_models = ["8", "9", "10"] if args.model == "ALL" else [args.model]
141 | flag_possible_differentiate_model = args.additional_model
142 | if flag_possible_differentiate_model not in ["possible", "all"]:
143 |     flag_possible_differentiate_model = None
144 | 
145 | 
146 | flag_enhancement_max_min = args.enhancement_max_min
147 | flag_enhancement_start_end = args.enhancement_start_end
148 | 
149 | flag_parallel = False if (args.parallel in ["False", False]) else True
150 | flag_cpu = args.cpu
151 | flag_fast_run = False if (args.fast_run in ["False", False]) else True
152 | 
153 | strand_flag = False if (args.strand in ["False", False]) else True
154 | cas_flag = False if (args.cas in ["False", False]) else True
155 | is_flag = False if (args.is_element in ["False", False]) else True
156 | degenerated_flag = False if (args.degenerated in ["False", False]) else True
157 | fasta_report = False if (args.fasta_report in ["False", False]) else True
158 | 
159 | flags = {"flag_parallel": flag_parallel,
160 |          "flag_cpu": flag_cpu,
161 |          "flag_fast_run": flag_fast_run,
162 |          "flag_strand": strand_flag,
163 |          "flag_cas": cas_flag,
164 |          "flag_is": is_flag,
165 |          "flag_fasta_report": fasta_report,
166 |          "flag_degenerated": degenerated_flag,
167 |          "flag_enhancement_min_max": flag_enhancement_max_min,
168 |          "flag_enhancement_start_end": flag_enhancement_start_end
169 | }
170 | 
171 | min_rep = args.min_len_rep
172 | max_rep = args.max_len_rep
173 | max_spacer = args.max_len_spacer
174 | min_spacer = args.min_len_spacer
175 | min_repeats = args.min_repeats
176 | max_identical_spacers = args.max_identical_spacers
177 | max_identical_cluster_spacers = args.max_identical_cluster_spacers
178 | margin_degenerated = args.margin_degenerated
179 | max_edit_distance_enhancement = args.max_edit_distance_enhanced
180 | 
181 | parameters = {
182 |     "param_min_avg_repeat_length": min_rep,
183 |     "param_max_avg_repeat_length": max_rep,
184 |     "param_max_avg_spacer_length": max_spacer,
185 |     "param_min_avg_spacer_length": min_spacer,
186 |     "param_min_repeats": min_repeats,
187 |     "param_max_identical_spacers": max_identical_spacers,
188 |     "param_max_identical_cluster_spacers": max_identical_cluster_spacers,
189 |     "param_spacer_margin_degenerated_search": margin_degenerated,
190 |     "param_max_edit_distance": max_edit_distance_enhancement
191 | }
192 | 
193 | 
194 | ALL_FEATURES = ['repeat_len', 'number_repeats', 'repeat_similarity',
195 |                 'at_richness', 'avg_spacer_len', 'spacer_similarity',
196 |                 'number_mismatches', 'spacer_evenness', 'mfe_score',
197 |                 'orf_score', 'hmmr_score', 'blast_score_1', 'blast_score_2',
198 |                 'eden_score']
199 | 
200 | best_combinations = {
201 |     "8": (2, 4, 5, 6, 7, 8, 9, 11),
202 |     "9": (1, 2, 4, 5, 7, 8, 9, 10, 12),
203 |     "10": (0, 2, 3, 4, 5, 6, 7, 10, 11, 12)
204 | }
205 | 
206 | 
207 | pid_work_directory = os.path.join(work_directory, 'Identify_Temp' + str(pid))
208 | if not os.path.exists(pid_work_directory):
209 |     os.makedirs(pid_work_directory)
210 |     os.chdir(pid_work_directory)
211 | 
212 | 
213 | feature_list = ['.'.join([ALL_FEATURES[i] for i in best_combinations[model]]) for model in list_models]
214 | list_ml_classifiers = [ClassifierWrapper(classifier_type=None,
215 |                                          load_option=script_absolute_path + "/trained_models/extra_trees/extra_trees_subset{}features.pkl".
216 |                                                      format(model))
217 |                        for model in list_models]
218 | 
219 | 
220 | def run_over_folder_of_files(folder, result_folder, pickle_folder, chunk_number=None, number_of_chunks=None):
221 |     files = [f for f in listdir(folder) if isfile(join(folder, f))]
222 |     files_name_fix = [f.replace("\r", "").replace("\t", "").replace("\n", "") for f in files]
223 |     for old_name, new_name in zip(files, files_name_fix):
224 |         old_path = join(folder, old_name)
225 |         new_path = join(folder, new_name)
226 |         if old_path != new_path:
227 |             os.system(f"mv {old_path} {new_path}")
228 |     files = sorted(files_name_fix)
229 | 
230 |     if number_of_chunks:
231 |         chunk_size = math.ceil(len(files) / number_of_chunks)
232 |         chunk_start = (chunk_number - 1) * chunk_size
233 |         chunk_end = chunk_number * chunk_size
234 |         chunk = files[chunk_start:chunk_end]
235 |         print(chunk_start)
236 |         print(chunk_end)
237 |     else:
238 |         chunk = files
239 | 
240 |     for index, file in enumerate(chunk, 1):
241 |         print("\n\n\n\t\t\t\tExecuting file {} out of {} ({})\n\n\n".format(index, len(chunk), file))
242 |         pl = Pipeline(result_folder_path="{}/".format(result_folder),
243 |                       pickle_folder_path="{}".format(pickle_folder),
244 |                       json_folder_path="{}".format(json_folder),
245 |                       file_path=join(folder, file),
246 |                       list_ml_classifiers=list_ml_classifiers,
247 |                       list_features=feature_list,
248 |                       parameters=parameters,
249 |                       flags=flags,
250 |                       flag_dev_mode=FLAG_DEVELOPER_MODE,
251 |                       absolute_directory_path=script_absolute_path)
252 | 
253 |     cfsm = CompleteFolderSummaryMaker(folder_result=result_folder)
254 |     ccfsm = CompleteCasSummaryFolderMaker(folder_result=result_folder)
255 |     cfom = CompleteFastaOutputMaker(folder_result=result_folder)
256 |     if cas_flag:
257 |         cs_csv = CompleteSpacerCSVMaker(folder_result=result_folder)
258 |     if json_folder:
259 |         cjsm = CompleteJsonOutputMaker(folder_json_result=json_folder, folder_text_tesult=result_folder)
260 | 
261 | 
262 | def run_over_one_file(file, result_folder, pickle_folder, json_folder):
263 |     print("\n\n\n\t\t\t\tExecuting file {}\n\n\n".format(file))
264 |     pl = Pipeline(result_folder_path="{}/".format(result_folder),
265 |                   pickle_folder_path="{}".format(pickle_folder),
266 |                   json_folder_path="{}".format(json_folder),
267 |                   file_path=join(file),
268 |                   list_ml_classifiers=list_ml_classifiers,
269 |                   list_features=feature_list,
270 |                   parameters=parameters,
271 |                   flags=flags,
272 |                   flag_dev_mode=FLAG_DEVELOPER_MODE,
273 |                   absolute_directory_path=script_absolute_path)
274 | 
275 |     cfsm = CompleteFolderSummaryMaker(folder_result=result_folder)
276 |     ccfsm = CompleteCasSummaryFolderMaker(folder_result=result_folder)
277 |     cfom = CompleteFastaOutputMaker(folder_result=result_folder)
278 |     if cas_flag:
279 |         cs_csv = CompleteSpacerCSVMaker(folder_result=result_folder)
280 |     if json_folder:
281 |         cjsm = CompleteJsonOutputMaker(folder_json_result=json_folder, folder_text_tesult=result_folder)
282 | 
283 | 
284 | 
285 | def main():
286 |     start_time = time()
287 |     if complete_path_file:
288 |         folder_multifasta = multiline_fasta_handle_python(complete_path_file, flag_ncbi_formatting=True)
289 |         print(folder_multifasta)
290 |         run_over_folder_of_files(folder_multifasta, folder_result, pickle_folder, json_folder)
291 |         shutil.rmtree(folder_multifasta)
292 |     elif complete_path_folder:
293 |         run_over_folder_of_files(complete_path_folder, folder_result, pickle_folder, json_folder)
294 |     elif complete_folder_multifasta:
295 |         print("Folder Multifasta")
296 |         folder_multifasta = folder_of_multifasta_handle(complete_folder_multifasta)
297 |         run_over_folder_of_files(folder_multifasta, folder_result, pickle_folder, json_folder)
298 |     else:
299 |         print("No input was provided")
300 | 
301 |     end_time = time()
302 |     print("Elapsed time: ", end_time-start_time)
303 | 
304 | 
305 | if __name__ == "__main__":
306 |     main()
307 |     shutil.rmtree(pid_work_directory, ignore_errors=True)
308 | 
309 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Bioinformatics Lab - Department of Computer Science - University Freiburg
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # CRISPRidentify: Identification of CRISPR arrays using machine learning approach
  3 | 
  4 | CRISPRidentify is a tool to search for CRISPR arrays which utilises 
  5 | machine learning approach for distinguishing false candidates from true CRISPRS.
  6 | CRISPRidentify, performs three steps: detection, feature extraction and 
  7 | classification based on manually curated sets of positive and negative examples of CRISPR arrays.
  8 | The identified CRISPR arrays are then reported to the user accompanied by detailed annotation.
  9 | We demonstrate that our approach identifies not only previously detected CRISPR arrays,
 10 | but also CRISPR array candidates not detected by other tools. Compared to other methods,
 11 | our tool has a drastically reduced false positive rate. In contrast to the existing tools, CRISPRidentify
 12 | approach not only provides the user with the basic statistics on the identified CRISPR arrays
 13 | but also produces a certainty score as an intuitive measure of the likelihood that a given
 14 | genomic region is a CRISPR array.
 15 | 
 16 | ## Getting Started
 17 | 
 18 | These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. 
 19 | 
 20 | ### Prerequisites
 21 | 
 22 | First you need to install Miniconda
 23 | Then create an environment and install the required libraries in it
 24 | 
 25 | 
 26 | ### Creating a Miniconda environment 
 27 | 
 28 | First we install Miniconda for python 3.
 29 | Miniconda can be downloaded from here:
 30 | 
 31 | https://docs.conda.io/en/latest/miniconda.html 
 32 | 
 33 | Then Miniconda should be installed. On a linux machine the command is similar to this one: 
 34 | 
 35 | ```
 36 | bash Miniconda3-latest-Linux-x86_64.sh
 37 | ```
 38 | 
 39 | Then we create an environment. The necessary setup is provided in the "environment.yml" file.
 40 | 
 41 | In order to install the corresponding environment one can execute the following command.
 42 | 
 43 | ```
 44 | conda env create -f environment.yml
 45 | ```
 46 | 
 47 | We recommend to install mamba package manager which is a faster alternative to conda.
 48 | 
 49 | ```
 50 | conda install -c conda-forge mamba
 51 | ```
 52 | 
 53 | Then we can create the environment using mamba.
 54 | ```
 55 | mamba env create -f environment.yml
 56 | ```
 57 | 
 58 | <sub><sub>We want to acknowledge Richard Stöckl @richardstoeckl for his contribution to the environment.yml file.</sub></sub>
 59 | 
 60 | 
 61 | ### Additional preparations
 62 | 
 63 | CRISPRidentify utilizes CRISPRcasIdentifier for the detection of the cas genes.
 64 | If you are interested in cas gene result please install CRISPRcasIdentifier.
 65 | 
 66 | Please make sure that after you downloaded CRISPRcasIdentifier its relative path is:
 67 | 
 68 | ```
 69 | tools/CRISPRcasIdentifier/CRISPRcasIdentifier/CRISPRcasIdentifier.py
 70 | ```
 71 | 
 72 | You can find the CRISPRcasIdentifier tool and its description [here](https://github.com/BackofenLab/CRISPRcasIdentifier)
 73 | 
 74 | You need to make two steps:
 75 | 
 76 | Firstly, you need to download the CRISPRcasIdentifier tool:
 77 | ```
 78 | wget https://github.com/BackofenLab/CRISPRcasIdentifier/archive/v1.1.0.tar.gz
 79 | tar -xzf v1.1.0.tar.gz
 80 | ```
 81 | Secondly, you need to download the models:
 82 | 
 83 | Due to GitHub's file size constraints, authors made their HMM and ML models available in Google Drive. You can download them [here](https://drive.google.com/file/d/1YbTxkn9KuJP2D7U1-6kL1Yimu_4RqSl1/view?usp=sharing) and [here](https://drive.google.com/file/d/1Nc5o6QVB6QxMxpQjmLQcbwQwkRLk-thM/view?usp=sharing). Save both tar.gz files inside CRISPRcasIdentifier's directory.
 84 | 
 85 | 
 86 | ### Activation of the environment
 87 | 
 88 | Before running CRISPRidentify one need to activate the corresponding environment.
 89 | 
 90 | ```
 91 | conda activate crispr_identify_env
 92 | ```
 93 | 
 94 | ## Running CRISPRidentify
 95 | 
 96 | We prepared the test folder which can be used for the test run.
 97 | 
 98 | Example of running CRISPRidentify over a folder of files:
 99 | 
100 | ```
101 | python CRISPRidentify.py --input_folder TestInput
102 | ```
103 | 
104 | Example of running CRISPRidentify over a single multiline fasta input:
105 | ```
106 | python CRISPRidentify.py --file TestInputMultiline/MultilineFasta.fasta
107 | ```
108 | 
109 | ### Flags
110 | 
111 | You can see the help by using the `-h` option
112 | 
113 | ```
114 | 
115 | python CRISPRidentify.py -h
116 | 
117 | ```
118 | 
119 | #### Mandatory flags
120 | The only mandatory parameter which has to be specified is the input.
121 | Our approach has two options to handle the input. User has to specify either the path to the folder with the input fasta files
122 | or the full path to a single fasta input file.
123 | 
124 | ##### Input as a folder of fasta files
125 | 
126 | * `--input_folder <path_to_the_folder>`
127 | 
128 | Specifies the mode where a folder with fasta files which will be used as the input for CRISPRidentify. The CRISPR array search will be
129 | then conducted separately for each file in the corresponding input folder.
130 | 
131 | ```
132 | python CRISPRidentify.py --input_folder TestInput
133 | ```
134 | 
135 | ##### Input as a single file
136 | 
137 | * `--file <path_to_the_file>`
138 | 
139 | Specifies the mode where a singe file is used as the input for the algorithm. The file might contain a single entry or multiple entries. 
140 | The CRISPR array search will be done for each entry independently.
141 | 
142 | For example:
143 | 
144 | ```
145 | python CRISPRidentify.py --file InputFile
146 | ```
147 | ##### Input as a folder of multiline fasta files
148 | 
149 | * `-- input_folder_multifasta <path_to_the_folder>`
150 | 
151 | Specifies the mode where a folder with fasta files which will be used as the input for CRISPRidentify. The CRISPR array search will be
152 | then conducted separately for each file in the corresponding input folder. The difference between this mode and the previous one is that
153 | in this mode the input files can contain multiple entries.
154 | 
155 | For example:
156 | 
157 | ```
158 | python CRISPRidentify.py --input_folder_multifasta TestFolderMultiline
159 | ```
160 | 
161 | #### Optional flags
162 | 
163 | ##### Output
164 | 
165 | * `--result_folder [paht_to_the_result_folder]`
166 | 
167 | Specifies the path and name of the folder with the output results. If not specified the results will appear in "Results" folder
168 | 
169 | 
170 | For example:
171 | 
172 | ```
173 | python CRISPRidentify.py --input_folder TestInput --result_folder Results
174 | ```
175 | 
176 | * `--pickle_report [folder_to_put_pickle_results]`
177 | 
178 | Specifies if found CRISPR arrays should be stored also as python objects. Turned off by default.
179 | 
180 | 
181 | For example:
182 | 
183 | ```
184 | python CRISPRidentify.py --input_folder TestInput --pickle_report PickleReportFolder
185 | ```
186 | 
187 | 
188 | ##### Classification parameters
189 | 
190 | * `--model [8/9/10/ALL]`
191 | 
192 | 
193 | Takes values: 8, 9, 10, ALL and specifies the classification model. The default value is `ALL`.
194 | If the value `ALL` is picked for the flag the certainty score will be calculated as average between all three available models.
195 | 
196 | 
197 | For example:
198 | 
199 | ```
200 | python CRISPRidentify.py --input_folder TestInput --model 8
201 | ```
202 | 
203 | 
204 | ```
205 | python CRISPRidentify.py --input_folder TestInput --model ALL
206 | ```
207 | 
208 | 
209 | ##### Performance speed
210 | * `--fast_run [True/False]`
211 | 
212 | Specifies if the repeat set enhancement step should be skipped which drastically speeds up the process but might decrease the recall quality.
213 | Only matching pairs found with Vmatch will be used as repeat candidates. Automatically turns off filter approximation and start_end approximation (see enhancement_max_min and enhancement_start_end)
214 | Turned off by default.
215 | 
216 | For example:
217 | 
218 | ```
219 | python CRISPRidentify.py --input_folder TestInput --fast_run True
220 | ```
221 | 
222 | * `--enhancement_max_min [True/False]`
223 | 
224 | Specifies if the filter approximation based on the max. and min. elements should be built
225 | The default value is True 
226 | 
227 | * `--enhancement_start_end [True/False]`
228 | 
229 | Specifies if the start/end omitting of the repeat candidates should be done to enrich the candidate set.
230 | The default value is True
231 | 
232 | 
233 | For example:
234 | 
235 | ```
236 | python CRISPRidentify.py --input_folder TestInput --enhancement_max_min True --enhancement_start_end False
237 | ```
238 | 
239 | ##### Candidate filtering criteria 
240 | 
241 | 
242 | * `--min_len_rep  [integer]`
243 | 
244 | Specifies the minimum length of repeats in a CRISPR array. The default value: 21
245 | 
246 | * `--max_len_rep [integer]`
247 | 
248 | Specifies the maximum length of repeats in a CRISPR array. The default value: 55
249 | 
250 | * `--min_len_spacer [integer]`
251 | 
252 | Specifies the minimum average length of spacers in a CRISPR array. The default value: 18
253 | 
254 | * `--max_len_spacer [integer]`
255 | 
256 | Specifies the maximum average length of spacers in a CRISPR array. The default value: 78
257 | 
258 | * `--min_repeats [integer]`
259 | 
260 | Specifies the minimum number of repeats in a CRISPR array. The default value: 3
261 | 
262 | 
263 | For example:
264 | 
265 | ```
266 | python CRISPRidentify.py --input_folder TestInput --min_len_rep 25 --max_len_rep 50 --min_repeats 2 
267 | ```
268 | 
269 | #####Candidate Enhancement 
270 | 
271 | * `--degenerated' [True/False]`
272 | 
273 | Allows search for degenerated repeat candidates on both ends of the CRISPR array candidate. The default value: True
274 | 
275 | * `--margin_degenerated [int]`
276 | 
277 | Specifies the maximum length difference between a new spacer sequence (obtained with the search of degenerated repeats) and the average value of spacer length in the array. The default value: 30
278 | 
279 | * `--max_edit_distance_enhanced [int]`
280 | 
281 | Specifies the number of editing operations for candidate enhancement. The default value: 6
282 | 
283 | 
284 | ##### Additional computations
285 | 
286 | * `--strand[True/False]`
287 | 
288 | Specifies if the array orientation should be predicted. The default value is True.
289 | 
290 | * `--cas [True/False]`
291 | 
292 | Specifies if cas genes should be predicted. The default value is False.
293 | 
294 | * `--is_element [True/False]`
295 | 
296 | Specifies if IS-Elements should be predicted. The default value is False.
297 | 
298 | 
299 | ```
300 | python CRISPRidentify.py --input_folder TestInput --cas True --is_element True 
301 | ```
302 | 
303 | ## Output files
304 | 
305 | The output folder for each input entries consist of the following files:
306 | 
307 | * Bona-Fide_Candidates. The file will contain the representation of the found CRISPR arrays complemented with the support information. 
308 | For each candidate the output will contain the values for extracted features as well as the certainty score of the used classifier.
309 | On top of that in the support information you can find the orientation for each array, leader and downstream regions, cas genes and IS-elements (if the corresponding flags were selected).
310 | 
311 | * Alternative_Candidates. In this file we demonstrate alternative representations of bona-fide arrays. These alternative representations also got a high score from the classifier but this score was lower than the corresponding score of the bona-fide representation.
312 | Alternative representation of a CRISPR array usually corresponds to a slightly longer/shorter repeat sequence but represents the same genomic region.
313 | 
314 | The candidates with the certainty scores between 0.4 and 0.75 are stored in Possible_Candidates and Possible_Discarded_Candidates
315 | 
316 | * Possible_Candidates. In this file the algorithm stores the candidate with the highest certainty score.
317 | 
318 | * Possible_Discarded.  Here are collected all the other representations 
319 | 
320 | 
321 | The algorithm also demonstrates CRISPR-looking structures which obtained certainty score lower than 0.4 from the classifier.
322 | 
323 | * Low_score_candidates. The user can find these structures in this file.
324 | 
325 | 
326 | On top of that the algorithm builds a csv summary. 
327 | 
328 | * Summary.csv
329 | 
330 | Following information can be found in the summary:
331 | 
332 | 1. Array index
333 | 2. Array start
334 | 3. Array end
335 | 4. Array length
336 | 5. Consensus repeat
337 | 6. Repeat length
338 | 7. Average length of the spacers
339 | 8. Number of spacers
340 | 9. Array orientation
341 | 10. Array category
342 | 
343 | ## Metagenomic analysis
344 | 
345 | CRISPRidentify is suitable for easy and powerful metagenomic analysis
346 | When `--file` or `--input_folder` flag is used the pipeline with automatically generate two complete summaries 
347 | : 
348 | 
349 | 1. For all the identified arrays
350 | 2. For all labeled Cas genes
351 | 
352 | 
353 | On top of that the user might use the flag:
354 | 
355 | `--fasta_report True`
356 | 
357 | This option with create three fasta files:
358 | 1. All the array sequences with their origins in the header
359 | 2. All the repeat sequences with their origins and locations in the arrays
360 | 3. All the spacer sequences with their origins and locations in the arrays
361 | 
362 | ## Improving CRISPRidentify
363 | 
364 | We are constantly working on the improvements of CRISPRidentify. If you found a bug or incorrect/missing CRISPR array representation please submit via github issue interface.
365 | 
366 | 
367 | 
368 | 
369 | 


--------------------------------------------------------------------------------
/TestFolderMultiline/MultilineFasta_1.fasta:
--------------------------------------------------------------------------------
 1 | >gi|56475432|ref|NC_006513.1 Damage1| Aromatoleum aromaticum EbN1 chromosome
 2 | ATCACGCCCTCCCATCCCGCCGATCCACCGCCCGACGATCCTTCCGCCGCTGCTCGCTCCGCACGCGCCG
 3 | GTCACCATCGCGACGTTCCACCGCGCGGCGGTCGTCATGGACGGGCTCGACATAGGTTCGCTCGGCCACA
 4 | TCAACCGGAGTTGGCACGAAGGCGGGGAGCGTGGTTTCGCTTTCCACGTCGCCAGTCCCGCGCAGATACC
 5 | CCCAATCCACGTCGGGGCGGAGTTCTTCGCAGCGGACGGCGCCGGCTGTGGCGCGCTCGATGGCCGGGCA
 6 | GCGCTCGGCGGGGATCGGTCGAACGCCAGTAACCCACTGGCTGACCGCAGCAGGCGTCACGCCGAGAGCA
 7 | CGGGCCAAGGTTGCTTGGCCGCCTACGGATTCGCACGCCAAAAGGATCGGGTTTCGGTTCATGCCACGAC
 8 | TATAGCACCGCTACAGATTACATATCAAGCCATGCTATTCATTCCAATAGATAGCATTGCTTCATCATGC
 9 | TGATATGGTCACCGCACGAAGAAGCAGAACGTCTTAAGGCCCGTTTTGGAGCGGTCCCCAACCGGGAGAA
10 | GTTCGCTCGAGAAATTGGACTTCACGGCGGCGGATCAATGATCTACCAGCACATAAAGGGGATTCGCCCG
11 | ATCAGCCGCGAAGCGGCGGTTGCGTATGCGAAAGGCTTCAATTGCAGGCTCGAAGAAATCAGCCCGCGAA
12 | TCGCCCTTGAGATACAGCAGGCCACTTCTGTCTTGTCGCCAACGCCAGACCGTCCGCCCGAGTCGCCGAA
13 | CATCTGCGCCGGACCGGACCGCAAAGGCAAGGTGCCGCTAATCTCGTGGGTGCGCGCAGGTGAGTTCGCT
14 | CATGCCGCTGATCTTTTGCCGGTCGGCGAGGCCTATGAATGGGTGGAGACCGGCGTGAACGTGCAGCCCC
15 | ACACTTTTGCGCTGCGCGTCCAGGGCGACTCGATGGAGCCGGAATTCGTCGCTGGCACGATCATCGTGAT
16 | CGAGCCGCACATGGTCGCTGAGCCCGGCGACTACGTCATCGCCCGCAACGGCGACAACGAGGCCACTTTC
17 | AAGCAGCTCGTGCGCGACGGGGCGGACCTGTACCTCAAACCGCTGAACCCCCGCTACCCGATCAAGCCGC
18 | TGGGCGCCACGGCGATCATCGGCGTGGTTCGAGAGGCCGTGAAGCGCTATCGGTGAGCGGGTGTTTTCCA
19 | GGCCATCACCCCTTGCCCGCAATCTGTAACAGCCTCCCAACAACAACAGACTCATGCTATTTTTCAAGGG
20 | CTTGAACGGCCCGAAATTTGTCAAGTCCATCGTCGAAGGGGTCGGATTGTGGCTCGGTATCGTCAGCGGC
21 | TTGGCGTGGCTGTGGTCCGAATTGGCGATCGTGAAGGTCGAACTGACATGGGCCGTGACAGTCACGACCG
22 | GATTTTTCGTGTTCTACGTCGGGGTCCTGCTTTGCTTCACGCGGCAGGGGGTTCTGCAAACGCGCATCGA
23 | CGAATGCGCCCAGGCCAAGAAAGCGCTGGAAGAAGAAGTGCTGCGCAAGCGGCTGTCGTCCAGGAAAAAA
24 | GGCCGGTGAGGCTGATGGGGAGAAGTGGAAAATGAGCGTGATTGCCATTCACGCGGCGATCGTCGGCGTG
25 | ACGGTCGTCGTCGCATACGTCCTGCACATGCGCACGATGCGCATGAAAGCCTGTTTCAACCTCTTCCGCG
26 | TGCGCGACCGCTTCGTCCTGCTCGTCGCCAAAGACATCCTGCCCGAAGACAGCAGGGTGTTCGTCCACTA
27 | CTACGGACGCATCAACAAGCTGCTTTGCGACGCCCCGAAAGTCGGCATCGACGACATGCTGGCCACGATC
28 | TTTCGTCACGTGCCCAATGGTGAGTTCGACCAGGCACTGGAGCGCGCCCGCTCCCAGTCGCAAAAAATGC
29 | TGGCCGATCCGCTCATGCAGAACGACGAAGTGCGAGCGGCCGTGGCCGATTACTACCGCGCCATCCGCGC
30 | GATGCTGCTGTCCCACAGCAGCATCCTGAAGGTCATCTACCTGCTGTCGCACCGCTTCGCCACGTCGCTC
31 | CACTCCGGCTGGATCGGCGGCGAAGTCAGCCGCGGGCTGAAGGCCGCCGACTACGCCGACGAAGAAGCTG
32 | CCCTGTTTAAACCCGCCTGAGGTTATGGCGCCGGGGTGACGCCACGGCAGCCGGTCCTCGACACGGCCGG
33 | CTATGTTGTGGTGGGTGGCGGCATAAAAAAAGACCGCCGAAGCGGTCTTTACGTGGAATATGCAGACTCT
34 | TGCCCTACTCTTATTCCGCTGCCGCCGCCCGGCAGGTCGACGGGCGATACTTCGCGTCCACAGATCCGGC
35 | GCACGCCCACACTAGCACGCCAGTTGCGTCGGCCTTAGGCGTCAGCGTGATCGTCTCTGCGGCAATCGCG
36 | TCGTTTGCATCTACGCCGCCGGTCGATGTAATGACGCCGCTTGGTCCGATTGCGACGCTGGCGGTGTACT
37 | TACCCACGGCACCACCGTAGCCCGCAGCGGCATCACTGGCAGGGAGGGCGCCAGTCGACTGAAAAGTTTC
38 | AGCAACGGCAACCTTGGCGCCATCTGTCAGGGACATCAGTTCTGAAACCTGAGCCCGGATCGTGTAATCC
39 | TGATACGCCGGCAGCGCGACCGCTGCCAAAATCCCGATGATCGCGACGACGATCATCAGTTCGATCAGCG
40 | TGAAACCTTGTTGGACCTTTTTCATTTTCAGCTCCCTTGCTTGTTTGACGGGACGTCCCGTGCACATCGT
41 | CACGCAACAAGCGTGCCAACCACCAAAGCGGCATAATCGCCCCGGAGTGCCACCCATGTCGCACTTATTA
42 | CTGTGACATTCGGGCGAATGTCGCAGCGTGGCACACGACACGCCGCCCCCTTCCGGGCTTGAGCGCAGCC
43 | CTCTAAACTCCGTGCCCTGGAGGCTCCCATGACCGCAATCACCGTTCCCACCGCCGCACTCATCCTCGAC
44 | CGGACCACGCGCACCATCTGGCGCCGCATCGCCGACGGATCGCTGCCGGCGATCACCGAAGACGACCGGC
45 | AGAAGATCCCGCTCGACGCCGTCATCCGCGAGGCGTGCATTCCGATCGACCCGGACGACTACGAGCTCGT
46 | CACCGGCACCGACGCCGGCGATGCCGAATCGCAGTGCGACCTCGCACTGCTGTTCCTGCTGCGCGACCGC
47 | CCGCACATCGCCATGCCGCTGCTCAACCTGGCCGCCAAGGACGACTACCCGGAGGCGCTCTACCAGATCG
48 | CCCGCTGCCACATCGCCGGCAAGGGCGTGCCGCGCGACGGCAACGCCGGCATCATGTGGCTCGCTCGGGC
49 | CGCCAGCCGCGGCCACTCCGTAGCCCAGGAGCAGATGCGCGTCGTGCGCGAGTCCGGCACCGGCACCGAC
50 | CTCGACGCCCTCGACGCGCTGCTCGAGCGCATCGAACAGCGAGTCGTGTTCGCTGCACTGGAAACCACCG
51 | CAACCCGCTAGACCCCCCGCGCTTCGCAATCTGCCCGCCGCTTGAGCGGGCTTTTTTACGTCCGTAGCTT
52 | AAAGCCATTTCGCTGATATATAGCTGCGCTATTGACATTAAATATAGCGTTGCTATTATTTCTCCAACGC
53 | CTCCCTCGAGGCACCGGAGACCGCGATGCCCCCCGCTGCACCCCATCCCGTCCCGCCCGAAAAAAAGGCC
54 | >gi|56475432|ref|NC_006513.1 Damage2| Aromatoleum aromaticum EbN1 chromosome
55 | ATCCCGATGATCGCGACGACGATCATCAGTTCGATCAGCG
56 | >gi|56475432|ref|NC_006513.1 Damage3| Aromatoleum aromaticum EbN1 chromosome
57 | TGAAACCTTGTTGGACCTTTTTCATTTTCAGCTCCCTTGCTTGTTTGACGGGACGTCCCGTGCACATCGT
58 | CACGCAACAAGCGTGCCAACCACCAAAGCGGCATAATCGCCCCGGAGTGCCACC
59 | 


--------------------------------------------------------------------------------
/components/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/components/__init__.py


--------------------------------------------------------------------------------
/components/components_detection_refinement.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import json
  3 | from functools import wraps
  4 | from itertools import groupby
  5 | 
  6 | 
  7 | class SameStartEndFilter:
  8 |     def __init__(self, dict_crispr_candidates):
  9 |         self.dict_crispr_candidates = dict_crispr_candidates
 10 |         self.dict_filtered_start_end_crispr_candidates = {}
 11 | 
 12 |         self._filter_fuzzy_searches_same_start_end()
 13 | 
 14 |     def _filter_fuzzy_searches_same_start_end(self):
 15 |         for cluster_seq, list_fuzzy_s in self.dict_crispr_candidates.items():
 16 |             list_start_end = [fuzzy_s.start_end for fuzzy_s in list_fuzzy_s]
 17 |             pattern_len = [len(fuzzy_s.repeat_candidate) for fuzzy_s in list_fuzzy_s]
 18 |             tuples_st_end_len = zip(list_start_end, pattern_len)
 19 | 
 20 |             list_categories = [[fuzzy_s for fuzzy_s in list_fuzzy_s if
 21 |                                 (fuzzy_s.start_end, len(fuzzy_s.repeat_candidate)) == tuple_info]
 22 |                                for tuple_info in tuples_st_end_len]
 23 | 
 24 |             best_fuzzy_s = [sorted(category, key=lambda x: x.number_errors)[0]
 25 |                             for category in list_categories]
 26 | 
 27 |             best_fuzzy_s_unique_repeat = []
 28 |             u_repeats = []
 29 |             for b_fuz in best_fuzzy_s:
 30 |                 repeat = b_fuz.repeat_candidate
 31 |                 if repeat not in u_repeats:
 32 |                     u_repeats.append(repeat)
 33 |                     best_fuzzy_s_unique_repeat.append(b_fuz)
 34 | 
 35 |             self.dict_filtered_start_end_crispr_candidates[cluster_seq] = best_fuzzy_s_unique_repeat
 36 | 
 37 |     def output(self):
 38 |         return self.dict_filtered_start_end_crispr_candidates
 39 | 
 40 | 
 41 | #          For filtering out non CRISPR cases
 42 | #############################################################
 43 | #############################################################
 44 | DEBUG_MODE = False
 45 | 
 46 | 
 47 | def exception_handler(function):
 48 |     @wraps(function)
 49 |     def wrapper(*args, **kwargs):
 50 |         try:
 51 |             result = function(*args, **kwargs)
 52 |             return result
 53 |         except Exception:
 54 |             return False
 55 |     return wrapper
 56 | 
 57 | 
 58 | def printing_if_filtered(function):
 59 |     @wraps(function)
 60 |     def wrapper(*args, **kwargs):
 61 |         result = function(*args, **kwargs)
 62 |         if DEBUG_MODE:
 63 |             if not result:
 64 |                 with open("filtered_results.txt", "a") as f:
 65 |                     f.write("\n\n")
 66 |                     f.write("\n".join([str(arg) for arg in args]))
 67 |                     f.write("\n\n")
 68 |                     f.write(function.__name__)
 69 |                     f.write("\n\n")
 70 | 
 71 |         return result
 72 |     return wrapper
 73 | 
 74 | 
 75 | class AdvancedFuzzySearchFilter:
 76 |     def __init__(self, min_column_dominance_repeat, min_avg_spacer_length,
 77 |                  max_spacer_length, max_column_dominance_spacer, max_allowed_consecutive_spacers,
 78 |                  max_allowed_same_spacers, max_inconsistent_columns, min_avg_repeat_length,
 79 |                  max_avg_repeat_length, max_avg_spacer_length, min_repeats):
 80 | 
 81 |         self.column_dominance = min_column_dominance_repeat
 82 |         self.min_avg_spacer_length = min_avg_spacer_length
 83 |         self.max_spacer_length = max_spacer_length
 84 |         self.max_column_dominance_spacer = max_column_dominance_spacer
 85 |         self.max_allowed_consecutive_spacers = max_allowed_consecutive_spacers
 86 |         self.max_allowed_same_spacers = max_allowed_same_spacers
 87 |         self.max_inconsistent_columns = max_inconsistent_columns
 88 |         self.min_avg_repeat_length = min_avg_repeat_length
 89 |         self.max_avg_repeat_length = max_avg_repeat_length
 90 |         self.max_avg_spacer_length = max_avg_spacer_length
 91 |         self.min_number_repeats = min_repeats
 92 | 
 93 |     @printing_if_filtered
 94 |     @exception_handler
 95 |     def _filter_by_column(self, candidate):
 96 |         def find_first_three_columns():
 97 |             list_three_columns = []
 98 |             list_gaped_repeats = candidate.list_gaped_repeats
 99 |             for index in range(len(list_gaped_repeats[0])):
100 |                 column_vec = [repeat[index] for repeat in list_gaped_repeats]
101 |                 column_gaps = sum([1 for x in column_vec if (x == " ")])
102 |                 percentage_gaps = column_gaps / len(column_vec)
103 |                 if percentage_gaps < 0.5:
104 |                     list_three_columns.append(column_vec)
105 |                 if len(list_three_columns) == 3:
106 |                     return list_three_columns
107 | 
108 |         def find_last_three_columns():
109 |             list_three_columns = []
110 |             list_gaped_repeats = candidate.list_gaped_repeats
111 |             for index in range(len(list_gaped_repeats[0])-1, 0, -1):
112 |                 column_vec = [repeat[index] for repeat in list_gaped_repeats]
113 |                 column_gaps = sum([1 for x in column_vec if (x == " ")])
114 |                 percentage_gaps = column_gaps/len(column_vec)
115 |                 if percentage_gaps < 0.5:
116 |                     list_three_columns.append(column_vec)
117 |                 if len(list_three_columns) == 3:
118 |                     return list_three_columns
119 | 
120 |         for column in find_first_three_columns():
121 |             column_characters = [x for x in column if (x not in (" ", "-"))]
122 |             if column_characters:
123 |                 most_freq_char = max(column_characters, key=column_characters.count)
124 |                 most_freq_char_freq = column_characters.count(most_freq_char)
125 |                 freq = most_freq_char_freq/len(column_characters)
126 |                 if len(column) <= 4:
127 |                     if freq < 0.49:
128 |                         return False
129 |                 else:
130 |                     if freq < self.column_dominance:
131 |                         return False
132 |             else:
133 |                 return False
134 | 
135 |         for column in find_last_three_columns():
136 |             column_characters = [x for x in column if (x not in (" ", "-"))]
137 |             if column_characters:
138 |                 most_freq_char = max(column_characters, key=column_characters.count)
139 |                 most_freq_char_freq = column_characters.count(most_freq_char)
140 |                 freq = most_freq_char_freq/len(column_characters)
141 |                 if len(column) <= 4:
142 |                     if freq < 0.49:
143 |                         return False
144 |                 else:
145 |                     if freq < self.column_dominance:
146 |                         return False
147 |             else:
148 |                 return False
149 |         return True
150 | 
151 |     @printing_if_filtered
152 |     @exception_handler
153 |     def _filter_by_min_avg_spacer(self, candidate):
154 |         list_spacers = candidate.list_spacers
155 |         avg_len = sum(len(x) for x in list_spacers) / len(list_spacers)
156 |         if avg_len > self.min_avg_spacer_length:
157 |             return True
158 |         return False
159 | 
160 |     @printing_if_filtered
161 |     @exception_handler
162 |     def _filter_by_max_spacer(self, candidate):
163 |         list_spacers = candidate.list_spacers
164 |         long_spacers = [spacer for spacer in list_spacers if len(spacer) > self.max_spacer_length]
165 |         if len(long_spacers) / len(list_spacers) > 0.3:
166 |             return False
167 |         if len(long_spacers) > 3:
168 |             return False
169 |         return True
170 | 
171 |     @printing_if_filtered
172 |     @exception_handler
173 |     def _filter_by_spacer_begin_end_similarity(self, candidate):
174 |         list_spacers = candidate.list_spacers
175 |         if len(list_spacers) >= 2:
176 |             column_begin = [spacer[0] for spacer in list_spacers if spacer]
177 |             most_freq_char_begin = max(column_begin, key=column_begin.count)
178 |             most_freq_char_freq_begin = column_begin.count(most_freq_char_begin)
179 | 
180 |             freq_begin = most_freq_char_freq_begin / len(column_begin)
181 |             if freq_begin > self.max_column_dominance_spacer:
182 |                 return False
183 | 
184 |             column_end = [spacer[-1] for spacer in list_spacers if spacer]
185 |             most_freq_char_end = max(column_end, key=column_end.count)
186 |             most_freq_char_freq_end = column_end.count(most_freq_char_end)
187 | 
188 |             freq_end = most_freq_char_freq_end / len(column_end)
189 |             if freq_end > self.max_column_dominance_spacer:
190 |                 return False
191 |         return True
192 | 
193 |     @printing_if_filtered
194 |     @exception_handler
195 |     def _filter_by_the_same_spacer(self, candidate):
196 |         list_spacers = candidate.list_spacers
197 |         list_spacers = [s for s in list_spacers if s]
198 |         groups = [len(list(group)) for key, group in groupby(list_spacers)]
199 |         if self.max_allowed_consecutive_spacers:
200 |             if max(groups) > self.max_allowed_consecutive_spacers:
201 |                 return False
202 | 
203 |         list_sorted_spacers = sorted(list_spacers)
204 |         groups_sorted = [len(list(group)) for key, group in groupby(list_sorted_spacers)]
205 |         if self.max_allowed_same_spacers:
206 |             if max(groups_sorted) > self.max_allowed_same_spacers:
207 |                 return False
208 |         return True
209 | 
210 |     @printing_if_filtered
211 |     @exception_handler
212 |     def _filter_by_overall_repeat_consistency(self, candidate):
213 |         list_column_consistency = []
214 |         list_repeats_gaped = candidate.list_gaped_repeats
215 |         for index, _ in enumerate(list_repeats_gaped[0]):
216 |             column = [repeat[index] for repeat in list_repeats_gaped]
217 |             column_characters = [x for x in column if (x not in (" ", "-"))]
218 |             try:
219 |                 most_freq_char = max(column_characters, key=column_characters.count)
220 |                 most_freq_char_freq = column_characters.count(most_freq_char)
221 |                 freq = most_freq_char_freq / len(column_characters)
222 |                 list_column_consistency.append(freq)
223 |             except ValueError:
224 |                 pass
225 | 
226 |         number_inconsistent = sum(1 for x in list_column_consistency if x < 0.66)
227 |         if number_inconsistent > self.max_inconsistent_columns:
228 |             return False
229 |         return True
230 | 
231 |     @printing_if_filtered
232 |     @exception_handler
233 |     def _filter_min_number_repeats(self, candidate):
234 |         list_repeats = candidate.list_repeats
235 |         if len(list_repeats) >= self.min_number_repeats:
236 |             return True
237 |         return False
238 | 
239 |     @printing_if_filtered
240 |     @exception_handler
241 |     def _filter_min_avg_repeat_length(self, candidate):
242 |         list_repeats = candidate.list_repeats
243 |         avg_len = sum(len(x) for x in list_repeats) / len(list_repeats)
244 |         if avg_len >= self.min_avg_repeat_length:
245 |             return True
246 |         return False
247 | 
248 |     @printing_if_filtered
249 |     @exception_handler
250 |     def _filter_max_avg_repeat_length(self, candidate):
251 |         list_repeats = candidate.list_repeats
252 |         avg_len = sum(len(x) for x in list_repeats) / len(list_repeats)
253 |         if avg_len <= self.max_avg_repeat_length:
254 |             return True
255 |         return False
256 | 
257 |     @printing_if_filtered
258 |     @exception_handler
259 |     def _filter_max_avg_spacer_length(self, candidate):
260 |         list_spacers = candidate.list_spacers
261 |         if len(list_spacers) > 4:
262 |             avg_len = sum(len(x) for x in list_spacers[1:-1]) / len(list_spacers)
263 |             if avg_len <= self.max_avg_repeat_length:
264 |                 return True
265 |         else:
266 |             avg_len = sum(len(x) for x in list_spacers) / len(list_spacers)
267 |             if avg_len <= self.max_avg_repeat_length:
268 |                 return True
269 |         return False
270 | 
271 |     @printing_if_filtered
272 |     @exception_handler
273 |     def _filter_min_repeat_length(self, candidate):
274 |         list_spacers = candidate.list_spacers
275 |         avg_len = sum(len(x) for x in list_spacers) / len(list_spacers)
276 |         if avg_len >= self.min_avg_repeat_length:
277 |             return True
278 |         return False
279 | 
280 |     def __call__(self, candidate):
281 |         if not self._filter_by_column(candidate):
282 |             return
283 |         if not self._filter_by_min_avg_spacer(candidate):
284 |             return
285 |         if not self._filter_by_max_spacer(candidate):
286 |             return
287 |         if not self._filter_by_spacer_begin_end_similarity(candidate):
288 |             return
289 |         if not self._filter_by_the_same_spacer(candidate):
290 |             return
291 |         if not self._filter_by_overall_repeat_consistency(candidate):
292 |             return
293 |         if not self._filter_max_avg_repeat_length(candidate):
294 |             return
295 |         if not self._filter_min_avg_repeat_length(candidate):
296 |             return
297 |         if not self._filter_max_avg_spacer_length(candidate):
298 |             return
299 |         if not self._filter_min_number_repeats(candidate):
300 |             return
301 |         return candidate
302 | 
303 | #              CRISPR Candidate
304 | #####################################################
305 | #####################################################
306 | class CrisprConsensus(object):
307 |     def __init__(self, list_repeats_gaped):
308 |         self.list_repeats_gaped = list_repeats_gaped
309 | 
310 |         self.num_different_repeat_length = None
311 |         self.consensus = None
312 |         self.consensus_no_gap = None
313 |         self.len_consensus = None
314 |         self.number_repeats = None
315 | 
316 |         self._check_repeat_length()
317 |         self._compute_consensus()
318 | 
319 |     def _check_repeat_length(self):
320 |         list_lengths = [len(repeat) for repeat in self.list_repeats_gaped]
321 |         self.num_different_repeat_length = len(set(list_lengths))
322 | 
323 |     def _compute_consensus(self):
324 |         if self.num_different_repeat_length == 0:
325 |             print('Got repeats of 0 length')
326 |         elif self.num_different_repeat_length != 1:
327 |             print('Got a case with different repeat lengths')
328 |             for rep_gapped in self.list_repeats_gaped:
329 |                 print(rep_gapped)
330 |         else:
331 |             self.consensus = ''
332 |             for char_ind, _ in enumerate(self.list_repeats_gaped[0]):
333 |                 list_char_in_column = [repeat[char_ind] for repeat in self.list_repeats_gaped]
334 |                 counter = collections.Counter(list_char_in_column)
335 |                 freq = counter.most_common()
336 |                 most_common_char = freq[0][0] if freq[0][0] != '-' else freq[1][0]
337 |                 self.consensus += most_common_char
338 | 
339 |         self.consensus_no_gap = self.consensus.replace(' ', '').replace('+', '')
340 |         self.len_consensus = len(self.consensus_no_gap)
341 | 
342 |     def output(self):
343 |         return self.consensus_no_gap, self.consensus
344 | 
345 | 
346 | class CrisprCandidate(object):
347 |     def __init__(self, list_repeats, list_repeats_gaped, list_spacers, list_repeat_starts):
348 |         self.list_repeats = list_repeats
349 |         self.list_repeats_gaped = list_repeats_gaped
350 |         self.list_spacers = list_spacers
351 |         self.list_repeat_starts = list_repeat_starts
352 | 
353 |         self.list_repeat_mismatches = []
354 |         self.list_mismatches_indexes = []
355 | 
356 |         self.consensus = None
357 |         self.consensus_gaped = None
358 |         self.total_mismatches = None
359 | 
360 |         self._filter_redundant_insertion_deletions()
361 |         self._compute_consensus()
362 |         self._compute_mismatches()
363 | 
364 |         self.list_gaped_repeats = self.list_repeats_gaped
365 | 
366 |     def _filter_redundant_insertion_deletions(self):
367 |         def _fix_repeats(list_repeats, list_bad_indexes_to_fix):
368 |             list_repeats_new = []
369 |             for repeat in list_repeats:
370 |                 list_repeats_new.append(_fix_repeat(repeat, list_bad_indexes_to_fix))
371 | 
372 |             return list_repeats_new
373 | 
374 |         def _fix_repeat(repeat, list_bad_indexes_to_fix):
375 |             new_repeat = ''
376 |             for index, char in enumerate(repeat):
377 |                 if index not in list_bad_indexes_to_fix:
378 |                     new_repeat += char
379 | 
380 |             return new_repeat
381 | 
382 |         list_bad_indexes = []
383 |         for char_ind, _ in enumerate(self.list_repeats_gaped[0]):
384 |             list_char_in_column = [repeat[char_ind] for repeat in self.list_repeats_gaped]
385 |             chars = set(list_char_in_column)
386 | 
387 |             if chars == {' '} or chars == {'-'}:
388 |                 list_bad_indexes.append(char_ind)
389 | 
390 |         if list_bad_indexes:
391 |             self.list_repeats_gaped = _fix_repeats(self.list_repeats_gaped, list_bad_indexes)
392 | 
393 |     def _compute_consensus(self):
394 |         self.consensus, self.consensus_gaped = CrisprConsensus(self.list_repeats_gaped).output()
395 | 
396 |     def _compute_mismatches(self):
397 |         def _compute_mismatches_repeat(gaped_repeat):
398 |             substitutions = 0
399 |             insertions = 0
400 |             deletions = 0
401 |             list_mismatches_indexes_one_repeat = []
402 |             for index, char_repeat, char_con_repeat in zip(range(len(gaped_repeat)),
403 |                                                            gaped_repeat,
404 |                                                            self.consensus_gaped):
405 | 
406 |                 if char_con_repeat == ' ':
407 |                     if char_repeat != ' ':
408 |                         insertions += 1
409 |                         list_mismatches_indexes_one_repeat.append(index)
410 |                 else:
411 |                     if char_repeat == char_con_repeat:
412 |                         pass
413 |                     else:
414 |                         if char_repeat == '-':
415 |                             deletions += 1
416 |                             list_mismatches_indexes_one_repeat.append(index)
417 |                         elif char_repeat == ' ':
418 |                             deletions += 1
419 |                         else:
420 |                             substitutions += 1
421 |                             list_mismatches_indexes_one_repeat.append(index)
422 | 
423 |             return substitutions, insertions, deletions, list_mismatches_indexes_one_repeat
424 | 
425 |         for gaped_repeat in self.list_repeats_gaped:
426 |             s, i, d, list_mismatches_indexes_one_repeat = _compute_mismatches_repeat(gaped_repeat)
427 |             total = s + i + d
428 |             repeat_stats = [s, i, d, total]
429 |             self.list_repeat_mismatches.append(repeat_stats)
430 |             self.list_mismatches_indexes.append(list_mismatches_indexes_one_repeat)
431 | 
432 |         self.total_mismatches = sum([x[3] for x in self.list_repeat_mismatches])
433 | 
434 |     def dot_repeat(self, gaped_repeat):
435 |         string = ''
436 |         substitutions = 0
437 |         insertions = 0
438 |         deletions = 0
439 |         for char_repeat, char_consensus in zip(gaped_repeat, self.consensus_gaped):
440 |             if char_consensus == ' ':
441 |                 string += char_repeat
442 |                 if char_repeat != ' ':
443 |                     insertions += 1
444 |             else:
445 |                 if char_repeat == char_consensus:
446 |                     string += '.'
447 |                 else:
448 |                     string += char_repeat
449 |                     if char_repeat == '-':
450 |                         deletions += 1
451 |                     elif char_repeat == ' ':
452 |                         deletions += 1
453 |                     else:
454 |                         substitutions += 1
455 |         return string, substitutions, insertions, deletions
456 | 
457 |     def dot_repr(self):
458 |         string = ''
459 |         g_s, g_i, g_d = 0, 0, 0
460 |         max_length_start_index = max(len(str(start)) for start in self.list_repeat_starts) + 3
461 |         max_length_spacer = max(len(spacer) for spacer in self.list_spacers) + 3
462 | 
463 |         for index, gaped_repeat in enumerate(self.list_repeats_gaped):
464 |             repeat_start_index = self.list_repeat_starts[index] + 1
465 |             n_gaps_after_start = max_length_start_index - len(str(repeat_start_index))
466 | 
467 |             if index == len(self.list_spacers):
468 |                 spacer = ""
469 |             else:
470 |                 spacer = self.list_spacers[index]
471 |             n_gaps_after_spacer = max_length_spacer - len(spacer)
472 | 
473 |             dotted_repeats, s, i, d = self.dot_repeat(gaped_repeat)
474 |             errors = "   s:{} i:{} d:{}".format(s, i, d)
475 |             g_s += s
476 |             g_i += i
477 |             g_d += d
478 | 
479 |             string += "{}{}{}  {}{}{}\n".format(repeat_start_index,
480 |                                                 " " * n_gaps_after_start,
481 |                                                 dotted_repeats, spacer,
482 |                                                 " " * n_gaps_after_spacer,
483 |                                                 errors)
484 | 
485 |         string += "_" * 100 + "\n"
486 | 
487 |         string += " " * max_length_start_index + self.consensus_gaped
488 |         string += " " * (max_length_spacer + 2) + "   s:{} i:{} d:{}".format(g_s, g_i, g_d) + "\n"
489 | 
490 |         return string
491 | 
492 |     def dot_repr_web_server(self):
493 |         string = ''
494 |         g_s, g_i, g_d = 0, 0, 0
495 |         max_length_start_index = max(len(str(start)) for start in self.list_repeat_starts) + 3
496 |         max_length_spacer = max(len(spacer) for spacer in self.list_spacers) + 3
497 | 
498 |         for index, gaped_repeat in enumerate(self.list_repeats_gaped):
499 |             repeat_start_index = self.list_repeat_starts[index] + 1
500 |             n_gaps_after_start = max_length_start_index - len(str(repeat_start_index))
501 | 
502 |             if index == len(self.list_spacers):
503 |                 spacer = ""
504 |             else:
505 |                 spacer = "$" + self.list_spacers[index] + "$"
506 |             n_gaps_after_spacer = max_length_spacer - len(spacer)
507 | 
508 |             dotted_repeats, s, i, d = self.dot_repeat(gaped_repeat)
509 |             errors = "   s:{} i:{} d:{}".format(s, i, d)
510 |             g_s += s
511 |             g_i += i
512 |             g_d += d
513 | 
514 |             string += "{}{}{}  {}{}{}\n".format(repeat_start_index,
515 |                                                 " " * n_gaps_after_start,
516 |                                                 dotted_repeats, spacer,
517 |                                                 " " * n_gaps_after_spacer,
518 |                                                 errors)
519 | 
520 |         string += "_" * 100 + "\n"
521 | 
522 |         string += " " * max_length_start_index + self.consensus_gaped
523 |         string += " " * (max_length_spacer + 2) + "   s:{} i:{} d:{}".format(g_s, g_i, g_d) + "\n"
524 | 
525 |         string += "_" * 100 + "\n"
526 | 
527 |         string += "consensus: " + self.consensus + "\n"
528 | 
529 |         return string
530 | 
531 |     def write_file(self, file_name):
532 |         with open(file_name, "w") as f:
533 |             f.write(self.dot_repr())
534 | 
535 |     def write_as_json(self, filename):
536 |         dict_to_write = {"repeat_begins": self.list_repeat_starts,
537 |                          "repeats": self.list_repeats,
538 |                          "repeats_gaped": self.list_repeats_gaped,
539 |                          "spacers": self.list_spacers}
540 | 
541 |         with open(filename, 'w') as outfile:
542 |             json.dump(dict_to_write, outfile)
543 | 
544 |     def compute_stats(self):
545 |         start = self.list_repeat_starts[0] + 1
546 |         end = self.list_repeat_starts[-1] + len(self.list_repeats[-1])
547 |         avg_repeat = len(self.consensus)
548 |         avg_spacer = int(sum((len(spacer) for spacer in self.list_spacers)) / len(self.list_spacers))
549 |         number_repeats = len(self.list_repeats)
550 |         return {"start": start, "end": end, "avg_repeat": avg_repeat,
551 |                 "avg_spacer": avg_spacer, "number_repeats": number_repeats}
552 | 
553 |     @classmethod
554 |     def init_from_json(cls, file_name):
555 |         with open(file_name) as json_file:
556 |             dict_data = json.load(json_file)
557 | 
558 |             list_repeas = dict_data["repeats"]
559 |             list_repeats_starts = dict_data["repeat_begins"]
560 |             list_spacers = dict_data["spacers"]
561 |             list_repeats_gaped = dict_data["repeats_gaped"]
562 | 
563 |         return cls(list_repeats=list_repeas, list_spacers=list_spacers,
564 |                    list_repeats_gaped=list_repeats_gaped, list_repeat_starts=list_repeats_starts)
565 | 
566 |     def __repr__(self):
567 |         return self.dot_repr()
568 | 
569 |     def __eq__(self, other):
570 |         if self.list_repeats == other.list_repeats:
571 |             if self.list_repeats_gaped == other.list_repeats_gaped:
572 |                 if self.list_spacers == other.list_spacers:
573 |                     return True
574 |         return False
575 | 
576 |     def __ne__(self, other):
577 |         return not self.__eq__(other)
578 | 


--------------------------------------------------------------------------------
/components/components_helpers.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | from os import listdir
  4 | from os.path import isfile, join
  5 | 
  6 | 
  7 | def process_string_from_header(input_string):
  8 |     # Define the function to replace based on the condition
  9 |     def replace_match(match):
 10 |         # If it's an integer, remove the dot and integer
 11 |         if match.group(2).isdigit():
 12 |             return match.group(1)
 13 |         # If it's not an integer, replace the dot with a hyphen
 14 |         return match.group(1) + "-" + match.group(2)
 15 | 
 16 |     # Use regex to find patterns with a dot followed by any characters
 17 |     result = re.sub(r'(\w+)\.(\w+)', replace_match, input_string)
 18 |     return result
 19 | 
 20 | def multiline_fasta_check(file):
 21 |     with open(file, "r") as f:
 22 |         lines = f.readlines()
 23 |     number_of_inputs = sum([1 for line in lines if ">" in line])
 24 |     return number_of_inputs != 1
 25 | 
 26 | 
 27 | def multiline_fasta_handle(file):
 28 |     base_name = str(os.path.basename(file).split(".")[0])
 29 |     try:
 30 |         os.mkdir(base_name)
 31 |     except OSError:
 32 |         pass
 33 | 
 34 |     cmd = f"cat {file}"
 35 |     cmd += " | awk '{ if (substr($0, 1, 1)==\">\") {"
 36 |     cmd += "filename=(\"{}/\"".format(base_name)
 37 |     cmd += "substr($0,2)\".fa\")} "
 38 |     cmd += f"print $0 > filename "
 39 |     cmd += "}'"
 40 | 
 41 |     os.system(cmd)
 42 | 
 43 |     return base_name
 44 | 
 45 | 
 46 | def multiline_fasta_handle_python(file, flag_ncbi_formatting=False):
 47 |     base_name = str(os.path.basename(file).split(".")[0])
 48 |     try:
 49 |         os.mkdir(base_name)
 50 |     except OSError:
 51 |         pass
 52 | 
 53 |     with open(file, "r") as f:
 54 |         lines = f.readlines()
 55 | 
 56 |     headers = []
 57 |     dna_sequences = []
 58 | 
 59 |     dna_sequence = ''
 60 |     for line in lines:
 61 |         if line:
 62 |             if ">" in line:
 63 |                 if dna_sequence:
 64 |                     dna_sequences.append(dna_sequence)
 65 |                     dna_sequence = ''
 66 |                 headers.append(line)
 67 |             else:
 68 |                 dna_sequence += line.strip()
 69 | 
 70 |     if dna_sequence:
 71 |         dna_sequences.append(dna_sequence)
 72 | 
 73 |     if flag_ncbi_formatting:
 74 |         for header, dna_sequence in zip(headers, dna_sequences):
 75 |             new_header = header.split(" ")[0]
 76 |             new_header = process_string_from_header(new_header)
 77 |             file_name = new_header.split(">")[1].replace(",", "-") \
 78 |                                   .replace(".", "-").replace(" ", "_").replace("|", "-") + ".fa"
 79 |             with open(os.path.join(base_name, file_name), "w") as f:
 80 |                 f.writelines(new_header)
 81 |                 f.write("\n")
 82 |                 f.writelines(dna_sequence)
 83 |     else:
 84 |         for header, dna_sequence in zip(headers, dna_sequences):
 85 |             file_name = header.strip().split(">")[1].replace(",", "_")\
 86 |                             .replace(".", "_").replace(" ", "_").replace("|", "_") + ".fa"
 87 |             with open(os.path.join(base_name, file_name), "w") as f:
 88 |                 f.write(header)
 89 |                 f.write(dna_sequence)
 90 | 
 91 |     return base_name
 92 | 
 93 | 
 94 | def folder_of_multifasta_handle(folder_multifasta):
 95 |     list_files = [f for f in listdir(folder_multifasta) if isfile(join(folder_multifasta, f))]
 96 |     all_lines_in_files = []
 97 |     for file in list_files:
 98 |         with open(os.path.join(folder_multifasta, file), "r") as f:
 99 |             lines = f.readlines()
100 |         all_lines_in_files.append(lines)
101 |     with open("multifasta_folder.fa", "w") as f:
102 |         for lines in all_lines_in_files:
103 |             for line in lines:
104 |                 f.write(line)
105 | 
106 |     multiline_fasta_handle_python("multifasta_folder.fa")
107 |     return "multifasta_folder"


--------------------------------------------------------------------------------
/components/components_ml.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | import numpy as np
  3 | import sklearn
  4 | import joblib
  5 | 
  6 | 
  7 | class ClassifierWrapper(object):
  8 |     def __init__(self, classifier_type, load_option=None, hyper_parameters=None):
  9 |         self.classifier_type = classifier_type
 10 |         self._hyper_parameters = hyper_parameters
 11 |         self._load_option = load_option
 12 | 
 13 |         self._init_classifier()
 14 | 
 15 |     def _init_classifier(self):
 16 |         if self._load_option:
 17 |             self._load_model()
 18 |         else:
 19 |             if self.classifier_type == 'k_near_neighbors':
 20 | 
 21 |                 if not self._hyper_parameters:
 22 |                     self.classifier = sklearn.neighbors.KNeighborsClassifier(n_neighbors=7)
 23 |                 else:
 24 |                     self.classifier = sklearn.neighbors.KNeighborsClassifier(**self._hyper_parameters)
 25 | 
 26 |             elif self.classifier_type == 'svm':
 27 | 
 28 |                 if not self._hyper_parameters:
 29 |                     self.classifier = sklearn.svm.SVC()
 30 |                 else:
 31 |                     self.classifier = sklearn.svm.SVC(**self._hyper_parameters)
 32 | 
 33 |             elif self.classifier_type == 'naive_bayes':
 34 | 
 35 |                 if not self._hyper_parameters:
 36 |                     self.classifier = sklearn.naive_bayes.GaussianNB()
 37 |                 else:
 38 |                     self.classifier = sklearn.naive_bayes.GaussianNB(**self._hyper_parameters)
 39 | 
 40 |             elif self.classifier_type == 'random_forest':
 41 | 
 42 |                 if not self._hyper_parameters:
 43 |                     self.classifier = RandomForestClassifier(max_depth=3, random_state=None)
 44 |                 else:
 45 |                     self.classifier = RandomForestClassifier(**self._hyper_parameters)
 46 | 
 47 |             elif self.classifier_type == 'neural_network':
 48 | 
 49 |                 if not self._hyper_parameters:
 50 |                     self.classifier = MLPClassifier(solver='lbfgs', alpha=1e-5,
 51 |                                                     hidden_layer_sizes=(100, 100), random_state=None)
 52 |                 else:
 53 |                     self.classifier = MLPClassifier(**self._hyper_parameters)
 54 | 
 55 |             elif self.classifier_type == 'extra_trees':
 56 | 
 57 |                 if not self._hyper_parameters:
 58 |                     self.classifier = ExtraTreesClassifier(max_depth=4)
 59 |                 else:
 60 |                     self.classifier = ExtraTreesClassifier(**self._hyper_parameters)
 61 | 
 62 |             else:
 63 |                 raise ValueError('Wrong classifier')
 64 | 
 65 |     def _load_model(self):
 66 |         self.classifier = joblib.load(self._load_option)
 67 | 
 68 |     def train_classifier(self, train_set_pos, train_set_neg):
 69 |         train_y_pos = np.ones(len(train_set_pos))
 70 |         train_y_neg = np.zeros(len(train_set_neg))
 71 |         train_y = np.concatenate([train_y_pos, train_y_neg])
 72 |         train_x = np.concatenate([train_set_pos, train_set_neg])
 73 |         self.classifier.fit(train_x, train_y)
 74 | 
 75 |     def test_classifier(self, test_set_pos, test_set_neg):
 76 |         if (test_set_pos is not None) and (test_set_neg is not None):
 77 |             test_set_y_pos = np.ones(len(test_set_pos))
 78 |             test_set_y_neg = np.zeros(len(test_set_neg))
 79 |             test_set_y = np.concatenate([test_set_y_pos, test_set_y_neg])
 80 |             test_set_x = np.concatenate([test_set_pos, test_set_neg])
 81 | 
 82 |         elif test_set_pos is not None:
 83 |             test_set_y = np.ones(len(test_set_pos))
 84 |             test_set_x = test_set_pos
 85 | 
 86 |         elif test_set_neg is not None:
 87 |             test_set_y = np.zeros(len(test_set_neg))
 88 |             test_set_x = test_set_neg
 89 | 
 90 |         else:
 91 |             raise ValueError
 92 | 
 93 |         predict = self.classifier.predict(test_set_x)
 94 |         dif = test_set_y - predict
 95 |         return 1 - np.count_nonzero(dif) / float(len(dif))
 96 | 
 97 |     def predict(self, dataset):
 98 |         return self.classifier.predict(dataset)
 99 | 
100 |     def predict_proba(self, dataset):
101 |         return self.classifier.predict_proba(dataset)
102 | 
103 |     def save_model(self, model_name_dot_pkl):
104 |         joblib.dump(self.classifier, model_name_dot_pkl)
105 | 
106 | 
107 | 
108 | 


--------------------------------------------------------------------------------
/components/module_detection.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing
  2 | from multiprocessing import Pool
  3 | 
  4 | from components.components_detection import VmatchRun
  5 | from components.components_detection import ClusterMaker
  6 | from components.components_detection import FilterApproximationClusters
  7 | from components.components_detection import StartEndEnhancementClusters
  8 | from components.components_detection import IntermediateEnhancementClusters
  9 | from components.components_detection import ClusterSequence
 10 | from components.components_detection import FuzzySearch
 11 | 
 12 | 
 13 | class Detection:
 14 |     def __init__(self, file_path, flags, parameters, flag_dev_mode):
 15 |         self.file_path = file_path
 16 |         self.flags = flags
 17 |         self.parameters = parameters
 18 |         self.flag_parallel = flags["flag_parallel"]
 19 |         self.flag_cpu = flags["flag_cpu"]
 20 |         self.flag_fast_run = flags["flag_fast_run"]
 21 |         self.flag_enhancement_min_max = flags["flag_enhancement_min_max"]
 22 |         self.flag_enhancement_start_end = flags["flag_enhancement_start_end"]
 23 |         self.parameters = parameters
 24 |         self.flag_dev_mode = flag_dev_mode
 25 | 
 26 |         self.clusters = []
 27 |         self.cluster_sequences = []
 28 |         self.dict_fuzzy_crisprs = {}
 29 | 
 30 |         self._get_complete_dna()
 31 |         self._run_cluster_detection()
 32 |         self._extract_cluster_sequences()
 33 |         self._run_array_detection()
 34 | 
 35 |     def _get_complete_dna(self):
 36 |         with open(self.file_path, 'r') as f:
 37 |             lines = f.readlines()
 38 | 
 39 |         self.input_header = lines[0]
 40 |         self.dna = ''.join([line.strip() for line in lines[1:]])
 41 |         self.dna_length = len(self.dna)
 42 |         self.dna = self.dna.upper()
 43 | 
 44 |     def _run_cluster_detection(self):
 45 |         vr = VmatchRun(self.file_path, self.flag_fast_run)
 46 |         list_repeats_from_vmatch = vr.output()
 47 |         #print("list vmatch repeats", list_repeats_from_vmatch)
 48 | 
 49 |         cm = ClusterMaker(list_repeats_from_vmatch, self.dna)
 50 |         self.clusters = cm.output()
 51 | 
 52 |         fa = FilterApproximationClusters(self.clusters)
 53 |         self.clusters = fa.output()
 54 | 
 55 |         st = StartEndEnhancementClusters(self.clusters)
 56 |         self.clusters = st.output()
 57 | 
 58 |         ie = IntermediateEnhancementClusters(self.clusters)
 59 |         self.clusters = ie.output()
 60 | 
 61 |     def _extract_cluster_sequences(self):
 62 |         for cluster in self.clusters:
 63 |             seq_start = max(0, cluster.begin - 100)
 64 |             seq_end = min(len(self.dna), cluster.end + 100)
 65 |             cluster_seq = self.dna[seq_start:seq_end]
 66 |             tup_cluster_dif_rep = tuple(cluster.list_clust_dif_rep_seq)
 67 | 
 68 |             self.cluster_sequences.append(ClusterSequence(cluster_seq, seq_start, seq_end, tup_cluster_dif_rep))
 69 | 
 70 |     @staticmethod
 71 |     def _parallel_run_fuzzy_run(input_tuple):
 72 |         repeat, sequence, start, weighted_error = input_tuple
 73 | 
 74 |         return FuzzySearch(sequence, start,
 75 |                            repeat, weighted_error)
 76 | 
 77 |     def _run_array_detection(self):
 78 |         weighted_error = "{i<=3,d<=3,s<=3,i+d+s<=6}"
 79 |         parallel = self.flag_parallel
 80 | 
 81 |         if parallel:
 82 |             for cluster_sequence in self.cluster_sequences:
 83 |                 nr = len(cluster_sequence.tuple_repeats)
 84 |                 input_tuples = zip(cluster_sequence.tuple_repeats, [cluster_sequence.sequence] * nr,
 85 |                                    [cluster_sequence.start] * nr, [weighted_error] * nr)
 86 | 
 87 |                 num_workers_suggested = multiprocessing.cpu_count() if self.flag_cpu == "ALL" else int(self.flag_cpu)
 88 |                 max_possible = multiprocessing.cpu_count()
 89 |                 num_workers = num_workers_suggested if num_workers_suggested < max_possible else max_possible
 90 |                 with Pool(num_workers) as p:
 91 |                     fuzzy_results = p.map(self._parallel_run_fuzzy_run, input_tuples)
 92 |                     fuzzy_results = [x for x in fuzzy_results if x.match_hit]
 93 |                     fuzzy_results = [x for x in fuzzy_results if len(x.list_repeats) > 1]
 94 | 
 95 |                 self.dict_fuzzy_crisprs[cluster_sequence] = fuzzy_results
 96 |         else:
 97 |             for cluster_sequence in self.cluster_sequences:
 98 |                 list_fuzzy_results = []
 99 |                 for repeat in cluster_sequence.tuple_repeats:
100 |                     fuzzy_s = FuzzySearch(cluster_sequence.sequence, cluster_sequence.start,
101 |                                           repeat, weighted_error)
102 |                     if fuzzy_s.match_hit:
103 |                         if len(fuzzy_s.list_repeats) > 1:
104 |                             list_fuzzy_results.append(fuzzy_s)
105 | 
106 |                 self.dict_fuzzy_crisprs[cluster_sequence] = list_fuzzy_results
107 | 
108 |     def output(self):
109 |         return self.dict_fuzzy_crisprs


--------------------------------------------------------------------------------
/components/module_detection_refinement.py:
--------------------------------------------------------------------------------
 1 | from components.components_detection_refinement import SameStartEndFilter
 2 | from components.components_detection_refinement import AdvancedFuzzySearchFilter
 3 | from components.components_detection_refinement import CrisprCandidate
 4 | 
 5 | 
 6 | class DetectionRefinement:
 7 |     def __init__(self, dict_fuzzy_crisprs, parameters, flag_dev_mode):
 8 |         self.dict_fuzzy_crisprs = dict_fuzzy_crisprs
 9 |         self.parameters = parameters
10 |         self.flag_dev_mode = flag_dev_mode
11 |         self.dict_fuzzy_crisprs_refined_st_end = {}
12 |         self.dict_fuzzy_crisprs_fully_refined = {}
13 | 
14 |         self._filter_out_same_start_end_cases()
15 |         self._filter_out_non_crispr_cases()
16 |         self._reformat_ac_crispr_candidates()
17 | 
18 |     def _filter_out_same_start_end_cases(self):
19 |         ssef = SameStartEndFilter(self.dict_fuzzy_crisprs)
20 |         self.dict_fuzzy_crisprs_refined_st_end = ssef.output()
21 | 
22 |     def _filter_out_non_crispr_cases(self):
23 |         self.param_min_avg_repeat_length = self.parameters["param_min_avg_repeat_length"]
24 |         self.param_max_avg_repeat_length = self.parameters["param_max_avg_repeat_length"]
25 |         self.param_min_avg_spacer_length = self.parameters["param_min_avg_spacer_length"]
26 |         self.param_max_avg_spacer_length = self.parameters["param_max_avg_spacer_length"]
27 |         self.param_min_repeats = self.parameters["param_min_repeats"]
28 |         self.param_max_identical_spacers = self.parameters["param_max_identical_spacers"]
29 |         self.param_max_identical_cluster_spacers = self.parameters["param_max_identical_cluster_spacers"]
30 | 
31 |         afsf = AdvancedFuzzySearchFilter(min_column_dominance_repeat=0.6,
32 |                                          max_spacer_length=140, max_column_dominance_spacer=0.8,
33 |                                          max_allowed_consecutive_spacers=self.param_max_identical_cluster_spacers,
34 |                                          max_allowed_same_spacers=self.param_max_identical_spacers,
35 |                                          max_inconsistent_columns=5,
36 |                                          min_avg_repeat_length=self.param_min_avg_repeat_length,
37 |                                          max_avg_repeat_length=self.param_max_avg_repeat_length,
38 |                                          min_avg_spacer_length=self.param_min_avg_spacer_length,
39 |                                          max_avg_spacer_length=self.param_max_avg_spacer_length,
40 |                                          min_repeats=self.param_min_repeats)
41 | 
42 |         for key, values in self.dict_fuzzy_crisprs_refined_st_end.items():
43 |             list_filtered_advanced = [afsf(value) for value in values]
44 |             list_filtered_advanced = [x for x in list_filtered_advanced if x]
45 |             if not list_filtered_advanced:
46 |                 sorted_by_num_errors = sorted(list(values), key=lambda x: x.number_errors)
47 |                 if sorted_by_num_errors:
48 |                     candidate_fewer_mismatches = sorted_by_num_errors[0]
49 |                     self.dict_fuzzy_crisprs_fully_refined[key] = [candidate_fewer_mismatches]
50 |             else:
51 |                 self.dict_fuzzy_crisprs_fully_refined[key] = list_filtered_advanced
52 | 
53 |     def _reformat_ac_crispr_candidates(self):
54 |         self.dict_crispr_candidates = {}
55 |         for key, list_fuzzy in self.dict_fuzzy_crisprs_fully_refined.items():
56 |             new_key = (key.start, key.end)
57 |             list_crispr_candidates = [CrisprCandidate(fuzzy.list_repeats, fuzzy.list_gaped_repeats,
58 |                                                       fuzzy.list_spacers, fuzzy.list_absolute_start)
59 |                                       for fuzzy in list_fuzzy]
60 | 
61 |             self.dict_crispr_candidates[new_key] = list_crispr_candidates
62 | 
63 |     def output(self):
64 |         return self.dict_crispr_candidates
65 | 


--------------------------------------------------------------------------------
/components/module_evaluated_arrays_enhancement.py:
--------------------------------------------------------------------------------
  1 | from os.path import basename
  2 | from components.components_evaluated_arrays_enhancement import IterativeDegeneratedSearch
  3 | from components.components_evaluated_arrays_enhancement import create_boundaries_for_intervals
  4 | from components.components_evaluated_arrays_enhancement import ArrayRefinerInsertionsDeletions
  5 | from components.components_detection_refinement import AdvancedFuzzySearchFilter
  6 | 
  7 | 
  8 | class EvaluatedArraysEnhancement:
  9 |     def __init__(self, file_path, categories, parameters, flag_dev_mode):
 10 |         self.file_path = file_path
 11 |         self.categories = categories
 12 |         self.parameters = parameters
 13 |         self.flag_dev_mode = flag_dev_mode
 14 | 
 15 |         self.bona_fide_arrays = categories[0]
 16 |         self.alternative_arrays = categories[1]
 17 |         self.possible_arrays = categories[2]
 18 | 
 19 |         self.dict_arrays_into_categories_enhanced = {}
 20 | 
 21 |         self._get_complete_dna()
 22 |         self._search_missed_or_degenerated_repeats()
 23 |         self._refine_nucleotides_repeat_spacer()
 24 |         self._filter_enhanced()
 25 | 
 26 |     def _get_complete_dna(self):
 27 |         with open(self.file_path, 'r') as f:
 28 |             lines = f.readlines()
 29 | 
 30 |         self.input_header = lines[0]
 31 |         self.dna = ''.join([line.strip() for line in lines[1:]])
 32 |         self.dna_length = len(self.dna)
 33 |         self.dna = self.dna.upper()
 34 | 
 35 |     def _search_missed_or_degenerated_repeats(self):
 36 |         for category in [self.bona_fide_arrays, self.alternative_arrays, self.possible_arrays]:
 37 |             intervals = []
 38 |             arrays_for_intervals = []
 39 | 
 40 |             for interval, list_data in category.items():
 41 |                 intervals.append(interval)
 42 |                 arrays_for_intervals.append([el[1] for el in list_data])
 43 | 
 44 |             boundaries = create_boundaries_for_intervals(intervals, 500)
 45 | 
 46 |             for interval, arrays_in_interval, boundary in zip(intervals, arrays_for_intervals, boundaries):
 47 |                 for array_index, array in enumerate(arrays_in_interval):
 48 |                     consensus = array.consensus
 49 |                     list_repeats = array.list_repeats
 50 |                     list_repeats_starts = array.list_repeat_starts
 51 |                     list_spacers = array.list_spacers
 52 | 
 53 | 
 54 |                     ids = IterativeDegeneratedSearch(full_dna=self.dna,
 55 |                                                      repeat_seq_candidate=consensus,
 56 |                                                      spacer_margin=self.parameters["param_spacer_margin_degenerated_search"],
 57 |                                                      repeat_seq_candidate_gaped=None,
 58 |                                                      list_repeats_starts=list_repeats_starts,
 59 |                                                      list_repeats=list_repeats,
 60 |                                                      list_spacers=list_spacers,
 61 |                                                      start_flanking_region_left=boundary[0],
 62 |                                                      end_flanking_region_right=boundary[1],
 63 |                                                      allowed_max_editing_distance=self.parameters["param_max_edit_distance"],
 64 |                                                      iterative_size_flanking_region=150,
 65 |                                                      prevent_long_spacers=True,
 66 |                                                      attempt_to_improve_initial_array=True)
 67 | 
 68 |                     new_crispr_candidate = ids.output()
 69 | 
 70 |                     if self.flag_dev_mode:
 71 |                         if array != new_crispr_candidate:
 72 |                             with open("log.txt", "a") as f:
 73 |                                 acc_num = basename(self.file_path).split(".")[0]
 74 |                                 f.write(f"Iteractive degenerated search {acc_num}\n")
 75 |                                 f.write(array.dot_repr())
 76 |                                 f.write("\n\n")
 77 |                                 f.write(new_crispr_candidate.dot_repr())
 78 |                                 f.write("\n\n")
 79 | 
 80 |                     """except Exception:
 81 |                         new_crispr_candidate = array
 82 | 
 83 |                         if self.flag_dev_mode:
 84 |                             with open("log_error.txt", "a") as f:
 85 |                                 acc_num = basename(self.file_path).split(".")[0]
 86 |                                 f.write(f"Iteractive degenerated search error {acc_num}\n")
 87 |                                 f.write(array.dot_repr())
 88 |                                 f.write("\n\n")"""
 89 | 
 90 |                     category[interval][array_index][1] = new_crispr_candidate
 91 | 
 92 |     def _refine_nucleotides_repeat_spacer(self):
 93 |         for category in [self.bona_fide_arrays, self.alternative_arrays, self.possible_arrays]:
 94 |             for interval, list_data in category.items():
 95 |                 arrays = [el[1] for el in list_data]
 96 |                 for array_index, array in enumerate(arrays):
 97 |                     try:
 98 |                         arid = ArrayRefinerInsertionsDeletions(array)
 99 |                         new_crispr_candidate = arid.output()
100 | 
101 |                         if self.flag_dev_mode:
102 |                             if array != new_crispr_candidate:
103 |                                 with open("log.txt", "a") as f:
104 |                                     acc_num = basename(self.file_path).split(".")[0]
105 |                                     f.write(f"Array refinement {acc_num}\n")
106 |                                     f.write(array.dot_repr())
107 |                                     f.write("\n\n")
108 |                                     f.write(new_crispr_candidate.dot_repr())
109 |                                     f.write("\n\n")
110 | 
111 |                     except Exception:
112 |                         new_crispr_candidate = array
113 | 
114 |                         if self.flag_dev_mode:
115 |                             with open("log_error.txt", "a") as f:
116 |                                 acc_num = basename(self.file_path).split(".")[0]
117 |                                 f.write(f"Array refinement error {acc_num}\n")
118 |                                 f.write(array.dot_repr())
119 |                                 f.write("\n\n")
120 | 
121 |                     category[interval][array_index][1] = new_crispr_candidate
122 | 
123 |     def _filter_enhanced(self):
124 |         self.param_min_avg_repeat_length = self.parameters["param_min_avg_repeat_length"]
125 |         self.param_max_avg_repeat_length = self.parameters["param_max_avg_repeat_length"]
126 |         self.param_min_avg_spacer_length = self.parameters["param_min_avg_spacer_length"]
127 |         self.param_max_avg_spacer_length = self.parameters["param_max_avg_spacer_length"]
128 |         self.param_min_repeats = self.parameters["param_min_repeats"]
129 |         self.param_max_identical_spacers = self.parameters["param_max_identical_spacers"]
130 |         self.param_max_identical_cluster_spacers = self.parameters["param_max_identical_cluster_spacers"]
131 | 
132 |         afsf = AdvancedFuzzySearchFilter(min_column_dominance_repeat=0.6,
133 |                                          max_spacer_length=140, max_column_dominance_spacer=0.8,
134 |                                          max_allowed_consecutive_spacers=self.param_max_identical_cluster_spacers,
135 |                                          max_allowed_same_spacers=self.param_max_identical_spacers,
136 |                                          max_inconsistent_columns=5,
137 |                                          min_avg_repeat_length=self.param_min_avg_repeat_length,
138 |                                          max_avg_repeat_length=self.param_max_avg_repeat_length,
139 |                                          min_avg_spacer_length=self.param_min_avg_spacer_length,
140 |                                          max_avg_spacer_length=self.param_max_avg_spacer_length,
141 |                                          min_repeats=self.param_min_repeats)
142 | 
143 |         bona_fide_not_filtered = self.categories[0]
144 |         alternative_not_filtered = self.categories[1]
145 |         possible_not_filtered = self.categories[2]
146 |         low_score = self.categories[4]
147 | 
148 |         bona_fide_filtered = {}
149 |         alternative_filtered = {}
150 |         possible_filtered = {}
151 | 
152 |         for not_filtered_category, filtered_category in zip([bona_fide_not_filtered, alternative_not_filtered, possible_not_filtered],
153 |                                                             [bona_fide_filtered, alternative_filtered, possible_filtered]):
154 |             for key, value in not_filtered_category.items():
155 |                 for crispr_tuple in value:
156 |                     crispr = crispr_tuple[1]
157 |                     if not afsf(crispr):
158 |                         if key in low_score:
159 |                             low_score[key].append(crispr_tuple)
160 |                         else:
161 |                             low_score[key] = [crispr_tuple]
162 |                     else:
163 |                         if key not in filtered_category:
164 |                             filtered_category[key] = [crispr_tuple]
165 |                         else:
166 |                             filtered_category[key].append(crispr_tuple)
167 | 
168 |         self.categories[0] = bona_fide_filtered
169 |         self.categories[1] = alternative_filtered
170 |         self.categories[2] = possible_filtered
171 |         self.categories[4] = low_score
172 | 
173 |     def output(self):
174 |         return self.categories
175 | 


--------------------------------------------------------------------------------
/components/module_evaluation.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from components.components_evaluation import BulkFeatureExtractor
  4 | from components.components_evaluation import FeatureExtractor
  5 | from components.components_evaluation import get_full_vector
  6 | from components.components_detection_refinement import AdvancedFuzzySearchFilter
  7 | 
  8 | 
  9 | class ArrayEvaluation:
 10 |     def __init__(self, dict_crispr_array_candidates, list_ml_classifiers, list_features, parameters, flag_dev_mode):
 11 |         self.dict_crispr_array_candidates = dict_crispr_array_candidates
 12 |         self.list_ml_classifiers = list_ml_classifiers
 13 |         self.list_features = list_features
 14 |         self.parameters = parameters
 15 |         self.flag_dev_mode = flag_dev_mode
 16 | 
 17 |         self.dict_scored_result = {}
 18 |         self.dict_scored_result_with_all_vectors = {}
 19 | 
 20 |         self.dict_bona_fide = {}
 21 |         self.dict_alternative = {}
 22 |         self.dict_possible = {}
 23 |         self.dict_possible_discarded = {}
 24 |         self.dict_low_score = {}
 25 | 
 26 |         self._load_filter()
 27 |         self._extract_features_and_evaluate()
 28 |         self._split_into_categories()
 29 | 
 30 |     def _load_filter(self):
 31 |         self.param_min_avg_repeat_length = self.parameters["param_min_avg_repeat_length"]
 32 |         self.param_max_avg_repeat_length = self.parameters["param_max_avg_repeat_length"]
 33 |         self.param_min_avg_spacer_length = self.parameters["param_min_avg_spacer_length"]
 34 |         self.param_max_avg_spacer_length = self.parameters["param_max_avg_spacer_length"]
 35 |         self.param_min_repeats = self.parameters["param_min_repeats"]
 36 |         self.param_max_identical_spacers = self.parameters["param_max_identical_spacers"]
 37 |         self.param_max_identical_cluster_spacers = self.parameters["param_max_identical_cluster_spacers"]
 38 |         self. afsf = AdvancedFuzzySearchFilter(min_column_dominance_repeat=0.6,
 39 |                                                max_spacer_length=140, max_column_dominance_spacer=0.8,
 40 |                                                max_allowed_consecutive_spacers=self.param_max_identical_cluster_spacers,
 41 |                                                max_allowed_same_spacers=self.param_max_identical_spacers,
 42 |                                                max_inconsistent_columns=5,
 43 |                                                min_avg_repeat_length=self.param_min_avg_repeat_length,
 44 |                                                max_avg_repeat_length=self.param_max_avg_repeat_length,
 45 |                                                min_avg_spacer_length=self.param_min_avg_spacer_length,
 46 |                                                max_avg_spacer_length=self.param_max_avg_spacer_length,
 47 |                                                min_repeats=self.param_min_repeats)
 48 | 
 49 |     def _extract_features_and_evaluate(self):
 50 |         bfe = BulkFeatureExtractor(self.dict_crispr_array_candidates)
 51 |         results = bfe.output()
 52 |         blast_results, orf_results, hmm_results, mfe_results = results
 53 |         blast_scores_1, blast_scores_2 = blast_results
 54 | 
 55 |         list_features = ['repeat_len', 'number_repeats', 'repeat_similarity',
 56 |                          'at_richness', 'avg_spacer_len', 'spacer_similarity',
 57 |                          'number_mismatches', 'spacer_evenness']
 58 | 
 59 |         for key, list_crispr_candidates in self.dict_crispr_array_candidates.items():
 60 |             self.dict_scored_result[key] = []
 61 |             self.dict_scored_result_with_all_vectors[key] = []
 62 |             for index, crispr_candidate in enumerate(list_crispr_candidates):
 63 |                 final_score = 0
 64 | 
 65 |                 feature_vector = FeatureExtractor(0, crispr_candidate, list_features).extract()[0]
 66 | 
 67 |                 mfe = mfe_results[key][index]
 68 |                 orf = orf_results[key][index]
 69 |                 hmmr = hmm_results[key][index]
 70 |                 blast1 = blast_scores_1[key][index]
 71 |                 blast2 = blast_scores_2[key][index]
 72 | 
 73 |                 feature_vector_8_incomplete = feature_vector[np.array([2, 4, 5, 6, 7])]
 74 |                 rest_8 = np.asarray([mfe, orf, blast1])
 75 |                 feature_vector_8 = np.concatenate((feature_vector_8_incomplete, rest_8))
 76 |                 feature_vector_8 = feature_vector_8.reshape(1, -1)
 77 | 
 78 |                 feature_vector_9_incomplete = feature_vector[np.array([1, 2, 4, 5, 7])]
 79 |                 rest_9 = np.asarray([mfe, orf, hmmr, blast2])
 80 |                 feature_vector_9 = np.concatenate((feature_vector_9_incomplete, rest_9))
 81 |                 feature_vector_9 = feature_vector_9.reshape(1, -1)
 82 | 
 83 |                 feature_vector_10_incomplete = feature_vector[np.array([0, 2, 3, 4, 5, 6, 7])]
 84 |                 rest_10 = np.asarray([hmmr, blast1, blast2])
 85 |                 feature_vector_10 = np.concatenate((feature_vector_10_incomplete, rest_10))
 86 |                 feature_vector_10 = feature_vector_10.reshape(1, -1)
 87 | 
 88 |                 dict_feature_vectors = {8: feature_vector_8,
 89 |                                         9: feature_vector_9,
 90 |                                         10: feature_vector_10}
 91 | 
 92 |                 feature_vectors = []
 93 |                 for ml_classifier, feature_names in zip(self.list_ml_classifiers, self.list_features):
 94 |                     len_features = len(feature_names)
 95 |                     feature_vector = dict_feature_vectors[len_features]
 96 |                     feature_vectors.append(feature_vector)
 97 |                     final_score += ml_classifier.predict_proba(feature_vector)[0][1]
 98 | 
 99 |                 final_score = final_score / len(self.list_ml_classifiers)
100 |                 score_crispr_candidate_feature_list = [final_score, crispr_candidate, feature_vectors]
101 |                 self.dict_scored_result[key].append(score_crispr_candidate_feature_list)
102 | 
103 |                 all_feature_vectors = [feature_vector_8, feature_vector_9, feature_vector_10]
104 |                 score_crispr_candidate_all_feature_tuple = final_score, crispr_candidate, all_feature_vectors
105 |                 self.dict_scored_result_with_all_vectors[key].append(score_crispr_candidate_all_feature_tuple)
106 | 
107 |     def _split_into_categories(self):
108 |         for key, data in self.dict_scored_result.items():
109 |             data_pre_possible = [candidate for candidate in data if 0.75 > candidate[0] >= 0.5]
110 |             data_alternative = [candidate for candidate in data if candidate[0] >= 0.75]
111 |             data_alternative_filtered = []
112 |             data_bad = [candidate for candidate in data if candidate[0] < 0.5]
113 | 
114 |             if data_alternative:
115 |                 for element in data_alternative:
116 |                     crispr = element[1]
117 |                     if self.afsf(crispr):
118 |                         data_alternative_filtered.append(element)
119 |                     else:
120 |                         data_bad.append(element)
121 | 
122 |                 if data_alternative_filtered:
123 |                     data_alternative_filtered = sorted(data_alternative_filtered, key=lambda x: x[0], reverse=True)
124 |                     best_candidate = data_alternative_filtered[0]
125 |                     data_alternative_filtered = data_alternative_filtered[1:]
126 | 
127 |                     self.dict_bona_fide[key] = [best_candidate]
128 |                     if data_alternative_filtered:
129 |                         self.dict_alternative[key] = data_alternative_filtered
130 | 
131 |             if data_pre_possible:
132 |                 if key in self.dict_bona_fide:
133 |                     data_show_in_alternative = [candidate for candidate in data_pre_possible if candidate[0] >= 0.6]
134 |                     if data_show_in_alternative:
135 |                         data_show_in_alternative_filtered = []
136 |                         for element in data_show_in_alternative:
137 |                             crispr = element[1]
138 |                             if self.afsf(crispr):
139 |                                 data_show_in_alternative_filtered.append(element)
140 |                             else:
141 |                                 data_bad.append(element)
142 | 
143 |                         if key in self.dict_alternative:
144 |                             self.dict_alternative[key] += data_show_in_alternative_filtered
145 |                         else:
146 |                             self.dict_alternative[key] = data_show_in_alternative_filtered
147 | 
148 |                 else:
149 |                     data_pre_possible = sorted(data_pre_possible, key=lambda x: x[0], reverse=True)
150 |                     best_possible_candidate = data_pre_possible[0]
151 |                     possible_discarded = data_pre_possible[1:]
152 | 
153 |                     if self.afsf(best_possible_candidate[1]):
154 |                         self.dict_possible[key] = [best_possible_candidate]
155 |                     else:
156 |                         data_bad.append(best_possible_candidate)
157 | 
158 |                     if possible_discarded:
159 |                         self.dict_possible_discarded[key] = possible_discarded
160 | 
161 |             if data_bad:
162 |                 self.dict_low_score[key] = data_bad
163 | 
164 |     def _split_into_categories_with_additional_classifier(self):
165 | 
166 |         for key, data in self.dict_scored_result_with_all_vectors.items():
167 |             data_pre_possible = [candidate for candidate in data if 0.75 > candidate[0] >= 0.5]
168 |             data_alternative = [candidate for candidate in data if candidate[0] >= 0.75]
169 |             data_alternative_filtered = []
170 |             data_bad = [candidate for candidate in data if candidate[0] < 0.5]
171 | 
172 |             if data_bad:
173 |                 self.dict_low_score[key] = data_bad
174 | 
175 |             if self.flag_possible_differential_model == "possible":
176 |                 if data_alternative:
177 |                     data_alternative = sorted(data_alternative, key=lambda x: x[0], reverse=True)
178 |                     best_candidate = data_alternative[0]
179 |                     data_alternative = data_alternative[1:]
180 | 
181 |                     self.dict_bona_fide[key] = best_candidate
182 |                     if data_alternative:
183 |                         self.dict_alternative[key] = data_alternative
184 |             else:
185 |                 if data_alternative:
186 |                     for element in data_alternative:
187 |                         crispr = element[1]
188 |                         if self.afsf(crispr):
189 |                             data_alternative_filtered.append(element)
190 |                         else:
191 |                             data_pre_possible.append(element)
192 | 
193 |                         data_alternative_filtered = sorted(data_alternative_filtered, key=lambda x: x[0], reverse=True)
194 |                         best_candidate = data_alternative_filtered[0]
195 |                         data_alternative_filtered = data_alternative_filtered[1:]
196 | 
197 |                         self.dict_bona_fide[key] = [best_candidate]
198 |                         if data_alternative_filtered:
199 |                             self.dict_alternative[key] = data_alternative_filtered
200 |                     data_alternative = sorted(data_alternative, key=lambda x: x[0], reverse=True)
201 |                     best_candidate_prev_model = data_alternative[0]
202 |                     data_alternative_prev_model = data_alternative[1:]
203 | 
204 |                     vectors_alternative = [get_full_vector(data[2]) for data in data_alternative]
205 |                     scores_new_model = [self.possible_differentiate_model.predict_proba(v)[0][1] for v in
206 |                                         vectors_alternative]
207 | 
208 |                     scores_new_model, data_alternative_sorted = zip(*sorted(zip(scores_new_model, data_alternative),
209 |                                                                     key=lambda x: x[0], reverse=True))
210 | 
211 |                     best_candidate = data_alternative_sorted[0]
212 |                     best_score = scores_new_model[0]
213 |                     label = 1.0 if best_score >= 0.5 else 0.0
214 | 
215 |                     if label == 1.0:
216 |                         self.dict_bona_fide[key] = [best_candidate]
217 |                         alternative = data_alternative_sorted[1:]
218 |                         if alternative:
219 |                             self.dict_alternative[key] = alternative
220 |                     else:
221 |                         self.dict_bona_fide[key] = [best_candidate_prev_model]
222 |                         if data_alternative_prev_model:
223 |                             self.dict_alternative[key] = data_alternative_prev_model
224 | 
225 |             if data_pre_possible:
226 |                 data_pre_possible = sorted(data_pre_possible, key=lambda x: x[0], reverse=True)
227 | 
228 |                 vectors_pre_possible = [get_full_vector(data[2]) for data in data_pre_possible]
229 |                 scores_new_model = [self.possible_differentiate_model.predict_proba(v)[0][1] for v in vectors_pre_possible]
230 | 
231 |                 scores_new_model, data_pre_possible_sorted = zip(*sorted(zip(scores_new_model, data_pre_possible),
232 |                                                                  key=lambda x: x[0], reverse=True))
233 | 
234 |                 best_possible_candidate = data_pre_possible_sorted[0]
235 |                 best_score = scores_new_model[0]
236 |                 label = 1.0 if best_score >= 0.5 else 0.0
237 | 
238 |                 if label == 1.0:
239 |                     self.dict_possible[key] = [best_possible_candidate]
240 |                     possible_discarded = data_pre_possible_sorted[1:]
241 |                     self.dict_possible_discarded[key] = possible_discarded
242 |                 else:
243 |                     possible_discarded = data_pre_possible_sorted
244 |                     self.dict_possible_discarded[key] = possible_discarded
245 | 
246 |     def output(self):
247 |         return [self.dict_bona_fide, self.dict_alternative, self.dict_possible,
248 |                 self.dict_possible_discarded, self.dict_low_score]
249 | 
250 | 


--------------------------------------------------------------------------------
/components/module_non_array_computations.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | from components.components_non_array_computations import StrandComputation
  4 | from components.components_non_array_computations import StrandComputationNew
  5 | from components.components_non_array_computations import FullISElementSearch
  6 | from components.components_non_array_computations import complete_info_with_cas_identifier
  7 | from components.components_non_array_computations import FullLeaderSeqSearch
  8 | from components.components_non_array_computations import RevComComputation
  9 | 
 10 | 
 11 | class NonArrayComputations:
 12 |     def __init__(self, file_path, categories, flags_non_arrays_computations, flag_dev_mode, absolute_directory_path):
 13 |         self.file_path = file_path
 14 |         self.categories = categories
 15 |         self.flags_non_arrays_computations = flags_non_arrays_computations
 16 |         self.flag_dev_mode=flag_dev_mode
 17 |         self.absolute_directory_path = absolute_directory_path
 18 | 
 19 |         self.list_of_crisprs_bona_fide = [self.categories[0][key][0][1] for key in sorted(self.categories[0].keys())]
 20 |         self.list_of_crisprs_alternative = [el[1] for key in self.categories[1].keys()
 21 |                                             for el in self.categories[1][key]]
 22 |         self.list_of_crisprs_possible = [el[1] for key in self.categories[2].keys()
 23 |                                          for el in self.categories[2][key]]
 24 | 
 25 |         self.hmm_model_is_elements = "tools/hmm_search/models_is_element.hmm"
 26 | 
 27 |         self.is_element_result = {}
 28 |         self.cas_results = {}
 29 |         self.cassete_results = {}
 30 |         self.unstructured_cas_result_from_cas_identifier = {}
 31 |         self.strand_results = {}
 32 |         self.leader_results = {}
 33 |         self.downstream_results = {}
 34 |         self.data_with_all_computations = {}
 35 | 
 36 |         self._get_complete_dna()
 37 |         self._calculate_all_non_array_values()
 38 | 
 39 |     def _get_complete_dna(self):
 40 |         with open(self.file_path, 'r') as f:
 41 |             lines = f.readlines()
 42 | 
 43 |         self.input_header = lines[0]
 44 |         self.dna = ''.join([line.strip() for line in lines[1:]])
 45 |         self.dna_length = len(self.dna)
 46 |         self.dna = self.dna.upper()
 47 | 
 48 |     def _calculate_all_non_array_values(self):
 49 |         self._calculate_strand()
 50 |         self._calculate_leader()
 51 | 
 52 |         if self.flags_non_arrays_computations["flag_cas"]:
 53 |             self._calculate_cas_proteins()
 54 |         if self.flags_non_arrays_computations["flag_is"]:
 55 |             self._calculate_is_elements()
 56 | 
 57 |         self.data_with_all_computations = {"IS": self.is_element_result,
 58 |                                            "Cas": self.cas_results,
 59 |                                            "Strand": self.strand_results,
 60 |                                            "Leader": [self.leader_results_bona_fide, self.leader_results_alternative, self.leader_results_possible],
 61 |                                            "Downstream": [self.downstream_results_bona_fide, self.downstream_results_alternative, self.downstream_results_possible],
 62 |                                            "Unstructured_Cas":self.unstructured_cas_result_from_cas_identifier,
 63 |                                            "Cassettes": self.cassete_results}
 64 | 
 65 |     def _calculate_is_elements(self):
 66 |         fies = FullISElementSearch(full_dna=self.dna, list_of_crisprs=self.list_of_crisprs_bona_fide,
 67 |                                    hmm_model=self.hmm_model_is_elements, min_similarity=0.9, min_coverage=0.9)
 68 | 
 69 |         self.is_element_result = fies.output()
 70 | 
 71 |     def _calculate_cas_proteins(self):
 72 |         def _get_crispr_intervals():
 73 |             intervals = [(x.compute_stats()["start"], x.compute_stats()["end"]) for x in self.list_of_crisprs_bona_fide]
 74 |             return intervals
 75 | 
 76 |         def _filter_cas_genes(intervals, dict_cas_genes):
 77 |             dict_filtered_cas_intervals = {}
 78 |             for key, value in dict_cas_genes.items():
 79 |                 for interval in intervals:
 80 |                     if interval[0] <= key[0] < interval[1]:
 81 |                         break
 82 |                     if interval[0] <= key[1] < interval[1]:
 83 |                         break
 84 |                 else:
 85 |                     dict_filtered_cas_intervals[key] = value
 86 | 
 87 |             return dict_filtered_cas_intervals
 88 | 
 89 |         def _cluster_cas_genes(dict_cas_genes):
 90 |             list_clusters = []
 91 |             cluster = []
 92 |             for key in sorted(dict_cas_genes.keys()):
 93 |                 value = dict_cas_genes[key]
 94 |                 new_candidate = key[0], key[1], value
 95 |                 if not cluster:
 96 |                     cluster.append(new_candidate)
 97 |                 elif abs(cluster[-1][1] - new_candidate[0]) < 500:
 98 |                     cluster.append(new_candidate)
 99 |                 else:
100 |                     list_clusters.append(cluster)
101 |                     cluster = [new_candidate]
102 | 
103 |             if cluster:
104 |                 list_clusters.append(cluster)
105 | 
106 |             return list_clusters
107 | 
108 |         def _clusters_to_simple_representation(list_clusters):
109 |             list_simple_clusters = []
110 |             for cluster in list_clusters:
111 |                 cluster_start = cluster[0][0]
112 |                 cluster_end = cluster[-1][1]
113 |                 list_cas_gene_descriptions = [x[2] for x in cluster]
114 |                 list_simple_clusters.append((cluster_start, cluster_end, list_cas_gene_descriptions))
115 |             return list_simple_clusters
116 | 
117 |         def _compute_allowed_intervals(crispr_intervals):
118 |             allowed_interwals = []
119 |             if not crispr_intervals:
120 |                 return [(0, math.inf)]
121 |             else:
122 |                 allowed_interwals.append((0, crispr_intervals[0][0]))
123 |                 for index in range(len(crispr_intervals) - 1):
124 |                     allowed_interwals.append((crispr_intervals[index][1], crispr_intervals[index+1][0]))
125 |             allowed_interwals.append((crispr_intervals[-1][1], math.inf))
126 |             return allowed_interwals
127 | 
128 |         def _group_by_output(allowed_intervals, list_simple_clusters):
129 |             dict_cas_gene_order = {}
130 |             for cluster in list_simple_clusters:
131 |                 for index, allowed_interval in enumerate(allowed_intervals):
132 |                     if allowed_interval[0] <= cluster[0] < allowed_interval[1]:
133 |                         if index in dict_cas_gene_order:
134 |                             dict_cas_gene_order[index].append(cluster)
135 |                         else:
136 |                             dict_cas_gene_order[index] = [cluster]
137 |                         break
138 |             return dict_cas_gene_order
139 | 
140 |         def _group_by_output_separated(allowed_intervals, regular_clusters):
141 |             dict_cas_gene_order_for_separated = {}
142 |             for cluster in regular_clusters:
143 |                 for index, allowed_interval in enumerate(allowed_intervals):
144 |                     if allowed_interval[0] <= cluster[0][0] < allowed_interval[1]:
145 |                         if index in dict_cas_gene_order_for_separated:
146 |                             dict_cas_gene_order_for_separated[index].append(cluster)
147 |                         else:
148 |                             dict_cas_gene_order_for_separated[index] = [cluster]
149 |                         break
150 |             return dict_cas_gene_order_for_separated
151 | 
152 |         dict_cas_genes, dict_cassete_labels = complete_info_with_cas_identifier(self.file_path,
153 |                                                                                 self.absolute_directory_path)
154 | 
155 |         self.cassete_results = dict_cassete_labels
156 |         self.unstructured_cas_result_from_cas_identifier = dict_cas_genes
157 | 
158 |         intervals = _get_crispr_intervals()
159 |         allowed_intervals = _compute_allowed_intervals(intervals)
160 |         dict_filtered_cas_genes = _filter_cas_genes(intervals, dict_cas_genes)
161 |         clustered_cas_genes = _cluster_cas_genes(dict_filtered_cas_genes)
162 | 
163 |         simple_clusters = _clusters_to_simple_representation(clustered_cas_genes)
164 |         dict_groups = _group_by_output(allowed_intervals, simple_clusters)
165 |         #dict_groups_separated = _group_by_output_separated(allowed_intervals, clustered_cas_genes)
166 | 
167 |         self.cas_results = dict_groups
168 | 
169 |     def _calculate_strand(self):
170 |         if self.flags_non_arrays_computations["flag_strand"]:
171 |             st = StrandComputationNew(list_of_crisprs=self.list_of_crisprs_bona_fide,
172 |                                       absolute_directory_path=self.absolute_directory_path)
173 |             self.strand_results["Bona-fide"] = st.output()
174 |             st = StrandComputationNew(list_of_crisprs=self.list_of_crisprs_alternative,
175 |                                       absolute_directory_path=self.absolute_directory_path)
176 |             self.strand_results["Alternative"] = st.output()
177 |             st = StrandComputationNew(list_of_crisprs=self.list_of_crisprs_possible,
178 |                                       absolute_directory_path=self.absolute_directory_path)
179 |             self.strand_results["Possible"] = st.output()
180 | 
181 | 
182 |             #except Exception:
183 |             #    st = StrandComputation(list_of_crisprs=self.list_of_crisprs_bona_fide,
184 |             #                           absolute_directory_path=self.absolute_directory_path)
185 |             #    self.strand_results["Bona-fide"] = st.output()
186 |             #    st = StrandComputation(list_of_crisprs=self.list_of_crisprs_alternative,
187 |             #                           absolute_directory_path=self.absolute_directory_path)
188 |             #    self.strand_results["Alternative"] = st.output()
189 |             #    st = StrandComputation(list_of_crisprs=self.list_of_crisprs_possible,
190 |             #                           absolute_directory_path=self.absolute_directory_path)
191 |             #    self.strand_results["Possible"] = st.output()
192 |         else:
193 |             self.strand_results["Bona-fide"] = {index: "Forward (Orientation was not computed)"
194 |                                                 for index in range(len(self.list_of_crisprs_bona_fide))}
195 |             self.strand_results["Alternative"] = {index: "Forward (Orientation was not computed)"
196 |                                                   for index in range(len(self.list_of_crisprs_alternative))}
197 |             self.strand_results["Possible"] = {index: "Forward (Orientation was not computed)"
198 |                                                for index in range(len(self.list_of_crisprs_possible))}
199 | 
200 |     def _calculate_leader(self):
201 |         flss_bona_fide = FullLeaderSeqSearch(self.list_of_crisprs_bona_fide, self.strand_results["Bona-fide"], self.dna)
202 |         self.leader_results_bona_fide, self.downstream_results_bona_fide = flss_bona_fide.output()
203 | 
204 |         flss_alternative = FullLeaderSeqSearch(self.list_of_crisprs_alternative, self.strand_results["Alternative"],
205 |                                                self.dna)
206 |         self.leader_results_alternative, self.downstream_results_alternative = flss_alternative.output()
207 | 
208 |         flss_possible = FullLeaderSeqSearch(self.list_of_crisprs_possible, self.strand_results["Possible"], self.dna)
209 |         self.leader_results_possible, self.downstream_results_possible = flss_possible.output()
210 | 
211 |     def output(self):
212 |         return self.data_with_all_computations
213 | 


--------------------------------------------------------------------------------
/components/module_output_maker.py:
--------------------------------------------------------------------------------
 1 | from components.components_output_maker import SimpleOutputMaker
 2 | from components.components_output_maker import SummaryOutputMaker
 3 | from components.components_output_maker import SummaryMakerCSV
 4 | from components.components_output_maker import PickleOutputMaker
 5 | from components.components_output_maker import CasSummaryMaker
 6 | from components.components_output_maker import FastaOutputArrayMaker
 7 | from components.components_output_maker import JsonOutputMaker
 8 | 
 9 | from components.components_output_maker import CompleteFastaOutputMaker
10 | from components.components_output_maker import CompleteFolderSummaryMaker
11 | from components.components_output_maker import CompleteCasSummaryFolderMaker
12 | from components.components_output_maker import SpacerSummaryMaker
13 | from components.components_output_maker import CompleteSpacerCSVMaker
14 | 
15 | 
16 | class OutputMaker:
17 |     def __init__(self, file_path, parameters, flags, result_path, pickle_result_path,
18 |                  json_result_path, categories, non_array_data, list_features, header):
19 |         self.file_path = file_path
20 |         self.parameters = parameters
21 |         self.flags = flags
22 |         self.result_path = result_path
23 |         self.pickle_result_path = pickle_result_path
24 |         self.json_result_path = json_result_path
25 |         self.categories = categories
26 |         self.non_array_data = non_array_data
27 |         self.list_features = list_features
28 |         self.header = header
29 |         self.global_result_folder = "/".join(self.result_path.split("/")[:-1])
30 | 
31 |         self._make_output()
32 | 
33 |     def _make_output(self):
34 |         som = SimpleOutputMaker(categories=self.categories,
35 |                                 result_path=self.result_path,
36 |                                 non_array_data=self.non_array_data,
37 |                                 list_features=self.list_features)
38 | 
39 |         suom = SummaryOutputMaker(result_path=self.result_path,
40 |                                   categories=self.categories,
41 |                                   non_array_data=self.non_array_data,
42 |                                   header=self.header,
43 |                                   list_feature_names=self.list_features)
44 | 
45 |         ssm = SpacerSummaryMaker(categories=self.categories,
46 |                                 result_path=self.result_path)
47 | 
48 |         sm_csv = SummaryMakerCSV(result_path=self.result_path,
49 |                                  categories=self.categories,
50 |                                  non_array_data=self.non_array_data)
51 | 
52 |         if self.flags["flag_cas"] is True:
53 |             sm_cas = CasSummaryMaker(result_path=self.result_path,
54 |                                      non_array_data=self.non_array_data)
55 | 
56 | 
57 |         #cfsm = CompleteFolderSummaryMaker(folder_result=self.global_result_folder)
58 |         #ccfsm = CompleteCasSummaryFolderMaker(folder_result=self.global_result_folder)
59 | 
60 |         if self.flags["flag_fasta_report"] is True:
61 |             foam = FastaOutputArrayMaker(folder_result=self.result_path,
62 |                                          categories=self.categories,
63 |                                          non_array_data=self.non_array_data)
64 | 
65 |             #cfom = CompleteFastaOutputMaker(folder_result=self.global_result_folder)
66 | 
67 |         if self.pickle_result_path:
68 |             pom = PickleOutputMaker(file_path=self.file_path,
69 |                                     pickle_result_folder=self.pickle_result_path,
70 |                                     parameters=self.parameters,
71 |                                     categories=self.categories,
72 |                                     non_array_data=self.non_array_data,
73 |                                     header=self.header,
74 |                                     list_feature_names=self.list_features)
75 | 
76 |         if self.json_result_path:
77 |             jom = JsonOutputMaker(file_path=self.file_path,
78 |                                   json_result_folder=self.json_result_path,
79 |                                   categories=self.categories,
80 |                                   non_array_data=self.non_array_data,
81 |                                   list_feature_names=self.non_array_data)
82 | 


--------------------------------------------------------------------------------
/components/pipeline.py:
--------------------------------------------------------------------------------
 1 | from components.module_detection import Detection
 2 | from components.module_detection_refinement import DetectionRefinement
 3 | from components.module_evaluation import ArrayEvaluation
 4 | from components.module_evaluated_arrays_enhancement import EvaluatedArraysEnhancement
 5 | from components.module_non_array_computations import NonArrayComputations
 6 | from components.module_output_maker import OutputMaker
 7 | 
 8 | 
 9 | class Pipeline:
10 |     def __init__(self, result_folder_path, pickle_folder_path, json_folder_path, file_path,
11 |                  list_ml_classifiers, list_features, parameters, flags, flag_dev_mode, absolute_directory_path):
12 |         self.result_folder_path = result_folder_path + "/" + file_path.split("/")[-1].split(".")[0]
13 |         self.pickle_folder_path = pickle_folder_path
14 |         self.json_folder_path = json_folder_path
15 |         self.file_path = file_path
16 |         self.list_ml_classifiers = list_ml_classifiers
17 |         self.list_features = [features.strip().split(".") for features in list_features]
18 |         self.flags = flags
19 |         self.parameters = parameters
20 |         self.flag_dev_mode = flag_dev_mode
21 |         self.absolute_directory_path = absolute_directory_path
22 | 
23 |         self.header = None
24 |         self.dict_fuzzy_crisprs = {}
25 |         self.dict_crispr_candidates = {}
26 |         self.categories = {}
27 |         self.non_array_data = {}
28 | 
29 |         self._get_header()
30 |         self._run_detection()
31 |         self._run_detection_refinement()
32 |         self._run_evaluation()
33 |         self._results_enhancement()
34 |         self._run_non_crispr_computation()
35 |         self._write_output()
36 | 
37 |     def _get_header(self):
38 |         with open(self.file_path) as f:
39 |             self.header = f.readline()
40 | 
41 |     def _run_detection(self):
42 |         print("1. Run initial array detection")
43 |         detection = Detection(file_path=self.file_path,
44 |                               flags=self.flags,
45 |                               parameters=self.parameters,
46 |                               flag_dev_mode=self.flag_dev_mode)
47 |         self.dict_fuzzy_crisprs = detection.output()
48 | 
49 |     def _run_detection_refinement(self):
50 |         print("2. Refine detected arrays")
51 |         det_ref = DetectionRefinement(dict_fuzzy_crisprs=self.dict_fuzzy_crisprs,
52 |                                       parameters=self.parameters,
53 |                                       flag_dev_mode=self.flag_dev_mode)
54 |         self.dict_crispr_candidates = det_ref.output()
55 | 
56 |     def _run_evaluation(self):
57 |         print("3. Evaluate candidates")
58 |         ae = ArrayEvaluation(dict_crispr_array_candidates=self.dict_crispr_candidates,
59 |                              list_ml_classifiers=self.list_ml_classifiers,
60 |                              list_features=self.list_features,
61 |                              parameters=self.parameters,
62 |                              flag_dev_mode=self.flag_dev_mode)
63 |         self.categories = ae.output()
64 | 
65 |     def _results_enhancement(self):
66 |         print("4. Enhance evaluated arrays")
67 |         a_enh = EvaluatedArraysEnhancement(file_path=self.file_path,
68 |                                            categories=self.categories,
69 |                                            parameters=self.parameters,
70 |                                            flag_dev_mode=self.flag_dev_mode)
71 |         self.categories = a_enh.output()
72 | 
73 |     def _run_non_crispr_computation(self):
74 |         print("5. Complement arrays with additional info")
75 |         nac = NonArrayComputations(file_path=self.file_path,
76 |                                    categories=self.categories,
77 |                                    flags_non_arrays_computations=self.flags,
78 |                                    flag_dev_mode=self.flag_dev_mode,
79 |                                    absolute_directory_path=self.absolute_directory_path)
80 |         self.non_array_data = nac.output()
81 | 
82 |     def _write_output(self):
83 |         print("6. Write down the results")
84 |         om = OutputMaker(file_path=self.file_path,
85 |                          parameters=self.parameters,
86 |                          flags=self.flags,
87 |                          result_path=self.result_folder_path,
88 |                          pickle_result_path=self.pickle_folder_path,
89 |                          json_result_path=self.json_folder_path,
90 |                          categories=self.categories,
91 |                          non_array_data=self.non_array_data,
92 |                          list_features=self.list_features,
93 |                          header=self.header)
94 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: crispr_identify_env
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - nodefaults
 6 |   - biobuilds
 7 |   - r
 8 |   - axfeh
 9 | dependencies:
10 |   - python==3.7.6
11 |   - pip
12 |   - python_abi=3.7
13 |   - biopython=1.76
14 |   - h5py=2.10.0
15 |   - hdf5=1.10.6
16 |   - hmmer=3.3
17 |   - numpy==1.18.1
18 |   - pandas=1.0.3
19 |   - matplotlib=3.1.3
20 |   - perl=5.26.2
21 |   - perl-compress-bgzf=0.005
22 |   - perl-threaded=5.26.0
23 |   - perl-yaml=1.29
24 |   - prodigal==2.6.3
25 |   - dill=0.3.3
26 |   - protobuf=3.13.0.1
27 |   - regex=2019.03.09
28 |   - pyasn1=0.4.8
29 |   - pycparser=2.20
30 |   - networkx=2.5
31 |   - pyjwt=1.7.1
32 |   - pyparsing=2.4.7
33 |   - pyqt=5.9.2
34 |   - pysocks=1.7.1
35 |   - python-dateutil=2.8.1
36 |   - pytz=2020.1
37 |   - pyyaml=5.3.1
38 |   - scikit-learn==0.22.1
39 |   - scipy=1.4.1
40 |   - anaconda::tensorflow==2.3.0
41 |   - tensorboard==2.3.0
42 |   - tensorboard-plugin-wit==1.6.0
43 |   - viennarna==2.4.15
44 |   - pyopenssl=22.0.0
45 |   - certifi=2022.12.7
46 |   - vmatch==2.3.0
47 |   - clustalo==1.2.3
48 |   - blast==2.5.0
49 |   - keras==2.4.3
50 |   - libffi=3.2.1
51 |   - spacerplacer
52 |   - pip:
53 |     - python-Levenshtein
54 | 


--------------------------------------------------------------------------------
/tools/CRISPRcasIdentifier/README.txt:
--------------------------------------------------------------------------------
1 | link the CRISPRcasIdentifier folder here, such that you have a call like the following 
2 | 
3 | CRISPRidentify/tools/CRISPRcasIdentifier/CRISPRcasIdentifier/CRISPRcasIdentifier.py
4 | 


--------------------------------------------------------------------------------
/tools/blasting/Verified_repeats_dataset1.fa:
--------------------------------------------------------------------------------
 1 | >db1_1
 2 | GATAATCTCTTATAGAATTGAAAG
 3 | >db1_2
 4 | GTTTTTATCGTACCTATGAGGAATTGAAAC
 5 | >db1_3
 6 | GTTTCAGACGAACCCTTGTGGGATTGAAGC
 7 | >db1_4
 8 | GTTTCAGACGAACCCTTGTGGGGTTGAAGC
 9 | >db1_5
10 | GTTTCAGACGAACCCTTGTGGGTTTGAAGC
11 | >db1_6
12 | GATTAATCCCAAAAGGAATTGAAAG
13 | >db1_7
14 | GTCGCGTCCTCACGGGCGCGTGGATTGAAAC
15 | >db1_8
16 | GAGTTCCCCGCGCCAGCGGGGATAAACCG
17 | >db1_9
18 | GTGTTCCCCGCGCCAGCGGGGATAAACCG
19 | >db1_10
20 | GTTCACTGCCGTGTAGGCAGCTAAGAAA
21 | >db1_11
22 | GTTCACTGCCGTACAGGCAGCTTAGAAA
23 | >db1_12
24 | GTTGAAGTGGTACTTCCAGTAAAACAAGGATTGAAAC
25 | >db1_13
26 | CTAAAAGAATAACTTGCAAAATAACAAGCATTGAAAC
27 | >db1_14
28 | CTTTCCTTCTACTAATCCCGGCGATCGGGACTGAAAC
29 | >db1_15
30 | GTTTTAGAGCTATGCTGTTTTGAATGGTCCCAAAAC
31 | >db1_16
32 | GTTGTAGCTCCCTTTCTCATTTCGCAGTGCTACAAT
33 | >db1_17
34 | GTTTTAGTCCCTTTTTAAATTTCTTTATGGTAAAAT
35 | >db1_18
36 | GTTCCAATAAGACTAAAATAGAATTGAAAG
37 | >db1_19
38 | GATCGATACCCACCCCGAAGAAAAGGGGACGAGAAC
39 | >db1_20
40 | GTTCAACACCCTCTTTTCCCCGTCAGGGGACTGAAAC
41 | >db1_21
42 | GTCTCCACTCGTAGGAGAAATTAATTGATTGGAAAC
43 | >db1_22
44 | GAACAACTCAAAAGAGAATTGCAAG
45 | >db1_23
46 | ATTAAAATCAGACCGTTTCGGAATGGAAAT
47 | >db1_24
48 | GTTTTATATTAACTAAGTGGTATGTAAAG
49 | >db1_25
50 | GAATCTCAAAAAGAGGATTGAAAG
51 | >db1_26
52 | GTGGAAATCAAAAGATAGTAGAAAC
53 | >db1_27
54 | GGTTTTAGTACTCTGTAATTTTAG
55 | 


--------------------------------------------------------------------------------
/tools/blasting/Verified_repeats_dataset1.fa.nhr:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/tools/blasting/Verified_repeats_dataset1.fa.nhr


--------------------------------------------------------------------------------
/tools/blasting/Verified_repeats_dataset1.fa.nin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/tools/blasting/Verified_repeats_dataset1.fa.nin


--------------------------------------------------------------------------------
/tools/blasting/Verified_repeats_dataset1.fa.nog:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/tools/blasting/Verified_repeats_dataset1.fa.nog


--------------------------------------------------------------------------------
/tools/blasting/Verified_repeats_dataset1.fa.nsd:
--------------------------------------------------------------------------------
 1 | db1_10
 2 | db1_109
 3 | db1_1110
 4 | db1_1211
 5 | db1_1312
 6 | db1_1413
 7 | db1_1514
 8 | db1_1615
 9 | db1_1716
10 | db1_1817
11 | db1_1918
12 | db1_21
13 | db1_2019
14 | db1_2120
15 | db1_2221
16 | db1_2322
17 | db1_2423
18 | db1_2524
19 | db1_2625
20 | db1_2726
21 | db1_32
22 | db1_43
23 | db1_54
24 | db1_65
25 | db1_76
26 | db1_87
27 | db1_98
28 | lcl|db1_10
29 | lcl|db1_109
30 | lcl|db1_1110
31 | lcl|db1_1211
32 | lcl|db1_1312
33 | lcl|db1_1413
34 | lcl|db1_1514
35 | lcl|db1_1615
36 | lcl|db1_1716
37 | lcl|db1_1817
38 | lcl|db1_1918
39 | lcl|db1_21
40 | lcl|db1_2019
41 | lcl|db1_2120
42 | lcl|db1_2221
43 | lcl|db1_2322
44 | lcl|db1_2423
45 | lcl|db1_2524
46 | lcl|db1_2625
47 | lcl|db1_2726
48 | lcl|db1_32
49 | lcl|db1_43
50 | lcl|db1_54
51 | lcl|db1_65
52 | lcl|db1_76
53 | lcl|db1_87
54 | lcl|db1_98
55 | 


--------------------------------------------------------------------------------
/tools/blasting/Verified_repeats_dataset1.fa.nsi:
--------------------------------------------------------------------------------
1 |         b   6      @                 b   4   <db1_10 


--------------------------------------------------------------------------------
/tools/blasting/Verified_repeats_dataset1.fa.nsq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/tools/blasting/Verified_repeats_dataset1.fa.nsq


--------------------------------------------------------------------------------
/tools/blasting/Verified_repeats_dataset2.fa:
--------------------------------------------------------------------------------
  1 | >db2_1
  2 | CTAAAAGAATAACTTGCAAAATAACAAGCATTGAAAC
  3 | >db2_2
  4 | GTTGAAGTGGTACTTCCAGTAAAACAAGGATTGAAAC
  5 | >db2_3
  6 | GTTTCAATCCACGCGCCCGTGAGGACGCGAC
  7 | >db2_4
  8 | GTTCCAATAAGACTAAAATAGAATTGAAAG
  9 | >db2_5
 10 | GTTTTTATCGTACCTATGAGGAATTGAAAC
 11 | >db2_6
 12 | GATTAATCCCAAAAGGAATTGAAAG
 13 | >db2_7
 14 | GATTAATCCTAAAAGGAATTGAAAG
 15 | >db2_8
 16 | CTTTCAATTCCTTTTGGGATTCATC
 17 | >db2_9
 18 | GAATCTCAAAAAGAGGATTGAAAG
 19 | >db2_10
 20 | CTTTCAATTCTATAAGAGATTATC
 21 | >db2_11
 22 | CTAAAAGAATAACTTGCAAAATAACAAGCATTGAAAC
 23 | >db2_12
 24 | GTTGAAGTGGTACTTCCAGTAAAACAAGGATTGAAAC
 25 | >db2_13
 26 | GTTGAAGAGGTACTTCCAGTAAAACAAGGATTGAAAC
 27 | >db2_14
 28 | GTTTCAATCCACGCGCCCGTGAGGACGCGAC
 29 | >db2_15
 30 | GTCGCGCCCTCACGGGCGCGTGGATTGAAAC
 31 | >db2_16
 32 | GTTTGTATCGTACCTATGAGGAATTGAAAC
 33 | >db2_17
 34 | CTTTCAATTCTCTTTTAGTCTTATTGGAAC
 35 | >db2_18
 36 | GTTACAATAAGACTAAAATAGAATTGAAAG
 37 | >db2_19
 38 | ATTTCAATTCCTCATAGGTACGATAAAAAC
 39 | >db2_20
 40 | GTTCCAATAAGACTAAAATAGAATTGAAAG
 41 | >db2_21
 42 | GTTTTTATCGTACCTATGAGGAATTGAAAC
 43 | >db2_22
 44 | GTTTTTATCTTACCTATGAGGAATTGAAAC
 45 | >db2_23
 46 | GTTTCAATTCCTCATAGGTACGATCAAAAC
 47 | >db2_24
 48 | GTTCCAATAAGACTAAAATAGAATTGAAA
 49 | >db2_25
 50 | GATTAATCCCAAAAGGAATTGAAAG
 51 | >db2_26
 52 | GATTAATCCTAAAAGGAATTGAAAG
 53 | >db2_27
 54 | CTTTCAATTCCTTTTGGGATTCATC
 55 | >db2_28
 56 | CTTTCAATCCCTTTTGGGATTCATC
 57 | >db2_29
 58 | GAATCTCAAAAAGAGGATTGAAAG
 59 | >db2_30
 60 | CTTTCAATTCTATAAGAGATTATC
 61 | >db2_31
 62 | CTTTCAATCCTCTCTTTGAGATTC
 63 | >db2_32
 64 | CTTTCAATCCTCTTCTTGAGATTC
 65 | >db2_33
 66 | GTATCTCAAAAAGAGGATTGAAAG
 67 | >db2_34
 68 | CTAAAAGAATAACTTGCAAAATAACAAGCATTGAAAC
 69 | >db2_35
 70 | GTTGAAGTGGTACTTCCAGTAAAACAAGGATTGAAAC
 71 | >db2_36
 72 | GTTGAAGAGGTACTTCCAGTAAAACAAGGATTGAAAC
 73 | >db2_37
 74 | GTTTCAATCCACGCGCCCGTGAGGACGCGAC
 75 | >db2_38
 76 | GTTTCAATTCCTCATAGGTACGATCAAAACG
 77 | >db2_39
 78 | GTCGCGCCCTCACGGGCGCGTGGATTGAAAC
 79 | >db2_40
 80 | CTTTCAATTCTTTTGTAGTCTTATTGGAAC
 81 | >db2_41
 82 | GTTTGTATTTTACCTATGAGGAATTGAAAC
 83 | >db2_42
 84 | GTTTGTATCGTACCTATGAGGAATTGAAAC
 85 | >db2_43
 86 | CTTTCAATTCTCTTTTAGTCTTATTGGAAC
 87 | >db2_44
 88 | GTTACAATAAGACTAAAATAGAATTGAAAG
 89 | >db2_45
 90 | GTTTTTAGACTACCTATGAGGAATTGAAAC
 91 | >db2_46
 92 | ATTTCAATTCCTCATAGGTACGATACAAAC
 93 | >db2_47
 94 | GTTTTTAGAGTGCCTATGAGGAATTGAAAC
 95 | >db2_48
 96 | GTTCCAATAAGACTCTAAGAGAATTGAAAG
 97 | >db2_49
 98 | ATTTCAATTCCTCATAGGTACGATAAAAAC
 99 | >db2_50
100 | GTTCCAATAAGACTCCAAGAGAATTGAAAG
101 | >db2_51
102 | CTTTCAATTCTCTTTGAGTCTTATTGGAAC
103 | >db2_52
104 | GTTCCAATAAGACTAAAATAGAATTGAAAG
105 | >db2_53
106 | GTTTTGAGCCTACCTATGAGGAATTGAAAC
107 | >db2_54
108 | GTTCCAATAAGACTATAAGAGAATTGAAAG
109 | >db2_55
110 | GTTTTTATCGTACCTATGAGGAATTGAAAC
111 | >db2_56
112 | GTTTCAATTCCTCATAGGTACGCTGAAAAC
113 | >db2_57
114 | GTTTCAATTCCTGATAGGTAGGCTAAAAAC
115 | >db2_58
116 | GTTTGTATCGTACCTATGAGGGATTGAAAC
117 | >db2_59
118 | GTTTCAATTCCTCATAGGTAAGCTAACAAC
119 | >db2_60
120 | GTTTCAATTCCTCATAGGTACGCTGAGAAC
121 | >db2_61
122 | GTTTTTATCTTACCTATGAGGAATTGAAAC
123 | >db2_62
124 | GTTCCAATAAGACTTTAAAAGAATTGAAAG
125 | >db2_63
126 | CTTTCAATTCTATTTTGGTCTTATTGTAAC
127 | >db2_64
128 | GTTTGTATCTTAACTATGAGGAATTGAAAC
129 | >db2_65
130 | GTTTCAATTCCTCATAGGTACGATCAAAAC
131 | >db2_66
132 | GTTTTTAGCTTACCTATGAGGGATTGAAAC
133 | >db2_67
134 | GTTTGTATCTTACCTATGAGGAATTGAAAC
135 | >db2_68
136 | GTTCCAATAAGACTAAAATAGAATTGAAA
137 | >db2_69
138 | GTTTTTATCGTACCTATGAGGGATTGAAA
139 | >db2_70
140 | GTTTTTATCTTACCTATGAGGAATTGAAA
141 | >db2_71
142 | GATTAATCCCAAAAGGAATTGAAAG
143 | >db2_72
144 | GATTAATCCTAAAAGGAATTGAAAG
145 | >db2_73
146 | CTTTCAATTCCTTTTGGGATTCATC
147 | >db2_74
148 | CTTTCAATCCCTTTTGGGATTCATC
149 | >db2_75
150 | GATAATCTACTATAGAATTGAAAG
151 | >db2_76
152 | GAATCTCAAAAAGAGGATTGAAAG
153 | >db2_77
154 | CTTTCAATTCTATAAGAGATTATC
155 | >db2_78
156 | CTTTCAATCCTCTCTTTGAGATTC
157 | >db2_79
158 | CTTTCAATCCTCTTCTTGAGATTC
159 | >db2_80
160 | GTATCTCAAAAAGAGGATTGAAAG
161 | >db2_81
162 | CTAAAAGAATAACTTGCAAAATAACAAGCATTGAAAC
163 | >db2_82
164 | GTTGAAGTGGTACTTCCAGTAAAACAAGGATTGAAAC
165 | >db2_83
166 | GTTGAAGAGGTACTTCCAGTAAAACAAGGATTGAAAC
167 | >db2_84
168 | GTTTCAATCCACGCGCCCGTGAGGACGCGAC
169 | >db2_85
170 | GTTTCAATTCCTCATAGGTACGATCAAAACG
171 | >db2_86
172 | TCTTTCAATTCTATTTTGGTCTTATTGTAAC
173 | >db2_87
174 | GTCGCGCTCTCACGAGCGCGTGGATTGAAAC
175 | >db2_88
176 | GTTTCAATCCACGCCCCCGTGAGGGAGCGAC
177 | >db2_89
178 | GTCGCGCCCTCACGGGCGCGTGGATTGAAAC
179 | >db2_90
180 | CTTTCAATTCTTTTGTAGTCTTATTGGAAC
181 | >db2_91
182 | GTTTGTATTTTACCTATGAGGAATTGAAAC
183 | >db2_92
184 | GTTTGTATCGTACCTATGAGGAATTGAAAC
185 | >db2_93
186 | CTTTCAATTCTCTTTTAGTCTTATTGGAAC
187 | >db2_94
188 | GTTACAATAAGACTAAAATAGAATTGAAAG
189 | >db2_95
190 | GTTTTTAGACTACCTATGAGGAATTGAAAC
191 | >db2_96
192 | ATTTCAATTCCTCATAGGTACGATACAAAC
193 | >db2_97
194 | GTTTTTAGAGTGCCTATGAGGAATTGAAAC
195 | >db2_98
196 | GTTCCAATAAGACTCTAAGAGAATTGAAAG
197 | >db2_99
198 | ATTTCAATTCCTCATAGGTACGATAAAAAC
199 | >db2_100
200 | GTTCCAATAAGACTCCAAGAGAATTGAAAG
201 | >db2_101
202 | CTTTCAATTCTCTTTGAGTCTTATTGGAAC
203 | >db2_102
204 | GTTCCAATAAGACTAAAATAGAATTGAAAG
205 | >db2_103
206 | GTTTTGAGCCTACCTATGAGGAATTGAAAC
207 | >db2_104
208 | GTTTTTATCGTACCTATGAGGAATTGAAAC
209 | >db2_105
210 | GTTCCAATAAGACTATAAGAGAATTGAAAG
211 | >db2_106
212 | GTTTCAATTCCTCATAGGTACGCTGAAAAC
213 | >db2_107
214 | GTTTCAATTCCTGATAGGTAGGCTAAAAAC
215 | >db2_108
216 | GTTTGTATCGTACCTATGAGGGATTGAAAC
217 | >db2_109
218 | GTTTCAATTCCTCATAGGTAAGCTAACAAC
219 | >db2_110
220 | GTTTCAATTCCTCATAGGTACGCTGAGAAC
221 | >db2_111
222 | GTTTTTATCTTACCTATGAGGAATTGAAAC
223 | >db2_112
224 | GTTCCAATAAGACTTTAAAAGAATTGAAAG
225 | >db2_113
226 | GTTTGTATCTTAACTATGAGGAATTGAAAC
227 | >db2_114
228 | CTTTCAATTCTATTTTGGTCTTATTGTAAC
229 | >db2_115
230 | GTTTCAATTCCTCATAGGTACGATCAAAAC
231 | >db2_116
232 | GTTTTTAGCTTACCTATGAGGGATTGAAAC
233 | >db2_117
234 | GTTTGTATCTTACCTATGAGGAATTGAAAC
235 | >db2_118
236 | GTTCCAATAAGACTAAAATAGAATTGAAA
237 | >db2_119
238 | GTTTTTATCGTACCTATGAGGGATTGAAA
239 | >db2_120
240 | GTTTTTATCTTACCTATGAGGAATTGAAA
241 | >db2_121
242 | GATTAATCCCAAAAGGAATTGAAAG
243 | >db2_122
244 | GATTAATCCTAAAAGGAATTGAAAG
245 | >db2_123
246 | CTTTCAATTCCTTTTGGGATTCATC
247 | >db2_124
248 | CTTTCAATCCCTTTTGGGATTCATC
249 | >db2_125
250 | GAATCTCAAAAAGAGGATTGAAAG
251 | >db2_126
252 | GATAATCTACTATAGAATTGAAAG
253 | >db2_127
254 | GAATCTCAAGTTGAGGATTGAAAG
255 | >db2_128
256 | CTTTCAATTCTATAAGAGATTATC
257 | >db2_129
258 | CTTTCAATCCTCTCTTTGAGATTC
259 | >db2_130
260 | CTTTCAATCCTCTTCTTGAGATTC
261 | >db2_131
262 | GTATCTCAAAAAGAGGATTGAAAG
263 | >db2_132
264 | CTAAAAGAATAACTTGCAAAATAACAAGCATTGAAAC
265 | >db2_133
266 | GTTGAAGTGGTACTTCCAGTAAAACAAGGATTGAAAC
267 | >db2_134
268 | GTTGAAGAGGTACTTCCAGTAAAACAAGGATTGAAAC
269 | >db2_135
270 | GTTTCAATCCTTATTTTACTGGATGTTCTACTTCAAC
271 | >db2_136
272 | TGTTTCAATTCCTCATAGGTACGCTGAAAACC
273 | >db2_137
274 | GTTTCAATCCACGCGCCCGTGAGGACGCGAC
275 | >db2_138
276 | GTTTCAATTCCTCATAGGTACGATCAAAACG
277 | >db2_139
278 | TCTTTCAATTCTATTTTGGTCTTATTGTAAC
279 | >db2_140
280 | GTCGCGCTCTCACGAGCGCGTGGATTGAAAC
281 | >db2_141
282 | GTTTCAATCCACGCCCCCGTGAGGGAGCGAC
283 | >db2_142
284 | GTCGCGCCCTCACGGGCGCGTGGATTGAAAC
285 | >db2_143
286 | CTTTCAATTCTTTTGTAGTCTTATTGGAAC
287 | >db2_144
288 | GTTTGTATTTTACCTATGAGGAATTGAAAC
289 | >db2_145
290 | GTTTGTATCGTACCTATGAGGAATTGAAAC
291 | >db2_146
292 | CTTTCAATTCTCTTTTAGTCTTATTGGAAC
293 | >db2_147
294 | GTTACAATAAGACTAAAATAGAATTGAAAG
295 | >db2_148
296 | GTTTTTAGACTACCTATGAGGAATTGAAAC
297 | >db2_149
298 | ATTTCAATTCCTCATAGGTACGATACAAAC
299 | >db2_150
300 | GTTTCAATTCCTCTTAGGCACTCTAAAAAC
301 | >db2_151
302 | GTTTTTAGAGTGCCTATGAGGAATTGAAAC
303 | >db2_152
304 | GTTTATAGAATACCTATGAGGAATTGAAAC
305 | >db2_153
306 | GTTCCAATAAGACTCTAAGAGAATTGAAAG
307 | >db2_154
308 | ATTTCAATTCCTCATAGGTACGATAAAAAC
309 | >db2_155
310 | GTTTTTAGCTTACCTATAAGGGATTGAAAC
311 | >db2_156
312 | GTTGCAATAAGACTCTAAGAGAATTGAAAG
313 | >db2_157
314 | GTTTTTAGCCTACCTATGAGGGATTGAAAT
315 | >db2_158
316 | GTTCCAATAAGACTCCAAGAGAATTGAAAG
317 | >db2_159
318 | CTTTCAATTCTCTTTGAGTCTTATTGGAAC
319 | >db2_160
320 | GTTCCAATAAGACTAAAATAGAATTGAAAG
321 | >db2_161
322 | GTTTATAGCCTACCTATAAGGAATTGAAAC
323 | >db2_162
324 | GTTTGTAGCCTACCTATGAGGGATTGAAAC
325 | >db2_163
326 | GTTTTTAGAGTGCCTATAAGGAATTGAAAC
327 | >db2_164
328 | GTTTTGAGCCTACCTATGAGGAATTGAAAC
329 | >db2_165
330 | GTTCCAATAAGACTATAAGAGAATTGAAAG
331 | >db2_166
332 | GTTTTTATCGTACCTATGAGGAATTGAAAC
333 | >db2_167
334 | GTTTCAATTCCTCATAGGTACGCTGAAAAC
335 | >db2_168
336 | GTTTCAATTCCTGATAGGTAGGCTAAAAAC
337 | >db2_169
338 | GTTTGTATCGTACCTATGAGGGATTGAAAC
339 | >db2_170
340 | GTTTCAATTCCTCATAGGTAAGCTAACAAC
341 | >db2_171
342 | GTTTCAATAAGACTCTAAGAGAATTGAAAG
343 | >db2_172
344 | GTTTCAATCCCTCATAGGTAAGCTAACAAC
345 | >db2_173
346 | ATTTCAATTCCTCATAGATAGGCTAAAAAC
347 | >db2_174
348 | GTTTCAATTCCTCATAGGTACGCTGAGAAC
349 | >db2_175
350 | GTTCCAATAAGACTTTAAAAGAATTGAAAG
351 | >db2_176
352 | GTTTTTATCTTACCTATGAGGAATTGAAAC
353 | >db2_177
354 | CTTTCAATTCTATTTTGGTCTTATTGTAAC
355 | >db2_178
356 | GTTTGTATCTTAACTATGAGGAATTGAAAC
357 | >db2_179
358 | GTTTCAATTCCTCATAGGTACGATCAAAAC
359 | >db2_180
360 | GTTTTTAGCCTACCTATAAGGAATTGAAAT
361 | >db2_181
362 | GTTTCAATCCCTTATAGGTAGGCTAAAAAC
363 | >db2_182
364 | GTTTCAATCCCTAATAGGTATGCTAAAAAC
365 | >db2_183
366 | GTTTTTAGCTTACCTATGAGGGATTGAAAC
367 | >db2_184
368 | GTTTGTATCTTACCTATGAGGAATTGAAAC
369 | >db2_185
370 | GTTCCAATAAGACTAAAATAGAATTGAAA
371 | >db2_186
372 | GTTTTTATCGTACCTATGAGGGATTGAAA
373 | >db2_187
374 | GTTTTTATCTTACCTATGAGGAATTGAAA
375 | >db2_188
376 | GATTAATCCCAAAAGGAATTGAAAG
377 | >db2_189
378 | GATTAATCCTAAAAGGAATTGAAAG
379 | >db2_190
380 | CTTTCAATTCCTTTTGGGATTCATC
381 | >db2_191
382 | CTTTCAATCCCTTTTGGGATTCATC
383 | >db2_192
384 | GATAATCTACTATAGAATTGAAAG
385 | >db2_193
386 | GAATCTCAAAAAGAGGATTGAAAG
387 | >db2_194
388 | GAATCTCAAGTTGAGGATTGAAAG
389 | >db2_195
390 | CTTTCAATTCTATAAGAGATTATC
391 | >db2_196
392 | CTTTCAATCCTCTCTTTGAGATTC
393 | >db2_197
394 | CTTTCAATCCTCTTCTTGAGATTC
395 | >db2_198
396 | GTATCTCAAAAAGAGGATTGAAAG
397 | >db2_199
398 | CTAAAAGAATAACTTGCAAAATAACAAGCATTGAAAC
399 | >db2_200
400 | GTTGAAGTGGTACTTCCAGTAAAACAAGGATTGAAAC
401 | >db2_201
402 | GTTGAAGAGGTACTTCCAGTAAAACAAGGATTGAAAC
403 | >db2_202
404 | GTTTCAATCCTTATTTTACTGGATGTTCTACTTCAAC
405 | >db2_203
406 | TGTTTCAATTCCTCATAGGTACGCTGAAAACC
407 | >db2_204
408 | GTTTCAATCCACGCGCCCGTGAGGACGCGAC
409 | >db2_205
410 | GTTTCAATTCCTCATAGGTACGATCAAAACG
411 | >db2_206
412 | GTCGCACCCTTGCGGGTGCGTGGATTGAAAC
413 | >db2_207
414 | GTCGCGCTCTCACGAGCGCGTGGATTGAAAC
415 | >db2_208
416 | TCTTTCAATTCTATTTTGGTCTTATTGTAAC
417 | >db2_209
418 | GTTTCAATCCACGCCCCCGTGAGGGAGCGAC
419 | >db2_210
420 | GTTTCAATCCGCGCCCCCGTGAGAGGGCGAC
421 | >db2_211
422 | GTCGCGCCCTCACGGGCGCGTGGATTGAAAC
423 | >db2_212
424 | GTTTCAATCCCTTATAGGTAGGCTAAAAACC
425 | >db2_213
426 | GTTTCAATTCTCCTAGAGTCTTATTGCAAC
427 | >db2_214
428 | CTTTCAATTCTTTTGTAGTCTTATTGGAAC
429 | >db2_215
430 | GTTTGTATTTTACCTATGAGGAATTGAAAC
431 | >db2_216
432 | GTTTCCAGCCTACCTATGAGGGATTGAAAC
433 | >db2_217
434 | GTTTTGTTTGTACCTATAGGGGATTGAAAC
435 | >db2_218
436 | GTTTGTATCGTACCTATGAGGAATTGAAAC
437 | >db2_219
438 | GTTTTTAGCCTACCTAAAAGGGATTGAAAC
439 | >db2_220
440 | CTTTCAATTCTCTTTTAGTCTTATTGGAAC
441 | >db2_221
442 | GTTGAAATCAGACTAATGTAGGATTGAAAG
443 | >db2_222
444 | GTTACAATAAGACTAAAATAGAATTGAAAG
445 | >db2_223
446 | GTTGAAATCAGACCAAAATGGGATTGAAAG
447 | >db2_224
448 | CTTTCTACAGTACCTATAAGGAATTGAAAT
449 | >db2_225
450 | ATTTCAATTCCTCATAGGTACGATACAAAC
451 | >db2_226
452 | GTTTTTAGACTACCTATGAGGAATTGAAAC
453 | >db2_227
454 | GTTTCAATTCCTCTTAGGCACTCTAAAAAC
455 | >db2_228
456 | GTTTTTAGAGTGCCTATGAGGAATTGAAAC
457 | >db2_229
458 | GTTTGTAGCGTGCCTATAAGGGATTGAAAC
459 | >db2_230
460 | GTTTATAGAATACCTATGAGGAATTGAAAC
461 | >db2_231
462 | GTTTCAATCCCAGATTGGTTCGATTAAAAC
463 | >db2_232
464 | GTTCCAATAAGACTCTAAGAGAATTGAAAG
465 | >db2_233
466 | ATTTCAATTCCTCATAGGTACGATAAAAAC
467 | >db2_234
468 | GTTTGAAGTTTACCTATGAGGAATTGAAAC
469 | >db2_235
470 | ATTTCAATCCCAAAATGGTCTGATTTTAAC
471 | >db2_236
472 | GTTTTTAGCTTACCTATAAGGGATTGAAAC
473 | >db2_237
474 | GTTGCAATAAGACTCTAAGAGAATTGAAAG
475 | >db2_238
476 | GTTTTTAGCCTACCTATGAGGGATTGAAAT
477 | >db2_239
478 | GTTCCAATAAGACTCCAAGAGAATTGAAAG
479 | >db2_240
480 | GTTGCAATAAGACTCGAGGAGAATTGAAAG
481 | >db2_241
482 | CTTTCAATTCTCTTTGAGTCTTATTGGAAC
483 | >db2_242
484 | GTTCCAATAAGACTAAAATAGAATTGAAAG
485 | >db2_243
486 | GTTTGTAGCCTACCTATGAGGGATTGAAAC
487 | >db2_244
488 | GTTTATAGCCTACCTATAAGGAATTGAAAC
489 | >db2_245
490 | GTTTTTAGAGTGCCTATAAGGAATTGAAAC
491 | >db2_246
492 | GTTTTGAGCCTACCTATGAGGAATTGAAAC
493 | >db2_247
494 | GTTCCAATAAGACTATAAGAGAATTGAAAG
495 | >db2_248
496 | GTTTTTATCGTACCTATGAGGAATTGAAAC
497 | >db2_249
498 | GCTTTAATCGTACCTTTTTGGAATTGAAAC
499 | >db2_250
500 | GTTTCAATTCCTCATAGGTACGCTGAAAAC
501 | >db2_251
502 | GTTTCAATTCCTGATAGGTAGGCTAAAAAC
503 | >db2_252
504 | GTTTGTATCGTACCTATGAGGGATTGAAAC
505 | >db2_253
506 | GCTTTTAGCATACCTATTAGGGATTGAAAC
507 | >db2_254
508 | GTTTCAATAAGACTCTAAGAGAATTGAAAG
509 | >db2_255
510 | GTTTCAATTCCTCATAGGTAAGCTAACAAC
511 | >db2_256
512 | GTTTCAATCCCTCATAGGTAAGCTAACAAC
513 | >db2_257
514 | ATTTCAATTCCTCATAGATAGGCTAAAAAC
515 | >db2_258
516 | GTTGCAATAAGACTCTAGGAGAATTGAAAG
517 | >db2_259
518 | GTTTCAATTCCTCATAGGTACGCTGAGAAC
519 | >db2_260
520 | GTTCCAATAAGACTTTAAAAGAATTGAAAG
521 | >db2_261
522 | GTTTTTATCTTACCTATGAGGAATTGAAAC
523 | >db2_262
524 | GTTTCAATCCCTTATAGGTAGGCTCAAAAC
525 | >db2_263
526 | CTTTCAATTCTATTTTGGTCTTATTGTAAC
527 | >db2_264
528 | GTTTGTATCTTAACTATGAGGAATTGAAAC
529 | >db2_265
530 | GTTTTTAGCCTACCTATAAGGAATTGAAAT
531 | >db2_266
532 | GTTTCAATTCCTCATAGGTACGATCAAAAC
533 | >db2_267
534 | GTTTCAATCCCTTATAGGTAAGCTAACAAC
535 | >db2_268
536 | GTTTCAATCCCTTATAGGTAGGCTAAAAAC
537 | >db2_269
538 | GTTTTTAGCTTACCTATGAGGGATTGAAAC
539 | >db2_270
540 | GTTTCAATCCCTAATAGGTATGCTAAAAAC
541 | >db2_271
542 | GTTTGTATCTTACCTATGAGGAATTGAAAC
543 | >db2_272
544 | GTTTATAGCCTACCTATAAGGGATTGAAAC
545 | >db2_273
546 | GTTTCTACCTTACCTTGGAGGAATTGAAAC
547 | >db2_274
548 | GTTATCAGCCTACCTATAAGGAATTGAAAC
549 | >db2_275
550 | ATTTCAATTCCTCCAAGGTAAGGTAAAAAC
551 | >db2_276
552 | GTTCCAATAAGACTAAAATAGAATTGAAA
553 | >db2_277
554 | GTTTTTATCGTACCTATGAGGGATTGAAA
555 | >db2_278
556 | GTTTCAATCCCTTATAGGTAAGCTAACAA
557 | >db2_279
558 | GTTTTTATCTTACCTATGAGGAATTGAAA
559 | >db2_280
560 | ACTTTCAATCCCTTATGGGATTCTTC
561 | >db2_281
562 | GATTAATCCCAAAAGGAATTGAAAG
563 | >db2_282
564 | CTTTCAATCCCTTTTGGGATGCAAC
565 | >db2_283
566 | GATTAATCCTAAAAGGAATTGAAAG
567 | >db2_284
568 | CTTTCAATTCCTTTTGGGATTCATC
569 | >db2_285
570 | CTTTCAATTCCATTATGGATTAGC
571 | >db2_286
572 | CTTTCAATCCCTTTTGGGATTCATC
573 | >db2_287
574 | GAATCCTATAAATGGAATTGAAAG
575 | >db2_288
576 | GAATCTCAAAAAGAGGATTGAAAG
577 | >db2_289
578 | GATAATCTACTATAGAATTGAAAG
579 | >db2_290
580 | CTTTCAATTCTATAAGAGATTATC
581 | >db2_291
582 | GAATCCTACAAATGGAATTGAAAG
583 | >db2_292
584 | GAATCTCAAGTTGAGGATTGAAAG
585 | >db2_293
586 | CTTTCAATCCTCTCTTTGAGATTC
587 | >db2_294
588 | CTTTCAATCCTCTTCTTGAGATTC
589 | >db2_295
590 | GTATCTCAAAAAGAGGATTGAAAG
591 | >db2_296
592 | CTTTCAATTCTATCTAACAGATTC
593 | >db2_297
594 | GTCGAAGAGCGAGTTCCAGGAAAACAAGGATTGAAAC
595 | >db2_298
596 | CTAAAAGAATAACTTGCAAAATAACAAGCATTGAAAC
597 | >db2_299
598 | GTTGAAGTGGTACTTCCAGTAAAACAAGGATTGAAAC
599 | >db2_300
600 | GTTGAAGAGGTACTTCCAGTAAAACAAGGATTGAAAC
601 | >db2_301
602 | GTTTCAATCCTTATTTTACTGGATGTTCTACTTCAAC
603 | >db2_302
604 | TGTTTCAATTCCTCATAGGTACGCTGAAAACC
605 | >db2_303
606 | GTTTCAATCCACGCGCCCGTGAGGACGCGAC
607 | >db2_304
608 | GTCGCACCCTTGCGGGTGCGTGGATTGAAAC
609 | >db2_305
610 | GTTTCAATTCCTCATAGGTACGATCAAAACG
611 | >db2_306
612 | TCTTTCAATTCTATTTTGGTCTTATTGTAAC
613 | >db2_307
614 | GTCGCGCTCTCACGAGCGCGTGGATTGAAAC
615 | >db2_308
616 | GTTTCAATCCGCGCCCCCGTGAGAGGGCGAC
617 | >db2_309
618 | GTTTCAATCCACGCCCCCGTGAGGGAGCGAC
619 | >db2_310
620 | GTCGCGCCCTCACGGGCGCGTGGATTGAAAC
621 | >db2_311
622 | ATCGCCCCCTCGCGGGGGCGCGGATTGAAAC
623 | >db2_312
624 | GTCGCCCCCGCAAGGGGGCGTGGATTGAAAT
625 | >db2_313
626 | GTTTCAATTCTCCTAGAGTCTTATTGCAAC
627 | >db2_314
628 | CTTTCAATTCTTTTGTAGTCTTATTGGAAC
629 | >db2_315
630 | CGTTTCCAGCCTACCTATGAGGGATTGAAAC
631 | >db2_316
632 | GTTTCAATCCCTTATAGGTAGGCTAAAAACC
633 | >db2_317
634 | GTTTGTATTTTACCTATGAGGAATTGAAAC
635 | >db2_318
636 | GTTTCCAGCCTACCTATGAGGGATTGAAAC
637 | >db2_319
638 | GTTTTGTTTGTACCTATAGGGGATTGAAAC
639 | >db2_320
640 | GTTTGTATCGTACCTATGAGGAATTGAAAC
641 | >db2_321
642 | GTTTTTAGCCTACCTAAAAGGGATTGAAAC
643 | >db2_322
644 | CTTTCAATTCTCTTTTAGTCTTATTGGAAC
645 | >db2_323
646 | GTTGAAATCAGACTAATGTAGGATTGAAAG
647 | >db2_324
648 | GTTACAATAAGACTAAAATAGAATTGAAAG
649 | >db2_325
650 | GTTGAAATCAGACCAAAATGGGATTGAAAG
651 | >db2_326
652 | CTTTCTACAGTACCTATAAGGAATTGAAAT
653 | >db2_327
654 | ATTTCAATTCCTCATAGGTACGATACAAAC
655 | >db2_328
656 | GTTTTTAGACTACCTATGAGGAATTGAAAC
657 | >db2_329
658 | GTTTCAATTCCTCTTAGGCACTCTAAAAAC
659 | >db2_330
660 | GTTTTTAGAGTGCCTATGAGGAATTGAAAC
661 | >db2_331
662 | GTTTGTAGCGTGCCTATAAGGGATTGAAAC
663 | >db2_332
664 | GTTTATAGAATACCTATGAGGAATTGAAAC
665 | >db2_333
666 | GTTTCAATCCCAGATTGGTTCGATTAAAAC
667 | >db2_334
668 | GTTCCAATAAGACTCTAAGAGAATTGAAAG
669 | >db2_335
670 | GTTTGAAGTTTACCTATGAGGAATTGAAAC
671 | >db2_336
672 | ATTTCAATTCCTCATAGGTACGATAAAAAC
673 | >db2_337
674 | ATTTCAATCCCAAAATGGTCTGATTTTAAC
675 | >db2_338
676 | GTTTTTAGCTTACCTATAAGGGATTGAAAC
677 | >db2_339
678 | GTTGCAATAAGACTCTAAGAGAATTGAAAG
679 | >db2_340
680 | GTTTTTAGCCTACCTATGAGGGATTGAAAT
681 | >db2_341
682 | GTTCCAATAAGACTCCAAGAGAATTGAAAG
683 | >db2_342
684 | GTTGCAATAAGACTCGAGGAGAATTGAAAG
685 | >db2_343
686 | CTTTCAATTCTCTTTGAGTCTTATTGGAAC
687 | >db2_344
688 | GTTCCAATAAGACTAAAATAGAATTGAAAG
689 | >db2_345
690 | GTTTGTAGCCTACCTATGAGGGATTGAAAC
691 | >db2_346
692 | GTTTATAGCCTACCTATAAGGAATTGAAAC
693 | >db2_347
694 | GTTTTTAGAGTGCCTATAAGGAATTGAAAC
695 | >db2_348
696 | GTTTTGAGCCTACCTATGAGGAATTGAAAC
697 | >db2_349
698 | GTTCCAATAAGACTATAAGAGAATTGAAAG
699 | >db2_350
700 | GTTTTTATCGTACCTATGAGGAATTGAAAC
701 | >db2_351
702 | GCTTTAATCGTACCTTTTTGGAATTGAAAC
703 | >db2_352
704 | GTTTCAATTCCTCATAGGTACGCTGAAAAC
705 | >db2_353
706 | GTTTCAATTCCTGATAGGTAGGCTAAAAAC
707 | >db2_354
708 | GTTTGTATCGTACCTATGAGGGATTGAAAC
709 | >db2_355
710 | GCTTTTAGCATACCTATTAGGGATTGAAAC
711 | >db2_356
712 | GTTTCAATAAGACTCTAAGAGAATTGAAAG
713 | >db2_357
714 | GTTTCAATTCCTCATAGGTAAGCTAACAAC
715 | >db2_358
716 | GTTTCAATCCCTCATAGGTAAGCTAACAAC
717 | >db2_359
718 | ATTTCAATTCCTCATAGATAGGCTAAAAAC
719 | >db2_360
720 | GTTGCAATAAGACTCTAGGAGAATTGAAAG
721 | >db2_361
722 | GTTTCAATTCCTCATAGGTACGCTGAGAAC
723 | >db2_362
724 | GTTTTTATCTTACCTATGAGGAATTGAAAC
725 | >db2_363
726 | GTTCCAATAAGACTTTAAAAGAATTGAAAG
727 | >db2_364
728 | GTTTCAATCCCTTATAGGTAGGCTCAAAAC
729 | >db2_365
730 | CTTTCAATTCTATTTTGGTCTTATTGTAAC
731 | >db2_366
732 | GTTTGTATCTTAACTATGAGGAATTGAAAC
733 | >db2_367
734 | GTTTTTAGCCTACCTATAAGGAATTGAAAT
735 | >db2_368
736 | GTTTCAATTCCTCATAGGTACGATCAAAAC
737 | >db2_369
738 | GTTTCAATCCCTTATAGGTAAGCTAACAAC
739 | >db2_370
740 | GTTTCAATCCCTTATAGGTAGGCTAAAAAC
741 | >db2_371
742 | GTTTTTAGCTTACCTATGAGGGATTGAAAC
743 | >db2_372
744 | GTTTCAATCCCTAATAGGTATGCTAAAAAC
745 | >db2_373
746 | GTTTGTATCTTACCTATGAGGAATTGAAAC
747 | >db2_374
748 | GTTTATAGCCTACCTATAAGGGATTGAAAC
749 | >db2_375
750 | GTTTCTACCTTACCTTGGAGGAATTGAAAC
751 | >db2_376
752 | GTTATCAGCCTACCTATAAGGAATTGAAAC
753 | >db2_377
754 | ATTTCAATTCCTCCAAGGTAAGGTAAAAAC
755 | >db2_378
756 | GTTGCAATAAGACTCTGGGAGAATTGAAA
757 | >db2_379
758 | GTTGAACCGTACCTATGAGAGATTGAAAC
759 | >db2_380
760 | GTTTGAAGTTTACCTATAAGGAATTGAAA
761 | >db2_381
762 | GTTCCAATAAGACTAAAATAGAATTGAAA
763 | >db2_382
764 | GTTTTTATCGTACCTATGAGGGATTGAAA
765 | >db2_383
766 | GTTTCAATCCCTTATAGGTAAGCTAACAA
767 | >db2_384
768 | GTTTTTATCTTACCTATGAGGAATTGAAA
769 | >db2_385
770 | GTTTCAATCCGCGCCCCCGTGAGGGGGC
771 | >db2_386
772 | ACTTTCAATCCCTTATGGGATTCTTC
773 | >db2_387
774 | CTTTCAATCCCTTTTGGGATGCAAC
775 | >db2_388
776 | GATTAATCCCAAAAGGAATTGAAAG
777 | >db2_389
778 | GATTAATCCTAAAAGGAATTGAAAG
779 | >db2_390
780 | CTTTCAATTCCTTTTGGGATTCATC
781 | >db2_391
782 | GAAGATTTCAATAGAATATTGAAAG
783 | >db2_392
784 | GTTTCAATTCTTTTGTAGGTTCTTC
785 | >db2_393
786 | CTTTCAATCCCTTTTGGGATTCATC
787 | >db2_394
788 | GAATCTCAAAAAGAGGATTGAAAG
789 | >db2_395
790 | CTTTCAATTCCATTATGGATTAGC
791 | >db2_396
792 | GAATCCTATAAATGGAATTGAAAG
793 | >db2_397
794 | GAATCCTACAAATGGAATTGAAAG
795 | >db2_398
796 | GATAATCTACTATAGAATTGAAAG
797 | >db2_399
798 | GAATCTCAAGTTGAGGATTGAAAG
799 | >db2_400
800 | CTTTCAATTCTATAAGAGATTATC
801 | >db2_401
802 | CTTTCAATCCTCTCTTTGAGATTC
803 | >db2_402
804 | CTTTCAATCCTCTTCTTGAGATTC
805 | >db2_403
806 | GTATCTCAAAAAGAGGATTGAAAG
807 | >db2_404
808 | CTTTCAATTCTATCTAACAGATTC
809 | >db2_405
810 | GATAATCTCTTATAGAATTGAAAG
811 | >db2_406
812 | GTTTTTATCGTACCTATGAGGAATTGAAAC
813 | >db2_407
814 | GTTTCAGACGAACCCTTGTGGGATTGAAGC
815 | >db2_408
816 | GTTTCAGACGAACCCTTGTGGGGTTGAAGC
817 | >db2_409
818 | GTTTCAGACGAACCCTTGTGGGTTTGAAGC
819 | >db2_410
820 | GATTAATCCCAAAAGGAATTGAAAG
821 | >db2_411
822 | GTCGCGTCCTCACGGGCGCGTGGATTGAAAC
823 | >db2_412
824 | GAGTTCCCCGCGCCAGCGGGGATAAACCG
825 | >db2_413
826 | GTGTTCCCCGCGCCAGCGGGGATAAACCG
827 | >db2_414
828 | GTTCACTGCCGTGTAGGCAGCTAAGAAA
829 | >db2_415
830 | GTTCACTGCCGTACAGGCAGCTTAGAAA
831 | >db2_416
832 | GTTGAAGTGGTACTTCCAGTAAAACAAGGATTGAAAC
833 | >db2_417
834 | CTAAAAGAATAACTTGCAAAATAACAAGCATTGAAAC
835 | >db2_418
836 | CTTTCCTTCTACTAATCCCGGCGATCGGGACTGAAAC
837 | >db2_419
838 | GTTTTAGAGCTATGCTGTTTTGAATGGTCCCAAAAC
839 | >db2_420
840 | GTTGTAGCTCCCTTTCTCATTTCGCAGTGCTACAAT
841 | >db2_421
842 | GTTTTAGTCCCTTTTTAAATTTCTTTATGGTAAAAT
843 | >db2_422
844 | GTTCCAATAAGACTAAAATAGAATTGAAAG
845 | >db2_423
846 | GATCGATACCCACCCCGAAGAAAAGGGGACGAGAAC
847 | >db2_424
848 | GTTCAACACCCTCTTTTCCCCGTCAGGGGACTGAAAC
849 | >db2_425
850 | GTCTCCACTCGTAGGAGAAATTAATTGATTGGAAAC
851 | >db2_426
852 | GAACAACTCAAAAGAGAATTGCAAG
853 | >db2_427
854 | ATTAAAATCAGACCGTTTCGGAATGGAAAT
855 | >db2_428
856 | GTTTTATATTAACTAAGTGGTATGTAAAG
857 | >db2_429
858 | GAATCTCAAAAAGAGGATTGAAAG
859 | >db2_430
860 | GTGGAAATCAAAAGATAGTAGAAAC
861 | 


--------------------------------------------------------------------------------
/tools/blasting/Verified_repeats_dataset2.fa.nhr:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/tools/blasting/Verified_repeats_dataset2.fa.nhr


--------------------------------------------------------------------------------
/tools/blasting/Verified_repeats_dataset2.fa.nin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/tools/blasting/Verified_repeats_dataset2.fa.nin


--------------------------------------------------------------------------------
/tools/blasting/Verified_repeats_dataset2.fa.nog:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/tools/blasting/Verified_repeats_dataset2.fa.nog


--------------------------------------------------------------------------------
/tools/blasting/Verified_repeats_dataset2.fa.nsd:
--------------------------------------------------------------------------------
  1 | db2_10
  2 | db2_109
  3 | db2_10099
  4 | db2_101100
  5 | db2_102101
  6 | db2_103102
  7 | db2_104103
  8 | db2_105104
  9 | db2_106105
 10 | db2_107106
 11 | db2_108107
 12 | db2_109108
 13 | db2_1110
 14 | db2_110109
 15 | db2_111110
 16 | db2_112111
 17 | db2_113112
 18 | db2_114113
 19 | db2_115114
 20 | db2_116115
 21 | db2_117116
 22 | db2_118117
 23 | db2_119118
 24 | db2_1211
 25 | db2_120119
 26 | db2_121120
 27 | db2_122121
 28 | db2_123122
 29 | db2_124123
 30 | db2_125124
 31 | db2_126125
 32 | db2_127126
 33 | db2_128127
 34 | db2_129128
 35 | db2_1312
 36 | db2_130129
 37 | db2_131130
 38 | db2_132131
 39 | db2_133132
 40 | db2_134133
 41 | db2_135134
 42 | db2_136135
 43 | db2_137136
 44 | db2_138137
 45 | db2_139138
 46 | db2_1413
 47 | db2_140139
 48 | db2_141140
 49 | db2_142141
 50 | db2_143142
 51 | db2_144143
 52 | db2_145144
 53 | db2_146145
 54 | db2_147146
 55 | db2_148147
 56 | db2_149148
 57 | db2_1514
 58 | db2_150149
 59 | db2_151150
 60 | db2_152151
 61 | db2_153152
 62 | db2_154153
 63 | db2_155154
 64 | db2_156155
 65 | db2_157156
 66 | db2_158157
 67 | db2_159158
 68 | db2_1615
 69 | db2_160159
 70 | db2_161160
 71 | db2_162161
 72 | db2_163162
 73 | db2_164163
 74 | db2_165164
 75 | db2_166165
 76 | db2_167166
 77 | db2_168167
 78 | db2_169168
 79 | db2_1716
 80 | db2_170169
 81 | db2_171170
 82 | db2_172171
 83 | db2_173172
 84 | db2_174173
 85 | db2_175174
 86 | db2_176175
 87 | db2_177176
 88 | db2_178177
 89 | db2_179178
 90 | db2_1817
 91 | db2_180179
 92 | db2_181180
 93 | db2_182181
 94 | db2_183182
 95 | db2_184183
 96 | db2_185184
 97 | db2_186185
 98 | db2_187186
 99 | db2_188187
100 | db2_189188
101 | db2_1918
102 | db2_190189
103 | db2_191190
104 | db2_192191
105 | db2_193192
106 | db2_194193
107 | db2_195194
108 | db2_196195
109 | db2_197196
110 | db2_198197
111 | db2_199198
112 | db2_21
113 | db2_2019
114 | db2_200199
115 | db2_201200
116 | db2_202201
117 | db2_203202
118 | db2_204203
119 | db2_205204
120 | db2_206205
121 | db2_207206
122 | db2_208207
123 | db2_209208
124 | db2_2120
125 | db2_210209
126 | db2_211210
127 | db2_212211
128 | db2_213212
129 | db2_214213
130 | db2_215214
131 | db2_216215
132 | db2_217216
133 | db2_218217
134 | db2_219218
135 | db2_2221
136 | db2_220219
137 | db2_221220
138 | db2_222221
139 | db2_223222
140 | db2_224223
141 | db2_225224
142 | db2_226225
143 | db2_227226
144 | db2_228227
145 | db2_229228
146 | db2_2322
147 | db2_230229
148 | db2_231230
149 | db2_232231
150 | db2_233232
151 | db2_234233
152 | db2_235234
153 | db2_236235
154 | db2_237236
155 | db2_238237
156 | db2_239238
157 | db2_2423
158 | db2_240239
159 | db2_241240
160 | db2_242241
161 | db2_243242
162 | db2_244243
163 | db2_245244
164 | db2_246245
165 | db2_247246
166 | db2_248247
167 | db2_249248
168 | db2_2524
169 | db2_250249
170 | db2_251250
171 | db2_252251
172 | db2_253252
173 | db2_254253
174 | db2_255254
175 | db2_256255
176 | db2_257256
177 | db2_258257
178 | db2_259258
179 | db2_2625
180 | db2_260259
181 | db2_261260
182 | db2_262261
183 | db2_263262
184 | db2_264263
185 | db2_265264
186 | db2_266265
187 | db2_267266
188 | db2_268267
189 | db2_269268
190 | db2_2726
191 | db2_270269
192 | db2_271270
193 | db2_272271
194 | db2_273272
195 | db2_274273
196 | db2_275274
197 | db2_276275
198 | db2_277276
199 | db2_278277
200 | db2_279278
201 | db2_2827
202 | db2_280279
203 | db2_281280
204 | db2_282281
205 | db2_283282
206 | db2_284283
207 | db2_285284
208 | db2_286285
209 | db2_287286
210 | db2_288287
211 | db2_289288
212 | db2_2928
213 | db2_290289
214 | db2_291290
215 | db2_292291
216 | db2_293292
217 | db2_294293
218 | db2_295294
219 | db2_296295
220 | db2_297296
221 | db2_298297
222 | db2_299298
223 | db2_32
224 | db2_3029
225 | db2_300299
226 | db2_301300
227 | db2_302301
228 | db2_303302
229 | db2_304303
230 | db2_305304
231 | db2_306305
232 | db2_307306
233 | db2_308307
234 | db2_309308
235 | db2_3130
236 | db2_310309
237 | db2_311310
238 | db2_312311
239 | db2_313312
240 | db2_314313
241 | db2_315314
242 | db2_316315
243 | db2_317316
244 | db2_318317
245 | db2_319318
246 | db2_3231
247 | db2_320319
248 | db2_321320
249 | db2_322321
250 | db2_323322
251 | db2_324323
252 | db2_325324
253 | db2_326325
254 | db2_327326
255 | db2_328327
256 | db2_329328
257 | db2_3332
258 | db2_330329
259 | db2_331330
260 | db2_332331
261 | db2_333332
262 | db2_334333
263 | db2_335334
264 | db2_336335
265 | db2_337336
266 | db2_338337
267 | db2_339338
268 | db2_3433
269 | db2_340339
270 | db2_341340
271 | db2_342341
272 | db2_343342
273 | db2_344343
274 | db2_345344
275 | db2_346345
276 | db2_347346
277 | db2_348347
278 | db2_349348
279 | db2_3534
280 | db2_350349
281 | db2_351350
282 | db2_352351
283 | db2_353352
284 | db2_354353
285 | db2_355354
286 | db2_356355
287 | db2_357356
288 | db2_358357
289 | db2_359358
290 | db2_3635
291 | db2_360359
292 | db2_361360
293 | db2_362361
294 | db2_363362
295 | db2_364363
296 | db2_365364
297 | db2_366365
298 | db2_367366
299 | db2_368367
300 | db2_369368
301 | db2_3736
302 | db2_370369
303 | db2_371370
304 | db2_372371
305 | db2_373372
306 | db2_374373
307 | db2_375374
308 | db2_376375
309 | db2_377376
310 | db2_378377
311 | db2_379378
312 | db2_3837
313 | db2_380379
314 | db2_381380
315 | db2_382381
316 | db2_383382
317 | db2_384383
318 | db2_385384
319 | db2_386385
320 | db2_387386
321 | db2_388387
322 | db2_389388
323 | db2_3938
324 | db2_390389
325 | db2_391390
326 | db2_392391
327 | db2_393392
328 | db2_394393
329 | db2_395394
330 | db2_396395
331 | db2_397396
332 | db2_398397
333 | db2_399398
334 | db2_43
335 | db2_4039
336 | db2_400399
337 | db2_401400
338 | db2_402401
339 | db2_403402
340 | db2_404403
341 | db2_405404
342 | db2_406405
343 | db2_407406
344 | db2_408407
345 | db2_409408
346 | db2_4140
347 | db2_410409
348 | db2_411410
349 | db2_412411
350 | db2_413412
351 | db2_414413
352 | db2_415414
353 | db2_416415
354 | db2_417416
355 | db2_418417
356 | db2_419418
357 | db2_4241
358 | db2_420419
359 | db2_421420
360 | db2_422421
361 | db2_423422
362 | db2_424423
363 | db2_425424
364 | db2_426425
365 | db2_427426
366 | db2_428427
367 | db2_429428
368 | db2_4342
369 | db2_430429
370 | db2_4443
371 | db2_4544
372 | db2_4645
373 | db2_4746
374 | db2_4847
375 | db2_4948
376 | db2_54
377 | db2_5049
378 | db2_5150
379 | db2_5251
380 | db2_5352
381 | db2_5453
382 | db2_5554
383 | db2_5655
384 | db2_5756
385 | db2_5857
386 | db2_5958
387 | db2_65
388 | db2_6059
389 | db2_6160
390 | db2_6261
391 | db2_6362
392 | db2_6463
393 | db2_6564
394 | db2_6665
395 | db2_6766
396 | db2_6867
397 | db2_6968
398 | db2_76
399 | db2_7069
400 | db2_7170
401 | db2_7271
402 | db2_7372
403 | db2_7473
404 | db2_7574
405 | db2_7675
406 | db2_7776
407 | db2_7877
408 | db2_7978
409 | db2_87
410 | db2_8079
411 | db2_8180
412 | db2_8281
413 | db2_8382
414 | db2_8483
415 | db2_8584
416 | db2_8685
417 | db2_8786
418 | db2_8887
419 | db2_8988
420 | db2_98
421 | db2_9089
422 | db2_9190
423 | db2_9291
424 | db2_9392
425 | db2_9493
426 | db2_9594
427 | db2_9695
428 | db2_9796
429 | db2_9897
430 | db2_9998
431 | lcl|db2_10
432 | lcl|db2_109
433 | lcl|db2_10099
434 | lcl|db2_101100
435 | lcl|db2_102101
436 | lcl|db2_103102
437 | lcl|db2_104103
438 | lcl|db2_105104
439 | lcl|db2_106105
440 | lcl|db2_107106
441 | lcl|db2_108107
442 | lcl|db2_109108
443 | lcl|db2_1110
444 | lcl|db2_110109
445 | lcl|db2_111110
446 | lcl|db2_112111
447 | lcl|db2_113112
448 | lcl|db2_114113
449 | lcl|db2_115114
450 | lcl|db2_116115
451 | lcl|db2_117116
452 | lcl|db2_118117
453 | lcl|db2_119118
454 | lcl|db2_1211
455 | lcl|db2_120119
456 | lcl|db2_121120
457 | lcl|db2_122121
458 | lcl|db2_123122
459 | lcl|db2_124123
460 | lcl|db2_125124
461 | lcl|db2_126125
462 | lcl|db2_127126
463 | lcl|db2_128127
464 | lcl|db2_129128
465 | lcl|db2_1312
466 | lcl|db2_130129
467 | lcl|db2_131130
468 | lcl|db2_132131
469 | lcl|db2_133132
470 | lcl|db2_134133
471 | lcl|db2_135134
472 | lcl|db2_136135
473 | lcl|db2_137136
474 | lcl|db2_138137
475 | lcl|db2_139138
476 | lcl|db2_1413
477 | lcl|db2_140139
478 | lcl|db2_141140
479 | lcl|db2_142141
480 | lcl|db2_143142
481 | lcl|db2_144143
482 | lcl|db2_145144
483 | lcl|db2_146145
484 | lcl|db2_147146
485 | lcl|db2_148147
486 | lcl|db2_149148
487 | lcl|db2_1514
488 | lcl|db2_150149
489 | lcl|db2_151150
490 | lcl|db2_152151
491 | lcl|db2_153152
492 | lcl|db2_154153
493 | lcl|db2_155154
494 | lcl|db2_156155
495 | lcl|db2_157156
496 | lcl|db2_158157
497 | lcl|db2_159158
498 | lcl|db2_1615
499 | lcl|db2_160159
500 | lcl|db2_161160
501 | lcl|db2_162161
502 | lcl|db2_163162
503 | lcl|db2_164163
504 | lcl|db2_165164
505 | lcl|db2_166165
506 | lcl|db2_167166
507 | lcl|db2_168167
508 | lcl|db2_169168
509 | lcl|db2_1716
510 | lcl|db2_170169
511 | lcl|db2_171170
512 | lcl|db2_172171
513 | lcl|db2_173172
514 | lcl|db2_174173
515 | lcl|db2_175174
516 | lcl|db2_176175
517 | lcl|db2_177176
518 | lcl|db2_178177
519 | lcl|db2_179178
520 | lcl|db2_1817
521 | lcl|db2_180179
522 | lcl|db2_181180
523 | lcl|db2_182181
524 | lcl|db2_183182
525 | lcl|db2_184183
526 | lcl|db2_185184
527 | lcl|db2_186185
528 | lcl|db2_187186
529 | lcl|db2_188187
530 | lcl|db2_189188
531 | lcl|db2_1918
532 | lcl|db2_190189
533 | lcl|db2_191190
534 | lcl|db2_192191
535 | lcl|db2_193192
536 | lcl|db2_194193
537 | lcl|db2_195194
538 | lcl|db2_196195
539 | lcl|db2_197196
540 | lcl|db2_198197
541 | lcl|db2_199198
542 | lcl|db2_21
543 | lcl|db2_2019
544 | lcl|db2_200199
545 | lcl|db2_201200
546 | lcl|db2_202201
547 | lcl|db2_203202
548 | lcl|db2_204203
549 | lcl|db2_205204
550 | lcl|db2_206205
551 | lcl|db2_207206
552 | lcl|db2_208207
553 | lcl|db2_209208
554 | lcl|db2_2120
555 | lcl|db2_210209
556 | lcl|db2_211210
557 | lcl|db2_212211
558 | lcl|db2_213212
559 | lcl|db2_214213
560 | lcl|db2_215214
561 | lcl|db2_216215
562 | lcl|db2_217216
563 | lcl|db2_218217
564 | lcl|db2_219218
565 | lcl|db2_2221
566 | lcl|db2_220219
567 | lcl|db2_221220
568 | lcl|db2_222221
569 | lcl|db2_223222
570 | lcl|db2_224223
571 | lcl|db2_225224
572 | lcl|db2_226225
573 | lcl|db2_227226
574 | lcl|db2_228227
575 | lcl|db2_229228
576 | lcl|db2_2322
577 | lcl|db2_230229
578 | lcl|db2_231230
579 | lcl|db2_232231
580 | lcl|db2_233232
581 | lcl|db2_234233
582 | lcl|db2_235234
583 | lcl|db2_236235
584 | lcl|db2_237236
585 | lcl|db2_238237
586 | lcl|db2_239238
587 | lcl|db2_2423
588 | lcl|db2_240239
589 | lcl|db2_241240
590 | lcl|db2_242241
591 | lcl|db2_243242
592 | lcl|db2_244243
593 | lcl|db2_245244
594 | lcl|db2_246245
595 | lcl|db2_247246
596 | lcl|db2_248247
597 | lcl|db2_249248
598 | lcl|db2_2524
599 | lcl|db2_250249
600 | lcl|db2_251250
601 | lcl|db2_252251
602 | lcl|db2_253252
603 | lcl|db2_254253
604 | lcl|db2_255254
605 | lcl|db2_256255
606 | lcl|db2_257256
607 | lcl|db2_258257
608 | lcl|db2_259258
609 | lcl|db2_2625
610 | lcl|db2_260259
611 | lcl|db2_261260
612 | lcl|db2_262261
613 | lcl|db2_263262
614 | lcl|db2_264263
615 | lcl|db2_265264
616 | lcl|db2_266265
617 | lcl|db2_267266
618 | lcl|db2_268267
619 | lcl|db2_269268
620 | lcl|db2_2726
621 | lcl|db2_270269
622 | lcl|db2_271270
623 | lcl|db2_272271
624 | lcl|db2_273272
625 | lcl|db2_274273
626 | lcl|db2_275274
627 | lcl|db2_276275
628 | lcl|db2_277276
629 | lcl|db2_278277
630 | lcl|db2_279278
631 | lcl|db2_2827
632 | lcl|db2_280279
633 | lcl|db2_281280
634 | lcl|db2_282281
635 | lcl|db2_283282
636 | lcl|db2_284283
637 | lcl|db2_285284
638 | lcl|db2_286285
639 | lcl|db2_287286
640 | lcl|db2_288287
641 | lcl|db2_289288
642 | lcl|db2_2928
643 | lcl|db2_290289
644 | lcl|db2_291290
645 | lcl|db2_292291
646 | lcl|db2_293292
647 | lcl|db2_294293
648 | lcl|db2_295294
649 | lcl|db2_296295
650 | lcl|db2_297296
651 | lcl|db2_298297
652 | lcl|db2_299298
653 | lcl|db2_32
654 | lcl|db2_3029
655 | lcl|db2_300299
656 | lcl|db2_301300
657 | lcl|db2_302301
658 | lcl|db2_303302
659 | lcl|db2_304303
660 | lcl|db2_305304
661 | lcl|db2_306305
662 | lcl|db2_307306
663 | lcl|db2_308307
664 | lcl|db2_309308
665 | lcl|db2_3130
666 | lcl|db2_310309
667 | lcl|db2_311310
668 | lcl|db2_312311
669 | lcl|db2_313312
670 | lcl|db2_314313
671 | lcl|db2_315314
672 | lcl|db2_316315
673 | lcl|db2_317316
674 | lcl|db2_318317
675 | lcl|db2_319318
676 | lcl|db2_3231
677 | lcl|db2_320319
678 | lcl|db2_321320
679 | lcl|db2_322321
680 | lcl|db2_323322
681 | lcl|db2_324323
682 | lcl|db2_325324
683 | lcl|db2_326325
684 | lcl|db2_327326
685 | lcl|db2_328327
686 | lcl|db2_329328
687 | lcl|db2_3332
688 | lcl|db2_330329
689 | lcl|db2_331330
690 | lcl|db2_332331
691 | lcl|db2_333332
692 | lcl|db2_334333
693 | lcl|db2_335334
694 | lcl|db2_336335
695 | lcl|db2_337336
696 | lcl|db2_338337
697 | lcl|db2_339338
698 | lcl|db2_3433
699 | lcl|db2_340339
700 | lcl|db2_341340
701 | lcl|db2_342341
702 | lcl|db2_343342
703 | lcl|db2_344343
704 | lcl|db2_345344
705 | lcl|db2_346345
706 | lcl|db2_347346
707 | lcl|db2_348347
708 | lcl|db2_349348
709 | lcl|db2_3534
710 | lcl|db2_350349
711 | lcl|db2_351350
712 | lcl|db2_352351
713 | lcl|db2_353352
714 | lcl|db2_354353
715 | lcl|db2_355354
716 | lcl|db2_356355
717 | lcl|db2_357356
718 | lcl|db2_358357
719 | lcl|db2_359358
720 | lcl|db2_3635
721 | lcl|db2_360359
722 | lcl|db2_361360
723 | lcl|db2_362361
724 | lcl|db2_363362
725 | lcl|db2_364363
726 | lcl|db2_365364
727 | lcl|db2_366365
728 | lcl|db2_367366
729 | lcl|db2_368367
730 | lcl|db2_369368
731 | lcl|db2_3736
732 | lcl|db2_370369
733 | lcl|db2_371370
734 | lcl|db2_372371
735 | lcl|db2_373372
736 | lcl|db2_374373
737 | lcl|db2_375374
738 | lcl|db2_376375
739 | lcl|db2_377376
740 | lcl|db2_378377
741 | lcl|db2_379378
742 | lcl|db2_3837
743 | lcl|db2_380379
744 | lcl|db2_381380
745 | lcl|db2_382381
746 | lcl|db2_383382
747 | lcl|db2_384383
748 | lcl|db2_385384
749 | lcl|db2_386385
750 | lcl|db2_387386
751 | lcl|db2_388387
752 | lcl|db2_389388
753 | lcl|db2_3938
754 | lcl|db2_390389
755 | lcl|db2_391390
756 | lcl|db2_392391
757 | lcl|db2_393392
758 | lcl|db2_394393
759 | lcl|db2_395394
760 | lcl|db2_396395
761 | lcl|db2_397396
762 | lcl|db2_398397
763 | lcl|db2_399398
764 | lcl|db2_43
765 | lcl|db2_4039
766 | lcl|db2_400399
767 | lcl|db2_401400
768 | lcl|db2_402401
769 | lcl|db2_403402
770 | lcl|db2_404403
771 | lcl|db2_405404
772 | lcl|db2_406405
773 | lcl|db2_407406
774 | lcl|db2_408407
775 | lcl|db2_409408
776 | lcl|db2_4140
777 | lcl|db2_410409
778 | lcl|db2_411410
779 | lcl|db2_412411
780 | lcl|db2_413412
781 | lcl|db2_414413
782 | lcl|db2_415414
783 | lcl|db2_416415
784 | lcl|db2_417416
785 | lcl|db2_418417
786 | lcl|db2_419418
787 | lcl|db2_4241
788 | lcl|db2_420419
789 | lcl|db2_421420
790 | lcl|db2_422421
791 | lcl|db2_423422
792 | lcl|db2_424423
793 | lcl|db2_425424
794 | lcl|db2_426425
795 | lcl|db2_427426
796 | lcl|db2_428427
797 | lcl|db2_429428
798 | lcl|db2_4342
799 | lcl|db2_430429
800 | lcl|db2_4443
801 | lcl|db2_4544
802 | lcl|db2_4645
803 | lcl|db2_4746
804 | lcl|db2_4847
805 | lcl|db2_4948
806 | lcl|db2_54
807 | lcl|db2_5049
808 | lcl|db2_5150
809 | lcl|db2_5251
810 | lcl|db2_5352
811 | lcl|db2_5453
812 | lcl|db2_5554
813 | lcl|db2_5655
814 | lcl|db2_5756
815 | lcl|db2_5857
816 | lcl|db2_5958
817 | lcl|db2_65
818 | lcl|db2_6059
819 | lcl|db2_6160
820 | lcl|db2_6261
821 | lcl|db2_6362
822 | lcl|db2_6463
823 | lcl|db2_6564
824 | lcl|db2_6665
825 | lcl|db2_6766
826 | lcl|db2_6867
827 | lcl|db2_6968
828 | lcl|db2_76
829 | lcl|db2_7069
830 | lcl|db2_7170
831 | lcl|db2_7271
832 | lcl|db2_7372
833 | lcl|db2_7473
834 | lcl|db2_7574
835 | lcl|db2_7675
836 | lcl|db2_7776
837 | lcl|db2_7877
838 | lcl|db2_7978
839 | lcl|db2_87
840 | lcl|db2_8079
841 | lcl|db2_8180
842 | lcl|db2_8281
843 | lcl|db2_8382
844 | lcl|db2_8483
845 | lcl|db2_8584
846 | lcl|db2_8685
847 | lcl|db2_8786
848 | lcl|db2_8887
849 | lcl|db2_8988
850 | lcl|db2_98
851 | lcl|db2_9089
852 | lcl|db2_9190
853 | lcl|db2_9291
854 | lcl|db2_9392
855 | lcl|db2_9493
856 | lcl|db2_9594
857 | lcl|db2_9695
858 | lcl|db2_9796
859 | lcl|db2_9897
860 | lcl|db2_9998
861 | 


--------------------------------------------------------------------------------
/tools/blasting/Verified_repeats_dataset2.fa.nsi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/tools/blasting/Verified_repeats_dataset2.fa.nsi


--------------------------------------------------------------------------------
/tools/blasting/Verified_repeats_dataset2.fa.nsq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/tools/blasting/Verified_repeats_dataset2.fa.nsq


--------------------------------------------------------------------------------
/tools/strand_prediction/CRISPRstrand/CRISPRstrand.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import argparse
  5 | 
  6 | # disable tensorflow multi-threading
  7 | import tensorflow as tf
  8 | tf.config.threading.set_inter_op_parallelism_threads(1)
  9 | tf.config.threading.set_intra_op_parallelism_threads(1)
 10 | 
 11 | import preprocessing as pp
 12 | import utils as u
 13 | 
 14 | if __name__ == "__main__":
 15 |     cmdline_parser = argparse.ArgumentParser('CRISPRstrand_v2')
 16 | 
 17 |     cmdline_parser.add_argument('-cv', '--cross_validation',
 18 |                                 action='store_true',
 19 |                                 default=False,
 20 |                                 help='Cross validation (only applied for training)')
 21 | 
 22 |     cmdline_parser.add_argument('-tr', '--training',
 23 |                                 action='store_true',
 24 |                                 default=False,
 25 |                                 help='Whether to train a model')
 26 | 
 27 |     cmdline_parser.add_argument('-i', '--input_files',
 28 |                                 nargs='+', 
 29 |                                 default= ['Example/Input3.fa'],
 30 |                                 help='Filenames of the input data.')
 31 | 
 32 |     cmdline_parser.add_argument('-cols', '--usecols',
 33 |                                 nargs='+', 
 34 |                                 default= [0, 5],
 35 |                                 help='ID and consensus repeat fields to use. Must be specified for csv/tsv/xls...')
 36 | 
 37 |     cmdline_parser.add_argument('-m', '--model_path',
 38 |                                 default='Models/model_r.h5',
 39 |                                 help='Evaluation/prediction model path',
 40 |                                 type=str)
 41 | 
 42 |     cmdline_parser.add_argument('-type', '--repeat_type',
 43 |                                 action='store_true',
 44 |                                 default=False,
 45 |                                 help='Whether to train a model')
 46 | 
 47 |     cmdline_parser.add_argument('-out', '--output_folder',
 48 |                                 default='Results',
 49 |                                 help='Output save',
 50 |                                 type=str)
 51 | 
 52 | 
 53 |     args, unknowns = cmdline_parser.parse_known_args()
 54 |     print(args)
 55 | 
 56 |     tr = args.training
 57 |     cv = args.cross_validation
 58 |     inputs = args.input_files
 59 |     columns = None if len(args.usecols) == 0 else args.usecols
 60 |     model_path = args.model_path
 61 |     do_type = args.repeat_type
 62 |     output_folder = args.output_folder
 63 |     ###################################
 64 |     
 65 |     
 66 |     if tr:
 67 |         
 68 |         from keras.backend import clear_session
 69 |         import train.train as tr
 70 | 
 71 |         df = pp.prepare_input(inputs, usecols = columns)
 72 |         X, y, sequence_length = pp.process_dataset_for_training(df)
 73 |             
 74 |         run_dict = {'batch_size'        : 128,
 75 |                     'num_epochs'        : 100,
 76 |                     'num_repeats'       : 1,
 77 |                     'k'                 : 5, # only will be used in CV
 78 |                     'classifier_build'  : 'parallel',
 79 |                     'kernel_width'      : [4, 5, 6, 8, 10], # put 6
 80 |                     'sequence_length'   : sequence_length
 81 |                     }
 82 | 
 83 |         clear_session()
 84 | 
 85 |         if cv:
 86 |             classifiers, histories, roc_auc_scores = te.cv_run(X, y, run_dict)
 87 |         else:
 88 |             classifiers, histories, roc_auc_scores = te.run(X, y, run_dict)
 89 | 
 90 |     else:
 91 | 
 92 |         import evaluate as ev
 93 | 
 94 |         classifier, sequence_length = u.load_model(model_path)
 95 |         df = pp.prepare_input(inputs, output_folder, False, usecols = columns, do_type = do_type)
 96 |         df, X, y = pp.process_dataset_for_test(df, sequence_length)
 97 |         df_out, roc_auc_scores = ev.test(classifier, df, X, y, output_folder)
 98 | 
 99 |     print(roc_auc_scores)
100 | 


--------------------------------------------------------------------------------
/tools/strand_prediction/CRISPRstrand/CRISPRstrand.yml:
--------------------------------------------------------------------------------
  1 | name: crispr_strand_env
  2 | channels:
  3 |   - defaults
  4 | dependencies:
  5 |   - _libgcc_mutex=0.1=main
  6 |   - _tflow_select=2.3.0=eigen
  7 |   - absl-py=0.11.0=py37h06a4308_0
  8 |   - aiohttp=3.6.3=py37h7b6447c_0
  9 |   - astunparse=1.6.3=py_0
 10 |   - async-timeout=3.0.1=py37_0
 11 |   - attrs=20.3.0=pyhd3eb1b0_0
 12 |   - blas=1.0=mkl
 13 |   - blinker=1.4=py37_0
 14 |   - brotlipy=0.7.0=py37h27cfd23_1003
 15 |   - c-ares=1.17.1=h27cfd23_0
 16 |   - ca-certificates=2020.10.14=0
 17 |   - cachetools=4.1.1=py_0
 18 |   - certifi=2020.11.8=py37h06a4308_0
 19 |   - cffi=1.14.3=py37h261ae71_2
 20 |   - chardet=3.0.4=py37h06a4308_1003
 21 |   - click=7.1.2=py_0
 22 |   - cryptography=3.2.1=py37h3c74f83_1
 23 |   - cycler=0.10.0=py37_0
 24 |   - dbus=1.13.18=hb2f20db_0
 25 |   - expat=2.2.10=he6710b0_2
 26 |   - fontconfig=2.13.0=h9420a91_0
 27 |   - freetype=2.10.4=h5ab3b9f_0
 28 |   - gast=0.3.3=py_0
 29 |   - glib=2.66.1=h92f7085_0
 30 |   - google-auth=1.23.0=pyhd3eb1b0_0
 31 |   - google-auth-oauthlib=0.4.2=pyhd3eb1b0_2
 32 |   - google-pasta=0.2.0=py_0
 33 |   - grpcio=1.31.0=py37hf8bcb03_0
 34 |   - gst-plugins-base=1.14.0=hbbd80ab_1
 35 |   - gstreamer=1.14.0=hb31296c_0
 36 |   - h5py=2.10.0=py37hd6299e0_1
 37 |   - hdf5=1.10.6=hb1b8bf9_0
 38 |   - icu=58.2=he6710b0_3
 39 |   - idna=2.10=py_0
 40 |   - importlib-metadata=2.0.0=py_1
 41 |   - intel-openmp=2020.2=254
 42 |   - joblib=0.17.0=py_0
 43 |   - jpeg=9b=h024ee3a_2
 44 |   - keras=2.4.3=0
 45 |   - keras-base=2.4.3=py_0
 46 |   - keras-preprocessing=1.1.0=py_1
 47 |   - kiwisolver=1.3.0=py37h2531618_0
 48 |   - lcms2=2.11=h396b838_0
 49 |   - ld_impl_linux-64=2.33.1=h53a641e_7
 50 |   - libedit=3.1.20191231=h14c3975_1
 51 |   - libffi=3.3=he6710b0_2
 52 |   - libgcc-ng=9.1.0=hdf63c60_0
 53 |   - libgfortran-ng=7.3.0=hdf63c60_0
 54 |   - libpng=1.6.37=hbc83047_0
 55 |   - libprotobuf=3.13.0.1=hd408876_0
 56 |   - libstdcxx-ng=9.1.0=hdf63c60_0
 57 |   - libtiff=4.1.0=h2733197_1
 58 |   - libuuid=1.0.3=h1bed415_2
 59 |   - libxcb=1.14=h7b6447c_0
 60 |   - libxml2=2.9.10=hb55368b_3
 61 |   - lz4-c=1.9.2=heb0550a_3
 62 |   - markdown=3.3.3=py37h06a4308_0
 63 |   - matplotlib=3.3.2=0
 64 |   - matplotlib-base=3.3.2=py37h817c723_0
 65 |   - mkl=2020.2=256
 66 |   - mkl-service=2.3.0=py37he904b0f_0
 67 |   - mkl_fft=1.2.0=py37h23d657b_0
 68 |   - mkl_random=1.1.1=py37h0573a6f_0
 69 |   - multidict=4.7.6=py37h7b6447c_1
 70 |   - ncurses=6.2=he6710b0_1
 71 |   - numpy=1.19.2=py37h54aff64_0
 72 |   - numpy-base=1.19.2=py37hfa32c7d_0
 73 |   - oauthlib=3.1.0=py_0
 74 |   - olefile=0.46=py37_0
 75 |   - openssl=1.1.1h=h7b6447c_0
 76 |   - opt_einsum=3.1.0=py_0
 77 |   - pandas=1.1.3=py37he6710b0_0
 78 |   - pcre=8.44=he6710b0_0
 79 |   - pillow=8.0.1=py37he98fc37_0
 80 |   - pip=20.2.4=py37h06a4308_0
 81 |   - protobuf=3.13.0.1=py37he6710b0_1
 82 |   - pyasn1=0.4.8=py_0
 83 |   - pyasn1-modules=0.2.8=py_0
 84 |   - pycparser=2.20=py_2
 85 |   - pyjwt=1.7.1=py37_0
 86 |   - pyopenssl=19.1.0=pyhd3eb1b0_1
 87 |   - pyparsing=2.4.7=py_0
 88 |   - pyqt=5.9.2=py37h05f1152_2
 89 |   - pysocks=1.7.1=py37_1
 90 |   - python=3.7.9=h7579374_0
 91 |   - python-dateutil=2.8.1=py_0
 92 |   - pytz=2020.1=py_0
 93 |   - pyyaml=5.3.1=py37h7b6447c_1
 94 |   - qt=5.9.7=h5867ecd_1
 95 |   - readline=8.0=h7b6447c_0
 96 |   - requests=2.24.0=py_0
 97 |   - requests-oauthlib=1.3.0=py_0
 98 |   - rsa=4.6=py_0
 99 |   - scikit-learn=0.23.2=py37h0573a6f_0
100 |   - scipy=1.5.2=py37h0b6359f_0
101 |   - setuptools=50.3.1=py37h06a4308_1
102 |   - sip=4.19.8=py37hf484d3e_0
103 |   - six=1.15.0=py37h06a4308_0
104 |   - sqlite=3.33.0=h62c20be_0
105 |   - tensorboard=2.3.0=pyh4dce500_0
106 |   - tensorboard-plugin-wit=1.6.0=py_0
107 |   - tensorflow=2.3.0=eigen_py37h189e6a2_0
108 |   - tensorflow-base=2.3.0=eigen_py37h3b305d7_0
109 |   - tensorflow-estimator=2.3.0=pyheb71bc4_0
110 |   - termcolor=1.1.0=py37_1
111 |   - threadpoolctl=2.1.0=pyh5ca1d4c_0
112 |   - tk=8.6.10=hbc83047_0
113 |   - tornado=6.0.4=py37h7b6447c_1
114 |   - urllib3=1.25.11=py_0
115 |   - werkzeug=1.0.1=py_0
116 |   - wheel=0.35.1=pyhd3eb1b0_0
117 |   - wrapt=1.12.1=py37h7b6447c_1
118 |   - xz=5.2.5=h7b6447c_0
119 |   - yaml=0.2.5=h7b6447c_0
120 |   - yarl=1.6.2=py37h7b6447c_0
121 |   - zipp=3.4.0=pyhd3eb1b0_0
122 |   - zlib=1.2.11=h7b6447c_3
123 |   - zstd=1.4.5=h9ceee32_0
124 |   - pip:
125 |     - xgboost==1.2.1
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------
/tools/strand_prediction/CRISPRstrand/Example/Input.fa:
--------------------------------------------------------------------------------
1 | >CRISPR_1 consensus
2 | GTTTCAGTCCCGATCGCCGGGATTAGTAGAAGGAAAG
3 | >CRISPR_2 consensus
4 | GTTTCAGTCCCCTGACGGGGAAAAGAGGGTGTTGAAC
5 | >CRISPR_3 consensus
6 | GTCTCCACTCGTAGGAGAAATTAATTGATTGGAAAC
7 | 


--------------------------------------------------------------------------------
/tools/strand_prediction/CRISPRstrand/Example/Input.txt:
--------------------------------------------------------------------------------
1 | GAGTTCCCCGCGCTAGCGGGGATAAACCG
2 | GAGTTCCCCGCGCCAGCGGGGATAAACCG
3 | GTGTTCCCCGCGCCAGCGGGGATAAACCG
4 | GTGTTCCCCGCGCCAGCGGGGATAAACCG
5 | GTGTTCCCCGCGCCAGCGGGGATAAACCG
6 | 


--------------------------------------------------------------------------------
/tools/strand_prediction/CRISPRstrand/Example/Input3.fa:
--------------------------------------------------------------------------------
 1 | >CRISPR1
 2 | CTTTCCTTCTACTAATCCCGGCGATCGGGACTGAAAC
 3 | >CRISPR2
 4 | GTTCAACACCCTCTTTTCCCCGTCAGGGGACTGAAAC
 5 | >CRISPR3
 6 | GTCTCCACTCGTAGGAGAAATTAATTGATTGGAAAC
 7 | >CRISPR1Rev
 8 | GTTTCAGTCCCGATCGCCGGGATTAGTAGAAGGAAAG
 9 | >CRISPR2Rev
10 | GTTTCAGTCCCCTGACGGGGAAAAGAGGGTGTTGAAC
11 | >CRISPR3Rev
12 | GTTTCCAATCAATTAATTTCTCCTACGAGTGGAGAC
13 | 


--------------------------------------------------------------------------------
/tools/strand_prediction/CRISPRstrand/Example/Input4.txt:
--------------------------------------------------------------------------------
1 | CGGTTTATCCCCGCTGGCGCGGGGAACAC
2 | CGGTTTATCCCCGCTGGCGCGGGGAACAC
3 | 


--------------------------------------------------------------------------------
/tools/strand_prediction/CRISPRstrand/Example/Input5.fa:
--------------------------------------------------------------------------------
 1 | >CRISPR_1 consensus
 2 | CGGTTTATCCCCGCTGGCGCGGGGAACAC
 3 | >CRISPR_2 consensus
 4 | CGGTTTATCCCCGCTGGCGCGGGGAACAC
 5 | >CRISPR_3 consensus
 6 | CGGTTTATCCCCGCTGGCGCGGGGAACAC
 7 | >CRISPR_4 consensus
 8 | CGGTTTATCCCCGCTGGCGCGGGGAACAC
 9 | 
10 | 


--------------------------------------------------------------------------------
/tools/strand_prediction/CRISPRstrand/Models/model_r.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/tools/strand_prediction/CRISPRstrand/Models/model_r.h5


--------------------------------------------------------------------------------
/tools/strand_prediction/CRISPRstrand/Results/CRISPRstrand_Summary.tsv:
--------------------------------------------------------------------------------
1 | ID	Input Sequence	Predicted Sequence	Strand	Confidence
2 | CRISPR_1 consensus	CGGTTTATCCCCGCTGGCGCGGGGAACAC	GTGTTCCCCGCGCCAGCGGGGATAAACCG	Reverse	High
3 | CRISPR_2 consensus	CGGTTTATCCCCGCTGGCGCGGGGAACAC	GTGTTCCCCGCGCCAGCGGGGATAAACCG	Reverse	High
4 | CRISPR_3 consensus	CGGTTTATCCCCGCTGGCGCGGGGAACAC	GTGTTCCCCGCGCCAGCGGGGATAAACCG	Reverse	High
5 | CRISPR_4 consensus	CGGTTTATCCCCGCTGGCGCGGGGAACAC	GTGTTCCCCGCGCCAGCGGGGATAAACCG	Reverse	High
6 | 


--------------------------------------------------------------------------------
/tools/strand_prediction/CRISPRstrand/cmd.txt:
--------------------------------------------------------------------------------
1 | python CRISPRstrand.py -r -i Example/Input3.fa
2 | 


--------------------------------------------------------------------------------
/tools/strand_prediction/CRISPRstrand/convNets.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Thu Jul 11 10:42:42 2019
  5 | 
  6 | @author: ekrem
  7 | """
  8 | 
  9 | # disable tensorflow multi-threading
 10 | import tensorflow as tf
 11 | tf.config.threading.set_inter_op_parallelism_threads(1)
 12 | tf.config.threading.set_intra_op_parallelism_threads(1)
 13 | 
 14 | from keras.layers import Conv2D
 15 | from keras.layers import MaxPooling2D
 16 | from keras.layers import GlobalMaxPooling2D
 17 | from keras.layers import Dense
 18 | from keras.layers import Dropout
 19 | from keras.layers import BatchNormalization
 20 | from keras.layers import Activation
 21 | from keras.layers import GaussianNoise
 22 | from keras.optimizers import Adam, SGD
 23 | from keras.layers import concatenate
 24 | from keras import Input, Model
 25 | from keras.regularizers import l2
 26 | 
 27 | '''
 28 | When only the consensus repeat is available
 29 | '''
 30 | def build_parallel_classifier_R(seq_length, kernel_width = [4, 6, 8, 12, 16]):
 31 |     
 32 |     seq_height = 4
 33 |     reg = 0.05
 34 |     #noise_stddev = 0.015
 35 |     num_feature_maps = 32
 36 |     
 37 |     main_input = Input(shape=(seq_height, seq_length, 1))
 38 |     
 39 |     ''''''
 40 |     layer_1p = Conv2D(num_feature_maps, (seq_height, kernel_width[0]), padding = 'valid', use_bias = False, kernel_regularizer=l2(reg))(main_input)
 41 |     #layer_1p = GaussianNoise(noise_stddev)(layer_1p)
 42 |     layer_1p = Activation('relu')(layer_1p)
 43 |     layer_1p = BatchNormalization()(layer_1p)
 44 |     pool_1p = GlobalMaxPooling2D()(layer_1p)
 45 |     
 46 |     layer_2p = Conv2D(num_feature_maps, (seq_height, kernel_width[1]), padding = 'valid', use_bias = False, kernel_regularizer=l2(reg))(main_input)
 47 |     #layer_2p = GaussianNoise(noise_stddev)(layer_2p)
 48 |     layer_2p = Activation('relu')(layer_2p)
 49 |     layer_2p = BatchNormalization()(layer_2p)
 50 |     pool_2p = GlobalMaxPooling2D()(layer_2p)
 51 |     
 52 |     layer_3p = Conv2D(num_feature_maps, (seq_height, kernel_width[2]), padding = 'valid', use_bias = False, kernel_regularizer=l2(reg))(main_input)
 53 |     #layer_3p = GaussianNoise(noise_stddev)(layer_3p)
 54 |     layer_3p = Activation('relu')(layer_3p)
 55 |     layer_3p = BatchNormalization()(layer_3p)
 56 |     pool_3p = GlobalMaxPooling2D()(layer_3p)
 57 |     
 58 |     layer_4p = Conv2D(num_feature_maps, (seq_height, kernel_width[3]), padding = 'valid', use_bias = False, kernel_regularizer=l2(reg))(main_input)
 59 |     #layer_4p = GaussianNoise(noise_stddev)(layer_4p)
 60 |     layer_4p = Activation('relu')(layer_4p)
 61 |     layer_4p = BatchNormalization()(layer_4p)
 62 |     pool_4p = GlobalMaxPooling2D()(layer_4p)
 63 |     
 64 |     layer_5p = Conv2D(num_feature_maps, (seq_height, kernel_width[4]), padding = 'valid', use_bias = False, kernel_regularizer=l2(reg))(main_input)
 65 |     #layer_5p = GaussianNoise(noise_stddev)(layer_5p)
 66 |     layer_5p = Activation('relu')(layer_5p)
 67 |     layer_5p = BatchNormalization()(layer_5p)
 68 |     pool_5p = GlobalMaxPooling2D()(layer_5p)
 69 |     
 70 |     concatenated = concatenate([pool_1p, pool_2p, pool_3p, pool_4p, pool_5p], axis = 1, name = 'cutoff_layer')
 71 |     
 72 |     x = Dense(256, kernel_regularizer=l2(reg), bias_regularizer=l2(reg))(concatenated)
 73 |     x = Activation('relu')(x)
 74 |     x = Dropout(rate = 0.5)(x)
 75 |     
 76 |     x = Dense(32, kernel_regularizer=l2(reg), bias_regularizer=l2(reg))(x)
 77 |     x = Activation('relu')(x)
 78 |     x = Dropout(rate = 0.5)(x)
 79 |     
 80 |     xs = Dense(units = 1, kernel_regularizer=l2(reg), bias_regularizer=l2(reg))(x)
 81 |     out = Activation('sigmoid')(xs)
 82 | 
 83 |     classifier = Model(main_input, out)
 84 |     
 85 |     optim = SGD(decay=1e-4)
 86 |     classifier.compile(optimizer = optim, loss = 'binary_crossentropy', metrics = ['accuracy'])
 87 |     return classifier
 88 | 
 89 | '''
 90 | When a sequence is available
 91 | '''
 92 | def build_parallel_classifier_A(seq_length, kernel_width = [4, 6, 8, 12, 16]):
 93 |     
 94 |     seq_height = 5
 95 |     reg = 0.05
 96 |     #noise_stddev = 0.015
 97 |     num_feature_maps = 32
 98 |     
 99 |     main_input = Input(shape=(seq_height, seq_length, 1))
100 |     side_input = Input(shape=(7,))
101 |     
102 |     ''''''
103 |     layer_1p = Conv2D(num_feature_maps, (seq_height, kernel_width[0]), padding = 'valid', use_bias = False, kernel_regularizer=l2(reg))(main_input)
104 |     #layer_1p = GaussianNoise(noise_stddev)(layer_1p)
105 |     layer_1p = Activation('relu')(layer_1p)
106 |     layer_1p = BatchNormalization()(layer_1p)
107 |     pool_1p = GlobalMaxPooling2D()(layer_1p)
108 |     
109 |     layer_2p = Conv2D(num_feature_maps, (seq_height, kernel_width[1]), padding = 'valid', use_bias = False, kernel_regularizer=l2(reg))(main_input)
110 |     #layer_2p = GaussianNoise(noise_stddev)(layer_2p)
111 |     layer_2p = Activation('relu')(layer_2p)
112 |     layer_2p = BatchNormalization()(layer_2p)
113 |     pool_2p = GlobalMaxPooling2D()(layer_2p)
114 |     
115 |     layer_3p = Conv2D(num_feature_maps, (seq_height, kernel_width[2]), padding = 'valid', use_bias = False, kernel_regularizer=l2(reg))(main_input)
116 |     #layer_3p = GaussianNoise(noise_stddev)(layer_3p)
117 |     layer_3p = Activation('relu')(layer_3p)
118 |     layer_3p = BatchNormalization()(layer_3p)
119 |     pool_3p = GlobalMaxPooling2D()(layer_3p)
120 |     
121 |     layer_4p = Conv2D(num_feature_maps, (seq_height, kernel_width[3]), padding = 'valid', use_bias = False, kernel_regularizer=l2(reg))(main_input)
122 |     #layer_4p = GaussianNoise(noise_stddev)(layer_4p)
123 |     layer_4p = Activation('relu')(layer_4p)
124 |     layer_4p = BatchNormalization()(layer_4p)
125 |     pool_4p = GlobalMaxPooling2D()(layer_4p)
126 |     
127 |     layer_5p = Conv2D(num_feature_maps, (seq_height, kernel_width[4]), padding = 'valid', use_bias = False, kernel_regularizer=l2(reg))(main_input)
128 |     #layer_5p = GaussianNoise(noise_stddev)(layer_5p)
129 |     layer_5p = Activation('relu')(layer_5p)
130 |     layer_5p = BatchNormalization()(layer_5p)
131 |     pool_5p = GlobalMaxPooling2D()(layer_5p)
132 |     
133 |     concatenated = concatenate([pool_1p, pool_2p, pool_3p, pool_4p, pool_5p], axis = 1, name = 'cutoff_layer')
134 |     
135 |     x = Dense(256, kernel_regularizer=l2(reg), bias_regularizer=l2(reg))(concatenated)
136 |     x = Activation('relu')(x)
137 |     x = Dropout(rate = 0.5)(x)
138 |     
139 |     x = Dense(32, kernel_regularizer=l2(reg), bias_regularizer=l2(reg))(x)
140 |     x = Activation('relu')(x)
141 |     x = Dropout(rate = 0.5)(x)
142 |     
143 |     s = Dense(16, kernel_regularizer=l2(reg), bias_regularizer=l2(reg))(side_input)
144 |     s = Activation('relu')(s)
145 |     s = Dropout(rate = 0.5)(s)
146 |     
147 |     xs = concatenate([x, s], axis = 1)
148 |     xs = Dense(units = 1, kernel_regularizer=l2(reg), bias_regularizer=l2(reg))(xs)
149 |     out = Activation('sigmoid')(xs)
150 |     
151 |     classifier = Model([main_input, side_input], out)
152 |     
153 |     optim = SGD(decay=1e-4)
154 |     classifier.compile(optimizer = optim, loss = 'binary_crossentropy', metrics = ['accuracy'])
155 |     return classifier
156 | 


--------------------------------------------------------------------------------
/tools/strand_prediction/CRISPRstrand/evaluate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Thu Jul 11 19:10:47 2019
 5 | 
 6 | @author: ekrem
 7 | """
 8 | import os
 9 | 
10 | import numpy as np
11 | import pandas as pd
12 | 
13 | from sklearn.metrics import roc_auc_score
14 | 
15 | def test(classifier, df, X, y, output_folder = 'Results', batch_size = 64):
16 |     
17 |     X = np.stack(X)
18 |     probs = classifier.predict(X, batch_size = batch_size)
19 |     df['Probs'] = probs
20 |     
21 |     orig_len = len(df)//2
22 |     samples = []
23 |     for i in range(orig_len):
24 |         pos_sample = df.iloc[i]
25 |         neg_sample = df.iloc[orig_len+i]
26 |     
27 |         ID = pos_sample.ID
28 |         assert ID == neg_sample.ID, 'Positive sample ID does not match the negative.'
29 | 
30 |         input_cons = pos_sample['Cons']
31 |         sample = pos_sample if pos_sample['Probs'] > neg_sample['Probs'] else neg_sample
32 |         strand = 'Forward' if pos_sample['Probs'] > neg_sample['Probs'] else 'Reverse'
33 |         predicted_cons = sample['Cons']
34 | 
35 |         prob = sample['Probs']
36 |         if prob >= 0.7:
37 |             confidence = 'High'
38 |         elif 0.5 < prob < 0.7:
39 |             confidence = 'Medium'
40 |         else:
41 |             confidence = 'Low'
42 |         
43 |         if 'Type' in df.columns.values:
44 |             samples.append((ID, input_cons, predicted_cons, strand, confidence, sample.Type))
45 |             columns = ['ID', 'Input Sequence', 'Predicted Sequence', 'Strand', 'Confidence', 'Type']
46 |         else:
47 |             samples.append((ID, input_cons, predicted_cons, strand, confidence))
48 |             columns = ['ID', 'Input Sequence', 'Predicted Sequence', 'Strand', 'Confidence']
49 | 
50 |     df_out = pd.DataFrame.from_records(data=samples, columns = columns)
51 | 
52 |     if not os.path.exists(output_folder):
53 |         os.makedirs(output_folder)
54 | 
55 |     df_out.to_csv(os.path.join(output_folder, 'CRISPRstrand_Summary.tsv'), sep='\t', index = False)
56 | 
57 |     return df_out, roc_auc_score(y, probs)
58 |     
59 | 
60 | 


--------------------------------------------------------------------------------
/tools/strand_prediction/CRISPRstrand/execute_strand.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | import os
3 | 
4 | conda_activation_cmd = "export PATH=~/miniconda3/bin:$PATH && conda activate crispr_strand_env && python CRISPRstrand.py -r -i Example/Input3.fa && conda deactivate"
5 | subprocess.run(conda_activation_cmd, shell=True)
6 | 
7 | 
8 | #os.system("source activate crispr_strand_env")


--------------------------------------------------------------------------------
/tools/strand_prediction/CRISPRstrand/preprocessing.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Wed Jul  3 20:44:47 2019
  5 | 
  6 | @author: ekrem
  7 | """
  8 | import re
  9 | import os
 10 | import numpy as np
 11 | import pandas as pd
 12 | from collections import namedtuple
 13 | import sklearn.model_selection as skms
 14 | 
 15 | #from Models.repeat import RepeatTyper
 16 | #from Models.xgb import XGB
 17 | 
 18 | # =============================================================================
 19 | # DICTIONARIES
 20 | # =============================================================================
 21 | 
 22 | one_hot_encoding_dict    = {
 23 |                            'A': np.array([0, 0, 0, 1]).reshape((-1,1)), # A
 24 |                            'T': np.array([0, 0, 1, 0]).reshape((-1,1)), # T
 25 |                            'G': np.array([0, 1, 0, 0]).reshape((-1,1)), # G
 26 |                            'C': np.array([1, 0, 0, 0]).reshape((-1,1)), # C
 27 |                            'N': np.array([0, 0, 0, 0]).reshape((-1,1)), # N
 28 |                            '-': np.array([0, 0, 0, 0]).reshape((-1,1)), # N
 29 |                            } 
 30 | complement_encoding_dict = {'A':'T', 'T':'A', 'G':'C', 'C':'G', 'N':'N', '-':'N'}
 31 | 
 32 | # =============================================================================
 33 | # Decoders/Encoders/Helpers
 34 | # =============================================================================
 35 | 
 36 | '''
 37 | Performs one-hot encoding on repeat sequences.
 38 | Reshapes each encoded nucleotide to a column vector, then concatenates them.
 39 | Returns representation with shape (4, sequence_length, 1). Last dimension is 
 40 | channels (always 1 in this case) to comply with 2D-CNN.
 41 | '''
 42 | def one_hot_encode_sequence(repeat):
 43 |     return np.concatenate(list(map(one_hot_encoding_dict.get, repeat)), axis = 1).reshape(4, -1, 1)
 44 | 
 45 | '''
 46 | Returns the reverse complement of given repeat sequence.
 47 | '''
 48 | def reverse_complement(repeat):
 49 |     complement = list(map(complement_encoding_dict.get, repeat))
 50 |     reverse_complement = ''.join(reversed(complement))
 51 |     return reverse_complement
 52 | 
 53 | '''
 54 | Returns the length of all repeat sequences in the dataframe.
 55 | '''
 56 | def get_seq_len(repeat):
 57 |     return repeat.shape[1]
 58 | 
 59 | '''
 60 | Pads all repeat sequences with zero from right-hand side to the length of 
 61 | maximum sequence.
 62 | '''
 63 | def pad_seq(repeat, max_seq_len):
 64 |     seq_len = get_seq_len(repeat)
 65 |     if max_seq_len>=seq_len:
 66 |         return np.pad(repeat, ((0,0),(0, max_seq_len-seq_len), (0,0)), 'constant')
 67 |     else:
 68 |         return repeat[:,:max_seq_len]
 69 | 
 70 | '''
 71 | Parses the ID and splits accession number, program and strand into different fields. Also implies start and end indices in case.
 72 | ''' 
 73 | def parse_ID(raw_sample):
 74 |     ID, program, strand = raw_sample['ID'].split('-')
 75 |     program, start, end = program.split('_')
 76 |     raw_sample['Accession'] = ID
 77 |     raw_sample['Program'] = program
 78 |     raw_sample['Strand'] = strand
 79 |     raw_sample['Start'] = start
 80 |     raw_sample['End'] = end
 81 |     return raw_sample
 82 | 
 83 | # =============================================================================
 84 | # FUNCTIONS TO GET NEGATIVE DATASET
 85 | # =============================================================================
 86 | '''
 87 | Wrapper method.
 88 | Forms negative dataset.
 89 | '''
 90 | def seperate_to_pos_neg(df_pos, _all = False, keep_orig = False):
 91 |     
 92 |     if keep_orig:
 93 |         df_pos = df_pos.copy()
 94 |     # Form negative dataset
 95 |     df_neg = get_neg(df_pos, _all)
 96 |     
 97 |     df_pos['Label'] = 1
 98 |     df_neg['Label'] = 0
 99 | 
100 |     return df_pos, df_neg
101 | 
102 | '''
103 | Applies reverse complement to repeat sequences and to strand.
104 | '''
105 | def get_neg(df_pos, _all = False):
106 |     
107 |     df_neg = df_pos.copy()
108 |     df_neg['Cons'] = df_neg['Cons'].apply(reverse_complement)
109 |     if _all:
110 |         df_neg['Conservation'] = df_neg['Conservation'].apply(np.flip)
111 |         df_neg['Edge_Conservations'] = df_neg['Edge_Conservations'].apply(np.flip)
112 |         df_neg['Up_Down_AT_content'] = df_neg['Up_Down_AT_content'].apply(np.flip)
113 |         df_neg['Up_Down_AT_content'] = df_neg['Up_Down_AT_content'].apply(lambda x: 1-x)
114 | 
115 |     return df_neg
116 | 
117 | '''
118 | Reads and formats the data.
119 | '''
120 | def read_xls(path = None):
121 |     df = pd.read_excel(path)
122 |     df.rename(columns={'Consensus repeat': 'Cons'}, inplace=True)
123 |     df = df.dropna()
124 |     df = df.apply(parse_ID, axis = 1)
125 |     return df
126 | 
127 | # =============================================================================
128 | # 
129 | # =============================================================================
130 | 
131 | '''
132 | Finds conflicting samples in less trusted dataset
133 | '''
134 | def compare_datasets(df_reliable_pos, df_corrupt_neg):
135 |     conflict_dict = check_datasets(df_corrupt_neg, df_reliable_pos)
136 |     keys = list(conflict_dict.keys())
137 |     return keys
138 | 
139 | '''
140 | Cleans conflicting samples
141 | '''
142 | def clean_datasets(df_pos, df_neg, conflict_dict = None, _all = False):
143 |     if conflict_dict == None:
144 |         conflict_dict = check_datasets(df_pos, df_neg)
145 |     keys = list(conflict_dict.keys())
146 |     print('Cleaning the dataset from conflicting samples...')
147 |     df_pos = df_pos[~df_pos['ID'].isin(keys)]
148 |     return seperate_to_pos_neg(df_pos, _all = _all, keep_orig = False)
149 | 
150 | '''
151 | Finds conflicting samples
152 | '''
153 | def check_datasets(df_pos, df_neg):
154 |     print('Checking the dataset for conflicting samples...')
155 |     conflict_dict = dict()
156 |     for idx, sample in df_pos.iterrows():
157 |         consensus = sample['Cons']
158 |         acc_id = sample['ID']
159 |         df_conf = df_neg[df_neg['Cons'] == consensus]
160 |         if (len(df_conf)>0):
161 |             conflict_dict[acc_id] = (df_conf, consensus)
162 |     return conflict_dict
163 | 
164 | '''
165 | '''
166 | def issue_conflicts(conflict_dict, path):
167 |     f = open(path, 'w+')
168 |     for conf_acc_id, conf_samples in conflict_dict.items():
169 |         df_conf, consensus = conf_samples
170 |         f.write('Sample '+conf_acc_id+'\t'+consensus+' conflicts with the negative samples down below.\n')
171 |         for _, sample in df_conf.iterrows():
172 |             f.write('\t'.join([str(s) for s in sample]))
173 |             f.write('\n')
174 |         f.write(45*'_'+'\n')
175 |     print('Saved conflicting samples into issue file in path %s' % (path))
176 |     f.close()
177 | 
178 | # =============================================================================
179 | # FUNCTIONS BELOW PROCESSES DATAFRAME TO FORMAT INPUT
180 | # =============================================================================
181 | '''
182 | Performs operations below to prepare input for neural network.
183 | For input with only repeats.
184 | '''
185 | def process_dataset_for_training(df):
186 |     df = df.copy()
187 |     df['Cons'] = df['Cons'].apply(one_hot_encode_sequence)
188 |     
189 |     # Pad repeats to equal size
190 |     seq_lens = df['Cons'].apply(get_seq_len)
191 |     max_seq_len = max(seq_lens)
192 |     df['Cons'] = df['Cons'].apply(pad_seq, args = [max_seq_len])
193 |     
194 |     # Split into features-labels
195 |     df_X = df['Cons']
196 |     df_y = df['Label']
197 |     
198 |     # Return train and also length of the sequence 
199 |     # (width of the input)
200 |     return df_X.values, df_y.values, max_seq_len
201 | 
202 | '''
203 | ''' 
204 | def process_dataset_for_test(df, max_seq_len, clean = False):
205 |     
206 |     # To remove conflicting samples
207 |     if clean:
208 |         df_pos, df_neg = seperate_to_pos_neg(df)
209 |         print('Number of conflicting samples:', len(check_datasets(df_pos, df_neg)))
210 |         print('Cleaning...')
211 |         df_pos, df_neg = clean_datasets(df_pos, df_neg, con = False)
212 |         print('SUCCESS' if len(check_datasets(df_pos, df_neg)) == 0 else 'Failure')
213 |     
214 |         df = pd.concat([df_pos, df_neg]).reset_index(drop = True)
215 |     
216 |     df_encoded = df.copy()
217 |     # One-hot encode
218 |     df_encoded['Cons'] = df_encoded['Cons'].apply(one_hot_encode_sequence)
219 |     
220 |     # Pad repeats to model input shape
221 |     df_encoded['Cons'] = df_encoded['Cons'].apply(pad_seq, args = [max_seq_len])
222 |     
223 |     # Split into features-labels
224 |     df_X = df_encoded['Cons']
225 |     df_y = df_encoded['Label']
226 |     
227 |     return df, df_X.values, df_y.values
228 | 
229 | '''
230 | '''
231 | def fasta_to_df(fasta_file_path):
232 |     f= open(fasta_file_path, "r")
233 |     contents = f.read()
234 |     contents = re.split(r'>', contents) # Split every id
235 |     samples = []
236 |     for content in contents[1:]: # First row is an empty partition
237 |         temp = re.split(r'\n', content) # Split from new line or tab
238 |         samples.append(temp[:2])
239 |     df = pd.DataFrame.from_records(data=samples, columns = ['ID', 'Cons'])
240 |     return df
241 | 
242 | def txt_to_df(path):
243 |     f = open(path, 'r')
244 |     repeats = f.readlines()
245 |     f.close()
246 | 
247 |     samples = []
248 |     for idx, repeat in enumerate(repeats):
249 |         ID = 'SAMPLE_'+str(idx)
250 |         samples.append((ID, repeat.strip()))
251 | 
252 |     return pd.DataFrame.from_records(data=samples, columns = ['ID', 'Cons'])
253 | '''
254 | '''
255 | 
256 | def read_datasets_prepare_train_test(folder, filenames, test_size = 0.25, usecols = None):
257 |     print('Reading the files...')
258 |     df_list = []
259 |     for filename in filenames:
260 |         
261 |         if 'xls' in filename:
262 |             df = read_xls(filename, usecols = usecols, header = 0, names = ['ID', 'Cons'])
263 |         elif '.txt' in filename:
264 |             df = txt_to_df(filename)
265 |         elif 'tsv' or 'tab' in filename:
266 |             df = pd.read_csv(filename, sep = '\t', usecols = usecols, header = 0, names = ['ID', 'Cons'])
267 |         elif '.fasta' in filename:
268 |             df = fasta_to_df(filename)
269 |         
270 |         df_train, df_test = skms.train_test_split(df, test_size = test_size)
271 |         df_list.append((df_train, df_test))
272 | 
273 | def read_datasets(filenames, test_size = 0.25, usecols = None):
274 |     print('Reading the files...')
275 |     df_list = []
276 |     for filename in filenames:
277 |         if '.xls' in filename:
278 |             df = read_xls(filename, usecols = usecols, header = 0, names = ['ID', 'Cons'])
279 |         elif '.txt' in filename:
280 |             df = txt_to_df(filename)
281 |         elif ('.tsv' in filename) or ('.tab' in filename):
282 |             df = pd.read_csv(filename, sep = '\t', usecols = usecols, header = 0, names = ['ID', 'Cons'])
283 |         elif ('.fasta' in filename) or ('.fa' in filename):
284 |             df = fasta_to_df(filename)
285 |         
286 |         df_list.append(df)
287 |     return df_list
288 | 
289 | '''
290 | '''
291 | def concat_datasets(df_list):
292 |     if len(df_list[0]) == 2:
293 |         df_train =pd.concat([dfs[0] for dfs in df_list], ignore_index = True)
294 |         df_test =pd.concat([dfs[1] for dfs in df_list], ignore_index = True)
295 |         return df_train, df_test
296 |     else:
297 |         df = pd.concat(df_list, ignore_index = True)
298 |         return df
299 | # =============================================================================
300 | 
301 | 
302 | def prepare_input(filenames, output_folder, check_conflict = True, usecols = None, do_type = False):
303 |     df_list = read_datasets(filenames, usecols = usecols)
304 |     if len(df_list) == 1:
305 |         df_pos = df_list[0]
306 |     else:
307 |         df_pos = concat_datasets(df_list)
308 | 
309 |     #if do_type:
310 |         #df_pos = find_repeat_type(df_pos)
311 | 
312 |     df_pos, df_neg = seperate_to_pos_neg(df_pos)
313 | 
314 |     if check_conflict:
315 |         conflict_dict = check_datasets(df_pos, df_neg)
316 |         if not os.path.exists(output_folder):
317 |             os.makedirs(output_folder)
318 |         issue_conflicts(conflict_dict, os.path.join(output_folder, 'conflict_issues.txt'))
319 |         df_pos, df_neg = clean_datasets(df_pos, df_neg, conflict_dict)
320 | 
321 |     df = concat_datasets((df_pos, df_neg))
322 |     return df
323 | 
324 | if __name__ == '__main__':
325 | 
326 |     nov_2020_filenames = ['I-E_repeat_seqs.xls', 'VI-B_repeat_seqs.xls', 'VI-A_repeat_seqs.xls','I-F_repeat_seqs.xls', 'II-A_repeat_seqs.xls', 'II-B_repeat_seqs.xls', 'II-C_repeat_seqs.xls']
327 | 
328 |     df_list = read_datasets(folder, ['clean_repeat_seqs.tsv'], True)
329 |     df_pos_train, df_pos_test = concat_datasets(df_list)
330 | 
331 |     print(df_pos_train)
332 |     print(df_pos_test)
333 |     df_pos_train.to_csv(os.path.join(folder, 'train_repeat_seqs.tsv'), sep = '\t', index = False)
334 |     df_pos_test.to_csv(os.path.join(folder, 'test_repeat_seqs.tsv'), sep = '\t', index = False)
335 | 


--------------------------------------------------------------------------------
/tools/strand_prediction/CRISPRstrand/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | # disable tensorflow multi-threading
 4 | import tensorflow as tf
 5 | tf.config.threading.set_inter_op_parallelism_threads(1)
 6 | tf.config.threading.set_intra_op_parallelism_threads(1)
 7 | 
 8 | #import matplotlib.pyplot as plt
 9 | from keras.models import load_model as lm
10 | 
11 | # =============================================================================
12 | # 
13 | # =============================================================================
14 | def load_model(model_path):
15 |     classifier = lm(model_path)
16 |     classifier.summary()
17 |     sequence_length = classifier.layers[1].input_shape[2]
18 | 
19 |     return classifier, sequence_length
20 | 
21 | def plot_loss_acc(histories):
22 |     
23 |     training_accuracies = []
24 |     validation_accuracies = []
25 |     training_losses = []
26 |     validation_losses = []
27 |     
28 |     for history in histories:
29 |         training_accuracies.append(history.history['acc'])
30 |         validation_accuracies.append(history.history['val_acc'])
31 |         training_losses.append(history.history['loss'])
32 |         validation_losses.append(history.history['val_loss'])
33 |     
34 |     training_accuracies = np.stack(training_accuracies)
35 |     validation_accuracies = np.stack(validation_accuracies)
36 |     training_losses = np.stack(training_losses)
37 |     validation_losses = np.stack(validation_losses)
38 | 
39 |     mean_training_accuracies = np.mean(training_accuracies, axis = 0)
40 |     mean_validation_accuracies = np.mean(validation_accuracies, axis = 0)
41 |     mean_training_losses = np.mean(training_losses, axis = 0)
42 |     mean_validation_losses = np.mean(validation_losses, axis = 0)
43 |     
44 |     std_training_accuracies = np.sqrt(np.var(training_accuracies, axis = 0))
45 |     std_validation_accuracies = np.sqrt(np.var(validation_accuracies, axis = 0))
46 |     std_training_losses = np.sqrt(np.var(training_losses, axis = 0))
47 |     std_validation_losses = np.sqrt(np.var(validation_losses, axis = 0))
48 |     
49 |     epochs = training_accuracies.shape[1]
50 |     
51 |     x = np.atleast_2d(np.linspace(1, epochs, epochs)).T
52 |     plt.plot(x, mean_training_accuracies, label='train_acc')
53 |     plt.plot(x, mean_validation_accuracies, label='val_acc')
54 |     plt.fill(np.concatenate([x, x[::-1]]), np.concatenate([mean_training_accuracies-std_training_accuracies, (mean_training_accuracies+std_training_accuracies)[::-1]]), alpha=.3, label='train_acc_uncertainty')
55 |     plt.fill(np.concatenate([x, x[::-1]]), np.concatenate([mean_validation_accuracies-std_validation_accuracies, (mean_validation_accuracies+std_validation_accuracies)[::-1]]), alpha=.3, label='val_acc_uncertainty')
56 |     plt.legend(loc='lower right')
57 |     plt.show()
58 |     
59 |     plt.plot(x, mean_training_losses, label='train_loss')
60 |     plt.plot(x, mean_validation_losses, label='val_loss')
61 |     plt.fill(np.concatenate([x, x[::-1]]), np.concatenate([mean_training_losses-std_training_losses, (mean_training_losses+std_training_losses)[::-1]]), alpha=.3, label='train_loss_uncertainty')
62 |     plt.fill(np.concatenate([x, x[::-1]]), np.concatenate([mean_validation_losses-std_validation_losses, (mean_validation_losses+std_validation_losses)[::-1]]), alpha=.3, label='val_loss_uncertainty')
63 |     plt.legend(loc='upper right')
64 |     plt.show()
65 | 


--------------------------------------------------------------------------------
/trained_models/eden/eden_ab_vs_n:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/eden/eden_ab_vs_n


--------------------------------------------------------------------------------
/trained_models/eden/eden_archaea:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/eden/eden_archaea


--------------------------------------------------------------------------------
/trained_models/eden/eden_bacteria:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/eden/eden_bacteria


--------------------------------------------------------------------------------
/trained_models/eden/eden_merged:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/eden/eden_merged


--------------------------------------------------------------------------------
/trained_models/eden/eden_merged_with_neg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/eden/eden_merged_with_neg


--------------------------------------------------------------------------------
/trained_models/extra_trees/extra_trees_ab_vs_n.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/extra_trees/extra_trees_ab_vs_n.pkl


--------------------------------------------------------------------------------
/trained_models/extra_trees/extra_trees_archaea.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/extra_trees/extra_trees_archaea.pkl


--------------------------------------------------------------------------------
/trained_models/extra_trees/extra_trees_bacteria.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/extra_trees/extra_trees_bacteria.pkl


--------------------------------------------------------------------------------
/trained_models/extra_trees/extra_trees_merged.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/extra_trees/extra_trees_merged.pkl


--------------------------------------------------------------------------------
/trained_models/extra_trees/extra_trees_merged_with_neg.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/extra_trees/extra_trees_merged_with_neg.pkl


--------------------------------------------------------------------------------
/trained_models/extra_trees/extra_trees_subset.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/extra_trees/extra_trees_subset.pkl


--------------------------------------------------------------------------------
/trained_models/extra_trees/extra_trees_subset10features.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/extra_trees/extra_trees_subset10features.pkl


--------------------------------------------------------------------------------
/trained_models/extra_trees/extra_trees_subset8features.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/extra_trees/extra_trees_subset8features.pkl


--------------------------------------------------------------------------------
/trained_models/extra_trees/extra_trees_subset9features.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/extra_trees/extra_trees_subset9features.pkl


--------------------------------------------------------------------------------