├── .gitignore ├── CRISPRidentify.py ├── LICENSE ├── README.md ├── TestFolderMultiline ├── MultilineFasta.fasta └── MultilineFasta_1.fasta ├── TestInput ├── NC_006513.fa ├── NC_013216.fa ├── NC_014152.fa ├── NC_016625.fa ├── NC_017040.1.fasta ├── NC_018524.fa └── NC_019693.fa ├── TestInputMultiline └── MultilineFasta.fasta ├── components ├── __init__.py ├── components_detection.py ├── components_detection_refinement.py ├── components_eden.py ├── components_evaluated_arrays_enhancement.py ├── components_evaluation.py ├── components_helpers.py ├── components_ml.py ├── components_non_array_computations.py ├── components_output_maker.py ├── module_detection.py ├── module_detection_refinement.py ├── module_evaluated_arrays_enhancement.py ├── module_evaluation.py ├── module_non_array_computations.py ├── module_output_maker.py └── pipeline.py ├── environment.yml ├── tools ├── CRISPRcasIdentifier │ └── README.txt ├── blasting │ ├── Verified_repeats_dataset1.fa │ ├── Verified_repeats_dataset1.fa.nhr │ ├── Verified_repeats_dataset1.fa.nin │ ├── Verified_repeats_dataset1.fa.nog │ ├── Verified_repeats_dataset1.fa.nsd │ ├── Verified_repeats_dataset1.fa.nsi │ ├── Verified_repeats_dataset1.fa.nsq │ ├── Verified_repeats_dataset2.fa │ ├── Verified_repeats_dataset2.fa.nhr │ ├── Verified_repeats_dataset2.fa.nin │ ├── Verified_repeats_dataset2.fa.nog │ ├── Verified_repeats_dataset2.fa.nsd │ ├── Verified_repeats_dataset2.fa.nsi │ └── Verified_repeats_dataset2.fa.nsq └── strand_prediction │ └── CRISPRstrand │ ├── CRISPRstrand.py │ ├── CRISPRstrand.yml │ ├── Example │ ├── Input.fa │ ├── Input.txt │ ├── Input3.fa │ ├── Input4.txt │ └── Input5.fa │ ├── Models │ └── model_r.h5 │ ├── Results │ └── CRISPRstrand_Summary.tsv │ ├── cmd.txt │ ├── convNets.py │ ├── evaluate.py │ ├── execute_strand.py │ ├── preprocessing.py │ └── utils.py └── trained_models ├── eden ├── eden_ab_vs_n ├── eden_archaea ├── eden_bacteria ├── eden_merged └── eden_merged_with_neg └── extra_trees ├── extra_trees_ab_vs_n.pkl ├── extra_trees_archaea.pkl ├── extra_trees_bacteria.pkl ├── extra_trees_merged.pkl ├── extra_trees_merged_with_neg.pkl ├── extra_trees_subset.pkl ├── extra_trees_subset10features.pkl ├── extra_trees_subset8features.pkl └── extra_trees_subset9features.pkl /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .idea 3 | /tools/CRISPRcasIdentifier/CRISPRcasIdentifier/ 4 | -------------------------------------------------------------------------------- /CRISPRidentify.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import math 3 | import shutil 4 | import warnings 5 | import os 6 | 7 | from pathlib import Path 8 | from os import listdir 9 | from os.path import isfile, join 10 | from time import time 11 | 12 | from components.pipeline import Pipeline 13 | from components.components_ml import ClassifierWrapper 14 | from components.components_output_maker import CompleteFastaOutputMaker 15 | from components.components_output_maker import CompleteFolderSummaryMaker 16 | from components.components_output_maker import CompleteCasSummaryFolderMaker 17 | from components.components_output_maker import CompleteJsonOutputMaker 18 | from components.components_output_maker import CompleteSpacerCSVMaker 19 | from components.components_helpers import multiline_fasta_check, multiline_fasta_handle, multiline_fasta_handle_python 20 | from components.components_helpers import folder_of_multifasta_handle 21 | 22 | warnings.filterwarnings("ignore") 23 | warnings.simplefilter(action='ignore', category=FutureWarning) 24 | 25 | FLAG_DEVELOPER_MODE = False 26 | 27 | parser = argparse.ArgumentParser(description='Run Identifier') 28 | parser.add_argument('--input_folder', type=str, default=None, 29 | help='input folder (default: None)') 30 | 31 | parser.add_argument('--file', type=str, default=None, 32 | help='input file (default: None)') 33 | 34 | parser.add_argument('--input_folder_multifasta', type=str, default=None, 35 | help='input folder of multifasta (default: None)') 36 | 37 | parser.add_argument('--model', type=str, default="ALL", 38 | help='model_to_use (default: ALL)') 39 | 40 | parser.add_argument('--additional_model', type=str, default=None, 41 | help='model_to_use (default: None)') 42 | 43 | parser.add_argument('--result_folder', type=str, default="Results", 44 | help='folder with the result (default: Results)') 45 | 46 | parser.add_argument('--pickle_report', type=str, default='', 47 | help='pickled report file (default: None)') 48 | 49 | parser.add_argument('--json_report', type=str, default='', 50 | help='json report file (default: None)') 51 | 52 | parser.add_argument('--fasta_report', type=str, default=False, 53 | help='fasta report file (default: False)') 54 | 55 | parser.add_argument('--strand', type=str, default=True, 56 | help='CRISPR array orientation prediction (default: True)') 57 | 58 | parser.add_argument('--cas', type=str, default=False, 59 | help='cas genes computation (default: False)') 60 | 61 | parser.add_argument('--is_element', type=str, default=True, 62 | help='is element computation (default: True)') 63 | 64 | parser.add_argument('--parallel', type=str, default=True, 65 | help='parallel computations (default: True)') 66 | 67 | parser.add_argument('--cpu', type=str, default="ALL", 68 | help='parallel computations (default: ALL)') 69 | 70 | parser.add_argument('--fast_run', type=str, default=False, 71 | help='fast run option (default: False)') 72 | 73 | parser.add_argument('--degenerated', type=bool, default=True, 74 | help='degenerated_repeat_computation (default: True)') 75 | 76 | parser.add_argument('--min_len_rep', type=int, default=21, 77 | help='min avg. length of the repeats (default: 21)') 78 | 79 | parser.add_argument('--max_len_rep', type=int, default=55, 80 | help='max avg. length of the repeats (default: 55)') 81 | 82 | parser.add_argument('--min_len_spacer', type=int, default=18, 83 | help='min avg. length of spacers (default: 18)') 84 | 85 | parser.add_argument('--max_len_spacer', type=int, default=78, 86 | help='max avg. length of spacers (default: 78)') 87 | 88 | parser.add_argument('--min_repeats', type=int, default=3, 89 | help='min number of repeats (default: 3)') 90 | 91 | parser.add_argument('--enhancement_max_min', type=bool, default=True, 92 | help='enhancement with filter (default: True)') 93 | 94 | parser.add_argument('--enhancement_start_end', type=bool, default=True, 95 | help='enhancement with start end omitting (default: True)') 96 | 97 | parser.add_argument('--max_identical_spacers', type=int, default=4, 98 | help='maximum number of identical spacers in the array (default: 4)') 99 | 100 | parser.add_argument('--max_identical_cluster_spacers', type=int, default=3, 101 | help='maximum number of consecutive identical spacers in the array (default: 3)') 102 | 103 | parser.add_argument('--margin_degenerated', type=int, default=30, 104 | help='maximum length of the spacer margin for the degenerated search (default: 30)', ) 105 | 106 | parser.add_argument('--max_edit_distance_enhanced', type=int, default=6, 107 | help='maximum edit distance for the evaluated array enhancement (default: 6)') 108 | 109 | 110 | script_absolute_path = os.path.dirname(os.path.abspath(__file__)) 111 | work_directory = os.getcwd() 112 | pid = os.getpid() 113 | 114 | args = parser.parse_args() 115 | 116 | complete_path_folder = (args.input_folder) 117 | if complete_path_folder: 118 | complete_path_folder = Path(complete_path_folder).absolute() 119 | 120 | complete_path_file = args.file 121 | if complete_path_file: 122 | complete_path_file = Path(complete_path_file).absolute() 123 | 124 | complete_folder_multifasta = args.input_folder_multifasta 125 | if complete_folder_multifasta: 126 | complete_folder_multifasta = Path(complete_folder_multifasta).absolute() 127 | 128 | folder_result = args.result_folder 129 | if folder_result: 130 | folder_result = Path(folder_result).absolute() 131 | 132 | pickle_folder = args.pickle_report 133 | if pickle_folder: 134 | pickle_folder = Path(pickle_folder).absolute() 135 | 136 | json_folder = args.json_report 137 | if json_folder: 138 | json_folder = Path(json_folder).absolute() 139 | 140 | list_models = ["8", "9", "10"] if args.model == "ALL" else [args.model] 141 | flag_possible_differentiate_model = args.additional_model 142 | if flag_possible_differentiate_model not in ["possible", "all"]: 143 | flag_possible_differentiate_model = None 144 | 145 | 146 | flag_enhancement_max_min = args.enhancement_max_min 147 | flag_enhancement_start_end = args.enhancement_start_end 148 | 149 | flag_parallel = False if (args.parallel in ["False", False]) else True 150 | flag_cpu = args.cpu 151 | flag_fast_run = False if (args.fast_run in ["False", False]) else True 152 | 153 | strand_flag = False if (args.strand in ["False", False]) else True 154 | cas_flag = False if (args.cas in ["False", False]) else True 155 | is_flag = False if (args.is_element in ["False", False]) else True 156 | degenerated_flag = False if (args.degenerated in ["False", False]) else True 157 | fasta_report = False if (args.fasta_report in ["False", False]) else True 158 | 159 | flags = {"flag_parallel": flag_parallel, 160 | "flag_cpu": flag_cpu, 161 | "flag_fast_run": flag_fast_run, 162 | "flag_strand": strand_flag, 163 | "flag_cas": cas_flag, 164 | "flag_is": is_flag, 165 | "flag_fasta_report": fasta_report, 166 | "flag_degenerated": degenerated_flag, 167 | "flag_enhancement_min_max": flag_enhancement_max_min, 168 | "flag_enhancement_start_end": flag_enhancement_start_end 169 | } 170 | 171 | min_rep = args.min_len_rep 172 | max_rep = args.max_len_rep 173 | max_spacer = args.max_len_spacer 174 | min_spacer = args.min_len_spacer 175 | min_repeats = args.min_repeats 176 | max_identical_spacers = args.max_identical_spacers 177 | max_identical_cluster_spacers = args.max_identical_cluster_spacers 178 | margin_degenerated = args.margin_degenerated 179 | max_edit_distance_enhancement = args.max_edit_distance_enhanced 180 | 181 | parameters = { 182 | "param_min_avg_repeat_length": min_rep, 183 | "param_max_avg_repeat_length": max_rep, 184 | "param_max_avg_spacer_length": max_spacer, 185 | "param_min_avg_spacer_length": min_spacer, 186 | "param_min_repeats": min_repeats, 187 | "param_max_identical_spacers": max_identical_spacers, 188 | "param_max_identical_cluster_spacers": max_identical_cluster_spacers, 189 | "param_spacer_margin_degenerated_search": margin_degenerated, 190 | "param_max_edit_distance": max_edit_distance_enhancement 191 | } 192 | 193 | 194 | ALL_FEATURES = ['repeat_len', 'number_repeats', 'repeat_similarity', 195 | 'at_richness', 'avg_spacer_len', 'spacer_similarity', 196 | 'number_mismatches', 'spacer_evenness', 'mfe_score', 197 | 'orf_score', 'hmmr_score', 'blast_score_1', 'blast_score_2', 198 | 'eden_score'] 199 | 200 | best_combinations = { 201 | "8": (2, 4, 5, 6, 7, 8, 9, 11), 202 | "9": (1, 2, 4, 5, 7, 8, 9, 10, 12), 203 | "10": (0, 2, 3, 4, 5, 6, 7, 10, 11, 12) 204 | } 205 | 206 | 207 | pid_work_directory = os.path.join(work_directory, 'Identify_Temp' + str(pid)) 208 | if not os.path.exists(pid_work_directory): 209 | os.makedirs(pid_work_directory) 210 | os.chdir(pid_work_directory) 211 | 212 | 213 | feature_list = ['.'.join([ALL_FEATURES[i] for i in best_combinations[model]]) for model in list_models] 214 | list_ml_classifiers = [ClassifierWrapper(classifier_type=None, 215 | load_option=script_absolute_path + "/trained_models/extra_trees/extra_trees_subset{}features.pkl". 216 | format(model)) 217 | for model in list_models] 218 | 219 | 220 | def run_over_folder_of_files(folder, result_folder, pickle_folder, chunk_number=None, number_of_chunks=None): 221 | files = [f for f in listdir(folder) if isfile(join(folder, f))] 222 | files_name_fix = [f.replace("\r", "").replace("\t", "").replace("\n", "") for f in files] 223 | for old_name, new_name in zip(files, files_name_fix): 224 | old_path = join(folder, old_name) 225 | new_path = join(folder, new_name) 226 | if old_path != new_path: 227 | os.system(f"mv {old_path} {new_path}") 228 | files = sorted(files_name_fix) 229 | 230 | if number_of_chunks: 231 | chunk_size = math.ceil(len(files) / number_of_chunks) 232 | chunk_start = (chunk_number - 1) * chunk_size 233 | chunk_end = chunk_number * chunk_size 234 | chunk = files[chunk_start:chunk_end] 235 | print(chunk_start) 236 | print(chunk_end) 237 | else: 238 | chunk = files 239 | 240 | for index, file in enumerate(chunk, 1): 241 | print("\n\n\n\t\t\t\tExecuting file {} out of {} ({})\n\n\n".format(index, len(chunk), file)) 242 | pl = Pipeline(result_folder_path="{}/".format(result_folder), 243 | pickle_folder_path="{}".format(pickle_folder), 244 | json_folder_path="{}".format(json_folder), 245 | file_path=join(folder, file), 246 | list_ml_classifiers=list_ml_classifiers, 247 | list_features=feature_list, 248 | parameters=parameters, 249 | flags=flags, 250 | flag_dev_mode=FLAG_DEVELOPER_MODE, 251 | absolute_directory_path=script_absolute_path) 252 | 253 | cfsm = CompleteFolderSummaryMaker(folder_result=result_folder) 254 | ccfsm = CompleteCasSummaryFolderMaker(folder_result=result_folder) 255 | cfom = CompleteFastaOutputMaker(folder_result=result_folder) 256 | if cas_flag: 257 | cs_csv = CompleteSpacerCSVMaker(folder_result=result_folder) 258 | if json_folder: 259 | cjsm = CompleteJsonOutputMaker(folder_json_result=json_folder, folder_text_tesult=result_folder) 260 | 261 | 262 | def run_over_one_file(file, result_folder, pickle_folder, json_folder): 263 | print("\n\n\n\t\t\t\tExecuting file {}\n\n\n".format(file)) 264 | pl = Pipeline(result_folder_path="{}/".format(result_folder), 265 | pickle_folder_path="{}".format(pickle_folder), 266 | json_folder_path="{}".format(json_folder), 267 | file_path=join(file), 268 | list_ml_classifiers=list_ml_classifiers, 269 | list_features=feature_list, 270 | parameters=parameters, 271 | flags=flags, 272 | flag_dev_mode=FLAG_DEVELOPER_MODE, 273 | absolute_directory_path=script_absolute_path) 274 | 275 | cfsm = CompleteFolderSummaryMaker(folder_result=result_folder) 276 | ccfsm = CompleteCasSummaryFolderMaker(folder_result=result_folder) 277 | cfom = CompleteFastaOutputMaker(folder_result=result_folder) 278 | if cas_flag: 279 | cs_csv = CompleteSpacerCSVMaker(folder_result=result_folder) 280 | if json_folder: 281 | cjsm = CompleteJsonOutputMaker(folder_json_result=json_folder, folder_text_tesult=result_folder) 282 | 283 | 284 | 285 | def main(): 286 | start_time = time() 287 | if complete_path_file: 288 | folder_multifasta = multiline_fasta_handle_python(complete_path_file, flag_ncbi_formatting=True) 289 | print(folder_multifasta) 290 | run_over_folder_of_files(folder_multifasta, folder_result, pickle_folder, json_folder) 291 | shutil.rmtree(folder_multifasta) 292 | elif complete_path_folder: 293 | run_over_folder_of_files(complete_path_folder, folder_result, pickle_folder, json_folder) 294 | elif complete_folder_multifasta: 295 | print("Folder Multifasta") 296 | folder_multifasta = folder_of_multifasta_handle(complete_folder_multifasta) 297 | run_over_folder_of_files(folder_multifasta, folder_result, pickle_folder, json_folder) 298 | else: 299 | print("No input was provided") 300 | 301 | end_time = time() 302 | print("Elapsed time: ", end_time-start_time) 303 | 304 | 305 | if __name__ == "__main__": 306 | main() 307 | shutil.rmtree(pid_work_directory, ignore_errors=True) 308 | 309 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Bioinformatics Lab - Department of Computer Science - University Freiburg 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # CRISPRidentify: Identification of CRISPR arrays using machine learning approach 3 | 4 | CRISPRidentify is a tool to search for CRISPR arrays which utilises 5 | machine learning approach for distinguishing false candidates from true CRISPRS. 6 | CRISPRidentify, performs three steps: detection, feature extraction and 7 | classification based on manually curated sets of positive and negative examples of CRISPR arrays. 8 | The identified CRISPR arrays are then reported to the user accompanied by detailed annotation. 9 | We demonstrate that our approach identifies not only previously detected CRISPR arrays, 10 | but also CRISPR array candidates not detected by other tools. Compared to other methods, 11 | our tool has a drastically reduced false positive rate. In contrast to the existing tools, CRISPRidentify 12 | approach not only provides the user with the basic statistics on the identified CRISPR arrays 13 | but also produces a certainty score as an intuitive measure of the likelihood that a given 14 | genomic region is a CRISPR array. 15 | 16 | ## Getting Started 17 | 18 | These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. 19 | 20 | ### Prerequisites 21 | 22 | First you need to install Miniconda 23 | Then create an environment and install the required libraries in it 24 | 25 | 26 | ### Creating a Miniconda environment 27 | 28 | First we install Miniconda for python 3. 29 | Miniconda can be downloaded from here: 30 | 31 | https://docs.conda.io/en/latest/miniconda.html 32 | 33 | Then Miniconda should be installed. On a linux machine the command is similar to this one: 34 | 35 | ``` 36 | bash Miniconda3-latest-Linux-x86_64.sh 37 | ``` 38 | 39 | Then we create an environment. The necessary setup is provided in the "environment.yml" file. 40 | 41 | In order to install the corresponding environment one can execute the following command. 42 | 43 | ``` 44 | conda env create -f environment.yml 45 | ``` 46 | 47 | We recommend to install mamba package manager which is a faster alternative to conda. 48 | 49 | ``` 50 | conda install -c conda-forge mamba 51 | ``` 52 | 53 | Then we can create the environment using mamba. 54 | ``` 55 | mamba env create -f environment.yml 56 | ``` 57 | 58 | We want to acknowledge Richard Stöckl @richardstoeckl for his contribution to the environment.yml file. 59 | 60 | 61 | ### Additional preparations 62 | 63 | CRISPRidentify utilizes CRISPRcasIdentifier for the detection of the cas genes. 64 | If you are interested in cas gene result please install CRISPRcasIdentifier. 65 | 66 | Please make sure that after you downloaded CRISPRcasIdentifier its relative path is: 67 | 68 | ``` 69 | tools/CRISPRcasIdentifier/CRISPRcasIdentifier/CRISPRcasIdentifier.py 70 | ``` 71 | 72 | You can find the CRISPRcasIdentifier tool and its description [here](https://github.com/BackofenLab/CRISPRcasIdentifier) 73 | 74 | You need to make two steps: 75 | 76 | Firstly, you need to download the CRISPRcasIdentifier tool: 77 | ``` 78 | wget https://github.com/BackofenLab/CRISPRcasIdentifier/archive/v1.1.0.tar.gz 79 | tar -xzf v1.1.0.tar.gz 80 | ``` 81 | Secondly, you need to download the models: 82 | 83 | Due to GitHub's file size constraints, authors made their HMM and ML models available in Google Drive. You can download them [here](https://drive.google.com/file/d/1YbTxkn9KuJP2D7U1-6kL1Yimu_4RqSl1/view?usp=sharing) and [here](https://drive.google.com/file/d/1Nc5o6QVB6QxMxpQjmLQcbwQwkRLk-thM/view?usp=sharing). Save both tar.gz files inside CRISPRcasIdentifier's directory. 84 | 85 | 86 | ### Activation of the environment 87 | 88 | Before running CRISPRidentify one need to activate the corresponding environment. 89 | 90 | ``` 91 | conda activate crispr_identify_env 92 | ``` 93 | 94 | ## Running CRISPRidentify 95 | 96 | We prepared the test folder which can be used for the test run. 97 | 98 | Example of running CRISPRidentify over a folder of files: 99 | 100 | ``` 101 | python CRISPRidentify.py --input_folder TestInput 102 | ``` 103 | 104 | Example of running CRISPRidentify over a single multiline fasta input: 105 | ``` 106 | python CRISPRidentify.py --file TestInputMultiline/MultilineFasta.fasta 107 | ``` 108 | 109 | ### Flags 110 | 111 | You can see the help by using the `-h` option 112 | 113 | ``` 114 | 115 | python CRISPRidentify.py -h 116 | 117 | ``` 118 | 119 | #### Mandatory flags 120 | The only mandatory parameter which has to be specified is the input. 121 | Our approach has two options to handle the input. User has to specify either the path to the folder with the input fasta files 122 | or the full path to a single fasta input file. 123 | 124 | ##### Input as a folder of fasta files 125 | 126 | * `--input_folder ` 127 | 128 | Specifies the mode where a folder with fasta files which will be used as the input for CRISPRidentify. The CRISPR array search will be 129 | then conducted separately for each file in the corresponding input folder. 130 | 131 | ``` 132 | python CRISPRidentify.py --input_folder TestInput 133 | ``` 134 | 135 | ##### Input as a single file 136 | 137 | * `--file ` 138 | 139 | Specifies the mode where a singe file is used as the input for the algorithm. The file might contain a single entry or multiple entries. 140 | The CRISPR array search will be done for each entry independently. 141 | 142 | For example: 143 | 144 | ``` 145 | python CRISPRidentify.py --file InputFile 146 | ``` 147 | ##### Input as a folder of multiline fasta files 148 | 149 | * `-- input_folder_multifasta ` 150 | 151 | Specifies the mode where a folder with fasta files which will be used as the input for CRISPRidentify. The CRISPR array search will be 152 | then conducted separately for each file in the corresponding input folder. The difference between this mode and the previous one is that 153 | in this mode the input files can contain multiple entries. 154 | 155 | For example: 156 | 157 | ``` 158 | python CRISPRidentify.py --input_folder_multifasta TestFolderMultiline 159 | ``` 160 | 161 | #### Optional flags 162 | 163 | ##### Output 164 | 165 | * `--result_folder [paht_to_the_result_folder]` 166 | 167 | Specifies the path and name of the folder with the output results. If not specified the results will appear in "Results" folder 168 | 169 | 170 | For example: 171 | 172 | ``` 173 | python CRISPRidentify.py --input_folder TestInput --result_folder Results 174 | ``` 175 | 176 | * `--pickle_report [folder_to_put_pickle_results]` 177 | 178 | Specifies if found CRISPR arrays should be stored also as python objects. Turned off by default. 179 | 180 | 181 | For example: 182 | 183 | ``` 184 | python CRISPRidentify.py --input_folder TestInput --pickle_report PickleReportFolder 185 | ``` 186 | 187 | 188 | ##### Classification parameters 189 | 190 | * `--model [8/9/10/ALL]` 191 | 192 | 193 | Takes values: 8, 9, 10, ALL and specifies the classification model. The default value is `ALL`. 194 | If the value `ALL` is picked for the flag the certainty score will be calculated as average between all three available models. 195 | 196 | 197 | For example: 198 | 199 | ``` 200 | python CRISPRidentify.py --input_folder TestInput --model 8 201 | ``` 202 | 203 | 204 | ``` 205 | python CRISPRidentify.py --input_folder TestInput --model ALL 206 | ``` 207 | 208 | 209 | ##### Performance speed 210 | * `--fast_run [True/False]` 211 | 212 | Specifies if the repeat set enhancement step should be skipped which drastically speeds up the process but might decrease the recall quality. 213 | Only matching pairs found with Vmatch will be used as repeat candidates. Automatically turns off filter approximation and start_end approximation (see enhancement_max_min and enhancement_start_end) 214 | Turned off by default. 215 | 216 | For example: 217 | 218 | ``` 219 | python CRISPRidentify.py --input_folder TestInput --fast_run True 220 | ``` 221 | 222 | * `--enhancement_max_min [True/False]` 223 | 224 | Specifies if the filter approximation based on the max. and min. elements should be built 225 | The default value is True 226 | 227 | * `--enhancement_start_end [True/False]` 228 | 229 | Specifies if the start/end omitting of the repeat candidates should be done to enrich the candidate set. 230 | The default value is True 231 | 232 | 233 | For example: 234 | 235 | ``` 236 | python CRISPRidentify.py --input_folder TestInput --enhancement_max_min True --enhancement_start_end False 237 | ``` 238 | 239 | ##### Candidate filtering criteria 240 | 241 | 242 | * `--min_len_rep [integer]` 243 | 244 | Specifies the minimum length of repeats in a CRISPR array. The default value: 21 245 | 246 | * `--max_len_rep [integer]` 247 | 248 | Specifies the maximum length of repeats in a CRISPR array. The default value: 55 249 | 250 | * `--min_len_spacer [integer]` 251 | 252 | Specifies the minimum average length of spacers in a CRISPR array. The default value: 18 253 | 254 | * `--max_len_spacer [integer]` 255 | 256 | Specifies the maximum average length of spacers in a CRISPR array. The default value: 78 257 | 258 | * `--min_repeats [integer]` 259 | 260 | Specifies the minimum number of repeats in a CRISPR array. The default value: 3 261 | 262 | 263 | For example: 264 | 265 | ``` 266 | python CRISPRidentify.py --input_folder TestInput --min_len_rep 25 --max_len_rep 50 --min_repeats 2 267 | ``` 268 | 269 | #####Candidate Enhancement 270 | 271 | * `--degenerated' [True/False]` 272 | 273 | Allows search for degenerated repeat candidates on both ends of the CRISPR array candidate. The default value: True 274 | 275 | * `--margin_degenerated [int]` 276 | 277 | Specifies the maximum length difference between a new spacer sequence (obtained with the search of degenerated repeats) and the average value of spacer length in the array. The default value: 30 278 | 279 | * `--max_edit_distance_enhanced [int]` 280 | 281 | Specifies the number of editing operations for candidate enhancement. The default value: 6 282 | 283 | 284 | ##### Additional computations 285 | 286 | * `--strand[True/False]` 287 | 288 | Specifies if the array orientation should be predicted. The default value is True. 289 | 290 | * `--cas [True/False]` 291 | 292 | Specifies if cas genes should be predicted. The default value is False. 293 | 294 | * `--is_element [True/False]` 295 | 296 | Specifies if IS-Elements should be predicted. The default value is False. 297 | 298 | 299 | ``` 300 | python CRISPRidentify.py --input_folder TestInput --cas True --is_element True 301 | ``` 302 | 303 | ## Output files 304 | 305 | The output folder for each input entries consist of the following files: 306 | 307 | * Bona-Fide_Candidates. The file will contain the representation of the found CRISPR arrays complemented with the support information. 308 | For each candidate the output will contain the values for extracted features as well as the certainty score of the used classifier. 309 | On top of that in the support information you can find the orientation for each array, leader and downstream regions, cas genes and IS-elements (if the corresponding flags were selected). 310 | 311 | * Alternative_Candidates. In this file we demonstrate alternative representations of bona-fide arrays. These alternative representations also got a high score from the classifier but this score was lower than the corresponding score of the bona-fide representation. 312 | Alternative representation of a CRISPR array usually corresponds to a slightly longer/shorter repeat sequence but represents the same genomic region. 313 | 314 | The candidates with the certainty scores between 0.4 and 0.75 are stored in Possible_Candidates and Possible_Discarded_Candidates 315 | 316 | * Possible_Candidates. In this file the algorithm stores the candidate with the highest certainty score. 317 | 318 | * Possible_Discarded. Here are collected all the other representations 319 | 320 | 321 | The algorithm also demonstrates CRISPR-looking structures which obtained certainty score lower than 0.4 from the classifier. 322 | 323 | * Low_score_candidates. The user can find these structures in this file. 324 | 325 | 326 | On top of that the algorithm builds a csv summary. 327 | 328 | * Summary.csv 329 | 330 | Following information can be found in the summary: 331 | 332 | 1. Array index 333 | 2. Array start 334 | 3. Array end 335 | 4. Array length 336 | 5. Consensus repeat 337 | 6. Repeat length 338 | 7. Average length of the spacers 339 | 8. Number of spacers 340 | 9. Array orientation 341 | 10. Array category 342 | 343 | ## Metagenomic analysis 344 | 345 | CRISPRidentify is suitable for easy and powerful metagenomic analysis 346 | When `--file` or `--input_folder` flag is used the pipeline with automatically generate two complete summaries 347 | : 348 | 349 | 1. For all the identified arrays 350 | 2. For all labeled Cas genes 351 | 352 | 353 | On top of that the user might use the flag: 354 | 355 | `--fasta_report True` 356 | 357 | This option with create three fasta files: 358 | 1. All the array sequences with their origins in the header 359 | 2. All the repeat sequences with their origins and locations in the arrays 360 | 3. All the spacer sequences with their origins and locations in the arrays 361 | 362 | ## Improving CRISPRidentify 363 | 364 | We are constantly working on the improvements of CRISPRidentify. If you found a bug or incorrect/missing CRISPR array representation please submit via github issue interface. 365 | 366 | 367 | 368 | 369 | -------------------------------------------------------------------------------- /TestFolderMultiline/MultilineFasta_1.fasta: -------------------------------------------------------------------------------- 1 | >gi|56475432|ref|NC_006513.1 Damage1| Aromatoleum aromaticum EbN1 chromosome 2 | ATCACGCCCTCCCATCCCGCCGATCCACCGCCCGACGATCCTTCCGCCGCTGCTCGCTCCGCACGCGCCG 3 | GTCACCATCGCGACGTTCCACCGCGCGGCGGTCGTCATGGACGGGCTCGACATAGGTTCGCTCGGCCACA 4 | TCAACCGGAGTTGGCACGAAGGCGGGGAGCGTGGTTTCGCTTTCCACGTCGCCAGTCCCGCGCAGATACC 5 | CCCAATCCACGTCGGGGCGGAGTTCTTCGCAGCGGACGGCGCCGGCTGTGGCGCGCTCGATGGCCGGGCA 6 | GCGCTCGGCGGGGATCGGTCGAACGCCAGTAACCCACTGGCTGACCGCAGCAGGCGTCACGCCGAGAGCA 7 | CGGGCCAAGGTTGCTTGGCCGCCTACGGATTCGCACGCCAAAAGGATCGGGTTTCGGTTCATGCCACGAC 8 | TATAGCACCGCTACAGATTACATATCAAGCCATGCTATTCATTCCAATAGATAGCATTGCTTCATCATGC 9 | TGATATGGTCACCGCACGAAGAAGCAGAACGTCTTAAGGCCCGTTTTGGAGCGGTCCCCAACCGGGAGAA 10 | GTTCGCTCGAGAAATTGGACTTCACGGCGGCGGATCAATGATCTACCAGCACATAAAGGGGATTCGCCCG 11 | ATCAGCCGCGAAGCGGCGGTTGCGTATGCGAAAGGCTTCAATTGCAGGCTCGAAGAAATCAGCCCGCGAA 12 | TCGCCCTTGAGATACAGCAGGCCACTTCTGTCTTGTCGCCAACGCCAGACCGTCCGCCCGAGTCGCCGAA 13 | CATCTGCGCCGGACCGGACCGCAAAGGCAAGGTGCCGCTAATCTCGTGGGTGCGCGCAGGTGAGTTCGCT 14 | CATGCCGCTGATCTTTTGCCGGTCGGCGAGGCCTATGAATGGGTGGAGACCGGCGTGAACGTGCAGCCCC 15 | ACACTTTTGCGCTGCGCGTCCAGGGCGACTCGATGGAGCCGGAATTCGTCGCTGGCACGATCATCGTGAT 16 | CGAGCCGCACATGGTCGCTGAGCCCGGCGACTACGTCATCGCCCGCAACGGCGACAACGAGGCCACTTTC 17 | AAGCAGCTCGTGCGCGACGGGGCGGACCTGTACCTCAAACCGCTGAACCCCCGCTACCCGATCAAGCCGC 18 | TGGGCGCCACGGCGATCATCGGCGTGGTTCGAGAGGCCGTGAAGCGCTATCGGTGAGCGGGTGTTTTCCA 19 | GGCCATCACCCCTTGCCCGCAATCTGTAACAGCCTCCCAACAACAACAGACTCATGCTATTTTTCAAGGG 20 | CTTGAACGGCCCGAAATTTGTCAAGTCCATCGTCGAAGGGGTCGGATTGTGGCTCGGTATCGTCAGCGGC 21 | TTGGCGTGGCTGTGGTCCGAATTGGCGATCGTGAAGGTCGAACTGACATGGGCCGTGACAGTCACGACCG 22 | GATTTTTCGTGTTCTACGTCGGGGTCCTGCTTTGCTTCACGCGGCAGGGGGTTCTGCAAACGCGCATCGA 23 | CGAATGCGCCCAGGCCAAGAAAGCGCTGGAAGAAGAAGTGCTGCGCAAGCGGCTGTCGTCCAGGAAAAAA 24 | GGCCGGTGAGGCTGATGGGGAGAAGTGGAAAATGAGCGTGATTGCCATTCACGCGGCGATCGTCGGCGTG 25 | ACGGTCGTCGTCGCATACGTCCTGCACATGCGCACGATGCGCATGAAAGCCTGTTTCAACCTCTTCCGCG 26 | TGCGCGACCGCTTCGTCCTGCTCGTCGCCAAAGACATCCTGCCCGAAGACAGCAGGGTGTTCGTCCACTA 27 | CTACGGACGCATCAACAAGCTGCTTTGCGACGCCCCGAAAGTCGGCATCGACGACATGCTGGCCACGATC 28 | TTTCGTCACGTGCCCAATGGTGAGTTCGACCAGGCACTGGAGCGCGCCCGCTCCCAGTCGCAAAAAATGC 29 | TGGCCGATCCGCTCATGCAGAACGACGAAGTGCGAGCGGCCGTGGCCGATTACTACCGCGCCATCCGCGC 30 | GATGCTGCTGTCCCACAGCAGCATCCTGAAGGTCATCTACCTGCTGTCGCACCGCTTCGCCACGTCGCTC 31 | CACTCCGGCTGGATCGGCGGCGAAGTCAGCCGCGGGCTGAAGGCCGCCGACTACGCCGACGAAGAAGCTG 32 | CCCTGTTTAAACCCGCCTGAGGTTATGGCGCCGGGGTGACGCCACGGCAGCCGGTCCTCGACACGGCCGG 33 | CTATGTTGTGGTGGGTGGCGGCATAAAAAAAGACCGCCGAAGCGGTCTTTACGTGGAATATGCAGACTCT 34 | TGCCCTACTCTTATTCCGCTGCCGCCGCCCGGCAGGTCGACGGGCGATACTTCGCGTCCACAGATCCGGC 35 | GCACGCCCACACTAGCACGCCAGTTGCGTCGGCCTTAGGCGTCAGCGTGATCGTCTCTGCGGCAATCGCG 36 | TCGTTTGCATCTACGCCGCCGGTCGATGTAATGACGCCGCTTGGTCCGATTGCGACGCTGGCGGTGTACT 37 | TACCCACGGCACCACCGTAGCCCGCAGCGGCATCACTGGCAGGGAGGGCGCCAGTCGACTGAAAAGTTTC 38 | AGCAACGGCAACCTTGGCGCCATCTGTCAGGGACATCAGTTCTGAAACCTGAGCCCGGATCGTGTAATCC 39 | TGATACGCCGGCAGCGCGACCGCTGCCAAAATCCCGATGATCGCGACGACGATCATCAGTTCGATCAGCG 40 | TGAAACCTTGTTGGACCTTTTTCATTTTCAGCTCCCTTGCTTGTTTGACGGGACGTCCCGTGCACATCGT 41 | CACGCAACAAGCGTGCCAACCACCAAAGCGGCATAATCGCCCCGGAGTGCCACCCATGTCGCACTTATTA 42 | CTGTGACATTCGGGCGAATGTCGCAGCGTGGCACACGACACGCCGCCCCCTTCCGGGCTTGAGCGCAGCC 43 | CTCTAAACTCCGTGCCCTGGAGGCTCCCATGACCGCAATCACCGTTCCCACCGCCGCACTCATCCTCGAC 44 | CGGACCACGCGCACCATCTGGCGCCGCATCGCCGACGGATCGCTGCCGGCGATCACCGAAGACGACCGGC 45 | AGAAGATCCCGCTCGACGCCGTCATCCGCGAGGCGTGCATTCCGATCGACCCGGACGACTACGAGCTCGT 46 | CACCGGCACCGACGCCGGCGATGCCGAATCGCAGTGCGACCTCGCACTGCTGTTCCTGCTGCGCGACCGC 47 | CCGCACATCGCCATGCCGCTGCTCAACCTGGCCGCCAAGGACGACTACCCGGAGGCGCTCTACCAGATCG 48 | CCCGCTGCCACATCGCCGGCAAGGGCGTGCCGCGCGACGGCAACGCCGGCATCATGTGGCTCGCTCGGGC 49 | CGCCAGCCGCGGCCACTCCGTAGCCCAGGAGCAGATGCGCGTCGTGCGCGAGTCCGGCACCGGCACCGAC 50 | CTCGACGCCCTCGACGCGCTGCTCGAGCGCATCGAACAGCGAGTCGTGTTCGCTGCACTGGAAACCACCG 51 | CAACCCGCTAGACCCCCCGCGCTTCGCAATCTGCCCGCCGCTTGAGCGGGCTTTTTTACGTCCGTAGCTT 52 | AAAGCCATTTCGCTGATATATAGCTGCGCTATTGACATTAAATATAGCGTTGCTATTATTTCTCCAACGC 53 | CTCCCTCGAGGCACCGGAGACCGCGATGCCCCCCGCTGCACCCCATCCCGTCCCGCCCGAAAAAAAGGCC 54 | >gi|56475432|ref|NC_006513.1 Damage2| Aromatoleum aromaticum EbN1 chromosome 55 | ATCCCGATGATCGCGACGACGATCATCAGTTCGATCAGCG 56 | >gi|56475432|ref|NC_006513.1 Damage3| Aromatoleum aromaticum EbN1 chromosome 57 | TGAAACCTTGTTGGACCTTTTTCATTTTCAGCTCCCTTGCTTGTTTGACGGGACGTCCCGTGCACATCGT 58 | CACGCAACAAGCGTGCCAACCACCAAAGCGGCATAATCGCCCCGGAGTGCCACC 59 | -------------------------------------------------------------------------------- /components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/components/__init__.py -------------------------------------------------------------------------------- /components/components_detection_refinement.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import json 3 | from functools import wraps 4 | from itertools import groupby 5 | 6 | 7 | class SameStartEndFilter: 8 | def __init__(self, dict_crispr_candidates): 9 | self.dict_crispr_candidates = dict_crispr_candidates 10 | self.dict_filtered_start_end_crispr_candidates = {} 11 | 12 | self._filter_fuzzy_searches_same_start_end() 13 | 14 | def _filter_fuzzy_searches_same_start_end(self): 15 | for cluster_seq, list_fuzzy_s in self.dict_crispr_candidates.items(): 16 | list_start_end = [fuzzy_s.start_end for fuzzy_s in list_fuzzy_s] 17 | pattern_len = [len(fuzzy_s.repeat_candidate) for fuzzy_s in list_fuzzy_s] 18 | tuples_st_end_len = zip(list_start_end, pattern_len) 19 | 20 | list_categories = [[fuzzy_s for fuzzy_s in list_fuzzy_s if 21 | (fuzzy_s.start_end, len(fuzzy_s.repeat_candidate)) == tuple_info] 22 | for tuple_info in tuples_st_end_len] 23 | 24 | best_fuzzy_s = [sorted(category, key=lambda x: x.number_errors)[0] 25 | for category in list_categories] 26 | 27 | best_fuzzy_s_unique_repeat = [] 28 | u_repeats = [] 29 | for b_fuz in best_fuzzy_s: 30 | repeat = b_fuz.repeat_candidate 31 | if repeat not in u_repeats: 32 | u_repeats.append(repeat) 33 | best_fuzzy_s_unique_repeat.append(b_fuz) 34 | 35 | self.dict_filtered_start_end_crispr_candidates[cluster_seq] = best_fuzzy_s_unique_repeat 36 | 37 | def output(self): 38 | return self.dict_filtered_start_end_crispr_candidates 39 | 40 | 41 | # For filtering out non CRISPR cases 42 | ############################################################# 43 | ############################################################# 44 | DEBUG_MODE = False 45 | 46 | 47 | def exception_handler(function): 48 | @wraps(function) 49 | def wrapper(*args, **kwargs): 50 | try: 51 | result = function(*args, **kwargs) 52 | return result 53 | except Exception: 54 | return False 55 | return wrapper 56 | 57 | 58 | def printing_if_filtered(function): 59 | @wraps(function) 60 | def wrapper(*args, **kwargs): 61 | result = function(*args, **kwargs) 62 | if DEBUG_MODE: 63 | if not result: 64 | with open("filtered_results.txt", "a") as f: 65 | f.write("\n\n") 66 | f.write("\n".join([str(arg) for arg in args])) 67 | f.write("\n\n") 68 | f.write(function.__name__) 69 | f.write("\n\n") 70 | 71 | return result 72 | return wrapper 73 | 74 | 75 | class AdvancedFuzzySearchFilter: 76 | def __init__(self, min_column_dominance_repeat, min_avg_spacer_length, 77 | max_spacer_length, max_column_dominance_spacer, max_allowed_consecutive_spacers, 78 | max_allowed_same_spacers, max_inconsistent_columns, min_avg_repeat_length, 79 | max_avg_repeat_length, max_avg_spacer_length, min_repeats): 80 | 81 | self.column_dominance = min_column_dominance_repeat 82 | self.min_avg_spacer_length = min_avg_spacer_length 83 | self.max_spacer_length = max_spacer_length 84 | self.max_column_dominance_spacer = max_column_dominance_spacer 85 | self.max_allowed_consecutive_spacers = max_allowed_consecutive_spacers 86 | self.max_allowed_same_spacers = max_allowed_same_spacers 87 | self.max_inconsistent_columns = max_inconsistent_columns 88 | self.min_avg_repeat_length = min_avg_repeat_length 89 | self.max_avg_repeat_length = max_avg_repeat_length 90 | self.max_avg_spacer_length = max_avg_spacer_length 91 | self.min_number_repeats = min_repeats 92 | 93 | @printing_if_filtered 94 | @exception_handler 95 | def _filter_by_column(self, candidate): 96 | def find_first_three_columns(): 97 | list_three_columns = [] 98 | list_gaped_repeats = candidate.list_gaped_repeats 99 | for index in range(len(list_gaped_repeats[0])): 100 | column_vec = [repeat[index] for repeat in list_gaped_repeats] 101 | column_gaps = sum([1 for x in column_vec if (x == " ")]) 102 | percentage_gaps = column_gaps / len(column_vec) 103 | if percentage_gaps < 0.5: 104 | list_three_columns.append(column_vec) 105 | if len(list_three_columns) == 3: 106 | return list_three_columns 107 | 108 | def find_last_three_columns(): 109 | list_three_columns = [] 110 | list_gaped_repeats = candidate.list_gaped_repeats 111 | for index in range(len(list_gaped_repeats[0])-1, 0, -1): 112 | column_vec = [repeat[index] for repeat in list_gaped_repeats] 113 | column_gaps = sum([1 for x in column_vec if (x == " ")]) 114 | percentage_gaps = column_gaps/len(column_vec) 115 | if percentage_gaps < 0.5: 116 | list_three_columns.append(column_vec) 117 | if len(list_three_columns) == 3: 118 | return list_three_columns 119 | 120 | for column in find_first_three_columns(): 121 | column_characters = [x for x in column if (x not in (" ", "-"))] 122 | if column_characters: 123 | most_freq_char = max(column_characters, key=column_characters.count) 124 | most_freq_char_freq = column_characters.count(most_freq_char) 125 | freq = most_freq_char_freq/len(column_characters) 126 | if len(column) <= 4: 127 | if freq < 0.49: 128 | return False 129 | else: 130 | if freq < self.column_dominance: 131 | return False 132 | else: 133 | return False 134 | 135 | for column in find_last_three_columns(): 136 | column_characters = [x for x in column if (x not in (" ", "-"))] 137 | if column_characters: 138 | most_freq_char = max(column_characters, key=column_characters.count) 139 | most_freq_char_freq = column_characters.count(most_freq_char) 140 | freq = most_freq_char_freq/len(column_characters) 141 | if len(column) <= 4: 142 | if freq < 0.49: 143 | return False 144 | else: 145 | if freq < self.column_dominance: 146 | return False 147 | else: 148 | return False 149 | return True 150 | 151 | @printing_if_filtered 152 | @exception_handler 153 | def _filter_by_min_avg_spacer(self, candidate): 154 | list_spacers = candidate.list_spacers 155 | avg_len = sum(len(x) for x in list_spacers) / len(list_spacers) 156 | if avg_len > self.min_avg_spacer_length: 157 | return True 158 | return False 159 | 160 | @printing_if_filtered 161 | @exception_handler 162 | def _filter_by_max_spacer(self, candidate): 163 | list_spacers = candidate.list_spacers 164 | long_spacers = [spacer for spacer in list_spacers if len(spacer) > self.max_spacer_length] 165 | if len(long_spacers) / len(list_spacers) > 0.3: 166 | return False 167 | if len(long_spacers) > 3: 168 | return False 169 | return True 170 | 171 | @printing_if_filtered 172 | @exception_handler 173 | def _filter_by_spacer_begin_end_similarity(self, candidate): 174 | list_spacers = candidate.list_spacers 175 | if len(list_spacers) >= 2: 176 | column_begin = [spacer[0] for spacer in list_spacers if spacer] 177 | most_freq_char_begin = max(column_begin, key=column_begin.count) 178 | most_freq_char_freq_begin = column_begin.count(most_freq_char_begin) 179 | 180 | freq_begin = most_freq_char_freq_begin / len(column_begin) 181 | if freq_begin > self.max_column_dominance_spacer: 182 | return False 183 | 184 | column_end = [spacer[-1] for spacer in list_spacers if spacer] 185 | most_freq_char_end = max(column_end, key=column_end.count) 186 | most_freq_char_freq_end = column_end.count(most_freq_char_end) 187 | 188 | freq_end = most_freq_char_freq_end / len(column_end) 189 | if freq_end > self.max_column_dominance_spacer: 190 | return False 191 | return True 192 | 193 | @printing_if_filtered 194 | @exception_handler 195 | def _filter_by_the_same_spacer(self, candidate): 196 | list_spacers = candidate.list_spacers 197 | list_spacers = [s for s in list_spacers if s] 198 | groups = [len(list(group)) for key, group in groupby(list_spacers)] 199 | if self.max_allowed_consecutive_spacers: 200 | if max(groups) > self.max_allowed_consecutive_spacers: 201 | return False 202 | 203 | list_sorted_spacers = sorted(list_spacers) 204 | groups_sorted = [len(list(group)) for key, group in groupby(list_sorted_spacers)] 205 | if self.max_allowed_same_spacers: 206 | if max(groups_sorted) > self.max_allowed_same_spacers: 207 | return False 208 | return True 209 | 210 | @printing_if_filtered 211 | @exception_handler 212 | def _filter_by_overall_repeat_consistency(self, candidate): 213 | list_column_consistency = [] 214 | list_repeats_gaped = candidate.list_gaped_repeats 215 | for index, _ in enumerate(list_repeats_gaped[0]): 216 | column = [repeat[index] for repeat in list_repeats_gaped] 217 | column_characters = [x for x in column if (x not in (" ", "-"))] 218 | try: 219 | most_freq_char = max(column_characters, key=column_characters.count) 220 | most_freq_char_freq = column_characters.count(most_freq_char) 221 | freq = most_freq_char_freq / len(column_characters) 222 | list_column_consistency.append(freq) 223 | except ValueError: 224 | pass 225 | 226 | number_inconsistent = sum(1 for x in list_column_consistency if x < 0.66) 227 | if number_inconsistent > self.max_inconsistent_columns: 228 | return False 229 | return True 230 | 231 | @printing_if_filtered 232 | @exception_handler 233 | def _filter_min_number_repeats(self, candidate): 234 | list_repeats = candidate.list_repeats 235 | if len(list_repeats) >= self.min_number_repeats: 236 | return True 237 | return False 238 | 239 | @printing_if_filtered 240 | @exception_handler 241 | def _filter_min_avg_repeat_length(self, candidate): 242 | list_repeats = candidate.list_repeats 243 | avg_len = sum(len(x) for x in list_repeats) / len(list_repeats) 244 | if avg_len >= self.min_avg_repeat_length: 245 | return True 246 | return False 247 | 248 | @printing_if_filtered 249 | @exception_handler 250 | def _filter_max_avg_repeat_length(self, candidate): 251 | list_repeats = candidate.list_repeats 252 | avg_len = sum(len(x) for x in list_repeats) / len(list_repeats) 253 | if avg_len <= self.max_avg_repeat_length: 254 | return True 255 | return False 256 | 257 | @printing_if_filtered 258 | @exception_handler 259 | def _filter_max_avg_spacer_length(self, candidate): 260 | list_spacers = candidate.list_spacers 261 | if len(list_spacers) > 4: 262 | avg_len = sum(len(x) for x in list_spacers[1:-1]) / len(list_spacers) 263 | if avg_len <= self.max_avg_repeat_length: 264 | return True 265 | else: 266 | avg_len = sum(len(x) for x in list_spacers) / len(list_spacers) 267 | if avg_len <= self.max_avg_repeat_length: 268 | return True 269 | return False 270 | 271 | @printing_if_filtered 272 | @exception_handler 273 | def _filter_min_repeat_length(self, candidate): 274 | list_spacers = candidate.list_spacers 275 | avg_len = sum(len(x) for x in list_spacers) / len(list_spacers) 276 | if avg_len >= self.min_avg_repeat_length: 277 | return True 278 | return False 279 | 280 | def __call__(self, candidate): 281 | if not self._filter_by_column(candidate): 282 | return 283 | if not self._filter_by_min_avg_spacer(candidate): 284 | return 285 | if not self._filter_by_max_spacer(candidate): 286 | return 287 | if not self._filter_by_spacer_begin_end_similarity(candidate): 288 | return 289 | if not self._filter_by_the_same_spacer(candidate): 290 | return 291 | if not self._filter_by_overall_repeat_consistency(candidate): 292 | return 293 | if not self._filter_max_avg_repeat_length(candidate): 294 | return 295 | if not self._filter_min_avg_repeat_length(candidate): 296 | return 297 | if not self._filter_max_avg_spacer_length(candidate): 298 | return 299 | if not self._filter_min_number_repeats(candidate): 300 | return 301 | return candidate 302 | 303 | # CRISPR Candidate 304 | ##################################################### 305 | ##################################################### 306 | class CrisprConsensus(object): 307 | def __init__(self, list_repeats_gaped): 308 | self.list_repeats_gaped = list_repeats_gaped 309 | 310 | self.num_different_repeat_length = None 311 | self.consensus = None 312 | self.consensus_no_gap = None 313 | self.len_consensus = None 314 | self.number_repeats = None 315 | 316 | self._check_repeat_length() 317 | self._compute_consensus() 318 | 319 | def _check_repeat_length(self): 320 | list_lengths = [len(repeat) for repeat in self.list_repeats_gaped] 321 | self.num_different_repeat_length = len(set(list_lengths)) 322 | 323 | def _compute_consensus(self): 324 | if self.num_different_repeat_length == 0: 325 | print('Got repeats of 0 length') 326 | elif self.num_different_repeat_length != 1: 327 | print('Got a case with different repeat lengths') 328 | for rep_gapped in self.list_repeats_gaped: 329 | print(rep_gapped) 330 | else: 331 | self.consensus = '' 332 | for char_ind, _ in enumerate(self.list_repeats_gaped[0]): 333 | list_char_in_column = [repeat[char_ind] for repeat in self.list_repeats_gaped] 334 | counter = collections.Counter(list_char_in_column) 335 | freq = counter.most_common() 336 | most_common_char = freq[0][0] if freq[0][0] != '-' else freq[1][0] 337 | self.consensus += most_common_char 338 | 339 | self.consensus_no_gap = self.consensus.replace(' ', '').replace('+', '') 340 | self.len_consensus = len(self.consensus_no_gap) 341 | 342 | def output(self): 343 | return self.consensus_no_gap, self.consensus 344 | 345 | 346 | class CrisprCandidate(object): 347 | def __init__(self, list_repeats, list_repeats_gaped, list_spacers, list_repeat_starts): 348 | self.list_repeats = list_repeats 349 | self.list_repeats_gaped = list_repeats_gaped 350 | self.list_spacers = list_spacers 351 | self.list_repeat_starts = list_repeat_starts 352 | 353 | self.list_repeat_mismatches = [] 354 | self.list_mismatches_indexes = [] 355 | 356 | self.consensus = None 357 | self.consensus_gaped = None 358 | self.total_mismatches = None 359 | 360 | self._filter_redundant_insertion_deletions() 361 | self._compute_consensus() 362 | self._compute_mismatches() 363 | 364 | self.list_gaped_repeats = self.list_repeats_gaped 365 | 366 | def _filter_redundant_insertion_deletions(self): 367 | def _fix_repeats(list_repeats, list_bad_indexes_to_fix): 368 | list_repeats_new = [] 369 | for repeat in list_repeats: 370 | list_repeats_new.append(_fix_repeat(repeat, list_bad_indexes_to_fix)) 371 | 372 | return list_repeats_new 373 | 374 | def _fix_repeat(repeat, list_bad_indexes_to_fix): 375 | new_repeat = '' 376 | for index, char in enumerate(repeat): 377 | if index not in list_bad_indexes_to_fix: 378 | new_repeat += char 379 | 380 | return new_repeat 381 | 382 | list_bad_indexes = [] 383 | for char_ind, _ in enumerate(self.list_repeats_gaped[0]): 384 | list_char_in_column = [repeat[char_ind] for repeat in self.list_repeats_gaped] 385 | chars = set(list_char_in_column) 386 | 387 | if chars == {' '} or chars == {'-'}: 388 | list_bad_indexes.append(char_ind) 389 | 390 | if list_bad_indexes: 391 | self.list_repeats_gaped = _fix_repeats(self.list_repeats_gaped, list_bad_indexes) 392 | 393 | def _compute_consensus(self): 394 | self.consensus, self.consensus_gaped = CrisprConsensus(self.list_repeats_gaped).output() 395 | 396 | def _compute_mismatches(self): 397 | def _compute_mismatches_repeat(gaped_repeat): 398 | substitutions = 0 399 | insertions = 0 400 | deletions = 0 401 | list_mismatches_indexes_one_repeat = [] 402 | for index, char_repeat, char_con_repeat in zip(range(len(gaped_repeat)), 403 | gaped_repeat, 404 | self.consensus_gaped): 405 | 406 | if char_con_repeat == ' ': 407 | if char_repeat != ' ': 408 | insertions += 1 409 | list_mismatches_indexes_one_repeat.append(index) 410 | else: 411 | if char_repeat == char_con_repeat: 412 | pass 413 | else: 414 | if char_repeat == '-': 415 | deletions += 1 416 | list_mismatches_indexes_one_repeat.append(index) 417 | elif char_repeat == ' ': 418 | deletions += 1 419 | else: 420 | substitutions += 1 421 | list_mismatches_indexes_one_repeat.append(index) 422 | 423 | return substitutions, insertions, deletions, list_mismatches_indexes_one_repeat 424 | 425 | for gaped_repeat in self.list_repeats_gaped: 426 | s, i, d, list_mismatches_indexes_one_repeat = _compute_mismatches_repeat(gaped_repeat) 427 | total = s + i + d 428 | repeat_stats = [s, i, d, total] 429 | self.list_repeat_mismatches.append(repeat_stats) 430 | self.list_mismatches_indexes.append(list_mismatches_indexes_one_repeat) 431 | 432 | self.total_mismatches = sum([x[3] for x in self.list_repeat_mismatches]) 433 | 434 | def dot_repeat(self, gaped_repeat): 435 | string = '' 436 | substitutions = 0 437 | insertions = 0 438 | deletions = 0 439 | for char_repeat, char_consensus in zip(gaped_repeat, self.consensus_gaped): 440 | if char_consensus == ' ': 441 | string += char_repeat 442 | if char_repeat != ' ': 443 | insertions += 1 444 | else: 445 | if char_repeat == char_consensus: 446 | string += '.' 447 | else: 448 | string += char_repeat 449 | if char_repeat == '-': 450 | deletions += 1 451 | elif char_repeat == ' ': 452 | deletions += 1 453 | else: 454 | substitutions += 1 455 | return string, substitutions, insertions, deletions 456 | 457 | def dot_repr(self): 458 | string = '' 459 | g_s, g_i, g_d = 0, 0, 0 460 | max_length_start_index = max(len(str(start)) for start in self.list_repeat_starts) + 3 461 | max_length_spacer = max(len(spacer) for spacer in self.list_spacers) + 3 462 | 463 | for index, gaped_repeat in enumerate(self.list_repeats_gaped): 464 | repeat_start_index = self.list_repeat_starts[index] + 1 465 | n_gaps_after_start = max_length_start_index - len(str(repeat_start_index)) 466 | 467 | if index == len(self.list_spacers): 468 | spacer = "" 469 | else: 470 | spacer = self.list_spacers[index] 471 | n_gaps_after_spacer = max_length_spacer - len(spacer) 472 | 473 | dotted_repeats, s, i, d = self.dot_repeat(gaped_repeat) 474 | errors = " s:{} i:{} d:{}".format(s, i, d) 475 | g_s += s 476 | g_i += i 477 | g_d += d 478 | 479 | string += "{}{}{} {}{}{}\n".format(repeat_start_index, 480 | " " * n_gaps_after_start, 481 | dotted_repeats, spacer, 482 | " " * n_gaps_after_spacer, 483 | errors) 484 | 485 | string += "_" * 100 + "\n" 486 | 487 | string += " " * max_length_start_index + self.consensus_gaped 488 | string += " " * (max_length_spacer + 2) + " s:{} i:{} d:{}".format(g_s, g_i, g_d) + "\n" 489 | 490 | return string 491 | 492 | def dot_repr_web_server(self): 493 | string = '' 494 | g_s, g_i, g_d = 0, 0, 0 495 | max_length_start_index = max(len(str(start)) for start in self.list_repeat_starts) + 3 496 | max_length_spacer = max(len(spacer) for spacer in self.list_spacers) + 3 497 | 498 | for index, gaped_repeat in enumerate(self.list_repeats_gaped): 499 | repeat_start_index = self.list_repeat_starts[index] + 1 500 | n_gaps_after_start = max_length_start_index - len(str(repeat_start_index)) 501 | 502 | if index == len(self.list_spacers): 503 | spacer = "" 504 | else: 505 | spacer = "$" + self.list_spacers[index] + "$" 506 | n_gaps_after_spacer = max_length_spacer - len(spacer) 507 | 508 | dotted_repeats, s, i, d = self.dot_repeat(gaped_repeat) 509 | errors = " s:{} i:{} d:{}".format(s, i, d) 510 | g_s += s 511 | g_i += i 512 | g_d += d 513 | 514 | string += "{}{}{} {}{}{}\n".format(repeat_start_index, 515 | " " * n_gaps_after_start, 516 | dotted_repeats, spacer, 517 | " " * n_gaps_after_spacer, 518 | errors) 519 | 520 | string += "_" * 100 + "\n" 521 | 522 | string += " " * max_length_start_index + self.consensus_gaped 523 | string += " " * (max_length_spacer + 2) + " s:{} i:{} d:{}".format(g_s, g_i, g_d) + "\n" 524 | 525 | string += "_" * 100 + "\n" 526 | 527 | string += "consensus: " + self.consensus + "\n" 528 | 529 | return string 530 | 531 | def write_file(self, file_name): 532 | with open(file_name, "w") as f: 533 | f.write(self.dot_repr()) 534 | 535 | def write_as_json(self, filename): 536 | dict_to_write = {"repeat_begins": self.list_repeat_starts, 537 | "repeats": self.list_repeats, 538 | "repeats_gaped": self.list_repeats_gaped, 539 | "spacers": self.list_spacers} 540 | 541 | with open(filename, 'w') as outfile: 542 | json.dump(dict_to_write, outfile) 543 | 544 | def compute_stats(self): 545 | start = self.list_repeat_starts[0] + 1 546 | end = self.list_repeat_starts[-1] + len(self.list_repeats[-1]) 547 | avg_repeat = len(self.consensus) 548 | avg_spacer = int(sum((len(spacer) for spacer in self.list_spacers)) / len(self.list_spacers)) 549 | number_repeats = len(self.list_repeats) 550 | return {"start": start, "end": end, "avg_repeat": avg_repeat, 551 | "avg_spacer": avg_spacer, "number_repeats": number_repeats} 552 | 553 | @classmethod 554 | def init_from_json(cls, file_name): 555 | with open(file_name) as json_file: 556 | dict_data = json.load(json_file) 557 | 558 | list_repeas = dict_data["repeats"] 559 | list_repeats_starts = dict_data["repeat_begins"] 560 | list_spacers = dict_data["spacers"] 561 | list_repeats_gaped = dict_data["repeats_gaped"] 562 | 563 | return cls(list_repeats=list_repeas, list_spacers=list_spacers, 564 | list_repeats_gaped=list_repeats_gaped, list_repeat_starts=list_repeats_starts) 565 | 566 | def __repr__(self): 567 | return self.dot_repr() 568 | 569 | def __eq__(self, other): 570 | if self.list_repeats == other.list_repeats: 571 | if self.list_repeats_gaped == other.list_repeats_gaped: 572 | if self.list_spacers == other.list_spacers: 573 | return True 574 | return False 575 | 576 | def __ne__(self, other): 577 | return not self.__eq__(other) 578 | -------------------------------------------------------------------------------- /components/components_helpers.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from os import listdir 4 | from os.path import isfile, join 5 | 6 | 7 | def process_string_from_header(input_string): 8 | # Define the function to replace based on the condition 9 | def replace_match(match): 10 | # If it's an integer, remove the dot and integer 11 | if match.group(2).isdigit(): 12 | return match.group(1) 13 | # If it's not an integer, replace the dot with a hyphen 14 | return match.group(1) + "-" + match.group(2) 15 | 16 | # Use regex to find patterns with a dot followed by any characters 17 | result = re.sub(r'(\w+)\.(\w+)', replace_match, input_string) 18 | return result 19 | 20 | def multiline_fasta_check(file): 21 | with open(file, "r") as f: 22 | lines = f.readlines() 23 | number_of_inputs = sum([1 for line in lines if ">" in line]) 24 | return number_of_inputs != 1 25 | 26 | 27 | def multiline_fasta_handle(file): 28 | base_name = str(os.path.basename(file).split(".")[0]) 29 | try: 30 | os.mkdir(base_name) 31 | except OSError: 32 | pass 33 | 34 | cmd = f"cat {file}" 35 | cmd += " | awk '{ if (substr($0, 1, 1)==\">\") {" 36 | cmd += "filename=(\"{}/\"".format(base_name) 37 | cmd += "substr($0,2)\".fa\")} " 38 | cmd += f"print $0 > filename " 39 | cmd += "}'" 40 | 41 | os.system(cmd) 42 | 43 | return base_name 44 | 45 | 46 | def multiline_fasta_handle_python(file, flag_ncbi_formatting=False): 47 | base_name = str(os.path.basename(file).split(".")[0]) 48 | try: 49 | os.mkdir(base_name) 50 | except OSError: 51 | pass 52 | 53 | with open(file, "r") as f: 54 | lines = f.readlines() 55 | 56 | headers = [] 57 | dna_sequences = [] 58 | 59 | dna_sequence = '' 60 | for line in lines: 61 | if line: 62 | if ">" in line: 63 | if dna_sequence: 64 | dna_sequences.append(dna_sequence) 65 | dna_sequence = '' 66 | headers.append(line) 67 | else: 68 | dna_sequence += line.strip() 69 | 70 | if dna_sequence: 71 | dna_sequences.append(dna_sequence) 72 | 73 | if flag_ncbi_formatting: 74 | for header, dna_sequence in zip(headers, dna_sequences): 75 | new_header = header.split(" ")[0] 76 | new_header = process_string_from_header(new_header) 77 | file_name = new_header.split(">")[1].replace(",", "-") \ 78 | .replace(".", "-").replace(" ", "_").replace("|", "-") + ".fa" 79 | with open(os.path.join(base_name, file_name), "w") as f: 80 | f.writelines(new_header) 81 | f.write("\n") 82 | f.writelines(dna_sequence) 83 | else: 84 | for header, dna_sequence in zip(headers, dna_sequences): 85 | file_name = header.strip().split(">")[1].replace(",", "_")\ 86 | .replace(".", "_").replace(" ", "_").replace("|", "_") + ".fa" 87 | with open(os.path.join(base_name, file_name), "w") as f: 88 | f.write(header) 89 | f.write(dna_sequence) 90 | 91 | return base_name 92 | 93 | 94 | def folder_of_multifasta_handle(folder_multifasta): 95 | list_files = [f for f in listdir(folder_multifasta) if isfile(join(folder_multifasta, f))] 96 | all_lines_in_files = [] 97 | for file in list_files: 98 | with open(os.path.join(folder_multifasta, file), "r") as f: 99 | lines = f.readlines() 100 | all_lines_in_files.append(lines) 101 | with open("multifasta_folder.fa", "w") as f: 102 | for lines in all_lines_in_files: 103 | for line in lines: 104 | f.write(line) 105 | 106 | multiline_fasta_handle_python("multifasta_folder.fa") 107 | return "multifasta_folder" -------------------------------------------------------------------------------- /components/components_ml.py: -------------------------------------------------------------------------------- 1 | import operator 2 | import numpy as np 3 | import sklearn 4 | import joblib 5 | 6 | 7 | class ClassifierWrapper(object): 8 | def __init__(self, classifier_type, load_option=None, hyper_parameters=None): 9 | self.classifier_type = classifier_type 10 | self._hyper_parameters = hyper_parameters 11 | self._load_option = load_option 12 | 13 | self._init_classifier() 14 | 15 | def _init_classifier(self): 16 | if self._load_option: 17 | self._load_model() 18 | else: 19 | if self.classifier_type == 'k_near_neighbors': 20 | 21 | if not self._hyper_parameters: 22 | self.classifier = sklearn.neighbors.KNeighborsClassifier(n_neighbors=7) 23 | else: 24 | self.classifier = sklearn.neighbors.KNeighborsClassifier(**self._hyper_parameters) 25 | 26 | elif self.classifier_type == 'svm': 27 | 28 | if not self._hyper_parameters: 29 | self.classifier = sklearn.svm.SVC() 30 | else: 31 | self.classifier = sklearn.svm.SVC(**self._hyper_parameters) 32 | 33 | elif self.classifier_type == 'naive_bayes': 34 | 35 | if not self._hyper_parameters: 36 | self.classifier = sklearn.naive_bayes.GaussianNB() 37 | else: 38 | self.classifier = sklearn.naive_bayes.GaussianNB(**self._hyper_parameters) 39 | 40 | elif self.classifier_type == 'random_forest': 41 | 42 | if not self._hyper_parameters: 43 | self.classifier = RandomForestClassifier(max_depth=3, random_state=None) 44 | else: 45 | self.classifier = RandomForestClassifier(**self._hyper_parameters) 46 | 47 | elif self.classifier_type == 'neural_network': 48 | 49 | if not self._hyper_parameters: 50 | self.classifier = MLPClassifier(solver='lbfgs', alpha=1e-5, 51 | hidden_layer_sizes=(100, 100), random_state=None) 52 | else: 53 | self.classifier = MLPClassifier(**self._hyper_parameters) 54 | 55 | elif self.classifier_type == 'extra_trees': 56 | 57 | if not self._hyper_parameters: 58 | self.classifier = ExtraTreesClassifier(max_depth=4) 59 | else: 60 | self.classifier = ExtraTreesClassifier(**self._hyper_parameters) 61 | 62 | else: 63 | raise ValueError('Wrong classifier') 64 | 65 | def _load_model(self): 66 | self.classifier = joblib.load(self._load_option) 67 | 68 | def train_classifier(self, train_set_pos, train_set_neg): 69 | train_y_pos = np.ones(len(train_set_pos)) 70 | train_y_neg = np.zeros(len(train_set_neg)) 71 | train_y = np.concatenate([train_y_pos, train_y_neg]) 72 | train_x = np.concatenate([train_set_pos, train_set_neg]) 73 | self.classifier.fit(train_x, train_y) 74 | 75 | def test_classifier(self, test_set_pos, test_set_neg): 76 | if (test_set_pos is not None) and (test_set_neg is not None): 77 | test_set_y_pos = np.ones(len(test_set_pos)) 78 | test_set_y_neg = np.zeros(len(test_set_neg)) 79 | test_set_y = np.concatenate([test_set_y_pos, test_set_y_neg]) 80 | test_set_x = np.concatenate([test_set_pos, test_set_neg]) 81 | 82 | elif test_set_pos is not None: 83 | test_set_y = np.ones(len(test_set_pos)) 84 | test_set_x = test_set_pos 85 | 86 | elif test_set_neg is not None: 87 | test_set_y = np.zeros(len(test_set_neg)) 88 | test_set_x = test_set_neg 89 | 90 | else: 91 | raise ValueError 92 | 93 | predict = self.classifier.predict(test_set_x) 94 | dif = test_set_y - predict 95 | return 1 - np.count_nonzero(dif) / float(len(dif)) 96 | 97 | def predict(self, dataset): 98 | return self.classifier.predict(dataset) 99 | 100 | def predict_proba(self, dataset): 101 | return self.classifier.predict_proba(dataset) 102 | 103 | def save_model(self, model_name_dot_pkl): 104 | joblib.dump(self.classifier, model_name_dot_pkl) 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /components/module_detection.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | from multiprocessing import Pool 3 | 4 | from components.components_detection import VmatchRun 5 | from components.components_detection import ClusterMaker 6 | from components.components_detection import FilterApproximationClusters 7 | from components.components_detection import StartEndEnhancementClusters 8 | from components.components_detection import IntermediateEnhancementClusters 9 | from components.components_detection import ClusterSequence 10 | from components.components_detection import FuzzySearch 11 | 12 | 13 | class Detection: 14 | def __init__(self, file_path, flags, parameters, flag_dev_mode): 15 | self.file_path = file_path 16 | self.flags = flags 17 | self.parameters = parameters 18 | self.flag_parallel = flags["flag_parallel"] 19 | self.flag_cpu = flags["flag_cpu"] 20 | self.flag_fast_run = flags["flag_fast_run"] 21 | self.flag_enhancement_min_max = flags["flag_enhancement_min_max"] 22 | self.flag_enhancement_start_end = flags["flag_enhancement_start_end"] 23 | self.parameters = parameters 24 | self.flag_dev_mode = flag_dev_mode 25 | 26 | self.clusters = [] 27 | self.cluster_sequences = [] 28 | self.dict_fuzzy_crisprs = {} 29 | 30 | self._get_complete_dna() 31 | self._run_cluster_detection() 32 | self._extract_cluster_sequences() 33 | self._run_array_detection() 34 | 35 | def _get_complete_dna(self): 36 | with open(self.file_path, 'r') as f: 37 | lines = f.readlines() 38 | 39 | self.input_header = lines[0] 40 | self.dna = ''.join([line.strip() for line in lines[1:]]) 41 | self.dna_length = len(self.dna) 42 | self.dna = self.dna.upper() 43 | 44 | def _run_cluster_detection(self): 45 | vr = VmatchRun(self.file_path, self.flag_fast_run) 46 | list_repeats_from_vmatch = vr.output() 47 | #print("list vmatch repeats", list_repeats_from_vmatch) 48 | 49 | cm = ClusterMaker(list_repeats_from_vmatch, self.dna) 50 | self.clusters = cm.output() 51 | 52 | fa = FilterApproximationClusters(self.clusters) 53 | self.clusters = fa.output() 54 | 55 | st = StartEndEnhancementClusters(self.clusters) 56 | self.clusters = st.output() 57 | 58 | ie = IntermediateEnhancementClusters(self.clusters) 59 | self.clusters = ie.output() 60 | 61 | def _extract_cluster_sequences(self): 62 | for cluster in self.clusters: 63 | seq_start = max(0, cluster.begin - 100) 64 | seq_end = min(len(self.dna), cluster.end + 100) 65 | cluster_seq = self.dna[seq_start:seq_end] 66 | tup_cluster_dif_rep = tuple(cluster.list_clust_dif_rep_seq) 67 | 68 | self.cluster_sequences.append(ClusterSequence(cluster_seq, seq_start, seq_end, tup_cluster_dif_rep)) 69 | 70 | @staticmethod 71 | def _parallel_run_fuzzy_run(input_tuple): 72 | repeat, sequence, start, weighted_error = input_tuple 73 | 74 | return FuzzySearch(sequence, start, 75 | repeat, weighted_error) 76 | 77 | def _run_array_detection(self): 78 | weighted_error = "{i<=3,d<=3,s<=3,i+d+s<=6}" 79 | parallel = self.flag_parallel 80 | 81 | if parallel: 82 | for cluster_sequence in self.cluster_sequences: 83 | nr = len(cluster_sequence.tuple_repeats) 84 | input_tuples = zip(cluster_sequence.tuple_repeats, [cluster_sequence.sequence] * nr, 85 | [cluster_sequence.start] * nr, [weighted_error] * nr) 86 | 87 | num_workers_suggested = multiprocessing.cpu_count() if self.flag_cpu == "ALL" else int(self.flag_cpu) 88 | max_possible = multiprocessing.cpu_count() 89 | num_workers = num_workers_suggested if num_workers_suggested < max_possible else max_possible 90 | with Pool(num_workers) as p: 91 | fuzzy_results = p.map(self._parallel_run_fuzzy_run, input_tuples) 92 | fuzzy_results = [x for x in fuzzy_results if x.match_hit] 93 | fuzzy_results = [x for x in fuzzy_results if len(x.list_repeats) > 1] 94 | 95 | self.dict_fuzzy_crisprs[cluster_sequence] = fuzzy_results 96 | else: 97 | for cluster_sequence in self.cluster_sequences: 98 | list_fuzzy_results = [] 99 | for repeat in cluster_sequence.tuple_repeats: 100 | fuzzy_s = FuzzySearch(cluster_sequence.sequence, cluster_sequence.start, 101 | repeat, weighted_error) 102 | if fuzzy_s.match_hit: 103 | if len(fuzzy_s.list_repeats) > 1: 104 | list_fuzzy_results.append(fuzzy_s) 105 | 106 | self.dict_fuzzy_crisprs[cluster_sequence] = list_fuzzy_results 107 | 108 | def output(self): 109 | return self.dict_fuzzy_crisprs -------------------------------------------------------------------------------- /components/module_detection_refinement.py: -------------------------------------------------------------------------------- 1 | from components.components_detection_refinement import SameStartEndFilter 2 | from components.components_detection_refinement import AdvancedFuzzySearchFilter 3 | from components.components_detection_refinement import CrisprCandidate 4 | 5 | 6 | class DetectionRefinement: 7 | def __init__(self, dict_fuzzy_crisprs, parameters, flag_dev_mode): 8 | self.dict_fuzzy_crisprs = dict_fuzzy_crisprs 9 | self.parameters = parameters 10 | self.flag_dev_mode = flag_dev_mode 11 | self.dict_fuzzy_crisprs_refined_st_end = {} 12 | self.dict_fuzzy_crisprs_fully_refined = {} 13 | 14 | self._filter_out_same_start_end_cases() 15 | self._filter_out_non_crispr_cases() 16 | self._reformat_ac_crispr_candidates() 17 | 18 | def _filter_out_same_start_end_cases(self): 19 | ssef = SameStartEndFilter(self.dict_fuzzy_crisprs) 20 | self.dict_fuzzy_crisprs_refined_st_end = ssef.output() 21 | 22 | def _filter_out_non_crispr_cases(self): 23 | self.param_min_avg_repeat_length = self.parameters["param_min_avg_repeat_length"] 24 | self.param_max_avg_repeat_length = self.parameters["param_max_avg_repeat_length"] 25 | self.param_min_avg_spacer_length = self.parameters["param_min_avg_spacer_length"] 26 | self.param_max_avg_spacer_length = self.parameters["param_max_avg_spacer_length"] 27 | self.param_min_repeats = self.parameters["param_min_repeats"] 28 | self.param_max_identical_spacers = self.parameters["param_max_identical_spacers"] 29 | self.param_max_identical_cluster_spacers = self.parameters["param_max_identical_cluster_spacers"] 30 | 31 | afsf = AdvancedFuzzySearchFilter(min_column_dominance_repeat=0.6, 32 | max_spacer_length=140, max_column_dominance_spacer=0.8, 33 | max_allowed_consecutive_spacers=self.param_max_identical_cluster_spacers, 34 | max_allowed_same_spacers=self.param_max_identical_spacers, 35 | max_inconsistent_columns=5, 36 | min_avg_repeat_length=self.param_min_avg_repeat_length, 37 | max_avg_repeat_length=self.param_max_avg_repeat_length, 38 | min_avg_spacer_length=self.param_min_avg_spacer_length, 39 | max_avg_spacer_length=self.param_max_avg_spacer_length, 40 | min_repeats=self.param_min_repeats) 41 | 42 | for key, values in self.dict_fuzzy_crisprs_refined_st_end.items(): 43 | list_filtered_advanced = [afsf(value) for value in values] 44 | list_filtered_advanced = [x for x in list_filtered_advanced if x] 45 | if not list_filtered_advanced: 46 | sorted_by_num_errors = sorted(list(values), key=lambda x: x.number_errors) 47 | if sorted_by_num_errors: 48 | candidate_fewer_mismatches = sorted_by_num_errors[0] 49 | self.dict_fuzzy_crisprs_fully_refined[key] = [candidate_fewer_mismatches] 50 | else: 51 | self.dict_fuzzy_crisprs_fully_refined[key] = list_filtered_advanced 52 | 53 | def _reformat_ac_crispr_candidates(self): 54 | self.dict_crispr_candidates = {} 55 | for key, list_fuzzy in self.dict_fuzzy_crisprs_fully_refined.items(): 56 | new_key = (key.start, key.end) 57 | list_crispr_candidates = [CrisprCandidate(fuzzy.list_repeats, fuzzy.list_gaped_repeats, 58 | fuzzy.list_spacers, fuzzy.list_absolute_start) 59 | for fuzzy in list_fuzzy] 60 | 61 | self.dict_crispr_candidates[new_key] = list_crispr_candidates 62 | 63 | def output(self): 64 | return self.dict_crispr_candidates 65 | -------------------------------------------------------------------------------- /components/module_evaluated_arrays_enhancement.py: -------------------------------------------------------------------------------- 1 | from os.path import basename 2 | from components.components_evaluated_arrays_enhancement import IterativeDegeneratedSearch 3 | from components.components_evaluated_arrays_enhancement import create_boundaries_for_intervals 4 | from components.components_evaluated_arrays_enhancement import ArrayRefinerInsertionsDeletions 5 | from components.components_detection_refinement import AdvancedFuzzySearchFilter 6 | 7 | 8 | class EvaluatedArraysEnhancement: 9 | def __init__(self, file_path, categories, parameters, flag_dev_mode): 10 | self.file_path = file_path 11 | self.categories = categories 12 | self.parameters = parameters 13 | self.flag_dev_mode = flag_dev_mode 14 | 15 | self.bona_fide_arrays = categories[0] 16 | self.alternative_arrays = categories[1] 17 | self.possible_arrays = categories[2] 18 | 19 | self.dict_arrays_into_categories_enhanced = {} 20 | 21 | self._get_complete_dna() 22 | self._search_missed_or_degenerated_repeats() 23 | self._refine_nucleotides_repeat_spacer() 24 | self._filter_enhanced() 25 | 26 | def _get_complete_dna(self): 27 | with open(self.file_path, 'r') as f: 28 | lines = f.readlines() 29 | 30 | self.input_header = lines[0] 31 | self.dna = ''.join([line.strip() for line in lines[1:]]) 32 | self.dna_length = len(self.dna) 33 | self.dna = self.dna.upper() 34 | 35 | def _search_missed_or_degenerated_repeats(self): 36 | for category in [self.bona_fide_arrays, self.alternative_arrays, self.possible_arrays]: 37 | intervals = [] 38 | arrays_for_intervals = [] 39 | 40 | for interval, list_data in category.items(): 41 | intervals.append(interval) 42 | arrays_for_intervals.append([el[1] for el in list_data]) 43 | 44 | boundaries = create_boundaries_for_intervals(intervals, 500) 45 | 46 | for interval, arrays_in_interval, boundary in zip(intervals, arrays_for_intervals, boundaries): 47 | for array_index, array in enumerate(arrays_in_interval): 48 | consensus = array.consensus 49 | list_repeats = array.list_repeats 50 | list_repeats_starts = array.list_repeat_starts 51 | list_spacers = array.list_spacers 52 | 53 | 54 | ids = IterativeDegeneratedSearch(full_dna=self.dna, 55 | repeat_seq_candidate=consensus, 56 | spacer_margin=self.parameters["param_spacer_margin_degenerated_search"], 57 | repeat_seq_candidate_gaped=None, 58 | list_repeats_starts=list_repeats_starts, 59 | list_repeats=list_repeats, 60 | list_spacers=list_spacers, 61 | start_flanking_region_left=boundary[0], 62 | end_flanking_region_right=boundary[1], 63 | allowed_max_editing_distance=self.parameters["param_max_edit_distance"], 64 | iterative_size_flanking_region=150, 65 | prevent_long_spacers=True, 66 | attempt_to_improve_initial_array=True) 67 | 68 | new_crispr_candidate = ids.output() 69 | 70 | if self.flag_dev_mode: 71 | if array != new_crispr_candidate: 72 | with open("log.txt", "a") as f: 73 | acc_num = basename(self.file_path).split(".")[0] 74 | f.write(f"Iteractive degenerated search {acc_num}\n") 75 | f.write(array.dot_repr()) 76 | f.write("\n\n") 77 | f.write(new_crispr_candidate.dot_repr()) 78 | f.write("\n\n") 79 | 80 | """except Exception: 81 | new_crispr_candidate = array 82 | 83 | if self.flag_dev_mode: 84 | with open("log_error.txt", "a") as f: 85 | acc_num = basename(self.file_path).split(".")[0] 86 | f.write(f"Iteractive degenerated search error {acc_num}\n") 87 | f.write(array.dot_repr()) 88 | f.write("\n\n")""" 89 | 90 | category[interval][array_index][1] = new_crispr_candidate 91 | 92 | def _refine_nucleotides_repeat_spacer(self): 93 | for category in [self.bona_fide_arrays, self.alternative_arrays, self.possible_arrays]: 94 | for interval, list_data in category.items(): 95 | arrays = [el[1] for el in list_data] 96 | for array_index, array in enumerate(arrays): 97 | try: 98 | arid = ArrayRefinerInsertionsDeletions(array) 99 | new_crispr_candidate = arid.output() 100 | 101 | if self.flag_dev_mode: 102 | if array != new_crispr_candidate: 103 | with open("log.txt", "a") as f: 104 | acc_num = basename(self.file_path).split(".")[0] 105 | f.write(f"Array refinement {acc_num}\n") 106 | f.write(array.dot_repr()) 107 | f.write("\n\n") 108 | f.write(new_crispr_candidate.dot_repr()) 109 | f.write("\n\n") 110 | 111 | except Exception: 112 | new_crispr_candidate = array 113 | 114 | if self.flag_dev_mode: 115 | with open("log_error.txt", "a") as f: 116 | acc_num = basename(self.file_path).split(".")[0] 117 | f.write(f"Array refinement error {acc_num}\n") 118 | f.write(array.dot_repr()) 119 | f.write("\n\n") 120 | 121 | category[interval][array_index][1] = new_crispr_candidate 122 | 123 | def _filter_enhanced(self): 124 | self.param_min_avg_repeat_length = self.parameters["param_min_avg_repeat_length"] 125 | self.param_max_avg_repeat_length = self.parameters["param_max_avg_repeat_length"] 126 | self.param_min_avg_spacer_length = self.parameters["param_min_avg_spacer_length"] 127 | self.param_max_avg_spacer_length = self.parameters["param_max_avg_spacer_length"] 128 | self.param_min_repeats = self.parameters["param_min_repeats"] 129 | self.param_max_identical_spacers = self.parameters["param_max_identical_spacers"] 130 | self.param_max_identical_cluster_spacers = self.parameters["param_max_identical_cluster_spacers"] 131 | 132 | afsf = AdvancedFuzzySearchFilter(min_column_dominance_repeat=0.6, 133 | max_spacer_length=140, max_column_dominance_spacer=0.8, 134 | max_allowed_consecutive_spacers=self.param_max_identical_cluster_spacers, 135 | max_allowed_same_spacers=self.param_max_identical_spacers, 136 | max_inconsistent_columns=5, 137 | min_avg_repeat_length=self.param_min_avg_repeat_length, 138 | max_avg_repeat_length=self.param_max_avg_repeat_length, 139 | min_avg_spacer_length=self.param_min_avg_spacer_length, 140 | max_avg_spacer_length=self.param_max_avg_spacer_length, 141 | min_repeats=self.param_min_repeats) 142 | 143 | bona_fide_not_filtered = self.categories[0] 144 | alternative_not_filtered = self.categories[1] 145 | possible_not_filtered = self.categories[2] 146 | low_score = self.categories[4] 147 | 148 | bona_fide_filtered = {} 149 | alternative_filtered = {} 150 | possible_filtered = {} 151 | 152 | for not_filtered_category, filtered_category in zip([bona_fide_not_filtered, alternative_not_filtered, possible_not_filtered], 153 | [bona_fide_filtered, alternative_filtered, possible_filtered]): 154 | for key, value in not_filtered_category.items(): 155 | for crispr_tuple in value: 156 | crispr = crispr_tuple[1] 157 | if not afsf(crispr): 158 | if key in low_score: 159 | low_score[key].append(crispr_tuple) 160 | else: 161 | low_score[key] = [crispr_tuple] 162 | else: 163 | if key not in filtered_category: 164 | filtered_category[key] = [crispr_tuple] 165 | else: 166 | filtered_category[key].append(crispr_tuple) 167 | 168 | self.categories[0] = bona_fide_filtered 169 | self.categories[1] = alternative_filtered 170 | self.categories[2] = possible_filtered 171 | self.categories[4] = low_score 172 | 173 | def output(self): 174 | return self.categories 175 | -------------------------------------------------------------------------------- /components/module_evaluation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from components.components_evaluation import BulkFeatureExtractor 4 | from components.components_evaluation import FeatureExtractor 5 | from components.components_evaluation import get_full_vector 6 | from components.components_detection_refinement import AdvancedFuzzySearchFilter 7 | 8 | 9 | class ArrayEvaluation: 10 | def __init__(self, dict_crispr_array_candidates, list_ml_classifiers, list_features, parameters, flag_dev_mode): 11 | self.dict_crispr_array_candidates = dict_crispr_array_candidates 12 | self.list_ml_classifiers = list_ml_classifiers 13 | self.list_features = list_features 14 | self.parameters = parameters 15 | self.flag_dev_mode = flag_dev_mode 16 | 17 | self.dict_scored_result = {} 18 | self.dict_scored_result_with_all_vectors = {} 19 | 20 | self.dict_bona_fide = {} 21 | self.dict_alternative = {} 22 | self.dict_possible = {} 23 | self.dict_possible_discarded = {} 24 | self.dict_low_score = {} 25 | 26 | self._load_filter() 27 | self._extract_features_and_evaluate() 28 | self._split_into_categories() 29 | 30 | def _load_filter(self): 31 | self.param_min_avg_repeat_length = self.parameters["param_min_avg_repeat_length"] 32 | self.param_max_avg_repeat_length = self.parameters["param_max_avg_repeat_length"] 33 | self.param_min_avg_spacer_length = self.parameters["param_min_avg_spacer_length"] 34 | self.param_max_avg_spacer_length = self.parameters["param_max_avg_spacer_length"] 35 | self.param_min_repeats = self.parameters["param_min_repeats"] 36 | self.param_max_identical_spacers = self.parameters["param_max_identical_spacers"] 37 | self.param_max_identical_cluster_spacers = self.parameters["param_max_identical_cluster_spacers"] 38 | self. afsf = AdvancedFuzzySearchFilter(min_column_dominance_repeat=0.6, 39 | max_spacer_length=140, max_column_dominance_spacer=0.8, 40 | max_allowed_consecutive_spacers=self.param_max_identical_cluster_spacers, 41 | max_allowed_same_spacers=self.param_max_identical_spacers, 42 | max_inconsistent_columns=5, 43 | min_avg_repeat_length=self.param_min_avg_repeat_length, 44 | max_avg_repeat_length=self.param_max_avg_repeat_length, 45 | min_avg_spacer_length=self.param_min_avg_spacer_length, 46 | max_avg_spacer_length=self.param_max_avg_spacer_length, 47 | min_repeats=self.param_min_repeats) 48 | 49 | def _extract_features_and_evaluate(self): 50 | bfe = BulkFeatureExtractor(self.dict_crispr_array_candidates) 51 | results = bfe.output() 52 | blast_results, orf_results, hmm_results, mfe_results = results 53 | blast_scores_1, blast_scores_2 = blast_results 54 | 55 | list_features = ['repeat_len', 'number_repeats', 'repeat_similarity', 56 | 'at_richness', 'avg_spacer_len', 'spacer_similarity', 57 | 'number_mismatches', 'spacer_evenness'] 58 | 59 | for key, list_crispr_candidates in self.dict_crispr_array_candidates.items(): 60 | self.dict_scored_result[key] = [] 61 | self.dict_scored_result_with_all_vectors[key] = [] 62 | for index, crispr_candidate in enumerate(list_crispr_candidates): 63 | final_score = 0 64 | 65 | feature_vector = FeatureExtractor(0, crispr_candidate, list_features).extract()[0] 66 | 67 | mfe = mfe_results[key][index] 68 | orf = orf_results[key][index] 69 | hmmr = hmm_results[key][index] 70 | blast1 = blast_scores_1[key][index] 71 | blast2 = blast_scores_2[key][index] 72 | 73 | feature_vector_8_incomplete = feature_vector[np.array([2, 4, 5, 6, 7])] 74 | rest_8 = np.asarray([mfe, orf, blast1]) 75 | feature_vector_8 = np.concatenate((feature_vector_8_incomplete, rest_8)) 76 | feature_vector_8 = feature_vector_8.reshape(1, -1) 77 | 78 | feature_vector_9_incomplete = feature_vector[np.array([1, 2, 4, 5, 7])] 79 | rest_9 = np.asarray([mfe, orf, hmmr, blast2]) 80 | feature_vector_9 = np.concatenate((feature_vector_9_incomplete, rest_9)) 81 | feature_vector_9 = feature_vector_9.reshape(1, -1) 82 | 83 | feature_vector_10_incomplete = feature_vector[np.array([0, 2, 3, 4, 5, 6, 7])] 84 | rest_10 = np.asarray([hmmr, blast1, blast2]) 85 | feature_vector_10 = np.concatenate((feature_vector_10_incomplete, rest_10)) 86 | feature_vector_10 = feature_vector_10.reshape(1, -1) 87 | 88 | dict_feature_vectors = {8: feature_vector_8, 89 | 9: feature_vector_9, 90 | 10: feature_vector_10} 91 | 92 | feature_vectors = [] 93 | for ml_classifier, feature_names in zip(self.list_ml_classifiers, self.list_features): 94 | len_features = len(feature_names) 95 | feature_vector = dict_feature_vectors[len_features] 96 | feature_vectors.append(feature_vector) 97 | final_score += ml_classifier.predict_proba(feature_vector)[0][1] 98 | 99 | final_score = final_score / len(self.list_ml_classifiers) 100 | score_crispr_candidate_feature_list = [final_score, crispr_candidate, feature_vectors] 101 | self.dict_scored_result[key].append(score_crispr_candidate_feature_list) 102 | 103 | all_feature_vectors = [feature_vector_8, feature_vector_9, feature_vector_10] 104 | score_crispr_candidate_all_feature_tuple = final_score, crispr_candidate, all_feature_vectors 105 | self.dict_scored_result_with_all_vectors[key].append(score_crispr_candidate_all_feature_tuple) 106 | 107 | def _split_into_categories(self): 108 | for key, data in self.dict_scored_result.items(): 109 | data_pre_possible = [candidate for candidate in data if 0.75 > candidate[0] >= 0.5] 110 | data_alternative = [candidate for candidate in data if candidate[0] >= 0.75] 111 | data_alternative_filtered = [] 112 | data_bad = [candidate for candidate in data if candidate[0] < 0.5] 113 | 114 | if data_alternative: 115 | for element in data_alternative: 116 | crispr = element[1] 117 | if self.afsf(crispr): 118 | data_alternative_filtered.append(element) 119 | else: 120 | data_bad.append(element) 121 | 122 | if data_alternative_filtered: 123 | data_alternative_filtered = sorted(data_alternative_filtered, key=lambda x: x[0], reverse=True) 124 | best_candidate = data_alternative_filtered[0] 125 | data_alternative_filtered = data_alternative_filtered[1:] 126 | 127 | self.dict_bona_fide[key] = [best_candidate] 128 | if data_alternative_filtered: 129 | self.dict_alternative[key] = data_alternative_filtered 130 | 131 | if data_pre_possible: 132 | if key in self.dict_bona_fide: 133 | data_show_in_alternative = [candidate for candidate in data_pre_possible if candidate[0] >= 0.6] 134 | if data_show_in_alternative: 135 | data_show_in_alternative_filtered = [] 136 | for element in data_show_in_alternative: 137 | crispr = element[1] 138 | if self.afsf(crispr): 139 | data_show_in_alternative_filtered.append(element) 140 | else: 141 | data_bad.append(element) 142 | 143 | if key in self.dict_alternative: 144 | self.dict_alternative[key] += data_show_in_alternative_filtered 145 | else: 146 | self.dict_alternative[key] = data_show_in_alternative_filtered 147 | 148 | else: 149 | data_pre_possible = sorted(data_pre_possible, key=lambda x: x[0], reverse=True) 150 | best_possible_candidate = data_pre_possible[0] 151 | possible_discarded = data_pre_possible[1:] 152 | 153 | if self.afsf(best_possible_candidate[1]): 154 | self.dict_possible[key] = [best_possible_candidate] 155 | else: 156 | data_bad.append(best_possible_candidate) 157 | 158 | if possible_discarded: 159 | self.dict_possible_discarded[key] = possible_discarded 160 | 161 | if data_bad: 162 | self.dict_low_score[key] = data_bad 163 | 164 | def _split_into_categories_with_additional_classifier(self): 165 | 166 | for key, data in self.dict_scored_result_with_all_vectors.items(): 167 | data_pre_possible = [candidate for candidate in data if 0.75 > candidate[0] >= 0.5] 168 | data_alternative = [candidate for candidate in data if candidate[0] >= 0.75] 169 | data_alternative_filtered = [] 170 | data_bad = [candidate for candidate in data if candidate[0] < 0.5] 171 | 172 | if data_bad: 173 | self.dict_low_score[key] = data_bad 174 | 175 | if self.flag_possible_differential_model == "possible": 176 | if data_alternative: 177 | data_alternative = sorted(data_alternative, key=lambda x: x[0], reverse=True) 178 | best_candidate = data_alternative[0] 179 | data_alternative = data_alternative[1:] 180 | 181 | self.dict_bona_fide[key] = best_candidate 182 | if data_alternative: 183 | self.dict_alternative[key] = data_alternative 184 | else: 185 | if data_alternative: 186 | for element in data_alternative: 187 | crispr = element[1] 188 | if self.afsf(crispr): 189 | data_alternative_filtered.append(element) 190 | else: 191 | data_pre_possible.append(element) 192 | 193 | data_alternative_filtered = sorted(data_alternative_filtered, key=lambda x: x[0], reverse=True) 194 | best_candidate = data_alternative_filtered[0] 195 | data_alternative_filtered = data_alternative_filtered[1:] 196 | 197 | self.dict_bona_fide[key] = [best_candidate] 198 | if data_alternative_filtered: 199 | self.dict_alternative[key] = data_alternative_filtered 200 | data_alternative = sorted(data_alternative, key=lambda x: x[0], reverse=True) 201 | best_candidate_prev_model = data_alternative[0] 202 | data_alternative_prev_model = data_alternative[1:] 203 | 204 | vectors_alternative = [get_full_vector(data[2]) for data in data_alternative] 205 | scores_new_model = [self.possible_differentiate_model.predict_proba(v)[0][1] for v in 206 | vectors_alternative] 207 | 208 | scores_new_model, data_alternative_sorted = zip(*sorted(zip(scores_new_model, data_alternative), 209 | key=lambda x: x[0], reverse=True)) 210 | 211 | best_candidate = data_alternative_sorted[0] 212 | best_score = scores_new_model[0] 213 | label = 1.0 if best_score >= 0.5 else 0.0 214 | 215 | if label == 1.0: 216 | self.dict_bona_fide[key] = [best_candidate] 217 | alternative = data_alternative_sorted[1:] 218 | if alternative: 219 | self.dict_alternative[key] = alternative 220 | else: 221 | self.dict_bona_fide[key] = [best_candidate_prev_model] 222 | if data_alternative_prev_model: 223 | self.dict_alternative[key] = data_alternative_prev_model 224 | 225 | if data_pre_possible: 226 | data_pre_possible = sorted(data_pre_possible, key=lambda x: x[0], reverse=True) 227 | 228 | vectors_pre_possible = [get_full_vector(data[2]) for data in data_pre_possible] 229 | scores_new_model = [self.possible_differentiate_model.predict_proba(v)[0][1] for v in vectors_pre_possible] 230 | 231 | scores_new_model, data_pre_possible_sorted = zip(*sorted(zip(scores_new_model, data_pre_possible), 232 | key=lambda x: x[0], reverse=True)) 233 | 234 | best_possible_candidate = data_pre_possible_sorted[0] 235 | best_score = scores_new_model[0] 236 | label = 1.0 if best_score >= 0.5 else 0.0 237 | 238 | if label == 1.0: 239 | self.dict_possible[key] = [best_possible_candidate] 240 | possible_discarded = data_pre_possible_sorted[1:] 241 | self.dict_possible_discarded[key] = possible_discarded 242 | else: 243 | possible_discarded = data_pre_possible_sorted 244 | self.dict_possible_discarded[key] = possible_discarded 245 | 246 | def output(self): 247 | return [self.dict_bona_fide, self.dict_alternative, self.dict_possible, 248 | self.dict_possible_discarded, self.dict_low_score] 249 | 250 | -------------------------------------------------------------------------------- /components/module_non_array_computations.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | from components.components_non_array_computations import StrandComputation 4 | from components.components_non_array_computations import StrandComputationNew 5 | from components.components_non_array_computations import FullISElementSearch 6 | from components.components_non_array_computations import complete_info_with_cas_identifier 7 | from components.components_non_array_computations import FullLeaderSeqSearch 8 | from components.components_non_array_computations import RevComComputation 9 | 10 | 11 | class NonArrayComputations: 12 | def __init__(self, file_path, categories, flags_non_arrays_computations, flag_dev_mode, absolute_directory_path): 13 | self.file_path = file_path 14 | self.categories = categories 15 | self.flags_non_arrays_computations = flags_non_arrays_computations 16 | self.flag_dev_mode=flag_dev_mode 17 | self.absolute_directory_path = absolute_directory_path 18 | 19 | self.list_of_crisprs_bona_fide = [self.categories[0][key][0][1] for key in sorted(self.categories[0].keys())] 20 | self.list_of_crisprs_alternative = [el[1] for key in self.categories[1].keys() 21 | for el in self.categories[1][key]] 22 | self.list_of_crisprs_possible = [el[1] for key in self.categories[2].keys() 23 | for el in self.categories[2][key]] 24 | 25 | self.hmm_model_is_elements = "tools/hmm_search/models_is_element.hmm" 26 | 27 | self.is_element_result = {} 28 | self.cas_results = {} 29 | self.cassete_results = {} 30 | self.unstructured_cas_result_from_cas_identifier = {} 31 | self.strand_results = {} 32 | self.leader_results = {} 33 | self.downstream_results = {} 34 | self.data_with_all_computations = {} 35 | 36 | self._get_complete_dna() 37 | self._calculate_all_non_array_values() 38 | 39 | def _get_complete_dna(self): 40 | with open(self.file_path, 'r') as f: 41 | lines = f.readlines() 42 | 43 | self.input_header = lines[0] 44 | self.dna = ''.join([line.strip() for line in lines[1:]]) 45 | self.dna_length = len(self.dna) 46 | self.dna = self.dna.upper() 47 | 48 | def _calculate_all_non_array_values(self): 49 | self._calculate_strand() 50 | self._calculate_leader() 51 | 52 | if self.flags_non_arrays_computations["flag_cas"]: 53 | self._calculate_cas_proteins() 54 | if self.flags_non_arrays_computations["flag_is"]: 55 | self._calculate_is_elements() 56 | 57 | self.data_with_all_computations = {"IS": self.is_element_result, 58 | "Cas": self.cas_results, 59 | "Strand": self.strand_results, 60 | "Leader": [self.leader_results_bona_fide, self.leader_results_alternative, self.leader_results_possible], 61 | "Downstream": [self.downstream_results_bona_fide, self.downstream_results_alternative, self.downstream_results_possible], 62 | "Unstructured_Cas":self.unstructured_cas_result_from_cas_identifier, 63 | "Cassettes": self.cassete_results} 64 | 65 | def _calculate_is_elements(self): 66 | fies = FullISElementSearch(full_dna=self.dna, list_of_crisprs=self.list_of_crisprs_bona_fide, 67 | hmm_model=self.hmm_model_is_elements, min_similarity=0.9, min_coverage=0.9) 68 | 69 | self.is_element_result = fies.output() 70 | 71 | def _calculate_cas_proteins(self): 72 | def _get_crispr_intervals(): 73 | intervals = [(x.compute_stats()["start"], x.compute_stats()["end"]) for x in self.list_of_crisprs_bona_fide] 74 | return intervals 75 | 76 | def _filter_cas_genes(intervals, dict_cas_genes): 77 | dict_filtered_cas_intervals = {} 78 | for key, value in dict_cas_genes.items(): 79 | for interval in intervals: 80 | if interval[0] <= key[0] < interval[1]: 81 | break 82 | if interval[0] <= key[1] < interval[1]: 83 | break 84 | else: 85 | dict_filtered_cas_intervals[key] = value 86 | 87 | return dict_filtered_cas_intervals 88 | 89 | def _cluster_cas_genes(dict_cas_genes): 90 | list_clusters = [] 91 | cluster = [] 92 | for key in sorted(dict_cas_genes.keys()): 93 | value = dict_cas_genes[key] 94 | new_candidate = key[0], key[1], value 95 | if not cluster: 96 | cluster.append(new_candidate) 97 | elif abs(cluster[-1][1] - new_candidate[0]) < 500: 98 | cluster.append(new_candidate) 99 | else: 100 | list_clusters.append(cluster) 101 | cluster = [new_candidate] 102 | 103 | if cluster: 104 | list_clusters.append(cluster) 105 | 106 | return list_clusters 107 | 108 | def _clusters_to_simple_representation(list_clusters): 109 | list_simple_clusters = [] 110 | for cluster in list_clusters: 111 | cluster_start = cluster[0][0] 112 | cluster_end = cluster[-1][1] 113 | list_cas_gene_descriptions = [x[2] for x in cluster] 114 | list_simple_clusters.append((cluster_start, cluster_end, list_cas_gene_descriptions)) 115 | return list_simple_clusters 116 | 117 | def _compute_allowed_intervals(crispr_intervals): 118 | allowed_interwals = [] 119 | if not crispr_intervals: 120 | return [(0, math.inf)] 121 | else: 122 | allowed_interwals.append((0, crispr_intervals[0][0])) 123 | for index in range(len(crispr_intervals) - 1): 124 | allowed_interwals.append((crispr_intervals[index][1], crispr_intervals[index+1][0])) 125 | allowed_interwals.append((crispr_intervals[-1][1], math.inf)) 126 | return allowed_interwals 127 | 128 | def _group_by_output(allowed_intervals, list_simple_clusters): 129 | dict_cas_gene_order = {} 130 | for cluster in list_simple_clusters: 131 | for index, allowed_interval in enumerate(allowed_intervals): 132 | if allowed_interval[0] <= cluster[0] < allowed_interval[1]: 133 | if index in dict_cas_gene_order: 134 | dict_cas_gene_order[index].append(cluster) 135 | else: 136 | dict_cas_gene_order[index] = [cluster] 137 | break 138 | return dict_cas_gene_order 139 | 140 | def _group_by_output_separated(allowed_intervals, regular_clusters): 141 | dict_cas_gene_order_for_separated = {} 142 | for cluster in regular_clusters: 143 | for index, allowed_interval in enumerate(allowed_intervals): 144 | if allowed_interval[0] <= cluster[0][0] < allowed_interval[1]: 145 | if index in dict_cas_gene_order_for_separated: 146 | dict_cas_gene_order_for_separated[index].append(cluster) 147 | else: 148 | dict_cas_gene_order_for_separated[index] = [cluster] 149 | break 150 | return dict_cas_gene_order_for_separated 151 | 152 | dict_cas_genes, dict_cassete_labels = complete_info_with_cas_identifier(self.file_path, 153 | self.absolute_directory_path) 154 | 155 | self.cassete_results = dict_cassete_labels 156 | self.unstructured_cas_result_from_cas_identifier = dict_cas_genes 157 | 158 | intervals = _get_crispr_intervals() 159 | allowed_intervals = _compute_allowed_intervals(intervals) 160 | dict_filtered_cas_genes = _filter_cas_genes(intervals, dict_cas_genes) 161 | clustered_cas_genes = _cluster_cas_genes(dict_filtered_cas_genes) 162 | 163 | simple_clusters = _clusters_to_simple_representation(clustered_cas_genes) 164 | dict_groups = _group_by_output(allowed_intervals, simple_clusters) 165 | #dict_groups_separated = _group_by_output_separated(allowed_intervals, clustered_cas_genes) 166 | 167 | self.cas_results = dict_groups 168 | 169 | def _calculate_strand(self): 170 | if self.flags_non_arrays_computations["flag_strand"]: 171 | st = StrandComputationNew(list_of_crisprs=self.list_of_crisprs_bona_fide, 172 | absolute_directory_path=self.absolute_directory_path) 173 | self.strand_results["Bona-fide"] = st.output() 174 | st = StrandComputationNew(list_of_crisprs=self.list_of_crisprs_alternative, 175 | absolute_directory_path=self.absolute_directory_path) 176 | self.strand_results["Alternative"] = st.output() 177 | st = StrandComputationNew(list_of_crisprs=self.list_of_crisprs_possible, 178 | absolute_directory_path=self.absolute_directory_path) 179 | self.strand_results["Possible"] = st.output() 180 | 181 | 182 | #except Exception: 183 | # st = StrandComputation(list_of_crisprs=self.list_of_crisprs_bona_fide, 184 | # absolute_directory_path=self.absolute_directory_path) 185 | # self.strand_results["Bona-fide"] = st.output() 186 | # st = StrandComputation(list_of_crisprs=self.list_of_crisprs_alternative, 187 | # absolute_directory_path=self.absolute_directory_path) 188 | # self.strand_results["Alternative"] = st.output() 189 | # st = StrandComputation(list_of_crisprs=self.list_of_crisprs_possible, 190 | # absolute_directory_path=self.absolute_directory_path) 191 | # self.strand_results["Possible"] = st.output() 192 | else: 193 | self.strand_results["Bona-fide"] = {index: "Forward (Orientation was not computed)" 194 | for index in range(len(self.list_of_crisprs_bona_fide))} 195 | self.strand_results["Alternative"] = {index: "Forward (Orientation was not computed)" 196 | for index in range(len(self.list_of_crisprs_alternative))} 197 | self.strand_results["Possible"] = {index: "Forward (Orientation was not computed)" 198 | for index in range(len(self.list_of_crisprs_possible))} 199 | 200 | def _calculate_leader(self): 201 | flss_bona_fide = FullLeaderSeqSearch(self.list_of_crisprs_bona_fide, self.strand_results["Bona-fide"], self.dna) 202 | self.leader_results_bona_fide, self.downstream_results_bona_fide = flss_bona_fide.output() 203 | 204 | flss_alternative = FullLeaderSeqSearch(self.list_of_crisprs_alternative, self.strand_results["Alternative"], 205 | self.dna) 206 | self.leader_results_alternative, self.downstream_results_alternative = flss_alternative.output() 207 | 208 | flss_possible = FullLeaderSeqSearch(self.list_of_crisprs_possible, self.strand_results["Possible"], self.dna) 209 | self.leader_results_possible, self.downstream_results_possible = flss_possible.output() 210 | 211 | def output(self): 212 | return self.data_with_all_computations 213 | -------------------------------------------------------------------------------- /components/module_output_maker.py: -------------------------------------------------------------------------------- 1 | from components.components_output_maker import SimpleOutputMaker 2 | from components.components_output_maker import SummaryOutputMaker 3 | from components.components_output_maker import SummaryMakerCSV 4 | from components.components_output_maker import PickleOutputMaker 5 | from components.components_output_maker import CasSummaryMaker 6 | from components.components_output_maker import FastaOutputArrayMaker 7 | from components.components_output_maker import JsonOutputMaker 8 | 9 | from components.components_output_maker import CompleteFastaOutputMaker 10 | from components.components_output_maker import CompleteFolderSummaryMaker 11 | from components.components_output_maker import CompleteCasSummaryFolderMaker 12 | from components.components_output_maker import SpacerSummaryMaker 13 | from components.components_output_maker import CompleteSpacerCSVMaker 14 | 15 | 16 | class OutputMaker: 17 | def __init__(self, file_path, parameters, flags, result_path, pickle_result_path, 18 | json_result_path, categories, non_array_data, list_features, header): 19 | self.file_path = file_path 20 | self.parameters = parameters 21 | self.flags = flags 22 | self.result_path = result_path 23 | self.pickle_result_path = pickle_result_path 24 | self.json_result_path = json_result_path 25 | self.categories = categories 26 | self.non_array_data = non_array_data 27 | self.list_features = list_features 28 | self.header = header 29 | self.global_result_folder = "/".join(self.result_path.split("/")[:-1]) 30 | 31 | self._make_output() 32 | 33 | def _make_output(self): 34 | som = SimpleOutputMaker(categories=self.categories, 35 | result_path=self.result_path, 36 | non_array_data=self.non_array_data, 37 | list_features=self.list_features) 38 | 39 | suom = SummaryOutputMaker(result_path=self.result_path, 40 | categories=self.categories, 41 | non_array_data=self.non_array_data, 42 | header=self.header, 43 | list_feature_names=self.list_features) 44 | 45 | ssm = SpacerSummaryMaker(categories=self.categories, 46 | result_path=self.result_path) 47 | 48 | sm_csv = SummaryMakerCSV(result_path=self.result_path, 49 | categories=self.categories, 50 | non_array_data=self.non_array_data) 51 | 52 | if self.flags["flag_cas"] is True: 53 | sm_cas = CasSummaryMaker(result_path=self.result_path, 54 | non_array_data=self.non_array_data) 55 | 56 | 57 | #cfsm = CompleteFolderSummaryMaker(folder_result=self.global_result_folder) 58 | #ccfsm = CompleteCasSummaryFolderMaker(folder_result=self.global_result_folder) 59 | 60 | if self.flags["flag_fasta_report"] is True: 61 | foam = FastaOutputArrayMaker(folder_result=self.result_path, 62 | categories=self.categories, 63 | non_array_data=self.non_array_data) 64 | 65 | #cfom = CompleteFastaOutputMaker(folder_result=self.global_result_folder) 66 | 67 | if self.pickle_result_path: 68 | pom = PickleOutputMaker(file_path=self.file_path, 69 | pickle_result_folder=self.pickle_result_path, 70 | parameters=self.parameters, 71 | categories=self.categories, 72 | non_array_data=self.non_array_data, 73 | header=self.header, 74 | list_feature_names=self.list_features) 75 | 76 | if self.json_result_path: 77 | jom = JsonOutputMaker(file_path=self.file_path, 78 | json_result_folder=self.json_result_path, 79 | categories=self.categories, 80 | non_array_data=self.non_array_data, 81 | list_feature_names=self.non_array_data) 82 | -------------------------------------------------------------------------------- /components/pipeline.py: -------------------------------------------------------------------------------- 1 | from components.module_detection import Detection 2 | from components.module_detection_refinement import DetectionRefinement 3 | from components.module_evaluation import ArrayEvaluation 4 | from components.module_evaluated_arrays_enhancement import EvaluatedArraysEnhancement 5 | from components.module_non_array_computations import NonArrayComputations 6 | from components.module_output_maker import OutputMaker 7 | 8 | 9 | class Pipeline: 10 | def __init__(self, result_folder_path, pickle_folder_path, json_folder_path, file_path, 11 | list_ml_classifiers, list_features, parameters, flags, flag_dev_mode, absolute_directory_path): 12 | self.result_folder_path = result_folder_path + "/" + file_path.split("/")[-1].split(".")[0] 13 | self.pickle_folder_path = pickle_folder_path 14 | self.json_folder_path = json_folder_path 15 | self.file_path = file_path 16 | self.list_ml_classifiers = list_ml_classifiers 17 | self.list_features = [features.strip().split(".") for features in list_features] 18 | self.flags = flags 19 | self.parameters = parameters 20 | self.flag_dev_mode = flag_dev_mode 21 | self.absolute_directory_path = absolute_directory_path 22 | 23 | self.header = None 24 | self.dict_fuzzy_crisprs = {} 25 | self.dict_crispr_candidates = {} 26 | self.categories = {} 27 | self.non_array_data = {} 28 | 29 | self._get_header() 30 | self._run_detection() 31 | self._run_detection_refinement() 32 | self._run_evaluation() 33 | self._results_enhancement() 34 | self._run_non_crispr_computation() 35 | self._write_output() 36 | 37 | def _get_header(self): 38 | with open(self.file_path) as f: 39 | self.header = f.readline() 40 | 41 | def _run_detection(self): 42 | print("1. Run initial array detection") 43 | detection = Detection(file_path=self.file_path, 44 | flags=self.flags, 45 | parameters=self.parameters, 46 | flag_dev_mode=self.flag_dev_mode) 47 | self.dict_fuzzy_crisprs = detection.output() 48 | 49 | def _run_detection_refinement(self): 50 | print("2. Refine detected arrays") 51 | det_ref = DetectionRefinement(dict_fuzzy_crisprs=self.dict_fuzzy_crisprs, 52 | parameters=self.parameters, 53 | flag_dev_mode=self.flag_dev_mode) 54 | self.dict_crispr_candidates = det_ref.output() 55 | 56 | def _run_evaluation(self): 57 | print("3. Evaluate candidates") 58 | ae = ArrayEvaluation(dict_crispr_array_candidates=self.dict_crispr_candidates, 59 | list_ml_classifiers=self.list_ml_classifiers, 60 | list_features=self.list_features, 61 | parameters=self.parameters, 62 | flag_dev_mode=self.flag_dev_mode) 63 | self.categories = ae.output() 64 | 65 | def _results_enhancement(self): 66 | print("4. Enhance evaluated arrays") 67 | a_enh = EvaluatedArraysEnhancement(file_path=self.file_path, 68 | categories=self.categories, 69 | parameters=self.parameters, 70 | flag_dev_mode=self.flag_dev_mode) 71 | self.categories = a_enh.output() 72 | 73 | def _run_non_crispr_computation(self): 74 | print("5. Complement arrays with additional info") 75 | nac = NonArrayComputations(file_path=self.file_path, 76 | categories=self.categories, 77 | flags_non_arrays_computations=self.flags, 78 | flag_dev_mode=self.flag_dev_mode, 79 | absolute_directory_path=self.absolute_directory_path) 80 | self.non_array_data = nac.output() 81 | 82 | def _write_output(self): 83 | print("6. Write down the results") 84 | om = OutputMaker(file_path=self.file_path, 85 | parameters=self.parameters, 86 | flags=self.flags, 87 | result_path=self.result_folder_path, 88 | pickle_result_path=self.pickle_folder_path, 89 | json_result_path=self.json_folder_path, 90 | categories=self.categories, 91 | non_array_data=self.non_array_data, 92 | list_features=self.list_features, 93 | header=self.header) 94 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: crispr_identify_env 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - nodefaults 6 | - biobuilds 7 | - r 8 | - axfeh 9 | dependencies: 10 | - python==3.7.6 11 | - pip 12 | - python_abi=3.7 13 | - biopython=1.76 14 | - h5py=2.10.0 15 | - hdf5=1.10.6 16 | - hmmer=3.3 17 | - numpy==1.18.1 18 | - pandas=1.0.3 19 | - matplotlib=3.1.3 20 | - perl=5.26.2 21 | - perl-compress-bgzf=0.005 22 | - perl-threaded=5.26.0 23 | - perl-yaml=1.29 24 | - prodigal==2.6.3 25 | - dill=0.3.3 26 | - protobuf=3.13.0.1 27 | - regex=2019.03.09 28 | - pyasn1=0.4.8 29 | - pycparser=2.20 30 | - networkx=2.5 31 | - pyjwt=1.7.1 32 | - pyparsing=2.4.7 33 | - pyqt=5.9.2 34 | - pysocks=1.7.1 35 | - python-dateutil=2.8.1 36 | - pytz=2020.1 37 | - pyyaml=5.3.1 38 | - scikit-learn==0.22.1 39 | - scipy=1.4.1 40 | - anaconda::tensorflow==2.3.0 41 | - tensorboard==2.3.0 42 | - tensorboard-plugin-wit==1.6.0 43 | - viennarna==2.4.15 44 | - pyopenssl=22.0.0 45 | - certifi=2022.12.7 46 | - vmatch==2.3.0 47 | - clustalo==1.2.3 48 | - blast==2.5.0 49 | - keras==2.4.3 50 | - libffi=3.2.1 51 | - spacerplacer 52 | - pip: 53 | - python-Levenshtein 54 | -------------------------------------------------------------------------------- /tools/CRISPRcasIdentifier/README.txt: -------------------------------------------------------------------------------- 1 | link the CRISPRcasIdentifier folder here, such that you have a call like the following 2 | 3 | CRISPRidentify/tools/CRISPRcasIdentifier/CRISPRcasIdentifier/CRISPRcasIdentifier.py 4 | -------------------------------------------------------------------------------- /tools/blasting/Verified_repeats_dataset1.fa: -------------------------------------------------------------------------------- 1 | >db1_1 2 | GATAATCTCTTATAGAATTGAAAG 3 | >db1_2 4 | GTTTTTATCGTACCTATGAGGAATTGAAAC 5 | >db1_3 6 | GTTTCAGACGAACCCTTGTGGGATTGAAGC 7 | >db1_4 8 | GTTTCAGACGAACCCTTGTGGGGTTGAAGC 9 | >db1_5 10 | GTTTCAGACGAACCCTTGTGGGTTTGAAGC 11 | >db1_6 12 | GATTAATCCCAAAAGGAATTGAAAG 13 | >db1_7 14 | GTCGCGTCCTCACGGGCGCGTGGATTGAAAC 15 | >db1_8 16 | GAGTTCCCCGCGCCAGCGGGGATAAACCG 17 | >db1_9 18 | GTGTTCCCCGCGCCAGCGGGGATAAACCG 19 | >db1_10 20 | GTTCACTGCCGTGTAGGCAGCTAAGAAA 21 | >db1_11 22 | GTTCACTGCCGTACAGGCAGCTTAGAAA 23 | >db1_12 24 | GTTGAAGTGGTACTTCCAGTAAAACAAGGATTGAAAC 25 | >db1_13 26 | CTAAAAGAATAACTTGCAAAATAACAAGCATTGAAAC 27 | >db1_14 28 | CTTTCCTTCTACTAATCCCGGCGATCGGGACTGAAAC 29 | >db1_15 30 | GTTTTAGAGCTATGCTGTTTTGAATGGTCCCAAAAC 31 | >db1_16 32 | GTTGTAGCTCCCTTTCTCATTTCGCAGTGCTACAAT 33 | >db1_17 34 | GTTTTAGTCCCTTTTTAAATTTCTTTATGGTAAAAT 35 | >db1_18 36 | GTTCCAATAAGACTAAAATAGAATTGAAAG 37 | >db1_19 38 | GATCGATACCCACCCCGAAGAAAAGGGGACGAGAAC 39 | >db1_20 40 | GTTCAACACCCTCTTTTCCCCGTCAGGGGACTGAAAC 41 | >db1_21 42 | GTCTCCACTCGTAGGAGAAATTAATTGATTGGAAAC 43 | >db1_22 44 | GAACAACTCAAAAGAGAATTGCAAG 45 | >db1_23 46 | ATTAAAATCAGACCGTTTCGGAATGGAAAT 47 | >db1_24 48 | GTTTTATATTAACTAAGTGGTATGTAAAG 49 | >db1_25 50 | GAATCTCAAAAAGAGGATTGAAAG 51 | >db1_26 52 | GTGGAAATCAAAAGATAGTAGAAAC 53 | >db1_27 54 | GGTTTTAGTACTCTGTAATTTTAG 55 | -------------------------------------------------------------------------------- /tools/blasting/Verified_repeats_dataset1.fa.nhr: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/tools/blasting/Verified_repeats_dataset1.fa.nhr -------------------------------------------------------------------------------- /tools/blasting/Verified_repeats_dataset1.fa.nin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/tools/blasting/Verified_repeats_dataset1.fa.nin -------------------------------------------------------------------------------- /tools/blasting/Verified_repeats_dataset1.fa.nog: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/tools/blasting/Verified_repeats_dataset1.fa.nog -------------------------------------------------------------------------------- /tools/blasting/Verified_repeats_dataset1.fa.nsd: -------------------------------------------------------------------------------- 1 | db1_10 2 | db1_109 3 | db1_1110 4 | db1_1211 5 | db1_1312 6 | db1_1413 7 | db1_1514 8 | db1_1615 9 | db1_1716 10 | db1_1817 11 | db1_1918 12 | db1_21 13 | db1_2019 14 | db1_2120 15 | db1_2221 16 | db1_2322 17 | db1_2423 18 | db1_2524 19 | db1_2625 20 | db1_2726 21 | db1_32 22 | db1_43 23 | db1_54 24 | db1_65 25 | db1_76 26 | db1_87 27 | db1_98 28 | lcl|db1_10 29 | lcl|db1_109 30 | lcl|db1_1110 31 | lcl|db1_1211 32 | lcl|db1_1312 33 | lcl|db1_1413 34 | lcl|db1_1514 35 | lcl|db1_1615 36 | lcl|db1_1716 37 | lcl|db1_1817 38 | lcl|db1_1918 39 | lcl|db1_21 40 | lcl|db1_2019 41 | lcl|db1_2120 42 | lcl|db1_2221 43 | lcl|db1_2322 44 | lcl|db1_2423 45 | lcl|db1_2524 46 | lcl|db1_2625 47 | lcl|db1_2726 48 | lcl|db1_32 49 | lcl|db1_43 50 | lcl|db1_54 51 | lcl|db1_65 52 | lcl|db1_76 53 | lcl|db1_87 54 | lcl|db1_98 55 | -------------------------------------------------------------------------------- /tools/blasting/Verified_repeats_dataset1.fa.nsi: -------------------------------------------------------------------------------- 1 | b6@b4db2_1 2 | CTAAAAGAATAACTTGCAAAATAACAAGCATTGAAAC 3 | >db2_2 4 | GTTGAAGTGGTACTTCCAGTAAAACAAGGATTGAAAC 5 | >db2_3 6 | GTTTCAATCCACGCGCCCGTGAGGACGCGAC 7 | >db2_4 8 | GTTCCAATAAGACTAAAATAGAATTGAAAG 9 | >db2_5 10 | GTTTTTATCGTACCTATGAGGAATTGAAAC 11 | >db2_6 12 | GATTAATCCCAAAAGGAATTGAAAG 13 | >db2_7 14 | GATTAATCCTAAAAGGAATTGAAAG 15 | >db2_8 16 | CTTTCAATTCCTTTTGGGATTCATC 17 | >db2_9 18 | GAATCTCAAAAAGAGGATTGAAAG 19 | >db2_10 20 | CTTTCAATTCTATAAGAGATTATC 21 | >db2_11 22 | CTAAAAGAATAACTTGCAAAATAACAAGCATTGAAAC 23 | >db2_12 24 | GTTGAAGTGGTACTTCCAGTAAAACAAGGATTGAAAC 25 | >db2_13 26 | GTTGAAGAGGTACTTCCAGTAAAACAAGGATTGAAAC 27 | >db2_14 28 | GTTTCAATCCACGCGCCCGTGAGGACGCGAC 29 | >db2_15 30 | GTCGCGCCCTCACGGGCGCGTGGATTGAAAC 31 | >db2_16 32 | GTTTGTATCGTACCTATGAGGAATTGAAAC 33 | >db2_17 34 | CTTTCAATTCTCTTTTAGTCTTATTGGAAC 35 | >db2_18 36 | GTTACAATAAGACTAAAATAGAATTGAAAG 37 | >db2_19 38 | ATTTCAATTCCTCATAGGTACGATAAAAAC 39 | >db2_20 40 | GTTCCAATAAGACTAAAATAGAATTGAAAG 41 | >db2_21 42 | GTTTTTATCGTACCTATGAGGAATTGAAAC 43 | >db2_22 44 | GTTTTTATCTTACCTATGAGGAATTGAAAC 45 | >db2_23 46 | GTTTCAATTCCTCATAGGTACGATCAAAAC 47 | >db2_24 48 | GTTCCAATAAGACTAAAATAGAATTGAAA 49 | >db2_25 50 | GATTAATCCCAAAAGGAATTGAAAG 51 | >db2_26 52 | GATTAATCCTAAAAGGAATTGAAAG 53 | >db2_27 54 | CTTTCAATTCCTTTTGGGATTCATC 55 | >db2_28 56 | CTTTCAATCCCTTTTGGGATTCATC 57 | >db2_29 58 | GAATCTCAAAAAGAGGATTGAAAG 59 | >db2_30 60 | CTTTCAATTCTATAAGAGATTATC 61 | >db2_31 62 | CTTTCAATCCTCTCTTTGAGATTC 63 | >db2_32 64 | CTTTCAATCCTCTTCTTGAGATTC 65 | >db2_33 66 | GTATCTCAAAAAGAGGATTGAAAG 67 | >db2_34 68 | CTAAAAGAATAACTTGCAAAATAACAAGCATTGAAAC 69 | >db2_35 70 | GTTGAAGTGGTACTTCCAGTAAAACAAGGATTGAAAC 71 | >db2_36 72 | GTTGAAGAGGTACTTCCAGTAAAACAAGGATTGAAAC 73 | >db2_37 74 | GTTTCAATCCACGCGCCCGTGAGGACGCGAC 75 | >db2_38 76 | GTTTCAATTCCTCATAGGTACGATCAAAACG 77 | >db2_39 78 | GTCGCGCCCTCACGGGCGCGTGGATTGAAAC 79 | >db2_40 80 | CTTTCAATTCTTTTGTAGTCTTATTGGAAC 81 | >db2_41 82 | GTTTGTATTTTACCTATGAGGAATTGAAAC 83 | >db2_42 84 | GTTTGTATCGTACCTATGAGGAATTGAAAC 85 | >db2_43 86 | CTTTCAATTCTCTTTTAGTCTTATTGGAAC 87 | >db2_44 88 | GTTACAATAAGACTAAAATAGAATTGAAAG 89 | >db2_45 90 | GTTTTTAGACTACCTATGAGGAATTGAAAC 91 | >db2_46 92 | ATTTCAATTCCTCATAGGTACGATACAAAC 93 | >db2_47 94 | GTTTTTAGAGTGCCTATGAGGAATTGAAAC 95 | >db2_48 96 | GTTCCAATAAGACTCTAAGAGAATTGAAAG 97 | >db2_49 98 | ATTTCAATTCCTCATAGGTACGATAAAAAC 99 | >db2_50 100 | GTTCCAATAAGACTCCAAGAGAATTGAAAG 101 | >db2_51 102 | CTTTCAATTCTCTTTGAGTCTTATTGGAAC 103 | >db2_52 104 | GTTCCAATAAGACTAAAATAGAATTGAAAG 105 | >db2_53 106 | GTTTTGAGCCTACCTATGAGGAATTGAAAC 107 | >db2_54 108 | GTTCCAATAAGACTATAAGAGAATTGAAAG 109 | >db2_55 110 | GTTTTTATCGTACCTATGAGGAATTGAAAC 111 | >db2_56 112 | GTTTCAATTCCTCATAGGTACGCTGAAAAC 113 | >db2_57 114 | GTTTCAATTCCTGATAGGTAGGCTAAAAAC 115 | >db2_58 116 | GTTTGTATCGTACCTATGAGGGATTGAAAC 117 | >db2_59 118 | GTTTCAATTCCTCATAGGTAAGCTAACAAC 119 | >db2_60 120 | GTTTCAATTCCTCATAGGTACGCTGAGAAC 121 | >db2_61 122 | GTTTTTATCTTACCTATGAGGAATTGAAAC 123 | >db2_62 124 | GTTCCAATAAGACTTTAAAAGAATTGAAAG 125 | >db2_63 126 | CTTTCAATTCTATTTTGGTCTTATTGTAAC 127 | >db2_64 128 | GTTTGTATCTTAACTATGAGGAATTGAAAC 129 | >db2_65 130 | GTTTCAATTCCTCATAGGTACGATCAAAAC 131 | >db2_66 132 | GTTTTTAGCTTACCTATGAGGGATTGAAAC 133 | >db2_67 134 | GTTTGTATCTTACCTATGAGGAATTGAAAC 135 | >db2_68 136 | GTTCCAATAAGACTAAAATAGAATTGAAA 137 | >db2_69 138 | GTTTTTATCGTACCTATGAGGGATTGAAA 139 | >db2_70 140 | GTTTTTATCTTACCTATGAGGAATTGAAA 141 | >db2_71 142 | GATTAATCCCAAAAGGAATTGAAAG 143 | >db2_72 144 | GATTAATCCTAAAAGGAATTGAAAG 145 | >db2_73 146 | CTTTCAATTCCTTTTGGGATTCATC 147 | >db2_74 148 | CTTTCAATCCCTTTTGGGATTCATC 149 | >db2_75 150 | GATAATCTACTATAGAATTGAAAG 151 | >db2_76 152 | GAATCTCAAAAAGAGGATTGAAAG 153 | >db2_77 154 | CTTTCAATTCTATAAGAGATTATC 155 | >db2_78 156 | CTTTCAATCCTCTCTTTGAGATTC 157 | >db2_79 158 | CTTTCAATCCTCTTCTTGAGATTC 159 | >db2_80 160 | GTATCTCAAAAAGAGGATTGAAAG 161 | >db2_81 162 | CTAAAAGAATAACTTGCAAAATAACAAGCATTGAAAC 163 | >db2_82 164 | GTTGAAGTGGTACTTCCAGTAAAACAAGGATTGAAAC 165 | >db2_83 166 | GTTGAAGAGGTACTTCCAGTAAAACAAGGATTGAAAC 167 | >db2_84 168 | GTTTCAATCCACGCGCCCGTGAGGACGCGAC 169 | >db2_85 170 | GTTTCAATTCCTCATAGGTACGATCAAAACG 171 | >db2_86 172 | TCTTTCAATTCTATTTTGGTCTTATTGTAAC 173 | >db2_87 174 | GTCGCGCTCTCACGAGCGCGTGGATTGAAAC 175 | >db2_88 176 | GTTTCAATCCACGCCCCCGTGAGGGAGCGAC 177 | >db2_89 178 | GTCGCGCCCTCACGGGCGCGTGGATTGAAAC 179 | >db2_90 180 | CTTTCAATTCTTTTGTAGTCTTATTGGAAC 181 | >db2_91 182 | GTTTGTATTTTACCTATGAGGAATTGAAAC 183 | >db2_92 184 | GTTTGTATCGTACCTATGAGGAATTGAAAC 185 | >db2_93 186 | CTTTCAATTCTCTTTTAGTCTTATTGGAAC 187 | >db2_94 188 | GTTACAATAAGACTAAAATAGAATTGAAAG 189 | >db2_95 190 | GTTTTTAGACTACCTATGAGGAATTGAAAC 191 | >db2_96 192 | ATTTCAATTCCTCATAGGTACGATACAAAC 193 | >db2_97 194 | GTTTTTAGAGTGCCTATGAGGAATTGAAAC 195 | >db2_98 196 | GTTCCAATAAGACTCTAAGAGAATTGAAAG 197 | >db2_99 198 | ATTTCAATTCCTCATAGGTACGATAAAAAC 199 | >db2_100 200 | GTTCCAATAAGACTCCAAGAGAATTGAAAG 201 | >db2_101 202 | CTTTCAATTCTCTTTGAGTCTTATTGGAAC 203 | >db2_102 204 | GTTCCAATAAGACTAAAATAGAATTGAAAG 205 | >db2_103 206 | GTTTTGAGCCTACCTATGAGGAATTGAAAC 207 | >db2_104 208 | GTTTTTATCGTACCTATGAGGAATTGAAAC 209 | >db2_105 210 | GTTCCAATAAGACTATAAGAGAATTGAAAG 211 | >db2_106 212 | GTTTCAATTCCTCATAGGTACGCTGAAAAC 213 | >db2_107 214 | GTTTCAATTCCTGATAGGTAGGCTAAAAAC 215 | >db2_108 216 | GTTTGTATCGTACCTATGAGGGATTGAAAC 217 | >db2_109 218 | GTTTCAATTCCTCATAGGTAAGCTAACAAC 219 | >db2_110 220 | GTTTCAATTCCTCATAGGTACGCTGAGAAC 221 | >db2_111 222 | GTTTTTATCTTACCTATGAGGAATTGAAAC 223 | >db2_112 224 | GTTCCAATAAGACTTTAAAAGAATTGAAAG 225 | >db2_113 226 | GTTTGTATCTTAACTATGAGGAATTGAAAC 227 | >db2_114 228 | CTTTCAATTCTATTTTGGTCTTATTGTAAC 229 | >db2_115 230 | GTTTCAATTCCTCATAGGTACGATCAAAAC 231 | >db2_116 232 | GTTTTTAGCTTACCTATGAGGGATTGAAAC 233 | >db2_117 234 | GTTTGTATCTTACCTATGAGGAATTGAAAC 235 | >db2_118 236 | GTTCCAATAAGACTAAAATAGAATTGAAA 237 | >db2_119 238 | GTTTTTATCGTACCTATGAGGGATTGAAA 239 | >db2_120 240 | GTTTTTATCTTACCTATGAGGAATTGAAA 241 | >db2_121 242 | GATTAATCCCAAAAGGAATTGAAAG 243 | >db2_122 244 | GATTAATCCTAAAAGGAATTGAAAG 245 | >db2_123 246 | CTTTCAATTCCTTTTGGGATTCATC 247 | >db2_124 248 | CTTTCAATCCCTTTTGGGATTCATC 249 | >db2_125 250 | GAATCTCAAAAAGAGGATTGAAAG 251 | >db2_126 252 | GATAATCTACTATAGAATTGAAAG 253 | >db2_127 254 | GAATCTCAAGTTGAGGATTGAAAG 255 | >db2_128 256 | CTTTCAATTCTATAAGAGATTATC 257 | >db2_129 258 | CTTTCAATCCTCTCTTTGAGATTC 259 | >db2_130 260 | CTTTCAATCCTCTTCTTGAGATTC 261 | >db2_131 262 | GTATCTCAAAAAGAGGATTGAAAG 263 | >db2_132 264 | CTAAAAGAATAACTTGCAAAATAACAAGCATTGAAAC 265 | >db2_133 266 | GTTGAAGTGGTACTTCCAGTAAAACAAGGATTGAAAC 267 | >db2_134 268 | GTTGAAGAGGTACTTCCAGTAAAACAAGGATTGAAAC 269 | >db2_135 270 | GTTTCAATCCTTATTTTACTGGATGTTCTACTTCAAC 271 | >db2_136 272 | TGTTTCAATTCCTCATAGGTACGCTGAAAACC 273 | >db2_137 274 | GTTTCAATCCACGCGCCCGTGAGGACGCGAC 275 | >db2_138 276 | GTTTCAATTCCTCATAGGTACGATCAAAACG 277 | >db2_139 278 | TCTTTCAATTCTATTTTGGTCTTATTGTAAC 279 | >db2_140 280 | GTCGCGCTCTCACGAGCGCGTGGATTGAAAC 281 | >db2_141 282 | GTTTCAATCCACGCCCCCGTGAGGGAGCGAC 283 | >db2_142 284 | GTCGCGCCCTCACGGGCGCGTGGATTGAAAC 285 | >db2_143 286 | CTTTCAATTCTTTTGTAGTCTTATTGGAAC 287 | >db2_144 288 | GTTTGTATTTTACCTATGAGGAATTGAAAC 289 | >db2_145 290 | GTTTGTATCGTACCTATGAGGAATTGAAAC 291 | >db2_146 292 | CTTTCAATTCTCTTTTAGTCTTATTGGAAC 293 | >db2_147 294 | GTTACAATAAGACTAAAATAGAATTGAAAG 295 | >db2_148 296 | GTTTTTAGACTACCTATGAGGAATTGAAAC 297 | >db2_149 298 | ATTTCAATTCCTCATAGGTACGATACAAAC 299 | >db2_150 300 | GTTTCAATTCCTCTTAGGCACTCTAAAAAC 301 | >db2_151 302 | GTTTTTAGAGTGCCTATGAGGAATTGAAAC 303 | >db2_152 304 | GTTTATAGAATACCTATGAGGAATTGAAAC 305 | >db2_153 306 | GTTCCAATAAGACTCTAAGAGAATTGAAAG 307 | >db2_154 308 | ATTTCAATTCCTCATAGGTACGATAAAAAC 309 | >db2_155 310 | GTTTTTAGCTTACCTATAAGGGATTGAAAC 311 | >db2_156 312 | GTTGCAATAAGACTCTAAGAGAATTGAAAG 313 | >db2_157 314 | GTTTTTAGCCTACCTATGAGGGATTGAAAT 315 | >db2_158 316 | GTTCCAATAAGACTCCAAGAGAATTGAAAG 317 | >db2_159 318 | CTTTCAATTCTCTTTGAGTCTTATTGGAAC 319 | >db2_160 320 | GTTCCAATAAGACTAAAATAGAATTGAAAG 321 | >db2_161 322 | GTTTATAGCCTACCTATAAGGAATTGAAAC 323 | >db2_162 324 | GTTTGTAGCCTACCTATGAGGGATTGAAAC 325 | >db2_163 326 | GTTTTTAGAGTGCCTATAAGGAATTGAAAC 327 | >db2_164 328 | GTTTTGAGCCTACCTATGAGGAATTGAAAC 329 | >db2_165 330 | GTTCCAATAAGACTATAAGAGAATTGAAAG 331 | >db2_166 332 | GTTTTTATCGTACCTATGAGGAATTGAAAC 333 | >db2_167 334 | GTTTCAATTCCTCATAGGTACGCTGAAAAC 335 | >db2_168 336 | GTTTCAATTCCTGATAGGTAGGCTAAAAAC 337 | >db2_169 338 | GTTTGTATCGTACCTATGAGGGATTGAAAC 339 | >db2_170 340 | GTTTCAATTCCTCATAGGTAAGCTAACAAC 341 | >db2_171 342 | GTTTCAATAAGACTCTAAGAGAATTGAAAG 343 | >db2_172 344 | GTTTCAATCCCTCATAGGTAAGCTAACAAC 345 | >db2_173 346 | ATTTCAATTCCTCATAGATAGGCTAAAAAC 347 | >db2_174 348 | GTTTCAATTCCTCATAGGTACGCTGAGAAC 349 | >db2_175 350 | GTTCCAATAAGACTTTAAAAGAATTGAAAG 351 | >db2_176 352 | GTTTTTATCTTACCTATGAGGAATTGAAAC 353 | >db2_177 354 | CTTTCAATTCTATTTTGGTCTTATTGTAAC 355 | >db2_178 356 | GTTTGTATCTTAACTATGAGGAATTGAAAC 357 | >db2_179 358 | GTTTCAATTCCTCATAGGTACGATCAAAAC 359 | >db2_180 360 | GTTTTTAGCCTACCTATAAGGAATTGAAAT 361 | >db2_181 362 | GTTTCAATCCCTTATAGGTAGGCTAAAAAC 363 | >db2_182 364 | GTTTCAATCCCTAATAGGTATGCTAAAAAC 365 | >db2_183 366 | GTTTTTAGCTTACCTATGAGGGATTGAAAC 367 | >db2_184 368 | GTTTGTATCTTACCTATGAGGAATTGAAAC 369 | >db2_185 370 | GTTCCAATAAGACTAAAATAGAATTGAAA 371 | >db2_186 372 | GTTTTTATCGTACCTATGAGGGATTGAAA 373 | >db2_187 374 | GTTTTTATCTTACCTATGAGGAATTGAAA 375 | >db2_188 376 | GATTAATCCCAAAAGGAATTGAAAG 377 | >db2_189 378 | GATTAATCCTAAAAGGAATTGAAAG 379 | >db2_190 380 | CTTTCAATTCCTTTTGGGATTCATC 381 | >db2_191 382 | CTTTCAATCCCTTTTGGGATTCATC 383 | >db2_192 384 | GATAATCTACTATAGAATTGAAAG 385 | >db2_193 386 | GAATCTCAAAAAGAGGATTGAAAG 387 | >db2_194 388 | GAATCTCAAGTTGAGGATTGAAAG 389 | >db2_195 390 | CTTTCAATTCTATAAGAGATTATC 391 | >db2_196 392 | CTTTCAATCCTCTCTTTGAGATTC 393 | >db2_197 394 | CTTTCAATCCTCTTCTTGAGATTC 395 | >db2_198 396 | GTATCTCAAAAAGAGGATTGAAAG 397 | >db2_199 398 | CTAAAAGAATAACTTGCAAAATAACAAGCATTGAAAC 399 | >db2_200 400 | GTTGAAGTGGTACTTCCAGTAAAACAAGGATTGAAAC 401 | >db2_201 402 | GTTGAAGAGGTACTTCCAGTAAAACAAGGATTGAAAC 403 | >db2_202 404 | GTTTCAATCCTTATTTTACTGGATGTTCTACTTCAAC 405 | >db2_203 406 | TGTTTCAATTCCTCATAGGTACGCTGAAAACC 407 | >db2_204 408 | GTTTCAATCCACGCGCCCGTGAGGACGCGAC 409 | >db2_205 410 | GTTTCAATTCCTCATAGGTACGATCAAAACG 411 | >db2_206 412 | GTCGCACCCTTGCGGGTGCGTGGATTGAAAC 413 | >db2_207 414 | GTCGCGCTCTCACGAGCGCGTGGATTGAAAC 415 | >db2_208 416 | TCTTTCAATTCTATTTTGGTCTTATTGTAAC 417 | >db2_209 418 | GTTTCAATCCACGCCCCCGTGAGGGAGCGAC 419 | >db2_210 420 | GTTTCAATCCGCGCCCCCGTGAGAGGGCGAC 421 | >db2_211 422 | GTCGCGCCCTCACGGGCGCGTGGATTGAAAC 423 | >db2_212 424 | GTTTCAATCCCTTATAGGTAGGCTAAAAACC 425 | >db2_213 426 | GTTTCAATTCTCCTAGAGTCTTATTGCAAC 427 | >db2_214 428 | CTTTCAATTCTTTTGTAGTCTTATTGGAAC 429 | >db2_215 430 | GTTTGTATTTTACCTATGAGGAATTGAAAC 431 | >db2_216 432 | GTTTCCAGCCTACCTATGAGGGATTGAAAC 433 | >db2_217 434 | GTTTTGTTTGTACCTATAGGGGATTGAAAC 435 | >db2_218 436 | GTTTGTATCGTACCTATGAGGAATTGAAAC 437 | >db2_219 438 | GTTTTTAGCCTACCTAAAAGGGATTGAAAC 439 | >db2_220 440 | CTTTCAATTCTCTTTTAGTCTTATTGGAAC 441 | >db2_221 442 | GTTGAAATCAGACTAATGTAGGATTGAAAG 443 | >db2_222 444 | GTTACAATAAGACTAAAATAGAATTGAAAG 445 | >db2_223 446 | GTTGAAATCAGACCAAAATGGGATTGAAAG 447 | >db2_224 448 | CTTTCTACAGTACCTATAAGGAATTGAAAT 449 | >db2_225 450 | ATTTCAATTCCTCATAGGTACGATACAAAC 451 | >db2_226 452 | GTTTTTAGACTACCTATGAGGAATTGAAAC 453 | >db2_227 454 | GTTTCAATTCCTCTTAGGCACTCTAAAAAC 455 | >db2_228 456 | GTTTTTAGAGTGCCTATGAGGAATTGAAAC 457 | >db2_229 458 | GTTTGTAGCGTGCCTATAAGGGATTGAAAC 459 | >db2_230 460 | GTTTATAGAATACCTATGAGGAATTGAAAC 461 | >db2_231 462 | GTTTCAATCCCAGATTGGTTCGATTAAAAC 463 | >db2_232 464 | GTTCCAATAAGACTCTAAGAGAATTGAAAG 465 | >db2_233 466 | ATTTCAATTCCTCATAGGTACGATAAAAAC 467 | >db2_234 468 | GTTTGAAGTTTACCTATGAGGAATTGAAAC 469 | >db2_235 470 | ATTTCAATCCCAAAATGGTCTGATTTTAAC 471 | >db2_236 472 | GTTTTTAGCTTACCTATAAGGGATTGAAAC 473 | >db2_237 474 | GTTGCAATAAGACTCTAAGAGAATTGAAAG 475 | >db2_238 476 | GTTTTTAGCCTACCTATGAGGGATTGAAAT 477 | >db2_239 478 | GTTCCAATAAGACTCCAAGAGAATTGAAAG 479 | >db2_240 480 | GTTGCAATAAGACTCGAGGAGAATTGAAAG 481 | >db2_241 482 | CTTTCAATTCTCTTTGAGTCTTATTGGAAC 483 | >db2_242 484 | GTTCCAATAAGACTAAAATAGAATTGAAAG 485 | >db2_243 486 | GTTTGTAGCCTACCTATGAGGGATTGAAAC 487 | >db2_244 488 | GTTTATAGCCTACCTATAAGGAATTGAAAC 489 | >db2_245 490 | GTTTTTAGAGTGCCTATAAGGAATTGAAAC 491 | >db2_246 492 | GTTTTGAGCCTACCTATGAGGAATTGAAAC 493 | >db2_247 494 | GTTCCAATAAGACTATAAGAGAATTGAAAG 495 | >db2_248 496 | GTTTTTATCGTACCTATGAGGAATTGAAAC 497 | >db2_249 498 | GCTTTAATCGTACCTTTTTGGAATTGAAAC 499 | >db2_250 500 | GTTTCAATTCCTCATAGGTACGCTGAAAAC 501 | >db2_251 502 | GTTTCAATTCCTGATAGGTAGGCTAAAAAC 503 | >db2_252 504 | GTTTGTATCGTACCTATGAGGGATTGAAAC 505 | >db2_253 506 | GCTTTTAGCATACCTATTAGGGATTGAAAC 507 | >db2_254 508 | GTTTCAATAAGACTCTAAGAGAATTGAAAG 509 | >db2_255 510 | GTTTCAATTCCTCATAGGTAAGCTAACAAC 511 | >db2_256 512 | GTTTCAATCCCTCATAGGTAAGCTAACAAC 513 | >db2_257 514 | ATTTCAATTCCTCATAGATAGGCTAAAAAC 515 | >db2_258 516 | GTTGCAATAAGACTCTAGGAGAATTGAAAG 517 | >db2_259 518 | GTTTCAATTCCTCATAGGTACGCTGAGAAC 519 | >db2_260 520 | GTTCCAATAAGACTTTAAAAGAATTGAAAG 521 | >db2_261 522 | GTTTTTATCTTACCTATGAGGAATTGAAAC 523 | >db2_262 524 | GTTTCAATCCCTTATAGGTAGGCTCAAAAC 525 | >db2_263 526 | CTTTCAATTCTATTTTGGTCTTATTGTAAC 527 | >db2_264 528 | GTTTGTATCTTAACTATGAGGAATTGAAAC 529 | >db2_265 530 | GTTTTTAGCCTACCTATAAGGAATTGAAAT 531 | >db2_266 532 | GTTTCAATTCCTCATAGGTACGATCAAAAC 533 | >db2_267 534 | GTTTCAATCCCTTATAGGTAAGCTAACAAC 535 | >db2_268 536 | GTTTCAATCCCTTATAGGTAGGCTAAAAAC 537 | >db2_269 538 | GTTTTTAGCTTACCTATGAGGGATTGAAAC 539 | >db2_270 540 | GTTTCAATCCCTAATAGGTATGCTAAAAAC 541 | >db2_271 542 | GTTTGTATCTTACCTATGAGGAATTGAAAC 543 | >db2_272 544 | GTTTATAGCCTACCTATAAGGGATTGAAAC 545 | >db2_273 546 | GTTTCTACCTTACCTTGGAGGAATTGAAAC 547 | >db2_274 548 | GTTATCAGCCTACCTATAAGGAATTGAAAC 549 | >db2_275 550 | ATTTCAATTCCTCCAAGGTAAGGTAAAAAC 551 | >db2_276 552 | GTTCCAATAAGACTAAAATAGAATTGAAA 553 | >db2_277 554 | GTTTTTATCGTACCTATGAGGGATTGAAA 555 | >db2_278 556 | GTTTCAATCCCTTATAGGTAAGCTAACAA 557 | >db2_279 558 | GTTTTTATCTTACCTATGAGGAATTGAAA 559 | >db2_280 560 | ACTTTCAATCCCTTATGGGATTCTTC 561 | >db2_281 562 | GATTAATCCCAAAAGGAATTGAAAG 563 | >db2_282 564 | CTTTCAATCCCTTTTGGGATGCAAC 565 | >db2_283 566 | GATTAATCCTAAAAGGAATTGAAAG 567 | >db2_284 568 | CTTTCAATTCCTTTTGGGATTCATC 569 | >db2_285 570 | CTTTCAATTCCATTATGGATTAGC 571 | >db2_286 572 | CTTTCAATCCCTTTTGGGATTCATC 573 | >db2_287 574 | GAATCCTATAAATGGAATTGAAAG 575 | >db2_288 576 | GAATCTCAAAAAGAGGATTGAAAG 577 | >db2_289 578 | GATAATCTACTATAGAATTGAAAG 579 | >db2_290 580 | CTTTCAATTCTATAAGAGATTATC 581 | >db2_291 582 | GAATCCTACAAATGGAATTGAAAG 583 | >db2_292 584 | GAATCTCAAGTTGAGGATTGAAAG 585 | >db2_293 586 | CTTTCAATCCTCTCTTTGAGATTC 587 | >db2_294 588 | CTTTCAATCCTCTTCTTGAGATTC 589 | >db2_295 590 | GTATCTCAAAAAGAGGATTGAAAG 591 | >db2_296 592 | CTTTCAATTCTATCTAACAGATTC 593 | >db2_297 594 | GTCGAAGAGCGAGTTCCAGGAAAACAAGGATTGAAAC 595 | >db2_298 596 | CTAAAAGAATAACTTGCAAAATAACAAGCATTGAAAC 597 | >db2_299 598 | GTTGAAGTGGTACTTCCAGTAAAACAAGGATTGAAAC 599 | >db2_300 600 | GTTGAAGAGGTACTTCCAGTAAAACAAGGATTGAAAC 601 | >db2_301 602 | GTTTCAATCCTTATTTTACTGGATGTTCTACTTCAAC 603 | >db2_302 604 | TGTTTCAATTCCTCATAGGTACGCTGAAAACC 605 | >db2_303 606 | GTTTCAATCCACGCGCCCGTGAGGACGCGAC 607 | >db2_304 608 | GTCGCACCCTTGCGGGTGCGTGGATTGAAAC 609 | >db2_305 610 | GTTTCAATTCCTCATAGGTACGATCAAAACG 611 | >db2_306 612 | TCTTTCAATTCTATTTTGGTCTTATTGTAAC 613 | >db2_307 614 | GTCGCGCTCTCACGAGCGCGTGGATTGAAAC 615 | >db2_308 616 | GTTTCAATCCGCGCCCCCGTGAGAGGGCGAC 617 | >db2_309 618 | GTTTCAATCCACGCCCCCGTGAGGGAGCGAC 619 | >db2_310 620 | GTCGCGCCCTCACGGGCGCGTGGATTGAAAC 621 | >db2_311 622 | ATCGCCCCCTCGCGGGGGCGCGGATTGAAAC 623 | >db2_312 624 | GTCGCCCCCGCAAGGGGGCGTGGATTGAAAT 625 | >db2_313 626 | GTTTCAATTCTCCTAGAGTCTTATTGCAAC 627 | >db2_314 628 | CTTTCAATTCTTTTGTAGTCTTATTGGAAC 629 | >db2_315 630 | CGTTTCCAGCCTACCTATGAGGGATTGAAAC 631 | >db2_316 632 | GTTTCAATCCCTTATAGGTAGGCTAAAAACC 633 | >db2_317 634 | GTTTGTATTTTACCTATGAGGAATTGAAAC 635 | >db2_318 636 | GTTTCCAGCCTACCTATGAGGGATTGAAAC 637 | >db2_319 638 | GTTTTGTTTGTACCTATAGGGGATTGAAAC 639 | >db2_320 640 | GTTTGTATCGTACCTATGAGGAATTGAAAC 641 | >db2_321 642 | GTTTTTAGCCTACCTAAAAGGGATTGAAAC 643 | >db2_322 644 | CTTTCAATTCTCTTTTAGTCTTATTGGAAC 645 | >db2_323 646 | GTTGAAATCAGACTAATGTAGGATTGAAAG 647 | >db2_324 648 | GTTACAATAAGACTAAAATAGAATTGAAAG 649 | >db2_325 650 | GTTGAAATCAGACCAAAATGGGATTGAAAG 651 | >db2_326 652 | CTTTCTACAGTACCTATAAGGAATTGAAAT 653 | >db2_327 654 | ATTTCAATTCCTCATAGGTACGATACAAAC 655 | >db2_328 656 | GTTTTTAGACTACCTATGAGGAATTGAAAC 657 | >db2_329 658 | GTTTCAATTCCTCTTAGGCACTCTAAAAAC 659 | >db2_330 660 | GTTTTTAGAGTGCCTATGAGGAATTGAAAC 661 | >db2_331 662 | GTTTGTAGCGTGCCTATAAGGGATTGAAAC 663 | >db2_332 664 | GTTTATAGAATACCTATGAGGAATTGAAAC 665 | >db2_333 666 | GTTTCAATCCCAGATTGGTTCGATTAAAAC 667 | >db2_334 668 | GTTCCAATAAGACTCTAAGAGAATTGAAAG 669 | >db2_335 670 | GTTTGAAGTTTACCTATGAGGAATTGAAAC 671 | >db2_336 672 | ATTTCAATTCCTCATAGGTACGATAAAAAC 673 | >db2_337 674 | ATTTCAATCCCAAAATGGTCTGATTTTAAC 675 | >db2_338 676 | GTTTTTAGCTTACCTATAAGGGATTGAAAC 677 | >db2_339 678 | GTTGCAATAAGACTCTAAGAGAATTGAAAG 679 | >db2_340 680 | GTTTTTAGCCTACCTATGAGGGATTGAAAT 681 | >db2_341 682 | GTTCCAATAAGACTCCAAGAGAATTGAAAG 683 | >db2_342 684 | GTTGCAATAAGACTCGAGGAGAATTGAAAG 685 | >db2_343 686 | CTTTCAATTCTCTTTGAGTCTTATTGGAAC 687 | >db2_344 688 | GTTCCAATAAGACTAAAATAGAATTGAAAG 689 | >db2_345 690 | GTTTGTAGCCTACCTATGAGGGATTGAAAC 691 | >db2_346 692 | GTTTATAGCCTACCTATAAGGAATTGAAAC 693 | >db2_347 694 | GTTTTTAGAGTGCCTATAAGGAATTGAAAC 695 | >db2_348 696 | GTTTTGAGCCTACCTATGAGGAATTGAAAC 697 | >db2_349 698 | GTTCCAATAAGACTATAAGAGAATTGAAAG 699 | >db2_350 700 | GTTTTTATCGTACCTATGAGGAATTGAAAC 701 | >db2_351 702 | GCTTTAATCGTACCTTTTTGGAATTGAAAC 703 | >db2_352 704 | GTTTCAATTCCTCATAGGTACGCTGAAAAC 705 | >db2_353 706 | GTTTCAATTCCTGATAGGTAGGCTAAAAAC 707 | >db2_354 708 | GTTTGTATCGTACCTATGAGGGATTGAAAC 709 | >db2_355 710 | GCTTTTAGCATACCTATTAGGGATTGAAAC 711 | >db2_356 712 | GTTTCAATAAGACTCTAAGAGAATTGAAAG 713 | >db2_357 714 | GTTTCAATTCCTCATAGGTAAGCTAACAAC 715 | >db2_358 716 | GTTTCAATCCCTCATAGGTAAGCTAACAAC 717 | >db2_359 718 | ATTTCAATTCCTCATAGATAGGCTAAAAAC 719 | >db2_360 720 | GTTGCAATAAGACTCTAGGAGAATTGAAAG 721 | >db2_361 722 | GTTTCAATTCCTCATAGGTACGCTGAGAAC 723 | >db2_362 724 | GTTTTTATCTTACCTATGAGGAATTGAAAC 725 | >db2_363 726 | GTTCCAATAAGACTTTAAAAGAATTGAAAG 727 | >db2_364 728 | GTTTCAATCCCTTATAGGTAGGCTCAAAAC 729 | >db2_365 730 | CTTTCAATTCTATTTTGGTCTTATTGTAAC 731 | >db2_366 732 | GTTTGTATCTTAACTATGAGGAATTGAAAC 733 | >db2_367 734 | GTTTTTAGCCTACCTATAAGGAATTGAAAT 735 | >db2_368 736 | GTTTCAATTCCTCATAGGTACGATCAAAAC 737 | >db2_369 738 | GTTTCAATCCCTTATAGGTAAGCTAACAAC 739 | >db2_370 740 | GTTTCAATCCCTTATAGGTAGGCTAAAAAC 741 | >db2_371 742 | GTTTTTAGCTTACCTATGAGGGATTGAAAC 743 | >db2_372 744 | GTTTCAATCCCTAATAGGTATGCTAAAAAC 745 | >db2_373 746 | GTTTGTATCTTACCTATGAGGAATTGAAAC 747 | >db2_374 748 | GTTTATAGCCTACCTATAAGGGATTGAAAC 749 | >db2_375 750 | GTTTCTACCTTACCTTGGAGGAATTGAAAC 751 | >db2_376 752 | GTTATCAGCCTACCTATAAGGAATTGAAAC 753 | >db2_377 754 | ATTTCAATTCCTCCAAGGTAAGGTAAAAAC 755 | >db2_378 756 | GTTGCAATAAGACTCTGGGAGAATTGAAA 757 | >db2_379 758 | GTTGAACCGTACCTATGAGAGATTGAAAC 759 | >db2_380 760 | GTTTGAAGTTTACCTATAAGGAATTGAAA 761 | >db2_381 762 | GTTCCAATAAGACTAAAATAGAATTGAAA 763 | >db2_382 764 | GTTTTTATCGTACCTATGAGGGATTGAAA 765 | >db2_383 766 | GTTTCAATCCCTTATAGGTAAGCTAACAA 767 | >db2_384 768 | GTTTTTATCTTACCTATGAGGAATTGAAA 769 | >db2_385 770 | GTTTCAATCCGCGCCCCCGTGAGGGGGC 771 | >db2_386 772 | ACTTTCAATCCCTTATGGGATTCTTC 773 | >db2_387 774 | CTTTCAATCCCTTTTGGGATGCAAC 775 | >db2_388 776 | GATTAATCCCAAAAGGAATTGAAAG 777 | >db2_389 778 | GATTAATCCTAAAAGGAATTGAAAG 779 | >db2_390 780 | CTTTCAATTCCTTTTGGGATTCATC 781 | >db2_391 782 | GAAGATTTCAATAGAATATTGAAAG 783 | >db2_392 784 | GTTTCAATTCTTTTGTAGGTTCTTC 785 | >db2_393 786 | CTTTCAATCCCTTTTGGGATTCATC 787 | >db2_394 788 | GAATCTCAAAAAGAGGATTGAAAG 789 | >db2_395 790 | CTTTCAATTCCATTATGGATTAGC 791 | >db2_396 792 | GAATCCTATAAATGGAATTGAAAG 793 | >db2_397 794 | GAATCCTACAAATGGAATTGAAAG 795 | >db2_398 796 | GATAATCTACTATAGAATTGAAAG 797 | >db2_399 798 | GAATCTCAAGTTGAGGATTGAAAG 799 | >db2_400 800 | CTTTCAATTCTATAAGAGATTATC 801 | >db2_401 802 | CTTTCAATCCTCTCTTTGAGATTC 803 | >db2_402 804 | CTTTCAATCCTCTTCTTGAGATTC 805 | >db2_403 806 | GTATCTCAAAAAGAGGATTGAAAG 807 | >db2_404 808 | CTTTCAATTCTATCTAACAGATTC 809 | >db2_405 810 | GATAATCTCTTATAGAATTGAAAG 811 | >db2_406 812 | GTTTTTATCGTACCTATGAGGAATTGAAAC 813 | >db2_407 814 | GTTTCAGACGAACCCTTGTGGGATTGAAGC 815 | >db2_408 816 | GTTTCAGACGAACCCTTGTGGGGTTGAAGC 817 | >db2_409 818 | GTTTCAGACGAACCCTTGTGGGTTTGAAGC 819 | >db2_410 820 | GATTAATCCCAAAAGGAATTGAAAG 821 | >db2_411 822 | GTCGCGTCCTCACGGGCGCGTGGATTGAAAC 823 | >db2_412 824 | GAGTTCCCCGCGCCAGCGGGGATAAACCG 825 | >db2_413 826 | GTGTTCCCCGCGCCAGCGGGGATAAACCG 827 | >db2_414 828 | GTTCACTGCCGTGTAGGCAGCTAAGAAA 829 | >db2_415 830 | GTTCACTGCCGTACAGGCAGCTTAGAAA 831 | >db2_416 832 | GTTGAAGTGGTACTTCCAGTAAAACAAGGATTGAAAC 833 | >db2_417 834 | CTAAAAGAATAACTTGCAAAATAACAAGCATTGAAAC 835 | >db2_418 836 | CTTTCCTTCTACTAATCCCGGCGATCGGGACTGAAAC 837 | >db2_419 838 | GTTTTAGAGCTATGCTGTTTTGAATGGTCCCAAAAC 839 | >db2_420 840 | GTTGTAGCTCCCTTTCTCATTTCGCAGTGCTACAAT 841 | >db2_421 842 | GTTTTAGTCCCTTTTTAAATTTCTTTATGGTAAAAT 843 | >db2_422 844 | GTTCCAATAAGACTAAAATAGAATTGAAAG 845 | >db2_423 846 | GATCGATACCCACCCCGAAGAAAAGGGGACGAGAAC 847 | >db2_424 848 | GTTCAACACCCTCTTTTCCCCGTCAGGGGACTGAAAC 849 | >db2_425 850 | GTCTCCACTCGTAGGAGAAATTAATTGATTGGAAAC 851 | >db2_426 852 | GAACAACTCAAAAGAGAATTGCAAG 853 | >db2_427 854 | ATTAAAATCAGACCGTTTCGGAATGGAAAT 855 | >db2_428 856 | GTTTTATATTAACTAAGTGGTATGTAAAG 857 | >db2_429 858 | GAATCTCAAAAAGAGGATTGAAAG 859 | >db2_430 860 | GTGGAAATCAAAAGATAGTAGAAAC 861 | -------------------------------------------------------------------------------- /tools/blasting/Verified_repeats_dataset2.fa.nhr: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/tools/blasting/Verified_repeats_dataset2.fa.nhr -------------------------------------------------------------------------------- /tools/blasting/Verified_repeats_dataset2.fa.nin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/tools/blasting/Verified_repeats_dataset2.fa.nin -------------------------------------------------------------------------------- /tools/blasting/Verified_repeats_dataset2.fa.nog: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/tools/blasting/Verified_repeats_dataset2.fa.nog -------------------------------------------------------------------------------- /tools/blasting/Verified_repeats_dataset2.fa.nsd: -------------------------------------------------------------------------------- 1 | db2_10 2 | db2_109 3 | db2_10099 4 | db2_101100 5 | db2_102101 6 | db2_103102 7 | db2_104103 8 | db2_105104 9 | db2_106105 10 | db2_107106 11 | db2_108107 12 | db2_109108 13 | db2_1110 14 | db2_110109 15 | db2_111110 16 | db2_112111 17 | db2_113112 18 | db2_114113 19 | db2_115114 20 | db2_116115 21 | db2_117116 22 | db2_118117 23 | db2_119118 24 | db2_1211 25 | db2_120119 26 | db2_121120 27 | db2_122121 28 | db2_123122 29 | db2_124123 30 | db2_125124 31 | db2_126125 32 | db2_127126 33 | db2_128127 34 | db2_129128 35 | db2_1312 36 | db2_130129 37 | db2_131130 38 | db2_132131 39 | db2_133132 40 | db2_134133 41 | db2_135134 42 | db2_136135 43 | db2_137136 44 | db2_138137 45 | db2_139138 46 | db2_1413 47 | db2_140139 48 | db2_141140 49 | db2_142141 50 | db2_143142 51 | db2_144143 52 | db2_145144 53 | db2_146145 54 | db2_147146 55 | db2_148147 56 | db2_149148 57 | db2_1514 58 | db2_150149 59 | db2_151150 60 | db2_152151 61 | db2_153152 62 | db2_154153 63 | db2_155154 64 | db2_156155 65 | db2_157156 66 | db2_158157 67 | db2_159158 68 | db2_1615 69 | db2_160159 70 | db2_161160 71 | db2_162161 72 | db2_163162 73 | db2_164163 74 | db2_165164 75 | db2_166165 76 | db2_167166 77 | db2_168167 78 | db2_169168 79 | db2_1716 80 | db2_170169 81 | db2_171170 82 | db2_172171 83 | db2_173172 84 | db2_174173 85 | db2_175174 86 | db2_176175 87 | db2_177176 88 | db2_178177 89 | db2_179178 90 | db2_1817 91 | db2_180179 92 | db2_181180 93 | db2_182181 94 | db2_183182 95 | db2_184183 96 | db2_185184 97 | db2_186185 98 | db2_187186 99 | db2_188187 100 | db2_189188 101 | db2_1918 102 | db2_190189 103 | db2_191190 104 | db2_192191 105 | db2_193192 106 | db2_194193 107 | db2_195194 108 | db2_196195 109 | db2_197196 110 | db2_198197 111 | db2_199198 112 | db2_21 113 | db2_2019 114 | db2_200199 115 | db2_201200 116 | db2_202201 117 | db2_203202 118 | db2_204203 119 | db2_205204 120 | db2_206205 121 | db2_207206 122 | db2_208207 123 | db2_209208 124 | db2_2120 125 | db2_210209 126 | db2_211210 127 | db2_212211 128 | db2_213212 129 | db2_214213 130 | db2_215214 131 | db2_216215 132 | db2_217216 133 | db2_218217 134 | db2_219218 135 | db2_2221 136 | db2_220219 137 | db2_221220 138 | db2_222221 139 | db2_223222 140 | db2_224223 141 | db2_225224 142 | db2_226225 143 | db2_227226 144 | db2_228227 145 | db2_229228 146 | db2_2322 147 | db2_230229 148 | db2_231230 149 | db2_232231 150 | db2_233232 151 | db2_234233 152 | db2_235234 153 | db2_236235 154 | db2_237236 155 | db2_238237 156 | db2_239238 157 | db2_2423 158 | db2_240239 159 | db2_241240 160 | db2_242241 161 | db2_243242 162 | db2_244243 163 | db2_245244 164 | db2_246245 165 | db2_247246 166 | db2_248247 167 | db2_249248 168 | db2_2524 169 | db2_250249 170 | db2_251250 171 | db2_252251 172 | db2_253252 173 | db2_254253 174 | db2_255254 175 | db2_256255 176 | db2_257256 177 | db2_258257 178 | db2_259258 179 | db2_2625 180 | db2_260259 181 | db2_261260 182 | db2_262261 183 | db2_263262 184 | db2_264263 185 | db2_265264 186 | db2_266265 187 | db2_267266 188 | db2_268267 189 | db2_269268 190 | db2_2726 191 | db2_270269 192 | db2_271270 193 | db2_272271 194 | db2_273272 195 | db2_274273 196 | db2_275274 197 | db2_276275 198 | db2_277276 199 | db2_278277 200 | db2_279278 201 | db2_2827 202 | db2_280279 203 | db2_281280 204 | db2_282281 205 | db2_283282 206 | db2_284283 207 | db2_285284 208 | db2_286285 209 | db2_287286 210 | db2_288287 211 | db2_289288 212 | db2_2928 213 | db2_290289 214 | db2_291290 215 | db2_292291 216 | db2_293292 217 | db2_294293 218 | db2_295294 219 | db2_296295 220 | db2_297296 221 | db2_298297 222 | db2_299298 223 | db2_32 224 | db2_3029 225 | db2_300299 226 | db2_301300 227 | db2_302301 228 | db2_303302 229 | db2_304303 230 | db2_305304 231 | db2_306305 232 | db2_307306 233 | db2_308307 234 | db2_309308 235 | db2_3130 236 | db2_310309 237 | db2_311310 238 | db2_312311 239 | db2_313312 240 | db2_314313 241 | db2_315314 242 | db2_316315 243 | db2_317316 244 | db2_318317 245 | db2_319318 246 | db2_3231 247 | db2_320319 248 | db2_321320 249 | db2_322321 250 | db2_323322 251 | db2_324323 252 | db2_325324 253 | db2_326325 254 | db2_327326 255 | db2_328327 256 | db2_329328 257 | db2_3332 258 | db2_330329 259 | db2_331330 260 | db2_332331 261 | db2_333332 262 | db2_334333 263 | db2_335334 264 | db2_336335 265 | db2_337336 266 | db2_338337 267 | db2_339338 268 | db2_3433 269 | db2_340339 270 | db2_341340 271 | db2_342341 272 | db2_343342 273 | db2_344343 274 | db2_345344 275 | db2_346345 276 | db2_347346 277 | db2_348347 278 | db2_349348 279 | db2_3534 280 | db2_350349 281 | db2_351350 282 | db2_352351 283 | db2_353352 284 | db2_354353 285 | db2_355354 286 | db2_356355 287 | db2_357356 288 | db2_358357 289 | db2_359358 290 | db2_3635 291 | db2_360359 292 | db2_361360 293 | db2_362361 294 | db2_363362 295 | db2_364363 296 | db2_365364 297 | db2_366365 298 | db2_367366 299 | db2_368367 300 | db2_369368 301 | db2_3736 302 | db2_370369 303 | db2_371370 304 | db2_372371 305 | db2_373372 306 | db2_374373 307 | db2_375374 308 | db2_376375 309 | db2_377376 310 | db2_378377 311 | db2_379378 312 | db2_3837 313 | db2_380379 314 | db2_381380 315 | db2_382381 316 | db2_383382 317 | db2_384383 318 | db2_385384 319 | db2_386385 320 | db2_387386 321 | db2_388387 322 | db2_389388 323 | db2_3938 324 | db2_390389 325 | db2_391390 326 | db2_392391 327 | db2_393392 328 | db2_394393 329 | db2_395394 330 | db2_396395 331 | db2_397396 332 | db2_398397 333 | db2_399398 334 | db2_43 335 | db2_4039 336 | db2_400399 337 | db2_401400 338 | db2_402401 339 | db2_403402 340 | db2_404403 341 | db2_405404 342 | db2_406405 343 | db2_407406 344 | db2_408407 345 | db2_409408 346 | db2_4140 347 | db2_410409 348 | db2_411410 349 | db2_412411 350 | db2_413412 351 | db2_414413 352 | db2_415414 353 | db2_416415 354 | db2_417416 355 | db2_418417 356 | db2_419418 357 | db2_4241 358 | db2_420419 359 | db2_421420 360 | db2_422421 361 | db2_423422 362 | db2_424423 363 | db2_425424 364 | db2_426425 365 | db2_427426 366 | db2_428427 367 | db2_429428 368 | db2_4342 369 | db2_430429 370 | db2_4443 371 | db2_4544 372 | db2_4645 373 | db2_4746 374 | db2_4847 375 | db2_4948 376 | db2_54 377 | db2_5049 378 | db2_5150 379 | db2_5251 380 | db2_5352 381 | db2_5453 382 | db2_5554 383 | db2_5655 384 | db2_5756 385 | db2_5857 386 | db2_5958 387 | db2_65 388 | db2_6059 389 | db2_6160 390 | db2_6261 391 | db2_6362 392 | db2_6463 393 | db2_6564 394 | db2_6665 395 | db2_6766 396 | db2_6867 397 | db2_6968 398 | db2_76 399 | db2_7069 400 | db2_7170 401 | db2_7271 402 | db2_7372 403 | db2_7473 404 | db2_7574 405 | db2_7675 406 | db2_7776 407 | db2_7877 408 | db2_7978 409 | db2_87 410 | db2_8079 411 | db2_8180 412 | db2_8281 413 | db2_8382 414 | db2_8483 415 | db2_8584 416 | db2_8685 417 | db2_8786 418 | db2_8887 419 | db2_8988 420 | db2_98 421 | db2_9089 422 | db2_9190 423 | db2_9291 424 | db2_9392 425 | db2_9493 426 | db2_9594 427 | db2_9695 428 | db2_9796 429 | db2_9897 430 | db2_9998 431 | lcl|db2_10 432 | lcl|db2_109 433 | lcl|db2_10099 434 | lcl|db2_101100 435 | lcl|db2_102101 436 | lcl|db2_103102 437 | lcl|db2_104103 438 | lcl|db2_105104 439 | lcl|db2_106105 440 | lcl|db2_107106 441 | lcl|db2_108107 442 | lcl|db2_109108 443 | lcl|db2_1110 444 | lcl|db2_110109 445 | lcl|db2_111110 446 | lcl|db2_112111 447 | lcl|db2_113112 448 | lcl|db2_114113 449 | lcl|db2_115114 450 | lcl|db2_116115 451 | lcl|db2_117116 452 | lcl|db2_118117 453 | lcl|db2_119118 454 | lcl|db2_1211 455 | lcl|db2_120119 456 | lcl|db2_121120 457 | lcl|db2_122121 458 | lcl|db2_123122 459 | lcl|db2_124123 460 | lcl|db2_125124 461 | lcl|db2_126125 462 | lcl|db2_127126 463 | lcl|db2_128127 464 | lcl|db2_129128 465 | lcl|db2_1312 466 | lcl|db2_130129 467 | lcl|db2_131130 468 | lcl|db2_132131 469 | lcl|db2_133132 470 | lcl|db2_134133 471 | lcl|db2_135134 472 | lcl|db2_136135 473 | lcl|db2_137136 474 | lcl|db2_138137 475 | lcl|db2_139138 476 | lcl|db2_1413 477 | lcl|db2_140139 478 | lcl|db2_141140 479 | lcl|db2_142141 480 | lcl|db2_143142 481 | lcl|db2_144143 482 | lcl|db2_145144 483 | lcl|db2_146145 484 | lcl|db2_147146 485 | lcl|db2_148147 486 | lcl|db2_149148 487 | lcl|db2_1514 488 | lcl|db2_150149 489 | lcl|db2_151150 490 | lcl|db2_152151 491 | lcl|db2_153152 492 | lcl|db2_154153 493 | lcl|db2_155154 494 | lcl|db2_156155 495 | lcl|db2_157156 496 | lcl|db2_158157 497 | lcl|db2_159158 498 | lcl|db2_1615 499 | lcl|db2_160159 500 | lcl|db2_161160 501 | lcl|db2_162161 502 | lcl|db2_163162 503 | lcl|db2_164163 504 | lcl|db2_165164 505 | lcl|db2_166165 506 | lcl|db2_167166 507 | lcl|db2_168167 508 | lcl|db2_169168 509 | lcl|db2_1716 510 | lcl|db2_170169 511 | lcl|db2_171170 512 | lcl|db2_172171 513 | lcl|db2_173172 514 | lcl|db2_174173 515 | lcl|db2_175174 516 | lcl|db2_176175 517 | lcl|db2_177176 518 | lcl|db2_178177 519 | lcl|db2_179178 520 | lcl|db2_1817 521 | lcl|db2_180179 522 | lcl|db2_181180 523 | lcl|db2_182181 524 | lcl|db2_183182 525 | lcl|db2_184183 526 | lcl|db2_185184 527 | lcl|db2_186185 528 | lcl|db2_187186 529 | lcl|db2_188187 530 | lcl|db2_189188 531 | lcl|db2_1918 532 | lcl|db2_190189 533 | lcl|db2_191190 534 | lcl|db2_192191 535 | lcl|db2_193192 536 | lcl|db2_194193 537 | lcl|db2_195194 538 | lcl|db2_196195 539 | lcl|db2_197196 540 | lcl|db2_198197 541 | lcl|db2_199198 542 | lcl|db2_21 543 | lcl|db2_2019 544 | lcl|db2_200199 545 | lcl|db2_201200 546 | lcl|db2_202201 547 | lcl|db2_203202 548 | lcl|db2_204203 549 | lcl|db2_205204 550 | lcl|db2_206205 551 | lcl|db2_207206 552 | lcl|db2_208207 553 | lcl|db2_209208 554 | lcl|db2_2120 555 | lcl|db2_210209 556 | lcl|db2_211210 557 | lcl|db2_212211 558 | lcl|db2_213212 559 | lcl|db2_214213 560 | lcl|db2_215214 561 | lcl|db2_216215 562 | lcl|db2_217216 563 | lcl|db2_218217 564 | lcl|db2_219218 565 | lcl|db2_2221 566 | lcl|db2_220219 567 | lcl|db2_221220 568 | lcl|db2_222221 569 | lcl|db2_223222 570 | lcl|db2_224223 571 | lcl|db2_225224 572 | lcl|db2_226225 573 | lcl|db2_227226 574 | lcl|db2_228227 575 | lcl|db2_229228 576 | lcl|db2_2322 577 | lcl|db2_230229 578 | lcl|db2_231230 579 | lcl|db2_232231 580 | lcl|db2_233232 581 | lcl|db2_234233 582 | lcl|db2_235234 583 | lcl|db2_236235 584 | lcl|db2_237236 585 | lcl|db2_238237 586 | lcl|db2_239238 587 | lcl|db2_2423 588 | lcl|db2_240239 589 | lcl|db2_241240 590 | lcl|db2_242241 591 | lcl|db2_243242 592 | lcl|db2_244243 593 | lcl|db2_245244 594 | lcl|db2_246245 595 | lcl|db2_247246 596 | lcl|db2_248247 597 | lcl|db2_249248 598 | lcl|db2_2524 599 | lcl|db2_250249 600 | lcl|db2_251250 601 | lcl|db2_252251 602 | lcl|db2_253252 603 | lcl|db2_254253 604 | lcl|db2_255254 605 | lcl|db2_256255 606 | lcl|db2_257256 607 | lcl|db2_258257 608 | lcl|db2_259258 609 | lcl|db2_2625 610 | lcl|db2_260259 611 | lcl|db2_261260 612 | lcl|db2_262261 613 | lcl|db2_263262 614 | lcl|db2_264263 615 | lcl|db2_265264 616 | lcl|db2_266265 617 | lcl|db2_267266 618 | lcl|db2_268267 619 | lcl|db2_269268 620 | lcl|db2_2726 621 | lcl|db2_270269 622 | lcl|db2_271270 623 | lcl|db2_272271 624 | lcl|db2_273272 625 | lcl|db2_274273 626 | lcl|db2_275274 627 | lcl|db2_276275 628 | lcl|db2_277276 629 | lcl|db2_278277 630 | lcl|db2_279278 631 | lcl|db2_2827 632 | lcl|db2_280279 633 | lcl|db2_281280 634 | lcl|db2_282281 635 | lcl|db2_283282 636 | lcl|db2_284283 637 | lcl|db2_285284 638 | lcl|db2_286285 639 | lcl|db2_287286 640 | lcl|db2_288287 641 | lcl|db2_289288 642 | lcl|db2_2928 643 | lcl|db2_290289 644 | lcl|db2_291290 645 | lcl|db2_292291 646 | lcl|db2_293292 647 | lcl|db2_294293 648 | lcl|db2_295294 649 | lcl|db2_296295 650 | lcl|db2_297296 651 | lcl|db2_298297 652 | lcl|db2_299298 653 | lcl|db2_32 654 | lcl|db2_3029 655 | lcl|db2_300299 656 | lcl|db2_301300 657 | lcl|db2_302301 658 | lcl|db2_303302 659 | lcl|db2_304303 660 | lcl|db2_305304 661 | lcl|db2_306305 662 | lcl|db2_307306 663 | lcl|db2_308307 664 | lcl|db2_309308 665 | lcl|db2_3130 666 | lcl|db2_310309 667 | lcl|db2_311310 668 | lcl|db2_312311 669 | lcl|db2_313312 670 | lcl|db2_314313 671 | lcl|db2_315314 672 | lcl|db2_316315 673 | lcl|db2_317316 674 | lcl|db2_318317 675 | lcl|db2_319318 676 | lcl|db2_3231 677 | lcl|db2_320319 678 | lcl|db2_321320 679 | lcl|db2_322321 680 | lcl|db2_323322 681 | lcl|db2_324323 682 | lcl|db2_325324 683 | lcl|db2_326325 684 | lcl|db2_327326 685 | lcl|db2_328327 686 | lcl|db2_329328 687 | lcl|db2_3332 688 | lcl|db2_330329 689 | lcl|db2_331330 690 | lcl|db2_332331 691 | lcl|db2_333332 692 | lcl|db2_334333 693 | lcl|db2_335334 694 | lcl|db2_336335 695 | lcl|db2_337336 696 | lcl|db2_338337 697 | lcl|db2_339338 698 | lcl|db2_3433 699 | lcl|db2_340339 700 | lcl|db2_341340 701 | lcl|db2_342341 702 | lcl|db2_343342 703 | lcl|db2_344343 704 | lcl|db2_345344 705 | lcl|db2_346345 706 | lcl|db2_347346 707 | lcl|db2_348347 708 | lcl|db2_349348 709 | lcl|db2_3534 710 | lcl|db2_350349 711 | lcl|db2_351350 712 | lcl|db2_352351 713 | lcl|db2_353352 714 | lcl|db2_354353 715 | lcl|db2_355354 716 | lcl|db2_356355 717 | lcl|db2_357356 718 | lcl|db2_358357 719 | lcl|db2_359358 720 | lcl|db2_3635 721 | lcl|db2_360359 722 | lcl|db2_361360 723 | lcl|db2_362361 724 | lcl|db2_363362 725 | lcl|db2_364363 726 | lcl|db2_365364 727 | lcl|db2_366365 728 | lcl|db2_367366 729 | lcl|db2_368367 730 | lcl|db2_369368 731 | lcl|db2_3736 732 | lcl|db2_370369 733 | lcl|db2_371370 734 | lcl|db2_372371 735 | lcl|db2_373372 736 | lcl|db2_374373 737 | lcl|db2_375374 738 | lcl|db2_376375 739 | lcl|db2_377376 740 | lcl|db2_378377 741 | lcl|db2_379378 742 | lcl|db2_3837 743 | lcl|db2_380379 744 | lcl|db2_381380 745 | lcl|db2_382381 746 | lcl|db2_383382 747 | lcl|db2_384383 748 | lcl|db2_385384 749 | lcl|db2_386385 750 | lcl|db2_387386 751 | lcl|db2_388387 752 | lcl|db2_389388 753 | lcl|db2_3938 754 | lcl|db2_390389 755 | lcl|db2_391390 756 | lcl|db2_392391 757 | lcl|db2_393392 758 | lcl|db2_394393 759 | lcl|db2_395394 760 | lcl|db2_396395 761 | lcl|db2_397396 762 | lcl|db2_398397 763 | lcl|db2_399398 764 | lcl|db2_43 765 | lcl|db2_4039 766 | lcl|db2_400399 767 | lcl|db2_401400 768 | lcl|db2_402401 769 | lcl|db2_403402 770 | lcl|db2_404403 771 | lcl|db2_405404 772 | lcl|db2_406405 773 | lcl|db2_407406 774 | lcl|db2_408407 775 | lcl|db2_409408 776 | lcl|db2_4140 777 | lcl|db2_410409 778 | lcl|db2_411410 779 | lcl|db2_412411 780 | lcl|db2_413412 781 | lcl|db2_414413 782 | lcl|db2_415414 783 | lcl|db2_416415 784 | lcl|db2_417416 785 | lcl|db2_418417 786 | lcl|db2_419418 787 | lcl|db2_4241 788 | lcl|db2_420419 789 | lcl|db2_421420 790 | lcl|db2_422421 791 | lcl|db2_423422 792 | lcl|db2_424423 793 | lcl|db2_425424 794 | lcl|db2_426425 795 | lcl|db2_427426 796 | lcl|db2_428427 797 | lcl|db2_429428 798 | lcl|db2_4342 799 | lcl|db2_430429 800 | lcl|db2_4443 801 | lcl|db2_4544 802 | lcl|db2_4645 803 | lcl|db2_4746 804 | lcl|db2_4847 805 | lcl|db2_4948 806 | lcl|db2_54 807 | lcl|db2_5049 808 | lcl|db2_5150 809 | lcl|db2_5251 810 | lcl|db2_5352 811 | lcl|db2_5453 812 | lcl|db2_5554 813 | lcl|db2_5655 814 | lcl|db2_5756 815 | lcl|db2_5857 816 | lcl|db2_5958 817 | lcl|db2_65 818 | lcl|db2_6059 819 | lcl|db2_6160 820 | lcl|db2_6261 821 | lcl|db2_6362 822 | lcl|db2_6463 823 | lcl|db2_6564 824 | lcl|db2_6665 825 | lcl|db2_6766 826 | lcl|db2_6867 827 | lcl|db2_6968 828 | lcl|db2_76 829 | lcl|db2_7069 830 | lcl|db2_7170 831 | lcl|db2_7271 832 | lcl|db2_7372 833 | lcl|db2_7473 834 | lcl|db2_7574 835 | lcl|db2_7675 836 | lcl|db2_7776 837 | lcl|db2_7877 838 | lcl|db2_7978 839 | lcl|db2_87 840 | lcl|db2_8079 841 | lcl|db2_8180 842 | lcl|db2_8281 843 | lcl|db2_8382 844 | lcl|db2_8483 845 | lcl|db2_8584 846 | lcl|db2_8685 847 | lcl|db2_8786 848 | lcl|db2_8887 849 | lcl|db2_8988 850 | lcl|db2_98 851 | lcl|db2_9089 852 | lcl|db2_9190 853 | lcl|db2_9291 854 | lcl|db2_9392 855 | lcl|db2_9493 856 | lcl|db2_9594 857 | lcl|db2_9695 858 | lcl|db2_9796 859 | lcl|db2_9897 860 | lcl|db2_9998 861 | -------------------------------------------------------------------------------- /tools/blasting/Verified_repeats_dataset2.fa.nsi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/tools/blasting/Verified_repeats_dataset2.fa.nsi -------------------------------------------------------------------------------- /tools/blasting/Verified_repeats_dataset2.fa.nsq: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/tools/blasting/Verified_repeats_dataset2.fa.nsq -------------------------------------------------------------------------------- /tools/strand_prediction/CRISPRstrand/CRISPRstrand.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | 6 | # disable tensorflow multi-threading 7 | import tensorflow as tf 8 | tf.config.threading.set_inter_op_parallelism_threads(1) 9 | tf.config.threading.set_intra_op_parallelism_threads(1) 10 | 11 | import preprocessing as pp 12 | import utils as u 13 | 14 | if __name__ == "__main__": 15 | cmdline_parser = argparse.ArgumentParser('CRISPRstrand_v2') 16 | 17 | cmdline_parser.add_argument('-cv', '--cross_validation', 18 | action='store_true', 19 | default=False, 20 | help='Cross validation (only applied for training)') 21 | 22 | cmdline_parser.add_argument('-tr', '--training', 23 | action='store_true', 24 | default=False, 25 | help='Whether to train a model') 26 | 27 | cmdline_parser.add_argument('-i', '--input_files', 28 | nargs='+', 29 | default= ['Example/Input3.fa'], 30 | help='Filenames of the input data.') 31 | 32 | cmdline_parser.add_argument('-cols', '--usecols', 33 | nargs='+', 34 | default= [0, 5], 35 | help='ID and consensus repeat fields to use. Must be specified for csv/tsv/xls...') 36 | 37 | cmdline_parser.add_argument('-m', '--model_path', 38 | default='Models/model_r.h5', 39 | help='Evaluation/prediction model path', 40 | type=str) 41 | 42 | cmdline_parser.add_argument('-type', '--repeat_type', 43 | action='store_true', 44 | default=False, 45 | help='Whether to train a model') 46 | 47 | cmdline_parser.add_argument('-out', '--output_folder', 48 | default='Results', 49 | help='Output save', 50 | type=str) 51 | 52 | 53 | args, unknowns = cmdline_parser.parse_known_args() 54 | print(args) 55 | 56 | tr = args.training 57 | cv = args.cross_validation 58 | inputs = args.input_files 59 | columns = None if len(args.usecols) == 0 else args.usecols 60 | model_path = args.model_path 61 | do_type = args.repeat_type 62 | output_folder = args.output_folder 63 | ################################### 64 | 65 | 66 | if tr: 67 | 68 | from keras.backend import clear_session 69 | import train.train as tr 70 | 71 | df = pp.prepare_input(inputs, usecols = columns) 72 | X, y, sequence_length = pp.process_dataset_for_training(df) 73 | 74 | run_dict = {'batch_size' : 128, 75 | 'num_epochs' : 100, 76 | 'num_repeats' : 1, 77 | 'k' : 5, # only will be used in CV 78 | 'classifier_build' : 'parallel', 79 | 'kernel_width' : [4, 5, 6, 8, 10], # put 6 80 | 'sequence_length' : sequence_length 81 | } 82 | 83 | clear_session() 84 | 85 | if cv: 86 | classifiers, histories, roc_auc_scores = te.cv_run(X, y, run_dict) 87 | else: 88 | classifiers, histories, roc_auc_scores = te.run(X, y, run_dict) 89 | 90 | else: 91 | 92 | import evaluate as ev 93 | 94 | classifier, sequence_length = u.load_model(model_path) 95 | df = pp.prepare_input(inputs, output_folder, False, usecols = columns, do_type = do_type) 96 | df, X, y = pp.process_dataset_for_test(df, sequence_length) 97 | df_out, roc_auc_scores = ev.test(classifier, df, X, y, output_folder) 98 | 99 | print(roc_auc_scores) 100 | -------------------------------------------------------------------------------- /tools/strand_prediction/CRISPRstrand/CRISPRstrand.yml: -------------------------------------------------------------------------------- 1 | name: crispr_strand_env 2 | channels: 3 | - defaults 4 | dependencies: 5 | - _libgcc_mutex=0.1=main 6 | - _tflow_select=2.3.0=eigen 7 | - absl-py=0.11.0=py37h06a4308_0 8 | - aiohttp=3.6.3=py37h7b6447c_0 9 | - astunparse=1.6.3=py_0 10 | - async-timeout=3.0.1=py37_0 11 | - attrs=20.3.0=pyhd3eb1b0_0 12 | - blas=1.0=mkl 13 | - blinker=1.4=py37_0 14 | - brotlipy=0.7.0=py37h27cfd23_1003 15 | - c-ares=1.17.1=h27cfd23_0 16 | - ca-certificates=2020.10.14=0 17 | - cachetools=4.1.1=py_0 18 | - certifi=2020.11.8=py37h06a4308_0 19 | - cffi=1.14.3=py37h261ae71_2 20 | - chardet=3.0.4=py37h06a4308_1003 21 | - click=7.1.2=py_0 22 | - cryptography=3.2.1=py37h3c74f83_1 23 | - cycler=0.10.0=py37_0 24 | - dbus=1.13.18=hb2f20db_0 25 | - expat=2.2.10=he6710b0_2 26 | - fontconfig=2.13.0=h9420a91_0 27 | - freetype=2.10.4=h5ab3b9f_0 28 | - gast=0.3.3=py_0 29 | - glib=2.66.1=h92f7085_0 30 | - google-auth=1.23.0=pyhd3eb1b0_0 31 | - google-auth-oauthlib=0.4.2=pyhd3eb1b0_2 32 | - google-pasta=0.2.0=py_0 33 | - grpcio=1.31.0=py37hf8bcb03_0 34 | - gst-plugins-base=1.14.0=hbbd80ab_1 35 | - gstreamer=1.14.0=hb31296c_0 36 | - h5py=2.10.0=py37hd6299e0_1 37 | - hdf5=1.10.6=hb1b8bf9_0 38 | - icu=58.2=he6710b0_3 39 | - idna=2.10=py_0 40 | - importlib-metadata=2.0.0=py_1 41 | - intel-openmp=2020.2=254 42 | - joblib=0.17.0=py_0 43 | - jpeg=9b=h024ee3a_2 44 | - keras=2.4.3=0 45 | - keras-base=2.4.3=py_0 46 | - keras-preprocessing=1.1.0=py_1 47 | - kiwisolver=1.3.0=py37h2531618_0 48 | - lcms2=2.11=h396b838_0 49 | - ld_impl_linux-64=2.33.1=h53a641e_7 50 | - libedit=3.1.20191231=h14c3975_1 51 | - libffi=3.3=he6710b0_2 52 | - libgcc-ng=9.1.0=hdf63c60_0 53 | - libgfortran-ng=7.3.0=hdf63c60_0 54 | - libpng=1.6.37=hbc83047_0 55 | - libprotobuf=3.13.0.1=hd408876_0 56 | - libstdcxx-ng=9.1.0=hdf63c60_0 57 | - libtiff=4.1.0=h2733197_1 58 | - libuuid=1.0.3=h1bed415_2 59 | - libxcb=1.14=h7b6447c_0 60 | - libxml2=2.9.10=hb55368b_3 61 | - lz4-c=1.9.2=heb0550a_3 62 | - markdown=3.3.3=py37h06a4308_0 63 | - matplotlib=3.3.2=0 64 | - matplotlib-base=3.3.2=py37h817c723_0 65 | - mkl=2020.2=256 66 | - mkl-service=2.3.0=py37he904b0f_0 67 | - mkl_fft=1.2.0=py37h23d657b_0 68 | - mkl_random=1.1.1=py37h0573a6f_0 69 | - multidict=4.7.6=py37h7b6447c_1 70 | - ncurses=6.2=he6710b0_1 71 | - numpy=1.19.2=py37h54aff64_0 72 | - numpy-base=1.19.2=py37hfa32c7d_0 73 | - oauthlib=3.1.0=py_0 74 | - olefile=0.46=py37_0 75 | - openssl=1.1.1h=h7b6447c_0 76 | - opt_einsum=3.1.0=py_0 77 | - pandas=1.1.3=py37he6710b0_0 78 | - pcre=8.44=he6710b0_0 79 | - pillow=8.0.1=py37he98fc37_0 80 | - pip=20.2.4=py37h06a4308_0 81 | - protobuf=3.13.0.1=py37he6710b0_1 82 | - pyasn1=0.4.8=py_0 83 | - pyasn1-modules=0.2.8=py_0 84 | - pycparser=2.20=py_2 85 | - pyjwt=1.7.1=py37_0 86 | - pyopenssl=19.1.0=pyhd3eb1b0_1 87 | - pyparsing=2.4.7=py_0 88 | - pyqt=5.9.2=py37h05f1152_2 89 | - pysocks=1.7.1=py37_1 90 | - python=3.7.9=h7579374_0 91 | - python-dateutil=2.8.1=py_0 92 | - pytz=2020.1=py_0 93 | - pyyaml=5.3.1=py37h7b6447c_1 94 | - qt=5.9.7=h5867ecd_1 95 | - readline=8.0=h7b6447c_0 96 | - requests=2.24.0=py_0 97 | - requests-oauthlib=1.3.0=py_0 98 | - rsa=4.6=py_0 99 | - scikit-learn=0.23.2=py37h0573a6f_0 100 | - scipy=1.5.2=py37h0b6359f_0 101 | - setuptools=50.3.1=py37h06a4308_1 102 | - sip=4.19.8=py37hf484d3e_0 103 | - six=1.15.0=py37h06a4308_0 104 | - sqlite=3.33.0=h62c20be_0 105 | - tensorboard=2.3.0=pyh4dce500_0 106 | - tensorboard-plugin-wit=1.6.0=py_0 107 | - tensorflow=2.3.0=eigen_py37h189e6a2_0 108 | - tensorflow-base=2.3.0=eigen_py37h3b305d7_0 109 | - tensorflow-estimator=2.3.0=pyheb71bc4_0 110 | - termcolor=1.1.0=py37_1 111 | - threadpoolctl=2.1.0=pyh5ca1d4c_0 112 | - tk=8.6.10=hbc83047_0 113 | - tornado=6.0.4=py37h7b6447c_1 114 | - urllib3=1.25.11=py_0 115 | - werkzeug=1.0.1=py_0 116 | - wheel=0.35.1=pyhd3eb1b0_0 117 | - wrapt=1.12.1=py37h7b6447c_1 118 | - xz=5.2.5=h7b6447c_0 119 | - yaml=0.2.5=h7b6447c_0 120 | - yarl=1.6.2=py37h7b6447c_0 121 | - zipp=3.4.0=pyhd3eb1b0_0 122 | - zlib=1.2.11=h7b6447c_3 123 | - zstd=1.4.5=h9ceee32_0 124 | - pip: 125 | - xgboost==1.2.1 126 | 127 | 128 | -------------------------------------------------------------------------------- /tools/strand_prediction/CRISPRstrand/Example/Input.fa: -------------------------------------------------------------------------------- 1 | >CRISPR_1 consensus 2 | GTTTCAGTCCCGATCGCCGGGATTAGTAGAAGGAAAG 3 | >CRISPR_2 consensus 4 | GTTTCAGTCCCCTGACGGGGAAAAGAGGGTGTTGAAC 5 | >CRISPR_3 consensus 6 | GTCTCCACTCGTAGGAGAAATTAATTGATTGGAAAC 7 | -------------------------------------------------------------------------------- /tools/strand_prediction/CRISPRstrand/Example/Input.txt: -------------------------------------------------------------------------------- 1 | GAGTTCCCCGCGCTAGCGGGGATAAACCG 2 | GAGTTCCCCGCGCCAGCGGGGATAAACCG 3 | GTGTTCCCCGCGCCAGCGGGGATAAACCG 4 | GTGTTCCCCGCGCCAGCGGGGATAAACCG 5 | GTGTTCCCCGCGCCAGCGGGGATAAACCG 6 | -------------------------------------------------------------------------------- /tools/strand_prediction/CRISPRstrand/Example/Input3.fa: -------------------------------------------------------------------------------- 1 | >CRISPR1 2 | CTTTCCTTCTACTAATCCCGGCGATCGGGACTGAAAC 3 | >CRISPR2 4 | GTTCAACACCCTCTTTTCCCCGTCAGGGGACTGAAAC 5 | >CRISPR3 6 | GTCTCCACTCGTAGGAGAAATTAATTGATTGGAAAC 7 | >CRISPR1Rev 8 | GTTTCAGTCCCGATCGCCGGGATTAGTAGAAGGAAAG 9 | >CRISPR2Rev 10 | GTTTCAGTCCCCTGACGGGGAAAAGAGGGTGTTGAAC 11 | >CRISPR3Rev 12 | GTTTCCAATCAATTAATTTCTCCTACGAGTGGAGAC 13 | -------------------------------------------------------------------------------- /tools/strand_prediction/CRISPRstrand/Example/Input4.txt: -------------------------------------------------------------------------------- 1 | CGGTTTATCCCCGCTGGCGCGGGGAACAC 2 | CGGTTTATCCCCGCTGGCGCGGGGAACAC 3 | -------------------------------------------------------------------------------- /tools/strand_prediction/CRISPRstrand/Example/Input5.fa: -------------------------------------------------------------------------------- 1 | >CRISPR_1 consensus 2 | CGGTTTATCCCCGCTGGCGCGGGGAACAC 3 | >CRISPR_2 consensus 4 | CGGTTTATCCCCGCTGGCGCGGGGAACAC 5 | >CRISPR_3 consensus 6 | CGGTTTATCCCCGCTGGCGCGGGGAACAC 7 | >CRISPR_4 consensus 8 | CGGTTTATCCCCGCTGGCGCGGGGAACAC 9 | 10 | -------------------------------------------------------------------------------- /tools/strand_prediction/CRISPRstrand/Models/model_r.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/tools/strand_prediction/CRISPRstrand/Models/model_r.h5 -------------------------------------------------------------------------------- /tools/strand_prediction/CRISPRstrand/Results/CRISPRstrand_Summary.tsv: -------------------------------------------------------------------------------- 1 | ID Input Sequence Predicted Sequence Strand Confidence 2 | CRISPR_1 consensus CGGTTTATCCCCGCTGGCGCGGGGAACAC GTGTTCCCCGCGCCAGCGGGGATAAACCG Reverse High 3 | CRISPR_2 consensus CGGTTTATCCCCGCTGGCGCGGGGAACAC GTGTTCCCCGCGCCAGCGGGGATAAACCG Reverse High 4 | CRISPR_3 consensus CGGTTTATCCCCGCTGGCGCGGGGAACAC GTGTTCCCCGCGCCAGCGGGGATAAACCG Reverse High 5 | CRISPR_4 consensus CGGTTTATCCCCGCTGGCGCGGGGAACAC GTGTTCCCCGCGCCAGCGGGGATAAACCG Reverse High 6 | -------------------------------------------------------------------------------- /tools/strand_prediction/CRISPRstrand/cmd.txt: -------------------------------------------------------------------------------- 1 | python CRISPRstrand.py -r -i Example/Input3.fa 2 | -------------------------------------------------------------------------------- /tools/strand_prediction/CRISPRstrand/convNets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Jul 11 10:42:42 2019 5 | 6 | @author: ekrem 7 | """ 8 | 9 | # disable tensorflow multi-threading 10 | import tensorflow as tf 11 | tf.config.threading.set_inter_op_parallelism_threads(1) 12 | tf.config.threading.set_intra_op_parallelism_threads(1) 13 | 14 | from keras.layers import Conv2D 15 | from keras.layers import MaxPooling2D 16 | from keras.layers import GlobalMaxPooling2D 17 | from keras.layers import Dense 18 | from keras.layers import Dropout 19 | from keras.layers import BatchNormalization 20 | from keras.layers import Activation 21 | from keras.layers import GaussianNoise 22 | from keras.optimizers import Adam, SGD 23 | from keras.layers import concatenate 24 | from keras import Input, Model 25 | from keras.regularizers import l2 26 | 27 | ''' 28 | When only the consensus repeat is available 29 | ''' 30 | def build_parallel_classifier_R(seq_length, kernel_width = [4, 6, 8, 12, 16]): 31 | 32 | seq_height = 4 33 | reg = 0.05 34 | #noise_stddev = 0.015 35 | num_feature_maps = 32 36 | 37 | main_input = Input(shape=(seq_height, seq_length, 1)) 38 | 39 | '''''' 40 | layer_1p = Conv2D(num_feature_maps, (seq_height, kernel_width[0]), padding = 'valid', use_bias = False, kernel_regularizer=l2(reg))(main_input) 41 | #layer_1p = GaussianNoise(noise_stddev)(layer_1p) 42 | layer_1p = Activation('relu')(layer_1p) 43 | layer_1p = BatchNormalization()(layer_1p) 44 | pool_1p = GlobalMaxPooling2D()(layer_1p) 45 | 46 | layer_2p = Conv2D(num_feature_maps, (seq_height, kernel_width[1]), padding = 'valid', use_bias = False, kernel_regularizer=l2(reg))(main_input) 47 | #layer_2p = GaussianNoise(noise_stddev)(layer_2p) 48 | layer_2p = Activation('relu')(layer_2p) 49 | layer_2p = BatchNormalization()(layer_2p) 50 | pool_2p = GlobalMaxPooling2D()(layer_2p) 51 | 52 | layer_3p = Conv2D(num_feature_maps, (seq_height, kernel_width[2]), padding = 'valid', use_bias = False, kernel_regularizer=l2(reg))(main_input) 53 | #layer_3p = GaussianNoise(noise_stddev)(layer_3p) 54 | layer_3p = Activation('relu')(layer_3p) 55 | layer_3p = BatchNormalization()(layer_3p) 56 | pool_3p = GlobalMaxPooling2D()(layer_3p) 57 | 58 | layer_4p = Conv2D(num_feature_maps, (seq_height, kernel_width[3]), padding = 'valid', use_bias = False, kernel_regularizer=l2(reg))(main_input) 59 | #layer_4p = GaussianNoise(noise_stddev)(layer_4p) 60 | layer_4p = Activation('relu')(layer_4p) 61 | layer_4p = BatchNormalization()(layer_4p) 62 | pool_4p = GlobalMaxPooling2D()(layer_4p) 63 | 64 | layer_5p = Conv2D(num_feature_maps, (seq_height, kernel_width[4]), padding = 'valid', use_bias = False, kernel_regularizer=l2(reg))(main_input) 65 | #layer_5p = GaussianNoise(noise_stddev)(layer_5p) 66 | layer_5p = Activation('relu')(layer_5p) 67 | layer_5p = BatchNormalization()(layer_5p) 68 | pool_5p = GlobalMaxPooling2D()(layer_5p) 69 | 70 | concatenated = concatenate([pool_1p, pool_2p, pool_3p, pool_4p, pool_5p], axis = 1, name = 'cutoff_layer') 71 | 72 | x = Dense(256, kernel_regularizer=l2(reg), bias_regularizer=l2(reg))(concatenated) 73 | x = Activation('relu')(x) 74 | x = Dropout(rate = 0.5)(x) 75 | 76 | x = Dense(32, kernel_regularizer=l2(reg), bias_regularizer=l2(reg))(x) 77 | x = Activation('relu')(x) 78 | x = Dropout(rate = 0.5)(x) 79 | 80 | xs = Dense(units = 1, kernel_regularizer=l2(reg), bias_regularizer=l2(reg))(x) 81 | out = Activation('sigmoid')(xs) 82 | 83 | classifier = Model(main_input, out) 84 | 85 | optim = SGD(decay=1e-4) 86 | classifier.compile(optimizer = optim, loss = 'binary_crossentropy', metrics = ['accuracy']) 87 | return classifier 88 | 89 | ''' 90 | When a sequence is available 91 | ''' 92 | def build_parallel_classifier_A(seq_length, kernel_width = [4, 6, 8, 12, 16]): 93 | 94 | seq_height = 5 95 | reg = 0.05 96 | #noise_stddev = 0.015 97 | num_feature_maps = 32 98 | 99 | main_input = Input(shape=(seq_height, seq_length, 1)) 100 | side_input = Input(shape=(7,)) 101 | 102 | '''''' 103 | layer_1p = Conv2D(num_feature_maps, (seq_height, kernel_width[0]), padding = 'valid', use_bias = False, kernel_regularizer=l2(reg))(main_input) 104 | #layer_1p = GaussianNoise(noise_stddev)(layer_1p) 105 | layer_1p = Activation('relu')(layer_1p) 106 | layer_1p = BatchNormalization()(layer_1p) 107 | pool_1p = GlobalMaxPooling2D()(layer_1p) 108 | 109 | layer_2p = Conv2D(num_feature_maps, (seq_height, kernel_width[1]), padding = 'valid', use_bias = False, kernel_regularizer=l2(reg))(main_input) 110 | #layer_2p = GaussianNoise(noise_stddev)(layer_2p) 111 | layer_2p = Activation('relu')(layer_2p) 112 | layer_2p = BatchNormalization()(layer_2p) 113 | pool_2p = GlobalMaxPooling2D()(layer_2p) 114 | 115 | layer_3p = Conv2D(num_feature_maps, (seq_height, kernel_width[2]), padding = 'valid', use_bias = False, kernel_regularizer=l2(reg))(main_input) 116 | #layer_3p = GaussianNoise(noise_stddev)(layer_3p) 117 | layer_3p = Activation('relu')(layer_3p) 118 | layer_3p = BatchNormalization()(layer_3p) 119 | pool_3p = GlobalMaxPooling2D()(layer_3p) 120 | 121 | layer_4p = Conv2D(num_feature_maps, (seq_height, kernel_width[3]), padding = 'valid', use_bias = False, kernel_regularizer=l2(reg))(main_input) 122 | #layer_4p = GaussianNoise(noise_stddev)(layer_4p) 123 | layer_4p = Activation('relu')(layer_4p) 124 | layer_4p = BatchNormalization()(layer_4p) 125 | pool_4p = GlobalMaxPooling2D()(layer_4p) 126 | 127 | layer_5p = Conv2D(num_feature_maps, (seq_height, kernel_width[4]), padding = 'valid', use_bias = False, kernel_regularizer=l2(reg))(main_input) 128 | #layer_5p = GaussianNoise(noise_stddev)(layer_5p) 129 | layer_5p = Activation('relu')(layer_5p) 130 | layer_5p = BatchNormalization()(layer_5p) 131 | pool_5p = GlobalMaxPooling2D()(layer_5p) 132 | 133 | concatenated = concatenate([pool_1p, pool_2p, pool_3p, pool_4p, pool_5p], axis = 1, name = 'cutoff_layer') 134 | 135 | x = Dense(256, kernel_regularizer=l2(reg), bias_regularizer=l2(reg))(concatenated) 136 | x = Activation('relu')(x) 137 | x = Dropout(rate = 0.5)(x) 138 | 139 | x = Dense(32, kernel_regularizer=l2(reg), bias_regularizer=l2(reg))(x) 140 | x = Activation('relu')(x) 141 | x = Dropout(rate = 0.5)(x) 142 | 143 | s = Dense(16, kernel_regularizer=l2(reg), bias_regularizer=l2(reg))(side_input) 144 | s = Activation('relu')(s) 145 | s = Dropout(rate = 0.5)(s) 146 | 147 | xs = concatenate([x, s], axis = 1) 148 | xs = Dense(units = 1, kernel_regularizer=l2(reg), bias_regularizer=l2(reg))(xs) 149 | out = Activation('sigmoid')(xs) 150 | 151 | classifier = Model([main_input, side_input], out) 152 | 153 | optim = SGD(decay=1e-4) 154 | classifier.compile(optimizer = optim, loss = 'binary_crossentropy', metrics = ['accuracy']) 155 | return classifier 156 | -------------------------------------------------------------------------------- /tools/strand_prediction/CRISPRstrand/evaluate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Jul 11 19:10:47 2019 5 | 6 | @author: ekrem 7 | """ 8 | import os 9 | 10 | import numpy as np 11 | import pandas as pd 12 | 13 | from sklearn.metrics import roc_auc_score 14 | 15 | def test(classifier, df, X, y, output_folder = 'Results', batch_size = 64): 16 | 17 | X = np.stack(X) 18 | probs = classifier.predict(X, batch_size = batch_size) 19 | df['Probs'] = probs 20 | 21 | orig_len = len(df)//2 22 | samples = [] 23 | for i in range(orig_len): 24 | pos_sample = df.iloc[i] 25 | neg_sample = df.iloc[orig_len+i] 26 | 27 | ID = pos_sample.ID 28 | assert ID == neg_sample.ID, 'Positive sample ID does not match the negative.' 29 | 30 | input_cons = pos_sample['Cons'] 31 | sample = pos_sample if pos_sample['Probs'] > neg_sample['Probs'] else neg_sample 32 | strand = 'Forward' if pos_sample['Probs'] > neg_sample['Probs'] else 'Reverse' 33 | predicted_cons = sample['Cons'] 34 | 35 | prob = sample['Probs'] 36 | if prob >= 0.7: 37 | confidence = 'High' 38 | elif 0.5 < prob < 0.7: 39 | confidence = 'Medium' 40 | else: 41 | confidence = 'Low' 42 | 43 | if 'Type' in df.columns.values: 44 | samples.append((ID, input_cons, predicted_cons, strand, confidence, sample.Type)) 45 | columns = ['ID', 'Input Sequence', 'Predicted Sequence', 'Strand', 'Confidence', 'Type'] 46 | else: 47 | samples.append((ID, input_cons, predicted_cons, strand, confidence)) 48 | columns = ['ID', 'Input Sequence', 'Predicted Sequence', 'Strand', 'Confidence'] 49 | 50 | df_out = pd.DataFrame.from_records(data=samples, columns = columns) 51 | 52 | if not os.path.exists(output_folder): 53 | os.makedirs(output_folder) 54 | 55 | df_out.to_csv(os.path.join(output_folder, 'CRISPRstrand_Summary.tsv'), sep='\t', index = False) 56 | 57 | return df_out, roc_auc_score(y, probs) 58 | 59 | 60 | -------------------------------------------------------------------------------- /tools/strand_prediction/CRISPRstrand/execute_strand.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import os 3 | 4 | conda_activation_cmd = "export PATH=~/miniconda3/bin:$PATH && conda activate crispr_strand_env && python CRISPRstrand.py -r -i Example/Input3.fa && conda deactivate" 5 | subprocess.run(conda_activation_cmd, shell=True) 6 | 7 | 8 | #os.system("source activate crispr_strand_env") -------------------------------------------------------------------------------- /tools/strand_prediction/CRISPRstrand/preprocessing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Jul 3 20:44:47 2019 5 | 6 | @author: ekrem 7 | """ 8 | import re 9 | import os 10 | import numpy as np 11 | import pandas as pd 12 | from collections import namedtuple 13 | import sklearn.model_selection as skms 14 | 15 | #from Models.repeat import RepeatTyper 16 | #from Models.xgb import XGB 17 | 18 | # ============================================================================= 19 | # DICTIONARIES 20 | # ============================================================================= 21 | 22 | one_hot_encoding_dict = { 23 | 'A': np.array([0, 0, 0, 1]).reshape((-1,1)), # A 24 | 'T': np.array([0, 0, 1, 0]).reshape((-1,1)), # T 25 | 'G': np.array([0, 1, 0, 0]).reshape((-1,1)), # G 26 | 'C': np.array([1, 0, 0, 0]).reshape((-1,1)), # C 27 | 'N': np.array([0, 0, 0, 0]).reshape((-1,1)), # N 28 | '-': np.array([0, 0, 0, 0]).reshape((-1,1)), # N 29 | } 30 | complement_encoding_dict = {'A':'T', 'T':'A', 'G':'C', 'C':'G', 'N':'N', '-':'N'} 31 | 32 | # ============================================================================= 33 | # Decoders/Encoders/Helpers 34 | # ============================================================================= 35 | 36 | ''' 37 | Performs one-hot encoding on repeat sequences. 38 | Reshapes each encoded nucleotide to a column vector, then concatenates them. 39 | Returns representation with shape (4, sequence_length, 1). Last dimension is 40 | channels (always 1 in this case) to comply with 2D-CNN. 41 | ''' 42 | def one_hot_encode_sequence(repeat): 43 | return np.concatenate(list(map(one_hot_encoding_dict.get, repeat)), axis = 1).reshape(4, -1, 1) 44 | 45 | ''' 46 | Returns the reverse complement of given repeat sequence. 47 | ''' 48 | def reverse_complement(repeat): 49 | complement = list(map(complement_encoding_dict.get, repeat)) 50 | reverse_complement = ''.join(reversed(complement)) 51 | return reverse_complement 52 | 53 | ''' 54 | Returns the length of all repeat sequences in the dataframe. 55 | ''' 56 | def get_seq_len(repeat): 57 | return repeat.shape[1] 58 | 59 | ''' 60 | Pads all repeat sequences with zero from right-hand side to the length of 61 | maximum sequence. 62 | ''' 63 | def pad_seq(repeat, max_seq_len): 64 | seq_len = get_seq_len(repeat) 65 | if max_seq_len>=seq_len: 66 | return np.pad(repeat, ((0,0),(0, max_seq_len-seq_len), (0,0)), 'constant') 67 | else: 68 | return repeat[:,:max_seq_len] 69 | 70 | ''' 71 | Parses the ID and splits accession number, program and strand into different fields. Also implies start and end indices in case. 72 | ''' 73 | def parse_ID(raw_sample): 74 | ID, program, strand = raw_sample['ID'].split('-') 75 | program, start, end = program.split('_') 76 | raw_sample['Accession'] = ID 77 | raw_sample['Program'] = program 78 | raw_sample['Strand'] = strand 79 | raw_sample['Start'] = start 80 | raw_sample['End'] = end 81 | return raw_sample 82 | 83 | # ============================================================================= 84 | # FUNCTIONS TO GET NEGATIVE DATASET 85 | # ============================================================================= 86 | ''' 87 | Wrapper method. 88 | Forms negative dataset. 89 | ''' 90 | def seperate_to_pos_neg(df_pos, _all = False, keep_orig = False): 91 | 92 | if keep_orig: 93 | df_pos = df_pos.copy() 94 | # Form negative dataset 95 | df_neg = get_neg(df_pos, _all) 96 | 97 | df_pos['Label'] = 1 98 | df_neg['Label'] = 0 99 | 100 | return df_pos, df_neg 101 | 102 | ''' 103 | Applies reverse complement to repeat sequences and to strand. 104 | ''' 105 | def get_neg(df_pos, _all = False): 106 | 107 | df_neg = df_pos.copy() 108 | df_neg['Cons'] = df_neg['Cons'].apply(reverse_complement) 109 | if _all: 110 | df_neg['Conservation'] = df_neg['Conservation'].apply(np.flip) 111 | df_neg['Edge_Conservations'] = df_neg['Edge_Conservations'].apply(np.flip) 112 | df_neg['Up_Down_AT_content'] = df_neg['Up_Down_AT_content'].apply(np.flip) 113 | df_neg['Up_Down_AT_content'] = df_neg['Up_Down_AT_content'].apply(lambda x: 1-x) 114 | 115 | return df_neg 116 | 117 | ''' 118 | Reads and formats the data. 119 | ''' 120 | def read_xls(path = None): 121 | df = pd.read_excel(path) 122 | df.rename(columns={'Consensus repeat': 'Cons'}, inplace=True) 123 | df = df.dropna() 124 | df = df.apply(parse_ID, axis = 1) 125 | return df 126 | 127 | # ============================================================================= 128 | # 129 | # ============================================================================= 130 | 131 | ''' 132 | Finds conflicting samples in less trusted dataset 133 | ''' 134 | def compare_datasets(df_reliable_pos, df_corrupt_neg): 135 | conflict_dict = check_datasets(df_corrupt_neg, df_reliable_pos) 136 | keys = list(conflict_dict.keys()) 137 | return keys 138 | 139 | ''' 140 | Cleans conflicting samples 141 | ''' 142 | def clean_datasets(df_pos, df_neg, conflict_dict = None, _all = False): 143 | if conflict_dict == None: 144 | conflict_dict = check_datasets(df_pos, df_neg) 145 | keys = list(conflict_dict.keys()) 146 | print('Cleaning the dataset from conflicting samples...') 147 | df_pos = df_pos[~df_pos['ID'].isin(keys)] 148 | return seperate_to_pos_neg(df_pos, _all = _all, keep_orig = False) 149 | 150 | ''' 151 | Finds conflicting samples 152 | ''' 153 | def check_datasets(df_pos, df_neg): 154 | print('Checking the dataset for conflicting samples...') 155 | conflict_dict = dict() 156 | for idx, sample in df_pos.iterrows(): 157 | consensus = sample['Cons'] 158 | acc_id = sample['ID'] 159 | df_conf = df_neg[df_neg['Cons'] == consensus] 160 | if (len(df_conf)>0): 161 | conflict_dict[acc_id] = (df_conf, consensus) 162 | return conflict_dict 163 | 164 | ''' 165 | ''' 166 | def issue_conflicts(conflict_dict, path): 167 | f = open(path, 'w+') 168 | for conf_acc_id, conf_samples in conflict_dict.items(): 169 | df_conf, consensus = conf_samples 170 | f.write('Sample '+conf_acc_id+'\t'+consensus+' conflicts with the negative samples down below.\n') 171 | for _, sample in df_conf.iterrows(): 172 | f.write('\t'.join([str(s) for s in sample])) 173 | f.write('\n') 174 | f.write(45*'_'+'\n') 175 | print('Saved conflicting samples into issue file in path %s' % (path)) 176 | f.close() 177 | 178 | # ============================================================================= 179 | # FUNCTIONS BELOW PROCESSES DATAFRAME TO FORMAT INPUT 180 | # ============================================================================= 181 | ''' 182 | Performs operations below to prepare input for neural network. 183 | For input with only repeats. 184 | ''' 185 | def process_dataset_for_training(df): 186 | df = df.copy() 187 | df['Cons'] = df['Cons'].apply(one_hot_encode_sequence) 188 | 189 | # Pad repeats to equal size 190 | seq_lens = df['Cons'].apply(get_seq_len) 191 | max_seq_len = max(seq_lens) 192 | df['Cons'] = df['Cons'].apply(pad_seq, args = [max_seq_len]) 193 | 194 | # Split into features-labels 195 | df_X = df['Cons'] 196 | df_y = df['Label'] 197 | 198 | # Return train and also length of the sequence 199 | # (width of the input) 200 | return df_X.values, df_y.values, max_seq_len 201 | 202 | ''' 203 | ''' 204 | def process_dataset_for_test(df, max_seq_len, clean = False): 205 | 206 | # To remove conflicting samples 207 | if clean: 208 | df_pos, df_neg = seperate_to_pos_neg(df) 209 | print('Number of conflicting samples:', len(check_datasets(df_pos, df_neg))) 210 | print('Cleaning...') 211 | df_pos, df_neg = clean_datasets(df_pos, df_neg, con = False) 212 | print('SUCCESS' if len(check_datasets(df_pos, df_neg)) == 0 else 'Failure') 213 | 214 | df = pd.concat([df_pos, df_neg]).reset_index(drop = True) 215 | 216 | df_encoded = df.copy() 217 | # One-hot encode 218 | df_encoded['Cons'] = df_encoded['Cons'].apply(one_hot_encode_sequence) 219 | 220 | # Pad repeats to model input shape 221 | df_encoded['Cons'] = df_encoded['Cons'].apply(pad_seq, args = [max_seq_len]) 222 | 223 | # Split into features-labels 224 | df_X = df_encoded['Cons'] 225 | df_y = df_encoded['Label'] 226 | 227 | return df, df_X.values, df_y.values 228 | 229 | ''' 230 | ''' 231 | def fasta_to_df(fasta_file_path): 232 | f= open(fasta_file_path, "r") 233 | contents = f.read() 234 | contents = re.split(r'>', contents) # Split every id 235 | samples = [] 236 | for content in contents[1:]: # First row is an empty partition 237 | temp = re.split(r'\n', content) # Split from new line or tab 238 | samples.append(temp[:2]) 239 | df = pd.DataFrame.from_records(data=samples, columns = ['ID', 'Cons']) 240 | return df 241 | 242 | def txt_to_df(path): 243 | f = open(path, 'r') 244 | repeats = f.readlines() 245 | f.close() 246 | 247 | samples = [] 248 | for idx, repeat in enumerate(repeats): 249 | ID = 'SAMPLE_'+str(idx) 250 | samples.append((ID, repeat.strip())) 251 | 252 | return pd.DataFrame.from_records(data=samples, columns = ['ID', 'Cons']) 253 | ''' 254 | ''' 255 | 256 | def read_datasets_prepare_train_test(folder, filenames, test_size = 0.25, usecols = None): 257 | print('Reading the files...') 258 | df_list = [] 259 | for filename in filenames: 260 | 261 | if 'xls' in filename: 262 | df = read_xls(filename, usecols = usecols, header = 0, names = ['ID', 'Cons']) 263 | elif '.txt' in filename: 264 | df = txt_to_df(filename) 265 | elif 'tsv' or 'tab' in filename: 266 | df = pd.read_csv(filename, sep = '\t', usecols = usecols, header = 0, names = ['ID', 'Cons']) 267 | elif '.fasta' in filename: 268 | df = fasta_to_df(filename) 269 | 270 | df_train, df_test = skms.train_test_split(df, test_size = test_size) 271 | df_list.append((df_train, df_test)) 272 | 273 | def read_datasets(filenames, test_size = 0.25, usecols = None): 274 | print('Reading the files...') 275 | df_list = [] 276 | for filename in filenames: 277 | if '.xls' in filename: 278 | df = read_xls(filename, usecols = usecols, header = 0, names = ['ID', 'Cons']) 279 | elif '.txt' in filename: 280 | df = txt_to_df(filename) 281 | elif ('.tsv' in filename) or ('.tab' in filename): 282 | df = pd.read_csv(filename, sep = '\t', usecols = usecols, header = 0, names = ['ID', 'Cons']) 283 | elif ('.fasta' in filename) or ('.fa' in filename): 284 | df = fasta_to_df(filename) 285 | 286 | df_list.append(df) 287 | return df_list 288 | 289 | ''' 290 | ''' 291 | def concat_datasets(df_list): 292 | if len(df_list[0]) == 2: 293 | df_train =pd.concat([dfs[0] for dfs in df_list], ignore_index = True) 294 | df_test =pd.concat([dfs[1] for dfs in df_list], ignore_index = True) 295 | return df_train, df_test 296 | else: 297 | df = pd.concat(df_list, ignore_index = True) 298 | return df 299 | # ============================================================================= 300 | 301 | 302 | def prepare_input(filenames, output_folder, check_conflict = True, usecols = None, do_type = False): 303 | df_list = read_datasets(filenames, usecols = usecols) 304 | if len(df_list) == 1: 305 | df_pos = df_list[0] 306 | else: 307 | df_pos = concat_datasets(df_list) 308 | 309 | #if do_type: 310 | #df_pos = find_repeat_type(df_pos) 311 | 312 | df_pos, df_neg = seperate_to_pos_neg(df_pos) 313 | 314 | if check_conflict: 315 | conflict_dict = check_datasets(df_pos, df_neg) 316 | if not os.path.exists(output_folder): 317 | os.makedirs(output_folder) 318 | issue_conflicts(conflict_dict, os.path.join(output_folder, 'conflict_issues.txt')) 319 | df_pos, df_neg = clean_datasets(df_pos, df_neg, conflict_dict) 320 | 321 | df = concat_datasets((df_pos, df_neg)) 322 | return df 323 | 324 | if __name__ == '__main__': 325 | 326 | nov_2020_filenames = ['I-E_repeat_seqs.xls', 'VI-B_repeat_seqs.xls', 'VI-A_repeat_seqs.xls','I-F_repeat_seqs.xls', 'II-A_repeat_seqs.xls', 'II-B_repeat_seqs.xls', 'II-C_repeat_seqs.xls'] 327 | 328 | df_list = read_datasets(folder, ['clean_repeat_seqs.tsv'], True) 329 | df_pos_train, df_pos_test = concat_datasets(df_list) 330 | 331 | print(df_pos_train) 332 | print(df_pos_test) 333 | df_pos_train.to_csv(os.path.join(folder, 'train_repeat_seqs.tsv'), sep = '\t', index = False) 334 | df_pos_test.to_csv(os.path.join(folder, 'test_repeat_seqs.tsv'), sep = '\t', index = False) 335 | -------------------------------------------------------------------------------- /tools/strand_prediction/CRISPRstrand/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # disable tensorflow multi-threading 4 | import tensorflow as tf 5 | tf.config.threading.set_inter_op_parallelism_threads(1) 6 | tf.config.threading.set_intra_op_parallelism_threads(1) 7 | 8 | #import matplotlib.pyplot as plt 9 | from keras.models import load_model as lm 10 | 11 | # ============================================================================= 12 | # 13 | # ============================================================================= 14 | def load_model(model_path): 15 | classifier = lm(model_path) 16 | classifier.summary() 17 | sequence_length = classifier.layers[1].input_shape[2] 18 | 19 | return classifier, sequence_length 20 | 21 | def plot_loss_acc(histories): 22 | 23 | training_accuracies = [] 24 | validation_accuracies = [] 25 | training_losses = [] 26 | validation_losses = [] 27 | 28 | for history in histories: 29 | training_accuracies.append(history.history['acc']) 30 | validation_accuracies.append(history.history['val_acc']) 31 | training_losses.append(history.history['loss']) 32 | validation_losses.append(history.history['val_loss']) 33 | 34 | training_accuracies = np.stack(training_accuracies) 35 | validation_accuracies = np.stack(validation_accuracies) 36 | training_losses = np.stack(training_losses) 37 | validation_losses = np.stack(validation_losses) 38 | 39 | mean_training_accuracies = np.mean(training_accuracies, axis = 0) 40 | mean_validation_accuracies = np.mean(validation_accuracies, axis = 0) 41 | mean_training_losses = np.mean(training_losses, axis = 0) 42 | mean_validation_losses = np.mean(validation_losses, axis = 0) 43 | 44 | std_training_accuracies = np.sqrt(np.var(training_accuracies, axis = 0)) 45 | std_validation_accuracies = np.sqrt(np.var(validation_accuracies, axis = 0)) 46 | std_training_losses = np.sqrt(np.var(training_losses, axis = 0)) 47 | std_validation_losses = np.sqrt(np.var(validation_losses, axis = 0)) 48 | 49 | epochs = training_accuracies.shape[1] 50 | 51 | x = np.atleast_2d(np.linspace(1, epochs, epochs)).T 52 | plt.plot(x, mean_training_accuracies, label='train_acc') 53 | plt.plot(x, mean_validation_accuracies, label='val_acc') 54 | plt.fill(np.concatenate([x, x[::-1]]), np.concatenate([mean_training_accuracies-std_training_accuracies, (mean_training_accuracies+std_training_accuracies)[::-1]]), alpha=.3, label='train_acc_uncertainty') 55 | plt.fill(np.concatenate([x, x[::-1]]), np.concatenate([mean_validation_accuracies-std_validation_accuracies, (mean_validation_accuracies+std_validation_accuracies)[::-1]]), alpha=.3, label='val_acc_uncertainty') 56 | plt.legend(loc='lower right') 57 | plt.show() 58 | 59 | plt.plot(x, mean_training_losses, label='train_loss') 60 | plt.plot(x, mean_validation_losses, label='val_loss') 61 | plt.fill(np.concatenate([x, x[::-1]]), np.concatenate([mean_training_losses-std_training_losses, (mean_training_losses+std_training_losses)[::-1]]), alpha=.3, label='train_loss_uncertainty') 62 | plt.fill(np.concatenate([x, x[::-1]]), np.concatenate([mean_validation_losses-std_validation_losses, (mean_validation_losses+std_validation_losses)[::-1]]), alpha=.3, label='val_loss_uncertainty') 63 | plt.legend(loc='upper right') 64 | plt.show() 65 | -------------------------------------------------------------------------------- /trained_models/eden/eden_ab_vs_n: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/eden/eden_ab_vs_n -------------------------------------------------------------------------------- /trained_models/eden/eden_archaea: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/eden/eden_archaea -------------------------------------------------------------------------------- /trained_models/eden/eden_bacteria: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/eden/eden_bacteria -------------------------------------------------------------------------------- /trained_models/eden/eden_merged: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/eden/eden_merged -------------------------------------------------------------------------------- /trained_models/eden/eden_merged_with_neg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/eden/eden_merged_with_neg -------------------------------------------------------------------------------- /trained_models/extra_trees/extra_trees_ab_vs_n.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/extra_trees/extra_trees_ab_vs_n.pkl -------------------------------------------------------------------------------- /trained_models/extra_trees/extra_trees_archaea.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/extra_trees/extra_trees_archaea.pkl -------------------------------------------------------------------------------- /trained_models/extra_trees/extra_trees_bacteria.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/extra_trees/extra_trees_bacteria.pkl -------------------------------------------------------------------------------- /trained_models/extra_trees/extra_trees_merged.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/extra_trees/extra_trees_merged.pkl -------------------------------------------------------------------------------- /trained_models/extra_trees/extra_trees_merged_with_neg.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/extra_trees/extra_trees_merged_with_neg.pkl -------------------------------------------------------------------------------- /trained_models/extra_trees/extra_trees_subset.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/extra_trees/extra_trees_subset.pkl -------------------------------------------------------------------------------- /trained_models/extra_trees/extra_trees_subset10features.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/extra_trees/extra_trees_subset10features.pkl -------------------------------------------------------------------------------- /trained_models/extra_trees/extra_trees_subset8features.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/extra_trees/extra_trees_subset8features.pkl -------------------------------------------------------------------------------- /trained_models/extra_trees/extra_trees_subset9features.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BackofenLab/CRISPRidentify/a2a82be79f8b1ed36fb3e2e90728fda44e588115/trained_models/extra_trees/extra_trees_subset9features.pkl --------------------------------------------------------------------------------