├── .gitignore ├── LICENSE ├── MANIFEST.in ├── PERF ├── __init__.py ├── all_repeats_1-6nt.txt ├── analyse.py ├── annotation.py ├── core.py ├── fastq_utils.py ├── lib │ ├── src │ │ ├── all.js │ │ ├── anno_charts.js │ │ ├── apexcharts.min.js │ │ ├── jquery-3.5.0.min.js │ │ ├── jquery.multi-select.min.js │ │ ├── lodash.min.js │ │ ├── main_fasta.js │ │ ├── main_fastq.js │ │ ├── semantic.min.js │ │ ├── tables_fasta.js │ │ └── tables_fastq.js │ ├── styles │ │ ├── apexcharts.min.css │ │ ├── main.css │ │ ├── multi-select.min.css │ │ └── semantic.min.css │ ├── template_fasta.html │ └── template_fastq.html ├── rep_utils.py └── utils.py ├── README.md ├── README.rst ├── pylint.rc ├── requirements.txt ├── setup.cfg ├── setup.py ├── test_data ├── repeat_options.txt ├── test.fastq_perf.html ├── test_input.fa ├── test_input.fastq.gz ├── test_input.fastq_perf.html ├── test_input.fastq_perf.tsv ├── test_input_perf.html ├── test_input_perf.tsv └── unit_options.txt └── utils └── repeat_generator.py /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | 3 | # Compiled python modules. 4 | *.pyc 5 | 6 | # Setuptools distribution folder. 7 | /dist/ 8 | 9 | # Python egg metadata, regenerated from source files by setuptools. 10 | /*.egg-info 11 | 12 | /build/ 13 | 14 | /test_data/million_100.fastq 15 | /test_data/tenK* -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 4 | Divya Tej Sowpati & Akshay Kumar Avvaru, 5 | Lab of Dr. Rakesh Mishra 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include PERF/lib/src/*js 2 | include PERF/lib/template_fasta.html 3 | include PERF/lib/template_fastq.html 4 | include PERF/lib/styles/*css 5 | -------------------------------------------------------------------------------- /PERF/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | -------------------------------------------------------------------------------- /PERF/analyse.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | from __future__ import print_function, division 4 | import sys, os, json 5 | from collections import Counter, defaultdict 6 | import numpy as np 7 | from pprint import pprint 8 | 9 | if sys.version_info.major == 2: 10 | from utils import rev_comp, kmers, get_cycles, build_cycVariations 11 | elif sys.version_info.major == 3: 12 | from .utils import rev_comp, kmers, get_cycles, build_cycVariations 13 | 14 | 15 | def writetoHTML(html_file, defaultInfo, repeat_options, input_format): 16 | html_handle = open(html_file, 'w') 17 | current_dir = os.path.dirname(__file__) 18 | 19 | template = open('%s/lib/template_%s.html' %(current_dir, input_format), 'r').read() 20 | 21 | fontawesome_js = open('%s/lib/src/all.js' %(current_dir), 'r').read() 22 | semantic_css = open('%s/lib/styles/semantic.min.css' %(current_dir), 'r').read() 23 | multiselect_css = open('%s/lib/styles/multi-select.min.css' %(current_dir), 'r').read() 24 | apexcharts_css = open('%s/lib/styles/apexcharts.min.css' %(current_dir), 'r').read() 25 | main_css = open('%s/lib/styles/main.css' %(current_dir), 'r').read() 26 | 27 | jquery_js = open("%s/lib/src/jquery-3.5.0.min.js" %(current_dir), "r").read() 28 | semantic_js = open("%s/lib/src/semantic.min.js" %(current_dir), "r").read() 29 | multiselect_js = open('%s/lib/src/jquery.multi-select.min.js' %(current_dir), 'r').read() 30 | apexcharts_js = open('%s/lib/src/apexcharts.min.js' %(current_dir), 'r').read() 31 | lodash_js = open('%s/lib/src/lodash.min.js' %(current_dir), 'r').read() 32 | main_js = open('%s/lib/src/main_%s.js' %(current_dir, input_format), 'r').read() 33 | tables_js = open('%s/lib/src/tables_%s.js' %(current_dir, input_format), 'r').read() 34 | annocharts_js = '' 35 | if input_format == 'fasta': 36 | annocharts_js = open('%s/lib/src/anno_charts.js' %(current_dir), 'r').read() 37 | 38 | template = template.format( 39 | fontawesome_js = fontawesome_js, 40 | semantic_css = semantic_css, 41 | multiselect_css = multiselect_css, 42 | apexcharts_css = apexcharts_css, 43 | main_css = main_css, 44 | jquery_js = jquery_js, 45 | semantic_js = semantic_js, 46 | multiselect_js = multiselect_js, 47 | apexcharts_js = apexcharts_js, 48 | lodash_js = lodash_js, 49 | analyse_data_js = defaultInfo, 50 | main_js = main_js, 51 | tables_js = tables_js, 52 | annocharts_js = annocharts_js, 53 | repeat_options = repeat_options, 54 | ) 55 | 56 | print(template, file=html_handle) 57 | html_handle.close() 58 | print("HTML report successfully saved to " + html_file) 59 | 60 | 61 | def get_parameters(args): 62 | runCommand = 'PERF' + ' '.join(sys.argv) 63 | 64 | 65 | def analyse_fasta(args): 66 | repeatsOutFile = args.output.name 67 | html_report = os.path.splitext(repeatsOutFile)[0] + '.html' 68 | print("\nGenerating HTML report. This may take a while..", end="\n\n") 69 | 70 | 71 | all_repeat_classes = [] 72 | kmer_classes = defaultdict(list) 73 | cyclical_variations = dict() 74 | for r in args.repeats: 75 | r = r.split('\t')[1] 76 | if r not in all_repeat_classes: 77 | all_repeat_classes.append(r) 78 | cyclical_variations[r] = build_cycVariations(r) 79 | 80 | inf = float('inf') 81 | defaultInfo = {} 82 | defaultInfo['info'] = { 'seqInfo': {}, 'repInfo': {} } 83 | 84 | if args.annotate: #if annotation is on the data is taken from t 85 | repeatsOutFile = os.path.splitext(repeatsOutFile)[0] + '_annotation.tsv' 86 | promUp = args.up_promoter 87 | promDown = args.down_promoter 88 | defaultInfo['info']['annoInfo'] = {'promUp': promUp, 'promDown': promDown} 89 | repAnno = {} 90 | TSS_dist = {} 91 | annoKeyDict = {} 92 | 93 | totalRepBases = 0 94 | totalRepFreq = 0 95 | longestLengths = [['seq', 'start', 'stop', 'repClass', 0, '+', 0, 'actualrep']]*100 96 | mostUnits = [['seq', 'start', 'stop', 'repClass', 0, '+', 0, 'actualrep']]*100 97 | minLength = inf 98 | minUnits = inf 99 | 100 | plot_data = dict() 101 | with open(repeatsOutFile, 'r') as repFile: 102 | for line in repFile: 103 | line = line.strip() 104 | if line.startswith('#'): 105 | fields = line[1:].split(': ') 106 | defaultInfo['info']['seqInfo'][fields[0]] = fields[1] 107 | else: 108 | fields = line.split('\t') 109 | fields[1] = int(fields[1]) 110 | fields[2] = int(fields[2]) 111 | fields[4] = int(fields[4]) 112 | fields[6] = int(fields[6]) 113 | 114 | seq = fields[0] 115 | start = fields[1] 116 | end = fields[2] 117 | repClass = fields[3] 118 | repLength = fields[4] 119 | repOri = fields[5] 120 | repUnit = fields[6] 121 | actualRepeat = fields[7] 122 | if args.annotate: 123 | if repClass not in repAnno: 124 | repAnno[repClass] = {'EP': 0, 'GP': 0, 'GN': 0, 'IP': 0, 'DP': 0, 'EN': 0, 'IN': 0, 'DN': 0, 'UU': 0} 125 | TSS_dist[repClass] = [] 126 | genicKey = fields[12] 127 | promKey = fields[13] 128 | try: 129 | tssD = int(fields[-1]) 130 | if -5000 <= tssD <= 5000: 131 | TSS_dist[repClass].append(tssD) 132 | except: 133 | pass 134 | if genicKey == 'Intergenic': 135 | genicKey = 'Distal Intergenic' 136 | elif genicKey == '-': 137 | genicKey = 'Unannotated' 138 | promKey = 'Unannotated' 139 | annoKey = genicKey[0]+promKey[0] 140 | annoKeyDict[annoKey] = genicKey + '+' + promKey 141 | repAnno[repClass][annoKey] += 1 142 | 143 | totalRepBases += repLength 144 | totalRepFreq += 1 145 | 146 | if repClass not in plot_data: 147 | plot_data[repClass] = dict() 148 | plot_data[repClass][repLength] = [0]*len(cyclical_variations[repClass]) 149 | if repLength not in plot_data[repClass]: 150 | plot_data[repClass][repLength] = [0]*len(cyclical_variations[repClass]) 151 | plot_data[repClass][repLength][cyclical_variations[repClass].index(actualRepeat)] += 1 152 | 153 | if minUnits > repUnit: minUnits = repUnit 154 | if minLength > repLength: minLength = repLength 155 | 156 | if (longestLengths[-1][4] < repLength) or (longestLengths[-1][4] == repLength and repClass < longestLengths[-1][3]): 157 | longestLengths[-1] = fields 158 | longestLengths.sort(key=lambda x: x[4]) 159 | longestLengths.reverse() 160 | if (mostUnits[-1][6] < repUnit) or (mostUnits[-1][6] == repUnit and repClass < longestLengths[-1][3]): 161 | mostUnits[-1] = fields 162 | mostUnits.sort(key=lambda x: x[6]) 163 | mostUnits.reverse() 164 | for r in all_repeat_classes: 165 | kmer_classes[kmers[len(r)]].append(r) 166 | if r not in plot_data: 167 | plot_data[r] = 0 168 | 169 | repeat_options = "" 170 | for kmer in kmer_classes: 171 | repeat_options += '' %(kmer) 172 | for r in kmer_classes[kmer]: 173 | repeat_options += '' %(r, r) 174 | repeat_options += '' 175 | 176 | totalBases = int(defaultInfo['info']['seqInfo']['Total_bases']) 177 | defaultInfo['info']['repInfo']['lenFrequency'] = plot_data 178 | defaultInfo['info']['repInfo']['numRepClasses'] = str(len(plot_data.keys())) + '/' + str(len(all_repeat_classes)) 179 | defaultInfo['info']['repInfo']['totalRepBases'] = totalRepBases 180 | defaultInfo['info']['repInfo']['totalRepFreq'] = totalRepFreq 181 | defaultInfo['info']['repInfo']['percentGenomeCovered'] = str(round((totalRepBases/totalBases)*100, 2)) + "%" 182 | defaultInfo['info']['repInfo']['repDensityByFreq'] = round((totalRepFreq/totalBases)*1000000, 2) 183 | defaultInfo['info']['repInfo']['repDensityByBases'] = round((totalRepBases/totalBases)*1000000, 2) 184 | defaultInfo['info']['repInfo']['minLength'] = minLength 185 | defaultInfo['info']['repInfo']['minUnits'] = minUnits 186 | defaultInfo['info']['repInfo']['longestRepeats'] = [] 187 | defaultInfo['info']['repInfo']['mostRepeatUnits'] = [] 188 | defaultInfo['info']['repInfo']['allRepClasses'] = all_repeat_classes 189 | if args.annotate: 190 | for r in TSS_dist: 191 | hist_values = np.histogram(TSS_dist[r], bins=200, range=(-5000,5000)) 192 | TSS_dist[r] = list(hist_values[0]) 193 | TSS_dist[r] = list(map(lambda x: int(x), TSS_dist[r])) 194 | defaultInfo['info']['annoInfo']['TSS_histBinEdges'] = list(map(lambda x: int(x), hist_values[1])) 195 | defaultInfo['info']['annoInfo']['repAnno'] = repAnno 196 | defaultInfo['info']['annoInfo']['TSS_dist'] = TSS_dist 197 | defaultInfo['info']['annoInfo']['annoKeyObj'] = annoKeyDict 198 | for a in longestLengths: 199 | testDict = {'seq': a[0], 'start': a[1], 'end': a[2], 'repClass': a[3], 'repLength': a[4], 'repOri': a[5], 'repUnit': a[6], 'actualRep': a[7]} 200 | defaultInfo['info']['repInfo']['longestRepeats'].append(testDict) 201 | for a in mostUnits: 202 | testDict = {'seq': a[0], 'start': a[1], 'end': a[2], 'repClass': a[3], 'repLength': a[4], 'repOri': a[5], 'repUnit': a[6], 'actualRep': a[7]} 203 | defaultInfo['info']['repInfo']['mostRepeatUnits'].append(testDict) 204 | defaultInfo = 'const data =' + json.dumps(defaultInfo) 205 | writetoHTML(html_report, defaultInfo, repeat_options, 'fasta') 206 | 207 | def analyse_fastq(args, fastq_out): 208 | 209 | """Generates HTML report for fastq files.""" 210 | html_report = os.path.splitext(args.output.name)[0] + '.html' 211 | print("\nGenerating HTML report. This may take a while..", end="\n\n") 212 | 213 | fastq_out['info']['seqInfo']['File_name'] = args.input.split('/')[-1] 214 | n = fastq_out['info']['seqInfo']['Total_reads'] 215 | b = fastq_out['info']['seqInfo']['Total_bases'] 216 | total_repeats = fastq_out['info']['repInfo']['totalRepFreq'] 217 | reads_with_repeats = fastq_out['info']['repInfo']['totalRepReads'] 218 | total_repeat_bases = fastq_out['info']['repInfo']['totalRepBases'] 219 | all_repeat_classes = list(map(lambda x: x.split('\t')[1], args.repeats)) 220 | temp = [] 221 | for a in all_repeat_classes: 222 | if a not in temp: 223 | temp.append(a) 224 | all_repeat_classes = temp 225 | del temp 226 | 227 | kmer_classes = defaultdict(list) 228 | for r in all_repeat_classes: 229 | kmer_classes[kmers[len(r)]].append(r) 230 | repeat_options = "" 231 | for kmer in kmer_classes: 232 | repeat_options += '' %(kmer) 233 | for r in kmer_classes[kmer]: 234 | repeat_options += '' %(r, r) 235 | repeat_options += '' 236 | 237 | fastq_out['info']['repInfo']['numRepClasses'] = str(fastq_out['info']['repInfo']['numRepClasses']) + '/' + str(len(all_repeat_classes)) 238 | fastq_out['info']['repInfo']['allRepClasses'] = all_repeat_classes 239 | fastq_out['info']['repInfo']['totalRepFreqNorm'] = round((total_repeats/n)*1000000, 2) 240 | fastq_out['info']['repInfo']['totalRepReadsNorm'] = str(round((reads_with_repeats/n)*100, 2)) + '%' 241 | fastq_out['info']['repInfo']['percentRepBases'] = str(round((total_repeat_bases/b)*100, 2)) + '%' 242 | 243 | rep_fastq_info = fastq_out['info']['repInfo']['repClassInfo'] 244 | for rep in sorted(rep_fastq_info, key= lambda k: (len(k), k)): 245 | fastq_out['info']['repInfo']['repClassInfo'][rep]['reads_norm'] = round((rep_fastq_info[rep]['reads']/n)*1000000, 3) 246 | fastq_out['info']['repInfo']['repClassInfo'][rep]['instances_norm'] = round((rep_fastq_info[rep]['instances']/n)*1000000, 3) 247 | fastq_out['info']['repInfo']['repClassInfo'][rep]['bases_norm'] = round((rep_fastq_info[rep]['bases']/b)*1000000, 3) 248 | 249 | defaultInfo = 'const data =' + json.dumps(fastq_out) 250 | writetoHTML(html_report, defaultInfo, repeat_options, 'fastq') 251 | -------------------------------------------------------------------------------- /PERF/annotation.py: -------------------------------------------------------------------------------- 1 | #!usr/bin/python 2 | from __future__ import print_function, division 3 | from operator import itemgetter 4 | from collections import defaultdict 5 | from tqdm import tqdm 6 | import os, sys, gzip 7 | 8 | if sys.version_info.major == 2: 9 | from utils import rawcharCount 10 | elif sys.version_info.major == 3: 11 | from .utils import rawcharCount 12 | 13 | """ 14 | 15 | CAUTION: Works currently for only sorted bed files. 16 | 17 | Preferential order of assigning annotation: 18 | Promoter >> Overlapping >> Intergenic 19 | 20 | Defaults: 21 | > Promoter distance is 1kb upstream and downstream of TSS. 22 | > Gene id considered is "gene". 23 | 24 | Built by checking on GFF3 file. 25 | 26 | # Sample GTF 27 | 1 transcribed_unprocessed_pseudogene gene 11869 14409 . + . gene_id "ENSG00000223972"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; 28 | 1 processed_transcript transcript 11869 14409 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_name "DDX11L1"; gene_sourc e "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-002"; transcript_source "havana"; 29 | 30 | # Sample GFF 31 | X Ensembl Repeat 2419108 2419128 42 . . hid=trf; hstart=1; hend=21 32 | X Ensembl Repeat 2419108 2419410 2502 - . hid=AluSx; hstart=1; hend=303 33 | X Ensembl Repeat 2419108 2419128 0 . . hid=dust; hstart=2419108; hend=2419128 34 | """ 35 | 36 | def select_anno(List): 37 | """Function to assign the hierarchically right choice of annotation""" 38 | if 'Exon' in List: 39 | return 'Exon' 40 | elif 'Intron' in List: 41 | return 'Intron' 42 | elif 'Genic' in List: 43 | return 'Genic' 44 | elif 'Intergenic' in List: 45 | return 'Intergenic' 46 | 47 | 48 | def promoter(check): 49 | if check == 1: 50 | return 'Promoter' 51 | else: 52 | return 'Non-Promoter' 53 | 54 | 55 | # Need to be updated for better parsing of the attributes 56 | def process_attrs(attribute, annotype): 57 | 58 | """Processes the attribute field to build a dictionary with detailed profiling of the feature""" 59 | 60 | attr_obj = {} 61 | attributes = attribute.split(";") 62 | subdelim = " " if annotype=='GTF' else "=" 63 | for a in attributes: 64 | attr = a.split(subdelim) 65 | attrName = attr[0].strip() 66 | attr_obj[attrName] = attr[1].strip() 67 | return attr_obj 68 | 69 | 70 | # class GeneKeyError(Exception): 71 | # """ 72 | # Exception raised for gene key which is absent. 73 | # """ 74 | 75 | # def __init__(self, expression, message): 76 | # self.expression = 77 | # self.message = message 78 | 79 | 80 | 81 | def process_annofile(annofile, annotype, gene_id): 82 | 83 | """ 84 | Processes the input annotation file and builds an object inclusive of all the available genic features. 85 | Order of columns for a typical GFF or GTF file: 86 | seqname source feature start end score strand frame attribute 87 | 88 | The output is an object is a constituent of two dictionaries: 89 | - An object for all the gene features. (key: chromosome_name, value: list of features in the chromosome). 90 | - An object for all the sub gene (exon, cds, etc.) features. (key: chromosome_name, 91 | value: (key: parent_geneid, value: list of features in the chromosome)) 92 | 93 | The features for each chromosome are sorted based on their starts for easier downstream processing. 94 | """ 95 | 96 | gene_obj = defaultdict(list) 97 | subgene_obj = defaultdict() 98 | if (annofile.endswith('gz')): 99 | annohandle = gzip.open(annofile, 'rt') 100 | else: 101 | annohandle = open(annofile, 'r') 102 | for line in annohandle: 103 | line = line.strip() 104 | if line.startswith('#'): 105 | pass 106 | else: 107 | fields = line.split('\t') 108 | seqname, source, feature = fields[:3] 109 | start = int(fields[3]) 110 | end = int(fields[4]) 111 | score, strand, frame, attribute = fields[5:9] 112 | attr_obj = process_attrs(attribute, annotype) 113 | 114 | if feature in set(['gene', 'exon']): 115 | try: 116 | gene_name = attr_obj[gene_id] 117 | except KeyError: 118 | print('\nGeneKeyError:') 119 | print('The attribute "%s" is not among the attributes for gene. Please select a different one.' %(gene_id)) 120 | print('The available ones are ['+ (", ".join(list(attr_obj.keys()))) +']', end='\n\n') 121 | sys.exit(1) 122 | 123 | if feature == 'gene': 124 | gene_obj[seqname].append([gene_name, start, end, strand]) 125 | elif feature == 'exon': 126 | try: 127 | subgene_obj[gene_name][feature].append([start, end, strand]) 128 | except KeyError: 129 | subgene_obj[gene_name] = {feature: [[start, end, strand]]} 130 | for i in gene_obj: 131 | gene_obj[i] = sorted(gene_obj[i], key=itemgetter(1)) #sorting based on the start of the feature 132 | for a in subgene_obj: 133 | for b in subgene_obj[a]: 134 | subgene_obj[a][b] = sorted(subgene_obj[a][b], key=itemgetter(0)) #sorting based on the start of the feature 135 | 136 | return {'gene': gene_obj, 'subgene': subgene_obj} 137 | 138 | 139 | def annotate(args): 140 | """ 141 | Main function which iterates over the given input bedfile(perf_output) 142 | Annotates each repeat location based on the close genic features. 143 | 144 | Simple outline: 145 | - Works with the assumption that perf_output is sorted based on co-ordinates. 146 | - For each repeat 147 | * The features on its chromosome are retrived 148 | * 149 | """ 150 | 151 | rep_file = args.output.name 152 | anno_file = args.annotate 153 | annotype = args.anno_format 154 | output_file = open(os.path.splitext(rep_file)[0] + '_annotation.tsv', 'w') 155 | gene_id = args.gene_key 156 | 157 | promUp = args.up_promoter 158 | promDown = args.down_promoter 159 | 160 | gffObject = process_annofile(anno_file, annotype, gene_id) 161 | gene_obj = gffObject['gene'] 162 | subgene_obj = gffObject['subgene'] 163 | 164 | print('', end='\n') 165 | print('Generating annotations for identified repeats..') 166 | print('', end='\n') 167 | # Counting the number of lines in bed ------------------------------------- 168 | num_records = rawcharCount(rep_file, '\n') 169 | with open(rep_file) as bed: 170 | prevSeqName = "Initialise" # Initialise for checking the prevSeqName 171 | minStartIndex = 0 172 | for line in tqdm(bed, total=num_records): 173 | # Object for the output entries to be appended -------------------- 174 | Annotations = {'Genic': [], 'Exon': [], 'Intron': []} 175 | line = line.strip() 176 | if line.startswith('#'): 177 | print(line.strip(), file = output_file) 178 | else: 179 | fields = line.split('\t') 180 | seqname = fields[0] 181 | """ 182 | If the seqname is not same the previous seq name the check 183 | starts from the first gene on the sequence. 184 | """ 185 | if seqname != prevSeqName: 186 | minStartIndex = 0 187 | prevSeqName = seqname 188 | S1 = int(fields[1]) 189 | E1 = int(fields[2]) 190 | least_dist = float('inf') 191 | breakCheck = 0 192 | promoterCheck = 0 193 | try: 194 | for i, a in enumerate(gene_obj[seqname][minStartIndex:]): 195 | annotation = '' 196 | geneName = a[0] 197 | try: 198 | subgeneElements = subgene_obj[geneName] 199 | except KeyError: 200 | subgeneElements = {} 201 | S2 = a[1] 202 | E2 = a[2] 203 | Ori = a[3] 204 | # Transcription Start site 205 | TSS = S2 206 | if Ori == '-': 207 | TSS = E2 208 | 209 | """ 210 | The below conditions make an optimal choice of a feature from where distance 211 | relation comparisons for this paricular repeat can be initiated. 212 | 213 | Basic assumption- 214 | 215 | S2-------Feature-------E2 |--- > maximum promoter distance---| |--- > maximum promoter distance---| S2-------Feature-------E2 216 | |--- > maximum promoter distance---| S1-----Repeat-----E1 |--- > maximum promoter distance---| 217 | 218 | With the condition of choosing the closest feature which is at least at a distance 219 | length of the promoter, we can omit comparisons with features which are much farther away. 220 | 221 | The point where the comparisons stop is the closest feature which is greater than the 222 | distance of promoter from the end of the repeat. 223 | """ 224 | if i == 0: 225 | least_start = S1 - E2 226 | minIndex = i 227 | else: 228 | if S1 - E2 > max([promUp, promDown]): 229 | if least_start > (S1 - E2): 230 | least_start = S1 - E2 231 | minIndex = i 232 | if breakCheck == 1: 233 | break 234 | if (S2 - E1 > max([promUp, promDown])): #point to break the comparisons done 235 | breakCheck = 1 236 | 237 | 238 | # Checking if region comes in promoter -------------------- 239 | # For positive strand orientation ------------------------- 240 | if Ori == '-' and (TSS-promDown <= S1 <= TSS+promUp or TSS-promDown <= E1 <= TSS+promUp): 241 | promoterCheck = 1 242 | elif Ori == '+' and (TSS-promUp <= S1 <= TSS+promDown or TSS-promUp <= E1 <= TSS+promDown): 243 | promoterCheck = 1 244 | # If no Promoter found ------------------------------------ 245 | # Checking if it overlaps --------------------------------- 246 | if (E2 - S1 >=0 and S2 - S1 <=0) or (E2 - E1 >=0 and S2 - E1 <= 0): 247 | annotation = 'Genic' 248 | # Removes the Intergenic entries cause a Genic overlap is found ------------------ 249 | Annotations['Intergenic'] = [] 250 | TSS = S2 251 | diffSS = S2 - S1 252 | diffES = E2 - S1 253 | diffSE = S2 - E1 254 | diffEE = E2 - E1 255 | if abs(diffSS) < abs(diffSE): 256 | TSSdist = diffSS 257 | else: 258 | TSSdist = diffSE 259 | distance = TSSdist 260 | # Checking overlap with subgene ------------------- 261 | if 'exon' in subgeneElements: 262 | for site in subgeneElements['exon']: 263 | S3 = site[0] 264 | E3 = site[1] 265 | if (E3 - S1 >=0 and S3 - S1 <=0) or (E3 - E1 >=0 and S3 - E1 <= 0): 266 | annotation = "Exon" 267 | break 268 | else: 269 | annotation = "Intron" 270 | elif len(Annotations['Exon']) == 0 and len(Annotations['Intron']) == 0 and len(Annotations['Genic']) == 0: 271 | # If no Genic annotations are found closest distace from the closest gene is calculated 272 | TSS = S2 273 | diffSS = S2 - S1 274 | diffES = E2 - S1 275 | diffSE = S2 - E1 276 | diffEE = E2 - E1 277 | if abs(diffSS) < abs(diffSE): 278 | TSSdist = diffSS 279 | else: 280 | TSSdist = diffSE 281 | minDistance = min([abs(diffSS), abs(diffEE), abs(diffSE), abs(diffES)]) 282 | annotation = 'Intergenic' 283 | distance = TSSdist 284 | if minDistance < least_dist: 285 | least_dist = minDistance 286 | Annotations[annotation] = [line + '\t' + '\t'.join(str(b) for b in a) + '\t' + annotation + '\t' + promoter(promoterCheck) + '\t' + str(distance)] 287 | 288 | if (annotation == "Exon" or annotation == "Intron" or annotation == "Genic"): 289 | Annotations[annotation].append(line + '\t' + '\t'.join(str(b) for b in a) + '\t' + annotation + '\t' + promoter(promoterCheck) + '\t' + str(distance)) 290 | 291 | minStartIndex += minIndex 292 | 293 | if minStartIndex > 0: 294 | # Cautious assignment the closest genic feature to start comparisons from 295 | minStartIndex = minStartIndex - 1 296 | #If sequence is not found, reports as annotation not available 297 | except KeyError: 298 | Annotations = {'Genic': [], 'Exon': [], 'Intron': []} 299 | print(line + '\t' + '\t'.join(['-']*7), file = output_file) 300 | for anno in list(Annotations.keys()): 301 | if len(Annotations[anno]) == 0: 302 | del Annotations[anno] 303 | for anno in Annotations: 304 | feature_leastdist = float('inf') 305 | closest_entry = "" 306 | for entry in Annotations[anno]: 307 | feature_dist = int(entry.split('\t')[-1]) 308 | if feature_dist < feature_leastdist: 309 | feature_leastdist = feature_dist 310 | closest_entry = entry 311 | if closest_entry != "": 312 | Annotations[anno] = closest_entry 313 | if len(Annotations) > 1: 314 | anno_selected = select_anno(list(Annotations.keys())) 315 | print(Annotations[anno_selected], file = output_file) 316 | else: 317 | for anno in Annotations: 318 | print(Annotations[anno], file = output_file) 319 | output_file.close() -------------------------------------------------------------------------------- /PERF/core.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | # pylint: disable=C0103, C0301 4 | 5 | from __future__ import print_function, division 6 | import sys, argparse, gzip, json, ntpath 7 | from os.path import splitext 8 | from datetime import datetime 9 | import multiprocessing as multi 10 | 11 | if sys.version_info.major == 2: 12 | from utils import rawcharCount, dotDict, getGC, get_targetids 13 | from rep_utils import generate_repeats, get_ssrs, build_rep_set, fasta_ssrs 14 | from fastq_utils import fastq_ssrs 15 | elif sys.version_info.major == 3: 16 | from .utils import rawcharCount, dotDict, getGC, get_targetids 17 | from .rep_utils import generate_repeats, get_ssrs, build_rep_set, fasta_ssrs 18 | from .fastq_utils import fastq_ssrs 19 | 20 | inf = float('inf') 21 | 22 | 23 | def getArgs(): 24 | """ 25 | Parses command line arguments and returns them to the caller 26 | """ 27 | __version__ = 'v0.4.6' 28 | parser = argparse.ArgumentParser() 29 | parser._action_groups.pop() 30 | 31 | required = parser.add_argument_group('Required arguments') 32 | required.add_argument('-i', '--input', required=True, metavar='', help='Input sequence file.') 33 | 34 | optional = parser.add_argument_group('Optional arguments') 35 | 36 | #Basic options 37 | optional.add_argument('-o', '--output', type=argparse.FileType('w'), metavar='', default=sys.stdout, help='Output file name. Default: Input file name + _perf.tsv') 38 | optional.add_argument('--format', metavar='', default='fasta', help='Input file format. Default: fasta, Permissible: fasta, fastq') 39 | optional.add_argument('--version', action='version', version='PERF ' + __version__) 40 | 41 | #Selections options based on motif size and seq lengths 42 | optional.add_argument('-rep', '--repeats', type=argparse.FileType('r'), metavar='', help='File with list of repeats (Not allowed with -m and/or -M)') 43 | optional.add_argument('-m', '--min-motif-size', type=int, metavar='', help='Minimum size of a repeat motif in bp (Not allowed with -rep)') 44 | optional.add_argument('-M', '--max-motif-size', type=int, metavar='', help='Maximum size of a repeat motif in bp (Not allowed with -rep)') 45 | optional.add_argument('-s', '--min-seq-length', type=int, metavar = '', default=0, help='Minimum size of sequence length for consideration (in bp)') 46 | optional.add_argument('-S', '--max-seq-length', type=float, metavar='', default=inf, help='Maximum size of sequence length for consideration (in bp)') 47 | optional.add_argument('--include-atomic', action='store_true', default=False, help='An option to include factor atomic repeats for minimum motif sizes greater than 1.') 48 | 49 | #Cutoff options (min_length or min_units) 50 | cutoff_group = optional.add_mutually_exclusive_group() 51 | cutoff_group.add_argument('-l', '--min-length', type=int, metavar='', help='Minimum length cutoff of repeat') 52 | cutoff_group.add_argument('-u', '--min-units', metavar='INT or FILE', help="Minimum number of repeating units to be considered. Can be an integer or a file specifying cutoffs for different motif sizes.") 53 | 54 | # Analysis options 55 | optional.add_argument('-a', '--analyse', action='store_true', default=False, help='Generate a summary HTML report.') 56 | optional.add_argument('--info', action='store_true', default=False, help='Sequence file info recorded in the output.') 57 | 58 | #Annotation options 59 | annotation = parser.add_argument_group('Annotation arguments') 60 | annotation.add_argument('-g', '--annotate', metavar='', help='Genic annotation input file for annotation, Both GFF and GTF can be processed. Use --anno-format to specify format.') 61 | annotation.add_argument('--anno-format', metavar='',default='GFF', type=str, help='Format of genic annotation file. Valid inputs: GFF, GTF. Default: GFF') 62 | annotation.add_argument('--gene-key', metavar='', default='gene', type=str, help='Attribute key for geneId. The default identifier is "gene". Please check the annotation file and pick a robust gene identifier from the attribute column.') 63 | annotation.add_argument('--up-promoter', metavar='', type=int, default=1000, help='Upstream distance(bp) from TSS to be considered as promoter region. Default 1000') 64 | annotation.add_argument('--down-promoter', metavar='', type=int, default=1000, help='Downstream distance(bp) from TSS to be considered as promoter region. Default 1000') 65 | 66 | 67 | #Filter based on seqIds 68 | seqid_group = optional.add_mutually_exclusive_group() 69 | seqid_group.add_argument('-f', '--filter-seq-ids', metavar='', help='List of sequence ids in fasta file which will be ignored.') 70 | seqid_group.add_argument('-F', '--target-seq-ids', metavar='', help='List of sequence ids in fasta file which will be used.') 71 | 72 | #Multiprocessing threads 73 | optional.add_argument('-t', '--threads', type=int, metavar='', default=1, help='Number of threads to run the process on. Default is 1.') 74 | 75 | args = parser.parse_args() 76 | 77 | if args.repeats and (args.min_motif_size or args.max_motif_size): 78 | parser.error("-rep is not allowed with -m/-M") 79 | if args.repeats is None: 80 | if args.min_motif_size is None: 81 | args.min_motif_size = 1 82 | if args.max_motif_size is None: 83 | args.max_motif_size = 6 84 | 85 | if args.output.name == "": 86 | args.output = open(splitext(args.input)[0] + '_perf.tsv', 'w') 87 | 88 | return args 89 | 90 | 91 | def ssr_native(args, length_cutoff=False, unit_cutoff=False): 92 | """ 93 | Identifies microsatellites using native string matching. 94 | As the entire sequence is scanned in a single iteration, the speed is vastly improved 95 | """ 96 | repeat_file = args.repeats 97 | if length_cutoff: 98 | length_cutoff = args.min_length 99 | repeats_info = build_rep_set(repeat_file, length_cutoff=length_cutoff) 100 | print('Using length cutoff of %d' % (length_cutoff), file=sys.stderr) 101 | elif unit_cutoff: 102 | repeats_info = build_rep_set(repeat_file, unit_cutoff=unit_cutoff) 103 | print('Using unit cutoff of ', unit_cutoff, file=sys.stderr) 104 | 105 | if args.format == 'fasta': 106 | fasta_ssrs(args, repeats_info) 107 | 108 | elif args.format == 'fastq': 109 | fastq_ssrs(args, repeats_info) 110 | 111 | 112 | def main(): 113 | """ 114 | Main function of the script 115 | """ 116 | args = getArgs() 117 | 118 | 119 | # User specifies motif size range instead of giving a repeats file 120 | if args.repeats is None: 121 | min_motif_size = args.min_motif_size 122 | max_motif_size = args.max_motif_size 123 | sizes = list(range(min_motif_size, max_motif_size+1)) 124 | args.repeats = generate_repeats(sizes, args.include_atomic) 125 | # User specifies minimum length 126 | if args.min_length: 127 | ssr_native(args, length_cutoff=args.min_length) 128 | 129 | # User specific minimum number of units 130 | elif args.min_units: 131 | unit_cutoff = dict() 132 | try: 133 | args.min_units = int(args.min_units) 134 | min_motif_size = args.min_motif_size 135 | max_motif_size = args.max_motif_size 136 | for m in range(min_motif_size, max_motif_size+1): unit_cutoff[m] = args.min_units 137 | except ValueError: 138 | try: 139 | max_motif_size = 0 140 | min_motif_size = float('inf') 141 | with open(args.min_units, 'r') as input_units: 142 | for line in input_units: 143 | L = line.strip().split() 144 | try: 145 | L[0] = int(L[0]) 146 | if (L[0] < min_motif_size): min_motif_size= L[0] 147 | if (L[0] > max_motif_size): max_motif_size= L[0] 148 | L[1] = int(L[1]) 149 | if L[1] == 1: 150 | print('Warning: Repeat unit of 1 used for size %d.' % (L[0]), file=sys.stderr) 151 | unit_cutoff[L[0]] = L[1] 152 | except ValueError: 153 | sys.exit('Invalid file format given for minimum units. Refer to help for more details') 154 | args.repeats = generate_repeats(list(unit_cutoff.keys()), args.include_atomic) 155 | except FileNotFoundError: 156 | sys.exit('Units file specified is not found. Please provide a valid file') 157 | ssr_native(args, unit_cutoff=unit_cutoff) 158 | 159 | # Default settings 160 | elif args.min_length is None and args.min_units is None: 161 | args.min_length = 12 162 | ssr_native(args, length_cutoff=args.min_length) 163 | 164 | if __name__ == '__main__': 165 | main() 166 | -------------------------------------------------------------------------------- /PERF/fastq_utils.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # pylint: disable=C0111, C0301 3 | 4 | from __future__ import print_function, division 5 | from datetime import datetime 6 | from itertools import islice 7 | from collections import Counter, defaultdict 8 | import sys, gzip 9 | import multiprocessing as multi 10 | 11 | if sys.version_info.major == 2: 12 | from utils import dotDict, build_cycVariations 13 | from analyse import analyse_fastq 14 | elif sys.version_info.major == 3: 15 | from .utils import dotDict, build_cycVariations 16 | from .analyse import analyse_fastq 17 | 18 | 19 | def get_ssrs_fastq(seq_record, repeats_info): 20 | """Native function to identify repeats in fastq files""" 21 | 22 | repeats = defaultdict(list) 23 | num_repeats = 0 24 | read = 0 25 | 26 | length_cutoffs = repeats_info['cutoff'] # All possible length cutoffs 27 | input_seq = str(seq_record.seq).upper() 28 | input_seq_length = len(input_seq) 29 | for length_cutoff in length_cutoffs: 30 | fallback = length_cutoff - 1 31 | sub_start = 0 # substring start 32 | sub_stop = sub_start + length_cutoff # substring stop 33 | while sub_stop <= input_seq_length: 34 | sub_stop = sub_start + length_cutoff 35 | sub_seq = input_seq[sub_start:sub_stop] 36 | if sub_seq in repeats_info: 37 | num_repeats += 1 38 | read = 1 39 | match = True 40 | repeat_data = repeats_info[sub_seq] 41 | motif_length = repeat_data['motif_length'] 42 | rep_class = repeat_data['class'] 43 | strand = repeat_data['strand'] 44 | offset = length_cutoff % motif_length 45 | repeat_seq = input_seq[sub_start+offset:sub_start+offset+motif_length] 46 | i = 0 47 | while match: 48 | j = sub_stop 49 | if sub_stop >= input_seq_length: 50 | match = False 51 | match_length = sub_stop - sub_start 52 | repeats[rep_class].append('%s-%d'%(sub_seq[:motif_length], match_length)) 53 | sub_start = sub_stop - fallback 54 | elif input_seq[j] == repeat_seq[i]: 55 | sub_stop += 1 56 | i += 1 57 | if i >= motif_length: 58 | i = 0 59 | else: 60 | match = False 61 | match_length = sub_stop - sub_start 62 | repeats[rep_class].append('%s-%d'%(sub_seq[:motif_length], match_length)) 63 | sub_start = sub_stop - fallback 64 | else: 65 | sub_start += 1 66 | return {'read': read, 'num_repeats': num_repeats, 'repeats': repeats} 67 | 68 | 69 | def process_fastq(args, repeats_info): 70 | """Processes fastq files for identification of repeats.""" 71 | 72 | print('\nProcessing fastq file...') 73 | 74 | if args.input.endswith('gz'): 75 | handle = gzip.open(args.input, 'rt') 76 | else: 77 | handle = open(args.input, 'r') 78 | 79 | n,b = [0,0] 80 | readlen_freq = Counter() 81 | start_time = datetime.now() 82 | 83 | total_repeats = 0 84 | reads_with_repeats = 0 85 | total_repeat_bases = 0 86 | fastq_repeats = dict() 87 | 88 | fastq_repeat_info = dict() 89 | repeat_class_info = dict() 90 | lenFreq_data = dict() 91 | 92 | while 1: 93 | lines_gen = list(islice(handle, 4)) 94 | if len(lines_gen) == 0: 95 | for rep in fastq_repeats: 96 | cycles = build_cycVariations(rep) 97 | for instance in fastq_repeats[rep]: 98 | f = fastq_repeats[rep][instance] 99 | m = instance.split('-')[0] 100 | l = int(instance.split('-')[1]) 101 | if l not in lenFreq_data[rep]: 102 | lenFreq_data[rep][l] = [0]*len(cycles) 103 | lenFreq_data[rep][l][cycles.index(m)] += f 104 | repeat_class_info[rep]['lengths'].update([l]*f) 105 | repeat_class_info[rep]['motifs'].update([m]*f) 106 | repeat_class_info[rep]['instances'] += f 107 | repeat_class_info[rep]['bases'] += l*f 108 | total_repeat_bases += l*f 109 | fastq_repeats[rep] = Counter() 110 | time_diff = datetime.now() - start_time 111 | print('Processed reads: %d | Time elapsed: %s | Rate: %d iters/s\r' %(n, time_diff, n/(time_diff.seconds+1)), end = '') 112 | sys.stdout.flush() 113 | break 114 | 115 | read_id = lines_gen[0].strip() 116 | read_seq = lines_gen[1].strip() 117 | read_len = len(read_seq) 118 | n += 1 # total reads 119 | b += len(read_seq) # total bases 120 | readlen_freq.update([read_len]) #updating the read length frequencies 121 | 122 | if n%50000 == 0: 123 | for rep in fastq_repeats: 124 | cycles = build_cycVariations(rep) 125 | for instance in fastq_repeats[rep]: 126 | f = fastq_repeats[rep][instance] 127 | m = instance.split('-')[0] 128 | l = int(instance.split('-')[1]) 129 | if l not in lenFreq_data[rep]: 130 | lenFreq_data[rep][l] = [0]*len(cycles) 131 | lenFreq_data[rep][l][cycles.index(m)] += f 132 | repeat_class_info[rep]['lengths'].update([l]*f) 133 | repeat_class_info[rep]['motifs'].update([m]*f) 134 | repeat_class_info[rep]['instances'] += f 135 | repeat_class_info[rep]['bases'] += l*f 136 | total_repeat_bases += l*f 137 | fastq_repeats[rep] = Counter() 138 | time_diff = datetime.now() - start_time 139 | print('Processed reads: %d | Time elapsed: %s | Rate: %d iters/s\r' %(n, time_diff, n/(time_diff.seconds+1)), end = '') 140 | sys.stdout.flush() 141 | 142 | record = dotDict({'id': read_id, 'seq': read_seq}) 143 | # read length should be greater than minimum repeat length 144 | if args.min_seq_length <= read_len <= args.max_seq_length: 145 | rep_identified = get_ssrs_fastq(record, repeats_info) 146 | for rep in rep_identified['repeats']: 147 | try: 148 | fastq_repeats[rep].update(rep_identified['repeats'][rep]) 149 | repeat_class_info[rep]['reads'] += 1 150 | except KeyError: 151 | fastq_repeats[rep] = Counter(rep_identified['repeats'][rep]) 152 | repeat_class_info[rep] = {'lengths': Counter(), 'motifs': Counter(), 'instances': 0, 'reads': 1, 'bases': 0} 153 | lenFreq_data[rep] = {} 154 | total_repeats += rep_identified['num_repeats'] 155 | reads_with_repeats += rep_identified['read'] 156 | 157 | fastq_repeat_info['totalRepReads'] = reads_with_repeats 158 | fastq_repeat_info['totalRepFreq'] = total_repeats 159 | fastq_repeat_info['totalRepBases'] = total_repeat_bases 160 | fastq_repeat_info['numRepClasses'] = len(lenFreq_data.keys()) 161 | fastq_repeat_info['lenFrequency'] = lenFreq_data 162 | fastq_repeat_info['repClassInfo'] = repeat_class_info 163 | 164 | print('') #A line for proper printing of the output 165 | min_readlen = min(readlen_freq.keys()) 166 | max_readlen = max(readlen_freq.keys()) 167 | if min_readlen == max_readlen: 168 | readlen_range = '%d bp' %(min_readlen) 169 | else: 170 | readlen_range = '%d-%d (bp)' %(min_readlen, max_readlen) 171 | return {'info': { 'seqInfo': {'Total_reads': n, 'Total_bases': b, 'Readlen_freq': readlen_freq, 172 | 'Readlen_range': readlen_range}, 'repInfo': fastq_repeat_info}} 173 | 174 | 175 | def ssr_fastq_output(fastq_out, out_file): 176 | """PERF OUTPUT for fastq files.""" 177 | 178 | 179 | n = fastq_out['info']['seqInfo']['Total_reads'] 180 | b = fastq_out['info']['seqInfo']['Total_bases'] 181 | readlen_freq = fastq_out['info']['seqInfo']['Readlen_freq'] 182 | readlen_range = fastq_out['info']['seqInfo']['Readlen_range'] 183 | 184 | fastq_repeat_info = fastq_out['info']['repInfo'] 185 | reads_with_repeats = fastq_repeat_info['totalRepReads'] 186 | total_repeats = fastq_repeat_info['totalRepFreq'] 187 | total_repeat_classes = fastq_repeat_info['numRepClasses'] 188 | repeat_class_info = fastq_repeat_info['repClassInfo'] 189 | 190 | print('#Total_reads: %d'%(n), file=out_file) 191 | print('#Total_bases: %d' %(b), file=out_file) 192 | print('#Total_repeat_instances: %d' %(total_repeats), file=out_file) 193 | print('#Total_reads_with_repeats: %d' %(reads_with_repeats), file=out_file) 194 | print('#Total_repeats_per_million_reads: %f' %(round((total_repeats/n)*1000000, 3)), file=out_file) 195 | print('#Read_length_distribution: ', readlen_freq.most_common(), file=out_file) 196 | 197 | print('repeatClass', 'reads', 'instances', 'bases', 'reads_per_million', 'instances_per_million', 198 | 'bases_norm', 'length_distribution', 'motif_distribution', sep='\t', file=out_file) 199 | 200 | for rep in sorted(repeat_class_info, key= lambda k: (len(k), k)): 201 | rep_info = repeat_class_info[rep] 202 | print( 203 | rep, int(rep_info['reads']), int(rep_info['instances']), int(rep_info['bases']), 204 | round((rep_info['reads']/n)*1000000, 3), 205 | round((rep_info['instances']/n)*1000000, 3), 206 | round((rep_info['bases']/b)*1000000, 3), 207 | ';'.join(['-'.join([str(y) for y in x]) for x in sorted(rep_info['lengths'].items())]), 208 | ';'.join(['-'.join([str(y) for y in x]) for x in sorted(rep_info['motifs'].items())]), 209 | sep='\t', file=out_file 210 | ) 211 | 212 | 213 | def fastq_ssrs(args, repeats_info): 214 | 215 | fastq_out = process_fastq(args, repeats_info) 216 | ssr_fastq_output(fastq_out, args.output) 217 | if args.analyse: 218 | analyse_fastq(args, fastq_out) 219 | args.output.close() -------------------------------------------------------------------------------- /PERF/lib/src/anno_charts.js: -------------------------------------------------------------------------------- 1 | const repeatAnnoDist = function(data, repeats, percent=true) { 2 | const annoKeyObj = { 3 | 'Exon': ['EP', 'EN'], 4 | 'Intron': ['IP', 'IN'], 5 | 'Genic': ['GP', 'GN'], 6 | 'Intergenic': ['DP', 'DN'], 7 | }; 8 | const promKeyObj = { 9 | 'Promoter': ['EP', 'IP', 'GP', 'DP'], 10 | 'Non-Promoter': ['EN', 'IN', 'GN', 'DN'] 11 | }; 12 | 13 | let annoKeys = Object.keys(annoKeyObj); 14 | let promKeys = Object.keys(promKeyObj); 15 | 16 | let annodata = {}; 17 | if (repeats === "all") { 18 | let obj = {}; 19 | let annototal = 0; 20 | let promtotal = 0; 21 | annoKeys.forEach(a => { 22 | let val = 0; 23 | Object.keys(data).forEach( r => { 24 | annoKeyObj[a].forEach( a => { 25 | val += data[r][a] 26 | }) 27 | }) 28 | obj[a] = val; 29 | annototal += val; 30 | }); 31 | 32 | promKeys.forEach(a => { 33 | let val = 0; 34 | Object.keys(data).forEach( (r, i) => { 35 | promKeyObj[a].forEach( a => { val += data[r][a] }) 36 | }) 37 | obj[a] = val; 38 | promtotal += val; 39 | }); 40 | 41 | if (percent){ 42 | annoKeys.forEach(a => { obj[a] = ((obj[a]*100)/annototal).toFixed(2) }) 43 | promKeys.forEach(a => { obj[a] = ((obj[a]*100)/promtotal).toFixed(2) }) 44 | } 45 | annodata = obj 46 | } 47 | 48 | else { 49 | 50 | for (let rep of repeats) { 51 | const repindex = Object.keys(data).indexOf(rep); 52 | if (repindex > -1) { 53 | let obj = {}; 54 | annoKeys.forEach(a => { 55 | obj[a] = _.sum(annoKeyObj[a].map(d => { return data[rep][d]; })); 56 | }); 57 | promKeys.forEach(p => { 58 | obj[p] = _.sum(promKeyObj[p].map(d => { return data[rep][d]; })); 59 | }); 60 | annodata[rep] = obj; 61 | } 62 | } 63 | for (let rep of repeats) { 64 | const repindex = Object.keys(data).indexOf(rep); 65 | if (repindex > -1) { 66 | const annototal = _.sum(annoKeys.map(a => {return annodata[rep][a] })); 67 | const promtotal = _.sum(promKeys.map(p => {return annodata[rep][p] })); 68 | if (percent) { 69 | for (let a of annoKeys) { annodata[rep][a] = ((annodata[rep][a]*100)/annototal).toFixed(2); } 70 | for (let a of promKeys) { annodata[rep][a] = ((annodata[rep][a]*100)/promtotal).toFixed(2); } 71 | } 72 | } 73 | } 74 | } 75 | 76 | 77 | return annodata; 78 | } 79 | 80 | 81 | const kmerAnnoDist = function(plotdata, freqdata, stacktype="annotation"){ 82 | const annoKeys = ['Exon', 'Intron', 'Intergenic', 'Genic', 'Promoter', 'Non-Promoter']; 83 | const repeatsObj = {} 84 | const kmerFreq = {} 85 | const kmers = []; 86 | repeats.forEach(d => { 87 | repeatsObj[d.kmer] = _.map(d.repeats, 'class'); 88 | kmerFreq[d.kmer] 89 | kmers.push(d.kmer); 90 | }) 91 | const annoKmerData = {}; 92 | for (const anno of annoKeys) { 93 | annoKmerData[anno] = {'Monomer': 0, 'Dimer': 0, 'Trimer': 0, 'Tetramer': 0, 'Pentamer': 0, 'Hexamer': 0}; 94 | } 95 | 96 | for (let kmer in repeatsObj) { 97 | const classes = repeatsObj[kmer]; 98 | const kmerData = repeatAnnoDist(plotdata, classes, false) 99 | const repFreqData = repeatFrequency(freqdata, classes, 'kmer', 'freq'); 100 | kmerFreq[kmer] = _.sumBy(repFreqData, 'value'); 101 | for (let rep in kmerData) { 102 | for (let anno in kmerData[rep]){ 103 | annoKmerData[anno][kmer] += kmerData[rep][anno] 104 | } 105 | } 106 | } 107 | 108 | const outdata = {data: {} }; 109 | if (stacktype === 'kmer') { 110 | outdata.keys = kmers; 111 | for (const anno in annoKmerData) { 112 | outdata.data[anno] = []; 113 | for (const kmer of kmers) { 114 | // const total = kmerFreq[kmer]; 115 | const annototal = annoKmerData['Exon'][kmer] + annoKmerData['Intron'][kmer] + annoKmerData['Intergenic'][kmer] + annoKmerData['Genic'][kmer] 116 | if (annototal > 0) { 117 | outdata.data[anno].push(((annoKmerData[anno][kmer]*100)/annototal).toFixed(2)) 118 | } 119 | } 120 | } 121 | } 122 | else { 123 | for (const kmer of kmers) { 124 | outdata.data[kmer] = [] 125 | outdata.keys = [] 126 | for (const anno in annoKmerData) { 127 | const total = _.sum(Object.values(annoKmerData[anno])); 128 | if (total > 0) { 129 | outdata.keys.push(anno); 130 | outdata.data[kmer].push(((annoKmerData[anno][kmer]*100)/total).toFixed(2)); 131 | } 132 | } 133 | } 134 | } 135 | return outdata; 136 | } 137 | 138 | const tssHistData = function(data, bins, repeats, bin, datatype) { 139 | const values = _.range(-4975, 4976, 50); 140 | 141 | const stepSize = parseInt(bin/50); 142 | const steps = parseInt((values.length)/stepSize); 143 | 144 | const start = -5000 + parseInt(bin/2); 145 | const end = 5001 - parseInt(bin/2); 146 | const binCenters = _.range(start, end, bin); 147 | 148 | const y = {}; 149 | repeats.forEach(r => { 150 | y[r] = Array(binCenters.length).fill(0); 151 | const repindex = Object.keys(data).indexOf(r); 152 | if (repindex > -1) { 153 | for (let i=0; i { 167 | let val = (d)/total; 168 | return [binCenters[i], val.toFixed(3)]; 169 | }); 170 | } 171 | else { 172 | a = _.map(y[rep], (d, i) => { 173 | return [binCenters[i], 0]; 174 | }); 175 | } 176 | y[rep] = a; 177 | } 178 | else { 179 | let a = _.map(y[rep], (d, i) => { 180 | let val = d; return [binCenters[i], val]; 181 | }) 182 | y[rep] = a; 183 | } 184 | y[rep].push([5000, y[rep][y[rep].length - 1][1]]); 185 | y[rep].unshift([-5000, y[rep][0][1]]); 186 | } 187 | 188 | return { data: y }; 189 | } 190 | 191 | if (data.info.annoInfo) { 192 | 193 | const annostackbar_activeSelected = ['A', 'C']; //allRepClasses; 194 | let stack_group = false; 195 | 196 | $("#anno-stackbar-repeat-select").multiSelect({ 197 | selectableOptgroup: true, 198 | afterSelect: function(d){ d.forEach(function(e){ if (annostackbar_activeSelected.indexOf(e) == -1) { annostackbar_activeSelected.push(e) } })}, 199 | afterDeselect: function(d){ d.forEach(element => { annostackbar_activeSelected.splice(annostackbar_activeSelected.indexOf(element), 1); }); } 200 | }); 201 | 202 | const annostackbar_data = repeatAnnoDist(data.info.annoInfo.repAnno, allRepClasses, false); 203 | var annostackbar_options = { 204 | series: [], 205 | chart: { type: 'bar', stacked: true, stackType: '100%' }, 206 | plotOptions: {}, 207 | stroke: { width: 1, colors: ['#fff'] }, 208 | title: { text: 'Repeat Genomic distribution' }, 209 | xaxis: { }, 210 | tooltip: { y: { formatter: function (val) { return val } } }, 211 | fill: { opacity: 1 }, 212 | legend: { position: 'top', horizontalAlign: 'left', offsetX: 40 } 213 | }; 214 | var annostackbar_chart = new ApexCharts(document.querySelector("#anno-stackbar-plot-area"), annostackbar_options); 215 | annostackbar_chart.render(); 216 | 217 | const annoLabels = ['Exon', 'Intron', 'Genic', 'Intergenic'] 218 | const plot_annostackbar = function(){ 219 | const annostackbar_series = [] 220 | if (!(stack_group)) { 221 | annoLabels.forEach(function(a){ 222 | const d = _.map(annostackbar_activeSelected, o => { 223 | return parseFloat(annostackbar_data[o][a]) 224 | }) 225 | annostackbar_series.push({name: a, data: d}); 226 | }) 227 | annostackbar_chart.updateOptions({series: annostackbar_series, xaxis: {categories: annostackbar_activeSelected}}); 228 | } 229 | 230 | else { 231 | annoLabels.forEach(function(a){ 232 | const d = _.map(annostackbar_activeSelected, o => { 233 | return parseFloat(annostackbar_data[o][a]) 234 | }) 235 | annostackbar_series.push({name: a, data: [_.sum(d)]}); 236 | }) 237 | annostackbar_chart.updateOptions({series: annostackbar_series, xaxis: {categories: ['All selected repeats']}}); 238 | } 239 | } 240 | 241 | $('.ui.checkbox.anno-stackbar').checkbox({ onChange: function(){ 242 | stack_group = !(stack_group); 243 | plot_annostackbar(); 244 | }}); 245 | 246 | $("#anno-stackbar-plot-button").click(function(){ plot_annostackbar(); }); 247 | plot_annostackbar(); 248 | 249 | 250 | const annoarea_activeSelected = ['A', 'C']; //allRepClasses; 251 | let binSize = 500; 252 | $("#anno-area-repeat-select").multiSelect({ 253 | selectableOptgroup: true, 254 | afterSelect: function(d){ d.forEach(function(e){ if (annoarea_activeSelected.indexOf(e) == -1) { annoarea_activeSelected.push(e) } })}, 255 | afterDeselect: function(d){ d.forEach(element => { annoarea_activeSelected.splice(annoarea_activeSelected.indexOf(element), 1); }); } 256 | }); 257 | 258 | $('.ui .dropdown.bin-size').dropdown({ 259 | values: [ 260 | {name: 100, value: 100}, 261 | {name: 200, value: 200}, 262 | {name: 500, value: 500, selected:true}, 263 | {name: 1000, value: 1000} 264 | ], 265 | onChange: function(value) { binSize = value; } 266 | }); 267 | 268 | var annoarea_options = { 269 | series: [], 270 | chart: { type: 'area'}, 271 | plotOptions: {}, 272 | stroke: { width: 1 }, 273 | title: { text: 'Average repeat distribution around TSS' }, 274 | xaxis: { type: 'numeric', min: -5000, max: 5000, tickAmount: 10000/binSize, axisTicks: { height: 8 }}, 275 | tooltip: { 276 | y: { formatter: function (val) { return val } }, 277 | x: { formatter: function (val) { return `${val-parseInt(binSize/2)}bp - ${val+parseInt(binSize/2)}bp` } } 278 | }, 279 | fill: { opacity: 1 }, 280 | legend: { position: 'top', horizontalAlign: 'left', offsetX: 40 } 281 | }; 282 | var annoarea_chart = new ApexCharts(document.querySelector("#anno-area-plot-area"), annoarea_options); 283 | annoarea_chart.render(); 284 | 285 | const plot_annoarea = function() { 286 | const annoarea_data = tssHistData(data.info.annoInfo.TSS_dist, data.info.annoInfo.TSS_histBinEdges, annoarea_activeSelected, binSize, 'density')['data']; 287 | const series = [] 288 | for (let rep of Object.keys(annoarea_data)) { series.push({ name: rep, data: _.map(annoarea_data[rep], o => { return [o[0], parseFloat(o[1])];})}) } 289 | annoarea_chart.updateOptions({ series: series, xaxis: { type: 'numeric', min: -5000, max: 5000, tickAmount: 20, axisTicks: { height: 8 }} }); 290 | } 291 | 292 | $("#anno-area-plot-button").click(function(){ plot_annoarea(); }); 293 | plot_annoarea(); 294 | 295 | } 296 | 297 | else { document.getElementById('anno-charts-main').style.display = 'none'; } -------------------------------------------------------------------------------- /PERF/lib/src/jquery.multi-select.min.js: -------------------------------------------------------------------------------- 1 | !function(e){"use strict";var t=function(t,s){this.options=s,this.$element=e(t),this.$container=e("
",{class:"ms-container"}),this.$selectableContainer=e("
",{class:"ms-selectable"}),this.$selectionContainer=e("
",{class:"ms-selection"}),this.$selectableUl=e("
    ",{class:"ms-list",tabindex:"-1"}),this.$selectionUl=e("
      ",{class:"ms-list",tabindex:"-1"}),this.scrollTo=0,this.elemsSelector="li:visible:not(.ms-optgroup-label,.ms-optgroup-container,."+s.disabledClass+")"};t.prototype={constructor:t,init:function(){var t=this,s=this.$element;if(0===s.next(".ms-container").length){s.css({position:"absolute",left:"-9999px"}),s.attr("id",s.attr("id")?s.attr("id"):Math.ceil(1e3*Math.random())+"multiselect"),this.$container.attr("id","ms-"+s.attr("id")),this.$container.addClass(t.options.cssClass),s.find("option").each(function(){t.generateLisFromOption(this)}),this.$selectionUl.find(".ms-optgroup-label").hide(),t.options.selectableHeader&&t.$selectableContainer.append(t.options.selectableHeader),t.$selectableContainer.append(t.$selectableUl),t.options.selectableFooter&&t.$selectableContainer.append(t.options.selectableFooter),t.options.selectionHeader&&t.$selectionContainer.append(t.options.selectionHeader),t.$selectionContainer.append(t.$selectionUl),t.options.selectionFooter&&t.$selectionContainer.append(t.options.selectionFooter),t.$container.append(t.$selectableContainer),t.$container.append(t.$selectionContainer),s.after(t.$container),t.activeMouse(t.$selectableUl),t.activeKeyboard(t.$selectableUl);var l=t.options.dblClick?"dblclick":"click";t.$selectableUl.on(l,".ms-elem-selectable",function(){t.select(e(this).data("ms-value"))}),t.$selectionUl.on(l,".ms-elem-selection",function(){t.deselect(e(this).data("ms-value"))}),t.activeMouse(t.$selectionUl),t.activeKeyboard(t.$selectionUl),s.on("focus",function(){t.$selectableUl.focus()})}var i=s.find("option:selected").map(function(){return e(this).val()}).get();t.select(i,"init"),"function"==typeof t.options.afterInit&&t.options.afterInit.call(this,this.$container)},generateLisFromOption:function(t,s,l){for(var i=this,n=i.$element,o="",a=e(t),r=0;r"+i.escapeHTML(a.text())+""),h=d.clone(),p=a.val(),f=i.sanitize(p);d.data("ms-value",p).addClass("ms-elem-selectable").attr("id",f+"-selectable"),h.data("ms-value",p).addClass("ms-elem-selection").attr("id",f+"-selection").hide(),(a.prop("disabled")||n.prop("disabled"))&&(h.addClass(i.options.disabledClass),d.addClass(i.options.disabledClass));var u=a.parent("optgroup");if(u.length>0){var m=u.attr("label"),v=i.sanitize(m),b=i.$selectableUl.find("#optgroup-selectable-"+v),g=i.$selectionUl.find("#optgroup-selection-"+v);if(0===b.length){var $='
      • '+m+"
      ";b=e('
    • '),g=e('
    • '),b.attr("id","optgroup-selectable-"+v),g.attr("id","optgroup-selection-"+v),b.append(e($)),g.append(e($)),i.options.selectableOptgroup&&(b.find(".ms-optgroup-label").on("click",function(){var t=u.children(":not(:selected, :disabled)").map(function(){return e(this).val()}).get();i.select(t)}),g.find(".ms-optgroup-label").on("click",function(){var t=u.children(":selected:not(:disabled)").map(function(){return e(this).val()}).get();i.deselect(t)})),i.$selectableUl.append(b),i.$selectionUl.append(g)}s=void 0===s?b.find("ul").children().length:s+1,d.insertAt(s,b.children()),h.insertAt(s,g.children())}else s=void 0===s?i.$selectableUl.children().length:s,d.insertAt(s,i.$selectableUl),h.insertAt(s,i.$selectionUl)},addOption:function(t){var s=this;void 0!==t.value&&null!==t.value&&(t=[t]),e.each(t,function(t,l){if(void 0!==l.value&&null!==l.value&&0===s.$element.find("option[value='"+l.value+"']").length){var i=e('"),n=void 0===l.nested?s.$element:e("optgroup[label='"+l.nested+"']");t=parseInt(void 0===l.index?n.children().length:l.index);l.optionClass&&i.addClass(l.optionClass),l.disabled&&i.prop("disabled",!0),i.insertAt(t,n),s.generateLisFromOption(i.get(0),t,l.nested)}})},escapeHTML:function(t){return e("
      ").text(t).html()},activeKeyboard:function(t){var s=this;t.on("focus",function(){e(this).addClass("ms-focus")}).on("blur",function(){e(this).removeClass("ms-focus")}).on("keydown",function(l){switch(l.which){case 40:case 38:return l.preventDefault(),l.stopPropagation(),void s.moveHighlight(e(this),38===l.which?-1:1);case 37:case 39:return l.preventDefault(),l.stopPropagation(),void s.switchList(t);case 9:if(s.$element.is("[tabindex]")){l.preventDefault();var i=parseInt(s.$element.attr("tabindex"),10);return i=l.shiftKey?i-1:i+1,void e('[tabindex="'+i+'"]').focus()}l.shiftKey&&s.$element.trigger("focus")}if(e.inArray(l.which,s.options.keySelect)>-1)return l.preventDefault(),l.stopPropagation(),void s.selectHighlighted(t)})},moveHighlight:function(e,t){var s=e.find(this.elemsSelector),l=s.filter(".ms-hover"),i=null,n=s.first().outerHeight(),o=e.height();this.$container.prop("id");if(s.removeClass("ms-hover"),1===t){if(0===(i=l.nextAll(this.elemsSelector).first()).length)if((r=l.parent()).hasClass("ms-optgroup")){var a=r.parent().next(":visible");i=a.length>0?a.find(this.elemsSelector).first():s.first()}else i=s.first()}else if(-1===t){var r;if(0===(i=l.prevAll(this.elemsSelector).first()).length)if((r=l.parent()).hasClass("ms-optgroup")){var c=r.parent().prev(":visible");i=c.length>0?c.find(this.elemsSelector).last():s.last()}else i=s.last()}if(i.length>0){i.addClass("ms-hover");var d=e.scrollTop()+i.position().top-o/2+n/2;e.scrollTop(d)}},selectHighlighted:function(e){var t=e.find(this.elemsSelector),s=t.filter(".ms-hover").first();s.length>0&&(e.parent().hasClass("ms-selectable")?this.select(s.data("ms-value")):this.deselect(s.data("ms-value")),t.removeClass("ms-hover"))},switchList:function(e){e.blur(),this.$container.find(this.elemsSelector).removeClass("ms-hover"),e.parent().hasClass("ms-selectable")?this.$selectionUl.focus():this.$selectableUl.focus()},activeMouse:function(t){var s=this;this.$container.on("mouseenter",s.elemsSelector,function(){e(this).parents(".ms-container").find(s.elemsSelector).removeClass("ms-hover"),e(this).addClass("ms-hover")}),this.$container.on("mouseleave",s.elemsSelector,function(){e(this).parents(".ms-container").find(s.elemsSelector).removeClass("ms-hover")})},refresh:function(){this.destroy(),this.$element.multiSelect(this.options)},destroy:function(){e("#ms-"+this.$element.attr("id")).remove(),this.$element.off("focus"),this.$element.css("position","").css("left",""),this.$element.removeData("multiselect")},select:function(t,s){"string"==typeof t&&(t=[t]);var l=this,i=this.$element,n=e.map(t,function(e){return l.sanitize(e)}),o=this.$selectableUl.find("#"+n.join("-selectable, #")+"-selectable").filter(":not(."+l.options.disabledClass+")"),a=this.$selectionUl.find("#"+n.join("-selection, #")+"-selection").filter(":not(."+l.options.disabledClass+")"),r=i.find("option:not(:disabled)").filter(function(){return e.inArray(this.value,t)>-1});if("init"===s&&(o=this.$selectableUl.find("#"+n.join("-selectable, #")+"-selectable"),a=this.$selectionUl.find("#"+n.join("-selection, #")+"-selection")),o.length>0){o.addClass("ms-selected").hide(),a.addClass("ms-selected").show(),r.prop("selected",!0),l.$container.find(l.elemsSelector).removeClass("ms-hover");var c=l.$selectableUl.children(".ms-optgroup-container");if(c.length>0)c.each(function(){var t=e(this).find(".ms-elem-selectable");t.length===t.filter(".ms-selected").length&&e(this).find(".ms-optgroup-label").hide()}),l.$selectionUl.children(".ms-optgroup-container").each(function(){e(this).find(".ms-elem-selection").filter(".ms-selected").length>0&&e(this).find(".ms-optgroup-label").show()});else if(l.options.keepOrder&&"init"!==s){var d=l.$selectionUl.find(".ms-selected");d.length>1&&d.last().get(0)!=a.get(0)&&a.insertAfter(d.last())}"init"!==s&&(i.trigger("change"),"function"==typeof l.options.afterSelect&&l.options.afterSelect.call(this,t))}},deselect:function(t){"string"==typeof t&&(t=[t]);var s=this,l=this.$element,i=e.map(t,function(e){return s.sanitize(e)}),n=this.$selectableUl.find("#"+i.join("-selectable, #")+"-selectable"),o=this.$selectionUl.find("#"+i.join("-selection, #")+"-selection").filter(".ms-selected").filter(":not(."+s.options.disabledClass+")"),a=l.find("option").filter(function(){return e.inArray(this.value,t)>-1});if(o.length>0){n.removeClass("ms-selected").show(),o.removeClass("ms-selected").hide(),a.prop("selected",!1),s.$container.find(s.elemsSelector).removeClass("ms-hover");var r=s.$selectableUl.children(".ms-optgroup-container");if(r.length>0)r.each(function(){e(this).find(".ms-elem-selectable").filter(":not(.ms-selected)").length>0&&e(this).find(".ms-optgroup-label").show()}),s.$selectionUl.children(".ms-optgroup-container").each(function(){0===e(this).find(".ms-elem-selection").filter(".ms-selected").length&&e(this).find(".ms-optgroup-label").hide()});l.trigger("change"),"function"==typeof s.options.afterDeselect&&s.options.afterDeselect.call(this,t)}},select_all:function(){var t=this.$element,s=t.val();if(t.find('option:not(":disabled")').prop("selected",!0),this.$selectableUl.find(".ms-elem-selectable").filter(":not(."+this.options.disabledClass+")").addClass("ms-selected").hide(),this.$selectionUl.find(".ms-optgroup-label").show(),this.$selectableUl.find(".ms-optgroup-label").hide(),this.$selectionUl.find(".ms-elem-selection").filter(":not(."+this.options.disabledClass+")").addClass("ms-selected").show(),this.$selectionUl.focus(),t.trigger("change"),"function"==typeof this.options.afterSelect){var l=e.grep(t.val(),function(t){return e.inArray(t,s)<0});this.options.afterSelect.call(this,l)}},deselect_all:function(){var e=this.$element,t=e.val();e.find("option").prop("selected",!1),this.$selectableUl.find(".ms-elem-selectable").removeClass("ms-selected").show(),this.$selectionUl.find(".ms-optgroup-label").hide(),this.$selectableUl.find(".ms-optgroup-label").show(),this.$selectionUl.find(".ms-elem-selection").removeClass("ms-selected").hide(),this.$selectableUl.focus(),e.trigger("change"),"function"==typeof this.options.afterDeselect&&this.options.afterDeselect.call(this,t)},sanitize:function(e){var t,s=0;if(0==e.length)return s;var l;for(t=0,l=e.length;t { return parseInt(d); }); 67 | let frequency = 0; let bases = 0; 68 | for (let l in lengths) { 69 | l = lengths[l]; frequency += _.sum(plotData[e][l]); bases += _.sum(plotData[e][l]) * l; 70 | } 71 | barData[e] = [bases, frequency]; 72 | } else { barData[e] = [0, 0]; } 73 | }) 74 | let bar_dataType = 1; 75 | let bar_sortSelected = []; 76 | let bar_sortOrder = 1; 77 | let bar_numReps = 10; 78 | let bar_repSelected = []; 79 | let sorted_barKeys = _.sortBy(Object.keys(barData), k => { return barData[k][bar_dataType]; }); 80 | let bar_activeSelected = []; 81 | 82 | const bar_options = { 83 | chart: { type: 'bar' }, 84 | plotOptions: { bar: { horizontal: false, columnwidth: '55%' } }, 85 | series: [{ data: [] }], 86 | dataLabels: { enabled: false }, 87 | yaxis: { 'title': { 'text': 'Frequency', 'style': { 'fontSize': '16px', 'font-weight': 'bold' } }}, 88 | xaxis: { categories: [], 'title': { 'text': 'Repeat Class', 'style': { 'fontSize': '16px', 'font-weight': 'bold' } }}, 89 | title: { text: 'Repeat Frequency', align: 'left' } 90 | } 91 | const bar_chart = new ApexCharts(document.querySelector('#bar-plot-area'), bar_options); 92 | bar_chart.render(); 93 | const plotBar = function(keys){ 94 | const values = []; 95 | keys.forEach(function(e){ values.push(barData[e][bar_dataType]); }); 96 | const name = (bar_dataType == 1) ? 'Frequency' : 'Bases'; 97 | bar_chart.updateOptions({series: [{'name': name, data: values}], yaxis: { title: { 'text': name, 'style': { 'fontSize': '16px', 'font-weight': 'bold' } } }, xaxis: {categories: keys}, animate: true}) 98 | } 99 | 100 | $('#bar-numRep').change(function(){ bar_numReps = this.value; }); 101 | $('.ui .dropdown.sort-order').dropdown({ 102 | values: [{name: 'top', value: 1, selected:true}, {name: 'bottom', value: 0}], 103 | onChange: function(value) { bar_sortOrder = value; } 104 | }); 105 | $('#bar-sortPlot-button').click(function(){ 106 | if (bar_sortOrder == 1) { bar_sortSelected = sorted_barKeys.slice(sorted_barKeys.length - bar_numReps); bar_sortSelected.reverse();} 107 | else { bar_sortSelected = sorted_barKeys.slice(0, bar_numReps); } 108 | bar_activeSelected = bar_sortSelected; plotBar(bar_activeSelected); 109 | }) 110 | $('#bar-sortPlot-button').trigger("click") 111 | 112 | $("#bar-repeat-select").multiSelect({ 113 | selectableOptgroup: true, 114 | afterSelect: function(d){ d.forEach(function(e){ if (bar_repSelected.indexOf(e) == -1) { bar_repSelected.push(e) } })}, 115 | afterDeselect: function(d){ d.forEach(element => { bar_repSelected.splice(bar_repSelected.indexOf(element), 1); }); } 116 | }); 117 | $('#bar-repPlot-button').click(function(){ 118 | bar_repSelected = _.sortBy(bar_repSelected, o => {return allRepClasses.indexOf(o)}); 119 | bar_activeSelected = bar_repSelected; plotBar(bar_activeSelected); 120 | }) 121 | 122 | // Once the data type is selected the global variable bar_dataType changes 123 | // also the sorted_barKeys is updated based on the datatype. 124 | $('.ui.checkbox.bar').checkbox({ onChange: function(val){ 125 | bar_dataType = this.value; 126 | sorted_barKeys = _.sortBy(Object.keys(barData), k => { return barData[k][bar_dataType]; }); 127 | plotBar(bar_activeSelected) 128 | }}); 129 | 130 | $('#asort-alpha').click(function(){ bar_activeSelected = bar_activeSelected.sort(function(a, b){ return allRepClasses.indexOf(a) - allRepClasses.indexOf(b) }); plotBar(bar_activeSelected); }) 131 | $('#dsort-alpha').click(function(){ bar_activeSelected = bar_activeSelected.sort(function(a, b){ return allRepClasses.indexOf(a) - allRepClasses.indexOf(b) }); bar_activeSelected.reverse(); plotBar(bar_activeSelected); }) 132 | $('#asort-num').click(function(){ bar_activeSelected = _.sortBy( bar_activeSelected, k => { return barData[k][bar_dataType];}); plotBar(bar_activeSelected); }) 133 | $('#dsort-num').click(function(){ bar_activeSelected = _.sortBy( bar_activeSelected, k => { return barData[k][bar_dataType];}); bar_activeSelected.reverse(); plotBar(bar_activeSelected); }) 134 | 135 | 136 | /* 137 | Pie graph 138 | - For pie chart as we deal with frequency and bases data of repeat classes we continue using barData for data retrieval. 139 | 140 | - pie_activeSelected is the list of Repeat classes which are considered to be plotted. 141 | - As there is only way to select repeats there will be no updating the repeats. 142 | 143 | The dataflow for the barChart is as follows. 144 | - There are two plot buttons dedicated individually to either plot with data based sorted keys or desired set of keys. 145 | - Based on which plot button is pressed that repeat set gets placed in the bar_activeSelected variable. 146 | - And then subsequently the data for the bar_activeSelected keys is plotted. 147 | - The data type and the sort customisation of the plots only deal with the bar_activeSelected keys. 148 | 149 | */ 150 | 151 | let pie_activeSelected = allRepClasses; 152 | let pie_dataType = 1; 153 | let pie_group = true; 154 | $("#pie-kmer-toggle").checkbox({ 155 | onChecked: function(){ pie_group = true; plotPie(pie_activeSelected) }, 156 | onUnchecked: function(){ pie_group= false; plotPie(pie_activeSelected) } 157 | }); 158 | $("#pie-repeat-select").multiSelect({ 159 | selectableOptgroup: true, 160 | afterSelect: function(d){ d.forEach(function(e){ if (pie_activeSelected.indexOf(e) == -1) { pie_activeSelected.push(e) } })}, 161 | afterDeselect: function(d){ d.forEach(element => { pie_activeSelected.splice(pie_activeSelected.indexOf(element), 1); }); } 162 | }); 163 | $(".ui.checkbox.pie.radio.pie-data-type").checkbox({ onChange: function(){ pie_dataType = this.value; plotPie(pie_activeSelected) }}); 164 | const pie_options = { 165 | chart: { type: 'pie' }, 166 | labels: ['Monomers', 'Dimers', 'Trimers', 'Tetramers', 'Pentamers', 'Hexamers'], 167 | series: [10, 10, 10, 10, 10, 10], 168 | responsive: [{ breakpoint: 480 }], 169 | colors: ["#3366cc", "#dc3912", "#ff9900", "#109618", "#990099", "#0099c6", "#dd4477", "#66aa00", "#b82e2e", "#316395", "#994499", "#22aa99", "#aaaa11", "#6633cc", "#e67300", "#8b0707", "#651067", "#329262", "#5574a6", "#3b3eac"], 170 | // theme: { monochrome: { enabled: true }} 171 | } 172 | let pie_chart = new ApexCharts( document.querySelector("#pie-plot-area"), pie_options ); 173 | pie_chart.render(); 174 | 175 | const plotPie = function(keys){ 176 | let values = []; 177 | keys = keys.sort(function(a, b){ return allRepClasses.indexOf(a) - allRepClasses.indexOf(b) }) 178 | keys.forEach(function(e){ values.push(barData[e][pie_dataType]); }); 179 | if (pie_group == true) { 180 | values = []; 181 | let group_keys = []; 182 | const kmer_lengths = _.uniq(_.map(keys, e => { return e.length; })).sort(); 183 | kmer_lengths.forEach( e => { group_keys.push(numPrefixObj[e - 1]); values.push(0); }); 184 | keys.forEach(e => { values[kmer_lengths.indexOf(e.length)] += barData[e][pie_dataType]; }) 185 | keys = group_keys; 186 | } 187 | pie_chart.updateOptions({labels:keys, series: values, animate: true}) 188 | } 189 | 190 | $("#pie-plot-button").click(function(){ 191 | pie_activeSelected = _.sortBy(pie_activeSelected, o => {return allRepClasses.indexOf(o)}); 192 | plotPie(pie_activeSelected); 193 | }); 194 | 195 | // Initialising Pie Chart 196 | plotPie(pie_activeSelected); 197 | 198 | /* 199 | Line graph 200 | - We retrieve the data for line plots from plotData. 201 | 202 | - Line plot has options to 203 | - Select repeats which will be saved in line_activeSelected variable 204 | - Length range in which it has to plot the data 205 | 206 | - Required data 207 | - The minimum length/units criteria is retrieved from the repInfo. 208 | 209 | The dataflow for the lineChart is as follows. 210 | - The data flow is prestty simple from the options selected the relavant data is retrieved and plotted 211 | 212 | */ 213 | 214 | let minLength = data.info.repInfo.minLength; 215 | let minUnits = data.info.repInfo.minUnits; 216 | let minRange = 12; 217 | let maxRange = 50; 218 | let line_dataType = 1; 219 | let line_activeSelected = ['A', 'C']; 220 | 221 | 222 | $('.ui .dropdown.units').dropdown({ 223 | values: [{name: 'length', value: 1, selected:true}, {name: 'units', value: 0}], 224 | onChange: function(value) { line_dataType = value;} 225 | }); 226 | $("#line-repeat-select").multiSelect({ 227 | selectableOptgroup: true, 228 | afterSelect: function(d){ d.forEach(function(e){ if (line_activeSelected.indexOf(e) == -1) { line_activeSelected.push(e) } }) }, 229 | afterDeselect: function(d){ d.forEach(element => { line_activeSelected.splice(line_activeSelected.indexOf(element), 1); }); } 230 | }); 231 | $('#line-min-len').change(function(){ minRange = parseInt(this.value); plotLine(line_activeSelected); }); 232 | $('#line-max-len').change(function(){ maxRange = parseInt(this.value); plotLine(line_activeSelected); }); 233 | const line_options = { 234 | chart: { type: 'line', zoom: { enabled: false }}, 235 | dataLabels: { enabled: false }, 236 | stroke: { curve: 'straight', width: 2 }, 237 | series: [], 238 | title: { text: 'Repeat sequence length(bp) vs Abundance', align: 'left' }, 239 | grid: { row: { colors: ['#f3f3f3', 'transparent'], opacity: 0.5 } }, 240 | tooltip: { x: { 241 | formatter: function(val) { return `Length: ${val}bp` } 242 | }}, 243 | markers: {size: 0}, 244 | yaxis: { title: { text: 'Frequency', 245 | style: { 'fontSize': '16px', 'font-weight': 'bold' } }}, 246 | xaxis: { title: { text: 'Length (bp)', 247 | style: { 'fontSize': '16px', 'font-weight': 'bold' } }, 248 | // labels: { format: '%d' }, 249 | tickAmount: parseInt((maxRange - minRange)/2) }, 250 | legend: { position: 'top' } 251 | } 252 | const line_chart = new ApexCharts( document.querySelector("#line-plot-area"), line_options ); 253 | line_chart.render(); 254 | 255 | const plotLine = function(keys) { 256 | const xValues = _.range(minRange, maxRange + 1); 257 | const series = []; 258 | keys.forEach(function(key){ 259 | const data = []; 260 | if (line_dataType == 0) { 261 | for (let i = minRange; i <= maxRange; i++) { 262 | let val = 0; 263 | for ( let j = 0; j < key.length; j++) { const repLen = (i*key.length) + j; const v = (plotData[key][repLen]) ? _.sum(plotData[key][repLen]) : 0; val += v; } 264 | data.push(val); 265 | } 266 | } 267 | else { for (let i = minRange; i <= maxRange; i++){ const val = (plotData[key][i]) ? _.sum(plotData[key][i]) : 0; data.push(val); } } 268 | series.push({'name': key, 'data': data}); 269 | }) 270 | line_chart.updateOptions({series: series, xaxis: {categories: xValues}}) 271 | } 272 | $('#line-plot-button').click(function(){ plotLine(line_activeSelected); }) 273 | 274 | // Initialsing line chart 275 | plotLine(line_activeSelected); 276 | -------------------------------------------------------------------------------- /PERF/lib/src/main_fastq.js: -------------------------------------------------------------------------------- 1 | /* 2 | The "main.js" contains core JS component supporting the PERF-Analysis module. 3 | 4 | This web application is developed with Semantic-ui frame work. 5 | The charts are build using Apex-Chart js charting library. 6 | 7 | All the data for the report is derived from analyse_data.js 8 | data = { info: {seqInfo: {}, repInfo: {}, plotInfo: {}} } 9 | 10 | plotInfo i s a dictionary with key as the repeat class and value as a dictionary 11 | plotInfo: { REPEAT_CLASS: { LENGTH: FREQUENCY } } 12 | 13 | */ 14 | 15 | // Updating report data 16 | for (const key in data.info.seqInfo){$(`.value.${key}`).html(data.info.seqInfo[key])}; 17 | for (const key in data.info.repInfo){if (key != 'lenFrequency') { $(`.value.${key}`).html(data.info.repInfo[key]); }}; 18 | 19 | const menuLayout = function(){ 20 | const w = window.innerWidth; 21 | if (w < 800) { 22 | const navmenu = document.getElementById('navmenu'); 23 | navmenu.classList.remove('vertical'); 24 | navmenu.parentElement.style.width = '100%'; 25 | document.getElementById('content-display').style.width = '100%'; 26 | } 27 | 28 | else { 29 | const navmenu = document.getElementById('navmenu') 30 | navmenu.classList.add('vertical'); 31 | navmenu.parentElement.style.width = '5%'; 32 | document.getElementById('content-display').style.width = '95%'; 33 | } 34 | } 35 | window.onresize = function(){ menuLayout(); } 36 | 37 | const numPrefixObj = ["Monomer","Dimer","Trimer","Tetramer","Pentamer","Hexamer","Heptamer","Octamer","Nonamer","Decamer","Undecamer","Dodecamer","Tridecamer","Tetradecamer","Pentadecamer","Hexadecamer","Heptadecamer","Octadecamer","Nonadecamer","Icosamer","Uncosamer","Docosamer","Tricosamer","Tetracosamer","Pentacosamer","Hexacosamer","Heptacosamer","Octacosamer","Nonacosamer","Triacontamer","Untriacontamer","Dotriacontamer","Tritriacontamer","Tetratriacontamer","Pentatriacontamer","Hexatriacontamer","Heptatriacontamer","Octatriacontamer","Nonatriacontamer","Tetracontamer","Untetracontamer","Dotetracontamer","Tritetracontamer","Tetratetracontamer","Pentatetracontamer","Hexatetracontamer","Heptatetracontamer","Octatetracontamer","Nonatetracontamer","Pentacontamer"] 38 | const plotData = data.info.repInfo.lenFrequency; 39 | const allRepClasses = data.info.repInfo.allRepClasses; 40 | 41 | $('.ui.dropdown').dropdown(); 42 | $('.chart .item').tab(); 43 | $('.anno-chart .item').tab(); 44 | $('.ui .units').dropdown({values: [{name: 'length', value: 1, selected:true}, {name: 'units', value: 0}]}); 45 | 46 | 47 | // const read_bar_options = { 48 | // chart: { type: 'bar' }, 49 | // plotOptions: { bar: { horizontal: false, columnwidth: '55%' } }, 50 | // series: [{ data: [] }], 51 | // dataLabels: { enabled: false }, 52 | // yaxis: { 'title': { 'text': 'Number of reads', 'style': { 'fontSize': '16px', 'font-weight': 'bold' } }}, 53 | // xaxis: { categories: [], 'title': { 'text': 'Read length(bp)', 'style': { 'fontSize': '16px', 'font-weight': 'bold' } }}, 54 | // title: { text: 'Read length distribution', align: 'left' } 55 | // } 56 | // const read_bar_chart = new ApexCharts(document.querySelector('#read-len-plot'), read_bar_options); 57 | // read_chart.render(); 58 | 59 | 60 | /* 61 | Bar graph 62 | - For bar graph we curate the data in barData with Repeat class as the key and [bases, frequency] as the value. 63 | 64 | - bar_activeSelected is the list of Repeat classes which are considered to be plotted. 65 | - can be selected by sort selection which is bar_sortSelected. 66 | - or by repeat selection dropdown stored in bar_repSelected. 67 | - sorted_barKeys stores the all repClasses sorted based on the datatype selected. 68 | 69 | The dataflow for the barChart is as follows. 70 | - There are two plot buttons dedicated individually to either plot with data based sorted keys or desired set of keys. 71 | - Based on which plot button is pressed that repeat set gets placed in the bar_activeSelected variable. 72 | - And then subsequently the data for the bar_activeSelected keys is plotted. 73 | - The data type and the sort customisation of the plots only deal with the bar_activeSelected keys. 74 | 75 | */ 76 | 77 | 78 | const barData = {}; 79 | allRepClasses.forEach(function(e){ 80 | if ((Object.keys(plotData).indexOf(e) != -1) && (plotData[e] != 0)) { 81 | const lengths = _.map(Object.keys(plotData[e]), d => { return parseInt(d); }); 82 | let frequency = 0; let bases = 0; let reads = 0; 83 | for (let l in lengths) { 84 | l = lengths[l]; frequency += _.sum(plotData[e][l]); bases += _.sum(plotData[e][l]) * l; 85 | } 86 | reads = parseInt(data.info.repInfo.repClassInfo[e]['reads']); 87 | barData[e] = [bases, frequency, reads]; 88 | } else { barData[e] = [0, 0, 0]; } 89 | }) 90 | let bar_dataType = 1; 91 | const bar_dataTypes = ['Bases', 'Frequency', 'Reads']; 92 | let bar_sortSelected = []; 93 | let bar_sortOrder = 1; 94 | let bar_numReps = 10; 95 | let bar_repSelected = []; 96 | let sorted_barKeys = _.sortBy(Object.keys(barData), k => { return barData[k][bar_dataType]; }); 97 | let bar_activeSelected = []; 98 | 99 | const bar_options = { 100 | chart: { type: 'bar' }, 101 | plotOptions: { bar: { horizontal: false, columnwidth: '55%' } }, 102 | series: [{ data: [] }], 103 | dataLabels: { enabled: false }, 104 | yaxis: { 'title': { 'text': 'Frequency', 'style': { 'fontSize': '16px', 'font-weight': 'bold' } }}, 105 | xaxis: { categories: [], 'title': { 'text': 'Repeat Class', 'style': { 'fontSize': '16px', 'font-weight': 'bold' } }}, 106 | title: { text: 'Repeat Frequency', align: 'left' } 107 | } 108 | const bar_chart = new ApexCharts(document.querySelector('#bar-plot-area'), bar_options); 109 | bar_chart.render(); 110 | const plotBar = function(keys){ 111 | const values = []; 112 | keys.forEach(function(e){ values.push(barData[e][bar_dataType]); }); 113 | const name = bar_dataTypes[bar_dataType]; 114 | bar_chart.updateOptions({series: [{'name': name, data: values}], yaxis: { title: { 'text': name, 'style': { 'fontSize': '16px', 'font-weight': 'bold' } } }, xaxis: {categories: keys}, animate: true}) 115 | } 116 | 117 | $('#bar-numRep').change(function(){ bar_numReps = this.value; }); 118 | $('.ui .dropdown.sort-order').dropdown({ 119 | values: [{name: 'top', value: 1, selected:true}, {name: 'bottom', value: 0}], 120 | onChange: function(value) { bar_sortOrder = value; } 121 | }); 122 | $('#bar-sortPlot-button').click(function(){ 123 | if (bar_sortOrder == 1) { bar_sortSelected = sorted_barKeys.slice(sorted_barKeys.length - bar_numReps); bar_sortSelected.reverse();} 124 | else { bar_sortSelected = sorted_barKeys.slice(0, bar_numReps); } 125 | bar_activeSelected = bar_sortSelected; plotBar(bar_activeSelected); 126 | }) 127 | $('#bar-sortPlot-button').trigger("click") 128 | 129 | $("#bar-repeat-select").multiSelect({ 130 | selectableOptgroup: true, 131 | afterSelect: function(d){ d.forEach(function(e){ if (bar_repSelected.indexOf(e) == -1) { bar_repSelected.push(e) } })}, 132 | afterDeselect: function(d){ d.forEach(element => { bar_repSelected.splice(bar_repSelected.indexOf(element), 1); }); } 133 | }); 134 | $('#bar-repPlot-button').click(function(){ 135 | bar_repSelected = _.sortBy(bar_repSelected, o => {return allRepClasses.indexOf(o)}); 136 | bar_activeSelected = bar_repSelected; plotBar(bar_activeSelected); 137 | }) 138 | 139 | // Once the data type is selected the global variable bar_dataType changes 140 | // also the sorted_barKeys is updated based on the datatype. 141 | $('.ui.checkbox.bar').checkbox({ onChange: function(val){ 142 | bar_dataType = this.value; 143 | sorted_barKeys = _.sortBy(Object.keys(barData), k => { return barData[k][bar_dataType]; }); 144 | plotBar(bar_activeSelected) 145 | }}); 146 | 147 | $('#asort-alpha').click(function(){ bar_activeSelected = bar_activeSelected.sort(function(a, b){ return allRepClasses.indexOf(a) - allRepClasses.indexOf(b) }); plotBar(bar_activeSelected); }) 148 | $('#dsort-alpha').click(function(){ bar_activeSelected = bar_activeSelected.sort(function(a, b){ return allRepClasses.indexOf(a) - allRepClasses.indexOf(b) }); bar_activeSelected.reverse(); plotBar(bar_activeSelected); }) 149 | $('#asort-num').click(function(){ bar_activeSelected = _.sortBy( bar_activeSelected, k => { return barData[k][bar_dataType];}); plotBar(bar_activeSelected); }) 150 | $('#dsort-num').click(function(){ bar_activeSelected = _.sortBy( bar_activeSelected, k => { return barData[k][bar_dataType];}); bar_activeSelected.reverse(); plotBar(bar_activeSelected); }) 151 | 152 | 153 | /* 154 | Pie graph 155 | - For pie chart as we deal with frequency and bases data of repeat classes we continue using barData for data retrieval. 156 | 157 | - pie_activeSelected is the list of Repeat classes which are considered to be plotted. 158 | - As there is only way to select repeats there will be no updating the repeats. 159 | 160 | The dataflow for the barChart is as follows. 161 | - There are two plot buttons dedicated individually to either plot with data based sorted keys or desired set of keys. 162 | - Based on which plot button is pressed that repeat set gets placed in the bar_activeSelected variable. 163 | - And then subsequently the data for the bar_activeSelected keys is plotted. 164 | - The data type and the sort customisation of the plots only deal with the bar_activeSelected keys. 165 | 166 | */ 167 | 168 | let pie_activeSelected = allRepClasses; 169 | let pie_dataType = 1; 170 | let pie_group = true; 171 | $("#pie-kmer-toggle").checkbox({ 172 | onChecked: function(){ pie_group = true; plotPie(pie_activeSelected) }, 173 | onUnchecked: function(){ pie_group= false; plotPie(pie_activeSelected) } 174 | }); 175 | $("#pie-repeat-select").multiSelect({ 176 | selectableOptgroup: true, 177 | afterSelect: function(d){ d.forEach(function(e){ if (pie_activeSelected.indexOf(e) == -1) { pie_activeSelected.push(e) } })}, 178 | afterDeselect: function(d){ d.forEach(element => { pie_activeSelected.splice(pie_activeSelected.indexOf(element), 1); }); } 179 | }); 180 | $(".ui.checkbox.pie.radio.pie-data-type").checkbox({ onChange: function(){ pie_dataType = this.value; console.log(pie_dataType); plotPie(pie_activeSelected) }}); 181 | const pie_options = { 182 | chart: { type: 'pie' }, 183 | labels: ['Monomers', 'Dimers', 'Trimers', 'Tetramers', 'Pentamers', 'Hexamers'], 184 | series: [10, 10, 10, 10, 10, 10], 185 | responsive: [{ breakpoint: 480 }], 186 | colors: ["#3366cc", "#dc3912", "#ff9900", "#109618", "#990099", "#0099c6", "#dd4477", "#66aa00", "#b82e2e", "#316395", "#994499", "#22aa99", "#aaaa11", "#6633cc", "#e67300", "#8b0707", "#651067", "#329262", "#5574a6", "#3b3eac"], 187 | // theme: { monochrome: { enabled: true }} 188 | } 189 | let pie_chart = new ApexCharts( document.querySelector("#pie-plot-area"), pie_options ); 190 | pie_chart.render(); 191 | 192 | const plotPie = function(keys){ 193 | let values = []; 194 | keys = keys.sort(function(a, b){ return allRepClasses.indexOf(a) - allRepClasses.indexOf(b) }) 195 | keys.forEach(function(e){ values.push(barData[e][pie_dataType]); }); 196 | if (pie_group == true) { 197 | values = []; 198 | let group_keys = []; 199 | const kmer_lengths = _.uniq(_.map(keys, e => { return e.length; })).sort(); 200 | kmer_lengths.forEach( e => { group_keys.push(numPrefixObj[e - 1]); values.push(0); }); 201 | keys.forEach(e => { values[kmer_lengths.indexOf(e.length)] += barData[e][pie_dataType]; }) 202 | keys = group_keys; 203 | } 204 | pie_chart.updateOptions({labels:keys, series: values, animate: true}) 205 | } 206 | 207 | $("#pie-plot-button").click(function(){ 208 | pie_activeSelected = _.sortBy(pie_activeSelected, o => {return allRepClasses.indexOf(o)}); 209 | plotPie(pie_activeSelected); 210 | }); 211 | 212 | // Initialising Pie Chart 213 | plotPie(pie_activeSelected); 214 | 215 | /* 216 | Line graph 217 | - We retrieve the data for line plots from plotData. 218 | 219 | - Line plot has options to 220 | - Select repeats which will be saved in line_activeSelected variable 221 | - Length range in which it has to plot the data 222 | 223 | - Required data 224 | - The minimum length/units criteria is retrieved from the repInfo. 225 | 226 | The dataflow for the lineChart is as follows. 227 | - The data flow is prestty simple from the options selected the relavant data is retrieved and plotted 228 | 229 | */ 230 | 231 | let minLength = data.info.repInfo.minLength; 232 | let minUnits = data.info.repInfo.minUnits; 233 | let minRange = 12; 234 | let maxRange = 50; 235 | let line_dataType = 1; 236 | let line_activeSelected = ['A', 'C']; 237 | 238 | 239 | $('.ui .dropdown.units').dropdown({ 240 | values: [{name: 'length', value: 1, selected:true}, {name: 'units', value: 0}], 241 | onChange: function(value) { line_dataType = value;} 242 | }); 243 | $("#line-repeat-select").multiSelect({ 244 | selectableOptgroup: true, 245 | afterSelect: function(d){ d.forEach(function(e){ if (line_activeSelected.indexOf(e) == -1) { line_activeSelected.push(e) } }) }, 246 | afterDeselect: function(d){ d.forEach(element => { line_activeSelected.splice(line_activeSelected.indexOf(element), 1); }); } 247 | }); 248 | $('#line-min-len').change(function(){ minRange = parseInt(this.value); plotLine(line_activeSelected); }); 249 | $('#line-max-len').change(function(){ maxRange = parseInt(this.value); plotLine(line_activeSelected); }); 250 | const line_options = { 251 | chart: { type: 'line', zoom: { enabled: false }}, 252 | dataLabels: { enabled: false }, 253 | stroke: { curve: 'straight', width: 2 }, 254 | series: [], 255 | title: { text: 'Repeat sequence length(bp) vs Abundance', align: 'left' }, 256 | grid: { row: { colors: ['#f3f3f3', 'transparent'], opacity: 0.5 } }, 257 | tooltip: { x: { 258 | formatter: function(val) { return `Length: ${val}bp` } 259 | }}, 260 | markers: {size: 0}, 261 | yaxis: { title: { text: 'Frequency', 262 | style: { 'fontSize': '16px', 'font-weight': 'bold' } }}, 263 | xaxis: { title: { text: 'Length (bp)', 264 | style: { 'fontSize': '16px', 'font-weight': 'bold' } }, 265 | // labels: { format: '%d' }, 266 | tickAmount: parseInt((maxRange - minRange)/2) }, 267 | legend: { position: 'top' } 268 | } 269 | const line_chart = new ApexCharts( document.querySelector("#line-plot-area"), line_options ); 270 | line_chart.render(); 271 | 272 | const plotLine = function(keys) { 273 | const xValues = _.range(minRange, maxRange + 1); 274 | const series = []; 275 | keys.forEach(function(key){ 276 | const data = []; 277 | if (line_dataType == 0) { 278 | for (let i = minRange; i <= maxRange; i++) { 279 | let val = 0; 280 | for ( let j = 0; j < key.length; j++) { const repLen = (i*key.length) + j; const v = (plotData[key][repLen]) ? _.sum(plotData[key][repLen]) : 0; val += v; } 281 | data.push(val); 282 | } 283 | } 284 | else { for (let i = minRange; i <= maxRange; i++){ const val = (plotData[key][i]) ? _.sum(plotData[key][i]) : 0; data.push(val); } } 285 | series.push({'name': key, 'data': data}); 286 | }) 287 | line_chart.updateOptions({series: series, xaxis: {categories: xValues}}) 288 | } 289 | $('#line-plot-button').click(function(){ plotLine(line_activeSelected); }) 290 | 291 | // Initialsing line chart 292 | plotLine(line_activeSelected); 293 | -------------------------------------------------------------------------------- /PERF/lib/src/tables_fasta.js: -------------------------------------------------------------------------------- 1 | /* 2 | The "tables.js" contains JS code which updates the data for 3 | tables present in the PERF-analysis module. 4 | 5 | This web application is developed with Semantic-ui frame work. 6 | The charts are build using Apex-Chart js charting library. 7 | 8 | All the data for the report is derived from analyse_data.js 9 | data = {info: {genomeInfo: {}, repInfo: {}, plotInfo: {}}} 10 | 11 | */ 12 | 13 | 14 | 15 | const updateSummaryTableData = function(tableId, tableData) { 16 | const tableDOM = document.getElementById(tableId); 17 | 18 | const table = document.createElement('table'); 19 | table.className = "ui sortable celled table"; 20 | const tableHead = document.createElement('thead') 21 | const tableHeadRow = document.createElement('tr'); 22 | const header = ['Repeat Class', 'Frequency', '% Frequency', 'Bases', '% Bases'] 23 | header.forEach(function(e){ const headCell = document.createElement('th'); headCell.innerHTML = e; tableHeadRow.appendChild(headCell); }) 24 | tableHead.appendChild(tableHeadRow); 25 | 26 | const tableBody = document.createElement('tbody'); 27 | const totalRepBases = _.sum(_.map(Object.keys(tableData), o => { return tableData[o][0]; })); 28 | const totalRepFreq = _.sum(_.map(Object.keys(tableData), o => { return tableData[o][1]; })); 29 | const totals = [totalRepBases, totalRepFreq] 30 | for (let rep in allRepClasses) { 31 | rep = allRepClasses[rep]; 32 | const row = document.createElement('tr'); 33 | const rep_cell = document.createElement('td'); 34 | rep_cell.innerHTML = rep; row.appendChild(rep_cell); 35 | 36 | const rowData = []; 37 | tableData[rep].forEach(function(d, i){ rowData.push(d); rowData.push(((d/totals[i])*100).toFixed(3)); }); 38 | rowData.forEach(function(e){ const cell = document.createElement('td'); cell.innerHTML = e; row.appendChild(cell); }) 39 | 40 | tableBody.appendChild(row); 41 | } 42 | 43 | table.appendChild(tableHead); 44 | table.appendChild(tableBody); 45 | tableDOM.appendChild(table); 46 | } 47 | 48 | updateSummaryTableData('rep-summary-table', barData); 49 | 50 | const updateLongestRepeatsTableData = function(tableId, tableData) { 51 | const tableDOM = document.getElementById(tableId); 52 | const table = document.createElement('table'); 53 | table.className = "ui sortable celled table"; 54 | const tableHead = document.createElement('thead') 55 | const tableHeadRow = document.createElement('tr'); 56 | const header = ['Sequence id', 'Start', 'Stop', 'Repeat Class', 'Repeat length', 'Strand', 'Units', 'Actual Repeat']; 57 | header.forEach(function(e){ const headCell = document.createElement('th'); headCell.innerHTML = e; tableHeadRow.appendChild(headCell); }) 58 | tableHead.appendChild(tableHeadRow); 59 | 60 | const tableBody = document.createElement('tbody'); 61 | for (let d in tableData) { 62 | d = tableData[d]; 63 | const row = document.createElement('tr'); 64 | const rowData = ["seq", "start", "end", "repClass", "repLength", "repOri", "repUnit", "actualRep"]; 65 | rowData.forEach(function(e){ const cell = document.createElement('td'); cell.innerHTML = d[e]; row.appendChild(cell); }) 66 | tableBody.appendChild(row); 67 | } 68 | 69 | table.appendChild(tableHead); 70 | table.appendChild(tableBody); 71 | tableDOM.appendChild(table); 72 | } 73 | 74 | updateLongestRepeatsTableData('longest-repeats-table', data.info.repInfo.longestRepeats); 75 | updateLongestRepeatsTableData('mostunits-repeats-table', data.info.repInfo.mostRepeatUnits); -------------------------------------------------------------------------------- /PERF/lib/src/tables_fastq.js: -------------------------------------------------------------------------------- 1 | /* 2 | The "tables.js" contains JS code which updates the data for 3 | tables present in the PERF-analysis module. 4 | 5 | This web application is developed with Semantic-ui frame work. 6 | The charts are build using Apex-Chart js charting library. 7 | 8 | All the data for the report is derived from analyse_data.js 9 | data = {info: {genomeInfo: {}, repInfo: {}, plotInfo: {}}} 10 | 11 | */ 12 | 13 | 14 | 15 | const updateSummaryTableData = function(tableId, tableData) { 16 | const tableDOM = document.getElementById(tableId); 17 | 18 | const table = document.createElement('table'); 19 | table.className = "ui sortable celled table"; 20 | const tableHead = document.createElement('thead') 21 | const tableHeadRow = document.createElement('tr'); 22 | const header = ['Repeat Class', 'Frequency', 'Frequency per million', '% Frequency', 'Reads', 'Reads per million', '% Reads', 'Bases'] 23 | header.forEach(function(e){ const headCell = document.createElement('th'); headCell.innerHTML = e; tableHeadRow.appendChild(headCell); }) 24 | tableHead.appendChild(tableHeadRow); 25 | 26 | const tableBody = document.createElement('tbody'); 27 | 28 | // const totalRepBases = _.sum(_.map(Object.keys(tableData), o => { return tableData[o][0]; })); 29 | const totalRepFreq = tableData.totalRepFreq; 30 | const totalRepReads = tableData.totalRepReads; 31 | const totals = [totalRepReads, totalRepFreq] 32 | const cell_keys = ['instances'] 33 | for (let rep in allRepClasses) { 34 | rep = allRepClasses[rep]; 35 | const row = document.createElement('tr'); 36 | const rep_cell = document.createElement('td'); 37 | rep_cell.innerHTML = rep; row.appendChild(rep_cell); 38 | 39 | let rowData = []; 40 | 41 | if (rep in tableData.repClassInfo) { 42 | const repInfo = tableData.repClassInfo[rep]; 43 | rowData.push(repInfo['instances']); 44 | rowData.push(repInfo['instances_norm']); 45 | rowData.push(((repInfo['instances']/totalRepFreq)*100).toFixed(3)); 46 | 47 | rowData.push(repInfo['reads']); 48 | rowData.push(repInfo['reads_norm']); 49 | rowData.push(((repInfo['reads']/totalRepReads)*100).toFixed(3)); 50 | 51 | rowData.push(repInfo['bases']); 52 | } 53 | else { rowData = Array(7).fill(0); } 54 | rowData.forEach(function(e){ const cell = document.createElement('td'); cell.innerHTML = e; row.appendChild(cell); }) 55 | 56 | tableBody.appendChild(row); 57 | } 58 | 59 | table.appendChild(tableHead); 60 | table.appendChild(tableBody); 61 | tableDOM.appendChild(table); 62 | } 63 | 64 | updateSummaryTableData('rep-summary-table', data.info.repInfo); -------------------------------------------------------------------------------- /PERF/lib/styles/apexcharts.min.css: -------------------------------------------------------------------------------- 1 | .apexcharts-canvas{position:relative;user-select:none}.apexcharts-canvas ::-webkit-scrollbar{-webkit-appearance:none;width:6px}.apexcharts-canvas ::-webkit-scrollbar-thumb{border-radius:4px;background-color:rgba(0,0,0,.5);box-shadow:0 0 1px rgba(255,255,255,.5);-webkit-box-shadow:0 0 1px rgba(255,255,255,.5)}.apexcharts-inner{position:relative}.legend-mouseover-inactive{transition:.15s ease all;opacity:.2}.apexcharts-series-collapsed{opacity:0}.apexcharts-gridline,.apexcharts-text{pointer-events:none}.apexcharts-tooltip{border-radius:5px;box-shadow:2px 2px 6px -4px #999;cursor:default;font-size:14px;left:62px;opacity:0;pointer-events:none;position:absolute;top:20px;overflow:hidden;white-space:nowrap;z-index:12;transition:.15s ease all}.apexcharts-tooltip.light{border:1px solid #e3e3e3;background:rgba(255,255,255,.96)}.apexcharts-tooltip.dark{color:#fff;background:rgba(30,30,30,.8)}.apexcharts-tooltip *{font-family:inherit}.apexcharts-area-series .apexcharts-area,.apexcharts-line,.apexcharts-tooltip .apexcharts-marker{pointer-events:none}.apexcharts-tooltip.active{opacity:1;transition:.15s ease all}.apexcharts-tooltip-title{padding:6px;font-size:15px;margin-bottom:4px}.apexcharts-tooltip.light .apexcharts-tooltip-title{background:#eceff1;border-bottom:1px solid #ddd}.apexcharts-tooltip.dark .apexcharts-tooltip-title{background:rgba(0,0,0,.7);border-bottom:1px solid #222}.apexcharts-tooltip-text-value,.apexcharts-tooltip-text-z-value{display:inline-block;font-weight:600;margin-left:5px}.apexcharts-tooltip-text-z-label:empty,.apexcharts-tooltip-text-z-value:empty{display:none}.apexcharts-tooltip-text-value,.apexcharts-tooltip-text-z-value{font-weight:600}.apexcharts-tooltip-marker{width:12px;height:12px;position:relative;top:0;margin-right:10px;border-radius:50%}.apexcharts-tooltip-series-group{padding:0 10px;display:none;text-align:left;justify-content:left;align-items:center}.apexcharts-tooltip-series-group.active .apexcharts-tooltip-marker{opacity:1}.apexcharts-tooltip-series-group.active,.apexcharts-tooltip-series-group:last-child{padding-bottom:4px}.apexcharts-tooltip-y-group{padding:6px 0 5px}.apexcharts-tooltip-candlestick{padding:4px 8px}.apexcharts-tooltip-candlestick>div{margin:4px 0}.apexcharts-tooltip-candlestick span.value{font-weight:700}.apexcharts-xaxistooltip{opacity:0;padding:9px 10px;pointer-events:none;color:#373d3f;font-size:13px;text-align:center;border-radius:2px;position:absolute;z-index:10;background:#eceff1;border:1px solid #90a4ae;transition:.15s ease all}.apexcharts-xaxistooltip:after,.apexcharts-xaxistooltip:before{left:50%;border:solid transparent;content:" ";height:0;width:0;position:absolute;pointer-events:none}.apexcharts-xaxistooltip:after{border-color:rgba(236,239,241,0);border-width:6px;margin-left:-6px}.apexcharts-xaxistooltip:before{border-color:rgba(144,164,174,0);border-width:7px;margin-left:-7px}.apexcharts-xaxistooltip-bottom:after,.apexcharts-xaxistooltip-bottom:before{bottom:100%}.apexcharts-xaxistooltip-bottom:after{border-bottom-color:#eceff1}.apexcharts-xaxistooltip-bottom:before{border-bottom-color:#90a4ae}.apexcharts-xaxistooltip-top:after,.apexcharts-xaxistooltip-top:before{top:100%}.apexcharts-xaxistooltip-top:after{border-top-color:#eceff1}.apexcharts-xaxistooltip-top:before{border-top-color:#90a4ae}.apexcharts-xaxistooltip.active{opacity:1;transition:.15s ease all}.apexcharts-yaxistooltip{opacity:0;padding:4px 10px;pointer-events:none;color:#373d3f;font-size:13px;text-align:center;border-radius:2px;position:absolute;z-index:10;background:#eceff1;border:1px solid #90a4ae}.apexcharts-yaxistooltip:after,.apexcharts-yaxistooltip:before{top:50%;border:solid transparent;content:" ";height:0;width:0;position:absolute;pointer-events:none}.apexcharts-yaxistooltip:after{border-color:rgba(236,239,241,0);border-width:6px;margin-top:-6px}.apexcharts-yaxistooltip:before{border-color:rgba(144,164,174,0);border-width:7px;margin-top:-7px}.apexcharts-yaxistooltip-left:after,.apexcharts-yaxistooltip-left:before{left:100%}.apexcharts-yaxistooltip-left:after{border-left-color:#eceff1}.apexcharts-yaxistooltip-left:before{border-left-color:#90a4ae}.apexcharts-yaxistooltip-right:after,.apexcharts-yaxistooltip-right:before{right:100%}.apexcharts-yaxistooltip-right:after{border-right-color:#eceff1}.apexcharts-yaxistooltip-right:before{border-right-color:#90a4ae}.apexcharts-yaxistooltip.active{opacity:1}.apexcharts-xcrosshairs,.apexcharts-ycrosshairs{pointer-events:none;opacity:0;transition:.15s ease all}.apexcharts-xcrosshairs.active,.apexcharts-ycrosshairs.active{opacity:1;transition:.15s ease all}.apexcharts-ycrosshairs-hidden{opacity:0}.apexcharts-zoom-rect{pointer-events:none}.apexcharts-selection-rect{cursor:move}.svg_select_points,.svg_select_points_rot{opacity:0;visibility:hidden}.svg_select_points_l,.svg_select_points_r{cursor:ew-resize;opacity:1;visibility:visible;fill:#888}.apexcharts-canvas.zoomable .hovering-zoom{cursor:crosshair}.apexcharts-canvas.zoomable .hovering-pan{cursor:move}.apexcharts-xaxis,.apexcharts-yaxis{pointer-events:none}.apexcharts-menu-icon,.apexcharts-pan-icon,.apexcharts-reset-zoom-icon,.apexcharts-selection-icon,.apexcharts-zoom-icon,.apexcharts-zoom-in-icon,.apexcharts-zoom-out-icon{cursor:pointer;width:20px;height:20px;text-align:center}.apexcharts-menu-icon svg,.apexcharts-reset-zoom-icon svg,.apexcharts-zoom-icon svg,.apexcharts-zoom-in-icon svg,.apexcharts-zoom-out-icon svg{fill:#6e8192}.apexcharts-selection-icon svg{fill:#444;transform:scale(.76)}.apexcharts-reset-zoom-icon.selected svg,.apexcharts-selection-icon.selected svg,.apexcharts-zoom-icon.selected svg{fill:#008ffb}.apexcharts-menu-icon:hover svg,.apexcharts-reset-zoom-icon:hover svg,.apexcharts-selection-icon:not(.selected):hover svg,.apexcharts-zoom-icon:not(.selected):hover svg,.apexcharts-zoom-in-icon:hover svg,.apexcharts-zoom-out-icon:hover svg{fill:#333}.apexcharts-menu-icon,.apexcharts-selection-icon{position:relative}.apexcharts-reset-zoom-icon{margin-left:5px}.apexcharts-menu-icon,.apexcharts-reset-zoom-icon,.apexcharts-zoom-icon{transform:scale(.85)}.apexcharts-zoom-in-icon,.apexcharts-zoom-out-icon{transform:scale(.7)}.apexcharts-zoom-out-icon{margin-right:3px}.apexcharts-pan-icon{transform:scale(.62);position:relative;left:1px;top:0}.apexcharts-pan-icon svg{fill:#fff;stroke:#6e8192;stroke-width:2}.apexcharts-pan-icon.selected svg{stroke:#008ffb}.apexcharts-pan-icon:not(.selected):hover svg{stroke:#333}.apexcharts-toolbar{position:absolute;z-index:11;top:0;right:3px;max-width:176px;text-align:right;border-radius:3px;padding:0 6px 2px 6px;display:flex;justify-content:space-between;align-items:center}.apexcharts-toolbar svg{pointer-events:none}.apexcharts-menu{background:#fff;position:absolute;top:100%;border:1px solid #ddd;border-radius:3px;padding:3px;right:10px;opacity:0;min-width:110px;transition:.15s ease all;pointer-events:none}.apexcharts-menu.open{opacity:1;pointer-events:all;transition:.15s ease all}.apexcharts-menu-item{padding:6px 7px;font-size:12px;cursor:pointer}.apexcharts-menu-item:hover{background:#eee}@media screen and (min-width:768px){.apexcharts-canvas:hover .apexcharts-toolbar{opacity:1}}.apexcharts-datalabel.hidden{opacity:0}.apexcharts-datalabel,.apexcharts-datalabel-label,.apexcharts-datalabel-value,.apexcharts-pie-label{cursor:default;pointer-events:none}.apexcharts-pie-label-delay{opacity:0;animation-name:opaque;animation-duration:.3s;animation-fill-mode:forwards;animation-timing-function:ease}.apexcharts-canvas .hidden{opacity:0}.apexcharts-hide .apexcharts-series-points{opacity:0}.apexcharts-area-series .apexcharts-series-markers .apexcharts-marker.no-pointer-events,.apexcharts-line-series .apexcharts-series-markers .apexcharts-marker.no-pointer-events,.apexcharts-radar-series path,.apexcharts-radar-series polygon{pointer-events:none}.apexcharts-marker{transition:.15s ease all}@keyframes opaque{0%{opacity:0}100%{opacity:1}} -------------------------------------------------------------------------------- /PERF/lib/styles/main.css: -------------------------------------------------------------------------------- 1 | #navbar { 2 | position: fixed; 3 | width: 50px; 4 | height: 50px; 5 | top: 100px; right: 0px; bottom: 0px; left: 20px; 6 | z-index: 100; 7 | opacity: 0.5 8 | } 9 | 10 | #navbar:hover { 11 | opacity: 1; 12 | } 13 | 14 | #content { 15 | min-height: 100px; 16 | } 17 | 18 | .ui.grid{ 19 | padding: 0 !important; 20 | } 21 | 22 | .pushable.segment{ 23 | margin: 0 !important; 24 | } 25 | 26 | .sumstat-segment { 27 | box-shadow: none !important; 28 | padding: 8px !important; 29 | margin: 0px !important; 30 | border: white !important; 31 | } 32 | 33 | 34 | 35 | .ms-selectable { 36 | float: none !important; 37 | width: 100% !important; 38 | } 39 | 40 | .ms-container { 41 | width: 100% !important; 42 | } 43 | 44 | .ms-selection { 45 | margin-top: 10px; 46 | float: none !important; 47 | width: 100% !important; 48 | } 49 | 50 | .button { 51 | margin-top: 5px !important 52 | } 53 | 54 | .plot-options { 55 | min-width: 200px !important; 56 | } -------------------------------------------------------------------------------- /PERF/lib/styles/multi-select.min.css: -------------------------------------------------------------------------------- 1 | .ms-container{background:transparent url(../img/switch.png) no-repeat 50% 50%;width:370px}.ms-container:after{content:".";display:block;height:0;line-height:0;font-size:0;clear:both;min-height:0;visibility:hidden}.ms-container .ms-selectable,.ms-container .ms-selection{background:#fff;color:#555;float:left;width:45%}.ms-container .ms-selection{float:right}.ms-container .ms-list{-webkit-box-shadow:inset 0 1px 1px rgba(0,0,0,.075);-moz-box-shadow:inset 0 1px 1px rgba(0,0,0,.075);box-shadow:inset 0 1px 1px rgba(0,0,0,.075);-webkit-transition:border linear .2s,box-shadow linear .2s;-moz-transition:border linear .2s,box-shadow linear .2s;-ms-transition:border linear .2s,box-shadow linear .2s;-o-transition:border linear .2s,box-shadow linear .2s;transition:border linear .2s,box-shadow linear .2s;border:1px solid #ccc;-webkit-border-radius:3px;-moz-border-radius:3px;border-radius:3px;position:relative;height:200px;padding:0;overflow-y:auto}.ms-container .ms-list.ms-focus{border-color:rgba(82,168,236,.8);-webkit-box-shadow:inset 0 1px 1px rgba(0,0,0,.075),0 0 8px rgba(82,168,236,.6);-moz-box-shadow:inset 0 1px 1px rgba(0,0,0,.075),0 0 8px rgba(82,168,236,.6);box-shadow:inset 0 1px 1px rgba(0,0,0,.075),0 0 8px rgba(82,168,236,.6);outline:0}.ms-container ul{margin:0;list-style-type:none;padding:0}.ms-container .ms-optgroup-container{width:100%}.ms-container .ms-optgroup-label{margin:0;padding:5px 0 0 5px;cursor:pointer;color:#999}.ms-container .ms-selectable li.ms-elem-selectable,.ms-container .ms-selection li.ms-elem-selection{border-bottom:1px #eee solid;padding:2px 10px;color:#555;font-size:14px}.ms-container .ms-selectable li.ms-hover,.ms-container .ms-selection li.ms-hover{cursor:pointer;color:#fff;text-decoration:none;background-color:#08c}.ms-container .ms-selectable li.disabled,.ms-container .ms-selection li.disabled{background-color:#eee;color:#aaa;cursor:text} -------------------------------------------------------------------------------- /PERF/lib/template_fastq.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | PERF analysis report 7 | 8 | 11 | 14 | 17 | 20 | 23 | 24 | 25 | 26 | 27 |
      28 |
      29 |
      30 |

      PERF analysis report 31 | 32 | 33 | 34 | PERF 35 | 36 | 37 |

      38 |
      39 |
      40 |
      41 | 42 | 43 | 44 |
      45 | 52 |
      53 |
      54 | 55 | 56 |
      57 |
      Summary
      58 | 59 | 60 |
      61 |
      62 |
      63 |
      64 |
      Sequence Info
      65 |
      66 |
      67 |
      68 |
      Homo sapiens.fna
      69 |
      File Name
      70 |
      71 |
      72 |
      73 |
      74 |
      3.24Gb
      75 |
      Total reads
      76 |
      77 |
      78 |
      79 |
      80 |
      45.4
      81 |
      Read length range
      82 | 83 |
      84 |
      85 |
      86 |
      87 |
      545
      88 |
      Total bases
      89 |
      90 |
      91 |
      92 |
      93 |
      94 | 95 |
      96 |
      97 |
      Repeat Info
      98 |
      99 |
      100 |
      101 |
      102 |
      103 |
      4,631,324
      104 |
      Total repeats
      105 |
      106 |
      107 |
      108 |
      109 |
      69.09 Mb
      110 |
      Reads with repeats
      111 |
      112 |
      113 |
      114 |
      115 |
      2.15 %
      116 |
      Percent reads with repeats
      117 |
      118 |
      119 |
      120 |
      121 |
      122 |
      123 |
      501/501
      124 |
      Repeat classes
      125 |
      126 |
      127 |
      128 |
      129 |
      1428
      130 |
      Repeats per million reads
      131 |
      132 |
      133 |
      134 |
      135 |
      21467
      136 |
      Percent repeat bases
      137 |
      138 |
      139 |
      140 |
      141 |
      142 |
      143 |
      144 |
      145 |
      146 |
      147 | 148 | 155 | 156 | 157 |
      158 |
      Summary Table
      159 | 160 |
      161 |
      162 |
      163 |
      164 | 165 | 166 | 167 |
      168 | 169 |
      170 | 173 | 178 |
      179 | 180 | 181 | 182 |
      183 |
      184 |
      185 |
      186 |
      187 |
      Options
      188 | 189 | 190 |
      191 | Show 192 | 198 |
      199 |
      200 | 201 | 202 | 203 |
      204 | 207 |
      208 | 209 | 210 | 211 |
      212 |
      213 | 214 |
      215 |
      216 | 217 | 218 |
      219 |
      220 |
      221 |
      222 | 223 | 224 |
      225 |
      226 |
      227 |
      228 | 229 | 230 |
      231 |
      232 |
      233 |
      234 | 235 | 236 | 237 | 238 | 239 | 240 |
      241 |
      242 | 243 |
      244 |
      245 |
      246 |
      247 |
      248 | 249 |
      250 |
      251 |
      252 |
      253 |
      Options
      254 | 255 |
      256 |
      257 | 258 | 259 |
      260 |
      261 | 262 | 263 | 264 |
      Select Repeats:
      265 |
      266 | 269 |
      270 | 271 | 274 | 275 | 276 | 277 |
      278 |
      279 | 280 |
      281 |
      282 | 283 | 284 |
      285 |
      286 |
      287 |
      288 | 289 | 290 |
      291 |
      292 |
      293 |
      294 | 295 | 296 |
      297 |
      298 |
      299 |
      300 | 301 |
      302 |
      303 |
      304 |
      305 | 306 |
      307 |
      308 |
      309 |
      310 | 311 |
      312 |
      313 |
      314 |
      315 |
      Options
      316 |
      317 |
      318 | to 319 |
      320 | 326 | range 327 |
      328 | 329 |
      330 | 333 |
      334 | 337 | 338 |
      339 |
      340 |
      341 |
      342 | 343 |
      344 |
      345 |
      346 |
      347 |
      348 |
      349 | 350 | 351 |
      352 | 353 |
      354 | 357 | 361 |
      362 | 363 | 364 | 365 |
      366 |
      367 |
      368 |
      369 |
      370 |
      Options
      371 | 372 |
      373 |
      374 | 375 | 376 |
      377 |
      378 | 379 |
      380 | 383 |
      384 | 385 | 386 |
      387 |
      388 |
      389 |
      390 | 391 |
      392 |
      393 |
      394 |
      395 | 396 |
      397 |
      398 |
      399 |
      400 |
      Options
      401 | 402 |
      403 | Bin size: 404 | 410 |
      411 | 412 |
      413 | 416 |
      417 | 418 |
      419 |
      420 | 421 |
      422 |
      423 |
      424 |
      425 |
      426 | 427 |
      428 | 429 |
      430 |
      431 |
      432 |
      433 | 434 | 437 | 440 | 443 | 446 | 449 | 452 | 455 | 458 | -------------------------------------------------------------------------------- /PERF/rep_utils.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # pylint: disable=C0111, C0301 3 | 4 | from __future__ import print_function, division 5 | from itertools import product 6 | from Bio import SeqIO 7 | from tqdm import tqdm 8 | import sys, gzip, os 9 | from os import remove as del_file 10 | import multiprocessing as multi 11 | 12 | if sys.version_info.major == 2: 13 | from utils import rev_comp, rawcharCount, getGC, get_targetids 14 | from analyse import analyse_fasta 15 | from annotation import annotate 16 | elif sys.version_info.major == 3: 17 | from .utils import rev_comp, rawcharCount, getGC, get_targetids 18 | from .analyse import analyse_fasta 19 | from .annotation import annotate 20 | 21 | def num_factors(num): 22 | factors = [] 23 | for i in range(1,num): 24 | if num%i == 0: factors.append(i) 25 | return factors 26 | 27 | def expand_repeat(string, size): 28 | """Expands a motif to highest motif size, used for checking duplicates""" 29 | return_string = '' 30 | i = 0 31 | while len(return_string) < size: 32 | return_string += string[i] 33 | i += 1 34 | if i >= len(string): 35 | i = 0 36 | return return_string 37 | 38 | 39 | def get_cycles(string): 40 | cycles = [] 41 | for i in range(len(string)): 42 | cycles.append(string[i:] + string[:i]) 43 | return cycles 44 | 45 | 46 | def generate_repeats(sizes, atomic): 47 | """Generates all possible motifs for repeats in a given length range""" 48 | generated_repeats = [] 49 | alphabet = ['A', 'C', 'G', 'T'] 50 | expanded_set = set() 51 | repeat_set = set() 52 | sizes.sort() 53 | min_size = sizes[0] 54 | max_size = sizes[-1] 55 | non_atomic_repeats = dict() 56 | for s in range(1, max_size): 57 | if s not in sizes: 58 | non_atomic_repeats[s] = set() 59 | if atomic: 60 | for combination in product(alphabet, repeat=s): 61 | repeat = ''.join(combination) 62 | expanded = expand_repeat(repeat, max_size) 63 | non_atomic_repeats[s].add(expanded) 64 | for i in sizes: 65 | factors = num_factors(i) 66 | for combination in product(alphabet, repeat=i): 67 | repeat = ''.join(combination) 68 | repeat_revcomp = rev_comp(repeat) 69 | expanded = expand_repeat(repeat, max_size) 70 | atomic_check = False 71 | if atomic: 72 | for factor in factors: 73 | if factor not in sizes and expanded in non_atomic_repeats[factor]: 74 | atomic_check = True 75 | if expanded in expanded_set: 76 | continue 77 | elif atomic and atomic_check: 78 | continue 79 | else: 80 | repeat_cycles = get_cycles(repeat) 81 | for cycle in repeat_cycles: 82 | strand = '+' 83 | string = expand_repeat(cycle, max_size) 84 | expanded_set.add(string) 85 | if cycle not in repeat_set: 86 | repeat_set.add(cycle) 87 | if len(cycle) >= min_size: 88 | generated_repeats.append('\t'.join([cycle, repeat, str(len(cycle)), strand])) 89 | if repeat_revcomp == repeat: 90 | continue 91 | repeat_cycles = get_cycles(repeat_revcomp) 92 | for cycle in repeat_cycles: 93 | strand = '-' 94 | string = expand_repeat(cycle, max_size) 95 | expanded_set.add(string) 96 | if cycle not in repeat_set: 97 | repeat_set.add(cycle) 98 | if len(cycle) >= min_size: 99 | generated_repeats.append('\t'.join([cycle, repeat, str(len(cycle)), strand])) 100 | return generated_repeats 101 | 102 | 103 | def build_rep_set(repeat_file, length_cutoff=None, unit_cutoff=None): 104 | """ 105 | Outputs the repeats info dictionary used by the get_ssrs function. 106 | Takes list of repeat motifs from repeats file(output by generate_repeats function) as input. 107 | Creates a dictionary with expanded repeat as the key and (class, motif_length, strand) as values. 108 | Works either by "length_cutoff=" or by "unit_cutoff=" arguments. 109 | """ 110 | repeats_out = dict() 111 | motif_fallback = dict() 112 | motif_cutoff = dict() 113 | repeat_lengths = set() 114 | if length_cutoff is not None: 115 | for line in repeat_file: 116 | motif_dict = dict() 117 | L = line.strip().split('\t') 118 | motif = L[0] 119 | motif_length = int(L[2]) 120 | motif = expand_repeat(motif, length_cutoff) 121 | motif_dict['class'] = L[1] 122 | motif_dict['motif_length'] = motif_length 123 | motif_dict['strand'] = L[3] 124 | repeats_out[motif] = motif_dict 125 | repeats_out['cutoff'] = [length_cutoff] 126 | 127 | elif unit_cutoff is not None: 128 | cutoffs = set() 129 | for line in repeat_file: 130 | motif_dict = dict() 131 | L = line.strip().split('\t') 132 | motif = L[0] 133 | motif_length = int(L[2]) 134 | motif = motif*unit_cutoff[motif_length] 135 | cutoffs.add(len(motif)) 136 | motif_dict['class'] = L[1] 137 | motif_dict['motif_length'] = motif_length 138 | motif_dict['strand'] = L[3] 139 | repeats_out[motif] = motif_dict 140 | repeats_out['cutoff'] = sorted(list(cutoffs)) 141 | 142 | return repeats_out 143 | 144 | 145 | 146 | def get_ssrs(seq_record, repeats_info, out): 147 | """Native function that identifies repeats in fasta files.""" 148 | if type(out) == str: 149 | out_file = open(out, 'w') 150 | else: 151 | out_file = out 152 | length_cutoffs = repeats_info['cutoff'] 153 | input_seq = str(seq_record.seq).upper() 154 | input_seq_length = len(input_seq) 155 | for length_cutoff in length_cutoffs: 156 | fallback = length_cutoff - 1 157 | sub_start = 0 # substring start 158 | sub_stop = sub_start + length_cutoff # substring stop 159 | while sub_stop <= input_seq_length: 160 | sub_stop = sub_start + length_cutoff 161 | sub_seq = input_seq[sub_start:sub_stop] 162 | if sub_seq in repeats_info: 163 | match = True 164 | repeat_data = repeats_info[sub_seq] 165 | motif_length = repeat_data['motif_length'] 166 | rep_class = repeat_data['class'] 167 | strand = repeat_data['strand'] 168 | offset = length_cutoff % motif_length 169 | repeat_seq = input_seq[sub_start+offset:sub_start+offset+motif_length] 170 | i = 0 171 | while match: 172 | j = sub_stop 173 | if sub_stop >= input_seq_length: 174 | match = False 175 | match_length = sub_stop - sub_start 176 | num_units = int(match_length/motif_length) 177 | print(seq_record.id, sub_start, sub_stop, rep_class, match_length, strand, num_units, sub_seq[:motif_length], sep="\t", file=out_file) 178 | sub_start = sub_stop - fallback 179 | elif input_seq[j] == repeat_seq[i]: 180 | sub_stop += 1 181 | i += 1 182 | if i >= motif_length: 183 | i = 0 184 | else: 185 | match = False 186 | match_length = sub_stop - sub_start 187 | num_units = int(match_length/motif_length) 188 | print(seq_record.id, sub_start, sub_stop, rep_class, match_length, strand, num_units, sub_seq[:motif_length], sep="\t", file=out_file) 189 | sub_start = sub_stop - fallback 190 | else: 191 | sub_start += 1 192 | if type(out) == str: 193 | out_file.close() 194 | 195 | 196 | def fasta_ssrs(args, repeats_info): 197 | 198 | if args.input.endswith('gz'): 199 | handle = gzip.open(args.input, 'rt') 200 | else: 201 | handle = open(args.input, 'r') 202 | 203 | seq_nucleotide_info = dict() 204 | num_records = rawcharCount(args.input, '>') 205 | records = SeqIO.parse(handle, 'fasta') 206 | target_ids = get_targetids(args.filter_seq_ids, args.target_seq_ids) 207 | 208 | if args.threads > 1: 209 | i = 0 210 | pool = multi.Pool(processes=args.threads) 211 | for record in records: 212 | out_name = './temp_%s.tsv' %(i) 213 | i += 1 214 | if (args.info or args.analyse)==True: 215 | for a in record.seq.upper(): 216 | try: seq_nucleotide_info[a] += 1 217 | except KeyError: seq_nucleotide_info[a] = 1 218 | if args.min_seq_length <= len(record.seq) <= args.max_seq_length and record.id in target_ids: 219 | pool.apply_async(get_ssrs, (record, repeats_info, out_name,)) 220 | 221 | pool.close() 222 | pool.join() 223 | 224 | # Concat all the output files into one. 225 | temp_outs = tqdm(range(num_records), total=num_records) 226 | for o in temp_outs: 227 | name = './temp_%s.tsv' %(o) 228 | temp_outs.set_description("Concatenating file: %d " %(o)) 229 | with open(name, 'r') as fh: 230 | for line in fh: 231 | print(line.strip(), file=args.output) 232 | del_file(name) 233 | 234 | elif args.threads == 1: 235 | records = tqdm(records, total=num_records) 236 | for record in records: 237 | records.set_description("Processing %s" %(record.id)) 238 | if (args.info or args.analyse)==True: 239 | for a in record.seq.upper(): 240 | try: seq_nucleotide_info[a] += 1 241 | except KeyError: seq_nucleotide_info[a] = 1 242 | if args.min_seq_length <= len(record.seq) <= args.max_seq_length and record.id in target_ids: 243 | get_ssrs(record, repeats_info, args.output) 244 | 245 | if (args.info or args.analyse)==True: 246 | line = "#File_name: %s\n#Total_sequences: %d\n#Total_bases: %d\n#GC: %f"\ 247 | %(os.path.basename(args.input), num_records, sum(seq_nucleotide_info.values()),\ 248 | round(getGC(seq_nucleotide_info), 2)) 249 | print(line, file=args.output) 250 | args.output.close() 251 | 252 | if args.annotate is not None: 253 | annotate(args) 254 | 255 | # Specifies to generate a HTML report 256 | if args.analyse: 257 | analyse_fasta(args) 258 | -------------------------------------------------------------------------------- /PERF/utils.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # pylint: disable=C0111, C0301 3 | 4 | from __future__ import print_function, division 5 | import sys, gzip 6 | from itertools import takewhile, repeat, islice 7 | from tqdm import tqdm 8 | from collections import Counter, defaultdict 9 | 10 | 11 | kmers = { 12 | 1: 'Monomer', 2: 'Dimer', 3: 'Trimer', 4: 'Tetramer', 5: 'Pentamer', 13 | 6: 'Hexamer', 7: 'Heptamer', 8: 'Octamer', 9: 'Nonamer', 10: 'Decamer', 14 | 11: 'Undecamer', 12: 'Dodecamer', 13: 'Tridecamer', 14: 'Tetradecamer', 15: 'Pentadecamer', 15 | 16: 'Hexadecamer', 17: 'Heptadecamer', 18: 'Octadecamer', 19: 'Nonadecamer', 20: 'Icosamer', 16 | 21: 'Uncosamer', 22: 'Docosamer', 23: 'Tricosamer', 24: 'Tetracosamer', 25: 'Pentacosamer', 17 | 26: 'Hexacosamer', 27: 'Heptacosamer', 28: 'Octacosamer', 29: 'Nonacosamer', 30: 'Triacontamer', 18 | 31: 'Untriacontamer', 32: 'Dotriacontamer', 33: 'Tritriacontamer', 34: 'Tetratriacontamer', 35: 'Pentatriacontamer', 19 | 36: 'Hexatriacontamer', 37: 'Heptatriacontamer', 38: 'Octatriacontamer', 39: 'Nonatriacontamer', 40: 'Tetracontamer', 20 | 41: 'Untetracontamer', 42: 'Dotetracontamer', 43: 'Tritetracontamer', 44: 'Tetratetracontamer', 45: 'Pentatetracontamer', 21 | 46: 'Hexatetracontamer', 47: 'Heptatetracontamer', 48: 'Octatetracontamer', 49: 'Nonatetracontamer', 50: 'Pentacontamer', 22 | } 23 | 24 | 25 | def get_cycles(string): 26 | cycles = set() 27 | for i in range(len(string)): 28 | cycles.add(string[i:] + string[:i]) 29 | cycles = sorted(list(cycles)) 30 | return cycles 31 | 32 | 33 | def build_cycVariations(string): 34 | cycles = get_cycles(string) 35 | rev_cycles = get_cycles(rev_comp(string)) 36 | for r in rev_cycles: 37 | if r not in cycles: cycles.append(r) 38 | return cycles 39 | 40 | 41 | def getGC(basesCounter): 42 | totalBases = sum(basesCounter.values()) 43 | try: 44 | GC = (float(basesCounter['G'] + basesCounter['C'])/(totalBases-basesCounter['N']))*100 45 | except KeyError: 46 | GC = (float(basesCounter['G'] + basesCounter['C'])/totalBases)*100 47 | return GC 48 | 49 | 50 | def rev_comp(string): 51 | """Outputs reverse complement of a nucleotide sequence""" 52 | if sys.version_info.major == 2: 53 | import string as st 54 | complement = string.translate(st.maketrans('ACGT', 'TGCA')) 55 | else: 56 | complement = string.translate(str.maketrans('ACGT', 'TGCA')) 57 | return complement[::-1] 58 | 59 | 60 | def rawcharCount(filename, char): 61 | if filename.endswith('gz'): 62 | f = gzip.open(filename, 'rb') 63 | else: 64 | f = open(filename, 'rb') 65 | bufgen = takewhile(lambda x: x, (f.read(1024*1024) for _ in repeat(None))) 66 | return sum( buf.count(char.encode('ASCII')) for buf in bufgen if buf ) 67 | 68 | 69 | def get_targetids(filter_seq_ids, target_seq_ids): 70 | """ 71 | The function returns the set of desired sequence ids 72 | across which repeats will be identified. 73 | """ 74 | target_ids = univset() 75 | if filter_seq_ids: 76 | target_ids = univset() 77 | filter_ids = [] 78 | with open(filter_seq_ids) as fh: 79 | for line in fh: 80 | line = line.strip() 81 | line = line.lstrip('>') 82 | line = line.split(' ')[0] 83 | filter_ids.append(line) 84 | target_ids = target_ids - set(filter_ids) 85 | 86 | elif target_seq_ids: 87 | target_ids = [] 88 | with open(target_seq_ids) as fh: 89 | for line in fh: 90 | line = line.strip() 91 | line = line.lstrip('>') 92 | line = line.split(' ')[0] 93 | target_ids.append(line) 94 | target_ids = set(target_ids) 95 | 96 | return target_ids 97 | 98 | 99 | class univset(object): 100 | def __init__(self): 101 | self._diff = set() 102 | 103 | def __sub__(self, other): 104 | S = univset() 105 | if type(other) == set: 106 | S._diff = self._diff | other 107 | return S 108 | else: 109 | S._diff = self._diff | other._diff 110 | return S 111 | 112 | def __rsub__(self, other): 113 | return other & self._diff 114 | 115 | def __contains__(self, obj): 116 | return not obj in self._diff 117 | 118 | def __and__(self, other): 119 | return other - self._diff 120 | 121 | def __rand__(self, other): 122 | return other - self._diff 123 | 124 | def __repr__(self): 125 | if self._diff == set(): 126 | return "ANY" 127 | else: 128 | return "ANY - %s"%self._diff 129 | 130 | def __or__(self, other): 131 | S = univset() 132 | S._diff = self._diff - other 133 | return S 134 | 135 | def __xor__(self, other): 136 | return (self - other) | (other - self) 137 | 138 | def add(self, elem): 139 | if elem in self._diff: 140 | self._diff.remove(elem) 141 | 142 | def update(self, elem): 143 | self._diff = self._diff - other 144 | 145 | def __ror__(self, other): 146 | return self.__or__(other) 147 | 148 | def union(self, other): 149 | return self.__or__(other) 150 | 151 | def difference(self, other): 152 | return self.__sub__(other) 153 | 154 | def intersection(self, other): 155 | return self.__and__(other) 156 | 157 | def symmetric_difference(self, other): 158 | return self.__xor__(other) 159 | 160 | def __lt__(self, other): 161 | return self.issubset(other) 162 | 163 | def __eq__(self, other): 164 | if type(other) == set: 165 | return False 166 | try: 167 | return self._diff == other._diff 168 | except AttributeError: 169 | return False 170 | 171 | def __ne__(self, other): 172 | return not self.__eq__(other) 173 | 174 | def __le__(self, other): 175 | return self.__lt__(other) or self.__eq__(other) 176 | 177 | def __gt__(self, other): 178 | return self.issuperset(other) 179 | 180 | def __gt__(self, other): 181 | return self.issuperset(other) or self == other 182 | 183 | 184 | class dotDict(dict): 185 | """ 186 | Example: 187 | m = dotDict({'first_name': 'Eduardo'}, last_name='Pool', age=24, sports=['Soccer']) 188 | """ 189 | def __init__(self, *args, **kwargs): 190 | super(dotDict, self).__init__(*args, **kwargs) 191 | for arg in args: 192 | if isinstance(arg, dict): 193 | for k, v in arg.items(): 194 | self[k] = v 195 | 196 | if kwargs: 197 | for k, v in kwargs.items(): 198 | self[k] = v 199 | 200 | def __getattr__(self, attr): 201 | return self.get(attr) 202 | 203 | def __setattr__(self, key, value): 204 | self.__setitem__(key, value) 205 | 206 | def __setitem__(self, key, value): 207 | super(dotDict, self).__setitem__(key, value) 208 | self.__dict__.update({key: value}) 209 | 210 | def __delattr__(self, item): 211 | self.__delitem__(item) 212 | 213 | def __delitem__(self, key): 214 | super(dotDict, self).__delitem__(key) 215 | del self.__dict__[key] -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PERF 2 | [![Build](https://img.shields.io/badge/Build-passing-brightgreen.svg)]() 3 | [![PyPI](https://img.shields.io/badge/PyPI-v0.4.6-blue.svg)]() 4 | [![License](https://img.shields.io/badge/Licence-MIT-blue.svg)]() 5 | ## Introduction 6 | PERF is a Python package developed for fast and accurate identification of microsatellites from DNA sequences. Microsatellites or Simple Sequence Repeats (SSRs) are short tandem repeats of 1-6nt motifs. They are present in all genomes, and have a wide range of uses and functional roles. The existing tools for SSR identification have one or more caveats in terms of speed, comprehensiveness, accuracy, ease-of-use, flexibility and memory usage. PERF was designed to address all these problems. 7 | 8 | PERF is a recursive acronym that stands for "PERF is an Exhaustive Repeat Finder". It is compatible with both Python 2 (tested on Python 2.7) and 3 (tested on Python 3.5). Its key features are: 9 | - Fast run time. As an example, identification of all SSRs from the entire human genome takes less than 7 minutes. The speed can be further improved ~3 to 4 fold using [PyPy](https://pypy.org/) (human genome finishes in less than 2 minutes using PyPy v5.8.0) 10 | - Linear time and space complexity (O(n)) 11 | - Identifies perfect SSRs 12 | - 100% accurate and comprehensive - Does not miss any repeats or does not pick any incorrect ones 13 | - Easy to use - The only required argument is the input DNA sequence in FASTA format 14 | - Flexible - Most of the parameters are customizable by the user at runtime 15 | - Repeat cutoffs can be specified either in terms of the total repeat length or in terms of number of repeating units 16 | - TSV output and HTML report. The default output is an easily parseable and exportable tab-separated format. Optionally, PERF also generates an interactive HTML report that depicts trends in repeat data as concise charts and tables 17 | 18 | ## Change log 19 | 20 | ## [0.4.6] - 2021-04-22 21 | ### Fixes 22 | - Fixed usage of unit options file input for fastq input. 23 | - Fixed usage of repeats input file. 24 | 25 | ## [0.4.5] - 2020-05-07 26 | ### Added 27 | - Annotation of repeats w.r.t to genomic context using a GFF or GTF file. (option -g). 28 | - Multi-threading. Parallel identification of repeats in different sequences. 29 | - Identification of perfect repeats in fastq files. 30 | - Analysis report for repeats in fastq files. 31 | - Option to identify atomic repeats. 32 | 33 | ### Changed 34 | - Analysis report rebuilt with Semantic ui and Apex Charts. 35 | - Visualisation of repeat annotation data in analysis report. 36 | 37 | ### Fixes 38 | - Python2 compatability fixed. 39 | - Bug fixes for PyPi compatability. 40 | - Import error issues. 41 | 42 | ## Installation 43 | PERF can be directly installed using pip with the package name `perf_ssr`. 44 | ```bash 45 | $ pip install perf_ssr 46 | ``` 47 | 48 | This name was chosen for the package so as not to clash with the existing `perf` package. 49 | 50 | Alternatively, it can be installed from the source code: 51 | ```bash 52 | # Download the git repo 53 | $ git clone https://github.com/RKMlab/perf.git 54 | 55 | # Install 56 | $ cd perf 57 | $ python setup.py install 58 | ``` 59 | Both of the methods add a console command `PERF`, which can be executed from any directory. It can also be used without installation by running the `core.py` file in the `PERF` subfolder: 60 | 61 | ```bash 62 | $ git clone https://github.com/RKMlab/perf.git 63 | $ cd perf/PERF 64 | $ python core.py -h # Print the help message of PERF (see below) 65 | ``` 66 | 67 | ## Usage 68 | The help message and available options can be accessed using 69 | ```bash 70 | $ PERF -h # Short option 71 | $ PERF --help # Long option 72 | ``` 73 | which gives the following output 74 | ``` 75 | usage: core.py [-h] -i [-o ] [--format ] [--version] 76 | [-rep ] [-m ] [-M ] [-s ] [-S ] 77 | [--include-atomic] [-l | -u INT or FILE] [-a] [--info] 78 | [-g ] [--anno-format ] [--gene-key ] 79 | [--up-promoter ] [--down-promoter ] 80 | [-f | -F ] [-t ] 81 | 82 | Required arguments: 83 | -i , --input 84 | Input sequence file. 85 | 86 | Optional arguments: 87 | -o , --output 88 | Output file name. Default: Input file name + _perf.tsv 89 | --format Input file format. Default: fasta, Permissible: fasta, 90 | fastq 91 | --version show program's version number and exit 92 | -rep , --repeats 93 | File with list of repeats (Not allowed with -m and/or 94 | -M) 95 | -m , --min-motif-size 96 | Minimum size of a repeat motif in bp (Not allowed with 97 | -rep) 98 | -M , --max-motif-size 99 | Maximum size of a repeat motif in bp (Not allowed with 100 | -rep) 101 | -s , --min-seq-length 102 | Minimum size of sequence length for consideration (in 103 | bp) 104 | -S , --max-seq-length 105 | Maximum size of sequence length for consideration (in 106 | bp) 107 | --include-atomic An option to include factor atomic repeats for minimum 108 | motif sizes greater than 1. 109 | -l , --min-length 110 | Minimum length cutoff of repeat 111 | -u INT or FILE, --min-units INT or FILE 112 | Minimum number of repeating units to be considered. 113 | Can be an integer or a file specifying cutoffs for 114 | different motif sizes. 115 | -a, --analyse Generate a summary HTML report. 116 | --info Sequence file info recorded in the output. 117 | -f , --filter-seq-ids 118 | List of sequence ids in fasta file which will be 119 | ignored. 120 | -F , --target-seq-ids 121 | List of sequence ids in fasta file which will be used. 122 | -t , --threads 123 | Number of threads to run the process on. Default is 1. 124 | 125 | Annotation arguments: 126 | -g , --annotate 127 | Genic annotation input file for annotation, Both GFF 128 | and GTF can be processed. Use --anno-format to specify 129 | format. 130 | --anno-format Format of genic annotation file. Valid inputs: GFF, 131 | GTF. Default: GFF 132 | --gene-key Attribute key for geneId. The default identifier is 133 | "gene". Please check the annotation file and pick a 134 | robust gene identifier from the attribute column. 135 | --up-promoter Upstream distance(bp) from TSS to be considered as 136 | promoter region. Default 1000 137 | --down-promoter 138 | Downstream distance(bp) from TSS to be considered as 139 | promoter region. Default 1000 140 | ``` 141 | The details of each option are given below: 142 | 143 | ### `-i or --input` 144 | **Expects:** *FILE*
      145 | **Default:** *None*
      146 | This is the only required argument for the program. The input file must be a valid FASTA/FASTQ file. PERF uses [Biopython's](http://biopython.org/wiki/SeqIO) FASTA parser to read the input fasta files. It accepts both single-line and multi-line sequences. Files with multiple sequences are also valid. To see more details about the FASTA format, see [this page](http://bioperl.org/formats/sequence_formats/FASTA_sequence_format). 147 | 148 | ### `-o or --output` 149 | **Expects:** *STRING (to be used as filename)*
      150 | **Default:** *Input Filename + _perf.tsv (see below)*
      151 | If this option is not provided, the default output filename will be the same as the input filename, with its extension replaced with '_perf.tsv'. For example, if the input filename is `my_seq.fa`, the default output filename will be `my_seq_perf.tsv`. If the input filename does not have any extension, `_perf.tsv` will be appended to the filename. Please note that even in the case of no identified SSRs, the output file is still created (therefore overwriting any previous file of the same name) but with no content in the file. 152 | #### Output for fasta 153 | The output is a tab-delimited file, with one SSR record per line. 154 | The output columns follow the [BED](https://genome.ucsc.edu/FAQ/FAQformat.html) format. The details of the columns are given below: 155 | 156 | | S.No | Column | Description | 157 | |:----:| ------ | ----------- | 158 | | 1 | Chromosome | Chromosome or Sequence Name as specified by the first word in the FASTA header | 159 | | 2 | Repeat Start | 0-based start position of SSR in the Chromosome | 160 | | 3 | Repeat Stop | End position of SSR in the Chromosome | 161 | | 4 | Repeat Class | Class of repeat as grouped by their cyclical variations | 162 | | 5 | Repeat Length | Total length of identified repeat in nt | 163 | | 6 | Repeat Strand | Strand of SSR based on their cyclical variation | 164 | | 7 | Motif Number | Number of times the base motif is repeated | 165 | | 8 | Actual Repeat | Starting sequence of the SSR irrespective of Repeat class and strand| 166 | 167 | An example output showing some of the largest repeats from *Drosophila melanogaster* is given below 168 | ``` 169 | X 22012826 22014795 ACTGGG 1969 - 328 TCCCAG 170 | 2RHet 591337 591966 AATACT 629 - 104 ATTAGT 171 | 4 1042143 1042690 AAATAT 547 + 91 AAATAT 172 | 2RHet 598244 598789 AATACT 545 - 90 AGTATT 173 | XHet 122 663 AGAT 541 + 135 GATA 174 | X 22422335 22422827 AGAT 492 + 123 GATA 175 | 3R 975265 975710 AAAT 445 - 111 TTAT 176 | X 15442288 15442724 ACAGAT 436 + 72 ACAGAT 177 | 2L 22086818 22087152 AATACT 334 - 55 TATTAG 178 | YHet 137144 137466 AAGAC 322 - 64 CTTGT 179 | ``` 180 | 181 | #### Output for fastq 182 | The output is a tab-delimited file, with data on each repeat class per line. 183 | | S.No | Column | Description | 184 | |:----:| ------ | ----------- | 185 | | 1 | Repeat Class | Class of repeat as grouped by their cyclical variations | 186 | | 2 | Number of reads | Number of reads having an instance of the repeat | 187 | | 3 | Frequency | Total number of instances of the repeat | 188 | | 4 | Bases | Total number of bases covered by the repeat | 189 | | 5 | Repeat reads per million reads | Number of | 190 | | 6 | Instances per million reads | Strand of SSR based on their cyclical variation | 191 | | 7 | Repeat Bases per MB of sequence | Number of times the base motif is repeated | 192 | | 8 | Length distribution | Starting sequence of the SSR irrespective of Repeat class and strand| 193 | | 9 | Motif distribution | Starting sequence of the SSR irrespective of Repeat class and strand| 194 | 195 | 196 | ### `--format` 197 | **Expects:** *STRING (specifying format of the file)*
      198 | **Default:** *fasta*
      199 | PERF was originally developed to identify repeats in FASTA files. In version 4.0.0 PERF can identify repeats in FASTQ sequence files as well. The default format the program expects is fasta. Specify input format as 'fasta' for FASTA files and 'fastq' for FASTQ files. 200 | 201 | ### `-a or --analyze` 202 | **Expects:** *None*
      203 | **Default:** *False*
      204 | In addition to the default tab-separated output, PERF can also generate a fully interactive HTML report for easy downstream analysis of the repeat data. The filename will be the same prefix as that of the main output. For example, if the input filename was `my_seq.fa`, the analysis report will be `my_seq_perf.html`. An example HTML report, generated from the repeat data of *Homo sapiens* (build hg19), can be accessed [here](https://raw.githubusercontent.com/RKMlab/perf/html-report/test_data/Human_hg19_perf.html) (Right click -> Save As). 205 | 206 | ### `-l or --min-length` 207 | **Expects:** *INTEGER*
      208 | **Default:** *12*
      209 | Minimum length cut-off to be considered when finding an SSR. The same cut-off will apply for SSRs of all motif lengths, even if the motif length is not a divisor of this value. In such cases, SSRs that end with a partial motif are also picked if they pass the length cut-off. 210 | 211 | ### `-u or --min-units` 212 | **Expects:** *INTEGER* OR *FILE*
      213 | **Default:** *None*
      214 | This option finds SSRs with a minimum number of repeating motifs. The argument accepts either an integer or file. If an integer is specified, the same value is used for all motif lengths. Instead, a specific value for each motif length using a two-column tab-separated file as demonstrated below: 215 | 216 | ```bash 217 | $ cat repeat_units.txt 218 | 1 10 219 | 2 6 220 | 3 4 221 | 4 3 222 | 5 2 223 | 6 2 224 | ``` 225 | 226 | The first column specifies the motif length, and the second column specifies the minimum number of times the motif should be repeated to be considered an SSR. This file can be used to identify repeats with different number of repeating motifs: monomers repeated at least 10 times, dimers repeated at least 6 times etc., using the following command 227 | ``` bash 228 | $ PERF -i my_seq.fa -m 1 -M 6 -u repeat_units.txt 229 | ``` 230 | 231 | ### `-rep or --repeats` 232 | **Expects:** *FILE*
      233 | **Default:** *None*
      234 | PERF provides an option to limit the search to specific repeat motifs. The repeats of interest should be specified via a file containing 4 tab-separated columns, as shown below: 235 | 236 | ```bash 237 | $ cat my_repeats.txt 238 | A A 1 + 239 | T A 1 - 240 | AG AG 2 + 241 | CT AG 2 - 242 | GA AG 2 + 243 | TC AG 2 - 244 | $ PERF -i my_seq.fa -rep my_repeats.txt # Find all A and AG repeats from my_seq.fa 245 | ``` 246 | 247 | **Note:** This option is not allowed when `-m` or `-M` options are used. 248 | ### `-m or --min-motif-size` 249 | **Expects:** *INTEGER*
      250 | **Default:** *1*
      251 | Minimum length of motifs to be considered. By default, PERF ignores redundant motifs. For example, a stretch of 12 A's is considered a monomer repeat of 12 A's rather than a dimer repeat of 6 AA's. However, this is only true if `-m` is set to 1. If for example, `-m` is set to 2, then stretches of 12 A's are reported as dimer AA repeats. If this behavior isn't desired, we suggest using the `-rep` option (see above) to specify the motifs that should/shouldn't be included. 252 | 253 | ### `-M or --max-motif-size` 254 | **Expects:** *INTEGER*
      255 | **Default:** *6*
      256 | Maximum length of motifs to be considered. Setting a large value of `-M` has a non-trivial effect on both the runtime and memory usage of PERF. This is noticeable with `-M` values above 10. 257 | 258 | ### `--include-atomic` 259 | **Expects:** *None*
      260 | **Default:** *False*
      261 | Searches for atomic repeats when set to *True*. For example, when minimum motif size is set to 2bp, PERF ignores monomer repeats. When include atomic repeats is set to *True*, PERF identifies AA, CC, GG and TT as dimer repeats. 262 | 263 | ### `-s or --min-seq-length` 264 | **Expects:** *INTEGER*
      265 | **Default:** *0*
      266 | Minimum length of the input sequence to be searched for SSRs in bp. All sequences in the input file that are smaller than this length will be ignored. 267 | 268 | ### `-S or --max-seq-length` 269 | **Expects:** *INTEGER*
      270 | **Default:** *Infinity*
      271 | Maximum length of the input sequence to be searched for SSRs in bp. All sequences in the input file that are larger than this length will be ignored. 272 | 273 | ### `-f or --filter-seq-ids` 274 | **Expects:** *FILE*
      275 | **Default:** *None*
      276 | This option accepts a file with a list of sequence IDs in the input file that should be ignored. Useful for ignoring contigs, scaffold, or other poor quality sequences. The IDs can be FASTA headers (starting with '>' symbol) or just the names without the '>' symbol. 277 | 278 | ### `-F or --target-seq-ids` 279 | **Expects:** *FILE*
      280 | **Default:** *None*
      281 | This option accepts a file with a list of sequence IDs in the input file that should be analyzed. All other sequences will be ignored. Useful for analyzing specific chromosomes from a large input file. The IDs can be FASTA headers (starting with '>' symbol) or just the names without the '>' symbol. 282 | 283 | ### `--info` 284 | **Expects:** *None*
      285 | **Default:** *False*
      286 | This option when set to *True*, includes information about the input sequence files and repeat summary data in the output file. 287 | 288 | ```bash 289 | $ tail -5 test_input_perf.tsv 290 | gi|514486271|gb|KE346361.1| 2667759 2667775 ATC 16 + 5 CAT 291 | #File_name: test_input.fa 292 | #Total_sequences: 2 293 | #Total_bases: 6462134 294 | #GC: 53.970000 295 | ``` 296 | 297 | ### `-g or --annotate` 298 | **Expects:** *FILE*
      299 | **Default:** *None*
      300 | Input a genomic feature file to annotate the repeats in the genomic context. PERF accepts both GFF and GTF format genomic feature files. Each repeat is annotated w.r.t the closest gene and classified either as Genic, Exonic, Intronic and Intergenic according to the position of the repeat. Besides this, the repeat is also checked if it falls in the promoter region of the gene. Annotation adds 7 columns to the default perf output which already consist 8 columns. 301 | 302 | | S.No | Column | Description | 303 | |:----:| ------ | ----------- | 304 | | 9 | Gene name | Name of the closest gene | 305 | | 10 | Gene Start | Start position of gene in the Chromosome | 306 | | 11 | Gene Stop | End position of gene in the Chromosome | 307 | | 12 | Strand | The strand orientation of the gene | 308 | | 13 | Genomic annotation | Annotation of the repeat w.r.t to the gene. Possible annotations are {Genic, Exonic, Intronic, Intergenic} | 309 | | 14 | Promoter annotation | If repeat falls in the promoter region of the closest gene. The default promoter region is 1Kb upstream and downstream of TSS. | 310 | | 15 | Distance from TSS | Distance of the repeat from the TSS of the gene. | 311 | 312 | ### `--anno-format` 313 | **Expects:** *STRING*
      314 | **Default:** *GFF*
      315 | Option to specify the format of the input genomic feature file. Accepted inputs are GFF or GTF. More details about the GFF and GTF formats can be found [here](https://asia.ensembl.org/info/website/upload/gff.html). 316 | 317 | ### `--gene-key` 318 | **Expects:** *STRING*
      319 | **Default:** *gene*
      320 | The attribute key used for the name of the gene in the GFF/GTF file. In the below example GFF file, we have the location of a gene and it's mRNA and exon locations. The last column of the file specifies attributes associated with each feature, like ID, Parent, gene etc. PERF uses on of the attribute to identify the gene and also it's exons. In th below example the key "gene" can be used to identify gene and the exons of the gene as they have the same gene name. Please check your GFF/GTF file for a robust attribute key which can identify all genes and their corresponding exons. We are actively working on better annotation where we can identify genes and their exons based on the ID and Parent. 321 | 322 | ``` 323 | # Sample GFF 324 | NC_004354.4 RefSeq gene 124370 126714 . - . ID=gene1;Name=CG17636;gbkey=Gene;gene=CG17636;gene_biotype=protein_coding;gene_synonym=DmelCG17636,EG:23E12.1; 325 | NC_004354.4 RefSeq mRNA 124370 126714 . - . ID=rna1;Parent=gene1;Name=NM_001103384.3;gbkey=mRNA;gene=CG17636;transcript_id=NM_001103384.3 326 | NC_004354.4 RefSeq exon 126626 126714 . - . ID=id13;Parent=rna1;gbkey=mRNA;gene=CG17636;transcript_id=NM_001103384.3 327 | NC_004354.4 RefSeq exon 125495 126259 . - . ID=id14;Parent=rna1;gbkey=mRNA;gene=CG17636;transcript_id=NM_001103384.3 328 | ``` 329 | 330 | ### `--up-promoter` 331 | **Expects:** *INT*
      332 | **Default:** *1000*
      333 | Upstream distance(bp) from the TSS of the gene to be considered as promoter region. Default 1000. 334 | 335 | ### `--down-promoter` 336 | **Expects:** *INT*
      337 | **Default:** *1000*
      338 | Downstream distance(bp) from the TSS of the gene to be considered as promoter region. Default 1000. 339 | 340 | ### `--version` 341 | Prints the version info of PERF. 342 | 343 | ## Examples 344 | 345 | The following examples assume that the file with input sequence in FASTA format is named `my_seq.fa`. 346 | 347 | #### Basic Usage 348 | ``` bash 349 | # Find all monomer to hexamer repeats of >=12nt length 350 | $ PERF -i my_seq.fa 351 | # Specify output filename 352 | $ PERF -i my_seq.fa -o PERF_output.tsv 353 | # Specify fastq format 354 | $ PERF -i my_seq.fastq --format fastq 355 | ``` 356 | 357 | #### Generate Analysis Report 358 | ``` bash 359 | # Find all monomer to hexamer repeats of >=12nt length and generate an HTML report 360 | $ PERF -i my_seq.fa -a 361 | # Specify output filename 362 | $ PERF -i my_seq.fa -o PERF_out.tsv -a # HTML file is called PERF_out.html 363 | ``` 364 | 365 | #### Annotate Repeats 366 | ``` bash 367 | # Find all monomer to hexamer repeats of >=12nt length and annotate them 368 | $ PERF -i my_seq.fa -g my_seq.gff 369 | # Specify feature file format and set downstream promoter region to 500bp 370 | $ PERF -i my_seq.fa -g my_seq.gtf --anno-format gtf --down-promoter 500 # HTML file is called PERF_out.html 371 | ``` 372 | 373 | #### Set Cut-off Criteria 374 | ```bash 375 | # Find all monomer to hexamer repeats of >=15nt length 376 | $ PERF -i my_seq.fa -l 15 377 | # Find SSRs with at least 6 repeating motifs (for all motif lengths) 378 | $ PERF -i my_seq.fa -u 6 379 | ``` 380 | 381 | #### Identify Specific Repeats 382 | ``` bash 383 | $ cat my_repeats.txt 384 | AG AG 2 + 385 | CT AG 2 - 386 | GA AG 2 + 387 | TC AG 2 - 388 | # Find all AG repeats and generate an HTML report 389 | $ PERF -i my_seq.fa -rep my_repeats.txt -a 390 | ``` 391 | 392 | #### Change Motif Length Cut-offs 393 | ```bash 394 | # Ignore monomer and dimer repeats, and repeats with <4 repeating units 395 | $ PERF -i my_seq.fa -m 3 -u 4 396 | # Report only tetramer repeats of >=16nt length, and generate HTML report 397 | $ PERF -i my_seq.fa -m 4 -M 4 -l 16 -a 398 | 399 | ``` 400 | 401 | In all the above examples, the output of PERF is saved to `my_seq_perf.tsv` and the HTML report is saved to `my_seq_perf.html` unless `-o` is specified. 402 | 403 | 404 | ## Citation 405 | 406 | If you find PERF useful for your research, please cite it as follows: 407 | 408 | PERF: an exhaustive algorithm for ultra-fast and efficient identification of microsatellites from large DNA sequences
      409 | *Akshay Kumar Avvaru, Divya Tej Sowpati, Rakesh Kumar Mishra*
      410 | Bioinformatics, , btx721
      411 | [doi](https://doi.org/10.1093/bioinformatics/btx721): 10.1093/bioinformatics/btx721 412 | 413 | ## Contact 414 | For queries or suggestions, please contact: 415 | 416 | Divya Tej Sowpati -
      417 | Akshay Kumar Avvaru - 418 | 419 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | PERF 2 | ==== 3 | 4 | Introduction 5 | ------------ 6 | 7 | PERF is a Python package developed for fast and accurate identification of microsatellites from DNA sequences. Microsatellites or Simple Sequence Repeats (SSRs) are short tandem repeats of 1-6nt motifs. They are present in all genomes, and have a wide range of uses and functional roles. The existing tools for SSR identification have one or more caveats in terms of speed, comprehensiveness, 8 | accuracy, ease-of-use, flexibility and memory usage. PERF was designed to address all these problems. 9 | 10 | PERF is a recursive acronym that stands for “PERF is an Exhaustive Repeat Finder”. It is compatible with both Python 2 (tested on Python 2.7) and 3 (tested on Python 3.5). Its key features are: 11 | 12 | - Fast run time, despite being a single-threaded application. As an example, identification of all SSRs from the entire human genome takes less than 7 minutes. The speed can be further improved ~3 to 4 fold using (human genome finishes in less than 2 minutes using PyPy v5.8.0) 13 | - Linear time and space complexity (O(n)) 14 | - Identifies perfect SSRs 15 | - 100% accurate and comprehensive 16 | - Does not miss any repeats or does not pick any incorrect ones 17 | - Easy to use 18 | - The only required argument is the input DNA sequence in FASTA format 19 | - Flexible 20 | - Most of the parameters are customizable by the user at runtime 21 | - Repeat cutoffs can be specified either in terms of the total repeat length or in terms of number of repeating units 22 | - TSV output and HTML report 23 | 24 | The default output is an easily parseable and exportable tab-separated format. Optionally, PERF also generates an interactive HTML report that depicts trends in repeat data as concise charts and tables. 25 | 26 | Installation 27 | ------------ 28 | 29 | PERF can be directly installed using pip with the package name 30 | ``perf_ssr``. 31 | 32 | .. code:: bash 33 | 34 | $ pip install perf_ssr 35 | 36 | This name was chosen for the package so as not to clash with the existing ``perf`` package. 37 | 38 | Alternatively, it can also be installed from the source code: 39 | 40 | .. code:: bash 41 | 42 | # Download the git repo 43 | $ git clone https://github.com/RKMlab/perf.git 44 | 45 | # Install 46 | $ cd perf 47 | $ python setup.py install 48 | 49 | Both of the methods add a console command ``PERF``, which can be executed from any directory. It can also be used without installation by running the ``core.py`` file in the ``PERF`` subfolder: 50 | 51 | .. code:: bash 52 | 53 | $ git clone https://github.com/RKMlab/perf.git 54 | $ cd perf/PERF 55 | $ python core.py -h # Print the help message of PERF (see below) 56 | 57 | Usage 58 | ----- 59 | 60 | The help message and available options can be accessed using 61 | 62 | .. code:: bash 63 | 64 | $ PERF -h # Short option 65 | $ PERF --help # Long option 66 | 67 | which gives the following output 68 | 69 | :: 70 | 71 | usage: PERF [-h] -i [-o ] [-a] [-l | -u INT or FILE] 72 | [-rep ] [-m ] [-M ] [--version] 73 | 74 | Required arguments: 75 | -i , --input 76 | Input file in FASTA format 77 | 78 | Optional arguments: 79 | -o , --output 80 | Output file name. Default: Input file name + _perf.tsv 81 | -a, --analyse Generate a summary HTML report. 82 | -l , --min-length 83 | Minimum length cutoff of repeat 84 | -u INT or FILE, --min-units INT or FILE 85 | Minimum number of repeating units to be considered. 86 | Can be an integer or a file specifying cutoffs for 87 | different motif sizes. 88 | -rep , --repeats 89 | File with list of repeats (Not allowed with -m and/or 90 | -M) 91 | -m , --min-motif-size 92 | Minimum size of a repeat motif in bp (Not allowed with 93 | -rep) 94 | -M , --max-motif-size 95 | Maximum size of a repeat motif in bp (Not allowed with 96 | -rep) 97 | --version show program's version number and exit -------------------------------------------------------------------------------- /pylint.rc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # Specify a configuration file. 4 | #rcfile= 5 | 6 | # Python code to execute, usually for sys.path manipulation such as 7 | # pygtk.require(). 8 | #init-hook= 9 | 10 | # Add files or directories to the blacklist. They should be base names, not 11 | # paths. 12 | ignore=CVS 13 | 14 | # Add files or directories matching the regex patterns to the blacklist. The 15 | # regex matches against base names, not paths. 16 | ignore-patterns= 17 | 18 | # Pickle collected data for later comparisons. 19 | persistent=yes 20 | 21 | # List of plugins (as comma separated values of python modules names) to load, 22 | # usually to register additional checkers. 23 | load-plugins= 24 | 25 | # Use multiple processes to speed up Pylint. 26 | jobs=1 27 | 28 | # Allow loading of arbitrary C extensions. Extensions are imported into the 29 | # active Python interpreter and may run arbitrary code. 30 | unsafe-load-any-extension=no 31 | 32 | # A comma-separated list of package or module names from where C extensions may 33 | # be loaded. Extensions are loading into the active Python interpreter and may 34 | # run arbitrary code 35 | extension-pkg-whitelist= 36 | 37 | # Allow optimization of some AST trees. This will activate a peephole AST 38 | # optimizer, which will apply various small optimizations. For instance, it can 39 | # be used to obtain the result of joining multiple strings with the addition 40 | # operator. Joining a lot of strings can lead to a maximum recursion error in 41 | # Pylint and this flag can prevent that. It has one side effect, the resulting 42 | # AST will be different than the one from reality. This option is deprecated 43 | # and it will be removed in Pylint 2.0. 44 | optimize-ast=no 45 | 46 | 47 | [MESSAGES CONTROL] 48 | 49 | # Only show warnings with the listed confidence levels. Leave empty to show 50 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED 51 | confidence= 52 | 53 | # Enable the message, report, category or checker with the given id(s). You can 54 | # either give multiple identifier separated by comma (,) or put this option 55 | # multiple time (only on the command line, not in the configuration file where 56 | # it should appear only once). See also the "--disable" option for examples. 57 | #enable= 58 | 59 | # Disable the message, report, category or checker with the given id(s). You 60 | # can either give multiple identifiers separated by comma (,) or put this 61 | # option multiple times (only on the command line, not in the configuration 62 | # file where it should appear only once).You can also use "--disable=all" to 63 | # disable everything first and then reenable specific checks. For example, if 64 | # you want to run only the similarities checker, you can use "--disable=all 65 | # --enable=similarities". If you want to run only the classes checker, but have 66 | # no Warning level messages displayed, use"--disable=all --enable=classes 67 | # --disable=W" 68 | disable=zip-builtin-not-iterating,oct-method,indexing-exception,file-builtin,parameter-unpacking,coerce-builtin,map-builtin-not-iterating,range-builtin-not-iterating,useless-suppression,raising-string,old-ne-operator,reload-builtin,long-suffix,filter-builtin-not-iterating,using-cmp-argument,old-octal-literal,print-statement,xrange-builtin,buffer-builtin,raw_input-builtin,coerce-method,reduce-builtin,cmp-builtin,getslice-method,unpacking-in-except,import-star-module-level,delslice-method,suppressed-message,cmp-method,unichr-builtin,basestring-builtin,old-division,unicode-builtin,nonzero-method,metaclass-assignment,old-raise-syntax,input-builtin,long-builtin,dict-view-method,apply-builtin,round-builtin,setslice-method,next-method-called,intern-builtin,hex-method,dict-iter-method,execfile-builtin,backtick,no-absolute-import,standarderror-builtin,locally-disabled 69 | 70 | 71 | [REPORTS] 72 | 73 | # Set the output format. Available formats are text, parseable, colorized, msvs 74 | # (visual studio) and html. You can also give a reporter class, eg 75 | # mypackage.mymodule.MyReporterClass. 76 | output-format=text 77 | 78 | # Put messages in a separate file for each module / package specified on the 79 | # command line instead of printing them on stdout. Reports (if any) will be 80 | # written in a file name "pylint_global.[txt|html]". This option is deprecated 81 | # and it will be removed in Pylint 2.0. 82 | files-output=no 83 | 84 | # Tells whether to display a full report or only the messages 85 | reports=yes 86 | 87 | # Python expression which should return a note less than 10 (10 is the highest 88 | # note). You have access to the variables errors warning, statement which 89 | # respectively contain the number of errors / warnings messages and the total 90 | # number of statements analyzed. This is used by the global evaluation report 91 | # (RP0004). 92 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 93 | 94 | # Template used to display messages. This is a python new-style format string 95 | # used to format the message information. See doc for all details 96 | #msg-template= 97 | 98 | 99 | [SPELLING] 100 | 101 | # Spelling dictionary name. Available dictionaries: none. To make it working 102 | # install python-enchant package. 103 | spelling-dict= 104 | 105 | # List of comma separated words that should not be checked. 106 | spelling-ignore-words= 107 | 108 | # A path to a file that contains private dictionary; one word per line. 109 | spelling-private-dict-file= 110 | 111 | # Tells whether to store unknown words to indicated private dictionary in 112 | # --spelling-private-dict-file option instead of raising a message. 113 | spelling-store-unknown-words=no 114 | 115 | 116 | [TYPECHECK] 117 | 118 | # Tells whether missing members accessed in mixin class should be ignored. A 119 | # mixin class is detected if its name ends with "mixin" (case insensitive). 120 | ignore-mixin-members=yes 121 | 122 | # List of module names for which member attributes should not be checked 123 | # (useful for modules/projects where namespaces are manipulated during runtime 124 | # and thus existing member attributes cannot be deduced by static analysis. It 125 | # supports qualified module names, as well as Unix pattern matching. 126 | ignored-modules= 127 | 128 | # List of class names for which member attributes should not be checked (useful 129 | # for classes with dynamically set attributes). This supports the use of 130 | # qualified names. 131 | ignored-classes=optparse.Values,thread._local,_thread._local 132 | 133 | # List of members which are set dynamically and missed by pylint inference 134 | # system, and so shouldn't trigger E1101 when accessed. Python regular 135 | # expressions are accepted. 136 | generated-members= 137 | 138 | # List of decorators that produce context managers, such as 139 | # contextlib.contextmanager. Add to this list to register other decorators that 140 | # produce valid context managers. 141 | contextmanager-decorators=contextlib.contextmanager 142 | 143 | 144 | [SIMILARITIES] 145 | 146 | # Minimum lines number of a similarity. 147 | min-similarity-lines=4 148 | 149 | # Ignore comments when computing similarities. 150 | ignore-comments=yes 151 | 152 | # Ignore docstrings when computing similarities. 153 | ignore-docstrings=yes 154 | 155 | # Ignore imports when computing similarities. 156 | ignore-imports=no 157 | 158 | 159 | [BASIC] 160 | 161 | # Good variable names which should always be accepted, separated by a comma 162 | good-names=i,j,k,ex,Run,_ 163 | 164 | # Bad variable names which should always be refused, separated by a comma 165 | bad-names=foo,bar,baz,toto,tutu,tata 166 | 167 | # Colon-delimited sets of names that determine each other's naming style when 168 | # the name regexes allow several styles. 169 | name-group= 170 | 171 | # Include a hint for the correct naming format with invalid-name 172 | include-naming-hint=no 173 | 174 | # List of decorators that produce properties, such as abc.abstractproperty. Add 175 | # to this list to register other decorators that produce valid properties. 176 | property-classes=abc.abstractproperty 177 | 178 | # Regular expression matching correct class attribute names 179 | class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ 180 | 181 | # Naming hint for class attribute names 182 | class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ 183 | 184 | # Regular expression matching correct method names 185 | method-rgx=[a-z_][a-z0-9_]{2,30}$ 186 | 187 | # Naming hint for method names 188 | method-name-hint=[a-z_][a-z0-9_]{2,30}$ 189 | 190 | # Regular expression matching correct constant names 191 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 192 | 193 | # Naming hint for constant names 194 | const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 195 | 196 | # Regular expression matching correct argument names 197 | argument-rgx=[a-z_][a-z0-9_]{2,30}$ 198 | 199 | # Naming hint for argument names 200 | argument-name-hint=[a-z_][a-z0-9_]{2,30}$ 201 | 202 | # Regular expression matching correct variable names 203 | variable-rgx=[a-z_][a-z0-9_]{2,30}$ 204 | 205 | # Naming hint for variable names 206 | variable-name-hint=[a-z_][a-z0-9_]{2,30}$ 207 | 208 | # Regular expression matching correct module names 209 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 210 | 211 | # Naming hint for module names 212 | module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 213 | 214 | # Regular expression matching correct class names 215 | class-rgx=[A-Z_][a-zA-Z0-9]+$ 216 | 217 | # Naming hint for class names 218 | class-name-hint=[A-Z_][a-zA-Z0-9]+$ 219 | 220 | # Regular expression matching correct function names 221 | function-rgx=[a-z_][a-z0-9_]{2,30}$ 222 | 223 | # Naming hint for function names 224 | function-name-hint=[a-z_][a-z0-9_]{2,30}$ 225 | 226 | # Regular expression matching correct attribute names 227 | attr-rgx=[a-z_][a-z0-9_]{2,30}$ 228 | 229 | # Naming hint for attribute names 230 | attr-name-hint=[a-z_][a-z0-9_]{2,30}$ 231 | 232 | # Regular expression matching correct inline iteration names 233 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ 234 | 235 | # Naming hint for inline iteration names 236 | inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$ 237 | 238 | # Regular expression which should only match function or class names that do 239 | # not require a docstring. 240 | no-docstring-rgx=^_ 241 | 242 | # Minimum line length for functions/classes that require docstrings, shorter 243 | # ones are exempt. 244 | docstring-min-length=-1 245 | 246 | 247 | [ELIF] 248 | 249 | # Maximum number of nested blocks for function / method body 250 | max-nested-blocks=5 251 | 252 | 253 | [VARIABLES] 254 | 255 | # Tells whether we should check for unused import in __init__ files. 256 | init-import=no 257 | 258 | # A regular expression matching the name of dummy variables (i.e. expectedly 259 | # not used). 260 | dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy 261 | 262 | # List of additional names supposed to be defined in builtins. Remember that 263 | # you should avoid to define new builtins when possible. 264 | additional-builtins= 265 | 266 | # List of strings which can identify a callback function by name. A callback 267 | # name must start or end with one of those strings. 268 | callbacks=cb_,_cb 269 | 270 | # List of qualified module names which can have objects that can redefine 271 | # builtins. 272 | redefining-builtins-modules=six.moves,future.builtins 273 | 274 | 275 | [LOGGING] 276 | 277 | # Logging modules to check that the string format arguments are in logging 278 | # function parameter format 279 | logging-modules=logging 280 | 281 | 282 | [FORMAT] 283 | 284 | # Maximum number of characters on a single line. 285 | max-line-length=100 286 | 287 | # Regexp for a line that is allowed to be longer than the limit. 288 | ignore-long-lines=^\s*(# )??$ 289 | 290 | # Allow the body of an if to be on the same line as the test if there is no 291 | # else. 292 | single-line-if-stmt=no 293 | 294 | # List of optional constructs for which whitespace checking is disabled. `dict- 295 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. 296 | # `trailing-comma` allows a space between comma and closing bracket: (a, ). 297 | # `empty-line` allows space-only lines. 298 | no-space-check=trailing-comma,dict-separator 299 | 300 | # Maximum number of lines in a module 301 | max-module-lines=1000 302 | 303 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 304 | # tab). 305 | indent-string=' ' 306 | 307 | # Number of spaces of indent required inside a hanging or continued line. 308 | indent-after-paren=4 309 | 310 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 311 | expected-line-ending-format= 312 | 313 | 314 | [MISCELLANEOUS] 315 | 316 | # List of note tags to take in consideration, separated by a comma. 317 | notes=FIXME,XXX,TODO 318 | 319 | 320 | [CLASSES] 321 | 322 | # List of method names used to declare (i.e. assign) instance attributes. 323 | defining-attr-methods=__init__,__new__,setUp 324 | 325 | # List of valid names for the first argument in a class method. 326 | valid-classmethod-first-arg=cls 327 | 328 | # List of valid names for the first argument in a metaclass class method. 329 | valid-metaclass-classmethod-first-arg=mcs 330 | 331 | # List of member names, which should be excluded from the protected access 332 | # warning. 333 | exclude-protected=_asdict,_fields,_replace,_source,_make 334 | 335 | 336 | [DESIGN] 337 | 338 | # Maximum number of arguments for function / method 339 | max-args=5 340 | 341 | # Argument names that match this expression will be ignored. Default to name 342 | # with leading underscore 343 | ignored-argument-names=_.* 344 | 345 | # Maximum number of locals for function / method body 346 | max-locals=15 347 | 348 | # Maximum number of return / yield for function / method body 349 | max-returns=6 350 | 351 | # Maximum number of branch for function / method body 352 | max-branches=12 353 | 354 | # Maximum number of statements in function / method body 355 | max-statements=50 356 | 357 | # Maximum number of parents for a class (see R0901). 358 | max-parents=7 359 | 360 | # Maximum number of attributes for a class (see R0902). 361 | max-attributes=7 362 | 363 | # Minimum number of public methods for a class (see R0903). 364 | min-public-methods=2 365 | 366 | # Maximum number of public methods for a class (see R0904). 367 | max-public-methods=20 368 | 369 | # Maximum number of boolean expressions in a if statement 370 | max-bool-expr=5 371 | 372 | 373 | [IMPORTS] 374 | 375 | # Deprecated modules which should not be used, separated by a comma 376 | deprecated-modules=optparse 377 | 378 | # Create a graph of every (i.e. internal and external) dependencies in the 379 | # given file (report RP0402 must not be disabled) 380 | import-graph= 381 | 382 | # Create a graph of external dependencies in the given file (report RP0402 must 383 | # not be disabled) 384 | ext-import-graph= 385 | 386 | # Create a graph of internal dependencies in the given file (report RP0402 must 387 | # not be disabled) 388 | int-import-graph= 389 | 390 | # Force import order to recognize a module as part of the standard 391 | # compatibility libraries. 392 | known-standard-library= 393 | 394 | # Force import order to recognize a module as part of a third party library. 395 | known-third-party=enchant 396 | 397 | # Analyse import fallback blocks. This can be used to support both Python 2 and 398 | # 3 compatible code, which means that the block might have code that exists 399 | # only in one or another interpreter, leading to false positives when analysed. 400 | analyse-fallback-blocks=no 401 | 402 | 403 | [EXCEPTIONS] 404 | 405 | # Exceptions that will emit a warning when being caught. Defaults to 406 | # "Exception" 407 | overgeneral-exceptions=Exception 408 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | biopython>=1.67 2 | regex>=2016.8.27 3 | tqdm>=4.8.4 4 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.rst -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup, find_packages 4 | 5 | setup( 6 | name='perf_ssr', 7 | version='0.4.6', 8 | description='PERF is an exhaustive repeat finder', 9 | url='https://github.com/rkmlab/perf', 10 | keywords='ssr microsatellites', 11 | author='Divya Tej Sowpati', 12 | author_email='tej@ccmb.res.in', 13 | license='MIT', 14 | packages=find_packages(), 15 | install_requires=['biopython==1.69', 'tqdm>=4'], # biopython version 1.69 installs numpy 16 | entry_points={ 17 | 'console_scripts': ['PERF=PERF.core:main'] 18 | }, 19 | include_package_data=True, # change path according to package name in MANIFEST.in 20 | ) -------------------------------------------------------------------------------- /test_data/repeat_options.txt: -------------------------------------------------------------------------------- 1 | A A 1 + 2 | T A 1 - 3 | AG AG 2 + 4 | CT AG 2 - 5 | GA AG 2 + 6 | TC AG 2 - 7 | -------------------------------------------------------------------------------- /test_data/test_input.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RKMlab/perf/e343a1fa437033afce5b5a794079230530619983/test_data/test_input.fastq.gz -------------------------------------------------------------------------------- /test_data/unit_options.txt: -------------------------------------------------------------------------------- 1 | 1 12 2 | 2 6 3 | 3 4 4 | 4 3 5 | 5 2 6 | 6 2 7 | -------------------------------------------------------------------------------- /utils/repeat_generator.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | # pylint: disable=C0103 4 | from __future__ import print_function 5 | import sys 6 | import argparse 7 | from itertools import product 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('-m', '--min-motif-size', type=int, metavar='', default=1, help='Minimum size of a repeat motif in bp') 11 | parser.add_argument('-M', '--max-motif-size', type=int, metavar='', default=6, help='Maximum size of a repeat motif in bp') 12 | parser.add_argument('-fo', '--out', type=argparse.FileType('w'), metavar='', default=sys.stdout, help='Output file') 13 | args = parser.parse_args() 14 | 15 | def rev_comp(string): 16 | complement = string.translate(str.maketrans('ACGT', 'TGCA')) 17 | return complement[::-1] 18 | 19 | 20 | def expand_repeat(string, size): 21 | return_string = '' 22 | i = 0 23 | while len(return_string) < size: 24 | return_string += string[i] 25 | i += 1 26 | if i >= len(string): 27 | i = 0 28 | return return_string 29 | 30 | 31 | def get_cycles(string): 32 | cycles = [] 33 | for i in range(len(string)): 34 | cycles.append(string[i:] + string[:i]) 35 | return cycles 36 | 37 | 38 | def generate_repeats(min_size, max_size, output_file): 39 | alphabet = ['A', 'C', 'G', 'T'] 40 | expanded_set = set() 41 | repeat_set = set() 42 | for i in range(min_size, max_size+1): 43 | for combination in product(alphabet, repeat=i): 44 | repeat = ''.join(combination) 45 | repeat_revcomp = rev_comp(repeat) 46 | expanded = expand_repeat(repeat, max_size) 47 | if expanded in expanded_set: 48 | continue 49 | else: 50 | repeat_cycles = get_cycles(repeat) 51 | for cycle in repeat_cycles: 52 | strand = '+' 53 | string = expand_repeat(cycle, max_size) 54 | expanded_set.add(string) 55 | if cycle not in repeat_set: 56 | repeat_set.add(cycle) 57 | print(cycle, repeat, str(len(cycle)), strand, sep='\t', file=output_file) 58 | if repeat_revcomp == repeat: 59 | continue 60 | repeat_cycles = get_cycles(repeat_revcomp) 61 | for cycle in repeat_cycles: 62 | strand = '-' 63 | string = expand_repeat(cycle, max_size) 64 | expanded_set.add(string) 65 | if cycle not in repeat_set: 66 | repeat_set.add(cycle) 67 | print(cycle, repeat, str(len(cycle)), strand, sep='\t', file=output_file) 68 | 69 | min_motif_size = args.min_motif_size 70 | max_motif_size = args.max_motif_size 71 | generate_repeats(min_motif_size, max_motif_size, args.output) 72 | --------------------------------------------------------------------------------