├── .gitignore ├── README.md ├── sv.py ├── svimmer ├── test ├── HG002_30x_Manta_v1.6.0_test_data.vcf.gz ├── HG002_30x_Manta_v1.6.0_test_data.vcf.gz.tbi ├── HG003_30x_Manta_v1.6.0_test_data.vcf.gz ├── HG003_30x_Manta_v1.6.0_test_data.vcf.gz.tbi ├── HG004_30x_Manta_v1.6.0_test_data.vcf.gz ├── HG004_30x_Manta_v1.6.0_test_data.vcf.gz.tbi └── expected_output.vcf ├── test_vcfs └── utilities.py /.gitignore: -------------------------------------------------------------------------------- 1 | /__pycache__/ 2 | *.pyc 3 | *.pyo 4 | /*.vcf 5 | /*.vcf.gz 6 | /*.vcf.gz.tbi 7 | /*.rp 8 | /*.rpdb 9 | /.*/ 10 | /*.sh 11 | /test/actual_output.vcf 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## svimmer - SV merging tool 2 | 3 | Merges similar SVs from multiple single sample VCF files. The tool was written for merging SVs discovered using Manta calls, but should support (almost) any SV VCFs. The output is a VCF file containing all merged SV sites (with no calls). The output can be given as input into [GraphTyper](https://github.com/DecodeGenetics/graphtyper) to genotype the sites. 4 | 5 | ### Requirements 6 | 7 | * Python 3.4+ 8 | * pysam 9 | 10 | ### Usage 11 | 12 | ```sh 13 | python3 svimmer input_vcfs chrA chrB chrC ... 14 | ``` 15 | 16 | where input is a list of tabix indexed+bgzipped VCF files and chromosomes are the chromosomes to merge. For further details see the help page: 17 | 18 | ```sh 19 | python3 svimmer -h 20 | ``` 21 | 22 | ## Test data example 23 | 24 | ```sh 25 | python3 svimmer test_vcfs chr20 > test/actual_output.vcf 26 | diff test/actual_output.vcf test/expected_output.vcf 27 | ``` 28 | 29 | ## License 30 | GNU GPLv3 31 | -------------------------------------------------------------------------------- /sv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | from utilities import calculate_overlap, calculate_stddev, get_most_common_item 5 | 6 | SV_TYPES = ['UNK', 'BND', 'DEL', 'INS', 'INV', 'NOT_SV'] 7 | 8 | def make_info_dictionary(info): 9 | spl_info = info.split(";") 10 | info_dict = {} 11 | 12 | for key_val in spl_info: 13 | if "=" in key_val: 14 | key, val = key_val.split("=") 15 | info_dict[key] = val 16 | else: 17 | info_dict[key_val] = "" 18 | 19 | return info_dict 20 | 21 | 22 | class SV(object): 23 | """ 24 | Constructs a structural variant (SV) candidate 25 | """ 26 | def __init__(self, 27 | vcf_record, 28 | check_type = True, 29 | join_mode = False, 30 | output_ids = False, 31 | ignore_bnd = False, 32 | ignore_inv = False): 33 | self.is_outputting_ids = output_ids 34 | self.join_mode = join_mode 35 | self.is_sv = True 36 | 37 | spl_line = vcf_record.rstrip("\n").split("\t")[:8] 38 | self.check_type = check_type 39 | self.chromosome = spl_line[0] 40 | self.begin = int(spl_line[1]) 41 | self.end = self.begin 42 | 43 | if self.is_outputting_ids: 44 | self.ids = [spl_line[2]] 45 | 46 | # If no INFO, add SVTYPE=UNK 47 | if spl_line[7] == ".": 48 | spl_line[7] = "" 49 | 50 | info_dict = make_info_dictionary(spl_line[7]) 51 | 52 | if "SVTYPE" not in info_dict: 53 | ref = spl_line[3] 54 | alt = spl_line[4] 55 | 56 | # Only biallelic 57 | alt_spl = [x for x in alt.split(",") if x != "*"] 58 | 59 | if len(alt_spl) == 1: 60 | if len(ref) >= len(alt) + 50: 61 | spl_line[4] = alt_spl[0] 62 | 63 | if len(spl_line[7]) > 0: 64 | spl_line[7] += ";" 65 | 66 | spl_line[7] = "%sSVTYPE=DEL;SVLEN=%d" % (spl_line[7], len(alt) - len(ref)) 67 | info_dict["SVTYPE"] = "DEL" 68 | info_dict["SVSIZE"] = "%d" % (len(ref) - len(alt)) 69 | info_dict["SVLEN"] = "%d" % (len(alt) - len(ref)) 70 | elif len(alt) >= len(ref) + 50: 71 | spl_line[4] = alt_spl[0] 72 | 73 | if len(spl_line[7]) > 0: 74 | spl_line[7] += ";" 75 | 76 | spl_line[7] = "%sSVTYPE=INS;SVLEN=%d" % (spl_line[7], len(alt) - len(ref)) 77 | info_dict["SVTYPE"] = "INS" 78 | info_dict["SVSIZE"] = "%d" % (len(alt) - len(ref)) 79 | info_dict["SVLEN"] = "%d" % (len(alt) - len(ref)) 80 | else: 81 | self.is_sv = False 82 | return None 83 | else: 84 | self.is_sv = False 85 | return None 86 | 87 | if "SVTYPE" not in info_dict: 88 | info_dict["SVTYPE"] = "NOT_SV" 89 | self.is_sv = False 90 | return None 91 | else: 92 | if (ignore_bnd and (info_dict["SVTYPE"] == "BND" or info_dict["SVTYPE"] == "TRA")) or \ 93 | (ignore_inv and info_dict["SVTYPE"] == "INV"): 94 | self.is_sv = False 95 | return None 96 | 97 | # Join related SV types 98 | if info_dict["SVTYPE"] == "DEL_ALU" or info_dict["SVTYPE"] == "DEL_LINE1": 99 | info_dict["SVTYPE"] = "DEL" 100 | elif info_dict["SVTYPE"] == "ALU" or info_dict["SVTYPE"] == "LINE1" or info_dict["SVTYPE"] == "SVA" or \ 101 | info_dict["SVTYPE"] == "DUP" or info_dict["SVTYPE"] == "CNV" or info_dict["SVTYPE"] == "INVDUP" or \ 102 | info_dict["SVTYPE"] == "INV": 103 | info_dict["SVTYPE"] = "INS" 104 | elif info_dict["SVTYPE"] == "TRA": 105 | info_dict["SVTYPE"] = "BND" 106 | 107 | 108 | 109 | # Remove old values 110 | self.old_num_merged_svs = -1 111 | 112 | if not join_mode and "NUM_MERGED_SVS" in info_dict: 113 | #print(int(info_dict["NUM_MERGED_SVS"])) 114 | self.old_num_merged_svs = int(info_dict["NUM_MERGED_SVS"]) 115 | spl_line[7] = spl_line[7].replace("NUM_MERGED_SVS=%s;" % info_dict["NUM_MERGED_SVS"], "") 116 | 117 | if "STDDEV_POS" in info_dict: 118 | spl_line[7] = spl_line[7].replace("STDDEV_POS=%s;" % info_dict["STDDEV_POS"], "") 119 | 120 | if "SVTYPE" in info_dict and info_dict["SVTYPE"] == "DEL": 121 | if "END" in info_dict: 122 | self.end = int(info_dict["END"]) 123 | else: 124 | if "SVSIZE" in info_dict: 125 | self.end = self.begin + abs(int(info_dict["SVSIZE"])) 126 | elif "SVLEN" in info_dict: 127 | self.end = self.begin + abs(int(info_dict["SVLEN"])) 128 | 129 | if self.end - 50 < self.begin: 130 | self.is_sv = False 131 | return None 132 | elif "SVTYPE" in info_dict and info_dict["SVTYPE"] == "INS": 133 | svlen = int(info_dict["SVLEN"]) if "SVLEN" in info_dict else -1 134 | 135 | if svlen == -1: 136 | svlen = int(info_dict["SVSIZE"]) if "SVSIZE" in info_dict else -1 137 | 138 | if svlen < 50 and "SVINSSEQ" not in info_dict and "LEFT_SVINSSEQ" not in info_dict and "RIGHT_SVINSSEQ" not in info_dict: 139 | self.is_sv = False 140 | return None 141 | 142 | if check_type: 143 | if info_dict["SVTYPE"] in SV_TYPES: 144 | self.type = SV_TYPES.index(info_dict["SVTYPE"]) 145 | else: 146 | self.type = 0 # Unknown type 147 | assert False 148 | else: 149 | self.type = 0 150 | 151 | # Get all begins and ends 152 | self.min_begin = self.begin 153 | self.max_begin = self.begin 154 | self.begins = [self.begin] 155 | self.ends = [self.end] 156 | self.infos = [spl_line[7]] 157 | self.unique_begins_and_ends = set() 158 | self.unique_begins_and_ends.add((int(self.begin), int(self.end))) 159 | self.refs = [spl_line[3]] 160 | self.alts = [spl_line[4]] 161 | 162 | 163 | """ 164 | Converts the SV to a string 165 | """ 166 | def __str__(self): 167 | info = self.infos[0] 168 | if len(info) > 0: 169 | info += ";" 170 | 171 | num_text = "JOINED" if self.join_mode else "MERGED" 172 | 173 | num_svs = len(self.begins) 174 | 175 | if self.old_num_merged_svs > 0: 176 | num_svs += self.old_num_merged_svs - 1 177 | 178 | if self.is_outputting_ids: 179 | info += "MERGED_IDS=%s;NUM_%s_SVS=%d;STDDEV_POS=%.2f,%.2f" % (",".join(self.ids), 180 | num_text, 181 | num_svs, 182 | calculate_stddev(self.begins), 183 | calculate_stddev(self.ends) 184 | ) 185 | else: 186 | info += "NUM_%s_SVS=%d;STDDEV_POS=%.2f,%.2f" % (num_text, 187 | num_svs, 188 | calculate_stddev(self.begins), 189 | calculate_stddev(self.ends) 190 | ) 191 | 192 | return "%s\t%d\t%s\t%s\t%s\t%d\t%s\t%s\n" % ( 193 | self.chromosome, 194 | self.begin, 195 | ".", # variant ID 196 | self.refs[0], 197 | self.alts[0], 198 | 0, # quality 199 | ".", # filter 200 | info 201 | ) 202 | 203 | 204 | """ 205 | Finalize SV before printing 206 | """ 207 | def finalize(self): 208 | # Get the most common begin and end combo 209 | begins_and_ends = list(zip(self.begins, self.ends)) 210 | self.begin, self.end = get_most_common_item(begins_and_ends) 211 | 212 | for i, (begin, end) in enumerate(begins_and_ends): 213 | if begin == self.begin and end == self.end: 214 | if i > 0: 215 | self.infos[0] = self.infos[i] 216 | self.refs[0] = self.refs[i] 217 | self.alts[0] = self.alts[i] 218 | return 219 | 220 | """ 221 | Defines how to represent the SV (in printing) 222 | """ 223 | def __repr__(self): 224 | return self.__str__() 225 | 226 | """ 227 | Check if some SV should merge with this one. Merging should happen when both SVs are the same type with and overlap 228 | highly with each other. 229 | 230 | :parameter otherSV The other SV to check. 231 | :returns True if the two SVs should merge. 232 | """ 233 | def should_merge(self, other_sv, max_sv_distance, max_size_difference): 234 | assert isinstance(other_sv, SV) 235 | assert len(self.begins) == len(self.ends) 236 | 237 | if other_sv.type != self.type: 238 | return False # SVs of different types or chromosome should not merge 239 | 240 | # For sanity 241 | if abs(other_sv.max_begin - self.min_begin) > 10000 or\ 242 | abs(other_sv.min_begin - self.max_begin) > 10000: 243 | return False 244 | 245 | for begin, end in self.unique_begins_and_ends: 246 | # For other SVs, calculate their overlap 247 | if abs(begin - other_sv.begin) <= max_sv_distance and\ 248 | abs(end - other_sv.end) <= max_sv_distance and\ 249 | abs((other_sv.end - other_sv.begin) - (end - begin)) <= max_size_difference: 250 | return True # Overlap is enough to merge 251 | 252 | # We could not find any interval in this SV that did not enough, we should therefore merge these two SVs 253 | return False 254 | 255 | """ 256 | Merges two SVs. Assumes that they should be merged, which can be checked with the 'should merge' function 257 | 258 | :parameter otherSV The other SV to merge with. 259 | :returns Nothing 260 | """ 261 | def merge(self, other_sv): 262 | assert isinstance(other_sv, SV) 263 | assert self.type == other_sv.type 264 | 265 | self.min_begin = min(self.min_begin, other_sv.min_begin) 266 | self.max_begin = max(self.max_begin, other_sv.max_begin) 267 | self.begins += other_sv.begins 268 | self.ends += other_sv.ends 269 | self.infos += other_sv.infos 270 | self.refs += other_sv.refs 271 | self.alts += other_sv.alts 272 | self.unique_begins_and_ends = self.unique_begins_and_ends.union(other_sv.unique_begins_and_ends) 273 | 274 | if other_sv.old_num_merged_svs > 0: 275 | self.old_num_merged_svs += other_sv.old_num_merged_svs - 1 276 | 277 | if self.is_outputting_ids: 278 | self.ids += other_sv.ids 279 | -------------------------------------------------------------------------------- /svimmer: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from __future__ import print_function 3 | 4 | import argparse 5 | import itertools 6 | import gzip 7 | import logging 8 | import operator 9 | import os 10 | import sys 11 | from pysam import TabixFile 12 | from multiprocessing import Pool 13 | 14 | from sv import SV 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | def open_vcf_file(filename): 20 | _vcf_f = None 21 | 22 | if filename.endswith(".gz"): 23 | _vcf_f = gzip.open(filename, "r") 24 | else: 25 | _vcf_f = open(filename, "r") 26 | 27 | return _vcf_f 28 | 29 | 30 | def read_header(vcf_f): 31 | header = "" 32 | 33 | while True: 34 | line = vcf_f.readline() 35 | 36 | if isinstance(line, (bytes, bytearray)): 37 | line = line.decode("utf-8") 38 | 39 | if line.startswith("##"): 40 | header += line 41 | continue 42 | 43 | break 44 | 45 | return header 46 | 47 | 48 | def add_sv_to_store(ref_stored_svs, new_sv): 49 | assert isinstance(ref_stored_svs, list) 50 | assert isinstance(new_sv, SV) 51 | 52 | for stored_sv_reverse in reversed(ref_stored_svs): 53 | if stored_sv_reverse.max_begin + args.max_distance + 1000 < new_sv.begin: 54 | break 55 | elif stored_sv_reverse.should_merge(new_sv, args.max_distance, args.max_size_difference): 56 | stored_sv_reverse.merge(new_sv) 57 | return 58 | 59 | # We could not find any SV in the stored SV list that can be merged with this one, 60 | # so let's add it 61 | ref_stored_svs.append(new_sv) 62 | 63 | 64 | def add_sv_to_many_in_store(ref_stored_svs, new_sv): 65 | assert isinstance(ref_stored_svs, list) 66 | assert isinstance(new_sv, SV) 67 | 68 | for stored_sv_reverse in reversed(ref_stored_svs): 69 | #if stored_sv_reverse.max_begin + args.max_distance + 1000 < new_sv.begin: 70 | # break 71 | if stored_sv_reverse.should_merge(new_sv, args.max_distance, args.max_size_difference): 72 | stored_sv_reverse.merge(new_sv) 73 | 74 | if args.join_mode_strict: 75 | return None 76 | 77 | 78 | def append_svs_from_vcf(vcf_filename, chrom, start=None, stop=None): 79 | svs = [] 80 | vcf_filename = vcf_filename.rstrip("\n") 81 | vcf_index = vcf_filename + ".csi" 82 | 83 | # Use .tbi index if it exists, otherwise try opening .csi 84 | if os.path.exists(vcf_filename + ".tbi"): 85 | vcf_index = vcf_filename + ".tbi" 86 | 87 | try: 88 | with TabixFile(vcf_filename, "r", None, vcf_index) as new_vcf_f: 89 | for record in new_vcf_f.fetch(chrom, start, stop): 90 | new_sv = SV(record, 91 | check_type=not args.ignore_types, 92 | join_mode=args.join_mode, 93 | output_ids=args.ids, 94 | ignore_bnd=args.ignore_bnd, 95 | ignore_inv=args.ignore_inv) 96 | 97 | if new_sv.is_sv and new_sv.begin >= start: 98 | svs.append(new_sv) 99 | except ValueError as e: 100 | print(e) 101 | print("Warning: Contig '%s' was not found in file %s!" % (chrom, vcf_filename)) 102 | pass # If contig does not exists 103 | except OSError as e2: 104 | print(e2) 105 | print("Error: Input file is expected to be bgzipped and index (either with .tbi or .csi)") 106 | raise 107 | 108 | return svs 109 | 110 | 111 | if __name__ == "__main__": 112 | parser = argparse.ArgumentParser() 113 | parser.add_argument("input", help="File containing all input VCF files.") 114 | parser.add_argument("chromosomes", nargs="+", help="Which chromosome(s) to merge") 115 | parser.add_argument("--output", default="-", help="Output VCF file.") 116 | parser.add_argument("--loglevel", default="WARNING", help="Which log level to use, should be INFO, WARNING or ERROR.") 117 | parser.add_argument("--threads", default=1, type=int, help="Number of threads to use.") 118 | parser.add_argument("--max_distance", 119 | default=200, 120 | type=int, 121 | help="Maximum distance between breakpoint ends to allow merging (default 200).") 122 | parser.add_argument("--max_size_difference", 123 | default=100, 124 | type=int, 125 | help="""Maximum size difference of SVs that can be merged (default 100).\ 126 | -1 means no limit.""" 127 | ) 128 | parser.add_argument("--ignore-types", 129 | dest="ignore_types", 130 | action="store_true", 131 | help="Set if the merging should ignore the SV type." 132 | ) 133 | parser.add_argument("--ignore-bnd", 134 | dest="ignore_bnd", 135 | action="store_true", 136 | help="""Set if the merging should ignore break-end (BND) SVs.""" 137 | ) 138 | parser.add_argument("--ignore-inv", 139 | dest="ignore_inv", 140 | action="store_true", 141 | help="""Set if the merging should ignore inversion (INV) SVs.""" 142 | ) 143 | parser.add_argument("--join-mode", 144 | dest="join_mode", 145 | action="store_true", 146 | help="""Set if the merging should join VCFs from the first file to the other\ 147 | files.""" 148 | ) 149 | parser.add_argument("--join-mode-strict", 150 | dest="join_mode_strict", 151 | action="store_true", 152 | help="""Set if the merging should join VCFs from the first file to the other\ 153 | files. In strict join mode the SVs from the other files are only allowed to join against\ 154 | at most one SV from the first file.""" 155 | ) 156 | parser.add_argument("--region-start", 157 | dest="region_start", 158 | default=1, 159 | type=int, 160 | help="""Start coordinate of region to merge/join.""" 161 | ) 162 | parser.add_argument("--region-end", 163 | dest="region_end", 164 | default=17179869183, 165 | type=int, 166 | help="""End coordinate of region to merge/join. -1 means end of chromsome.""" 167 | ) 168 | parser.add_argument("--ids", action="store_true", help="""Print variant IDs in INFO field of merged SVs.""") 169 | args = parser.parse_args() 170 | 171 | # Strict join mode implies join mode 172 | if args.join_mode_strict: 173 | args.join_mode = True 174 | 175 | if args.max_size_difference < 0: 176 | args.max_size_difference = 400000000 177 | 178 | header = "" 179 | all_svs = [] # List with all processed SVs 180 | 181 | lines = [] 182 | pool = Pool(args.threads) 183 | 184 | FORMAT = '%(asctime)-15s: %(message)s' 185 | logging.basicConfig(format=FORMAT) 186 | logger.setLevel(logging.getLevelName(args.loglevel)) 187 | logger.info("Starting to merge") 188 | 189 | 190 | with open(args.input, "r") as f_in: 191 | lines = f_in.read().rstrip("\n").split("\n") 192 | vcf_f = open_vcf_file(filename=lines[0].rstrip("\n")) # Open the first VCF 193 | header = read_header(vcf_f) # Read the header of the first VCF file 194 | vcf_f.close() # Close the VCF file 195 | 196 | with sys.stdout if args.output == "-" else open(args.output, "w") as f_out: 197 | ## Write header 198 | f_out.write(header) # Print header of first VCF file 199 | 200 | # Add some lines to the output VCF header 201 | if args.ids: 202 | f_out.write("""##INFO=\n""") 203 | 204 | if args.join_mode: 205 | f_out.write("""##INFO=\n""") 206 | else: 207 | f_out.write("""##INFO=\n""") 208 | 209 | f_out.write("""##INFO=\n""") 210 | f_out.write("""#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n""") 211 | 212 | for chrom in args.chromosomes: 213 | if args.join_mode: 214 | all_svs = append_svs_from_vcf(lines[0], chrom, args.region_start, args.region_end) 215 | else: 216 | results = pool.starmap(append_svs_from_vcf, zip(lines, 217 | itertools.repeat(chrom), 218 | itertools.repeat(args.region_start), 219 | itertools.repeat(args.region_end)), chunksize=1) 220 | all_svs = [item for sublist in results for item in sublist] 221 | all_svs.sort(key = operator.attrgetter('begin', 'end')) # Sort all SVs 222 | 223 | logger.info("Finished reading all VCFs for chrom: %s" % chrom) 224 | stored_svs = [] 225 | 226 | if args.join_mode: 227 | # Join mode 228 | while len(all_svs) > 0: 229 | stored_svs.append(all_svs.pop(0)) 230 | 231 | for line_in in lines[1:]: 232 | all_svs = append_svs_from_vcf(line_in, chrom) 233 | all_svs.sort(key = operator.attrgetter('begin', 'end')) # Sort all SVs 234 | 235 | while len(all_svs) > 0: 236 | add_sv_to_many_in_store(stored_svs, all_svs.pop(0)) 237 | elif len(all_svs) > 0: 238 | # Merge mode 239 | stored_svs.append(all_svs.pop(0)) # Start merging SVs 240 | 241 | while len(all_svs) > 0: 242 | add_sv_to_store(stored_svs, all_svs.pop(0)) 243 | 244 | logger.info("Finished merging the SV sites") 245 | 246 | for stored_sv in stored_svs: 247 | stored_sv.finalize() 248 | 249 | logger.info("Finished finalizing") 250 | stored_svs.sort(key = operator.attrgetter('begin', 'end')) # Sort all before outputting 251 | logger.info("Finished sorting") 252 | 253 | for stored_sv in stored_svs: 254 | f_out.write(str(stored_sv)) 255 | 256 | logger.info("Everything is completed") 257 | -------------------------------------------------------------------------------- /test/HG002_30x_Manta_v1.6.0_test_data.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DecodeGenetics/svimmer/2af9ccf1ac056b9516c25ec4a0121657a9f9ab8e/test/HG002_30x_Manta_v1.6.0_test_data.vcf.gz -------------------------------------------------------------------------------- /test/HG002_30x_Manta_v1.6.0_test_data.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DecodeGenetics/svimmer/2af9ccf1ac056b9516c25ec4a0121657a9f9ab8e/test/HG002_30x_Manta_v1.6.0_test_data.vcf.gz.tbi -------------------------------------------------------------------------------- /test/HG003_30x_Manta_v1.6.0_test_data.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DecodeGenetics/svimmer/2af9ccf1ac056b9516c25ec4a0121657a9f9ab8e/test/HG003_30x_Manta_v1.6.0_test_data.vcf.gz -------------------------------------------------------------------------------- /test/HG003_30x_Manta_v1.6.0_test_data.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DecodeGenetics/svimmer/2af9ccf1ac056b9516c25ec4a0121657a9f9ab8e/test/HG003_30x_Manta_v1.6.0_test_data.vcf.gz.tbi -------------------------------------------------------------------------------- /test/HG004_30x_Manta_v1.6.0_test_data.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DecodeGenetics/svimmer/2af9ccf1ac056b9516c25ec4a0121657a9f9ab8e/test/HG004_30x_Manta_v1.6.0_test_data.vcf.gz -------------------------------------------------------------------------------- /test/HG004_30x_Manta_v1.6.0_test_data.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DecodeGenetics/svimmer/2af9ccf1ac056b9516c25ec4a0121657a9f9ab8e/test/HG004_30x_Manta_v1.6.0_test_data.vcf.gz.tbi -------------------------------------------------------------------------------- /test/expected_output.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##fileDate=20210107 3 | ##contig= 4 | ##INFO= 5 | ##INFO= 6 | #CHROM POS ID REF ALT QUAL FILTER INFO 7 | chr20 41196482 . T TTATAAATATATATTATATATAATATATATTTATATAAATATATATTATATATAAATATATATTATATATAATATATATTTATATAAATATATATTATATATTATATAAATATTATATATAATATATATTTA 0 . END=41196482;SVTYPE=INS;SVLEN=131;CIGAR=1M131I;CIPOS=0,20;HOMLEN=20;HOMSEQ=TATAAATATATATTATATAT;NUM_MERGED_SVS=3;STDDEV_POS=0.00,0.00 8 | chr20 41257715 . AATTCTCCTGCCTCAGCCTCCTTAGTAGCTGGGACTACAGGCACACGCCACCATGCCTGGCTAAGTTTTCGTATTTTTAGTAGAGACGGGGTTTCACCATGCTAGCCAGGCTGGTCTCGAACTCCTGACCTTGTGATCTGCCCACCTTGGCCTCCCAAAGTGCTGGGATTACAGGTGTTAGCCACCACGCCCAACCCTTTTTTTTTTTGAGACGGAGTTTTGCTCTTGTCAGCCAGGCTGGGGTACAGTGGCACAGCTCACTGCAACCTCCACCTCCCAGGTTCAAGTG A 0 . END=41258003;SVTYPE=DEL;SVLEN=-288;CIGAR=1M288D;CIPOS=0,9;HOMLEN=9;HOMSEQ=ATTCTCCTG;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 9 | chr20 41277701 . C 0 . END=41278993;SVTYPE=DEL;SVLEN=-1292;IMPRECISE;CIPOS=-393,393;CIEND=-511,511;NUM_MERGED_SVS=3;STDDEV_POS=12.49,71.07 10 | chr20 41694772 . A AATGTCAAATTGCGATAAATAGAACACCCCCACCCCCAGGATGCTACTAGAGAGAATAATGGAGGAAAATTGATACATTAGATTAGAATAAT 0 . END=41694772;SVTYPE=INS;SVLEN=91;CIGAR=1M91I;CIPOS=0,46;HOMLEN=46;HOMSEQ=ATGTCAAATTGCGATAAATAGAACACCCCCACCCCCAGGATGCTAC;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 11 | chr20 41912429 . AACTTCCACCTCCCGGGTTCAAGCAATTCTCCTTCCTCAGCCTCCCGAGTAGCTGGG A 0 . END=41912485;SVTYPE=DEL;SVLEN=-56;CIGAR=1M56D;CIPOS=0,3;HOMLEN=3;HOMSEQ=ACT;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 12 | chr20 42213265 . G GTTTCACATCATTAAAAGTTGTAGAATCTTGGATTCTCTGGCTGGAAAGGCCCTCCCA 0 . END=42213265;SVTYPE=INS;SVLEN=57;CIGAR=1M57I;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 13 | chr20 42633975 . ATATTATATATTATATTATATATATTATATATATATAATATATATATAATATATATATTATATATATTATATATATATTATAAATATATAATATATATATAATATAT A 0 . END=42634081;SVTYPE=DEL;SVLEN=-106;CIGAR=1M106D;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 14 | chr20 43162622 . TCTCCATGGTCCATCACTCTGGAGGCAGCAAGGAGTCAAGCTTCCAGGATCTGGTCTGGGGCTGCCTGGCTCCCCTTCCCTTCAGCCACACCAGTCAGTCACCCATGGTTCCAACCACACCAAAGGGTTTGGCTTTTCATGATGACCCGCTGGACAGCAGCTTGGCTCAGGCTGCTCTTCTGTCTG T 0 . END=43162807;SVTYPE=DEL;SVLEN=-185;CIGAR=1M185D;CIPOS=0,4;HOMLEN=4;HOMSEQ=CTCC;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 15 | chr20 43395954 . A 0 . END=43397152;SVTYPE=DEL;SVLEN=-1198;CIPOS=0,18;CIEND=0,18;HOMLEN=18;HOMSEQ=TGGTAAAACCCCATTTCT;NUM_MERGED_SVS=3;STDDEV_POS=0.00,0.00 16 | chr20 43643224 . A 0 . END=43645948;SVTYPE=DEL;SVLEN=-2724;CIPOS=0,35;CIEND=0,35;HOMLEN=35;HOMSEQ=CCTGTAATCCCAGCACTTTGGGAGGCCGAGGCAGG;NUM_MERGED_SVS=3;STDDEV_POS=0.00,0.00 17 | chr20 43696486 . A 0 . END=43696990;SVTYPE=DUP;SVLEN=504;CIPOS=0,23;CIEND=0,23;HOMLEN=23;HOMSEQ=CTGGGATTACAGGCGTGAGCCAC;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 18 | chr20 44047485 . TTAACATCTGCACTAAGAAAAATTCCTCTGCCTTGGGATCCTGTTGATCTGTGACCTTACCCCCAACCCTGTGCTCTCTGAAACATGTGCTGTGTCCACTCAGGGTTAAATGGATTAAGGGCGGTGCAAGATGTGCTTTGTTAAACAGATGCTTGAAGGCAGCATGCTCGTTAAGAGTCATCACCAATCCCTAATCTCAAGTAATCAGGGACACAAACACTGCGGAAGGCCGCAGGGTCCTCTGCCTAGGAAAACCAGAGACCTTTGTTCACTTGTTTATCTGCTGACCTTCCCTCCACTATTGTCCCATGACCCTGCCAAATCCCCCTCTGTGAGAAACACCCAAGAATTATCAATAAAAAAAAATTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA T 0 . END=44047884;SVTYPE=DEL;SVLEN=-399;CIGAR=1M399D;CIPOS=0,3;HOMLEN=3;HOMSEQ=TAA;NUM_MERGED_SVS=3;STDDEV_POS=0.00,0.00 19 | chr20 44134468 . ATAAATATATATTTATAATATATATTTATTATATATAAATAATATATATTTATAATAAATATATATAAATAAATATATATTTATAATATATATAAATAAATATATATTTATTATAAATATATATAAATATATATTTATTATAAATATAATAAATATATATTTATTATAAATATATAAATATTTATTATAAATATATATAAATATATATTTATAAATATTATAAATATATATACATTAATATATATTATAAATATATATT AAA 0 . END=44134726;SVTYPE=DEL;SVLEN=-258;CIGAR=1M2I258D;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 20 | chr20 44404776 . T 0 . END=44404776;SVTYPE=INS;CIPOS=0,4;CIEND=0,4;HOMLEN=4;HOMSEQ=GTGT;LEFT_SVINSSEQ=GTGTTTGTTGATGTGTGTGTATGTGTGCGTGTGTGTGAATTGTGTGT;RIGHT_SVINSSEQ=TGTGTGTTGATGTGTGTGGAGTGTGTGGTGTGTGTGGTGTGTGGGTGTGTGGTGTGTGTGTGTGTTTGTGTGTGGTGTGTGTGCGTGTGTGTGGACTGTGTGGTGCGTGTGTGTGCTTGTGTGTGGACTGTGGATTGTGTGGTGTGTGTGTGCGCAC;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 21 | chr20 44677204 . G G]chr20:44683574] 0 . SVTYPE=BND;MATEID=MantaBND:2404:0:1:0:0:0:1;CIPOS=0,2;HOMLEN=2;HOMSEQ=CT;BND_DEPTH=19;MATE_BND_DEPTH=38;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 22 | chr20 44680251 . C [chr20:44683272[C 0 . SVTYPE=BND;MATEID=MantaBND:2409:0:1:0:0:0:1;CIPOS=0,4;HOMLEN=4;HOMSEQ=AGGG;BND_DEPTH=19;MATE_BND_DEPTH=47;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 23 | chr20 44683268 . C [chr20:44680255[C 0 . SVTYPE=BND;MATEID=MantaBND:2409:0:1:0:0:0:0;CIPOS=0,4;HOMLEN=4;HOMSEQ=CTGA;BND_DEPTH=47;MATE_BND_DEPTH=19;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 24 | chr20 44683572 . C C]chr20:44677206] 0 . SVTYPE=BND;MATEID=MantaBND:2404:0:1:0:0:0:0;CIPOS=0,2;HOMLEN=2;HOMSEQ=AG;BND_DEPTH=38;MATE_BND_DEPTH=19;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 25 | chr20 44764203 . C CATCCATCCATCCATCCATCCATCCGTCCATCCATTTATCCATCCATCCATCCATCCATCCATCCATCCATCCATTTATCCATCCGTCCATTTATCCATCCATCCATCCACCCACCCATCCATCCATCCATCCACTT 0 . END=44764203;SVTYPE=INS;SVLEN=136;CIGAR=1M136I;CIPOS=0,9;HOMLEN=9;HOMSEQ=ATCCATCCA;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 26 | chr20 45575422 . A 0 . END=45578660;SVTYPE=DEL;SVLEN=-3238;CIPOS=0,3;CIEND=0,3;HOMLEN=3;HOMSEQ=GCA;NUM_MERGED_SVS=2;STDDEV_POS=0.00,0.00 27 | chr20 45725015 . T TATACACACATATATATATATATATATATACACATATATATATATATACAC 0 . END=45725015;SVTYPE=INS;SVLEN=50;CIGAR=1M50I;CIPOS=0,3;HOMLEN=3;HOMSEQ=ATA;NUM_MERGED_SVS=3;STDDEV_POS=1.15,1.15 28 | chr20 45730773 . CTCACTGCAACCTCCGCTTCCCAGGTTCAAGCAATTCTCCTGCCTCAGCCTCCTGAGTAGCCGGGATTACAGGCACGTGCCACCACACCCAGCTAATTTTTCTATTTTTAGTA C 0 . END=45730885;SVTYPE=DEL;SVLEN=-112;CIGAR=1M112D;NUM_MERGED_SVS=2;STDDEV_POS=0.00,0.00 29 | chr20 45738910 . CTATATATAGTTATATATATAGTTATAGTTTACAAAACTATATATAGTTATATATAGTTATAGTTTACAAAACTATATATAGTTA C 0 . END=45738994;SVTYPE=DEL;SVLEN=-84;CIGAR=1M84D;CIPOS=0,14;HOMLEN=14;HOMSEQ=TATATATAGTTATA;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 30 | chr20 45906708 . CGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGTTGGGCATGGTGGTGGGCGCCTGTAATCCCAGCTACTCAGGAGGCTGAGGCAGGAGAATCGCTTGATCCTGGGAGGTGGAGGTTGCAGGGAGCCAGGATTGCACCACTGCACTCCAGCCTGGGTGACAGAGCGAGACTCCATCTCAAAAAAAAAAAAAAAAAAAAAAGATGGCATGAGGTTCCTTTCCCTTGGTTTCTCCTACCTCCTCCTTCGCTTCCCATGCCTGTGTGTTAGGTGTGATGGGAAAATACCTTTGCCCCACAGTAGACAGAGGTCATGGCTTAGAAAAAGGGAATTCATGGCCAGGCACAGTGGCTCATGCCTATAATCCCAGCACTTTGGGAGGCCAAGGTGGGCAGATCAGGAGGTCAGGAGATCGAGACCATCCCGGCCAACAT C 0 . END=45907144;SVTYPE=DEL;SVLEN=-436;CIGAR=1M436D;CIPOS=0,24;HOMLEN=24;HOMSEQ=GGTGAAACCCCGTCTCTACTAAAA;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 31 | chr20 47156115 . CACACACACACACATATATATATATATATATATATATATATATATATATAT C 0 . END=47156165;SVTYPE=DEL;SVLEN=-50;CIGAR=1M50D;CIPOS=0,1;HOMLEN=1;HOMSEQ=A;NUM_MERGED_SVS=2;STDDEV_POS=0.00,0.00 32 | chr20 47390323 . CGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGAGACGGAGTCTCGCTCTGTCGCCCAGGCCGGACTGCGGACTGCAGTGGCGCAATCTCGGCTCACTGCAAGCTCCGCTTCCCGGGTTCACGCCATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCACCCGCCACCGCGCCCGGCTAATTTTTTGTATTTTTAGTAGAGACGGGGTTTCACCTTGTTAGCCAGGATGGTCTCGATCTCCTGACCTCATGATCCACCCGCCTCGGCCTCCCAAAGTGCTGGGATTACAGGCGTGAGCCACCGCGCCCGGCCGCCTCCTGT CTG 0 . END=47390651;SVTYPE=DEL;SVLEN=-328;CIGAR=1M2I328D;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 33 | chr20 47481053 . T TTCTTTCCTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTC 0 . END=47481053;SVTYPE=INS;SVLEN=50;CIGAR=1M50I;CIPOS=0,1;HOMLEN=1;HOMSEQ=T;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 34 | chr20 47525638 . CGGGGCGGCTGGCCGGACGGGGGGGCTGACCCCCCCCACCTCCCTCCCGGACGGGGCGGCTGGCCGGGCGGGGGGCTGACCCCCCCACCTCCATCCCGGAGGGGGCGGCTGGCCGGGCGGGGGGCTGACCCCCCCCACCTCCCTCCCAGACGGGGCGGCTGGCCGGGCAGAGGGGCTCCTCACTTCCCAGTAGGGGCGGCCGGGCAGAGGTGCCCCTCACTTCCCGGATGGGG CAGGT 0 . END=47525870;SVTYPE=DEL;SVLEN=-232;CIGAR=1M4I232D;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 35 | chr20 48010935 . C CATCACCATCATCACAACCACCATCGTCAATATCACCATCACTTTCATAATCACCACCACCACAACCACCATCAATATCACCATCACCTTAATCATCACCACAACCACCATCAAT 0 . END=48010935;SVTYPE=INS;SVLEN=114;CIGAR=1M114I;CIPOS=0,10;HOMLEN=10;HOMSEQ=ATCACCATCA;NUM_MERGED_SVS=3;STDDEV_POS=0.00,0.00 36 | chr20 48214211 . G G]chr20:48598465] 0 . SVTYPE=BND;MATEID=MantaBND:2577:0:1:0:0:0:0;IMPRECISE;CIPOS=-478,478;BND_DEPTH=22;MATE_BND_DEPTH=27;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 37 | chr20 48214631 . C [chr20:48597827[C 0 . SVTYPE=BND;MATEID=MantaBND:2577:0:2:0:0:0:0;IMPRECISE;CIPOS=-472,473;BND_DEPTH=28;MATE_BND_DEPTH=31;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 38 | chr20 48412705 . T TTCTTTCTTCTTAAGGTGTTACAGAAGGGCAAAGTCCATCTTCTGTTCCTTCCTTCCTTCTG 0 . END=48412705;SVTYPE=INS;SVLEN=61;CIGAR=1M61I;CIPOS=0,59;HOMLEN=59;HOMSEQ=TCTTTCTTCTTAAGGTGTTACAGAAGGGCAAAGTCCATCTTCTGTTCCTTCCTTCCTTC;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 39 | chr20 48597827 . A [chr20:48214631[A 0 . SVTYPE=BND;MATEID=MantaBND:2577:0:2:0:0:0:1;IMPRECISE;CIPOS=-292,292;BND_DEPTH=31;MATE_BND_DEPTH=28;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 40 | chr20 48598465 . C C]chr20:48214211] 0 . SVTYPE=BND;MATEID=MantaBND:2577:0:1:0:0:0:1;IMPRECISE;CIPOS=-311,311;BND_DEPTH=27;MATE_BND_DEPTH=22;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 41 | chr20 48811623 . CCCCCC CACACGTGTGCATGCACACTACTGACCAGACCTGGATACACACACACACGTGCATGCACACTACTGACCAGACCTGGATACACACACACACACACGTGTGCATGCACACTACTGACCAGACCTGGATA 0 . END=48811628;SVTYPE=INS;SVLEN=127;CIGAR=1M127I5D;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 42 | chr20 49108128 . CAAAAAAAAAAAAAGGCCAGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGACCGAGGCGGGTGGATCATGAGGTCAGGAGATCGAGACCATCCTGGCTAACAAGGTGAAACCCCGTCTCTACTAAAAATACAAAAAATTAGCCGGGCGCGGTGGCGGGCGCGGTGGCGGGCGCCTGTAGTCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATGGCGTGAACCCGGGAAGCGGAGCTTGCAGTGAGCCGAGATTGCGCCACTGCAGTCCGCAGTCCGGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAAAAAAAA C 0 . END=49108453;SVTYPE=DEL;SVLEN=-325;CIGAR=1M325D;CIPOS=0,13;HOMLEN=13;HOMSEQ=AAAAAAAAAAAAA;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 43 | chr20 49126606 . A 0 . END=49126641;SVTYPE=INS;LEFT_SVINSSEQ=AAAAAAAAAAAACTAAAAAATCAAACCAAAATAACCCAAAAAACTCAAAAAAAAAAAAAA;RIGHT_SVINSSEQ=GCTAAAGGGAGTTATGACCACCAAACTGCCCCCCAGCCTGGGAAAAAAAGAAAACCCCCTTCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAACAAAAAAACTTAAAAATCTAACCGATATTAGCCAAGACACTC;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 44 | chr20 49834407 . ATATATATATACTGTATATATATACTGTATATATATATATATACTGTATATATATATACTGTG A 0 . END=49834469;SVTYPE=DEL;SVLEN=-62;CIGAR=1M62D;CIPOS=0,10;HOMLEN=10;HOMSEQ=TATATATATA;NUM_MERGED_SVS=1;STDDEV_POS=0.00,0.00 45 | chr20 49927198 . CAAAGGTTGCAGTGAGCCGAGATGGTACCACTGCACTCCAGCCTGGGAACAAAGTGAGACTCCATCTCAAAAAAAAAAATA C 0 . END=49927278;SVTYPE=DEL;SVLEN=-80;CIGAR=1M80D;CIPOS=0,2;HOMLEN=2;HOMSEQ=AA;NUM_MERGED_SVS=3;STDDEV_POS=0.00,0.00 46 | -------------------------------------------------------------------------------- /test_vcfs: -------------------------------------------------------------------------------- 1 | test/HG002_30x_Manta_v1.6.0_test_data.vcf.gz 2 | test/HG003_30x_Manta_v1.6.0_test_data.vcf.gz 3 | test/HG004_30x_Manta_v1.6.0_test_data.vcf.gz 4 | -------------------------------------------------------------------------------- /utilities.py: -------------------------------------------------------------------------------- 1 | def calculate_overlap(i1_begin, i1_end, i2_begin, i2_end): 2 | """ 3 | Calculates percentage of overlapping positions between two internvals, [i1_begin, i1_end[ and [i2_begin, i2_end[. 4 | 5 | @parameter i1_begin Begin position of interval 1. 6 | @parameter i1_end End position of interval 1. Should be greater or equal to i1_begin. 7 | @parameter i2_begin Begin position of interval 2. 8 | @parameter i2_end End position of interval 2. Should be greater or equal to i2_begin. 9 | 10 | @returns Percentage in a float number in the range [0.0, 100.0] 11 | """ 12 | 13 | # Make sure the ranges are valid 14 | assert i1_begin <= i1_end 15 | assert i2_begin <= i2_end 16 | 17 | # Calculate the number of overlapping bases 18 | overlapping_bases = max(0, min(i1_end, i2_end) - max(i1_begin, i2_begin)) 19 | 20 | # Calculate the size of the larger interval 21 | larger_interval_size = max(1, max(i1_end - i1_begin, i2_end - i2_begin)) 22 | 23 | # Divide the two numbers number of overlapping bases with the size of larger interval 24 | return float(100.0 * overlapping_bases) / float(larger_interval_size) 25 | 26 | 27 | # Unit tests for the function 'calculate_overlap()' 28 | assert calculate_overlap(0, 1, 2, 3) == 0.0 29 | assert calculate_overlap(2, 3, 0, 1) == 0.0 30 | assert calculate_overlap(0, 1, 0, 1) == 100.0 31 | assert calculate_overlap(0, 1, 1, 1) == 0.0 32 | assert calculate_overlap(0, 2, 1, 1) == 0.0 33 | assert calculate_overlap(0, 2, 1, 2) == 50.0 34 | assert calculate_overlap(0, 3, 1, 2) == float(100)/float(3) 35 | assert calculate_overlap(0, 100, 32, 62) == float(3000)/float(100) 36 | assert calculate_overlap(0, 1000, 32, 62) == float(3000)/float(1000) 37 | assert calculate_overlap(0, 1000, 32, 1042) == float(100.0 * (1000-32))/float(1010) 38 | assert calculate_overlap(32, 1042, 0, 1000) == float(100.0 * (1000-32))/float(1010) 39 | 40 | 41 | def calculate_mean(data): 42 | """ 43 | Calculates the sample mean of data. Returns a float. 44 | """ 45 | n = len(data) 46 | assert n > 0 47 | return float(sum(data)/float(len(data))) 48 | 49 | 50 | # Unit tests for the function 'calculate_mean()' 51 | assert calculate_mean([0.0]) == 0.0 52 | assert calculate_mean([2]) == 2.0 53 | assert calculate_mean([1, 2]) == 1.5 54 | assert calculate_mean([100, -100]) == 0.0 55 | assert calculate_mean(range(10)) == 4.5 56 | assert calculate_mean(range(11)) == 5.0 57 | 58 | 59 | def calculate_ssd(data): 60 | """ 61 | Calculates the sum of square deviation of data. 62 | """ 63 | c = calculate_mean(data) 64 | ss = sum((float(x) - c)**2 for x in data) 65 | return float(ss) 66 | 67 | 68 | def calculate_stddev(data, degrees_of_freedom=1.0): 69 | """ 70 | Calculates the population standard deviation of data. 71 | """ 72 | n = len(data) 73 | 74 | if n < 2: 75 | return 0.0 # variance requires at least two data points 76 | 77 | ssd = calculate_ssd(data) 78 | pvar = ssd / float(n - degrees_of_freedom) 79 | return pvar ** 0.5 80 | 81 | 82 | def calculate_stddev_no_outliers(data, degrees_of_freedom=1.0): 83 | """ 84 | Calculates the population standard deviation of data but ignores outliers 85 | """ 86 | n = len(data) 87 | 88 | if n < 2: 89 | return 0.0 # variance requires at least two data points 90 | 91 | c = calculate_mean(data) 92 | ssd_values = [(float(x) - c) ** 2 for x in data] 93 | mean_ssd_value = calculate_mean(ssd_values) 94 | ssd = float(sum(val for val in ssd_values if val <= 5 * mean_ssd_value)) 95 | pvar = ssd / float(n - degrees_of_freedom) 96 | return pvar ** 0.5 97 | 98 | 99 | def get_most_common_item(data): 100 | """ 101 | Returns the most common item of a list. The list may contain tuples 102 | """ 103 | assert isinstance(data, list) 104 | return max(set(data), key=data.count) 105 | 106 | 107 | assert get_most_common_item([0]) == 0 108 | assert get_most_common_item([0, 1, 1]) == 1 109 | assert get_most_common_item([0, 1, 1, 0, 100, 0]) == 0 110 | assert get_most_common_item([(0, 1), (0, 2), (0, 2), (0, 3)]) == (0, 2) 111 | assert get_most_common_item([(0, 1), (0, 2), (0, 2), (0, 3), (0, 1), (0, 1)]) == (0, 1) 112 | --------------------------------------------------------------------------------