├── MANIFEST.in ├── MsPAC ├── MsPAC.py ├── __init__.py ├── assemble_haplotype.py ├── bash │ ├── assemble_window_v3.sh │ ├── assign_reads_to_haplotype.sh │ ├── get_msa_coords.sh │ ├── map.sh │ └── sv_calling.sh ├── haplotype_assignment.py ├── prepping_reads.py ├── python │ ├── assign_reads_to_haplotypes.py │ ├── extract_raw_reads_from_bam_fofn.py │ ├── hmm2.py │ ├── msa_to_variants.py │ └── start_end_coordinates.py └── sv_calling.py ├── README.md ├── cfg_readme.md ├── environment.yml ├── environment_mac.yaml ├── setup.py └── testing ├── run.cfg └── run.sh /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include MsPAC/bash/* 2 | include MsPAC/python/* 3 | -------------------------------------------------------------------------------- /MsPAC/MsPAC.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | import os 3 | import sys 4 | import ConfigParser 5 | from string import Template 6 | from lsf.lsf import Lsf 7 | 8 | class Pipeline(object): 9 | def __init__(self,configfile,step): 10 | self.configfile = configfile 11 | self.step = step 12 | self.jobs = [] 13 | 14 | #### All steps params 15 | ## User mandatory input 16 | self.directory = None 17 | self.package_bash_directory = None 18 | self.package_python_directory = None 19 | self.cluster = None 20 | self.job_threads = None 21 | self.job_walltime = None 22 | self.job_memory = None 23 | self.job_queue = None 24 | 25 | #### Multi steps params 26 | ## User optional input 27 | self.phased_bamfile = None 28 | self.phased_bedfile = None 29 | 30 | #### phase-bam only params 31 | ## User mandatory input 32 | self.vcffile = None 33 | self.bamfile = None 34 | self.vcf_sample_name = None 35 | ## User optional input 36 | self.generate_phased_bedfile = None 37 | self.max_phased_window = None 38 | self.padding_size = None 39 | 40 | #### prep reads only params 41 | self.raw_reads_in_bam_format = None 42 | self.raw_reads_directory = None 43 | 44 | #### assembly only params 45 | ## User optional input 46 | self.min_phased_block = None 47 | self.windows_to_assemble = None 48 | self.haps_to_assemble = None 49 | self.assembly_directory = None 50 | self.flanking_length = None 51 | self.max_block_length = None 52 | self.tech = None 53 | ## Non user options 54 | self.hap0_assembly_fa = None 55 | self.hap0_assembly_fq = None 56 | self.hap1_assembly_fa = None 57 | self.hap1_assembly_fq = None 58 | self.hap2_assembly_fa = None 59 | self.hap2_assembly_fq = None 60 | 61 | #### sv calling only params 62 | self.sv_calling_directory = None 63 | self.reference = None 64 | ## Non user options 65 | self.hap1_assembly_split_fq = None 66 | self.hap2_assembly_split_fq = None 67 | 68 | def set_options(self,pipeline_options): 69 | for pipeline_option, option_input in pipeline_options: 70 | setattr(self,pipeline_option,option_input) 71 | 72 | def get_bash_scripts_path(self): 73 | return "%s/MsPAC/bash" % "/".join(os.path.dirname(__file__).split("/")[:-1]) 74 | 75 | def get_python_scripts_path(self): 76 | return "%s/MsPAC/python" % "/".join(os.path.dirname(__file__).split("/")[:-1]) 77 | 78 | def set_optional_options(self,config): 79 | if config.has_option("Params","chromosome"): 80 | self.chromosome = config.get("Params","chromosome") 81 | 82 | def all_steps_configure(self,config): 83 | pipeline_options = \ 84 | [("directory",os.path.abspath(config.get('Input','directory'))), 85 | ("package_python_directory",self.get_python_scripts_path()), 86 | ("package_bash_directory",self.get_bash_scripts_path()), 87 | ("cluster",config.get("Other params",'cluster')), 88 | ("job_threads",{"high":config.get("HIGH INTENSITY JOB","threads"), 89 | "low":config.get("LOW INTENSITY JOB","threads")}), 90 | ("job_memory",{"high":config.get("HIGH INTENSITY JOB","memory"), 91 | "low":config.get("LOW INTENSITY JOB","memory")}), 92 | ("job_walltime",{"high":config.get("HIGH INTENSITY JOB","walltime"), 93 | "low":config.get("LOW INTENSITY JOB","walltime")}), 94 | ("job_queue",{"high":config.get("HIGH INTENSITY JOB","queue"), 95 | "low":config.get("LOW INTENSITY JOB","queue")})] 96 | self.set_options(pipeline_options) 97 | self.create_directory(self.directory) 98 | 99 | def phase_bam_dependent_options(self): 100 | if self.phased_bedfile == "None": 101 | self.phased_bedfile = "%s.hap_blocks.bed" % self.phased_bamfile[:-4] 102 | 103 | def phase_bam_configure(self,config): 104 | pipeline_options = \ 105 | [("vcffile",config.get("Phase-bam input files",'phased vcf')), 106 | ("bamfile",config.get("Phase-bam input files",'reads aligned')), 107 | ("vcf_sample_name",config.get("Phase-bam params",'sample name in VCF')), 108 | ("phased_bamfile",config.get("Phase-bam params",'output phased bamfile'))] 109 | self.set_options(pipeline_options) 110 | #self.phase_bam_dependent_options() 111 | 112 | def assembly_dependent_options(self): 113 | self.hap0_assembly_fa = "%s/hap0_assembly.fasta" % self.assembly_directory 114 | self.hap0_assembly_fq = "%s/hap0_assembly.fastq" % self.assembly_directory 115 | self.hap1_assembly_fa = "%s/hap1_assembly.fasta" % self.assembly_directory 116 | self.hap1_assembly_fq = "%s/hap1_assembly.fastq" % self.assembly_directory 117 | self.hap2_assembly_fa = "%s/hap2_assembly.fasta" % self.assembly_directory 118 | self.hap2_assembly_fq = "%s/hap2_assembly.fastq" % self.assembly_directory 119 | 120 | def assembly_configure(self,config): 121 | pipeline_options = \ 122 | [("min_phased_block",config.get("Assembly params",'Minimum phased block length')), 123 | ("phased_bedfile",config.get("Assembly params",'Phased bedfile')), 124 | ("windows_to_assemble",[]), 125 | ("tech",config.get("Assembly params",'Technology')), 126 | ("haps_to_assemble",config.get("Assembly params",'Comma-seperated list of haplotypes')), 127 | ("phased_bamfile",os.path.abspath(config.get("Phase-bam params",'output phased bamfile'))), 128 | ("flanking_length",int(config.get("Assembly params",'Flanking length'))), 129 | ("max_block_length",int(config.get("Assembly params",'Max block length'))), 130 | ("raw_reads_directory",os.path.abspath(config.get("Prep reads params",'Raw reads directory'))), #ugh 131 | ("assembly_directory",os.path.abspath(config.get("Assembly params",'Assembly directory')))] 132 | self.set_options(pipeline_options) 133 | self.phase_bam_dependent_options() 134 | self.assembly_dependent_options() 135 | 136 | def prep_reads_configure(self,config): 137 | pipeline_options = \ 138 | [("raw_reads_in_bam_format",os.path.abspath(config.get("Prep reads params",'BAM fofn'))), 139 | ("raw_reads_directory",os.path.abspath(config.get("Prep reads params",'Raw reads directory'))), 140 | ("phased_bamfile",config.get("Phase-bam params",'output phased bamfile'))] 141 | self.set_options(pipeline_options) 142 | 143 | def sv_calling_dependent_options(self): 144 | self.hap1_assembly_split_fq = "%s/hap1_assembly_split.fastq" % self.sv_calling_directory 145 | self.hap2_assembly_split_fq = "%s/hap2_assembly_split.fastq" % self.sv_calling_directory 146 | 147 | def sv_calling_configure(self,config): 148 | pipeline_options = \ 149 | [("sv_calling_directory",os.path.abspath(config.get("SV calling params",'SV calling directory'))), 150 | ("assembly_directory",os.path.abspath(config.get("Assembly params",'Assembly directory'))), 151 | ("reference",os.path.abspath(config.get("SV calling params",'reference')))] 152 | self.set_options(pipeline_options) 153 | self.sv_calling_dependent_options() 154 | self.assembly_dependent_options() 155 | self.create_directory(self.sv_calling_directory) 156 | 157 | def configure(self): 158 | config = ConfigParser.RawConfigParser() 159 | config.read(self.configfile) 160 | self.all_steps_configure(config) 161 | if self.step == "phase-bam": 162 | self.phase_bam_configure(config) 163 | if self.step == "assembly": 164 | self.assembly_configure(config) 165 | if self.step == "prep-reads": 166 | self.prep_reads_configure(config) 167 | if self.step == "sv-calling": 168 | self.sv_calling_configure(config) 169 | 170 | def create_directory(self,directory): 171 | if not os.path.exists(directory): 172 | os.makedirs(directory) 173 | 174 | def write_to_bashfile(self,template_bash,bashfile,params): 175 | filein = open(template_bash) 176 | src = Template(filein.read()) 177 | output_lines = src.safe_substitute(params) 178 | bashfh = open(bashfile,'w') 179 | bashfh.write(output_lines) 180 | filein.close() 181 | bashfh.close() 182 | 183 | def non_emptyfile(self,checkfile): 184 | return os.path.isfile(checkfile) and os.path.getsize(checkfile) > 0 185 | 186 | def run_locally(self): 187 | use_cluster = self.cluster 188 | self.cluster = "no" 189 | self.submitjobs() 190 | self.cluster = use_cluster 191 | 192 | def submitjobs(self,wait=True): 193 | if len(self.jobs) == 0: 194 | return 195 | if self.cluster != "Yes": 196 | for job,intensity in self.jobs: 197 | os.system("sh %s" % job) 198 | self.jobs = [] 199 | return 200 | hpc = Lsf() 201 | for job,intensity in self.jobs: 202 | hpc.config(cpu=self.job_threads[intensity], 203 | walltime=self.job_walltime[intensity], 204 | memory=int(self.job_memory[intensity]) * int(self.job_threads[intensity]), 205 | queue=self.job_queue[intensity]) 206 | hpc.submit("%s" % job) 207 | if wait: 208 | hpc.wait() 209 | else: 210 | dummy=1 211 | # Bug gets overwritten if told not to wait twice 212 | #job_id_log = "%s/job.ids" % self.log_directory 213 | #hpc.write_ids(job_id_log) 214 | self.jobs = [] 215 | 216 | def map_reads(self,reads,reference,directory,name,intensity="low"): 217 | bashfile = "%s/%s.sh" % (directory,name) 218 | template_bash = "%s/map_reads.sh" % self.package_bash_directory 219 | params = { 220 | 'output': "%s/%s" % (directory,name), 221 | 'threads': self.job_threads[intensity], 222 | 'reads': reads, 223 | 'ref': reference, 224 | 'BLASR': self.fast_blasr 225 | } 226 | self.write_to_bashfile(template_bash,bashfile,params) 227 | return (bashfile,intensity) 228 | 229 | def __call__(self): 230 | if self.step == "phase-bam": 231 | from haplotype_assignment import HaplotypeAssignment 232 | print "Assigning reads to haplotypes..." 233 | assign_reads_to_haplotype = HaplotypeAssignment(self.configfile) 234 | assign_reads_to_haplotype.run() 235 | elif self.step == "prep-reads": 236 | from prepping_reads import PrepReads 237 | print "Prepping reads..." 238 | prep_reads = PrepReads(self.configfile) 239 | prep_reads.run() 240 | elif self.step == "assembly": 241 | from assemble_haplotype import HaplotypeAssembly 242 | print "Assembling haplotypes..." 243 | assemble_haps = HaplotypeAssembly(self.configfile) 244 | assemble_haps.run() 245 | elif self.step == "sv-calling": 246 | from sv_calling import SVCaller 247 | print "Calling SVs..." 248 | call_svs = SVCaller(self.configfile) 249 | call_svs.run() 250 | else: 251 | sys.exit("Choose one of the following steps: phase-bam, prep-reads, assembly, sv-calling") 252 | #print "Detecting structural variants..." 253 | #from calling_structural_variants import StructuralVariationDetection 254 | #calling_structural_variants = StructuralVariationDetection(self.configfile) 255 | #calling_structural_variants.run() 256 | 257 | def run_pipeline(step,configfile): 258 | pipepine = Pipeline(configfile,step) 259 | return pipepine() 260 | 261 | def main(): 262 | if len(sys.argv) < 3: 263 | sys.exit("Usage: MsPAC ") 264 | return run_pipeline(sys.argv[1],sys.argv[2]) 265 | -------------------------------------------------------------------------------- /MsPAC/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oscarlr/MsPAC/3a741dceba8e46efb0e0578eaf28c0ac1cb7b7ce/MsPAC/__init__.py -------------------------------------------------------------------------------- /MsPAC/assemble_haplotype.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | import sys 3 | import os 4 | import pysam 5 | import gzip 6 | from Bio import SeqIO 7 | from MsPAC import Pipeline 8 | 9 | class PhasedBlocks(): 10 | def __init__(self,line,min_phased_block,haps_to_assemble): 11 | min_cov = 5 12 | self.line = line 13 | line = line.rstrip().split('\t') 14 | self.chrom = line[0] 15 | self.start = int(line[1]) 16 | self.end = int(line[2]) 17 | self.hap = str(line[3]) 18 | self.comp_cost = line[4] 19 | self.cov = float(line[5]) 20 | self.length = self.end - self.start 21 | self.assemble = True 22 | if self.length < int(min_phased_block): 23 | self.assemble = False 24 | if self.hap not in haps_to_assemble.split(','): 25 | self.assemble = False 26 | if self.cov < min_cov: 27 | self.assemble = False 28 | 29 | class HaplotypeAssembly(Pipeline): 30 | def __init__(self,configfile): 31 | Pipeline.__init__(self,configfile,"assembly") 32 | 33 | def get_read_groups_regions(self): 34 | regions = { 35 | "0": {}, 36 | "1": {}, 37 | "2": {}, 38 | "0_1": {}, 39 | "0_2": {} 40 | } 41 | samfile = pysam.AlignmentFile(self.phased_bamfile) 42 | for read in samfile: 43 | if read.is_secondary: 44 | continue 45 | if read.is_supplementary: 46 | continue 47 | if read.is_unmapped: 48 | continue 49 | read_group = read.get_tag("RG") 50 | chrom = samfile.get_reference_name(read.reference_id) 51 | ref_start = read.reference_start 52 | ref_end = read.reference_end 53 | if read_group == "0": 54 | rgs = ["0","0_1","0_2"] 55 | if read_group == "1": 56 | rgs = ["1","0_1"] 57 | if read_group == "2": 58 | rgs = ["2","0_2"] 59 | for rg in rgs: 60 | if chrom not in regions[rg]: 61 | regions[rg][chrom] = [] 62 | regions[rg][chrom].append((ref_start,ref_end)) 63 | return regions 64 | 65 | def merge_regions(self,regions): 66 | sorted_by_lower_bound = sorted(regions, key=lambda tup: tup[0]) 67 | merged_regions = [] 68 | for higher in sorted_by_lower_bound: 69 | if len(merged_regions) == 0: 70 | merged_regions.append(higher) 71 | continue 72 | lower = merged_regions[-1] 73 | if higher[0] <= lower[1]: 74 | upper_bound = max(lower[1], higher[1]) 75 | merged_regions[-1] = (lower[0], upper_bound) 76 | else: 77 | merged_regions.append(higher) 78 | return merged_regions 79 | 80 | def merge_regions_coverage(self,regions,merged_regions): 81 | bases = {} 82 | regions_used = set() 83 | sorted_by_lower_bound = sorted(regions, key=lambda tup: tup[0]) 84 | for m_start, m_end in merged_regions: 85 | bases[(m_start,m_end)] = 0 86 | for r_start, r_end in sorted_by_lower_bound: 87 | a = [m_start,m_end] 88 | b= [r_start,r_end] 89 | bases_overlapping = max(0, min(a[1], b[1]) - max(a[0], b[0])) 90 | if bases_overlapping > 0: 91 | bases[(m_start,m_end)] += (r_end - r_start) 92 | regions_used.add((r_start,r_end)) 93 | bases[(m_start,m_end)] = bases[(m_start,m_end)]/float((m_end - m_start)) 94 | return bases 95 | 96 | def break_long_regions(self,regions,coverage): 97 | broken_regions = [] 98 | broken_coverage = {} 99 | for start,end in regions: 100 | for new_start in range(start,end,self.max_block_length): 101 | new_end = new_start + self.max_block_length 102 | if new_end > end: 103 | new_end = end 104 | broken_regions.append((new_start,new_end)) 105 | broken_coverage[(new_start,new_end)] = coverage[(start,end)] 106 | return (broken_regions,broken_coverage) 107 | 108 | # Max block length 109 | def create_phased_bedfile(self): 110 | window_size = 1000000 # 10 MB //Everything should get low 111 | read_group_regions = self.get_read_groups_regions() 112 | with open(self.phased_bedfile,'w') as fh: 113 | for read_group in read_group_regions: 114 | for chrom in read_group_regions[read_group]: 115 | merged_regions_pre_broken = self.merge_regions(read_group_regions[read_group][chrom]) 116 | merged_regions_coverage_p_b = self.merge_regions_coverage(read_group_regions[read_group][chrom],merged_regions_pre_broken) 117 | if self.max_block_length == None: 118 | merged_regions = merged_regions_pre_broken 119 | merged_regions_coverage = merged_regions_coverage_p_b 120 | else: 121 | merged_regions,merged_regions_coverage = self.break_long_regions(merged_regions_pre_broken,merged_regions_coverage_p_b) 122 | for start,end in merged_regions: 123 | comp_cost = "low" 124 | if end - start > window_size: 125 | comp_cost = "high" 126 | out = [chrom,start,end,read_group,comp_cost,merged_regions_coverage[(start,end)]] 127 | fh.write("%s\n" % "\t".join(map(str,out))) 128 | 129 | def load_haplotype_blocks(self): 130 | if not self.non_emptyfile(self.phased_bedfile): 131 | self.create_phased_bedfile() 132 | with open(self.phased_bedfile,'r') as fh: 133 | for line in fh: 134 | phased_block = PhasedBlocks(line,self.min_phased_block,self.haps_to_assemble) 135 | if phased_block.assemble == False: 136 | continue 137 | self.windows_to_assemble.append(phased_block) 138 | 139 | def assemble_windows(self): 140 | for window in self.windows_to_assemble: 141 | directory = "%s/%s/%s/%s_%s" % (self.assembly_directory,window.hap, 142 | window.chrom,window.start,window.end) 143 | self.create_directory(directory) 144 | if os.path.isfile("%s/done" % directory): 145 | continue 146 | template_bash = "%s/assemble_window_v3.sh" % self.package_bash_directory 147 | bashfile = "%s/assemble_window_v3.sh" % directory 148 | params = { 149 | 'output': directory, 150 | 'raw_reads_dir': self.raw_reads_directory, 151 | 'python_scripts': self.package_python_directory, 152 | 'hap': window.hap, 153 | 'subreads_to_ref': self.phased_bamfile, 154 | 'chrom': window.chrom, 155 | 'start': window.start, 156 | 'end': window.end, 157 | 'threads': self.job_threads[window.comp_cost], 158 | 'memory': int(self.job_threads[window.comp_cost])*int(self.job_memory[window.comp_cost]), 159 | 'size': max(6000,window.end - window.start), 160 | 'tech': self.tech 161 | } 162 | self.write_to_bashfile(template_bash,bashfile,params) 163 | self.jobs.append((bashfile,window.comp_cost)) 164 | self.submitjobs() 165 | 166 | def merge_sequences(self): 167 | records = {} 168 | for hap in ["hap0","hap1","hap2"]: 169 | records[hap] = {} 170 | records[hap]["fa"] = [] 171 | records[hap]["fq"] = [] 172 | for window in self.windows_to_assemble: 173 | directory = "%s/%s/%s/%s_%s" % (self.assembly_directory,window.hap, 174 | window.chrom,window.start,window.end) 175 | raw_contigs = "%s/canu/raw.contigs.fasta" % directory 176 | #raw_contigs = "%s/canu/raw.quivered.contigs.fasta" % directory 177 | if self.non_emptyfile(raw_contigs): 178 | for index,record in enumerate(SeqIO.parse(raw_contigs, "fasta")): 179 | record.id = "%s.%s.%s.%s.raw.%s/0/0_0" % (window.chrom,window.start,window.end,window.hap,index) 180 | record.description = "" 181 | record.name = "" 182 | for hap,hap_key in zip(["0","1","2"],["hap0","hap1","hap2"]): 183 | if hap in window.hap: 184 | records[hap_key]["fa"].append(record) 185 | raw_contigs = "%s/canu/raw.quivered.contigs.fastq" % directory 186 | if self.non_emptyfile(raw_contigs): 187 | print raw_contigs 188 | for index,record in enumerate(SeqIO.parse(raw_contigs, "fastq")): 189 | record.id = "%s.%s.%s.%s.raw.%s/0/0_0" % (window.chrom,window.start,window.end,window.hap,index) 190 | record.description = "" 191 | record.name = "" 192 | for hap,hap_key in zip(["0","1","2"],["hap0","hap1","hap2"]): 193 | if hap in window.hap: 194 | records[hap_key]["fq"].append(record) 195 | SeqIO.write(records["hap0"]["fa"],self.hap0_assembly_fa,"fasta") 196 | SeqIO.write(records["hap1"]["fa"],self.hap1_assembly_fa,"fasta") 197 | SeqIO.write(records["hap2"]["fa"],self.hap2_assembly_fa,"fasta") 198 | SeqIO.write(records["hap0"]["fq"],self.hap0_assembly_fq,"fastq") 199 | SeqIO.write(records["hap1"]["fq"],self.hap1_assembly_fq,"fastq") 200 | SeqIO.write(records["hap2"]["fq"],self.hap2_assembly_fq,"fastq") 201 | 202 | 203 | def run(self): 204 | self.configure() 205 | self.load_haplotype_blocks() 206 | self.assemble_windows() 207 | self.merge_sequences() 208 | -------------------------------------------------------------------------------- /MsPAC/bash/assemble_window_v3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | output=$1 5 | bash=$2 6 | smrtsuite=$3 7 | threads=$4 8 | bams_fofn=$5 9 | CANU=$6 10 | size=$7 11 | subreads_to_ref=$8 12 | hap=$9 13 | reference=${10} 14 | memory=${11} 15 | raw_reads_dir=${12} 16 | start=${13} 17 | end=${14} 18 | chrom=${15} 19 | python_scripts=${16} 20 | tech=${17} 21 | 22 | if [ "${tech}" == "ONT" ] 23 | then 24 | canu_tech="-nanopore" 25 | fi 26 | 27 | if [ "${tech}" == "PACB" ] 28 | then 29 | canu_tech="-pacbio" 30 | fi 31 | 32 | 33 | if [ "${hap}" == "0_1" ] 34 | then 35 | samtools view -F 3884 ${subreads_to_ref} -r 0 ${chrom}:${start}-${end} | awk '{ print ">"$1"\n"$10}' > ${output}/reads.fasta 36 | samtools view -F 3884 ${subreads_to_ref} -r 1 ${chrom}:${start}-${end} | awk '{ print ">"$1"\n"$10}' >> ${output}/reads.fasta 37 | elif [ "${hap}" == "0_2" ] 38 | then 39 | samtools view -F 3884 ${subreads_to_ref} -r 0 ${chrom}:${start}-${end} | awk '{ print ">"$1"\n"$10}' > ${output}/reads.fasta 40 | samtools view -F 3884 ${subreads_to_ref} -r 2 ${chrom}:${start}-${end} | awk '{ print ">"$1"\n"$10}' >> ${output}/reads.fasta 41 | else 42 | samtools view -F 3884 ${subreads_to_ref} -r ${hap} ${chrom}:${start}-${end} | awk '{ print ">"$1"\n"$10}' > ${output}/reads.fasta 43 | fi 44 | 45 | samtools faidx ${output}/reads.fasta 46 | 47 | if [ ! -s ${output}/canu/raw.contigs.fasta ] 48 | then 49 | #rm -fr ${output}/canu 50 | canu \ 51 | -p raw \ 52 | -d ${output}/canu \ 53 | contigFilter="2 1000 1.0 1.0 2" \ 54 | minInputCoverage=0 \ 55 | corMinCoverage=0 \ 56 | stopOnLowCoverage=0 \ 57 | minThreads=${threads} \ 58 | genomeSize=${size} \ 59 | useGrid=0 \ 60 | ${canu_tech} ${output}/reads.fasta 61 | fi 62 | 63 | if [ "${tech}" == "PACB" ] 64 | then 65 | if [ -s ${output}/canu/raw.contigs.fasta ] 66 | then 67 | samtools faidx ${output}/canu/raw.contigs.fasta 68 | if [ ! -s ${output}/subreads.bam ] 69 | then 70 | if [ "${hap}" == "0_1" ] 71 | then 72 | ls ${raw_reads_dir}/${chrom}/1/*bam > ${output}/subreads.fofn 73 | ls ${raw_reads_dir}/${chrom}/0/*bam >> ${output}/subreads.fofn 74 | elif [ "${hap}" == "0_2" ] 75 | then 76 | ls ${raw_reads_dir}/${chrom}/2/*bam > ${output}/subreads.fofn 77 | ls ${raw_reads_dir}/${chrom}/0/*bam >> ${output}/subreads.fofn 78 | else 79 | ls ${raw_reads_dir}/${chrom}/${hap}/*bam > ${output}/subreads.fofn 80 | fi 81 | if [ ! -s ${output}/subreads.fofn ] 82 | then 83 | echo "" > ${output}/done 84 | exit 0 85 | fi 86 | cut -f1 ${output}/reads.fasta.fai > ${output}/reads.id 87 | python \ 88 | ${python_scripts}/extract_raw_reads_from_bam_fofn.py \ 89 | ${output}/reads.id \ 90 | ${output}/subreads.fofn \ 91 | ${output}/subreads.bam 92 | pbindex ${output}/subreads.bam 93 | fi 94 | if [ ! -s ${output}/canu/reads_to_canu_contigs.sorted.bam.pbi ] 95 | then 96 | blasr \ 97 | ${output}/subreads.bam \ 98 | ${output}/canu/raw.contigs.fasta \ 99 | --bestn 1 \ 100 | --bam \ 101 | --nproc ${threads} \ 102 | --out ${output}/canu/reads_to_canu_contigs.bam 103 | samtools sort -@ ${threads} ${output}/canu/reads_to_canu_contigs.bam -o ${output}/canu/reads_to_canu_contigs.sorted.bam 104 | pbindex ${output}/canu/reads_to_canu_contigs.sorted.bam 105 | fi 106 | if [ ! -s ${output}/canu/raw.quivered.contigs.fastq ] 107 | then 108 | samtools faidx ${output}/canu/raw.contigs.fasta 109 | arrow \ 110 | --referenceFilename ${output}/canu/raw.contigs.fasta \ 111 | -j ${threads} \ 112 | -o ${output}/canu/raw.quivered.contigs.fastq \ 113 | -o ${output}/canu/raw.quivered.contigs.fasta \ 114 | ${output}/canu/reads_to_canu_contigs.sorted.bam 115 | fi 116 | fi 117 | fi 118 | 119 | echo "" > ${output}/done 120 | -------------------------------------------------------------------------------- /MsPAC/bash/assign_reads_to_haplotype.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e -x 3 | 4 | python_scripts=$1 5 | vcffile=$2 6 | bamfile=$3 7 | phased_bamfile=$4 8 | vcf_sample_name=$5 9 | ## 10 | 11 | if [ ! -s "${phased_bamfile}.bai" ] 12 | then 13 | python ${python_scripts}/assign_reads_to_haplotypes.py \ 14 | ${vcffile} \ 15 | ${bamfile} \ 16 | ${vcf_sample_name} \ 17 | ${phased_bamfile} 18 | samtools index ${phased_bamfile} 19 | fi 20 | 21 | 22 | -------------------------------------------------------------------------------- /MsPAC/bash/get_msa_coords.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e -x 3 | 4 | if [ ! -s ${ref}.fai ] 5 | then 6 | samtools faidx ${ref} 7 | fi 8 | 9 | cut -f1,2 ${ref}.fai > ${sv_calling_dir}/chrom.sizes 10 | 11 | for i in 1 2 12 | do 13 | python ${python_packages}/start_end_coordinates.py \ 14 | ${sv_calling_dir}/hap${i}_to_ref.sorted.bam > ${sv_calling_dir}/hap${i}_to_ref.bed 15 | 16 | bedtools genomecov \ 17 | -bg \ 18 | -i ${sv_calling_dir}/hap${i}_to_ref.bed \ 19 | -g ${sv_calling_dir}/chrom.sizes \ 20 | | awk '$4 == 1' \ 21 | > ${sv_calling_dir}/hap${i}_to_ref.no_overlap.bed 22 | 23 | bedtools intersect \ 24 | -a ${sv_calling_dir}/hap${i}_to_ref.bed \ 25 | -b ${sv_calling_dir}/hap${i}_to_ref.no_overlap.bed \ 26 | > ${sv_calling_dir}/hap${i}_to_ref.no_overlap.contig.bed 27 | done 28 | 29 | bedtools intersect \ 30 | -a ${sv_calling_dir}/hap1_to_ref.no_overlap.contig.bed \ 31 | -b ${sv_calling_dir}/hap2_to_ref.no_overlap.contig.bed \ 32 | | cut -f-3 \ 33 | > ${sv_calling_dir}/msa_coords.bed \ 34 | 35 | rm -f ${sv_calling_dir}/hap1_to_ref.no_overlap.bed 36 | rm -f ${sv_calling_dir}/hap2_to_ref.no_overlap.bed 37 | 38 | -------------------------------------------------------------------------------- /MsPAC/bash/map.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | if [ ! -s ${prefix}.sorted.bam.bai ] 5 | then 6 | blasr \ 7 | ${input} \ 8 | ${ref} \ 9 | --bestn 1 \ 10 | --bam \ 11 | --nproc ${threads} \ 12 | --out ${prefix}.bam 13 | 14 | samtools \ 15 | sort -@ ${threads} \ 16 | ${prefix}.bam \ 17 | -o ${prefix}.sorted.bam 18 | 19 | samtools index ${prefix}.sorted.bam 20 | 21 | rm -f ${prefix}.bam 22 | fi 23 | -------------------------------------------------------------------------------- /MsPAC/bash/sv_calling.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e -x 3 | 4 | kalign -q -f clu -s 100 -e 0.85 -t 0.45 -m 0 -in ${dir}/seq.fa \ 5 | | sed 's/Kalign/CLUSTAL/g' > ${dir}/msa.clu 6 | 7 | python ${python_scripts}/msa_to_variants.py \ 8 | ${dir}/msa.clu \ 9 | ${chrom} \ 10 | ${start} \ 11 | ${end} \ 12 | ${dir}/seq.qual \ 13 | 50 > ${dir}/svs.bed 14 | 15 | -------------------------------------------------------------------------------- /MsPAC/haplotype_assignment.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | from MsPAC import Pipeline 3 | 4 | class HaplotypeAssignment(Pipeline): 5 | def __init__(self,configfile): 6 | Pipeline.__init__(self,configfile,"phase-bam") 7 | self.configure() 8 | 9 | def assign_reads_to_haplotype(self): 10 | bashfile = "%s_assign_reads_to_haplotype.sh" % self.bamfile[0:-4] 11 | template_bash = "%s/assign_reads_to_haplotype.sh" % self.package_bash_directory 12 | params = { 13 | 'python_scripts': self.package_python_directory, 14 | 'vcffile': self.vcffile, 15 | 'bamfile': self.bamfile, 16 | 'phased_bamfile': self.phased_bamfile, 17 | 'vcf_sample_name': self.vcf_sample_name 18 | } 19 | self.write_to_bashfile(template_bash,bashfile,params) 20 | self.jobs.append((bashfile,"low")) 21 | self.run_locally() 22 | 23 | def run(self): 24 | self.assign_reads_to_haplotype() 25 | 26 | -------------------------------------------------------------------------------- /MsPAC/prepping_reads.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | import sys 3 | import pysam 4 | from MsPAC import Pipeline 5 | 6 | 7 | class PrepReads(Pipeline): 8 | def __init__(self,configfile): 9 | Pipeline.__init__(self,configfile,"prep-reads") 10 | 11 | def get_reads_per_group(self): 12 | print "\tGetting reads per chrom per group..." 13 | reads_per_read_group = {} 14 | samfile = pysam.AlignmentFile(self.phased_bamfile) 15 | reads_mapped = samfile.mapped 16 | count = 0 17 | for read in samfile: 18 | count += 1.0 19 | read_group = read.get_tag("RG") 20 | chrom = samfile.get_reference_name(read.reference_id) 21 | if chrom not in reads_per_read_group: 22 | reads_per_read_group[chrom] = {} 23 | if read_group not in reads_per_read_group[chrom]: 24 | reads_per_read_group[chrom][read_group] = [] 25 | reads_per_read_group[chrom][read_group].append(read.query_name) 26 | if count % 50000 == 0: 27 | print "\t\tStatus: %s" % (count/reads_mapped) 28 | return reads_per_read_group 29 | 30 | def extract_raw_reads(self,read_names,inbamfile): 31 | raw_reads = [] 32 | read_names_indexed = pysam.IndexedReads(inbamfile) 33 | read_names_indexed.build() 34 | for name in read_names: 35 | try: 36 | read_names_indexed.find(name) 37 | except KeyError: 38 | pass 39 | else: 40 | iterator = read_names_indexed.find(name) 41 | for x in iterator: 42 | raw_reads.append(x) 43 | return raw_reads 44 | 45 | def get_raw_reads(self,read_names): 46 | headers = {} 47 | raw_reads = [] 48 | with open(self.raw_reads_in_bam_format,'r') as fofh: 49 | for i,fn in enumerate(fofh): 50 | print "\t\tLooking into bam file #: %s" % i 51 | fn = fn.rstrip() 52 | inbamfile = pysam.AlignmentFile(fn,check_sq=False) 53 | reads = self.extract_raw_reads(read_names,inbamfile) 54 | if len(reads) > 0: 55 | headers[fn] = dict(inbamfile.header) 56 | inbamfile.close() 57 | for read in reads: 58 | raw_reads.append(read) 59 | return raw_reads,headers 60 | 61 | def split_raw_reads_into_groups(self,raw_reads,max_num_reads_per_file,header,outdir): 62 | for index,i in enumerate(range(0,len(raw_reads),max_num_reads_per_file)): 63 | reads = raw_reads[i:i + max_num_reads_per_file] 64 | outbamfilefn = "%s/%s.bam" % (outdir,index) 65 | outbamfile = pysam.AlignmentFile(outbamfilefn,"wb", header=header) 66 | for read in reads: 67 | outbamfile.write(read) 68 | outbamfile.close() 69 | pysam.index(outbamfilefn) 70 | 71 | def merge_headers(self,headers): 72 | header = {} 73 | for fn in headers: 74 | if len(header) == 0: 75 | header["HD"] = headers[fn]["HD"] 76 | header["RG"] = [] 77 | header["PG"] = [] 78 | header["SQ"] = [] 79 | for rg in headers[fn]["RG"]: 80 | header["RG"].append(rg) 81 | for pg in headers[fn]["PG"]: 82 | header["PG"].append(pg) 83 | return header 84 | 85 | def split_raw_reads(self): 86 | max_num_reads = 200000 # 200,000 reads is around 5GB 87 | read_names_per_read_group = self.get_reads_per_group() 88 | for chrom in read_names_per_read_group: 89 | for read_group in read_names_per_read_group[chrom]: 90 | print "\tWorking on chrom %s and read group %s" % (chrom,read_group) 91 | print "\tExtracting %s reads" % len(read_names_per_read_group[chrom][read_group]) 92 | outdir = "%s/%s/%s" % (self.raw_reads_directory,chrom,read_group) 93 | self.create_directory(outdir) 94 | read_names = read_names_per_read_group[chrom][read_group] 95 | raw_reads,headers = self.get_raw_reads(read_names) 96 | header = self.merge_headers(headers) 97 | self.split_raw_reads_into_groups(raw_reads,max_num_reads,header,outdir) 98 | 99 | def run(self): 100 | self.configure() 101 | self.split_raw_reads() 102 | -------------------------------------------------------------------------------- /MsPAC/python/assign_reads_to_haplotypes.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | import sys 3 | import vcf 4 | import pysam 5 | import numpy as np 6 | 7 | unphased_tag = 0 8 | haplotype1_tag = 1 9 | haplotype2_tag = 2 10 | 11 | def read_phased_snps(vcffile,sample): 12 | phased_snps = {} 13 | vcf_reader = vcf.Reader(open(vcffile, 'r')) 14 | for record in vcf_reader: 15 | if record.var_subtype == "deletion": 16 | continue 17 | if record.var_subtype == "insertion": 18 | continue 19 | phased_snps.setdefault(record.CHROM, {}) 20 | try: 21 | alleles = record.genotype(sample)['PGT'].split("|") 22 | except: 23 | alleles = record.genotype(sample)['GT'].split("|") 24 | if len(alleles) == 2 and alleles[0] != alleles[1]: 25 | alleles = map(int, alleles) 26 | if max(alleles) != 1: 27 | continue 28 | allele_bases = [record.REF,record.ALT[0]] 29 | sample_bases = map(lambda x: allele_bases[x], alleles) 30 | phased_snps[record.CHROM][record.POS-1] = sample_bases 31 | return phased_snps 32 | 33 | def create_tag(hap): 34 | haptag = ("RG", str(hap), "Z") 35 | return haptag 36 | 37 | def calculate_prob(basetups,hap_dict,threshold=0.9,lr_threshold=10): 38 | ''' 39 | Returns a 0, 1 or -1 as the value of a read 40 | ''' 41 | log_lr_threshold = np.log(lr_threshold) 42 | prob0 = 1. 43 | prob1 = 1. 44 | lprob0 = 0. 45 | lprob1 = 0. 46 | for basetup in basetups: 47 | rpos, b, qual = basetup 48 | #try: 49 | if len(hap_dict[rpos]) == 0: 50 | print rpos 51 | b0, b1 = hap_dict[rpos] 52 | #except: 53 | # print rpos 54 | # sys.exit() 55 | if b == b1 or b == b0: # check to make sure the base is called 56 | if b == b0: 57 | prob0 *= (1.-10.**(-qual/10.)) 58 | prob1 *= (10.**(-qual/10.)) 59 | lprob0 += np.log(1.-10.**(-qual/10.)) 60 | lprob1 += np.log(10.**(-qual/10.)) 61 | else: 62 | prob1 *= (1.-10.**(-qual/10.)) 63 | prob0 *= (10.**(-qual/10.)) 64 | lprob1 += np.log(1.-10.**(-qual/10.)) 65 | lprob0 += np.log(10.**(-qual/10.)) 66 | prob01 = prob0 + prob1 67 | if (prob0 > 0 and prob1 > 0) and prob0 / (prob01) >= threshold: 68 | if lprob0 - lprob1 >= log_lr_threshold: 69 | return create_tag(haplotype1_tag) 70 | if (prob0 > 0 and prob1 > 0) and prob1 / (prob01) >= threshold: 71 | if lprob1 - lprob0 >= log_lr_threshold: 72 | return create_tag(haplotype2_tag) 73 | if (prob0 == 0 or prob1 == 0) and lprob0 - lprob1 >= log_lr_threshold: 74 | return create_tag(haplotype1_tag) 75 | if (prob0 == 0 or prob1 == 0) and lprob1 - lprob0 >= log_lr_threshold: 76 | return create_tag(haplotype2_tag) 77 | return create_tag(unphased_tag) 78 | 79 | def phase_read(read,phased_snps,chrom): 80 | unphased_tag = 0 81 | haplotype1_tag = 1 82 | haplotype2_tag = 2 83 | base_tuples = [] 84 | if chrom in phased_snps: 85 | for read_pos, ref_pos in read.get_aligned_pairs(): 86 | if ref_pos in phased_snps[chrom]: 87 | if read_pos is not None: 88 | if read.query_qualities == None: 89 | qual = 8 90 | else: 91 | qual = read.query_qualities[read_pos] 92 | base = read.query_sequence[read_pos] 93 | base_tuples.append((ref_pos,base,qual)) 94 | if len(base_tuples) > 0: 95 | read_group_tag = calculate_prob(base_tuples,phased_snps[chrom]) 96 | else: 97 | read_group_tag = create_tag(unphased_tag) 98 | #print "%s\t%s\t%s" % (len(base_tuples),read.query_name,read_group_tag[1]) 99 | read_tags = read.get_tags() 100 | tags_to_add = [] 101 | for tag in read_tags: 102 | if tag[0] != "RG": 103 | tags_to_add.append(tag) 104 | tags_to_add.append(read_group_tag) 105 | read.set_tags(tags_to_add) 106 | return read 107 | 108 | def create_header(unphased_bam): 109 | readgroup_unphased = { "ID": unphased_tag } 110 | readgroup_hap1 = { "ID": haplotype1_tag } 111 | readgroup_hap2 = { "ID": haplotype2_tag } 112 | if "RG" in unphased_bam.header: 113 | for group in unphased_bam.header["RG"]: 114 | for item in group: 115 | if item != "ID": 116 | readgroup_unphased[item] = group[item] 117 | readgroup_hap1[item] = group[item] 118 | readgroup_hap2[item] = group[item] 119 | phased_bam_header = unphased_bam.header.to_dict() 120 | phased_bam_header["RG"] = [readgroup_unphased,readgroup_hap1,readgroup_hap2] 121 | return phased_bam_header 122 | 123 | def main(vcffile,bamfile,vcf_sample_name,outbamfile): 124 | #outbamfile = "%s.phased.bam" % bamfile[0:-4] 125 | phased_snps = read_phased_snps(vcffile,vcf_sample_name) 126 | unphased_bam = pysam.AlignmentFile(bamfile,'rb') 127 | phased_bam_header = create_header(unphased_bam) 128 | phased_bam = pysam.AlignmentFile(outbamfile,'wb',header=phased_bam_header) 129 | for read in unphased_bam.fetch(): 130 | if read.is_secondary: 131 | continue 132 | if read.is_unmapped: 133 | continue 134 | if read.is_supplementary: 135 | continue 136 | if not read.is_secondary: 137 | tagged_read = phase_read(read,phased_snps,unphased_bam.get_reference_name(read.reference_id)) 138 | phased_bam.write(tagged_read) 139 | unphased_bam.close() 140 | phased_bam.close() 141 | 142 | if __name__ == '__main__': 143 | vcffile = sys.argv[1] 144 | bamfile = sys.argv[2] 145 | vcf_sample_name = sys.argv[3] 146 | phased_bamfile = sys.argv[4] 147 | sys.exit(main(vcffile,bamfile,vcf_sample_name,phased_bamfile)) 148 | -------------------------------------------------------------------------------- /MsPAC/python/extract_raw_reads_from_bam_fofn.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | import sys 3 | import pysam 4 | 5 | readsfn = sys.argv[1] 6 | inbamfofn = sys.argv[2] 7 | outbamfile = sys.argv[3] 8 | 9 | def merge_headers(headers): 10 | header = {} 11 | for fn in headers: 12 | if len(header) == 0: 13 | header["HD"] = headers[fn]["HD"] 14 | header["RG"] = [] 15 | header["PG"] = [] 16 | header["SQ"] = [] 17 | for rg in headers[fn]["RG"]: 18 | header["RG"].append(rg) 19 | for pg in headers[fn]["PG"]: 20 | header["PG"].append(pg) 21 | return header 22 | 23 | read_names = set() 24 | with open(readsfn,'r') as fh: 25 | for line in fh: 26 | name = line.rstrip() 27 | read_names.add(name) 28 | 29 | outreads = [] 30 | headers = {} 31 | with open(inbamfofn,'r') as fh: 32 | for i,fn in enumerate(fh): 33 | print ">>>>>>>>> %s" % i 34 | fn = fn.rstrip() 35 | inbamfile = pysam.AlignmentFile(fn,check_sq=False) 36 | read_names_indexed = pysam.IndexedReads(inbamfile) 37 | read_names_indexed.build() 38 | for name in read_names: 39 | try: 40 | read_names_indexed.find(name) 41 | except KeyError: 42 | pass 43 | else: 44 | iterator = read_names_indexed.find(name) 45 | for x in iterator: 46 | outreads.append(x) 47 | headers[fn] = dict(inbamfile.header) 48 | inbamfile.close() 49 | 50 | header = merge_headers(headers) 51 | outbamfile = pysam.AlignmentFile(outbamfile,"wb", header=header) 52 | for read in outreads: 53 | outbamfile.write(read) 54 | outbamfile.close() 55 | 56 | -------------------------------------------------------------------------------- /MsPAC/python/hmm2.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | from pomegranate import * 3 | from Bio import AlignIO 4 | import numpy as np 5 | 6 | # hap1: { hap2: { ref: ... 7 | observations = { 8 | "3": { 9 | "A" : { "A": { "A": 0,"T": 2,"C": 2,"G": 2,"N": 14,"-": 10}, 10 | "T": { "A": 3,"T": 1,"C": 4,"G": 4,"N": 14,"-": 11}, 11 | "C": { "A": 3,"T": 4,"C": 1,"G": 4,"N": 14,"-": 11}, 12 | "G": { "A": 3,"T": 4,"C": 4,"G": 1,"N": 14,"-": 11}, 13 | "N": { "A": 14,"T": 14,"C": 14,"G": 14,"N": 14,"-": 14}, 14 | "-": { "A": 7,"T": 8,"C": 8,"G": 8,"N": 14,"-": 5} 15 | }, 16 | "T" : { "T": { "T": 0,"C": 2,"G": 2,"A": 2,"N": 14,"-": 10}, 17 | "C": { "T": 3,"C": 1,"G": 4,"A": 4,"N": 14,"-": 11}, 18 | "G": { "T": 3,"C": 4,"G": 1,"A": 4,"N": 14,"-": 11}, 19 | "A": { "T": 3,"C": 4,"G": 4,"A": 1,"N": 14,"-": 11}, 20 | "N": { "T": 14,"C": 14,"G": 14,"A": 14,"N": 14,"-": 14}, 21 | "-": { "T": 7,"C": 8,"G": 8,"A": 8,"N": 14,"-": 5} 22 | }, 23 | "C" : { "C": { "C": 0,"G": 2,"A": 2,"T": 2,"N": 14,"-": 10}, 24 | "G": { "C": 3,"G": 1,"A": 4,"T": 4,"N": 14,"-": 11}, 25 | "A": { "C": 3,"G": 4,"A": 1,"T": 4,"N": 14,"-": 11}, 26 | "T": { "C": 3,"G": 4,"A": 4,"T": 1,"N": 14,"-": 11}, 27 | "N": { "C": 14,"G": 14,"A": 14,"T": 14,"N": 14,"-": 14}, 28 | "-": { "C": 7,"G": 8,"A": 8,"T": 8,"N": 14,"-": 5} 29 | }, 30 | "G" : { "G": { "G": 0,"A": 2,"T": 2,"C": 2,"N": 14,"-": 10}, 31 | "A": { "G": 3,"A": 1,"T": 4,"C": 4,"N": 14,"-": 11}, 32 | "T": { "G": 3,"A": 4,"T": 1,"C": 4,"N": 14,"-": 11}, 33 | "C": { "G": 3,"A": 4,"T": 4,"C": 1,"N": 14,"-": 11}, 34 | "N": { "G": 14,"A": 14,"T": 14,"C": 14,"N": 14,"-": 14}, 35 | "-": { "G": 7,"A": 8,"T": 8,"C": 8,"N": 14,"-": 5} 36 | }, 37 | "N" : { "G": { "G": 14,"A": 14,"T": 14,"C": 14,"N": 14,"-": 14}, 38 | "A": { "G": 14,"A": 14,"T": 14,"C": 14,"N": 14,"-": 14}, 39 | "T": { "G": 14,"A": 14,"T": 14,"C": 14,"N": 14,"-": 14}, 40 | "C": { "G": 14,"A": 14,"T": 14,"C": 14,"N": 14,"-": 14}, 41 | "N": { "G": 14,"A": 14,"T": 14,"C": 14,"N": 14,"-": 14}, 42 | "-": { "G": 14,"A": 14,"T": 14,"C": 14,"N": 14,"-": 14} 43 | }, 44 | "-" : { "-": { "-": None,"A": 12,"T": 12,"C": 12,"G": 12,"N": 14}, 45 | "A": { "-": 9,"A": 6, "T": 13, "C": 13,"G": 13,"N": 14}, 46 | "T": { "-": 9,"A": 13,"T": 6,"C": 13,"G": 13,"N": 14}, 47 | "C": { "-": 9,"A": 13,"T": 13,"C": 6,"G": 13,"N": 14}, 48 | "G": { "-": 9,"A": 13,"T": 13,"C": 13,"G": 6,"N": 14}, 49 | "N": { "-": 14,"A": 14,"T": 14,"C": 14,"G": 14,"N": 14} 50 | } 51 | }, 52 | "2": { 53 | "A" : { "A": 1, "-": 0, "G": 3, "C": 3, "T": 3, "N": 3}, 54 | "T" : { "T": 1, "A": 3, "-": 0, "G": 3, "C": 3, "N": 3}, 55 | "C" : { "C": 1, "T": 3, "A": 3, "-": 0, "G": 3, "N": 3}, 56 | "G" : { "G": 1, "C": 3, "T": 3, "A": 3, "-": 0, "N": 3}, 57 | "-" : { "-": None, "G": 2, "C": 2, "T": 2, "A": 2, "N": 3}, 58 | "N" : { "-": 3, "G": 3, "C": 3, "T": 3, "A": 3, "N": 3} 59 | } 60 | } 61 | 62 | def observation_probs(index): 63 | num_obs = 15 64 | prob = 0.95 65 | index_prob = prob/len(index) 66 | other_prob = (1-prob)/(num_obs-len(index)+1) 67 | probs = [other_prob]*num_obs 68 | for i in index: 69 | probs[i] = index_prob 70 | obs = list(range(0,15)) 71 | return dict(zip(obs,probs)) 72 | 73 | def get_obs_probs(index,complex_): 74 | obs_probs = observation_probs([index]) 75 | if complex_: 76 | obs_probs[index] = 0.94 77 | obs_probs[0] = 0.01 78 | return obs_probs 79 | 80 | def get_states(): 81 | states = [] 82 | states_with_index = { "INS_1|1": 10,"INS_1|0": 5,"INS_0|1": 9, 83 | "DEL_1|1": 12,"DEL_1|0": 6,"DEL_0|1": 7 } 84 | for state_and_index in states_with_index: 85 | index = states_with_index[state_and_index] 86 | for complex_ in [True,False]: 87 | obs_probs = get_obs_probs(index,complex_) 88 | name = state_and_index 89 | if complex_: 90 | name = "COMPLEX.%s" % name 91 | state = State(DiscreteDistribution(obs_probs),name=name) 92 | states.append(state) 93 | #error_obs_prob = observation_probs([1,2,3,4]) #[4,5,6,7,8,9,11,13]) 94 | #error_state = State(DiscreteDistribution(error_obs_prob),name="ERROR") 95 | #states.append(error_state) 96 | complex_obs_prob = observation_probs([5,6,7,9,10,12]) #list(range(0,15))) 97 | complex_state = State(DiscreteDistribution(complex_obs_prob),name="COMPLEX_.|.") 98 | states.append(complex_state) 99 | return states 100 | 101 | def three_hmm(): 102 | model = HiddenMarkovModel("Sequence Aligner") 103 | states = get_states() 104 | normal = State(DiscreteDistribution(observation_probs([0])), name="NORMAL") 105 | states.append(normal) 106 | num_states = len(states) 107 | model.add_states(states) 108 | for state in states: 109 | if state.name != "NORMAL": 110 | model.add_transition(model.start, state, 0) 111 | model.add_transition(state, model.end, 0) 112 | else: 113 | model.add_transition(model.start, state, 1) 114 | model.add_transition(state, model.end, 1) 115 | trans_probs = np.zeros((num_states,num_states)) 116 | trans_probs.fill(1e-15) 117 | np.fill_diagonal(trans_probs,.99999) # Stay in the same state 118 | trans_probs[:,len(trans_probs)-1] = 1e-15 # Transition to normal 119 | trans_probs[len(trans_probs)-1] = [4.5e-15]*num_states 120 | trans_probs[len(trans_probs)-1][-1] = 1-2e-15 # normal event 121 | #print >> sys.stderr, trans_probs 122 | for state, t_prob in zip(states,trans_probs): 123 | for s, prob in zip(states,t_prob): 124 | model.add_transition(state,s,prob) 125 | #print >> sys.stderr, state.name,s.name, prob 126 | model.add_transition(normal,normal,(1-2e-15)) 127 | model.bake() 128 | return model 129 | 130 | model = {"3": three_hmm()} 131 | -------------------------------------------------------------------------------- /MsPAC/python/msa_to_variants.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | #from MsPAC.python_scripts.hmm import * 3 | from hmm2 import * 4 | from pomegranate import * 5 | from Bio import AlignIO 6 | import numpy as np 7 | import pysam 8 | import sys 9 | 10 | def get_msa_sequence(clufn): 11 | alignment = AlignIO.read(clufn,"clustal") 12 | sequences = {} 13 | for i,sequence in enumerate(alignment): 14 | if sequence.id not in ["ref","hap1","hap2"]: 15 | if i == 0: 16 | sequence.id = "ref" 17 | if i == 1: 18 | sequence.id = "hap1" 19 | if i == 2: 20 | sequence.id = "hap2" 21 | sequences[sequence.id] = str(sequence.seq).upper() 22 | return sequences 23 | 24 | def get_observations(sequence): 25 | obs = [] 26 | if len(sequence) == 3: 27 | h1_seq = sequence["hap1"] 28 | h2_seq = sequence["hap2"] 29 | chrom_seq = sequence["ref"] 30 | for h1,h2,chrom in zip(h1_seq,h2_seq,chrom_seq): 31 | obs.append(observations["3"][h1][h2][chrom]) 32 | return obs 33 | 34 | def get_quality_scores(sequence,quality_scores,start_index,end_index,padding): 35 | quality_start = start_index - sequence[:start_index].count("-") #- padding 36 | quality_end = end_index - sequence[:end_index].count("-") #+ padding 37 | #print quality_start,quality_end 38 | if quality_end - quality_start < (padding*2): 39 | quality_start = max(0,quality_start - padding) 40 | quality_end = quality_end + padding 41 | scores = quality_scores[quality_start:quality_end] 42 | #print scores,start_index,end_index,quality_start,quality_end 43 | mean = float(sum(scores))/len(scores) 44 | return mean 45 | 46 | def three_sequence_msa_variants(sequence,path,clufn,chrom,ref_start,ref_end,quality_scores,padding): 47 | start = False 48 | current_sv = None 49 | current_sv_start_index = None 50 | current_sv_current_index = None 51 | current_sv_hap1_seq = [] 52 | current_sv_hap2_seq = [] 53 | current_sv_ref_seq = [] 54 | ref_index = -1 55 | ref_sv_start = None 56 | ref_sv_index = None 57 | for i,(h1,h2,ref,p) in enumerate(zip(sequence["hap1"],sequence["hap2"],sequence["ref"],path[1:-1])): 58 | state_index, state = p 59 | if ref != "-": 60 | ref_index += 1 61 | if start == False: 62 | if "-" not in (h1,h2,ref,p) and state.name == "NORMAL": 63 | start = True 64 | continue 65 | if state.name != "NORMAL": 66 | if current_sv == None: 67 | current_sv = state.name 68 | current_sv_start_index = i 69 | current_sv_current_index = i 70 | ref_sv_start = ref_index 71 | ref_sv_index = ref_index 72 | continue 73 | current_sv_current_index += 1 74 | ref_sv_index = ref_index 75 | if h1 != "-": 76 | current_sv_hap1_seq.append(h1) 77 | if h2 != "-": 78 | current_sv_hap2_seq.append(h2) 79 | if ref != "-": 80 | current_sv_ref_seq.append(ref) 81 | if i != current_sv_current_index and current_sv != None: 82 | if i + 1 == len(sequence["hap1"]) and "-" in (h1,h2,ref): 83 | continue 84 | sv_type, genotype = current_sv.split("_") 85 | sv_len = max(len(current_sv_hap1_seq),len(current_sv_hap2_seq),len(current_sv_ref_seq)) 86 | if quality_scores != None: 87 | hap1_qual_score = get_quality_scores(sequence["hap1"],quality_scores["hap1"],current_sv_start_index,current_sv_current_index,padding) 88 | hap2_qual_score = get_quality_scores(sequence["hap2"],quality_scores["hap2"],current_sv_start_index,current_sv_current_index,padding) 89 | else: 90 | hap1_qual_score = 60 91 | hap2_qual_score = 60 92 | output = [chrom, 93 | ref_sv_start + ref_start, # 0-based/UCSC Genome format 94 | ref_sv_index + ref_start + 1, # 0-based/UCSC Genome format 95 | sv_type, 96 | genotype, 97 | sv_len, 98 | hap1_qual_score, 99 | hap2_qual_score, 100 | "".join(current_sv_ref_seq) if len("".join(current_sv_ref_seq)) > 0 else ".", 101 | "".join(current_sv_hap1_seq) if len("".join(current_sv_hap1_seq)) > 0 else ".", 102 | "".join(current_sv_hap2_seq) if len("".join(current_sv_hap2_seq)) > 0 else ".", 103 | current_sv_start_index, 104 | current_sv_current_index, 105 | clufn] 106 | print "\t".join(map(str,output)) 107 | current_sv = None 108 | current_sv_hap1_seq = [] 109 | current_sv_hap2_seq = [] 110 | current_sv_ref_seq = [] 111 | 112 | def path_to_variants(path,sequence,clufn,chrom,start,end,quality_scores,padding): 113 | if len(sequence) == 3: 114 | three_sequence_msa_variants(sequence,path,clufn,chrom,start,end,quality_scores,padding) 115 | 116 | def load_qual_scores(qual_scoresfn): 117 | qual_scores = {} 118 | with open(qual_scoresfn,'r') as fh: 119 | for line in fh: 120 | if ">" in line: 121 | name = line[1:].rstrip() 122 | else: 123 | qual = line.rstrip().split(',') 124 | qual_scores[name] = map(int,qual) 125 | return qual_scores 126 | 127 | def main(): 128 | clufn = sys.argv[1] 129 | chrom = sys.argv[2] 130 | start = int(sys.argv[3]) 131 | end = int(sys.argv[4]) 132 | if sys.argv[5] != "None": 133 | qual_scoresfn = sys.argv[5] 134 | qual_scores = load_qual_scores(qual_scoresfn) 135 | else: 136 | qual_scores = None 137 | if sys.argv[6] != "None": 138 | padding = int(sys.argv[6]) 139 | else: 140 | padding = 5 141 | sequence = get_msa_sequence(clufn) 142 | obs = get_observations(sequence) 143 | log, path = model[str(len(sequence))].viterbi(obs) 144 | path_to_variants(path,sequence,clufn,chrom,start,end,qual_scores,padding) 145 | 146 | if __name__ == "__main__": 147 | main() 148 | -------------------------------------------------------------------------------- /MsPAC/python/start_end_coordinates.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | import pysam 3 | import sys 4 | 5 | bamfile = sys.argv[1] 6 | samfile = pysam.AlignmentFile(bamfile) 7 | for i,read in enumerate(samfile): 8 | if read.is_unmapped: 9 | continue 10 | if read.is_secondary: 11 | continue 12 | if read.is_supplementary: 13 | continue 14 | output = [samfile.getrname(read.reference_id),read.reference_start,read.reference_end - 1,read.query_name] 15 | print "\t".join(map(str,output)) 16 | 17 | -------------------------------------------------------------------------------- /MsPAC/sv_calling.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | from Bio import SeqIO 3 | from Bio.Seq import Seq 4 | from Bio.SeqRecord import SeqRecord 5 | import math 6 | import pysam 7 | import os 8 | 9 | from MsPAC import Pipeline 10 | 11 | class SVCaller(Pipeline): 12 | def __init__(self,configfile): 13 | Pipeline.__init__(self,configfile,"sv-calling") 14 | 15 | def create_new_record(self,record,i,start,end): 16 | new_record_name = "%s.%s" % (i,record.id) 17 | new_record_seq = record.seq[start:end] 18 | new_record_qual = record.letter_annotations["phred_quality"][start:end] 19 | new_record = SeqRecord(new_record_seq,id=new_record_name,name=new_record_name,description="") 20 | new_record.letter_annotations["phred_quality"] = new_record_qual 21 | return new_record 22 | 23 | def create_new_fastq(self,infastq,outfastq,window): 24 | records = [] 25 | for record in SeqIO.parse(infastq, "fastq"): 26 | if len(record.seq) > window: 27 | num_windows = int(math.ceil(len(record.seq)/window)) 28 | size_of_window = len(record.seq)/num_windows 29 | i = 0 30 | for i in range(num_windows - 1): 31 | start = i*size_of_window 32 | end = (i+1)*size_of_window 33 | new_record = create_new_record(record,i,start,end) 34 | records.append(new_record) 35 | start = (i+1)*size_of_window 36 | new_record = create_new_record(record,i+1,start,None) 37 | records.append(new_record) 38 | else: 39 | records.append(record) 40 | SeqIO.write(records,outfastq,"fastq") 41 | 42 | def split_fastq_files(self): 43 | window = 500000 44 | for hap in ["hap1","hap2"]: 45 | if hap == "hap1": 46 | infastq = self.hap1_assembly_fq 47 | outfastq = self.hap1_assembly_split_fq 48 | elif hap == "hap2": 49 | infastq = self.hap2_assembly_fq 50 | outfastq = self.hap2_assembly_split_fq 51 | self.create_new_fastq(infastq,outfastq,window) 52 | 53 | def map_assembly(self): 54 | template_bash = "%s/map.sh" % self.package_bash_directory 55 | for hap in ["hap1","hap2"]: 56 | if hap == "hap1": 57 | fastq = self.hap1_assembly_split_fq 58 | elif hap == "hap2": 59 | fastq = self.hap2_assembly_split_fq 60 | prefix = "%s/%s_to_ref" % (self.sv_calling_directory,hap) 61 | bashfile = "%s/map_%s_assembly.sh" % (self.sv_calling_directory,hap) 62 | params = { 63 | 'input': fastq, 64 | 'ref': self.reference, 65 | 'prefix': prefix, 66 | 'threads': self.job_threads["high"] 67 | } 68 | self.write_to_bashfile(template_bash,bashfile,params) 69 | self.jobs.append((bashfile,self.job_threads["high"])) 70 | self.run_locally() 71 | 72 | def get_msa_coordinates(self): 73 | if os.path.isfile("%s/msa_coords.bed" % self.sv_calling_directory): 74 | return 75 | template_bash = "%s/get_msa_coords.sh" % self.package_bash_directory 76 | bashfile = "%s/get_msa_coords.sh" % self.sv_calling_directory 77 | params = { 78 | 'python_packages': self.package_python_directory, 79 | 'ref': self.reference, 80 | 'sv_calling_dir': self.sv_calling_directory 81 | } 82 | self.write_to_bashfile(template_bash,bashfile,params) 83 | self.jobs.append((bashfile,self.job_threads["high"])) 84 | self.run_locally() 85 | 86 | def read_bedfile(self,bedfile): 87 | regions = {} 88 | with open(bedfile,'r') as bedfh: 89 | for line in bedfh: 90 | line = line.rstrip().split('\t') 91 | chrom = line[0] 92 | coord = (int(line[1]),int(line[2])) 93 | if chrom not in regions: 94 | regions[chrom] = [] 95 | regions[chrom].append(coord) 96 | return regions 97 | 98 | def get_hap_sequence(self,hap_bamfile,regions): 99 | samfile = pysam.AlignmentFile(hap_bamfile,'rb') 100 | hap_sequences = {} 101 | for chrom in regions: 102 | for start,end in regions[chrom]: 103 | for contig in samfile.fetch(chrom,start,end): 104 | if contig.is_unmapped: 105 | continue 106 | if contig.is_secondary: 107 | continue 108 | if contig.is_supplementary: 109 | continue 110 | if contig.reference_start > start: 111 | continue 112 | if contig.reference_end < end: 113 | continue 114 | aligned_pairs = contig.get_aligned_pairs() 115 | query_start = None 116 | query_end = None 117 | matched_ref_start = None 118 | matched_ref_end = None 119 | for query_pos, ref_pos in aligned_pairs: 120 | if query_pos == None: 121 | continue 122 | if ref_pos == None: 123 | continue 124 | if int(ref_pos) <= int(start): 125 | query_start = query_pos 126 | matched_ref_start = ref_pos 127 | query_end = query_pos 128 | matched_ref_end = ref_pos 129 | if int(ref_pos) > int(end): 130 | break 131 | assert query_start != None 132 | assert query_end != None 133 | hap_sequence = contig.query_sequence[query_start:query_end] 134 | if contig.query_qualities != None: 135 | sequence_qual = contig.query_qualities[query_start:query_end] 136 | else: 137 | sequence_qual = [0]*len(hap_sequence) 138 | assert len(sequence_qual) == len(hap_sequence) 139 | hap_sequences[(chrom,start,end)] = (hap_sequence,sequence_qual) 140 | return hap_sequences 141 | 142 | def extract_msa_sequence(self): 143 | msa_coordsfn = "%s/msa_coords.bed" % self.sv_calling_directory 144 | msa_coords = self.read_bedfile(msa_coordsfn) 145 | hap1_bamfn = "%s/hap1_to_ref.sorted.bam" % self.sv_calling_directory 146 | hap1_sequence = self.get_hap_sequence(hap1_bamfn,msa_coords) 147 | hap2_bamfn = "%s/hap2_to_ref.sorted.bam" % self.sv_calling_directory 148 | hap2_sequence = self.get_hap_sequence(hap2_bamfn,msa_coords) 149 | fasta = pysam.FastaFile(self.reference) 150 | for chrom in msa_coords: 151 | for i,(start,end) in enumerate(msa_coords[chrom]): 152 | ref_seq = fasta.fetch(reference=chrom,start=max(1,start),end=end) 153 | h1_seq,h1_qual = hap1_sequence[(chrom,start,end)] 154 | h2_seq,h2_qual = hap2_sequence[(chrom,start,end)] 155 | directory = "%s/%s/%s_%s" % (self.sv_calling_directory,chrom,start,end) 156 | self.create_directory(directory) 157 | outseqfn = "%s/seq.fa" % directory 158 | outqualfn = "%s/seq.qual" % directory 159 | with open(outseqfn,'w') as outfh: 160 | outfh.write(">ref\n%s\n" % ref_seq) 161 | outfh.write(">hap1\n%s\n" % h1_seq) 162 | outfh.write(">hap2\n%s\n" % h2_seq) 163 | with open(outqualfn,'w') as outqualfh: 164 | outqualfh.write(">hap1\n%s\n" % ",".join(map(str,h1_qual))) 165 | outqualfh.write(">hap2\n%s\n" % ",".join(map(str,h2_qual))) 166 | 167 | def calls_svs_from_msa(self): 168 | msa_coordsfn = "%s/msa_coords.bed" % self.sv_calling_directory 169 | msa_coords = self.read_bedfile(msa_coordsfn) 170 | template_bash = "%s/sv_calling.sh" % self.package_bash_directory 171 | for chrom in msa_coords: 172 | for i,(start,end) in enumerate(msa_coords[chrom]): 173 | directory = "%s/%s/%s_%s" % (self.sv_calling_directory,chrom,start,end) 174 | bashfile = "%s/sv_calling.sh" % directory 175 | params = { 176 | 'dir': directory, 177 | 'python_scripts': self.package_python_directory, 178 | 'chrom': chrom, 179 | 'start': start, 180 | 'end': end 181 | } 182 | self.write_to_bashfile(template_bash,bashfile,params) 183 | self.jobs.append((bashfile,self.job_threads["low"])) 184 | self.submitjobs() 185 | 186 | def run(self): 187 | self.configure() 188 | self.split_fastq_files() 189 | self.map_assembly() 190 | self.get_msa_coordinates() 191 | self.extract_msa_sequence() 192 | self.calls_svs_from_msa() 193 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MsPAC 2 | **Phase reads, assemble haplotypes and detect SVs** 3 | 4 | [Introduction](#introduction) 5 | [Tool requirements](#tool-requirements) 6 | [Installation](#installation) 7 | [Cluster configuration](#cluster-configuration)
8 | [Test runs](#test-runs)
9 | [Configuration File](#configuration-file)
10 | [Quick Start](#quick-start) 11 | [Explanation of steps](#explanation-of-steps) 12 | [Example of output](#example-of-output)
13 | [Manuscript results](#manuscript-results) 14 | 15 | 16 | ## Introduction 17 | MsPAC takes in long reads and phased SNVs to separate the reads into two haplotypes, and assembles both haplotypes and detects structural variants. The output is a fasta file containing both haplotypes and VCF file with SVs. The SVs are annotated with their type, size, genotype and reference, haplotype 1 and haplotype 2 sequence. 18 | 19 | ## Tool requirements 20 | 1. Linux operating system 21 | 2. [Conda package](https://conda.io/en/latest/) 22 | 3. [cluster python package](https://github.com/oscarlr/cluster) 23 | 24 | ## Installation 25 | ``` 26 | ### Installing MsPAC and it's dependencies 27 | git clone https://github.com/oscarlr/MsPAC.git 28 | cd MsPAC 29 | conda env create -f environment.yml 30 | conda activate MsPAC 31 | python setup.py install 32 | 33 | ### Installing cluster package that's needed 34 | cd .. 35 | git clone https://github.com/oscarlr/cluster.git 36 | cd cluster 37 | python setup.py install 38 | ``` 39 | ## Cluster configuration 40 | If you don't want to use the cluster use this command before running MsPAC: 41 | ``` 42 | export SJOB_DEFALLOC="" 43 | ``` 44 | If you want to use the cluster, edit the `lsf/cluster/config.py` script in `https://github.com/oscarlr/cluster.git`. The cluster package reads from this file the default configurations to run jobs in the cluster as wells as the account to use when submitting jobs. After you edit `lsf/cluster/config.py` reinstall the package using `python setup.py install` in the cluster folder. 45 | 46 | ## Test runs 47 | ``` 48 | export SJOB_DEFALLOC="" 49 | cd testing 50 | sh run.sh 51 | ``` 52 | ## Configuration File 53 | Explanation of configuration file entries is [here](cfg_readme.md). 54 | ``` 55 | [Input] 56 | directory = 57 | 58 | [Phase-bam input files] 59 | phased vcf = 60 | reads aligned = 61 | 62 | [Phase-bam params] 63 | sample name in VCF = 64 | output phased bamfile = 65 | 66 | [Prep reads params] 67 | BAM fofn = 68 | Raw reads directory = 69 | 70 | [Assembly params] 71 | Minimum phased block length = 1000 72 | Comma-seperated list of haplotypes = 0_1,0_2 73 | Assembly directory = 74 | Flanking length = 1000 75 | Phased bedfile = None 76 | 77 | [SV calling params] 78 | SV calling directory = 79 | reference = 80 | 81 | [Other params] 82 | cluster = No 83 | 84 | [HIGH INTENSITY JOB] 85 | walltime = 24 86 | threads = 1 87 | memory = 8 88 | queue = private 89 | 90 | [LOW INTENSITY JOB] 91 | walltime = 24 92 | threads = 1 93 | memory = 8 94 | queue = private 95 | ``` 96 | 97 | ## Quick Start 98 | ``` 99 | MsPAC phase-bam run.cfg 100 | MsPAC prep-reads run.cfg 101 | MsPAC assembly run.cfg 102 | MsPAC sv-calling run.cfg 103 | ``` 104 | 105 | ## Explanation of steps 106 | MsPac is split into four steps. For each step, the input is a configuration file. A description of the configuration file is [here](cfg_readme.md). 107 | #### `phase-bam` 108 | In the first step `phase-bam`, a bam file is created. This bam file is a copy of the input bam file with a read group annotation added to the reads. A read group annotation of 1 and 2 corresponds to haplotype 1 and 2. The read group annotation of 0 corresponds to unassignable reads. 109 | #### `prep-reads` 110 | In the second step `prep-reads`, several bam files are created. These bam contain the raw reads seperated by chromosome and haplotype. It makes the process of searching for these reads much faster during the Quiver process, where haplotype specific reads are used to clean the haplotype-specific contigs. 111 | #### `assembly` 112 | In the third step `assembly`, the haplotypes are assembled. During this process folders will be created for each region. Within each folder there is a bash script that runs the assembly process. MsPAC can submit these bash scripts as a single job into the cluster (this speeds up the process). 113 | #### `sv-calling` 114 | In the last step `sv-calling`, the haplotypes and reference are aligned and the SVs are called. In this step, new directories will be made that holds the multiple sequence alignment and a BED file with the SVs. 115 | 116 | ## Example of output 117 | ### BED SV output 118 | ``` 119 | chr22 16610019 16610020 INS 1|0 46 46.6780821918 46.84 . CACTGCTGTTGGGTTCTCTTTGTTTTTCCTCACAAAGGATTCCACA . 18270 18316 /sc/orga/work/rodrio10/software/in_github/MsPAC/testing/MsPAC/sv_calling/chr22/16595201_16611082/msa.clu 120 | ``` 121 | The columns are: 122 | ``` 123 | 1. chromosome 124 | 2. SV start 125 | 3. SV end 126 | 4. SV type 127 | 5. SV genotype 128 | 6. SV size 129 | 7. Haplotype 1 SV quality score 130 | 8. Haplotype 2 SV quality score 131 | 9. Reference sequence 132 | 10. Haplotype 1 sequence 133 | 11. Haplotype 2 sequence 134 | 12. Start index position of SV in multiple sequence alignment file 135 | 13. End index position of SV in multiple sequence alignment file 136 | 14. Full path of multiple sequence alignment file 137 | ``` 138 | ### Assembled fasta haplotype 139 | ``` 140 | >22.16050007.16697745.0_1.raw.0/0/0_0 141 | GACCATGTGAAACTAAGGACAACTTCAGAGCTTCACACAGCTTCAACACTGGAGAGAAAA 142 | CAGTGAACCCACAGAAAACATCCTACAGACTGGGAGAAAATTATGGAAAACTGTGGATCT 143 | GGAAGGGCTTCTTATCTAACATATTCAAGAAACTAATGGTCCTAAGTGGACAAAAACCAA 144 | TATACAATGCTTGTCACACCTAAGTGGACAAAAACCAATACTAAAAATGCCCAAAAGACT 145 | GCGTAGGCATTTCTGAAAAAACCTGAAACAGCCTCTCAGGTAACAGAAGTTTCTCCACAT 146 | CAAGAAGAGTTTCTCCCCAGAGAACGAGTATGACCAGAAAACAGCAATAAAACTTTGGAA 147 | TAAGAGATAAGGGCAGTGTAGATTTGCAGACAGAGGAACTATTACATACTACCTGGTTTG 148 | AATGCAAATTTGTATACCCACTGGGAAACAGCTGGAGGTTTCTGAAACAATTAACAACAC 149 | AACCACCAGTTCCTCTAGCCATCCCACACTGGGTATACCTGCAAAGCCAAGGAAACCTAC 150 | ``` 151 | The fasta header has the region that was assembled with the corresponding haplotype. 152 | 153 | ### Phased BAM file 154 | ``` 155 | m150131_015113_42163R_c100780292550000001823166508251570_s1_p0/8761/14473_27456 156 | 16 157 | 22 158 | 16050008 159 | 0 160 | 15543H97S27=1I1=1X12=1I20=2D13=1I8=1I9=3I6=1I11=1D5=1D1=1D11=1I3=1D11=1I4=1X1=1D3=1I8=5I10=1I1=1I3=1I5=3I15=1I8=1D38=1X16=1X5=1I8=1X4=1I12=1I4=1I14=1I1=1I8=1D6=1I4=1I13=1I15=1D1=1I21=2I12=1I1=1X6=1X3=1D5= 161 | * 162 | 0 163 | 12343 164 | CCAATCTCCTGGCAGCCACGCAGCCGGTCGAGAAATTTCGTCACTTGTGGCGGGTTCCCAAGCCTGTTGCCATGCAGCCTCTGGAAAGAGATCTGATTAAGTCCCAGGACTTCAGAAGAGCTGTTGCGACCTTGGCCAATGTCACTTCCTCCTTCAGGAATTGCAGTGGGCCTTAAGTGCCTTCCTCTCGGGCCCACTGGTTAT 165 | --*%..)-.-/,.//*-/..,/.%+%"'-)(./*./)'/"(..%,(.,(+&)"*.(-,+-./-"///*/,.//+.//,/).(+)/*.//+/)//,//#/'./.,,./)/'&%///.+.////.(%/./.+.,*/+)(..$/////,+...,,/&&..((%.(/.////,$,/*'/.+//,//%-././..,'###(&(+($',, 166 | AS:i:-54803 167 | XS:i:15544 168 | XE:i:28527 169 | qs:i:15544 170 | qe:i:28527 171 | zm:i:-1 172 | XL:i:12884 173 | XT:i:1 174 | NM:i:0 175 | FI:i:14476 176 | XQ:i:42999 177 | iq:Z:0/,(121/113-223+/212121/,-#'//202*12*'2#)10%0(0.),&.#*0)0--/03.#232*3-1332223.2*111,2+123,3232.32#3'120/.23)2-3&3333.1223232%313211-*3-*(01$22332--101/.2&-01)0)0)313322-$-2*'30,23-22%/121211-'###,'/, 178 | dq:Z:222'22*2222222222222222&2&(22+)2222222222222-2222222)2222222222222222222222222222(222222222*22222222222222222('222222222222)222222222222222222222222222222'222)'222222222222222222222222222222222(2+2)2 179 | sq:Z:<<<<<<<<<<<<<9<<<<<<<<<<<<<<<<<<<<<<<<<%<<<<<<<<<<<<<<<<<<<<<<<$<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<5<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<3<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 180 | mq:Z:03/6>0>75,834;=1280-*:3,C.-1/*:@9C3@1}>}}19}:857}:}&}:753:564:C}>9M323=62'4>1;:}94''A?=9:}5C;6}?=}=}FG6-71;B4}DI5>6-)=D<3@17}9;40'7}D:1-}95};57=8};/A9-9B}}8:346@(8283C@7}<8}}88}963=6}74;5A8>:7&%-.}9< 181 | st:Z:AACCGAGAAGTTACTAACATACTAATTGATCTCCCGGGATGACAGGTGTTATTTGGAAACCTAAGTGGTAACGTACTAAGAGTTCCCTCTCGAGTCGGCCTGAAACTTCAGGACTCCTCTAGTGGTATCAAGGTTAACCGTGACAGGAAGAAGGACTTCCGGTACTGTTTAAGGCCTGTAAGGAAGAGATTTAAACAGT 182 | dt:Z:NNNANNCNNNNNNNNNNNNNNNNANTTNNTANNNNNNNNNNNNNTNNNNNNNANNNNNNNNNNNNNNNNNNNNNNNNNNNNANNNNNNNNNANNNNNNNNNNNNNNNNNGTNNNNNNNNNNNNGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTNNNCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCNANGN 183 | ip:Z:S26,94,16,11,14,14,77,19,28,28,26,91,24,5,35,8,11,46,10,31,75,27,16,257,26,54,44,15,39,47,16,7,37,7,17,57,50,20,26,30,23,49,63,6,636,54,12,33,33,16,15,141,360,18,14,25,42,8,17,21,51,10,17,34,19,13,12 184 | RG:Z:2 185 | ``` 186 | RG tag has the haplotype information. This read can be accessed using the `samtools view` command with the `-r` option. For example, `samtools view reads.bam 22:16050008-16050108 -r 2`. 187 | ## Manuscript results 188 | [HG002 haplotype assemblies](https://rodrio10.u.hpc.mssm.edu/MsPAC/hg002_assembly/haplotypes/)
189 | [HG002 SV calls](https://rodrio10.u.hpc.mssm.edu/MsPAC/SV_calls.bed) 190 | -------------------------------------------------------------------------------- /cfg_readme.md: -------------------------------------------------------------------------------- 1 | [Example](#example) 2 | [Explanation](#explanation) 3 | 4 | # Example 5 | ``` 6 | [Input] 7 | directory = MsPAC 8 | 9 | [Phase-bam input files] 10 | phased vcf = input_data/test.vcf.gz 11 | reads aligned = input_data/test.bam 12 | 13 | [Phase-bam params] 14 | sample name in VCF = 20977 15 | output phased bamfile = input_data/test_phased.bam 16 | 17 | [Prep reads params] 18 | BAM fofn = reads.fofn 19 | Raw reads directory = MsPAC/prep_reads 20 | 21 | [Assembly params] 22 | Minimum phased block length = 1000 23 | Comma-seperated list of haplotypes = 0_1,0_2 24 | Assembly directory = MsPAC/assembly 25 | Flanking length = 1000 26 | Phased bedfile = None 27 | Technology = ONT 28 | 29 | [SV calling params] 30 | SV calling directory = MsPAC/sv_calling 31 | reference = input_data/chr22.fa 32 | 33 | [Other params] 34 | cluster = No 35 | 36 | [HIGH INTENSITY JOB] 37 | walltime = 24 38 | threads = 1 39 | memory = 8 40 | queue = private 41 | 42 | [LOW INTENSITY JOB] 43 | walltime = 24 44 | threads = 1 45 | memory = 8 46 | queue = private 47 | ``` 48 | 49 | # Explanation 50 | ``` 51 | [Input] 52 | directory = MsPAC 53 | ``` 54 | The directory that MsPAC writes to. 55 | 56 | ``` 57 | [Phase-bam input files] 58 | phased vcf = input_data/test.vcf.gz 59 | reads aligned = input_data/test.bam 60 | ``` 61 | `phase vcf` is the input phased VCF file with the phased SNPs. `reads aligned` is the input BAM file with reads aligned. 62 | 63 | ``` 64 | [Phase-bam params] 65 | sample name in VCF = 20977 66 | output phased bamfile = input_data/test_phased.bam 67 | ``` 68 | `sample name in VCF` is the sample name in the VCF file. `output phased bamfile` is the output BAM file with the input reads phased. In the `output phased bamfile`, the input reads have a new read group tag. The read group tag `0`, `1`, and `2` correspond to unphased reads, haplotype 1, and haplotype 2. 69 | 70 | ``` 71 | [Prep reads params] 72 | BAM fofn = reads.fofn 73 | Raw reads directory = MsPAC/prep_reads 74 | ``` 75 | `BAM fofn` is a file created by MsPAC that list all the BAM files created by the `prep-reads` step. `Raw reads directory` is the directory with the BAM files `prep-reads` creates. The BAM files contain the raw reads seperated by chromosome. 76 | 77 | ``` 78 | [Assembly params] 79 | Minimum phased block length = 1000 80 | Comma-seperated list of haplotypes = 0_1,0_2 81 | Assembly directory = MsPAC/assembly 82 | Flanking length = 1000 83 | Phased bedfile = None 84 | ``` 85 | `Minimum phased block length` is the minimum size that will be assembled. `Comma-seperated list of haplotypes` are the haplotypes that will be assembled. The options are: `0`,`1`,`2`,`0_1`, and `0_2`. `0`,`1`, and `2` are unambiguous regions, haplotype 1 and haplotype 2. `0_1` and `0_2` are haplotype 1 and 2 with the reads from unambiguous regions added to both haplotype 1 and 2. `Assembly directory ` is the directory with the regions assembled. `Flanking length` is an extra amount of bases added to both ends of each region to be assembled. `Phased bedfile` is a bed file with the regions to assemble. It is created by MsPAC if none is given. `Phased bedfile` should have this tab-delimited format: 86 | `chromosome start end haplotype low/high`, for example: 87 | ``` 88 | 22 16050007 16697745 1 low 89 | 22 16847850 17262375 1 low 90 | 22 17262464 18711525 1 low 91 | 22 18712024 18712281 1 low 92 | 22 50414777 51244565 0 low 93 | 22 16050007 16697745 2 low 94 | 22 50414777 51244565 2 low 95 | 22 16050007 16697745 0_2 low 96 | 22 20609570 50364777 0_1 high 97 | 22 50414777 51244565 0_1 low 98 | ``` 99 | 100 | ``` 101 | [SV calling params] 102 | SV calling directory = MsPAC/sv_calling 103 | reference = input_data/chr22.fa 104 | ``` 105 | `SV calling directory` is the directory with the output from the `sv-calling` MsPAC step. `reference` is the genome reference in fasta format. 106 | 107 | ``` 108 | [Other params] 109 | cluster = No 110 | ``` 111 | If `cluster` is "Yes", then the assembly and sv-calling jobs will be sent to the cluster. 112 | 113 | ``` 114 | [HIGH INTENSITY JOB] 115 | walltime = 24 116 | threads = 1 117 | memory = 8 118 | queue = private 119 | 120 | [LOW INTENSITY JOB] 121 | walltime = 24 122 | threads = 1 123 | memory = 8 124 | queue = private 125 | ``` 126 | Regions labelled `low` in `Phased bedfile` will use the `[LOW INTENSITY JOB]` configuration, and similarly for `high` regions. 127 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: MsPAC 2 | channels: 3 | - hcc 4 | - conda-forge 5 | - bioconda 6 | - etetoolkit 7 | - defaults 8 | dependencies: 9 | - avro-python2=1.8.2=py_1 10 | - bcftools=1.9=h4da6232_0 11 | - blas=1.0 12 | - blasr=5.3.2=h82bacf8_5 13 | - blasr_libcpp=5.3.1=h82bacf8_4 14 | - htslib=1.9=hc238db4_4 15 | - libdeflate=1.0=h470a237_0 16 | - pbbam=0.19.0=h6678c95_1 17 | - pbcommand=1.1.1=py27_2 18 | - pbcore=1.6.5=py27_0 19 | - perl-filesys-df=0.92=pl526h470a237_2 20 | - pomegranate=0.3.7=py27_2 21 | - pysam=0.15.1=py27h0380709_0 22 | - python-consensuscore=1.1.1=py27h02d93b8_2 23 | - pyvcf=0.6.8=py27_0 24 | - samtools=1.9=h46bd0b3_0 25 | - asn1crypto=0.24.0=py27_1003 26 | - biopython=1.72=py27h470a237_0 27 | - ca-certificates=2018.11.29=ha4d7672_0 28 | - cairo=1.14.10=0 29 | - certifi=2018.11.29=py27_1000 30 | - cffi=1.11.5=py27h5e8e0c9_1 31 | - chardet=3.0.4=py27_1003 32 | - cryptography=2.3.1=py27hdffb7b8_0 33 | - cryptography-vectors=2.3.1=py27_1000 34 | - curl=7.63.0=h74213dd_0 35 | - cython=0.29.1=py27hfc679d8_0 36 | - dbus=1.13.0=h3a4f0e9_0 37 | - decorator=4.3.0=py_0 38 | - enum34=1.1.6=py27_1001 39 | - expat=2.2.5=hfc679d8_2 40 | - fontconfig=2.12.6=0 41 | - freetype=2.8.1=hfa320df_1 42 | - giflib=5.1.4=h470a237_1 43 | - glib=2.55.0=h464dc38_2 44 | - gnuplot=5.2.3=0 45 | - h5py=2.8.0=py27h097b052_4 46 | - harfbuzz=1.7.6=0 47 | - hdf5=1.10.3=hc401514_2 48 | - icu=58.2=hfc679d8_0 49 | - idna=2.8=py27_1000 50 | - ipaddress=1.0.22=py_1 51 | - iso8601=0.1.12=py_1 52 | - joblib=0.13.0=py_0 53 | - jpeg=9c=h470a237_1 54 | - krb5=1.16.2=hbb41f41_0 55 | - libcurl=7.63.0=hbdb9355_0 56 | - libedit=3.1.20170329=0 57 | - libgd=2.2.5=3 58 | - libiconv=1.15=h470a237_3 59 | - libpng=1.6.34=ha92aebf_2 60 | - libssh2=1.8.0=h5b517e9_3 61 | - libtiff=4.0.9=he6b73bb_2 62 | - libwebp=0.5.2=7 63 | - libxcb=1.13=h470a237_2 64 | - libxml2=2.9.8=h422b904_5 65 | - linecache2=1.0.0=py_1 66 | - ncurses=5.9=10 67 | - networkx=1.9 68 | - openjdk=11.0.1=h470a237_14 69 | - openssl=1.0.2p=h470a237_1 70 | - pango=1.40.14=0 71 | - pcre=8.41=hfc679d8_3 72 | - pixman=0.34.0=h470a237_3 73 | - pthread-stubs=0.4=h470a237_1 74 | - pycparser=2.19=py_0 75 | - pyopenssl=18.0.0=py27_1000 76 | - pysocks=1.6.8=py27_1002 77 | - pytz=2018.7=py_0 78 | - readline=7.0=0 79 | - requests=2.21.0=py27_1000 80 | - six=1.12.0=py27_1000 81 | - traceback2=1.4.0=py27_0 82 | - unittest2=1.1.0=py_0 83 | - urllib3=1.24.1=py27_1000 84 | - xorg-libxau=1.0.8=h470a237_6 85 | - xorg-libxdmcp=1.1.2=h470a237_7 86 | - blas=1.0=mkl 87 | - intel-openmp=2019.1=144 88 | - mkl=2018.0.3=1 89 | - pip=18.1=py27_0 90 | - setuptools=40.6.2=py27_0 91 | - wheel=0.32.3=py27_0 92 | - kalign 93 | - zlib=1.2.11 94 | - bedtools=2.27.1 95 | - python-consensuscore2=3.1.0 96 | - numpy=1.15.4 97 | - libgcc-ng=8.2.0 98 | - qt=5.6.2 99 | - perl=5.26.2 100 | - mkl_random=1.0.1 101 | - python=2.7.15 102 | - bzip2=1.0.6 103 | - numpy-base=1.15.4 104 | - scipy=1.1.0 105 | - xz=5.2.4 106 | - graphite2=1.3.12 107 | - sqlite=3.25.3 108 | - mkl_fft=1.0.6 109 | - tk=8.6.8 110 | - gstreamer=1.12.5 111 | - gettext=0.19.8.1 112 | - genomicconsensus=2.3.2 113 | - canu=1.8 114 | - libgcc=7.2.0 115 | - libffi=3.2.1 116 | - libgfortran-ng=7.3.0 117 | - gst-plugins-base=1.12.5 118 | - libstdcxx-ng=8.2.0 119 | -------------------------------------------------------------------------------- /environment_mac.yaml: -------------------------------------------------------------------------------- 1 | name: MsPAC 2 | channels: 3 | - hcc 4 | - conda-forge 5 | - bioconda 6 | - defaults 7 | - etetoolkit 8 | dependencies: 9 | - avro-python2=1.8.2=py_1 10 | - bcftools=1.9=h4da6232_0 11 | - blasr=5.3.2=h82bacf8_5 12 | - blasr_libcpp=5.3.1=h82bacf8_4 13 | - htslib=1.9=hc238db4_4 14 | - libdeflate=1.0=h470a237_0 15 | - pbbam=0.19.0=h6678c95_1 16 | - pbcommand=1.1.1=py27_2 17 | - pbcore=1.6.5=py27_0 18 | - perl-filesys-df=0.92=pl526h470a237_2 19 | - pomegranate=0.3.7=py27_2 20 | - pysam=0.15.1=py27h0380709_0 21 | - python-consensuscore=1.1.1=py27h02d93b8_2 22 | - pyvcf=0.6.8=py27_0 23 | - samtools=1.9=h46bd0b3_0 24 | - asn1crypto=0.24.0=py27_1003 25 | - biopython=1.72=py27h470a237_0 26 | - ca-certificates=2018.11.29=ha4d7672_0 27 | - cairo=1.14.10=0 28 | - certifi=2018.11.29=py27_1000 29 | - cffi=1.11.5=py27h5e8e0c9_1 30 | - chardet=3.0.4=py27_1003 31 | - cryptography=2.3.1=py27hdffb7b8_0 32 | - cryptography-vectors=2.3.1=py27_1000 33 | - curl=7.63.0=h74213dd_0 34 | - cython=0.29.1=py27hfc679d8_0 35 | - dbus=1.13.0=h3a4f0e9_0 36 | - decorator=4.3.0=py_0 37 | - enum34=1.1.6=py27_1001 38 | - expat=2.2.5=hfc679d8_2 39 | - fontconfig=2.12.6=0 40 | - freetype=2.8.1=hfa320df_1 41 | - giflib=5.1.4=h470a237_1 42 | - glib=2.55.0=h464dc38_2 43 | - gnuplot=5.2.3=0 44 | - h5py=2.8.0=py27h097b052_4 45 | - harfbuzz=1.7.6=0 46 | - hdf5=1.10.3=hc401514_2 47 | - icu=58.2=hfc679d8_0 48 | - idna=2.8=py27_1000 49 | - ipaddress=1.0.22=py_1 50 | - iso8601=0.1.12=py_1 51 | - joblib=0.13.0=py_0 52 | - jpeg=9c=h470a237_1 53 | - krb5=1.16.2=hbb41f41_0 54 | - libcurl=7.63.0=hbdb9355_0 55 | - libedit=3.1.20170329=0 56 | - libgd=2.2.5=3 57 | - libiconv=1.15=h470a237_3 58 | - libpng=1.6.34=ha92aebf_2 59 | - libssh2=1.8.0=h5b517e9_3 60 | - libtiff=4.0.9=he6b73bb_2 61 | - libwebp=0.5.2=7 62 | - libxcb=1.13=h470a237_2 63 | - libxml2=2.9.8=h422b904_5 64 | - linecache2=1.0.0=py_1 65 | - ncurses=5.9=10 66 | - networkx=2.2=py_1 67 | - openjdk=11.0.1=h470a237_14 68 | - openssl=1.0.2p=h470a237_1 69 | - pango=1.40.14=0 70 | - pcre=8.41=hfc679d8_3 71 | - pixman=0.34.0=h470a237_3 72 | - pthread-stubs=0.4=h470a237_1 73 | - pycparser=2.19=py_0 74 | - pyopenssl=18.0.0=py27_1000 75 | - pysocks=1.6.8=py27_1002 76 | - pytz=2018.7=py_0 77 | - readline=7.0=0 78 | - requests=2.21.0=py27_1000 79 | - six=1.12.0=py27_1000 80 | - traceback2=1.4.0=py27_0 81 | - unittest2=1.1.0=py_0 82 | - urllib3=1.24.1=py27_1000 83 | - xorg-libxau=1.0.8=h470a237_6 84 | - xorg-libxdmcp=1.1.2=h470a237_7 85 | - blas=1.0=mkl 86 | - intel-openmp=2019.1=144 87 | - mkl=2018.0.3=1 88 | - pip=18.1=py27_0 89 | - setuptools=40.6.2=py27_0 90 | - wheel=0.32.3=py27_0 91 | - kalign=2.03 92 | - zlib=1.2.11 93 | - bedtools=2.27.1 94 | - numpy=1.15.4 95 | - qt=5.6.2 96 | - perl=5.26.2 97 | - mkl_random=1.0.1 98 | - python=2.7.15 99 | - bzip2=1.0.6 100 | - numpy-base=1.15.4 101 | - scipy=1.1.0 102 | - xz=5.2.4 103 | - graphite2=1.3.12 104 | - sqlite=3.25.3 105 | - mkl_fft=1.0.6 106 | - tk=8.6.8 107 | - gstreamer=1.14.4 108 | - gettext=0.19.8.1 109 | - genomicconsensus=2.3.2 110 | - canu=1.5 111 | - libgcc=7.2.0 112 | - libffi=3.2.1 113 | - libgfortran-ng=3.0.1 114 | - gst-plugins-base=1.12.5 115 | - libstdcxx-ng=8.2.0 116 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='MsPAC', 5 | description='', 6 | packages=find_packages(), 7 | include_package_data=True, 8 | entry_points = { 9 | 'console_scripts': ['MsPAC = MsPAC.MsPAC:main'], 10 | }, 11 | platforms='any' 12 | ) 13 | -------------------------------------------------------------------------------- /testing/run.cfg: -------------------------------------------------------------------------------- 1 | [Input] 2 | directory = MsPAC 3 | 4 | [Phase-bam input files] 5 | phased vcf = input_data/test.vcf.gz 6 | reads aligned = input_data/test.bam 7 | 8 | [Phase-bam params] 9 | sample name in VCF = 20977 10 | output phased bamfile = input_data/test_phased.bam 11 | 12 | [Prep reads params] 13 | BAM fofn = reads.fofn 14 | Raw reads directory = MsPAC/prep_reads 15 | 16 | [Assembly params] 17 | Minimum phased block length = 1000 18 | Comma-seperated list of haplotypes = 0_1,0_2 19 | Assembly directory = MsPAC/assembly 20 | Flanking length = 1000 21 | Phased bedfile = None 22 | 23 | [SV calling params] 24 | SV calling directory = MsPAC/sv_calling 25 | reference = input_data/chr22.fa 26 | 27 | [Other params] 28 | cluster = No 29 | 30 | [HIGH INTENSITY JOB] 31 | walltime = 24 32 | threads = 1 33 | memory = 8 34 | queue = private 35 | 36 | [LOW INTENSITY JOB] 37 | walltime = 24 38 | threads = 1 39 | memory = 8 40 | queue = private -------------------------------------------------------------------------------- /testing/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e -x 3 | 4 | if [ -z "${SJOB_DEFALLOC}" ] 5 | then 6 | export SJOB_DEFALLOC="" 7 | fi 8 | 9 | mkdir -p input_data 10 | cd input_data 11 | 12 | if [ ! -s reads.bam.bai ] 13 | then 14 | curl -O https://rodrio10.u.hpc.mssm.edu/MsPAC/reads.bam 15 | curl -O https://rodrio10.u.hpc.mssm.edu/MsPAC/reads.bam.bai 16 | fi 17 | 18 | if [ ! -s test.bam.bai ] 19 | then 20 | curl -O https://rodrio10.u.hpc.mssm.edu/MsPAC/test.bam 21 | curl -O https://rodrio10.u.hpc.mssm.edu/MsPAC/test.bam.bai 22 | fi 23 | 24 | if [ ! -s test.vcf.gz.tbi ] 25 | then 26 | curl -O https://rodrio10.u.hpc.mssm.edu/MsPAC/test.vcf.gz 27 | curl -O https://rodrio10.u.hpc.mssm.edu/MsPAC/test.vcf.gz.tbi 28 | fi 29 | 30 | if [ ! -s chr22.fa ] 31 | then 32 | curl -O http://hgdownload.cse.ucsc.edu/goldenpath/hg19/chromosomes/chr22.fa.gz 33 | gunzip chr22.fa.gz 34 | samtools faidx chr22.fa 35 | fi 36 | cd - 37 | 38 | ls input_data/reads.bam > reads.fofn 39 | 40 | MsPAC phase-bam run.cfg 41 | MsPAC prep-reads run.cfg 42 | MsPAC assembly run.cfg 43 | MsPAC sv-calling run.cfg 44 | --------------------------------------------------------------------------------