├── MANIFEST.in
├── MsPAC
    ├── MsPAC.py
    ├── __init__.py
    ├── assemble_haplotype.py
    ├── bash
    │   ├── assemble_window_v3.sh
    │   ├── assign_reads_to_haplotype.sh
    │   ├── get_msa_coords.sh
    │   ├── map.sh
    │   └── sv_calling.sh
    ├── haplotype_assignment.py
    ├── prepping_reads.py
    ├── python
    │   ├── assign_reads_to_haplotypes.py
    │   ├── extract_raw_reads_from_bam_fofn.py
    │   ├── hmm2.py
    │   ├── msa_to_variants.py
    │   └── start_end_coordinates.py
    └── sv_calling.py
├── README.md
├── cfg_readme.md
├── environment.yml
├── environment_mac.yaml
├── setup.py
└── testing
    ├── run.cfg
    └── run.sh


/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include MsPAC/bash/*
2 | include MsPAC/python/*
3 | 


--------------------------------------------------------------------------------
/MsPAC/MsPAC.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | import os
  3 | import sys
  4 | import ConfigParser
  5 | from string import Template
  6 | from lsf.lsf import Lsf
  7 | 
  8 | class Pipeline(object):
  9 |     def __init__(self,configfile,step):
 10 |         self.configfile = configfile
 11 |         self.step = step
 12 |         self.jobs = []
 13 |         
 14 |         #### All steps params
 15 |         ## User mandatory input
 16 |         self.directory = None
 17 |         self.package_bash_directory = None
 18 |         self.package_python_directory = None
 19 |         self.cluster = None
 20 |         self.job_threads = None
 21 |         self.job_walltime = None
 22 |         self.job_memory = None
 23 |         self.job_queue = None
 24 | 
 25 |         #### Multi steps params
 26 |         ## User optional input
 27 |         self.phased_bamfile = None
 28 |         self.phased_bedfile = None
 29 | 
 30 |         #### phase-bam only params
 31 |         ## User mandatory input
 32 |         self.vcffile = None
 33 |         self.bamfile = None
 34 |         self.vcf_sample_name = None
 35 |         ## User optional input
 36 |         self.generate_phased_bedfile = None
 37 |         self.max_phased_window = None
 38 |         self.padding_size = None
 39 | 
 40 |         #### prep reads only params
 41 |         self.raw_reads_in_bam_format = None
 42 |         self.raw_reads_directory = None
 43 | 
 44 |         #### assembly only params
 45 |         ## User optional input
 46 |         self.min_phased_block = None
 47 |         self.windows_to_assemble = None
 48 |         self.haps_to_assemble = None
 49 |         self.assembly_directory = None
 50 |         self.flanking_length = None
 51 |         self.max_block_length = None
 52 |         self.tech = None
 53 |         ## Non user options
 54 |         self.hap0_assembly_fa = None
 55 |         self.hap0_assembly_fq = None
 56 |         self.hap1_assembly_fa = None
 57 |         self.hap1_assembly_fq = None
 58 |         self.hap2_assembly_fa = None
 59 |         self.hap2_assembly_fq = None
 60 | 
 61 |         #### sv calling only params 
 62 |         self.sv_calling_directory = None
 63 |         self.reference = None
 64 |         ## Non user options
 65 |         self.hap1_assembly_split_fq = None
 66 |         self.hap2_assembly_split_fq = None
 67 | 
 68 |     def set_options(self,pipeline_options):
 69 |         for pipeline_option, option_input in pipeline_options:
 70 |             setattr(self,pipeline_option,option_input)        
 71 | 
 72 |     def get_bash_scripts_path(self):
 73 |         return "%s/MsPAC/bash" % "/".join(os.path.dirname(__file__).split("/")[:-1])
 74 | 
 75 |     def get_python_scripts_path(self):
 76 |         return "%s/MsPAC/python" % "/".join(os.path.dirname(__file__).split("/")[:-1])
 77 | 
 78 |     def set_optional_options(self,config):
 79 |         if config.has_option("Params","chromosome"):
 80 |             self.chromosome = config.get("Params","chromosome")
 81 | 
 82 |     def all_steps_configure(self,config):
 83 |         pipeline_options  = \
 84 |             [("directory",os.path.abspath(config.get('Input','directory'))),
 85 |              ("package_python_directory",self.get_python_scripts_path()),
 86 |              ("package_bash_directory",self.get_bash_scripts_path()),
 87 |              ("cluster",config.get("Other params",'cluster')),
 88 |              ("job_threads",{"high":config.get("HIGH INTENSITY JOB","threads"),
 89 |                              "low":config.get("LOW INTENSITY JOB","threads")}),
 90 |              ("job_memory",{"high":config.get("HIGH INTENSITY JOB","memory"),
 91 |                             "low":config.get("LOW INTENSITY JOB","memory")}),
 92 |              ("job_walltime",{"high":config.get("HIGH INTENSITY JOB","walltime"),
 93 |                               "low":config.get("LOW INTENSITY JOB","walltime")}),
 94 |              ("job_queue",{"high":config.get("HIGH INTENSITY JOB","queue"),
 95 |                            "low":config.get("LOW INTENSITY JOB","queue")})]
 96 |         self.set_options(pipeline_options)
 97 |         self.create_directory(self.directory)
 98 | 
 99 |     def phase_bam_dependent_options(self):
100 |         if self.phased_bedfile == "None":
101 |             self.phased_bedfile = "%s.hap_blocks.bed" %  self.phased_bamfile[:-4]
102 | 
103 |     def phase_bam_configure(self,config):
104 |         pipeline_options  = \
105 |             [("vcffile",config.get("Phase-bam input files",'phased vcf')),
106 |              ("bamfile",config.get("Phase-bam input files",'reads aligned')),
107 |              ("vcf_sample_name",config.get("Phase-bam params",'sample name in VCF')),
108 |              ("phased_bamfile",config.get("Phase-bam params",'output phased bamfile'))]
109 |         self.set_options(pipeline_options)
110 |         #self.phase_bam_dependent_options()
111 | 
112 |     def assembly_dependent_options(self):
113 |         self.hap0_assembly_fa = "%s/hap0_assembly.fasta" % self.assembly_directory
114 |         self.hap0_assembly_fq = "%s/hap0_assembly.fastq" % self.assembly_directory
115 |         self.hap1_assembly_fa = "%s/hap1_assembly.fasta" % self.assembly_directory
116 |         self.hap1_assembly_fq = "%s/hap1_assembly.fastq" % self.assembly_directory
117 |         self.hap2_assembly_fa = "%s/hap2_assembly.fasta" % self.assembly_directory
118 |         self.hap2_assembly_fq = "%s/hap2_assembly.fastq" % self.assembly_directory
119 | 
120 |     def assembly_configure(self,config):
121 |         pipeline_options  = \
122 |             [("min_phased_block",config.get("Assembly params",'Minimum phased block length')),
123 |              ("phased_bedfile",config.get("Assembly params",'Phased bedfile')),
124 |              ("windows_to_assemble",[]),
125 |              ("tech",config.get("Assembly params",'Technology')),
126 |              ("haps_to_assemble",config.get("Assembly params",'Comma-seperated list of haplotypes')),
127 |              ("phased_bamfile",os.path.abspath(config.get("Phase-bam params",'output phased bamfile'))),
128 |              ("flanking_length",int(config.get("Assembly params",'Flanking length'))),
129 |              ("max_block_length",int(config.get("Assembly params",'Max block length'))),
130 |              ("raw_reads_directory",os.path.abspath(config.get("Prep reads params",'Raw reads directory'))), #ugh
131 |              ("assembly_directory",os.path.abspath(config.get("Assembly params",'Assembly directory')))]
132 |         self.set_options(pipeline_options)
133 |         self.phase_bam_dependent_options()
134 |         self.assembly_dependent_options()
135 | 
136 |     def prep_reads_configure(self,config):
137 |         pipeline_options  = \
138 |             [("raw_reads_in_bam_format",os.path.abspath(config.get("Prep reads params",'BAM fofn'))),
139 |              ("raw_reads_directory",os.path.abspath(config.get("Prep reads params",'Raw reads directory'))),
140 |              ("phased_bamfile",config.get("Phase-bam params",'output phased bamfile'))]
141 |         self.set_options(pipeline_options)
142 | 
143 |     def sv_calling_dependent_options(self):
144 |         self.hap1_assembly_split_fq = "%s/hap1_assembly_split.fastq" % self.sv_calling_directory
145 |         self.hap2_assembly_split_fq = "%s/hap2_assembly_split.fastq" % self.sv_calling_directory
146 | 
147 |     def sv_calling_configure(self,config):
148 |         pipeline_options  = \
149 |             [("sv_calling_directory",os.path.abspath(config.get("SV calling params",'SV calling directory'))),
150 |              ("assembly_directory",os.path.abspath(config.get("Assembly params",'Assembly directory'))),
151 |              ("reference",os.path.abspath(config.get("SV calling params",'reference')))]
152 |         self.set_options(pipeline_options)
153 |         self.sv_calling_dependent_options()
154 |         self.assembly_dependent_options()
155 |         self.create_directory(self.sv_calling_directory)
156 | 
157 |     def configure(self):
158 |         config = ConfigParser.RawConfigParser()
159 |         config.read(self.configfile)
160 |         self.all_steps_configure(config)        
161 |         if self.step == "phase-bam":
162 |             self.phase_bam_configure(config)
163 |         if self.step == "assembly":
164 |             self.assembly_configure(config)
165 |         if self.step == "prep-reads":
166 |             self.prep_reads_configure(config)
167 |         if self.step == "sv-calling":
168 |             self.sv_calling_configure(config)
169 | 
170 |     def create_directory(self,directory):
171 |         if not os.path.exists(directory):
172 |             os.makedirs(directory)
173 | 
174 |     def write_to_bashfile(self,template_bash,bashfile,params):
175 |         filein = open(template_bash)
176 |         src = Template(filein.read())
177 |         output_lines = src.safe_substitute(params)
178 |         bashfh = open(bashfile,'w')
179 |         bashfh.write(output_lines)
180 |         filein.close()
181 |         bashfh.close()
182 | 
183 |     def non_emptyfile(self,checkfile):
184 |         return os.path.isfile(checkfile) and os.path.getsize(checkfile) > 0
185 | 
186 |     def run_locally(self):
187 |         use_cluster = self.cluster
188 |         self.cluster = "no"
189 |         self.submitjobs()
190 |         self.cluster = use_cluster
191 | 
192 |     def submitjobs(self,wait=True):
193 |         if len(self.jobs) == 0:
194 |             return
195 |         if self.cluster != "Yes":
196 |             for job,intensity in self.jobs:
197 |                 os.system("sh %s" % job)
198 |             self.jobs = []
199 |             return
200 |         hpc = Lsf()
201 |         for job,intensity in self.jobs:
202 |             hpc.config(cpu=self.job_threads[intensity],
203 |                        walltime=self.job_walltime[intensity],
204 |                        memory=int(self.job_memory[intensity]) * int(self.job_threads[intensity]),
205 |                        queue=self.job_queue[intensity])
206 |             hpc.submit("%s" % job)
207 |         if wait:
208 |             hpc.wait()
209 |         else:
210 |             dummy=1
211 |             # Bug gets overwritten if told not to wait twice                                         
212 |             #job_id_log = "%s/job.ids" % self.log_directory
213 |             #hpc.write_ids(job_id_log)
214 |         self.jobs = []
215 | 
216 |     def map_reads(self,reads,reference,directory,name,intensity="low"):
217 |         bashfile = "%s/%s.sh" % (directory,name)
218 |         template_bash = "%s/map_reads.sh" % self.package_bash_directory
219 |         params = {
220 |             'output': "%s/%s" % (directory,name),
221 |             'threads': self.job_threads[intensity],
222 |             'reads': reads,
223 |             'ref': reference,
224 |             'BLASR': self.fast_blasr
225 |             }
226 |         self.write_to_bashfile(template_bash,bashfile,params)
227 |         return (bashfile,intensity)
228 | 
229 |     def __call__(self):
230 |         if self.step == "phase-bam":
231 |             from haplotype_assignment import HaplotypeAssignment
232 |             print "Assigning reads to haplotypes..."
233 |             assign_reads_to_haplotype = HaplotypeAssignment(self.configfile)
234 |             assign_reads_to_haplotype.run()
235 |         elif self.step == "prep-reads":
236 |             from prepping_reads import PrepReads
237 |             print "Prepping reads..."
238 |             prep_reads = PrepReads(self.configfile)
239 |             prep_reads.run()
240 |         elif self.step == "assembly":
241 |             from assemble_haplotype import HaplotypeAssembly
242 |             print "Assembling haplotypes..."
243 |             assemble_haps = HaplotypeAssembly(self.configfile)
244 |             assemble_haps.run()
245 |         elif self.step == "sv-calling":
246 |             from sv_calling import SVCaller
247 |             print "Calling SVs..."
248 |             call_svs = SVCaller(self.configfile)
249 |             call_svs.run()
250 |         else:
251 |             sys.exit("Choose one of the following steps: phase-bam, prep-reads, assembly, sv-calling")
252 |         #print "Detecting structural variants..."
253 |         #from calling_structural_variants import StructuralVariationDetection
254 |         #calling_structural_variants = StructuralVariationDetection(self.configfile)
255 |         #calling_structural_variants.run()
256 | 
257 | def run_pipeline(step,configfile):
258 |     pipepine = Pipeline(configfile,step)
259 |     return pipepine()
260 | 
261 | def main():
262 |     if len(sys.argv) < 3:
263 |         sys.exit("Usage: MsPAC <step> <config file>")
264 |     return run_pipeline(sys.argv[1],sys.argv[2])
265 | 


--------------------------------------------------------------------------------
/MsPAC/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oscarlr/MsPAC/3a741dceba8e46efb0e0578eaf28c0ac1cb7b7ce/MsPAC/__init__.py


--------------------------------------------------------------------------------
/MsPAC/assemble_haplotype.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | import sys
  3 | import os
  4 | import pysam
  5 | import gzip
  6 | from Bio import SeqIO
  7 | from MsPAC import Pipeline
  8 | 
  9 | class PhasedBlocks():
 10 |     def __init__(self,line,min_phased_block,haps_to_assemble):
 11 |         min_cov = 5
 12 |         self.line = line
 13 |         line = line.rstrip().split('\t')
 14 |         self.chrom = line[0]
 15 |         self.start = int(line[1])
 16 |         self.end = int(line[2])
 17 |         self.hap = str(line[3])
 18 |         self.comp_cost = line[4]
 19 |         self.cov = float(line[5])
 20 |         self.length = self.end - self.start
 21 |         self.assemble = True
 22 |         if self.length < int(min_phased_block):
 23 |             self.assemble = False
 24 |         if self.hap not in haps_to_assemble.split(','):
 25 |             self.assemble = False
 26 |         if self.cov < min_cov:
 27 |             self.assemble = False
 28 | 
 29 | class HaplotypeAssembly(Pipeline):
 30 |     def __init__(self,configfile):
 31 |         Pipeline.__init__(self,configfile,"assembly")
 32 |         
 33 |     def get_read_groups_regions(self):
 34 |         regions = {
 35 |             "0": {},
 36 |             "1": {},
 37 |             "2": {},
 38 |             "0_1": {},
 39 |             "0_2": {}
 40 |         }
 41 |         samfile = pysam.AlignmentFile(self.phased_bamfile)
 42 |         for read in samfile:
 43 |             if read.is_secondary:
 44 |                 continue
 45 |             if read.is_supplementary:
 46 |                 continue
 47 |             if read.is_unmapped:
 48 |                 continue
 49 |             read_group = read.get_tag("RG")
 50 |             chrom = samfile.get_reference_name(read.reference_id)
 51 |             ref_start = read.reference_start
 52 |             ref_end = read.reference_end
 53 |             if read_group == "0":
 54 |                 rgs = ["0","0_1","0_2"]
 55 |             if read_group == "1":
 56 |                 rgs = ["1","0_1"]
 57 |             if read_group == "2":
 58 |                 rgs = ["2","0_2"]
 59 |             for rg in rgs:
 60 |                 if chrom not in regions[rg]:
 61 |                     regions[rg][chrom] = []
 62 |                 regions[rg][chrom].append((ref_start,ref_end))
 63 |         return regions
 64 | 
 65 |     def merge_regions(self,regions):
 66 |         sorted_by_lower_bound = sorted(regions, key=lambda tup: tup[0])
 67 |         merged_regions = []
 68 |         for higher in sorted_by_lower_bound:
 69 |             if len(merged_regions) == 0:
 70 |                 merged_regions.append(higher)
 71 |                 continue
 72 |             lower = merged_regions[-1]
 73 |             if higher[0] <= lower[1]:
 74 |                 upper_bound = max(lower[1], higher[1])
 75 |                 merged_regions[-1] = (lower[0], upper_bound) 
 76 |             else:
 77 |                 merged_regions.append(higher)
 78 |         return merged_regions
 79 | 
 80 |     def merge_regions_coverage(self,regions,merged_regions):
 81 |         bases = {}
 82 |         regions_used = set()
 83 |         sorted_by_lower_bound = sorted(regions, key=lambda tup: tup[0])
 84 |         for m_start, m_end in merged_regions:
 85 |             bases[(m_start,m_end)] = 0
 86 |             for r_start, r_end in sorted_by_lower_bound:
 87 |                 a = [m_start,m_end]
 88 |                 b= [r_start,r_end]
 89 |                 bases_overlapping = max(0, min(a[1], b[1]) - max(a[0], b[0]))
 90 |                 if bases_overlapping > 0:
 91 |                     bases[(m_start,m_end)] += (r_end - r_start)
 92 |                     regions_used.add((r_start,r_end))
 93 |             bases[(m_start,m_end)] = bases[(m_start,m_end)]/float((m_end - m_start))
 94 |         return bases
 95 |                     
 96 |     def break_long_regions(self,regions,coverage):
 97 |         broken_regions = []
 98 |         broken_coverage = {}
 99 |         for start,end in regions:
100 |             for new_start in range(start,end,self.max_block_length):
101 |                 new_end = new_start + self.max_block_length
102 |                 if new_end > end:
103 |                     new_end = end
104 |                 broken_regions.append((new_start,new_end))
105 |                 broken_coverage[(new_start,new_end)] = coverage[(start,end)]
106 |         return (broken_regions,broken_coverage)
107 | 
108 |     # Max block length
109 |     def create_phased_bedfile(self):
110 |         window_size = 1000000 # 10 MB //Everything should get low
111 |         read_group_regions = self.get_read_groups_regions()
112 |         with open(self.phased_bedfile,'w') as fh:
113 |             for read_group in read_group_regions:
114 |                 for chrom in read_group_regions[read_group]:
115 |                     merged_regions_pre_broken = self.merge_regions(read_group_regions[read_group][chrom])
116 |                     merged_regions_coverage_p_b = self.merge_regions_coverage(read_group_regions[read_group][chrom],merged_regions_pre_broken)
117 |                     if self.max_block_length == None:
118 |                         merged_regions = merged_regions_pre_broken
119 |                         merged_regions_coverage = merged_regions_coverage_p_b
120 |                     else:
121 |                         merged_regions,merged_regions_coverage = self.break_long_regions(merged_regions_pre_broken,merged_regions_coverage_p_b)                        
122 |                     for start,end in merged_regions:
123 |                         comp_cost = "low"
124 |                         if end - start > window_size:
125 |                             comp_cost = "high"
126 |                         out = [chrom,start,end,read_group,comp_cost,merged_regions_coverage[(start,end)]]
127 |                         fh.write("%s\n" % "\t".join(map(str,out)))
128 |                         
129 |     def load_haplotype_blocks(self):
130 |         if not self.non_emptyfile(self.phased_bedfile):
131 |             self.create_phased_bedfile()
132 |         with open(self.phased_bedfile,'r') as fh:
133 |             for line in fh:
134 |                 phased_block = PhasedBlocks(line,self.min_phased_block,self.haps_to_assemble)
135 |                 if phased_block.assemble == False:
136 |                     continue
137 |                 self.windows_to_assemble.append(phased_block)
138 |                 
139 |     def assemble_windows(self):
140 |         for window in self.windows_to_assemble:
141 |             directory = "%s/%s/%s/%s_%s" % (self.assembly_directory,window.hap,
142 |                                             window.chrom,window.start,window.end)
143 |             self.create_directory(directory)
144 |             if os.path.isfile("%s/done" % directory):
145 |                 continue
146 |             template_bash = "%s/assemble_window_v3.sh" % self.package_bash_directory
147 |             bashfile = "%s/assemble_window_v3.sh" % directory
148 |             params = {
149 |                 'output': directory,
150 |                 'raw_reads_dir': self.raw_reads_directory,
151 |                 'python_scripts': self.package_python_directory,
152 |                 'hap': window.hap,
153 |                 'subreads_to_ref': self.phased_bamfile,
154 |                 'chrom': window.chrom,
155 |                 'start': window.start,
156 |                 'end': window.end,
157 |                 'threads': self.job_threads[window.comp_cost],
158 |                 'memory': int(self.job_threads[window.comp_cost])*int(self.job_memory[window.comp_cost]),
159 |                 'size': max(6000,window.end - window.start),
160 |                 'tech': self.tech
161 |                 }
162 |             self.write_to_bashfile(template_bash,bashfile,params)
163 |             self.jobs.append((bashfile,window.comp_cost))
164 |         self.submitjobs()
165 | 
166 |     def merge_sequences(self):
167 |         records = {}
168 |         for hap in ["hap0","hap1","hap2"]:
169 |             records[hap] = {}
170 |             records[hap]["fa"] = []
171 |             records[hap]["fq"] = []
172 |         for window in self.windows_to_assemble:
173 |             directory = "%s/%s/%s/%s_%s" % (self.assembly_directory,window.hap,
174 |                                             window.chrom,window.start,window.end)
175 |             raw_contigs = "%s/canu/raw.contigs.fasta" % directory
176 |             #raw_contigs = "%s/canu/raw.quivered.contigs.fasta" % directory
177 |             if self.non_emptyfile(raw_contigs):
178 |                 for index,record in enumerate(SeqIO.parse(raw_contigs, "fasta")):
179 |                     record.id = "%s.%s.%s.%s.raw.%s/0/0_0" % (window.chrom,window.start,window.end,window.hap,index)
180 |                     record.description = ""
181 |                     record.name = ""
182 |                     for hap,hap_key in zip(["0","1","2"],["hap0","hap1","hap2"]):
183 |                         if hap in window.hap:
184 |                             records[hap_key]["fa"].append(record)
185 |             raw_contigs = "%s/canu/raw.quivered.contigs.fastq" % directory
186 |             if self.non_emptyfile(raw_contigs):
187 |                 print raw_contigs
188 |                 for index,record in enumerate(SeqIO.parse(raw_contigs, "fastq")):
189 |                     record.id = "%s.%s.%s.%s.raw.%s/0/0_0" % (window.chrom,window.start,window.end,window.hap,index)
190 |                     record.description = ""
191 |                     record.name = ""
192 |                     for hap,hap_key in zip(["0","1","2"],["hap0","hap1","hap2"]):
193 |                         if hap in window.hap:
194 |                             records[hap_key]["fq"].append(record)
195 |         SeqIO.write(records["hap0"]["fa"],self.hap0_assembly_fa,"fasta")
196 |         SeqIO.write(records["hap1"]["fa"],self.hap1_assembly_fa,"fasta")
197 |         SeqIO.write(records["hap2"]["fa"],self.hap2_assembly_fa,"fasta")
198 |         SeqIO.write(records["hap0"]["fq"],self.hap0_assembly_fq,"fastq")
199 |         SeqIO.write(records["hap1"]["fq"],self.hap1_assembly_fq,"fastq")
200 |         SeqIO.write(records["hap2"]["fq"],self.hap2_assembly_fq,"fastq")
201 | 
202 | 
203 |     def run(self):
204 |         self.configure()
205 |         self.load_haplotype_blocks()        
206 |         self.assemble_windows()
207 |         self.merge_sequences()
208 | 


--------------------------------------------------------------------------------
/MsPAC/bash/assemble_window_v3.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -x
  3 | 
  4 | output=$1
  5 | bash=$2
  6 | smrtsuite=$3
  7 | threads=$4
  8 | bams_fofn=$5
  9 | CANU=$6
 10 | size=$7
 11 | subreads_to_ref=$8
 12 | hap=$9
 13 | reference=${10}
 14 | memory=${11}
 15 | raw_reads_dir=${12}
 16 | start=${13}
 17 | end=${14}
 18 | chrom=${15}
 19 | python_scripts=${16}
 20 | tech=${17}
 21 | 
 22 | if [ "${tech}" == "ONT" ]
 23 | then
 24 |     canu_tech="-nanopore"
 25 | fi
 26 | 
 27 | if [ "${tech}" == "PACB" ]
 28 | then
 29 |     canu_tech="-pacbio"
 30 | fi
 31 | 
 32 | 
 33 | if [ "${hap}" == "0_1" ]
 34 | then
 35 |     samtools view -F 3884 ${subreads_to_ref} -r 0 ${chrom}:${start}-${end} | awk '{ print ">"$1"\n"$10}' > ${output}/reads.fasta
 36 |     samtools view -F 3884 ${subreads_to_ref} -r 1 ${chrom}:${start}-${end} | awk '{ print ">"$1"\n"$10}' >> ${output}/reads.fasta
 37 | elif [ "${hap}" == "0_2" ]
 38 | then
 39 |     samtools view -F 3884 ${subreads_to_ref} -r 0 ${chrom}:${start}-${end} | awk '{ print ">"$1"\n"$10}' > ${output}/reads.fasta
 40 |     samtools view -F 3884 ${subreads_to_ref} -r 2 ${chrom}:${start}-${end} | awk '{ print ">"$1"\n"$10}' >> ${output}/reads.fasta
 41 | else
 42 |     samtools view -F 3884 ${subreads_to_ref} -r ${hap} ${chrom}:${start}-${end} | awk '{ print ">"$1"\n"$10}' > ${output}/reads.fasta
 43 | fi
 44 | 
 45 | samtools faidx ${output}/reads.fasta
 46 | 
 47 | if [ ! -s ${output}/canu/raw.contigs.fasta ]
 48 | then    
 49 |     #rm -fr ${output}/canu
 50 |     canu \
 51 | 	-p raw \
 52 | 	-d ${output}/canu \
 53 | 	contigFilter="2 1000 1.0 1.0 2" \
 54 | 	minInputCoverage=0 \
 55 | 	corMinCoverage=0 \
 56 | 	stopOnLowCoverage=0 \
 57 | 	minThreads=${threads} \
 58 | 	genomeSize=${size} \
 59 | 	useGrid=0 \
 60 | 	${canu_tech} ${output}/reads.fasta
 61 | fi
 62 | 
 63 | if [ "${tech}" == "PACB" ]
 64 | then
 65 |     if [  -s ${output}/canu/raw.contigs.fasta ]
 66 |     then
 67 | 	samtools faidx ${output}/canu/raw.contigs.fasta
 68 | 	if [ ! -s ${output}/subreads.bam ]
 69 | 	then
 70 | 	    if [ "${hap}" == "0_1" ]
 71 | 	    then
 72 | 		ls ${raw_reads_dir}/${chrom}/1/*bam > ${output}/subreads.fofn
 73 | 		ls ${raw_reads_dir}/${chrom}/0/*bam >> ${output}/subreads.fofn
 74 | 	    elif [ "${hap}" == "0_2" ]
 75 | 	    then
 76 | 		ls ${raw_reads_dir}/${chrom}/2/*bam > ${output}/subreads.fofn
 77 | 		ls ${raw_reads_dir}/${chrom}/0/*bam >> ${output}/subreads.fofn
 78 | 	    else
 79 | 		ls ${raw_reads_dir}/${chrom}/${hap}/*bam > ${output}/subreads.fofn
 80 | 	    fi
 81 | 	    if [ ! -s ${output}/subreads.fofn ]
 82 | 	    then
 83 | 		echo "" > ${output}/done
 84 | 		exit 0
 85 | 	    fi
 86 | 	    cut -f1 ${output}/reads.fasta.fai > ${output}/reads.id
 87 | 	    python \
 88 | 		${python_scripts}/extract_raw_reads_from_bam_fofn.py \
 89 | 		${output}/reads.id \
 90 | 		${output}/subreads.fofn \
 91 | 		${output}/subreads.bam
 92 | 	    pbindex ${output}/subreads.bam
 93 | 	fi
 94 | 	if [ ! -s ${output}/canu/reads_to_canu_contigs.sorted.bam.pbi ]
 95 | 	then
 96 |             blasr \
 97 | 		${output}/subreads.bam \
 98 | 		${output}/canu/raw.contigs.fasta \
 99 | 		--bestn 1 \
100 | 		--bam \
101 | 		--nproc ${threads} \
102 | 		--out ${output}/canu/reads_to_canu_contigs.bam
103 |             samtools sort -@ ${threads} ${output}/canu/reads_to_canu_contigs.bam -o ${output}/canu/reads_to_canu_contigs.sorted.bam
104 |             pbindex ${output}/canu/reads_to_canu_contigs.sorted.bam
105 | 	fi
106 | 	if [ ! -s ${output}/canu/raw.quivered.contigs.fastq ]
107 | 	then
108 |             samtools faidx ${output}/canu/raw.contigs.fasta
109 |             arrow \
110 | 		--referenceFilename ${output}/canu/raw.contigs.fasta \
111 | 		-j ${threads} \
112 | 		-o ${output}/canu/raw.quivered.contigs.fastq \
113 | 		-o ${output}/canu/raw.quivered.contigs.fasta \
114 | 		${output}/canu/reads_to_canu_contigs.sorted.bam
115 | 	fi
116 |     fi
117 | fi
118 | 
119 | echo "" > ${output}/done
120 | 


--------------------------------------------------------------------------------
/MsPAC/bash/assign_reads_to_haplotype.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e -x
 3 | 
 4 | python_scripts=$1
 5 | vcffile=$2
 6 | bamfile=$3
 7 | phased_bamfile=$4
 8 | vcf_sample_name=$5
 9 | ##
10 | 
11 | if [ ! -s "${phased_bamfile}.bai" ]
12 | then
13 |     python ${python_scripts}/assign_reads_to_haplotypes.py \
14 |     	${vcffile} \
15 |     	${bamfile} \
16 |     	${vcf_sample_name} \
17 |     	${phased_bamfile}
18 |     samtools index ${phased_bamfile}
19 | fi
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/MsPAC/bash/get_msa_coords.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e -x
 3 | 
 4 | if [ ! -s ${ref}.fai ]
 5 | then
 6 |     samtools faidx ${ref}    
 7 | fi
 8 | 
 9 | cut -f1,2 ${ref}.fai > ${sv_calling_dir}/chrom.sizes
10 | 
11 | for i in 1 2
12 | do
13 |     python ${python_packages}/start_end_coordinates.py \
14 | 	${sv_calling_dir}/hap${i}_to_ref.sorted.bam > ${sv_calling_dir}/hap${i}_to_ref.bed
15 |     
16 |     bedtools genomecov \
17 | 	-bg \
18 | 	-i ${sv_calling_dir}/hap${i}_to_ref.bed \
19 | 	-g ${sv_calling_dir}/chrom.sizes \
20 | 	| awk '$4 == 1' \
21 | 	> ${sv_calling_dir}/hap${i}_to_ref.no_overlap.bed
22 |     
23 |     bedtools intersect \
24 | 	-a ${sv_calling_dir}/hap${i}_to_ref.bed \
25 | 	-b ${sv_calling_dir}/hap${i}_to_ref.no_overlap.bed \
26 | 	> ${sv_calling_dir}/hap${i}_to_ref.no_overlap.contig.bed
27 | done
28 | 
29 | bedtools intersect \
30 |     -a ${sv_calling_dir}/hap1_to_ref.no_overlap.contig.bed \
31 |     -b ${sv_calling_dir}/hap2_to_ref.no_overlap.contig.bed \
32 |     | cut -f-3 \
33 |     > ${sv_calling_dir}/msa_coords.bed \
34 | 
35 | rm -f ${sv_calling_dir}/hap1_to_ref.no_overlap.bed
36 | rm -f ${sv_calling_dir}/hap2_to_ref.no_overlap.bed
37 | 
38 | 


--------------------------------------------------------------------------------
/MsPAC/bash/map.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | 
 4 | if [ ! -s ${prefix}.sorted.bam.bai ]
 5 | then
 6 |     blasr \
 7 | 	${input} \
 8 | 	${ref} \
 9 | 	--bestn 1 \
10 | 	--bam \
11 | 	--nproc ${threads} \
12 | 	--out ${prefix}.bam
13 |     
14 |     samtools \
15 | 	sort -@ ${threads} \
16 | 	${prefix}.bam \
17 | 	-o ${prefix}.sorted.bam
18 |     
19 |     samtools index ${prefix}.sorted.bam
20 |     
21 |     rm -f ${prefix}.bam
22 | fi
23 | 


--------------------------------------------------------------------------------
/MsPAC/bash/sv_calling.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e -x
 3 | 
 4 | kalign -q -f clu -s 100 -e 0.85 -t 0.45 -m 0 -in ${dir}/seq.fa \
 5 |     | sed 's/Kalign/CLUSTAL/g' > ${dir}/msa.clu
 6 | 
 7 | python ${python_scripts}/msa_to_variants.py \
 8 |     ${dir}/msa.clu \
 9 |     ${chrom} \
10 |     ${start} \
11 |     ${end} \
12 |     ${dir}/seq.qual \
13 |     50 > ${dir}/svs.bed
14 | 
15 | 


--------------------------------------------------------------------------------
/MsPAC/haplotype_assignment.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | from MsPAC import Pipeline
 3 | 
 4 | class HaplotypeAssignment(Pipeline):
 5 |     def __init__(self,configfile):
 6 |         Pipeline.__init__(self,configfile,"phase-bam")
 7 |         self.configure()
 8 |         
 9 |     def assign_reads_to_haplotype(self):
10 |         bashfile = "%s_assign_reads_to_haplotype.sh" % self.bamfile[0:-4]
11 |         template_bash = "%s/assign_reads_to_haplotype.sh" % self.package_bash_directory
12 |         params = {
13 |             'python_scripts': self.package_python_directory,
14 |             'vcffile': self.vcffile,
15 |             'bamfile': self.bamfile,
16 |             'phased_bamfile': self.phased_bamfile,
17 |             'vcf_sample_name': self.vcf_sample_name
18 |             }
19 |         self.write_to_bashfile(template_bash,bashfile,params)
20 |         self.jobs.append((bashfile,"low"))
21 |         self.run_locally()
22 |         
23 |     def run(self):
24 |         self.assign_reads_to_haplotype()
25 | 
26 | 


--------------------------------------------------------------------------------
/MsPAC/prepping_reads.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | import sys
  3 | import pysam
  4 | from MsPAC import Pipeline
  5 | 
  6 | 
  7 | class PrepReads(Pipeline):
  8 |     def __init__(self,configfile):
  9 |         Pipeline.__init__(self,configfile,"prep-reads")
 10 | 
 11 |     def get_reads_per_group(self):
 12 |         print "\tGetting reads per chrom per group..."
 13 |         reads_per_read_group = {}
 14 |         samfile = pysam.AlignmentFile(self.phased_bamfile)
 15 |         reads_mapped = samfile.mapped
 16 |         count = 0
 17 |         for read in samfile:            
 18 |             count += 1.0
 19 |             read_group = read.get_tag("RG")
 20 |             chrom = samfile.get_reference_name(read.reference_id)
 21 |             if chrom not in reads_per_read_group:
 22 |                 reads_per_read_group[chrom] = {}
 23 |             if read_group not in reads_per_read_group[chrom]:
 24 |                 reads_per_read_group[chrom][read_group] = []
 25 |             reads_per_read_group[chrom][read_group].append(read.query_name)            
 26 |             if count % 50000  == 0:
 27 |                 print "\t\tStatus: %s" % (count/reads_mapped)
 28 |         return reads_per_read_group
 29 | 
 30 |     def extract_raw_reads(self,read_names,inbamfile):
 31 |         raw_reads = []
 32 |         read_names_indexed = pysam.IndexedReads(inbamfile)
 33 |         read_names_indexed.build()
 34 |         for name in read_names:
 35 |             try:
 36 |                 read_names_indexed.find(name)
 37 |             except KeyError:
 38 |                 pass
 39 |             else:
 40 |                 iterator = read_names_indexed.find(name)
 41 |                 for x in iterator:
 42 |                     raw_reads.append(x)
 43 |         return raw_reads
 44 | 
 45 |     def get_raw_reads(self,read_names):        
 46 |         headers = {}
 47 |         raw_reads = []
 48 |         with open(self.raw_reads_in_bam_format,'r') as fofh:
 49 |             for i,fn in enumerate(fofh):
 50 |                 print "\t\tLooking into bam file #: %s" % i
 51 |                 fn = fn.rstrip()
 52 |                 inbamfile = pysam.AlignmentFile(fn,check_sq=False)
 53 |                 reads = self.extract_raw_reads(read_names,inbamfile)
 54 |                 if len(reads) > 0:
 55 |                     headers[fn] = dict(inbamfile.header)
 56 |                 inbamfile.close()
 57 |                 for read in reads:
 58 |                     raw_reads.append(read)
 59 |         return raw_reads,headers
 60 | 
 61 |     def split_raw_reads_into_groups(self,raw_reads,max_num_reads_per_file,header,outdir):
 62 |         for index,i in enumerate(range(0,len(raw_reads),max_num_reads_per_file)):
 63 |             reads = raw_reads[i:i + max_num_reads_per_file]
 64 |             outbamfilefn = "%s/%s.bam" % (outdir,index)
 65 |             outbamfile = pysam.AlignmentFile(outbamfilefn,"wb", header=header)
 66 |             for read in reads:
 67 |                 outbamfile.write(read)
 68 |             outbamfile.close()
 69 |             pysam.index(outbamfilefn)
 70 | 
 71 |     def merge_headers(self,headers):
 72 |         header = {}
 73 |         for fn in headers:
 74 |             if len(header) == 0:
 75 |                 header["HD"] = headers[fn]["HD"]
 76 |                 header["RG"] = []
 77 |                 header["PG"] = []
 78 |                 header["SQ"] = []
 79 |             for rg in headers[fn]["RG"]:
 80 |                 header["RG"].append(rg)
 81 |             for pg in headers[fn]["PG"]:                
 82 |                 header["PG"].append(pg)
 83 |         return header
 84 | 
 85 |     def split_raw_reads(self):
 86 |         max_num_reads = 200000 # 200,000 reads is around 5GB
 87 |         read_names_per_read_group = self.get_reads_per_group()
 88 |         for chrom in read_names_per_read_group:
 89 |             for read_group in read_names_per_read_group[chrom]:        
 90 |                 print "\tWorking on chrom %s and read group %s" % (chrom,read_group)
 91 |                 print "\tExtracting %s reads" % len(read_names_per_read_group[chrom][read_group])
 92 |                 outdir = "%s/%s/%s" % (self.raw_reads_directory,chrom,read_group)
 93 |                 self.create_directory(outdir)
 94 |                 read_names = read_names_per_read_group[chrom][read_group]
 95 |                 raw_reads,headers = self.get_raw_reads(read_names)
 96 |                 header = self.merge_headers(headers)
 97 |                 self.split_raw_reads_into_groups(raw_reads,max_num_reads,header,outdir)
 98 | 
 99 |     def run(self):
100 |         self.configure()
101 |         self.split_raw_reads()
102 | 


--------------------------------------------------------------------------------
/MsPAC/python/assign_reads_to_haplotypes.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | import sys
  3 | import vcf
  4 | import pysam
  5 | import numpy as np
  6 | 
  7 | unphased_tag = 0
  8 | haplotype1_tag = 1
  9 | haplotype2_tag = 2
 10 | 
 11 | def read_phased_snps(vcffile,sample):
 12 |     phased_snps = {}
 13 |     vcf_reader = vcf.Reader(open(vcffile, 'r'))
 14 |     for record in vcf_reader:
 15 |         if record.var_subtype == "deletion":
 16 |             continue
 17 |         if record.var_subtype == "insertion":
 18 |             continue
 19 |         phased_snps.setdefault(record.CHROM, {})
 20 |         try:
 21 |             alleles = record.genotype(sample)['PGT'].split("|")
 22 |         except:
 23 |             alleles = record.genotype(sample)['GT'].split("|")
 24 |         if len(alleles) == 2 and alleles[0] != alleles[1]:            
 25 |             alleles = map(int, alleles)
 26 |             if max(alleles) != 1:
 27 |                 continue
 28 |             allele_bases = [record.REF,record.ALT[0]]
 29 |             sample_bases = map(lambda x: allele_bases[x], alleles)
 30 |             phased_snps[record.CHROM][record.POS-1] = sample_bases
 31 |     return phased_snps
 32 | 
 33 | def create_tag(hap):
 34 |     haptag = ("RG", str(hap), "Z")
 35 |     return haptag
 36 | 
 37 | def calculate_prob(basetups,hap_dict,threshold=0.9,lr_threshold=10):
 38 |     '''
 39 |     Returns a 0, 1 or -1 as the value of a read
 40 |     '''
 41 |     log_lr_threshold = np.log(lr_threshold)
 42 |     prob0  = 1.
 43 |     prob1  = 1.
 44 |     lprob0 = 0.
 45 |     lprob1 = 0.
 46 |     for basetup in basetups:
 47 |         rpos, b, qual = basetup
 48 |         #try:
 49 |         if len(hap_dict[rpos]) == 0:
 50 |             print rpos
 51 |         b0, b1 = hap_dict[rpos]
 52 |         #except:
 53 |         #    print rpos
 54 |         #    sys.exit()
 55 |         if b == b1 or b == b0: # check to make sure the base is called
 56 |             if b == b0:
 57 |                 prob0 *= (1.-10.**(-qual/10.))
 58 |                 prob1 *= (10.**(-qual/10.))
 59 |                 lprob0 += np.log(1.-10.**(-qual/10.))
 60 |                 lprob1 += np.log(10.**(-qual/10.))
 61 |             else:
 62 |                 prob1 *= (1.-10.**(-qual/10.))
 63 |                 prob0 *= (10.**(-qual/10.))
 64 |                 lprob1 += np.log(1.-10.**(-qual/10.))
 65 |                 lprob0 += np.log(10.**(-qual/10.))
 66 |     prob01 = prob0 + prob1
 67 |     if (prob0 > 0 and prob1 > 0) and prob0 / (prob01) >= threshold:
 68 |         if lprob0 - lprob1 >= log_lr_threshold:
 69 |             return create_tag(haplotype1_tag)
 70 |     if (prob0 > 0 and prob1 > 0) and prob1 / (prob01) >= threshold:
 71 |         if lprob1 - lprob0 >= log_lr_threshold:
 72 |             return create_tag(haplotype2_tag)
 73 |     if (prob0 == 0 or prob1 == 0) and lprob0 - lprob1 >= log_lr_threshold:
 74 |         return create_tag(haplotype1_tag)
 75 |     if (prob0 == 0 or prob1 == 0) and lprob1 - lprob0 >= log_lr_threshold:
 76 |         return create_tag(haplotype2_tag)
 77 |     return create_tag(unphased_tag)
 78 | 
 79 | def phase_read(read,phased_snps,chrom):
 80 |     unphased_tag = 0
 81 |     haplotype1_tag = 1
 82 |     haplotype2_tag = 2
 83 |     base_tuples = []
 84 |     if chrom in phased_snps:
 85 |         for read_pos, ref_pos in read.get_aligned_pairs():
 86 |             if ref_pos in phased_snps[chrom]:
 87 |                 if read_pos is not None:
 88 |                     if read.query_qualities == None:
 89 |                         qual = 8
 90 |                     else:
 91 |                         qual = read.query_qualities[read_pos]
 92 |                     base = read.query_sequence[read_pos]
 93 |                     base_tuples.append((ref_pos,base,qual))
 94 |     if len(base_tuples) > 0:
 95 |         read_group_tag = calculate_prob(base_tuples,phased_snps[chrom])
 96 |     else:
 97 |         read_group_tag = create_tag(unphased_tag)
 98 |     #print "%s\t%s\t%s" % (len(base_tuples),read.query_name,read_group_tag[1])
 99 |     read_tags = read.get_tags()
100 |     tags_to_add = []
101 |     for tag in read_tags:
102 |         if tag[0] != "RG":
103 |             tags_to_add.append(tag)
104 |     tags_to_add.append(read_group_tag)
105 |     read.set_tags(tags_to_add)
106 |     return read
107 | 
108 | def create_header(unphased_bam):    
109 |     readgroup_unphased = { "ID": unphased_tag } 
110 |     readgroup_hap1 = { "ID": haplotype1_tag }
111 |     readgroup_hap2 = { "ID": haplotype2_tag }
112 |     if "RG" in unphased_bam.header:
113 |         for group in unphased_bam.header["RG"]:
114 |             for item in group:
115 |                 if item != "ID":
116 |                     readgroup_unphased[item] = group[item]
117 |                     readgroup_hap1[item] = group[item]
118 |                     readgroup_hap2[item] = group[item]
119 |     phased_bam_header = unphased_bam.header.to_dict()
120 |     phased_bam_header["RG"] = [readgroup_unphased,readgroup_hap1,readgroup_hap2]
121 |     return phased_bam_header
122 | 
123 | def main(vcffile,bamfile,vcf_sample_name,outbamfile):
124 |     #outbamfile = "%s.phased.bam" % bamfile[0:-4]
125 |     phased_snps = read_phased_snps(vcffile,vcf_sample_name)
126 |     unphased_bam = pysam.AlignmentFile(bamfile,'rb')
127 |     phased_bam_header = create_header(unphased_bam)
128 |     phased_bam = pysam.AlignmentFile(outbamfile,'wb',header=phased_bam_header)
129 |     for read in unphased_bam.fetch():
130 |         if read.is_secondary:
131 |             continue
132 |         if read.is_unmapped:
133 |             continue
134 |         if read.is_supplementary:
135 |             continue
136 |         if not read.is_secondary:
137 |             tagged_read = phase_read(read,phased_snps,unphased_bam.get_reference_name(read.reference_id))        
138 |             phased_bam.write(tagged_read)
139 |     unphased_bam.close()
140 |     phased_bam.close()
141 | 
142 | if __name__ == '__main__':
143 |     vcffile = sys.argv[1]
144 |     bamfile = sys.argv[2]
145 |     vcf_sample_name = sys.argv[3]
146 |     phased_bamfile = sys.argv[4]
147 |     sys.exit(main(vcffile,bamfile,vcf_sample_name,phased_bamfile))
148 | 


--------------------------------------------------------------------------------
/MsPAC/python/extract_raw_reads_from_bam_fofn.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | import sys
 3 | import pysam
 4 | 
 5 | readsfn = sys.argv[1]
 6 | inbamfofn = sys.argv[2] 
 7 | outbamfile = sys.argv[3]
 8 | 
 9 | def merge_headers(headers):
10 |     header = {}
11 |     for fn in headers:
12 |         if len(header) == 0:
13 |             header["HD"] = headers[fn]["HD"]
14 |             header["RG"] = []
15 |             header["PG"] = []
16 |             header["SQ"] = []
17 |         for rg in headers[fn]["RG"]:
18 |             header["RG"].append(rg)
19 |         for pg in headers[fn]["PG"]:
20 |             header["PG"].append(pg)
21 |     return header
22 | 
23 | read_names = set()
24 | with open(readsfn,'r') as fh:
25 |     for line in fh:
26 |         name = line.rstrip()        
27 |         read_names.add(name)
28 | 
29 | outreads = []
30 | headers = {}
31 | with open(inbamfofn,'r') as fh:
32 |     for i,fn in enumerate(fh):
33 |         print ">>>>>>>>> %s" % i
34 |         fn = fn.rstrip()
35 |         inbamfile = pysam.AlignmentFile(fn,check_sq=False)
36 |         read_names_indexed = pysam.IndexedReads(inbamfile)
37 |         read_names_indexed.build()
38 |         for name in read_names:
39 |             try:
40 |                 read_names_indexed.find(name)
41 |             except KeyError:
42 |                 pass
43 |             else:
44 |                 iterator = read_names_indexed.find(name)
45 |                 for x in iterator:
46 |                     outreads.append(x)
47 |                     headers[fn] = dict(inbamfile.header)
48 |         inbamfile.close()
49 | 
50 | header = merge_headers(headers)
51 | outbamfile = pysam.AlignmentFile(outbamfile,"wb", header=header)
52 | for read in outreads:
53 |     outbamfile.write(read)
54 | outbamfile.close()
55 | 
56 | 


--------------------------------------------------------------------------------
/MsPAC/python/hmm2.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | from pomegranate import *
  3 | from Bio import AlignIO
  4 | import numpy as np
  5 | 
  6 | # hap1: { hap2: { ref: ...
  7 | observations = {
  8 |     "3": { 
  9 |         "A" : { "A": { "A": 0,"T": 2,"C": 2,"G": 2,"N": 14,"-": 10},
 10 |                 "T": { "A": 3,"T": 1,"C": 4,"G": 4,"N": 14,"-": 11},
 11 |                 "C": { "A": 3,"T": 4,"C": 1,"G": 4,"N": 14,"-": 11},
 12 |                 "G": { "A": 3,"T": 4,"C": 4,"G": 1,"N": 14,"-": 11},
 13 |                 "N": { "A": 14,"T": 14,"C": 14,"G": 14,"N": 14,"-": 14},
 14 |                 "-": { "A": 7,"T": 8,"C": 8,"G": 8,"N": 14,"-": 5}
 15 |                 },
 16 |         "T" : { "T": { "T": 0,"C": 2,"G": 2,"A": 2,"N": 14,"-": 10},
 17 |                 "C": { "T": 3,"C": 1,"G": 4,"A": 4,"N": 14,"-": 11},
 18 |                 "G": { "T": 3,"C": 4,"G": 1,"A": 4,"N": 14,"-": 11},
 19 |                 "A": { "T": 3,"C": 4,"G": 4,"A": 1,"N": 14,"-": 11},
 20 |                 "N": { "T": 14,"C": 14,"G": 14,"A": 14,"N": 14,"-": 14},
 21 |                 "-": { "T": 7,"C": 8,"G": 8,"A": 8,"N": 14,"-": 5}
 22 |                 },
 23 |         "C" : { "C": { "C": 0,"G": 2,"A": 2,"T": 2,"N": 14,"-": 10},
 24 |                 "G": { "C": 3,"G": 1,"A": 4,"T": 4,"N": 14,"-": 11},
 25 |                 "A": { "C": 3,"G": 4,"A": 1,"T": 4,"N": 14,"-": 11},
 26 |                 "T": { "C": 3,"G": 4,"A": 4,"T": 1,"N": 14,"-": 11},
 27 |                 "N": { "C": 14,"G": 14,"A": 14,"T": 14,"N": 14,"-": 14},
 28 |                 "-": { "C": 7,"G": 8,"A": 8,"T": 8,"N": 14,"-": 5}
 29 |                 },
 30 |         "G" : { "G": { "G": 0,"A": 2,"T": 2,"C": 2,"N": 14,"-": 10},
 31 |                 "A": { "G": 3,"A": 1,"T": 4,"C": 4,"N": 14,"-": 11},
 32 |                 "T": { "G": 3,"A": 4,"T": 1,"C": 4,"N": 14,"-": 11},
 33 |                 "C": { "G": 3,"A": 4,"T": 4,"C": 1,"N": 14,"-": 11},
 34 |                 "N": { "G": 14,"A": 14,"T": 14,"C": 14,"N": 14,"-": 14},
 35 |                 "-": { "G": 7,"A": 8,"T": 8,"C": 8,"N": 14,"-": 5}
 36 |                 },
 37 |         "N" : { "G": { "G": 14,"A": 14,"T": 14,"C": 14,"N": 14,"-": 14},
 38 |                 "A": { "G": 14,"A": 14,"T": 14,"C": 14,"N": 14,"-": 14},
 39 |                 "T": { "G": 14,"A": 14,"T": 14,"C": 14,"N": 14,"-": 14},
 40 |                 "C": { "G": 14,"A": 14,"T": 14,"C": 14,"N": 14,"-": 14},
 41 |                 "N": { "G": 14,"A": 14,"T": 14,"C": 14,"N": 14,"-": 14},
 42 |                 "-": { "G": 14,"A": 14,"T": 14,"C": 14,"N": 14,"-": 14}
 43 |                 },
 44 |         "-" : { "-": { "-": None,"A": 12,"T": 12,"C": 12,"G": 12,"N": 14},
 45 |                 "A": { "-": 9,"A": 6, "T": 13, "C": 13,"G": 13,"N": 14},
 46 |                 "T": { "-": 9,"A": 13,"T": 6,"C": 13,"G": 13,"N": 14},
 47 |                 "C": { "-": 9,"A": 13,"T": 13,"C": 6,"G": 13,"N": 14},
 48 |                 "G": { "-": 9,"A": 13,"T": 13,"C": 13,"G": 6,"N": 14},
 49 |                 "N": { "-": 14,"A": 14,"T": 14,"C": 14,"G": 14,"N": 14}
 50 |                 }
 51 |         },
 52 |     "2": {
 53 |         "A" : { "A": 1, "-": 0, "G": 3, "C": 3, "T": 3, "N": 3},
 54 |         "T" : { "T": 1, "A": 3, "-": 0, "G": 3, "C": 3, "N": 3},
 55 |         "C" : { "C": 1, "T": 3, "A": 3, "-": 0, "G": 3, "N": 3},
 56 |         "G" : { "G": 1, "C": 3, "T": 3, "A": 3, "-": 0, "N": 3},
 57 |         "-" : { "-": None, "G": 2, "C": 2, "T": 2, "A": 2, "N": 3},
 58 |         "N" : { "-": 3, "G": 3, "C": 3, "T": 3, "A": 3, "N": 3}
 59 |         }
 60 |     }
 61 | 
 62 | def observation_probs(index):
 63 |     num_obs = 15
 64 |     prob = 0.95
 65 |     index_prob = prob/len(index)
 66 |     other_prob = (1-prob)/(num_obs-len(index)+1)
 67 |     probs = [other_prob]*num_obs
 68 |     for i in index:
 69 |         probs[i] = index_prob
 70 |     obs = list(range(0,15))
 71 |     return dict(zip(obs,probs))
 72 | 
 73 | def get_obs_probs(index,complex_):
 74 |     obs_probs = observation_probs([index])
 75 |     if complex_:
 76 |         obs_probs[index] = 0.94
 77 |         obs_probs[0] = 0.01
 78 |     return obs_probs
 79 | 
 80 | def get_states():
 81 |     states = []
 82 |     states_with_index = { "INS_1|1": 10,"INS_1|0": 5,"INS_0|1": 9,
 83 |                           "DEL_1|1": 12,"DEL_1|0": 6,"DEL_0|1": 7 }
 84 |     for state_and_index in states_with_index:
 85 |         index = states_with_index[state_and_index]
 86 |         for complex_ in [True,False]:
 87 |             obs_probs = get_obs_probs(index,complex_)
 88 |             name = state_and_index
 89 |             if complex_:
 90 |                 name = "COMPLEX.%s" % name
 91 |             state = State(DiscreteDistribution(obs_probs),name=name)
 92 |             states.append(state)
 93 |     #error_obs_prob = observation_probs([1,2,3,4])  #[4,5,6,7,8,9,11,13])
 94 |     #error_state = State(DiscreteDistribution(error_obs_prob),name="ERROR")
 95 |     #states.append(error_state)
 96 |     complex_obs_prob = observation_probs([5,6,7,9,10,12]) #list(range(0,15)))
 97 |     complex_state = State(DiscreteDistribution(complex_obs_prob),name="COMPLEX_.|.")
 98 |     states.append(complex_state)
 99 |     return states
100 |             
101 | def three_hmm():
102 |     model = HiddenMarkovModel("Sequence Aligner")
103 |     states = get_states()
104 |     normal = State(DiscreteDistribution(observation_probs([0])), name="NORMAL")
105 |     states.append(normal)
106 |     num_states = len(states)
107 |     model.add_states(states)
108 |     for state in states:
109 |         if state.name != "NORMAL":
110 |             model.add_transition(model.start, state, 0)
111 |             model.add_transition(state, model.end, 0)
112 |         else:
113 |             model.add_transition(model.start, state, 1)
114 |             model.add_transition(state, model.end, 1)
115 |     trans_probs = np.zeros((num_states,num_states))
116 |     trans_probs.fill(1e-15)
117 |     np.fill_diagonal(trans_probs,.99999) # Stay in the same state
118 |     trans_probs[:,len(trans_probs)-1] = 1e-15 # Transition to normal
119 |     trans_probs[len(trans_probs)-1] = [4.5e-15]*num_states
120 |     trans_probs[len(trans_probs)-1][-1] = 1-2e-15 # normal event
121 |     #print >> sys.stderr, trans_probs
122 |     for state, t_prob in zip(states,trans_probs):
123 |         for s, prob in zip(states,t_prob):
124 |             model.add_transition(state,s,prob)
125 |             #print >> sys.stderr, state.name,s.name, prob
126 |     model.add_transition(normal,normal,(1-2e-15))
127 |     model.bake()
128 |     return model
129 | 
130 | model = {"3": three_hmm()}
131 | 


--------------------------------------------------------------------------------
/MsPAC/python/msa_to_variants.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | #from MsPAC.python_scripts.hmm import *
  3 | from hmm2 import *
  4 | from pomegranate import *
  5 | from Bio import AlignIO
  6 | import numpy as np
  7 | import pysam
  8 | import sys
  9 | 
 10 | def get_msa_sequence(clufn):
 11 |     alignment = AlignIO.read(clufn,"clustal")
 12 |     sequences = {}
 13 |     for i,sequence in enumerate(alignment):        
 14 |         if sequence.id not in ["ref","hap1","hap2"]:
 15 |             if i == 0:
 16 |                 sequence.id = "ref"
 17 |             if i == 1:
 18 |                 sequence.id = "hap1"
 19 |             if i == 2:
 20 |                 sequence.id = "hap2"
 21 |         sequences[sequence.id] = str(sequence.seq).upper()
 22 |     return sequences
 23 | 
 24 | def get_observations(sequence):
 25 |     obs = []
 26 |     if len(sequence) == 3:
 27 |         h1_seq = sequence["hap1"]
 28 |         h2_seq = sequence["hap2"]
 29 |         chrom_seq = sequence["ref"]
 30 |         for h1,h2,chrom in zip(h1_seq,h2_seq,chrom_seq):
 31 |             obs.append(observations["3"][h1][h2][chrom])
 32 |         return obs
 33 | 
 34 | def get_quality_scores(sequence,quality_scores,start_index,end_index,padding):
 35 |     quality_start = start_index - sequence[:start_index].count("-") #- padding
 36 |     quality_end = end_index - sequence[:end_index].count("-") #+ padding
 37 |     #print quality_start,quality_end
 38 |     if quality_end - quality_start < (padding*2):
 39 |         quality_start = max(0,quality_start - padding)
 40 |         quality_end = quality_end + padding
 41 |     scores = quality_scores[quality_start:quality_end]
 42 |     #print scores,start_index,end_index,quality_start,quality_end
 43 |     mean = float(sum(scores))/len(scores)
 44 |     return mean
 45 | 
 46 | def three_sequence_msa_variants(sequence,path,clufn,chrom,ref_start,ref_end,quality_scores,padding):
 47 |     start = False
 48 |     current_sv = None
 49 |     current_sv_start_index = None
 50 |     current_sv_current_index = None
 51 |     current_sv_hap1_seq = []    
 52 |     current_sv_hap2_seq = []
 53 |     current_sv_ref_seq = []
 54 |     ref_index = -1
 55 |     ref_sv_start = None
 56 |     ref_sv_index = None
 57 |     for i,(h1,h2,ref,p) in enumerate(zip(sequence["hap1"],sequence["hap2"],sequence["ref"],path[1:-1])):
 58 |         state_index, state = p
 59 |         if ref != "-":
 60 |             ref_index += 1
 61 |         if start == False:
 62 |             if "-" not in (h1,h2,ref,p) and state.name == "NORMAL":
 63 |                 start = True
 64 |             continue
 65 |         if state.name != "NORMAL":
 66 |             if current_sv == None:
 67 |                 current_sv = state.name
 68 |                 current_sv_start_index = i
 69 |                 current_sv_current_index = i
 70 |                 ref_sv_start = ref_index
 71 |                 ref_sv_index = ref_index
 72 |                 continue
 73 |             current_sv_current_index += 1 
 74 |             ref_sv_index = ref_index
 75 |             if h1 != "-":
 76 |                 current_sv_hap1_seq.append(h1)
 77 |             if h2 != "-":
 78 |                 current_sv_hap2_seq.append(h2)
 79 |             if ref != "-":
 80 |                 current_sv_ref_seq.append(ref)
 81 |         if i != current_sv_current_index and current_sv != None:
 82 |             if i + 1 == len(sequence["hap1"]) and "-" in (h1,h2,ref):
 83 |                 continue
 84 |             sv_type, genotype = current_sv.split("_")
 85 |             sv_len = max(len(current_sv_hap1_seq),len(current_sv_hap2_seq),len(current_sv_ref_seq))
 86 |             if quality_scores != None:
 87 |                 hap1_qual_score = get_quality_scores(sequence["hap1"],quality_scores["hap1"],current_sv_start_index,current_sv_current_index,padding)
 88 |                 hap2_qual_score = get_quality_scores(sequence["hap2"],quality_scores["hap2"],current_sv_start_index,current_sv_current_index,padding)
 89 |             else:
 90 |                 hap1_qual_score = 60
 91 |                 hap2_qual_score = 60
 92 |             output = [chrom,                              
 93 |                       ref_sv_start + ref_start, # 0-based/UCSC Genome format
 94 |                       ref_sv_index + ref_start + 1, # 0-based/UCSC Genome format    
 95 |                       sv_type,
 96 |                       genotype,
 97 |                       sv_len,
 98 |                       hap1_qual_score,
 99 |                       hap2_qual_score,
100 |                       "".join(current_sv_ref_seq) if len("".join(current_sv_ref_seq)) > 0 else ".",
101 |                       "".join(current_sv_hap1_seq) if len("".join(current_sv_hap1_seq)) > 0 else ".",
102 |                       "".join(current_sv_hap2_seq) if len("".join(current_sv_hap2_seq)) > 0 else ".",
103 |                       current_sv_start_index,
104 |                       current_sv_current_index,
105 |                       clufn] 
106 |             print "\t".join(map(str,output))
107 |             current_sv = None
108 |             current_sv_hap1_seq = []    
109 |             current_sv_hap2_seq = []
110 |             current_sv_ref_seq = []
111 | 
112 | def path_to_variants(path,sequence,clufn,chrom,start,end,quality_scores,padding):
113 |     if len(sequence) == 3:
114 |         three_sequence_msa_variants(sequence,path,clufn,chrom,start,end,quality_scores,padding)
115 | 
116 | def load_qual_scores(qual_scoresfn):
117 |     qual_scores = {}
118 |     with open(qual_scoresfn,'r') as fh:
119 |         for line in fh:
120 |             if ">" in line:
121 |                 name = line[1:].rstrip()
122 |             else:
123 |                 qual =  line.rstrip().split(',')
124 |                 qual_scores[name] = map(int,qual)
125 |     return qual_scores
126 | 
127 | def main():
128 |     clufn = sys.argv[1]
129 |     chrom = sys.argv[2]
130 |     start = int(sys.argv[3])
131 |     end = int(sys.argv[4])
132 |     if sys.argv[5] != "None":
133 |         qual_scoresfn = sys.argv[5]
134 |         qual_scores = load_qual_scores(qual_scoresfn)
135 |     else:
136 |         qual_scores = None
137 |     if sys.argv[6] != "None":
138 |         padding = int(sys.argv[6])
139 |     else:
140 |         padding = 5
141 |     sequence = get_msa_sequence(clufn)
142 |     obs = get_observations(sequence)
143 |     log, path = model[str(len(sequence))].viterbi(obs)
144 |     path_to_variants(path,sequence,clufn,chrom,start,end,qual_scores,padding)   
145 | 
146 | if __name__ == "__main__":
147 |     main()
148 | 


--------------------------------------------------------------------------------
/MsPAC/python/start_end_coordinates.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | import pysam
 3 | import sys
 4 | 
 5 | bamfile = sys.argv[1]
 6 | samfile = pysam.AlignmentFile(bamfile)
 7 | for i,read in enumerate(samfile):
 8 |     if read.is_unmapped:
 9 |         continue
10 |     if read.is_secondary:
11 |         continue
12 |     if read.is_supplementary:
13 |         continue
14 |     output = [samfile.getrname(read.reference_id),read.reference_start,read.reference_end - 1,read.query_name]
15 |     print "\t".join(map(str,output))
16 |         
17 | 


--------------------------------------------------------------------------------
/MsPAC/sv_calling.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | from Bio import SeqIO
  3 | from Bio.Seq import Seq
  4 | from Bio.SeqRecord import SeqRecord
  5 | import math
  6 | import pysam
  7 | import os
  8 | 
  9 | from MsPAC import Pipeline
 10 | 
 11 | class SVCaller(Pipeline):
 12 |     def __init__(self,configfile):
 13 |         Pipeline.__init__(self,configfile,"sv-calling")
 14 | 
 15 |     def create_new_record(self,record,i,start,end):     
 16 |         new_record_name = "%s.%s" % (i,record.id)
 17 |         new_record_seq = record.seq[start:end]
 18 |         new_record_qual = record.letter_annotations["phred_quality"][start:end]
 19 |         new_record = SeqRecord(new_record_seq,id=new_record_name,name=new_record_name,description="")
 20 |         new_record.letter_annotations["phred_quality"] = new_record_qual
 21 |         return new_record
 22 |     
 23 |     def create_new_fastq(self,infastq,outfastq,window):
 24 |         records = []            
 25 |         for record in SeqIO.parse(infastq, "fastq"):
 26 |             if len(record.seq) > window:
 27 |                 num_windows = int(math.ceil(len(record.seq)/window))
 28 |                 size_of_window = len(record.seq)/num_windows
 29 |                 i = 0
 30 |                 for i in range(num_windows - 1):
 31 |                     start = i*size_of_window
 32 |                     end = (i+1)*size_of_window
 33 |                     new_record = create_new_record(record,i,start,end)
 34 |                     records.append(new_record)
 35 |                 start = (i+1)*size_of_window
 36 |                 new_record = create_new_record(record,i+1,start,None)
 37 |                 records.append(new_record)
 38 |             else:
 39 |                 records.append(record)
 40 |         SeqIO.write(records,outfastq,"fastq")        
 41 | 
 42 |     def split_fastq_files(self):
 43 |         window = 500000
 44 |         for hap in ["hap1","hap2"]:
 45 |             if hap == "hap1":
 46 |                 infastq = self.hap1_assembly_fq
 47 |                 outfastq = self.hap1_assembly_split_fq
 48 |             elif hap == "hap2":
 49 |                 infastq = self.hap2_assembly_fq
 50 |                 outfastq = self.hap2_assembly_split_fq
 51 |             self.create_new_fastq(infastq,outfastq,window)
 52 | 
 53 |     def map_assembly(self):
 54 |         template_bash = "%s/map.sh" % self.package_bash_directory
 55 |         for hap in ["hap1","hap2"]:
 56 |             if hap == "hap1":
 57 |                 fastq = self.hap1_assembly_split_fq
 58 |             elif hap == "hap2":
 59 |                 fastq = self.hap2_assembly_split_fq
 60 |             prefix = "%s/%s_to_ref" % (self.sv_calling_directory,hap)
 61 |             bashfile = "%s/map_%s_assembly.sh" % (self.sv_calling_directory,hap)
 62 |             params = {
 63 |                 'input': fastq,
 64 |                 'ref': self.reference,
 65 |                 'prefix': prefix,
 66 |                 'threads': self.job_threads["high"]
 67 |                 }
 68 |             self.write_to_bashfile(template_bash,bashfile,params)
 69 |             self.jobs.append((bashfile,self.job_threads["high"]))
 70 |         self.run_locally()
 71 | 
 72 |     def get_msa_coordinates(self):
 73 |         if os.path.isfile("%s/msa_coords.bed" % self.sv_calling_directory):
 74 |             return
 75 |         template_bash = "%s/get_msa_coords.sh" % self.package_bash_directory
 76 |         bashfile = "%s/get_msa_coords.sh" % self.sv_calling_directory
 77 |         params = {
 78 |             'python_packages': self.package_python_directory,
 79 |             'ref': self.reference,
 80 |             'sv_calling_dir': self.sv_calling_directory
 81 |             }
 82 |         self.write_to_bashfile(template_bash,bashfile,params)
 83 |         self.jobs.append((bashfile,self.job_threads["high"]))
 84 |         self.run_locally()
 85 |     
 86 |     def read_bedfile(self,bedfile):
 87 |         regions = {}
 88 |         with open(bedfile,'r') as bedfh:
 89 |             for line in bedfh:
 90 |                 line = line.rstrip().split('\t')
 91 |                 chrom = line[0]
 92 |                 coord = (int(line[1]),int(line[2]))
 93 |                 if chrom not in regions:
 94 |                     regions[chrom] = []
 95 |                 regions[chrom].append(coord)
 96 |         return regions
 97 | 
 98 |     def get_hap_sequence(self,hap_bamfile,regions):
 99 |         samfile = pysam.AlignmentFile(hap_bamfile,'rb')
100 |         hap_sequences = {}
101 |         for chrom in regions:
102 |             for start,end in regions[chrom]:
103 |                 for contig in samfile.fetch(chrom,start,end):
104 |                     if contig.is_unmapped:
105 |                         continue
106 |                     if contig.is_secondary:
107 |                         continue
108 |                     if contig.is_supplementary:
109 |                         continue
110 |                     if contig.reference_start > start:
111 |                         continue
112 |                     if contig.reference_end < end:
113 |                         continue
114 |                     aligned_pairs = contig.get_aligned_pairs()
115 |                     query_start = None
116 |                     query_end = None
117 |                     matched_ref_start = None
118 |                     matched_ref_end = None
119 |                     for query_pos, ref_pos in aligned_pairs:
120 |                         if query_pos == None:
121 |                             continue
122 |                         if ref_pos == None:
123 |                             continue
124 |                         if int(ref_pos) <= int(start):
125 |                             query_start = query_pos
126 |                             matched_ref_start = ref_pos
127 |                         query_end = query_pos
128 |                         matched_ref_end = ref_pos
129 |                         if int(ref_pos) > int(end):
130 |                             break
131 |                     assert query_start != None
132 |                     assert query_end != None
133 |                     hap_sequence = contig.query_sequence[query_start:query_end]
134 |                     if contig.query_qualities != None:
135 |                         sequence_qual = contig.query_qualities[query_start:query_end]
136 |                     else:
137 |                         sequence_qual = [0]*len(hap_sequence)
138 |                     assert len(sequence_qual) == len(hap_sequence)
139 |                     hap_sequences[(chrom,start,end)] = (hap_sequence,sequence_qual)
140 |             return hap_sequences
141 | 
142 |     def extract_msa_sequence(self):
143 |         msa_coordsfn = "%s/msa_coords.bed" % self.sv_calling_directory
144 |         msa_coords = self.read_bedfile(msa_coordsfn)
145 |         hap1_bamfn = "%s/hap1_to_ref.sorted.bam" % self.sv_calling_directory
146 |         hap1_sequence = self.get_hap_sequence(hap1_bamfn,msa_coords)
147 |         hap2_bamfn = "%s/hap2_to_ref.sorted.bam" % self.sv_calling_directory
148 |         hap2_sequence = self.get_hap_sequence(hap2_bamfn,msa_coords)
149 |         fasta = pysam.FastaFile(self.reference)
150 |         for chrom in msa_coords:
151 |             for i,(start,end) in enumerate(msa_coords[chrom]):
152 |                 ref_seq = fasta.fetch(reference=chrom,start=max(1,start),end=end) 
153 |                 h1_seq,h1_qual = hap1_sequence[(chrom,start,end)]
154 |                 h2_seq,h2_qual = hap2_sequence[(chrom,start,end)]
155 |                 directory = "%s/%s/%s_%s" % (self.sv_calling_directory,chrom,start,end)
156 |                 self.create_directory(directory)
157 |                 outseqfn = "%s/seq.fa" % directory
158 |                 outqualfn = "%s/seq.qual" % directory
159 |                 with open(outseqfn,'w') as outfh:
160 |                     outfh.write(">ref\n%s\n" % ref_seq)
161 |                     outfh.write(">hap1\n%s\n" % h1_seq)
162 |                     outfh.write(">hap2\n%s\n" % h2_seq)
163 |                 with open(outqualfn,'w') as outqualfh:
164 |                     outqualfh.write(">hap1\n%s\n" % ",".join(map(str,h1_qual)))
165 |                     outqualfh.write(">hap2\n%s\n" % ",".join(map(str,h2_qual)))
166 | 
167 |     def calls_svs_from_msa(self):
168 |         msa_coordsfn = "%s/msa_coords.bed" % self.sv_calling_directory
169 |         msa_coords = self.read_bedfile(msa_coordsfn)
170 |         template_bash = "%s/sv_calling.sh" % self.package_bash_directory
171 |         for chrom in msa_coords:
172 |             for i,(start,end) in enumerate(msa_coords[chrom]):
173 |                 directory = "%s/%s/%s_%s" % (self.sv_calling_directory,chrom,start,end)
174 |                 bashfile = "%s/sv_calling.sh" % directory
175 |                 params = {
176 |                     'dir': directory,
177 |                     'python_scripts': self.package_python_directory,
178 |                     'chrom': chrom,
179 |                     'start': start,
180 |                     'end': end
181 |                     }
182 |                 self.write_to_bashfile(template_bash,bashfile,params)
183 |                 self.jobs.append((bashfile,self.job_threads["low"]))
184 |         self.submitjobs()        
185 | 
186 |     def run(self):
187 |         self.configure()
188 |         self.split_fastq_files()
189 |         self.map_assembly()
190 |         self.get_msa_coordinates()
191 |         self.extract_msa_sequence()
192 |         self.calls_svs_from_msa()
193 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # MsPAC
  2 | **Phase reads, assemble haplotypes and detect SVs**
  3 | 
  4 | [Introduction](#introduction)  
  5 | [Tool requirements](#tool-requirements)  
  6 | [Installation](#installation)  
  7 | [Cluster configuration](#cluster-configuration)<br/>
  8 | [Test runs](#test-runs)<br/>
  9 | [Configuration File](#configuration-file)<br/>
 10 | [Quick Start](#quick-start)       
 11 | [Explanation of steps](#explanation-of-steps)     
 12 | [Example of output](#example-of-output)<br/>
 13 | [Manuscript results](#manuscript-results)
 14 | 
 15 | 
 16 | ## Introduction
 17 | MsPAC takes in long reads and phased SNVs to separate the reads into two haplotypes, and assembles both haplotypes and detects structural variants. The output is a fasta file containing both haplotypes and VCF file with SVs. The SVs are annotated with their type, size, genotype and reference, haplotype 1 and haplotype 2 sequence.
 18 | 
 19 | ## Tool requirements
 20 | 1. Linux operating system
 21 | 2. [Conda package](https://conda.io/en/latest/)
 22 | 3. [cluster python package](https://github.com/oscarlr/cluster)
 23 | 
 24 | ## Installation
 25 | ```
 26 | ### Installing MsPAC and it's dependencies
 27 | git clone https://github.com/oscarlr/MsPAC.git
 28 | cd MsPAC
 29 | conda env create -f environment.yml 
 30 | conda activate MsPAC
 31 | python setup.py install
 32 | 
 33 | ### Installing cluster package that's needed
 34 | cd ..
 35 | git clone https://github.com/oscarlr/cluster.git
 36 | cd cluster
 37 | python setup.py install
 38 | ```
 39 | ## Cluster configuration
 40 | If you don't want to use the cluster use this command before running MsPAC:
 41 | ```
 42 | export SJOB_DEFALLOC=""
 43 | ```
 44 | If you want to use the cluster, edit the `lsf/cluster/config.py` script in `https://github.com/oscarlr/cluster.git`. The cluster package reads from this file the default configurations to run jobs in the cluster as wells as the account to use when submitting jobs. After you edit `lsf/cluster/config.py` reinstall the package using `python setup.py install` in the cluster folder.
 45 | 
 46 | ## Test runs
 47 | ```
 48 | export SJOB_DEFALLOC=""
 49 | cd testing
 50 | sh run.sh
 51 | ```
 52 | ## Configuration File
 53 | Explanation of configuration file entries is [here](cfg_readme.md).
 54 | ```
 55 | [Input]
 56 | directory = 
 57 | 
 58 | [Phase-bam input files]
 59 | phased vcf = 
 60 | reads aligned = 
 61 | 
 62 | [Phase-bam params]
 63 | sample name in VCF = 
 64 | output phased bamfile = 
 65 | 
 66 | [Prep reads params]
 67 | BAM fofn = 
 68 | Raw reads directory =
 69 | 
 70 | [Assembly params]
 71 | Minimum phased block length = 1000
 72 | Comma-seperated list of haplotypes = 0_1,0_2
 73 | Assembly directory = 
 74 | Flanking length = 1000
 75 | Phased bedfile = None
 76 | 
 77 | [SV calling params]
 78 | SV calling directory =
 79 | reference = 
 80 | 
 81 | [Other params]
 82 | cluster = No
 83 | 
 84 | [HIGH INTENSITY JOB]
 85 | walltime = 24
 86 | threads = 1
 87 | memory = 8
 88 | queue = private
 89 | 
 90 | [LOW INTENSITY JOB]
 91 | walltime = 24
 92 | threads = 1
 93 | memory = 8
 94 | queue = private
 95 | ```
 96 | 
 97 | ## Quick Start
 98 | ```
 99 | MsPAC phase-bam run.cfg
100 | MsPAC prep-reads run.cfg
101 | MsPAC assembly run.cfg
102 | MsPAC sv-calling run.cfg
103 | ```
104 | 
105 | ## Explanation of steps
106 | MsPac is split into four steps. For each step, the input is a configuration file. A description of the configuration file is [here](cfg_readme.md).
107 | #### `phase-bam`
108 | In the first step `phase-bam`, a bam file is created. This bam file is a copy of the input bam file with a read group annotation added to the reads. A read group annotation of 1 and 2 corresponds to haplotype 1 and 2. The read group annotation of 0 corresponds to unassignable reads.
109 | #### `prep-reads`
110 | In the second step `prep-reads`, several bam files are created. These bam contain the raw reads seperated by chromosome and haplotype. It makes the process of searching for these reads much faster during the Quiver process, where haplotype specific reads are used to clean the haplotype-specific contigs.
111 | #### `assembly`
112 | In the third step `assembly`, the haplotypes are assembled. During this process folders will be created for each region. Within each folder there is a bash script that runs the assembly process. MsPAC can submit these bash scripts as a single job into the cluster (this speeds up the process).
113 | #### `sv-calling`
114 | In the last step `sv-calling`, the haplotypes and reference are aligned and the SVs are called. In this step, new directories will be made that holds the multiple sequence alignment and a BED file with the SVs.
115 | 
116 | ## Example of output
117 | ### BED SV output
118 | ```
119 | chr22	16610019	16610020	INS	1|0	46	46.6780821918	46.84	.	CACTGCTGTTGGGTTCTCTTTGTTTTTCCTCACAAAGGATTCCACA	.	18270	18316	/sc/orga/work/rodrio10/software/in_github/MsPAC/testing/MsPAC/sv_calling/chr22/16595201_16611082/msa.clu
120 | ```
121 | The columns are:
122 | ```
123 | 1. chromosome
124 | 2. SV start
125 | 3. SV end
126 | 4. SV type
127 | 5. SV genotype
128 | 6. SV size
129 | 7. Haplotype 1 SV quality score 
130 | 8. Haplotype 2 SV quality score
131 | 9. Reference sequence
132 | 10. Haplotype 1 sequence
133 | 11. Haplotype 2 sequence
134 | 12. Start index position of SV in multiple sequence alignment file 
135 | 13. End index position of SV in multiple sequence alignment file 
136 | 14. Full path of multiple sequence alignment file
137 | ```
138 | ### Assembled fasta haplotype 
139 | ```
140 | >22.16050007.16697745.0_1.raw.0/0/0_0
141 | GACCATGTGAAACTAAGGACAACTTCAGAGCTTCACACAGCTTCAACACTGGAGAGAAAA
142 | CAGTGAACCCACAGAAAACATCCTACAGACTGGGAGAAAATTATGGAAAACTGTGGATCT
143 | GGAAGGGCTTCTTATCTAACATATTCAAGAAACTAATGGTCCTAAGTGGACAAAAACCAA
144 | TATACAATGCTTGTCACACCTAAGTGGACAAAAACCAATACTAAAAATGCCCAAAAGACT
145 | GCGTAGGCATTTCTGAAAAAACCTGAAACAGCCTCTCAGGTAACAGAAGTTTCTCCACAT
146 | CAAGAAGAGTTTCTCCCCAGAGAACGAGTATGACCAGAAAACAGCAATAAAACTTTGGAA
147 | TAAGAGATAAGGGCAGTGTAGATTTGCAGACAGAGGAACTATTACATACTACCTGGTTTG
148 | AATGCAAATTTGTATACCCACTGGGAAACAGCTGGAGGTTTCTGAAACAATTAACAACAC
149 | AACCACCAGTTCCTCTAGCCATCCCACACTGGGTATACCTGCAAAGCCAAGGAAACCTAC
150 | ```
151 | The fasta header has the region that was assembled with the corresponding haplotype.
152 | 
153 | ### Phased BAM file
154 | ```
155 | m150131_015113_42163R_c100780292550000001823166508251570_s1_p0/8761/14473_27456
156 | 16
157 | 22
158 | 16050008
159 | 0
160 | 15543H97S27=1I1=1X12=1I20=2D13=1I8=1I9=3I6=1I11=1D5=1D1=1D11=1I3=1D11=1I4=1X1=1D3=1I8=5I10=1I1=1I3=1I5=3I15=1I8=1D38=1X16=1X5=1I8=1X4=1I12=1I4=1I14=1I1=1I8=1D6=1I4=1I13=1I15=1D1=1I21=2I12=1I1=1X6=1X3=1D5=
161 | *
162 | 0
163 | 12343
164 | CCAATCTCCTGGCAGCCACGCAGCCGGTCGAGAAATTTCGTCACTTGTGGCGGGTTCCCAAGCCTGTTGCCATGCAGCCTCTGGAAAGAGATCTGATTAAGTCCCAGGACTTCAGAAGAGCTGTTGCGACCTTGGCCAATGTCACTTCCTCCTTCAGGAATTGCAGTGGGCCTTAAGTGCCTTCCTCTCGGGCCCACTGGTTAT
165 | --*%..)-.-/,.//*-/..,/.%+%"'-)(./*./)'/"(..%,(.,(+&)"*.(-,+-./-"///*/,.//+.//,/).(+)/*.//+/)//,//#/'./.,,./)/'&%///.+.////.(%/./.+.,*/+)(..$/////,+...,,/&&..((%.(/.////,$,/*'/.+//,//%-././..,'###(&(+($',,
166 | AS:i:-54803
167 | XS:i:15544
168 | XE:i:28527
169 | qs:i:15544
170 | qe:i:28527
171 | zm:i:-1
172 | XL:i:12884
173 | XT:i:1
174 | NM:i:0
175 | FI:i:14476
176 | XQ:i:42999
177 | iq:Z:0/,(121/113-223+/212121/,-#'//202*12*'2#)10%0(0.),&.#*0)0--/03.#232*3-1332223.2*111,2+123,3232.32#3'120/.23)2-3&3333.1223232%313211-*3-*(01$22332--101/.2&-01)0)0)313322-$-2*'30,23-22%/121211-'###,'/,
178 | dq:Z:222'22*2222222222222222&2&(22+)2222222222222-2222222)2222222222222222222222222222(222222222*22222222222222222('222222222222)222222222222222222222222222222'222)'222222222222222222222222222222222(2+2)2
179 | sq:Z:<<<<<<<<<<<<<9<<<<<<<<<<<<<<<<<<<<<<<<<%<<<<<<<<<<<<<<<<<<<<<<<$<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<5<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<3<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
180 | mq:Z:03/6>0>75,834;=1280-*:3,C.-1/*:@9C3@1}>}}19}:857}:}&}:753:564:C}>9M323=62'4>1;:}94''A?=9:}5C;6}?=}=}FG6-71;B4}DI5>6-)=D<3@17}9;40'7}D:1-}95};57=8};/A9-9B}}8:346@(8283C@7}<8}}88}963=6}74;5A8>:7&%-.}9<
181 | st:Z:AACCGAGAAGTTACTAACATACTAATTGATCTCCCGGGATGACAGGTGTTATTTGGAAACCTAAGTGGTAACGTACTAAGAGTTCCCTCTCGAGTCGGCCTGAAACTTCAGGACTCCTCTAGTGGTATCAAGGTTAACCGTGACAGGAAGAAGGACTTCCGGTACTGTTTAAGGCCTGTAAGGAAGAGATTTAAACAGT
182 | dt:Z:NNNANNCNNNNNNNNNNNNNNNNANTTNNTANNNNNNNNNNNNNTNNNNNNNANNNNNNNNNNNNNNNNNNNNNNNNNNNNANNNNNNNNNANNNNNNNNNNNNNNNNNGTNNNNNNNNNNNNGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTNNNCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCNANGN
183 | ip:Z:S26,94,16,11,14,14,77,19,28,28,26,91,24,5,35,8,11,46,10,31,75,27,16,257,26,54,44,15,39,47,16,7,37,7,17,57,50,20,26,30,23,49,63,6,636,54,12,33,33,16,15,141,360,18,14,25,42,8,17,21,51,10,17,34,19,13,12
184 | RG:Z:2
185 | ```
186 | RG tag has the haplotype information. This read can be accessed using the `samtools view` command with the `-r` option. For example, `samtools view reads.bam 22:16050008-16050108 -r 2`.
187 | ## Manuscript results
188 | [HG002 haplotype assemblies](https://rodrio10.u.hpc.mssm.edu/MsPAC/hg002_assembly/haplotypes/)<br>
189 | [HG002 SV calls](https://rodrio10.u.hpc.mssm.edu/MsPAC/SV_calls.bed)
190 | 


--------------------------------------------------------------------------------
/cfg_readme.md:
--------------------------------------------------------------------------------
  1 | [Example](#example)  
  2 | [Explanation](#explanation)
  3 | 
  4 | # Example
  5 | ```
  6 | [Input]
  7 | directory = MsPAC
  8 | 
  9 | [Phase-bam input files]
 10 | phased vcf = input_data/test.vcf.gz
 11 | reads aligned = input_data/test.bam
 12 | 
 13 | [Phase-bam params]
 14 | sample name in VCF = 20977
 15 | output phased bamfile = input_data/test_phased.bam
 16 | 
 17 | [Prep reads params]
 18 | BAM fofn = reads.fofn
 19 | Raw reads directory = MsPAC/prep_reads
 20 | 
 21 | [Assembly params]
 22 | Minimum phased block length = 1000
 23 | Comma-seperated list of haplotypes = 0_1,0_2
 24 | Assembly directory = MsPAC/assembly
 25 | Flanking length = 1000
 26 | Phased bedfile = None
 27 | Technology = ONT
 28 | 
 29 | [SV calling params]
 30 | SV calling directory = MsPAC/sv_calling
 31 | reference = input_data/chr22.fa
 32 | 
 33 | [Other params]
 34 | cluster = No
 35 | 
 36 | [HIGH INTENSITY JOB]
 37 | walltime = 24
 38 | threads = 1
 39 | memory = 8
 40 | queue = private
 41 | 
 42 | [LOW INTENSITY JOB]
 43 | walltime = 24
 44 | threads = 1
 45 | memory = 8
 46 | queue = private
 47 | ```
 48 | 
 49 | # Explanation
 50 | ```
 51 | [Input]
 52 | directory = MsPAC
 53 | ```
 54 | The directory that MsPAC writes to.
 55 | 
 56 | ```
 57 | [Phase-bam input files]
 58 | phased vcf = input_data/test.vcf.gz
 59 | reads aligned = input_data/test.bam
 60 | ```
 61 | `phase vcf` is the input phased VCF file with the phased SNPs. `reads aligned` is the input BAM file with reads aligned. 
 62 | 
 63 | ```
 64 | [Phase-bam params]
 65 | sample name in VCF = 20977
 66 | output phased bamfile = input_data/test_phased.bam
 67 | ```
 68 | `sample name in VCF` is the sample name in the VCF file. `output phased bamfile` is the output BAM file with the input reads phased. In the `output phased bamfile`, the input reads have a new read group tag. The read group tag `0`, `1`, and `2` correspond to unphased reads, haplotype 1, and haplotype 2.
 69 | 
 70 | ```
 71 | [Prep reads params]
 72 | BAM fofn = reads.fofn
 73 | Raw reads directory = MsPAC/prep_reads
 74 | ```
 75 | `BAM fofn` is a file created by MsPAC that list all the BAM files created by the `prep-reads` step. `Raw reads directory` is the directory with the BAM files `prep-reads` creates. The BAM files contain the raw reads seperated by chromosome.
 76 | 
 77 | ```
 78 | [Assembly params]
 79 | Minimum phased block length = 1000
 80 | Comma-seperated list of haplotypes = 0_1,0_2
 81 | Assembly directory = MsPAC/assembly
 82 | Flanking length = 1000
 83 | Phased bedfile = None
 84 | ```
 85 | `Minimum phased block length` is the minimum size that will be assembled. `Comma-seperated list of haplotypes` are the haplotypes that will be assembled. The options are: `0`,`1`,`2`,`0_1`, and `0_2`. `0`,`1`, and `2` are unambiguous regions, haplotype 1 and haplotype 2. `0_1` and `0_2` are haplotype 1 and 2 with the reads from unambiguous regions added to both haplotype 1 and 2. `Assembly directory ` is the directory with the regions assembled. `Flanking length` is an extra amount of bases added to both ends of each region to be assembled. `Phased bedfile` is a bed file with the regions to assemble. It is created by MsPAC if none is given. `Phased bedfile` should have this tab-delimited format:
 86 | `chromosome start end haplotype low/high`, for example:
 87 | ```
 88 | 22	16050007	16697745	1	low
 89 | 22	16847850	17262375	1	low
 90 | 22	17262464	18711525	1	low
 91 | 22	18712024	18712281	1	low
 92 | 22	50414777	51244565	0	low
 93 | 22	16050007	16697745	2	low
 94 | 22	50414777	51244565	2	low
 95 | 22	16050007	16697745	0_2	low
 96 | 22	20609570	50364777	0_1	high
 97 | 22	50414777	51244565	0_1	low
 98 | ```
 99 | 
100 | ```
101 | [SV calling params]
102 | SV calling directory = MsPAC/sv_calling
103 | reference = input_data/chr22.fa
104 | ```
105 | `SV calling directory` is the directory with the output from the `sv-calling` MsPAC step. `reference` is the genome reference in fasta format. 
106 | 
107 | ```
108 | [Other params]
109 | cluster = No
110 | ```
111 | If `cluster` is "Yes", then the assembly and sv-calling jobs will be sent to the cluster.
112 | 
113 | ```
114 | [HIGH INTENSITY JOB]
115 | walltime = 24
116 | threads = 1
117 | memory = 8
118 | queue = private
119 | 
120 | [LOW INTENSITY JOB]
121 | walltime = 24
122 | threads = 1
123 | memory = 8
124 | queue = private
125 | ```
126 | Regions labelled `low` in `Phased bedfile` will use the `[LOW INTENSITY JOB]` configuration, and similarly for `high` regions.
127 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
  1 | name: MsPAC
  2 | channels:
  3 |   - hcc
  4 |   - conda-forge
  5 |   - bioconda
  6 |   - etetoolkit
  7 |   - defaults
  8 | dependencies:
  9 |   - avro-python2=1.8.2=py_1
 10 |   - bcftools=1.9=h4da6232_0
 11 |   - blas=1.0
 12 |   - blasr=5.3.2=h82bacf8_5
 13 |   - blasr_libcpp=5.3.1=h82bacf8_4
 14 |   - htslib=1.9=hc238db4_4
 15 |   - libdeflate=1.0=h470a237_0
 16 |   - pbbam=0.19.0=h6678c95_1
 17 |   - pbcommand=1.1.1=py27_2
 18 |   - pbcore=1.6.5=py27_0
 19 |   - perl-filesys-df=0.92=pl526h470a237_2
 20 |   - pomegranate=0.3.7=py27_2
 21 |   - pysam=0.15.1=py27h0380709_0
 22 |   - python-consensuscore=1.1.1=py27h02d93b8_2
 23 |   - pyvcf=0.6.8=py27_0
 24 |   - samtools=1.9=h46bd0b3_0
 25 |   - asn1crypto=0.24.0=py27_1003
 26 |   - biopython=1.72=py27h470a237_0
 27 |   - ca-certificates=2018.11.29=ha4d7672_0
 28 |   - cairo=1.14.10=0
 29 |   - certifi=2018.11.29=py27_1000
 30 |   - cffi=1.11.5=py27h5e8e0c9_1
 31 |   - chardet=3.0.4=py27_1003
 32 |   - cryptography=2.3.1=py27hdffb7b8_0
 33 |   - cryptography-vectors=2.3.1=py27_1000
 34 |   - curl=7.63.0=h74213dd_0
 35 |   - cython=0.29.1=py27hfc679d8_0
 36 |   - dbus=1.13.0=h3a4f0e9_0
 37 |   - decorator=4.3.0=py_0
 38 |   - enum34=1.1.6=py27_1001
 39 |   - expat=2.2.5=hfc679d8_2
 40 |   - fontconfig=2.12.6=0
 41 |   - freetype=2.8.1=hfa320df_1
 42 |   - giflib=5.1.4=h470a237_1
 43 |   - glib=2.55.0=h464dc38_2
 44 |   - gnuplot=5.2.3=0
 45 |   - h5py=2.8.0=py27h097b052_4
 46 |   - harfbuzz=1.7.6=0
 47 |   - hdf5=1.10.3=hc401514_2
 48 |   - icu=58.2=hfc679d8_0
 49 |   - idna=2.8=py27_1000
 50 |   - ipaddress=1.0.22=py_1
 51 |   - iso8601=0.1.12=py_1
 52 |   - joblib=0.13.0=py_0
 53 |   - jpeg=9c=h470a237_1
 54 |   - krb5=1.16.2=hbb41f41_0
 55 |   - libcurl=7.63.0=hbdb9355_0
 56 |   - libedit=3.1.20170329=0
 57 |   - libgd=2.2.5=3
 58 |   - libiconv=1.15=h470a237_3
 59 |   - libpng=1.6.34=ha92aebf_2
 60 |   - libssh2=1.8.0=h5b517e9_3
 61 |   - libtiff=4.0.9=he6b73bb_2
 62 |   - libwebp=0.5.2=7
 63 |   - libxcb=1.13=h470a237_2
 64 |   - libxml2=2.9.8=h422b904_5
 65 |   - linecache2=1.0.0=py_1
 66 |   - ncurses=5.9=10
 67 |   - networkx=1.9
 68 |   - openjdk=11.0.1=h470a237_14
 69 |   - openssl=1.0.2p=h470a237_1
 70 |   - pango=1.40.14=0
 71 |   - pcre=8.41=hfc679d8_3
 72 |   - pixman=0.34.0=h470a237_3
 73 |   - pthread-stubs=0.4=h470a237_1
 74 |   - pycparser=2.19=py_0
 75 |   - pyopenssl=18.0.0=py27_1000
 76 |   - pysocks=1.6.8=py27_1002
 77 |   - pytz=2018.7=py_0
 78 |   - readline=7.0=0
 79 |   - requests=2.21.0=py27_1000
 80 |   - six=1.12.0=py27_1000
 81 |   - traceback2=1.4.0=py27_0
 82 |   - unittest2=1.1.0=py_0
 83 |   - urllib3=1.24.1=py27_1000
 84 |   - xorg-libxau=1.0.8=h470a237_6
 85 |   - xorg-libxdmcp=1.1.2=h470a237_7
 86 |   - blas=1.0=mkl
 87 |   - intel-openmp=2019.1=144
 88 |   - mkl=2018.0.3=1
 89 |   - pip=18.1=py27_0
 90 |   - setuptools=40.6.2=py27_0
 91 |   - wheel=0.32.3=py27_0
 92 |   - kalign
 93 |   - zlib=1.2.11
 94 |   - bedtools=2.27.1
 95 |   - python-consensuscore2=3.1.0
 96 |   - numpy=1.15.4
 97 |   - libgcc-ng=8.2.0
 98 |   - qt=5.6.2
 99 |   - perl=5.26.2
100 |   - mkl_random=1.0.1
101 |   - python=2.7.15
102 |   - bzip2=1.0.6
103 |   - numpy-base=1.15.4
104 |   - scipy=1.1.0
105 |   - xz=5.2.4
106 |   - graphite2=1.3.12
107 |   - sqlite=3.25.3
108 |   - mkl_fft=1.0.6
109 |   - tk=8.6.8
110 |   - gstreamer=1.12.5
111 |   - gettext=0.19.8.1
112 |   - genomicconsensus=2.3.2
113 |   - canu=1.8
114 |   - libgcc=7.2.0
115 |   - libffi=3.2.1
116 |   - libgfortran-ng=7.3.0
117 |   - gst-plugins-base=1.12.5
118 |   - libstdcxx-ng=8.2.0
119 | 


--------------------------------------------------------------------------------
/environment_mac.yaml:
--------------------------------------------------------------------------------
  1 | name: MsPAC
  2 | channels:
  3 |   - hcc
  4 |   - conda-forge
  5 |   - bioconda
  6 |   - defaults
  7 |   - etetoolkit
  8 | dependencies:
  9 |   - avro-python2=1.8.2=py_1
 10 |   - bcftools=1.9=h4da6232_0
 11 |   - blasr=5.3.2=h82bacf8_5
 12 |   - blasr_libcpp=5.3.1=h82bacf8_4
 13 |   - htslib=1.9=hc238db4_4
 14 |   - libdeflate=1.0=h470a237_0
 15 |   - pbbam=0.19.0=h6678c95_1
 16 |   - pbcommand=1.1.1=py27_2
 17 |   - pbcore=1.6.5=py27_0
 18 |   - perl-filesys-df=0.92=pl526h470a237_2
 19 |   - pomegranate=0.3.7=py27_2
 20 |   - pysam=0.15.1=py27h0380709_0
 21 |   - python-consensuscore=1.1.1=py27h02d93b8_2
 22 |   - pyvcf=0.6.8=py27_0
 23 |   - samtools=1.9=h46bd0b3_0
 24 |   - asn1crypto=0.24.0=py27_1003
 25 |   - biopython=1.72=py27h470a237_0
 26 |   - ca-certificates=2018.11.29=ha4d7672_0
 27 |   - cairo=1.14.10=0
 28 |   - certifi=2018.11.29=py27_1000
 29 |   - cffi=1.11.5=py27h5e8e0c9_1
 30 |   - chardet=3.0.4=py27_1003
 31 |   - cryptography=2.3.1=py27hdffb7b8_0
 32 |   - cryptography-vectors=2.3.1=py27_1000
 33 |   - curl=7.63.0=h74213dd_0
 34 |   - cython=0.29.1=py27hfc679d8_0
 35 |   - dbus=1.13.0=h3a4f0e9_0
 36 |   - decorator=4.3.0=py_0
 37 |   - enum34=1.1.6=py27_1001
 38 |   - expat=2.2.5=hfc679d8_2
 39 |   - fontconfig=2.12.6=0
 40 |   - freetype=2.8.1=hfa320df_1
 41 |   - giflib=5.1.4=h470a237_1
 42 |   - glib=2.55.0=h464dc38_2
 43 |   - gnuplot=5.2.3=0
 44 |   - h5py=2.8.0=py27h097b052_4
 45 |   - harfbuzz=1.7.6=0
 46 |   - hdf5=1.10.3=hc401514_2
 47 |   - icu=58.2=hfc679d8_0
 48 |   - idna=2.8=py27_1000
 49 |   - ipaddress=1.0.22=py_1
 50 |   - iso8601=0.1.12=py_1
 51 |   - joblib=0.13.0=py_0
 52 |   - jpeg=9c=h470a237_1
 53 |   - krb5=1.16.2=hbb41f41_0
 54 |   - libcurl=7.63.0=hbdb9355_0
 55 |   - libedit=3.1.20170329=0
 56 |   - libgd=2.2.5=3
 57 |   - libiconv=1.15=h470a237_3
 58 |   - libpng=1.6.34=ha92aebf_2
 59 |   - libssh2=1.8.0=h5b517e9_3
 60 |   - libtiff=4.0.9=he6b73bb_2
 61 |   - libwebp=0.5.2=7
 62 |   - libxcb=1.13=h470a237_2
 63 |   - libxml2=2.9.8=h422b904_5
 64 |   - linecache2=1.0.0=py_1
 65 |   - ncurses=5.9=10
 66 |   - networkx=2.2=py_1
 67 |   - openjdk=11.0.1=h470a237_14
 68 |   - openssl=1.0.2p=h470a237_1
 69 |   - pango=1.40.14=0
 70 |   - pcre=8.41=hfc679d8_3
 71 |   - pixman=0.34.0=h470a237_3
 72 |   - pthread-stubs=0.4=h470a237_1
 73 |   - pycparser=2.19=py_0
 74 |   - pyopenssl=18.0.0=py27_1000
 75 |   - pysocks=1.6.8=py27_1002
 76 |   - pytz=2018.7=py_0
 77 |   - readline=7.0=0
 78 |   - requests=2.21.0=py27_1000
 79 |   - six=1.12.0=py27_1000
 80 |   - traceback2=1.4.0=py27_0
 81 |   - unittest2=1.1.0=py_0
 82 |   - urllib3=1.24.1=py27_1000
 83 |   - xorg-libxau=1.0.8=h470a237_6
 84 |   - xorg-libxdmcp=1.1.2=h470a237_7
 85 |   - blas=1.0=mkl
 86 |   - intel-openmp=2019.1=144
 87 |   - mkl=2018.0.3=1
 88 |   - pip=18.1=py27_0
 89 |   - setuptools=40.6.2=py27_0
 90 |   - wheel=0.32.3=py27_0
 91 |   - kalign=2.03
 92 |   - zlib=1.2.11
 93 |   - bedtools=2.27.1
 94 |   - numpy=1.15.4
 95 |   - qt=5.6.2
 96 |   - perl=5.26.2
 97 |   - mkl_random=1.0.1
 98 |   - python=2.7.15
 99 |   - bzip2=1.0.6
100 |   - numpy-base=1.15.4
101 |   - scipy=1.1.0
102 |   - xz=5.2.4
103 |   - graphite2=1.3.12
104 |   - sqlite=3.25.3
105 |   - mkl_fft=1.0.6
106 |   - tk=8.6.8
107 |   - gstreamer=1.14.4
108 |   - gettext=0.19.8.1
109 |   - genomicconsensus=2.3.2
110 |   - canu=1.5
111 |   - libgcc=7.2.0
112 |   - libffi=3.2.1
113 |   - libgfortran-ng=3.0.1
114 |   - gst-plugins-base=1.12.5
115 |   - libstdcxx-ng=8.2.0
116 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='MsPAC',
 5 |     description='',
 6 |     packages=find_packages(),
 7 |     include_package_data=True,
 8 |     entry_points = {
 9 |         'console_scripts': ['MsPAC = MsPAC.MsPAC:main'],
10 |         },
11 |     platforms='any'
12 | )
13 | 


--------------------------------------------------------------------------------
/testing/run.cfg:
--------------------------------------------------------------------------------
 1 | [Input]
 2 | directory = MsPAC
 3 | 
 4 | [Phase-bam input files]
 5 | phased vcf = input_data/test.vcf.gz
 6 | reads aligned = input_data/test.bam
 7 | 
 8 | [Phase-bam params]
 9 | sample name in VCF = 20977
10 | output phased bamfile = input_data/test_phased.bam
11 | 
12 | [Prep reads params]
13 | BAM fofn = reads.fofn
14 | Raw reads directory = MsPAC/prep_reads
15 | 
16 | [Assembly params]
17 | Minimum phased block length = 1000
18 | Comma-seperated list of haplotypes = 0_1,0_2
19 | Assembly directory = MsPAC/assembly
20 | Flanking length = 1000
21 | Phased bedfile = None
22 | 
23 | [SV calling params]
24 | SV calling directory = MsPAC/sv_calling
25 | reference = input_data/chr22.fa
26 | 
27 | [Other params]
28 | cluster = No
29 | 
30 | [HIGH INTENSITY JOB]
31 | walltime = 24
32 | threads = 1
33 | memory = 8
34 | queue = private
35 | 
36 | [LOW INTENSITY JOB]
37 | walltime = 24
38 | threads = 1
39 | memory = 8
40 | queue = private


--------------------------------------------------------------------------------
/testing/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e -x
 3 | 
 4 | if [ -z "${SJOB_DEFALLOC}" ]
 5 | then
 6 |     export SJOB_DEFALLOC=""
 7 | fi
 8 | 
 9 | mkdir -p input_data
10 | cd input_data
11 | 
12 | if [ ! -s reads.bam.bai ]
13 | then
14 |     curl -O https://rodrio10.u.hpc.mssm.edu/MsPAC/reads.bam
15 |     curl -O https://rodrio10.u.hpc.mssm.edu/MsPAC/reads.bam.bai
16 | fi
17 | 
18 | if [ ! -s test.bam.bai ]
19 | then
20 |     curl -O https://rodrio10.u.hpc.mssm.edu/MsPAC/test.bam
21 |     curl -O https://rodrio10.u.hpc.mssm.edu/MsPAC/test.bam.bai
22 | fi
23 | 
24 | if [ ! -s test.vcf.gz.tbi ]
25 | then
26 |     curl -O https://rodrio10.u.hpc.mssm.edu/MsPAC/test.vcf.gz
27 |     curl -O https://rodrio10.u.hpc.mssm.edu/MsPAC/test.vcf.gz.tbi
28 | fi
29 | 
30 | if [ ! -s chr22.fa ]
31 | then
32 |     curl -O http://hgdownload.cse.ucsc.edu/goldenpath/hg19/chromosomes/chr22.fa.gz
33 |     gunzip chr22.fa.gz
34 |     samtools faidx chr22.fa
35 | fi
36 | cd -
37 | 
38 | ls input_data/reads.bam > reads.fofn
39 | 
40 | MsPAC phase-bam run.cfg
41 | MsPAC prep-reads run.cfg
42 | MsPAC assembly run.cfg
43 | MsPAC sv-calling run.cfg
44 | 


--------------------------------------------------------------------------------