├── cmd ├── mcorr-fit │ ├── requirements.txt │ ├── mcorr │ │ ├── __init__.py │ │ ├── writer.py │ │ ├── fit_res.py │ │ ├── corr_res.py │ │ ├── fit_report.py │ │ ├── lmfitFunctions.py │ │ ├── fit_data.py │ │ ├── cli.py │ │ ├── singleFit.py │ │ ├── lmfit_report.py │ │ ├── FitComparison.py │ │ ├── plot.py │ │ └── fit.py │ ├── setup.py │ ├── .gitignore │ └── old │ │ └── fitCorr.py ├── development │ ├── mcorr-vcf │ │ ├── vcf_record.go │ │ └── main.go │ ├── utils │ │ ├── ToDistMatrix.py │ │ └── ClusterCorrResults.py │ ├── mcorr-collect │ │ └── main.go │ ├── FitCollector │ │ ├── old │ │ │ └── getNumPairs.go │ │ └── FitCollector.go │ └── mcorr-pair │ │ └── main.go ├── mcorr-bam │ ├── mapped_read.go │ ├── codon.go │ ├── read_bam.go │ └── main.go ├── mcorr-xmfa │ ├── noncoding_calculator.go │ ├── mate_calculator.go │ ├── coding_calculator.go │ └── main.go └── mcorr-xmfa-2clades │ ├── noncoding_calculator.go │ ├── mate_calculator.go │ ├── coding_calculator.go │ └── main.go ├── .idea ├── misc.xml ├── vcs.xml ├── .gitignore ├── modules.xml └── mcorr.iml ├── .gitignore ├── corr_results.go ├── go.mod ├── bootstrap.go ├── mean_var_test.go ├── mean_var.go ├── collect.go ├── collector.go ├── nucl_cov.go └── README.md /cmd/mcorr-fit/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | matplotlib 3 | lmfit 4 | tqdm 5 | numdifftools 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Datasource local storage ignored files 5 | /dataSources/ 6 | /dataSources.local.xml 7 | # Editor-based HTTP Client requests 8 | /httpRequests/ 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # This .gitignore file was automatically created by Microsoft(R) Visual Studio. 3 | ################################################################################ 4 | 5 | /cmd/fitting/.vs 6 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /cmd/development/mcorr-vcf/vcf_record.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // VCFRecord store information for a VCF. 4 | type VCFRecord struct { 5 | Chrom string // chromosome name 6 | Pos int // position in the chromosome 7 | Ref, Alt string // reference and alternative allels 8 | GTs []byte 9 | } 10 | -------------------------------------------------------------------------------- /cmd/mcorr-bam/mapped_read.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // MappedRead contains the section of a read mapped to a reference genome. 4 | type MappedRead struct { 5 | Pos int 6 | Seq []byte 7 | Qual []byte 8 | } 9 | 10 | // Len return the lenght of a sequence. 11 | func (m MappedRead) Len() int { 12 | return len(m.Seq) 13 | } 14 | -------------------------------------------------------------------------------- /corr_results.go: -------------------------------------------------------------------------------- 1 | package mcorr 2 | 3 | // CorrResult stores a correlation result. 4 | type CorrResult struct { 5 | Lag int 6 | Mean float64 7 | Variance float64 8 | N int 9 | Type string 10 | } 11 | 12 | // CorrResults stores a list of CorrResult with an gene ID. 13 | type CorrResults struct { 14 | ID string 15 | Results []CorrResult 16 | } -------------------------------------------------------------------------------- /cmd/mcorr-fit/mcorr/__init__.py: -------------------------------------------------------------------------------- 1 | from .fit_res import FitRes 2 | from .fit_report import FitReport 3 | from .fit import fit_p2, geom_r1, const_r1 4 | from .corr_res import read_corr 5 | from .fit_data import FitData, FitDatas 6 | ##used to be .io 7 | from .writer import write_fitting_results, write_fitting_reports 8 | from .plot import plot_fit, plot_params 9 | from .fit import fit_one 10 | -------------------------------------------------------------------------------- /.idea/mcorr.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/kussell-lab/mcorr 2 | 3 | go 1.16 4 | 5 | require ( 6 | github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 // indirect 7 | github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 // indirect 8 | github.com/biogo/hts v1.4.3 9 | github.com/kussell-lab/biogo v0.0.0-20180102204004-ca4e680bc9e3 10 | github.com/kussell-lab/ncbiftp v0.0.0-20180102204232-614f5f8e9538 11 | github.com/mattn/go-colorable v0.1.12 // indirect 12 | golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e // indirect 13 | gonum.org/v1/gonum v0.9.3 14 | gopkg.in/VividCortex/ewma.v1 v1.1.1 // indirect 15 | gopkg.in/alecthomas/kingpin.v2 v2.2.6 16 | gopkg.in/cheggaaa/pb.v2 v2.0.7 17 | gopkg.in/fatih/color.v1 v1.7.0 // indirect 18 | gopkg.in/mattn/go-colorable.v0 v0.1.0 // indirect 19 | gopkg.in/mattn/go-isatty.v0 v0.0.4 // indirect 20 | gopkg.in/mattn/go-runewidth.v0 v0.0.4 // indirect 21 | ) 22 | -------------------------------------------------------------------------------- /cmd/mcorr-fit/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | # read requirements. 4 | requirements = [] 5 | with open("requirements.txt", 'rU') as reader: 6 | for line in reader: 7 | requirements.append(line.strip()) 8 | 9 | setup(name='mcorr', 10 | python_requires='>=3', 11 | version='20180506', 12 | description='Inferring recombination rates from correlation profiles', 13 | url='https://github.com/kussell-lab/mcorr', 14 | author='Mingzhi Lin, Asher Preska Steinberg', 15 | author_email='mingzhi9@gmail.com, apsteinberg@nyu.edu', 16 | license='MIT', 17 | packages=['mcorr'], 18 | install_requires=requirements, 19 | entry_points = { 20 | 'console_scripts' : ['mcorr-fit=mcorr.cli:main', 'mcorrFitOne=mcorr.singleFit:main', 21 | 'mcorrFitCompare=mcorr.FitComparison:main'], 22 | }, 23 | zip_safe=False) 24 | -------------------------------------------------------------------------------- /cmd/mcorr-fit/mcorr/writer.py: -------------------------------------------------------------------------------- 1 | from . import FitReport 2 | def write_fitting_results(all_results, model_params, out_file): 3 | """ 4 | write fitting results into a .csv file. 5 | """ 6 | # write fitting results. 7 | sep = "," 8 | with open(out_file, 'w') as out: 9 | out.write(sep.join(model_params)+"\n") 10 | for fit_res in all_results: 11 | values = fit_res.get_values(model_params) 12 | out.write(sep.join([str(x) for x in values])+"\n") 13 | 14 | def write_fitting_reports(all_results, model_params, out_file): 15 | """ 16 | write fitting reports into a .txt file. 17 | """ 18 | with open(out_file, 'w') as out: 19 | for param_name in model_params: 20 | label_name = param_name 21 | if param_name == "ratio": 22 | label_name = "gamma/mu" 23 | report = FitReport(all_results, param_name, label_name) 24 | out.write(report.report()+"\n") 25 | 26 | -------------------------------------------------------------------------------- /cmd/mcorr-fit/mcorr/fit_res.py: -------------------------------------------------------------------------------- 1 | class FitRes(object): 2 | """Fitting results""" 3 | def __init__(self, group, residual, params, d_sample): 4 | self.group = group 5 | self.d_sample = d_sample 6 | self.residual = residual 7 | if "thetaP" in params: 8 | self.theta_pool = params['thetaP'] 9 | if 'phiP' in params: 10 | self.phi_pool = params['phiP'] 11 | if 'f' in params: 12 | self.fbar = params['f'] 13 | if 'phiP' in params: 14 | self.ratio = self.phi_pool / self.theta_pool 15 | if 'f' in params: 16 | self.rho = self.phi_pool * self.fbar 17 | if 'c' in params: 18 | self.c = params['c'] 19 | if 'dc' in params: 20 | self.d_clonal = params['dc'] 21 | if 'dp' in params: 22 | self.d_pool = params['dp'] 23 | if 'phiS' in params: 24 | self.phi_s = params['phiS'] 25 | if 'thetaS' in params: 26 | self.theta_s = params['thetaS'] 27 | 28 | def get_values(self, attributes): 29 | """Get attribute values""" 30 | values = [] 31 | for name in attributes: 32 | if hasattr(self, name): 33 | values.append(getattr(self, name)) 34 | else: 35 | values.append("NA") 36 | return values 37 | 38 | 39 | -------------------------------------------------------------------------------- /bootstrap.go: -------------------------------------------------------------------------------- 1 | package mcorr 2 | 3 | import "math/rand" 4 | import "math" 5 | 6 | // Bootstrap for one bootstrapping instance. 7 | type Bootstrap struct { 8 | ID string 9 | sampleRatio float64 10 | collector *Collector 11 | isRandom bool 12 | } 13 | 14 | // NewBootstrap creates a new Boot, given id and sample ratio. 15 | // Sample ratio must be a float64 from 0 to 1. 16 | // By default, bootstrap should do random sampling. 17 | func NewBootstrap(id string, sampleRatio float64) *Bootstrap { 18 | b := Bootstrap{} 19 | b.ID = id 20 | if sampleRatio < 0 { 21 | sampleRatio = 0 22 | } else if sampleRatio > 1 { 23 | sampleRatio = 1 24 | } 25 | b.sampleRatio = sampleRatio 26 | b.collector = NewCollector() 27 | b.isRandom = true 28 | return &b 29 | } 30 | 31 | // SetRandom set random status 32 | func (b *Bootstrap) SetRandom(r bool) { 33 | b.isRandom = r 34 | } 35 | 36 | // Add add one result into the Bootstrap. 37 | func (b *Bootstrap) Add(results CorrResults) { 38 | if b.isRandom { 39 | k := poisson(b.sampleRatio) 40 | for i := 0; i < k; i++ { 41 | b.collector.Add(results) 42 | } 43 | } else { 44 | b.collector.Add(results) 45 | } 46 | 47 | } 48 | 49 | // Results return final results. 50 | func (b *Bootstrap) Results() (results []CorrResult) { 51 | return b.collector.Results() 52 | } 53 | 54 | func poisson(lambda float64) int { 55 | L := math.Pow(math.E, -lambda) 56 | k := 0 57 | p := 1.0 58 | for p > L { 59 | k++ 60 | p *= rand.Float64() 61 | } 62 | return k - 1 63 | } 64 | -------------------------------------------------------------------------------- /cmd/mcorr-fit/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | .vscode -------------------------------------------------------------------------------- /cmd/development/utils/ToDistMatrix.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from argparse import ArgumentParser 3 | 4 | def main(): 5 | """Run fitting using lmfit""" 6 | parser = ArgumentParser(description="Convert to distant matrix") 7 | parser.add_argument("fit_res_file", type=str) 8 | parser.add_argument("output_file", type=str) 9 | parser.add_argument('--by', nargs='?', const="theta", type=str, default="theta") 10 | opts = parser.parse_args() 11 | datafile = opts.fit_res_file 12 | outfile = opts.output_file 13 | byvalue = opts.by 14 | 15 | dmap = {} 16 | with open(datafile) as reader: 17 | header = reader.readline().rstrip().split(",") 18 | for line in reader: 19 | terms = line.rstrip().split(",") 20 | group = terms[0] 21 | if "_vs_" in group: 22 | isolates = group.split("_vs_") 23 | ddmap = dmap.get(isolates[0], {}) 24 | ddmap[isolates[1]] = terms[header.index(byvalue)] 25 | dmap[isolates[0]] = ddmap 26 | 27 | ddmap = dmap.get(isolates[1], {}) 28 | ddmap[isolates[0]] = terms[header.index(byvalue)] 29 | dmap[isolates[1]] = ddmap 30 | isolates = sorted(dmap.keys()) 31 | with open(outfile, 'w') as writer: 32 | writer.write("," + ",".join(isolates) + "\n") 33 | for isolate1 in isolates: 34 | writer.write(isolate1) 35 | for isolate2 in isolates: 36 | if isolate1 == isolate2: 37 | value = 0 38 | else: 39 | value = float(dmap[isolate1][isolate2]) 40 | writer.write(",%g" % value) 41 | writer.write("\n") 42 | 43 | 44 | 45 | if __name__ == "__main__": 46 | main() -------------------------------------------------------------------------------- /cmd/mcorr-fit/mcorr/corr_res.py: -------------------------------------------------------------------------------- 1 | class CorrRes(object): 2 | """One correlation result""" 3 | def __init__(self, terms): 4 | lag = float(terms[0]) 5 | value = float(terms[1]) 6 | variance = float(terms[2]) 7 | num = float(terms[3]) 8 | corrtype = terms[4] 9 | group = terms[5] 10 | self.lag = lag 11 | self.value = value 12 | self.variance = variance 13 | self.num = num 14 | self.corrtype = corrtype 15 | self.group = group 16 | 17 | def read_corr(csv_file): 18 | """Read corr results in a csv file""" 19 | results = [] 20 | with open(csv_file, 'r') as infile: 21 | for line in infile: 22 | if line.startswith('#'): continue 23 | terms = line.rstrip().split(",") 24 | if terms[0] == 'l': continue 25 | results.append(CorrRes(terms)) 26 | return results 27 | 28 | class GeneCorrRes(object): 29 | """One correlation result""" 30 | def __init__(self, terms): 31 | lag = float(terms[0]) 32 | value = float(terms[1]) 33 | variance = float(terms[3]) 34 | num = float(terms[2]) 35 | corrtype = terms[4] 36 | group = terms[5] 37 | self.lag = lag 38 | self.value = value 39 | self.variance = variance 40 | self.num = num 41 | self.corrtype = corrtype 42 | self.group = group 43 | 44 | def read_genecorr(csv_file): 45 | """Read corr results in a csv file""" 46 | results = [] 47 | with open(csv_file, 'r') as infile: 48 | for line in infile: 49 | if line.startswith('#'): continue 50 | terms = line.rstrip().split(",") 51 | if terms[0] == 'lag': continue 52 | results.append(CorrRes(terms)) 53 | return results 54 | 55 | 56 | -------------------------------------------------------------------------------- /mean_var_test.go: -------------------------------------------------------------------------------- 1 | package mcorr 2 | 3 | import ( 4 | "testing" 5 | "math" 6 | ) 7 | 8 | func TestMeanAndVariance(t *testing.T) { 9 | mv := NewMeanVar(); 10 | if mv.Mean() != 0 { 11 | t.Error("Empty MeanVar should return zero for mean\n") 12 | } 13 | 14 | if !math.IsNaN(mv.Variance()) { 15 | t.Error("Empty MeanVar should return NaN for variance\n") 16 | } 17 | 18 | mv.Add(1.0) 19 | if mv.Mean() != 1.0 { 20 | t.Errorf("Expected 1.0, but got %g\n", mv.Mean()) 21 | } 22 | 23 | if !math.IsNaN(mv.Variance()) { 24 | t.Errorf("Expected NaN, but got %g\n", mv.Variance()) 25 | } 26 | 27 | resValues := []float64{1.0, 2.0, 4.0, 7.0} 28 | sum := 1.0 29 | for _, val := range resValues { 30 | sum += val 31 | } 32 | expectedMean := sum / float64(len(resValues) + 1) 33 | expectedVariance := (1.0 - expectedMean) * (1.0 - expectedMean) 34 | for _, val := range resValues { 35 | expectedVariance += (val - expectedMean) * (val - expectedMean) 36 | } 37 | expectedVariance /= float64(len(resValues) + 1) 38 | for _, val := range resValues { 39 | mv.Add(val) 40 | } 41 | if mv.Mean() != expectedMean { 42 | t.Errorf("Expected %g, but got %g\n", expectedMean, mv.Mean()) 43 | } 44 | if mv.Variance() != expectedVariance { 45 | t.Errorf("Expected %g, but got %g\n", expectedVariance, mv.Variance()) 46 | } 47 | 48 | } 49 | 50 | func TestN(t *testing.T) { 51 | mv := NewMeanVar() 52 | if mv.N() != 0 { 53 | t.Error("Empty MeanVariance should return zero for N()\n") 54 | } 55 | values := []float64{1, 2, 3, 4} 56 | for _, v := range values { 57 | mv.Add(v) 58 | } 59 | if mv.N() != len(values) { 60 | t.Errorf("Expected %d, but got %d\n", len(values), mv.N()) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /cmd/mcorr-xmfa/noncoding_calculator.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/kussell-lab/biogo/seq" 5 | "github.com/kussell-lab/mcorr" 6 | ) 7 | 8 | // NoncodingCalculator for calculating noncoding sequences. 9 | type NoncodingCalculator struct { 10 | MaxLen int 11 | } 12 | 13 | // NewNoncodingCalculator return a NoncodingCalculator 14 | func NewNoncodingCalculator(maxLen int) *NoncodingCalculator { 15 | return &NoncodingCalculator{ 16 | MaxLen: maxLen, 17 | } 18 | } 19 | 20 | // CalcP2 calculate P2 21 | func (cc *NoncodingCalculator) CalcP2(alignment []seq.Sequence, others ...[]seq.Sequence) (results []mcorr.CorrResult) { 22 | return calcP2Noncoding(alignment, cc.MaxLen) 23 | } 24 | 25 | func calcP2Noncoding(aln []seq.Sequence, maxLen int) (results []mcorr.CorrResult) { 26 | for l := 0; l < maxLen; l++ { 27 | totalxy := 0.0 28 | totaln := 0 29 | for i := 0; i+l < len(aln[0].Seq); i++ { 30 | j := i + l 31 | basePairs := [][]byte{} 32 | for _, s := range aln { 33 | basePairs = append(basePairs, []byte{s.Seq[i], s.Seq[j]}) 34 | } 35 | 36 | nc := doubleCounts(basePairs) 37 | xy, n := nc.P11(0) 38 | totalxy += xy 39 | totaln += n 40 | } 41 | if totaln > 0 { 42 | res := mcorr.CorrResult{ 43 | Lag: l, 44 | Mean: totalxy / float64(totaln), 45 | N: totaln, 46 | Type: "P2"} 47 | results = append(results, res) 48 | } 49 | } 50 | 51 | return 52 | } 53 | 54 | func doubleCounts(basePairs [][]byte) *mcorr.NuclCov { 55 | alphabet := []byte{'A', 'T', 'G', 'C'} 56 | c := mcorr.NewNuclCov(alphabet) 57 | for _, basePair := range basePairs { 58 | a := basePair[0] 59 | b := basePair[1] 60 | if isATGC(a) && isATGC(b) { 61 | c.Add(a, b) 62 | } 63 | } 64 | return c 65 | } 66 | 67 | // ATGC is DNA alphabet. 68 | const ATGC = "ATGC" 69 | const atgc = "atgc" 70 | 71 | func isATGC(b byte) bool { 72 | yes := false 73 | for i := 0; i < len(ATGC); i++ { 74 | if b == ATGC[i] { 75 | yes = true 76 | break 77 | } 78 | } 79 | return yes 80 | } 81 | -------------------------------------------------------------------------------- /cmd/mcorr-xmfa-2clades/noncoding_calculator.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/kussell-lab/biogo/seq" 5 | "github.com/kussell-lab/mcorr" 6 | ) 7 | 8 | // NoncodingCalculator for calculating noncoding sequences. 9 | type NoncodingCalculator struct { 10 | MaxLen int 11 | } 12 | 13 | // NewNoncodingCalculator return a NoncodingCalculator 14 | func NewNoncodingCalculator(maxLen int) *NoncodingCalculator { 15 | return &NoncodingCalculator{ 16 | MaxLen: maxLen, 17 | } 18 | } 19 | 20 | // CalcP2 calculate P2 21 | func (cc *NoncodingCalculator) CalcP2(alignment []seq.Sequence, others ...[]seq.Sequence) (results []mcorr.CorrResult) { 22 | return calcP2Noncoding(alignment, cc.MaxLen) 23 | } 24 | 25 | func calcP2Noncoding(aln []seq.Sequence, maxLen int) (results []mcorr.CorrResult) { 26 | for l := 0; l < maxLen; l++ { 27 | totalxy := 0.0 28 | totaln := 0 29 | for i := 0; i+l < len(aln[0].Seq); i++ { 30 | j := i + l 31 | basePairs := [][]byte{} 32 | for _, s := range aln { 33 | basePairs = append(basePairs, []byte{s.Seq[i], s.Seq[j]}) 34 | } 35 | 36 | nc := doubleCounts(basePairs) 37 | xy, n := nc.P11(0) 38 | totalxy += xy 39 | totaln += n 40 | } 41 | if totaln > 0 { 42 | res := mcorr.CorrResult{ 43 | Lag: l, 44 | Mean: totalxy / float64(totaln), 45 | N: totaln, 46 | Type: "P2"} 47 | results = append(results, res) 48 | } 49 | } 50 | 51 | return 52 | } 53 | 54 | func doubleCounts(basePairs [][]byte) *mcorr.NuclCov { 55 | alphabet := []byte{'A', 'T', 'G', 'C'} 56 | c := mcorr.NewNuclCov(alphabet) 57 | for _, basePair := range basePairs { 58 | a := basePair[0] 59 | b := basePair[1] 60 | if isATGC(a) && isATGC(b) { 61 | c.Add(a, b) 62 | } 63 | } 64 | return c 65 | } 66 | 67 | // ATGC is DNA alphabet. 68 | const ATGC = "ATGC" 69 | const atgc = "atgc" 70 | 71 | func isATGC(b byte) bool { 72 | yes := false 73 | for i := 0; i < len(ATGC); i++ { 74 | if b == ATGC[i] { 75 | yes = true 76 | break 77 | } 78 | } 79 | return yes 80 | } 81 | -------------------------------------------------------------------------------- /mean_var.go: -------------------------------------------------------------------------------- 1 | package mcorr 2 | 3 | import ( 4 | "math" 5 | ) 6 | 7 | // MeanVar is for calculate mean and variance in the increment way. 8 | type MeanVar struct { 9 | n int // number of values. 10 | m1 float64 // first moment. 11 | dev float64 12 | nDev float64 13 | m2 float64 // second moment. 14 | biasCorrected bool 15 | } 16 | 17 | // NewMeanVar return a new MeanVar. 18 | func NewMeanVar() *MeanVar { 19 | return &MeanVar{} 20 | } 21 | 22 | // Add adds a value. 23 | func (m *MeanVar) Add(v float64) { 24 | m.n++ 25 | m.dev = v - m.m1 26 | m.nDev = m.dev / float64(m.n) 27 | m.m1 += m.nDev 28 | m.m2 += float64(m.n-1) * m.dev * m.nDev 29 | } 30 | 31 | // Mean returns the mean result. 32 | func (m *MeanVar) Mean() float64 { 33 | return m.m1 34 | } 35 | 36 | // Variance returns the variance. 37 | func (m *MeanVar) Variance() float64 { 38 | if m.n < 2 { 39 | return math.NaN() 40 | } 41 | 42 | if m.biasCorrected { 43 | return m.m2 / float64(m.n-1) 44 | } 45 | 46 | return m.m2 / float64(m.n) 47 | } 48 | 49 | // N returns the number of values. 50 | func (m *MeanVar) N() int { 51 | return m.n 52 | } 53 | 54 | // IsBiasCorrected return true if the variance will be bias corrected. 55 | func (m *MeanVar) IsBiasCorrected() bool { 56 | return m.biasCorrected 57 | } 58 | 59 | // SetBiasCorrected sets if bias corrected. 60 | func (m *MeanVar) SetBiasCorrected(biasCorrected bool) { 61 | m.biasCorrected = biasCorrected 62 | } 63 | 64 | // Append add another result. 65 | func (m *MeanVar) Append(m2 *MeanVar) { 66 | if m.n == 0 { 67 | m.n = m2.n 68 | m.m1 = m2.m1 69 | m.dev = m2.dev 70 | m.nDev = m2.nDev 71 | m.m2 = m2.m2 72 | } else { 73 | if m2.n > 0 { 74 | total1 := m.m1 * float64(m.n) 75 | total2 := m2.m1 * float64(m2.n) 76 | newMean := (total1 + total2) / float64(m.n+m2.n) 77 | delta1 := m.Mean() - newMean 78 | delta2 := m2.Mean() - newMean 79 | sm := (m.m2 + m2.m2) + float64(m.n)*delta1*delta1 + float64(m2.n)*delta2*delta2 80 | m.m1 = newMean 81 | m.m2 = sm 82 | m.n = m.n + m2.n 83 | } 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /cmd/development/utils/ClusterCorrResults.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | import numpy as np 4 | from sklearn.cluster import KMeans 5 | from argparse import ArgumentParser 6 | 7 | parser = ArgumentParser(description="Cluster correlation results.") 8 | parser.add_argument("corr_file", type=str) 9 | parser.add_argument("output_prefix", type=str) 10 | parser.add_argument('--corr_type', nargs='?', const="P4", type=str, default="P4") 11 | parser.add_argument('--xmin', nargs='?', const=3, type=int, default=3) 12 | parser.add_argument('--xmax', nargs='?', const=150, type=int, default=150) 13 | opts = parser.parse_args() 14 | labels = [] 15 | X = [] 16 | corr_results = {} 17 | with open(opts.corr_file) as reader: 18 | for line in reader: 19 | data = json.loads(line) 20 | labels.append(data['ID']) 21 | corr = [] 22 | for res in data['Results']: 23 | if res['Type'] == 'P2' and int(res['Lag']) == 0 and int(res['N']) > 0: 24 | if len(corr) == 0: 25 | corr.append(float(res['Mean'])) 26 | else: 27 | corr[0] = float(res['Mean']) 28 | elif res['Type'] == opts.corr_type and int(res['Lag']) > 0 and int(res['Lag']) < 150 and int(res['N']) > 0: 29 | idx = int(res['Lag']) / 3 30 | while len(corr) <= idx: 31 | corr.append(0.0) 32 | corr[idx] = float(res['Mean']) 33 | while len(corr) < 50: 34 | corr.append(0.0) 35 | X.append(corr) 36 | corr_results[data['ID']] = data 37 | X = np.array(X) 38 | n_clusters = 2 39 | kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(X) 40 | 41 | for i in range(n_clusters): 42 | outfile = "%s_%d.json" % (opts.output_prefix, i) 43 | with open(outfile, 'w') as w: 44 | for idx, c in enumerate(kmeans.labels_): 45 | if c == i: 46 | w.write(json.dumps(corr_results[labels[idx]]) + "\n") 47 | outfile = "%s_%d.txt" % (opts.output_prefix, i) 48 | with open(outfile, 'w') as w: 49 | for idx, c in enumerate(kmeans.labels_): 50 | if c == i: 51 | w.write(labels[idx] + "\n") -------------------------------------------------------------------------------- /collect.go: -------------------------------------------------------------------------------- 1 | package mcorr 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "os" 7 | ) 8 | 9 | // PipeOutCorrResults pipe the the channel of CorrResults out to a file. 10 | func PipeOutCorrResults(corrResChan chan CorrResults, outFile string) chan CorrResults { 11 | c := make(chan CorrResults) 12 | go func() { 13 | defer close(c) 14 | f, err := os.Create(outFile) 15 | if err != nil { 16 | panic(err) 17 | } 18 | defer f.Close() 19 | 20 | encoder := json.NewEncoder(f) 21 | for res := range corrResChan { 22 | if err := encoder.Encode(res); err != nil { 23 | panic(err) 24 | } 25 | c <- res 26 | } 27 | }() 28 | return c 29 | } 30 | 31 | // Collect feed correlation results into boostrappers and return them. 32 | func Collect(corrResChan chan CorrResults, numBoot int) []*Bootstrap { 33 | // prepare bootstrappers. 34 | bootstraps := []*Bootstrap{} 35 | notBootstrap := NewBootstrap("all", 1.0) 36 | notBootstrap.SetRandom(false) 37 | bootstraps = append(bootstraps, notBootstrap) 38 | for i := 0; i < numBoot; i++ { 39 | id := fmt.Sprintf("boot_%d", i) 40 | sampleRatio := 1.0 41 | bootstraps = append(bootstraps, NewBootstrap(id, sampleRatio)) 42 | } 43 | 44 | for corrResults := range corrResChan { 45 | for _, bs := range bootstraps { 46 | bs.Add(corrResults) 47 | } 48 | } 49 | return bootstraps 50 | } 51 | 52 | // CollectWrite collects and writes the correlation results. 53 | func CollectWrite(corrResChan chan CorrResults, outFile string, numBoot int) { 54 | bootstraps := Collect(corrResChan, numBoot) 55 | 56 | w, err := os.Create(outFile) 57 | if err != nil { 58 | panic(err) 59 | } 60 | defer w.Close() 61 | 62 | w.WriteString("# l: the distance between two genomic positions\n") 63 | w.WriteString("# m: the mean value of correlatio profile\n") 64 | w.WriteString("# v: the variance of correlation profile\n") 65 | w.WriteString("# n: the total number of alignments used for calculation\n") 66 | w.WriteString("# t: the type of result: Ks is for d_sample, and P2 is for correlation profile\n") 67 | w.WriteString("# b: the bootstrap number (all means used all alignments).\n") 68 | 69 | w.WriteString("l,m,v,n,t,b\n") 70 | for _, bs := range bootstraps { 71 | //this is where division by P(l = 0) happens!!!! 72 | results := bs.Results() 73 | for _, res := range results { 74 | w.WriteString(fmt.Sprintf("%d,%g,%g,%d,%s,%s\n", res.Lag, res.Mean, res.Variance, res.N, res.Type, bs.ID)) 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /cmd/mcorr-fit/mcorr/fit_report.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | class FitReport(object): 3 | """statistics report of fitting results""" 4 | def __init__(self, fit_results, param_name, label_name=None): 5 | """generate FitReport from fit_results of the param""" 6 | self.param_name = param_name 7 | if label_name is None: 8 | self.label_name = param_name 9 | else: 10 | self.label_name = label_name 11 | 12 | self.boot_data = [] 13 | self.raw_value = None 14 | for res in fit_results: 15 | if hasattr(res, param_name): 16 | value = getattr(res, param_name) 17 | group = res.group 18 | if group == "all": 19 | self.raw_value = value 20 | else: 21 | self.boot_data.append(value) 22 | def get_param_name(self): 23 | return self.param_name 24 | 25 | def get_label_name(self): 26 | return self.label_name 27 | 28 | def get_raw_value(self): 29 | return self.raw_value 30 | 31 | def get_boot_size(self): 32 | """return the size of the bootstrapping data""" 33 | return len(self.boot_data) 34 | 35 | def get_boot_mean(self): 36 | """return mean of the bootstrapping data""" 37 | return numpy.mean(self.boot_data) 38 | 39 | def get_boot_std(self): 40 | """return standard deviation of the bootstrapping data""" 41 | return numpy.std(self.boot_data) 42 | 43 | def get_boot_median(self): 44 | """return median of the bootstrapping data""" 45 | return numpy.median(self.boot_data) 46 | 47 | def get_boot_lower_bound(self): 48 | """return bootstrapping lower bound""" 49 | return numpy.percentile(self.boot_data, 5) 50 | 51 | def get_boot_upper_bound(self): 52 | """return bootstrapping upper bound""" 53 | return numpy.percentile(self.boot_data, 95) 54 | 55 | def report(self): 56 | value = "" 57 | value += "[%s]\n" % self.get_label_name() 58 | if self.get_raw_value(): 59 | value += "value = %g\n" % self.get_raw_value() 60 | if len(self.boot_data) >= 10: 61 | value += "bootstrapping mean = %g\n" % self.get_boot_mean() 62 | value += "bootstrapping standard deviation = %g\n" % self.get_boot_std() 63 | value += "bootstrapping size = %d\n" % self.get_boot_size() 64 | value += "bootstrapping median = %g\n" % self.get_boot_median() 65 | value += "bootstrapping lower bound (5%%) = %g\n" % \ 66 | self.get_boot_lower_bound() 67 | value += "bootstrapping upper bound (95%%) = %g\n" % \ 68 | self.get_boot_upper_bound() 69 | return value 70 | -------------------------------------------------------------------------------- /collector.go: -------------------------------------------------------------------------------- 1 | package mcorr 2 | 3 | // Collector collect correlation results. 4 | type Collector struct { 5 | m map[string][]*MeanVar 6 | minN int 7 | } 8 | 9 | // NewCollector return a new Collector. 10 | func NewCollector() *Collector { 11 | c := Collector{} 12 | c.m = make(map[string][]*MeanVar) 13 | return &c 14 | } 15 | 16 | // Add add an array of CorrResult. 17 | func (c *Collector) Add(results CorrResults) { 18 | for _, res := range results.Results { 19 | for len(c.m[res.Type]) <= res.Lag { 20 | c.m[res.Type] = append(c.m[res.Type], NewMeanVar()) 21 | } 22 | if res.N > c.minN { 23 | c.m[res.Type][res.Lag].Add(res.Mean) 24 | } 25 | } 26 | } 27 | 28 | // Means return means of a particular type. 29 | func (c *Collector) Means(corrType string) (values []float64) { 30 | for _, mv := range c.MeanVars(corrType) { 31 | values = append(values, mv.Mean()) 32 | } 33 | return 34 | } 35 | 36 | // Vars return variances of a particular type. 37 | func (c *Collector) Vars(corrType string) (values []float64) { 38 | for _, mv := range c.MeanVars(corrType) { 39 | values = append(values, mv.Variance()) 40 | } 41 | return 42 | } 43 | 44 | // Ns return variances of a particular type. 45 | func (c *Collector) Ns(corrType string) (nums []int) { 46 | for _, mv := range c.MeanVars(corrType) { 47 | nums = append(nums, mv.N()) 48 | } 49 | return 50 | } 51 | 52 | // MeanVars return a list of meanvar.MeanVar. 53 | func (c *Collector) MeanVars(corrType string) (values []*MeanVar) { 54 | return c.m[corrType] 55 | } 56 | 57 | // CorrTypes return all corr types. 58 | func (c *Collector) CorrTypes() (corrTypes []string) { 59 | for key := range c.m { 60 | corrTypes = append(corrTypes, key) 61 | } 62 | return 63 | } 64 | 65 | // Results get results 66 | func (c *Collector) Results() (results []CorrResult) { 67 | // Failed fitting. 68 | if len(c.Means("P2")) == 0 { 69 | return nil 70 | } 71 | 72 | // calculate ks first 73 | ks := c.Means("P2")[0] 74 | results = append(results, 75 | CorrResult{ 76 | Lag: 0, 77 | N: c.Ns("P2")[0], 78 | Type: "Ks", 79 | Mean: c.Means("P2")[0], 80 | Variance: c.Vars("P2")[0], 81 | }) 82 | if ks == 0 { 83 | return 84 | } 85 | 86 | for _, ctype := range c.CorrTypes() { 87 | means := c.Means(ctype) 88 | vars := c.Vars(ctype) 89 | ns := c.Ns(ctype) 90 | for i := 0; i < len(means); i++ { 91 | if !(ctype == "P2" && i == 0) && ns[i] > 0 { 92 | res := CorrResult{} 93 | res.Lag = i 94 | res.N = ns[i] 95 | res.Type = ctype 96 | res.Mean = means[i] / ks 97 | res.Variance = vars[i] / (ks * ks) 98 | results = append(results, res) 99 | } 100 | } 101 | } 102 | 103 | return 104 | } 105 | -------------------------------------------------------------------------------- /cmd/development/mcorr-collect/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "io" 7 | "os" 8 | "sort" 9 | 10 | "github.com/kussell-lab/mcorr" 11 | "gopkg.in/alecthomas/kingpin.v2" 12 | ) 13 | 14 | func main() { 15 | app := kingpin.New("mcorr-collect", "Collect results.") 16 | app.Version("v0.1") 17 | 18 | alnFile := app.Arg("in", "json file").Required().String() 19 | outFile := app.Arg("out", "Output file in CSV format.").Required().String() 20 | numBoot := app.Flag("num-boot", "Number of bootstrapping on genes").Default("1000").Int() 21 | corrType := app.Flag("corr-type", "correlation type").Default("P4").String() 22 | kingpin.MustParse(app.Parse(os.Args[1:])) 23 | 24 | resChan := readCorrRes(*alnFile) 25 | if *corrType == "P2" { 26 | mcorr.CollectWrite(resChan, *outFile, *numBoot) 27 | } else { 28 | bootstraps := mcorr.Collect(resChan, *numBoot) 29 | 30 | w, err := os.Create(*outFile) 31 | if err != nil { 32 | panic(err) 33 | } 34 | defer w.Close() 35 | 36 | w.WriteString("l,m,v,n,t,b\n") 37 | for _, bs := range bootstraps { 38 | results := bs.Results() 39 | qfactor := getQfactor(results) 40 | for _, res := range results { 41 | if res.Type == "Ks" || (res.Type == "P4" && res.Lag > 0) { 42 | if res.Type == "P4" { 43 | res.Mean *= qfactor 44 | res.Variance *= qfactor * qfactor 45 | res.Type = "P2" 46 | } 47 | w.WriteString(fmt.Sprintf("%d,%g,%g,%d,%s,%s\n", 48 | res.Lag, res.Mean, res.Variance, res.N, res.Type, bs.ID)) 49 | } 50 | } 51 | } 52 | } 53 | 54 | } 55 | 56 | // readCorrRes return a channel of CorrRes 57 | func readCorrRes(filename string) chan mcorr.CorrResults { 58 | c := make(chan mcorr.CorrResults) 59 | go func() { 60 | defer close(c) 61 | f, err := os.Open(filename) 62 | if err != nil { 63 | panic(err) 64 | } 65 | defer f.Close() 66 | 67 | decoder := json.NewDecoder(f) 68 | for { 69 | var corrResults mcorr.CorrResults 70 | if err := decoder.Decode(&corrResults); err != nil { 71 | if err != io.EOF { 72 | panic(err) 73 | } 74 | break 75 | } 76 | c <- corrResults 77 | } 78 | }() 79 | return c 80 | } 81 | 82 | // getQfactor return the q factor between p2 and p4. 83 | func getQfactor(results []mcorr.CorrResult) float64 { 84 | p2values := make([]float64, 31) 85 | p4values := make([]float64, 31) 86 | for _, res := range results { 87 | if res.Lag <= 30 && res.Lag > 0 { 88 | if res.Type == "P2" { 89 | p2values[res.Lag] = res.Mean 90 | } else if res.Type == "P4" { 91 | p4values[res.Lag] = res.Mean 92 | } 93 | } 94 | } 95 | 96 | var factors []float64 97 | for i := range p2values { 98 | if p2values[i] > 0 && p4values[i] > 0 { 99 | factors = append(factors, p2values[i]/p4values[i]) 100 | } 101 | } 102 | 103 | if len(factors) == 0 { 104 | return 0 105 | } 106 | 107 | sort.Float64s(factors) 108 | if len(factors)%2 == 0 { 109 | return (factors[len(factors)/2] + factors[len(factors)/2-1]) / 2 110 | } 111 | return (factors[len(factors)/2]) 112 | } 113 | -------------------------------------------------------------------------------- /cmd/mcorr-xmfa/mate_calculator.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/kussell-lab/mcorr" 5 | "github.com/kussell-lab/ncbiftp/taxonomy" 6 | ) 7 | 8 | // MateCalculator for calculating correlation for two clusters of sequences. 9 | type MateCalculator struct { 10 | CodingTable *taxonomy.GeneticCode 11 | MaxCodonLen int 12 | CodonOffset int 13 | CodonPosition int 14 | Synonymous bool 15 | } 16 | 17 | // NewMateCalculator returns a MateCalculator 18 | func NewMateCalculator(codingTable *taxonomy.GeneticCode, maxCodonLen, codonOffset, codonPos int, synonymous bool) *MateCalculator { 19 | return &MateCalculator{ 20 | CodingTable: codingTable, 21 | MaxCodonLen: maxCodonLen, 22 | CodonOffset: codonOffset, 23 | CodonPosition: codonPos, 24 | Synonymous: synonymous, 25 | } 26 | } 27 | 28 | // CalcP2 calcualtes P2 29 | func (cc *MateCalculator) CalcP2(aln1 Alignment, mates ...Alignment) (corrResults mcorr.CorrResults) { 30 | if len(mates) == 0 { 31 | return 32 | } 33 | 34 | var results []mcorr.CorrResult 35 | cs1 := cc.extractCodonSequences(aln1) 36 | cs2 := cc.extractCodonSequences(mates[0]) 37 | 38 | for l := 0; l < cc.MaxCodonLen; l++ { 39 | totalP2 := 0.0 40 | totaln := 0 41 | for pos := 0; pos+l < len(cs1[0]) && pos+l < len(cs2[0]); pos++ { 42 | cpList1 := cc.extractCodonPairs(cs1, pos, pos+l) 43 | cpList2 := cc.extractCodonPairs(cs2, pos, pos+l) 44 | for _, cp1 := range cpList1 { 45 | nc1 := doubleCodons(cp1, cc.CodonPosition) 46 | for _, cp2 := range cpList2 { 47 | nc2 := doubleCodons(cp2, cc.CodonPosition) 48 | if cc.Synonymous { 49 | aa1 := cc.translateCodonPair(cp1[0]) 50 | aa2 := cc.translateCodonPair(cp2[0]) 51 | if aa1 == aa2 { 52 | xy, n := nc1.MateP11(nc2, 0) 53 | totalP2 += xy 54 | totaln += n 55 | } 56 | } else { 57 | xy, n := nc1.MateP11(nc2, 0) 58 | totalP2 += xy 59 | totaln += n 60 | } 61 | } 62 | } 63 | } 64 | if totaln > 0 { 65 | res1 := mcorr.CorrResult{ 66 | Lag: l * 3, 67 | Mean: totalP2 / float64(totaln), 68 | N: totaln, 69 | Type: "P2", 70 | } 71 | results = append(results, res1) 72 | } 73 | } 74 | 75 | corrResults = mcorr.CorrResults{ID: aln1.ID, Results: results} 76 | 77 | return 78 | } 79 | 80 | func (cc *MateCalculator) translateCodonPair(cp CodonPair) string { 81 | a := cc.CodingTable.Table[string(cp.A)] 82 | b := cc.CodingTable.Table[string(cp.B)] 83 | return string([]byte{a, b}) 84 | } 85 | 86 | func (cc *MateCalculator) extractCodonSequences(aln Alignment) (csList []CodonSequence) { 87 | for _, s := range aln.Sequences { 88 | csList = append(csList, extractCodons(s, cc.CodonOffset)) 89 | } 90 | return 91 | } 92 | 93 | func (cc *MateCalculator) extractCodonPairs(codonSequences []CodonSequence, i, j int) [][]CodonPair { 94 | codonPairs := []CodonPair{} 95 | for _, cc := range codonSequences { 96 | if i < len(cc) && j < len(cc) { 97 | pair := CodonPair{A: cc[i], B: cc[j]} 98 | codonPairs = append(codonPairs, pair) 99 | } 100 | } 101 | 102 | if cc.Synonymous { 103 | return synonymousSplit(codonPairs, cc.CodingTable) 104 | } 105 | 106 | return [][]CodonPair{codonPairs} 107 | } 108 | -------------------------------------------------------------------------------- /cmd/mcorr-xmfa-2clades/mate_calculator.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/kussell-lab/mcorr" 5 | "github.com/kussell-lab/ncbiftp/taxonomy" 6 | ) 7 | 8 | // MateCalculator for calculating correlation for two clusters of sequences. 9 | type MateCalculator struct { 10 | CodingTable *taxonomy.GeneticCode 11 | MaxCodonLen int 12 | CodonOffset int 13 | CodonPosition int 14 | Synonymous bool 15 | } 16 | 17 | // NewMateCalculator returns a MateCalculator 18 | func NewMateCalculator(codingTable *taxonomy.GeneticCode, maxCodonLen, codonOffset, codonPos int, synonymous bool) *MateCalculator { 19 | return &MateCalculator{ 20 | CodingTable: codingTable, 21 | MaxCodonLen: maxCodonLen, 22 | CodonOffset: codonOffset, 23 | CodonPosition: codonPos, 24 | Synonymous: synonymous, 25 | } 26 | } 27 | 28 | // CalcP2 calcualtes P2 29 | func (cc *MateCalculator) CalcP2(aln1 Alignment, mates ...Alignment) (corrResults mcorr.CorrResults) { 30 | if len(mates) == 0 { 31 | return 32 | } 33 | 34 | var results []mcorr.CorrResult 35 | cs1 := cc.extractCodonSequences(aln1) 36 | cs2 := cc.extractCodonSequences(mates[0]) 37 | 38 | for l := 0; l < cc.MaxCodonLen; l++ { 39 | totalP2 := 0.0 40 | totaln := 0 41 | for pos := 0; pos+l < len(cs1[0]) && pos+l < len(cs2[0]); pos++ { 42 | cpList1 := cc.extractCodonPairs(cs1, pos, pos+l) 43 | cpList2 := cc.extractCodonPairs(cs2, pos, pos+l) 44 | for _, cp1 := range cpList1 { 45 | nc1 := doubleCodons(cp1, cc.CodonPosition) 46 | for _, cp2 := range cpList2 { 47 | nc2 := doubleCodons(cp2, cc.CodonPosition) 48 | if cc.Synonymous { 49 | aa1 := cc.translateCodonPair(cp1[0]) 50 | aa2 := cc.translateCodonPair(cp2[0]) 51 | if aa1 == aa2 { 52 | xy, n := nc1.MateP11(nc2, 0) 53 | totalP2 += xy 54 | totaln += n 55 | } 56 | } else { 57 | xy, n := nc1.MateP11(nc2, 0) 58 | totalP2 += xy 59 | totaln += n 60 | } 61 | } 62 | } 63 | } 64 | if totaln > 0 { 65 | res1 := mcorr.CorrResult{ 66 | Lag: l * 3, 67 | Mean: totalP2 / float64(totaln), 68 | N: totaln, 69 | Type: "P2", 70 | } 71 | results = append(results, res1) 72 | } 73 | } 74 | 75 | corrResults = mcorr.CorrResults{ID: aln1.ID, Results: results} 76 | 77 | return 78 | } 79 | 80 | func (cc *MateCalculator) translateCodonPair(cp CodonPair) string { 81 | a := cc.CodingTable.Table[string(cp.A)] 82 | b := cc.CodingTable.Table[string(cp.B)] 83 | return string([]byte{a, b}) 84 | } 85 | 86 | func (cc *MateCalculator) extractCodonSequences(aln Alignment) (csList []CodonSequence) { 87 | for _, s := range aln.Sequences { 88 | csList = append(csList, extractCodons(s, cc.CodonOffset)) 89 | } 90 | return 91 | } 92 | 93 | func (cc *MateCalculator) extractCodonPairs(codonSequences []CodonSequence, i, j int) [][]CodonPair { 94 | codonPairs := []CodonPair{} 95 | for _, cc := range codonSequences { 96 | if i < len(cc) && j < len(cc) { 97 | pair := CodonPair{A: cc[i], B: cc[j]} 98 | codonPairs = append(codonPairs, pair) 99 | } 100 | } 101 | 102 | if cc.Synonymous { 103 | return synonymousSplit(codonPairs, cc.CodingTable) 104 | } 105 | 106 | return [][]CodonPair{codonPairs} 107 | } 108 | -------------------------------------------------------------------------------- /cmd/mcorr-fit/mcorr/lmfitFunctions.py: -------------------------------------------------------------------------------- 1 | """defining lmfit functions for fitCorr.py script""" 2 | from lmfit import Minimizer, Parameters, minimize 3 | import numpy as np 4 | import numdifftools 5 | 6 | def c_s(phi_s, w, f, theta_s, a): 7 | """eq 21""" 8 | c_s = (phi_s*w*f)/(1+theta_s*a+phi_s*w*f) 9 | return c_s 10 | 11 | def d_i(a, theta_i): 12 | "eq 6" 13 | d = theta_i/(1+theta_i*a) 14 | return d 15 | 16 | def c_s0(c_s1, c_s2, l): 17 | "eq 14" 18 | return np.ones(len(l))-c_s1-c_s2 19 | 20 | def c_s1(w, a, phi_s, l, theta_s, f): 21 | "eq 23" 22 | ##for l < f 23 | c_s1less = (2*phi_s*w*l)/(1+2*theta_s*a+phi_s*w*(f+l)) 24 | ## for f >= l 25 | c_s1greater = (2*phi_s*w*l)/(1+2*theta_s*a+phi_s*w) 26 | return np.where(l < f, c_s1less, c_s1greater) 27 | 28 | def c_s2(phi_s, w, f, l, theta_s, a): 29 | "eq 24" 30 | c_s2 = (phi_s*w*(f-l))/(1+2*theta_s*a+phi_s*w*(f+l)) 31 | return np.where(l < f, c_s2, 0) 32 | 33 | def Q_p(theta_p, a, phi_p, w, l): 34 | "eq 25" 35 | Q_p = 2*((theta_p/(1+theta_p*a))**2)*((1+theta_p*a+phi_p*w*l)/(1+2*theta_p*a+2*phi_p*w*l)) 36 | return Q_p 37 | 38 | def residual(pars, x, data=None): 39 | "defines the function to be minimized -- the residuals of equation 18 for P2" 40 | ##load in parameters from lmfit 41 | phi_s = pars["phi_s"] 42 | #f = pars["f"] 43 | theta_s = pars["theta_s"] 44 | theta_p = pars["theta_p"] 45 | w = pars["w"] 46 | a = pars["a"] 47 | c_s = pars["c_s"] 48 | #d_theta_p = pars["d_theta_p"] 49 | d_theta_s = pars["d_theta_s"] 50 | phi_p = pars["phi_p"] 51 | ds = pars["d_s"] 52 | f = pars["f"] 53 | 54 | ##define equations to be plugged into eq 18 55 | #cs0 = c_s0(w, a, theta_s, phi_s, f, x) # eq 22 56 | d2thetas = d_i(a, 2*theta_s) #eq 20 for 2*theta_s 57 | #cs1 = pars["c_s1"] 58 | cs1 = c_s1(w, a, phi_s, x, theta_s, f) # eq 23 59 | dp = d_i(a, theta_p) # eq 20 for theta_p 60 | cs2 = c_s2(phi_s, w, f, x, theta_s, a) # eq 24 61 | cs0 = c_s0(cs1, cs2, x) 62 | Qp = Q_p(theta_p, a, phi_p, w, x) # eq 25 63 | ##FINALLY EQ 18 64 | Qs = cs0*d2thetas*ds+cs1*ds*dp+cs2*Qp 65 | P2 = Qs/ds 66 | if data is None: 67 | return P2 68 | return P2 - data 69 | 70 | 71 | def perform_lmfit(x, y, d_sample): 72 | "perform the fitting with lmfit" 73 | pfit = Parameters() 74 | pfit.add(name="phi_s", vary=True, min=0, value=1e-4) ##originally had upper bound of 1 75 | ##inital 7.5e2 76 | pfit.add(name="f", vary=True, value=7.5e2, min=3, max=1e6) ##originally min=3/max=3e5; value=1e3 77 | pfit.add(name="theta_s", vary=True, min=0, value=1e-4) 78 | ##define the fixed params 79 | pfit.add(name="w", value=2.0/3.0, vary=False) 80 | pfit.add(name="a", value=4.0/3.0, vary=False) 81 | pfit.add(name="d_s", vary=False, value=d_sample) 82 | ##constrained params 83 | ##originally 0 to 1 for c_s 84 | pfit.add(name="c_s", expr="(phi_s*w*f)/(1+theta_s*a+phi_s*w*f)", min=0, max=1) #eq 21 85 | pfit.add(name="d_theta_s", expr="theta_s/(1+theta_s*a)", min=0) #eq 20 for theta_s (for eq 26) 86 | pfit.add(name="theta_p", expr="((1-c_s)*d_theta_s-d_s)/(a*(d_s-d_theta_s)+c_s*(d_theta_s*a-1))", min=0) #eq 26 87 | pfit.add(name="phi_p", expr="(theta_p*phi_s)/theta_s", min=0) #eq. 27 88 | pfit.add(name="d_theta_p", expr="theta_p/(1+theta_p*a)", min=0) #eq 20 for theta_p (for outputs) 89 | ##least squares with levenberg-marquardt 90 | result = minimize(residual, pfit, args=(x,), kws={'data': y}, method="least_squares", max_nfev=1e6) 91 | ##nelder-mead algorithm for least-squares minimization 92 | #result = minimize(residual, pfit, args=(x,), kws={'data': y}, method="nelder", max_nfev=1e6) 93 | return result -------------------------------------------------------------------------------- /cmd/development/mcorr-vcf/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "io" 7 | "os" 8 | "strconv" 9 | "strings" 10 | 11 | "github.com/kussell-lab/mcorr" 12 | 13 | "gopkg.in/alecthomas/kingpin.v2" 14 | ) 15 | 16 | func main() { 17 | app := kingpin.New("mcorr-vcf", "Calculate mutational correlation from VCF files.") 18 | app.Version("v20171020") 19 | vcfFileArg := app.Arg("vcf-file", "VCF input file.").Required().String() 20 | outFileArg := app.Arg("out-prefix", "output prefix.").Required().String() 21 | maxlFlag := app.Flag("max-corr-length", "max length of correlations (bp).").Default("300").Int() 22 | regionStartFlag := app.Flag("region-start", "region start").Default("1").Int() 23 | regionEndFlag := app.Flag("region-end", "region end").Default("1000000000000").Int() 24 | kingpin.MustParse(app.Parse(os.Args[1:])) 25 | 26 | vcfChan := readVCF(*vcfFileArg) 27 | p2arr := make([]float64, *maxlFlag) 28 | p2counts := make([]int64, *maxlFlag) 29 | var buffer []VCFRecord 30 | for rec := range vcfChan { 31 | if rec.Pos < *regionStartFlag || rec.Pos > *regionEndFlag { 32 | break 33 | } 34 | if len(buffer) == 0 || rec.Pos-buffer[0].Pos < *maxlFlag { 35 | buffer = append(buffer, rec) 36 | } else { 37 | compute(buffer, p2arr, p2counts) 38 | buffer = buffer[1:] 39 | } 40 | } 41 | compute(buffer, p2arr, p2counts) 42 | 43 | w, err := os.Create(*outFileArg) 44 | if err != nil { 45 | panic(err) 46 | } 47 | defer w.Close() 48 | w.WriteString("l,m,n,v,t,b\n") 49 | for k := 0; k < len(p2arr); k++ { 50 | var m float64 51 | var n int64 52 | var t string 53 | n = p2counts[k] 54 | if k == 0 { 55 | m = p2arr[0] / float64(p2counts[0]) 56 | t = "Ks" 57 | } else { 58 | m = p2arr[k] / p2arr[0] 59 | t = "P2" 60 | } 61 | if n > 0 { 62 | w.WriteString(fmt.Sprintf("%d,%g,0,%d,%s,all\n", k, m, n, t)) 63 | } 64 | } 65 | } 66 | 67 | // Compute calculates correlation function. 68 | func compute(buffer []VCFRecord, p2arr []float64, p2counts []int64) { 69 | for i := 0; i < len(buffer); i++ { 70 | nc := mcorr.NewNuclCov([]byte{'0', '1'}) 71 | for k := 0; k < len(buffer[0].GTs); k++ { 72 | nc.Add(buffer[0].GTs[k], buffer[i].GTs[k]) 73 | } 74 | lag := buffer[i].Pos - buffer[0].Pos 75 | xy, n := nc.P11(0) 76 | p2arr[lag] += xy / float64(n) 77 | p2counts[lag]++ 78 | } 79 | } 80 | 81 | // readVCF return a channel of VCF record. 82 | func readVCF(filename string) (c chan VCFRecord) { 83 | c = make(chan VCFRecord) 84 | go func() { 85 | defer close(c) 86 | f, err := os.Open(filename) 87 | if err != nil { 88 | panic(err) 89 | } 90 | defer f.Close() 91 | 92 | rd := bufio.NewReader(f) 93 | for { 94 | line, err := rd.ReadString('\n') 95 | if err != nil { 96 | if err != io.EOF { 97 | panic(err) 98 | } 99 | break 100 | } 101 | if line[0] == '#' { 102 | continue 103 | } 104 | 105 | line = strings.TrimSpace(line) 106 | terms := strings.Split(line, "\t") 107 | var rec VCFRecord 108 | rec.Chrom = terms[0] 109 | rec.Pos = atoi(terms[1]) 110 | rec.Ref = terms[3] 111 | rec.Alt = terms[4] 112 | if len(rec.Alt) == 1 && len(rec.Ref) == 1 { 113 | inGT := false 114 | for _, t := range terms { 115 | if t == "GT" { 116 | inGT = true 117 | } else if inGT { 118 | for _, gt := range t { 119 | if gt != '|' { 120 | rec.GTs = append(rec.GTs, byte(gt)) 121 | } 122 | } 123 | } 124 | } 125 | c <- rec 126 | } 127 | } 128 | }() 129 | return 130 | } 131 | 132 | func atoi(s string) int { 133 | v, err := strconv.Atoi(s) 134 | if err != nil { 135 | panic(err) 136 | } 137 | return v 138 | } 139 | -------------------------------------------------------------------------------- /cmd/mcorr-fit/mcorr/fit_data.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | class FitData(object): 4 | """Fitting data""" 5 | def __init__(self, group, xvalues, yvalues, d_sample): 6 | self.group = group 7 | self.xvalues = xvalues 8 | self.yvalues = yvalues 9 | self.d_sample = d_sample 10 | 11 | class FitDatas(object): 12 | """Fitting data""" 13 | def __init__(self, corr_results, fit_start, xmax): 14 | corr_map = {} 15 | groups = [] 16 | for row in corr_results: 17 | rows = corr_map.get(row.group, []) 18 | if len(rows) == 0: 19 | groups.append(row.group) 20 | rows.append(row) 21 | corr_map[row.group] = rows 22 | fitdata_map = {} 23 | for group, items in corr_map.items(): 24 | xvalues, yvalues, d_sample = prepare_fitting_data( 25 | items, fit_start, xmax) 26 | fitdata_map[group] = FitData(group, xvalues, yvalues, d_sample) 27 | self.fitdata_dict = fitdata_map 28 | self.groups = groups 29 | def has(self, group): 30 | """return True if the group is in the data""" 31 | return group in self.fitdata_dict 32 | 33 | def get(self, group): 34 | """return fit data""" 35 | fitdata = self.fitdata_dict.get(group, None) 36 | return fitdata 37 | def getall(self): 38 | """return all""" 39 | return [self.fitdata_dict[group] for group in self.groups] 40 | 41 | def prepare_fitting_data(fitdata, fit_start, xmax): 42 | """Prepare fitting xvalues and yvalues""" 43 | xvalues = [] 44 | yvalues = [] 45 | diver = 0 46 | for row in fitdata: 47 | if row.corrtype == 'P2' and row.lag >= fit_start and row.lag <= xmax: 48 | xvalues.append(row.lag) 49 | yvalues.append(row.value) 50 | elif row.corrtype == 'Ks': 51 | diver = row.value 52 | xvalues = numpy.array(xvalues) 53 | yvalues = numpy.array(yvalues) 54 | return (xvalues, yvalues, diver) 55 | 56 | class FitGeneDatas(object): 57 | """Fitting data""" 58 | def __init__(self, corr_results, fit_start, xmax): 59 | corr_map = {} 60 | groups = [] 61 | for row in corr_results: 62 | rows = corr_map.get(row.group, []) 63 | if len(rows) == 0: 64 | groups.append(row.group) 65 | rows.append(row) 66 | corr_map[row.group] = rows 67 | fitdata_map = {} 68 | for group, items in corr_map.items(): 69 | xvalues, yvalues, d_sample = prepare_fitting_genedata( 70 | items, fit_start, xmax) 71 | fitdata_map[group] = FitData(group, xvalues, yvalues, d_sample) 72 | self.fitdata_dict = fitdata_map 73 | self.groups = groups 74 | def has(self, group): 75 | """return True if the group is in the data""" 76 | return group in self.fitdata_dict 77 | 78 | def get(self, group): 79 | """return fit data""" 80 | fitdata = self.fitdata_dict.get(group, None) 81 | return fitdata 82 | def getall(self): 83 | """return all""" 84 | return [self.fitdata_dict[group] for group in self.groups] 85 | 86 | def prepare_fitting_genedata(fitdata, fit_start, xmax): 87 | """Prepare fitting xvalues and yvalues""" 88 | xvalues = [] 89 | yvalues = [] 90 | diver = 0 91 | for row in fitdata: 92 | if row.corrtype == 'P2' and row.lag >= fit_start and row.lag <= xmax: 93 | xvalues.append(row.lag) 94 | yvalues.append(row.value) 95 | elif row.lag == 0: 96 | diver = row.value 97 | xvalues = numpy.array(xvalues) 98 | yvalues = numpy.array(yvalues) 99 | return (xvalues, yvalues, diver) 100 | 101 | 102 | -------------------------------------------------------------------------------- /cmd/mcorr-bam/codon.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/kussell-lab/ncbiftp/taxonomy" 5 | ) 6 | 7 | // Codon stores a codon value, the position in a genome and the read id. 8 | type Codon struct { 9 | Seq string 10 | ReadID string 11 | GenePos int 12 | } 13 | 14 | // ContainsGap return true if '-' in a sequence. 15 | func (c Codon) ContainsGap() bool { 16 | for _, b := range c.Seq { 17 | if b == '-' { 18 | return true 19 | } 20 | } 21 | return false 22 | } 23 | 24 | // CodonPile stores a pile of Codon, which are at a particular genome position. 25 | type CodonPile struct { 26 | genePos int 27 | codonMap map[string]Codon 28 | } 29 | 30 | // NewCodonPile return a new CodonPile. 31 | func NewCodonPile() *CodonPile { 32 | return &CodonPile{codonMap: make(map[string]Codon)} 33 | } 34 | 35 | // Add appends a new Codon. 36 | func (cp *CodonPile) Add(c Codon) { 37 | cp.genePos = c.GenePos 38 | cp.codonMap[c.ReadID] = c 39 | } 40 | 41 | // LookUp search a codon by ReadName. If not found, it returns nil. 42 | func (cp *CodonPile) LookUp(readID string) Codon { 43 | return cp.codonMap[readID] 44 | } 45 | 46 | // Len return the lenght of pileup Codons. 47 | func (cp *CodonPile) Len() int { 48 | return len(cp.codonMap) 49 | } 50 | 51 | // GenePos return the gene position. 52 | func (cp *CodonPile) GenePos() int { 53 | return cp.genePos 54 | } 55 | 56 | // CodonGene represents a gene with an array of CodonPile. 57 | type CodonGene struct { 58 | CodonPiles []*CodonPile 59 | } 60 | 61 | // NewCodonGene return a new CodonGene. 62 | func NewCodonGene() *CodonGene { 63 | return &CodonGene{} 64 | } 65 | 66 | // AddCodon add a codon. 67 | func (cg *CodonGene) AddCodon(c Codon) { 68 | for len(cg.CodonPiles) <= c.GenePos { 69 | cg.CodonPiles = append(cg.CodonPiles, NewCodonPile()) 70 | } 71 | cg.CodonPiles[c.GenePos].Add(c) 72 | } 73 | 74 | // DepthAt return the pile depth at position i. 75 | func (cg *CodonGene) DepthAt(i int) int { 76 | if len(cg.CodonPiles) <= i { 77 | return 0 78 | } 79 | return cg.CodonPiles[i].Len() 80 | } 81 | 82 | // Len returns length of CodonPile array. 83 | func (cg *CodonGene) Len() int { 84 | return len(cg.CodonPiles) 85 | } 86 | 87 | // CodonPair stores a pair of Codon 88 | type CodonPair struct { 89 | A, B Codon 90 | } 91 | 92 | // PairCodonAt pairs codons at positions i and j. 93 | func (cg *CodonGene) PairCodonAt(i, j int) (pairs []CodonPair) { 94 | if i >= len(cg.CodonPiles) || j >= len(cg.CodonPiles) { 95 | return 96 | } 97 | 98 | if i > j { 99 | j, i = i, j 100 | } 101 | 102 | pile1 := cg.CodonPiles[i] 103 | if i == j { 104 | for _, codon := range pile1.codonMap { 105 | pairs = append(pairs, CodonPair{A: codon, B: codon}) 106 | } 107 | } 108 | pile2 := cg.CodonPiles[j] 109 | for readID, codon1 := range pile1.codonMap { 110 | codon2 := pile2.LookUp(readID) 111 | if codon2.ReadID != "" { 112 | pairs = append(pairs, CodonPair{A: codon1, B: codon2}) 113 | } 114 | } 115 | return 116 | } 117 | 118 | // SynoumousSplitCodonPairs split codon pairs into synoumous pairs. 119 | func SynoumousSplitCodonPairs(codonPairs []CodonPair, codeTable *taxonomy.GeneticCode) [][]CodonPair { 120 | var splittedPairs [][]CodonPair 121 | var aaArray []string 122 | for _, codonPair := range codonPairs { 123 | hasGap := false 124 | for _, codon := range []Codon{codonPair.A, codonPair.B} { 125 | for _, b := range codon.Seq { 126 | if !isATGC(byte(b)) { 127 | hasGap = true 128 | break 129 | } 130 | } 131 | if hasGap { 132 | break 133 | } 134 | } 135 | 136 | if hasGap { 137 | continue 138 | } 139 | 140 | a := codeTable.Table[codonPair.A.Seq] 141 | b := codeTable.Table[codonPair.B.Seq] 142 | ab := string([]byte{a, b}) 143 | index := -1 144 | for i, aa := range aaArray { 145 | if aa == ab { 146 | index = i 147 | } 148 | } 149 | if index == -1 { 150 | index = len(aaArray) 151 | aaArray = append(aaArray, ab) 152 | splittedPairs = append(splittedPairs, []CodonPair{}) 153 | } 154 | splittedPairs[index] = append(splittedPairs[index], codonPair) 155 | } 156 | return splittedPairs 157 | } 158 | -------------------------------------------------------------------------------- /cmd/mcorr-fit/old/fitCorr.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter 4 | #from mcorr import fit_p2 5 | from mcorr import fit_p2, read_corr, FitDatas, \ 6 | write_fitting_results, plot_fit, plot_params, write_fitting_reports, \ 7 | geom_r1, const_r1 8 | from mcorr.fit import fit_model, vary_fit 9 | from lmfit import fit_report 10 | import csv 11 | import pandas as pd 12 | #from lmfit.printfuncs import report_fit 13 | from mcorr.lmfitFunctions import perform_lmfit 14 | 15 | def main(): 16 | """Run fitting using lmfit, and generate output files and plots""" 17 | parser = ArgumentParser( 18 | formatter_class=ArgumentDefaultsHelpFormatter, 19 | description="Fit the actual data (not the bootstraps) and return goodness-of fit stats") 20 | parser.add_argument("corr_file", type = str, help='correlation input file') 21 | parser.add_argument("output_prefix", type=str, help='output file prefix') 22 | parser.add_argument('--fit_start', type=int, default=3, 23 | help='fitting range starts at') 24 | parser.add_argument('--fit_end', type=int, default=300, 25 | help='fitting range ends at') 26 | parser.add_argument("--use_geom_frag", action="store_true", 27 | help='use geometric distribution for fragment sizes') 28 | parser.add_argument('--quiet', action="store_true") 29 | parser.add_argument("--title", type=str, help="plot title", default="") 30 | opts = parser.parse_args() 31 | corr_file = opts.corr_file 32 | prefix = opts.output_prefix 33 | fit_start = opts.fit_start 34 | fit_end = opts.fit_end 35 | quiet = opts.quiet 36 | use_geom_frag = opts.use_geom_frag 37 | title = opts.title 38 | 39 | ##for testing fixes 40 | # dir = '/Volumes/aps_timemachine/recombo/APS160.5_lmfit/cluster8_cluster221' 41 | # corr_file = os.path.join(dir, 'cluster8_cluster221_CORE_XMFA_OUT.csv') 42 | # prefix = 'cluster8_cluster221_CORE_FIT_OUT_0205test' 43 | # fit_start = 3 44 | # fit_end = 300 45 | # quiet = False 46 | # use_geom_frag = False 47 | # title="" 48 | 49 | # read correlation results and prepare fitting data 50 | corr_results = read_corr(corr_file) 51 | fitdatas = FitDatas(corr_results, fit_start, fit_end) 52 | ##do fitting 53 | r1_func = const_r1 54 | #if you want to use a geometric distribution of fragments 55 | if use_geom_frag: 56 | r1_func = geom_r1 57 | 58 | all = fitdatas.get("all") 59 | x = all.xvalues 60 | y = all.yvalues 61 | d_sample = all.d_sample 62 | fitres = perform_lmfit(x, y, d_sample) 63 | ## write a fit report as generated by lmfit (includes chi-squared, uncertainties, etc) 64 | params = fitres.params.valuesdict() 65 | thetaS = fitres.params["theta_s"] 66 | phiS = fitres.params["phi_s"] 67 | f = fitres.params["f"] 68 | lmfitfile = prefix + "_lmfit_report.csv" 69 | with open(lmfitfile, "w+") as csvfile: 70 | lmfit_writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) 71 | lmfit_writer.writerow(["fit_success", fitres.success]) 72 | lmfit_writer.writerow(["function_evals", fitres.nfev]) 73 | lmfit_writer.writerow(["data_points", fitres.ndata]) 74 | lmfit_writer.writerow(["variables", fitres.nvarys]) 75 | lmfit_writer.writerow(["message", fitres.message]) 76 | lmfit_writer.writerow(["thetaS (init)", thetaS.init_value]) 77 | lmfit_writer.writerow(["f (init)", f.init_value]) 78 | lmfit_writer.writerow(["phiS (init)", phiS.init_value]) 79 | lmfit_writer.writerow([""]) 80 | lmfit_writer.writerow(["d_s", "theta_s", "f", "phi_s", 81 | "theta_p", "phi_p", "c", "d_theta_p", 82 | "d_theta_s", "chisq", "red-chisq"]) 83 | lmfit_writer.writerow([params["d_s"], params["theta_s"], params["f"], params["phi_s"], 84 | params["theta_p"], params["phi_p"], params["c_s"], params["d_theta_p"], 85 | params["d_theta_s"], fitres.chisqr, fitres.redchi]) 86 | ##save the residuals as a .csv file 87 | residuals = fitres.residual 88 | resdat = pd.DataFrame(residuals) 89 | resdat.to_csv(prefix+"_residuals.csv", header=None) 90 | ##plot the best fit and the residuals 91 | best_fit_file = prefix + "_best_fit.svg" 92 | plot_fit(all, fitres, best_fit_file, title=title) 93 | 94 | if __name__ == "__main__": 95 | main() 96 | 97 | 98 | 99 | 100 | 101 | 102 | -------------------------------------------------------------------------------- /cmd/mcorr-xmfa/coding_calculator.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/kussell-lab/biogo/seq" 5 | "github.com/kussell-lab/mcorr" 6 | "github.com/kussell-lab/ncbiftp/taxonomy" 7 | ) 8 | 9 | // Calculator define a interface for calculating correlations. 10 | type Calculator interface { 11 | CalcP2(a Alignment, others ...Alignment) (corrResults mcorr.CorrResults) 12 | } 13 | 14 | // CodingCalculator for calculating coding sequences. 15 | type CodingCalculator struct { 16 | CodingTable *taxonomy.GeneticCode 17 | MaxCodonLen int 18 | CodonOffset int 19 | CodonPosition int 20 | Synonymous bool 21 | } 22 | 23 | // NewCodingCalculator return a CodingCalculator 24 | func NewCodingCalculator(codingTable *taxonomy.GeneticCode, maxCodonLen, codonOffset int, codonPosition int, synonymous bool) *CodingCalculator { 25 | return &CodingCalculator{ 26 | CodingTable: codingTable, 27 | MaxCodonLen: maxCodonLen, 28 | CodonOffset: codonOffset, 29 | CodonPosition: codonPosition, 30 | Synonymous: synonymous, 31 | } 32 | } 33 | 34 | // CalcP2 calculate P2 35 | func (cc *CodingCalculator) CalcP2(a Alignment, others ...Alignment) mcorr.CorrResults { 36 | results := calcP2Coding(a, cc.CodonOffset, cc.CodonPosition, cc.MaxCodonLen, cc.CodingTable, cc.Synonymous) 37 | return mcorr.CorrResults{ID: a.ID, Results: results} 38 | } 39 | 40 | func calcP2Coding(aln Alignment, codonOffset, codonPosition, maxCodonLen int, codingTable *taxonomy.GeneticCode, synonymous bool) (results []mcorr.CorrResult) { 41 | codonSequences := [][]Codon{} 42 | for _, s := range aln.Sequences { 43 | codons := extractCodons(s, codonOffset) 44 | codonSequences = append(codonSequences, codons) 45 | } 46 | 47 | for l := 0; l < maxCodonLen; l++ { 48 | totalP2 := 0.0 49 | totaln := 0 50 | 51 | for i := 0; i+l < len(codonSequences[0]); i++ { 52 | codonPairs := []CodonPair{} 53 | j := i + l 54 | for _, cc := range codonSequences { 55 | if i+l < len(cc) { 56 | codonPairs = append(codonPairs, CodonPair{A: cc[i], B: cc[j]}) 57 | } 58 | } 59 | 60 | multiCodonPairs := [][]CodonPair{} 61 | if synonymous { 62 | multiCodonPairs = synonymousSplit(codonPairs, codingTable) 63 | } else { 64 | multiCodonPairs = append(multiCodonPairs, codonPairs) 65 | } 66 | for _, codonPairs := range multiCodonPairs { 67 | if len(codonPairs) >= 2 { 68 | nc := doubleCodons(codonPairs, codonPosition) 69 | xy, n := nc.P11(0) 70 | totalP2 += xy 71 | totaln += n 72 | 73 | } 74 | } 75 | } 76 | 77 | if totaln > 0 { 78 | res1 := mcorr.CorrResult{ 79 | Lag: l * 3, 80 | Mean: totalP2 / float64(totaln), 81 | N: totaln, 82 | Type: "P2", 83 | } 84 | results = append(results, res1) 85 | } 86 | } 87 | 88 | return 89 | } 90 | 91 | func doubleCodons(codonPairs []CodonPair, codonPosition int) *mcorr.NuclCov { 92 | alphabet := []byte{'A', 'T', 'G', 'C'} 93 | c := mcorr.NewNuclCov(alphabet) 94 | for _, codonPair := range codonPairs { 95 | a := codonPair.A[codonPosition] 96 | b := codonPair.B[codonPosition] 97 | c.Add(a, b) 98 | } 99 | return c 100 | } 101 | 102 | // Codon is a byte list of length 3 103 | type Codon []byte 104 | 105 | // CodonSequence is a sequence of codons. 106 | type CodonSequence []Codon 107 | 108 | // CodonPair is a pair of Codons. 109 | type CodonPair struct { 110 | A, B Codon 111 | } 112 | 113 | // extractCodons return a list of codons from a DNA sequence. 114 | func extractCodons(s seq.Sequence, offset int) (codons []Codon) { 115 | for i := offset; i+3 <= len(s.Seq); i += 3 { 116 | c := s.Seq[i:(i + 3)] 117 | codons = append(codons, c) 118 | } 119 | return 120 | } 121 | 122 | // synonymousSplit split a list of codon pairs into multiple 123 | // synonymous pairs. 124 | func synonymousSplit(codonPairs []CodonPair, codingTable *taxonomy.GeneticCode) (multiCodonPairs [][]CodonPair) { 125 | aaList := []string{} 126 | for _, codonPair := range codonPairs { 127 | // check gap. 128 | containsGap := false 129 | for _, codon := range []Codon{codonPair.A, codonPair.B} { 130 | for i := 0; i < 3; i++ { 131 | if codon[i] == '-' || codon[i] == 'N' { 132 | containsGap = true 133 | break 134 | } 135 | } 136 | } 137 | if containsGap { 138 | continue 139 | } 140 | 141 | codonA := string(codonPair.A) 142 | codonB := string(codonPair.B) 143 | a := codingTable.Table[codonA] 144 | b := codingTable.Table[codonB] 145 | ab := string([]byte{a, b}) 146 | index := -1 147 | for i := 0; i < len(aaList); i++ { 148 | if aaList[i] == ab { 149 | index = i 150 | } 151 | } 152 | if index == -1 { 153 | index = len(aaList) 154 | aaList = append(aaList, ab) 155 | multiCodonPairs = append(multiCodonPairs, []CodonPair{}) 156 | } 157 | 158 | multiCodonPairs[index] = append(multiCodonPairs[index], codonPair) 159 | } 160 | 161 | return 162 | } 163 | -------------------------------------------------------------------------------- /cmd/mcorr-fit/mcorr/cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import csv 3 | import os 4 | from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter 5 | #from mcorr import fit_p2 6 | from mcorr import fit_p2, read_corr, FitDatas, \ 7 | write_fitting_results, plot_fit, plot_params, write_fitting_reports, \ 8 | geom_r1, const_r1 9 | from mcorr.fit import fit_model 10 | from lmfit import fit_report 11 | 12 | def main(): 13 | """Run fitting using lmfit, and generate output files and plots""" 14 | parser = ArgumentParser( 15 | formatter_class=ArgumentDefaultsHelpFormatter, 16 | description="Infer recombination rates\ 17 | by fitting correlation profile of mutations.") 18 | parser.add_argument("corr_file", type = str, help='correlation input file') 19 | parser.add_argument("output_prefix", type=str, help='output file prefix') 20 | parser.add_argument('--fit_start', type=int, default=3, 21 | help='fitting range starts at') 22 | parser.add_argument('--fit_end', type=int, default=300, 23 | help='fitting range ends at') 24 | parser.add_argument("--use_geom_frag", action="store_true", 25 | help='use geometric distribution for fragment sizes') 26 | parser.add_argument('--quiet', action="store_true") 27 | parser.add_argument("--title", type=str, help="plot title", default="") 28 | opts = parser.parse_args() 29 | corr_file = opts.corr_file 30 | prefix = opts.output_prefix 31 | fit_start = opts.fit_start 32 | fit_end = opts.fit_end 33 | quiet = opts.quiet 34 | use_geom_frag = opts.use_geom_frag 35 | title = opts.title 36 | # read correlation results and prepare fitting data 37 | corr_results = read_corr(corr_file) 38 | fitdatas = FitDatas(corr_results, fit_start, fit_end) 39 | r1_func = const_r1 40 | ## write a fit report as generated by lmfit (includes chi-squared, uncertainties, etc) 41 | all = fitdatas.get("all") 42 | actualdata = fit_model(all.xvalues, all.yvalues, all.d_sample, r1_func) 43 | ## write a fit report as generated by lmfit (includes chi-squared, uncertainties, etc) 44 | params = actualdata.params.valuesdict() 45 | thetaS = actualdata.params["thetaS"] 46 | phiS = actualdata.params["phiS"] 47 | f = actualdata.params["f"] 48 | lmfitfile = prefix + "_lmfit_report.csv" 49 | with open(lmfitfile, "w+") as csvfile: 50 | lmfit_writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) 51 | lmfit_writer.writerow(["fit_success", actualdata.success]) 52 | lmfit_writer.writerow(["function_evals", actualdata.nfev]) 53 | lmfit_writer.writerow(["data_points", actualdata.ndata]) 54 | lmfit_writer.writerow(["variables", actualdata.nvarys]) 55 | lmfit_writer.writerow(["message", actualdata.message]) 56 | lmfit_writer.writerow(["thetaS (init)", thetaS.init_value]) 57 | lmfit_writer.writerow(["f (init)", f.init_value]) 58 | lmfit_writer.writerow(["phiS (init)", phiS.init_value]) 59 | lmfit_writer.writerow([""]) 60 | lmfit_writer.writerow(["d_s", "theta_s", "f", "phi_s", 61 | "theta_p", "phi_p", "c", "d_theta_p", 62 | "d_theta_s", "chisq", "red-chisq"]) 63 | lmfit_writer.writerow([params["ds"], params["thetaS"], params["f"], params["phiS"], 64 | params["thetaP"], params["phiP"], params["c"], params["dp"], 65 | params["dc"], actualdata.chisqr, actualdata.redchi]) 66 | 67 | #do fitting 68 | if use_geom_frag: 69 | r1_func = geom_r1 70 | fit_results = fit_p2(fitdatas, r1_func=r1_func, disable_progress_bar=quiet) 71 | # parameters to report 72 | model_params = ["group", "d_sample", "theta_pool", 73 | "phi_pool", "ratio", "fbar", "c", "d_pool", 74 | "d_clonal", 'theta_s', 'phi_s'] 75 | # save fitting results into csv file 76 | csv_file = prefix + "_fit_results.csv" 77 | write_fitting_results(fit_results, model_params, csv_file) 78 | # plot the best fit 79 | best_fit_file = prefix + "_best_fit.svg" 80 | fitdata = fitdatas.get("all") 81 | fitres = None 82 | for res in fit_results: 83 | if res.group == "all": 84 | fitres = res 85 | break 86 | if fitres is not None: 87 | plot_fit(fitdata, fitres, best_fit_file, title=title) 88 | # write fitting report for bootstrapping 89 | report_file = prefix + "_bootstrapping_report.txt" 90 | write_fitting_reports(fit_results, model_params[1:7], report_file) 91 | 92 | # plot histogram of fitted parameters 93 | # temporarily taking this out because it is problematic when bins for bootstraps are not well-determined 94 | # params_hist_file = prefix + "_parameter_histograms.svg" 95 | # plot_params(fit_results, model_params[1:7], params_hist_file) 96 | 97 | if __name__ == "__main__": 98 | main() 99 | -------------------------------------------------------------------------------- /cmd/mcorr-fit/mcorr/singleFit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """" 3 | written by Asher Preska Steinberg (apsteinberg@nyu.edu) 4 | """ 5 | import os 6 | from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter 7 | #from mcorr import fit_p2 8 | from mcorr import fit_p2, read_corr, FitDatas, \ 9 | write_fitting_results, plot_fit, plot_params, write_fitting_reports, \ 10 | geom_r1, const_r1 11 | from mcorr.fit import fit_model, vary_fit 12 | #from mcorr.fit import fit_varynefv 13 | from .fit import fit_modelopts 14 | from lmfit import fit_report 15 | import csv 16 | import pandas as pd 17 | #from lmfit.printfuncs import report_fit 18 | from mcorr.lmfitFunctions import perform_lmfit 19 | 20 | def main(): 21 | """Fit just the data (no bootstraps) using the method from Nature Methods paper""" 22 | parser = ArgumentParser( 23 | formatter_class=ArgumentDefaultsHelpFormatter, 24 | description="Fit just data (no bootstraps) with fitting setup from Nature Methods") 25 | parser.add_argument("corr_file", type = str, help='correlation input file') 26 | parser.add_argument("output_prefix", type=str, help='output file prefix') 27 | parser.add_argument('--fit_start', type=int, default=3, 28 | help='fitting range starts at') 29 | parser.add_argument('--fit_end', type=int, default=300, 30 | help='fitting range ends at') 31 | parser.add_argument("--use_geom_frag", action="store_true", 32 | help='use geometric distribution for fragment sizes') 33 | parser.add_argument("--fit_method", type=str, default="least_squares", help="lmfit method (see lmfit documentation)") 34 | parser.add_argument("--max_nfev", type=int, default=int(1e6), 35 | help='max number of function evaluations before lmfit quits') 36 | parser.add_argument('--quiet', action="store_true") 37 | parser.add_argument("--title", type=str, help="plot title", default="") 38 | opts = parser.parse_args() 39 | corr_file = opts.corr_file 40 | prefix = opts.output_prefix 41 | fit_start = opts.fit_start 42 | fit_end = opts.fit_end 43 | quiet = opts.quiet 44 | use_geom_frag = opts.use_geom_frag 45 | title = opts.title 46 | fit_method = opts.fit_method 47 | max_nfev = opts.max_nfev 48 | 49 | # read correlation results and prepare fitting data 50 | corr_results = read_corr(corr_file) 51 | fitdatas = FitDatas(corr_results, fit_start, fit_end) 52 | ##do fitting 53 | r1_func = const_r1 54 | #if you want to use a geometric distribution of fragments 55 | if use_geom_frag: 56 | r1_func = geom_r1 57 | 58 | all = fitdatas.get("all") 59 | x = all.xvalues 60 | y = all.yvalues 61 | d_sample = all.d_sample 62 | fitres = fit_modelopts(x, y, d_sample, r1_func, max_nfev, fit_method) 63 | #fitres = fit_model(x, y, d_sample, r1_func) 64 | ## write a fit report as generated by lmfit (includes chi-squared, uncertainties, etc) 65 | params = fitres.params.valuesdict() 66 | thetaS = fitres.params["thetaS"] 67 | phiS = fitres.params["phiS"] 68 | f = fitres.params["f"] 69 | lmfitfile = prefix + "_lmfit_report.csv" 70 | with open(lmfitfile, "w+") as csvfile: 71 | lmfit_writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) 72 | lmfit_writer.writerow(["fit_success", fitres.success]) 73 | lmfit_writer.writerow(["function_evals", fitres.nfev]) 74 | lmfit_writer.writerow(["data_points", fitres.ndata]) 75 | lmfit_writer.writerow(["variables", fitres.nvarys]) 76 | lmfit_writer.writerow(["message", fitres.message]) 77 | lmfit_writer.writerow(["thetaS (init)", thetaS.init_value]) 78 | lmfit_writer.writerow(["f (init)", f.init_value]) 79 | lmfit_writer.writerow(["phiS (init)", phiS.init_value]) 80 | lmfit_writer.writerow([""]) 81 | lmfit_writer.writerow(["d_s", "theta_s", "f", "phi_s", 82 | "theta_p", "phi_p", "c", "d_theta_p", 83 | "d_theta_s", "chisq", "red-chisq"]) 84 | lmfit_writer.writerow([params["ds"], params["thetaS"], params["f"], params["phiS"], 85 | params["thetaP"], params["phiP"], params["c"], params["dp"], 86 | params["dc"], fitres.chisqr, fitres.redchi]) 87 | ##save the residuals as a .csv file 88 | if fitres.success: 89 | residuals = fitres.residual 90 | resdat = pd.DataFrame(residuals) 91 | resdat.to_csv(prefix+"_residuals.csv", header=None) 92 | #plot the best fit and the residuals 93 | best_fit_file = prefix + "_best_fit.svg" 94 | plot_fit(all, fitres, best_fit_file, title=title) 95 | else: 96 | print("Fitting failed for %s" % corr_file) 97 | 98 | if __name__ == "__main__": 99 | main() 100 | 101 | 102 | 103 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /cmd/mcorr-fit/mcorr/lmfit_report.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter 4 | #from mcorr import fit_p2 5 | from mcorr import fit_p2, read_corr, FitDatas, \ 6 | write_fitting_results, plot_fit, plot_params, write_fitting_reports, \ 7 | geom_r1, const_r1 8 | from mcorr.fit import fit_model, vary_fit 9 | from lmfit import fit_report 10 | import csv 11 | import pandas as pd 12 | from lmfit import Parameters, Minimizer 13 | 14 | def main(): 15 | """Run fitting using lmfit, and generate output files and plots""" 16 | parser = ArgumentParser( 17 | formatter_class=ArgumentDefaultsHelpFormatter, 18 | description="Fit the actual data (not the bootstraps) and return goodness-of fit stats") 19 | parser.add_argument("corr_file", type = str, help='correlation input file') 20 | parser.add_argument("output_prefix", type=str, help='output file prefix') 21 | parser.add_argument('--fit_start', type=int, default=3, 22 | help='fitting range starts at') 23 | parser.add_argument('--fit_end', type=int, default=300, 24 | help='fitting range ends at') 25 | parser.add_argument("--use_geom_frag", action="store_true", 26 | help='use geometric distribution for fragment sizes') 27 | parser.add_argument('--quiet', action="store_true") 28 | parser.add_argument("--title", type=str, help="plot title", default="") 29 | parser.add_argument("--phiS_init", type=float, help="set initial value for phiS in the fitting", default=0.00005) 30 | parser.add_argument("--f_init", type=float, help="set initial value for f in the fitting", default=1000) 31 | parser.add_argument("--thetaS_init", type=float, help="set initial value for thetaS in the fitting", default=0.00001) 32 | parser.add_argument("--phiS_max", type=float, help="set initial value for f in the fitting", default=1.0) 33 | opts = parser.parse_args() 34 | corr_file = opts.corr_file 35 | prefix = opts.output_prefix 36 | fit_start = opts.fit_start 37 | fit_end = opts.fit_end 38 | quiet = opts.quiet 39 | use_geom_frag = opts.use_geom_frag 40 | title = opts.title 41 | f_init = opts.f_init 42 | thetaS_init = opts.thetaS_init 43 | phiS_init = opts.phiS_init 44 | phiS_max = opts.phiS_max 45 | 46 | ##for testing fixes 47 | # dir = '/Volumes/aps_timemachine/recombo/APS160.5_lmfit/cluster8_cluster221' 48 | # corr_file = os.path.join(dir, 'cluster8_cluster221_CORE_XMFA_OUT.csv') 49 | # prefix = 'cluster8_cluster221_CORE_FIT_OUT' 50 | # fit_start = 3 51 | # fit_end = 300 52 | # quiet = False 53 | # use_geom_frag = False 54 | # title="" 55 | 56 | # read correlation results and prepare fitting data 57 | corr_results = read_corr(corr_file) 58 | fitdatas = FitDatas(corr_results, fit_start, fit_end) 59 | ##do fitting 60 | r1_func = const_r1 61 | #if you want to use a geometric distribution of fragments 62 | if use_geom_frag: 63 | r1_func = geom_r1 64 | 65 | all = fitdatas.get("all") 66 | fitres = vary_fit(all.xvalues, all.yvalues, all.d_sample, r1_func, f_init, thetaS_init, phiS_init, phiS_max) 67 | # lmfitreport=fit_report(fitres) 68 | # print(lmfitreport) 69 | ## write a fit report as generated by lmfit (includes chi-squared, uncertainties, etc) 70 | params = fitres.params.valuesdict() 71 | thetaS = fitres.params["thetaS"] 72 | phiS = fitres.params["phiS"] 73 | f = fitres.params["f"] 74 | lmfitfile = prefix + "_lmfit_report.csv" 75 | with open(lmfitfile, "w+") as csvfile: 76 | lmfit_writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) 77 | lmfit_writer.writerow(["fit_success", fitres.success]) 78 | lmfit_writer.writerow(["function_evals", fitres.nfev]) 79 | lmfit_writer.writerow(["data_points", fitres.ndata]) 80 | lmfit_writer.writerow(["variables", fitres.nvarys]) 81 | lmfit_writer.writerow(["message", fitres.message]) 82 | lmfit_writer.writerow(["thetaS (init)", thetaS.init_value]) 83 | lmfit_writer.writerow(["f (init)", f.init_value]) 84 | lmfit_writer.writerow(["phiS (init)", phiS.init_value]) 85 | lmfit_writer.writerow([""]) 86 | lmfit_writer.writerow(["ds", "thetaS", "f", "phiS", "thetaP", "phiP", "c", 87 | "dp", "dc", "chisq", "red-chisq"]) 88 | lmfit_writer.writerow([params["ds"], params["thetaS"], params["f"], params["phiS"], 89 | params["thetaP"], params["phiP"], 90 | params["c"], params["dp"], params["dc"], fitres.chisqr, fitres.redchi]) 91 | ##save the residuals as a .csv file 92 | residuals = fitres.residual 93 | resdat = pd.DataFrame(residuals) 94 | resdat.to_csv(prefix+"_residuals.csv", header=None) 95 | ##plot the best fit and the residuals 96 | best_fit_file = prefix + "_best_fit.svg" 97 | plot_fit(all, fitres, best_fit_file, title=title) 98 | 99 | if __name__ == "__main__": 100 | main() 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /cmd/mcorr-xmfa-2clades/coding_calculator.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/kussell-lab/biogo/seq" 5 | "github.com/kussell-lab/mcorr" 6 | "github.com/kussell-lab/ncbiftp/taxonomy" 7 | ) 8 | 9 | // Calculator define a interface for calculating correlations. 10 | type Calculator interface { 11 | CalcP2(a Alignment, others ...Alignment) (corrResults mcorr.CorrResults) 12 | } 13 | 14 | // CodingCalculator for calculating coding sequences. 15 | type CodingCalculator struct { 16 | CodingTable *taxonomy.GeneticCode 17 | MaxCodonLen int 18 | CodonOffset int 19 | CodonPosition int 20 | Synonymous bool 21 | } 22 | 23 | // NewCodingCalculator return a CodingCalculator 24 | func NewCodingCalculator(codingTable *taxonomy.GeneticCode, maxCodonLen, codonOffset int, codonPosition int, synonymous bool) *CodingCalculator { 25 | return &CodingCalculator{ 26 | CodingTable: codingTable, 27 | MaxCodonLen: maxCodonLen, 28 | CodonOffset: codonOffset, 29 | CodonPosition: codonPosition, 30 | Synonymous: synonymous, 31 | } 32 | } 33 | 34 | // CalcP2 calculate P2 35 | func (cc *CodingCalculator) CalcP2(a Alignment, others ...Alignment) mcorr.CorrResults { 36 | results := calcP2Coding(a, cc.CodonOffset, cc.CodonPosition, cc.MaxCodonLen, cc.CodingTable, cc.Synonymous) 37 | return mcorr.CorrResults{ID: a.ID, Results: results} 38 | } 39 | 40 | func calcP2Coding(aln Alignment, codonOffset, codonPosition, maxCodonLen int, codingTable *taxonomy.GeneticCode, synonymous bool) (results []mcorr.CorrResult) { 41 | codonSequences := [][]Codon{} 42 | for _, s := range aln.Sequences { 43 | codons := extractCodons(s, codonOffset) 44 | codonSequences = append(codonSequences, codons) 45 | } 46 | 47 | ks := 1.0 48 | nn := 0 49 | for l := 0; l < maxCodonLen; l++ { 50 | totalP2 := 0.0 51 | totaln := 0 52 | if l > 0 && ks == 0.0 { 53 | totalP2 = 0.0 54 | totaln = nn 55 | } else { 56 | for i := 0; i+l < len(codonSequences[0]); i++ { 57 | codonPairs := []CodonPair{} 58 | j := i + l 59 | for _, cc := range codonSequences { 60 | if i+l < len(cc) { 61 | //all codonpairs (two codons separated by l) from a list of codon seqeunces 62 | codonPairs = append(codonPairs, CodonPair{A: cc[i], B: cc[j]}) 63 | } 64 | } 65 | 66 | multiCodonPairs := [][]CodonPair{} 67 | if synonymous { 68 | multiCodonPairs = synonymousSplit(codonPairs, codingTable) 69 | } else { 70 | multiCodonPairs = append(multiCodonPairs, codonPairs) 71 | } 72 | 73 | for _, codonPairs := range multiCodonPairs { 74 | if len(codonPairs) >= 2 { 75 | //a codon pair is a pair of codons separated by l 76 | //i think the double codons are a list of all codon pairs 77 | //across all the sequences 78 | nc := doubleCodons(codonPairs, codonPosition) 79 | xy, n := nc.P11(0) 80 | totalP2 += xy 81 | totaln += n 82 | } 83 | } 84 | } 85 | } 86 | 87 | if l == 0 { 88 | ks = totalP2 89 | nn = totaln 90 | } 91 | if totaln > 0 { 92 | res1 := mcorr.CorrResult{ 93 | Lag: l * 3, 94 | Mean: totalP2 / float64(totaln), 95 | N: totaln, 96 | Type: "P2", 97 | } 98 | results = append(results, res1) 99 | } 100 | } 101 | 102 | return 103 | } 104 | 105 | func doubleCodons(codonPairs []CodonPair, codonPosition int) *mcorr.NuclCov { 106 | alphabet := []byte{'A', 'T', 'G', 'C'} 107 | c := mcorr.NewNuclCov(alphabet) 108 | for _, codonPair := range codonPairs { 109 | a := codonPair.A[codonPosition] 110 | b := codonPair.B[codonPosition] 111 | c.Add(a, b) 112 | } 113 | return c 114 | } 115 | 116 | // Codon is a byte list of length 3 117 | type Codon []byte 118 | 119 | // CodonSequence is a sequence of codons. 120 | type CodonSequence []Codon 121 | 122 | // CodonPair is a pair of Codons. 123 | type CodonPair struct { 124 | A, B Codon 125 | } 126 | 127 | // extractCodons return a list of codons from a DNA sequence. 128 | func extractCodons(s seq.Sequence, offset int) (codons []Codon) { 129 | for i := offset; i+3 <= len(s.Seq); i += 3 { 130 | c := s.Seq[i:(i + 3)] 131 | codons = append(codons, c) 132 | } 133 | return 134 | } 135 | 136 | // synonymousSplit split a list of codon pairs into multiple 137 | // synonymous pairs. 138 | func synonymousSplit(codonPairs []CodonPair, codingTable *taxonomy.GeneticCode) (multiCodonPairs [][]CodonPair) { 139 | aaList := []string{} 140 | for _, codonPair := range codonPairs { 141 | // check gap. 142 | containsGap := false 143 | for _, codon := range []Codon{codonPair.A, codonPair.B} { 144 | for i := 0; i < 3; i++ { 145 | if codon[i] == '-' || codon[i] == 'N' { 146 | containsGap = true 147 | break 148 | } 149 | } 150 | } 151 | if containsGap { 152 | continue 153 | } 154 | 155 | codonA := string(codonPair.A) 156 | codonB := string(codonPair.B) 157 | a := codingTable.Table[codonA] 158 | b := codingTable.Table[codonB] 159 | ab := string([]byte{a, b}) 160 | index := -1 161 | for i := 0; i < len(aaList); i++ { 162 | if aaList[i] == ab { 163 | index = i 164 | } 165 | } 166 | if index == -1 { 167 | index = len(aaList) 168 | aaList = append(aaList, ab) 169 | multiCodonPairs = append(multiCodonPairs, []CodonPair{}) 170 | } 171 | 172 | multiCodonPairs[index] = append(multiCodonPairs[index], codonPair) 173 | } 174 | 175 | return 176 | } 177 | -------------------------------------------------------------------------------- /nucl_cov.go: -------------------------------------------------------------------------------- 1 | package mcorr 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | ) 7 | 8 | // NuclCov contains covariance of nucleotide acid in a DNA sequence. 9 | type NuclCov struct { 10 | Doublets []int 11 | Alphabet []byte 12 | } 13 | 14 | // NewNuclCov return a NuclCov given the alphabet. 15 | func NewNuclCov(alphabet []byte) *NuclCov { 16 | sizeOfAlphabet := len(alphabet) 17 | nc := NuclCov{Alphabet: alphabet} 18 | nc.Doublets = make([]int, sizeOfAlphabet*sizeOfAlphabet) 19 | return &nc 20 | } 21 | 22 | // Add insert a pair of nucliotide acids. 23 | // It returns error when the nucliotide acid is not in the alphabet. 24 | func (nc *NuclCov) Add(a, b byte) error { 25 | indexA := bytes.IndexByte(nc.Alphabet, a) 26 | indexB := bytes.IndexByte(nc.Alphabet, b) 27 | sizeOfAlphabet := len(nc.Alphabet) 28 | if indexA >= 0 && indexB >= 0 { 29 | nc.Doublets[indexA*sizeOfAlphabet+indexB]++ 30 | return nil 31 | } 32 | 33 | var err error 34 | if indexA < 0 && indexB < 0 { 35 | err = fmt.Errorf("%c and %c are not in Alphabet: %s", a, b, string(nc.Alphabet)) 36 | } else if indexA < 0 { 37 | err = fmt.Errorf("%c is not in Alphabet: %s", a, string(nc.Alphabet)) 38 | } else { 39 | err = fmt.Errorf("%c is not in Alphabet: %s", b, string(nc.Alphabet)) 40 | } 41 | 42 | return err 43 | } 44 | 45 | // Count returns the total number of pairs. 46 | func (nc *NuclCov) Count() int { 47 | n := 0 48 | for _, a := range nc.Doublets { 49 | n += a 50 | } 51 | return n 52 | } 53 | 54 | // P00 returns the probability of 00. 55 | func (nc *NuclCov) P00(minAlleleNum int) (xy float64, n int) { 56 | for i := 0; i < len(nc.Doublets); i++ { 57 | if nc.Doublets[i] > minAlleleNum { 58 | for j := i + 1; j < len(nc.Doublets); j++ { 59 | if nc.Doublets[j] > minAlleleNum { 60 | n += nc.Doublets[i] * nc.Doublets[j] 61 | } 62 | } 63 | n += nc.Doublets[i] * (nc.Doublets[i] - 1) / 2 64 | xy += float64(nc.Doublets[i] * (nc.Doublets[i] - 1) / 2) 65 | } 66 | } 67 | return 68 | } 69 | 70 | // P11 returns the probability of 11. 71 | func (nc *NuclCov) P11(minAlleleNum int) (xy float64, n int) { 72 | sizeOfAlphabet := len(nc.Alphabet) 73 | for i := 0; i < len(nc.Doublets); i++ { 74 | if nc.Doublets[i] > minAlleleNum { 75 | for j := i + 1; j < len(nc.Doublets); j++ { 76 | if nc.Doublets[j] > minAlleleNum { 77 | c := float64(nc.Doublets[i] * nc.Doublets[j]) 78 | if i%sizeOfAlphabet != j%sizeOfAlphabet && i/sizeOfAlphabet != j/sizeOfAlphabet { 79 | xy += c 80 | } 81 | 82 | n += nc.Doublets[i] * nc.Doublets[j] 83 | } 84 | } 85 | n += nc.Doublets[i] * (nc.Doublets[i] - 1) / 2 86 | } 87 | } 88 | return 89 | } 90 | 91 | //I think what we actually want is the cross-covariance matrix for this 92 | //which is not symmetric, and we need all of the elements of this matrix 93 | //which represent each possible combination except the diagonal which is just the same position 94 | //https://en.wikipedia.org/wiki/Cross-covariance_matrix 95 | func (nc *NuclCov) MateP11APS(nc2 *NuclCov, minAlleleNum int) (xy float64, n int) { 96 | //sizeOfAlphabet := len(nc.Alphabet) 97 | n1 := 0 98 | n2 := 0 99 | for i := 0; i < len(nc.Doublets); i++ { 100 | if nc.Doublets[i] > minAlleleNum { 101 | for j := 0; j < len(nc2.Doublets); j++ { 102 | if nc2.Doublets[j] > minAlleleNum { 103 | c := float64(nc.Doublets[i] * nc2.Doublets[j]) 104 | xy += c 105 | } 106 | n2 += nc2.Doublets[j] 107 | } 108 | } 109 | n1 += nc.Doublets[i] 110 | } 111 | //for i := 0; i < len(nc.Doublets); i++ { 112 | // n1 += nc.Doublets[i] 113 | // n2 += nc2.Doublets[i] 114 | //} 115 | n = n1 * n2 116 | return 117 | } 118 | 119 | // MateP11 calculate covariance between two clusters. 120 | func (nc *NuclCov) MateP11(nc2 *NuclCov, minAlleleNum int) (xy float64, n int) { 121 | sizeOfAlphabet := len(nc.Alphabet) 122 | for i := 0; i < len(nc.Doublets); i++ { 123 | if nc.Doublets[i] > minAlleleNum { 124 | for j := 0; j < len(nc2.Doublets); j++ { 125 | if i != j && nc2.Doublets[j] > minAlleleNum { 126 | c := float64(nc.Doublets[i] * nc2.Doublets[j]) 127 | if i%sizeOfAlphabet != j%sizeOfAlphabet && i/sizeOfAlphabet != j/sizeOfAlphabet { 128 | xy += c 129 | } 130 | } 131 | } 132 | } 133 | } 134 | n1 := 0 135 | n2 := 0 136 | for i := 0; i < len(nc.Doublets); i++ { 137 | n1 += nc.Doublets[i] 138 | n2 += nc2.Doublets[i] 139 | } 140 | n = n1 * n2 141 | return 142 | } 143 | 144 | // MateP00 calculate covariance between two clusters. 145 | func (nc *NuclCov) MateP00(nc2 *NuclCov, minAlleleNum int) (xy float64, n int) { 146 | n1, n2 := 0, 0 147 | for i := 0; i < len(nc.Doublets); i++ { 148 | xy += float64(nc.Doublets[i] * nc2.Doublets[i]) 149 | n1 += nc.Doublets[i] 150 | n2 += nc2.Doublets[i] 151 | } 152 | n = n1 * n2 153 | return 154 | } 155 | 156 | // Append another NuclCov. 157 | func (nc *NuclCov) Append(nc2 *NuclCov) error { 158 | // Check alphabet 159 | diffAlphabetError := fmt.Errorf("Different alphbet %s, %s", string(nc.Alphabet), string(nc2.Alphabet)) 160 | if len(nc.Alphabet) != len(nc2.Alphabet) { 161 | return diffAlphabetError 162 | } 163 | for i, a := range nc.Alphabet { 164 | b := nc2.Alphabet[i] 165 | if a != b { 166 | return diffAlphabetError 167 | } 168 | } 169 | 170 | for i := 0; i < len(nc.Doublets); i++ { 171 | nc.Doublets[i] += nc2.Doublets[i] 172 | } 173 | 174 | return nil 175 | } 176 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mcorr 2 | Using _Correlation Profiles_ of mutations to infer the recombination rate from large-scale sequencing data in bacteria. 3 | 4 | ## Requirements 5 | * Install `git` from [https://git-scm.com](https://git-scm.com/); 6 | * Install `go` from [https://golang.org/doc/install](https://golang.org/doc/install); 7 | * Install `python3` from [https://www.python.org/](https://www.python.org/) (we found running issues using the default Python in MacOS); 8 | * Install `pip3` from [https://pip.pypa.io/en/stable/installing/](https://pip.pypa.io/en/stable/installing/). 9 | 10 | ## Installation 11 | 1. Install `mcorr-xmfa`, `mcorr-bam`, and `mcorr-fit` from your terminal: 12 | ```sh 13 | go get -u github.com/kussell-lab/mcorr/cmd/mcorr-xmfa 14 | go get -u github.com/kussell-lab/mcorr/cmd/mcorr-bam 15 | cd $HOME/go/src/github.com/kussell-lab/mcorr/cmd/mcorr-fit 16 | python3 setup.py install 17 | ``` 18 | or to install `mcorr-fit` in local directory (~/.local/bin in Linux or ~/Library/Python/3.6/bin in MacOS): 19 | ```sh 20 | python3 setup.py install --user 21 | ``` 22 | 2. Add `$HOME/go/bin` and `$HOME/.local/bin` to your `$PATH` environment. In Linux, you can do it in your terminal: 23 | ```sh 24 | export PATH=$PATH:$HOME/go/bin:$HOME/.local/bin 25 | ``` 26 | 27 | In MacOS, you can do it as follows: 28 | ```sh 29 | export PATH=$PATH:$HOME/go/bin:$HOME/Library/Python/3.6/bin 30 | ``` 31 | 32 | We have tested installation in Windows 10, Ubuntu 17.10, and MacOS Big Sur (on both Intel and M1 chips), using Python 3 and Go 1.15 and 1.16. 33 | 34 | Typical installation time on an iMac is 10 minutes. 35 | 36 | ## Basic Usage 37 | The inference of recombination parameters requires two steps: 38 | 39 | 1. Calculate _Correlation Profile_ 40 | 41 | 1. For whole-genome alignments (multiple gene alignments), use `mcorr-xmfa`: 42 | 43 | ```sh 44 | mcorr-xmfa 45 | ``` 46 | The XMFA files should contain only *coding* sequences. The description of XMFA file can be found in [http://darlinglab.org/mauve/user-guide/files.html](http://darlinglab.org/mauve/user-guide/files.html). We provide two useful pipelines to generate whole-genome alignments: 47 | * from multiple assemblies: [https://github.com/kussell-lab/AssemblyAlignmentGenerator](https://github.com/kussell-lab/AssemblyAlignmentGenerator); 48 | * from raw reads: [https://github.com/kussell-lab/ReferenceAlignmentGenerator](https://github.com/kussell-lab/ReferenceAlignmentGenerator) 49 | 50 | 2. For read alignments, use `mcorr-bam`: 51 | ```sh 52 | mcorr-bam 53 | ``` 54 | The GFF3 file is used for extracting the coding regions of the sorted BAM file. 55 | 56 | 3. For calculating correlation profiles between two clades or sequence clusters from 57 | whole-genome alignments, you can use `mcorr-xmfa-2clades`: 58 | 59 | ```sh 60 | mcorr-xmfa-2clades 61 | ``` 62 | Where file 1 and file 2 are the multiple gene alignments for the two clades. 63 | 64 | All programs will produce two files: 65 | * a .csv file stores the calculated Correlation Profile, which will be used for fitting in the next step; 66 | * a .json file stores the (intermediate) Correlation Profile for each gene. 67 | 2. Fit the Correlation Profile using `mcorr-fit`: 68 | 69 | 1. For fitting correlation profiles as described in the 70 | [ 2019 Nature Methods paper](https://pubmed.ncbi.nlm.nih.gov/30664775/) use 71 | `mcorr-fit`: 72 | 73 | ```sh 74 | mcorr-fit <.csv file> 75 | ``` 76 | 77 | It will produce four files: 78 | 79 | * `_best_fit.svg` shows the plots of the Correlation Profile, fitting, and residuals; 80 | * `_fit_reports.txt` shows the summary of the fitted parameters; 81 | * `_fit_results.csv` shows the table of fitted parameters; 82 | * `_lmfit_report.csv` shows goodness of fit-statistics from LMFIT 83 | 84 | 2. To fit correlation profiles using the method from the Nature Methods paper and do model selection 85 | with AIC by comparing to the zero recombination case, use `mcorrFitCompare`: 86 | 87 | ```sh 88 | mcorrFitCompare <.csv file> 89 | ``` 90 | 91 | It will produce five files: 92 | 93 | * `_recombo_best_fit.svg` and `_zero-recombo_best_fit.svg` show the plots of the Correlation Profile, fitting, and residuals for the model with recombination and for the zero recombination case; 94 | * `_comparemodels.csv` shows the table of fitted parameters and AIC values; 95 | * `_recombo_residuals.csv` and `_zero-recombo_residuals.csv` includes residuals for the model with recombination and the zero-recombination case 96 | 97 | 98 | ## Examples 99 | 1. [Inferring recombination rates of _Helicobacter pylori_ from whole genome sequences of a set of global strains](https://github.com/kussell-lab/Helicobacter_pylori_global_population); 100 | 2. [Inferring recombination rates of _Helicobacter pylori_ from reads sequenced from a transformation experiment](https://github.com/kussell-lab/Helicobacter_pylori_transformation_experiments). 101 | 102 | -------------------------------------------------------------------------------- /cmd/mcorr-bam/read_bam.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "io" 5 | "os" 6 | 7 | "github.com/biogo/hts/bam" 8 | "github.com/biogo/hts/sam" 9 | "github.com/kussell-lab/biogo/feat/gff" 10 | ) 11 | 12 | // SamReader is an interface for sam or bam reader. 13 | type SamReader interface { 14 | Header() *sam.Header 15 | Read() (*sam.Record, error) 16 | } 17 | 18 | func readSamRecords(fileName string) (headerChan chan *sam.Header, samRecChan chan *sam.Record) { 19 | headerChan = make(chan *sam.Header) 20 | samRecChan = make(chan *sam.Record) 21 | go func() { 22 | defer close(headerChan) 23 | defer close(samRecChan) 24 | 25 | // Open file stream, and close it when finished. 26 | f, err := os.Open(fileName) 27 | if err != nil { 28 | panic(err) 29 | } 30 | defer f.Close() 31 | 32 | // Decide if it is a .sam or .bam file. 33 | var reader SamReader 34 | if fileName[len(fileName)-3:] == "bam" { 35 | bamReader, err := bam.NewReader(f, 0) 36 | if err != nil { 37 | panic(err) 38 | } 39 | defer bamReader.Close() 40 | reader = bamReader 41 | } else { 42 | reader, err = sam.NewReader(f) 43 | if err != nil { 44 | panic(err) 45 | } 46 | } 47 | 48 | header := reader.Header() 49 | headerChan <- header 50 | 51 | // Read sam records and send them to the channel, 52 | // until it hit an error, which raises a panic 53 | // if it is not a IO EOF. 54 | for { 55 | rec, err := reader.Read() 56 | if err != nil { 57 | if err != io.EOF { 58 | panic(err) 59 | } 60 | break 61 | } 62 | samRecChan <- rec 63 | } 64 | }() 65 | return 66 | } 67 | 68 | // GeneSamRecords stores Sam Records. 69 | type GeneSamRecords struct { 70 | ID string 71 | Start int 72 | End int 73 | Strand int 74 | Records []*sam.Record 75 | } 76 | 77 | // readPanGenomeBamFile reads bam file, and return the header and a channel of sam records. 78 | func readPanGenomeBamFile(fileName string) (header *sam.Header, recordsChan chan GeneSamRecords) { 79 | headerChan, samRecChan := readSamRecords(fileName) 80 | header = <-headerChan 81 | recordsChan = make(chan GeneSamRecords) 82 | go func() { 83 | defer close(recordsChan) 84 | currentRefID := "" 85 | var records []*sam.Record 86 | for rec := range samRecChan { 87 | if currentRefID == "" { 88 | currentRefID = rec.Ref.Name() 89 | } 90 | if rec.Ref.Name() != currentRefID { 91 | if len(records) > 0 { 92 | recordsChan <- GeneSamRecords{Start: 0, Records: records, End: records[0].Ref.Len(), ID: currentRefID} 93 | records = []*sam.Record{} 94 | } 95 | currentRefID = rec.Ref.Name() 96 | } 97 | records = append(records, rec) 98 | } 99 | if len(records) > 0 { 100 | recordsChan <- GeneSamRecords{Start: 0, Records: records, End: records[0].Ref.Len()} 101 | } 102 | }() 103 | 104 | return 105 | } 106 | 107 | //readStrainBamFile read []sam.Record from a bam file of mapping reads to a strain genome file. 108 | func readStrainBamFile(fileName string, gffMap map[string][]*gff.Record) (header *sam.Header, resultChan chan GeneSamRecords) { 109 | headerChan, samRecChan := readSamRecords(fileName) 110 | header = <-headerChan 111 | resultChan = make(chan GeneSamRecords) 112 | go func() { 113 | defer close(resultChan) 114 | var currentRecord GeneSamRecords 115 | var currentIdx int 116 | var currentRef string 117 | for read := range samRecChan { 118 | // skip if read quality is low. 119 | if !checkReadQuality(read) { 120 | continue 121 | } 122 | 123 | // skip if read is not in the reference genomes. 124 | if _, found := gffMap[read.Ref.Name()]; !found { 125 | continue 126 | } 127 | 128 | // update current reference and current idx 129 | if currentRef != read.Ref.Name() { 130 | currentRef = read.Ref.Name() 131 | currentIdx = 0 132 | } 133 | 134 | // update current record. 135 | if currentRecord.End <= read.Pos { 136 | if currentRecord.Start < currentRecord.End && len(currentRecord.Records) > 0 { 137 | resultChan <- currentRecord 138 | } 139 | records := gffMap[currentRef] 140 | for currentIdx < len(records) && records[currentIdx].End <= read.Pos { 141 | currentIdx++ 142 | } 143 | if currentIdx < len(records) { 144 | rec := records[currentIdx] 145 | currentRecord = GeneSamRecords{Start: rec.Start - 1, End: rec.End, ID: rec.ID(), Strand: rec.Strand} 146 | } 147 | } 148 | 149 | if read.Pos >= currentRecord.Start && read.Pos < currentRecord.End { 150 | currentRecord.Records = append(currentRecord.Records, read) 151 | } 152 | } 153 | if currentRecord.End > currentRecord.Start && len(currentRecord.Records) > 0 { 154 | resultChan <- currentRecord 155 | } 156 | 157 | }() 158 | return 159 | } 160 | 161 | func readGffs(fileName string) map[string][]*gff.Record { 162 | m := make(map[string][]*gff.Record) 163 | f, err := os.Open(fileName) 164 | if err != nil { 165 | panic(err) 166 | } 167 | defer f.Close() 168 | 169 | gffReader := gff.NewReader(f) 170 | 171 | records, err := gffReader.ReadAll() 172 | if err != nil { 173 | panic(err) 174 | } 175 | 176 | for _, rec := range records { 177 | if rec.Feature == "CDS" { 178 | m[rec.SeqName] = append(m[rec.SeqName], rec) 179 | } 180 | } 181 | return m 182 | } 183 | 184 | // checkReadQuality return false if the read fails quality check. 185 | func checkReadQuality(read *sam.Record) bool { 186 | if int(read.MapQ) < MinMapQuality || read.Len() < MinReadLength { 187 | return false 188 | } 189 | 190 | // contains only match or mismatch 191 | for _, cigar := range read.Cigar { 192 | if cigar.Type() != sam.CigarMatch && cigar.Type() != sam.CigarSoftClipped { 193 | return false 194 | } 195 | } 196 | 197 | return true 198 | } 199 | -------------------------------------------------------------------------------- /cmd/mcorr-fit/mcorr/FitComparison.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter 4 | #from mcorr import fit_p2 5 | # from mcorr import fit_p2, read_corr, FitDatas, \ 6 | # write_fitting_results, plot_fit, plot_params, write_fitting_reports, \ 7 | # geom_r1, const_r1 8 | from .fit_data import FitDatas 9 | from .fit import fit_p2, fit_modelopts, geom_r1, const_r1, fit_zerorecombo, zero_r1, solve_zerorecombo 10 | from .writer import write_fitting_reports, write_fitting_results 11 | from .plot import plot_fit, plot_params, plot_zerorecombo 12 | from .corr_res import read_corr 13 | import numpy as np 14 | import csv 15 | import pandas as pd 16 | import scipy 17 | #from lmfit.printfuncs import report_fit 18 | 19 | def main(): 20 | """Fit just the data (no bootstraps) using the method from Nature Methods paper and 21 | compare to the zero recombination case""" 22 | parser = ArgumentParser( 23 | formatter_class=ArgumentDefaultsHelpFormatter, 24 | description="Fit just data (no bootstraps) for model w/ and w/o recombination") 25 | parser.add_argument("corr_file", type = str, help='correlation input file') 26 | parser.add_argument("output_prefix", type=str, help='output file prefix') 27 | parser.add_argument('--fit_start', type=int, default=3, 28 | help='fitting range starts at') 29 | parser.add_argument('--fit_end', type=int, default=300, 30 | help='fitting range ends at') 31 | parser.add_argument("--use_geom_frag", action="store_true", 32 | help='use geometric distribution for fragment sizes') 33 | parser.add_argument("--fit_method", type=str, default="least_squares", help="lmfit method (see lmfit documentation)") 34 | parser.add_argument("--max_nfev", type=int, default=int(1e6), 35 | help='max number of function evaluations before lmfit quits') 36 | parser.add_argument('--quiet', action="store_true") 37 | parser.add_argument("--title", type=str, help="plot title", default="") 38 | opts = parser.parse_args() 39 | corr_file = opts.corr_file 40 | prefix = opts.output_prefix 41 | fit_start = opts.fit_start 42 | fit_end = opts.fit_end 43 | quiet = opts.quiet 44 | use_geom_frag = opts.use_geom_frag 45 | title = opts.title 46 | fit_method = opts.fit_method 47 | max_nfev = opts.max_nfev 48 | 49 | # read correlation results and prepare fitting data 50 | corr_results = read_corr(corr_file) 51 | fitdatas = FitDatas(corr_results, fit_start, fit_end) 52 | ##do fitting 53 | r1_func = const_r1 54 | #if you want to use a geometric distribution of fragments 55 | if use_geom_frag: 56 | r1_func = geom_r1 57 | 58 | all = fitdatas.get("all") 59 | x = all.xvalues 60 | y = all.yvalues 61 | d_sample = all.d_sample 62 | ##fit with the recombination model 63 | fitres = fit_modelopts(x, y, d_sample, r1_func, max_nfev, fit_method) 64 | #fit with the zero recombination model 65 | zdata, zres, zchisq, zred_chisq, zaic, zthetaS, z_dc = solve_zerorecombo(x, y, d_sample) 66 | ## write a fit report as generated by lmfit (includes chi-squared, uncertainties, etc) 67 | ##for the recombination model 68 | params = fitres.params.valuesdict() 69 | thetaS = fitres.params["thetaS"] 70 | phiS = fitres.params["phiS"] 71 | f = fitres.params["f"] 72 | lmfitfile = prefix + "_comparemodels.csv" 73 | with open(lmfitfile, "w+") as csvfile: 74 | lmfit_writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) 75 | lmfit_writer.writerow(["", "recombo", "zero recombo"]) 76 | lmfit_writer.writerow(["fit_success", fitres.success, "n/a"]) 77 | lmfit_writer.writerow(["function_evals", fitres.nfev, "n/a"]) 78 | lmfit_writer.writerow(["data_points", fitres.ndata, zdata]) 79 | lmfit_writer.writerow(["variables", fitres.nvarys, 1]) 80 | lmfit_writer.writerow(["message", fitres.message, "n/a"]) 81 | lmfit_writer.writerow(["thetaS (init)", thetaS.init_value, "n/a"]) 82 | lmfit_writer.writerow(["f (init)", f.init_value, 0]) 83 | lmfit_writer.writerow(["phiS (init)", phiS.init_value, 0]) 84 | lmfit_writer.writerow([""]) 85 | lmfit_writer.writerow(["recombination", "d_s", "theta_s", "f", "phi_s", 86 | "theta_p", "phi_p", "c", "d_theta_p", 87 | "d_theta_s", "chisq", "red-chisq", "AIC"]) 88 | lmfit_writer.writerow(["recombo", params["ds"], params["thetaS"], params["f"], params["phiS"], 89 | params["thetaP"], params["phiP"], params["c"], params["dp"], 90 | params["dc"], fitres.chisqr, fitres.redchi, fitres.aic]) 91 | lmfit_writer.writerow(["zero_recombo", d_sample, zthetaS, np.NAN, np.NAN, 92 | np.NAN, np.NAN, np.NAN, np.NAN, 93 | z_dc, zchisq, zred_chisq, zaic]) 94 | ##save the residuals as a .csv file 95 | residuals = zres 96 | resdat = pd.DataFrame(residuals) 97 | resdat.to_csv(prefix+"_zero-recombo_residuals.csv", header=None) 98 | #plot the best fit and the residuals 99 | best_fit_file = prefix + "_zero-recombo_best_fit.svg" 100 | plot_zerorecombo(all, zres, best_fit_file, title=title) 101 | if fitres.success: 102 | residuals = fitres.residual 103 | resdat = pd.DataFrame(residuals) 104 | resdat.to_csv(prefix+"_recombo_residuals.csv", header=None) 105 | #plot the best fit and the residuals 106 | best_fit_file = prefix + "_recombo_best_fit.svg" 107 | plot_fit(all, fitres, best_fit_file, title=title) 108 | else: 109 | print("Fitting failed for %s" % corr_file) 110 | 111 | if __name__ == "__main__": 112 | main() 113 | 114 | 115 | 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /cmd/mcorr-fit/mcorr/plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib as mpl 2 | mpl.use('Agg') 3 | 4 | import numpy 5 | import matplotlib.pyplot as plt 6 | from matplotlib import gridspec 7 | from mpl_toolkits.axes_grid1.inset_locator import inset_axes 8 | 9 | plt.rcParams['mathtext.fontset'] = 'cm' 10 | 11 | def plot_zerorecombo(fitdata, fitres, plot_file, title=None): 12 | """plot the analytical solution from fitting to the zero-recombination model""" 13 | xvalues = fitdata.xvalues 14 | yvalues = fitdata.yvalues 15 | fig = plt.figure(tight_layout=False) 16 | 17 | figsize = 4 18 | fig.set_figheight(figsize) 19 | fig.set_figwidth(figsize) 20 | gs = gridspec.GridSpec(2, 1, height_ratios=[3, 1], hspace=0) 21 | ax1 = plt.subplot(gs[0, 0]) 22 | ax1.scatter(xvalues, yvalues, s=20, facecolors='none', edgecolors='k') 23 | predictions = yvalues + fitres 24 | ax1.plot(xvalues, predictions, 'k') 25 | ax1.set_ylabel(r'$P$') 26 | if numpy.min(yvalues) != numpy.max(yvalues): 27 | ax1.set_ylim([numpy.min(yvalues)*0.9, numpy.max(yvalues)*1.1]) 28 | ax1.locator_params(axis='x', nbins=5) 29 | ax1.locator_params(axis='y', nbins=5) 30 | plt.setp(ax1.get_xticklabels(), visible=False) 31 | if title: plt.title(title, loc="left") 32 | 33 | ax2 = plt.subplot(gs[1, 0]) 34 | markerline, _, _ = ax2.stem(xvalues, 35 | fitres, 36 | linefmt='k-', 37 | basefmt='r-', 38 | markerfmt='ko') 39 | ax2.set_xlabel(r'$l$') 40 | ax2.set_ylabel("Residual") 41 | ax2.locator_params(axis='x', nbins=5) 42 | plt.setp(markerline, "markersize", 4) 43 | fig.tight_layout() 44 | 45 | ##remove histograms as they can cause problems for samples with widely varying bootstrap results 46 | # ax3 = inset_axes(ax1, width="50%", height="33%", loc=1) 47 | # ax3.hist(fitres, bins="auto", facecolor='green', alpha=0.5) 48 | # ax3.set_xlabel("Residual") 49 | # plt.setp(ax3.get_xticklabels(), rotation=20, horizontalalignment='right') 50 | # ax3.axes.get_yaxis().set_ticks([]) 51 | 52 | fig.savefig(plot_file) 53 | 54 | def plot_fit(fitdata, fitres, plot_file, title=None): 55 | """Fit all row data and do plotting for the full-recombination model""" 56 | xvalues = fitdata.xvalues 57 | yvalues = fitdata.yvalues 58 | fig = plt.figure(tight_layout=False) 59 | 60 | figsize = 4 61 | fig.set_figheight(figsize) 62 | fig.set_figwidth(figsize) 63 | gs = gridspec.GridSpec(2, 1, height_ratios=[3, 1], hspace=0) 64 | ax1 = plt.subplot(gs[0, 0]) 65 | ax1.scatter(xvalues, yvalues, s=20, facecolors='none', edgecolors='k') 66 | predictions = yvalues + fitres.residual 67 | ax1.plot(xvalues, predictions, 'k') 68 | ax1.set_ylabel(r'$P$') 69 | if numpy.min(yvalues) != numpy.max(yvalues): 70 | ax1.set_ylim([numpy.min(yvalues)*0.9, numpy.max(yvalues)*1.1]) 71 | ax1.locator_params(axis='x', nbins=5) 72 | ax1.locator_params(axis='y', nbins=5) 73 | plt.setp(ax1.get_xticklabels(), visible=False) 74 | if title: plt.title(title, loc="left") 75 | 76 | ax2 = plt.subplot(gs[1, 0]) 77 | markerline, _, _ = ax2.stem(xvalues, 78 | fitres.residual, 79 | linefmt='k-', 80 | basefmt='r-', 81 | markerfmt='ko') 82 | ax2.set_xlabel(r'$l$') 83 | ax2.set_ylabel("Residual") 84 | ax2.locator_params(axis='x', nbins=5) 85 | plt.setp(markerline, "markersize", 4) 86 | fig.tight_layout() 87 | 88 | ##remove histograms as they can cause problems for samples with widely varying bootstrap results 89 | # ax3 = inset_axes(ax1, width="50%", height="33%", loc=1) 90 | # ax3.hist(fitres.residual, bins="auto", facecolor='green', alpha=0.5) 91 | # ax3.set_xlabel("Residual") 92 | # plt.setp(ax3.get_xticklabels(), rotation=20, horizontalalignment='right') 93 | # ax3.axes.get_yaxis().set_ticks([]) 94 | 95 | fig.savefig(plot_file) 96 | 97 | def plot_params(fit_results, param_names, plot_file): 98 | """plot histogram of parameters; this function has been taken out of the main scripts for now 99 | due to some issues with determining bins for widely-varying bootstraps""" 100 | # determine how many columns and rows of sub-plots. 101 | num_col = 3 102 | num_row = len(param_names) // num_col 103 | if len(param_names) % num_col != 0: 104 | num_row = num_row + 1 105 | # each sub plot is 6x3 106 | figheight = num_row * 8 107 | figwidth = num_col * 3 108 | # init figure. 109 | fig = plt.figure(tight_layout=True) 110 | fig.set_figheight(figheight) 111 | fig.set_figwidth(figwidth) 112 | # create greek name of parameters. 113 | label_names = { 114 | "d_sample": r"$d_{sample}$", 115 | "fbar": r"$\bar{f}$", 116 | "theta_pool": r"$\theta_{pool}$", 117 | "phi_pool": r"$\phi_{pool}$", 118 | "c": r"$c$", 119 | "ratio": r"$\gamma/\mu$", 120 | } 121 | 122 | for i, name in enumerate(param_names): 123 | boot_values = [] 124 | raw_value = None 125 | for fitres in fit_results: 126 | if hasattr(fitres, name): 127 | value = getattr(fitres, name) 128 | if fitres.group == "all": 129 | raw_value = value 130 | else: 131 | boot_values.append(value) 132 | if len(boot_values) > 0: 133 | ax1 = fig.add_subplot(num_col * num_row, num_col, i+1) 134 | ax1.hist(boot_values, histtype='bar', 135 | bins='auto', color="green") 136 | label = label_names.get(name, name) 137 | ax1.set_xlabel(label) 138 | if raw_value is not None: 139 | ax1.axvline(x=raw_value, color="red") 140 | ax1.locator_params(axis='x', nbins=6) 141 | ax1.tick_params(axis='x', rotation=30) 142 | ax1.ticklabel_format(axis='x', style='sci', scilimits=(-3, 3)) 143 | fig.savefig(plot_file) 144 | 145 | -------------------------------------------------------------------------------- /cmd/mcorr-xmfa/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "io" 5 | "os" 6 | "runtime" 7 | "strings" 8 | 9 | "github.com/kussell-lab/biogo/seq" 10 | "github.com/kussell-lab/mcorr" 11 | "github.com/kussell-lab/ncbiftp/taxonomy" 12 | "gopkg.in/alecthomas/kingpin.v2" 13 | "gopkg.in/cheggaaa/pb.v2" 14 | ) 15 | 16 | // global variables. 17 | func main() { 18 | app := kingpin.New("mcorr-xmfa", "Calculate mutation correlation from bacterial sequence alignments in XMFA format.") 19 | app.Version("v20180102") 20 | 21 | alnFile := app.Arg("in", "Alignment file in XMFA format.").Required().String() 22 | outPrefix := app.Arg("out", "Output prefix.").Required().String() 23 | 24 | maxl := app.Flag("max-corr-length", "Maximum distance of correlation (base pairs)").Default("300").Int() 25 | ncpu := app.Flag("num-cpu", "Number of CPUs (default: using all available cores)").Default("0").Int() 26 | numBoot := app.Flag("num-boot", "Number of bootstrapping on genes").Default("1000").Int() 27 | showProgress := app.Flag("show-progress", "Show progress").Bool() 28 | 29 | kingpin.MustParse(app.Parse(os.Args[1:])) 30 | 31 | if *ncpu <= 0 { 32 | *ncpu = runtime.NumCPU() 33 | } 34 | runtime.GOMAXPROCS(*ncpu) 35 | 36 | // show progress bar? 37 | var bar *pb.ProgressBar 38 | if *showProgress { 39 | max := getNumberOfAlignments(*alnFile) 40 | bar = pb.StartNew(max) 41 | defer bar.Finish() 42 | } 43 | 44 | // prepare calculator. 45 | var calculator Calculator 46 | codingTable := taxonomy.GeneticCodes()["11"] 47 | maxCodonLen := *maxl / 3 48 | 49 | synonymous := true 50 | codonPos := 3 51 | codonOffset := 0 52 | 53 | var alnChan chan Alignment 54 | if bar == nil { 55 | alnChan = readAlignments(*alnFile) 56 | } else { 57 | alnChan = make(chan Alignment) 58 | go func() { 59 | defer close(alnChan) 60 | count := 0 61 | c := readAlignments(*alnFile) 62 | for a := range c { 63 | alnChan <- a 64 | bar.Add(1) 65 | count++ 66 | } 67 | }() 68 | } 69 | calculator = NewCodingCalculator(codingTable, maxCodonLen, codonOffset, codonPos-1, synonymous) 70 | corrResChan := calcSingleClade(alnChan, calculator) 71 | //what's in the json is actually Qs NOT P2! 72 | resChan := mcorr.PipeOutCorrResults(corrResChan, *outPrefix+".json") 73 | //division by d_sample or P2 is not until here!!! 74 | mcorr.CollectWrite(resChan, *outPrefix+".csv", *numBoot) 75 | } 76 | 77 | // Alignment is an array of mutliple sequences with same length. 78 | type Alignment struct { 79 | ID string 80 | Sequences []seq.Sequence 81 | } 82 | 83 | // calcSingleClade calculate correlation functions in a single cluster of sequence. 84 | func calcSingleClade(alnChan chan Alignment, calculator Calculator) (corrResChan chan mcorr.CorrResults) { 85 | corrResChan = make(chan mcorr.CorrResults) 86 | done := make(chan bool) 87 | 88 | ncpu := runtime.GOMAXPROCS(0) 89 | for i := 0; i < ncpu; i++ { 90 | go func() { 91 | for aln := range alnChan { 92 | if len(aln.Sequences) > 1 { 93 | results := calculator.CalcP2(aln) 94 | corrResChan <- results 95 | } 96 | } 97 | done <- true 98 | }() 99 | } 100 | 101 | go func() { 102 | defer close(corrResChan) 103 | for i := 0; i < ncpu; i++ { 104 | <-done 105 | } 106 | }() 107 | return 108 | } 109 | 110 | // calcTwoClade calculate correlation functions between two clades. 111 | func calcTwoClade(alnChan, mateAlnChan chan Alignment, calculator Calculator) (corrResChan chan mcorr.CorrResults) { 112 | type job struct { 113 | A, B Alignment 114 | } 115 | jobChan := make(chan job) 116 | go func() { 117 | defer close(jobChan) 118 | for aln := range alnChan { 119 | mateAln := <-mateAlnChan 120 | if len(aln.Sequences) >= 1 && len(mateAln.Sequences) >= 1 { 121 | j := job{A: aln, B: mateAln} 122 | jobChan <- j 123 | } 124 | } 125 | }() 126 | 127 | corrResChan = make(chan mcorr.CorrResults) 128 | done := make(chan bool) 129 | 130 | ncpu := runtime.GOMAXPROCS(0) 131 | for i := 0; i < ncpu; i++ { 132 | go func() { 133 | for j := range jobChan { 134 | results := calculator.CalcP2(j.A, j.B) 135 | corrResChan <- results 136 | } 137 | done <- true 138 | }() 139 | } 140 | 141 | go func() { 142 | defer close(corrResChan) 143 | for i := 0; i < ncpu; i++ { 144 | <-done 145 | } 146 | }() 147 | return 148 | } 149 | 150 | // setNumThreads sets number of CPU cores for using. 151 | // if ncpu == 0, we will used all core avaible. 152 | func setNumThreads(ncpu int) { 153 | if ncpu == 0 { 154 | ncpu = runtime.NumCPU() 155 | } 156 | runtime.GOMAXPROCS(ncpu) 157 | } 158 | 159 | // readAlignments reads sequence alignment from a extended Multi-FASTA file, 160 | // and return a channel of alignment, which is a list of seq.Sequence 161 | func readAlignments(file string) (alnChan chan Alignment) { 162 | alnChan = make(chan Alignment) 163 | go func() { 164 | defer close(alnChan) 165 | 166 | c := readXMFA(file) 167 | for alignment := range c { 168 | alnID := strings.Split(alignment[0].Id, " ")[0] 169 | alnChan <- Alignment{ID: alnID, Sequences: alignment} 170 | } 171 | }() 172 | 173 | return 174 | } 175 | 176 | // getNumberOfAlignments return total number of alignments in a xmfa file. 177 | func getNumberOfAlignments(file string) (count int) { 178 | c := readXMFA(file) 179 | for a := range c { 180 | if len(a) >= 2 { 181 | count++ 182 | } 183 | } 184 | return 185 | } 186 | 187 | // readXMFA reads a xmfa format file and returns a channel of []seq.Sequence. 188 | func readXMFA(file string) chan []seq.Sequence { 189 | c := make(chan []seq.Sequence) 190 | go func() { 191 | defer close(c) 192 | 193 | f := mustOpen(file) 194 | defer f.Close() 195 | 196 | rd := seq.NewXMFAReader(f) 197 | for { 198 | a, err := rd.Read() 199 | if err != nil { 200 | if err != io.EOF { 201 | panic(err) 202 | } 203 | break 204 | } 205 | if len(a) >= 2 { 206 | c <- a 207 | } 208 | } 209 | }() 210 | return c 211 | } 212 | 213 | // mustOpen is a helper function to open a file. 214 | // and panic if error occurs. 215 | func mustOpen(file string) (f *os.File) { 216 | var err error 217 | f, err = os.Open(file) 218 | if err != nil { 219 | panic(err) 220 | } 221 | return 222 | } 223 | 224 | //print funky monkey 225 | -------------------------------------------------------------------------------- /cmd/mcorr-xmfa-2clades/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // script written by Asher Preska Steinberg (apsteinberg@nyu.edu) 4 | import ( 5 | "github.com/kussell-lab/biogo/seq" 6 | "github.com/kussell-lab/mcorr" 7 | "github.com/kussell-lab/ncbiftp/taxonomy" 8 | "gopkg.in/alecthomas/kingpin.v2" 9 | "gopkg.in/cheggaaa/pb.v2" 10 | "io" 11 | "os" 12 | "runtime" 13 | "strings" 14 | ) 15 | 16 | // global variables. 17 | func main() { 18 | app := kingpin.New("mcorr-xmfa-2clades", "Calculate mutation correlation from bacterial sequence alignments from two clades in XMFA format.") 19 | app.Version("v20200808") 20 | alnFile := app.Arg("in-1", "Alignment file in XMFA format.").Required().String() 21 | //added by Asher 22 | mateAlnFile := app.Arg("in-2", "Alignment file in XMFA format.").Required().String() 23 | outPrefix := app.Arg("out", "Output prefix.").Required().String() 24 | maxl := app.Flag("max-corr-length", "Maximum distance of correlation (base pairs)").Default("300").Int() 25 | ncpu := app.Flag("num-cpu", "Number of CPUs (default: using all available cores)").Default("0").Int() 26 | numBoot := app.Flag("num-boot", "Number of bootstrapping on genes").Default("1000").Int() 27 | showProgress := app.Flag("show-progress", "Show progress").Default("true").Bool() 28 | 29 | kingpin.MustParse(app.Parse(os.Args[1:])) 30 | 31 | if *ncpu <= 0 { 32 | *ncpu = runtime.NumCPU() 33 | } 34 | runtime.GOMAXPROCS(*ncpu) 35 | 36 | // show progress bar? 37 | var bar *pb.ProgressBar 38 | if *showProgress { 39 | max := getNumberOfAlignments(*alnFile) 40 | bar = pb.StartNew(max) 41 | defer bar.Finish() 42 | } 43 | 44 | // prepare calculator. 45 | var calculator Calculator 46 | codingTable := taxonomy.GeneticCodes()["11"] 47 | maxCodonLen := *maxl / 3 48 | 49 | synonymous := true 50 | codonPos := 3 51 | codonOffset := 0 52 | 53 | var alnChan chan Alignment 54 | if bar == nil { 55 | alnChan = readAlignments(*alnFile) 56 | //mateAlnChan = findMateAln(*mateAlnFile, aln.ID) 57 | } else { 58 | alnChan = make(chan Alignment) 59 | go func() { 60 | defer close(alnChan) 61 | count := 0 62 | c := readAlignments(*alnFile) 63 | for a := range c { 64 | alnChan <- a 65 | bar.Add(1) 66 | count++ 67 | } 68 | }() 69 | } 70 | 71 | calculator = NewMateCalculator(codingTable, maxCodonLen, codonOffset, codonPos-1, synonymous) 72 | corrResChan := calcTwoClade(alnChan, calculator, mateAlnFile) 73 | 74 | resChan := mcorr.PipeOutCorrResults(corrResChan, *outPrefix+".json") 75 | mcorr.CollectWrite(resChan, *outPrefix+".csv", *numBoot) 76 | } 77 | 78 | // Alignment is an array of multiple sequences with same length. 79 | type Alignment struct { 80 | ID string //gene ID 81 | Pos string // position on the genome to differentiate alleles 82 | Sequences []seq.Sequence 83 | } 84 | 85 | // calcSingleClade calculate correlation functions in a single cluster of sequence. 86 | func calcSingleClade(alnChan chan Alignment, calculator Calculator) (corrResChan chan mcorr.CorrResults) { 87 | corrResChan = make(chan mcorr.CorrResults) 88 | done := make(chan bool) 89 | 90 | ncpu := runtime.GOMAXPROCS(0) 91 | for i := 0; i < ncpu; i++ { 92 | go func() { 93 | for aln := range alnChan { 94 | if len(aln.Sequences) > 1 { 95 | results := calculator.CalcP2(aln) 96 | corrResChan <- results 97 | } 98 | } 99 | done <- true 100 | }() 101 | } 102 | 103 | go func() { 104 | defer close(corrResChan) 105 | for i := 0; i < ncpu; i++ { 106 | <-done 107 | } 108 | }() 109 | return 110 | } 111 | 112 | // calcTwoClade calculate correlation functions between two clades. 113 | func calcTwoClade(alnChan chan Alignment, calculator Calculator, mateAlnFile *string) (corrResChan chan mcorr.CorrResults) { 114 | type job struct { 115 | A, B Alignment 116 | } 117 | jobChan := make(chan job) 118 | go func() { 119 | defer close(jobChan) 120 | for aln := range alnChan { 121 | //find the same gene in the second alignment file 122 | mateAlnChan := findMateAln(*mateAlnFile, aln.ID, aln.Pos) 123 | mateAln := <-mateAlnChan 124 | if len(aln.Sequences) >= 1 && len(mateAln.Sequences) >= 1 { 125 | //double-check that you have the same gene from both files! 126 | if aln.ID == mateAln.ID { 127 | j := job{A: aln, B: mateAln} 128 | jobChan <- j 129 | //fmt.Printf("match") 130 | } 131 | } 132 | } 133 | }() 134 | 135 | corrResChan = make(chan mcorr.CorrResults) 136 | done := make(chan bool) 137 | 138 | ncpu := runtime.GOMAXPROCS(0) 139 | for i := 0; i < ncpu; i++ { 140 | go func() { 141 | for j := range jobChan { 142 | results := calculator.CalcP2(j.A, j.B) 143 | corrResChan <- results 144 | } 145 | done <- true 146 | }() 147 | } 148 | 149 | go func() { 150 | defer close(corrResChan) 151 | for i := 0; i < ncpu; i++ { 152 | <-done 153 | } 154 | }() 155 | return 156 | } 157 | 158 | // setNumThreads sets number of CPU cores for using. 159 | // if ncpu == 0, we will used all core avaible. 160 | func setNumThreads(ncpu int) { 161 | if ncpu == 0 { 162 | ncpu = runtime.NumCPU() 163 | } 164 | runtime.GOMAXPROCS(ncpu) 165 | } 166 | 167 | // readAlignments reads sequence alignment from a extended Multi-FASTA file, 168 | // and return a channel of alignment, which is a list of seq.Sequence 169 | func readAlignments(file string) (alnChan chan Alignment) { 170 | alnChan = make(chan Alignment) 171 | go func() { 172 | defer close(alnChan) 173 | 174 | c := readXMFA(file) 175 | for alignment := range c { 176 | header := strings.Split(alignment[0].Id, " ") 177 | alnID := header[0] 178 | genomePos := header[1] 179 | alnChan <- Alignment{ID: alnID, Pos: genomePos, Sequences: alignment} 180 | } 181 | }() 182 | 183 | return 184 | } 185 | 186 | func findMateAln(file, alnID, alnGenomePos string) (mateAln chan Alignment) { 187 | mateAln = make(chan Alignment) 188 | go func() { 189 | defer close(mateAln) 190 | 191 | c := readXMFA(file) 192 | for alignment := range c { 193 | header := strings.Split(alignment[0].Id, " ") 194 | mateAlnID := header[0] 195 | mateGenomePos := header[1] 196 | if mateAlnID == alnID && mateGenomePos == alnGenomePos { 197 | mateAln <- Alignment{ID: mateAlnID, Pos: mateGenomePos, Sequences: alignment} 198 | } 199 | } 200 | }() 201 | 202 | return 203 | } 204 | 205 | // getNumberOfAlignments return total number of alignments in a xmfa file. 206 | func getNumberOfAlignments(file string) (count int) { 207 | c := readXMFA(file) 208 | for a := range c { 209 | if len(a) >= 2 { 210 | count++ 211 | } 212 | } 213 | return 214 | } 215 | 216 | // readXMFA reads a xmfa format file and returns a channel of []seq.Sequence. 217 | func readXMFA(file string) chan []seq.Sequence { 218 | c := make(chan []seq.Sequence) 219 | go func() { 220 | defer close(c) 221 | 222 | f := mustOpen(file) 223 | defer f.Close() 224 | 225 | rd := seq.NewXMFAReader(f) 226 | for { 227 | a, err := rd.Read() 228 | if err != nil { 229 | if err != io.EOF { 230 | panic(err) 231 | } 232 | break 233 | } 234 | //temporary change from 2 to 1 235 | if len(a) >= 1 { 236 | c <- a 237 | } 238 | } 239 | }() 240 | return c 241 | } 242 | 243 | // mustOpen is a helper function to open a file. 244 | // and panic if error occurs. 245 | func mustOpen(file string) (f *os.File) { 246 | var err error 247 | f, err = os.Open(file) 248 | if err != nil { 249 | panic(err) 250 | } 251 | return 252 | } 253 | -------------------------------------------------------------------------------- /cmd/mcorr-fit/mcorr/fit.py: -------------------------------------------------------------------------------- 1 | """Infer recombination rates by fitting correlation profile""" 2 | from __future__ import print_function 3 | 4 | import math 5 | import numpy as numpy 6 | from lmfit import Parameters, Minimizer, minimize 7 | from tqdm import tqdm 8 | from . import FitRes 9 | 10 | def Power(a, b): 11 | """compute power""" 12 | return a**b 13 | 14 | def zero_r1(x, fBar, phiC, w): 15 | """zero r1 for the no recombination case""" 16 | return numpy.zeros(len(x)) 17 | 18 | def const_r1(x, fBar, phiC, w): 19 | """calculate r1 assuming constant fragment size""" 20 | return numpy.where(x < fBar, w*phiC*x, w*phiC*fBar) 21 | 22 | def exp_r1(x, fBar, phiC, w): 23 | """calculate r1 assuming exponetional decay of fragment size""" 24 | return w*phiC*fBar*(1.0 - numpy.exp(-x/fBar)) 25 | 26 | def geom_r1(x, fBar, phiC, w): 27 | """calculate r1 assuming geom distribution""" 28 | prob = 1.0/fBar 29 | return w*phiC*fBar*(1.0 - numpy.power(1-prob, x)) 30 | 31 | def calcP2(thetaS, r1, r2, ds, a): 32 | """ 33 | calcP2 using expression computed using Mathematica CForm 34 | """ 35 | v = (2*(r2*thetaS + ds*r1*(1 + r1 + r2 + a*thetaS))* \ 36 | (r2*Power(thetaS,2) + Power(ds,2)*(1 + r1 + r2 + a*thetaS)* \ 37 | (2*Power(r1,2) + r2 + 3*r1*r2 + Power(r2,2) + a*(r1 + 2*r2)*thetaS) - \ 38 | ds*thetaS*(2*r2 + Power(r1 + r2,2) + a*(r1 + 3*r2)*thetaS)))/ \ 39 | (Power(r1 + r2,2)*(1 + 2*r1 + r2 + 2*a*thetaS)* \ 40 | (-(thetaS*(r1 - r2 + a*thetaS)) + ds*(2*r1 + a*thetaS)* \ 41 | (1 + r1 + r2 + a*thetaS))) 42 | return v 43 | 44 | def fcn2min(params, xvalues, yvalues, r1_func): 45 | """function 2 min""" 46 | thetaS = params['thetaS'] 47 | phiS = params['phiS'] 48 | f = params['f'] 49 | w = params['w'] 50 | r1 = r1_func(xvalues, f, phiS, w) 51 | r2 = phiS * w * f - r1 52 | ds = params['ds'] 53 | a = params['a'] 54 | p2 = calcP2(thetaS, r1, r2, ds, a) / ds 55 | return p2 - yvalues 56 | 57 | def zerofcn2min(params, xvalues, yvalues): 58 | """function to 2 min for zero recombination case""" 59 | thetaS = params['thetaS'] 60 | a = params['a'] 61 | ds = params['ds'] 62 | d2thetaS = (2*thetaS)/(1+2*thetaS*a) 63 | p2 = d2thetaS*numpy.ones(len(xvalues)) 64 | return p2 - yvalues 65 | 66 | def fit_model(xvalues, yvalues, d_sample, r1_func): 67 | """fitting correlation profile using lmfit""" 68 | params1 = Parameters() 69 | params1.add('ds', value=d_sample, vary=False) 70 | params1.add('thetaS', value=0.00001, min=0, max=d_sample) 71 | params1.add('f', value=1000, min=3, max=300000) 72 | ## originally max was 1 73 | params1.add('phiS', value=0.00005, min=0, max=1) 74 | params1.add('w', value=2.0/3.0, vary=False) 75 | params1.add('a', value=4.0/3.0, vary=False) 76 | ##originally thetaP, phiP had no minima 77 | params1.add('thetaP', expr='(ds*(1 + phiS*w*f + a*thetaS)-thetaS)/ \ 78 | ((1 - a*ds)*(phiS*w*f + a*thetaS)-(a*ds))') 79 | params1.add('phiP', expr='phiS*thetaP/thetaS') 80 | params1.add('c', expr='w*phiS*f/(1+w*phiS*f+thetaS*a)') 81 | params1.add('dp', expr='thetaP/(1+a*thetaP)') 82 | params1.add('dc', expr='thetaS/(1+a*thetaS)') 83 | result = minimize(fcn2min, params1, args=(xvalues, yvalues, r1_func), 84 | method="least_squares", max_nfev=int(1e6)) 85 | return result 86 | 87 | def fit_modelopts(xvalues, yvalues, d_sample, r1_func, nefv, fit_method): 88 | """fitting correlation profile using lmfit""" 89 | params1 = Parameters() 90 | params1.add('ds', value=d_sample, vary=False) 91 | params1.add('thetaS', value=0.00001, min=0, max=d_sample) 92 | params1.add('f', value=1000, min=3, max=300000) 93 | ## originally max was 1 94 | params1.add('phiS', value=0.00005, min=0, max=1) 95 | params1.add('w', value=2.0/3.0, vary=False) 96 | params1.add('a', value=4.0/3.0, vary=False) 97 | ##originally thetaP, phiP had no minima 98 | params1.add('thetaP', expr='(ds*(1 + phiS*w*f + a*thetaS)-thetaS)/ \ 99 | ((1 - a*ds)*(phiS*w*f + a*thetaS)-(a*ds))') 100 | params1.add('phiP', expr='phiS*thetaP/thetaS') 101 | params1.add('c', expr='w*phiS*f/(1+w*phiS*f+thetaS*a)') 102 | params1.add('dp', expr='thetaP/(1+a*thetaP)') 103 | params1.add('dc', expr='thetaS/(1+a*thetaS)') 104 | result = minimize(fcn2min, params1, args=(xvalues, yvalues, r1_func), method=fit_method, max_nfev=nefv) 105 | return result 106 | 107 | def fit_zerorecombo(xvalues, yvalues, d_sample, nefv, fit_method): 108 | """fitting correlation profile with the 'null recombination model'; this is obsolete """ 109 | params1 = Parameters() 110 | params1.add('ds', value=d_sample, vary=False) 111 | params1.add('thetaS', value=0.00001, min=0) 112 | params1.add('a', value=4.0/3.0, vary=False) 113 | params1.add('dc', expr='thetaS/(1+a*thetaS)') 114 | result = minimize(zerofcn2min, params1, args=(xvalues, yvalues), method=fit_method, max_nfev=nefv) 115 | return result 116 | 117 | def solve_zerorecombo(xvalues, yvalues, d_sample): 118 | """solve the null recombination model exactly""" 119 | d2thetaS = numpy.mean(yvalues) 120 | residuals = numpy.ones(len(yvalues))*d2thetaS - yvalues 121 | chisq = numpy.sum(residuals**2) 122 | ndata = len(xvalues) 123 | red_chisq = chisq/(ndata-1) 124 | if chisq == 0: 125 | aic = -numpy.Inf 126 | else: 127 | aic = ndata*math.log(chisq/ndata)+2*1 128 | a = 4/3 129 | ##equation is classic expression of heterozygosity 130 | thetaS = d_sample/(1-a*d_sample) 131 | dc = thetaS/(1+a*thetaS) 132 | return ndata, residuals, chisq, red_chisq, aic, thetaS, dc 133 | 134 | 135 | def vary_fit(xvalues, yvalues, d_sample, r1_func, f_i, thetaS_i, phiS_i, phiS_max): 136 | """fitting correlation profile using lmfit""" 137 | params1 = Parameters() 138 | params1.add('ds', value=d_sample, vary=False) 139 | params1.add('thetaS', value=thetaS_i, min=0, max=d_sample) 140 | params1.add('f', value=f_i, min=3, max=300000) 141 | ## originally max was 1 142 | params1.add('phiS', value=phiS_i, min=0, max=phiS_max) 143 | params1.add('w', value=2.0/3.0, vary=False) 144 | params1.add('a', value=4.0/3.0, vary=False) 145 | ##originally thetaP, phiP had no minima 146 | params1.add('thetaP', expr='(ds*(1 + phiS*w*f + a*thetaS)-thetaS)/ \ 147 | ((1 - a*ds)*(phiS*w*f + a*thetaS)-(a*ds))') 148 | params1.add('phiP', expr='phiS*thetaP/thetaS') 149 | params1.add('c', expr='w*phiS*f/(1+w*phiS*f+thetaS*a)') 150 | params1.add('dp', expr='thetaP/(1+a*thetaP)') 151 | params1.add('dc', expr='thetaS/(1+a*thetaS)') 152 | minner1 = Minimizer(fcn2min, params1, fcn_args=(xvalues, yvalues, r1_func)) 153 | try: 154 | fitres1 = minner1.minimize() 155 | except: 156 | fitres1 = None 157 | return fitres1 158 | 159 | def fit_one(fitdata, r1_func): 160 | """Fit one data set""" 161 | xvalues = fitdata.xvalues 162 | yvalues = fitdata.yvalues 163 | dsample = fitdata.d_sample 164 | fitres = fit_model(xvalues, yvalues, dsample, r1_func) 165 | if fitres is not None: 166 | try: 167 | params = fitres.params.valuesdict() 168 | residual = fitres.residual 169 | except ZeroDivisionError as error: 170 | print(error) 171 | return None 172 | return FitRes(fitdata.group, residual, params, dsample) 173 | return None 174 | 175 | def fit_p2(fitdatas, r1_func=const_r1, disable_progress_bar=False): 176 | """Fit p2""" 177 | all_results = [] 178 | for fitdata in tqdm(fitdatas.getall(), disable=disable_progress_bar): 179 | fitres = fit_one(fitdata, r1_func) 180 | if fitres is not None: 181 | all_results.append(fitres) 182 | return all_results 183 | 184 | -------------------------------------------------------------------------------- /cmd/development/FitCollector/old/getNumPairs.go: -------------------------------------------------------------------------------- 1 | package old 2 | 3 | import ( 4 | "bufio" 5 | "encoding/csv" 6 | "encoding/json" 7 | "fmt" 8 | "gonum.org/v1/gonum/stat" 9 | "io" 10 | "io/ioutil" 11 | "log" 12 | "os" 13 | "path/filepath" 14 | "strings" 15 | "sync" 16 | ) 17 | 18 | func main() { 19 | fulljson := "/Volumes/aps_timemachine/recombo/APS160.5_lmfit/cluster8_cluster221/cluster8_cluster221_CORE_XMFA_OUT.json" 20 | j, err := os.Open(fulljson) 21 | if err != nil { 22 | fmt.Println(err) 23 | } 24 | defer j.Close() 25 | 26 | //initialize the Genes array 27 | var genes CorrResults 28 | //initialize the number of pairs 29 | N := 0 30 | //initialize the number of genes 31 | numGenes := 0 32 | r := bufio.NewReader(j) 33 | for { 34 | line, err := r.ReadString('\n') 35 | // read our opened xmlFile as a byte array. 36 | byteValue := []byte(line) 37 | //unmarshall into genes 38 | json.Unmarshal(byteValue, &genes) 39 | //get N for l = 0 40 | if len(genes.Results) > 0 { 41 | //fmt.Println("N: ", genes.Results[0].N) 42 | N = N + genes.Results[0].N 43 | numGenes++ 44 | //if N != 0{ 45 | // numGenes++ 46 | //} 47 | } 48 | if err != nil { 49 | if err != io.EOF { 50 | log.Fatalf("Error when reading file %s: %v", j, err) 51 | } 52 | break 53 | } 54 | } 55 | avgPairs := N / numGenes 56 | fmt.Println("Average number of pairs: ", avgPairs) 57 | fmt.Println(N) 58 | fmt.Println(numGenes) 59 | 60 | done := make(chan struct{}) 61 | defer close(done) 62 | 63 | root := "/Volumes/aps_timemachine/recombo/APS160.5_lmfit" 64 | jsonSuffix := "_XMFA_OUT.json" 65 | lmfitSuffix := "_XMFA_OUT_lmfit_report.csv" 66 | numDigesters := 4 67 | clusterDirs := makeDirList(root) 68 | //make a channel for cluster directories which closes when we're out of them 69 | clusters := clusterFileChan(done, root, clusterDirs, jsonSuffix, lmfitSuffix) 70 | //start a fixed number of goroutines to send results on 71 | //make a results channel 72 | resChan := make(chan result) 73 | var wg sync.WaitGroup 74 | for i := 0; i < numDigesters; i++ { 75 | wg.Add(1) 76 | go digester(done, clusters, resChan, i, &wg) 77 | } 78 | go func() { 79 | wg.Wait() 80 | close(resChan) 81 | }() 82 | //end of pipeline 83 | //for r := range resChan { 84 | // fmt.Printf("num pairs is %f+/-%f for %s %s with %d genes\n", r.numPairs, r.StDev, r.ID, r.genome, r.numGenes) 85 | // fmt.Printf("fit values for %s %s: %s\n", r.ID, r.genome, r.fitOut) 86 | //} 87 | 88 | writeCSV(resChan, root, "blooo.csv") 89 | 90 | } 91 | 92 | //clusterFiles is a struct for filepaths for all files associated with a cluster or pair of clusters 93 | type clusterFiles struct { 94 | ID string //cluster ID 95 | genome string //type of genome 96 | json string //json with gene corr profiles 97 | lmfitOut string //lmfit output csv 98 | } 99 | 100 | // result struct 101 | type result struct { 102 | ID string //cluster ID 103 | genome string //type of genome 104 | numPairs float64 //average number of pairs 105 | StDev float64 //St Dev of number of pairs 106 | numGenes int //number of genes 107 | fitOut []string //values from fitting 108 | } 109 | 110 | //Genes struct which contains 111 | // 112 | type Genes struct { 113 | Genes []CorrResults //'json:gene' 114 | } 115 | 116 | // CorrResults stores a list of CorrResult with an gene ID. 117 | type CorrResults struct { 118 | ID string 119 | Results []CorrResult 120 | } 121 | 122 | // CorrResult stores a correlation result. 123 | type CorrResult struct { 124 | Lag int 125 | Mean float64 126 | Variance float64 127 | N int 128 | Type string 129 | } 130 | 131 | // makeDirList make a list of cluster directories 132 | func makeDirList(root string) (DirList []string) { 133 | c, err := ioutil.ReadDir(root) 134 | if err != nil { 135 | panic(err) 136 | } 137 | for _, entry := range c { 138 | if entry.IsDir() && strings.HasPrefix(entry.Name(), "cluster") { 139 | DirList = append(DirList, entry.Name()) 140 | } 141 | } 142 | return DirList 143 | } 144 | 145 | //clusterFileChan returns a channel of clusterFiles 146 | func clusterFileChan(done <-chan struct{}, root string, DirList []string, jsonSuffix string, lmfitSuffix string) <-chan clusterFiles { 147 | clusterFileChan := make(chan clusterFiles) 148 | 149 | go func() { 150 | defer close(clusterFileChan) 151 | for _, d := range DirList { 152 | //define the flex files 153 | flexJson := d + "_FLEX" + jsonSuffix 154 | json := filepath.Join(root, d, flexJson) 155 | flexLmfit := d + "_FLEX" + lmfitSuffix 156 | lmfit := filepath.Join(root, d, flexLmfit) 157 | flex := clusterFiles{d, "FLEX", json, lmfit} 158 | //define the core files 159 | coreJson := d + "_CORE" + jsonSuffix 160 | json = filepath.Join(root, d, coreJson) 161 | coreLmfit := d + "_CORE" + lmfitSuffix 162 | lmfit = filepath.Join(root, d, coreLmfit) 163 | core := clusterFiles{d, "CORE", json, lmfit} 164 | 165 | cGenomes := []clusterFiles{core, flex} 166 | for _, c := range cGenomes { 167 | select { 168 | case clusterFileChan <- c: 169 | case <-done: 170 | return 171 | } 172 | } 173 | } 174 | }() 175 | return clusterFileChan 176 | } 177 | 178 | func getNumPairs(cluster clusterFiles) (avgPairs float64) { 179 | j, err := os.Open(cluster.json) 180 | if err != nil { 181 | fmt.Println(err) 182 | } 183 | defer j.Close() 184 | 185 | //initialize the Genes array 186 | var genes CorrResults 187 | //initialize the number of pairs 188 | N := 0 189 | //initialize the number of genes 190 | numGenes := 0 191 | r := bufio.NewReader(j) 192 | for { 193 | line, err := r.ReadString('\n') 194 | // read our opened xmlFile as a byte array. 195 | byteValue := []byte(line) 196 | //unmarshall into genes 197 | json.Unmarshal(byteValue, &genes) 198 | //get N for l = 0 199 | if len(genes.Results) > 0 { 200 | //fmt.Println("N: ", genes.Results[0].N) 201 | N = N + genes.Results[0].N 202 | numGenes++ 203 | //if N != 0{ 204 | // numGenes++ 205 | //} 206 | } 207 | if err != nil { 208 | if err != io.EOF { 209 | log.Fatalf("Error when reading file %s: %v", j, err) 210 | } 211 | break 212 | } 213 | } 214 | avgPairs = float64(N) / float64(numGenes) 215 | return avgPairs 216 | } 217 | 218 | func calcMeanStDev(cluster clusterFiles) (MeanVariance []float64, numGenes int) { 219 | j, err := os.Open(cluster.json) 220 | if err != nil { 221 | fmt.Println(err) 222 | } 223 | defer j.Close() 224 | 225 | //initialize the Genes array 226 | var genes CorrResults 227 | //initialize the num of pairs array 228 | var pairs []float64 229 | //initialize the number of genes 230 | numGenes = 0 231 | r := bufio.NewReader(j) 232 | for { 233 | line, err := r.ReadString('\n') 234 | if err != nil { 235 | if err != io.EOF { 236 | log.Fatalf("Error when reading file %s: %v", j, err) 237 | } 238 | break 239 | } 240 | // read our opened xmlFile as a byte array. 241 | byteValue := []byte(line) 242 | //unmarshall into genes 243 | json.Unmarshal(byteValue, &genes) 244 | //get N for l = 0 245 | if len(genes.Results) > 0 { 246 | pairs = append(pairs, float64(genes.Results[0].N)) 247 | numGenes++ 248 | } 249 | } 250 | m, v := stat.MeanStdDev(pairs, nil) 251 | MeanVariance = []float64{m, v} 252 | numGenes = len(pairs) 253 | return MeanVariance, numGenes 254 | } 255 | 256 | //digester aligns genomes and writes them to FASTA files until either SRAchan or done is closed 257 | func digester(done <-chan struct{}, FileChan <-chan clusterFiles, c chan<- result, id int, wg *sync.WaitGroup) { 258 | defer wg.Done() 259 | fmt.Printf("Digester %d starting\n", id) 260 | for cluster := range FileChan { 261 | MeanStDev, numGenes := calcMeanStDev(cluster) 262 | fitOut := getFitOut(cluster) 263 | select { 264 | case c <- result{cluster.ID, cluster.genome, 265 | MeanStDev[0], MeanStDev[1], numGenes, fitOut}: 266 | case <-done: 267 | return 268 | } 269 | } 270 | fmt.Printf("Digester %d done\n", id) 271 | } 272 | 273 | func getFitOut(cluster clusterFiles) (fitOut []string) { 274 | l, err := os.Open(cluster.lmfitOut) 275 | if err != nil { 276 | fmt.Println(err) 277 | } 278 | defer l.Close() 279 | 280 | reader := csv.NewReader(l) 281 | reader.FieldsPerRecord = -1 282 | i := 0 283 | for { 284 | record, err := reader.Read() 285 | if err == io.EOF { 286 | break 287 | } 288 | if err != nil { 289 | panic(err) 290 | } 291 | if i == 10 { 292 | fitOut = record 293 | break 294 | } 295 | i++ 296 | } 297 | return fitOut 298 | } 299 | 300 | func writeCSV(resChan chan result, root string, outName string) { 301 | path := filepath.Join(root, outName) 302 | recordFile, err := os.Create(path) 303 | if err != nil { 304 | fmt.Println("Error while creating the output csv ::", err) 305 | return 306 | } 307 | defer recordFile.Close() 308 | // Initialize the writer 309 | writer := csv.NewWriter(recordFile) 310 | defer writer.Flush() 311 | //write header 312 | header := []string{"ID", "genome", "avg_pairs", "stdev_pairs", "num_genes", 313 | "ds", "thetaS", "f", "phiS", "thetaP", 314 | "phiP", "c", "dp", "dc", "chisq", "red-chisq"} 315 | err = writer.Write(header) 316 | for r := range resChan { 317 | // Write all the records 318 | var line []string 319 | mean := fmt.Sprintf("%f", r.numPairs) 320 | StDev := fmt.Sprintf("%f", r.StDev) 321 | numGenes := fmt.Sprintf("%d", r.numGenes) 322 | line = append(line, r.ID, r.genome, mean, StDev, numGenes) 323 | line = append(line, r.fitOut...) 324 | err = writer.Write(line) 325 | if err != nil { 326 | fmt.Println("Error while writing to the file ::", err) 327 | return 328 | } 329 | } 330 | } 331 | -------------------------------------------------------------------------------- /cmd/development/mcorr-pair/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "io" 7 | "log" 8 | "os" 9 | "runtime" 10 | "strings" 11 | "time" 12 | 13 | "github.com/kussell-lab/biogo/seq" 14 | "github.com/kussell-lab/mcorr" 15 | "github.com/kussell-lab/ncbiftp/taxonomy" 16 | "gopkg.in/alecthomas/kingpin.v2" 17 | ) 18 | 19 | func main() { 20 | app := kingpin.New("mcorr-pair", "Calculate mutation correlation for each pair of isolates") 21 | app.Version("v20170728") 22 | 23 | alnFile := app.Arg("in", "Alignment file in XMFA format").Required().String() 24 | outFile := app.Arg("out", "Output file in CSV format").Required().String() 25 | 26 | mateFile := app.Flag("second-alignment", "Second alignment file in XMFA format").Default("").String() 27 | maxl := app.Flag("max-corr-length", "Maximum length of correlation (base pairs)").Default("300").Int() 28 | ncpu := app.Flag("num-cpu", "Number of CPUs (default: using all available cores)").Default("0").Int() 29 | codonPos := app.Flag("codon-position", "Codon position (1: first codon position; 2: second codon position; 3: third codon position; 4: synonymous at third codon position.").Default("4").Int() 30 | kingpin.MustParse(app.Parse(os.Args[1:])) 31 | //timer 32 | 33 | start := time.Now() 34 | 35 | if *ncpu == 0 { 36 | *ncpu = runtime.NumCPU() 37 | } 38 | runtime.GOMAXPROCS(*ncpu) 39 | 40 | synonymous := true 41 | if *codonPos == 4 { 42 | synonymous = true 43 | *codonPos = 3 44 | } 45 | if *codonPos <= 0 || *codonPos > 4 { 46 | log.Fatalln("--codon-position should be in the range of 1 to 4.") 47 | } 48 | 49 | var mateMap map[string]*seq.Sequence 50 | if *mateFile != "" { 51 | f, err := os.Open(*mateFile) 52 | if err != nil { 53 | panic(err) 54 | } 55 | rd := seq.NewFastaReader(f) 56 | sequences, err := rd.ReadAll() 57 | if err != nil { 58 | panic(err) 59 | } 60 | 61 | mateMap = make(map[string]*seq.Sequence) 62 | for _, s := range sequences { 63 | geneid := strings.Split(s.Id, " ")[0] 64 | mateMap[geneid] = s 65 | } 66 | 67 | f.Close() 68 | } 69 | 70 | //special error log -- if you want to log to a text file 71 | // If the file doesn't exist, create it or append to the file 72 | //file, err := os.OpenFile("201030-1538-mcp_logs.txt", os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0666) 73 | //if err != nil { 74 | // log.Fatal(err) 75 | //} 76 | //log.SetOutput(file) 77 | 78 | alnChan := readAlignments(*alnFile) 79 | 80 | codingTable := taxonomy.GeneticCodes()["11"] 81 | maxCodonLen := *maxl / 3 82 | codonOffset := 0 83 | 84 | numJob := *ncpu 85 | done := make(chan bool) 86 | resChan := make(chan mcorr.CorrResults) 87 | for i := 0; i < numJob; i++ { 88 | go func() { 89 | for aln := range alnChan { 90 | var mateSequence *seq.Sequence 91 | if mateMap != nil { 92 | geneid, _ := getNames(aln.Sequences[0].Id) 93 | s, found := mateMap[geneid] 94 | if found { 95 | mateSequence = s 96 | } 97 | } 98 | corrRes := calcP2Coding(aln, codonOffset, maxCodonLen, codingTable, synonymous, *codonPos-1, mateSequence) 99 | for _, res := range corrRes { 100 | resChan <- res 101 | } 102 | } 103 | done <- true 104 | }() 105 | } 106 | 107 | go func() { 108 | defer close(resChan) 109 | for i := 0; i < numJob; i++ { 110 | <-done 111 | } 112 | }() 113 | 114 | CollectWrite(resChan, *outFile) 115 | //time it 116 | duration := time.Since(start) 117 | fmt.Println(duration) 118 | } 119 | 120 | // Alignment is an array of multiple sequences with same length 121 | type Alignment struct { 122 | ID string 123 | Sequences []seq.Sequence 124 | } 125 | 126 | // readAlignments reads sequence alignment from a extended Multi-FASTA file, 127 | // and return a channel of alignment, which is a list of seq.Sequence 128 | func readAlignments(file string) (alnChan chan Alignment) { 129 | alnChan = make(chan Alignment) 130 | read := func() { 131 | defer close(alnChan) 132 | 133 | f, err := os.Open(file) 134 | if err != nil { 135 | panic(err) 136 | } 137 | defer f.Close() 138 | xmfaReader := seq.NewXMFAReader(f) 139 | numAln := 0 140 | for { 141 | alignment, err := xmfaReader.Read() 142 | if err != nil { 143 | if err != io.EOF { 144 | panic(err) 145 | } 146 | break 147 | } 148 | if len(alignment) > 0 { 149 | numAln++ 150 | alnID := strings.Split(alignment[0].Id, " ")[0] 151 | alnChan <- Alignment{ID: alnID, Sequences: alignment} 152 | fmt.Printf("\rRead %d alignments.", numAln) 153 | fmt.Printf("\r alignment ID: %s", alnID) 154 | } 155 | } 156 | fmt.Printf(" Total alignments %d\n", numAln) 157 | } 158 | go read() 159 | return 160 | } 161 | 162 | func calcP2Coding(aln Alignment, codonOffset int, maxCodonLen int, codingTable *taxonomy.GeneticCode, synonymous bool, codonPos int, mateSequence *seq.Sequence) (results []mcorr.CorrResults) { 163 | codonSequences := [][]Codon{} 164 | sequences := []seq.Sequence{} 165 | if mateSequence != nil { 166 | sequences = append(sequences, *mateSequence) 167 | } 168 | sequences = append(sequences, aln.Sequences...) 169 | for _, s := range sequences { 170 | codons := extractCodons(s, codonOffset) 171 | codonSequences = append(codonSequences, codons) 172 | } 173 | 174 | for i, seq1 := range codonSequences { 175 | for j := i + 1; j < len(codonSequences); j++ { 176 | _, genomeName1 := getNames(aln.Sequences[i].Id) 177 | _, genomeName2 := getNames(aln.Sequences[j].Id) 178 | if genomeName1 > genomeName2 { 179 | genomeName1, genomeName2 = genomeName2, genomeName1 180 | } 181 | //error log 182 | //log.Println(aln.ID) 183 | 184 | id := genomeName1 + "_vs_" + genomeName2 185 | seq2 := codonSequences[j] 186 | crRes := mcorr.CorrResults{ID: id} 187 | for l := 0; l < maxCodonLen; l++ { 188 | d := 0.0 189 | t := 0 190 | for k := 0; k < len(seq1)-l; k++ { 191 | c1 := seq1[k] 192 | c2 := seq2[k] 193 | a1, found1 := codingTable.Table[string(c1)] 194 | a2, found2 := codingTable.Table[string(c2)] 195 | if found1 && found2 && a1 == a2 { 196 | b1 := seq1[k+l] 197 | b2 := seq2[k+l] 198 | 199 | good := true 200 | if synonymous { 201 | //this is a mistake -- i think it should be b1 and b2 here 202 | //d1, found1 := codingTable.Table[string(c1)] 203 | //d2, found2 := codingTable.Table[string(c2)] 204 | d1, found1 := codingTable.Table[string(b1)] 205 | d2, found2 := codingTable.Table[string(b2)] 206 | if found1 && found2 && d1 == d2 { 207 | good = true 208 | } else { 209 | good = false 210 | } 211 | } 212 | if good { 213 | var codonPositions []int 214 | if codonPos < 0 || codonPos > 2 { 215 | codonPositions = []int{0, 1, 2} 216 | } else { 217 | codonPositions = append(codonPositions, codonPos) 218 | } 219 | for _, codonP := range codonPositions { 220 | if c1[codonP] != c2[codonP] { 221 | if b1[codonP] != b2[codonP] { 222 | d++ 223 | } 224 | } 225 | t++ 226 | } 227 | } 228 | } 229 | } 230 | cr := mcorr.CorrResult{} 231 | cr.Lag = l * 3 232 | cr.Mean = d / float64(t) 233 | cr.N = t 234 | //error log 235 | //log.Printf("%s %d", aln.ID, t) 236 | cr.Type = "P2" 237 | crRes.Results = append(crRes.Results, cr) 238 | } 239 | results = append(results, crRes) 240 | } 241 | if mateSequence != nil { 242 | break 243 | } 244 | } 245 | 246 | return 247 | } 248 | 249 | // Codon is a byte list of length 3 250 | type Codon []byte 251 | 252 | // CodonSequence is a sequence of codons. 253 | type CodonSequence []Codon 254 | 255 | // CodonPair is a pair of Codons. 256 | type CodonPair struct { 257 | A, B Codon 258 | } 259 | 260 | // extractCodons return a list of codons from a DNA sequence. 261 | func extractCodons(s seq.Sequence, offset int) (codons []Codon) { 262 | for i := offset; i+3 <= len(s.Seq); i += 3 { 263 | c := s.Seq[i:(i + 3)] 264 | codons = append(codons, c) 265 | } 266 | return 267 | } 268 | 269 | // countAlignments return total number of alignments in a file. 270 | func countAlignments(file string) (count int) { 271 | f, err := os.Open(file) 272 | if err != nil { 273 | panic(err) 274 | } 275 | defer f.Close() 276 | rd := bufio.NewReader(f) 277 | for { 278 | line, err := rd.ReadString('\n') 279 | if err != nil { 280 | if err != io.EOF { 281 | panic(err) 282 | } 283 | break 284 | } 285 | if line[0] == '=' { 286 | count++ 287 | } 288 | } 289 | return 290 | } 291 | 292 | func getNames(s string) (geneName, genomeName string) { 293 | terms := strings.Split(s, " ") 294 | //this is for the helicobacter test files 295 | //geneName = terms[0] 296 | //genomeName = terms[1] 297 | //this is the genomeName for the MSA files assembled from ReferenceAlignmentGenerator 298 | geneName = terms[0] 299 | genomeName = terms[2] 300 | return 301 | } 302 | 303 | // CollectWrite collects and writes the correlation results. 304 | func CollectWrite(corrResChan chan mcorr.CorrResults, outFile string) { 305 | // prepare bootstrappers. 306 | bootstraps := make(map[string]*mcorr.Bootstrap) 307 | notBootstrap := mcorr.NewBootstrap("all", 1.0) 308 | notBootstrap.SetRandom(false) 309 | bootstraps["all"] = notBootstrap 310 | 311 | for corrResults := range corrResChan { 312 | id := corrResults.ID 313 | if _, found := bootstraps[id]; !found { 314 | bootstraps[id] = mcorr.NewBootstrap(id, 1.0) 315 | bootstraps[id].SetRandom(false) 316 | } 317 | bootstraps[id].Add(corrResults) 318 | bootstraps["all"].Add(corrResults) 319 | } 320 | 321 | w, err := os.Create(outFile) 322 | if err != nil { 323 | panic(err) 324 | } 325 | defer w.Close() 326 | 327 | w.WriteString("l,m,v,n,t,b\n") 328 | for _, bs := range bootstraps { 329 | results := bs.Results() 330 | for _, res := range results { 331 | w.WriteString(fmt.Sprintf("%d,%g,%g,%d,%s,%s\n", res.Lag, res.Mean, res.Variance, res.N, res.Type, bs.ID)) 332 | } 333 | } 334 | } 335 | -------------------------------------------------------------------------------- /cmd/development/FitCollector/FitCollector.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "encoding/csv" 6 | "encoding/json" 7 | "fmt" 8 | "gonum.org/v1/gonum/stat" 9 | "gopkg.in/alecthomas/kingpin.v2" 10 | "io" 11 | "io/ioutil" 12 | "log" 13 | "os" 14 | "path/filepath" 15 | "runtime" 16 | "strings" 17 | "sync" 18 | "time" 19 | ) 20 | 21 | func main() { 22 | 23 | app := kingpin.New("FitCollector", "Collect results from mcorr-fit or fitCorr for many sequence clusters") 24 | app.Version("v20210126") 25 | root := app.Arg("root", "root directory containing cluster out folders").Required().String() 26 | jsonSuffix := app.Flag("jsonSuffix", "suffix for json files").Default("_XMFA_OUT.json").String() 27 | lmfitSuffix := app.Flag("lmfitSuffix", "suffix for lmfit report output file").Default("_XMFA_OUT_lmfit_report.csv").String() 28 | ncpu := app.Flag("numCpu", "Number of CPUs (default: using all available cores)").Default("0").Int() 29 | numDigesters := app.Flag("numDigesters", "Number threads fetching results").Default("20").Int() 30 | out := app.Flag("outCsv", "prefix for output csv").Default("mcorrfit_results").String() 31 | kingpin.MustParse(app.Parse(os.Args[1:])) 32 | 33 | if *ncpu <= 0 { 34 | *ncpu = runtime.NumCPU() 35 | } 36 | runtime.GOMAXPROCS(*ncpu) 37 | 38 | //timer 39 | start := time.Now() 40 | timeStamp := fmt.Sprintf(start.Format("060102_1504")) 41 | done := make(chan struct{}) 42 | defer close(done) 43 | 44 | //root := "/Volumes/aps_timemachine/recombo/APS160.5_lmfit" 45 | //jsonSuffix := "_XMFA_OUT.json" 46 | //lmfitSuffix := "_XMFA_OUT_lmfit_report.csv" 47 | //numDigesters := 4 48 | //out := "blarrrrgh" 49 | clusterDirs := makeDirList(*root) 50 | //make a channel for cluster directories which closes when we're out of them 51 | failOut := timeStamp + "_lmfitfailed.csv" 52 | initFailOut(*root, failOut) 53 | clusters := clusterFileChan(done, *root, clusterDirs, *jsonSuffix, *lmfitSuffix, failOut) 54 | //start a fixed number of goroutines to send results on 55 | //make a results channel 56 | resChan := make(chan result) 57 | var wg sync.WaitGroup 58 | for i := 0; i < *numDigesters; i++ { 59 | wg.Add(1) 60 | go digester(done, clusters, resChan, i, &wg) 61 | } 62 | go func() { 63 | wg.Wait() 64 | close(resChan) 65 | }() 66 | //end of pipeline 67 | 68 | outCsv := timeStamp + "_" + *out + ".csv" 69 | writeCSV(resChan, *root, outCsv) 70 | 71 | //writeFitFails(fitFailed, root, failOut) 72 | 73 | duration := time.Since(start) 74 | fmt.Println("Time to collect results:", duration) 75 | 76 | } 77 | 78 | //clusterFiles is a struct for filepaths for all files associated with a cluster or pair of clusters 79 | type clusterFiles struct { 80 | ID string //cluster ID 81 | genome string //type of genome 82 | json string //json with gene corr profiles 83 | lmfitOut string //lmfit output csv 84 | } 85 | 86 | // result struct 87 | type result struct { 88 | ID string //cluster ID 89 | genome string //type of genome 90 | numPairs float64 //average number of pairs 91 | StDev float64 //St Dev of number of pairs 92 | numGenes int //number of genes 93 | fitOut []string //values from fitting 94 | } 95 | 96 | //Genes struct which contains 97 | // 98 | type Genes struct { 99 | Genes []CorrResults //'json:gene' 100 | } 101 | 102 | // CorrResults stores a list of CorrResult with an gene ID. 103 | type CorrResults struct { 104 | ID string 105 | Results []CorrResult 106 | } 107 | 108 | // CorrResult stores a correlation result. 109 | type CorrResult struct { 110 | Lag int 111 | Mean float64 112 | Variance float64 113 | N int 114 | Type string 115 | } 116 | 117 | // makeDirList make a list of cluster directories 118 | func makeDirList(root string) (DirList []string) { 119 | c, err := ioutil.ReadDir(root) 120 | if err != nil { 121 | panic(err) 122 | } 123 | for _, entry := range c { 124 | if entry.IsDir() && strings.HasPrefix(entry.Name(), "cluster") { 125 | DirList = append(DirList, entry.Name()) 126 | } 127 | } 128 | return DirList 129 | } 130 | 131 | //clusterFileChan returns a channel of clusterFiles and a list of clusters where lmfit failed 132 | func clusterFileChan(done <-chan struct{}, root string, DirList []string, jsonSuffix string, lmfitSuffix string, failOut string) <-chan clusterFiles { 133 | clusterFileChan := make(chan clusterFiles) 134 | //var lmfitFailed []clusterFiles 135 | go func() { 136 | defer close(clusterFileChan) 137 | for _, d := range DirList { 138 | //define the core and flex files 139 | core, flex := makeClusterFiles(root, d, jsonSuffix, lmfitSuffix) 140 | // var cGenomes []clusterFiles 141 | //if checkLmfit(core) { 142 | // cGenomes = append(cGenomes, core) 143 | //} else { 144 | // lmfitFailed = append(lmfitFailed, core) 145 | //} 146 | //if checkLmfit(flex) { 147 | // cGenomes = append(cGenomes, flex) 148 | //} else { 149 | // lmfitFailed = append(lmfitFailed, flex) 150 | //} 151 | cGenomes := []clusterFiles{core, flex} 152 | for _, c := range cGenomes { 153 | if !checkLmfit(c) { 154 | writeFitFail(c, root, failOut) 155 | continue 156 | } 157 | select { 158 | case clusterFileChan <- c: 159 | case <-done: 160 | return 161 | } 162 | } 163 | } 164 | }() 165 | return clusterFileChan 166 | } 167 | 168 | //makeClusterFiles returns ClusterFiles for core and flexible genomes 169 | func makeClusterFiles(root string, d string, jsonSuffix string, lmfitSuffix string) (core clusterFiles, flex clusterFiles) { 170 | //define the flex files 171 | flexJson := d + "_FLEX" + jsonSuffix 172 | json := filepath.Join(root, d, flexJson) 173 | flexLmfit := d + "_FLEX" + lmfitSuffix 174 | lmfit := filepath.Join(root, d, flexLmfit) 175 | flex = clusterFiles{d, "FLEX", json, lmfit} 176 | //define the core files 177 | coreJson := d + "_CORE" + jsonSuffix 178 | json = filepath.Join(root, d, coreJson) 179 | coreLmfit := d + "_CORE" + lmfitSuffix 180 | lmfit = filepath.Join(root, d, coreLmfit) 181 | core = clusterFiles{d, "CORE", json, lmfit} 182 | return core, flex 183 | } 184 | 185 | //checkLmfit check to see if lmfit completed for the cluster 186 | func checkLmfit(c clusterFiles) bool { 187 | _, err := os.Stat(c.lmfitOut) 188 | if os.IsNotExist(err) { 189 | return false 190 | } else { 191 | return true 192 | } 193 | } 194 | 195 | func getNumPairs(cluster clusterFiles) (avgPairs float64) { 196 | j, err := os.Open(cluster.json) 197 | if err != nil { 198 | fmt.Println(err) 199 | } 200 | defer j.Close() 201 | 202 | //initialize the Genes array 203 | var genes CorrResults 204 | //initialize the number of pairs 205 | N := 0 206 | //initialize the number of genes 207 | numGenes := 0 208 | r := bufio.NewReader(j) 209 | for { 210 | line, err := r.ReadString('\n') 211 | // read our opened xmlFile as a byte array. 212 | byteValue := []byte(line) 213 | //unmarshall into genes 214 | json.Unmarshal(byteValue, &genes) 215 | //get N for l = 0 216 | if len(genes.Results) > 0 { 217 | //fmt.Println("N: ", genes.Results[0].N) 218 | N = N + genes.Results[0].N 219 | numGenes++ 220 | //if N != 0{ 221 | // numGenes++ 222 | //} 223 | } 224 | if err != nil { 225 | if err != io.EOF { 226 | log.Fatalf("Error when reading file %s: %v", j, err) 227 | } 228 | break 229 | } 230 | } 231 | avgPairs = float64(N) / float64(numGenes) 232 | return avgPairs 233 | } 234 | 235 | func calcMeanStDev(cluster clusterFiles) (MeanVariance []float64, numGenes int) { 236 | j, err := os.Open(cluster.json) 237 | if err != nil { 238 | fmt.Println(err) 239 | } 240 | defer j.Close() 241 | 242 | //initialize the Genes array 243 | var genes CorrResults 244 | //initialize the num of pairs array 245 | var pairs []float64 246 | r := bufio.NewReader(j) 247 | for { 248 | line, err := r.ReadString('\n') 249 | if err != nil { 250 | if err != io.EOF { 251 | log.Fatalf("Error when reading file %s: %v", j, err) 252 | } 253 | break 254 | } 255 | // read our opened xmlFile as a byte array. 256 | byteValue := []byte(line) 257 | //unmarshall into genes 258 | json.Unmarshal(byteValue, &genes) 259 | //get N for l = 0 260 | if len(genes.Results) > 0 { 261 | pairs = append(pairs, float64(genes.Results[0].N)) 262 | } 263 | } 264 | m, v := stat.MeanStdDev(pairs, nil) 265 | MeanVariance = []float64{m, v} 266 | numGenes = len(pairs) 267 | return MeanVariance, numGenes 268 | } 269 | 270 | //digester aligns genomes and writes them to FASTA files until either SRAchan or done is closed 271 | func digester(done <-chan struct{}, FileChan <-chan clusterFiles, c chan<- result, id int, wg *sync.WaitGroup) { 272 | defer wg.Done() 273 | fmt.Printf("Digester %d starting\n", id) 274 | for cluster := range FileChan { 275 | MeanStDev, numGenes := calcMeanStDev(cluster) 276 | fitOut := getFitOut(cluster) 277 | select { 278 | case c <- result{cluster.ID, cluster.genome, 279 | MeanStDev[0], MeanStDev[1], numGenes, fitOut}: 280 | case <-done: 281 | return 282 | } 283 | } 284 | fmt.Printf("Digester %d done\n", id) 285 | } 286 | 287 | func getFitOut(cluster clusterFiles) (fitOut []string) { 288 | l, err := os.Open(cluster.lmfitOut) 289 | if err != nil { 290 | fmt.Println(err) 291 | } 292 | defer l.Close() 293 | 294 | reader := csv.NewReader(l) 295 | reader.FieldsPerRecord = -1 296 | i := 0 297 | Loop: 298 | for { 299 | record, err := reader.Read() 300 | if err != nil { 301 | if err == io.EOF { 302 | break 303 | } else { 304 | panic(err) 305 | } 306 | } 307 | switch i { 308 | //did the fit succeed 309 | case 0: 310 | fitOut = append(fitOut, record[1]) 311 | //get the number of function evals 312 | case 1: 313 | fitOut = append(fitOut, record[1]) 314 | //get number of datapoints evaluated 315 | case 2: 316 | fitOut = append(fitOut, record[1]) 317 | case 10: 318 | fitOut = append(fitOut, record...) 319 | break Loop 320 | } 321 | i++ 322 | //if err == io.EOF { 323 | // break 324 | //} 325 | //if err != nil { 326 | // panic(err) 327 | //} 328 | ////get number of data points 329 | //if i == 2 { 330 | // fitOut = append(fitOut, record[1]) 331 | //} 332 | //if i == 10 { 333 | // fitOut = append(fitOut, record...) 334 | // break Loop 335 | //} 336 | //i++ 337 | } 338 | return fitOut 339 | } 340 | 341 | func writeCSV(resChan chan result, root string, outName string) { 342 | path := filepath.Join(root, outName) 343 | recordFile, err := os.Create(path) 344 | if err != nil { 345 | fmt.Println("Error while creating the output csv ::", err) 346 | return 347 | } 348 | defer recordFile.Close() 349 | // Initialize the writer 350 | writer := csv.NewWriter(recordFile) 351 | defer writer.Flush() 352 | //write header 353 | header := []string{"ID", "genome", "avg_pairs", "stdev_pairs", "num_genes", 354 | "fit_success", "nefv", 355 | "datapoints", "ds", "thetaS", "f", "phiS", "thetaP", 356 | "phiP", "c", "dp", "dc", "chisq", "red-chisq"} 357 | err = writer.Write(header) 358 | for r := range resChan { 359 | // Write all the records 360 | var line []string 361 | mean := fmt.Sprintf("%f", r.numPairs) 362 | StDev := fmt.Sprintf("%f", r.StDev) 363 | numGenes := fmt.Sprintf("%d", r.numGenes) 364 | line = append(line, r.ID, r.genome, mean, StDev, numGenes) 365 | line = append(line, r.fitOut...) 366 | err = writer.Write(line) 367 | if err != nil { 368 | fmt.Println("Error while writing to the file ::", err) 369 | return 370 | } 371 | } 372 | } 373 | 374 | func writeFitFails(fitFailed []clusterFiles, root string, outName string) { 375 | path := filepath.Join(root, outName) 376 | recordFile, err := os.Create(path) 377 | if err != nil { 378 | fmt.Println("Error while creating the output csv ::", err) 379 | return 380 | } 381 | defer recordFile.Close() 382 | // Initialize the writer 383 | writer := csv.NewWriter(recordFile) 384 | defer writer.Flush() 385 | //write header 386 | header := []string{"ID", "genome"} 387 | err = writer.Write(header) 388 | for _, c := range fitFailed { 389 | // Write all the records 390 | var line []string 391 | line = append(line, c.ID, c.genome) 392 | err = writer.Write(line) 393 | if err != nil { 394 | fmt.Println("Error while writing to the file ::", err) 395 | return 396 | } 397 | } 398 | } 399 | 400 | func writeFitFail(c clusterFiles, root string, outName string) { 401 | path := filepath.Join(root, outName) 402 | recordFile, err := os.OpenFile(path, os.O_APPEND|os.O_WRONLY, 0644) 403 | if err != nil { 404 | fmt.Println("Error while writing to fail output ::", err) 405 | return 406 | } 407 | defer recordFile.Close() 408 | // Initialize the writer 409 | writer := csv.NewWriter(recordFile) 410 | defer writer.Flush() 411 | // Write all the records 412 | var line []string 413 | line = append(line, c.ID, c.genome) 414 | err = writer.Write(line) 415 | if err != nil { 416 | fmt.Println("Error while writing to the file ::", err) 417 | return 418 | } 419 | } 420 | 421 | func initFailOut(root string, outName string) { 422 | path := filepath.Join(root, outName) 423 | recordFile, err := os.Create(path) 424 | if err != nil { 425 | fmt.Println("Error while creating the fail output ::", err) 426 | return 427 | } 428 | defer recordFile.Close() 429 | // Initialize the writer 430 | writer := csv.NewWriter(recordFile) 431 | defer writer.Flush() 432 | //write header 433 | header := []string{"ID", "genome"} 434 | err = writer.Write(header) 435 | if err != nil { 436 | fmt.Println("Error while writing header for failure output ::", err) 437 | return 438 | } 439 | } 440 | -------------------------------------------------------------------------------- /cmd/mcorr-bam/main.go: -------------------------------------------------------------------------------- 1 | // Calculate correlation functions (P2 and P4) from read mapping results. 2 | package main 3 | 4 | import ( 5 | "bufio" 6 | "bytes" 7 | "fmt" 8 | "io" 9 | "log" 10 | "math/rand" 11 | "os" 12 | "runtime" 13 | "sort" 14 | "strings" 15 | 16 | "github.com/biogo/hts/sam" 17 | "github.com/kussell-lab/biogo/seq" 18 | "github.com/kussell-lab/mcorr" 19 | "github.com/kussell-lab/ncbiftp/taxonomy" 20 | "gopkg.in/alecthomas/kingpin.v2" 21 | ) 22 | 23 | // SubProfile Substitution/mutation profile. 24 | type SubProfile struct { 25 | Pos int 26 | Profile []float64 27 | } 28 | 29 | // MinBaseQuality min base quality 30 | var MinBaseQuality int 31 | 32 | // MinMapQuality min map quality 33 | var MinMapQuality int 34 | 35 | // MinReadLength minimal read length 36 | var MinReadLength int 37 | 38 | // MinAlleleNumber minimal allele (pairs) number 39 | var MinAlleleNumber int 40 | 41 | func main() { 42 | // Command variables. 43 | var bamFile string // bam or sam file 44 | var outFile string // output file 45 | var maxl int // max length of correlation 46 | var ncpu int // number of CPUs 47 | var minDepth int // min depth 48 | var minCoverage float64 // min coveage 49 | var gffFile string // gff file 50 | var corrChanFile string // CorrResults channel results. 51 | 52 | // Parse command arguments. 53 | app := kingpin.New("mcorr-bam", "Calculate mutation correlation from bacterial metagenomic sequence in BAM read files.") 54 | app.Version("v20180102") 55 | gffFileArg := app.Arg("gff", "Gff3 file").Required().String() 56 | bamFileArg := app.Arg("in", "input file.").Required().String() 57 | outPrefixArg := app.Arg("out-prefix", "output prefix.").Required().String() 58 | 59 | maxlFlag := app.Flag("max-corr-length", "Maximum distance of correlations (base pairs).").Default("300").Int() 60 | ncpuFlag := app.Flag("num-cpu", "Number of CPUs (default: using all available cores).").Default("0").Int() 61 | minDepthFlag := app.Flag("min-depth", "Minimal depth at each position.").Default("2").Int() 62 | minCoverageFlag := app.Flag("min-coverage", "Minimal coverage of a gene.").Default("0.5").Float64() 63 | minBaseQFlag := app.Flag("min-base-qual", "Minimal base quality").Default("30").Int() 64 | minMapQFlag := app.Flag("min-map-qual", "Minimal mapping quality").Default("30").Int() 65 | minReadLenFlag := app.Flag("min-read-length", "Minimal read length").Default("60").Int() 66 | codonPosition := app.Flag("codon-position", "Codon position (1: first codon position; 2: second codon position; 3: third codon position; 4: synoumous at third codon position.").Default("4").Int() 67 | numBoot := app.Flag("num-boot", "Number of bootstrapping on genes").Default("1000").Int() 68 | minAlleleNumber := app.Flag("min-allele-number", "Minimal number of alleles").Default("0").Int() 69 | kingpin.MustParse(app.Parse(os.Args[1:])) 70 | 71 | bamFile = *bamFileArg 72 | outFile = *outPrefixArg + ".csv" 73 | maxl = *maxlFlag / 3 74 | if *ncpuFlag <= 0 { 75 | ncpu = runtime.NumCPU() 76 | } else { 77 | ncpu = *ncpuFlag 78 | } 79 | runtime.GOMAXPROCS(ncpu) 80 | minDepth = *minDepthFlag 81 | minCoverage = *minCoverageFlag 82 | gffFile = *gffFileArg 83 | MinBaseQuality = *minBaseQFlag 84 | MinMapQuality = *minMapQFlag 85 | MinReadLength = *minReadLenFlag 86 | MinAlleleNumber = *minAlleleNumber 87 | corrChanFile = *outPrefixArg + ".json" 88 | 89 | synoumous := false 90 | if *codonPosition == 4 { 91 | synoumous = true 92 | *codonPosition = 3 93 | } 94 | if *codonPosition <= 0 || *codonPosition > 4 { 95 | log.Fatalln("--codon-position should be in the range of 1 to 4.") 96 | } 97 | 98 | // Read sequence reads. 99 | var recordsChan chan GeneSamRecords 100 | gffRecMap := readGffs(gffFile) 101 | _, recordsChan = readStrainBamFile(bamFile, gffRecMap) 102 | 103 | codeTable := taxonomy.GeneticCodes()["11"] 104 | 105 | done := make(chan bool) 106 | p2Chan := make(chan mcorr.CorrResults) 107 | for i := 0; i < ncpu; i++ { 108 | go func() { 109 | for geneRecords := range recordsChan { 110 | geneLen := geneRecords.End - geneRecords.Start 111 | gene := pileupCodons(geneRecords) 112 | ok := checkCoverage(gene, geneLen, minDepth, minCoverage) 113 | if ok { 114 | p2 := calcP2(gene, 10, minDepth, codeTable, *codonPosition-1, synoumous) 115 | p4 := calcP4(gene, maxl, minDepth, codeTable, *codonPosition-1, synoumous) 116 | p2 = append(p2, p4...) 117 | p2Chan <- mcorr.CorrResults{Results: p2, ID: geneRecords.ID} 118 | } 119 | } 120 | done <- true 121 | }() 122 | } 123 | 124 | go func() { 125 | defer close(p2Chan) 126 | for i := 0; i < ncpu; i++ { 127 | <-done 128 | } 129 | }() 130 | 131 | var resChan chan mcorr.CorrResults 132 | resChan = mcorr.PipeOutCorrResults(p2Chan, corrChanFile) 133 | 134 | bootstraps := mcorr.Collect(resChan, *numBoot) 135 | 136 | w, err := os.Create(outFile) 137 | if err != nil { 138 | panic(err) 139 | } 140 | defer w.Close() 141 | 142 | w.WriteString("# l: the distance between two genomic positions\n") 143 | w.WriteString("# m: the mean value of correlatio profile\n") 144 | w.WriteString("# v: the variance of correlation profile\n") 145 | w.WriteString("# n: the total number of alignments used for calculation\n") 146 | w.WriteString("# t: the type of result: Ks is for d_sample, and P2 is for correlation profile\n") 147 | w.WriteString("# b: the bootstrap number (all means used all alignments).\n") 148 | w.WriteString("l,m,v,n,t,b\n") 149 | for _, bs := range bootstraps { 150 | results := bs.Results() 151 | qfactor := getQfactor(results) 152 | for _, res := range results { 153 | if res.Type == "Ks" || (res.Type == "P4" && res.Lag > 0) { 154 | if res.Type == "P4" { 155 | res.Mean *= qfactor 156 | res.Variance *= qfactor * qfactor 157 | res.Type = "P2" 158 | } 159 | w.WriteString(fmt.Sprintf("%d,%g,%g,%d,%s,%s\n", 160 | res.Lag, res.Mean, res.Variance, res.N, res.Type, bs.ID)) 161 | } 162 | } 163 | } 164 | } 165 | 166 | // getQfactor return the q factor between p2 and p4. 167 | func getQfactor(results []mcorr.CorrResult) float64 { 168 | p2values := make([]float64, 31) 169 | p4values := make([]float64, 31) 170 | for _, res := range results { 171 | if res.Lag <= 30 && res.Lag > 0 { 172 | if res.Type == "P2" { 173 | p2values[res.Lag] = res.Mean 174 | } else if res.Type == "P4" { 175 | p4values[res.Lag] = res.Mean 176 | } 177 | } 178 | } 179 | 180 | var factors []float64 181 | for i := range p2values { 182 | if p2values[i] > 0 && p4values[i] > 0 { 183 | factors = append(factors, p2values[i]/p4values[i]) 184 | } 185 | } 186 | 187 | if len(factors) == 0 { 188 | return 0 189 | } 190 | 191 | sort.Float64s(factors) 192 | if len(factors)%2 == 0 { 193 | return (factors[len(factors)/2] + factors[len(factors)/2-1]) / 2 194 | } 195 | return (factors[len(factors)/2]) 196 | } 197 | 198 | // pileupCodons pileup codons of a list of reads at a gene. 199 | func pileupCodons(geneRecords GeneSamRecords) (codonGene *CodonGene) { 200 | codonGene = NewCodonGene() 201 | for _, read := range geneRecords.Records { 202 | codonArray := getCodons(read, geneRecords.Start, geneRecords.Strand) 203 | for _, codon := range codonArray { 204 | if !codon.ContainsGap() { 205 | codonGene.AddCodon(codon) 206 | } 207 | } 208 | } 209 | 210 | return 211 | } 212 | 213 | // getCodons split a read into a list of Codon. 214 | func getCodons(read *sam.Record, offset, strand int) (codonArray []Codon) { 215 | // get the mapped sequence of the read onto the reference. 216 | mappedSeq, _ := Map2Ref(read) 217 | for i := 2; i < len(mappedSeq); { 218 | if (read.Pos+i-offset+1)%3 == 0 { 219 | codonSeq := mappedSeq[i-2 : i+1] 220 | genePos := (read.Pos+i-offset+1)/3 - 1 221 | if genePos >= 0 { 222 | if strand == -1 { 223 | codonSeq = seq.Reverse(seq.Complement(codonSeq)) 224 | } 225 | codon := Codon{ReadID: read.Name, Seq: string(codonSeq), GenePos: genePos} 226 | codonArray = append(codonArray, codon) 227 | } 228 | i += 3 229 | } else { 230 | i++ 231 | } 232 | } 233 | 234 | return 235 | } 236 | 237 | func isATGC(b byte) bool { 238 | if b == 'A' { 239 | return true 240 | } else if b == 'T' { 241 | return true 242 | } else if b == 'C' { 243 | return true 244 | } else if b == 'G' { 245 | return true 246 | } 247 | 248 | return false 249 | } 250 | 251 | // P2 stores p2 calculation results. 252 | type P2 struct { 253 | Total float64 254 | Count int 255 | } 256 | 257 | // doubleCount count codon pairs. 258 | func doubleCount(nc *mcorr.NuclCov, codonPairArray []CodonPair, position int) { 259 | for _, cp := range codonPairArray { 260 | a := cp.A.Seq[position] 261 | b := cp.B.Seq[position] 262 | nc.Add(a, b) 263 | } 264 | } 265 | 266 | func calcP2(gene *CodonGene, maxl, minDepth int, codeTable *taxonomy.GeneticCode, codonPosition int, synoumous bool) (p2Res []mcorr.CorrResult) { 267 | alphabet := []byte{'A', 'T', 'G', 'C'} 268 | for i := 0; i < gene.Len(); i++ { 269 | for j := i; j < gene.Len(); j++ { 270 | codonPairRaw := gene.PairCodonAt(i, j) 271 | if len(codonPairRaw) < 2 { 272 | continue 273 | } 274 | lag := codonPairRaw[0].B.GenePos - codonPairRaw[0].A.GenePos 275 | if lag < 0 { 276 | lag = -lag 277 | } 278 | if lag >= maxl { 279 | break 280 | } 281 | 282 | var splittedCodonPairs [][]CodonPair 283 | if synoumous { 284 | splittedCodonPairs = SynoumousSplitCodonPairs(codonPairRaw, codeTable) 285 | } else { 286 | splittedCodonPairs = [][]CodonPair{codonPairRaw} 287 | } 288 | 289 | for _, synPairs := range splittedCodonPairs { 290 | if len(synPairs) > minDepth { 291 | nc := mcorr.NewNuclCov(alphabet) 292 | doubleCount(nc, synPairs, codonPosition) 293 | 294 | for len(p2Res) <= lag { 295 | p2Res = append(p2Res, mcorr.CorrResult{Type: "P2", Lag: len(p2Res)}) 296 | } 297 | xy, n := nc.P11(MinAlleleNumber) 298 | p2Res[lag].N += n 299 | p2Res[lag].Mean += xy 300 | } 301 | } 302 | } 303 | } 304 | 305 | for i := 0; i < len(p2Res); { 306 | if p2Res[i].N == 0 { 307 | p2Res = append(p2Res[:i], p2Res[i+1:]...) 308 | } else { 309 | p2Res[i].Mean /= float64(p2Res[i].N) 310 | p2Res[i].Lag *= 3 311 | i++ 312 | } 313 | } 314 | 315 | return 316 | } 317 | 318 | func calcP4(gene *CodonGene, maxl, minDepth int, codeTable *taxonomy.GeneticCode, codonPosition int, synoumous bool) (p4Res []mcorr.CorrResult) { 319 | var valueArray []float64 320 | var countArray []int 321 | var posArray []int 322 | for i := 0; i < gene.Len(); i++ { 323 | value, count := autoCov(gene, i, minDepth, codeTable, codonPosition, synoumous) 324 | if count > 0 { 325 | pos := gene.CodonPiles[i].GenePos() 326 | valueArray = append(valueArray, value) 327 | countArray = append(countArray, count) 328 | posArray = append(posArray, pos) 329 | } 330 | } 331 | for i := 0; i < len(valueArray); i++ { 332 | value1 := valueArray[i] 333 | count1 := countArray[i] 334 | xbar := value1 / float64(count1) 335 | for j := i; j < len(valueArray); j++ { 336 | value2 := valueArray[j] 337 | count2 := countArray[j] 338 | ybar := value2 / float64(count2) 339 | lag := posArray[j] - posArray[i] 340 | if lag < 0 { 341 | lag = -lag 342 | } 343 | if lag >= maxl { 344 | break 345 | } 346 | for len(p4Res) <= lag { 347 | p4Res = append(p4Res, mcorr.CorrResult{Type: "P4", Lag: len(p4Res)}) 348 | } 349 | p4Res[lag].Mean += xbar * ybar 350 | p4Res[lag].N++ 351 | } 352 | } 353 | 354 | for i := 0; i < len(p4Res); { 355 | if p4Res[i].N == 0 { 356 | p4Res = append(p4Res[:i], p4Res[i+1:]...) 357 | } else { 358 | p4Res[i].Mean /= float64(p4Res[i].N) 359 | p4Res[i].Lag *= 3 360 | i++ 361 | } 362 | } 363 | 364 | return 365 | } 366 | 367 | func autoCov(gene *CodonGene, i, minDepth int, codeTable *taxonomy.GeneticCode, codonPosition int, synoumous bool) (value float64, count int) { 368 | alphabet := []byte{'A', 'T', 'G', 'C'} 369 | codonPairRaw := gene.PairCodonAt(i, i) 370 | if len(codonPairRaw) < 2 { 371 | return 372 | } 373 | lag := codonPairRaw[0].B.GenePos - codonPairRaw[0].A.GenePos 374 | if lag < 0 { 375 | lag = -lag 376 | } 377 | 378 | var splittedCodonPairs [][]CodonPair 379 | if synoumous { 380 | splittedCodonPairs = SynoumousSplitCodonPairs(codonPairRaw, codeTable) 381 | } else { 382 | splittedCodonPairs = [][]CodonPair{codonPairRaw} 383 | } 384 | 385 | for _, synPairs := range splittedCodonPairs { 386 | if len(synPairs) > minDepth { 387 | nc := mcorr.NewNuclCov(alphabet) 388 | doubleCount(nc, synPairs, codonPosition) 389 | 390 | xy, n := nc.P11(MinAlleleNumber) 391 | value += xy 392 | count += n 393 | } 394 | } 395 | return 396 | } 397 | 398 | // Map2Ref Obtains a read mapping to the reference genome. 399 | func Map2Ref(r *sam.Record) (s []byte, q []byte) { 400 | p := 0 // position in the read sequence. 401 | read := r.Seq.Expand() // read sequence. 402 | qual := r.Qual 403 | length := 0 404 | for _, c := range r.Cigar { 405 | switch c.Type() { 406 | case sam.CigarMatch, sam.CigarMismatch, sam.CigarEqual, sam.CigarSoftClipped: 407 | length += c.Len() 408 | } 409 | } 410 | if length != len(read) || len(read) != len(qual) { 411 | return 412 | } 413 | 414 | for _, c := range r.Cigar { 415 | switch c.Type() { 416 | case sam.CigarMatch, sam.CigarMismatch, sam.CigarEqual: 417 | s = append(s, read[p:p+c.Len()]...) 418 | q = append(q, qual[p:p+c.Len()]...) 419 | p += c.Len() 420 | case sam.CigarInsertion, sam.CigarSoftClipped: 421 | p += c.Len() 422 | case sam.CigarDeletion, sam.CigarSkipped: 423 | for i := 0; i < c.Len(); i++ { 424 | s = append(s, '-') 425 | q = append(q, 0) 426 | } 427 | } 428 | } 429 | 430 | s = bytes.ToUpper(s) 431 | 432 | for i, a := range q { 433 | if int(a) < MinBaseQuality { 434 | s[i] = '-' 435 | } 436 | } 437 | 438 | return 439 | } 440 | 441 | func checkCoverage(gene *CodonGene, geneLen, minDepth int, minCoverage float64) (ok bool) { 442 | num := 0 443 | for _, pile := range gene.CodonPiles { 444 | if pile.Len() > minDepth { 445 | num++ 446 | } 447 | } 448 | coverage := float64(num) / float64(geneLen) * 3.0 // codon pile is in unit of codons (3) 449 | ok = coverage > minCoverage 450 | return 451 | } 452 | 453 | // readLines return all trimmed lines. 454 | func readLines(filename string) []string { 455 | f, err := os.Open(filename) 456 | if err != nil { 457 | log.Panic(err) 458 | } 459 | defer f.Close() 460 | 461 | rd := bufio.NewReader(f) 462 | var lines []string 463 | for { 464 | line, err := rd.ReadString('\n') 465 | if err != nil { 466 | if err != io.EOF { 467 | log.Panic(err) 468 | } 469 | break 470 | } 471 | lines = append(lines, strings.TrimSpace(line)) 472 | } 473 | return lines 474 | } 475 | 476 | // subsample 477 | func subsample(geneRecords GeneSamRecords, maxDepth float64) GeneSamRecords { 478 | length := float64(geneRecords.End - geneRecords.Start) 479 | readNum := len(geneRecords.Records) 480 | readLen := float64(geneRecords.Records[0].Len()) 481 | maxReadNum := int(length * maxDepth / readLen) 482 | if readNum <= maxReadNum { 483 | return geneRecords 484 | } 485 | 486 | oldRecords := geneRecords.Records 487 | geneRecords.Records = []*sam.Record{} 488 | ratio := float64(maxReadNum) / float64(readNum) 489 | for _, read := range oldRecords { 490 | if rand.Float64() < ratio { 491 | geneRecords.Records = append(geneRecords.Records, read) 492 | } 493 | } 494 | 495 | return geneRecords 496 | } 497 | --------------------------------------------------------------------------------