├── cmd
├── mcorr-fit
│ ├── requirements.txt
│ ├── mcorr
│ │ ├── __init__.py
│ │ ├── writer.py
│ │ ├── fit_res.py
│ │ ├── corr_res.py
│ │ ├── fit_report.py
│ │ ├── lmfitFunctions.py
│ │ ├── fit_data.py
│ │ ├── cli.py
│ │ ├── singleFit.py
│ │ ├── lmfit_report.py
│ │ ├── FitComparison.py
│ │ ├── plot.py
│ │ └── fit.py
│ ├── setup.py
│ ├── .gitignore
│ └── old
│ │ └── fitCorr.py
├── development
│ ├── mcorr-vcf
│ │ ├── vcf_record.go
│ │ └── main.go
│ ├── utils
│ │ ├── ToDistMatrix.py
│ │ └── ClusterCorrResults.py
│ ├── mcorr-collect
│ │ └── main.go
│ ├── FitCollector
│ │ ├── old
│ │ │ └── getNumPairs.go
│ │ └── FitCollector.go
│ └── mcorr-pair
│ │ └── main.go
├── mcorr-bam
│ ├── mapped_read.go
│ ├── codon.go
│ ├── read_bam.go
│ └── main.go
├── mcorr-xmfa
│ ├── noncoding_calculator.go
│ ├── mate_calculator.go
│ ├── coding_calculator.go
│ └── main.go
└── mcorr-xmfa-2clades
│ ├── noncoding_calculator.go
│ ├── mate_calculator.go
│ ├── coding_calculator.go
│ └── main.go
├── .idea
├── misc.xml
├── vcs.xml
├── .gitignore
├── modules.xml
└── mcorr.iml
├── .gitignore
├── corr_results.go
├── go.mod
├── bootstrap.go
├── mean_var_test.go
├── mean_var.go
├── collect.go
├── collector.go
├── nucl_cov.go
└── README.md
/cmd/mcorr-fit/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | matplotlib
3 | lmfit
4 | tqdm
5 | numdifftools
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /dataSources/
6 | /dataSources.local.xml
7 | # Editor-based HTTP Client requests
8 | /httpRequests/
9 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # This .gitignore file was automatically created by Microsoft(R) Visual Studio.
3 | ################################################################################
4 |
5 | /cmd/fitting/.vs
6 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/cmd/development/mcorr-vcf/vcf_record.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | // VCFRecord store information for a VCF.
4 | type VCFRecord struct {
5 | Chrom string // chromosome name
6 | Pos int // position in the chromosome
7 | Ref, Alt string // reference and alternative allels
8 | GTs []byte
9 | }
10 |
--------------------------------------------------------------------------------
/cmd/mcorr-bam/mapped_read.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | // MappedRead contains the section of a read mapped to a reference genome.
4 | type MappedRead struct {
5 | Pos int
6 | Seq []byte
7 | Qual []byte
8 | }
9 |
10 | // Len return the lenght of a sequence.
11 | func (m MappedRead) Len() int {
12 | return len(m.Seq)
13 | }
14 |
--------------------------------------------------------------------------------
/corr_results.go:
--------------------------------------------------------------------------------
1 | package mcorr
2 |
3 | // CorrResult stores a correlation result.
4 | type CorrResult struct {
5 | Lag int
6 | Mean float64
7 | Variance float64
8 | N int
9 | Type string
10 | }
11 |
12 | // CorrResults stores a list of CorrResult with an gene ID.
13 | type CorrResults struct {
14 | ID string
15 | Results []CorrResult
16 | }
--------------------------------------------------------------------------------
/cmd/mcorr-fit/mcorr/__init__.py:
--------------------------------------------------------------------------------
1 | from .fit_res import FitRes
2 | from .fit_report import FitReport
3 | from .fit import fit_p2, geom_r1, const_r1
4 | from .corr_res import read_corr
5 | from .fit_data import FitData, FitDatas
6 | ##used to be .io
7 | from .writer import write_fitting_results, write_fitting_reports
8 | from .plot import plot_fit, plot_params
9 | from .fit import fit_one
10 |
--------------------------------------------------------------------------------
/.idea/mcorr.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/kussell-lab/mcorr
2 |
3 | go 1.16
4 |
5 | require (
6 | github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 // indirect
7 | github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 // indirect
8 | github.com/biogo/hts v1.4.3
9 | github.com/kussell-lab/biogo v0.0.0-20180102204004-ca4e680bc9e3
10 | github.com/kussell-lab/ncbiftp v0.0.0-20180102204232-614f5f8e9538
11 | github.com/mattn/go-colorable v0.1.12 // indirect
12 | golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e // indirect
13 | gonum.org/v1/gonum v0.9.3
14 | gopkg.in/VividCortex/ewma.v1 v1.1.1 // indirect
15 | gopkg.in/alecthomas/kingpin.v2 v2.2.6
16 | gopkg.in/cheggaaa/pb.v2 v2.0.7
17 | gopkg.in/fatih/color.v1 v1.7.0 // indirect
18 | gopkg.in/mattn/go-colorable.v0 v0.1.0 // indirect
19 | gopkg.in/mattn/go-isatty.v0 v0.0.4 // indirect
20 | gopkg.in/mattn/go-runewidth.v0 v0.0.4 // indirect
21 | )
22 |
--------------------------------------------------------------------------------
/cmd/mcorr-fit/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | # read requirements.
4 | requirements = []
5 | with open("requirements.txt", 'rU') as reader:
6 | for line in reader:
7 | requirements.append(line.strip())
8 |
9 | setup(name='mcorr',
10 | python_requires='>=3',
11 | version='20180506',
12 | description='Inferring recombination rates from correlation profiles',
13 | url='https://github.com/kussell-lab/mcorr',
14 | author='Mingzhi Lin, Asher Preska Steinberg',
15 | author_email='mingzhi9@gmail.com, apsteinberg@nyu.edu',
16 | license='MIT',
17 | packages=['mcorr'],
18 | install_requires=requirements,
19 | entry_points = {
20 | 'console_scripts' : ['mcorr-fit=mcorr.cli:main', 'mcorrFitOne=mcorr.singleFit:main',
21 | 'mcorrFitCompare=mcorr.FitComparison:main'],
22 | },
23 | zip_safe=False)
24 |
--------------------------------------------------------------------------------
/cmd/mcorr-fit/mcorr/writer.py:
--------------------------------------------------------------------------------
1 | from . import FitReport
2 | def write_fitting_results(all_results, model_params, out_file):
3 | """
4 | write fitting results into a .csv file.
5 | """
6 | # write fitting results.
7 | sep = ","
8 | with open(out_file, 'w') as out:
9 | out.write(sep.join(model_params)+"\n")
10 | for fit_res in all_results:
11 | values = fit_res.get_values(model_params)
12 | out.write(sep.join([str(x) for x in values])+"\n")
13 |
14 | def write_fitting_reports(all_results, model_params, out_file):
15 | """
16 | write fitting reports into a .txt file.
17 | """
18 | with open(out_file, 'w') as out:
19 | for param_name in model_params:
20 | label_name = param_name
21 | if param_name == "ratio":
22 | label_name = "gamma/mu"
23 | report = FitReport(all_results, param_name, label_name)
24 | out.write(report.report()+"\n")
25 |
26 |
--------------------------------------------------------------------------------
/cmd/mcorr-fit/mcorr/fit_res.py:
--------------------------------------------------------------------------------
1 | class FitRes(object):
2 | """Fitting results"""
3 | def __init__(self, group, residual, params, d_sample):
4 | self.group = group
5 | self.d_sample = d_sample
6 | self.residual = residual
7 | if "thetaP" in params:
8 | self.theta_pool = params['thetaP']
9 | if 'phiP' in params:
10 | self.phi_pool = params['phiP']
11 | if 'f' in params:
12 | self.fbar = params['f']
13 | if 'phiP' in params:
14 | self.ratio = self.phi_pool / self.theta_pool
15 | if 'f' in params:
16 | self.rho = self.phi_pool * self.fbar
17 | if 'c' in params:
18 | self.c = params['c']
19 | if 'dc' in params:
20 | self.d_clonal = params['dc']
21 | if 'dp' in params:
22 | self.d_pool = params['dp']
23 | if 'phiS' in params:
24 | self.phi_s = params['phiS']
25 | if 'thetaS' in params:
26 | self.theta_s = params['thetaS']
27 |
28 | def get_values(self, attributes):
29 | """Get attribute values"""
30 | values = []
31 | for name in attributes:
32 | if hasattr(self, name):
33 | values.append(getattr(self, name))
34 | else:
35 | values.append("NA")
36 | return values
37 |
38 |
39 |
--------------------------------------------------------------------------------
/bootstrap.go:
--------------------------------------------------------------------------------
1 | package mcorr
2 |
3 | import "math/rand"
4 | import "math"
5 |
6 | // Bootstrap for one bootstrapping instance.
7 | type Bootstrap struct {
8 | ID string
9 | sampleRatio float64
10 | collector *Collector
11 | isRandom bool
12 | }
13 |
14 | // NewBootstrap creates a new Boot, given id and sample ratio.
15 | // Sample ratio must be a float64 from 0 to 1.
16 | // By default, bootstrap should do random sampling.
17 | func NewBootstrap(id string, sampleRatio float64) *Bootstrap {
18 | b := Bootstrap{}
19 | b.ID = id
20 | if sampleRatio < 0 {
21 | sampleRatio = 0
22 | } else if sampleRatio > 1 {
23 | sampleRatio = 1
24 | }
25 | b.sampleRatio = sampleRatio
26 | b.collector = NewCollector()
27 | b.isRandom = true
28 | return &b
29 | }
30 |
31 | // SetRandom set random status
32 | func (b *Bootstrap) SetRandom(r bool) {
33 | b.isRandom = r
34 | }
35 |
36 | // Add add one result into the Bootstrap.
37 | func (b *Bootstrap) Add(results CorrResults) {
38 | if b.isRandom {
39 | k := poisson(b.sampleRatio)
40 | for i := 0; i < k; i++ {
41 | b.collector.Add(results)
42 | }
43 | } else {
44 | b.collector.Add(results)
45 | }
46 |
47 | }
48 |
49 | // Results return final results.
50 | func (b *Bootstrap) Results() (results []CorrResult) {
51 | return b.collector.Results()
52 | }
53 |
54 | func poisson(lambda float64) int {
55 | L := math.Pow(math.E, -lambda)
56 | k := 0
57 | p := 1.0
58 | for p > L {
59 | k++
60 | p *= rand.Float64()
61 | }
62 | return k - 1
63 | }
64 |
--------------------------------------------------------------------------------
/cmd/mcorr-fit/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
106 | .vscode
--------------------------------------------------------------------------------
/cmd/development/utils/ToDistMatrix.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from argparse import ArgumentParser
3 |
4 | def main():
5 | """Run fitting using lmfit"""
6 | parser = ArgumentParser(description="Convert to distant matrix")
7 | parser.add_argument("fit_res_file", type=str)
8 | parser.add_argument("output_file", type=str)
9 | parser.add_argument('--by', nargs='?', const="theta", type=str, default="theta")
10 | opts = parser.parse_args()
11 | datafile = opts.fit_res_file
12 | outfile = opts.output_file
13 | byvalue = opts.by
14 |
15 | dmap = {}
16 | with open(datafile) as reader:
17 | header = reader.readline().rstrip().split(",")
18 | for line in reader:
19 | terms = line.rstrip().split(",")
20 | group = terms[0]
21 | if "_vs_" in group:
22 | isolates = group.split("_vs_")
23 | ddmap = dmap.get(isolates[0], {})
24 | ddmap[isolates[1]] = terms[header.index(byvalue)]
25 | dmap[isolates[0]] = ddmap
26 |
27 | ddmap = dmap.get(isolates[1], {})
28 | ddmap[isolates[0]] = terms[header.index(byvalue)]
29 | dmap[isolates[1]] = ddmap
30 | isolates = sorted(dmap.keys())
31 | with open(outfile, 'w') as writer:
32 | writer.write("," + ",".join(isolates) + "\n")
33 | for isolate1 in isolates:
34 | writer.write(isolate1)
35 | for isolate2 in isolates:
36 | if isolate1 == isolate2:
37 | value = 0
38 | else:
39 | value = float(dmap[isolate1][isolate2])
40 | writer.write(",%g" % value)
41 | writer.write("\n")
42 |
43 |
44 |
45 | if __name__ == "__main__":
46 | main()
--------------------------------------------------------------------------------
/cmd/mcorr-fit/mcorr/corr_res.py:
--------------------------------------------------------------------------------
1 | class CorrRes(object):
2 | """One correlation result"""
3 | def __init__(self, terms):
4 | lag = float(terms[0])
5 | value = float(terms[1])
6 | variance = float(terms[2])
7 | num = float(terms[3])
8 | corrtype = terms[4]
9 | group = terms[5]
10 | self.lag = lag
11 | self.value = value
12 | self.variance = variance
13 | self.num = num
14 | self.corrtype = corrtype
15 | self.group = group
16 |
17 | def read_corr(csv_file):
18 | """Read corr results in a csv file"""
19 | results = []
20 | with open(csv_file, 'r') as infile:
21 | for line in infile:
22 | if line.startswith('#'): continue
23 | terms = line.rstrip().split(",")
24 | if terms[0] == 'l': continue
25 | results.append(CorrRes(terms))
26 | return results
27 |
28 | class GeneCorrRes(object):
29 | """One correlation result"""
30 | def __init__(self, terms):
31 | lag = float(terms[0])
32 | value = float(terms[1])
33 | variance = float(terms[3])
34 | num = float(terms[2])
35 | corrtype = terms[4]
36 | group = terms[5]
37 | self.lag = lag
38 | self.value = value
39 | self.variance = variance
40 | self.num = num
41 | self.corrtype = corrtype
42 | self.group = group
43 |
44 | def read_genecorr(csv_file):
45 | """Read corr results in a csv file"""
46 | results = []
47 | with open(csv_file, 'r') as infile:
48 | for line in infile:
49 | if line.startswith('#'): continue
50 | terms = line.rstrip().split(",")
51 | if terms[0] == 'lag': continue
52 | results.append(CorrRes(terms))
53 | return results
54 |
55 |
56 |
--------------------------------------------------------------------------------
/mean_var_test.go:
--------------------------------------------------------------------------------
1 | package mcorr
2 |
3 | import (
4 | "testing"
5 | "math"
6 | )
7 |
8 | func TestMeanAndVariance(t *testing.T) {
9 | mv := NewMeanVar();
10 | if mv.Mean() != 0 {
11 | t.Error("Empty MeanVar should return zero for mean\n")
12 | }
13 |
14 | if !math.IsNaN(mv.Variance()) {
15 | t.Error("Empty MeanVar should return NaN for variance\n")
16 | }
17 |
18 | mv.Add(1.0)
19 | if mv.Mean() != 1.0 {
20 | t.Errorf("Expected 1.0, but got %g\n", mv.Mean())
21 | }
22 |
23 | if !math.IsNaN(mv.Variance()) {
24 | t.Errorf("Expected NaN, but got %g\n", mv.Variance())
25 | }
26 |
27 | resValues := []float64{1.0, 2.0, 4.0, 7.0}
28 | sum := 1.0
29 | for _, val := range resValues {
30 | sum += val
31 | }
32 | expectedMean := sum / float64(len(resValues) + 1)
33 | expectedVariance := (1.0 - expectedMean) * (1.0 - expectedMean)
34 | for _, val := range resValues {
35 | expectedVariance += (val - expectedMean) * (val - expectedMean)
36 | }
37 | expectedVariance /= float64(len(resValues) + 1)
38 | for _, val := range resValues {
39 | mv.Add(val)
40 | }
41 | if mv.Mean() != expectedMean {
42 | t.Errorf("Expected %g, but got %g\n", expectedMean, mv.Mean())
43 | }
44 | if mv.Variance() != expectedVariance {
45 | t.Errorf("Expected %g, but got %g\n", expectedVariance, mv.Variance())
46 | }
47 |
48 | }
49 |
50 | func TestN(t *testing.T) {
51 | mv := NewMeanVar()
52 | if mv.N() != 0 {
53 | t.Error("Empty MeanVariance should return zero for N()\n")
54 | }
55 | values := []float64{1, 2, 3, 4}
56 | for _, v := range values {
57 | mv.Add(v)
58 | }
59 | if mv.N() != len(values) {
60 | t.Errorf("Expected %d, but got %d\n", len(values), mv.N())
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/cmd/mcorr-xmfa/noncoding_calculator.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "github.com/kussell-lab/biogo/seq"
5 | "github.com/kussell-lab/mcorr"
6 | )
7 |
8 | // NoncodingCalculator for calculating noncoding sequences.
9 | type NoncodingCalculator struct {
10 | MaxLen int
11 | }
12 |
13 | // NewNoncodingCalculator return a NoncodingCalculator
14 | func NewNoncodingCalculator(maxLen int) *NoncodingCalculator {
15 | return &NoncodingCalculator{
16 | MaxLen: maxLen,
17 | }
18 | }
19 |
20 | // CalcP2 calculate P2
21 | func (cc *NoncodingCalculator) CalcP2(alignment []seq.Sequence, others ...[]seq.Sequence) (results []mcorr.CorrResult) {
22 | return calcP2Noncoding(alignment, cc.MaxLen)
23 | }
24 |
25 | func calcP2Noncoding(aln []seq.Sequence, maxLen int) (results []mcorr.CorrResult) {
26 | for l := 0; l < maxLen; l++ {
27 | totalxy := 0.0
28 | totaln := 0
29 | for i := 0; i+l < len(aln[0].Seq); i++ {
30 | j := i + l
31 | basePairs := [][]byte{}
32 | for _, s := range aln {
33 | basePairs = append(basePairs, []byte{s.Seq[i], s.Seq[j]})
34 | }
35 |
36 | nc := doubleCounts(basePairs)
37 | xy, n := nc.P11(0)
38 | totalxy += xy
39 | totaln += n
40 | }
41 | if totaln > 0 {
42 | res := mcorr.CorrResult{
43 | Lag: l,
44 | Mean: totalxy / float64(totaln),
45 | N: totaln,
46 | Type: "P2"}
47 | results = append(results, res)
48 | }
49 | }
50 |
51 | return
52 | }
53 |
54 | func doubleCounts(basePairs [][]byte) *mcorr.NuclCov {
55 | alphabet := []byte{'A', 'T', 'G', 'C'}
56 | c := mcorr.NewNuclCov(alphabet)
57 | for _, basePair := range basePairs {
58 | a := basePair[0]
59 | b := basePair[1]
60 | if isATGC(a) && isATGC(b) {
61 | c.Add(a, b)
62 | }
63 | }
64 | return c
65 | }
66 |
67 | // ATGC is DNA alphabet.
68 | const ATGC = "ATGC"
69 | const atgc = "atgc"
70 |
71 | func isATGC(b byte) bool {
72 | yes := false
73 | for i := 0; i < len(ATGC); i++ {
74 | if b == ATGC[i] {
75 | yes = true
76 | break
77 | }
78 | }
79 | return yes
80 | }
81 |
--------------------------------------------------------------------------------
/cmd/mcorr-xmfa-2clades/noncoding_calculator.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "github.com/kussell-lab/biogo/seq"
5 | "github.com/kussell-lab/mcorr"
6 | )
7 |
8 | // NoncodingCalculator for calculating noncoding sequences.
9 | type NoncodingCalculator struct {
10 | MaxLen int
11 | }
12 |
13 | // NewNoncodingCalculator return a NoncodingCalculator
14 | func NewNoncodingCalculator(maxLen int) *NoncodingCalculator {
15 | return &NoncodingCalculator{
16 | MaxLen: maxLen,
17 | }
18 | }
19 |
20 | // CalcP2 calculate P2
21 | func (cc *NoncodingCalculator) CalcP2(alignment []seq.Sequence, others ...[]seq.Sequence) (results []mcorr.CorrResult) {
22 | return calcP2Noncoding(alignment, cc.MaxLen)
23 | }
24 |
25 | func calcP2Noncoding(aln []seq.Sequence, maxLen int) (results []mcorr.CorrResult) {
26 | for l := 0; l < maxLen; l++ {
27 | totalxy := 0.0
28 | totaln := 0
29 | for i := 0; i+l < len(aln[0].Seq); i++ {
30 | j := i + l
31 | basePairs := [][]byte{}
32 | for _, s := range aln {
33 | basePairs = append(basePairs, []byte{s.Seq[i], s.Seq[j]})
34 | }
35 |
36 | nc := doubleCounts(basePairs)
37 | xy, n := nc.P11(0)
38 | totalxy += xy
39 | totaln += n
40 | }
41 | if totaln > 0 {
42 | res := mcorr.CorrResult{
43 | Lag: l,
44 | Mean: totalxy / float64(totaln),
45 | N: totaln,
46 | Type: "P2"}
47 | results = append(results, res)
48 | }
49 | }
50 |
51 | return
52 | }
53 |
54 | func doubleCounts(basePairs [][]byte) *mcorr.NuclCov {
55 | alphabet := []byte{'A', 'T', 'G', 'C'}
56 | c := mcorr.NewNuclCov(alphabet)
57 | for _, basePair := range basePairs {
58 | a := basePair[0]
59 | b := basePair[1]
60 | if isATGC(a) && isATGC(b) {
61 | c.Add(a, b)
62 | }
63 | }
64 | return c
65 | }
66 |
67 | // ATGC is DNA alphabet.
68 | const ATGC = "ATGC"
69 | const atgc = "atgc"
70 |
71 | func isATGC(b byte) bool {
72 | yes := false
73 | for i := 0; i < len(ATGC); i++ {
74 | if b == ATGC[i] {
75 | yes = true
76 | break
77 | }
78 | }
79 | return yes
80 | }
81 |
--------------------------------------------------------------------------------
/mean_var.go:
--------------------------------------------------------------------------------
1 | package mcorr
2 |
3 | import (
4 | "math"
5 | )
6 |
7 | // MeanVar is for calculate mean and variance in the increment way.
8 | type MeanVar struct {
9 | n int // number of values.
10 | m1 float64 // first moment.
11 | dev float64
12 | nDev float64
13 | m2 float64 // second moment.
14 | biasCorrected bool
15 | }
16 |
17 | // NewMeanVar return a new MeanVar.
18 | func NewMeanVar() *MeanVar {
19 | return &MeanVar{}
20 | }
21 |
22 | // Add adds a value.
23 | func (m *MeanVar) Add(v float64) {
24 | m.n++
25 | m.dev = v - m.m1
26 | m.nDev = m.dev / float64(m.n)
27 | m.m1 += m.nDev
28 | m.m2 += float64(m.n-1) * m.dev * m.nDev
29 | }
30 |
31 | // Mean returns the mean result.
32 | func (m *MeanVar) Mean() float64 {
33 | return m.m1
34 | }
35 |
36 | // Variance returns the variance.
37 | func (m *MeanVar) Variance() float64 {
38 | if m.n < 2 {
39 | return math.NaN()
40 | }
41 |
42 | if m.biasCorrected {
43 | return m.m2 / float64(m.n-1)
44 | }
45 |
46 | return m.m2 / float64(m.n)
47 | }
48 |
49 | // N returns the number of values.
50 | func (m *MeanVar) N() int {
51 | return m.n
52 | }
53 |
54 | // IsBiasCorrected return true if the variance will be bias corrected.
55 | func (m *MeanVar) IsBiasCorrected() bool {
56 | return m.biasCorrected
57 | }
58 |
59 | // SetBiasCorrected sets if bias corrected.
60 | func (m *MeanVar) SetBiasCorrected(biasCorrected bool) {
61 | m.biasCorrected = biasCorrected
62 | }
63 |
64 | // Append add another result.
65 | func (m *MeanVar) Append(m2 *MeanVar) {
66 | if m.n == 0 {
67 | m.n = m2.n
68 | m.m1 = m2.m1
69 | m.dev = m2.dev
70 | m.nDev = m2.nDev
71 | m.m2 = m2.m2
72 | } else {
73 | if m2.n > 0 {
74 | total1 := m.m1 * float64(m.n)
75 | total2 := m2.m1 * float64(m2.n)
76 | newMean := (total1 + total2) / float64(m.n+m2.n)
77 | delta1 := m.Mean() - newMean
78 | delta2 := m2.Mean() - newMean
79 | sm := (m.m2 + m2.m2) + float64(m.n)*delta1*delta1 + float64(m2.n)*delta2*delta2
80 | m.m1 = newMean
81 | m.m2 = sm
82 | m.n = m.n + m2.n
83 | }
84 | }
85 | }
86 |
--------------------------------------------------------------------------------
/cmd/development/utils/ClusterCorrResults.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import json
3 | import numpy as np
4 | from sklearn.cluster import KMeans
5 | from argparse import ArgumentParser
6 |
7 | parser = ArgumentParser(description="Cluster correlation results.")
8 | parser.add_argument("corr_file", type=str)
9 | parser.add_argument("output_prefix", type=str)
10 | parser.add_argument('--corr_type', nargs='?', const="P4", type=str, default="P4")
11 | parser.add_argument('--xmin', nargs='?', const=3, type=int, default=3)
12 | parser.add_argument('--xmax', nargs='?', const=150, type=int, default=150)
13 | opts = parser.parse_args()
14 | labels = []
15 | X = []
16 | corr_results = {}
17 | with open(opts.corr_file) as reader:
18 | for line in reader:
19 | data = json.loads(line)
20 | labels.append(data['ID'])
21 | corr = []
22 | for res in data['Results']:
23 | if res['Type'] == 'P2' and int(res['Lag']) == 0 and int(res['N']) > 0:
24 | if len(corr) == 0:
25 | corr.append(float(res['Mean']))
26 | else:
27 | corr[0] = float(res['Mean'])
28 | elif res['Type'] == opts.corr_type and int(res['Lag']) > 0 and int(res['Lag']) < 150 and int(res['N']) > 0:
29 | idx = int(res['Lag']) / 3
30 | while len(corr) <= idx:
31 | corr.append(0.0)
32 | corr[idx] = float(res['Mean'])
33 | while len(corr) < 50:
34 | corr.append(0.0)
35 | X.append(corr)
36 | corr_results[data['ID']] = data
37 | X = np.array(X)
38 | n_clusters = 2
39 | kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
40 |
41 | for i in range(n_clusters):
42 | outfile = "%s_%d.json" % (opts.output_prefix, i)
43 | with open(outfile, 'w') as w:
44 | for idx, c in enumerate(kmeans.labels_):
45 | if c == i:
46 | w.write(json.dumps(corr_results[labels[idx]]) + "\n")
47 | outfile = "%s_%d.txt" % (opts.output_prefix, i)
48 | with open(outfile, 'w') as w:
49 | for idx, c in enumerate(kmeans.labels_):
50 | if c == i:
51 | w.write(labels[idx] + "\n")
--------------------------------------------------------------------------------
/collect.go:
--------------------------------------------------------------------------------
1 | package mcorr
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 | "os"
7 | )
8 |
9 | // PipeOutCorrResults pipe the the channel of CorrResults out to a file.
10 | func PipeOutCorrResults(corrResChan chan CorrResults, outFile string) chan CorrResults {
11 | c := make(chan CorrResults)
12 | go func() {
13 | defer close(c)
14 | f, err := os.Create(outFile)
15 | if err != nil {
16 | panic(err)
17 | }
18 | defer f.Close()
19 |
20 | encoder := json.NewEncoder(f)
21 | for res := range corrResChan {
22 | if err := encoder.Encode(res); err != nil {
23 | panic(err)
24 | }
25 | c <- res
26 | }
27 | }()
28 | return c
29 | }
30 |
31 | // Collect feed correlation results into boostrappers and return them.
32 | func Collect(corrResChan chan CorrResults, numBoot int) []*Bootstrap {
33 | // prepare bootstrappers.
34 | bootstraps := []*Bootstrap{}
35 | notBootstrap := NewBootstrap("all", 1.0)
36 | notBootstrap.SetRandom(false)
37 | bootstraps = append(bootstraps, notBootstrap)
38 | for i := 0; i < numBoot; i++ {
39 | id := fmt.Sprintf("boot_%d", i)
40 | sampleRatio := 1.0
41 | bootstraps = append(bootstraps, NewBootstrap(id, sampleRatio))
42 | }
43 |
44 | for corrResults := range corrResChan {
45 | for _, bs := range bootstraps {
46 | bs.Add(corrResults)
47 | }
48 | }
49 | return bootstraps
50 | }
51 |
52 | // CollectWrite collects and writes the correlation results.
53 | func CollectWrite(corrResChan chan CorrResults, outFile string, numBoot int) {
54 | bootstraps := Collect(corrResChan, numBoot)
55 |
56 | w, err := os.Create(outFile)
57 | if err != nil {
58 | panic(err)
59 | }
60 | defer w.Close()
61 |
62 | w.WriteString("# l: the distance between two genomic positions\n")
63 | w.WriteString("# m: the mean value of correlatio profile\n")
64 | w.WriteString("# v: the variance of correlation profile\n")
65 | w.WriteString("# n: the total number of alignments used for calculation\n")
66 | w.WriteString("# t: the type of result: Ks is for d_sample, and P2 is for correlation profile\n")
67 | w.WriteString("# b: the bootstrap number (all means used all alignments).\n")
68 |
69 | w.WriteString("l,m,v,n,t,b\n")
70 | for _, bs := range bootstraps {
71 | //this is where division by P(l = 0) happens!!!!
72 | results := bs.Results()
73 | for _, res := range results {
74 | w.WriteString(fmt.Sprintf("%d,%g,%g,%d,%s,%s\n", res.Lag, res.Mean, res.Variance, res.N, res.Type, bs.ID))
75 | }
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/cmd/mcorr-fit/mcorr/fit_report.py:
--------------------------------------------------------------------------------
1 | import numpy
2 | class FitReport(object):
3 | """statistics report of fitting results"""
4 | def __init__(self, fit_results, param_name, label_name=None):
5 | """generate FitReport from fit_results of the param"""
6 | self.param_name = param_name
7 | if label_name is None:
8 | self.label_name = param_name
9 | else:
10 | self.label_name = label_name
11 |
12 | self.boot_data = []
13 | self.raw_value = None
14 | for res in fit_results:
15 | if hasattr(res, param_name):
16 | value = getattr(res, param_name)
17 | group = res.group
18 | if group == "all":
19 | self.raw_value = value
20 | else:
21 | self.boot_data.append(value)
22 | def get_param_name(self):
23 | return self.param_name
24 |
25 | def get_label_name(self):
26 | return self.label_name
27 |
28 | def get_raw_value(self):
29 | return self.raw_value
30 |
31 | def get_boot_size(self):
32 | """return the size of the bootstrapping data"""
33 | return len(self.boot_data)
34 |
35 | def get_boot_mean(self):
36 | """return mean of the bootstrapping data"""
37 | return numpy.mean(self.boot_data)
38 |
39 | def get_boot_std(self):
40 | """return standard deviation of the bootstrapping data"""
41 | return numpy.std(self.boot_data)
42 |
43 | def get_boot_median(self):
44 | """return median of the bootstrapping data"""
45 | return numpy.median(self.boot_data)
46 |
47 | def get_boot_lower_bound(self):
48 | """return bootstrapping lower bound"""
49 | return numpy.percentile(self.boot_data, 5)
50 |
51 | def get_boot_upper_bound(self):
52 | """return bootstrapping upper bound"""
53 | return numpy.percentile(self.boot_data, 95)
54 |
55 | def report(self):
56 | value = ""
57 | value += "[%s]\n" % self.get_label_name()
58 | if self.get_raw_value():
59 | value += "value = %g\n" % self.get_raw_value()
60 | if len(self.boot_data) >= 10:
61 | value += "bootstrapping mean = %g\n" % self.get_boot_mean()
62 | value += "bootstrapping standard deviation = %g\n" % self.get_boot_std()
63 | value += "bootstrapping size = %d\n" % self.get_boot_size()
64 | value += "bootstrapping median = %g\n" % self.get_boot_median()
65 | value += "bootstrapping lower bound (5%%) = %g\n" % \
66 | self.get_boot_lower_bound()
67 | value += "bootstrapping upper bound (95%%) = %g\n" % \
68 | self.get_boot_upper_bound()
69 | return value
70 |
--------------------------------------------------------------------------------
/collector.go:
--------------------------------------------------------------------------------
1 | package mcorr
2 |
3 | // Collector collect correlation results.
4 | type Collector struct {
5 | m map[string][]*MeanVar
6 | minN int
7 | }
8 |
9 | // NewCollector return a new Collector.
10 | func NewCollector() *Collector {
11 | c := Collector{}
12 | c.m = make(map[string][]*MeanVar)
13 | return &c
14 | }
15 |
16 | // Add add an array of CorrResult.
17 | func (c *Collector) Add(results CorrResults) {
18 | for _, res := range results.Results {
19 | for len(c.m[res.Type]) <= res.Lag {
20 | c.m[res.Type] = append(c.m[res.Type], NewMeanVar())
21 | }
22 | if res.N > c.minN {
23 | c.m[res.Type][res.Lag].Add(res.Mean)
24 | }
25 | }
26 | }
27 |
28 | // Means return means of a particular type.
29 | func (c *Collector) Means(corrType string) (values []float64) {
30 | for _, mv := range c.MeanVars(corrType) {
31 | values = append(values, mv.Mean())
32 | }
33 | return
34 | }
35 |
36 | // Vars return variances of a particular type.
37 | func (c *Collector) Vars(corrType string) (values []float64) {
38 | for _, mv := range c.MeanVars(corrType) {
39 | values = append(values, mv.Variance())
40 | }
41 | return
42 | }
43 |
44 | // Ns return variances of a particular type.
45 | func (c *Collector) Ns(corrType string) (nums []int) {
46 | for _, mv := range c.MeanVars(corrType) {
47 | nums = append(nums, mv.N())
48 | }
49 | return
50 | }
51 |
52 | // MeanVars return a list of meanvar.MeanVar.
53 | func (c *Collector) MeanVars(corrType string) (values []*MeanVar) {
54 | return c.m[corrType]
55 | }
56 |
57 | // CorrTypes return all corr types.
58 | func (c *Collector) CorrTypes() (corrTypes []string) {
59 | for key := range c.m {
60 | corrTypes = append(corrTypes, key)
61 | }
62 | return
63 | }
64 |
65 | // Results get results
66 | func (c *Collector) Results() (results []CorrResult) {
67 | // Failed fitting.
68 | if len(c.Means("P2")) == 0 {
69 | return nil
70 | }
71 |
72 | // calculate ks first
73 | ks := c.Means("P2")[0]
74 | results = append(results,
75 | CorrResult{
76 | Lag: 0,
77 | N: c.Ns("P2")[0],
78 | Type: "Ks",
79 | Mean: c.Means("P2")[0],
80 | Variance: c.Vars("P2")[0],
81 | })
82 | if ks == 0 {
83 | return
84 | }
85 |
86 | for _, ctype := range c.CorrTypes() {
87 | means := c.Means(ctype)
88 | vars := c.Vars(ctype)
89 | ns := c.Ns(ctype)
90 | for i := 0; i < len(means); i++ {
91 | if !(ctype == "P2" && i == 0) && ns[i] > 0 {
92 | res := CorrResult{}
93 | res.Lag = i
94 | res.N = ns[i]
95 | res.Type = ctype
96 | res.Mean = means[i] / ks
97 | res.Variance = vars[i] / (ks * ks)
98 | results = append(results, res)
99 | }
100 | }
101 | }
102 |
103 | return
104 | }
105 |
--------------------------------------------------------------------------------
/cmd/development/mcorr-collect/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 | "io"
7 | "os"
8 | "sort"
9 |
10 | "github.com/kussell-lab/mcorr"
11 | "gopkg.in/alecthomas/kingpin.v2"
12 | )
13 |
14 | func main() {
15 | app := kingpin.New("mcorr-collect", "Collect results.")
16 | app.Version("v0.1")
17 |
18 | alnFile := app.Arg("in", "json file").Required().String()
19 | outFile := app.Arg("out", "Output file in CSV format.").Required().String()
20 | numBoot := app.Flag("num-boot", "Number of bootstrapping on genes").Default("1000").Int()
21 | corrType := app.Flag("corr-type", "correlation type").Default("P4").String()
22 | kingpin.MustParse(app.Parse(os.Args[1:]))
23 |
24 | resChan := readCorrRes(*alnFile)
25 | if *corrType == "P2" {
26 | mcorr.CollectWrite(resChan, *outFile, *numBoot)
27 | } else {
28 | bootstraps := mcorr.Collect(resChan, *numBoot)
29 |
30 | w, err := os.Create(*outFile)
31 | if err != nil {
32 | panic(err)
33 | }
34 | defer w.Close()
35 |
36 | w.WriteString("l,m,v,n,t,b\n")
37 | for _, bs := range bootstraps {
38 | results := bs.Results()
39 | qfactor := getQfactor(results)
40 | for _, res := range results {
41 | if res.Type == "Ks" || (res.Type == "P4" && res.Lag > 0) {
42 | if res.Type == "P4" {
43 | res.Mean *= qfactor
44 | res.Variance *= qfactor * qfactor
45 | res.Type = "P2"
46 | }
47 | w.WriteString(fmt.Sprintf("%d,%g,%g,%d,%s,%s\n",
48 | res.Lag, res.Mean, res.Variance, res.N, res.Type, bs.ID))
49 | }
50 | }
51 | }
52 | }
53 |
54 | }
55 |
56 | // readCorrRes return a channel of CorrRes
57 | func readCorrRes(filename string) chan mcorr.CorrResults {
58 | c := make(chan mcorr.CorrResults)
59 | go func() {
60 | defer close(c)
61 | f, err := os.Open(filename)
62 | if err != nil {
63 | panic(err)
64 | }
65 | defer f.Close()
66 |
67 | decoder := json.NewDecoder(f)
68 | for {
69 | var corrResults mcorr.CorrResults
70 | if err := decoder.Decode(&corrResults); err != nil {
71 | if err != io.EOF {
72 | panic(err)
73 | }
74 | break
75 | }
76 | c <- corrResults
77 | }
78 | }()
79 | return c
80 | }
81 |
82 | // getQfactor return the q factor between p2 and p4.
83 | func getQfactor(results []mcorr.CorrResult) float64 {
84 | p2values := make([]float64, 31)
85 | p4values := make([]float64, 31)
86 | for _, res := range results {
87 | if res.Lag <= 30 && res.Lag > 0 {
88 | if res.Type == "P2" {
89 | p2values[res.Lag] = res.Mean
90 | } else if res.Type == "P4" {
91 | p4values[res.Lag] = res.Mean
92 | }
93 | }
94 | }
95 |
96 | var factors []float64
97 | for i := range p2values {
98 | if p2values[i] > 0 && p4values[i] > 0 {
99 | factors = append(factors, p2values[i]/p4values[i])
100 | }
101 | }
102 |
103 | if len(factors) == 0 {
104 | return 0
105 | }
106 |
107 | sort.Float64s(factors)
108 | if len(factors)%2 == 0 {
109 | return (factors[len(factors)/2] + factors[len(factors)/2-1]) / 2
110 | }
111 | return (factors[len(factors)/2])
112 | }
113 |
--------------------------------------------------------------------------------
/cmd/mcorr-xmfa/mate_calculator.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "github.com/kussell-lab/mcorr"
5 | "github.com/kussell-lab/ncbiftp/taxonomy"
6 | )
7 |
8 | // MateCalculator for calculating correlation for two clusters of sequences.
9 | type MateCalculator struct {
10 | CodingTable *taxonomy.GeneticCode
11 | MaxCodonLen int
12 | CodonOffset int
13 | CodonPosition int
14 | Synonymous bool
15 | }
16 |
17 | // NewMateCalculator returns a MateCalculator
18 | func NewMateCalculator(codingTable *taxonomy.GeneticCode, maxCodonLen, codonOffset, codonPos int, synonymous bool) *MateCalculator {
19 | return &MateCalculator{
20 | CodingTable: codingTable,
21 | MaxCodonLen: maxCodonLen,
22 | CodonOffset: codonOffset,
23 | CodonPosition: codonPos,
24 | Synonymous: synonymous,
25 | }
26 | }
27 |
28 | // CalcP2 calcualtes P2
29 | func (cc *MateCalculator) CalcP2(aln1 Alignment, mates ...Alignment) (corrResults mcorr.CorrResults) {
30 | if len(mates) == 0 {
31 | return
32 | }
33 |
34 | var results []mcorr.CorrResult
35 | cs1 := cc.extractCodonSequences(aln1)
36 | cs2 := cc.extractCodonSequences(mates[0])
37 |
38 | for l := 0; l < cc.MaxCodonLen; l++ {
39 | totalP2 := 0.0
40 | totaln := 0
41 | for pos := 0; pos+l < len(cs1[0]) && pos+l < len(cs2[0]); pos++ {
42 | cpList1 := cc.extractCodonPairs(cs1, pos, pos+l)
43 | cpList2 := cc.extractCodonPairs(cs2, pos, pos+l)
44 | for _, cp1 := range cpList1 {
45 | nc1 := doubleCodons(cp1, cc.CodonPosition)
46 | for _, cp2 := range cpList2 {
47 | nc2 := doubleCodons(cp2, cc.CodonPosition)
48 | if cc.Synonymous {
49 | aa1 := cc.translateCodonPair(cp1[0])
50 | aa2 := cc.translateCodonPair(cp2[0])
51 | if aa1 == aa2 {
52 | xy, n := nc1.MateP11(nc2, 0)
53 | totalP2 += xy
54 | totaln += n
55 | }
56 | } else {
57 | xy, n := nc1.MateP11(nc2, 0)
58 | totalP2 += xy
59 | totaln += n
60 | }
61 | }
62 | }
63 | }
64 | if totaln > 0 {
65 | res1 := mcorr.CorrResult{
66 | Lag: l * 3,
67 | Mean: totalP2 / float64(totaln),
68 | N: totaln,
69 | Type: "P2",
70 | }
71 | results = append(results, res1)
72 | }
73 | }
74 |
75 | corrResults = mcorr.CorrResults{ID: aln1.ID, Results: results}
76 |
77 | return
78 | }
79 |
80 | func (cc *MateCalculator) translateCodonPair(cp CodonPair) string {
81 | a := cc.CodingTable.Table[string(cp.A)]
82 | b := cc.CodingTable.Table[string(cp.B)]
83 | return string([]byte{a, b})
84 | }
85 |
86 | func (cc *MateCalculator) extractCodonSequences(aln Alignment) (csList []CodonSequence) {
87 | for _, s := range aln.Sequences {
88 | csList = append(csList, extractCodons(s, cc.CodonOffset))
89 | }
90 | return
91 | }
92 |
93 | func (cc *MateCalculator) extractCodonPairs(codonSequences []CodonSequence, i, j int) [][]CodonPair {
94 | codonPairs := []CodonPair{}
95 | for _, cc := range codonSequences {
96 | if i < len(cc) && j < len(cc) {
97 | pair := CodonPair{A: cc[i], B: cc[j]}
98 | codonPairs = append(codonPairs, pair)
99 | }
100 | }
101 |
102 | if cc.Synonymous {
103 | return synonymousSplit(codonPairs, cc.CodingTable)
104 | }
105 |
106 | return [][]CodonPair{codonPairs}
107 | }
108 |
--------------------------------------------------------------------------------
/cmd/mcorr-xmfa-2clades/mate_calculator.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "github.com/kussell-lab/mcorr"
5 | "github.com/kussell-lab/ncbiftp/taxonomy"
6 | )
7 |
8 | // MateCalculator for calculating correlation for two clusters of sequences.
9 | type MateCalculator struct {
10 | CodingTable *taxonomy.GeneticCode
11 | MaxCodonLen int
12 | CodonOffset int
13 | CodonPosition int
14 | Synonymous bool
15 | }
16 |
17 | // NewMateCalculator returns a MateCalculator
18 | func NewMateCalculator(codingTable *taxonomy.GeneticCode, maxCodonLen, codonOffset, codonPos int, synonymous bool) *MateCalculator {
19 | return &MateCalculator{
20 | CodingTable: codingTable,
21 | MaxCodonLen: maxCodonLen,
22 | CodonOffset: codonOffset,
23 | CodonPosition: codonPos,
24 | Synonymous: synonymous,
25 | }
26 | }
27 |
28 | // CalcP2 calcualtes P2
29 | func (cc *MateCalculator) CalcP2(aln1 Alignment, mates ...Alignment) (corrResults mcorr.CorrResults) {
30 | if len(mates) == 0 {
31 | return
32 | }
33 |
34 | var results []mcorr.CorrResult
35 | cs1 := cc.extractCodonSequences(aln1)
36 | cs2 := cc.extractCodonSequences(mates[0])
37 |
38 | for l := 0; l < cc.MaxCodonLen; l++ {
39 | totalP2 := 0.0
40 | totaln := 0
41 | for pos := 0; pos+l < len(cs1[0]) && pos+l < len(cs2[0]); pos++ {
42 | cpList1 := cc.extractCodonPairs(cs1, pos, pos+l)
43 | cpList2 := cc.extractCodonPairs(cs2, pos, pos+l)
44 | for _, cp1 := range cpList1 {
45 | nc1 := doubleCodons(cp1, cc.CodonPosition)
46 | for _, cp2 := range cpList2 {
47 | nc2 := doubleCodons(cp2, cc.CodonPosition)
48 | if cc.Synonymous {
49 | aa1 := cc.translateCodonPair(cp1[0])
50 | aa2 := cc.translateCodonPair(cp2[0])
51 | if aa1 == aa2 {
52 | xy, n := nc1.MateP11(nc2, 0)
53 | totalP2 += xy
54 | totaln += n
55 | }
56 | } else {
57 | xy, n := nc1.MateP11(nc2, 0)
58 | totalP2 += xy
59 | totaln += n
60 | }
61 | }
62 | }
63 | }
64 | if totaln > 0 {
65 | res1 := mcorr.CorrResult{
66 | Lag: l * 3,
67 | Mean: totalP2 / float64(totaln),
68 | N: totaln,
69 | Type: "P2",
70 | }
71 | results = append(results, res1)
72 | }
73 | }
74 |
75 | corrResults = mcorr.CorrResults{ID: aln1.ID, Results: results}
76 |
77 | return
78 | }
79 |
80 | func (cc *MateCalculator) translateCodonPair(cp CodonPair) string {
81 | a := cc.CodingTable.Table[string(cp.A)]
82 | b := cc.CodingTable.Table[string(cp.B)]
83 | return string([]byte{a, b})
84 | }
85 |
86 | func (cc *MateCalculator) extractCodonSequences(aln Alignment) (csList []CodonSequence) {
87 | for _, s := range aln.Sequences {
88 | csList = append(csList, extractCodons(s, cc.CodonOffset))
89 | }
90 | return
91 | }
92 |
93 | func (cc *MateCalculator) extractCodonPairs(codonSequences []CodonSequence, i, j int) [][]CodonPair {
94 | codonPairs := []CodonPair{}
95 | for _, cc := range codonSequences {
96 | if i < len(cc) && j < len(cc) {
97 | pair := CodonPair{A: cc[i], B: cc[j]}
98 | codonPairs = append(codonPairs, pair)
99 | }
100 | }
101 |
102 | if cc.Synonymous {
103 | return synonymousSplit(codonPairs, cc.CodingTable)
104 | }
105 |
106 | return [][]CodonPair{codonPairs}
107 | }
108 |
--------------------------------------------------------------------------------
/cmd/mcorr-fit/mcorr/lmfitFunctions.py:
--------------------------------------------------------------------------------
1 | """defining lmfit functions for fitCorr.py script"""
2 | from lmfit import Minimizer, Parameters, minimize
3 | import numpy as np
4 | import numdifftools
5 |
6 | def c_s(phi_s, w, f, theta_s, a):
7 | """eq 21"""
8 | c_s = (phi_s*w*f)/(1+theta_s*a+phi_s*w*f)
9 | return c_s
10 |
11 | def d_i(a, theta_i):
12 | "eq 6"
13 | d = theta_i/(1+theta_i*a)
14 | return d
15 |
16 | def c_s0(c_s1, c_s2, l):
17 | "eq 14"
18 | return np.ones(len(l))-c_s1-c_s2
19 |
20 | def c_s1(w, a, phi_s, l, theta_s, f):
21 | "eq 23"
22 | ##for l < f
23 | c_s1less = (2*phi_s*w*l)/(1+2*theta_s*a+phi_s*w*(f+l))
24 | ## for f >= l
25 | c_s1greater = (2*phi_s*w*l)/(1+2*theta_s*a+phi_s*w)
26 | return np.where(l < f, c_s1less, c_s1greater)
27 |
28 | def c_s2(phi_s, w, f, l, theta_s, a):
29 | "eq 24"
30 | c_s2 = (phi_s*w*(f-l))/(1+2*theta_s*a+phi_s*w*(f+l))
31 | return np.where(l < f, c_s2, 0)
32 |
33 | def Q_p(theta_p, a, phi_p, w, l):
34 | "eq 25"
35 | Q_p = 2*((theta_p/(1+theta_p*a))**2)*((1+theta_p*a+phi_p*w*l)/(1+2*theta_p*a+2*phi_p*w*l))
36 | return Q_p
37 |
38 | def residual(pars, x, data=None):
39 | "defines the function to be minimized -- the residuals of equation 18 for P2"
40 | ##load in parameters from lmfit
41 | phi_s = pars["phi_s"]
42 | #f = pars["f"]
43 | theta_s = pars["theta_s"]
44 | theta_p = pars["theta_p"]
45 | w = pars["w"]
46 | a = pars["a"]
47 | c_s = pars["c_s"]
48 | #d_theta_p = pars["d_theta_p"]
49 | d_theta_s = pars["d_theta_s"]
50 | phi_p = pars["phi_p"]
51 | ds = pars["d_s"]
52 | f = pars["f"]
53 |
54 | ##define equations to be plugged into eq 18
55 | #cs0 = c_s0(w, a, theta_s, phi_s, f, x) # eq 22
56 | d2thetas = d_i(a, 2*theta_s) #eq 20 for 2*theta_s
57 | #cs1 = pars["c_s1"]
58 | cs1 = c_s1(w, a, phi_s, x, theta_s, f) # eq 23
59 | dp = d_i(a, theta_p) # eq 20 for theta_p
60 | cs2 = c_s2(phi_s, w, f, x, theta_s, a) # eq 24
61 | cs0 = c_s0(cs1, cs2, x)
62 | Qp = Q_p(theta_p, a, phi_p, w, x) # eq 25
63 | ##FINALLY EQ 18
64 | Qs = cs0*d2thetas*ds+cs1*ds*dp+cs2*Qp
65 | P2 = Qs/ds
66 | if data is None:
67 | return P2
68 | return P2 - data
69 |
70 |
71 | def perform_lmfit(x, y, d_sample):
72 | "perform the fitting with lmfit"
73 | pfit = Parameters()
74 | pfit.add(name="phi_s", vary=True, min=0, value=1e-4) ##originally had upper bound of 1
75 | ##inital 7.5e2
76 | pfit.add(name="f", vary=True, value=7.5e2, min=3, max=1e6) ##originally min=3/max=3e5; value=1e3
77 | pfit.add(name="theta_s", vary=True, min=0, value=1e-4)
78 | ##define the fixed params
79 | pfit.add(name="w", value=2.0/3.0, vary=False)
80 | pfit.add(name="a", value=4.0/3.0, vary=False)
81 | pfit.add(name="d_s", vary=False, value=d_sample)
82 | ##constrained params
83 | ##originally 0 to 1 for c_s
84 | pfit.add(name="c_s", expr="(phi_s*w*f)/(1+theta_s*a+phi_s*w*f)", min=0, max=1) #eq 21
85 | pfit.add(name="d_theta_s", expr="theta_s/(1+theta_s*a)", min=0) #eq 20 for theta_s (for eq 26)
86 | pfit.add(name="theta_p", expr="((1-c_s)*d_theta_s-d_s)/(a*(d_s-d_theta_s)+c_s*(d_theta_s*a-1))", min=0) #eq 26
87 | pfit.add(name="phi_p", expr="(theta_p*phi_s)/theta_s", min=0) #eq. 27
88 | pfit.add(name="d_theta_p", expr="theta_p/(1+theta_p*a)", min=0) #eq 20 for theta_p (for outputs)
89 | ##least squares with levenberg-marquardt
90 | result = minimize(residual, pfit, args=(x,), kws={'data': y}, method="least_squares", max_nfev=1e6)
91 | ##nelder-mead algorithm for least-squares minimization
92 | #result = minimize(residual, pfit, args=(x,), kws={'data': y}, method="nelder", max_nfev=1e6)
93 | return result
--------------------------------------------------------------------------------
/cmd/development/mcorr-vcf/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "bufio"
5 | "fmt"
6 | "io"
7 | "os"
8 | "strconv"
9 | "strings"
10 |
11 | "github.com/kussell-lab/mcorr"
12 |
13 | "gopkg.in/alecthomas/kingpin.v2"
14 | )
15 |
16 | func main() {
17 | app := kingpin.New("mcorr-vcf", "Calculate mutational correlation from VCF files.")
18 | app.Version("v20171020")
19 | vcfFileArg := app.Arg("vcf-file", "VCF input file.").Required().String()
20 | outFileArg := app.Arg("out-prefix", "output prefix.").Required().String()
21 | maxlFlag := app.Flag("max-corr-length", "max length of correlations (bp).").Default("300").Int()
22 | regionStartFlag := app.Flag("region-start", "region start").Default("1").Int()
23 | regionEndFlag := app.Flag("region-end", "region end").Default("1000000000000").Int()
24 | kingpin.MustParse(app.Parse(os.Args[1:]))
25 |
26 | vcfChan := readVCF(*vcfFileArg)
27 | p2arr := make([]float64, *maxlFlag)
28 | p2counts := make([]int64, *maxlFlag)
29 | var buffer []VCFRecord
30 | for rec := range vcfChan {
31 | if rec.Pos < *regionStartFlag || rec.Pos > *regionEndFlag {
32 | break
33 | }
34 | if len(buffer) == 0 || rec.Pos-buffer[0].Pos < *maxlFlag {
35 | buffer = append(buffer, rec)
36 | } else {
37 | compute(buffer, p2arr, p2counts)
38 | buffer = buffer[1:]
39 | }
40 | }
41 | compute(buffer, p2arr, p2counts)
42 |
43 | w, err := os.Create(*outFileArg)
44 | if err != nil {
45 | panic(err)
46 | }
47 | defer w.Close()
48 | w.WriteString("l,m,n,v,t,b\n")
49 | for k := 0; k < len(p2arr); k++ {
50 | var m float64
51 | var n int64
52 | var t string
53 | n = p2counts[k]
54 | if k == 0 {
55 | m = p2arr[0] / float64(p2counts[0])
56 | t = "Ks"
57 | } else {
58 | m = p2arr[k] / p2arr[0]
59 | t = "P2"
60 | }
61 | if n > 0 {
62 | w.WriteString(fmt.Sprintf("%d,%g,0,%d,%s,all\n", k, m, n, t))
63 | }
64 | }
65 | }
66 |
67 | // Compute calculates correlation function.
68 | func compute(buffer []VCFRecord, p2arr []float64, p2counts []int64) {
69 | for i := 0; i < len(buffer); i++ {
70 | nc := mcorr.NewNuclCov([]byte{'0', '1'})
71 | for k := 0; k < len(buffer[0].GTs); k++ {
72 | nc.Add(buffer[0].GTs[k], buffer[i].GTs[k])
73 | }
74 | lag := buffer[i].Pos - buffer[0].Pos
75 | xy, n := nc.P11(0)
76 | p2arr[lag] += xy / float64(n)
77 | p2counts[lag]++
78 | }
79 | }
80 |
81 | // readVCF return a channel of VCF record.
82 | func readVCF(filename string) (c chan VCFRecord) {
83 | c = make(chan VCFRecord)
84 | go func() {
85 | defer close(c)
86 | f, err := os.Open(filename)
87 | if err != nil {
88 | panic(err)
89 | }
90 | defer f.Close()
91 |
92 | rd := bufio.NewReader(f)
93 | for {
94 | line, err := rd.ReadString('\n')
95 | if err != nil {
96 | if err != io.EOF {
97 | panic(err)
98 | }
99 | break
100 | }
101 | if line[0] == '#' {
102 | continue
103 | }
104 |
105 | line = strings.TrimSpace(line)
106 | terms := strings.Split(line, "\t")
107 | var rec VCFRecord
108 | rec.Chrom = terms[0]
109 | rec.Pos = atoi(terms[1])
110 | rec.Ref = terms[3]
111 | rec.Alt = terms[4]
112 | if len(rec.Alt) == 1 && len(rec.Ref) == 1 {
113 | inGT := false
114 | for _, t := range terms {
115 | if t == "GT" {
116 | inGT = true
117 | } else if inGT {
118 | for _, gt := range t {
119 | if gt != '|' {
120 | rec.GTs = append(rec.GTs, byte(gt))
121 | }
122 | }
123 | }
124 | }
125 | c <- rec
126 | }
127 | }
128 | }()
129 | return
130 | }
131 |
132 | func atoi(s string) int {
133 | v, err := strconv.Atoi(s)
134 | if err != nil {
135 | panic(err)
136 | }
137 | return v
138 | }
139 |
--------------------------------------------------------------------------------
/cmd/mcorr-fit/mcorr/fit_data.py:
--------------------------------------------------------------------------------
1 | import numpy
2 |
3 | class FitData(object):
4 | """Fitting data"""
5 | def __init__(self, group, xvalues, yvalues, d_sample):
6 | self.group = group
7 | self.xvalues = xvalues
8 | self.yvalues = yvalues
9 | self.d_sample = d_sample
10 |
11 | class FitDatas(object):
12 | """Fitting data"""
13 | def __init__(self, corr_results, fit_start, xmax):
14 | corr_map = {}
15 | groups = []
16 | for row in corr_results:
17 | rows = corr_map.get(row.group, [])
18 | if len(rows) == 0:
19 | groups.append(row.group)
20 | rows.append(row)
21 | corr_map[row.group] = rows
22 | fitdata_map = {}
23 | for group, items in corr_map.items():
24 | xvalues, yvalues, d_sample = prepare_fitting_data(
25 | items, fit_start, xmax)
26 | fitdata_map[group] = FitData(group, xvalues, yvalues, d_sample)
27 | self.fitdata_dict = fitdata_map
28 | self.groups = groups
29 | def has(self, group):
30 | """return True if the group is in the data"""
31 | return group in self.fitdata_dict
32 |
33 | def get(self, group):
34 | """return fit data"""
35 | fitdata = self.fitdata_dict.get(group, None)
36 | return fitdata
37 | def getall(self):
38 | """return all"""
39 | return [self.fitdata_dict[group] for group in self.groups]
40 |
41 | def prepare_fitting_data(fitdata, fit_start, xmax):
42 | """Prepare fitting xvalues and yvalues"""
43 | xvalues = []
44 | yvalues = []
45 | diver = 0
46 | for row in fitdata:
47 | if row.corrtype == 'P2' and row.lag >= fit_start and row.lag <= xmax:
48 | xvalues.append(row.lag)
49 | yvalues.append(row.value)
50 | elif row.corrtype == 'Ks':
51 | diver = row.value
52 | xvalues = numpy.array(xvalues)
53 | yvalues = numpy.array(yvalues)
54 | return (xvalues, yvalues, diver)
55 |
56 | class FitGeneDatas(object):
57 | """Fitting data"""
58 | def __init__(self, corr_results, fit_start, xmax):
59 | corr_map = {}
60 | groups = []
61 | for row in corr_results:
62 | rows = corr_map.get(row.group, [])
63 | if len(rows) == 0:
64 | groups.append(row.group)
65 | rows.append(row)
66 | corr_map[row.group] = rows
67 | fitdata_map = {}
68 | for group, items in corr_map.items():
69 | xvalues, yvalues, d_sample = prepare_fitting_genedata(
70 | items, fit_start, xmax)
71 | fitdata_map[group] = FitData(group, xvalues, yvalues, d_sample)
72 | self.fitdata_dict = fitdata_map
73 | self.groups = groups
74 | def has(self, group):
75 | """return True if the group is in the data"""
76 | return group in self.fitdata_dict
77 |
78 | def get(self, group):
79 | """return fit data"""
80 | fitdata = self.fitdata_dict.get(group, None)
81 | return fitdata
82 | def getall(self):
83 | """return all"""
84 | return [self.fitdata_dict[group] for group in self.groups]
85 |
86 | def prepare_fitting_genedata(fitdata, fit_start, xmax):
87 | """Prepare fitting xvalues and yvalues"""
88 | xvalues = []
89 | yvalues = []
90 | diver = 0
91 | for row in fitdata:
92 | if row.corrtype == 'P2' and row.lag >= fit_start and row.lag <= xmax:
93 | xvalues.append(row.lag)
94 | yvalues.append(row.value)
95 | elif row.lag == 0:
96 | diver = row.value
97 | xvalues = numpy.array(xvalues)
98 | yvalues = numpy.array(yvalues)
99 | return (xvalues, yvalues, diver)
100 |
101 |
102 |
--------------------------------------------------------------------------------
/cmd/mcorr-bam/codon.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "github.com/kussell-lab/ncbiftp/taxonomy"
5 | )
6 |
7 | // Codon stores a codon value, the position in a genome and the read id.
8 | type Codon struct {
9 | Seq string
10 | ReadID string
11 | GenePos int
12 | }
13 |
14 | // ContainsGap return true if '-' in a sequence.
15 | func (c Codon) ContainsGap() bool {
16 | for _, b := range c.Seq {
17 | if b == '-' {
18 | return true
19 | }
20 | }
21 | return false
22 | }
23 |
24 | // CodonPile stores a pile of Codon, which are at a particular genome position.
25 | type CodonPile struct {
26 | genePos int
27 | codonMap map[string]Codon
28 | }
29 |
30 | // NewCodonPile return a new CodonPile.
31 | func NewCodonPile() *CodonPile {
32 | return &CodonPile{codonMap: make(map[string]Codon)}
33 | }
34 |
35 | // Add appends a new Codon.
36 | func (cp *CodonPile) Add(c Codon) {
37 | cp.genePos = c.GenePos
38 | cp.codonMap[c.ReadID] = c
39 | }
40 |
41 | // LookUp search a codon by ReadName. If not found, it returns nil.
42 | func (cp *CodonPile) LookUp(readID string) Codon {
43 | return cp.codonMap[readID]
44 | }
45 |
46 | // Len return the lenght of pileup Codons.
47 | func (cp *CodonPile) Len() int {
48 | return len(cp.codonMap)
49 | }
50 |
51 | // GenePos return the gene position.
52 | func (cp *CodonPile) GenePos() int {
53 | return cp.genePos
54 | }
55 |
56 | // CodonGene represents a gene with an array of CodonPile.
57 | type CodonGene struct {
58 | CodonPiles []*CodonPile
59 | }
60 |
61 | // NewCodonGene return a new CodonGene.
62 | func NewCodonGene() *CodonGene {
63 | return &CodonGene{}
64 | }
65 |
66 | // AddCodon add a codon.
67 | func (cg *CodonGene) AddCodon(c Codon) {
68 | for len(cg.CodonPiles) <= c.GenePos {
69 | cg.CodonPiles = append(cg.CodonPiles, NewCodonPile())
70 | }
71 | cg.CodonPiles[c.GenePos].Add(c)
72 | }
73 |
74 | // DepthAt return the pile depth at position i.
75 | func (cg *CodonGene) DepthAt(i int) int {
76 | if len(cg.CodonPiles) <= i {
77 | return 0
78 | }
79 | return cg.CodonPiles[i].Len()
80 | }
81 |
82 | // Len returns length of CodonPile array.
83 | func (cg *CodonGene) Len() int {
84 | return len(cg.CodonPiles)
85 | }
86 |
87 | // CodonPair stores a pair of Codon
88 | type CodonPair struct {
89 | A, B Codon
90 | }
91 |
92 | // PairCodonAt pairs codons at positions i and j.
93 | func (cg *CodonGene) PairCodonAt(i, j int) (pairs []CodonPair) {
94 | if i >= len(cg.CodonPiles) || j >= len(cg.CodonPiles) {
95 | return
96 | }
97 |
98 | if i > j {
99 | j, i = i, j
100 | }
101 |
102 | pile1 := cg.CodonPiles[i]
103 | if i == j {
104 | for _, codon := range pile1.codonMap {
105 | pairs = append(pairs, CodonPair{A: codon, B: codon})
106 | }
107 | }
108 | pile2 := cg.CodonPiles[j]
109 | for readID, codon1 := range pile1.codonMap {
110 | codon2 := pile2.LookUp(readID)
111 | if codon2.ReadID != "" {
112 | pairs = append(pairs, CodonPair{A: codon1, B: codon2})
113 | }
114 | }
115 | return
116 | }
117 |
118 | // SynoumousSplitCodonPairs split codon pairs into synoumous pairs.
119 | func SynoumousSplitCodonPairs(codonPairs []CodonPair, codeTable *taxonomy.GeneticCode) [][]CodonPair {
120 | var splittedPairs [][]CodonPair
121 | var aaArray []string
122 | for _, codonPair := range codonPairs {
123 | hasGap := false
124 | for _, codon := range []Codon{codonPair.A, codonPair.B} {
125 | for _, b := range codon.Seq {
126 | if !isATGC(byte(b)) {
127 | hasGap = true
128 | break
129 | }
130 | }
131 | if hasGap {
132 | break
133 | }
134 | }
135 |
136 | if hasGap {
137 | continue
138 | }
139 |
140 | a := codeTable.Table[codonPair.A.Seq]
141 | b := codeTable.Table[codonPair.B.Seq]
142 | ab := string([]byte{a, b})
143 | index := -1
144 | for i, aa := range aaArray {
145 | if aa == ab {
146 | index = i
147 | }
148 | }
149 | if index == -1 {
150 | index = len(aaArray)
151 | aaArray = append(aaArray, ab)
152 | splittedPairs = append(splittedPairs, []CodonPair{})
153 | }
154 | splittedPairs[index] = append(splittedPairs[index], codonPair)
155 | }
156 | return splittedPairs
157 | }
158 |
--------------------------------------------------------------------------------
/cmd/mcorr-fit/old/fitCorr.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import os
3 | from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
4 | #from mcorr import fit_p2
5 | from mcorr import fit_p2, read_corr, FitDatas, \
6 | write_fitting_results, plot_fit, plot_params, write_fitting_reports, \
7 | geom_r1, const_r1
8 | from mcorr.fit import fit_model, vary_fit
9 | from lmfit import fit_report
10 | import csv
11 | import pandas as pd
12 | #from lmfit.printfuncs import report_fit
13 | from mcorr.lmfitFunctions import perform_lmfit
14 |
15 | def main():
16 | """Run fitting using lmfit, and generate output files and plots"""
17 | parser = ArgumentParser(
18 | formatter_class=ArgumentDefaultsHelpFormatter,
19 | description="Fit the actual data (not the bootstraps) and return goodness-of fit stats")
20 | parser.add_argument("corr_file", type = str, help='correlation input file')
21 | parser.add_argument("output_prefix", type=str, help='output file prefix')
22 | parser.add_argument('--fit_start', type=int, default=3,
23 | help='fitting range starts at')
24 | parser.add_argument('--fit_end', type=int, default=300,
25 | help='fitting range ends at')
26 | parser.add_argument("--use_geom_frag", action="store_true",
27 | help='use geometric distribution for fragment sizes')
28 | parser.add_argument('--quiet', action="store_true")
29 | parser.add_argument("--title", type=str, help="plot title", default="")
30 | opts = parser.parse_args()
31 | corr_file = opts.corr_file
32 | prefix = opts.output_prefix
33 | fit_start = opts.fit_start
34 | fit_end = opts.fit_end
35 | quiet = opts.quiet
36 | use_geom_frag = opts.use_geom_frag
37 | title = opts.title
38 |
39 | ##for testing fixes
40 | # dir = '/Volumes/aps_timemachine/recombo/APS160.5_lmfit/cluster8_cluster221'
41 | # corr_file = os.path.join(dir, 'cluster8_cluster221_CORE_XMFA_OUT.csv')
42 | # prefix = 'cluster8_cluster221_CORE_FIT_OUT_0205test'
43 | # fit_start = 3
44 | # fit_end = 300
45 | # quiet = False
46 | # use_geom_frag = False
47 | # title=""
48 |
49 | # read correlation results and prepare fitting data
50 | corr_results = read_corr(corr_file)
51 | fitdatas = FitDatas(corr_results, fit_start, fit_end)
52 | ##do fitting
53 | r1_func = const_r1
54 | #if you want to use a geometric distribution of fragments
55 | if use_geom_frag:
56 | r1_func = geom_r1
57 |
58 | all = fitdatas.get("all")
59 | x = all.xvalues
60 | y = all.yvalues
61 | d_sample = all.d_sample
62 | fitres = perform_lmfit(x, y, d_sample)
63 | ## write a fit report as generated by lmfit (includes chi-squared, uncertainties, etc)
64 | params = fitres.params.valuesdict()
65 | thetaS = fitres.params["theta_s"]
66 | phiS = fitres.params["phi_s"]
67 | f = fitres.params["f"]
68 | lmfitfile = prefix + "_lmfit_report.csv"
69 | with open(lmfitfile, "w+") as csvfile:
70 | lmfit_writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
71 | lmfit_writer.writerow(["fit_success", fitres.success])
72 | lmfit_writer.writerow(["function_evals", fitres.nfev])
73 | lmfit_writer.writerow(["data_points", fitres.ndata])
74 | lmfit_writer.writerow(["variables", fitres.nvarys])
75 | lmfit_writer.writerow(["message", fitres.message])
76 | lmfit_writer.writerow(["thetaS (init)", thetaS.init_value])
77 | lmfit_writer.writerow(["f (init)", f.init_value])
78 | lmfit_writer.writerow(["phiS (init)", phiS.init_value])
79 | lmfit_writer.writerow([""])
80 | lmfit_writer.writerow(["d_s", "theta_s", "f", "phi_s",
81 | "theta_p", "phi_p", "c", "d_theta_p",
82 | "d_theta_s", "chisq", "red-chisq"])
83 | lmfit_writer.writerow([params["d_s"], params["theta_s"], params["f"], params["phi_s"],
84 | params["theta_p"], params["phi_p"], params["c_s"], params["d_theta_p"],
85 | params["d_theta_s"], fitres.chisqr, fitres.redchi])
86 | ##save the residuals as a .csv file
87 | residuals = fitres.residual
88 | resdat = pd.DataFrame(residuals)
89 | resdat.to_csv(prefix+"_residuals.csv", header=None)
90 | ##plot the best fit and the residuals
91 | best_fit_file = prefix + "_best_fit.svg"
92 | plot_fit(all, fitres, best_fit_file, title=title)
93 |
94 | if __name__ == "__main__":
95 | main()
96 |
97 |
98 |
99 |
100 |
101 |
102 |
--------------------------------------------------------------------------------
/cmd/mcorr-xmfa/coding_calculator.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "github.com/kussell-lab/biogo/seq"
5 | "github.com/kussell-lab/mcorr"
6 | "github.com/kussell-lab/ncbiftp/taxonomy"
7 | )
8 |
9 | // Calculator define a interface for calculating correlations.
10 | type Calculator interface {
11 | CalcP2(a Alignment, others ...Alignment) (corrResults mcorr.CorrResults)
12 | }
13 |
14 | // CodingCalculator for calculating coding sequences.
15 | type CodingCalculator struct {
16 | CodingTable *taxonomy.GeneticCode
17 | MaxCodonLen int
18 | CodonOffset int
19 | CodonPosition int
20 | Synonymous bool
21 | }
22 |
23 | // NewCodingCalculator return a CodingCalculator
24 | func NewCodingCalculator(codingTable *taxonomy.GeneticCode, maxCodonLen, codonOffset int, codonPosition int, synonymous bool) *CodingCalculator {
25 | return &CodingCalculator{
26 | CodingTable: codingTable,
27 | MaxCodonLen: maxCodonLen,
28 | CodonOffset: codonOffset,
29 | CodonPosition: codonPosition,
30 | Synonymous: synonymous,
31 | }
32 | }
33 |
34 | // CalcP2 calculate P2
35 | func (cc *CodingCalculator) CalcP2(a Alignment, others ...Alignment) mcorr.CorrResults {
36 | results := calcP2Coding(a, cc.CodonOffset, cc.CodonPosition, cc.MaxCodonLen, cc.CodingTable, cc.Synonymous)
37 | return mcorr.CorrResults{ID: a.ID, Results: results}
38 | }
39 |
40 | func calcP2Coding(aln Alignment, codonOffset, codonPosition, maxCodonLen int, codingTable *taxonomy.GeneticCode, synonymous bool) (results []mcorr.CorrResult) {
41 | codonSequences := [][]Codon{}
42 | for _, s := range aln.Sequences {
43 | codons := extractCodons(s, codonOffset)
44 | codonSequences = append(codonSequences, codons)
45 | }
46 |
47 | for l := 0; l < maxCodonLen; l++ {
48 | totalP2 := 0.0
49 | totaln := 0
50 |
51 | for i := 0; i+l < len(codonSequences[0]); i++ {
52 | codonPairs := []CodonPair{}
53 | j := i + l
54 | for _, cc := range codonSequences {
55 | if i+l < len(cc) {
56 | codonPairs = append(codonPairs, CodonPair{A: cc[i], B: cc[j]})
57 | }
58 | }
59 |
60 | multiCodonPairs := [][]CodonPair{}
61 | if synonymous {
62 | multiCodonPairs = synonymousSplit(codonPairs, codingTable)
63 | } else {
64 | multiCodonPairs = append(multiCodonPairs, codonPairs)
65 | }
66 | for _, codonPairs := range multiCodonPairs {
67 | if len(codonPairs) >= 2 {
68 | nc := doubleCodons(codonPairs, codonPosition)
69 | xy, n := nc.P11(0)
70 | totalP2 += xy
71 | totaln += n
72 |
73 | }
74 | }
75 | }
76 |
77 | if totaln > 0 {
78 | res1 := mcorr.CorrResult{
79 | Lag: l * 3,
80 | Mean: totalP2 / float64(totaln),
81 | N: totaln,
82 | Type: "P2",
83 | }
84 | results = append(results, res1)
85 | }
86 | }
87 |
88 | return
89 | }
90 |
91 | func doubleCodons(codonPairs []CodonPair, codonPosition int) *mcorr.NuclCov {
92 | alphabet := []byte{'A', 'T', 'G', 'C'}
93 | c := mcorr.NewNuclCov(alphabet)
94 | for _, codonPair := range codonPairs {
95 | a := codonPair.A[codonPosition]
96 | b := codonPair.B[codonPosition]
97 | c.Add(a, b)
98 | }
99 | return c
100 | }
101 |
102 | // Codon is a byte list of length 3
103 | type Codon []byte
104 |
105 | // CodonSequence is a sequence of codons.
106 | type CodonSequence []Codon
107 |
108 | // CodonPair is a pair of Codons.
109 | type CodonPair struct {
110 | A, B Codon
111 | }
112 |
113 | // extractCodons return a list of codons from a DNA sequence.
114 | func extractCodons(s seq.Sequence, offset int) (codons []Codon) {
115 | for i := offset; i+3 <= len(s.Seq); i += 3 {
116 | c := s.Seq[i:(i + 3)]
117 | codons = append(codons, c)
118 | }
119 | return
120 | }
121 |
122 | // synonymousSplit split a list of codon pairs into multiple
123 | // synonymous pairs.
124 | func synonymousSplit(codonPairs []CodonPair, codingTable *taxonomy.GeneticCode) (multiCodonPairs [][]CodonPair) {
125 | aaList := []string{}
126 | for _, codonPair := range codonPairs {
127 | // check gap.
128 | containsGap := false
129 | for _, codon := range []Codon{codonPair.A, codonPair.B} {
130 | for i := 0; i < 3; i++ {
131 | if codon[i] == '-' || codon[i] == 'N' {
132 | containsGap = true
133 | break
134 | }
135 | }
136 | }
137 | if containsGap {
138 | continue
139 | }
140 |
141 | codonA := string(codonPair.A)
142 | codonB := string(codonPair.B)
143 | a := codingTable.Table[codonA]
144 | b := codingTable.Table[codonB]
145 | ab := string([]byte{a, b})
146 | index := -1
147 | for i := 0; i < len(aaList); i++ {
148 | if aaList[i] == ab {
149 | index = i
150 | }
151 | }
152 | if index == -1 {
153 | index = len(aaList)
154 | aaList = append(aaList, ab)
155 | multiCodonPairs = append(multiCodonPairs, []CodonPair{})
156 | }
157 |
158 | multiCodonPairs[index] = append(multiCodonPairs[index], codonPair)
159 | }
160 |
161 | return
162 | }
163 |
--------------------------------------------------------------------------------
/cmd/mcorr-fit/mcorr/cli.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import csv
3 | import os
4 | from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
5 | #from mcorr import fit_p2
6 | from mcorr import fit_p2, read_corr, FitDatas, \
7 | write_fitting_results, plot_fit, plot_params, write_fitting_reports, \
8 | geom_r1, const_r1
9 | from mcorr.fit import fit_model
10 | from lmfit import fit_report
11 |
12 | def main():
13 | """Run fitting using lmfit, and generate output files and plots"""
14 | parser = ArgumentParser(
15 | formatter_class=ArgumentDefaultsHelpFormatter,
16 | description="Infer recombination rates\
17 | by fitting correlation profile of mutations.")
18 | parser.add_argument("corr_file", type = str, help='correlation input file')
19 | parser.add_argument("output_prefix", type=str, help='output file prefix')
20 | parser.add_argument('--fit_start', type=int, default=3,
21 | help='fitting range starts at')
22 | parser.add_argument('--fit_end', type=int, default=300,
23 | help='fitting range ends at')
24 | parser.add_argument("--use_geom_frag", action="store_true",
25 | help='use geometric distribution for fragment sizes')
26 | parser.add_argument('--quiet', action="store_true")
27 | parser.add_argument("--title", type=str, help="plot title", default="")
28 | opts = parser.parse_args()
29 | corr_file = opts.corr_file
30 | prefix = opts.output_prefix
31 | fit_start = opts.fit_start
32 | fit_end = opts.fit_end
33 | quiet = opts.quiet
34 | use_geom_frag = opts.use_geom_frag
35 | title = opts.title
36 | # read correlation results and prepare fitting data
37 | corr_results = read_corr(corr_file)
38 | fitdatas = FitDatas(corr_results, fit_start, fit_end)
39 | r1_func = const_r1
40 | ## write a fit report as generated by lmfit (includes chi-squared, uncertainties, etc)
41 | all = fitdatas.get("all")
42 | actualdata = fit_model(all.xvalues, all.yvalues, all.d_sample, r1_func)
43 | ## write a fit report as generated by lmfit (includes chi-squared, uncertainties, etc)
44 | params = actualdata.params.valuesdict()
45 | thetaS = actualdata.params["thetaS"]
46 | phiS = actualdata.params["phiS"]
47 | f = actualdata.params["f"]
48 | lmfitfile = prefix + "_lmfit_report.csv"
49 | with open(lmfitfile, "w+") as csvfile:
50 | lmfit_writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
51 | lmfit_writer.writerow(["fit_success", actualdata.success])
52 | lmfit_writer.writerow(["function_evals", actualdata.nfev])
53 | lmfit_writer.writerow(["data_points", actualdata.ndata])
54 | lmfit_writer.writerow(["variables", actualdata.nvarys])
55 | lmfit_writer.writerow(["message", actualdata.message])
56 | lmfit_writer.writerow(["thetaS (init)", thetaS.init_value])
57 | lmfit_writer.writerow(["f (init)", f.init_value])
58 | lmfit_writer.writerow(["phiS (init)", phiS.init_value])
59 | lmfit_writer.writerow([""])
60 | lmfit_writer.writerow(["d_s", "theta_s", "f", "phi_s",
61 | "theta_p", "phi_p", "c", "d_theta_p",
62 | "d_theta_s", "chisq", "red-chisq"])
63 | lmfit_writer.writerow([params["ds"], params["thetaS"], params["f"], params["phiS"],
64 | params["thetaP"], params["phiP"], params["c"], params["dp"],
65 | params["dc"], actualdata.chisqr, actualdata.redchi])
66 |
67 | #do fitting
68 | if use_geom_frag:
69 | r1_func = geom_r1
70 | fit_results = fit_p2(fitdatas, r1_func=r1_func, disable_progress_bar=quiet)
71 | # parameters to report
72 | model_params = ["group", "d_sample", "theta_pool",
73 | "phi_pool", "ratio", "fbar", "c", "d_pool",
74 | "d_clonal", 'theta_s', 'phi_s']
75 | # save fitting results into csv file
76 | csv_file = prefix + "_fit_results.csv"
77 | write_fitting_results(fit_results, model_params, csv_file)
78 | # plot the best fit
79 | best_fit_file = prefix + "_best_fit.svg"
80 | fitdata = fitdatas.get("all")
81 | fitres = None
82 | for res in fit_results:
83 | if res.group == "all":
84 | fitres = res
85 | break
86 | if fitres is not None:
87 | plot_fit(fitdata, fitres, best_fit_file, title=title)
88 | # write fitting report for bootstrapping
89 | report_file = prefix + "_bootstrapping_report.txt"
90 | write_fitting_reports(fit_results, model_params[1:7], report_file)
91 |
92 | # plot histogram of fitted parameters
93 | # temporarily taking this out because it is problematic when bins for bootstraps are not well-determined
94 | # params_hist_file = prefix + "_parameter_histograms.svg"
95 | # plot_params(fit_results, model_params[1:7], params_hist_file)
96 |
97 | if __name__ == "__main__":
98 | main()
99 |
--------------------------------------------------------------------------------
/cmd/mcorr-fit/mcorr/singleFit.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """"
3 | written by Asher Preska Steinberg (apsteinberg@nyu.edu)
4 | """
5 | import os
6 | from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
7 | #from mcorr import fit_p2
8 | from mcorr import fit_p2, read_corr, FitDatas, \
9 | write_fitting_results, plot_fit, plot_params, write_fitting_reports, \
10 | geom_r1, const_r1
11 | from mcorr.fit import fit_model, vary_fit
12 | #from mcorr.fit import fit_varynefv
13 | from .fit import fit_modelopts
14 | from lmfit import fit_report
15 | import csv
16 | import pandas as pd
17 | #from lmfit.printfuncs import report_fit
18 | from mcorr.lmfitFunctions import perform_lmfit
19 |
20 | def main():
21 | """Fit just the data (no bootstraps) using the method from Nature Methods paper"""
22 | parser = ArgumentParser(
23 | formatter_class=ArgumentDefaultsHelpFormatter,
24 | description="Fit just data (no bootstraps) with fitting setup from Nature Methods")
25 | parser.add_argument("corr_file", type = str, help='correlation input file')
26 | parser.add_argument("output_prefix", type=str, help='output file prefix')
27 | parser.add_argument('--fit_start', type=int, default=3,
28 | help='fitting range starts at')
29 | parser.add_argument('--fit_end', type=int, default=300,
30 | help='fitting range ends at')
31 | parser.add_argument("--use_geom_frag", action="store_true",
32 | help='use geometric distribution for fragment sizes')
33 | parser.add_argument("--fit_method", type=str, default="least_squares", help="lmfit method (see lmfit documentation)")
34 | parser.add_argument("--max_nfev", type=int, default=int(1e6),
35 | help='max number of function evaluations before lmfit quits')
36 | parser.add_argument('--quiet', action="store_true")
37 | parser.add_argument("--title", type=str, help="plot title", default="")
38 | opts = parser.parse_args()
39 | corr_file = opts.corr_file
40 | prefix = opts.output_prefix
41 | fit_start = opts.fit_start
42 | fit_end = opts.fit_end
43 | quiet = opts.quiet
44 | use_geom_frag = opts.use_geom_frag
45 | title = opts.title
46 | fit_method = opts.fit_method
47 | max_nfev = opts.max_nfev
48 |
49 | # read correlation results and prepare fitting data
50 | corr_results = read_corr(corr_file)
51 | fitdatas = FitDatas(corr_results, fit_start, fit_end)
52 | ##do fitting
53 | r1_func = const_r1
54 | #if you want to use a geometric distribution of fragments
55 | if use_geom_frag:
56 | r1_func = geom_r1
57 |
58 | all = fitdatas.get("all")
59 | x = all.xvalues
60 | y = all.yvalues
61 | d_sample = all.d_sample
62 | fitres = fit_modelopts(x, y, d_sample, r1_func, max_nfev, fit_method)
63 | #fitres = fit_model(x, y, d_sample, r1_func)
64 | ## write a fit report as generated by lmfit (includes chi-squared, uncertainties, etc)
65 | params = fitres.params.valuesdict()
66 | thetaS = fitres.params["thetaS"]
67 | phiS = fitres.params["phiS"]
68 | f = fitres.params["f"]
69 | lmfitfile = prefix + "_lmfit_report.csv"
70 | with open(lmfitfile, "w+") as csvfile:
71 | lmfit_writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
72 | lmfit_writer.writerow(["fit_success", fitres.success])
73 | lmfit_writer.writerow(["function_evals", fitres.nfev])
74 | lmfit_writer.writerow(["data_points", fitres.ndata])
75 | lmfit_writer.writerow(["variables", fitres.nvarys])
76 | lmfit_writer.writerow(["message", fitres.message])
77 | lmfit_writer.writerow(["thetaS (init)", thetaS.init_value])
78 | lmfit_writer.writerow(["f (init)", f.init_value])
79 | lmfit_writer.writerow(["phiS (init)", phiS.init_value])
80 | lmfit_writer.writerow([""])
81 | lmfit_writer.writerow(["d_s", "theta_s", "f", "phi_s",
82 | "theta_p", "phi_p", "c", "d_theta_p",
83 | "d_theta_s", "chisq", "red-chisq"])
84 | lmfit_writer.writerow([params["ds"], params["thetaS"], params["f"], params["phiS"],
85 | params["thetaP"], params["phiP"], params["c"], params["dp"],
86 | params["dc"], fitres.chisqr, fitres.redchi])
87 | ##save the residuals as a .csv file
88 | if fitres.success:
89 | residuals = fitres.residual
90 | resdat = pd.DataFrame(residuals)
91 | resdat.to_csv(prefix+"_residuals.csv", header=None)
92 | #plot the best fit and the residuals
93 | best_fit_file = prefix + "_best_fit.svg"
94 | plot_fit(all, fitres, best_fit_file, title=title)
95 | else:
96 | print("Fitting failed for %s" % corr_file)
97 |
98 | if __name__ == "__main__":
99 | main()
100 |
101 |
102 |
103 |
104 |
105 |
106 |
--------------------------------------------------------------------------------
/cmd/mcorr-fit/mcorr/lmfit_report.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import os
3 | from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
4 | #from mcorr import fit_p2
5 | from mcorr import fit_p2, read_corr, FitDatas, \
6 | write_fitting_results, plot_fit, plot_params, write_fitting_reports, \
7 | geom_r1, const_r1
8 | from mcorr.fit import fit_model, vary_fit
9 | from lmfit import fit_report
10 | import csv
11 | import pandas as pd
12 | from lmfit import Parameters, Minimizer
13 |
14 | def main():
15 | """Run fitting using lmfit, and generate output files and plots"""
16 | parser = ArgumentParser(
17 | formatter_class=ArgumentDefaultsHelpFormatter,
18 | description="Fit the actual data (not the bootstraps) and return goodness-of fit stats")
19 | parser.add_argument("corr_file", type = str, help='correlation input file')
20 | parser.add_argument("output_prefix", type=str, help='output file prefix')
21 | parser.add_argument('--fit_start', type=int, default=3,
22 | help='fitting range starts at')
23 | parser.add_argument('--fit_end', type=int, default=300,
24 | help='fitting range ends at')
25 | parser.add_argument("--use_geom_frag", action="store_true",
26 | help='use geometric distribution for fragment sizes')
27 | parser.add_argument('--quiet', action="store_true")
28 | parser.add_argument("--title", type=str, help="plot title", default="")
29 | parser.add_argument("--phiS_init", type=float, help="set initial value for phiS in the fitting", default=0.00005)
30 | parser.add_argument("--f_init", type=float, help="set initial value for f in the fitting", default=1000)
31 | parser.add_argument("--thetaS_init", type=float, help="set initial value for thetaS in the fitting", default=0.00001)
32 | parser.add_argument("--phiS_max", type=float, help="set initial value for f in the fitting", default=1.0)
33 | opts = parser.parse_args()
34 | corr_file = opts.corr_file
35 | prefix = opts.output_prefix
36 | fit_start = opts.fit_start
37 | fit_end = opts.fit_end
38 | quiet = opts.quiet
39 | use_geom_frag = opts.use_geom_frag
40 | title = opts.title
41 | f_init = opts.f_init
42 | thetaS_init = opts.thetaS_init
43 | phiS_init = opts.phiS_init
44 | phiS_max = opts.phiS_max
45 |
46 | ##for testing fixes
47 | # dir = '/Volumes/aps_timemachine/recombo/APS160.5_lmfit/cluster8_cluster221'
48 | # corr_file = os.path.join(dir, 'cluster8_cluster221_CORE_XMFA_OUT.csv')
49 | # prefix = 'cluster8_cluster221_CORE_FIT_OUT'
50 | # fit_start = 3
51 | # fit_end = 300
52 | # quiet = False
53 | # use_geom_frag = False
54 | # title=""
55 |
56 | # read correlation results and prepare fitting data
57 | corr_results = read_corr(corr_file)
58 | fitdatas = FitDatas(corr_results, fit_start, fit_end)
59 | ##do fitting
60 | r1_func = const_r1
61 | #if you want to use a geometric distribution of fragments
62 | if use_geom_frag:
63 | r1_func = geom_r1
64 |
65 | all = fitdatas.get("all")
66 | fitres = vary_fit(all.xvalues, all.yvalues, all.d_sample, r1_func, f_init, thetaS_init, phiS_init, phiS_max)
67 | # lmfitreport=fit_report(fitres)
68 | # print(lmfitreport)
69 | ## write a fit report as generated by lmfit (includes chi-squared, uncertainties, etc)
70 | params = fitres.params.valuesdict()
71 | thetaS = fitres.params["thetaS"]
72 | phiS = fitres.params["phiS"]
73 | f = fitres.params["f"]
74 | lmfitfile = prefix + "_lmfit_report.csv"
75 | with open(lmfitfile, "w+") as csvfile:
76 | lmfit_writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
77 | lmfit_writer.writerow(["fit_success", fitres.success])
78 | lmfit_writer.writerow(["function_evals", fitres.nfev])
79 | lmfit_writer.writerow(["data_points", fitres.ndata])
80 | lmfit_writer.writerow(["variables", fitres.nvarys])
81 | lmfit_writer.writerow(["message", fitres.message])
82 | lmfit_writer.writerow(["thetaS (init)", thetaS.init_value])
83 | lmfit_writer.writerow(["f (init)", f.init_value])
84 | lmfit_writer.writerow(["phiS (init)", phiS.init_value])
85 | lmfit_writer.writerow([""])
86 | lmfit_writer.writerow(["ds", "thetaS", "f", "phiS", "thetaP", "phiP", "c",
87 | "dp", "dc", "chisq", "red-chisq"])
88 | lmfit_writer.writerow([params["ds"], params["thetaS"], params["f"], params["phiS"],
89 | params["thetaP"], params["phiP"],
90 | params["c"], params["dp"], params["dc"], fitres.chisqr, fitres.redchi])
91 | ##save the residuals as a .csv file
92 | residuals = fitres.residual
93 | resdat = pd.DataFrame(residuals)
94 | resdat.to_csv(prefix+"_residuals.csv", header=None)
95 | ##plot the best fit and the residuals
96 | best_fit_file = prefix + "_best_fit.svg"
97 | plot_fit(all, fitres, best_fit_file, title=title)
98 |
99 | if __name__ == "__main__":
100 | main()
101 |
102 |
103 |
104 |
--------------------------------------------------------------------------------
/cmd/mcorr-xmfa-2clades/coding_calculator.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "github.com/kussell-lab/biogo/seq"
5 | "github.com/kussell-lab/mcorr"
6 | "github.com/kussell-lab/ncbiftp/taxonomy"
7 | )
8 |
9 | // Calculator define a interface for calculating correlations.
10 | type Calculator interface {
11 | CalcP2(a Alignment, others ...Alignment) (corrResults mcorr.CorrResults)
12 | }
13 |
14 | // CodingCalculator for calculating coding sequences.
15 | type CodingCalculator struct {
16 | CodingTable *taxonomy.GeneticCode
17 | MaxCodonLen int
18 | CodonOffset int
19 | CodonPosition int
20 | Synonymous bool
21 | }
22 |
23 | // NewCodingCalculator return a CodingCalculator
24 | func NewCodingCalculator(codingTable *taxonomy.GeneticCode, maxCodonLen, codonOffset int, codonPosition int, synonymous bool) *CodingCalculator {
25 | return &CodingCalculator{
26 | CodingTable: codingTable,
27 | MaxCodonLen: maxCodonLen,
28 | CodonOffset: codonOffset,
29 | CodonPosition: codonPosition,
30 | Synonymous: synonymous,
31 | }
32 | }
33 |
34 | // CalcP2 calculate P2
35 | func (cc *CodingCalculator) CalcP2(a Alignment, others ...Alignment) mcorr.CorrResults {
36 | results := calcP2Coding(a, cc.CodonOffset, cc.CodonPosition, cc.MaxCodonLen, cc.CodingTable, cc.Synonymous)
37 | return mcorr.CorrResults{ID: a.ID, Results: results}
38 | }
39 |
40 | func calcP2Coding(aln Alignment, codonOffset, codonPosition, maxCodonLen int, codingTable *taxonomy.GeneticCode, synonymous bool) (results []mcorr.CorrResult) {
41 | codonSequences := [][]Codon{}
42 | for _, s := range aln.Sequences {
43 | codons := extractCodons(s, codonOffset)
44 | codonSequences = append(codonSequences, codons)
45 | }
46 |
47 | ks := 1.0
48 | nn := 0
49 | for l := 0; l < maxCodonLen; l++ {
50 | totalP2 := 0.0
51 | totaln := 0
52 | if l > 0 && ks == 0.0 {
53 | totalP2 = 0.0
54 | totaln = nn
55 | } else {
56 | for i := 0; i+l < len(codonSequences[0]); i++ {
57 | codonPairs := []CodonPair{}
58 | j := i + l
59 | for _, cc := range codonSequences {
60 | if i+l < len(cc) {
61 | //all codonpairs (two codons separated by l) from a list of codon seqeunces
62 | codonPairs = append(codonPairs, CodonPair{A: cc[i], B: cc[j]})
63 | }
64 | }
65 |
66 | multiCodonPairs := [][]CodonPair{}
67 | if synonymous {
68 | multiCodonPairs = synonymousSplit(codonPairs, codingTable)
69 | } else {
70 | multiCodonPairs = append(multiCodonPairs, codonPairs)
71 | }
72 |
73 | for _, codonPairs := range multiCodonPairs {
74 | if len(codonPairs) >= 2 {
75 | //a codon pair is a pair of codons separated by l
76 | //i think the double codons are a list of all codon pairs
77 | //across all the sequences
78 | nc := doubleCodons(codonPairs, codonPosition)
79 | xy, n := nc.P11(0)
80 | totalP2 += xy
81 | totaln += n
82 | }
83 | }
84 | }
85 | }
86 |
87 | if l == 0 {
88 | ks = totalP2
89 | nn = totaln
90 | }
91 | if totaln > 0 {
92 | res1 := mcorr.CorrResult{
93 | Lag: l * 3,
94 | Mean: totalP2 / float64(totaln),
95 | N: totaln,
96 | Type: "P2",
97 | }
98 | results = append(results, res1)
99 | }
100 | }
101 |
102 | return
103 | }
104 |
105 | func doubleCodons(codonPairs []CodonPair, codonPosition int) *mcorr.NuclCov {
106 | alphabet := []byte{'A', 'T', 'G', 'C'}
107 | c := mcorr.NewNuclCov(alphabet)
108 | for _, codonPair := range codonPairs {
109 | a := codonPair.A[codonPosition]
110 | b := codonPair.B[codonPosition]
111 | c.Add(a, b)
112 | }
113 | return c
114 | }
115 |
116 | // Codon is a byte list of length 3
117 | type Codon []byte
118 |
119 | // CodonSequence is a sequence of codons.
120 | type CodonSequence []Codon
121 |
122 | // CodonPair is a pair of Codons.
123 | type CodonPair struct {
124 | A, B Codon
125 | }
126 |
127 | // extractCodons return a list of codons from a DNA sequence.
128 | func extractCodons(s seq.Sequence, offset int) (codons []Codon) {
129 | for i := offset; i+3 <= len(s.Seq); i += 3 {
130 | c := s.Seq[i:(i + 3)]
131 | codons = append(codons, c)
132 | }
133 | return
134 | }
135 |
136 | // synonymousSplit split a list of codon pairs into multiple
137 | // synonymous pairs.
138 | func synonymousSplit(codonPairs []CodonPair, codingTable *taxonomy.GeneticCode) (multiCodonPairs [][]CodonPair) {
139 | aaList := []string{}
140 | for _, codonPair := range codonPairs {
141 | // check gap.
142 | containsGap := false
143 | for _, codon := range []Codon{codonPair.A, codonPair.B} {
144 | for i := 0; i < 3; i++ {
145 | if codon[i] == '-' || codon[i] == 'N' {
146 | containsGap = true
147 | break
148 | }
149 | }
150 | }
151 | if containsGap {
152 | continue
153 | }
154 |
155 | codonA := string(codonPair.A)
156 | codonB := string(codonPair.B)
157 | a := codingTable.Table[codonA]
158 | b := codingTable.Table[codonB]
159 | ab := string([]byte{a, b})
160 | index := -1
161 | for i := 0; i < len(aaList); i++ {
162 | if aaList[i] == ab {
163 | index = i
164 | }
165 | }
166 | if index == -1 {
167 | index = len(aaList)
168 | aaList = append(aaList, ab)
169 | multiCodonPairs = append(multiCodonPairs, []CodonPair{})
170 | }
171 |
172 | multiCodonPairs[index] = append(multiCodonPairs[index], codonPair)
173 | }
174 |
175 | return
176 | }
177 |
--------------------------------------------------------------------------------
/nucl_cov.go:
--------------------------------------------------------------------------------
1 | package mcorr
2 |
3 | import (
4 | "bytes"
5 | "fmt"
6 | )
7 |
8 | // NuclCov contains covariance of nucleotide acid in a DNA sequence.
9 | type NuclCov struct {
10 | Doublets []int
11 | Alphabet []byte
12 | }
13 |
14 | // NewNuclCov return a NuclCov given the alphabet.
15 | func NewNuclCov(alphabet []byte) *NuclCov {
16 | sizeOfAlphabet := len(alphabet)
17 | nc := NuclCov{Alphabet: alphabet}
18 | nc.Doublets = make([]int, sizeOfAlphabet*sizeOfAlphabet)
19 | return &nc
20 | }
21 |
22 | // Add insert a pair of nucliotide acids.
23 | // It returns error when the nucliotide acid is not in the alphabet.
24 | func (nc *NuclCov) Add(a, b byte) error {
25 | indexA := bytes.IndexByte(nc.Alphabet, a)
26 | indexB := bytes.IndexByte(nc.Alphabet, b)
27 | sizeOfAlphabet := len(nc.Alphabet)
28 | if indexA >= 0 && indexB >= 0 {
29 | nc.Doublets[indexA*sizeOfAlphabet+indexB]++
30 | return nil
31 | }
32 |
33 | var err error
34 | if indexA < 0 && indexB < 0 {
35 | err = fmt.Errorf("%c and %c are not in Alphabet: %s", a, b, string(nc.Alphabet))
36 | } else if indexA < 0 {
37 | err = fmt.Errorf("%c is not in Alphabet: %s", a, string(nc.Alphabet))
38 | } else {
39 | err = fmt.Errorf("%c is not in Alphabet: %s", b, string(nc.Alphabet))
40 | }
41 |
42 | return err
43 | }
44 |
45 | // Count returns the total number of pairs.
46 | func (nc *NuclCov) Count() int {
47 | n := 0
48 | for _, a := range nc.Doublets {
49 | n += a
50 | }
51 | return n
52 | }
53 |
54 | // P00 returns the probability of 00.
55 | func (nc *NuclCov) P00(minAlleleNum int) (xy float64, n int) {
56 | for i := 0; i < len(nc.Doublets); i++ {
57 | if nc.Doublets[i] > minAlleleNum {
58 | for j := i + 1; j < len(nc.Doublets); j++ {
59 | if nc.Doublets[j] > minAlleleNum {
60 | n += nc.Doublets[i] * nc.Doublets[j]
61 | }
62 | }
63 | n += nc.Doublets[i] * (nc.Doublets[i] - 1) / 2
64 | xy += float64(nc.Doublets[i] * (nc.Doublets[i] - 1) / 2)
65 | }
66 | }
67 | return
68 | }
69 |
70 | // P11 returns the probability of 11.
71 | func (nc *NuclCov) P11(minAlleleNum int) (xy float64, n int) {
72 | sizeOfAlphabet := len(nc.Alphabet)
73 | for i := 0; i < len(nc.Doublets); i++ {
74 | if nc.Doublets[i] > minAlleleNum {
75 | for j := i + 1; j < len(nc.Doublets); j++ {
76 | if nc.Doublets[j] > minAlleleNum {
77 | c := float64(nc.Doublets[i] * nc.Doublets[j])
78 | if i%sizeOfAlphabet != j%sizeOfAlphabet && i/sizeOfAlphabet != j/sizeOfAlphabet {
79 | xy += c
80 | }
81 |
82 | n += nc.Doublets[i] * nc.Doublets[j]
83 | }
84 | }
85 | n += nc.Doublets[i] * (nc.Doublets[i] - 1) / 2
86 | }
87 | }
88 | return
89 | }
90 |
91 | //I think what we actually want is the cross-covariance matrix for this
92 | //which is not symmetric, and we need all of the elements of this matrix
93 | //which represent each possible combination except the diagonal which is just the same position
94 | //https://en.wikipedia.org/wiki/Cross-covariance_matrix
95 | func (nc *NuclCov) MateP11APS(nc2 *NuclCov, minAlleleNum int) (xy float64, n int) {
96 | //sizeOfAlphabet := len(nc.Alphabet)
97 | n1 := 0
98 | n2 := 0
99 | for i := 0; i < len(nc.Doublets); i++ {
100 | if nc.Doublets[i] > minAlleleNum {
101 | for j := 0; j < len(nc2.Doublets); j++ {
102 | if nc2.Doublets[j] > minAlleleNum {
103 | c := float64(nc.Doublets[i] * nc2.Doublets[j])
104 | xy += c
105 | }
106 | n2 += nc2.Doublets[j]
107 | }
108 | }
109 | n1 += nc.Doublets[i]
110 | }
111 | //for i := 0; i < len(nc.Doublets); i++ {
112 | // n1 += nc.Doublets[i]
113 | // n2 += nc2.Doublets[i]
114 | //}
115 | n = n1 * n2
116 | return
117 | }
118 |
119 | // MateP11 calculate covariance between two clusters.
120 | func (nc *NuclCov) MateP11(nc2 *NuclCov, minAlleleNum int) (xy float64, n int) {
121 | sizeOfAlphabet := len(nc.Alphabet)
122 | for i := 0; i < len(nc.Doublets); i++ {
123 | if nc.Doublets[i] > minAlleleNum {
124 | for j := 0; j < len(nc2.Doublets); j++ {
125 | if i != j && nc2.Doublets[j] > minAlleleNum {
126 | c := float64(nc.Doublets[i] * nc2.Doublets[j])
127 | if i%sizeOfAlphabet != j%sizeOfAlphabet && i/sizeOfAlphabet != j/sizeOfAlphabet {
128 | xy += c
129 | }
130 | }
131 | }
132 | }
133 | }
134 | n1 := 0
135 | n2 := 0
136 | for i := 0; i < len(nc.Doublets); i++ {
137 | n1 += nc.Doublets[i]
138 | n2 += nc2.Doublets[i]
139 | }
140 | n = n1 * n2
141 | return
142 | }
143 |
144 | // MateP00 calculate covariance between two clusters.
145 | func (nc *NuclCov) MateP00(nc2 *NuclCov, minAlleleNum int) (xy float64, n int) {
146 | n1, n2 := 0, 0
147 | for i := 0; i < len(nc.Doublets); i++ {
148 | xy += float64(nc.Doublets[i] * nc2.Doublets[i])
149 | n1 += nc.Doublets[i]
150 | n2 += nc2.Doublets[i]
151 | }
152 | n = n1 * n2
153 | return
154 | }
155 |
156 | // Append another NuclCov.
157 | func (nc *NuclCov) Append(nc2 *NuclCov) error {
158 | // Check alphabet
159 | diffAlphabetError := fmt.Errorf("Different alphbet %s, %s", string(nc.Alphabet), string(nc2.Alphabet))
160 | if len(nc.Alphabet) != len(nc2.Alphabet) {
161 | return diffAlphabetError
162 | }
163 | for i, a := range nc.Alphabet {
164 | b := nc2.Alphabet[i]
165 | if a != b {
166 | return diffAlphabetError
167 | }
168 | }
169 |
170 | for i := 0; i < len(nc.Doublets); i++ {
171 | nc.Doublets[i] += nc2.Doublets[i]
172 | }
173 |
174 | return nil
175 | }
176 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # mcorr
2 | Using _Correlation Profiles_ of mutations to infer the recombination rate from large-scale sequencing data in bacteria.
3 |
4 | ## Requirements
5 | * Install `git` from [https://git-scm.com](https://git-scm.com/);
6 | * Install `go` from [https://golang.org/doc/install](https://golang.org/doc/install);
7 | * Install `python3` from [https://www.python.org/](https://www.python.org/) (we found running issues using the default Python in MacOS);
8 | * Install `pip3` from [https://pip.pypa.io/en/stable/installing/](https://pip.pypa.io/en/stable/installing/).
9 |
10 | ## Installation
11 | 1. Install `mcorr-xmfa`, `mcorr-bam`, and `mcorr-fit` from your terminal:
12 | ```sh
13 | go get -u github.com/kussell-lab/mcorr/cmd/mcorr-xmfa
14 | go get -u github.com/kussell-lab/mcorr/cmd/mcorr-bam
15 | cd $HOME/go/src/github.com/kussell-lab/mcorr/cmd/mcorr-fit
16 | python3 setup.py install
17 | ```
18 | or to install `mcorr-fit` in local directory (~/.local/bin in Linux or ~/Library/Python/3.6/bin in MacOS):
19 | ```sh
20 | python3 setup.py install --user
21 | ```
22 | 2. Add `$HOME/go/bin` and `$HOME/.local/bin` to your `$PATH` environment. In Linux, you can do it in your terminal:
23 | ```sh
24 | export PATH=$PATH:$HOME/go/bin:$HOME/.local/bin
25 | ```
26 |
27 | In MacOS, you can do it as follows:
28 | ```sh
29 | export PATH=$PATH:$HOME/go/bin:$HOME/Library/Python/3.6/bin
30 | ```
31 |
32 | We have tested installation in Windows 10, Ubuntu 17.10, and MacOS Big Sur (on both Intel and M1 chips), using Python 3 and Go 1.15 and 1.16.
33 |
34 | Typical installation time on an iMac is 10 minutes.
35 |
36 | ## Basic Usage
37 | The inference of recombination parameters requires two steps:
38 |
39 | 1. Calculate _Correlation Profile_
40 |
41 | 1. For whole-genome alignments (multiple gene alignments), use `mcorr-xmfa`:
42 |
43 | ```sh
44 | mcorr-xmfa