├── Bio.Core.dll
├── Bio.Core.xml
├── Bio.Desktop.dll
├── Bio.Desktop.xml
├── Bio.Platform.Helpers.dll
├── Bio.Platform.Helpers.xml
├── ConPADE.XML
├── ConPADE.exe
├── LICENSE.txt
├── NOTICE.TXT
├── README.txt
├── RunTestData.bat
├── TestData.bam
├── errorModel.bin
├── src
├── ConPADE MIT License.docx
├── ConPADE.cs
├── ConPADE.csproj
└── Read.cs
└── substModel.bin
/Bio.Core.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gramarga/ConPADE/e6022dc09ea96812101432bddb0fbac4e6bbc2bb/Bio.Core.dll
--------------------------------------------------------------------------------
/Bio.Desktop.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gramarga/ConPADE/e6022dc09ea96812101432bddb0fbac4e6bbc2bb/Bio.Desktop.dll
--------------------------------------------------------------------------------
/Bio.Platform.Helpers.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gramarga/ConPADE/e6022dc09ea96812101432bddb0fbac4e6bbc2bb/Bio.Platform.Helpers.dll
--------------------------------------------------------------------------------
/Bio.Platform.Helpers.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Bio.Platform.Helpers
5 |
6 |
7 |
8 |
9 | .NET 4.5 desktop version of the platform services.
10 |
11 |
12 |
13 |
14 | Retrieves the assemblies in the application/package/bundle
15 |
16 |
17 |
18 |
19 |
20 | Creates a Regular Expression; pushed here because some platforms do not support compiling
21 |
22 | Regex pattern
23 | Optional options
24 |
25 |
26 |
27 |
28 | Creates a temporary stream that is deleted when disposed.
29 |
30 | Stream
31 |
32 |
33 |
34 | Constructor
35 |
36 |
37 |
38 |
39 | True if this is a 64-bit process
40 |
41 |
42 |
43 |
44 | Default buffer size for parsers
45 |
46 |
47 |
48 |
49 | Maximum sequence size for the platform.
50 |
51 |
52 |
53 |
54 | This represents a temporary stream that will auto-delete the backing storage.
55 |
56 |
57 |
58 |
59 | When overridden in a derived class, clears all buffers for this stream and causes any buffered data to be written to the underlying device.
60 |
61 | An I/O error occurs.
62 |
63 |
64 |
65 | When overridden in a derived class, sets the position within the current stream.
66 |
67 |
68 | The new position within the current stream.
69 |
70 | A byte offset relative to the parameter. A value of type indicating the reference point used to obtain the new position. An I/O error occurs. The stream does not support seeking, such as if the stream is constructed from a pipe or console output. Methods were called after the stream was closed.
71 |
72 |
73 |
74 | When overridden in a derived class, sets the length of the current stream.
75 |
76 | The desired length of the current stream in bytes. An I/O error occurs. The stream does not support both writing and seeking, such as if the stream is constructed from a pipe or console output. Methods were called after the stream was closed.
77 |
78 |
79 |
80 | When overridden in a derived class, reads a sequence of bytes from the current stream and advances the position within the stream by the number of bytes read.
81 |
82 |
83 | The total number of bytes read into the buffer. This can be less than the number of bytes requested if that many bytes are not currently available, or zero (0) if the end of the stream has been reached.
84 |
85 | An array of bytes. When this method returns, the buffer contains the specified byte array with the values between and ( + - 1) replaced by the bytes read from the current source. The zero-based byte offset in at which to begin storing the data read from the current stream. The maximum number of bytes to be read from the current stream. The sum of and is larger than the buffer length. is null. or is negative. An I/O error occurs. The stream does not support reading. Methods were called after the stream was closed.
86 |
87 |
88 |
89 | When overridden in a derived class, writes a sequence of bytes to the current stream and advances the current position within this stream by the number of bytes written.
90 |
91 | An array of bytes. This method copies bytes from to the current stream. The zero-based byte offset in at which to begin copying bytes to the current stream. The number of bytes to be written to the current stream.
92 |
93 |
94 |
95 | When overridden in a derived class, gets a value indicating whether the current stream supports reading.
96 |
97 |
98 | true if the stream supports reading; otherwise, false.
99 |
100 |
101 |
102 |
103 | When overridden in a derived class, gets a value indicating whether the current stream supports seeking.
104 |
105 |
106 | true if the stream supports seeking; otherwise, false.
107 |
108 |
109 |
110 |
111 | When overridden in a derived class, gets a value indicating whether the current stream supports writing.
112 |
113 |
114 | true if the stream supports writing; otherwise, false.
115 |
116 |
117 |
118 |
119 | When overridden in a derived class, gets the length in bytes of the stream.
120 |
121 |
122 | A long value representing the length of the stream in bytes.
123 |
124 | A class derived from Stream does not support seeking. Methods were called after the stream was closed.
125 |
126 |
127 |
128 | When overridden in a derived class, gets or sets the position within the current stream.
129 |
130 |
131 | The current position within the stream.
132 |
133 | An I/O error occurs. The stream does not support seeking. Methods were called after the stream was closed.
134 |
135 |
136 |
137 |
--------------------------------------------------------------------------------
/ConPADE.XML:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | ConPADE
5 |
6 |
7 |
8 |
9 | Class to store the two alleles, read depths, best allele
10 | dosage and posterior probability for each putative SNP.
11 |
12 |
13 |
14 |
15 | Standard constructor for class Best_Dose.
16 |
17 | Zero-based position in the reference sequence.
18 | Reference allele.
19 | Reference allele depth.
20 | Alternative allele.
21 | Alternative allele depth.
22 | Most likely allele dosage for each possible ploidy.
23 | Posterior probability of there being a SNP for each possible ploidy.
24 |
25 |
26 |
27 | Main class of ConPADE.
28 |
29 |
30 |
31 |
32 | Path to a sorted BAM file.
33 |
34 |
35 |
36 |
37 | Path to error model file.
38 |
39 |
40 |
41 |
42 | Path to substitution model file.
43 |
44 |
45 |
46 |
47 | Maximum ploidy to evaluate.
48 |
49 |
50 |
51 |
52 | Phred-like threshold for outputting a SNP.
53 |
54 |
55 |
56 |
57 | SNP density.
58 |
59 |
60 |
61 |
62 | Store results for different contigs in separate files.
63 |
64 |
65 |
66 |
67 | Run ConPADE on each contig of the input BAM file.
68 |
69 | Name of the input BAM file.
70 |
71 |
72 |
73 | Class to build and store padded reads.
74 | The CIGAR string is used to pad insertions and deletions.
75 | Read positions can then be accessed based on the alignment to the reference.
76 |
77 |
78 |
79 |
80 | Padded_Read standard constructor.
81 |
82 | An input object of type SAMAlignedSequence.
83 |
84 |
85 |
86 | Pad deletions in an aligned read.
87 | Create an index of insertions/deletions so that aligned
88 | positions may be accessed based on reference index.
89 |
90 |
91 |
92 |
93 |
--------------------------------------------------------------------------------
/ConPADE.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gramarga/ConPADE/e6022dc09ea96812101432bddb0fbac4e6bbc2bb/ConPADE.exe
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gramarga/ConPADE/e6022dc09ea96812101432bddb0fbac4e6bbc2bb/LICENSE.txt
--------------------------------------------------------------------------------
/NOTICE.TXT:
--------------------------------------------------------------------------------
1 | ConPADE
2 | Copyright (c) Microsoft Corporation. All rights reserved.
3 | Non-Commercial Use Only
4 | _____________________________________________________________________
5 |
6 | This software is released under the Microsoft Research License
7 | Agreement, ("MSR-LA" or the "License"); you may not use the software
8 | except in compliance with the License. You can find a copy of the
9 | License in the file LICENSE.TXT accompanying this file.
10 | _____________________________________________________________________
11 |
12 | Versions of this software may also rely on additional libraries and
13 | code distributed under their respective licenses. To re-compile or
14 | build this code or derivatives thereof, you may be required to
15 | individually download and appropriately license some or all of of
16 | these additional libraries for your specific use.
17 | _____________________________________________________________________
18 | -------------------- Third Party Notices --------------------
19 | This file is based on or incorporates material from the projects
20 | listed below (collectively, 'Third Party Code'). Microsoft is not
21 | the original author of the Third Party Code. The original copyright
22 | notice and the license under which Microsoft received such Third
23 | Party Code, are set forth below. Such licenses and notices are
24 | provided for informational purposes only. Microsoft licenses the
25 | Third Party Code to you under the terms set forth in the EULA for
26 | the Microsoft Product. Microsoft reserves all other rights not
27 | expressly granted under this agreement, whether by implication,
28 | estoppel or otherwise.
29 | _____________________________________________________________________
30 | Testdata.bam
31 | These sequence data were produced by the US Department of Energy
32 | Joint Genome Institute http://www.jgi.doe.gov/ in collaboration
33 | with the user community.
34 | _____________________________________________________________________
35 | .NET Bio
36 | Copyright (c) 2011, The Outercurve Foundation.
37 |
38 | Provided for Informational Purposes Only
39 |
40 | Apache 2.0 License
41 |
42 | Licensed under the Apache License, Version 2.0 (the "License");
43 | you may not use this file except in compliance with the License.
44 | You may obtain a copy of the License at
45 | http://www.apache.org/licenses/LICENSE-2.0
46 |
47 | THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR
48 | CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING
49 | WITHOUT LIMITATION ANY IMPLIED WARRANTIES OR CONDITIONS OF
50 | TITLE, FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABLITY OR
51 | NON-INFRINGEMENT.
52 |
53 | See the Apache Version 2.0 License for specific language
54 | governing permissions and limitations under the License.
55 |
56 | _____________________________________________________________________
57 | .NET Bio, an open source library of common bioinformatics functions,
58 | intended to simplify the creation of life science applications.
59 |
60 | Copyright (c) 2011, The Outercurve Foundation.
61 | This software is released under the Apache License 2.0 (the
62 | "License"); you may not use the software except in compliance
63 | with the License. You can find a copy of the License in the
64 | file LICENSE[.TXT] accompanying this file.
65 |
66 | Additional Copyrights:
67 | MAFFTT:
68 | -------
69 | The Weighted mixture of minimum and average linkage (weightedMAFFT)
70 | in IHierarchicalClustering.cs is adapted from MAFFT: multiple
71 | sequence alignment program -
72 | http://mafft.cbrc.jp/alignment/software/
73 | Copyright (c) 2006 Kazutaka Katoh
74 |
75 | Redistribution and use in source and binary forms, with or without
76 | modification, are permitted provided that the following conditions
77 | are met:
78 |
79 | Redistributions of source code must retain the above copyright
80 | notice, this list of conditions and the following disclaimer.
81 | Redistributions in binary form must reproduce the above copyright
82 | notice, this list of conditions and the following disclaimer in
83 | the documentation and/or other materials provided with the
84 | distribution.
85 |
86 | The name of the author may not be used to endorse or promote
87 | products derived from this software without specific prior
88 | written permission.
89 |
90 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS
91 | OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
92 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
93 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
94 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
95 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
96 | GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
97 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
98 | WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
99 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
100 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
101 |
102 | NLog:
103 | ------------
104 | The HttpUtility contains NLog (.NET logging library) that was
105 | contributed by Jaroslaw Kowalski -
106 | http://nlog-project.org/
107 |
108 | Copyright (c) 2004-2006 Jaroslaw Kowalski
109 | All rights reserved.
110 |
111 | Redistribution and use in source and binary forms, with or without
112 | modification, are permitted provided that the following conditions
113 | are met:
114 |
115 | * Redistributions of source code must retain the above copyright
116 | notice, this list of conditions and the following disclaimer.
117 |
118 | * Redistributions in binary form must reproduce the above copyright
119 | notice, this list of conditions and the following disclaimer in
120 | the documentation and/or other materials provided with the
121 | distribution.
122 |
123 | * Neither the name of Jaroslaw Kowalski nor the names of its
124 | contributors may be used to endorse or promote products derived
125 | from this software without specific prior written permission.
126 |
127 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
128 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
129 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
130 | FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
131 | COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
132 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
133 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
134 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
135 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
136 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
137 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
138 | OF THE POSSIBILITY OF SUCH DAMAGE.
139 |
140 | QUT:
141 | ----
142 | QUTBio in BioSequenceAssembler is a contribution from Queensland
143 | University of Technology
144 |
145 | QUT.Bio Version 1.1
146 | Copyright (c) 2009 Queensland University of Technology. All rights
147 | reserved.
148 |
149 | The QUT Bioinformatics Collection (QUT.Bio) is open source software
150 | subject to the terms of the Microsoft Public License (Ms-PL):
151 | http://www.microsoft.com/opensource/licenses.mspx.
152 |
153 | The latest version of the source code can be obtained at the project
154 | distribution point: http://qutbio.codeplex.com
155 |
156 | Additional Libraries:
157 | This software includes or is derivative of works distributed under
158 | the Apache license listed in the LICENSE.txt file. The full text
159 | for the license can be found in the LICENSE.txt file accompanying
160 | this file.
161 |
162 | AppliedBioSystems
163 | -----------------
164 | Author: Jeremy Kolpak
165 | Contact: JKolpak@its.jnj.com
166 | Company: Johnson & Johnson Pharmaceutical Research and Development
167 | Copyright: Copyright (c) 2011, The Outercurve Foundation
168 | License: Apache License 2.0
169 |
170 | FileFormatConverter
171 | -----------------
172 | Author: Daniel Ourada
173 | Contact: daniel.ourada@gmail.com
174 | Company: cBio - www.cbiocorp.com
175 | Copyright: Copyright (c) 2011, The Outercurve Foundation
176 | License: Apache License 2.0
177 |
178 | --------------- End of Third Party Notices ---------------
179 |
--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
1 | ######################################################################
2 | # #
3 | # ConPADE: Contig Ploidy and Allele Dosage Estimation #
4 | # #
5 | ######################################################################
6 | # #
7 | # Version 1.0-4 #
8 | # Last updated: 10/30/2015 #
9 | # #
10 | ######################################################################
11 |
12 |
13 | For licensing information, please read file LICENSE.txt distributed with this package
14 |
15 |
16 | ######################################################################
17 |
18 |
19 | Installing from source:
20 |
21 | - ConPADE is written in C# and can be built with any compatible compiler (Visual Studio, Xamarin Studio or MonoDevelop)
22 | - To compile ConPADE from source, you need to link against the .NET Bio libraries. These can be found at https://github.com/dotnetbio/bio
23 |
24 | After successfully building the binaries, make sure you also have the error model files (errorModel.bin and substModel.bin) from the main distribution folder.
25 |
26 |
27 | ######################################################################
28 |
29 |
30 | First steps:
31 |
32 | To run a sample data set, simply open a command line, navigate to the ConPADE folder and type
33 |
34 | \YourWorkingDirectory\RunTestData.bat
35 |
36 | or issue the command
37 |
38 | \YourWorkingDirectory\ConPADE -bamName TestData.bam
39 |
40 | Make sure all downloaded files are in the folder.
41 |
42 |
43 | ######################################################################
44 |
45 |
46 | Result files:
47 |
48 | Default behavior is to produce three files from the input BAM file.
49 | - ploidy: one line per contig, with the second column indicating the most likely ploidy, followed by the log-likelihoods for each evaluated ploidy
50 | - readStats: read usage statistics, a table containing information on numbers of aligned reads and base pairs for each contig
51 | - SNP: a table with identified variants, one SNP per line
52 |
53 | Optionally, argument -splitContigs can be used to produce four files for each individual contig.
54 | - logLikelihoods: contains log-likelihoods for each evaluated ploidy
55 | - ploidy: a single integer indicating the most likely ploidy
56 | - readStats: read usage statistics
57 | - SNP: a table with identified variants
58 |
59 |
60 | ######################################################################
61 |
62 |
63 | To get help on usage and detailed information about arguments, open a command line, navigate to the ConPADE folder and type
64 |
65 | YourWorkingDirectory\ConPADE
66 |
67 |
68 | ######################################################################
69 |
70 |
71 | Version changes:
72 |
73 | * 1.0-1
74 | - Using updated .Net Bio version 2.0 to fix BAM parsing
75 | - Output is now combined for all contigs (there is an option to split files for individual contigs)
76 | - Changed argument from -bamNames to -bamName
77 |
78 | * 1.0-2
79 | - Bug fix: properly ignore not aligned reads
80 |
81 | * 1.0-3
82 | - Bug fix: added support for dummy reads in BAM file
83 | - Bug fix: hard-clipped sequences are now correctly parsed
84 |
85 | * 1.0-4
86 | - First source code release
87 | - Removed dependency on Escience.dll
--------------------------------------------------------------------------------
/RunTestData.bat:
--------------------------------------------------------------------------------
1 | ConPADE.exe -bamName TestData.bam
--------------------------------------------------------------------------------
/TestData.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gramarga/ConPADE/e6022dc09ea96812101432bddb0fbac4e6bbc2bb/TestData.bam
--------------------------------------------------------------------------------
/errorModel.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gramarga/ConPADE/e6022dc09ea96812101432bddb0fbac4e6bbc2bb/errorModel.bin
--------------------------------------------------------------------------------
/src/ConPADE MIT License.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gramarga/ConPADE/e6022dc09ea96812101432bddb0fbac4e6bbc2bb/src/ConPADE MIT License.docx
--------------------------------------------------------------------------------
/src/ConPADE.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using System.IO;
5 | using System.Diagnostics;
6 | using Bio.Util;
7 | using Bio.Util.Distribute;
8 | using Bio.Util.ArgumentParser;
9 | using Bio.IO.SAM;
10 | using Bio.IO.BAM;
11 |
12 |
13 | namespace ConPADE
14 | {
15 | ///
16 | /// Class to store the two alleles, read depths, best allele
17 | /// dosage and posterior probability for each putative SNP.
18 | ///
19 | class Best_Dose
20 | {
21 | public long position { get; private set; }
22 | public byte nuc_one { get; private set; }
23 | public int count_one { get; private set; }
24 | public byte nuc_two { get; private set; }
25 | public int count_two { get; private set; }
26 | public byte[] best_dose { get; private set; }
27 | public double[] SNP_posterior { get; private set; }
28 |
29 | ///
30 | /// Standard constructor for class Best_Dose.
31 | ///
32 | /// Zero-based position in the reference sequence.
33 | /// Reference allele.
34 | /// Reference allele depth.
35 | /// Alternative allele.
36 | /// Alternative allele depth.
37 | /// Most likely allele dosage for each possible ploidy.
38 | /// Posterior probability of there being a SNP for each possible ploidy.
39 | public Best_Dose(long position, byte nuc_one, int count_one, byte nuc_two, int count_two,
40 | byte[] best_dose, double[] SNP_posterior)
41 | {
42 | this.position = position;
43 | this.nuc_one = nuc_one;
44 | this.count_one = count_one;
45 | this.nuc_two = nuc_two;
46 | this.count_two = count_two;
47 | this.best_dose = best_dose;
48 | this.SNP_posterior = SNP_posterior;
49 | }
50 | }
51 |
52 | ///
53 | /// Main class of ConPADE.
54 | ///
55 | class ConPADE : SelfDistributable
56 | {
57 | private string _jobName = "ConPADE";
58 | public override string JobName
59 | {
60 | get { return _jobName; }
61 | }
62 |
63 | ///
64 | /// Path to a sorted BAM file.
65 | ///
66 | [Parse(ParseAction.Required, typeof(InputFile))]
67 | public InputFile bamName = null;
68 |
69 | ///
70 | /// Path to error model file.
71 | ///
72 | [Parse(ParseAction.Optional, typeof(InputFile))]
73 | public InputFile modelFile = new InputFile();
74 |
75 | ///
76 | /// Path to substitution model file.
77 | ///
78 | [Parse(ParseAction.Optional, typeof(InputFile))]
79 | public InputFile substFile = new InputFile();
80 |
81 | ///
82 | /// Maximum ploidy to evaluate.
83 | ///
84 | [Parse(ParseAction.Optional, typeof(int))]
85 | public int max_ploidy = 4;
86 |
87 | ///
88 | /// Phred-like threshold for outputting a SNP.
89 | ///
90 | [Parse(ParseAction.Optional, typeof(int))]
91 | public int SNPthres = 40;
92 |
93 | ///
94 | /// SNP density.
95 | ///
96 | [Parse(ParseAction.Optional, typeof(int))]
97 | public int snpDens = 200;
98 |
99 | ///
100 | /// Store results for different contigs in separate files.
101 | ///
102 | [Parse(ParseAction.Optional, typeof(bool))]
103 | public bool splitContigs = false;
104 |
105 | public override void RunTasks(RangeCollection tasksToRun, long taskCount)
106 | {
107 | RunFile(bamName.FullName);
108 | }
109 |
110 | public override void Cleanup(long taskCount)
111 | {
112 |
113 | }
114 |
115 |
116 | // Calculate log(x + y), given log(x) and log(y)
117 | private static double LogSum(double log_x, double log_y)
118 | {
119 | double result;
120 |
121 | if (log_x > (log_y + 40))
122 | {
123 | result = log_x;
124 | }
125 | else if (log_y > (log_x + 40))
126 | {
127 | result = log_y;
128 | }
129 | else
130 | {
131 | result = Math.Log(Math.Exp(log_x - log_y) + 1) + log_y;
132 | }
133 |
134 | return result;
135 | }
136 |
137 |
138 | // For each ploidy to be evaluated, determine all possible nucleotide proportions/probabilities.
139 | // Store the logarithm of each probability.
140 | private double[][][] Nuc_Props(int min_ploidy, int number_of_ploidies)
141 | {
142 | double[][][] nuc_props = new double[number_of_ploidies][][];
143 | for (int i = 0; i < number_of_ploidies; i++)
144 | {
145 | int ploidy = i + min_ploidy;
146 | nuc_props[i] = new double[ploidy + 1][];
147 |
148 | for (int j = 0; j <= ploidy; j++)
149 | {
150 | double prob = (double)j / ploidy;
151 | nuc_props[i][j] = new double[2]
152 | {
153 | Math.Log(prob),
154 | Math.Log(1-prob)
155 | };
156 | }
157 | }
158 | return nuc_props;
159 | }
160 |
161 |
162 | // Set the model for the probability of each allele dosage.
163 | // We currently use uniform probabilities for each heterozygous genotype.
164 | private double[][] Dose_Probs(int min_ploidy, int number_of_ploidies, double SNP_density, double no_SNP_prob)
165 | {
166 | double[][] dose_probs = new double[number_of_ploidies][];
167 |
168 | // For a ploidy of 1: no SNP allowed.
169 | dose_probs[0] = new double[2];
170 | dose_probs[0][0] = dose_probs[0][1] = Math.Log(0.5);
171 |
172 | for (int i = 1; i < number_of_ploidies; i++)
173 | {
174 | int ploidy = i + min_ploidy;
175 | dose_probs[i] = new double[ploidy + 1];
176 |
177 | dose_probs[i][0] = no_SNP_prob;
178 | dose_probs[i][ploidy] = no_SNP_prob;
179 |
180 | double other_probs = Math.Log(SNP_density / (ploidy - 1));
181 | for (int j = 1; j < ploidy; j++)
182 | {
183 | dose_probs[i][j] = other_probs;
184 | }
185 | }
186 | return dose_probs;
187 | }
188 |
189 | // Get sequencing error probabilities from the error model file.
190 | // Error model file contains double values nested according to the following sequence:
191 | // GG precedes - quality score - neighboring quality score - true nucleotide - error/no error
192 | private double[, , , ,] Error_Probs()
193 | {
194 | if (modelFile.FullName == null)
195 | {
196 | modelFile.FullName = "errorModel.bin";
197 | }
198 | if (!File.Exists(modelFile.ToString()))
199 | {
200 | throw new FileNotFoundException(String.Format("File {0} not found.", modelFile.FullName.ToString()));
201 | }
202 |
203 | double[, , , ,] log_probs = new double[2, 40, 40, 4, 2];
204 | bool got_file_handle = false;
205 | while (!got_file_handle)
206 | {
207 | got_file_handle = true;
208 | try
209 | {
210 | using (FileStream model_stream = new FileStream(modelFile.ToString(), FileMode.Open))
211 | {
212 | using (BinaryReader model_reader = new BinaryReader(model_stream))
213 | {
214 | for (int is_GG = 0; is_GG < 2; is_GG++)
215 | {
216 | for (int qual = 0; qual < 40; qual++)
217 | {
218 | for (int neigh_qual = 0; neigh_qual < 40; neigh_qual++)
219 | {
220 | for (int nuc = 0; nuc < 4; nuc++)
221 | {
222 | for (int is_correct = 0; is_correct < 2; is_correct++)
223 | {
224 | log_probs[is_GG, qual, neigh_qual, nuc, is_correct] =
225 | model_reader.ReadDouble();
226 | }
227 | }
228 | }
229 | }
230 | }
231 | }
232 | }
233 | }
234 | catch (IOException)
235 | {
236 | got_file_handle = false;
237 | }
238 | }
239 |
240 | return log_probs;
241 | }
242 |
243 | // Get nucleotide substitution probabilities from the substitution model file.
244 | // Substitution model file contains double values nested according to the following sequence:
245 | // true nucleotide - GG precedes - observed nucleotide
246 | private double[, ,] Subst_Probs()
247 | {
248 | if (substFile.FullName == null)
249 | {
250 | substFile.FullName = "substModel.bin";
251 | }
252 | if (!File.Exists(substFile.ToString()))
253 | {
254 | throw new FileNotFoundException(String.Format("File {0} not found.", substFile.FullName.ToString()));
255 | }
256 |
257 | double[, ,] log_subst_probs = new double[4, 2, 4];
258 | bool got_file_handle = false;
259 | while (!got_file_handle)
260 | {
261 | got_file_handle = true;
262 | try
263 | {
264 | using (FileStream subst_stream = new FileStream(substFile.ToString(), FileMode.Open))
265 | {
266 | using (BinaryReader subst_reader = new BinaryReader(subst_stream))
267 | {
268 | for (int real_nuc = 0; real_nuc < 4; real_nuc++)
269 | {
270 | for (int is_GG = 0; is_GG < 2; is_GG++)
271 | {
272 | for (int obs_nuc = 0; obs_nuc < 4; obs_nuc++)
273 | {
274 | log_subst_probs[real_nuc, is_GG, obs_nuc] =
275 | subst_reader.ReadDouble();
276 | }
277 | }
278 | }
279 | }
280 | }
281 | }
282 | catch (IOException)
283 | {
284 | got_file_handle = false;
285 | }
286 | }
287 | return log_subst_probs;
288 | }
289 |
290 | // Search the BAM file for the next valid read aligned against the current contig.
291 | // Update read/base pairs statistics.
292 | private void Search_Reads(BAMParser parser, ref SAMAlignedSequence next_alignment, string contig_name,
293 | ref long number_of_aligned_reads, ref long number_of_aligned_base_pairs, ref long number_of_used_reads,
294 | ref long number_of_used_base_pairs, Queue read_queue, long current_position)
295 | {
296 | while (next_alignment != null &&
297 | !next_alignment.IsDummyRead &&
298 | next_alignment.RName == contig_name &&
299 | (next_alignment.Pos - 1) == current_position)
300 | {
301 | // The next alignment overlaps with current position, so continue.
302 | number_of_aligned_reads++;
303 | number_of_aligned_base_pairs += next_alignment.QuerySequence.Count;
304 |
305 | // Maybe we should let the mininum alignment quality be a parameter.
306 | // We currently leave it for the user to pre-filter the BAM file.
307 | if (next_alignment.MapQ > 0)
308 | {
309 | number_of_used_reads++;
310 | number_of_used_base_pairs += next_alignment.QuerySequence.Count;
311 | read_queue.Enqueue(new Padded_Read(next_alignment));
312 | }
313 |
314 | #region Parse BAM file until next alignment is found
315 | if (!parser.IsEOF())
316 | {
317 | next_alignment = parser.GetAlignedSequence(true);
318 |
319 | while ((next_alignment == null || next_alignment.RName == "*" || next_alignment.IsDummyRead) && !parser.IsEOF())
320 | {
321 | next_alignment = parser.GetAlignedSequence(true);
322 | }
323 | }
324 | else
325 | {
326 | next_alignment = null;
327 | }
328 | #endregion Parse BAM file until next alignment is found
329 | }
330 | }
331 |
332 | // For each read in the queue that overlaps the current position,
333 | // extract information on the nucleotide and quality score.
334 | private void Extract_Read_Info(Queue read_queue, long current_position, out byte[] obs_nucs,
335 | out byte[] is_GG, out bool[] reverse, out int[] quality_scores, out int[] neigh_quality_scores,
336 | out int[] scores, out int[] counts, out int k)
337 | {
338 | obs_nucs = new byte[read_queue.Count];
339 | is_GG = new byte[read_queue.Count];
340 | reverse = new bool[read_queue.Count];
341 | quality_scores = new int[read_queue.Count];
342 | neigh_quality_scores = new int[read_queue.Count];
343 | scores = new int[4];
344 | counts = new int[4];
345 | k = 0;
346 |
347 | foreach (Padded_Read curRead in read_queue)
348 | {
349 | // The check for overlap below may not be necessary depending on how we clear read cache.
350 | if ((curRead.alignment.Pos + curRead.alignment_length - 2) >= current_position)
351 | {
352 | byte nuc = curRead.padded_sequence[curRead.cur_pos_ind];
353 | if (nuc < 4)
354 | {
355 | obs_nucs[k] = nuc;
356 | counts[nuc]++;
357 | scores[nuc] += quality_scores[k] =
358 | curRead.padded_quality_scores[curRead.cur_pos_ind];
359 |
360 | // Quality scores are allowed in the [2,41] range.
361 | if (quality_scores[k] < 2 )
362 | {
363 | quality_scores[k] = 3;
364 | }
365 | else if (quality_scores[k] > 41)
366 | {
367 | quality_scores[k] = 41;
368 | }
369 |
370 | #region Get neighboring quality scores
371 | int max_ind = curRead.located_sequence.Length - 1;
372 | int left_found = 0;
373 | int right_found = 0;
374 | int total_qual = 0;
375 | int no_qual = -10;
376 |
377 | // We currently define the neighboring nucleotides as 5 on each side.
378 | int running_ind = curRead.cur_pos_ind - 1;
379 | while (left_found < 5 && running_ind > 0)
380 | {
381 | int temp_qual = curRead.padded_quality_scores[running_ind--];
382 | if (temp_qual != no_qual)
383 | {
384 | left_found++;
385 | total_qual += temp_qual;
386 | }
387 | }
388 |
389 | running_ind = curRead.cur_pos_ind + 1;
390 | while (right_found < 5 && running_ind < max_ind)
391 | {
392 | int temp_qual = curRead.padded_quality_scores[running_ind++];
393 | if (temp_qual != -10)
394 | {
395 | right_found++;
396 | total_qual += temp_qual;
397 | }
398 | }
399 |
400 | neigh_quality_scores[k] = (int)Math.Round((double)total_qual / (left_found + right_found));
401 | if (neigh_quality_scores[k] < 2)
402 | {
403 | neigh_quality_scores[k] = 3;
404 | }
405 | else if (neigh_quality_scores[k] > 41)
406 | {
407 | neigh_quality_scores[k] = 41;
408 | }
409 | #endregion Get neighboring quality scores
410 |
411 | #region Get preceding nucleotides and update values if read is reversed
412 | reverse[k] = curRead.is_reverse;
413 | int nucs_found = 0;
414 | if (reverse[k])
415 | {
416 | running_ind = curRead.cur_pos_ind + 1;
417 | while (nucs_found < 2 && running_ind < max_ind)
418 | {
419 | byte temp_nuc = curRead.padded_sequence[running_ind++];
420 | // If read is reversed, we look for C nucleotides after the current nucleotide.
421 | if (temp_nuc == 1)
422 | {
423 | nucs_found++;
424 | }
425 | else
426 | {
427 | if (temp_nuc <= 4)
428 | {
429 | break;
430 | }
431 | }
432 | }
433 | }
434 | else
435 | {
436 | running_ind = curRead.cur_pos_ind - 1;
437 | while (nucs_found < 2 && running_ind > 0)
438 | {
439 | byte temp_nuc = curRead.padded_sequence[running_ind--];
440 | // If read is not reversed, we look for G nucleotides before the current nucleotide.
441 | if (temp_nuc == 2)
442 | {
443 | nucs_found++;
444 | }
445 | else
446 | {
447 | if (temp_nuc <= 4)
448 | {
449 | break;
450 | }
451 | }
452 | }
453 | }
454 | if (nucs_found == 2)
455 | {
456 | is_GG[k] = 1;
457 | }
458 | #endregion Get preceding nucleotides and update values if read is reversed
459 |
460 | k++;
461 | }
462 |
463 | // Update cur_pos_ind in the padded read to ignore insertions.
464 | while (curRead.located_sequence[++curRead.cur_pos_ind] == curRead.located_sequence[curRead.cur_pos_ind - 1]) ;
465 | }
466 | }
467 | }
468 |
469 | // Find two most abundant nucleotides for a given position.
470 | // We currently use A as a neutral nucleotide, based on its substitution model.
471 | private static void Get_Two_Nucs(int[] scores, out byte nuc_one, out byte nuc_two)
472 | {
473 | if (scores[1] > scores[0])
474 | {
475 | nuc_one = 1;
476 | nuc_two = 0;
477 | }
478 | else
479 | {
480 | nuc_one = 0;
481 | nuc_two = 1;
482 | }
483 | for (byte i = 2; i < 4; i++)
484 | {
485 | if (scores[i] > scores[nuc_one])
486 | {
487 | nuc_two = nuc_one;
488 | nuc_one = i;
489 | }
490 | else if (scores[i] > scores[nuc_two])
491 | {
492 | nuc_two = i;
493 | }
494 | }
495 | }
496 |
497 | // For a set of aligned nucleotides and their auxiliary information,
498 | // return the probability of the observed values.
499 | // This includes both the sequencing error and the substitution models.
500 | private double[][] Obs_Probs(double[, , , ,] log_probs, double[, ,] log_subst_probs, byte[] obs_nucs,
501 | byte[] is_GG, bool[] reverse, int[] quality_scores, int[] neigh_quality_scores, int[] counts, int k,
502 | byte nuc_one, byte nuc_two)
503 | {
504 | byte nuc_one_reverse = (byte)(3 - nuc_one);
505 | byte nuc_two_reverse = (byte)(3 - nuc_two);
506 |
507 | int reads_to_use = counts[nuc_one] + counts[nuc_two];
508 |
509 | double[][] log_nuc_probs = new double[reads_to_use][];
510 | int l = 0;
511 | for (int i = 0; i < k; i++)
512 | {
513 | if (obs_nucs[i] == nuc_one)
514 | {
515 | byte nuc_one_to_use = nuc_one;
516 | byte nuc_two_to_use = nuc_two;
517 | if (reverse[i])
518 | {
519 | nuc_one_to_use = nuc_one_reverse;
520 | nuc_two_to_use = nuc_two_reverse;
521 | }
522 | log_nuc_probs[l++] = new double[2] {
523 | log_probs[is_GG[i], quality_scores[i]-2, neigh_quality_scores[i]-2, nuc_one_to_use, 1],
524 | log_probs[is_GG[i], quality_scores[i]-2, neigh_quality_scores[i]-2, nuc_two_to_use, 0] +
525 | log_subst_probs[nuc_two_to_use, is_GG[i], nuc_one_to_use]
526 | };
527 | }
528 | else if (obs_nucs[i] == nuc_two)
529 | {
530 | byte nuc_one_to_use = nuc_one;
531 | byte nuc_two_to_use = nuc_two;
532 | if (reverse[i])
533 | {
534 | nuc_one_to_use = nuc_one_reverse;
535 | nuc_two_to_use = nuc_two_reverse;
536 | }
537 | log_nuc_probs[l++] = new double[2] {
538 | log_probs[is_GG[i], quality_scores[i]-2, neigh_quality_scores[i]-2, nuc_one_to_use, 0] +
539 | log_subst_probs[nuc_one_to_use, is_GG[i], nuc_two_to_use],
540 | log_probs[is_GG[i], quality_scores[i]-2, neigh_quality_scores[i]-2, nuc_two_to_use, 1]
541 | };
542 | }
543 | }
544 | return log_nuc_probs;
545 | }
546 |
547 | // For each ploidy to be evaluated, calculate the likelihood of the genotypes given the observed data.
548 | private static double[][] Log_Likelihoods(int min_ploidy, int max_ploidy, double[][] log_nuc_probs,
549 | double[][][] nuc_props)
550 | {
551 | int number_of_ploidies = max_ploidy - min_ploidy + 1;
552 |
553 | double[][] log_likelihoods = new double[number_of_ploidies][];
554 |
555 | // Likelihoods of the homozygous genotypes are the same for all ploidies.
556 | double log_like_P0 = 0;
557 | double log_like_P1 = 0;
558 | for (int read = 0; read < log_nuc_probs.Length; read++)
559 | {
560 | log_like_P0 += LogSum(
561 | log_nuc_probs[read][0] + nuc_props[0][0][0],
562 | log_nuc_probs[read][1] + nuc_props[0][0][1]);
563 |
564 | log_like_P1 += LogSum(
565 | log_nuc_probs[read][0] + nuc_props[0][min_ploidy][0],
566 | log_nuc_probs[read][1] + nuc_props[0][min_ploidy][1]);
567 | }
568 |
569 | for (int i = 0; i < number_of_ploidies; i++)
570 | {
571 | int ploidy = i + min_ploidy;
572 | log_likelihoods[i] = new double[ploidy + 1];
573 |
574 | log_likelihoods[i][0] = log_like_P0;
575 | log_likelihoods[i][ploidy] = log_like_P1;
576 |
577 | // Calculate the likelihood of heterozygous genotypes.
578 | for (int j = 1; j < ploidy; j++)
579 | {
580 | for (int read = 0; read < log_nuc_probs.Length; read++)
581 | {
582 | log_likelihoods[i][j] += LogSum(
583 | log_nuc_probs[read][0] + nuc_props[i][j][0],
584 | log_nuc_probs[read][1] + nuc_props[i][j][1]);
585 | }
586 | }
587 | }
588 |
589 | return log_likelihoods;
590 | }
591 |
592 | // Calculate the likelihood of each ploidy for the current position, given genotype likelihoods.
593 | // Use this value to update the global likelihood of each ploidy.
594 | // Also, find the most likely allele dosage and store the corresponding posterior
595 | // probability of there being a SNP in the current position, for each ploidy.
596 | private void Global_Likelihood_Keep_Dose(int min_ploidy, int number_of_ploidies, double[][] dose_probs,
597 | double[] global_log_like, Queue dose_queue, long current_position, int[] counts,
598 | byte nuc_one, byte nuc_two, double[][] log_likelihoods)
599 | {
600 | byte[] best_dose = new byte[number_of_ploidies];
601 | double[] SNP_posterior = new double[number_of_ploidies];
602 |
603 | for (int i = 0; i < number_of_ploidies; i++)
604 | {
605 | int ploidy = i + min_ploidy;
606 |
607 | best_dose[i] = 0;
608 | double best_posterior = double.NegativeInfinity;
609 | double aggregate = double.NegativeInfinity;
610 | for (byte j = 0; j <= ploidy; j++)
611 | {
612 | double posterior = dose_probs[i][j] + log_likelihoods[i][j];
613 | aggregate = LogSum(
614 | aggregate, posterior);
615 | if (posterior > best_posterior)
616 | {
617 | best_dose[i] = j;
618 | best_posterior = posterior;
619 | }
620 | }
621 | global_log_like[i] += aggregate;
622 |
623 | SNP_posterior[i] = LogSum(
624 | dose_probs[i][0] + log_likelihoods[i][0],
625 | dose_probs[i][ploidy] + log_likelihoods[i][ploidy]) - aggregate;
626 | }
627 |
628 | // Store information that will be part of the called SNPs output.
629 | dose_queue.Enqueue(new Best_Dose(current_position, nuc_one, counts[nuc_one], nuc_two,
630 | counts[nuc_two], best_dose, SNP_posterior));
631 | }
632 |
633 | ///
634 | /// Run ConPADE on each contig of the input BAM file.
635 | ///
636 | /// Name of the input BAM file.
637 | public void RunFile(string bamName)
638 | {
639 | // Current implementation requires that minimum ploidy be 1
640 | int min_ploidy = 1;
641 | int number_of_ploidies = max_ploidy - min_ploidy + 1;
642 |
643 | // Set nucleotide proportions (genotypes)
644 | double[][][] nuc_props = Nuc_Props(min_ploidy, number_of_ploidies);
645 |
646 | // Set dosage probabilities
647 | double SNP_density = (double)1 / snpDens;
648 | double no_SNP_prob = Math.Log((1 - SNP_density) / 2);
649 | double[][] dose_probs = Dose_Probs(min_ploidy, number_of_ploidies, SNP_density, no_SNP_prob);
650 |
651 | // Set HiSeq error model
652 | double[, , , ,] log_probs = Error_Probs();
653 |
654 | // Set substitution model
655 | double[, ,] log_subst_probs = Subst_Probs();
656 |
657 | // Set SNP calling probability
658 | double log_SNP_thres = SNPthres * Math.Log(10) / -10;
659 |
660 | Stopwatch clock = new Stopwatch();
661 |
662 | Console.WriteLine("Program started at {0}\n", DateTime.Now);
663 |
664 | Stream bam_stream = new FileStream(bamName, FileMode.Open, FileAccess.Read);
665 | BAMParser parser = new BAMParser();
666 | SAMAlignmentHeader header = parser.GetHeader(bam_stream);
667 | string temp = Path.GetFileNameWithoutExtension(bamName);
668 |
669 | // Find first valid alignment in BAM file
670 | SAMAlignedSequence next_alignment = parser.GetAlignedSequence(true);
671 | while (next_alignment == null || next_alignment.RName == "*" || next_alignment.IsDummyRead)
672 | {
673 | next_alignment = parser.GetAlignedSequence(true);
674 | }
675 |
676 | TextWriter writer_log_like = null;
677 | TextWriter writer_SNP = null;
678 | TextWriter writer_ploidy = null;
679 | TextWriter writer_reads = null;
680 |
681 | // Create global output files and write headers.
682 | if (!splitContigs)
683 | {
684 | string SNP_file = temp + "_SNP.txt";
685 | writer_SNP = new StreamWriter(SNP_file);
686 | writer_SNP.WriteLine("Contig\tPosition\tAlleles\tCounts\tDosage\tPhredQuality");
687 |
688 | string ploidy_file = temp + "_ploidy.txt";
689 | writer_ploidy = new StreamWriter(ploidy_file);
690 | writer_ploidy.Write("Contig\tBestPloidy");
691 | for (int i = 0; i < number_of_ploidies; i++)
692 | {
693 | writer_ploidy.Write("\tlogLike_M{0}", i + min_ploidy);
694 | }
695 | writer_ploidy.WriteLine("");
696 |
697 | string reads_file = temp + "_readStats.txt";
698 | writer_reads = new StreamWriter(reads_file);
699 | writer_reads.WriteLine("Contig\tAlignedReads\tAlignedBases\tUsedReads\tUsedBases");
700 | }
701 |
702 | // Run over each contig in input BAM file.
703 | int contig_ind = -1;
704 | while (next_alignment != null && next_alignment.RName != "*" && !next_alignment.IsDummyRead)
705 | {
706 | string contig_name = next_alignment.RName;
707 |
708 | Console.WriteLine("Started contig {0} at {1}",
709 | contig_name, DateTime.Now);
710 |
711 | clock.Restart();
712 |
713 | #region Variables and file handles for current contig
714 | long number_of_aligned_reads = 0;
715 | long number_of_aligned_base_pairs = 0;
716 | long number_of_used_reads = 0;
717 | long number_of_used_base_pairs = 0;
718 |
719 | // Create individual output files for the current contig.
720 | if (splitContigs)
721 | {
722 | string name = temp + "_" + contig_name;
723 | string log_like_file = name + "_log_likelihoods.txt";
724 | writer_log_like = new StreamWriter(log_like_file);
725 |
726 | string SNP_file = name + "_SNP.txt";
727 | writer_SNP = new StreamWriter(SNP_file);
728 |
729 | string ploidy_file = name + "_ploidy.txt";
730 | writer_ploidy = new StreamWriter(ploidy_file);
731 |
732 | string reads_file = name + "_readStats.txt";
733 | writer_reads = new StreamWriter(reads_file);
734 | }
735 |
736 | double[] global_log_like = new double[number_of_ploidies];
737 |
738 | while (header.ReferenceSequences[++contig_ind].Name != contig_name) ;
739 | long contig_length = header.ReferenceSequences[contig_ind].Length;
740 |
741 | // Create a queue to include all reads that overlap with a given position.
742 | Queue read_queue = new Queue();
743 |
744 | // Create a queue to include best doses for each tested position.
745 | Queue dose_queue = new Queue((int)contig_length);
746 | #endregion Variables and file handles for current contig
747 |
748 | int positions_to_compute = 0;
749 | long current_position = 0;
750 |
751 | #region Run over every position in contig
752 | while (current_position < contig_length)
753 | {
754 | if ((current_position % 1000000) == 0 && current_position != 0)
755 | {
756 | Console.WriteLine("At position {0} of {1}", current_position + 1, contig_length);
757 | }
758 |
759 | // Search for reads starting at current position.
760 | Search_Reads(parser, ref next_alignment, contig_name, ref number_of_aligned_reads,
761 | ref number_of_aligned_base_pairs, ref number_of_used_reads, ref number_of_used_base_pairs,
762 | read_queue, current_position);
763 |
764 | if (read_queue.Count > 0)
765 | {
766 | positions_to_compute++;
767 |
768 | // Extract information from each read in queue.
769 | byte[] obs_nucs;
770 | byte[] is_GG;
771 | bool[] reverse;
772 | int[] quality_scores;
773 | int[] neigh_quality_scores;
774 | int[] scores;
775 | int[] counts;
776 | int k;
777 |
778 | Extract_Read_Info(read_queue, current_position, out obs_nucs, out is_GG, out reverse,
779 | out quality_scores, out neigh_quality_scores, out scores, out counts, out k);
780 |
781 | // Find two most abundant nucleotides for this position.
782 | byte nuc_one;
783 | byte nuc_two;
784 | Get_Two_Nucs(scores, out nuc_one, out nuc_two);
785 |
786 | // Calculate Pr(obs|allele1) and Pr(obs|allele2).
787 | double[][] log_nuc_probs = Obs_Probs(log_probs, log_subst_probs, obs_nucs, is_GG, reverse,
788 | quality_scores, neigh_quality_scores, counts, k, nuc_one, nuc_two);
789 |
790 | // Calculate log_likelihoods of genotypes for current position.
791 | double[][] log_likelihoods = Log_Likelihoods(min_ploidy, max_ploidy, log_nuc_probs, nuc_props);
792 |
793 | // Calculate log_likelihood of each ploidy and keep most likely allele dosage.
794 | Global_Likelihood_Keep_Dose(min_ploidy, number_of_ploidies, dose_probs, global_log_like,
795 | dose_queue, current_position, counts, nuc_one, nuc_two, log_likelihoods);
796 | }
797 |
798 | // Remove finished reads from queue. Finished reads no longer overlap with current position.
799 | Padded_Read read_to_remove;
800 | if (read_queue.Count > 0)
801 | {
802 | read_to_remove = read_queue.First();
803 | }
804 | else
805 | {
806 | read_to_remove = null;
807 | }
808 |
809 | while (read_to_remove != null &&
810 | (read_to_remove.alignment.Pos + read_to_remove.alignment_length - 2) < current_position)
811 | {
812 | read_queue.Dequeue();
813 | if (read_queue.Count > 0)
814 | {
815 | read_to_remove = read_queue.First();
816 | }
817 | else
818 | {
819 | read_to_remove = null;
820 | }
821 | }
822 |
823 | ++current_position;
824 | }
825 | #endregion Run over every position in contig
826 |
827 | // Output log_likelihoods.
828 | int best_log_like = 0;
829 | for (int i = 0; i < number_of_ploidies; i++)
830 | {
831 | if (global_log_like[i] > global_log_like[best_log_like])
832 | {
833 | best_log_like = i;
834 | }
835 |
836 | if (splitContigs)
837 | {
838 | writer_log_like.WriteLine("Ploidy {0} - log_likelihood {1}", i + min_ploidy, global_log_like[i]);
839 | }
840 | }
841 |
842 | // Output most likely ploidy.
843 | int best_ploidy = best_log_like + min_ploidy;
844 | if (splitContigs)
845 | {
846 | writer_ploidy.WriteLine(best_ploidy);
847 | }
848 | else
849 | {
850 | writer_ploidy.Write("{0}\t{1}", contig_name, best_ploidy);
851 | for (int i = 0; i < number_of_ploidies; i++)
852 | {
853 | writer_ploidy.Write("\t{0}", global_log_like[i]);
854 | }
855 | writer_ploidy.WriteLine("");
856 | }
857 |
858 | // Output SNPs.
859 | if (splitContigs)
860 | {
861 | writer_SNP.WriteLine("Position\tAlleles\tCounts\tDosage\tPhredQuality");
862 | }
863 | char[] nuc_chars = new char[4] { 'A', 'C', 'G', 'T' };
864 | foreach (Best_Dose cur_doses in dose_queue)
865 | {
866 | double cur_SNP_posterior = cur_doses.SNP_posterior[best_log_like];
867 | if (cur_SNP_posterior <= log_SNP_thres)
868 | {
869 | int cur_best_dose = cur_doses.best_dose[best_log_like];
870 | if (cur_best_dose != best_ploidy && cur_best_dose != 0)
871 | {
872 | if (splitContigs)
873 | {
874 | writer_SNP.WriteLine("{0}\t{1}|{2}\t{3}|{4}\t{5}\t{6}", cur_doses.position + 1,
875 | nuc_chars[cur_doses.nuc_one], nuc_chars[cur_doses.nuc_two], cur_doses.count_one,
876 | cur_doses.count_two, cur_best_dose, -10 * cur_SNP_posterior / Math.Log(10));
877 | }
878 | else
879 | {
880 | writer_SNP.WriteLine("{0}\t{1}\t{2}|{3}\t{4}|{5}\t{6}\t{7}", contig_name,
881 | cur_doses.position + 1, nuc_chars[cur_doses.nuc_one], nuc_chars[cur_doses.nuc_two],
882 | cur_doses.count_one, cur_doses.count_two, cur_best_dose,
883 | -10 * cur_SNP_posterior / Math.Log(10));
884 | }
885 | }
886 | }
887 | }
888 |
889 | // Output read statistics.
890 | if (splitContigs)
891 | {
892 | writer_reads.WriteLine("\nNumber of aligned reads: {0}", number_of_aligned_reads);
893 | writer_reads.WriteLine("Number of aligned base pairs: {0}", number_of_aligned_base_pairs);
894 | writer_reads.WriteLine("\nNumber of used reads: {0}", number_of_used_reads);
895 | writer_reads.WriteLine("Number of used base pairs: {0}", number_of_used_base_pairs);
896 | }
897 | else
898 | {
899 | writer_reads.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", contig_name, number_of_aligned_reads,
900 | number_of_aligned_base_pairs, number_of_used_reads, number_of_used_base_pairs);
901 | }
902 |
903 | if (splitContigs)
904 | {
905 | writer_log_like.Close();
906 | writer_SNP.Close();
907 | writer_ploidy.Close();
908 | writer_reads.Close();
909 | }
910 |
911 | clock.Stop();
912 | Console.WriteLine("Time to run contig: {0} s\n", (double)clock.ElapsedMilliseconds / 1000);
913 | }
914 |
915 | if (!splitContigs)
916 | {
917 | writer_SNP.Close();
918 | writer_ploidy.Close();
919 | writer_reads.Close();
920 | }
921 |
922 | parser.Dispose();
923 | Console.WriteLine("Finished at {0}\n", DateTime.Now);
924 | }
925 |
926 | static void Main(string[] args)
927 | {
928 | CommandArguments.ConstructAndRun(args);
929 | }
930 | }
931 | }
932 |
--------------------------------------------------------------------------------
/src/ConPADE.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Debug
5 | x86
6 | 8.0.30703
7 | 2.0
8 | {DB342BEE-30EC-47F1-B3F8-394260F30ACD}
9 | Exe
10 | Properties
11 | ConPADE
12 | ConPADE
13 | v4.5
14 |
15 |
16 | 512
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 | x86
28 | true
29 | full
30 | false
31 | bin\Debug\
32 | DEBUG;TRACE
33 | prompt
34 | 4
35 | bin\Debug\ConPADE.XML
36 | false
37 |
38 |
39 | x86
40 | pdbonly
41 | true
42 | bin\Release\
43 | TRACE
44 | prompt
45 | 4
46 | false
47 |
48 |
49 | true
50 | bin\Debug\
51 | DEBUG;TRACE
52 | bin\Debug\ConPADE.XML
53 | full
54 | AnyCPU
55 | bin\Debug\ConPADE.exe.CodeAnalysisLog.xml
56 | true
57 | GlobalSuppressions.cs
58 | prompt
59 | MinimumRecommendedRules.ruleset
60 | ;C:\Program Files (x86)\Microsoft Visual Studio 10.0\Team Tools\Static Analysis Tools\\Rule Sets
61 | false
62 | ;C:\Program Files (x86)\Microsoft Visual Studio 10.0\Team Tools\Static Analysis Tools\FxCop\\Rules
63 | true
64 | false
65 |
66 |
67 | bin\Release\
68 | TRACE
69 | true
70 | pdbonly
71 | AnyCPU
72 | bin\Release\ConPADE.exe.CodeAnalysisLog.xml
73 | true
74 | GlobalSuppressions.cs
75 | prompt
76 | MinimumRecommendedRules.ruleset
77 | ;C:\Program Files (x86)\Microsoft Visual Studio 10.0\Team Tools\Static Analysis Tools\\Rule Sets
78 | true
79 | ;C:\Program Files (x86)\Microsoft Visual Studio 10.0\Team Tools\Static Analysis Tools\FxCop\\Rules
80 | true
81 | bin\Release\ConPADE.XML
82 | false
83 |
84 |
85 |
86 | .\Bio.Core.dll
87 |
88 |
89 | .\Bio.Desktop.dll
90 |
91 |
92 | .\Bio.Platform.Helpers.dll
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
118 |
--------------------------------------------------------------------------------
/src/Read.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Linq;
3 | using Bio;
4 | using Bio.IO.SAM;
5 |
6 | namespace ConPADE
7 | {
8 | ///
9 | /// Class to build and store padded reads.
10 | /// The CIGAR string is used to pad insertions and deletions.
11 | /// Read positions can then be accessed based on the alignment to the reference.
12 | ///
13 | class Padded_Read
14 | {
15 | public SAMAlignedSequence alignment { get; private set; }
16 | public byte[] padded_sequence;
17 | public int[] padded_quality_scores;
18 | public int[] located_sequence;
19 | public bool is_reverse;
20 | public int cur_pos_ind;
21 | public int[] numbers;
22 | public char[] letters;
23 | public int vector_ind;
24 | public int alignment_length;
25 |
26 | ///
27 | /// Padded_Read standard constructor.
28 | ///
29 | /// An input object of type SAMAlignedSequence.
30 | public Padded_Read(SAMAlignedSequence alignment)
31 | {
32 | this.alignment = alignment;
33 | this.is_reverse = alignment.Flag.HasFlag(SAMFlags.QueryOnReverseStrand);
34 | this.Pad_Read();
35 | }
36 |
37 | // Split CIGAR string into paired "numbers" and "letters".
38 | private void Split_CIGAR()
39 | {
40 | this.numbers = new int[1000];
41 | this.letters = new char[1000];
42 | int cigar_ind = -1;
43 | this.vector_ind = 0;
44 | while (cigar_ind < (this.alignment.CIGAR.Length - 1))
45 | {
46 | int cur_number = 0;
47 | int current;
48 | while (Int32.TryParse(this.alignment.CIGAR[++cigar_ind].ToString(), out current))
49 | {
50 | cur_number = 10 * cur_number + current;
51 | }
52 | this.numbers[this.vector_ind] = cur_number;
53 | this.letters[this.vector_ind++] = this.alignment.CIGAR[cigar_ind];
54 | }
55 | }
56 |
57 | // Count the total length of deletions to pad.
58 | // Currently considering deletions ("D") and soft padding ("P")
59 | private int Count_Padding()
60 | {
61 | int extras = 0;
62 | for (int i = 0; i < this.numbers.Length; i++)
63 | {
64 | if (this.letters[i] == 'D' || this.letters[i] == 'P')
65 | {
66 | extras += this.numbers[i];
67 | }
68 | }
69 | return extras;
70 | }
71 |
72 | ///
73 | /// Pad deletions in an aligned read.
74 | /// Create an index of insertions/deletions so that aligned
75 | /// positions may be accessed based on reference index.
76 | ///
77 | public void Pad_Read()
78 | {
79 | byte[] sequence = this.alignment.QuerySequence.ToArray();
80 | int[] quality_scores = (this.alignment.QuerySequence as QualitativeSequence).GetQualityScores();
81 |
82 | this.Split_CIGAR();
83 |
84 | int extras = Count_Padding();
85 |
86 | // size includes deletions and one spacer on both sides
87 | int size = sequence.Length + extras + 2;
88 | this.padded_sequence = new byte[size];
89 | this.padded_quality_scores = new int[size];
90 | this.located_sequence = new int[size];
91 |
92 | // Values to use for spacers
93 | int no_qual = -10;
94 | byte spacer_nuc = 5;
95 | int spacer_ind = -1;
96 |
97 | int last_ind = size - 1;
98 | this.padded_sequence[0] = spacer_nuc;
99 | this.padded_sequence[last_ind] = spacer_nuc;
100 | this.padded_quality_scores[0] = no_qual;
101 | this.padded_quality_scores[last_ind] = no_qual;
102 | this.located_sequence[0] = spacer_ind;
103 | this.located_sequence[last_ind] = spacer_ind;
104 |
105 | int k = 1;
106 | int l = 0;
107 | int m = 1;
108 |
109 | // Nucleotide : Index
110 | // A : 0
111 | // C : 1
112 | // G : 2
113 | // T : 3
114 | // N : 4
115 | // Others (incl dels) : 5
116 | // We may want to use a separate index for deletions, if we want to call indels in the future
117 |
118 | int last = spacer_ind;
119 | for (int i = 0; i < this.letters.Length; i++)
120 | {
121 | char cur_letter = this.letters[i];
122 | if (cur_letter == 'D' ||
123 | cur_letter == 'P')
124 | {
125 | if (cur_letter == 'D')
126 | {
127 | for (int j = 0; j < this.numbers[i]; j++)
128 | {
129 | this.located_sequence[m++] = ++last;
130 | this.padded_quality_scores[k] = no_qual;
131 | this.padded_sequence[k++] = spacer_nuc;
132 | }
133 | }
134 | }
135 | else
136 | {
137 | if (cur_letter == 'M' ||
138 | cur_letter == '=' ||
139 | cur_letter == 'X')
140 | {
141 | for (int j = 0; j < this.numbers[i]; j++)
142 | {
143 | this.located_sequence[m++] = ++last;
144 | }
145 | }
146 | else if (cur_letter == 'I')
147 | {
148 | if (last == spacer_ind)
149 | {
150 | ++last;
151 | }
152 | for (int j = 0; j < this.numbers[i]; j++)
153 | {
154 | this.located_sequence[m++] = last;
155 | }
156 | }
157 | else if (cur_letter == 'S')
158 | {
159 | for (int j = 0; j < this.numbers[i]; j++)
160 | {
161 | this.located_sequence[m++] = spacer_ind;
162 | }
163 | }
164 |
165 | if (cur_letter != 'H') {
166 | for (int j = 0; j < this.numbers[i]; j++)
167 | {
168 | this.padded_quality_scores[k] = quality_scores[l];
169 | switch (sequence[l++])
170 | {
171 | case 65:
172 | this.padded_sequence[k++] = 0;
173 | break;
174 | case 67:
175 | this.padded_sequence[k++] = 1;
176 | break;
177 | case 71:
178 | this.padded_sequence[k++] = 2;
179 | break;
180 | case 84:
181 | this.padded_sequence[k++] = 3;
182 | break;
183 | case 78:
184 | this.padded_sequence[k++] = 4;
185 | break;
186 | default:
187 | this.padded_sequence[k++] = 5;
188 | break;
189 | };
190 | }
191 | }
192 | }
193 | }
194 |
195 | // Set cur_pos_ind to the position with the first aligned nucleotide.
196 | // This value will be updated by other methods.
197 | this.cur_pos_ind = 0;
198 | while (this.located_sequence[++this.cur_pos_ind] == spacer_ind) ;
199 |
200 | this.alignment_length = this.alignment.RefEndPos - this.alignment.Pos + 1;
201 | }
202 | }
203 | }
--------------------------------------------------------------------------------
/substModel.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gramarga/ConPADE/e6022dc09ea96812101432bddb0fbac4e6bbc2bb/substModel.bin
--------------------------------------------------------------------------------