├── Bio.Core.dll ├── Bio.Core.xml ├── Bio.Desktop.dll ├── Bio.Desktop.xml ├── Bio.Platform.Helpers.dll ├── Bio.Platform.Helpers.xml ├── ConPADE.XML ├── ConPADE.exe ├── LICENSE.txt ├── NOTICE.TXT ├── README.txt ├── RunTestData.bat ├── TestData.bam ├── errorModel.bin ├── src ├── ConPADE MIT License.docx ├── ConPADE.cs ├── ConPADE.csproj └── Read.cs └── substModel.bin /Bio.Core.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gramarga/ConPADE/e6022dc09ea96812101432bddb0fbac4e6bbc2bb/Bio.Core.dll -------------------------------------------------------------------------------- /Bio.Desktop.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gramarga/ConPADE/e6022dc09ea96812101432bddb0fbac4e6bbc2bb/Bio.Desktop.dll -------------------------------------------------------------------------------- /Bio.Platform.Helpers.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gramarga/ConPADE/e6022dc09ea96812101432bddb0fbac4e6bbc2bb/Bio.Platform.Helpers.dll -------------------------------------------------------------------------------- /Bio.Platform.Helpers.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Bio.Platform.Helpers 5 | 6 | 7 | 8 |

9 | .NET 4.5 desktop version of the platform services. 10 |

11 | 12 | 13 |

14 | Retrieves the assemblies in the application/package/bundle 15 |

16 | 17 | 18 | 19 |

20 | Creates a Regular Expression; pushed here because some platforms do not support compiling 21 |

22 | Regex pattern 23 | Optional options 24 | 25 | 26 | 27 |

28 | Creates a temporary stream that is deleted when disposed. 29 |

30 | Stream 31 | 32 | 33 |

34 | Constructor 35 |

36 | 37 | 38 |

39 | True if this is a 64-bit process 40 |

41 | 42 | 43 |

44 | Default buffer size for parsers 45 |

46 | 47 | 48 |

49 | Maximum sequence size for the platform. 50 |

51 | 52 | 53 |

54 | This represents a temporary stream that will auto-delete the backing storage. 55 |

56 | 57 | 58 |

59 | When overridden in a derived class, clears all buffers for this stream and causes any buffered data to be written to the underlying device. 60 |

61 | An I/O error occurs. 62 | 63 | 64 |

65 | When overridden in a derived class, sets the position within the current stream. 66 |

67 | 68 | The new position within the current stream. 69 | 70 | A byte offset relative to the parameter. A value of type indicating the reference point used to obtain the new position. An I/O error occurs. The stream does not support seeking, such as if the stream is constructed from a pipe or console output. Methods were called after the stream was closed. 71 | 72 | 73 |

74 | When overridden in a derived class, sets the length of the current stream. 75 |

76 | The desired length of the current stream in bytes. An I/O error occurs. The stream does not support both writing and seeking, such as if the stream is constructed from a pipe or console output. Methods were called after the stream was closed. 77 | 78 | 79 |

80 | When overridden in a derived class, reads a sequence of bytes from the current stream and advances the position within the stream by the number of bytes read. 81 |

82 | 83 | The total number of bytes read into the buffer. This can be less than the number of bytes requested if that many bytes are not currently available, or zero (0) if the end of the stream has been reached. 84 | 85 | An array of bytes. When this method returns, the buffer contains the specified byte array with the values between and ( + - 1) replaced by the bytes read from the current source. The zero-based byte offset in at which to begin storing the data read from the current stream. The maximum number of bytes to be read from the current stream. The sum of and is larger than the buffer length. is null. or is negative. An I/O error occurs. The stream does not support reading. Methods were called after the stream was closed. 86 | 87 | 88 |

89 | When overridden in a derived class, writes a sequence of bytes to the current stream and advances the current position within this stream by the number of bytes written. 90 |

91 | An array of bytes. This method copies bytes from to the current stream. The zero-based byte offset in at which to begin copying bytes to the current stream. The number of bytes to be written to the current stream. 92 | 93 | 94 |

95 | When overridden in a derived class, gets a value indicating whether the current stream supports reading. 96 |

97 | 98 | true if the stream supports reading; otherwise, false. 99 | 100 | 101 | 102 |

103 | When overridden in a derived class, gets a value indicating whether the current stream supports seeking. 104 |

105 | 106 | true if the stream supports seeking; otherwise, false. 107 | 108 | 109 | 110 |

111 | When overridden in a derived class, gets a value indicating whether the current stream supports writing. 112 |

113 | 114 | true if the stream supports writing; otherwise, false. 115 | 116 | 117 | 118 |

119 | When overridden in a derived class, gets the length in bytes of the stream. 120 |

121 | 122 | A long value representing the length of the stream in bytes. 123 | 124 | A class derived from Stream does not support seeking. Methods were called after the stream was closed. 125 | 126 | 127 |

128 | When overridden in a derived class, gets or sets the position within the current stream. 129 |

130 | 131 | The current position within the stream. 132 | 133 | An I/O error occurs. The stream does not support seeking. Methods were called after the stream was closed. 134 | 135 | 136 | 137 | -------------------------------------------------------------------------------- /ConPADE.XML: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ConPADE 5 | 6 | 7 | 8 |

9 | Class to store the two alleles, read depths, best allele 10 | dosage and posterior probability for each putative SNP. 11 |

12 | 13 | 14 |

15 | Standard constructor for class Best_Dose. 16 |

27 | Main class of ConPADE. 28 |

29 | 30 | 31 |

32 | Path to a sorted BAM file. 33 |

34 | 35 | 36 |

37 | Path to error model file. 38 |

39 | 40 | 41 |

42 | Path to substitution model file. 43 |

44 | 45 | 46 |

47 | Maximum ploidy to evaluate. 48 |

49 | 50 | 51 |

52 | Phred-like threshold for outputting a SNP. 53 |

54 | 55 | 56 |

57 | SNP density. 58 |

59 | 60 | 61 |

62 | Store results for different contigs in separate files. 63 |

64 | 65 | 66 |

67 | Run ConPADE on each contig of the input BAM file. 68 |

69 | Name of the input BAM file. 70 | 71 | 72 |

73 | Class to build and store padded reads. 74 | The CIGAR string is used to pad insertions and deletions. 75 | Read positions can then be accessed based on the alignment to the reference. 76 |

77 | 78 | 79 |

80 | Padded_Read standard constructor. 81 |

82 | An input object of type SAMAlignedSequence. 83 | 84 | 85 |

86 | Pad deletions in an aligned read. 87 | Create an index of insertions/deletions so that aligned 88 | positions may be accessed based on reference index. 89 |

90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /ConPADE.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gramarga/ConPADE/e6022dc09ea96812101432bddb0fbac4e6bbc2bb/ConPADE.exe -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gramarga/ConPADE/e6022dc09ea96812101432bddb0fbac4e6bbc2bb/LICENSE.txt -------------------------------------------------------------------------------- /NOTICE.TXT: -------------------------------------------------------------------------------- 1 | ConPADE 2 | Copyright (c) Microsoft Corporation. All rights reserved. 3 | Non-Commercial Use Only 4 | _____________________________________________________________________ 5 | 6 | This software is released under the Microsoft Research License 7 | Agreement, ("MSR-LA" or the "License"); you may not use the software 8 | except in compliance with the License. You can find a copy of the 9 | License in the file LICENSE.TXT accompanying this file. 10 | _____________________________________________________________________ 11 | 12 | Versions of this software may also rely on additional libraries and 13 | code distributed under their respective licenses. To re-compile or 14 | build this code or derivatives thereof, you may be required to 15 | individually download and appropriately license some or all of of 16 | these additional libraries for your specific use. 17 | _____________________________________________________________________ 18 | -------------------- Third Party Notices -------------------- 19 | This file is based on or incorporates material from the projects 20 | listed below (collectively, 'Third Party Code'). Microsoft is not 21 | the original author of the Third Party Code. The original copyright 22 | notice and the license under which Microsoft received such Third 23 | Party Code, are set forth below. Such licenses and notices are 24 | provided for informational purposes only. Microsoft licenses the 25 | Third Party Code to you under the terms set forth in the EULA for 26 | the Microsoft Product. Microsoft reserves all other rights not 27 | expressly granted under this agreement, whether by implication, 28 | estoppel or otherwise. 29 | _____________________________________________________________________ 30 | Testdata.bam 31 | These sequence data were produced by the US Department of Energy 32 | Joint Genome Institute http://www.jgi.doe.gov/ in collaboration 33 | with the user community. 34 | _____________________________________________________________________ 35 | .NET Bio 36 | Copyright (c) 2011, The Outercurve Foundation. 37 | 38 | Provided for Informational Purposes Only 39 | 40 | Apache 2.0 License 41 | 42 | Licensed under the Apache License, Version 2.0 (the "License"); 43 | you may not use this file except in compliance with the License. 44 | You may obtain a copy of the License at 45 | http://www.apache.org/licenses/LICENSE-2.0 46 | 47 | THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR 48 | CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING 49 | WITHOUT LIMITATION ANY IMPLIED WARRANTIES OR CONDITIONS OF 50 | TITLE, FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABLITY OR 51 | NON-INFRINGEMENT. 52 | 53 | See the Apache Version 2.0 License for specific language 54 | governing permissions and limitations under the License. 55 | 56 | _____________________________________________________________________ 57 | .NET Bio, an open source library of common bioinformatics functions, 58 | intended to simplify the creation of life science applications. 59 | 60 | Copyright (c) 2011, The Outercurve Foundation. 61 | This software is released under the Apache License 2.0 (the 62 | "License"); you may not use the software except in compliance 63 | with the License. You can find a copy of the License in the 64 | file LICENSE[.TXT] accompanying this file. 65 | 66 | Additional Copyrights: 67 | MAFFTT: 68 | ------- 69 | The Weighted mixture of minimum and average linkage (weightedMAFFT) 70 | in IHierarchicalClustering.cs is adapted from MAFFT: multiple 71 | sequence alignment program - 72 | http://mafft.cbrc.jp/alignment/software/ 73 | Copyright (c) 2006 Kazutaka Katoh 74 | 75 | Redistribution and use in source and binary forms, with or without 76 | modification, are permitted provided that the following conditions 77 | are met: 78 | 79 | Redistributions of source code must retain the above copyright 80 | notice, this list of conditions and the following disclaimer. 81 | Redistributions in binary form must reproduce the above copyright 82 | notice, this list of conditions and the following disclaimer in 83 | the documentation and/or other materials provided with the 84 | distribution. 85 | 86 | The name of the author may not be used to endorse or promote 87 | products derived from this software without specific prior 88 | written permission. 89 | 90 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS 91 | OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 92 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 93 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 94 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 95 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 96 | GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 97 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 98 | WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 99 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 100 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 101 | 102 | NLog: 103 | ------------ 104 | The HttpUtility contains NLog (.NET logging library) that was 105 | contributed by Jaroslaw Kowalski - 106 | http://nlog-project.org/ 107 | 108 | Copyright (c) 2004-2006 Jaroslaw Kowalski 109 | All rights reserved. 110 | 111 | Redistribution and use in source and binary forms, with or without 112 | modification, are permitted provided that the following conditions 113 | are met: 114 | 115 | * Redistributions of source code must retain the above copyright 116 | notice, this list of conditions and the following disclaimer. 117 | 118 | * Redistributions in binary form must reproduce the above copyright 119 | notice, this list of conditions and the following disclaimer in 120 | the documentation and/or other materials provided with the 121 | distribution. 122 | 123 | * Neither the name of Jaroslaw Kowalski nor the names of its 124 | contributors may be used to endorse or promote products derived 125 | from this software without specific prior written permission. 126 | 127 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 128 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 129 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 130 | FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 131 | COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 132 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 133 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 134 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 135 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 136 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 137 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 138 | OF THE POSSIBILITY OF SUCH DAMAGE. 139 | 140 | QUT: 141 | ---- 142 | QUTBio in BioSequenceAssembler is a contribution from Queensland 143 | University of Technology 144 | 145 | QUT.Bio Version 1.1 146 | Copyright (c) 2009 Queensland University of Technology. All rights 147 | reserved. 148 | 149 | The QUT Bioinformatics Collection (QUT.Bio) is open source software 150 | subject to the terms of the Microsoft Public License (Ms-PL): 151 | http://www.microsoft.com/opensource/licenses.mspx. 152 | 153 | The latest version of the source code can be obtained at the project 154 | distribution point: http://qutbio.codeplex.com 155 | 156 | Additional Libraries: 157 | This software includes or is derivative of works distributed under 158 | the Apache license listed in the LICENSE.txt file. The full text 159 | for the license can be found in the LICENSE.txt file accompanying 160 | this file. 161 | 162 | AppliedBioSystems 163 | ----------------- 164 | Author: Jeremy Kolpak 165 | Contact: JKolpak@its.jnj.com 166 | Company: Johnson & Johnson Pharmaceutical Research and Development 167 | Copyright: Copyright (c) 2011, The Outercurve Foundation 168 | License: Apache License 2.0 169 | 170 | FileFormatConverter 171 | ----------------- 172 | Author: Daniel Ourada 173 | Contact: daniel.ourada@gmail.com 174 | Company: cBio - www.cbiocorp.com 175 | Copyright: Copyright (c) 2011, The Outercurve Foundation 176 | License: Apache License 2.0 177 | 178 | --------------- End of Third Party Notices --------------- 179 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | ###################################################################### 2 | # # 3 | # ConPADE: Contig Ploidy and Allele Dosage Estimation # 4 | # # 5 | ###################################################################### 6 | # # 7 | # Version 1.0-4 # 8 | # Last updated: 10/30/2015 # 9 | # # 10 | ###################################################################### 11 | 12 | 13 | For licensing information, please read file LICENSE.txt distributed with this package 14 | 15 | 16 | ###################################################################### 17 | 18 | 19 | Installing from source: 20 | 21 | - ConPADE is written in C# and can be built with any compatible compiler (Visual Studio, Xamarin Studio or MonoDevelop) 22 | - To compile ConPADE from source, you need to link against the .NET Bio libraries. These can be found at https://github.com/dotnetbio/bio 23 | 24 | After successfully building the binaries, make sure you also have the error model files (errorModel.bin and substModel.bin) from the main distribution folder. 25 | 26 | 27 | ###################################################################### 28 | 29 | 30 | First steps: 31 | 32 | To run a sample data set, simply open a command line, navigate to the ConPADE folder and type 33 | 34 | \YourWorkingDirectory\RunTestData.bat 35 | 36 | or issue the command 37 | 38 | \YourWorkingDirectory\ConPADE -bamName TestData.bam 39 | 40 | Make sure all downloaded files are in the folder. 41 | 42 | 43 | ###################################################################### 44 | 45 | 46 | Result files: 47 | 48 | Default behavior is to produce three files from the input BAM file. 49 | - ploidy: one line per contig, with the second column indicating the most likely ploidy, followed by the log-likelihoods for each evaluated ploidy 50 | - readStats: read usage statistics, a table containing information on numbers of aligned reads and base pairs for each contig 51 | - SNP: a table with identified variants, one SNP per line 52 | 53 | Optionally, argument -splitContigs can be used to produce four files for each individual contig. 54 | - logLikelihoods: contains log-likelihoods for each evaluated ploidy 55 | - ploidy: a single integer indicating the most likely ploidy 56 | - readStats: read usage statistics 57 | - SNP: a table with identified variants 58 | 59 | 60 | ###################################################################### 61 | 62 | 63 | To get help on usage and detailed information about arguments, open a command line, navigate to the ConPADE folder and type 64 | 65 | YourWorkingDirectory\ConPADE 66 | 67 | 68 | ###################################################################### 69 | 70 | 71 | Version changes: 72 | 73 | * 1.0-1 74 | - Using updated .Net Bio version 2.0 to fix BAM parsing 75 | - Output is now combined for all contigs (there is an option to split files for individual contigs) 76 | - Changed argument from -bamNames to -bamName 77 | 78 | * 1.0-2 79 | - Bug fix: properly ignore not aligned reads 80 | 81 | * 1.0-3 82 | - Bug fix: added support for dummy reads in BAM file 83 | - Bug fix: hard-clipped sequences are now correctly parsed 84 | 85 | * 1.0-4 86 | - First source code release 87 | - Removed dependency on Escience.dll -------------------------------------------------------------------------------- /RunTestData.bat: -------------------------------------------------------------------------------- 1 | ConPADE.exe -bamName TestData.bam -------------------------------------------------------------------------------- /TestData.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gramarga/ConPADE/e6022dc09ea96812101432bddb0fbac4e6bbc2bb/TestData.bam -------------------------------------------------------------------------------- /errorModel.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gramarga/ConPADE/e6022dc09ea96812101432bddb0fbac4e6bbc2bb/errorModel.bin -------------------------------------------------------------------------------- /src/ConPADE MIT License.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gramarga/ConPADE/e6022dc09ea96812101432bddb0fbac4e6bbc2bb/src/ConPADE MIT License.docx -------------------------------------------------------------------------------- /src/ConPADE.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.IO; 5 | using System.Diagnostics; 6 | using Bio.Util; 7 | using Bio.Util.Distribute; 8 | using Bio.Util.ArgumentParser; 9 | using Bio.IO.SAM; 10 | using Bio.IO.BAM; 11 | 12 | 13 | namespace ConPADE 14 | { 15 | ///

16 | /// Class to store the two alleles, read depths, best allele 17 | /// dosage and posterior probability for each putative SNP. 18 | ///

19 | class Best_Dose 20 | { 21 | public long position { get; private set; } 22 | public byte nuc_one { get; private set; } 23 | public int count_one { get; private set; } 24 | public byte nuc_two { get; private set; } 25 | public int count_two { get; private set; } 26 | public byte[] best_dose { get; private set; } 27 | public double[] SNP_posterior { get; private set; } 28 | 29 | ///

30 | /// Standard constructor for class Best_Dose. 31 | ///

32 | /// Zero-based position in the reference sequence. 33 | /// Reference allele. 34 | /// Reference allele depth. 35 | /// Alternative allele. 36 | /// Alternative allele depth. 37 | /// Most likely allele dosage for each possible ploidy. 38 | /// Posterior probability of there being a SNP for each possible ploidy. 39 | public Best_Dose(long position, byte nuc_one, int count_one, byte nuc_two, int count_two, 40 | byte[] best_dose, double[] SNP_posterior) 41 | { 42 | this.position = position; 43 | this.nuc_one = nuc_one; 44 | this.count_one = count_one; 45 | this.nuc_two = nuc_two; 46 | this.count_two = count_two; 47 | this.best_dose = best_dose; 48 | this.SNP_posterior = SNP_posterior; 49 | } 50 | } 51 | 52 | ///

53 | /// Main class of ConPADE. 54 | ///

55 | class ConPADE : SelfDistributable 56 | { 57 | private string _jobName = "ConPADE"; 58 | public override string JobName 59 | { 60 | get { return _jobName; } 61 | } 62 | 63 | ///

64 | /// Path to a sorted BAM file. 65 | ///

66 | [Parse(ParseAction.Required, typeof(InputFile))] 67 | public InputFile bamName = null; 68 | 69 | ///

70 | /// Path to error model file. 71 | ///

72 | [Parse(ParseAction.Optional, typeof(InputFile))] 73 | public InputFile modelFile = new InputFile(); 74 | 75 | ///

76 | /// Path to substitution model file. 77 | ///

78 | [Parse(ParseAction.Optional, typeof(InputFile))] 79 | public InputFile substFile = new InputFile(); 80 | 81 | ///

82 | /// Maximum ploidy to evaluate. 83 | ///

84 | [Parse(ParseAction.Optional, typeof(int))] 85 | public int max_ploidy = 4; 86 | 87 | ///

88 | /// Phred-like threshold for outputting a SNP. 89 | ///

90 | [Parse(ParseAction.Optional, typeof(int))] 91 | public int SNPthres = 40; 92 | 93 | ///

94 | /// SNP density. 95 | ///

96 | [Parse(ParseAction.Optional, typeof(int))] 97 | public int snpDens = 200; 98 | 99 | ///

100 | /// Store results for different contigs in separate files. 101 | ///

102 | [Parse(ParseAction.Optional, typeof(bool))] 103 | public bool splitContigs = false; 104 | 105 | public override void RunTasks(RangeCollection tasksToRun, long taskCount) 106 | { 107 | RunFile(bamName.FullName); 108 | } 109 | 110 | public override void Cleanup(long taskCount) 111 | { 112 | 113 | } 114 | 115 | 116 | // Calculate log(x + y), given log(x) and log(y) 117 | private static double LogSum(double log_x, double log_y) 118 | { 119 | double result; 120 | 121 | if (log_x > (log_y + 40)) 122 | { 123 | result = log_x; 124 | } 125 | else if (log_y > (log_x + 40)) 126 | { 127 | result = log_y; 128 | } 129 | else 130 | { 131 | result = Math.Log(Math.Exp(log_x - log_y) + 1) + log_y; 132 | } 133 | 134 | return result; 135 | } 136 | 137 | 138 | // For each ploidy to be evaluated, determine all possible nucleotide proportions/probabilities. 139 | // Store the logarithm of each probability. 140 | private double[][][] Nuc_Props(int min_ploidy, int number_of_ploidies) 141 | { 142 | double[][][] nuc_props = new double[number_of_ploidies][][]; 143 | for (int i = 0; i < number_of_ploidies; i++) 144 | { 145 | int ploidy = i + min_ploidy; 146 | nuc_props[i] = new double[ploidy + 1][]; 147 | 148 | for (int j = 0; j <= ploidy; j++) 149 | { 150 | double prob = (double)j / ploidy; 151 | nuc_props[i][j] = new double[2] 152 | { 153 | Math.Log(prob), 154 | Math.Log(1-prob) 155 | }; 156 | } 157 | } 158 | return nuc_props; 159 | } 160 | 161 | 162 | // Set the model for the probability of each allele dosage. 163 | // We currently use uniform probabilities for each heterozygous genotype. 164 | private double[][] Dose_Probs(int min_ploidy, int number_of_ploidies, double SNP_density, double no_SNP_prob) 165 | { 166 | double[][] dose_probs = new double[number_of_ploidies][]; 167 | 168 | // For a ploidy of 1: no SNP allowed. 169 | dose_probs[0] = new double[2]; 170 | dose_probs[0][0] = dose_probs[0][1] = Math.Log(0.5); 171 | 172 | for (int i = 1; i < number_of_ploidies; i++) 173 | { 174 | int ploidy = i + min_ploidy; 175 | dose_probs[i] = new double[ploidy + 1]; 176 | 177 | dose_probs[i][0] = no_SNP_prob; 178 | dose_probs[i][ploidy] = no_SNP_prob; 179 | 180 | double other_probs = Math.Log(SNP_density / (ploidy - 1)); 181 | for (int j = 1; j < ploidy; j++) 182 | { 183 | dose_probs[i][j] = other_probs; 184 | } 185 | } 186 | return dose_probs; 187 | } 188 | 189 | // Get sequencing error probabilities from the error model file. 190 | // Error model file contains double values nested according to the following sequence: 191 | // GG precedes - quality score - neighboring quality score - true nucleotide - error/no error 192 | private double[, , , ,] Error_Probs() 193 | { 194 | if (modelFile.FullName == null) 195 | { 196 | modelFile.FullName = "errorModel.bin"; 197 | } 198 | if (!File.Exists(modelFile.ToString())) 199 | { 200 | throw new FileNotFoundException(String.Format("File {0} not found.", modelFile.FullName.ToString())); 201 | } 202 | 203 | double[, , , ,] log_probs = new double[2, 40, 40, 4, 2]; 204 | bool got_file_handle = false; 205 | while (!got_file_handle) 206 | { 207 | got_file_handle = true; 208 | try 209 | { 210 | using (FileStream model_stream = new FileStream(modelFile.ToString(), FileMode.Open)) 211 | { 212 | using (BinaryReader model_reader = new BinaryReader(model_stream)) 213 | { 214 | for (int is_GG = 0; is_GG < 2; is_GG++) 215 | { 216 | for (int qual = 0; qual < 40; qual++) 217 | { 218 | for (int neigh_qual = 0; neigh_qual < 40; neigh_qual++) 219 | { 220 | for (int nuc = 0; nuc < 4; nuc++) 221 | { 222 | for (int is_correct = 0; is_correct < 2; is_correct++) 223 | { 224 | log_probs[is_GG, qual, neigh_qual, nuc, is_correct] = 225 | model_reader.ReadDouble(); 226 | } 227 | } 228 | } 229 | } 230 | } 231 | } 232 | } 233 | } 234 | catch (IOException) 235 | { 236 | got_file_handle = false; 237 | } 238 | } 239 | 240 | return log_probs; 241 | } 242 | 243 | // Get nucleotide substitution probabilities from the substitution model file. 244 | // Substitution model file contains double values nested according to the following sequence: 245 | // true nucleotide - GG precedes - observed nucleotide 246 | private double[, ,] Subst_Probs() 247 | { 248 | if (substFile.FullName == null) 249 | { 250 | substFile.FullName = "substModel.bin"; 251 | } 252 | if (!File.Exists(substFile.ToString())) 253 | { 254 | throw new FileNotFoundException(String.Format("File {0} not found.", substFile.FullName.ToString())); 255 | } 256 | 257 | double[, ,] log_subst_probs = new double[4, 2, 4]; 258 | bool got_file_handle = false; 259 | while (!got_file_handle) 260 | { 261 | got_file_handle = true; 262 | try 263 | { 264 | using (FileStream subst_stream = new FileStream(substFile.ToString(), FileMode.Open)) 265 | { 266 | using (BinaryReader subst_reader = new BinaryReader(subst_stream)) 267 | { 268 | for (int real_nuc = 0; real_nuc < 4; real_nuc++) 269 | { 270 | for (int is_GG = 0; is_GG < 2; is_GG++) 271 | { 272 | for (int obs_nuc = 0; obs_nuc < 4; obs_nuc++) 273 | { 274 | log_subst_probs[real_nuc, is_GG, obs_nuc] = 275 | subst_reader.ReadDouble(); 276 | } 277 | } 278 | } 279 | } 280 | } 281 | } 282 | catch (IOException) 283 | { 284 | got_file_handle = false; 285 | } 286 | } 287 | return log_subst_probs; 288 | } 289 | 290 | // Search the BAM file for the next valid read aligned against the current contig. 291 | // Update read/base pairs statistics. 292 | private void Search_Reads(BAMParser parser, ref SAMAlignedSequence next_alignment, string contig_name, 293 | ref long number_of_aligned_reads, ref long number_of_aligned_base_pairs, ref long number_of_used_reads, 294 | ref long number_of_used_base_pairs, Queue read_queue, long current_position) 295 | { 296 | while (next_alignment != null && 297 | !next_alignment.IsDummyRead && 298 | next_alignment.RName == contig_name && 299 | (next_alignment.Pos - 1) == current_position) 300 | { 301 | // The next alignment overlaps with current position, so continue. 302 | number_of_aligned_reads++; 303 | number_of_aligned_base_pairs += next_alignment.QuerySequence.Count; 304 | 305 | // Maybe we should let the mininum alignment quality be a parameter. 306 | // We currently leave it for the user to pre-filter the BAM file. 307 | if (next_alignment.MapQ > 0) 308 | { 309 | number_of_used_reads++; 310 | number_of_used_base_pairs += next_alignment.QuerySequence.Count; 311 | read_queue.Enqueue(new Padded_Read(next_alignment)); 312 | } 313 | 314 | #region Parse BAM file until next alignment is found 315 | if (!parser.IsEOF()) 316 | { 317 | next_alignment = parser.GetAlignedSequence(true); 318 | 319 | while ((next_alignment == null || next_alignment.RName == "*" || next_alignment.IsDummyRead) && !parser.IsEOF()) 320 | { 321 | next_alignment = parser.GetAlignedSequence(true); 322 | } 323 | } 324 | else 325 | { 326 | next_alignment = null; 327 | } 328 | #endregion Parse BAM file until next alignment is found 329 | } 330 | } 331 | 332 | // For each read in the queue that overlaps the current position, 333 | // extract information on the nucleotide and quality score. 334 | private void Extract_Read_Info(Queue read_queue, long current_position, out byte[] obs_nucs, 335 | out byte[] is_GG, out bool[] reverse, out int[] quality_scores, out int[] neigh_quality_scores, 336 | out int[] scores, out int[] counts, out int k) 337 | { 338 | obs_nucs = new byte[read_queue.Count]; 339 | is_GG = new byte[read_queue.Count]; 340 | reverse = new bool[read_queue.Count]; 341 | quality_scores = new int[read_queue.Count]; 342 | neigh_quality_scores = new int[read_queue.Count]; 343 | scores = new int[4]; 344 | counts = new int[4]; 345 | k = 0; 346 | 347 | foreach (Padded_Read curRead in read_queue) 348 | { 349 | // The check for overlap below may not be necessary depending on how we clear read cache. 350 | if ((curRead.alignment.Pos + curRead.alignment_length - 2) >= current_position) 351 | { 352 | byte nuc = curRead.padded_sequence[curRead.cur_pos_ind]; 353 | if (nuc < 4) 354 | { 355 | obs_nucs[k] = nuc; 356 | counts[nuc]++; 357 | scores[nuc] += quality_scores[k] = 358 | curRead.padded_quality_scores[curRead.cur_pos_ind]; 359 | 360 | // Quality scores are allowed in the [2,41] range. 361 | if (quality_scores[k] < 2 ) 362 | { 363 | quality_scores[k] = 3; 364 | } 365 | else if (quality_scores[k] > 41) 366 | { 367 | quality_scores[k] = 41; 368 | } 369 | 370 | #region Get neighboring quality scores 371 | int max_ind = curRead.located_sequence.Length - 1; 372 | int left_found = 0; 373 | int right_found = 0; 374 | int total_qual = 0; 375 | int no_qual = -10; 376 | 377 | // We currently define the neighboring nucleotides as 5 on each side. 378 | int running_ind = curRead.cur_pos_ind - 1; 379 | while (left_found < 5 && running_ind > 0) 380 | { 381 | int temp_qual = curRead.padded_quality_scores[running_ind--]; 382 | if (temp_qual != no_qual) 383 | { 384 | left_found++; 385 | total_qual += temp_qual; 386 | } 387 | } 388 | 389 | running_ind = curRead.cur_pos_ind + 1; 390 | while (right_found < 5 && running_ind < max_ind) 391 | { 392 | int temp_qual = curRead.padded_quality_scores[running_ind++]; 393 | if (temp_qual != -10) 394 | { 395 | right_found++; 396 | total_qual += temp_qual; 397 | } 398 | } 399 | 400 | neigh_quality_scores[k] = (int)Math.Round((double)total_qual / (left_found + right_found)); 401 | if (neigh_quality_scores[k] < 2) 402 | { 403 | neigh_quality_scores[k] = 3; 404 | } 405 | else if (neigh_quality_scores[k] > 41) 406 | { 407 | neigh_quality_scores[k] = 41; 408 | } 409 | #endregion Get neighboring quality scores 410 | 411 | #region Get preceding nucleotides and update values if read is reversed 412 | reverse[k] = curRead.is_reverse; 413 | int nucs_found = 0; 414 | if (reverse[k]) 415 | { 416 | running_ind = curRead.cur_pos_ind + 1; 417 | while (nucs_found < 2 && running_ind < max_ind) 418 | { 419 | byte temp_nuc = curRead.padded_sequence[running_ind++]; 420 | // If read is reversed, we look for C nucleotides after the current nucleotide. 421 | if (temp_nuc == 1) 422 | { 423 | nucs_found++; 424 | } 425 | else 426 | { 427 | if (temp_nuc <= 4) 428 | { 429 | break; 430 | } 431 | } 432 | } 433 | } 434 | else 435 | { 436 | running_ind = curRead.cur_pos_ind - 1; 437 | while (nucs_found < 2 && running_ind > 0) 438 | { 439 | byte temp_nuc = curRead.padded_sequence[running_ind--]; 440 | // If read is not reversed, we look for G nucleotides before the current nucleotide. 441 | if (temp_nuc == 2) 442 | { 443 | nucs_found++; 444 | } 445 | else 446 | { 447 | if (temp_nuc <= 4) 448 | { 449 | break; 450 | } 451 | } 452 | } 453 | } 454 | if (nucs_found == 2) 455 | { 456 | is_GG[k] = 1; 457 | } 458 | #endregion Get preceding nucleotides and update values if read is reversed 459 | 460 | k++; 461 | } 462 | 463 | // Update cur_pos_ind in the padded read to ignore insertions. 464 | while (curRead.located_sequence[++curRead.cur_pos_ind] == curRead.located_sequence[curRead.cur_pos_ind - 1]) ; 465 | } 466 | } 467 | } 468 | 469 | // Find two most abundant nucleotides for a given position. 470 | // We currently use A as a neutral nucleotide, based on its substitution model. 471 | private static void Get_Two_Nucs(int[] scores, out byte nuc_one, out byte nuc_two) 472 | { 473 | if (scores[1] > scores[0]) 474 | { 475 | nuc_one = 1; 476 | nuc_two = 0; 477 | } 478 | else 479 | { 480 | nuc_one = 0; 481 | nuc_two = 1; 482 | } 483 | for (byte i = 2; i < 4; i++) 484 | { 485 | if (scores[i] > scores[nuc_one]) 486 | { 487 | nuc_two = nuc_one; 488 | nuc_one = i; 489 | } 490 | else if (scores[i] > scores[nuc_two]) 491 | { 492 | nuc_two = i; 493 | } 494 | } 495 | } 496 | 497 | // For a set of aligned nucleotides and their auxiliary information, 498 | // return the probability of the observed values. 499 | // This includes both the sequencing error and the substitution models. 500 | private double[][] Obs_Probs(double[, , , ,] log_probs, double[, ,] log_subst_probs, byte[] obs_nucs, 501 | byte[] is_GG, bool[] reverse, int[] quality_scores, int[] neigh_quality_scores, int[] counts, int k, 502 | byte nuc_one, byte nuc_two) 503 | { 504 | byte nuc_one_reverse = (byte)(3 - nuc_one); 505 | byte nuc_two_reverse = (byte)(3 - nuc_two); 506 | 507 | int reads_to_use = counts[nuc_one] + counts[nuc_two]; 508 | 509 | double[][] log_nuc_probs = new double[reads_to_use][]; 510 | int l = 0; 511 | for (int i = 0; i < k; i++) 512 | { 513 | if (obs_nucs[i] == nuc_one) 514 | { 515 | byte nuc_one_to_use = nuc_one; 516 | byte nuc_two_to_use = nuc_two; 517 | if (reverse[i]) 518 | { 519 | nuc_one_to_use = nuc_one_reverse; 520 | nuc_two_to_use = nuc_two_reverse; 521 | } 522 | log_nuc_probs[l++] = new double[2] { 523 | log_probs[is_GG[i], quality_scores[i]-2, neigh_quality_scores[i]-2, nuc_one_to_use, 1], 524 | log_probs[is_GG[i], quality_scores[i]-2, neigh_quality_scores[i]-2, nuc_two_to_use, 0] + 525 | log_subst_probs[nuc_two_to_use, is_GG[i], nuc_one_to_use] 526 | }; 527 | } 528 | else if (obs_nucs[i] == nuc_two) 529 | { 530 | byte nuc_one_to_use = nuc_one; 531 | byte nuc_two_to_use = nuc_two; 532 | if (reverse[i]) 533 | { 534 | nuc_one_to_use = nuc_one_reverse; 535 | nuc_two_to_use = nuc_two_reverse; 536 | } 537 | log_nuc_probs[l++] = new double[2] { 538 | log_probs[is_GG[i], quality_scores[i]-2, neigh_quality_scores[i]-2, nuc_one_to_use, 0] + 539 | log_subst_probs[nuc_one_to_use, is_GG[i], nuc_two_to_use], 540 | log_probs[is_GG[i], quality_scores[i]-2, neigh_quality_scores[i]-2, nuc_two_to_use, 1] 541 | }; 542 | } 543 | } 544 | return log_nuc_probs; 545 | } 546 | 547 | // For each ploidy to be evaluated, calculate the likelihood of the genotypes given the observed data. 548 | private static double[][] Log_Likelihoods(int min_ploidy, int max_ploidy, double[][] log_nuc_probs, 549 | double[][][] nuc_props) 550 | { 551 | int number_of_ploidies = max_ploidy - min_ploidy + 1; 552 | 553 | double[][] log_likelihoods = new double[number_of_ploidies][]; 554 | 555 | // Likelihoods of the homozygous genotypes are the same for all ploidies. 556 | double log_like_P0 = 0; 557 | double log_like_P1 = 0; 558 | for (int read = 0; read < log_nuc_probs.Length; read++) 559 | { 560 | log_like_P0 += LogSum( 561 | log_nuc_probs[read][0] + nuc_props[0][0][0], 562 | log_nuc_probs[read][1] + nuc_props[0][0][1]); 563 | 564 | log_like_P1 += LogSum( 565 | log_nuc_probs[read][0] + nuc_props[0][min_ploidy][0], 566 | log_nuc_probs[read][1] + nuc_props[0][min_ploidy][1]); 567 | } 568 | 569 | for (int i = 0; i < number_of_ploidies; i++) 570 | { 571 | int ploidy = i + min_ploidy; 572 | log_likelihoods[i] = new double[ploidy + 1]; 573 | 574 | log_likelihoods[i][0] = log_like_P0; 575 | log_likelihoods[i][ploidy] = log_like_P1; 576 | 577 | // Calculate the likelihood of heterozygous genotypes. 578 | for (int j = 1; j < ploidy; j++) 579 | { 580 | for (int read = 0; read < log_nuc_probs.Length; read++) 581 | { 582 | log_likelihoods[i][j] += LogSum( 583 | log_nuc_probs[read][0] + nuc_props[i][j][0], 584 | log_nuc_probs[read][1] + nuc_props[i][j][1]); 585 | } 586 | } 587 | } 588 | 589 | return log_likelihoods; 590 | } 591 | 592 | // Calculate the likelihood of each ploidy for the current position, given genotype likelihoods. 593 | // Use this value to update the global likelihood of each ploidy. 594 | // Also, find the most likely allele dosage and store the corresponding posterior 595 | // probability of there being a SNP in the current position, for each ploidy. 596 | private void Global_Likelihood_Keep_Dose(int min_ploidy, int number_of_ploidies, double[][] dose_probs, 597 | double[] global_log_like, Queue dose_queue, long current_position, int[] counts, 598 | byte nuc_one, byte nuc_two, double[][] log_likelihoods) 599 | { 600 | byte[] best_dose = new byte[number_of_ploidies]; 601 | double[] SNP_posterior = new double[number_of_ploidies]; 602 | 603 | for (int i = 0; i < number_of_ploidies; i++) 604 | { 605 | int ploidy = i + min_ploidy; 606 | 607 | best_dose[i] = 0; 608 | double best_posterior = double.NegativeInfinity; 609 | double aggregate = double.NegativeInfinity; 610 | for (byte j = 0; j <= ploidy; j++) 611 | { 612 | double posterior = dose_probs[i][j] + log_likelihoods[i][j]; 613 | aggregate = LogSum( 614 | aggregate, posterior); 615 | if (posterior > best_posterior) 616 | { 617 | best_dose[i] = j; 618 | best_posterior = posterior; 619 | } 620 | } 621 | global_log_like[i] += aggregate; 622 | 623 | SNP_posterior[i] = LogSum( 624 | dose_probs[i][0] + log_likelihoods[i][0], 625 | dose_probs[i][ploidy] + log_likelihoods[i][ploidy]) - aggregate; 626 | } 627 | 628 | // Store information that will be part of the called SNPs output. 629 | dose_queue.Enqueue(new Best_Dose(current_position, nuc_one, counts[nuc_one], nuc_two, 630 | counts[nuc_two], best_dose, SNP_posterior)); 631 | } 632 | 633 | ///

634 | /// Run ConPADE on each contig of the input BAM file. 635 | ///

636 | /// Name of the input BAM file. 637 | public void RunFile(string bamName) 638 | { 639 | // Current implementation requires that minimum ploidy be 1 640 | int min_ploidy = 1; 641 | int number_of_ploidies = max_ploidy - min_ploidy + 1; 642 | 643 | // Set nucleotide proportions (genotypes) 644 | double[][][] nuc_props = Nuc_Props(min_ploidy, number_of_ploidies); 645 | 646 | // Set dosage probabilities 647 | double SNP_density = (double)1 / snpDens; 648 | double no_SNP_prob = Math.Log((1 - SNP_density) / 2); 649 | double[][] dose_probs = Dose_Probs(min_ploidy, number_of_ploidies, SNP_density, no_SNP_prob); 650 | 651 | // Set HiSeq error model 652 | double[, , , ,] log_probs = Error_Probs(); 653 | 654 | // Set substitution model 655 | double[, ,] log_subst_probs = Subst_Probs(); 656 | 657 | // Set SNP calling probability 658 | double log_SNP_thres = SNPthres * Math.Log(10) / -10; 659 | 660 | Stopwatch clock = new Stopwatch(); 661 | 662 | Console.WriteLine("Program started at {0}\n", DateTime.Now); 663 | 664 | Stream bam_stream = new FileStream(bamName, FileMode.Open, FileAccess.Read); 665 | BAMParser parser = new BAMParser(); 666 | SAMAlignmentHeader header = parser.GetHeader(bam_stream); 667 | string temp = Path.GetFileNameWithoutExtension(bamName); 668 | 669 | // Find first valid alignment in BAM file 670 | SAMAlignedSequence next_alignment = parser.GetAlignedSequence(true); 671 | while (next_alignment == null || next_alignment.RName == "*" || next_alignment.IsDummyRead) 672 | { 673 | next_alignment = parser.GetAlignedSequence(true); 674 | } 675 | 676 | TextWriter writer_log_like = null; 677 | TextWriter writer_SNP = null; 678 | TextWriter writer_ploidy = null; 679 | TextWriter writer_reads = null; 680 | 681 | // Create global output files and write headers. 682 | if (!splitContigs) 683 | { 684 | string SNP_file = temp + "_SNP.txt"; 685 | writer_SNP = new StreamWriter(SNP_file); 686 | writer_SNP.WriteLine("Contig\tPosition\tAlleles\tCounts\tDosage\tPhredQuality"); 687 | 688 | string ploidy_file = temp + "_ploidy.txt"; 689 | writer_ploidy = new StreamWriter(ploidy_file); 690 | writer_ploidy.Write("Contig\tBestPloidy"); 691 | for (int i = 0; i < number_of_ploidies; i++) 692 | { 693 | writer_ploidy.Write("\tlogLike_M{0}", i + min_ploidy); 694 | } 695 | writer_ploidy.WriteLine(""); 696 | 697 | string reads_file = temp + "_readStats.txt"; 698 | writer_reads = new StreamWriter(reads_file); 699 | writer_reads.WriteLine("Contig\tAlignedReads\tAlignedBases\tUsedReads\tUsedBases"); 700 | } 701 | 702 | // Run over each contig in input BAM file. 703 | int contig_ind = -1; 704 | while (next_alignment != null && next_alignment.RName != "*" && !next_alignment.IsDummyRead) 705 | { 706 | string contig_name = next_alignment.RName; 707 | 708 | Console.WriteLine("Started contig {0} at {1}", 709 | contig_name, DateTime.Now); 710 | 711 | clock.Restart(); 712 | 713 | #region Variables and file handles for current contig 714 | long number_of_aligned_reads = 0; 715 | long number_of_aligned_base_pairs = 0; 716 | long number_of_used_reads = 0; 717 | long number_of_used_base_pairs = 0; 718 | 719 | // Create individual output files for the current contig. 720 | if (splitContigs) 721 | { 722 | string name = temp + "_" + contig_name; 723 | string log_like_file = name + "_log_likelihoods.txt"; 724 | writer_log_like = new StreamWriter(log_like_file); 725 | 726 | string SNP_file = name + "_SNP.txt"; 727 | writer_SNP = new StreamWriter(SNP_file); 728 | 729 | string ploidy_file = name + "_ploidy.txt"; 730 | writer_ploidy = new StreamWriter(ploidy_file); 731 | 732 | string reads_file = name + "_readStats.txt"; 733 | writer_reads = new StreamWriter(reads_file); 734 | } 735 | 736 | double[] global_log_like = new double[number_of_ploidies]; 737 | 738 | while (header.ReferenceSequences[++contig_ind].Name != contig_name) ; 739 | long contig_length = header.ReferenceSequences[contig_ind].Length; 740 | 741 | // Create a queue to include all reads that overlap with a given position. 742 | Queue read_queue = new Queue(); 743 | 744 | // Create a queue to include best doses for each tested position. 745 | Queue dose_queue = new Queue((int)contig_length); 746 | #endregion Variables and file handles for current contig 747 | 748 | int positions_to_compute = 0; 749 | long current_position = 0; 750 | 751 | #region Run over every position in contig 752 | while (current_position < contig_length) 753 | { 754 | if ((current_position % 1000000) == 0 && current_position != 0) 755 | { 756 | Console.WriteLine("At position {0} of {1}", current_position + 1, contig_length); 757 | } 758 | 759 | // Search for reads starting at current position. 760 | Search_Reads(parser, ref next_alignment, contig_name, ref number_of_aligned_reads, 761 | ref number_of_aligned_base_pairs, ref number_of_used_reads, ref number_of_used_base_pairs, 762 | read_queue, current_position); 763 | 764 | if (read_queue.Count > 0) 765 | { 766 | positions_to_compute++; 767 | 768 | // Extract information from each read in queue. 769 | byte[] obs_nucs; 770 | byte[] is_GG; 771 | bool[] reverse; 772 | int[] quality_scores; 773 | int[] neigh_quality_scores; 774 | int[] scores; 775 | int[] counts; 776 | int k; 777 | 778 | Extract_Read_Info(read_queue, current_position, out obs_nucs, out is_GG, out reverse, 779 | out quality_scores, out neigh_quality_scores, out scores, out counts, out k); 780 | 781 | // Find two most abundant nucleotides for this position. 782 | byte nuc_one; 783 | byte nuc_two; 784 | Get_Two_Nucs(scores, out nuc_one, out nuc_two); 785 | 786 | // Calculate Pr(obs|allele1) and Pr(obs|allele2). 787 | double[][] log_nuc_probs = Obs_Probs(log_probs, log_subst_probs, obs_nucs, is_GG, reverse, 788 | quality_scores, neigh_quality_scores, counts, k, nuc_one, nuc_two); 789 | 790 | // Calculate log_likelihoods of genotypes for current position. 791 | double[][] log_likelihoods = Log_Likelihoods(min_ploidy, max_ploidy, log_nuc_probs, nuc_props); 792 | 793 | // Calculate log_likelihood of each ploidy and keep most likely allele dosage. 794 | Global_Likelihood_Keep_Dose(min_ploidy, number_of_ploidies, dose_probs, global_log_like, 795 | dose_queue, current_position, counts, nuc_one, nuc_two, log_likelihoods); 796 | } 797 | 798 | // Remove finished reads from queue. Finished reads no longer overlap with current position. 799 | Padded_Read read_to_remove; 800 | if (read_queue.Count > 0) 801 | { 802 | read_to_remove = read_queue.First(); 803 | } 804 | else 805 | { 806 | read_to_remove = null; 807 | } 808 | 809 | while (read_to_remove != null && 810 | (read_to_remove.alignment.Pos + read_to_remove.alignment_length - 2) < current_position) 811 | { 812 | read_queue.Dequeue(); 813 | if (read_queue.Count > 0) 814 | { 815 | read_to_remove = read_queue.First(); 816 | } 817 | else 818 | { 819 | read_to_remove = null; 820 | } 821 | } 822 | 823 | ++current_position; 824 | } 825 | #endregion Run over every position in contig 826 | 827 | // Output log_likelihoods. 828 | int best_log_like = 0; 829 | for (int i = 0; i < number_of_ploidies; i++) 830 | { 831 | if (global_log_like[i] > global_log_like[best_log_like]) 832 | { 833 | best_log_like = i; 834 | } 835 | 836 | if (splitContigs) 837 | { 838 | writer_log_like.WriteLine("Ploidy {0} - log_likelihood {1}", i + min_ploidy, global_log_like[i]); 839 | } 840 | } 841 | 842 | // Output most likely ploidy. 843 | int best_ploidy = best_log_like + min_ploidy; 844 | if (splitContigs) 845 | { 846 | writer_ploidy.WriteLine(best_ploidy); 847 | } 848 | else 849 | { 850 | writer_ploidy.Write("{0}\t{1}", contig_name, best_ploidy); 851 | for (int i = 0; i < number_of_ploidies; i++) 852 | { 853 | writer_ploidy.Write("\t{0}", global_log_like[i]); 854 | } 855 | writer_ploidy.WriteLine(""); 856 | } 857 | 858 | // Output SNPs. 859 | if (splitContigs) 860 | { 861 | writer_SNP.WriteLine("Position\tAlleles\tCounts\tDosage\tPhredQuality"); 862 | } 863 | char[] nuc_chars = new char[4] { 'A', 'C', 'G', 'T' }; 864 | foreach (Best_Dose cur_doses in dose_queue) 865 | { 866 | double cur_SNP_posterior = cur_doses.SNP_posterior[best_log_like]; 867 | if (cur_SNP_posterior <= log_SNP_thres) 868 | { 869 | int cur_best_dose = cur_doses.best_dose[best_log_like]; 870 | if (cur_best_dose != best_ploidy && cur_best_dose != 0) 871 | { 872 | if (splitContigs) 873 | { 874 | writer_SNP.WriteLine("{0}\t{1}|{2}\t{3}|{4}\t{5}\t{6}", cur_doses.position + 1, 875 | nuc_chars[cur_doses.nuc_one], nuc_chars[cur_doses.nuc_two], cur_doses.count_one, 876 | cur_doses.count_two, cur_best_dose, -10 * cur_SNP_posterior / Math.Log(10)); 877 | } 878 | else 879 | { 880 | writer_SNP.WriteLine("{0}\t{1}\t{2}|{3}\t{4}|{5}\t{6}\t{7}", contig_name, 881 | cur_doses.position + 1, nuc_chars[cur_doses.nuc_one], nuc_chars[cur_doses.nuc_two], 882 | cur_doses.count_one, cur_doses.count_two, cur_best_dose, 883 | -10 * cur_SNP_posterior / Math.Log(10)); 884 | } 885 | } 886 | } 887 | } 888 | 889 | // Output read statistics. 890 | if (splitContigs) 891 | { 892 | writer_reads.WriteLine("\nNumber of aligned reads: {0}", number_of_aligned_reads); 893 | writer_reads.WriteLine("Number of aligned base pairs: {0}", number_of_aligned_base_pairs); 894 | writer_reads.WriteLine("\nNumber of used reads: {0}", number_of_used_reads); 895 | writer_reads.WriteLine("Number of used base pairs: {0}", number_of_used_base_pairs); 896 | } 897 | else 898 | { 899 | writer_reads.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", contig_name, number_of_aligned_reads, 900 | number_of_aligned_base_pairs, number_of_used_reads, number_of_used_base_pairs); 901 | } 902 | 903 | if (splitContigs) 904 | { 905 | writer_log_like.Close(); 906 | writer_SNP.Close(); 907 | writer_ploidy.Close(); 908 | writer_reads.Close(); 909 | } 910 | 911 | clock.Stop(); 912 | Console.WriteLine("Time to run contig: {0} s\n", (double)clock.ElapsedMilliseconds / 1000); 913 | } 914 | 915 | if (!splitContigs) 916 | { 917 | writer_SNP.Close(); 918 | writer_ploidy.Close(); 919 | writer_reads.Close(); 920 | } 921 | 922 | parser.Dispose(); 923 | Console.WriteLine("Finished at {0}\n", DateTime.Now); 924 | } 925 | 926 | static void Main(string[] args) 927 | { 928 | CommandArguments.ConstructAndRun(args); 929 | } 930 | } 931 | } 932 | -------------------------------------------------------------------------------- /src/ConPADE.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Debug 5 | x86 6 | 8.0.30703 7 | 2.0 8 | {DB342BEE-30EC-47F1-B3F8-394260F30ACD} 9 | Exe 10 | Properties 11 | ConPADE 12 | ConPADE 13 | v4.5 14 | 15 | 16 | 512 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | x86 28 | true 29 | full 30 | false 31 | bin\Debug\ 32 | DEBUG;TRACE 33 | prompt 34 | 4 35 | bin\Debug\ConPADE.XML 36 | false 37 | 38 | 39 | x86 40 | pdbonly 41 | true 42 | bin\Release\ 43 | TRACE 44 | prompt 45 | 4 46 | false 47 | 48 | 49 | true 50 | bin\Debug\ 51 | DEBUG;TRACE 52 | bin\Debug\ConPADE.XML 53 | full 54 | AnyCPU 55 | bin\Debug\ConPADE.exe.CodeAnalysisLog.xml 56 | true 57 | GlobalSuppressions.cs 58 | prompt 59 | MinimumRecommendedRules.ruleset 60 | ;C:\Program Files (x86)\Microsoft Visual Studio 10.0\Team Tools\Static Analysis Tools\\Rule Sets 61 | false 62 | ;C:\Program Files (x86)\Microsoft Visual Studio 10.0\Team Tools\Static Analysis Tools\FxCop\\Rules 63 | true 64 | false 65 | 66 | 67 | bin\Release\ 68 | TRACE 69 | true 70 | pdbonly 71 | AnyCPU 72 | bin\Release\ConPADE.exe.CodeAnalysisLog.xml 73 | true 74 | GlobalSuppressions.cs 75 | prompt 76 | MinimumRecommendedRules.ruleset 77 | ;C:\Program Files (x86)\Microsoft Visual Studio 10.0\Team Tools\Static Analysis Tools\\Rule Sets 78 | true 79 | ;C:\Program Files (x86)\Microsoft Visual Studio 10.0\Team Tools\Static Analysis Tools\FxCop\\Rules 80 | true 81 | bin\Release\ConPADE.XML 82 | false 83 | 84 | 85 | 86 | .\Bio.Core.dll 87 | 88 | 89 | .\Bio.Desktop.dll 90 | 91 | 92 | .\Bio.Platform.Helpers.dll 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 118 | -------------------------------------------------------------------------------- /src/Read.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Linq; 3 | using Bio; 4 | using Bio.IO.SAM; 5 | 6 | namespace ConPADE 7 | { 8 | ///

9 | /// Class to build and store padded reads. 10 | /// The CIGAR string is used to pad insertions and deletions. 11 | /// Read positions can then be accessed based on the alignment to the reference. 12 | ///

13 | class Padded_Read 14 | { 15 | public SAMAlignedSequence alignment { get; private set; } 16 | public byte[] padded_sequence; 17 | public int[] padded_quality_scores; 18 | public int[] located_sequence; 19 | public bool is_reverse; 20 | public int cur_pos_ind; 21 | public int[] numbers; 22 | public char[] letters; 23 | public int vector_ind; 24 | public int alignment_length; 25 | 26 | ///

27 | /// Padded_Read standard constructor. 28 | ///

29 | /// An input object of type SAMAlignedSequence. 30 | public Padded_Read(SAMAlignedSequence alignment) 31 | { 32 | this.alignment = alignment; 33 | this.is_reverse = alignment.Flag.HasFlag(SAMFlags.QueryOnReverseStrand); 34 | this.Pad_Read(); 35 | } 36 | 37 | // Split CIGAR string into paired "numbers" and "letters". 38 | private void Split_CIGAR() 39 | { 40 | this.numbers = new int[1000]; 41 | this.letters = new char[1000]; 42 | int cigar_ind = -1; 43 | this.vector_ind = 0; 44 | while (cigar_ind < (this.alignment.CIGAR.Length - 1)) 45 | { 46 | int cur_number = 0; 47 | int current; 48 | while (Int32.TryParse(this.alignment.CIGAR[++cigar_ind].ToString(), out current)) 49 | { 50 | cur_number = 10 * cur_number + current; 51 | } 52 | this.numbers[this.vector_ind] = cur_number; 53 | this.letters[this.vector_ind++] = this.alignment.CIGAR[cigar_ind]; 54 | } 55 | } 56 | 57 | // Count the total length of deletions to pad. 58 | // Currently considering deletions ("D") and soft padding ("P") 59 | private int Count_Padding() 60 | { 61 | int extras = 0; 62 | for (int i = 0; i < this.numbers.Length; i++) 63 | { 64 | if (this.letters[i] == 'D' || this.letters[i] == 'P') 65 | { 66 | extras += this.numbers[i]; 67 | } 68 | } 69 | return extras; 70 | } 71 | 72 | ///

73 | /// Pad deletions in an aligned read. 74 | /// Create an index of insertions/deletions so that aligned 75 | /// positions may be accessed based on reference index. 76 | ///

77 | public void Pad_Read() 78 | { 79 | byte[] sequence = this.alignment.QuerySequence.ToArray(); 80 | int[] quality_scores = (this.alignment.QuerySequence as QualitativeSequence).GetQualityScores(); 81 | 82 | this.Split_CIGAR(); 83 | 84 | int extras = Count_Padding(); 85 | 86 | // size includes deletions and one spacer on both sides 87 | int size = sequence.Length + extras + 2; 88 | this.padded_sequence = new byte[size]; 89 | this.padded_quality_scores = new int[size]; 90 | this.located_sequence = new int[size]; 91 | 92 | // Values to use for spacers 93 | int no_qual = -10; 94 | byte spacer_nuc = 5; 95 | int spacer_ind = -1; 96 | 97 | int last_ind = size - 1; 98 | this.padded_sequence[0] = spacer_nuc; 99 | this.padded_sequence[last_ind] = spacer_nuc; 100 | this.padded_quality_scores[0] = no_qual; 101 | this.padded_quality_scores[last_ind] = no_qual; 102 | this.located_sequence[0] = spacer_ind; 103 | this.located_sequence[last_ind] = spacer_ind; 104 | 105 | int k = 1; 106 | int l = 0; 107 | int m = 1; 108 | 109 | // Nucleotide : Index 110 | // A : 0 111 | // C : 1 112 | // G : 2 113 | // T : 3 114 | // N : 4 115 | // Others (incl dels) : 5 116 | // We may want to use a separate index for deletions, if we want to call indels in the future 117 | 118 | int last = spacer_ind; 119 | for (int i = 0; i < this.letters.Length; i++) 120 | { 121 | char cur_letter = this.letters[i]; 122 | if (cur_letter == 'D' || 123 | cur_letter == 'P') 124 | { 125 | if (cur_letter == 'D') 126 | { 127 | for (int j = 0; j < this.numbers[i]; j++) 128 | { 129 | this.located_sequence[m++] = ++last; 130 | this.padded_quality_scores[k] = no_qual; 131 | this.padded_sequence[k++] = spacer_nuc; 132 | } 133 | } 134 | } 135 | else 136 | { 137 | if (cur_letter == 'M' || 138 | cur_letter == '=' || 139 | cur_letter == 'X') 140 | { 141 | for (int j = 0; j < this.numbers[i]; j++) 142 | { 143 | this.located_sequence[m++] = ++last; 144 | } 145 | } 146 | else if (cur_letter == 'I') 147 | { 148 | if (last == spacer_ind) 149 | { 150 | ++last; 151 | } 152 | for (int j = 0; j < this.numbers[i]; j++) 153 | { 154 | this.located_sequence[m++] = last; 155 | } 156 | } 157 | else if (cur_letter == 'S') 158 | { 159 | for (int j = 0; j < this.numbers[i]; j++) 160 | { 161 | this.located_sequence[m++] = spacer_ind; 162 | } 163 | } 164 | 165 | if (cur_letter != 'H') { 166 | for (int j = 0; j < this.numbers[i]; j++) 167 | { 168 | this.padded_quality_scores[k] = quality_scores[l]; 169 | switch (sequence[l++]) 170 | { 171 | case 65: 172 | this.padded_sequence[k++] = 0; 173 | break; 174 | case 67: 175 | this.padded_sequence[k++] = 1; 176 | break; 177 | case 71: 178 | this.padded_sequence[k++] = 2; 179 | break; 180 | case 84: 181 | this.padded_sequence[k++] = 3; 182 | break; 183 | case 78: 184 | this.padded_sequence[k++] = 4; 185 | break; 186 | default: 187 | this.padded_sequence[k++] = 5; 188 | break; 189 | }; 190 | } 191 | } 192 | } 193 | } 194 | 195 | // Set cur_pos_ind to the position with the first aligned nucleotide. 196 | // This value will be updated by other methods. 197 | this.cur_pos_ind = 0; 198 | while (this.located_sequence[++this.cur_pos_ind] == spacer_ind) ; 199 | 200 | this.alignment_length = this.alignment.RefEndPos - this.alignment.Pos + 1; 201 | } 202 | } 203 | } -------------------------------------------------------------------------------- /substModel.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gramarga/ConPADE/e6022dc09ea96812101432bddb0fbac4e6bbc2bb/substModel.bin --------------------------------------------------------------------------------