├── README ├── package.xml └── src └── uk └── ac └── ebi └── gnx ├── ParseFasta.java └── CalculateGNx.java /README: -------------------------------------------------------------------------------- 1 | Basic Genome assembly statistic tool to calculate Nx values e.g. N50, N10, NG50 2 | 3 | INSTALLATION 4 | 5 | a) Download git repository 6 | b) Compile java classes 7 | # in gnx-tools folder run 8 | mkdir bin 9 | javac -d bin/ src/uk/ac/ebi/gnx/* 10 | c) package jar file 11 | # in gnx-tools foder run 12 | ant -f package.xml 13 | d) execute jar file 14 | java -jar gnx.jar 15 | 16 | DONE 17 | 18 | 19 | -------------------------------------------------------------------------------- /package.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /src/uk/ac/ebi/gnx/ParseFasta.java: -------------------------------------------------------------------------------- 1 | /** 2 | * File: ParseFasta.java 3 | * Created by: mhaimel 4 | * Created on: Feb 23, 2012 5 | * CVS: $Id: ParseFasta.java 1.0 Feb 23, 2012 2:29:04 PM mhaimel Exp $ 6 | */ 7 | package uk.ac.ebi.gnx; 8 | 9 | import java.io.IOException; 10 | import java.io.InputStream; 11 | import java.util.ArrayList; 12 | import java.util.List; 13 | 14 | /** 15 | * @author mhaimel 16 | * 17 | */ 18 | public class ParseFasta { 19 | private volatile InputStream in = null; 20 | 21 | private volatile List len = new ArrayList(); 22 | private volatile long nsCnt = 0; 23 | private volatile long totalCnt = 0; 24 | 25 | private long minLen = 0; 26 | 27 | public ParseFasta() { 28 | // empty 29 | } 30 | public long getNsCnt() { 31 | return nsCnt; 32 | } 33 | public void setNsCnt(long nsCnt) { 34 | this.nsCnt = nsCnt; 35 | } 36 | public long getTotalCnt() { 37 | return totalCnt; 38 | } 39 | public void setTotalCnt(long totalCnt) { 40 | this.totalCnt = totalCnt; 41 | } 42 | public void setMinLen(long minLen) { 43 | this.minLen = minLen; 44 | } 45 | public InputStream getIn() { 46 | return in; 47 | } 48 | public void setIn(InputStream in) { 49 | this.in = in; 50 | } 51 | public void setLen(List len) { 52 | this.len = len; 53 | } 54 | public List getLen() { 55 | return len; 56 | } 57 | public long getMinLen() { 58 | return minLen; 59 | } 60 | 61 | public void process() throws IOException { 62 | int currSeqCnt = 0; 63 | int currNsCnt = 0; 64 | int currSeqOfOddChars = 0; 65 | while(true){ 66 | switch (in.read()) { 67 | case -1: 68 | // EOF 69 | case '>': 70 | if(currSeqCnt > 0){ 71 | if(addLength(currSeqCnt)){ 72 | this.nsCnt += currNsCnt; 73 | } 74 | } 75 | currSeqCnt = 0; 76 | currSeqOfOddChars = 0; 77 | currNsCnt = 0; 78 | boolean search = true; // ugly, but should work for the moment; 79 | while(search){ 80 | switch (in.read()) { 81 | case -1: 82 | // EOF 83 | return; 84 | case '\n': 85 | search = false; 86 | break; 87 | default: 88 | break; 89 | } 90 | } 91 | break; 92 | case '\r': 93 | case '\n': 94 | // ignore 95 | break; 96 | 97 | case 'n': 98 | case 'N': 99 | ++currNsCnt; 100 | case 'a': 101 | case 'A': 102 | case 't': 103 | case 'T': 104 | case 'g': 105 | case 'G': 106 | case 'c': 107 | case 'C': 108 | ++ currSeqCnt; 109 | break; 110 | default: 111 | ++currSeqOfOddChars; 112 | break; 113 | } 114 | } 115 | } 116 | 117 | private boolean addLength(int seqLen) { 118 | if(seqLen >= this.minLen ){ 119 | this.totalCnt += seqLen; 120 | this.len.add(seqLen); 121 | return true; 122 | } 123 | return false; 124 | } 125 | 126 | public void reset() { 127 | this.nsCnt = 0; 128 | this.totalCnt = 0; 129 | this.len.clear(); 130 | this.in = null; 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /src/uk/ac/ebi/gnx/CalculateGNx.java: -------------------------------------------------------------------------------- 1 | /** 2 | * File: CalculateGNx.java 3 | * Created by: mhaimel 4 | * Created on: Feb 22, 2012 5 | * CVS: $Id: CalculateGNx.java 1.0 Feb 22, 2012 4:40:13 PM mhaimel Exp $ 6 | */ 7 | package uk.ac.ebi.gnx; 8 | 9 | import java.io.BufferedInputStream; 10 | import java.io.File; 11 | import java.io.FileInputStream; 12 | import java.io.IOException; 13 | import java.io.InputStream; 14 | import java.io.PrintStream; 15 | import java.util.ArrayList; 16 | import java.util.Collections; 17 | import java.util.List; 18 | import java.util.zip.GZIPInputStream; 19 | 20 | /** 21 | * @author mhaimel 22 | * 23 | */ 24 | public class CalculateGNx { 25 | 26 | private final double nxPosition; // default 27 | 28 | public CalculateGNx() { 29 | this(0.5); 30 | } 31 | 32 | public CalculateGNx(double position) { 33 | this.nxPosition = position; 34 | } 35 | 36 | private void printResults(PrintStream out, List revSortedList, long total) { 37 | int tCnt = revSortedList.size(); 38 | long tSum = total; 39 | // for(Integer i : revSortedList){ 40 | // tSum += i; 41 | // } 42 | double cOff = ((double)tSum)*this.nxPosition; 43 | 44 | long sum = 0; 45 | int i = 0; 46 | Integer sLen = 0; 47 | for(i = 0; i < tCnt; ++i){ 48 | sLen = revSortedList.get(i); 49 | sum += sLen; 50 | if(sum >= cOff){ 51 | break; 52 | } 53 | } 54 | out.println( 55 | "N" 56 | +Double.valueOf(this.nxPosition * 100).intValue()+":\t" 57 | + sLen 58 | + "\t("+(i+1)+" sequences)" 59 | + "\t("+sum +" bp combined)"); 60 | } 61 | 62 | 63 | private static void printHelp(PrintStream out) { 64 | out.println("gnx [-min ] [-nx 25,50,75] [-g ] "); 65 | out.println("-min Minimum bp length of a sequence to be considered"); 66 | out.println("-nx Nx values to be printed seperated by ',' e.g. 50 for N50, 25 for N25"); 67 | out.println("-g genome size to be used to calculte Nx values"); 68 | out.println(" "); 69 | out.println(" o /path/to/file.fa"); 70 | out.println(" o use '-' for standard input"); 71 | out.println(" o file-a.fa file-b.fa for a list of files"); 72 | } 73 | 74 | /** 75 | * @param args 76 | * @throws IOException 77 | */ 78 | public static void main(String[] args) throws IOException { 79 | if(args.length == 0){ 80 | System.err.println("Please provide an input!"); 81 | printHelp(System.err); 82 | fail("Please provide an input!"); 83 | } 84 | 85 | List nxList = new ArrayList(); 86 | nxList.add(new CalculateGNx()); 87 | ParseFasta pfa = new ParseFasta(); 88 | long genomeSize = -1; 89 | List list = new ArrayList(); 90 | for(int i = 0; i < args.length; ++i){ 91 | String s = args[i]; 92 | if(s.equals("-min")){ 93 | pfa.setMinLen(Long.valueOf(args[++i])); 94 | } else if(s.equals("-g")){ 95 | genomeSize = Long.valueOf(args[++i]); 96 | } else if(s.equals("-nx")){ 97 | nxList.clear(); 98 | for(String v : args[++i].split(",")){ 99 | if(v.length() > 0){ 100 | nxList.add(new CalculateGNx(Double.valueOf(v)/100)); 101 | } 102 | } 103 | } else if(s.equals("-")){ 104 | list.add(new File("-")); 105 | } else { 106 | File f = new File(s); 107 | if(!f.isFile()){ 108 | fail(s + " is not a file!!!"); 109 | } else if(!f.canRead()){ 110 | fail(s + " is not Readable!!!"); 111 | } 112 | list.add(f); 113 | } 114 | } 115 | boolean isStream = false; 116 | long gSize = 0; 117 | for(File f : list){ 118 | try{ 119 | if(f.getName().equals("-")){ 120 | isStream = true; 121 | pfa.setIn(System.in); 122 | } else { 123 | pfa.setIn(openFile(f)); 124 | } 125 | pfa.process(); 126 | // TODO process results 127 | System.out.println("Results for " + f ); 128 | System.out.println("Total number of sequences: " + pfa.getLen().size()); 129 | System.out.println("Total length of sequences: " + pfa.getTotalCnt() + " bp"); 130 | // Sort 131 | Collections.sort(pfa.getLen()); 132 | System.out.println("Shortest sequence length : " + (pfa.getLen().isEmpty()?0:pfa.getLen().get(0)) + " bp"); 133 | 134 | // Reverse 135 | Collections.reverse(pfa.getLen()); 136 | System.out.println("Longest sequence length : " + (pfa.getLen().isEmpty()?0:pfa.getLen().get(0))+ " bp"); 137 | 138 | gSize = 0; 139 | if(genomeSize < 0){ 140 | gSize = pfa.getTotalCnt(); 141 | } else { 142 | gSize = genomeSize; 143 | System.out.println("-> with a provided genome size of: " + gSize + " bp"); 144 | } 145 | System.out.println("Total number of Ns in sequences: " + pfa.getNsCnt()); 146 | for(CalculateGNx nx : nxList){ 147 | nx.printResults(System.out,pfa.getLen(), gSize); 148 | } 149 | if(!isStream){ 150 | pfa.getIn().close(); 151 | } 152 | pfa.setIn(null); 153 | } finally{ 154 | if(pfa.getIn() != null){ 155 | try{ 156 | pfa.getIn().close(); 157 | } catch (Exception e) { 158 | // ignore 159 | } 160 | pfa.setIn(null); 161 | } 162 | } 163 | System.out.println(""); 164 | pfa.reset(); 165 | } 166 | } 167 | 168 | private static InputStream openFile(File f) throws IOException { 169 | InputStream in = new FileInputStream(f); 170 | if(f.getName().endsWith(".gz") || f.getName().endsWith(".gzip")){ 171 | in = new GZIPInputStream(in); 172 | } 173 | in = new BufferedInputStream(in); 174 | return in; 175 | } 176 | 177 | private static void fail(String msg) { 178 | System.err.println(msg); 179 | System.exit(1); 180 | } 181 | 182 | 183 | 184 | 185 | } 186 | --------------------------------------------------------------------------------