├── README
├── package.xml
└── src
└── uk
└── ac
└── ebi
└── gnx
├── ParseFasta.java
└── CalculateGNx.java
/README:
--------------------------------------------------------------------------------
1 | Basic Genome assembly statistic tool to calculate Nx values e.g. N50, N10, NG50
2 |
3 | INSTALLATION
4 |
5 | a) Download git repository
6 | b) Compile java classes
7 | # in gnx-tools folder run
8 | mkdir bin
9 | javac -d bin/ src/uk/ac/ebi/gnx/*
10 | c) package jar file
11 | # in gnx-tools foder run
12 | ant -f package.xml
13 | d) execute jar file
14 | java -jar gnx.jar
15 |
16 | DONE
17 |
18 |
19 |
--------------------------------------------------------------------------------
/package.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/src/uk/ac/ebi/gnx/ParseFasta.java:
--------------------------------------------------------------------------------
1 | /**
2 | * File: ParseFasta.java
3 | * Created by: mhaimel
4 | * Created on: Feb 23, 2012
5 | * CVS: $Id: ParseFasta.java 1.0 Feb 23, 2012 2:29:04 PM mhaimel Exp $
6 | */
7 | package uk.ac.ebi.gnx;
8 |
9 | import java.io.IOException;
10 | import java.io.InputStream;
11 | import java.util.ArrayList;
12 | import java.util.List;
13 |
14 | /**
15 | * @author mhaimel
16 | *
17 | */
18 | public class ParseFasta {
19 | private volatile InputStream in = null;
20 |
21 | private volatile List len = new ArrayList();
22 | private volatile long nsCnt = 0;
23 | private volatile long totalCnt = 0;
24 |
25 | private long minLen = 0;
26 |
27 | public ParseFasta() {
28 | // empty
29 | }
30 | public long getNsCnt() {
31 | return nsCnt;
32 | }
33 | public void setNsCnt(long nsCnt) {
34 | this.nsCnt = nsCnt;
35 | }
36 | public long getTotalCnt() {
37 | return totalCnt;
38 | }
39 | public void setTotalCnt(long totalCnt) {
40 | this.totalCnt = totalCnt;
41 | }
42 | public void setMinLen(long minLen) {
43 | this.minLen = minLen;
44 | }
45 | public InputStream getIn() {
46 | return in;
47 | }
48 | public void setIn(InputStream in) {
49 | this.in = in;
50 | }
51 | public void setLen(List len) {
52 | this.len = len;
53 | }
54 | public List getLen() {
55 | return len;
56 | }
57 | public long getMinLen() {
58 | return minLen;
59 | }
60 |
61 | public void process() throws IOException {
62 | int currSeqCnt = 0;
63 | int currNsCnt = 0;
64 | int currSeqOfOddChars = 0;
65 | while(true){
66 | switch (in.read()) {
67 | case -1:
68 | // EOF
69 | case '>':
70 | if(currSeqCnt > 0){
71 | if(addLength(currSeqCnt)){
72 | this.nsCnt += currNsCnt;
73 | }
74 | }
75 | currSeqCnt = 0;
76 | currSeqOfOddChars = 0;
77 | currNsCnt = 0;
78 | boolean search = true; // ugly, but should work for the moment;
79 | while(search){
80 | switch (in.read()) {
81 | case -1:
82 | // EOF
83 | return;
84 | case '\n':
85 | search = false;
86 | break;
87 | default:
88 | break;
89 | }
90 | }
91 | break;
92 | case '\r':
93 | case '\n':
94 | // ignore
95 | break;
96 |
97 | case 'n':
98 | case 'N':
99 | ++currNsCnt;
100 | case 'a':
101 | case 'A':
102 | case 't':
103 | case 'T':
104 | case 'g':
105 | case 'G':
106 | case 'c':
107 | case 'C':
108 | ++ currSeqCnt;
109 | break;
110 | default:
111 | ++currSeqOfOddChars;
112 | break;
113 | }
114 | }
115 | }
116 |
117 | private boolean addLength(int seqLen) {
118 | if(seqLen >= this.minLen ){
119 | this.totalCnt += seqLen;
120 | this.len.add(seqLen);
121 | return true;
122 | }
123 | return false;
124 | }
125 |
126 | public void reset() {
127 | this.nsCnt = 0;
128 | this.totalCnt = 0;
129 | this.len.clear();
130 | this.in = null;
131 | }
132 | }
133 |
--------------------------------------------------------------------------------
/src/uk/ac/ebi/gnx/CalculateGNx.java:
--------------------------------------------------------------------------------
1 | /**
2 | * File: CalculateGNx.java
3 | * Created by: mhaimel
4 | * Created on: Feb 22, 2012
5 | * CVS: $Id: CalculateGNx.java 1.0 Feb 22, 2012 4:40:13 PM mhaimel Exp $
6 | */
7 | package uk.ac.ebi.gnx;
8 |
9 | import java.io.BufferedInputStream;
10 | import java.io.File;
11 | import java.io.FileInputStream;
12 | import java.io.IOException;
13 | import java.io.InputStream;
14 | import java.io.PrintStream;
15 | import java.util.ArrayList;
16 | import java.util.Collections;
17 | import java.util.List;
18 | import java.util.zip.GZIPInputStream;
19 |
20 | /**
21 | * @author mhaimel
22 | *
23 | */
24 | public class CalculateGNx {
25 |
26 | private final double nxPosition; // default
27 |
28 | public CalculateGNx() {
29 | this(0.5);
30 | }
31 |
32 | public CalculateGNx(double position) {
33 | this.nxPosition = position;
34 | }
35 |
36 | private void printResults(PrintStream out, List revSortedList, long total) {
37 | int tCnt = revSortedList.size();
38 | long tSum = total;
39 | // for(Integer i : revSortedList){
40 | // tSum += i;
41 | // }
42 | double cOff = ((double)tSum)*this.nxPosition;
43 |
44 | long sum = 0;
45 | int i = 0;
46 | Integer sLen = 0;
47 | for(i = 0; i < tCnt; ++i){
48 | sLen = revSortedList.get(i);
49 | sum += sLen;
50 | if(sum >= cOff){
51 | break;
52 | }
53 | }
54 | out.println(
55 | "N"
56 | +Double.valueOf(this.nxPosition * 100).intValue()+":\t"
57 | + sLen
58 | + "\t("+(i+1)+" sequences)"
59 | + "\t("+sum +" bp combined)");
60 | }
61 |
62 |
63 | private static void printHelp(PrintStream out) {
64 | out.println("gnx [-min ] [-nx 25,50,75] [-g ] ");
65 | out.println("-min Minimum bp length of a sequence to be considered");
66 | out.println("-nx Nx values to be printed seperated by ',' e.g. 50 for N50, 25 for N25");
67 | out.println("-g genome size to be used to calculte Nx values");
68 | out.println(" ");
69 | out.println(" o /path/to/file.fa");
70 | out.println(" o use '-' for standard input");
71 | out.println(" o file-a.fa file-b.fa for a list of files");
72 | }
73 |
74 | /**
75 | * @param args
76 | * @throws IOException
77 | */
78 | public static void main(String[] args) throws IOException {
79 | if(args.length == 0){
80 | System.err.println("Please provide an input!");
81 | printHelp(System.err);
82 | fail("Please provide an input!");
83 | }
84 |
85 | List nxList = new ArrayList();
86 | nxList.add(new CalculateGNx());
87 | ParseFasta pfa = new ParseFasta();
88 | long genomeSize = -1;
89 | List list = new ArrayList();
90 | for(int i = 0; i < args.length; ++i){
91 | String s = args[i];
92 | if(s.equals("-min")){
93 | pfa.setMinLen(Long.valueOf(args[++i]));
94 | } else if(s.equals("-g")){
95 | genomeSize = Long.valueOf(args[++i]);
96 | } else if(s.equals("-nx")){
97 | nxList.clear();
98 | for(String v : args[++i].split(",")){
99 | if(v.length() > 0){
100 | nxList.add(new CalculateGNx(Double.valueOf(v)/100));
101 | }
102 | }
103 | } else if(s.equals("-")){
104 | list.add(new File("-"));
105 | } else {
106 | File f = new File(s);
107 | if(!f.isFile()){
108 | fail(s + " is not a file!!!");
109 | } else if(!f.canRead()){
110 | fail(s + " is not Readable!!!");
111 | }
112 | list.add(f);
113 | }
114 | }
115 | boolean isStream = false;
116 | long gSize = 0;
117 | for(File f : list){
118 | try{
119 | if(f.getName().equals("-")){
120 | isStream = true;
121 | pfa.setIn(System.in);
122 | } else {
123 | pfa.setIn(openFile(f));
124 | }
125 | pfa.process();
126 | // TODO process results
127 | System.out.println("Results for " + f );
128 | System.out.println("Total number of sequences: " + pfa.getLen().size());
129 | System.out.println("Total length of sequences: " + pfa.getTotalCnt() + " bp");
130 | // Sort
131 | Collections.sort(pfa.getLen());
132 | System.out.println("Shortest sequence length : " + (pfa.getLen().isEmpty()?0:pfa.getLen().get(0)) + " bp");
133 |
134 | // Reverse
135 | Collections.reverse(pfa.getLen());
136 | System.out.println("Longest sequence length : " + (pfa.getLen().isEmpty()?0:pfa.getLen().get(0))+ " bp");
137 |
138 | gSize = 0;
139 | if(genomeSize < 0){
140 | gSize = pfa.getTotalCnt();
141 | } else {
142 | gSize = genomeSize;
143 | System.out.println("-> with a provided genome size of: " + gSize + " bp");
144 | }
145 | System.out.println("Total number of Ns in sequences: " + pfa.getNsCnt());
146 | for(CalculateGNx nx : nxList){
147 | nx.printResults(System.out,pfa.getLen(), gSize);
148 | }
149 | if(!isStream){
150 | pfa.getIn().close();
151 | }
152 | pfa.setIn(null);
153 | } finally{
154 | if(pfa.getIn() != null){
155 | try{
156 | pfa.getIn().close();
157 | } catch (Exception e) {
158 | // ignore
159 | }
160 | pfa.setIn(null);
161 | }
162 | }
163 | System.out.println("");
164 | pfa.reset();
165 | }
166 | }
167 |
168 | private static InputStream openFile(File f) throws IOException {
169 | InputStream in = new FileInputStream(f);
170 | if(f.getName().endsWith(".gz") || f.getName().endsWith(".gzip")){
171 | in = new GZIPInputStream(in);
172 | }
173 | in = new BufferedInputStream(in);
174 | return in;
175 | }
176 |
177 | private static void fail(String msg) {
178 | System.err.println(msg);
179 | System.exit(1);
180 | }
181 |
182 |
183 |
184 |
185 | }
186 |
--------------------------------------------------------------------------------