├── README.md └── calN50.js /README.md: -------------------------------------------------------------------------------- 1 | ## Getting Started 2 | 3 | ```sh 4 | ## If you can run paftools.js from minimap2, you already have k8 installed. If not: 5 | 6 | ## install k8 without conda: 7 | # curl -L https://github.com/attractivechaos/k8/releases/download/v0.2.4/k8-0.2.4.tar.bz2 | tar -jxf - 8 | # cp k8-0.2.4/k8-`uname -s` k8 # or copy it to a directory on your $PATH 9 | 10 | ## install k8 via bioconda: 11 | # conda install -c bioconda minimap2 # k8 comes with minimap2 12 | 13 | k8 calN50.js ctg.fa # compute auN and N50 from FASTA 14 | ./calN50.js ctg.fa.fai # faidx index (assuming k8 on $PATH) 15 | calN50.js graph.gfa.gz # if k8 and calN50.js are on $PATH 16 | calN50.js -L3.1g ctg.fa.fai # compute auNG and NG50 for a 3.1Gbp genome 17 | calN50.js -f ref.fa.fai ctg.fa # or get the genome size from a .fai file 18 | ``` 19 | 20 | ## Introduction 21 | 22 | calN50.js is a simple script to calculate N50/NG50 and [auN][auN]/auNG. It is 23 | fast and seamlessly works with FASTA, GFA1 and faidx formats. 24 | 25 | [auN]: http://lh3.github.io/2020/04/08/a-new-metric-on-assembly-contiguity 26 | -------------------------------------------------------------------------------- /calN50.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env k8 2 | 3 | var version = "r4"; 4 | 5 | var getopt = function(args, ostr) { 6 | var oli; // option letter list index 7 | if (typeof(getopt.place) == 'undefined') 8 | getopt.ind = 0, getopt.arg = null, getopt.place = -1; 9 | if (getopt.place == -1) { // update scanning pointer 10 | if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') { 11 | getopt.place = -1; 12 | return null; 13 | } 14 | if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--" 15 | ++getopt.ind; 16 | getopt.place = -1; 17 | return null; 18 | } 19 | } 20 | var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity 21 | if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) { 22 | if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null. 23 | if (getopt.place < 0) ++getopt.ind; 24 | return '?'; 25 | } 26 | if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument 27 | getopt.arg = null; 28 | if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1; 29 | } else { // need an argument 30 | if (getopt.place >= 0 && getopt.place < args[getopt.ind].length) 31 | getopt.arg = args[getopt.ind].substr(getopt.place); 32 | else if (args.length <= ++getopt.ind) { // no arg 33 | getopt.place = -1; 34 | if (ostr.length > 0 && ostr.charAt(0) == ':') return ':'; 35 | return '?'; 36 | } else getopt.arg = args[getopt.ind]; // white space 37 | getopt.place = -1; 38 | ++getopt.ind; 39 | } 40 | return optopt; 41 | } 42 | 43 | function parseNum(s) { 44 | var m, x = null; 45 | if ((m = /^(\d*\.?\d*)([mMgGkK]?)/.exec(s)) != null) { 46 | x = parseFloat(m[1]); 47 | if (m[2] == 'k' || m[2] == 'K') x *= 1000; 48 | else if (m[2] == 'm' || m[2] == 'M') x *= 1000000; 49 | else if (m[2] == 'g' || m[2] == 'G') x *= 1000000000; 50 | } 51 | return Math.floor(x + .499); 52 | } 53 | 54 | function main(args) { 55 | var c, step = 0.1, min_len = 0, tot_len = null, fn_fai = null; 56 | while ((c = getopt(args, "l:L:s:f:v")) != null) { 57 | if (c == 's') step = parseFloat(getopt.arg); 58 | else if (c == 'l') min_len = parseNum(getopt.arg); 59 | else if (c == 'L') tot_len = parseNum(getopt.arg); 60 | else if (c == 'f') fn_fai = getopt.arg; 61 | else if (c == 'v') { 62 | print(version); 63 | exit(0); 64 | } 65 | } 66 | 67 | if (args.length == 0) { 68 | print("Usage: calN50.js [options] ||"); 69 | print("Options:"); 70 | print(" -l NUM min length [0]"); 71 | print(" -L NUM total length for NGx []"); 72 | print(" -f FILE reference .fai file for NGx []"); 73 | print(" -s FLOAT N50 step size [" + step + "]"); 74 | print(" -v print version number"); 75 | exit(0); 76 | } 77 | 78 | var file, buf = new Bytes(); 79 | if (fn_fai) { 80 | file = new File(fn_fai); 81 | tot_len = 0; 82 | while (file.readline(buf) >= 0) { 83 | var t = buf.toString().split("\t"); 84 | if (t.length >= 2) 85 | tot_len += parseInt(t[1]); 86 | } 87 | file.close(); 88 | } 89 | 90 | file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]); 91 | 92 | var is_fa = false, is_gfa = false, len = 0, name = null; 93 | var a = []; 94 | while (file.readline(buf) >= 0) { 95 | if (buf.length == 0) continue; 96 | var m, s = buf.toString(); 97 | if (s[0] == '>') { // fasta header 98 | if ((m = /^>(\S+)/.exec(s)) != null) { 99 | if (name) a.push([name, len]); 100 | is_fa = true, name = m[1], len = 0; 101 | } 102 | } else if (is_fa) { // fasta sequence line 103 | len += s.length; 104 | } else { // gfa or length line 105 | if ((m = /^S\t(\S+)\t([a-zA-Z]+)|(\*.*\tLN:i:(\d+))/.exec(s)) != null) { // GFA S-line 106 | if (m[4] != null || m[2] != null) { 107 | is_gfa = true; 108 | if (m[4] != null) a.push([m[1], parseInt(m[4])]); 109 | else a.push([m[1], m[2].length]); 110 | } 111 | } else if (!is_gfa) { 112 | if ((m = /^(\S+)\t(\d+)/.exec(s)) != null) // length line 113 | a.push([m[1], parseInt(m[2])]); 114 | } 115 | } 116 | } 117 | if (is_fa && name && len) 118 | a.push([name, len]); 119 | 120 | file.close(); 121 | buf.destroy(); 122 | 123 | if (a.length == 0) { 124 | warn("ERROR: no sequences found"); 125 | return 1; 126 | } 127 | 128 | a.sort(function(x,y) { return y[1]-x[1] }); 129 | if (min_len > 0) { 130 | var j = a.length; 131 | for (var i = a.length - 1; i >= 0; --i) 132 | if (a[i][1] >= min_len) { 133 | j = i; 134 | break; 135 | } 136 | a.length = j + 1; 137 | } 138 | 139 | print("CC\tGS genome_size_if_provided"); 140 | print("CC\tSZ total_sequence_length"); 141 | print("CC\tNN number_of_sequences"); 142 | print("CC\tNL x Nx Lx"); 143 | print("CC\tAU auN"); 144 | print("CC"); 145 | 146 | var sum = 0; 147 | for (var i = 0; i < a.length; ++i) 148 | sum += a[i][1]; 149 | if (tot_len != null) print("GS", tot_len); 150 | print("SZ", sum); 151 | print("NN", a.length); 152 | if (tot_len != null) sum = tot_len; 153 | 154 | var n = 0, x = 0, next = 0, y = 0; 155 | for (var i = 0; i < a.length; ++i) { 156 | if (x >= sum) break; 157 | var l = x + a[i][1] <= sum? a[i][1] : sum - x; 158 | y += l * (l / sum); 159 | x += a[i][1], ++n; 160 | if (x > sum * next - 0.01) { 161 | do { 162 | print("NL", Math.floor(next*100.0+.499), a[i][1], n); 163 | next += step; 164 | } while (x > sum * next - 0.01); 165 | } 166 | } 167 | print("AU", y.toFixed(0)); 168 | return 0; 169 | } 170 | 171 | var ret = main(arguments); 172 | exit(ret); 173 | --------------------------------------------------------------------------------