├── tests ├── .gitignore ├── test1.fa ├── test2.fa ├── exp.test1.0.txt ├── exp.test2.0.txt ├── exp.test2.2.txt ├── test3.fa ├── exp.test3.2.txt ├── exp.test3.3.txt ├── view_test.sh └── Makefile ├── disty.xcodeproj ├── project.xcworkspace │ └── contents.xcworkspacedata └── project.pbxproj ├── Makefile ├── .gitignore ├── LICENSE ├── readme.md └── src ├── kseq.h └── main.cpp /tests/.gitignore: -------------------------------------------------------------------------------- 1 | obs.*.txt 2 | -------------------------------------------------------------------------------- /tests/test1.fa: -------------------------------------------------------------------------------- 1 | >seq1 2 | AAA 3 | >seq2 4 | AaA 5 | >seq3 6 | aaA 7 | -------------------------------------------------------------------------------- /tests/test2.fa: -------------------------------------------------------------------------------- 1 | >seq1 2 | AAA 3 | >seq2 4 | CCC 5 | >seq3 6 | ACN 7 | -------------------------------------------------------------------------------- /tests/exp.test1.0.txt: -------------------------------------------------------------------------------- 1 | seq1 seq2 seq3 2 | seq1 0 0 0 3 | seq2 0 0 0 4 | seq3 0 0 0 5 | -------------------------------------------------------------------------------- /tests/exp.test2.0.txt: -------------------------------------------------------------------------------- 1 | seq1 seq2 seq3 2 | seq1 0 3 1 3 | seq2 3 0 1 4 | seq3 1 1 0 5 | -------------------------------------------------------------------------------- /tests/exp.test2.2.txt: -------------------------------------------------------------------------------- 1 | seq1 seq2 seq3 2 | seq1 0 2 1 3 | seq2 2 0 1 4 | seq3 1 1 0 5 | -------------------------------------------------------------------------------- /tests/test3.fa: -------------------------------------------------------------------------------- 1 | >seq1 2 | AAA 3 | >seq2 4 | AAA 5 | >seq3 6 | AAC 7 | >seq4 8 | CNN 9 | -------------------------------------------------------------------------------- /tests/exp.test3.2.txt: -------------------------------------------------------------------------------- 1 | seq1 seq2 seq3 seq4 2 | seq1 0 0 0 1 3 | seq2 0 0 0 1 4 | seq3 0 0 0 1 5 | seq4 1 1 1 0 6 | -------------------------------------------------------------------------------- /tests/exp.test3.3.txt: -------------------------------------------------------------------------------- 1 | seq1 seq2 seq3 seq4 2 | seq1 0 0 1 1 3 | seq2 0 0 1 1 4 | seq3 1 1 0 2 5 | seq4 1 1 2 0 6 | -------------------------------------------------------------------------------- /disty.xcodeproj/project.xcworkspace/contents.xcworkspacedata: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /tests/view_test.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | 3 | set -u 4 | set -e 5 | set -o pipefail 6 | 7 | readonly PROGNAME=$(basename $0) 8 | readonly PROGDIR=$(dirname $0) 9 | readonly ARGS="$@" 10 | readonly NARGS="$#" 11 | 12 | if [ $NARGS -ne 1 ]; then 13 | echo "usage: $PROGNAME options" 14 | exit 1 15 | fi 16 | 17 | head {test$1*,exp.test$1*} 18 | 19 | 20 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CXX ?= g++ 2 | CXXFLAGS = -std=c++11 -Wall -Wextra -Wno-missing-field-initializers -g -O2 3 | LIBS = -lm -lz -lpthread 4 | 5 | PREFIX = $(DESTDIR)/usr/local 6 | BINDIR = $(PREFIX)/bin 7 | 8 | ofiles = src/main.cpp.o 9 | hfiles = $(wildcard src/*.h) 10 | 11 | .PHONY: all clean install 12 | 13 | all: disty 14 | 15 | install: disty 16 | install disty $(BINDIR)/disty 17 | 18 | disty: $(ofiles) 19 | $(CXX) $(CXXFLAGS) $(DFLAGS) $(ofiles) -o $@ -L. $(LIBS) 20 | 21 | src/%.cpp.o: src/%.cpp $(hfiles) 22 | $(CXX) $(CXXFLAGS) $(DFLAGS) -c $< -o $@ 23 | 24 | clean: 25 | rm -f src/*.o 26 | rm -f disty 27 | 28 | -------------------------------------------------------------------------------- /tests/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all clean 2 | 3 | SHELL=/usr/bin/env bash -euc -o pipefail 4 | 5 | .SECONDARY: 6 | 7 | DM=../disty 8 | SEQ=$(wildcard *.fa) 9 | 10 | all: 11 | @for seq in $(SEQ); do \ 12 | pref=$${seq%.*}; \ 13 | for s in $$(seq 0 4); do \ 14 | CMD="$(DM) -s $$s $$seq"; \ 15 | EXP="exp.$${pref}.$$s.txt"; \ 16 | OBS="obs.$${pref}.$$s.txt"; \ 17 | if [[ -f "$$EXP" ]]; then \ 18 | echo ; \ 19 | echo "$$CMD"; \ 20 | echo "======================================"; \ 21 | echo ; \ 22 | echo $$EXP; \ 23 | echo $$OBS; \ 24 | echo ; \ 25 | $$CMD > "$$OBS" 2> /dev/null; \ 26 | diff "$$OBS" "$$EXP"; \ 27 | echo "OK"; \ 28 | echo; \ 29 | fi; \ 30 | done; \ 31 | done; 32 | 33 | clean: 34 | rm -f obs.*.txt 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | disty 2 | 3 | # Xcode 4 | 5 | ## Build generated 6 | build/ 7 | DerivedData/ 8 | 9 | ## Various settings 10 | *.pbxuser 11 | !default.pbxuser 12 | *.mode1v3 13 | !default.mode1v3 14 | *.mode2v3 15 | !default.mode2v3 16 | *.perspectivev3 17 | !default.perspectivev3 18 | xcuserdata/ 19 | 20 | ## Other 21 | *.moved-aside 22 | *.xccheckout 23 | *.xcscmblueprint 24 | 25 | 26 | # Prerequisites 27 | *.d 28 | 29 | # Compiled Object files 30 | *.slo 31 | *.lo 32 | *.o 33 | *.obj 34 | 35 | # Precompiled Headers 36 | *.gch 37 | *.pch 38 | 39 | # Compiled Dynamic libraries 40 | *.so 41 | *.dylib 42 | *.dll 43 | 44 | # Fortran module files 45 | *.mod 46 | *.smod 47 | 48 | # Compiled Static libraries 49 | *.lai 50 | *.la 51 | *.a 52 | *.lib 53 | 54 | # Executables 55 | *.exe 56 | *.out 57 | *.app 58 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Karel Brinda 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Disty McMatrixface 2 | 3 | Compute a distance matrix from a core genome alignment file. 4 | 5 | 6 | ## Prerequisities 7 | 8 | * GCC 4.8+ or equivalent 9 | * ZLib 10 | 11 | 12 | ## Getting started 13 | 14 | ```bash 15 | git clone https://github.com/c2-d2/disty 16 | cd disty && make 17 | ./disty tests/test2.fa 18 | ``` 19 | 20 | Output: 21 | ``` 22 | seq1 seq2 seq3 23 | seq1 0 3 1 24 | seq2 3 0 1 25 | seq3 1 1 0 26 | 27 | ``` 28 | 29 | ## Installation 30 | 31 | **Using Bioconda:** 32 | 33 | ```bash 34 | conda config --add channels defaults 35 | conda config --add channels conda-forge 36 | conda config --add channels bioconda 37 | 38 | conda install disty 39 | ``` 40 | 41 | **Using brew:** 42 | 43 | ```bash 44 | brew install tseemann/bioinformatics-linux/disty 45 | ``` 46 | 47 | **From Github:** 48 | 49 | ```bash 50 | git clone https://github.com/c2-d2/disty 51 | cd disty && make && make install 52 | ``` 53 | 54 | 55 | ## Command line parameters 56 | 57 | ``` 58 | Usage: disty 59 | 60 | Options: 61 | -n FLOAT skip columns having frequency of N > FLOAT [1.00] 62 | -i INT input format [0] 63 | 0: ACGT 64 | 1: 01 65 | -s INT strategy to deal with N's [0] 66 | 0: ignore pairwisely 67 | 1: ignore pairwisely and normalize 68 | 2: ignore globally 69 | 3: replace by the major allele 70 | 4: replace by the closest individual (not implemented yet) 71 | -h print help message and exit 72 | -v print version and exit 73 | ``` 74 | 75 | 76 | ## Author 77 | 78 | [Karel Brinda](http://brinda.cz) <kbrinda@hsph.harvard.edu> 79 | -------------------------------------------------------------------------------- /src/kseq.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, 2009, 2011 Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* Last Modified: 05MAR2012 */ 27 | 28 | #ifndef AC_KSEQ_H 29 | #define AC_KSEQ_H 30 | 31 | #include 32 | #include 33 | #include 34 | 35 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r 36 | #define KS_SEP_TAB 1 // isspace() && !' ' 37 | #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) 38 | #define KS_SEP_MAX 2 39 | 40 | #define __KS_TYPE(type_t) \ 41 | typedef struct __kstream_t { \ 42 | unsigned char *buf; \ 43 | int begin, end, is_eof; \ 44 | type_t f; \ 45 | } kstream_t; 46 | 47 | #define ks_err(ks) ((ks)->end == -1) 48 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) 49 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) 50 | 51 | #define __KS_BASIC(type_t, __bufsize) \ 52 | static inline kstream_t *ks_init(type_t f) \ 53 | { \ 54 | kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ 55 | ks->f = f; \ 56 | ks->buf = (unsigned char*)malloc(__bufsize); \ 57 | return ks; \ 58 | } \ 59 | static inline void ks_destroy(kstream_t *ks) \ 60 | { \ 61 | if (ks) { \ 62 | free(ks->buf); \ 63 | free(ks); \ 64 | } \ 65 | } 66 | 67 | #define __KS_GETC(__read, __bufsize) \ 68 | static inline int ks_getc(kstream_t *ks) \ 69 | { \ 70 | if (ks_err(ks)) return -3; \ 71 | if (ks->is_eof && ks->begin >= ks->end) return -1; \ 72 | if (ks->begin >= ks->end) { \ 73 | ks->begin = 0; \ 74 | ks->end = __read(ks->f, ks->buf, __bufsize); \ 75 | if (ks->end == 0) { ks->is_eof = 1; return -1;} \ 76 | if (ks->end == -1) { ks->is_eof = 1; return -3;}\ 77 | } \ 78 | return (int)ks->buf[ks->begin++]; \ 79 | } 80 | 81 | #ifndef KSTRING_T 82 | #define KSTRING_T kstring_t 83 | typedef struct __kstring_t { 84 | size_t l, m; 85 | char *s; 86 | } kstring_t; 87 | #endif 88 | 89 | #ifndef kroundup32 90 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 91 | #endif 92 | 93 | #define __KS_GETUNTIL(__read, __bufsize) \ 94 | static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ 95 | { \ 96 | int gotany = 0; \ 97 | if (dret) *dret = 0; \ 98 | str->l = append? str->l : 0; \ 99 | for (;;) { \ 100 | int i; \ 101 | if (ks_err(ks)) return -3; \ 102 | if (ks->begin >= ks->end) { \ 103 | if (!ks->is_eof) { \ 104 | ks->begin = 0; \ 105 | ks->end = __read(ks->f, ks->buf, __bufsize); \ 106 | if (ks->end == 0) { ks->is_eof = 1; break; } \ 107 | if (ks->end == -1) { ks->is_eof = 1; return -3; } \ 108 | } else break; \ 109 | } \ 110 | if (delimiter == KS_SEP_LINE) { \ 111 | for (i = ks->begin; i < ks->end; ++i) \ 112 | if (ks->buf[i] == '\n') break; \ 113 | } else if (delimiter > KS_SEP_MAX) { \ 114 | for (i = ks->begin; i < ks->end; ++i) \ 115 | if (ks->buf[i] == delimiter) break; \ 116 | } else if (delimiter == KS_SEP_SPACE) { \ 117 | for (i = ks->begin; i < ks->end; ++i) \ 118 | if (isspace(ks->buf[i])) break; \ 119 | } else if (delimiter == KS_SEP_TAB) { \ 120 | for (i = ks->begin; i < ks->end; ++i) \ 121 | if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ 122 | } else i = 0; /* never come to here! */ \ 123 | if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ 124 | str->m = str->l + (i - ks->begin) + 1; \ 125 | kroundup32(str->m); \ 126 | str->s = (char*)realloc(str->s, str->m); \ 127 | } \ 128 | gotany = 1; \ 129 | memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ 130 | str->l = str->l + (i - ks->begin); \ 131 | ks->begin = i + 1; \ 132 | if (i < ks->end) { \ 133 | if (dret) *dret = ks->buf[i]; \ 134 | break; \ 135 | } \ 136 | } \ 137 | if (!gotany && ks_eof(ks)) return -1; \ 138 | if (str->s == 0) { \ 139 | str->m = 1; \ 140 | str->s = (char*)calloc(1, 1); \ 141 | } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ 142 | str->s[str->l] = '\0'; \ 143 | return str->l; \ 144 | } \ 145 | static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ 146 | { return ks_getuntil2(ks, delimiter, str, dret, 0); } 147 | 148 | #define KSTREAM_INIT(type_t, __read, __bufsize) \ 149 | __KS_TYPE(type_t) \ 150 | __KS_BASIC(type_t, __bufsize) \ 151 | __KS_GETC(__read, __bufsize) \ 152 | __KS_GETUNTIL(__read, __bufsize) 153 | 154 | #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) 155 | 156 | #define __KSEQ_BASIC(SCOPE, type_t) \ 157 | SCOPE kseq_t *kseq_init(type_t fd) \ 158 | { \ 159 | kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ 160 | s->f = ks_init(fd); \ 161 | return s; \ 162 | } \ 163 | SCOPE void kseq_destroy(kseq_t *ks) \ 164 | { \ 165 | if (!ks) return; \ 166 | free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ 167 | ks_destroy(ks->f); \ 168 | free(ks); \ 169 | } 170 | 171 | /* Return value: 172 | >=0 length of the sequence (normal) 173 | -1 end-of-file 174 | -2 truncated quality string 175 | -3 error reading stream 176 | */ 177 | #define __KSEQ_READ(SCOPE) \ 178 | SCOPE int kseq_read(kseq_t *seq) \ 179 | { \ 180 | int c,r; \ 181 | kstream_t *ks = seq->f; \ 182 | if (seq->last_char == 0) { /* then jump to the next header line */ \ 183 | while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '@'); \ 184 | if (c < 0) return c; /* end of file or error*/ \ 185 | seq->last_char = c; \ 186 | } /* else: the first header char has been read in the previous call */ \ 187 | seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ 188 | if ((r=ks_getuntil(ks, 0, &seq->name, &c)) < 0) return r; /* normal exit: EOF or error */ \ 189 | if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ 190 | if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ 191 | seq->seq.m = 256; \ 192 | seq->seq.s = (char*)malloc(seq->seq.m); \ 193 | } \ 194 | while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '+' && c != '@') { \ 195 | if (c == '\n') continue; /* skip empty lines */ \ 196 | seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ 197 | ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ 198 | } \ 199 | if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ 200 | if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ 201 | seq->seq.m = seq->seq.l + 2; \ 202 | kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ 203 | seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ 204 | } \ 205 | seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ 206 | if (c != '+') return seq->seq.l; /* FASTA */ \ 207 | if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ 208 | seq->qual.m = seq->seq.m; \ 209 | seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ 210 | } \ 211 | while ((c = ks_getc(ks)) >= 0 && c != '\n'); /* skip the rest of '+' line */ \ 212 | if (c == -1) return -2; /* error: no quality string */ \ 213 | while ((c = ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l)); \ 214 | if (c == -3) return -3; /* stream error */ \ 215 | seq->last_char = 0; /* we have not come to the next header line */ \ 216 | if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ 217 | return seq->seq.l; \ 218 | } 219 | 220 | #define __KSEQ_TYPE(type_t) \ 221 | typedef struct { \ 222 | kstring_t name, comment, seq, qual; \ 223 | int last_char; \ 224 | kstream_t *f; \ 225 | } kseq_t; 226 | 227 | #define KSEQ_INIT2(SCOPE, type_t, __read) \ 228 | KSTREAM_INIT(type_t, __read, 16384) \ 229 | __KSEQ_TYPE(type_t) \ 230 | __KSEQ_BASIC(SCOPE, type_t) \ 231 | __KSEQ_READ(SCOPE) 232 | 233 | #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) 234 | 235 | #define KSEQ_DECLARE(type_t) \ 236 | __KS_TYPE(type_t) \ 237 | __KSEQ_TYPE(type_t) \ 238 | extern kseq_t *kseq_init(type_t fd); \ 239 | void kseq_destroy(kseq_t *ks); \ 240 | int kseq_read(kseq_t *seq); 241 | 242 | #endif 243 | -------------------------------------------------------------------------------- /disty.xcodeproj/project.pbxproj: -------------------------------------------------------------------------------- 1 | // !$*UTF8*$! 2 | { 3 | archiveVersion = 1; 4 | classes = { 5 | }; 6 | objectVersion = 46; 7 | objects = { 8 | 9 | /* Begin PBXBuildFile section */ 10 | FC6DB7381F16D6EB00DCFDD5 /* main.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC6DB7371F16D6EB00DCFDD5 /* main.cpp */; }; 11 | FC6DB73A1F16D71100DCFDD5 /* Makefile in Sources */ = {isa = PBXBuildFile; fileRef = FC6DB7391F16D71100DCFDD5 /* Makefile */; }; 12 | FCE6C6441F0FD8AC00EE50DE /* libz.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = FCE6C6431F0FD8AC00EE50DE /* libz.tbd */; }; 13 | /* End PBXBuildFile section */ 14 | 15 | /* Begin PBXCopyFilesBuildPhase section */ 16 | FCE6C6361F0FD04B00EE50DE /* CopyFiles */ = { 17 | isa = PBXCopyFilesBuildPhase; 18 | buildActionMask = 2147483647; 19 | dstPath = /usr/share/man/man1/; 20 | dstSubfolderSpec = 0; 21 | files = ( 22 | ); 23 | runOnlyForDeploymentPostprocessing = 1; 24 | }; 25 | /* End PBXCopyFilesBuildPhase section */ 26 | 27 | /* Begin PBXFileReference section */ 28 | FC6DB7371F16D6EB00DCFDD5 /* main.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = main.cpp; path = src/main.cpp; sourceTree = SOURCE_ROOT; }; 29 | FC6DB7391F16D71100DCFDD5 /* Makefile */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.make; path = Makefile; sourceTree = SOURCE_ROOT; }; 30 | FCE6C6381F0FD04B00EE50DE /* disty */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = disty; sourceTree = BUILT_PRODUCTS_DIR; }; 31 | FCE6C6431F0FD8AC00EE50DE /* libz.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = libz.tbd; path = usr/lib/libz.tbd; sourceTree = SDKROOT; }; 32 | /* End PBXFileReference section */ 33 | 34 | /* Begin PBXFrameworksBuildPhase section */ 35 | FCE6C6351F0FD04B00EE50DE /* Frameworks */ = { 36 | isa = PBXFrameworksBuildPhase; 37 | buildActionMask = 2147483647; 38 | files = ( 39 | FCE6C6441F0FD8AC00EE50DE /* libz.tbd in Frameworks */, 40 | ); 41 | runOnlyForDeploymentPostprocessing = 0; 42 | }; 43 | /* End PBXFrameworksBuildPhase section */ 44 | 45 | /* Begin PBXGroup section */ 46 | FCE6C62F1F0FD04B00EE50DE = { 47 | isa = PBXGroup; 48 | children = ( 49 | FCE6C63A1F0FD04B00EE50DE /* disty */, 50 | FCE6C6391F0FD04B00EE50DE /* Products */, 51 | FCE6C6421F0FD8AC00EE50DE /* Frameworks */, 52 | ); 53 | sourceTree = ""; 54 | }; 55 | FCE6C6391F0FD04B00EE50DE /* Products */ = { 56 | isa = PBXGroup; 57 | children = ( 58 | FCE6C6381F0FD04B00EE50DE /* disty */, 59 | ); 60 | name = Products; 61 | sourceTree = ""; 62 | }; 63 | FCE6C63A1F0FD04B00EE50DE /* disty */ = { 64 | isa = PBXGroup; 65 | children = ( 66 | FC6DB7391F16D71100DCFDD5 /* Makefile */, 67 | FC6DB7371F16D6EB00DCFDD5 /* main.cpp */, 68 | ); 69 | path = disty; 70 | sourceTree = ""; 71 | }; 72 | FCE6C6421F0FD8AC00EE50DE /* Frameworks */ = { 73 | isa = PBXGroup; 74 | children = ( 75 | FCE6C6431F0FD8AC00EE50DE /* libz.tbd */, 76 | ); 77 | name = Frameworks; 78 | sourceTree = ""; 79 | }; 80 | /* End PBXGroup section */ 81 | 82 | /* Begin PBXNativeTarget section */ 83 | FCE6C6371F0FD04B00EE50DE /* disty */ = { 84 | isa = PBXNativeTarget; 85 | buildConfigurationList = FCE6C63F1F0FD04B00EE50DE /* Build configuration list for PBXNativeTarget "disty" */; 86 | buildPhases = ( 87 | FCE6C6341F0FD04B00EE50DE /* Sources */, 88 | FCE6C6351F0FD04B00EE50DE /* Frameworks */, 89 | FCE6C6361F0FD04B00EE50DE /* CopyFiles */, 90 | ); 91 | buildRules = ( 92 | ); 93 | dependencies = ( 94 | ); 95 | name = disty; 96 | productName = disty; 97 | productReference = FCE6C6381F0FD04B00EE50DE /* disty */; 98 | productType = "com.apple.product-type.tool"; 99 | }; 100 | /* End PBXNativeTarget section */ 101 | 102 | /* Begin PBXProject section */ 103 | FCE6C6301F0FD04B00EE50DE /* Project object */ = { 104 | isa = PBXProject; 105 | attributes = { 106 | LastUpgradeCheck = 0830; 107 | ORGANIZATIONNAME = "Karel Brinda"; 108 | TargetAttributes = { 109 | FCE6C6371F0FD04B00EE50DE = { 110 | CreatedOnToolsVersion = 8.3.3; 111 | ProvisioningStyle = Automatic; 112 | }; 113 | }; 114 | }; 115 | buildConfigurationList = FCE6C6331F0FD04B00EE50DE /* Build configuration list for PBXProject "disty" */; 116 | compatibilityVersion = "Xcode 3.2"; 117 | developmentRegion = English; 118 | hasScannedForEncodings = 0; 119 | knownRegions = ( 120 | en, 121 | ); 122 | mainGroup = FCE6C62F1F0FD04B00EE50DE; 123 | productRefGroup = FCE6C6391F0FD04B00EE50DE /* Products */; 124 | projectDirPath = ""; 125 | projectRoot = ""; 126 | targets = ( 127 | FCE6C6371F0FD04B00EE50DE /* disty */, 128 | ); 129 | }; 130 | /* End PBXProject section */ 131 | 132 | /* Begin PBXSourcesBuildPhase section */ 133 | FCE6C6341F0FD04B00EE50DE /* Sources */ = { 134 | isa = PBXSourcesBuildPhase; 135 | buildActionMask = 2147483647; 136 | files = ( 137 | FC6DB73A1F16D71100DCFDD5 /* Makefile in Sources */, 138 | FC6DB7381F16D6EB00DCFDD5 /* main.cpp in Sources */, 139 | ); 140 | runOnlyForDeploymentPostprocessing = 0; 141 | }; 142 | /* End PBXSourcesBuildPhase section */ 143 | 144 | /* Begin XCBuildConfiguration section */ 145 | FCE6C63D1F0FD04B00EE50DE /* Debug */ = { 146 | isa = XCBuildConfiguration; 147 | buildSettings = { 148 | ALWAYS_SEARCH_USER_PATHS = NO; 149 | CLANG_ANALYZER_NONNULL = YES; 150 | CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; 151 | CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; 152 | CLANG_CXX_LIBRARY = "libc++"; 153 | CLANG_ENABLE_MODULES = YES; 154 | CLANG_ENABLE_OBJC_ARC = YES; 155 | CLANG_WARN_BOOL_CONVERSION = YES; 156 | CLANG_WARN_CONSTANT_CONVERSION = YES; 157 | CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; 158 | CLANG_WARN_DOCUMENTATION_COMMENTS = YES; 159 | CLANG_WARN_EMPTY_BODY = YES; 160 | CLANG_WARN_ENUM_CONVERSION = YES; 161 | CLANG_WARN_INFINITE_RECURSION = YES; 162 | CLANG_WARN_INT_CONVERSION = YES; 163 | CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; 164 | CLANG_WARN_SUSPICIOUS_MOVE = YES; 165 | CLANG_WARN_UNREACHABLE_CODE = YES; 166 | CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; 167 | CODE_SIGN_IDENTITY = "-"; 168 | COPY_PHASE_STRIP = NO; 169 | DEBUG_INFORMATION_FORMAT = dwarf; 170 | ENABLE_STRICT_OBJC_MSGSEND = YES; 171 | ENABLE_TESTABILITY = YES; 172 | GCC_C_LANGUAGE_STANDARD = gnu99; 173 | GCC_DYNAMIC_NO_PIC = NO; 174 | GCC_NO_COMMON_BLOCKS = YES; 175 | GCC_OPTIMIZATION_LEVEL = 0; 176 | GCC_PREPROCESSOR_DEFINITIONS = ( 177 | "DEBUG=1", 178 | "$(inherited)", 179 | ); 180 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES; 181 | GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; 182 | GCC_WARN_UNDECLARED_SELECTOR = YES; 183 | GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; 184 | GCC_WARN_UNUSED_FUNCTION = YES; 185 | GCC_WARN_UNUSED_VARIABLE = YES; 186 | MACOSX_DEPLOYMENT_TARGET = 10.12; 187 | MTL_ENABLE_DEBUG_INFO = YES; 188 | ONLY_ACTIVE_ARCH = YES; 189 | SDKROOT = macosx; 190 | }; 191 | name = Debug; 192 | }; 193 | FCE6C63E1F0FD04B00EE50DE /* Release */ = { 194 | isa = XCBuildConfiguration; 195 | buildSettings = { 196 | ALWAYS_SEARCH_USER_PATHS = NO; 197 | CLANG_ANALYZER_NONNULL = YES; 198 | CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; 199 | CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; 200 | CLANG_CXX_LIBRARY = "libc++"; 201 | CLANG_ENABLE_MODULES = YES; 202 | CLANG_ENABLE_OBJC_ARC = YES; 203 | CLANG_WARN_BOOL_CONVERSION = YES; 204 | CLANG_WARN_CONSTANT_CONVERSION = YES; 205 | CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; 206 | CLANG_WARN_DOCUMENTATION_COMMENTS = YES; 207 | CLANG_WARN_EMPTY_BODY = YES; 208 | CLANG_WARN_ENUM_CONVERSION = YES; 209 | CLANG_WARN_INFINITE_RECURSION = YES; 210 | CLANG_WARN_INT_CONVERSION = YES; 211 | CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; 212 | CLANG_WARN_SUSPICIOUS_MOVE = YES; 213 | CLANG_WARN_UNREACHABLE_CODE = YES; 214 | CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; 215 | CODE_SIGN_IDENTITY = "-"; 216 | COPY_PHASE_STRIP = NO; 217 | DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; 218 | ENABLE_NS_ASSERTIONS = NO; 219 | ENABLE_STRICT_OBJC_MSGSEND = YES; 220 | GCC_C_LANGUAGE_STANDARD = gnu99; 221 | GCC_NO_COMMON_BLOCKS = YES; 222 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES; 223 | GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; 224 | GCC_WARN_UNDECLARED_SELECTOR = YES; 225 | GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; 226 | GCC_WARN_UNUSED_FUNCTION = YES; 227 | GCC_WARN_UNUSED_VARIABLE = YES; 228 | MACOSX_DEPLOYMENT_TARGET = 10.12; 229 | MTL_ENABLE_DEBUG_INFO = NO; 230 | SDKROOT = macosx; 231 | }; 232 | name = Release; 233 | }; 234 | FCE6C6401F0FD04B00EE50DE /* Debug */ = { 235 | isa = XCBuildConfiguration; 236 | buildSettings = { 237 | PRODUCT_NAME = "$(TARGET_NAME)"; 238 | }; 239 | name = Debug; 240 | }; 241 | FCE6C6411F0FD04B00EE50DE /* Release */ = { 242 | isa = XCBuildConfiguration; 243 | buildSettings = { 244 | PRODUCT_NAME = "$(TARGET_NAME)"; 245 | }; 246 | name = Release; 247 | }; 248 | /* End XCBuildConfiguration section */ 249 | 250 | /* Begin XCConfigurationList section */ 251 | FCE6C6331F0FD04B00EE50DE /* Build configuration list for PBXProject "disty" */ = { 252 | isa = XCConfigurationList; 253 | buildConfigurations = ( 254 | FCE6C63D1F0FD04B00EE50DE /* Debug */, 255 | FCE6C63E1F0FD04B00EE50DE /* Release */, 256 | ); 257 | defaultConfigurationIsVisible = 0; 258 | defaultConfigurationName = Release; 259 | }; 260 | FCE6C63F1F0FD04B00EE50DE /* Build configuration list for PBXNativeTarget "disty" */ = { 261 | isa = XCConfigurationList; 262 | buildConfigurations = ( 263 | FCE6C6401F0FD04B00EE50DE /* Debug */, 264 | FCE6C6411F0FD04B00EE50DE /* Release */, 265 | ); 266 | defaultConfigurationIsVisible = 0; 267 | defaultConfigurationName = Release; 268 | }; 269 | /* End XCConfigurationList section */ 270 | }; 271 | rootObject = FCE6C6301F0FD04B00EE50DE /* Project object */; 272 | } 273 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License 3 | 4 | Copyright (c) 2017 Karel Brinda 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining 7 | a copy of this software and associated documentation files (the 8 | "Software"), to deal in the Software without restriction, including 9 | without limitation the rights to use, copy, modify, merge, publish, 10 | distribute, sublicense, and/or sell copies of the Software, and to 11 | permit persons to whom the Software is furnished to do so, subject to 12 | the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be 15 | included in all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 21 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 22 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 23 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE. 25 | */ 26 | 27 | 28 | #ifndef DISTY_VERSION 29 | #define DISTY_VERSION "0.1.0" 30 | #endif 31 | 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | 44 | #include "kseq.h" 45 | 46 | using namespace std; 47 | 48 | 49 | enum class input_t { 50 | ACGT, 51 | BINARY, 52 | _MAX 53 | }; 54 | 55 | enum class n_strategy_t { 56 | IGNORE_PAIRWISE, 57 | IGNORE_PAIRWISE_NORM, 58 | IGNORE_GLOBALLY, 59 | REPLACE_MAJOR, 60 | REPLACE_CLOSEST, 61 | _MAX 62 | }; 63 | 64 | string USAGE= 65 | "\n" 66 | "Program: Disty McMatrixface - compute a distance matrix from a core genome alignment file\n" 67 | "Version: " DISTY_VERSION "\n" 68 | "Contact: Karel Brinda \n" 69 | "\n" 70 | "Usage: disty \n" 71 | "\n" 72 | "Options:\n" 73 | " -n FLOAT skip columns having frequency of N > FLOAT [1.00]\n" 74 | " -i INT input format [0]\n" 75 | " 0: ACGT\n" 76 | " 1: 01\n" 77 | " -s INT strategy to deal with N's [0]\n" 78 | " 0: ignore pairwisely\n" 79 | " 1: ignore pairwisely and normalize\n" 80 | " 2: ignore globally\n" 81 | " 3: replace by the major allele\n" 82 | " 4: replace by the closest individual (not implemented yet)\n" 83 | " -h print help message and exit\n" 84 | " -v print version and exit\n"; 85 | 86 | struct params_t { 87 | string fasta_fn; 88 | input_t input; 89 | n_strategy_t n_strategy; 90 | float skip_n; 91 | 92 | params_t() 93 | :fasta_fn(""), input(input_t::ACGT), n_strategy(n_strategy_t::IGNORE_PAIRWISE), skip_n(1.0) 94 | {} 95 | }; 96 | 97 | struct pair_char_t { 98 | int matches; 99 | int mismatches; 100 | int unknown; 101 | }; 102 | 103 | static const uint8_t acgt_nt256_nt4[] = { 104 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 105 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 106 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 107 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 108 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 109 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 110 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 111 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 112 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 113 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 114 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; 115 | 116 | 117 | /* 118 | static const uint8_t acgt_nt256_nt16[] = { 119 | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 120 | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 121 | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 122 | 1 , 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0,15,15, 123 | 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, 124 | 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, 125 | 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, 126 | 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, 127 | 128 | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 129 | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 130 | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 131 | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 132 | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 133 | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 134 | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 135 | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 136 | };*/ 137 | 138 | //static const uint8_t acgt_nt16_nt4[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; 139 | 140 | 141 | static const uint8_t binary_nt256_nt4[] = { 142 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2, 143 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2, 144 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2, 145 | 0,1,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2, 146 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2, 147 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2, 148 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2, 149 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2, 150 | 151 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2, 152 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2, 153 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2, 154 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2, 155 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2, 156 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2, 157 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2, 158 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2 159 | }; 160 | 161 | 162 | KSEQ_INIT(gzFile, gzread) 163 | 164 | 165 | /* 166 | * Parse arguments. 167 | */ 168 | void parse_arguments(int argc, const char **argv, params_t ¶ms) { 169 | if (argc==1){ 170 | cerr << USAGE << endl; 171 | exit(1); 172 | } 173 | 174 | int c; 175 | while ((c = getopt(argc, (char *const *)argv, "hvi:s:n:")) >= 0) { 176 | switch (c) { 177 | case 'h': { 178 | cout << USAGE << endl; 179 | exit(0); 180 | } 181 | case 'v': { 182 | cout << DISTY_VERSION << endl; 183 | exit(0); 184 | } 185 | case 'i': { 186 | int val=atoi(optarg); 187 | assert(val>=0); 188 | assert(val<(int)input_t::_MAX); 189 | params.input=static_cast(val); 190 | break; 191 | } 192 | case 's': { 193 | int val=atoi(optarg); 194 | assert(val>=0); 195 | assert(val<(int)n_strategy_t::_MAX); 196 | params.n_strategy=static_cast(val); 197 | break; 198 | } 199 | case 'n': { 200 | float val=atof(optarg); 201 | assert(val>=0.0); 202 | assert(val<=1.0); 203 | params.skip_n=val; 204 | break; 205 | } 206 | case '?': { 207 | cerr << "Unknown error" << endl; 208 | exit(1); 209 | } 210 | default: { 211 | cerr << "Unknown option " << c << endl; 212 | exit(1); 213 | } 214 | } 215 | } 216 | 217 | argc -= optind; 218 | argv += optind; 219 | 220 | if(argc != 1){ 221 | cerr << USAGE << endl; 222 | exit(1); 223 | } 224 | else { 225 | params.fasta_fn=string(argv[0]); 226 | } 227 | } 228 | 229 | 230 | /* 231 | * Load sequences and convert nucleotides to upper case. 232 | */ 233 | template 234 | void load_sequences(const string &fasta_fn, T &names, T &seqs) { 235 | gzFile fp; 236 | kseq_t *seq; 237 | int l; 238 | fp = gzopen(fasta_fn.c_str(), "r"); 239 | assert (fp != nullptr); 240 | seq = kseq_init(fp); 241 | 242 | int len=0; // length of sequences (for checking) 243 | 244 | while ((l = kseq_read(seq)) >= 0) { 245 | names.push_back(seq->name.s); 246 | string s(seq->seq.s); 247 | for (auto & c: s) { 248 | c = toupper(c); 249 | } 250 | if(len!=0){ 251 | assert(len==static_cast(s.size())); 252 | } 253 | else{ 254 | len=(int)s.size(); 255 | } 256 | 257 | for(char &a: s){ 258 | assert ((int)a<128); 259 | } 260 | 261 | seqs.push_back(s); 262 | } 263 | kseq_destroy(seq); 264 | gzclose(fp); 265 | 266 | assert(seqs.size()>0); 267 | } 268 | 269 | template 270 | void print_sequences(T &seqs) { 271 | cerr << endl; 272 | for (auto const &s: seqs){ 273 | cerr << s << endl; 274 | } 275 | cerr << endl; 276 | } 277 | 278 | /* 279 | * Compute pileup (len x 128). 280 | */ 281 | template 282 | void compute_pileup(const T &seqs, U &pileup) { 283 | assert(seqs[0].size()==pileup.size()); 284 | assert(pileup[0].size()==128); 285 | auto len=seqs[0].size(); 286 | for(int i=0; i(len); i++){ 287 | for(int c=0; c<128; c++){ 288 | pileup[i][c]=0; 289 | } 290 | } 291 | 292 | for (const auto &seq: seqs){ 293 | for(int i=0; i(len); i++){ 294 | unsigned char c=seq[i]; 295 | ++pileup[i][c]; 296 | } 297 | } 298 | } 299 | 300 | template 301 | void print_pileup(const T &pileup){ 302 | assert(pileup[0].size()==128); 303 | for (int i=0;i 318 | void compute_consensus(const T &pileup, string &consensus) { 319 | assert(pileup.size()==consensus.size()); 320 | assert(pileup[0].size()==128); 321 | 322 | for(int i=0; i(pileup.size()); i++){ 323 | char c='N'; 324 | int max_freq=-1; 325 | const auto &column=pileup[i]; 326 | 327 | for(int d=0;d<128;d++){ 328 | if(d!='N'){ 329 | if(column[d]>max_freq){ 330 | max_freq=column[d]; 331 | c=(char)d; 332 | } 333 | } 334 | } 335 | 336 | consensus[i]=c; 337 | } 338 | } 339 | 340 | void print_consensus(const string &consensus){ 341 | cout << consensus << endl; 342 | } 343 | 344 | /* 345 | * Compute mask. 346 | * 347 | * if N >= skip_n, then mask the column 348 | * 349 | * 0 - position ignored 350 | * N - position non-ignored, containg Ns 351 | * 1 - position non-ignored 352 | */ 353 | template 354 | void compute_mask(string &mask, const T &pileup, int n_thres) { 355 | assert(pileup.size()==mask.size()); 356 | assert(pileup[0].size()==128); 357 | 358 | int column_sum=accumulate(pileup[0].begin(), pileup[0].end(), 0); 359 | 360 | int masked_columns=0; 361 | 362 | for(int i=0; i(pileup.size()); i++){ 363 | int ns=pileup[i]['n']+pileup[i]['N']; 364 | if (ns >= n_thres){ 365 | mask[i]='0'; 366 | masked_columns++; 367 | } 368 | else{ 369 | if(ns>0) 370 | { 371 | mask[i]='N'; 372 | } 373 | else{ 374 | mask[i]='1'; 375 | } 376 | } 377 | } 378 | 379 | cerr << "Number of masked columns: " << masked_columns << " (out of " << pileup.size() << " positions, threshold: " << n_thres << " Ns, number of samples: " << column_sum << ")" << endl; 380 | } 381 | 382 | void print_mask(const string &mask){ 383 | cerr << mask << endl; 384 | } 385 | 386 | 387 | /* 388 | * Compute pair matrix (128 x 128). 389 | */ 390 | template 391 | void compute_pair_matrix(const string &seq1, const string &seq2, const string &mask, T &pair_matrix){ 392 | assert(seq1.size()==seq2.size()); 393 | assert(seq1.size()==mask.size()); 394 | assert(pair_matrix.size()==128); 395 | assert(pair_matrix[0].size()==128); 396 | 397 | for(int i=0; i<128; i++){ 398 | for(int j=0; j<128; j++){ 399 | pair_matrix[i][j]=0; 400 | } 401 | } 402 | int len=(int)seq1.size(); 403 | for (int i=0;i 418 | void pair_matrix_char_acgt(T &pair_matrix, pair_char_t &pair_char) { 419 | assert(pair_matrix.size()==128); 420 | assert(pair_matrix[0].size()==128); 421 | 422 | pair_char.matches=0; 423 | pair_char.mismatches=0; 424 | pair_char.unknown=0; 425 | 426 | for(unsigned char i=0;i<128;i++){ 427 | char n1_nt4=acgt_nt256_nt4[i]; 428 | for(unsigned char j=0;j<128;j++){ 429 | char n2_nt4=acgt_nt256_nt4[j]; 430 | 431 | if (n1_nt4==4 || n2_nt4==4){ 432 | pair_char.unknown+=pair_matrix[i][j]; 433 | } 434 | else{ 435 | if(n1_nt4==n2_nt4){ 436 | pair_char.matches+=pair_matrix[i][j]; 437 | } 438 | else { 439 | pair_char.mismatches+=pair_matrix[i][j]; 440 | } 441 | } 442 | } 443 | } 444 | //cout << pair_char.matches << " " << pair_char.mismatches << " " << pair_char.unknown << endl; 445 | } 446 | 447 | template 448 | void pair_matrix_char_binary(T &pair_matrix, pair_char_t &pair_char) { 449 | assert(pair_matrix.size()==128); 450 | assert(pair_matrix[0].size()==128); 451 | 452 | pair_char.matches=0; 453 | pair_char.mismatches=0; 454 | pair_char.unknown=0; 455 | 456 | for(unsigned char i=0;i<128;i++){ 457 | char n1_nt4=binary_nt256_nt4[i]; 458 | for(unsigned char j=0;j<128;j++){ 459 | char n2_nt4=binary_nt256_nt4[j]; 460 | 461 | if (n1_nt4==2 || n2_nt4==2){ 462 | pair_char.unknown++; 463 | } 464 | else{ 465 | if(n1_nt4+n2_nt4==1) 466 | { 467 | pair_char.matches++; 468 | } 469 | else { 470 | pair_char.mismatches++; 471 | } 472 | } 473 | } 474 | } 475 | } 476 | 477 | void print_pair_matrix_char(pair_char_t &pair_char){ 478 | cerr << pair_char.matches << "\t" << pair_char.mismatches << "\t" << pair_char.unknown << endl; 479 | } 480 | 481 | /* 482 | * Compute distance. 483 | */ 484 | 485 | int distance(const pair_char_t &pair_char) { 486 | return pair_char.mismatches; 487 | } 488 | 489 | int distance_norm(const pair_char_t &pair_char) { 490 | float multiplicator=1.0*(pair_char.matches+pair_char.mismatches+pair_char.unknown)/(pair_char.matches+pair_char.mismatches); 491 | return round(multiplicator * pair_char.mismatches); 492 | } 493 | 494 | 495 | template 496 | void print_distance_matrix(const T &distance_matrix, const U &names){ 497 | assert(distance_matrix.size() == distance_matrix[0].size()); 498 | assert(distance_matrix.size() == names.size()); 499 | int count=static_cast(distance_matrix.size()); 500 | 501 | //cout << ""; 502 | for (const string& name : names){ 503 | cout << "\t" << name; 504 | } 505 | cout << endl; 506 | 507 | for (int i=0;i names, seqs; 525 | load_sequences(params.fasta_fn, names, seqs); 526 | //print_sequences(seqs); 527 | 528 | int count=(int)seqs.size(); 529 | int len=(int)seqs[0].size(); 530 | 531 | cerr << "Computing pileup" << endl; 532 | vector> pileup(len, vector(128)); 533 | compute_pileup(seqs, pileup); 534 | //print_pileup(pileup); 535 | 536 | string consensus(len, '?'); 537 | if(params.n_strategy==n_strategy_t::REPLACE_MAJOR){ 538 | cerr << "Computing consensus" << endl; 539 | compute_consensus(pileup, consensus); 540 | //print_consensus(consensus); 541 | } 542 | 543 | 544 | cerr << "Computing mask" << endl; 545 | string mask(len, '?'); 546 | if(params.n_strategy==n_strategy_t::IGNORE_GLOBALLY){ 547 | compute_mask(mask, pileup, 1); 548 | } 549 | else{ 550 | int min_n=ceil(count*params.skip_n); 551 | compute_mask(mask, pileup, min_n); 552 | } 553 | //print_mask(mask); 554 | 555 | 556 | cerr << "Computing distance matrix" << endl; 557 | vector> pair_matrix(128, vector(128, 0)); 558 | vector> distance_matrix(count, vector(count, 0)); 559 | pair_char_t pair_char; 560 | 561 | if(params.n_strategy==n_strategy_t::REPLACE_MAJOR){ 562 | cerr << "Replacing N's by major alleles" << endl; 563 | for (auto &seq: seqs){ 564 | for(int i=0;i(seq.size());i++){ 565 | if (seq[i]=='N'||seq[i]=='N'){ 566 | seq[i]=consensus[i]; 567 | } 568 | } 569 | } 570 | } 571 | 572 | 573 | /* 574 | * For each pair: 575 | */ 576 | for(int i=0;i