├── tests
├── .gitignore
├── test1.fa
├── test2.fa
├── exp.test1.0.txt
├── exp.test2.0.txt
├── exp.test2.2.txt
├── test3.fa
├── exp.test3.2.txt
├── exp.test3.3.txt
├── view_test.sh
└── Makefile
├── disty.xcodeproj
├── project.xcworkspace
│ └── contents.xcworkspacedata
└── project.pbxproj
├── Makefile
├── .gitignore
├── LICENSE
├── readme.md
└── src
├── kseq.h
└── main.cpp
/tests/.gitignore:
--------------------------------------------------------------------------------
1 | obs.*.txt
2 |
--------------------------------------------------------------------------------
/tests/test1.fa:
--------------------------------------------------------------------------------
1 | >seq1
2 | AAA
3 | >seq2
4 | AaA
5 | >seq3
6 | aaA
7 |
--------------------------------------------------------------------------------
/tests/test2.fa:
--------------------------------------------------------------------------------
1 | >seq1
2 | AAA
3 | >seq2
4 | CCC
5 | >seq3
6 | ACN
7 |
--------------------------------------------------------------------------------
/tests/exp.test1.0.txt:
--------------------------------------------------------------------------------
1 | seq1 seq2 seq3
2 | seq1 0 0 0
3 | seq2 0 0 0
4 | seq3 0 0 0
5 |
--------------------------------------------------------------------------------
/tests/exp.test2.0.txt:
--------------------------------------------------------------------------------
1 | seq1 seq2 seq3
2 | seq1 0 3 1
3 | seq2 3 0 1
4 | seq3 1 1 0
5 |
--------------------------------------------------------------------------------
/tests/exp.test2.2.txt:
--------------------------------------------------------------------------------
1 | seq1 seq2 seq3
2 | seq1 0 2 1
3 | seq2 2 0 1
4 | seq3 1 1 0
5 |
--------------------------------------------------------------------------------
/tests/test3.fa:
--------------------------------------------------------------------------------
1 | >seq1
2 | AAA
3 | >seq2
4 | AAA
5 | >seq3
6 | AAC
7 | >seq4
8 | CNN
9 |
--------------------------------------------------------------------------------
/tests/exp.test3.2.txt:
--------------------------------------------------------------------------------
1 | seq1 seq2 seq3 seq4
2 | seq1 0 0 0 1
3 | seq2 0 0 0 1
4 | seq3 0 0 0 1
5 | seq4 1 1 1 0
6 |
--------------------------------------------------------------------------------
/tests/exp.test3.3.txt:
--------------------------------------------------------------------------------
1 | seq1 seq2 seq3 seq4
2 | seq1 0 0 1 1
3 | seq2 0 0 1 1
4 | seq3 1 1 0 2
5 | seq4 1 1 2 0
6 |
--------------------------------------------------------------------------------
/disty.xcodeproj/project.xcworkspace/contents.xcworkspacedata:
--------------------------------------------------------------------------------
1 |
2 |
4 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/tests/view_test.sh:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env bash
2 |
3 | set -u
4 | set -e
5 | set -o pipefail
6 |
7 | readonly PROGNAME=$(basename $0)
8 | readonly PROGDIR=$(dirname $0)
9 | readonly ARGS="$@"
10 | readonly NARGS="$#"
11 |
12 | if [ $NARGS -ne 1 ]; then
13 | echo "usage: $PROGNAME options"
14 | exit 1
15 | fi
16 |
17 | head {test$1*,exp.test$1*}
18 |
19 |
20 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | CXX ?= g++
2 | CXXFLAGS = -std=c++11 -Wall -Wextra -Wno-missing-field-initializers -g -O2
3 | LIBS = -lm -lz -lpthread
4 |
5 | PREFIX = $(DESTDIR)/usr/local
6 | BINDIR = $(PREFIX)/bin
7 |
8 | ofiles = src/main.cpp.o
9 | hfiles = $(wildcard src/*.h)
10 |
11 | .PHONY: all clean install
12 |
13 | all: disty
14 |
15 | install: disty
16 | install disty $(BINDIR)/disty
17 |
18 | disty: $(ofiles)
19 | $(CXX) $(CXXFLAGS) $(DFLAGS) $(ofiles) -o $@ -L. $(LIBS)
20 |
21 | src/%.cpp.o: src/%.cpp $(hfiles)
22 | $(CXX) $(CXXFLAGS) $(DFLAGS) -c $< -o $@
23 |
24 | clean:
25 | rm -f src/*.o
26 | rm -f disty
27 |
28 |
--------------------------------------------------------------------------------
/tests/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: all clean
2 |
3 | SHELL=/usr/bin/env bash -euc -o pipefail
4 |
5 | .SECONDARY:
6 |
7 | DM=../disty
8 | SEQ=$(wildcard *.fa)
9 |
10 | all:
11 | @for seq in $(SEQ); do \
12 | pref=$${seq%.*}; \
13 | for s in $$(seq 0 4); do \
14 | CMD="$(DM) -s $$s $$seq"; \
15 | EXP="exp.$${pref}.$$s.txt"; \
16 | OBS="obs.$${pref}.$$s.txt"; \
17 | if [[ -f "$$EXP" ]]; then \
18 | echo ; \
19 | echo "$$CMD"; \
20 | echo "======================================"; \
21 | echo ; \
22 | echo $$EXP; \
23 | echo $$OBS; \
24 | echo ; \
25 | $$CMD > "$$OBS" 2> /dev/null; \
26 | diff "$$OBS" "$$EXP"; \
27 | echo "OK"; \
28 | echo; \
29 | fi; \
30 | done; \
31 | done;
32 |
33 | clean:
34 | rm -f obs.*.txt
35 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | disty
2 |
3 | # Xcode
4 |
5 | ## Build generated
6 | build/
7 | DerivedData/
8 |
9 | ## Various settings
10 | *.pbxuser
11 | !default.pbxuser
12 | *.mode1v3
13 | !default.mode1v3
14 | *.mode2v3
15 | !default.mode2v3
16 | *.perspectivev3
17 | !default.perspectivev3
18 | xcuserdata/
19 |
20 | ## Other
21 | *.moved-aside
22 | *.xccheckout
23 | *.xcscmblueprint
24 |
25 |
26 | # Prerequisites
27 | *.d
28 |
29 | # Compiled Object files
30 | *.slo
31 | *.lo
32 | *.o
33 | *.obj
34 |
35 | # Precompiled Headers
36 | *.gch
37 | *.pch
38 |
39 | # Compiled Dynamic libraries
40 | *.so
41 | *.dylib
42 | *.dll
43 |
44 | # Fortran module files
45 | *.mod
46 | *.smod
47 |
48 | # Compiled Static libraries
49 | *.lai
50 | *.la
51 | *.a
52 | *.lib
53 |
54 | # Executables
55 | *.exe
56 | *.out
57 | *.app
58 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Karel Brinda
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # Disty McMatrixface
2 |
3 | Compute a distance matrix from a core genome alignment file.
4 |
5 |
6 | ## Prerequisities
7 |
8 | * GCC 4.8+ or equivalent
9 | * ZLib
10 |
11 |
12 | ## Getting started
13 |
14 | ```bash
15 | git clone https://github.com/c2-d2/disty
16 | cd disty && make
17 | ./disty tests/test2.fa
18 | ```
19 |
20 | Output:
21 | ```
22 | seq1 seq2 seq3
23 | seq1 0 3 1
24 | seq2 3 0 1
25 | seq3 1 1 0
26 |
27 | ```
28 |
29 | ## Installation
30 |
31 | **Using Bioconda:**
32 |
33 | ```bash
34 | conda config --add channels defaults
35 | conda config --add channels conda-forge
36 | conda config --add channels bioconda
37 |
38 | conda install disty
39 | ```
40 |
41 | **Using brew:**
42 |
43 | ```bash
44 | brew install tseemann/bioinformatics-linux/disty
45 | ```
46 |
47 | **From Github:**
48 |
49 | ```bash
50 | git clone https://github.com/c2-d2/disty
51 | cd disty && make && make install
52 | ```
53 |
54 |
55 | ## Command line parameters
56 |
57 | ```
58 | Usage: disty
59 |
60 | Options:
61 | -n FLOAT skip columns having frequency of N > FLOAT [1.00]
62 | -i INT input format [0]
63 | 0: ACGT
64 | 1: 01
65 | -s INT strategy to deal with N's [0]
66 | 0: ignore pairwisely
67 | 1: ignore pairwisely and normalize
68 | 2: ignore globally
69 | 3: replace by the major allele
70 | 4: replace by the closest individual (not implemented yet)
71 | -h print help message and exit
72 | -v print version and exit
73 | ```
74 |
75 |
76 | ## Author
77 |
78 | [Karel Brinda](http://brinda.cz) <kbrinda@hsph.harvard.edu>
79 |
--------------------------------------------------------------------------------
/src/kseq.h:
--------------------------------------------------------------------------------
1 | /* The MIT License
2 |
3 | Copyright (c) 2008, 2009, 2011 Attractive Chaos
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining
6 | a copy of this software and associated documentation files (the
7 | "Software"), to deal in the Software without restriction, including
8 | without limitation the rights to use, copy, modify, merge, publish,
9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | */
25 |
26 | /* Last Modified: 05MAR2012 */
27 |
28 | #ifndef AC_KSEQ_H
29 | #define AC_KSEQ_H
30 |
31 | #include
32 | #include
33 | #include
34 |
35 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
36 | #define KS_SEP_TAB 1 // isspace() && !' '
37 | #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
38 | #define KS_SEP_MAX 2
39 |
40 | #define __KS_TYPE(type_t) \
41 | typedef struct __kstream_t { \
42 | unsigned char *buf; \
43 | int begin, end, is_eof; \
44 | type_t f; \
45 | } kstream_t;
46 |
47 | #define ks_err(ks) ((ks)->end == -1)
48 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
49 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
50 |
51 | #define __KS_BASIC(type_t, __bufsize) \
52 | static inline kstream_t *ks_init(type_t f) \
53 | { \
54 | kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
55 | ks->f = f; \
56 | ks->buf = (unsigned char*)malloc(__bufsize); \
57 | return ks; \
58 | } \
59 | static inline void ks_destroy(kstream_t *ks) \
60 | { \
61 | if (ks) { \
62 | free(ks->buf); \
63 | free(ks); \
64 | } \
65 | }
66 |
67 | #define __KS_GETC(__read, __bufsize) \
68 | static inline int ks_getc(kstream_t *ks) \
69 | { \
70 | if (ks_err(ks)) return -3; \
71 | if (ks->is_eof && ks->begin >= ks->end) return -1; \
72 | if (ks->begin >= ks->end) { \
73 | ks->begin = 0; \
74 | ks->end = __read(ks->f, ks->buf, __bufsize); \
75 | if (ks->end == 0) { ks->is_eof = 1; return -1;} \
76 | if (ks->end == -1) { ks->is_eof = 1; return -3;}\
77 | } \
78 | return (int)ks->buf[ks->begin++]; \
79 | }
80 |
81 | #ifndef KSTRING_T
82 | #define KSTRING_T kstring_t
83 | typedef struct __kstring_t {
84 | size_t l, m;
85 | char *s;
86 | } kstring_t;
87 | #endif
88 |
89 | #ifndef kroundup32
90 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
91 | #endif
92 |
93 | #define __KS_GETUNTIL(__read, __bufsize) \
94 | static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
95 | { \
96 | int gotany = 0; \
97 | if (dret) *dret = 0; \
98 | str->l = append? str->l : 0; \
99 | for (;;) { \
100 | int i; \
101 | if (ks_err(ks)) return -3; \
102 | if (ks->begin >= ks->end) { \
103 | if (!ks->is_eof) { \
104 | ks->begin = 0; \
105 | ks->end = __read(ks->f, ks->buf, __bufsize); \
106 | if (ks->end == 0) { ks->is_eof = 1; break; } \
107 | if (ks->end == -1) { ks->is_eof = 1; return -3; } \
108 | } else break; \
109 | } \
110 | if (delimiter == KS_SEP_LINE) { \
111 | for (i = ks->begin; i < ks->end; ++i) \
112 | if (ks->buf[i] == '\n') break; \
113 | } else if (delimiter > KS_SEP_MAX) { \
114 | for (i = ks->begin; i < ks->end; ++i) \
115 | if (ks->buf[i] == delimiter) break; \
116 | } else if (delimiter == KS_SEP_SPACE) { \
117 | for (i = ks->begin; i < ks->end; ++i) \
118 | if (isspace(ks->buf[i])) break; \
119 | } else if (delimiter == KS_SEP_TAB) { \
120 | for (i = ks->begin; i < ks->end; ++i) \
121 | if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
122 | } else i = 0; /* never come to here! */ \
123 | if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
124 | str->m = str->l + (i - ks->begin) + 1; \
125 | kroundup32(str->m); \
126 | str->s = (char*)realloc(str->s, str->m); \
127 | } \
128 | gotany = 1; \
129 | memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
130 | str->l = str->l + (i - ks->begin); \
131 | ks->begin = i + 1; \
132 | if (i < ks->end) { \
133 | if (dret) *dret = ks->buf[i]; \
134 | break; \
135 | } \
136 | } \
137 | if (!gotany && ks_eof(ks)) return -1; \
138 | if (str->s == 0) { \
139 | str->m = 1; \
140 | str->s = (char*)calloc(1, 1); \
141 | } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
142 | str->s[str->l] = '\0'; \
143 | return str->l; \
144 | } \
145 | static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
146 | { return ks_getuntil2(ks, delimiter, str, dret, 0); }
147 |
148 | #define KSTREAM_INIT(type_t, __read, __bufsize) \
149 | __KS_TYPE(type_t) \
150 | __KS_BASIC(type_t, __bufsize) \
151 | __KS_GETC(__read, __bufsize) \
152 | __KS_GETUNTIL(__read, __bufsize)
153 |
154 | #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
155 |
156 | #define __KSEQ_BASIC(SCOPE, type_t) \
157 | SCOPE kseq_t *kseq_init(type_t fd) \
158 | { \
159 | kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
160 | s->f = ks_init(fd); \
161 | return s; \
162 | } \
163 | SCOPE void kseq_destroy(kseq_t *ks) \
164 | { \
165 | if (!ks) return; \
166 | free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
167 | ks_destroy(ks->f); \
168 | free(ks); \
169 | }
170 |
171 | /* Return value:
172 | >=0 length of the sequence (normal)
173 | -1 end-of-file
174 | -2 truncated quality string
175 | -3 error reading stream
176 | */
177 | #define __KSEQ_READ(SCOPE) \
178 | SCOPE int kseq_read(kseq_t *seq) \
179 | { \
180 | int c,r; \
181 | kstream_t *ks = seq->f; \
182 | if (seq->last_char == 0) { /* then jump to the next header line */ \
183 | while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '@'); \
184 | if (c < 0) return c; /* end of file or error*/ \
185 | seq->last_char = c; \
186 | } /* else: the first header char has been read in the previous call */ \
187 | seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
188 | if ((r=ks_getuntil(ks, 0, &seq->name, &c)) < 0) return r; /* normal exit: EOF or error */ \
189 | if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
190 | if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
191 | seq->seq.m = 256; \
192 | seq->seq.s = (char*)malloc(seq->seq.m); \
193 | } \
194 | while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '+' && c != '@') { \
195 | if (c == '\n') continue; /* skip empty lines */ \
196 | seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
197 | ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
198 | } \
199 | if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
200 | if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
201 | seq->seq.m = seq->seq.l + 2; \
202 | kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
203 | seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
204 | } \
205 | seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
206 | if (c != '+') return seq->seq.l; /* FASTA */ \
207 | if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
208 | seq->qual.m = seq->seq.m; \
209 | seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
210 | } \
211 | while ((c = ks_getc(ks)) >= 0 && c != '\n'); /* skip the rest of '+' line */ \
212 | if (c == -1) return -2; /* error: no quality string */ \
213 | while ((c = ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l)); \
214 | if (c == -3) return -3; /* stream error */ \
215 | seq->last_char = 0; /* we have not come to the next header line */ \
216 | if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
217 | return seq->seq.l; \
218 | }
219 |
220 | #define __KSEQ_TYPE(type_t) \
221 | typedef struct { \
222 | kstring_t name, comment, seq, qual; \
223 | int last_char; \
224 | kstream_t *f; \
225 | } kseq_t;
226 |
227 | #define KSEQ_INIT2(SCOPE, type_t, __read) \
228 | KSTREAM_INIT(type_t, __read, 16384) \
229 | __KSEQ_TYPE(type_t) \
230 | __KSEQ_BASIC(SCOPE, type_t) \
231 | __KSEQ_READ(SCOPE)
232 |
233 | #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
234 |
235 | #define KSEQ_DECLARE(type_t) \
236 | __KS_TYPE(type_t) \
237 | __KSEQ_TYPE(type_t) \
238 | extern kseq_t *kseq_init(type_t fd); \
239 | void kseq_destroy(kseq_t *ks); \
240 | int kseq_read(kseq_t *seq);
241 |
242 | #endif
243 |
--------------------------------------------------------------------------------
/disty.xcodeproj/project.pbxproj:
--------------------------------------------------------------------------------
1 | // !$*UTF8*$!
2 | {
3 | archiveVersion = 1;
4 | classes = {
5 | };
6 | objectVersion = 46;
7 | objects = {
8 |
9 | /* Begin PBXBuildFile section */
10 | FC6DB7381F16D6EB00DCFDD5 /* main.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC6DB7371F16D6EB00DCFDD5 /* main.cpp */; };
11 | FC6DB73A1F16D71100DCFDD5 /* Makefile in Sources */ = {isa = PBXBuildFile; fileRef = FC6DB7391F16D71100DCFDD5 /* Makefile */; };
12 | FCE6C6441F0FD8AC00EE50DE /* libz.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = FCE6C6431F0FD8AC00EE50DE /* libz.tbd */; };
13 | /* End PBXBuildFile section */
14 |
15 | /* Begin PBXCopyFilesBuildPhase section */
16 | FCE6C6361F0FD04B00EE50DE /* CopyFiles */ = {
17 | isa = PBXCopyFilesBuildPhase;
18 | buildActionMask = 2147483647;
19 | dstPath = /usr/share/man/man1/;
20 | dstSubfolderSpec = 0;
21 | files = (
22 | );
23 | runOnlyForDeploymentPostprocessing = 1;
24 | };
25 | /* End PBXCopyFilesBuildPhase section */
26 |
27 | /* Begin PBXFileReference section */
28 | FC6DB7371F16D6EB00DCFDD5 /* main.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = main.cpp; path = src/main.cpp; sourceTree = SOURCE_ROOT; };
29 | FC6DB7391F16D71100DCFDD5 /* Makefile */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.make; path = Makefile; sourceTree = SOURCE_ROOT; };
30 | FCE6C6381F0FD04B00EE50DE /* disty */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = disty; sourceTree = BUILT_PRODUCTS_DIR; };
31 | FCE6C6431F0FD8AC00EE50DE /* libz.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = libz.tbd; path = usr/lib/libz.tbd; sourceTree = SDKROOT; };
32 | /* End PBXFileReference section */
33 |
34 | /* Begin PBXFrameworksBuildPhase section */
35 | FCE6C6351F0FD04B00EE50DE /* Frameworks */ = {
36 | isa = PBXFrameworksBuildPhase;
37 | buildActionMask = 2147483647;
38 | files = (
39 | FCE6C6441F0FD8AC00EE50DE /* libz.tbd in Frameworks */,
40 | );
41 | runOnlyForDeploymentPostprocessing = 0;
42 | };
43 | /* End PBXFrameworksBuildPhase section */
44 |
45 | /* Begin PBXGroup section */
46 | FCE6C62F1F0FD04B00EE50DE = {
47 | isa = PBXGroup;
48 | children = (
49 | FCE6C63A1F0FD04B00EE50DE /* disty */,
50 | FCE6C6391F0FD04B00EE50DE /* Products */,
51 | FCE6C6421F0FD8AC00EE50DE /* Frameworks */,
52 | );
53 | sourceTree = "";
54 | };
55 | FCE6C6391F0FD04B00EE50DE /* Products */ = {
56 | isa = PBXGroup;
57 | children = (
58 | FCE6C6381F0FD04B00EE50DE /* disty */,
59 | );
60 | name = Products;
61 | sourceTree = "";
62 | };
63 | FCE6C63A1F0FD04B00EE50DE /* disty */ = {
64 | isa = PBXGroup;
65 | children = (
66 | FC6DB7391F16D71100DCFDD5 /* Makefile */,
67 | FC6DB7371F16D6EB00DCFDD5 /* main.cpp */,
68 | );
69 | path = disty;
70 | sourceTree = "";
71 | };
72 | FCE6C6421F0FD8AC00EE50DE /* Frameworks */ = {
73 | isa = PBXGroup;
74 | children = (
75 | FCE6C6431F0FD8AC00EE50DE /* libz.tbd */,
76 | );
77 | name = Frameworks;
78 | sourceTree = "";
79 | };
80 | /* End PBXGroup section */
81 |
82 | /* Begin PBXNativeTarget section */
83 | FCE6C6371F0FD04B00EE50DE /* disty */ = {
84 | isa = PBXNativeTarget;
85 | buildConfigurationList = FCE6C63F1F0FD04B00EE50DE /* Build configuration list for PBXNativeTarget "disty" */;
86 | buildPhases = (
87 | FCE6C6341F0FD04B00EE50DE /* Sources */,
88 | FCE6C6351F0FD04B00EE50DE /* Frameworks */,
89 | FCE6C6361F0FD04B00EE50DE /* CopyFiles */,
90 | );
91 | buildRules = (
92 | );
93 | dependencies = (
94 | );
95 | name = disty;
96 | productName = disty;
97 | productReference = FCE6C6381F0FD04B00EE50DE /* disty */;
98 | productType = "com.apple.product-type.tool";
99 | };
100 | /* End PBXNativeTarget section */
101 |
102 | /* Begin PBXProject section */
103 | FCE6C6301F0FD04B00EE50DE /* Project object */ = {
104 | isa = PBXProject;
105 | attributes = {
106 | LastUpgradeCheck = 0830;
107 | ORGANIZATIONNAME = "Karel Brinda";
108 | TargetAttributes = {
109 | FCE6C6371F0FD04B00EE50DE = {
110 | CreatedOnToolsVersion = 8.3.3;
111 | ProvisioningStyle = Automatic;
112 | };
113 | };
114 | };
115 | buildConfigurationList = FCE6C6331F0FD04B00EE50DE /* Build configuration list for PBXProject "disty" */;
116 | compatibilityVersion = "Xcode 3.2";
117 | developmentRegion = English;
118 | hasScannedForEncodings = 0;
119 | knownRegions = (
120 | en,
121 | );
122 | mainGroup = FCE6C62F1F0FD04B00EE50DE;
123 | productRefGroup = FCE6C6391F0FD04B00EE50DE /* Products */;
124 | projectDirPath = "";
125 | projectRoot = "";
126 | targets = (
127 | FCE6C6371F0FD04B00EE50DE /* disty */,
128 | );
129 | };
130 | /* End PBXProject section */
131 |
132 | /* Begin PBXSourcesBuildPhase section */
133 | FCE6C6341F0FD04B00EE50DE /* Sources */ = {
134 | isa = PBXSourcesBuildPhase;
135 | buildActionMask = 2147483647;
136 | files = (
137 | FC6DB73A1F16D71100DCFDD5 /* Makefile in Sources */,
138 | FC6DB7381F16D6EB00DCFDD5 /* main.cpp in Sources */,
139 | );
140 | runOnlyForDeploymentPostprocessing = 0;
141 | };
142 | /* End PBXSourcesBuildPhase section */
143 |
144 | /* Begin XCBuildConfiguration section */
145 | FCE6C63D1F0FD04B00EE50DE /* Debug */ = {
146 | isa = XCBuildConfiguration;
147 | buildSettings = {
148 | ALWAYS_SEARCH_USER_PATHS = NO;
149 | CLANG_ANALYZER_NONNULL = YES;
150 | CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
151 | CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
152 | CLANG_CXX_LIBRARY = "libc++";
153 | CLANG_ENABLE_MODULES = YES;
154 | CLANG_ENABLE_OBJC_ARC = YES;
155 | CLANG_WARN_BOOL_CONVERSION = YES;
156 | CLANG_WARN_CONSTANT_CONVERSION = YES;
157 | CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
158 | CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
159 | CLANG_WARN_EMPTY_BODY = YES;
160 | CLANG_WARN_ENUM_CONVERSION = YES;
161 | CLANG_WARN_INFINITE_RECURSION = YES;
162 | CLANG_WARN_INT_CONVERSION = YES;
163 | CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
164 | CLANG_WARN_SUSPICIOUS_MOVE = YES;
165 | CLANG_WARN_UNREACHABLE_CODE = YES;
166 | CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
167 | CODE_SIGN_IDENTITY = "-";
168 | COPY_PHASE_STRIP = NO;
169 | DEBUG_INFORMATION_FORMAT = dwarf;
170 | ENABLE_STRICT_OBJC_MSGSEND = YES;
171 | ENABLE_TESTABILITY = YES;
172 | GCC_C_LANGUAGE_STANDARD = gnu99;
173 | GCC_DYNAMIC_NO_PIC = NO;
174 | GCC_NO_COMMON_BLOCKS = YES;
175 | GCC_OPTIMIZATION_LEVEL = 0;
176 | GCC_PREPROCESSOR_DEFINITIONS = (
177 | "DEBUG=1",
178 | "$(inherited)",
179 | );
180 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
181 | GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
182 | GCC_WARN_UNDECLARED_SELECTOR = YES;
183 | GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
184 | GCC_WARN_UNUSED_FUNCTION = YES;
185 | GCC_WARN_UNUSED_VARIABLE = YES;
186 | MACOSX_DEPLOYMENT_TARGET = 10.12;
187 | MTL_ENABLE_DEBUG_INFO = YES;
188 | ONLY_ACTIVE_ARCH = YES;
189 | SDKROOT = macosx;
190 | };
191 | name = Debug;
192 | };
193 | FCE6C63E1F0FD04B00EE50DE /* Release */ = {
194 | isa = XCBuildConfiguration;
195 | buildSettings = {
196 | ALWAYS_SEARCH_USER_PATHS = NO;
197 | CLANG_ANALYZER_NONNULL = YES;
198 | CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
199 | CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
200 | CLANG_CXX_LIBRARY = "libc++";
201 | CLANG_ENABLE_MODULES = YES;
202 | CLANG_ENABLE_OBJC_ARC = YES;
203 | CLANG_WARN_BOOL_CONVERSION = YES;
204 | CLANG_WARN_CONSTANT_CONVERSION = YES;
205 | CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
206 | CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
207 | CLANG_WARN_EMPTY_BODY = YES;
208 | CLANG_WARN_ENUM_CONVERSION = YES;
209 | CLANG_WARN_INFINITE_RECURSION = YES;
210 | CLANG_WARN_INT_CONVERSION = YES;
211 | CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
212 | CLANG_WARN_SUSPICIOUS_MOVE = YES;
213 | CLANG_WARN_UNREACHABLE_CODE = YES;
214 | CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
215 | CODE_SIGN_IDENTITY = "-";
216 | COPY_PHASE_STRIP = NO;
217 | DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
218 | ENABLE_NS_ASSERTIONS = NO;
219 | ENABLE_STRICT_OBJC_MSGSEND = YES;
220 | GCC_C_LANGUAGE_STANDARD = gnu99;
221 | GCC_NO_COMMON_BLOCKS = YES;
222 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
223 | GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
224 | GCC_WARN_UNDECLARED_SELECTOR = YES;
225 | GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
226 | GCC_WARN_UNUSED_FUNCTION = YES;
227 | GCC_WARN_UNUSED_VARIABLE = YES;
228 | MACOSX_DEPLOYMENT_TARGET = 10.12;
229 | MTL_ENABLE_DEBUG_INFO = NO;
230 | SDKROOT = macosx;
231 | };
232 | name = Release;
233 | };
234 | FCE6C6401F0FD04B00EE50DE /* Debug */ = {
235 | isa = XCBuildConfiguration;
236 | buildSettings = {
237 | PRODUCT_NAME = "$(TARGET_NAME)";
238 | };
239 | name = Debug;
240 | };
241 | FCE6C6411F0FD04B00EE50DE /* Release */ = {
242 | isa = XCBuildConfiguration;
243 | buildSettings = {
244 | PRODUCT_NAME = "$(TARGET_NAME)";
245 | };
246 | name = Release;
247 | };
248 | /* End XCBuildConfiguration section */
249 |
250 | /* Begin XCConfigurationList section */
251 | FCE6C6331F0FD04B00EE50DE /* Build configuration list for PBXProject "disty" */ = {
252 | isa = XCConfigurationList;
253 | buildConfigurations = (
254 | FCE6C63D1F0FD04B00EE50DE /* Debug */,
255 | FCE6C63E1F0FD04B00EE50DE /* Release */,
256 | );
257 | defaultConfigurationIsVisible = 0;
258 | defaultConfigurationName = Release;
259 | };
260 | FCE6C63F1F0FD04B00EE50DE /* Build configuration list for PBXNativeTarget "disty" */ = {
261 | isa = XCConfigurationList;
262 | buildConfigurations = (
263 | FCE6C6401F0FD04B00EE50DE /* Debug */,
264 | FCE6C6411F0FD04B00EE50DE /* Release */,
265 | );
266 | defaultConfigurationIsVisible = 0;
267 | defaultConfigurationName = Release;
268 | };
269 | /* End XCConfigurationList section */
270 | };
271 | rootObject = FCE6C6301F0FD04B00EE50DE /* Project object */;
272 | }
273 |
--------------------------------------------------------------------------------
/src/main.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | The MIT License
3 |
4 | Copyright (c) 2017 Karel Brinda
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining
7 | a copy of this software and associated documentation files (the
8 | "Software"), to deal in the Software without restriction, including
9 | without limitation the rights to use, copy, modify, merge, publish,
10 | distribute, sublicense, and/or sell copies of the Software, and to
11 | permit persons to whom the Software is furnished to do so, subject to
12 | the following conditions:
13 |
14 | The above copyright notice and this permission notice shall be
15 | included in all copies or substantial portions of the Software.
16 |
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
21 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
22 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
23 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 | SOFTWARE.
25 | */
26 |
27 |
28 | #ifndef DISTY_VERSION
29 | #define DISTY_VERSION "0.1.0"
30 | #endif
31 |
32 | #include
33 | #include
34 | #include
35 | #include
36 | #include
37 | #include
38 | #include
39 | #include
40 | #include
41 | #include
42 | #include
43 |
44 | #include "kseq.h"
45 |
46 | using namespace std;
47 |
48 |
49 | enum class input_t {
50 | ACGT,
51 | BINARY,
52 | _MAX
53 | };
54 |
55 | enum class n_strategy_t {
56 | IGNORE_PAIRWISE,
57 | IGNORE_PAIRWISE_NORM,
58 | IGNORE_GLOBALLY,
59 | REPLACE_MAJOR,
60 | REPLACE_CLOSEST,
61 | _MAX
62 | };
63 |
64 | string USAGE=
65 | "\n"
66 | "Program: Disty McMatrixface - compute a distance matrix from a core genome alignment file\n"
67 | "Version: " DISTY_VERSION "\n"
68 | "Contact: Karel Brinda \n"
69 | "\n"
70 | "Usage: disty \n"
71 | "\n"
72 | "Options:\n"
73 | " -n FLOAT skip columns having frequency of N > FLOAT [1.00]\n"
74 | " -i INT input format [0]\n"
75 | " 0: ACGT\n"
76 | " 1: 01\n"
77 | " -s INT strategy to deal with N's [0]\n"
78 | " 0: ignore pairwisely\n"
79 | " 1: ignore pairwisely and normalize\n"
80 | " 2: ignore globally\n"
81 | " 3: replace by the major allele\n"
82 | " 4: replace by the closest individual (not implemented yet)\n"
83 | " -h print help message and exit\n"
84 | " -v print version and exit\n";
85 |
86 | struct params_t {
87 | string fasta_fn;
88 | input_t input;
89 | n_strategy_t n_strategy;
90 | float skip_n;
91 |
92 | params_t()
93 | :fasta_fn(""), input(input_t::ACGT), n_strategy(n_strategy_t::IGNORE_PAIRWISE), skip_n(1.0)
94 | {}
95 | };
96 |
97 | struct pair_char_t {
98 | int matches;
99 | int mismatches;
100 | int unknown;
101 | };
102 |
103 | static const uint8_t acgt_nt256_nt4[] = {
104 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
105 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
106 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2,
107 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
108 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4,
109 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
110 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
111 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
112 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
113 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
114 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
115 |
116 |
117 | /*
118 | static const uint8_t acgt_nt256_nt16[] = {
119 | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
120 | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
121 | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
122 | 1 , 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0,15,15,
123 | 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
124 | 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
125 | 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
126 | 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
127 |
128 | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
129 | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
130 | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
131 | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
132 | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
133 | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
134 | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
135 | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
136 | };*/
137 |
138 | //static const uint8_t acgt_nt16_nt4[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
139 |
140 |
141 | static const uint8_t binary_nt256_nt4[] = {
142 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2,
143 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2,
144 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2,
145 | 0,1,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2,
146 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2,
147 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2,
148 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2,
149 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2,
150 |
151 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2,
152 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2,
153 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2,
154 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2,
155 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2,
156 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2,
157 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2,
158 | 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
159 | };
160 |
161 |
162 | KSEQ_INIT(gzFile, gzread)
163 |
164 |
165 | /*
166 | * Parse arguments.
167 | */
168 | void parse_arguments(int argc, const char **argv, params_t ¶ms) {
169 | if (argc==1){
170 | cerr << USAGE << endl;
171 | exit(1);
172 | }
173 |
174 | int c;
175 | while ((c = getopt(argc, (char *const *)argv, "hvi:s:n:")) >= 0) {
176 | switch (c) {
177 | case 'h': {
178 | cout << USAGE << endl;
179 | exit(0);
180 | }
181 | case 'v': {
182 | cout << DISTY_VERSION << endl;
183 | exit(0);
184 | }
185 | case 'i': {
186 | int val=atoi(optarg);
187 | assert(val>=0);
188 | assert(val<(int)input_t::_MAX);
189 | params.input=static_cast(val);
190 | break;
191 | }
192 | case 's': {
193 | int val=atoi(optarg);
194 | assert(val>=0);
195 | assert(val<(int)n_strategy_t::_MAX);
196 | params.n_strategy=static_cast(val);
197 | break;
198 | }
199 | case 'n': {
200 | float val=atof(optarg);
201 | assert(val>=0.0);
202 | assert(val<=1.0);
203 | params.skip_n=val;
204 | break;
205 | }
206 | case '?': {
207 | cerr << "Unknown error" << endl;
208 | exit(1);
209 | }
210 | default: {
211 | cerr << "Unknown option " << c << endl;
212 | exit(1);
213 | }
214 | }
215 | }
216 |
217 | argc -= optind;
218 | argv += optind;
219 |
220 | if(argc != 1){
221 | cerr << USAGE << endl;
222 | exit(1);
223 | }
224 | else {
225 | params.fasta_fn=string(argv[0]);
226 | }
227 | }
228 |
229 |
230 | /*
231 | * Load sequences and convert nucleotides to upper case.
232 | */
233 | template
234 | void load_sequences(const string &fasta_fn, T &names, T &seqs) {
235 | gzFile fp;
236 | kseq_t *seq;
237 | int l;
238 | fp = gzopen(fasta_fn.c_str(), "r");
239 | assert (fp != nullptr);
240 | seq = kseq_init(fp);
241 |
242 | int len=0; // length of sequences (for checking)
243 |
244 | while ((l = kseq_read(seq)) >= 0) {
245 | names.push_back(seq->name.s);
246 | string s(seq->seq.s);
247 | for (auto & c: s) {
248 | c = toupper(c);
249 | }
250 | if(len!=0){
251 | assert(len==static_cast(s.size()));
252 | }
253 | else{
254 | len=(int)s.size();
255 | }
256 |
257 | for(char &a: s){
258 | assert ((int)a<128);
259 | }
260 |
261 | seqs.push_back(s);
262 | }
263 | kseq_destroy(seq);
264 | gzclose(fp);
265 |
266 | assert(seqs.size()>0);
267 | }
268 |
269 | template
270 | void print_sequences(T &seqs) {
271 | cerr << endl;
272 | for (auto const &s: seqs){
273 | cerr << s << endl;
274 | }
275 | cerr << endl;
276 | }
277 |
278 | /*
279 | * Compute pileup (len x 128).
280 | */
281 | template
282 | void compute_pileup(const T &seqs, U &pileup) {
283 | assert(seqs[0].size()==pileup.size());
284 | assert(pileup[0].size()==128);
285 | auto len=seqs[0].size();
286 | for(int i=0; i(len); i++){
287 | for(int c=0; c<128; c++){
288 | pileup[i][c]=0;
289 | }
290 | }
291 |
292 | for (const auto &seq: seqs){
293 | for(int i=0; i(len); i++){
294 | unsigned char c=seq[i];
295 | ++pileup[i][c];
296 | }
297 | }
298 | }
299 |
300 | template
301 | void print_pileup(const T &pileup){
302 | assert(pileup[0].size()==128);
303 | for (int i=0;i
318 | void compute_consensus(const T &pileup, string &consensus) {
319 | assert(pileup.size()==consensus.size());
320 | assert(pileup[0].size()==128);
321 |
322 | for(int i=0; i(pileup.size()); i++){
323 | char c='N';
324 | int max_freq=-1;
325 | const auto &column=pileup[i];
326 |
327 | for(int d=0;d<128;d++){
328 | if(d!='N'){
329 | if(column[d]>max_freq){
330 | max_freq=column[d];
331 | c=(char)d;
332 | }
333 | }
334 | }
335 |
336 | consensus[i]=c;
337 | }
338 | }
339 |
340 | void print_consensus(const string &consensus){
341 | cout << consensus << endl;
342 | }
343 |
344 | /*
345 | * Compute mask.
346 | *
347 | * if N >= skip_n, then mask the column
348 | *
349 | * 0 - position ignored
350 | * N - position non-ignored, containg Ns
351 | * 1 - position non-ignored
352 | */
353 | template
354 | void compute_mask(string &mask, const T &pileup, int n_thres) {
355 | assert(pileup.size()==mask.size());
356 | assert(pileup[0].size()==128);
357 |
358 | int column_sum=accumulate(pileup[0].begin(), pileup[0].end(), 0);
359 |
360 | int masked_columns=0;
361 |
362 | for(int i=0; i(pileup.size()); i++){
363 | int ns=pileup[i]['n']+pileup[i]['N'];
364 | if (ns >= n_thres){
365 | mask[i]='0';
366 | masked_columns++;
367 | }
368 | else{
369 | if(ns>0)
370 | {
371 | mask[i]='N';
372 | }
373 | else{
374 | mask[i]='1';
375 | }
376 | }
377 | }
378 |
379 | cerr << "Number of masked columns: " << masked_columns << " (out of " << pileup.size() << " positions, threshold: " << n_thres << " Ns, number of samples: " << column_sum << ")" << endl;
380 | }
381 |
382 | void print_mask(const string &mask){
383 | cerr << mask << endl;
384 | }
385 |
386 |
387 | /*
388 | * Compute pair matrix (128 x 128).
389 | */
390 | template
391 | void compute_pair_matrix(const string &seq1, const string &seq2, const string &mask, T &pair_matrix){
392 | assert(seq1.size()==seq2.size());
393 | assert(seq1.size()==mask.size());
394 | assert(pair_matrix.size()==128);
395 | assert(pair_matrix[0].size()==128);
396 |
397 | for(int i=0; i<128; i++){
398 | for(int j=0; j<128; j++){
399 | pair_matrix[i][j]=0;
400 | }
401 | }
402 | int len=(int)seq1.size();
403 | for (int i=0;i
418 | void pair_matrix_char_acgt(T &pair_matrix, pair_char_t &pair_char) {
419 | assert(pair_matrix.size()==128);
420 | assert(pair_matrix[0].size()==128);
421 |
422 | pair_char.matches=0;
423 | pair_char.mismatches=0;
424 | pair_char.unknown=0;
425 |
426 | for(unsigned char i=0;i<128;i++){
427 | char n1_nt4=acgt_nt256_nt4[i];
428 | for(unsigned char j=0;j<128;j++){
429 | char n2_nt4=acgt_nt256_nt4[j];
430 |
431 | if (n1_nt4==4 || n2_nt4==4){
432 | pair_char.unknown+=pair_matrix[i][j];
433 | }
434 | else{
435 | if(n1_nt4==n2_nt4){
436 | pair_char.matches+=pair_matrix[i][j];
437 | }
438 | else {
439 | pair_char.mismatches+=pair_matrix[i][j];
440 | }
441 | }
442 | }
443 | }
444 | //cout << pair_char.matches << " " << pair_char.mismatches << " " << pair_char.unknown << endl;
445 | }
446 |
447 | template
448 | void pair_matrix_char_binary(T &pair_matrix, pair_char_t &pair_char) {
449 | assert(pair_matrix.size()==128);
450 | assert(pair_matrix[0].size()==128);
451 |
452 | pair_char.matches=0;
453 | pair_char.mismatches=0;
454 | pair_char.unknown=0;
455 |
456 | for(unsigned char i=0;i<128;i++){
457 | char n1_nt4=binary_nt256_nt4[i];
458 | for(unsigned char j=0;j<128;j++){
459 | char n2_nt4=binary_nt256_nt4[j];
460 |
461 | if (n1_nt4==2 || n2_nt4==2){
462 | pair_char.unknown++;
463 | }
464 | else{
465 | if(n1_nt4+n2_nt4==1)
466 | {
467 | pair_char.matches++;
468 | }
469 | else {
470 | pair_char.mismatches++;
471 | }
472 | }
473 | }
474 | }
475 | }
476 |
477 | void print_pair_matrix_char(pair_char_t &pair_char){
478 | cerr << pair_char.matches << "\t" << pair_char.mismatches << "\t" << pair_char.unknown << endl;
479 | }
480 |
481 | /*
482 | * Compute distance.
483 | */
484 |
485 | int distance(const pair_char_t &pair_char) {
486 | return pair_char.mismatches;
487 | }
488 |
489 | int distance_norm(const pair_char_t &pair_char) {
490 | float multiplicator=1.0*(pair_char.matches+pair_char.mismatches+pair_char.unknown)/(pair_char.matches+pair_char.mismatches);
491 | return round(multiplicator * pair_char.mismatches);
492 | }
493 |
494 |
495 | template
496 | void print_distance_matrix(const T &distance_matrix, const U &names){
497 | assert(distance_matrix.size() == distance_matrix[0].size());
498 | assert(distance_matrix.size() == names.size());
499 | int count=static_cast(distance_matrix.size());
500 |
501 | //cout << "";
502 | for (const string& name : names){
503 | cout << "\t" << name;
504 | }
505 | cout << endl;
506 |
507 | for (int i=0;i names, seqs;
525 | load_sequences(params.fasta_fn, names, seqs);
526 | //print_sequences(seqs);
527 |
528 | int count=(int)seqs.size();
529 | int len=(int)seqs[0].size();
530 |
531 | cerr << "Computing pileup" << endl;
532 | vector> pileup(len, vector(128));
533 | compute_pileup(seqs, pileup);
534 | //print_pileup(pileup);
535 |
536 | string consensus(len, '?');
537 | if(params.n_strategy==n_strategy_t::REPLACE_MAJOR){
538 | cerr << "Computing consensus" << endl;
539 | compute_consensus(pileup, consensus);
540 | //print_consensus(consensus);
541 | }
542 |
543 |
544 | cerr << "Computing mask" << endl;
545 | string mask(len, '?');
546 | if(params.n_strategy==n_strategy_t::IGNORE_GLOBALLY){
547 | compute_mask(mask, pileup, 1);
548 | }
549 | else{
550 | int min_n=ceil(count*params.skip_n);
551 | compute_mask(mask, pileup, min_n);
552 | }
553 | //print_mask(mask);
554 |
555 |
556 | cerr << "Computing distance matrix" << endl;
557 | vector> pair_matrix(128, vector(128, 0));
558 | vector> distance_matrix(count, vector(count, 0));
559 | pair_char_t pair_char;
560 |
561 | if(params.n_strategy==n_strategy_t::REPLACE_MAJOR){
562 | cerr << "Replacing N's by major alleles" << endl;
563 | for (auto &seq: seqs){
564 | for(int i=0;i(seq.size());i++){
565 | if (seq[i]=='N'||seq[i]=='N'){
566 | seq[i]=consensus[i];
567 | }
568 | }
569 | }
570 | }
571 |
572 |
573 | /*
574 | * For each pair:
575 | */
576 | for(int i=0;i