├── .gitignore
├── .gitmodules
├── .travis.yml
├── Dockerfile
├── IntervalTree.h
├── LICENSE
├── Makefile
├── README.md
├── ReleaseNotes.md
├── build-tools
    ├── CMakeLists.txt
    ├── makeBinRelease
    ├── makeSrcRelease
    ├── quayTagRelease
    └── releaseLib.sh
├── clip-vg.cpp
├── count-vg-hap-cov.cpp
├── filter-paf-deletions.cpp
├── hal2vg.cpp
├── halMergeChroms.cpp
├── halRemoveDupes.cpp
├── halUnclip.cpp
├── include.mk
├── paf.hpp
├── subpaths.h
└── tests
    ├── bash-tap
        ├── Changes
        ├── README.mkdn
        ├── bash-tap
        ├── bash-tap-bootstrap
        └── bash-tap-mock
    ├── chop
        ├── tiny-flat.gfa
        └── tiny-rev.gfa
    ├── small
        ├── small.maf
        ├── small2.maf
        └── truth.json
    └── t
        ├── chop.t
        ├── merge.t
        └── small.t


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files
 2 | *.slo
 3 | *.lo
 4 | *.o
 5 | *.obj
 6 | 
 7 | # Precompiled Headers
 8 | *.gch
 9 | *.pch
10 | 
11 | # Compiled Dynamic libraries
12 | *.so
13 | *.dylib
14 | *.dll
15 | 
16 | # Fortran module files
17 | *.mod
18 | *.smod
19 | 
20 | # Compiled Static libraries
21 | *.lai
22 | *.la
23 | *.a
24 | *.lib
25 | 
26 | # Executables
27 | *.exe
28 | *.out
29 | *.app
30 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "deps/hal"]
 2 | 	path = deps/hal
 3 | 	url = https://github.com/ComparativeGenomicsToolkit/hal.git
 4 | [submodule "deps/sonLib"]
 5 | 	path = deps/sonLib
 6 | 	url = https://github.com/ComparativeGenomicsToolkit/sonLib.git
 7 | [submodule "deps/libbdsg-easy"]
 8 | 	path = deps/libbdsg-easy
 9 | 	url = https://github.com/vgteam/libbdsg-easy.git
10 | [submodule "deps/pinchesAndCacti"]
11 | 	path = deps/pinchesAndCacti
12 | 	url = https://github.com/ComparativeGenomicsToolkit/pinchesAndCacti.git
13 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Control file for continuous integration testing at http://travis-ci.org/
 2 | 
 3 | language: cpp
 4 | compiler: gcc
 5 | 
 6 | before_install:
 7 |   - git submodule update --init --recursive  
 8 |   - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libomp; fi
 9 |   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo apt-get -qq update; fi
10 |   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo apt-get install -y libhdf5-serial-dev python3 python3-pip libpython3-dev wget; fi
11 |   - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install hdf5 python3.6 python3-pip || echo "a brew error code when installing gcc is expected"; fi
12 | 
13 | install:
14 |   - sudo pip3 install setuptools --upgrade
15 |   - wget https://github.com/vgteam/vg/releases/download/v1.30.0/vg && chmod u+x vg
16 | 
17 | script:
18 |   - export PATH=$(pwd):$PATH
19 |   - export PATH=$(pwd)/deps/hal/bin:$PATH
20 |   - make test
21 | 
22 | dist: bionic
23 | osx_image: xcode10.1
24 |   
25 | matrix:
26 |   include:
27 |     - os: linux
28 |       compiler: gcc
29 |     #- os: osx
30 |     #  compiler: clang
31 |       
32 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # creates an image containing vg and hal2vg
 2 | 
 3 | # build on compatible vg image
 4 | # (this is for convenience of having vg and hal2vg in the same image, as hal2vg no longer depends on vg to build)
 5 | FROM quay.io/vgteam/vg:v1.25.0
 6 | 
 7 | # update system and install dependencies not present in vg image
 8 | RUN apt-get -qq update && apt-get -qq install -y libhdf5-dev build-essential python3-dev python3-pip cmake libz-dev pkg-config git
 9 | 
10 | # copy current directory to docker
11 | ADD . /hal2vg
12 | 
13 | # set working directory
14 | WORKDIR /hal2vg
15 | 
16 | # build
17 | RUN make clean ; make
18 | 
19 | # add hal2vg to the PATH
20 | ENV PATH /hal2vg:/hal2vg/deps/hal/bin:$PATH
21 | 


--------------------------------------------------------------------------------
/IntervalTree.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | This file was taken from 
  3 | https://github.com/ekg/intervaltree/commit/aa5937755000f1cd007402d03b6f7ce4427c5d21
  4 | 
  5 | It has the following license:
  6 | 
  7 | Copyright (c) 2011 Erik Garrison
  8 | 
  9 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 10 | this software and associated documentation files (the "Software"), to deal in
 11 | the Software without restriction, including without limitation the rights to
 12 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 13 | of the Software, and to permit persons to whom the Software is furnished to do
 14 | so, subject to the following conditions:
 15 | 
 16 | The above copyright notice and this permission notice shall be included in all
 17 | copies or substantial portions of the Software.
 18 | 
 19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 22 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 25 | SOFTWARE.
 26 | */
 27 | 
 28 | #ifndef __INTERVAL_TREE_H
 29 | #define __INTERVAL_TREE_H
 30 | 
 31 | #include <vector>
 32 | #include <algorithm>
 33 | #include <iostream>
 34 | #include <memory>
 35 | #include <cassert>
 36 | #include <limits>
 37 | 
 38 | #ifdef USE_INTERVAL_TREE_NAMESPACE
 39 | namespace interval_tree {
 40 | #endif
 41 | template <class Scalar, typename Value>
 42 | class Interval {
 43 | public:
 44 |     Scalar start;
 45 |     Scalar stop;
 46 |     Value value;
 47 |     Interval(const Scalar& s, const Scalar& e, const Value& v)
 48 |     : start(std::min(s, e))
 49 |     , stop(std::max(s, e))
 50 |     , value(v) 
 51 |     {}
 52 | };
 53 | 
 54 | template <class Scalar, typename Value>
 55 | Value intervalStart(const Interval<Scalar,Value>& i) {
 56 |     return i.start;
 57 | }
 58 | 
 59 | template <class Scalar, typename Value>
 60 | Value intervalStop(const Interval<Scalar, Value>& i) {
 61 |     return i.stop;
 62 | }
 63 | 
 64 | template <class Scalar, typename Value>
 65 | std::ostream& operator<<(std::ostream& out, const Interval<Scalar, Value>& i) {
 66 |     out << "Interval(" << i.start << ", " << i.stop << "): " << i.value;
 67 |     return out;
 68 | }
 69 | 
 70 | template <class Scalar, class Value>
 71 | class IntervalTree {
 72 | public:
 73 |     typedef Interval<Scalar, Value> interval;
 74 |     typedef std::vector<interval> interval_vector;
 75 | 
 76 | 
 77 |     struct IntervalStartCmp {
 78 |         bool operator()(const interval& a, const interval& b) {
 79 |             return a.start < b.start;
 80 |         }
 81 |     };
 82 | 
 83 |     struct IntervalStopCmp {
 84 |         bool operator()(const interval& a, const interval& b) {
 85 |             return a.stop < b.stop;
 86 |         }
 87 |     };
 88 | 
 89 |     IntervalTree()
 90 |         : left(nullptr)
 91 |         , right(nullptr)
 92 |         , center(0)
 93 |     {}
 94 | 
 95 |     ~IntervalTree() = default;
 96 | 
 97 |     std::unique_ptr<IntervalTree> clone() const {
 98 |         return std::unique_ptr<IntervalTree>(new IntervalTree(*this));
 99 |     }
100 | 
101 |     IntervalTree(const IntervalTree& other)
102 |     :   intervals(other.intervals),
103 |         left(other.left ? other.left->clone() : nullptr),
104 |         right(other.right ? other.right->clone() : nullptr),
105 |         center(other.center)
106 |     {}
107 | 
108 |     IntervalTree& operator=(IntervalTree&&) = default;
109 |     IntervalTree(IntervalTree&&) = default;
110 | 
111 |     IntervalTree& operator=(const IntervalTree& other) {
112 |         center = other.center;
113 |         intervals = other.intervals;
114 |         left = other.left ? other.left->clone() : nullptr;
115 |         right = other.right ? other.right->clone() : nullptr;
116 |         return *this;
117 |     }
118 | 
119 |     IntervalTree(
120 |             interval_vector ivals,
121 |             std::size_t depth = 16,
122 |             std::size_t minbucket = 64,
123 |             std::size_t maxbucket = 512, 
124 |             Scalar leftextent = 0,
125 |             Scalar rightextent = 0)
126 |       : left(nullptr)
127 |       , right(nullptr)
128 |     {
129 |         --depth;
130 |         const auto minmaxStop = std::minmax_element(ivals.begin(), ivals.end(), 
131 |                                                     IntervalStopCmp());
132 |         const auto minmaxStart = std::minmax_element(ivals.begin(), ivals.end(), 
133 |                                                      IntervalStartCmp());
134 |         if (!ivals.empty()) {
135 |             center = (minmaxStart.first->start + minmaxStop.second->stop) / 2;
136 |         }
137 |         if (leftextent == 0 && rightextent == 0) {
138 |             // sort intervals by start
139 |             std::sort(ivals.begin(), ivals.end(), IntervalStartCmp());
140 |         } else {
141 |             assert(std::is_sorted(ivals.begin(), ivals.end(), IntervalStartCmp()));
142 |         }
143 |         if (depth == 0 || (ivals.size() < minbucket && ivals.size() < maxbucket)) {
144 |             std::sort(ivals.begin(), ivals.end(), IntervalStartCmp());
145 |             intervals = std::move(ivals);
146 |             assert(is_valid().first);
147 |             return;
148 |         } else {
149 |             Scalar leftp = 0;
150 |             Scalar rightp = 0;
151 | 
152 |             if (leftextent || rightextent) {
153 |                 leftp = leftextent;
154 |                 rightp = rightextent;
155 |             } else {
156 |                 leftp = ivals.front().start;
157 |                 rightp = std::max_element(ivals.begin(), ivals.end(),
158 |                                           IntervalStopCmp())->stop;
159 |             }
160 | 
161 |             interval_vector lefts;
162 |             interval_vector rights;
163 | 
164 |             for (typename interval_vector::const_iterator i = ivals.begin(); 
165 |                  i != ivals.end(); ++i) {
166 |                 const interval& interval = *i;
167 |                 if (interval.stop < center) {
168 |                     lefts.push_back(interval);
169 |                 } else if (interval.start > center) {
170 |                     rights.push_back(interval);
171 |                 } else {
172 |                     assert(interval.start <= center);
173 |                     assert(center <= interval.stop);
174 |                     intervals.push_back(interval);
175 |                 }
176 |             }
177 | 
178 |             if (!lefts.empty()) {
179 |                 left.reset(new IntervalTree(std::move(lefts), 
180 |                                             depth, minbucket, maxbucket,
181 |                                             leftp, center));
182 |             }
183 |             if (!rights.empty()) {
184 |                 right.reset(new IntervalTree(std::move(rights), 
185 |                                              depth, minbucket, maxbucket, 
186 |                                              center, rightp));
187 |             }
188 |         }
189 |         assert(is_valid().first);
190 |     }
191 | 
192 |     // Call f on all intervals near the range [start, stop]:
193 |     template <class UnaryFunction>
194 |     void visit_near(const Scalar& start, const Scalar& stop, UnaryFunction f) const {
195 |         if (!intervals.empty() && ! (stop < intervals.front().start)) {
196 |             for (auto & i : intervals) {
197 |               f(i);
198 |             }
199 |         }
200 |         if (left && start <= center) {
201 |             left->visit_near(start, stop, f);
202 |         }
203 |         if (right && stop >= center) {
204 |             right->visit_near(start, stop, f);
205 |         }
206 |     }
207 | 
208 |     // Call f on all intervals crossing pos
209 |     template <class UnaryFunction>
210 |     void visit_overlapping(const Scalar& pos, UnaryFunction f) const {
211 |         visit_overlapping(pos, pos, f);
212 |     }
213 | 
214 |     // Call f on all intervals overlapping [start, stop]
215 |     template <class UnaryFunction>
216 |     void visit_overlapping(const Scalar& start, const Scalar& stop, UnaryFunction f) const {
217 |         auto filterF = [&](const interval& interval) {
218 |             if (interval.stop >= start && interval.start <= stop) {
219 |                 // Only apply f if overlapping
220 |                 f(interval);
221 |             }
222 |         };
223 |         visit_near(start, stop, filterF);
224 |     }
225 | 
226 |     // Call f on all intervals contained within [start, stop]
227 |     template <class UnaryFunction>
228 |     void visit_contained(const Scalar& start, const Scalar& stop, UnaryFunction f) const {
229 |         auto filterF = [&](const interval& interval) {
230 |             if (start <= interval.start && interval.stop <= stop) {
231 |                 f(interval);
232 |             }
233 |         };
234 |         visit_near(start, stop, filterF);
235 |     }
236 | 
237 |     interval_vector findOverlapping(const Scalar& start, const Scalar& stop) const {
238 |         interval_vector result;
239 |         visit_overlapping(start, stop,
240 |                           [&](const interval& interval) { 
241 |                             result.emplace_back(interval); 
242 |                           });
243 |         return result;
244 |     }
245 | 
246 |     interval_vector findContained(const Scalar& start, const Scalar& stop) const {
247 |         interval_vector result;
248 |         visit_contained(start, stop,
249 |                         [&](const interval& interval) { 
250 |                           result.push_back(interval); 
251 |                         });
252 |         return result;
253 |     }
254 |     bool empty() const {
255 |         if (left && !left->empty()) {
256 |             return false;
257 |         }
258 |         if (!intervals.empty()) { 
259 |             return false;
260 |         }
261 |         if (right && !right->empty()) {
262 |             return false;
263 |         }
264 |         return true;
265 |     }
266 | 
267 |     template <class UnaryFunction>
268 |     void visit_all(UnaryFunction f) const {
269 |         if (left) {
270 |             left->visit_all(f);
271 |         }
272 |         std::for_each(intervals.begin(), intervals.end(), f);
273 |         if (right) {
274 |             right->visit_all(f);
275 |         }
276 |     }
277 | 
278 |     std::pair<Scalar, Scalar> extentBruitForce() const {
279 |         struct Extent {
280 |             std::pair<Scalar, Scalar> x = {std::numeric_limits<Scalar>::max(),
281 |                                                        std::numeric_limits<Scalar>::min() };
282 |             void operator()(const interval & interval) {
283 |                 x.first  = std::min(x.first,  interval.start);
284 |                 x.second = std::max(x.second, interval.stop);
285 |             }
286 |                                                                 };
287 |                                             Extent extent;
288 | 
289 |         visit_all([&](const interval & interval) { extent(interval); });
290 |         return extent.x;
291 |                                             }
292 | 
293 |     // Check all constraints.
294 |     // If first is false, second is invalid.
295 |     std::pair<bool, std::pair<Scalar, Scalar>> is_valid() const {
296 |         const auto minmaxStop = std::minmax_element(intervals.begin(), intervals.end(), 
297 |                                                     IntervalStopCmp());
298 |         const auto minmaxStart = std::minmax_element(intervals.begin(), intervals.end(), 
299 |                                                      IntervalStartCmp());
300 |         
301 |         std::pair<bool, std::pair<Scalar, Scalar>> result = {true, { std::numeric_limits<Scalar>::max(),
302 |                                                                      std::numeric_limits<Scalar>::min() }};
303 |         if (!intervals.empty()) {
304 |             result.second.first   = std::min(result.second.first,  minmaxStart.first->start);
305 |             result.second.second  = std::min(result.second.second, minmaxStop.second->stop);
306 |         }
307 |         if (left) {
308 |             auto valid = left->is_valid();
309 |             result.first &= valid.first;
310 |             result.second.first   = std::min(result.second.first,  valid.second.first);
311 |             result.second.second  = std::min(result.second.second, valid.second.second);
312 |             if (!result.first) { return result; }
313 |             if (valid.second.second >= center) {
314 |                 result.first = false;
315 |                 return result;
316 |             }
317 |         }
318 |         if (right) {
319 |             auto valid = right->is_valid();
320 |             result.first &= valid.first;
321 |             result.second.first   = std::min(result.second.first,  valid.second.first);
322 |             result.second.second  = std::min(result.second.second, valid.second.second);
323 |             if (!result.first) { return result; }
324 |             if (valid.second.first <= center) { 
325 |                 result.first = false;
326 |                 return result;
327 |             }
328 |         }
329 |         if (!std::is_sorted(intervals.begin(), intervals.end(), IntervalStartCmp())) {
330 |             result.first = false;
331 |         }
332 |         return result;        
333 |     }
334 | 
335 |     friend std::ostream& operator<<(std::ostream& os, const IntervalTree& itree) {
336 |         return writeOut(os, itree);
337 |     }
338 | 
339 |     friend std::ostream& writeOut(std::ostream& os, const IntervalTree& itree, 
340 |                                   std::size_t depth = 0) {
341 |         auto pad = [&]() { for (std::size_t i = 0; i != depth; ++i) { os << ' '; } };
342 |         pad(); os << "center: " << itree.center << '\n';
343 |         for (const interval & inter : itree.intervals) {
344 |             pad(); os << inter << '\n';
345 |         }
346 |         if (itree.left) {
347 |             pad(); os << "left:\n";
348 |             writeOut(os, *itree.left, depth + 1);
349 |         } else {
350 |             pad(); os << "left: nullptr\n";
351 |         }
352 |         if (itree.right) {
353 |             pad(); os << "right:\n";
354 |             writeOut(os, *itree.right, depth + 1);
355 |         } else {
356 |             pad(); os << "right: nullptr\n";
357 |         }
358 |         return os;
359 |     }
360 | 
361 | private:
362 |     interval_vector intervals;
363 |     std::unique_ptr<IntervalTree> left;
364 |     std::unique_ptr<IntervalTree> right;
365 |     Scalar center;
366 | };
367 | #ifdef USE_INTERVAL_TREE_NAMESPACE
368 | }
369 | #endif
370 | 
371 | #endif
372 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (C) 2020 by UCSC Computational Genomics Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # simplest possible to start.  dangerous since all header deps done manually. ne
 2 | rootPath = ./
 3 | include ./include.mk
 4 | 
 5 | all : hal2vg clip-vg halRemoveDupes halMergeChroms halUnclip filter-paf-deletions count-vg-hap-cov
 6 | 
 7 | # Note: hdf5 from apt doesn't seem to work for static builds.  It should be installed
 8 | # from source and configured with "--enable-static --disable-shared", then have its
 9 | # bin put at the front of PATH
10 | static:
11 | 	CFLAGS="$${CFLAGS} -static" \
12 | 	CXXFLAGS="$${CXXFLAGS} -static" \
13 | 	${MAKE} all
14 | 
15 | check-static: static
16 | 	if [ $(shell ls hal2vg clip-vg halRemoveDupes halMergeChroms halUnclip filter-paf-deletions count-vg-hap-cov | xargs ldd 2>& 1 | grep "not a dynamic" | wc -l) = $(shell ls hal2vg clip-vg halRemoveDupes halMergeChroms halUnclip filter-paf-deletions count-vg-hap-cov | wc -l) ] ; then\
17 | 		echo "ldd verified that all files in bin/ are static";\
18 | 	else\
19 | 		echo "ldd found dynamic linked binary in bin/";\
20 | 		exit 1;\
21 | 	fi
22 | 
23 | cleanFast : 
24 | 	rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o halMergeChroms halMergeChroms.o halUnclip halUnclip.o filter-paf-deletions filter-paf-deletions.o count-vg-hap-cov.o count-vg-hap-cov
25 | 
26 | clean :
27 | 	rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o halMergeChroms halMergeChroms.o halUnclip halUnclip.o filter-paf-deletions filter-paf-deletions.o
28 | 	cd deps/sonLib && make clean
29 | 	cd deps/pinchesAndCacti && make clean
30 | 	cd deps/hal && make clean
31 | 	cd deps/libbdsg-easy && make clean
32 | 
33 | hal2vg.o : hal2vg.cpp ${basicLibsDependencies}
34 | 	${cpp} ${CXXFLAGS} -I . hal2vg.cpp -c
35 | 
36 | ${sonLibPath}/sonLib.a :
37 | 	cd deps/sonLib && make
38 | 
39 | ${halPath}/libHal.a : ${sonLibPath}/sonLib.a
40 | 	cd deps/hal && make
41 | 
42 | ${sonLibPath}/stPinchesAndCacti.a : ${sonLibPath}/sonLib.a
43 | 	cd deps/pinchesAndCacti && make
44 | 
45 | ${libbdsgPath}/lib/libbdsg.a :
46 | 	cd deps/libbdsg-easy && make
47 | 
48 | hal2vg : hal2vg.o ${basicLibsDependencies}
49 | 	${cpp} ${CXXFLAGS} -fopenmp -pthread hal2vg.o  ${basicLibs}  -o hal2vg
50 | 
51 | clip-vg.o : clip-vg.cpp ${basicLibsDependencies}
52 | 	${cpp} ${CXXFLAGS} -I . clip-vg.cpp -c
53 | 
54 | clip-vg : clip-vg.o ${basicLibsDependencies}
55 | 	${cpp} ${CXXFLAGS} -fopenmp -pthread clip-vg.o  ${basicLibs}  -o clip-vg
56 | 
57 | halRemoveDupes.o : halRemoveDupes.cpp ${basicLibsDependencies}
58 | 	${cpp} ${CXXFLAGS} -I . halRemoveDupes.cpp -c
59 | 
60 | halRemoveDupes : halRemoveDupes.o ${basicLibsDependencies}
61 | 	${cpp} ${CXXFLAGS} -fopenmp -pthread halRemoveDupes.o  ${basicLibs}  -o halRemoveDupes
62 | 
63 | halMergeChroms.o : halMergeChroms.cpp ${basicLibsDependencies}
64 | 	${cpp} ${CXXFLAGS} -I . halMergeChroms.cpp -c
65 | 
66 | halMergeChroms : halMergeChroms.o ${basicLibsDependencies}
67 | 	${cpp} ${CXXFLAGS} -fopenmp -pthread halMergeChroms.o  ${basicLibs}  -o halMergeChroms
68 | 
69 | halUnclip.o : halUnclip.cpp subpaths.h ${basicLibsDependencies}
70 | 	${cpp} ${CXXFLAGS} -I . halUnclip.cpp -c
71 | 
72 | halUnclip : halUnclip.o ${basicLibsDependencies} 
73 | 	${cpp} ${CXXFLAGS} -fopenmp -pthread halUnclip.o  ${basicLibs}  -o halUnclip
74 | 
75 | filter-paf-deletions.o : filter-paf-deletions.cpp subpaths.h paf.hpp ${basicLibsDependencies}
76 | 	${cpp} ${CXXFLAGS} -I . filter-paf-deletions.cpp -c
77 | 
78 | filter-paf-deletions : filter-paf-deletions.o ${basicLibsDependencies} 
79 | 	${cpp} ${CXXFLAGS} -fopenmp -pthread filter-paf-deletions.o  ${basicLibs}  -o filter-paf-deletions
80 | 
81 | count-vg-hap-cov.o : count-vg-hap-cov.cpp ${basicLibsDependencies}
82 | 	${cpp} ${CXXFLAGS} -I . count-vg-hap-cov.cpp -c
83 | 
84 | count-vg-hap-cov : count-vg-hap-cov.o ${basicLibsDependencies}
85 | 	${cpp} ${CXXFLAGS} -fopenmp -pthread count-vg-hap-cov.o  ${basicLibs}  -o count-vg-hap-cov
86 | 
87 | test :
88 | 	make
89 | 	cd tests && prove -v t
90 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # hal2vg
 2 | [![Build Status](https://travis-ci.org/ComparativeGenomicsToolkit/hal2vg.svg?branch=master)](https://travis-ci.org/ComparativeGenomicsToolkit/hal2vg)
 3 | 
 4 | Convert [HAL](https://github.com/glennhickey/hal) to [vg](https://github.com/vgteam/vg)-compatible sequence graph.
 5 | 
 6 | Supports the three sequence graph formats in [libbdsg](https://github.com/vgteam/libbdsg):
 7 | * PackedGraph (default)
 8 | * ODGI
 9 | * HashGraph
10 | 
11 | ## Algorithm
12 | 
13 | 1. Each sequence in the HAL is added as a thread to a [Pinch Graph](https://github.com/ComparativeGenomicsToolkit/pinchesAndCacti).
14 | 2. Exact pairwise alignment blocks (no gaps or substitutions) are extracted from each branch in the HAL tree and "pinched" in the graph
15 | 3. For each branch, bases in the child that have substitutions in the parent (snps) are aligned across the tree using the column iterator and all exact matches are extracted and pinched.
16 | 4. Pinch graph is cleaned up by merging trivial joins
17 | 5. Each HAL sequence is traced through the pinch graph, adding nodes and edges to the output sequence graph.  A table is maintained to map each pinch graph block to a sequence graph node.
18 | 
19 | ## Suggested Postprocessing:
20 | 
21 | *  Sort the output with `vg ids --sort`.  
22 | 
23 | ## Installation
24 | 
25 | ### Binary Release
26 | 
27 | You can download a standalone binary for the latest release [here](https://github.com/ComparativeGenomicsToolkit/hal2vg/releases).
28 | 
29 | ### Compile From Source
30 | 
31 | You can use the the [Dockerfile](Dockerfile) as a guide to see how all dependencies are installed with `apt` on Ubuntu.  More details on installing HDF5 can be found in the [HAL README](https://github.com/ComparativeGenomicsToolkit/hal)
32 | 
33 | **Cloning:** Don't forget to clone submodules with the `--recursive` option:
34 | 
35 |      git clone https://github.com/glennhickey/hal2vg.git --recursive
36 | 
37 | **Compiling:**
38 | 
39 |      make
40 | 
41 | ## Usage
42 | 
43 | It is required to use the `--inMemory` option for all but trivial inputs.
44 | 
45 | `vg` has been tuned to work best on graphs with nodes chopped to at most 32 bases.  It is therefore recommended to use the `--chop 32` option.
46 | 
47 | ```
48 | hal2vg input.hal --inMemory --chop 32 --progress > output.pg
49 | ```
50 | 
51 | **Note**: The output graph is only readable by vg version 1.24.0 and greater.
52 | 
53 | Copyright (C) 2020 by UCSC Computational Genomics Lab
54 | 
55 | 


--------------------------------------------------------------------------------
/ReleaseNotes.md:
--------------------------------------------------------------------------------
 1 | # Release 1.0.1   2020-09-07
 2 | 
 3 | This release contains a bugfix required to use the subsetting options such as `--targetGenomes` and `--ignoreGenomes` without crashing.  
 4 | 
 5 | # Release 1.0.0   2020-08-19
 6 | 
 7 | This release uses Cactus's Pinch Graph library to create the sequence graph.
 8 | 
 9 | Notable Changes:
10 |  - Bespoke STL-based structures and algorimths replaced with Pinch Graph library
11 |  - SNPS are aligned through using the column iterator instead of tables
12 |  - Much more performant than original implementaiton, but still only tested on smallish graphs
13 | 


--------------------------------------------------------------------------------
/build-tools/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | # Specify the minimum version for CMake
  2 | cmake_minimum_required(VERSION 3.10)
  3 | 
  4 | # This defines default install directories like "lib"
  5 | include(GNUInstallDirs)
  6 | 
  7 | # Project's name
  8 | project(libhandlegraph)
  9 | # We build using c++14
 10 | set(CMAKE_CXX_STANDARD 14)
 11 | 
 12 | # Use all standard-compliant optimizations
 13 | set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -g")
 14 | set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g")
 15 | 
 16 | # Let cmake decide where to put the output files, allowing for out-of-tree builds.
 17 | 
 18 | if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
 19 |     # We are probably an external project. Don't use @rpath in Mac builds'
 20 |     # install_name fields (AKA LC_ID_DYLIB in otool -l output). Populate with
 21 |     # an absolute path instead. This will let us actually find the library when
 22 |     # we use it as a CMake external project and don't fully install it to any
 23 |     # normal lib directory.
 24 |     message("libhandlegraph is root project or external_project")
 25 |     set (CMAKE_MACOSX_RPATH OFF)
 26 | else()
 27 |     # We are probably an add_subdirectory. We will expect to be in the root
 28 |     # project's lib directory, so we do want to have our non-installed
 29 |     # install_name use @rpath.
 30 |     message("libhandlegraph is add_subdirectory project")
 31 |     set (CMAKE_MACOSX_RPATH ON)
 32 | endif()
 33 | 
 34 | # The install_name gets modified on installation to be this.
 35 | set (CMAKE_INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
 36 | 
 37 | add_library(handlegraph_objs OBJECT
 38 |   src/deletable_handle_graph.cpp
 39 |   src/handle_graph.cpp
 40 |   src/mutable_handle_graph.cpp
 41 |   src/path_metadata.cpp
 42 |   src/mutable_path_metadata.cpp 
 43 |   src/path_handle_graph.cpp 
 44 |   src/path_position_handle_graph.cpp
 45 |   src/mutable_path_handle_graph.cpp
 46 |   src/ranked_handle_graph.cpp
 47 |   src/serializable.cpp
 48 |   src/snarl_decomposition.cpp
 49 |   src/trivially_serializable.cpp
 50 |   src/types.cpp
 51 |   src/copy_graph.cpp
 52 |   src/append_graph.cpp
 53 |   src/are_equivalent.cpp
 54 |   src/find_tips.cpp
 55 |   src/topological_sort.cpp
 56 |   src/apply_orientations.cpp
 57 |   src/is_single_stranded.cpp
 58 |   src/count_walks.cpp
 59 |   src/eades_algorithm.cpp
 60 |   src/dagify.cpp
 61 |   src/strongly_connected_components.cpp
 62 |   src/find_shortest_paths.cpp
 63 |   src/dijkstra.cpp
 64 |   src/is_acyclic.cpp
 65 |   src/reverse_complement.cpp
 66 |   src/split_strands.cpp
 67 |   src/chop.cpp
 68 |   src/weakly_connected_components.cpp
 69 |   src/extend.cpp
 70 |   src/include/handlegraph/handle_graph.hpp
 71 |   src/include/handlegraph/mutable_handle_graph.hpp
 72 |   src/include/handlegraph/deletable_handle_graph.hpp
 73 |   src/include/handlegraph/path_handle_graph.hpp
 74 |   src/include/handlegraph/path_position_handle_graph.hpp
 75 |   src/include/handlegraph/mutable_path_handle_graph.hpp
 76 |   src/include/handlegraph/mutable_path_mutable_handle_graph.hpp
 77 |   src/include/handlegraph/mutable_path_deletable_handle_graph.hpp
 78 |   src/include/handlegraph/expanding_overlay_graph.hpp
 79 |   src/include/handlegraph/util.hpp
 80 |   src/include/handlegraph/types.hpp
 81 |   src/include/handlegraph/iteratee.hpp
 82 |   src/include/handlegraph/algorithms/copy_graph.hpp
 83 |   src/include/handlegraph/algorithms/append_graph.hpp
 84 |   src/include/handlegraph/algorithms/are_equivalent.hpp
 85 |   src/include/handlegraph/algorithms/find_tips.hpp
 86 |   src/include/handlegraph/algorithms/topological_sort.hpp
 87 |   src/include/handlegraph/algorithms/apply_orientations.hpp
 88 |   src/include/handlegraph/algorithms/is_single_stranded.hpp
 89 |   src/include/handlegraph/algorithms/count_walks.hpp
 90 |   src/include/handlegraph/algorithms/eades_algorithm.hpp
 91 |   src/include/handlegraph/algorithms/dagify.hpp
 92 |   src/include/handlegraph/algorithms/strongly_connected_components.hpp
 93 |   src/include/handlegraph/algorithms/find_shortest_paths.hpp
 94 |   src/include/handlegraph/algorithms/dijkstra.hpp
 95 |   src/include/handlegraph/algorithms/reverse_complement.hpp
 96 |   src/include/handlegraph/algorithms/is_acyclic.hpp
 97 |   src/include/handlegraph/algorithms/split_strands.hpp
 98 |   src/include/handlegraph/algorithms/chop.hpp
 99 |   src/include/handlegraph/algorithms/weakly_connected_components.hpp
100 |   src/include/handlegraph/algorithms/extend.hpp
101 |   )
102 | 
103 | # Use the include directory when building the objects.
104 | # It can't be picked up via dependency by the other libraries even if it's public.
105 | target_include_directories(handlegraph_objs PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src/include")
106 | 
107 | # Build objects position-independent to allow a shared library
108 | set_target_properties(handlegraph_objs PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
109 | 
110 | # Make static and shared versions with the same base name.
111 | # Make sure to give them interface include directories that depending targets can use.
112 | #add_library(handlegraph_shared SHARED $<TARGET_OBJECTS:handlegraph_objs>)
113 | #set_target_properties(handlegraph_shared PROPERTIES OUTPUT_NAME handlegraph)
114 | #target_include_directories(handlegraph_shared INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}/src/include")
115 | add_library(handlegraph_static STATIC $<TARGET_OBJECTS:handlegraph_objs>)
116 | set_target_properties(handlegraph_static PROPERTIES OUTPUT_NAME handlegraph)
117 | target_include_directories(handlegraph_static INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}/src/include")
118 | 
119 | # Set up for installability
120 | #install(TARGETS handlegraph_shared handlegraph_static
121 | install(TARGETS handlegraph_static 
122 |   RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
123 |   LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
124 |   ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
125 | install(DIRECTORY src/include/handlegraph
126 |   DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
127 |   FILES_MATCHING PATTERN "*.hpp"
128 |   )
129 | 


--------------------------------------------------------------------------------
/build-tools/makeBinRelease:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Generate release hal2vg binary
 3 | # Must have a static buildable hdf5 (ie not the one from apt)
 4 | # Must be run after tree is tagged and pushed to master.
 5 | # Use --keep to keep working directory for debugging.
 6 | 
 7 | mydir=$(dirname $(which $0))
 8 | source ${mydir}/releaseLib.sh
 9 | 
10 | keep=no
11 | if [ $1 = '--keep' ] ; then
12 |     keep=yes
13 | fi
14 | set -beEu -o pipefail
15 | 
16 | buildDir=$(realpath -m build)
17 | binBuildDir="${buildDir}/bin-tmp"
18 | 
19 | set -x
20 | rm -rf ${binBuildDir}
21 | mkdir -p ${binBuildDir}
22 | cd ${binBuildDir}
23 | git clone https://github.com/ComparativeGenomicsToolkit/hal2vg.git
24 | cd hal2vg
25 | git fetch --tags origin
26 | 
27 | REL_TAG=$(getLatestReleaseTag)
28 | git checkout "${REL_TAG}"
29 | git submodule update --init --recursive
30 | 
31 | # todo: update / figure out / remove hack:
32 | cp ./build-tools/CMakeLists.txt ./deps/libbdsg-easy/deps/libhandlegraph/CMakeLists.txt
33 | 
34 | if [ $(man gcc | grep nehalem | wc -l) -ge 1 ]
35 | then
36 | 	 # attempt to increase portability by using older architecture
37 | 	 # this make/sed/make thing is a hack to get around a linking error that just cropped up
38 | 	 CFLAGS="-march=nehalem" CXXFLAGS="-march=nehalem" make static || true
39 | 	 sed -i deps/libbdsg-easy/deps/libbdsg/Makefile -e "s/-lomp//g"	 
40 | 	 CFLAGS="-march=nehalem" CXXFLAGS="-march=nehalem" make check-static
41 | else
42 | 	 make static || true
43 | 	 sed -i deps/libbdsg-easy/deps/libbdsg/Makefile -e "s/-lomp//g"
44 | 	 make check-static
45 | fi
46 | 
47 | cp hal2vg clip-vg halRemoveDupes halMergeChroms halUnclip filter-paf-deletions count-vg-hap-cov ${buildDir}/
48 | 
49 | 


--------------------------------------------------------------------------------
/build-tools/makeSrcRelease:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Generate release tar file of source
 3 | # Must be run after tree is tagged and pushed to master.
 4 | # Use --keep to keep working directory for debugging.
 5 | 
 6 | mydir=$(dirname $(which $0))
 7 | source ${mydir}/releaseLib.sh
 8 | 
 9 | keep=no
10 | if [ $1 = '--keep' ] ; then
11 |     keep=yes
12 | fi
13 | set -beEu -o pipefail
14 | 
15 | buildDir=$(realpath -m build)
16 | srcBuildDir="${buildDir}/src-tmp"
17 | 
18 | set -x
19 | rm -rf ${srcBuildDir}
20 | mkdir -p ${srcBuildDir}
21 | cd ${srcBuildDir}
22 | git clone --recursive https://github.com/ComparativeGenomicsToolkit/hal2vg.git
23 | cd hal2vg
24 | git fetch --tags origin
25 | 
26 | REL_TAG=$(getLatestReleaseTag)
27 | git checkout "${REL_TAG}"
28 | git submodule update --init --recursive
29 | find deps -name ".git" -exec rm -Rf "{}" \;
30 | cd ..
31 | mv hal2vg hal2vg-${REL_TAG}
32 | tar -czf ${buildDir}/hal2vg-${REL_TAG}.tar.gz hal2vg-${REL_TAG}
33 | if [ "$keep" = "no" ] ; then
34 |     rm -Rf ${srcBuildDir}
35 | fi
36 | 


--------------------------------------------------------------------------------
/build-tools/quayTagRelease:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # tag the docker image at quay.io corresponding to the release
 3 | 
 4 | set -x
 5 | set -beEu -o pipefail
 6 | mydir=$(dirname $(which $0))
 7 | source ${mydir}/releaseLib.sh
 8 | 
 9 | REL_TAG=$(getLatestReleaseTag)
10 | REL_COMMIT=$(git rev-list -n 1 ${REL_TAG})
11 | 
12 | docker image tag ${dockname}:${REL_COMMIT} ${dockname}:${REL_TAG}
13 | 


--------------------------------------------------------------------------------
/build-tools/releaseLib.sh:
--------------------------------------------------------------------------------
 1 | # definitions and functions for release bash programs
 2 | 
 3 | PYTHON=python3.6
 4 | PIP="${PYTHON} -m pip"
 5 | 
 6 | dockstore="quay.io/comparative-genomics-toolkit"
 7 | dockname=${dockstore}/hal2vg
 8 | 
 9 | 
10 | # get the tag for the lastest release, in the form v1.2.3, from git
11 | getLatestReleaseTag() {
12 |     git describe --tags $(git rev-list --tags --max-count=10) | egrep -e '^v[0-9]+\.[0-9]+\.[0-9]+$' | head -1
13 | }
14 | 
15 | 


--------------------------------------------------------------------------------
/count-vg-hap-cov.cpp:
--------------------------------------------------------------------------------
  1 | // Count the number of bases that aren't in a given reference sample.
  2 | // Print the table of results stratisfied by number of covering samples
  3 | // Assume's current cactus convertion of Sample.Haplotype.Contig
  4 | 
  5 | //#define debug
  6 | 
  7 | #include <cstdlib>
  8 | #include <iostream>
  9 | #include <cassert>
 10 | #include <fstream>
 11 | #include <deque>
 12 | #include <unordered_map>
 13 | #include <unistd.h>
 14 | #include <getopt.h>
 15 | #include <omp.h>
 16 | 
 17 | #include "bdsg/packed_graph.hpp"
 18 | #include "bdsg/hash_graph.hpp"
 19 | 
 20 | using namespace std;
 21 | using namespace handlegraph;
 22 | using namespace bdsg;
 23 | 
 24 | static unique_ptr<PathHandleGraph> load_graph(istream& graph_stream) {
 25 | 
 26 |     char magic_bytes[4];
 27 |     graph_stream.read(magic_bytes, 4);
 28 |     uint32_t magic_number = ntohl(*((uint32_t*) magic_bytes));
 29 |     graph_stream.clear();
 30 |     graph_stream.seekg(0, ios::beg);
 31 | 
 32 |     PathHandleGraph* graph;
 33 |     if (magic_number == PackedGraph().get_magic_number()) {
 34 |         graph = new PackedGraph();
 35 |     } else if (magic_number == HashGraph().get_magic_number()) {
 36 |         graph = new HashGraph();
 37 |     } else {
 38 |         cerr << "Unable to parse input graph with magic number " << magic_number << endl;
 39 |         exit(1);
 40 |     }
 41 |     dynamic_cast<SerializableHandleGraph*>(graph)->deserialize(graph_stream);
 42 | 
 43 |     return unique_ptr<PathHandleGraph>(graph);
 44 | }
 45 | 
 46 | void help(char** argv) {
 47 |   cerr << "usage: " << argv[0] << " [options] <graph> [graph] [graph] [...]" << endl
 48 |        << "Count nodes and bp in graph covered by different sample counts\n" 
 49 |        << "Assumes SAMPLE.HAPLOTYPE.CONTIG path name format" << endl
 50 |        << endl
 51 |        << "options: " << endl
 52 |        << "    -r, --reference           Include counts of nodes that are not present in the given reference sample prefix" << endl
 53 |        << "    -i, --ignore              Completely ignore all paths with given prefix [default: _MINIGRAPH_]" << endl
 54 |        << "    -t, --threads             Number of threads [default: all]" << endl
 55 |        << "    -s, --separator           Use this separator for tokenizing path name. Haplotype key will be first 2 tokens (or all tokens if fewer than 2) [default=.]" << endl
 56 |        << "    -p, --progress            Print progress" << endl 
 57 |        << endl;
 58 | }    
 59 | 
 60 | // returns SAMPLE.HAPLOTYPE
 61 | // todo: vg/bdsg in progress of adpoting conventions / api
 62 | // to manage stuff like this -- should switch to using that
 63 | const string& get_sample_name(const PathHandleGraph* graph, path_handle_t path_handle,
 64 |                               unordered_map<path_handle_t, string>& name_map,
 65 |                               char separator) {
 66 |     if (!name_map.count(path_handle)) {
 67 |         string path_name = graph->get_path_name(path_handle);
 68 |         string sample;
 69 |         int dots = 0;
 70 |         for (int64_t i = 0; i < path_name.length(); ++i) {
 71 |             if (path_name[i] == separator) {
 72 |                 ++dots;
 73 |             }
 74 |             if (dots == 2) {
 75 |                 break;
 76 |             }
 77 |             sample.push_back(path_name[i]);
 78 |         }
 79 |         name_map[path_handle] = sample;        
 80 |     }
 81 |     return name_map.at(path_handle);
 82 | }
 83 | 
 84 | int main(int argc, char** argv) {
 85 | 
 86 |     string ref_sample;
 87 |     string ignore_sample = "_MINIGRAPH_";
 88 |     char separator = '.';
 89 |     bool progress = false;
 90 |     
 91 |     int c;
 92 |     optind = 1; 
 93 |     while (true) {
 94 | 
 95 |         static const struct option long_options[] = {
 96 |             {"help", no_argument, 0, 'h'},
 97 |             {"ref-sample", required_argument, 0, 'r'},
 98 |             {"ignore", required_argument, 0, 'i'},
 99 |             {"separator", required_argument, 0, 's'},
100 |             {"threads", required_argument, 0, 't'},
101 |             {"progress", no_argument, 0, 'p'},            
102 |             {0, 0, 0, 0}
103 |         };
104 | 
105 |         int option_index = 0;
106 | 
107 |         c = getopt_long (argc, argv, "hr:s:i:t:p",
108 |                          long_options, &option_index);
109 | 
110 |         // Detect the end of the options.
111 |         if (c == -1)
112 |             break;
113 | 
114 |         switch (c)
115 |         {
116 |         case 'r':
117 |             ref_sample = optarg;
118 |             break;
119 |         case 'i':
120 |             ignore_sample = optarg;
121 |             break;
122 |         case 's':
123 |             assert(strlen(optarg) == 1);
124 |             separator = optarg[0];
125 |             break;
126 |         case 't':
127 |             {
128 |                 int num_threads = stoi(optarg);
129 |                 if (num_threads <= 0) {
130 |                     cerr << "error:[count-vg-hap-depth] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl;
131 |                     exit(1);
132 |                 }
133 |                 omp_set_num_threads(num_threads);
134 |                 break;                
135 |             }
136 |         case 'p':
137 |             progress = true;
138 |             break;            
139 |         case 'h':
140 |         case '?':
141 |             /* getopt_long already printed an error message. */
142 |             help(argv);
143 |             exit(1);
144 |             break;
145 |         default:
146 |             abort ();
147 |         }
148 |     }
149 | 
150 |     if (argc <= 1) {
151 |         help(argv);
152 |         return 1;
153 |     }
154 | 
155 |     // Parse the positional argument
156 |     if (optind >= argc) {
157 |         cerr << "[count-vg-hap-depth] error: too few arguments" << endl;
158 |         help(argv);
159 |         return 1;
160 |     }
161 | 
162 |     // depth stats (one per thread)
163 |     vector<vector<int64_t>> depth_base_counts(get_thread_count());
164 |     vector<vector<int64_t>> depth_nfree_base_counts(get_thread_count());
165 |     vector<vector<int64_t>> depth_node_counts(get_thread_count());
166 |     vector<vector<int64_t>> depth_base_counts_nonref(get_thread_count());
167 |     vector<vector<int64_t>> depth_nfree_base_counts_nonref(get_thread_count());    
168 |     vector<vector<int64_t>> depth_node_counts_nonref(get_thread_count());    
169 | 
170 |     // do counts for each graph arg
171 |     while(optind < argc) {
172 | 
173 |         string graph_path = argv[optind++];
174 |         ifstream graph_stream(graph_path);
175 |         if (!graph_stream) {
176 |             cerr << "[count-vg-hap-depth] error: Unable to open input graph " << graph_path << endl;
177 |             return 1;
178 |         }    
179 |         unique_ptr<PathHandleGraph> graph = load_graph(graph_stream);
180 |         graph_stream.close();
181 |         if (progress) {
182 |             cerr << "[count-vg-hap-depth]: Loaded graph" << endl;
183 |         }
184 | 
185 |         // path handle to sample key (one per thread)
186 |         vector<unordered_map<path_handle_t, string>> name_maps(get_thread_count());
187 |     
188 |         if (progress) {
189 |             cerr << "[count-vg-hap-depth]: Calculating coverage with " << depth_base_counts.size() << " threads" << endl;
190 |         }
191 | 
192 |         graph->for_each_handle([&](handle_t handle) {
193 |                 int64_t t = omp_get_thread_num();
194 |                 // collect all the samples that step on the node
195 |                 set<string> sample_set;
196 |                 bool ref = false;
197 |                 graph->for_each_step_on_handle(handle, [&](step_handle_t step_handle) {
198 |                         const string& sample_name = get_sample_name(graph.get(), graph->get_path_handle_of_step(step_handle), name_maps[t], separator);
199 |                         if (ignore_sample.empty() || sample_name.compare(0, ignore_sample.length(), ignore_sample) != 0) {
200 |                             if (!ref && sample_name.compare(0, ref_sample.length(), ref_sample) == 0) {
201 |                                 ref = true;
202 |                             }
203 |                             sample_set.insert(sample_name);
204 |                         }
205 |                     });
206 |                 // update the total coverage
207 |                 int64_t coverage = sample_set.size();
208 |                 if (depth_base_counts[t].size() <= coverage) {
209 |                     depth_base_counts[t].resize(coverage + 1, 0);
210 |                     depth_node_counts[t].resize(coverage + 1, 0);
211 |                     depth_nfree_base_counts[t].resize(coverage + 1, 0);
212 |                 }
213 |                 int64_t node_len = graph->get_length(handle);
214 |                 int64_t num_ns = 0;
215 |                 string node_seq = graph->get_sequence(handle);
216 |                 for (auto c : node_seq) {
217 |                     if (c == 'N' || c == 'n') {
218 |                         ++num_ns;
219 |                     }
220 |                 }
221 |                 depth_base_counts[t][coverage] += node_len;
222 |                 depth_nfree_base_counts[t][coverage] += node_len - num_ns;
223 |                 depth_node_counts[t][coverage] += 1;
224 |                             
225 |                 if (!ref && !ref_sample.empty()) {
226 |                     // update the nonref coverage 
227 |                     int64_t coverage = sample_set.size();
228 |                     if (depth_base_counts_nonref[t].size() <= coverage) {
229 |                         depth_base_counts_nonref[t].resize(coverage + 1, 0);
230 |                         depth_node_counts_nonref[t].resize(coverage + 1, 0);
231 |                         depth_nfree_base_counts_nonref[t].resize(coverage + 1, 0);
232 |                     }
233 |                     depth_base_counts_nonref[t][coverage] += node_len;
234 |                     depth_nfree_base_counts_nonref[t][coverage] += node_len - num_ns;                    
235 |                     depth_node_counts_nonref[t][coverage] += 1;
236 |                 }
237 |             },
238 |             true);
239 |     }
240 | 
241 |     // make sure all tables have same size
242 |     size_t max_size = 0;
243 |     for (int64_t t = 0; t < get_thread_count(); ++t) {
244 |         max_size = std::max(max_size, depth_base_counts[t].size());
245 |         max_size = std::max(max_size, depth_base_counts_nonref[t].size());
246 |     }
247 |     for (int64_t t = 0; t < get_thread_count(); ++t) {
248 |         if (depth_base_counts[t].size() < max_size) {
249 |             depth_base_counts[t].resize(max_size, 0);
250 |             depth_nfree_base_counts[t].resize(max_size, 0);
251 |             depth_node_counts[t].resize(max_size, 0);
252 |         }
253 |         if (depth_base_counts_nonref[t].size() < max_size) {
254 |             depth_base_counts_nonref[t].resize(max_size, 0);
255 |             depth_nfree_base_counts_nonref[t].resize(max_size, 0);
256 |             depth_node_counts_nonref[t].resize(max_size, 0);
257 |         }
258 |         assert(depth_base_counts[t].size() == max_size);
259 |         assert(depth_nfree_base_counts[t].size() == max_size);
260 |         assert(depth_node_counts[t].size() == max_size);
261 |         assert(depth_base_counts_nonref[t].size() == max_size);
262 |         assert(depth_nfree_base_counts_nonref[t].size() == max_size);
263 |         assert(depth_node_counts_nonref[t].size() == max_size);
264 |     }
265 |     
266 |     if (progress) {
267 |         cerr << "[count-vg-hap-depth]: Merging data from different threads" << endl;
268 |     }
269 |     
270 |     // merge up the threads
271 |     for (int64_t t = 1; t < get_thread_count(); ++t) {
272 |         for (int64_t coverage = 0; coverage < depth_base_counts[t].size(); ++coverage) {
273 |             assert(depth_base_counts[0].size() > coverage);
274 |             depth_base_counts[0][coverage] += depth_base_counts[t][coverage];
275 |             depth_nfree_base_counts[0][coverage] += depth_nfree_base_counts[t][coverage];
276 |             depth_node_counts[0][coverage] += depth_node_counts[t][coverage];
277 | 
278 |             if (!ref_sample.empty()) {
279 |                 assert(depth_base_counts_nonref[0].size() > coverage);
280 |                 depth_base_counts_nonref[0][coverage] += depth_base_counts_nonref[t][coverage];
281 |                 depth_nfree_base_counts_nonref[0][coverage] += depth_nfree_base_counts_nonref[t][coverage];
282 |                 depth_node_counts_nonref[0][coverage] += depth_node_counts_nonref[t][coverage];                
283 |             }
284 |         }
285 |     }
286 | 
287 |     // there's almost certainly an stl one-line for this.. oh well
288 |     function<vector<int64_t>(const vector<int64_t>&)> get_cumul = [](const vector<int64_t>& v) {
289 |         int64_t tot = 0;
290 |         vector<int64_t> cumul(v.size(), 0);
291 |         for (int64_t i = 0; i < v.size(); ++i) {
292 |             tot += v[i];
293 |             cumul[i] = tot;
294 |         }
295 |         return cumul;
296 |     };
297 |     function<vector<int64_t>(const vector<int64_t>&)> get_lumuc = [](const vector<int64_t>& v) {
298 |         int64_t tot = 0;
299 |         vector<int64_t> cumul(v.size(), 0);
300 |         for (int64_t i = v.size() - 1; i >= 0; --i) {
301 |             tot += v[i];
302 |             cumul[i] = tot;
303 |         }
304 |         return cumul;
305 |     };
306 |     
307 |     // keep cumulative counts while we're at it
308 |     // cumulate from 0
309 |     vector<int64_t> node_counts_cumul = get_cumul(depth_node_counts[0]);
310 |     vector<int64_t> base_counts_cumul = get_cumul(depth_base_counts[0]);
311 |     vector<int64_t> nfree_base_counts_cumul = get_cumul(depth_nfree_base_counts[0]);    
312 |     vector<int64_t> node_counts_nonref_cumul = get_cumul(depth_node_counts_nonref[0]);
313 |     vector<int64_t> base_counts_nonref_cumul = get_cumul(depth_base_counts_nonref[0]);
314 |     vector<int64_t> nfree_base_counts_nonref_cumul = get_cumul(depth_nfree_base_counts_nonref[0]);
315 | 
316 |     //cumulate from end
317 |     vector<int64_t> node_counts_lumuc = get_lumuc(depth_node_counts[0]);
318 |     vector<int64_t> base_counts_lumuc = get_lumuc(depth_base_counts[0]);
319 |     vector<int64_t> nfree_base_counts_lumuc = get_lumuc(depth_nfree_base_counts[0]);
320 |     vector<int64_t> node_counts_nonref_lumuc = get_lumuc(depth_node_counts_nonref[0]);
321 |     vector<int64_t> base_counts_nonref_lumuc = get_lumuc(depth_base_counts_nonref[0]);
322 |     vector<int64_t> nfree_base_counts_nonref_lumuc = get_lumuc(depth_nfree_base_counts_nonref[0]);
323 | 
324 |     // print the results
325 |     cout << "hap-depth"
326 |          << "\t" << "nodes" << "\t" << "bases" << "\t" << "non-n-bases"
327 |          << "\t" << "nodes-cumul" << "\t" <<"bases-cumul" << "\t" << "non-n-bases-cumul"
328 |          << "\t" << "nodes-cumul-rev" << "\t" << "bases-cumul-rev" << "\t" << "non-n-bases-cumul-rev";
329 |     if (!ref_sample.empty()) {
330 |         cout << "\t" << "nodes-nonref" << "\t" << "bases-nonref" << "\t" << "non-n-bases-nonref"
331 |              << "\t" << "nodes-cumul-nonref" << "\t" << "bases-cumul-nonref" << "\t" << "non-n-bases-cumul-nonref"
332 |              << "\t" << "nodes-cumul-rev-nonref" << "\t" << "bases-cumul-rev-nonref" << "\t" << "non-n-bases-cumul-rev-nonref";
333 |     }
334 |     cout << endl;
335 | 
336 |     for (int64_t coverage = 0; coverage < depth_base_counts[0].size(); ++coverage) {        
337 |         cout << coverage
338 |              << "\t" << depth_node_counts[0][coverage] << "\t" << depth_base_counts[0][coverage] << "\t" << depth_nfree_base_counts[0][coverage]
339 |              << "\t" << node_counts_cumul[coverage] << "\t" << base_counts_cumul[coverage] << "\t" << nfree_base_counts_cumul[coverage]
340 |              << "\t" << node_counts_lumuc[coverage] << "\t" << base_counts_lumuc[coverage] << "\t" << nfree_base_counts_lumuc[coverage];
341 |         if (!ref_sample.empty()) {
342 |             cout << "\t" << depth_node_counts_nonref[0][coverage] << "\t" << depth_base_counts_nonref[0][coverage] << "\t" << depth_nfree_base_counts_nonref[0][coverage]
343 |                  << "\t" << node_counts_nonref_cumul[coverage] << "\t" << base_counts_nonref_cumul[coverage] << "\t" << nfree_base_counts_nonref_cumul[coverage]
344 |                  << "\t" << node_counts_nonref_lumuc[coverage] << "\t" << base_counts_nonref_lumuc[coverage] << "\t" << nfree_base_counts_nonref_lumuc[coverage];
345 |         }
346 |         cout << "\n";
347 |     }
348 |     
349 |     return 0;
350 | }
351 | 


--------------------------------------------------------------------------------
/filter-paf-deletions.cpp:
--------------------------------------------------------------------------------
  1 | // Filter big deletions from PAF.  These can sometimes arise from minigraph split alignments.  They are rare, but can really
  2 | // mess up topology of graph if even one gets into cactus.
  3 | 
  4 | // 1) Estimate anchors along reference path for every node in input graph
  5 | // 2) Scan every query in PAF in order, and look at target blocks
  6 | // 3) Use table from 1) in order to estimate the distances between target blocks
  7 | // 4) If two conesecutive target blocks span too big of a distance, delete the smaller block
  8 | 
  9 | //#define debug
 10 | 
 11 | #include <cstdlib>
 12 | #include <iostream>
 13 | #include <cassert>
 14 | #include <fstream>
 15 | #include <deque>
 16 | #include <unordered_map>
 17 | #include <unistd.h>
 18 | #include <getopt.h>
 19 | #include <omp.h>
 20 | 
 21 | #include "bdsg/packed_graph.hpp"
 22 | #include "bdsg/hash_graph.hpp"
 23 | 
 24 | #include "IntervalTree.h"
 25 | #include "paf.hpp"
 26 | 
 27 | using namespace std;
 28 | using namespace handlegraph;
 29 | using namespace bdsg;
 30 | 
 31 | struct Anchor {
 32 |     path_handle_t path_handle;
 33 |     int64_t max_offset;
 34 |     int64_t min_offset;
 35 | };
 36 | 
 37 | struct PafDelta {
 38 |     int64_t delta;
 39 |     int64_t ref_delta;
 40 |     int64_t query_delta;
 41 |     int64_t ref_overlap_size;
 42 |     int64_t prev_ref_start;
 43 |     int64_t prev_ref_end;
 44 |     int64_t cur_ref_start;
 45 |     int64_t cur_ref_end;
 46 |     int64_t query_len;
 47 | };
 48 |     
 49 | static unique_ptr<MutablePathMutableHandleGraph> load_graph(istream& graph_stream);
 50 | static pair<unordered_map<string, nid_t>, unordered_map<nid_t, string>> load_trans(const string& trans_path);
 51 | static unordered_map<nid_t, Anchor> index_graph(const PathHandleGraph* graph,
 52 |                                                 const string& ref_prefix);
 53 | static unordered_map<path_handle_t, IntervalTree<int64_t, int64_t>> index_deletions(const PathHandleGraph* graph, const unordered_map<nid_t, Anchor>& index);
 54 | static pair<vector<PafLine>, unordered_map<int64_t, int64_t>> load_paf(ifstream& paf_file);
 55 | static int64_t for_each_query_block(const vector<PafLine>& paf_lines, const vector<bool>& masking,
 56 |                                     function<void(int64_t, int64_t)> visit_block);
 57 | static int64_t check_delta(int64_t max_deletion_threshold, int64_t max_insertion_threshold, const PafDelta& paf_delta, double overlap_threshold,
 58 |                            double deletion_size_threshold);
 59 | static PafDelta get_delta(path_handle_t ref_path, const PafLine& prev_paf, const PafLine& cur_paf,
 60 |                           const unordered_map<string, nid_t>& mg_to_vg, const unordered_map<nid_t, Anchor>& ref_index,
 61 |                           const unordered_map<path_handle_t, IntervalTree<int64_t, int64_t>>& ref_deletions);
 62 |                                                              
 63 | void help(char** argv) {
 64 |     cerr << "usage: " << argv[0] << " [options] <graph.vg> <trans> <aln.paf> -d <del-threshold>\n" << endl
 65 |          << "Use distances from graph to filter out implied deletions from PAF (cigars not considered, only blocks)" << endl
 66 |          << "  <graph.vg> : minigraph as obtained from vg convert -g graph.gfa" << endl
 67 |          << "  <trans> : node translation from vg convert -g -T" << endl
 68 |          << "  <aln.paf> : paf alignment from cactus-graphmap" << endl
 69 |          << endl
 70 |          << "options: " << endl
 71 |          << "    -d  --del-threshold F      Only remove deletions greater than this. if < 1, then interpreted as fraction of reference path size" << endl
 72 |          << "    -i, --ins-threshold F      Like <del-threshold>, but applied to insertions instead of deletions [-1]" << endl
 73 |          << "    -m, --max-filter F         If F*<threshold> matches need to be pulled apart to resolve a single deletion, just leave it alone [1]" << endl
 74 |          << "    -s, --del-size-threshold F Remove any deletion if the source contig size is < F*<deletion-size> [-1: disabled]" << endl 
 75 |          << "    -r, --ref-prefix STR       Only consider paths whose names start with STR" << endl
 76 |          << "    -p, --progress             Print progress" << endl
 77 |          << "    -o, --filter-off-ref       Filter mappings that aren't in dominant ref" << endl        
 78 |          << "    -v, --verbose              Print deletions" << endl
 79 |          << "    -t, --threads N            Number of threads to use (used only for indexing graph) [default: all available]" << endl
 80 |        << endl;
 81 | }    
 82 | 
 83 | int main(int argc, char** argv) {
 84 | 
 85 |     string ref_prefix;
 86 |     bool progress = false;
 87 |     bool verbose = false;
 88 |     bool keep_off_ref = true;
 89 |     // only filter deletions that don't overlap an existing deletion by at least this much
 90 |     // (doesn't seem to a factor -- most big deletions not in minigraph)
 91 |     double overlap_threshold = 0.5;
 92 |     double filter_threshold = 1.0;
 93 |     double max_insertion = -1.0;
 94 |     double max_deletion = -1.0;
 95 |     double deletion_size_threshold = -1.0;
 96 |     int c;
 97 |     optind = 1; 
 98 |     while (true) {
 99 | 
100 |         static const struct option long_options[] = {
101 |             {"del-threshold", required_argument, 0, 'd'},            
102 |             {"ins-threshold", required_argument, 0, 'i'},
103 |             {"max-filter", required_argument, 0, 'm'},
104 |             {"del-size-threshold", required_argument, 0, 's'},
105 |             {"ref-prefix", required_argument, 0, 'r'},
106 |             {"filter-off-ref", no_argument, 0, 'o'},
107 |             {"help", no_argument, 0, 'h'},            
108 |             {"progress", no_argument, 0, 'p'},
109 |             {"verbose", no_argument, 0, 'v'},
110 |             {"threads", required_argument, 0, 't'},
111 |             {0, 0, 0, 0}
112 |         };
113 | 
114 |         int option_index = 0;
115 | 
116 |         c = getopt_long (argc, argv, "d:i:m:s:r:khpvt:",
117 |                          long_options, &option_index);
118 | 
119 |         // Detect the end of the options.
120 |         if (c == -1)
121 |             break;
122 | 
123 |         switch (c)
124 |         {
125 |         case 'd':
126 |             max_deletion = stof(optarg);
127 |             break;                                                
128 |         case 'i':
129 |             max_insertion = stof(optarg);
130 |             break;                                    
131 |         case 'm':
132 |             filter_threshold = stof(optarg);
133 |             break;
134 |         case 's':
135 |             deletion_size_threshold = stof(optarg);
136 |             break;            
137 |         case 'r':
138 |             ref_prefix = optarg;
139 |             break;
140 |         case 'o':
141 |             keep_off_ref = false;
142 |             break;
143 |         case 'v':
144 |             verbose = true;
145 |             break;
146 |         case 'p':
147 |             progress = true;
148 |             break;
149 |         case 't':
150 |         {
151 |             int num_threads = stoi(optarg);
152 |             if (num_threads <= 0) {
153 |                 cerr << "[filter-paf-deletions] error: Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl;
154 |                 exit(1);
155 |             }
156 |             omp_set_num_threads(num_threads);
157 |             break;
158 |         }                        
159 |         case 'h':
160 |         case '?':
161 |             /* getopt_long already printed an error message. */
162 |             help(argv);
163 |             exit(1);
164 |             break;
165 |         default:
166 |             abort ();
167 |         }
168 |     }
169 | 
170 |     if (argc <= 3) {
171 |         cerr << "[filter-paf-deletions] error: too few arguments\n" << endl;
172 |         help(argv);
173 |         return 1;
174 |     }
175 | 
176 |     // Parse the positional argument
177 |     if (optind >= argc) {
178 |         cerr << "[filter-paf-deletions] error: too few arguments\n" << endl;
179 |         help(argv);
180 |         return 1;
181 |     }
182 | 
183 |     if (optind != argc - 3) {
184 |         cerr << "[filter-paf-deletions] error: too many arguments\n" << endl;
185 |         help(argv);
186 |         return 1;
187 |     }
188 | 
189 |     if (max_deletion <= 0 && max_insertion <= 0) {
190 |         cerr << "[filter-paf-deletions] error: at least one of -d or -i must be set to positive value" << endl;
191 |         return 1;
192 |     }
193 |     
194 |     string graph_path = argv[optind++];
195 |     string trans_path = argv[optind++];
196 |     string paf_path = argv[optind++];
197 | 
198 |     // load the graph
199 |     ifstream graph_stream(graph_path);
200 |     if (!graph_stream) {
201 |         cerr << "[filter-paf-deletions] error: Unable to open input graph " << graph_path << endl;
202 |         return 1;
203 |     }    
204 |     unique_ptr<PathHandleGraph> graph = load_graph(graph_stream);
205 |     graph_stream.close();
206 |     if (progress) {
207 |         cerr << "[filter-paf-deletions]: Loaded graph" << endl;
208 |     }
209 | 
210 |     // load the minigraph <-> vg id translation table (because our PAF is expressed in terms of the minigraph
211 |     // ids but we lose them when converting to vg.)
212 |     unordered_map<string, nid_t> mg_to_vg;
213 |     unordered_map<nid_t, string> vg_to_mg;
214 |     std::tie(mg_to_vg, vg_to_mg) = load_trans(trans_path);
215 | 
216 |     if (progress) {
217 |         cerr << "[filter-paf-deletions]: Loaded " << mg_to_vg.size() << " translations." << endl;
218 |     }
219 | 
220 |     // open the paf
221 |     ifstream paf_file(paf_path);
222 |     if (!paf_file) {
223 |         cerr << "[filter-paf-deletions] error: Unable to open PAF" << endl;
224 |         return 1;
225 |     }
226 | 
227 |     // load the paf into memory
228 |     vector<PafLine> paf_lines;
229 |     unordered_map<int64_t, int64_t> orig_to_sorted;
230 |     std::tie(paf_lines, orig_to_sorted) = load_paf(paf_file);
231 |     if (progress) {
232 |         cerr << "[filter-paf-deletions]: Loaded " << paf_lines.size() << " paf lines" << endl;
233 |     }
234 | 
235 |     // index the minigraph
236 |     // this maps each node in the graph to a (maximal) reference interval
237 |     unordered_map<nid_t, Anchor> ref_index = index_graph(graph.get(), ref_prefix);
238 |     if (progress) {
239 |         cerr << "[filter-paf-deletions]: Created reference path index" << endl;
240 |     }
241 | 
242 |     unordered_map<path_handle_t, int64_t> ref_path_to_length;
243 |     if ((max_deletion > 0 && max_deletion <= 1.) || (max_insertion > 0. && max_insertion <= 1.)) {
244 |         graph->for_each_path_handle([&](path_handle_t path_handle) {
245 |                 int64_t len = 0;
246 |                 graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) {
247 |                         len += graph->get_length(graph->get_handle_of_step(step_handle));
248 |                     });
249 |                 ref_path_to_length[path_handle] = len;
250 |             });
251 |         if (progress) {
252 |             cerr << "[filter-paf-deletions]: Computed lengths for " << ref_path_to_length.size() << " reference paths" << endl;
253 |         }
254 |     }
255 | 
256 |     unordered_map<path_handle_t, IntervalTree<int64_t, int64_t>> ref_deletions = index_deletions(graph.get(), ref_index);
257 |     if (progress) {
258 |         cerr << "[filter-paf-deletions]: Created reference deletion index" << endl;
259 |     }
260 | 
261 | #ifdef debug
262 |     for (auto fam : ref_index) {
263 |         cerr << fam.first << " -> " << graph->get_path_name(fam.second.path_handle) << " " << fam.second.min_offset << " " << fam.second.max_offset << endl;
264 |     }
265 | #endif
266 | 
267 |     // we have everything needed to filter the paf
268 |     vector<bool> filtered_lines(paf_lines.size(), false);
269 |     int64_t filtered_line_total = 0;
270 |     int64_t filtered_line_it = 0;
271 |     int64_t filtered_match_total = 0;
272 |     vector<bool> off_ref_filtered_lines(paf_lines.size(), false);
273 |     int64_t off_ref_filtered_line_it = 0;
274 |     int64_t off_ref_filtered_match_total = 0;
275 |     int64_t iteration = 0;
276 | 
277 |     do {
278 |         filtered_line_it = 0;
279 |         for_each_query_block(paf_lines, filtered_lines, [&](int64_t block_start, int64_t block_end) {
280 |                 assert(!filtered_lines[block_start] && !filtered_lines[block_end]);
281 |                 // get some stats about the block
282 |                 unordered_map<path_handle_t, int64_t> ref_path_sizes;
283 |                 int64_t total_matches = 0;
284 |                 for (int64_t i = block_start; i <= block_end; ++i) {
285 |                     if (!filtered_lines[i]) {
286 |                         const PafLine& paf = paf_lines[i];
287 |                         nid_t target_id = mg_to_vg.at(paf.target_name);
288 |                         const Anchor& anchor = ref_index.at(target_id);
289 |                         ref_path_sizes[anchor.path_handle] += paf.num_matching;
290 |                         total_matches += paf.num_matching;
291 |                     } else {
292 |                         assert(iteration > 0);
293 |                     }
294 |                 }
295 |                 if (total_matches == 0) {
296 |                     // whole block was filtered, nothing to be done
297 |                     return;
298 |                 }
299 |                 // find the number one reference path by match coverage
300 |                 // todo: what about tie?
301 |                 path_handle_t ref_path;
302 |                 int64_t ref_path_size = -1;
303 |                 for (const auto& rps : ref_path_sizes) {
304 |                     if (rps.second > ref_path_size) {
305 |                         ref_path_size = rps.second;
306 |                         ref_path = rps.first;
307 |                     }
308 |                 }
309 | 
310 |                 // support fracitonal thresholds which apply to path length
311 |                 int64_t max_deletion_threshold = max_deletion;
312 |                 int64_t max_insertion_threshold = max_insertion;
313 |                 if (max_deletion > 0. && max_deletion <= 1.) {
314 |                     max_deletion_threshold = max_deletion * ref_path_to_length.at(ref_path);
315 |                 }
316 |                 if (max_insertion > 0. && max_insertion <= 1.) {
317 |                     max_insertion_threshold = max_insertion * ref_path_to_length.at(ref_path);
318 |                 }
319 | 
320 |                 // mask out everything off this path
321 |                 // get rid of all off-reference path mappings right away
322 |                 // note: these are flagged to be ignored but not actually filtered
323 |                 // unless keep_off_ref is set to false, then they are removed
324 |                 int64_t off_ref_total = 0;
325 |                 int64_t off_ref_match_total = 0;
326 |                 for (int64_t i = block_start; i <= block_end; ++i) {
327 |                     if (!filtered_lines[i]) {
328 |                         nid_t cur_target_id = mg_to_vg.at(paf_lines[i].target_name);
329 |                         const Anchor& cur_anchor = ref_index.at(cur_target_id);
330 |                         if (cur_anchor.path_handle != ref_path) {
331 |                             off_ref_filtered_lines[i] = true;
332 |                             ++off_ref_total;
333 |                             off_ref_match_total += paf_lines[i].num_matching;
334 |                         }
335 |                     }
336 |                 }
337 |                 off_ref_filtered_line_it += off_ref_total;
338 |                 off_ref_filtered_match_total += off_ref_match_total;
339 |                 if (!keep_off_ref && verbose && off_ref_total > 0) {
340 |                     cerr << "[filter-paf-deletions]: filtered " << off_ref_total << " lines with " << off_ref_match_total << " bases "
341 |                          << " because they did not map to reference sequence " << graph->get_path_name(ref_path) << " all in block "
342 |                          << "\n  I=" << block_start <<": " << paf_lines[block_start]
343 |                          << "\n  J=" << block_end << ": " << paf_lines[block_end] << endl << endl;
344 |                 }
345 | 
346 |                 // try to find a gap that exceeds the length
347 |                 int64_t prev_idx = -1;
348 |                 // these are the boundaries of the deletions in the block
349 |                 // the deletion is between cut_point[i] and cut_point[i] - 1
350 |                 vector<int64_t> cut_points;
351 |                 vector<int64_t> cut_sizes;
352 |                 for (int64_t i = block_start; i <= block_end; ++i) {
353 |                     if (filtered_lines[i] || off_ref_filtered_lines[i]) {
354 |                         continue;
355 |                     }
356 |                     if (prev_idx == -1) {
357 |                         prev_idx = i;
358 |                         continue;
359 |                     }
360 |                     // if we got this far that means we're on the path and we have a prev on the path too
361 |                     // do a rough delta check
362 |                     assert(prev_idx < i);
363 |                     const PafLine& cur_paf = paf_lines[i];
364 |                     const PafLine& prev_paf = paf_lines[prev_idx];
365 |                     PafDelta paf_delta = get_delta(ref_path, prev_paf, cur_paf, mg_to_vg, ref_index, ref_deletions);
366 |                     
367 |                     int64_t checked_delta = check_delta(max_deletion_threshold, max_insertion_threshold, paf_delta, overlap_threshold, deletion_size_threshold);
368 | 
369 |                     if (checked_delta != 0) {
370 |                         if (verbose) {
371 |                             cerr << "[filter-paf-deletions]: detected " << (checked_delta > 0 ? "deletion" : "insertion")
372 |                                  << " of size " << (int64_t)abs(paf_delta.delta) << " with overlap " << paf_delta.ref_overlap_size
373 |                                  << " on ref path " << graph->get_path_name(ref_path) << " with cur anchor ("
374 |                                  << paf_delta.cur_ref_start << ", " << paf_delta.cur_ref_end << ") and prev anchor (" << paf_delta.prev_ref_start << ", "
375 |                                  << paf_delta.prev_ref_end << ") and threshold " << max_deletion_threshold
376 |                                  << " on following paf line:\n  I=" << (prev_idx) <<": " << prev_paf
377 |                                  << "\n  J=" << i << ": " << cur_paf << endl << endl;
378 |                         }
379 |                         cut_points.push_back(i);
380 |                         cut_sizes.push_back(paf_delta.delta);
381 |                     } 
382 |                 
383 |                     prev_idx = i;
384 |                 }
385 | 
386 |                 // greedy heuristic: for every deletion cut point, we try scanning forward and backward to find the minimum
387 |                 // resolving block of lines whose removal solves the deletion
388 |                 for (int64_t j = 0; j < cut_points.size(); ++j) {
389 |                     if (filtered_lines[cut_points[j]]) {
390 |                         continue;
391 |                     }
392 |                     // go backward                        
393 |                     int64_t backward_matches = 0;
394 |                     int64_t backward_candidate = -1; // last *unfiltered* line scanning left
395 |                     int64_t prev_idx = -1;
396 |                     for (int64_t k = cut_points[j] - 1; k >= block_start && backward_candidate == -1; --k) {
397 |                         if (!filtered_lines[k] && !off_ref_filtered_lines[k]) {
398 |                             if (prev_idx == -1) {
399 |                                 prev_idx = k;
400 |                             }
401 |                             const PafLine& prev_paf = paf_lines[k];
402 |                             const PafLine& cur_paf = paf_lines[cut_points[j]];
403 |                             PafDelta paf_delta = get_delta(ref_path, prev_paf, cur_paf, mg_to_vg, ref_index, ref_deletions);
404 |                             int64_t checked_delta = check_delta(max_deletion_threshold, max_insertion_threshold, paf_delta, overlap_threshold,
405 |                                                                 deletion_size_threshold);
406 |                             if (checked_delta == 0) {
407 |                                 backward_candidate = k;
408 |                             } else {
409 |                                 backward_matches += paf_lines[k].num_matching;
410 |                             }
411 |                         }
412 |                     }
413 |                     if (backward_candidate == -1) {
414 |                         // need to delete block_start too
415 |                         backward_candidate = block_start - 1;        
416 |                     }
417 | 
418 |                     // go forward
419 |                     int64_t forward_matches = 0;
420 |                     int64_t forward_candidate = -1; // last *unfiltered* line scanning right
421 |                     for (int64_t k = cut_points[j]; k <= block_end && forward_candidate == -1 && prev_idx != -1; ++k) {
422 |                         if (!filtered_lines[k] && !off_ref_filtered_lines[k]) {
423 |                             const PafLine& prev_paf = paf_lines[prev_idx];
424 |                             const PafLine& cur_paf = paf_lines[k];
425 |                             int64_t max_deletion_threshold = max_deletion;
426 |                             int64_t max_insertion_threshold = max_insertion;
427 |                             PafDelta paf_delta = get_delta(ref_path, prev_paf, cur_paf, mg_to_vg, ref_index, ref_deletions);
428 |                             int64_t checked_delta = check_delta(max_deletion_threshold, max_insertion_threshold, paf_delta, overlap_threshold,
429 |                                                                 deletion_size_threshold);
430 |                             if (checked_delta == 0) {
431 |                                 forward_candidate = k;
432 |                             } else {
433 |                                 forward_matches += paf_lines[k].num_matching;
434 |                             }
435 |                         }
436 |                     }
437 |                     if (forward_candidate == -1) {
438 |                         // need to delete block_end too
439 |                         forward_candidate = block_end + 1;
440 |                     }
441 | 
442 |                     assert(backward_candidate >= block_start - 1 && forward_candidate <= block_end + 1);
443 | 
444 |                     int64_t min_segment_start = -1;
445 |                     int64_t min_segment_end = -1;
446 |                     int64_t min_segment_matches = -1;
447 |                     if (backward_matches < forward_matches) {
448 |                         min_segment_start = backward_candidate + 1; // want to filter right of candidate (not include)
449 |                         min_segment_end = cut_points[j] - 1;
450 |                         min_segment_matches = backward_matches;
451 |                     } else {
452 |                         min_segment_start = cut_points[j];
453 |                         min_segment_end = forward_candidate - 1; // want to filter left of candidate (not include)
454 |                         min_segment_matches = forward_matches;                        
455 |                     }
456 | 
457 |                     int64_t max_matches_deleted = filter_threshold * cut_sizes[j];
458 |                     if (j == 0) {
459 |                         if (min_segment_end < min_segment_start) {
460 |                             cerr << "ms me " << min_segment_start << " " << min_segment_end << " bm fm " << backward_matches << " " << forward_matches
461 |                                  << " bc fc " << backward_candidate << " " << forward_candidate << endl;
462 |                         }
463 |                         assert(min_segment_start <= min_segment_end);
464 |                     }
465 |                     if (min_segment_matches > 0) {
466 |                   
467 |                         if (min_segment_matches <= max_matches_deleted) {
468 |                             int64_t lines_in_segment = 0;
469 |                             for (int64_t k = min_segment_start; k <= min_segment_end; ++k) {
470 |                                 if (!filtered_lines[k]) {
471 |                                     filtered_lines[k] = true;
472 |                                     ++filtered_line_it;
473 |                                     filtered_match_total += paf_lines[k].num_matching;
474 |                                     ++lines_in_segment;
475 |                                 }
476 |                             }
477 | 
478 |                             if (verbose) {
479 |                                 cerr << "[filter-paf-deletions]: filtering " << lines_in_segment << " PAF lines between (inclusively)\n  I="
480 |                                      << min_segment_start << ": " << paf_lines[min_segment_start]
481 |                                      << "\n  J=" << min_segment_end << ":  " << paf_lines[min_segment_end]
482 |                                      << "\nfor a total of " << min_segment_matches << " matches" << endl << endl;
483 |                             }
484 |                         } else {
485 |                             if (verbose) {
486 |                                 cerr << "[filter-paf-deletions]: leaving in PAF lines between (inclusively)\n  I="
487 |                                      << min_segment_start << ": " << paf_lines[min_segment_start]
488 |                                      << "\n  J=" << min_segment_end << ":  " << paf_lines[min_segment_end]
489 |                                      << "\nfor a total of " << min_segment_matches << " matches, which exceeds deletion threshold of " << max_matches_deleted
490 |                                      << endl << endl;
491 |                             }
492 | 
493 |                         }
494 |                     }
495 |                 }
496 |             });
497 |         if (!keep_off_ref) {
498 |             filtered_line_it += off_ref_filtered_line_it;
499 |         }
500 |         
501 |         if (progress) {
502 |             cerr << "[filter-paf-deletions]: Iteration " << iteration << ": Found " << filtered_line_it << " lines to filter" << endl;
503 |         }
504 |         ++iteration;
505 |         filtered_line_total += filtered_line_it;
506 |         if (!keep_off_ref) {
507 |             filtered_line_total += off_ref_filtered_line_it;
508 |         }
509 |     } while (filtered_line_it > 0);
510 | 
511 |     if (!keep_off_ref) {
512 |         filtered_match_total += off_ref_filtered_match_total;
513 |     }
514 |     if (progress) {
515 |         cerr << "[filter-paf-deletions]: Filtering out " << filtered_line_total << " paf lines totaling " << filtered_match_total << " matches" << endl;
516 |     }
517 | 
518 |     // output the unfiltered lines
519 |     paf_file.clear();
520 |     paf_file.seekg(0, ios::beg) ;
521 |     string buffer;
522 |     for (int64_t line_no = 0; line_no < filtered_lines.size(); ++line_no) {
523 |         int64_t sorted_line_no = orig_to_sorted.at(line_no);
524 |         const auto& ret = getline(paf_file, buffer);
525 |         assert(ret);
526 | 
527 |         // sanity check:
528 |         PafLine paf_line = parse_paf_line(buffer);
529 |         assert(paf_line.query_name == paf_lines[sorted_line_no].query_name);
530 |         assert(paf_line.query_start == paf_lines[sorted_line_no].query_start);
531 |         assert(paf_line.query_end == paf_lines[sorted_line_no].query_end);
532 |         assert(paf_line.target_name == paf_lines[sorted_line_no].target_name);
533 |         assert(paf_line.target_start == paf_lines[sorted_line_no].target_start);
534 |         assert(paf_line.target_end == paf_lines[sorted_line_no].target_end);
535 |                 
536 |         if (filtered_lines[sorted_line_no] == false && (keep_off_ref || off_ref_filtered_lines[sorted_line_no] == false)) {
537 |             cout << buffer << "\n";
538 |         }
539 |     }
540 |     cout << flush;
541 | 
542 |     return 0;
543 | }
544 | 
545 | static string strip_prefix(const string& name) {
546 |     if (name.compare(0, 3, "id=") == 0) {
547 |         size_t p = name.find('|', 3);
548 |         assert(p != string::npos);
549 |         return name.substr(p + 1);
550 |     }
551 |     return name;
552 | }
553 | 
554 | unordered_map<nid_t, Anchor> index_graph(const PathHandleGraph* graph,
555 |                                          const string& ref_prefix) {
556 | 
557 |     // start by making a path position index
558 |     // minigraph assumption: no more than one path per handle!
559 |     unordered_map<handle_t, int64_t> position_index;
560 |     graph->for_each_path_handle([&](path_handle_t path_handle) {            
561 |             if (graph->get_path_name(path_handle).compare(0, ref_prefix.length(), ref_prefix) == 0) {
562 |                 size_t offset = 0;
563 |                 graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) {
564 |                         handle_t handle = graph->get_handle_of_step(step_handle);
565 |                         size_t len = graph->get_length(handle);
566 |                         assert(len > 0);
567 |                         assert(!position_index.count(handle));
568 |                         position_index[handle] = offset;
569 |                         position_index[graph->flip(handle)] = offset + len - 1;
570 |                         offset += len;
571 |                     });
572 |             }
573 |         });
574 | 
575 |     if (position_index.empty()) {
576 |         cerr << "[filter-paf-deletions] error: no reference path found" << endl;
577 |         exit(0);
578 |     }
579 |     
580 |     vector<unordered_map<nid_t, Anchor>> thread_results(get_thread_count());
581 |     
582 |     // really slow brute-force relies on minigraph not having too many nodes
583 |     graph->for_each_handle([&](handle_t handle) {
584 |             unordered_map<nid_t, Anchor>& ref_index = thread_results[omp_get_thread_num()];
585 | 
586 |             // find all reference nodes that are connected via BFS
587 |             unordered_set<handle_t> context;
588 |             unordered_set<handle_t> ref_handles;
589 |             vector<handle_t> cur_handles = {handle};
590 |             while (!cur_handles.empty()) {
591 |                 vector<handle_t> next_handles;
592 |                 for (auto& h : cur_handles) {
593 |                     if (!context.count(h)) {
594 |                         context.insert(h);
595 |                         if (position_index.count(h)) {
596 |                             // dead-end on reference
597 |                             ref_handles.insert(h);
598 |                         } else {
599 |                             graph->follow_edges(h, false, [&](handle_t n) {
600 |                                     next_handles.push_back(n);
601 |                                 });
602 |                             graph->follow_edges(h, true, [&](handle_t p) {
603 |                                     next_handles.push_back(p);
604 |                                 });
605 |                         }
606 |                     }
607 |                 }
608 |                 cur_handles = std::move(next_handles);
609 |             }
610 | 
611 |             // update the index with reference offsets
612 |             unordered_set<path_handle_t> ref_path_set;
613 |             int64_t min_ref_offset = numeric_limits<int64_t>::max();
614 |             int64_t max_ref_offset = -1;
615 |             for (handle_t ref_handle : ref_handles) {
616 |                 vector<step_handle_t> steps = graph->steps_of_handle(ref_handle);
617 |                 assert(steps.size() == 1);
618 |                 path_handle_t ref_path_handle = graph->get_path_handle_of_step(steps.back());
619 |                 ref_path_set.insert(ref_path_handle);
620 |                 // assumption: only one reference path in component
621 |                 // (fair for minigraph, but may need to do better than prefix for path selection)
622 |                 assert(ref_path_set.size() == 1);
623 |                 int64_t ref_offset = position_index.at(ref_handle);
624 |                 int64_t ref_offset_rev = position_index.at(graph->flip(ref_handle));
625 |                 min_ref_offset = std::min(min_ref_offset, min(ref_offset, ref_offset_rev));
626 |                 max_ref_offset = std::max(max_ref_offset, max(ref_offset, ref_offset_rev));
627 |                 assert(max_ref_offset >= min_ref_offset);
628 |             }
629 |             if (!ref_path_set.empty()) {
630 |                 assert(ref_path_set.size() == 1);
631 |                 Anchor& anchor = ref_index[graph->get_id(handle)];
632 |                 anchor.path_handle = *ref_path_set.begin();
633 |                 anchor.min_offset = min_ref_offset;
634 |                 anchor.max_offset = max_ref_offset;
635 |                 assert(anchor.max_offset >= anchor.min_offset);
636 |             }
637 |         }, true);
638 | 
639 |     // merge up the indexes
640 |     for (size_t i = 1; i < thread_results.size(); ++i) {
641 |         for (const auto& id_anchor : thread_results[i]) {
642 |             thread_results[0][id_anchor.first] = id_anchor.second;
643 |         }
644 |         thread_results[i].clear();
645 |     }
646 | 
647 |     return thread_results[0];
648 | }
649 | 
650 | pair<vector<PafLine>, unordered_map<int64_t, int64_t>> load_paf(ifstream& paf_file) {
651 | 
652 |     vector<pair<int64_t, PafLine>> numbered_lines;
653 |     string buffer;
654 |     int64_t line_no = 0;
655 |     while (getline(paf_file, buffer)) {
656 |         PafLine paf_line = parse_paf_line(buffer);
657 |         // dont use this
658 |         paf_line.cigar = "";
659 |         numbered_lines.push_back(make_pair(line_no++, paf_line));
660 |     }
661 |     std::sort(numbered_lines.begin(), numbered_lines.end(), [&](const pair<int64_t, PafLine>& p1, const pair<int64_t, PafLine>& p2) {
662 |             return p1.second.query_name < p2.second.query_name ||
663 |                                           (p1.second.query_name == p2.second.query_name && p1.second.query_start < p2.second.query_start);
664 |         });
665 | 
666 |     vector<PafLine> paf_lines;
667 |     unordered_map<int64_t, int64_t> orig_to_sorted;
668 |     for (int64_t i = 0; i < numbered_lines.size(); ++i) {
669 |         paf_lines.push_back(numbered_lines[i].second);
670 |         orig_to_sorted[numbered_lines[i].first] = i;
671 |     }
672 |     
673 |     return make_pair(paf_lines, orig_to_sorted);
674 | }
675 | 
676 | int64_t for_each_query_block(const vector<PafLine>& paf_lines, const vector<bool>& filtered_lines,
677 |                              function<void(int64_t, int64_t)> visit_block) {
678 |     if (paf_lines.empty()) {
679 |         assert(false);
680 |     }
681 |     int64_t block_start = -1;
682 |     int64_t block_end = -1;
683 |     string prev_query;
684 |     int64_t num_visits = 0;
685 |     for (int64_t i = 0; i < paf_lines.size(); ++i) {
686 |         if (filtered_lines[i]) {
687 |             continue;
688 |         }
689 |         const PafLine& paf = paf_lines[i];
690 |         if (block_start == -1) {
691 |             block_start = i;
692 |         } else if (paf.query_name != prev_query) {
693 |             assert(!prev_query.empty());
694 |             if (block_start > -1) {
695 |                 // visit the previous block
696 |                 visit_block(block_start, block_end);
697 |             }
698 |             ++num_visits;
699 |             //start a new block
700 |             block_start = i;
701 |         }
702 |         // update end of current block
703 |         block_end = i;
704 |         prev_query = paf.query_name;
705 |     }
706 | 
707 |     if (block_end != -1) {
708 |         // visit last block if present
709 |         visit_block(block_start, block_end);
710 |         ++num_visits;
711 |     }
712 |     return num_visits;
713 | }
714 | 
715 | unordered_map<path_handle_t, IntervalTree<int64_t, int64_t>> index_deletions(const PathHandleGraph* graph, const unordered_map<nid_t, Anchor>& index) {
716 | 
717 |     vector<unordered_map<path_handle_t, vector<Interval<int64_t, int64_t>>>> thread_deletions(get_thread_count());
718 | 
719 |     // get approximate deletion intervals using the index
720 |     graph->for_each_edge([&](edge_t edge) {
721 |             const Anchor& a1 = index.at(graph->get_id(edge.first));
722 |             const Anchor& a2 = index.at(graph->get_id(edge.second));
723 |             if (a1.path_handle == a2.path_handle) {
724 |                 Interval<int64_t, int64_t> interval(0, 0, 0);
725 |                 if (a1.min_offset < a2.min_offset) {
726 |                     interval.start = a1.max_offset;
727 |                     interval.stop = a2.min_offset;
728 |                 } else {
729 |                     interval.start = a2.max_offset;
730 |                     interval.stop = a1.min_offset;
731 |                 }
732 |                 interval.value = interval.stop - interval.start;
733 |                 if (interval.value > 1) {
734 |                     thread_deletions[omp_get_thread_num()][a1.path_handle].push_back(interval);
735 |                 }
736 |             }
737 |         }, true);
738 | 
739 |     for (size_t i = 1; i < thread_deletions.size(); ++i) {
740 |         for (const auto& pi : thread_deletions[i]) {
741 |             for (const auto& interval : pi.second) {
742 |                 thread_deletions[0][pi.first].push_back(interval);
743 |             }
744 |         }
745 |         thread_deletions[i].clear();
746 |     }
747 | 
748 |     unordered_map<path_handle_t, IntervalTree<int64_t, int64_t>> path_to_tree;
749 |     for (const auto& pi : thread_deletions[0]) {
750 |         path_to_tree[pi.first] = IntervalTree<int64_t, int64_t>(pi.second);
751 |     }
752 |     return path_to_tree;
753 | }
754 | 
755 | 
756 | pair<unordered_map<string, nid_t>, unordered_map<nid_t, string>> load_trans(const string& trans_path) {
757 |     ifstream trans_file(trans_path);
758 |     if (!trans_file) {
759 |         cerr << "[filter-paf-deletions] error: Unable to load trans file" << endl;
760 |         exit(1);
761 |     }
762 | 
763 |     unordered_map<string, nid_t> mg_to_vg;
764 |     unordered_map<nid_t, string> vg_to_mg;
765 | 
766 |     string buffer;
767 |     while (getline(trans_file, buffer)) {
768 |         vector<string> toks;
769 |         split_delims(buffer, "\t\n", toks);
770 |         assert(toks.size() == 3 && toks[0] == "T");
771 |         string& mg_name = toks[1];
772 |         bool has_prefix = mg_name.compare(0, 3, "id=") == 0;
773 |         mg_to_vg[mg_name] = stol(toks[2]);
774 |         // hack to support prefixed or not minigraph
775 |         // just by keeping both versions in the map no matter what
776 |         // todo: parameterize prefix name
777 |         if (has_prefix) {
778 |             mg_to_vg[strip_prefix(mg_name)] = stol(toks[2]);
779 |         } else {
780 |             mg_to_vg["id=_MINIGRAPH_|" + mg_name] = stol(toks[2]);
781 |         }
782 |         vg_to_mg[stol(toks[2])] = mg_name;
783 |     }
784 | 
785 |     return make_pair(mg_to_vg, vg_to_mg);    
786 | }
787 | 
788 | unique_ptr<MutablePathMutableHandleGraph> load_graph(istream& graph_stream) {
789 | 
790 |     char magic_bytes[4];
791 |     graph_stream.read(magic_bytes, 4);
792 |     uint32_t magic_number = ntohl(*((uint32_t*) magic_bytes));
793 |     graph_stream.clear();
794 |     graph_stream.seekg(0, ios::beg);
795 | 
796 |     MutablePathMutableHandleGraph* graph;
797 |     if (magic_number == PackedGraph().get_magic_number()) {
798 |         graph = new PackedGraph();
799 |     } else if (magic_number == HashGraph().get_magic_number()) {
800 |         graph = new HashGraph();
801 |     } else {
802 |         cerr << "Unable to parse input graph with magic number " << magic_number << endl;
803 |         exit(1);
804 |     }
805 |     dynamic_cast<SerializableHandleGraph*>(graph)->deserialize(graph_stream);
806 | 
807 |     return unique_ptr<MutablePathMutableHandleGraph>(graph);
808 | }
809 | 
810 | // return -Delta for insertion, +Delta for deletion and 0 if it doesn't pass thresholds
811 | int64_t check_delta(int64_t max_deletion_threshold, int64_t max_insertion_threshold, const PafDelta& paf_delta, double overlap_threshold,
812 |                     double deletion_size_threshold) {
813 |     int64_t ret = 0;
814 |     if (paf_delta.delta != 0) {
815 |         // note: paf_delta.delta is ref-query, so deletions are positive and insertions are negative
816 |         if (paf_delta.delta < 0 && max_insertion_threshold > 0 && -paf_delta.delta > max_insertion_threshold) {
817 |             ret = paf_delta.delta;
818 |         } else if (paf_delta.delta > 0 && max_deletion_threshold > 0 &&
819 |                    (paf_delta.delta > max_deletion_threshold ||
820 |                     (deletion_size_threshold >= 0 && paf_delta.query_len < deletion_size_threshold * abs(paf_delta.delta)))  &&
821 |                    abs((double)paf_delta.ref_overlap_size / paf_delta.delta) < overlap_threshold) {
822 |             ret = paf_delta.delta;
823 |         }
824 |     }
825 |     return ret;
826 | }
827 | 
828 | PafDelta get_delta(path_handle_t ref_path, const PafLine& prev_paf, const PafLine& cur_paf,
829 |                    const unordered_map<string, nid_t>& mg_to_vg, const unordered_map<nid_t, Anchor>& ref_index,
830 |                    const unordered_map<path_handle_t, IntervalTree<int64_t, int64_t>>& ref_deletions) {
831 |     
832 |     PafDelta paf_delta;
833 | 
834 |     paf_delta.query_len = prev_paf.query_len;
835 |     assert(paf_delta.query_len == cur_paf.query_len);
836 |     
837 |     paf_delta.query_delta = cur_paf.query_start - prev_paf.query_end; // not abs because sorted
838 |     
839 |     nid_t prev_target_id = mg_to_vg.at(prev_paf.target_name);
840 |     const Anchor& prev_anchor = ref_index.at(prev_target_id);
841 | 
842 |     nid_t cur_target_id = mg_to_vg.at(cur_paf.target_name);
843 |     const Anchor& cur_anchor = ref_index.at(cur_target_id);
844 |                     
845 |     // todo : verify 
846 |     paf_delta.cur_ref_start = cur_anchor.min_offset + cur_paf.target_start;
847 |     paf_delta.cur_ref_end = cur_anchor.max_offset - (cur_paf.target_len - cur_paf.target_end);
848 |     paf_delta.prev_ref_start = prev_anchor.min_offset + prev_paf.target_start;
849 |     paf_delta.prev_ref_end = prev_anchor.max_offset - (prev_paf.target_len - prev_paf.target_end);
850 | 
851 |     int64_t cur_ref_start = paf_delta.cur_ref_start;
852 |     int64_t cur_ref_end = paf_delta.cur_ref_end;
853 |     int64_t prev_ref_start = paf_delta.prev_ref_start;
854 |     int64_t prev_ref_end = paf_delta.prev_ref_end;
855 |         
856 |     // sort the ref intervals
857 |     if (cur_ref_start < prev_ref_start) {
858 |         swap(cur_ref_start, prev_ref_start);
859 |         swap(cur_ref_end, prev_ref_end);
860 |     }
861 |     paf_delta.ref_delta = cur_ref_start - prev_ref_end;
862 |                                     
863 |     paf_delta.delta = paf_delta.ref_delta > 0 ? paf_delta.ref_delta - paf_delta.query_delta : 0;
864 | 
865 |     paf_delta.ref_overlap_size = 0;
866 |     if (paf_delta.delta > 0) {
867 |         if (ref_deletions.count(ref_path)) {
868 |             vector<Interval<int64_t, int64_t>> overlaps = ref_deletions.at(ref_path).findOverlapping(prev_ref_end, cur_ref_start);
869 |             for (const auto& overlap : overlaps) {
870 |                 int64_t intersection_start = max(prev_ref_end, overlap.start);
871 |                 int64_t intersection_stop = min(cur_ref_start, overlap.stop);
872 |                 paf_delta.ref_overlap_size = max(paf_delta.ref_overlap_size, intersection_stop - intersection_start);                                
873 |             }
874 |         }
875 |     }
876 | 
877 |     return paf_delta;
878 | }
879 | 


--------------------------------------------------------------------------------
/hal2vg.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2016 by Glenn Hickey (hickey@soe.ucsc.edu)
  3 |  *
  4 |  * Released under the MIT license, see LICENSE.txt
  5 |  */
  6 | 
  7 | // This file was created by merging hal2sg.cpp and sg2vg.cpp with
  8 | // a small amount of glue for the interface. 
  9 | 
 10 | //#define debug
 11 | 
 12 | #include <cstdlib>
 13 | #include <iostream>
 14 | #include <cassert>
 15 | #include <fstream>
 16 | #include <deque>
 17 | #include <unordered_map>
 18 | 
 19 | #include "stPinchGraphs.h"
 20 | #include "bdsg/packed_graph.hpp"
 21 | #include "bdsg/hash_graph.hpp"
 22 | #include "hal.h"
 23 | 
 24 | using namespace std;
 25 | using namespace hal;
 26 | using namespace handlegraph;
 27 | using namespace bdsg;
 28 | using namespace handlegraph;
 29 | 
 30 | static void initParser(CLParser* optionsParser) {
 31 |     optionsParser->addArgument("halFile", "input hal file");
 32 |     optionsParser->addOption("refGenomes",
 33 |                              "comma-separated (no spaces) genomes to treat as reference paths with all others as haplotype paths (default=all genomes)",
 34 |                              "\"\"");
 35 |     optionsParser->addOption("rootGenome", 
 36 |                              "process only genomes in clade with specified root"
 37 |                              " (HAL root if empty)", 
 38 |                              "\"\"");
 39 |     optionsParser->addOption("targetGenomes",
 40 |                              "comma-separated (no spaces) list of target genomes "
 41 |                              "(others are excluded) (all leaves if empty)",
 42 |                              "\"\"");
 43 |     optionsParser->addOptionFlag("noAncestors", 
 44 |                                  "don't write ancestral paths, nor sequence exclusive to ancestral genomes",
 45 |                                  false);
 46 |     optionsParser->addOption("ignoreGenomes",
 47 |                              "comma-separated (no spaces) list of genomes to ignore",
 48 |                              "\"\"");
 49 |     optionsParser->addOption("outputFormat",
 50 |                              "output graph format in {pg, hg} [default=pg]",
 51 |                              "pg");
 52 |     optionsParser->addOption("chop",
 53 |                              "chop up nodes in output graph so they are not longer than given length",
 54 |                              0);
 55 |     optionsParser->addOptionFlag("progress",
 56 |                                  "show progress",
 57 |                                  false);
 58 |     optionsParser->setDescription("Convert HAL alignment to handle graph");
 59 | 
 60 | }
 61 | 
 62 | static void add_genome_threads(const Genome* genome,
 63 |                                stPinchThreadSet* threads,
 64 |                                vector<string>& IDToName,
 65 |                                unordered_map<string, int64_t>& nameToID);
 66 | 
 67 | static void pinch_genome(const Genome* genome,
 68 |                          stPinchThreadSet* threads,
 69 |                          unordered_map<string, int64_t>& nameToID,
 70 |                          const vector<string>& targetNames,
 71 |                          unordered_map<stPinchThread*, vector<bool>>& snp_cache);
 72 | 
 73 | static void pinch_snp(const Genome* genome,
 74 |                       stPinchThreadSet* threads,
 75 |                       unordered_map<string, int64_t>& nameToID,
 76 |                       const TopSegmentIteratorPtr& topIt,
 77 |                       int64_t topOffset,
 78 |                       ColumnIteratorPtr& colIt,
 79 |                       char topBase,
 80 |                       stPinchThread* topThread,
 81 |                       unordered_map<stPinchThread*, vector<bool>>& snp_cache);
 82 | 
 83 | static void pinch_to_handle(const Genome* genome,
 84 |                             stPinchThreadSet* threadSet,
 85 |                             const vector<string>& IDToName,
 86 |                             const unordered_map<string, int64_t>& nameToID,
 87 |                             unordered_map<stPinchBlock*, nid_t>& blockToNode,
 88 |                             MutablePathMutableHandleGraph& graph,
 89 |                             const vector<string>& refNames);
 90 | 
 91 | static void chop_graph(MutablePathMutableHandleGraph& graph, size_t maxNodeLength);
 92 | 
 93 | static subrange_t resolve_subpath_naming(string& path_name);
 94 | 
 95 | static size_t resolve_haplotype_naming(string& genome_name);
 96 | 
 97 | int main(int argc, char** argv) {
 98 |     CLParser optionsParser;
 99 |     initParser(&optionsParser);
100 |     string halPath;
101 |     string refGenomes;
102 |     string rootGenomeName;
103 |     string targetGenomes;
104 |     bool noAncestors;
105 |     string ignoreGenomes;
106 |     string outputFormat;
107 |     size_t maxNodeLength;
108 |     bool progress;
109 |     try {
110 |         optionsParser.parseOptions(argc, argv);
111 |         halPath = optionsParser.getArgument<string>("halFile");
112 |         refGenomes = optionsParser.getOption<string>("refGenomes");
113 |         rootGenomeName = optionsParser.getOption<string>("rootGenome");
114 |         targetGenomes = optionsParser.getOption<string>("targetGenomes");
115 |         noAncestors = optionsParser.getFlag("noAncestors");
116 |         ignoreGenomes = optionsParser.getOption<string>("ignoreGenomes");
117 |         outputFormat = optionsParser.getOption<string>("outputFormat");
118 |         if (outputFormat != "pg" && outputFormat != "hg") {
119 |             throw hal_exception("--outputFormat must be one of {pg, hg}");
120 |         }
121 |         if (ignoreGenomes != "\"\"" && targetGenomes != "\"\"") {
122 |             throw hal_exception("--ignoreGenomes and --targetGenomes options are "
123 |                                 "mutually exclusive");
124 |         }
125 |         
126 |         maxNodeLength = optionsParser.getOption<size_t>("chop");
127 |         progress = optionsParser.getFlag("progress");
128 |     }
129 |     catch(exception& e) {
130 |         cerr << e.what() << endl;
131 |         optionsParser.printUsage(cerr);
132 |         exit(1);
133 |     }
134 |     try {
135 |         AlignmentConstPtr alignment(openHalAlignment(halPath, &optionsParser));
136 |         if (alignment->getNumGenomes() == 0) {
137 |             throw hal_exception("input hal alignmenet is empty");
138 |         }
139 | 
140 |         vector<string> refNames;
141 |         if (refGenomes != "\"\"") {
142 |             refNames = chopString(refGenomes, ",");
143 |             std::sort(refNames.begin(), refNames.end());
144 |         }
145 | 
146 |         // default to alignment root if none specified
147 |         bool givenRoot = true;
148 |         if (rootGenomeName == "\"\"") {
149 |             givenRoot = false;
150 |             rootGenomeName = alignment->getRootName();
151 |             const Genome* rootGenome = alignment->openGenome(rootGenomeName);
152 |             if (rootGenome == NULL) {
153 |                 throw hal_exception(string("Root genome, ") + rootGenomeName + 
154 |                                     ", not found in alignment");
155 |             }
156 |             alignment->closeGenome(rootGenome);
157 |         }
158 | 
159 |         vector<string> ignoreNames;
160 |         if (ignoreGenomes != "\"\"") {
161 |             ignoreNames = chopString(ignoreGenomes, ",");
162 |             std::sort(ignoreNames.begin(), ignoreNames.end());
163 |         }
164 | 
165 |         vector<string> targetNames;
166 |         bool givenTargets;
167 |         if (targetGenomes != "\"\"") {
168 |             // if we're supplied targets, we use them
169 |             targetNames = chopString(targetGenomes, ",");
170 |             givenTargets = true;
171 |         } else {
172 |             // otherwise, we take all the leaves below the root, except any that are ignored
173 |             vector<string> leafNames = alignment->getLeafNamesBelow(rootGenomeName);
174 |             for (size_t i = 0; i < leafNames.size(); ++i) {
175 |                 if (!std::binary_search(ignoreNames.begin(), ignoreNames.end(), leafNames[i])) {
176 |                     targetNames.push_back(leafNames[i]);
177 |                 }
178 |             }
179 |             givenTargets = false;
180 |         }
181 |         std::sort(targetNames.begin(), targetNames.end());
182 | 
183 |         // keep track of internal nodes needed to transitively align our targets
184 |         vector<string> spanningNames;        
185 |         set<const Genome*> targetSet;
186 |         for (size_t i = 0; i < targetNames.size(); ++i) {
187 |             const Genome* targetGenome = alignment->openGenome(targetNames[i]);
188 |             if (targetGenome == NULL) {
189 |                 throw hal_exception(string("Target genome, ") + targetNames[i] + 
190 |                                     ", not found in alignment");
191 |             }
192 |             targetSet.insert(targetGenome);
193 |         }
194 |         const Genome* rootGenome = getLowestCommonAncestor(targetSet);
195 |         set<const Genome*> targetSetCpy = targetSet;
196 |         getGenomesInSpanningTree(targetSetCpy, targetSet);
197 |         if (!givenRoot) {
198 |             // update our root if it wasn't user-specified
199 |             rootGenomeName = rootGenome->getName();
200 |         }
201 |         for (set<const Genome*>::iterator i = targetSet.begin(); i != targetSet.end(); ++i) {
202 |             if ((*i)->getNumChildren() > 0) {
203 |                 spanningNames.push_back((*i)->getName());
204 |             }
205 |             alignment->closeGenome(*i);            
206 |         }
207 |         std::sort(spanningNames.begin(), spanningNames.end());
208 |         
209 |         if (progress) {
210 |             cerr << "Root: " << rootGenomeName << endl;
211 |             if (!targetNames.empty()) {
212 |                 cerr << "Targets:";
213 |                 for (size_t i = 0; i < targetNames.size(); ++i) {
214 |                     cerr << " " << targetNames[i];
215 |                 }
216 |                 cerr << endl;
217 |             }
218 |             if (!spanningNames.empty()) {
219 |                 cerr << "Spanning:";
220 |                 for (size_t i = 0; i < spanningNames.size(); ++i) {
221 |                     cerr << " " << spanningNames[i];
222 |                 }
223 |                 cerr << endl;
224 |             }
225 |             if (!ignoreNames.empty()) {
226 |                 cerr << "Ignore:";
227 |                 for (size_t i = 0; i < ignoreNames.size(); ++i) {
228 |                     cerr << " " << ignoreNames[i];
229 |                 }
230 |                 cerr << endl;
231 |             }
232 |         }
233 | 
234 |         // map Sequence pointers to integers (assumes sequence pointers stable within hal)
235 |         vector<string> IDToName;
236 |         unordered_map<string, int64_t> nameToID;
237 |         
238 |         // start up our pinch graph
239 |         stPinchThreadSet* threadSet = stPinchThreadSet_construct();
240 |         
241 |         const Genome* parentGenome = nullptr;
242 |         string parentName;
243 | 
244 |         deque<string> queue = {rootGenomeName};
245 | 
246 |         vector<string> pinchGenomes;
247 |         
248 |         while (!queue.empty()) {
249 |             string genomeName = queue.front();
250 |             queue.pop_front();
251 | 
252 |             // we have a target set, and this genome isn't in it, and this genome isn't needed to span it
253 |             // so we can ignore it completely
254 |             bool ignoreGenome = (!std::binary_search(targetNames.begin(), targetNames.end(), genomeName) &&
255 |                                  !std::binary_search(spanningNames.begin(), spanningNames.end(), genomeName) &&
256 |                                  genomeName != rootGenomeName);
257 |             
258 |             const Genome* genome = alignment->openGenome(genomeName);
259 |             string curParent = alignment->getParentName(genomeName);
260 | 
261 |             // add the genome sequences as threads
262 |             if (!ignoreGenome) {
263 |                 if (progress && !(!curParent.empty() && genomeName != rootGenomeName)) {
264 |                     cerr << "adding threads from " << genome->getName() << endl;
265 |                 }
266 |                 add_genome_threads(genome, threadSet, IDToName, nameToID);
267 |             }
268 | 
269 |             if (!ignoreGenome && !curParent.empty() && genomeName != rootGenomeName) {
270 |                 // load up the parent genome if it's not already open, taking care
271 |                 // to only ever have one parent open at a time
272 |                 if (curParent != parentName) {
273 |                     if (parentGenome != nullptr) {
274 |                         alignment->closeGenome(parentGenome);
275 |                     }
276 |                     parentName = curParent;
277 |                     parentGenome = alignment->openGenome(parentName);
278 |                 }
279 | 
280 |                 // pinching must now be done in second pass, so we queue up the genome here
281 |                 pinchGenomes.push_back(genome->getName());
282 |             }
283 | 
284 |             // recurse on children                
285 |             vector<string> childs = alignment->getChildNames(genomeName);
286 |             for (size_t i = 0; i < childs.size(); ++i) {
287 |                 queue.push_back(childs[i]);
288 |             }
289 | 
290 |             // todo: this logic not very efficient for normal (ie non-star trees)
291 |             alignment->closeGenome(genome);
292 | 
293 |         }
294 | 
295 |         if (parentGenome != nullptr) {
296 |             alignment->closeGenome(parentGenome);
297 |         }
298 | 
299 |         // do all the pinching
300 |         unordered_map<stPinchThread*, vector<bool>> snp_cache;
301 |         for (size_t i = 0; i < pinchGenomes.size(); ++i) {
302 |             
303 |             // pinch the child with its parent
304 |             if (progress) {
305 |                 cerr << "pinching " << pinchGenomes[i] << endl;
306 |             }
307 |             pinch_genome(alignment->openGenome(pinchGenomes[i]), threadSet, nameToID, targetNames, snp_cache);
308 |         }
309 |         snp_cache.clear();
310 | 
311 |         // clean up the pinch graph
312 |         if (progress) {
313 |             cerr << "merging trivial segments and blocks in pinch graph" << endl;
314 |         }
315 |         stPinchThreadSet_joinTrivialBoundaries(threadSet);
316 | 
317 |         // make a handle graph
318 |         unique_ptr<MutablePathMutableHandleGraph> graph;
319 |         if (outputFormat == "pg") {
320 |             graph = unique_ptr<MutablePathMutableHandleGraph>(new PackedGraph());
321 |         } else if (outputFormat == "hg") {
322 |             graph = unique_ptr<MutablePathMutableHandleGraph>(new HashGraph());
323 |         } else {
324 |             assert(false);
325 |         }
326 | 
327 |         // keep track of where blocks fit into the handle graph
328 |         unordered_map<stPinchBlock*, nid_t> blockToNode;
329 | 
330 |         // start iterating over the genomes again in order to export to handle graph
331 |         queue = {rootGenomeName};
332 |         while (!queue.empty()) {
333 |             string genomeName = queue.front();
334 |             queue.pop_front();
335 | 
336 |             // skip it if
337 |             // it's an ancestor and we don't want ancestors or
338 |             // if we have targets and it's not in it or
339 |             // if it's on the ignore list
340 |             bool ignoreGenome = ((noAncestors && !alignment->getChildNames(genomeName).empty()) ||
341 |                                  (givenTargets && !std::binary_search(targetNames.begin(), targetNames.end(), genomeName)) ||
342 |                                  (std::binary_search(ignoreNames.begin(), ignoreNames.end(), genomeName)));
343 |             if (!ignoreGenome) {
344 |                 const Genome* genome = alignment->openGenome(genomeName);
345 | 
346 |                 if (progress) {
347 |                     cerr << "converting " << genomeName << " with " << genome->getNumSequences()
348 |                          << " sequences and total length " << genome->getSequenceLength() << endl;
349 |                 }
350 |                 pinch_to_handle(genome, threadSet, IDToName, nameToID, blockToNode, *graph, refNames);
351 | 
352 |                 alignment->closeGenome(genome);
353 |             }
354 |             
355 |             vector<string> childs = alignment->getChildNames(genomeName);
356 |             for (size_t i = 0; i < childs.size(); ++i) {
357 |                 queue.push_back(childs[i]);
358 |             }
359 |         }
360 | 
361 |         // free the pinch graph
362 |         stPinchThreadSet_destruct(threadSet);
363 | 
364 |         // free the hal
365 |         alignment = AlignmentConstPtr();
366 | 
367 |         // chop
368 |         if (maxNodeLength > 0) {
369 |             if (progress) {
370 |                 cerr << "chopping graph to max node size " << maxNodeLength << endl;
371 |             }
372 |             chop_graph(*graph, maxNodeLength);
373 |         }
374 | 
375 |         // write out the graph
376 |         if (progress) {
377 |             cerr << "serializing graph" << endl;
378 |         }
379 |         dynamic_cast<SerializableHandleGraph*>(graph.get())->serialize(cout);
380 |     }
381 |     catch(exception& e) {
382 |         cerr << e.what() << endl;
383 |         exit(1);
384 |     }
385 |      
386 |     return 0;
387 | }
388 | 
389 | // Add every sequence from the genome into the pinch graph
390 | void add_genome_threads(const Genome* genome,
391 |                         stPinchThreadSet* threads,
392 |                         vector<string>& IDToName,
393 |                         unordered_map<string, int64_t>& nameToID) {
394 |     
395 |     for (SequenceIteratorPtr seqIt = genome->getSequenceIterator(); not seqIt->atEnd(); seqIt->toNext()) {
396 |         const Sequence *sequence = seqIt->getSequence();
397 |         hal_size_t seqLen = sequence->getSequenceLength();
398 |         string name = sequence->getFullName();
399 |         // update lookups to map hal sequence to numeric id
400 |         int64_t seqID = IDToName.size(); 
401 |         nameToID[name] = seqID;
402 |         IDToName.push_back(name);
403 |         // add to thread set
404 | #ifdef debug
405 |         cerr << "Adding sequence " << name << " as thread " << seqID << " with length " << seqLen << endl;
406 | #endif
407 |         stPinchThreadSet_addThread(threads, seqID, 0, seqLen);
408 |     }
409 | }
410 | 
411 | // Use exact pairwise alginments from genome to its parent to make the pinch graph
412 | void pinch_genome(const Genome* genome,
413 |                   stPinchThreadSet* threads,
414 |                   unordered_map<string, int64_t>& nameToID,
415 |                   const vector<string>& targetNames,
416 |                   unordered_map<stPinchThread*, vector<bool>>& snp_cache) {
417 | 
418 |     TopSegmentIteratorPtr topIt = genome->getTopSegmentIterator();
419 |     BottomSegmentIteratorPtr botIt = genome->getParent()->getBottomSegmentIterator();
420 | 
421 |     // make a target set for column iterator pinching. unfortunately this means
422 |     // opening every single genome
423 |     const Alignment* alignment = genome->getAlignment();
424 |     set<const Genome*> targets;
425 |     for (size_t i = 0; i < targetNames.size(); ++i) {
426 |         targets.insert(alignment->openGenome(targetNames[i]));
427 |     }
428 | 
429 |     ColumnIteratorPtr colIt = genome->getColumnIterator(&targets);
430 | 
431 |     // avoid thread set lookups
432 |     const Sequence* topSeq = nullptr;
433 |     const Sequence* botSeq = nullptr;
434 |     stPinchThread* topThread = nullptr;
435 |     stPinchThread* botThread = nullptr;
436 |     string topString;
437 |     string botString;
438 | 
439 |     // merge up consecutive segments for fewer pinches
440 |     stPinchThread* prevTopThread = nullptr;
441 |     stPinchThread* prevBotThread = nullptr;
442 |     hal_index_t prevStart1 = -1;
443 |     hal_index_t prevStart2 = -1;
444 |     hal_index_t prevLength = -1;
445 |     bool prevReversed = false;
446 |     
447 |     for (; not topIt->atEnd(); topIt->toRight()) {
448 |         if (topIt->tseg()->hasParent()) {
449 |             botIt->toParent(topIt);
450 | 
451 |             // todo: lots of string lookups
452 |             int64_t topID = nameToID[topIt->tseg()->getSequence()->getFullName()];
453 |             int64_t botID = nameToID[botIt->bseg()->getSequence()->getFullName()];
454 | 
455 |             if (topIt->tseg()->getSequence() != topSeq) {
456 |                 topSeq = topIt->tseg()->getSequence();
457 |                 topThread = stPinchThreadSet_getThread(threads, topID);
458 |             }
459 |             if (botIt->bseg()->getSequence() != botSeq) {
460 |                 botSeq = botIt->bseg()->getSequence();
461 |                 botThread = stPinchThreadSet_getThread(threads, botID);
462 |             }
463 | 
464 |             topIt->getString(topString);
465 |             botIt->getString(botString);
466 | 
467 | #ifdef debug
468 |             cerr << "pinching " << endl
469 |                  << "   " << *topIt << endl
470 |                  << "  " << topString << endl
471 |                  << "   " << *botIt << endl
472 |                  << "  " << botString << endl;
473 | #endif
474 | 
475 |             int64_t first_match = -1;
476 |             int64_t last_match = -1;
477 |             for (int64_t i = 0; i < (int64_t)topString.length(); ++i) {
478 |                 if (std::toupper(topString[i]) == std::toupper(botString[i])) {
479 |                     if (first_match == -1) {
480 |                         first_match = i;
481 |                     }
482 |                     last_match = i;
483 |                 } else if (colIt.get() != NULL) {
484 |                     pinch_snp(genome, threads, nameToID, topIt, i, colIt,
485 |                               std::toupper(topString[i]), topThread, snp_cache);
486 |                 }
487 |                 if (std::toupper(topString[i]) != std::toupper(botString[i]) || i == (int64_t)topString.length() - 1) {
488 |                     if (last_match >= first_match && first_match >= 0) {
489 |                         hal_index_t length = last_match - first_match + 1;
490 |                         hal_index_t start1 = topIt->tseg()->getStartPosition() + first_match - topSeq->getStartPosition();
491 |                         hal_index_t start2;
492 |                         if (!botIt->getReversed()) {
493 |                             start2 = botIt->bseg()->getStartPosition() + first_match - botSeq->getStartPosition();
494 |                         } else {
495 |                             start2 = botIt->bseg()->getEndPosition() - first_match - length + 1 - botSeq->getStartPosition();
496 |                         }
497 | #ifdef debug
498 |                         cerr << " inserting (fm=" << first_match <<",lm=" << last_match << ", s1=" << start1 << ",s2=" << start2 << ",l=" << length
499 |                              << ", hl1=" << topSeq->getSequenceLength() << ",hl2=" << botSeq->getSequenceLength() << ",pl1=" << stPinchThread_getLength(topThread)
500 |                              << ", pl2=" << stPinchThread_getLength(botThread) << ", rev=" << botIt->getReversed()
501 |                              << " sp1g=" << (start1 + topSeq->getStartPosition()) << " sp2g=" << (start2 + botSeq->getStartPosition()) << endl
502 |                              << "   " << topString.substr(first_match, length) << endl;
503 | #endif
504 |                         // are we dealing with two consectuive segments? 
505 |                         bool canMerge = topThread == prevTopThread &&
506 |                            botThread == prevBotThread &&
507 |                            start1 == prevStart1 + prevLength &&
508 |                            botIt->getReversed() == prevReversed &&
509 |                            ((!prevReversed && start2 == prevStart2 + prevLength) ||
510 |                             (prevReversed && start2 + length == prevStart2));
511 | 
512 |                         if (canMerge) {
513 |                             // if consecutive, just merge
514 |                             prevLength += length;
515 |                             if (botIt->getReversed()) {
516 |                                 prevStart2 = start2;
517 |                             }
518 |                         } else {
519 |                             // otherwise
520 |                             if (prevTopThread != nullptr) {
521 |                                 // pinch the last segment
522 |                                 stPinchThread_pinch(prevTopThread,
523 |                                                     prevBotThread,
524 |                                                     prevStart1,
525 |                                                     prevStart2,
526 |                                                     prevLength,
527 |                                                     !prevReversed);
528 |                             }
529 |                             // and update our previous
530 |                             prevTopThread = topThread;
531 |                             prevBotThread = botThread;
532 |                             prevStart1 = start1;
533 |                             prevStart2 = start2;
534 |                             prevLength = length;
535 |                             prevReversed = botIt->getReversed();
536 |                         }
537 |                                                 
538 |                     }
539 |                     first_match = -1;
540 |                     last_match = -1;
541 |                 }
542 |             }            
543 |         }
544 |     }
545 |     // do that last pinch
546 |     if (prevTopThread != nullptr) {
547 |         stPinchThread_pinch(prevTopThread,
548 |                             prevBotThread,
549 |                             prevStart1,
550 |                             prevStart2,
551 |                             prevLength,
552 |                             !prevReversed);
553 |     }
554 | }
555 | 
556 | // Use the column iterator to find all alignments of this snp and pinch accordingly
557 | //
558 | // Todo:  Worried this might be too slow to use at scale.  Also, it blows away all previous
559 | // efforts in hal2vg to be cache-friendly by only loading 2 genomes at a time, so it may
560 | // hog lots of memory.  On a star tree, it may just be better to manually scan the siblings
561 | // before resorting to the column iterator.  Or perhaps do everything in the pinch graph
562 | // by pinching snps then doing a pass over the graph to break them apart once its constructed.
563 | void pinch_snp(const Genome* genome,
564 |                stPinchThreadSet* threads,
565 |                unordered_map<string, int64_t>& nameToID,
566 |                const TopSegmentIteratorPtr& topIt,
567 |                int64_t topOffset,
568 |                ColumnIteratorPtr& colIt,
569 |                char topBase,
570 |                stPinchThread* topThread,
571 |                unordered_map<stPinchThread*, vector<bool>>& snp_cache) {
572 | 
573 |     const Sequence* topSeq = topIt->tseg()->getSequence();
574 |     hal_index_t topStart = topIt->tseg()->getStartPosition() + topOffset - topSeq->getStartPosition();
575 | 
576 |     vector<bool>& cache_rec = snp_cache[topThread];
577 |     if (!cache_rec.empty() && cache_rec[topStart] == true) {
578 |         // we've already pinched this base
579 |         return;
580 |     }
581 | 
582 |     // move the column iterator into position
583 |     colIt->toSite(topStart + topSeq->getStartPosition(), topStart + topSeq->getStartPosition() + 1);
584 | 
585 |     const ColumnIterator::ColumnMap* columnMap = colIt->getColumnMap();
586 | 
587 |     // remember all equivalence classes of pinches
588 |     map<char, vector<tuple<stPinchThread*, hal_index_t, bool>>> base_pinches;
589 |     
590 |     // scan through all the homologous bases, breaking them into lists for each possible nucleotide
591 |     for (ColumnIterator::ColumnMap::const_iterator cmi = columnMap->begin(); cmi != columnMap->end(); ++cmi) {
592 |         const Sequence* sequence = cmi->first;
593 |         for (ColumnIterator::DNASet::const_iterator dsi = cmi->second->begin(); dsi != cmi->second->end(); ++dsi) {
594 |             char botBase = std::toupper((*dsi)->getBase());
595 |             
596 |             int64_t otherID = nameToID[sequence->getFullName()];
597 |             stPinchThread* otherThread = stPinchThreadSet_getThread(threads, otherID);
598 |             hal_index_t otherStart = (*dsi)->getArrayIndex() - sequence->getStartPosition();
599 | 
600 |             base_pinches[botBase].push_back(make_tuple(otherThread, otherStart, !(*dsi)->getReversed()));
601 |             
602 |         }
603 |     }
604 | 
605 |     // pinch through each nucleotde
606 |     for (auto& bp : base_pinches) {
607 |         vector<tuple<stPinchThread*, hal_index_t, bool>>& other_positions = bp.second;
608 |         for (size_t i = 0; i < other_positions.size(); ++i) {
609 |             if (i > 0) {
610 |                 stPinchThread_pinch(get<0>(other_positions[0]),
611 |                                     get<0>(other_positions[i]),
612 |                                     get<1>(other_positions[0]),
613 |                                     get<1>(other_positions[i]),
614 |                                     1,
615 |                                     get<2>(other_positions[0]) == get<2>(other_positions[i]));
616 |             }
617 |             // update the cache
618 |             vector<bool>& cache_vec = snp_cache[get<0>(other_positions[i])];
619 |             if (cache_vec.empty()) {
620 |                 cache_vec.resize(stPinchThread_getLength(get<0>(other_positions[i])), false);
621 |             }
622 |             cache_vec[get<1>(other_positions[i])] = true;
623 |         }
624 |     }
625 | }
626 | 
627 | // create nodes and edges for a genome using the pinch graph
628 | void pinch_to_handle(const Genome* genome,
629 |                      stPinchThreadSet* threadSet,
630 |                      const vector<string>& IDToName,
631 |                      const unordered_map<string, int64_t>& nameToID,
632 |                      unordered_map<stPinchBlock*, nid_t>& blockToNode,
633 |                      MutablePathMutableHandleGraph& graph,
634 |                      const vector<string>& refNames) {
635 | 
636 |     // iterate over the sequences of the genome
637 |     for (SequenceIteratorPtr seqIt = genome->getSequenceIterator(); not seqIt->atEnd(); seqIt->toNext()) {
638 |         const Sequence *sequence = seqIt->getSequence();
639 |         PathSense sense = PathSense::REFERENCE;
640 |         if (!refNames.empty() && !std::binary_search(refNames.begin(), refNames.end(), genome->getName())) {
641 |             sense = PathSense::HAPLOTYPE;
642 |         }
643 |         int64_t seqID = nameToID.find(sequence->getFullName())->second;
644 |         stPinchThread* thread = stPinchThreadSet_getThread(threadSet, seqID);
645 | 
646 |         // cactus_graphmap_split can make paths like contig_sub_1_3.  here we convert that
647 |         // into a format vg can (sometimes) understand contig[1-3].
648 |         // (the reason we go through this is that assembly hubs can't handle any special characters apparently)
649 |         string parsed_name = sequence->getName();
650 |         subrange_t subpath = resolve_subpath_naming(parsed_name);
651 |         string parsed_genome_name = genome->getName();
652 |         size_t haplotype = resolve_haplotype_naming(parsed_genome_name);
653 |         if (haplotype == PathMetadata::NO_HAPLOTYPE) {
654 |             haplotype = 0;
655 |         }
656 |         // create the path
657 |         path_handle_t pathHandle = graph.create_path(sense,
658 |                                                      parsed_genome_name,
659 |                                                      parsed_name,
660 |                                                      haplotype,
661 |                                                      sense == PathSense::HAPLOTYPE ? 0 : PathMetadata::NO_PHASE_BLOCK,
662 |                                                      subpath,
663 |                                                      false);
664 |         string pathString;
665 |         
666 |         // iterate over the segments of the sequence
667 |         stPinchSegment* prevSeg = nullptr;
668 |         handle_t prevHandle;
669 |         stPinchSegment* lastSeg = stPinchThread_getLast(thread);
670 |         hal_index_t segStart = 0;
671 |         string seqString;
672 |         for (stPinchSegment* seg = stPinchThread_getFirst(thread); ;
673 |              seg = stPinchSegment_get3Prime(seg)) {
674 | 
675 |             // get the segment's block.  note that if it's not aligned to anything, it will have no block
676 |             stPinchBlock* block = stPinchSegment_getBlock(seg);
677 |             bool reversed = block != nullptr && stPinchSegment_getBlockOrientation(seg) == 0;
678 |             handle_t handle;
679 | 
680 |             // get the segment's dna sequence from the hal
681 |             sequence->getSubString(seqString, segStart, stPinchSegment_getLength(seg));
682 |             if (reversed) {
683 |                 // we always work in block-relative orientation
684 |                 reverseComplement(seqString);
685 |             }
686 |             
687 |             // have we already converted this block?
688 |             auto bi = blockToNode.find(block);
689 |             if (bi == blockToNode.end()) {
690 |                 // no: it is a new block
691 |                 handle = graph.create_handle(seqString);
692 |                 if (block != nullptr) {
693 |                     blockToNode[block] = graph.get_id(handle);
694 |                 }
695 | #ifdef debug
696 |                 cerr << "created node " << graph.get_id(handle) << " for block " << block << " from " << sequence->getFullName() << " at " << segStart
697 |                      << " rev=" << reversed << " len=" << seqString.length()
698 |                      << endl;
699 |                 cerr << "node seq " << graph.get_sequence(handle) << endl;
700 | #endif
701 |             } else {
702 |                 // yes: we can find it in the table
703 |                 handle = graph.get_handle(bi->second);
704 | #ifdef debug
705 |                 cerr << "found node " << graph.get_id(handle) << " for block " << block << " from " << sequence->getFullName() << " at " << segStart
706 |                      << " rev=" << reversed << " len=" << seqString.length()
707 |                      << endl;
708 |                 cerr << "node seq " << graph.get_sequence(handle) << endl;
709 |                 cerr << "my substring " << seqString << endl;
710 | #endif
711 |             }
712 |             assert(!graph.get_is_reverse(handle));
713 |             if (reversed) {
714 |                 handle = graph.flip(handle);
715 |                 assert(graph.get_is_reverse(handle));
716 |             }
717 |                    
718 |             // wire up the edge to previous
719 |             if (prevSeg != nullptr) {
720 | #ifdef debug
721 |                 cerr << "creating edge from " << graph.get_id(prevHandle) << ":" << graph.get_is_reverse(prevHandle) << " -> "
722 |                      << graph.get_id(handle) << ":" << graph.get_is_reverse(handle) << endl;
723 | #endif
724 |                 graph.create_edge(prevHandle, handle);
725 |             }
726 | 
727 |             // add the node to the path
728 |             graph.append_step(pathHandle, handle);
729 |             pathString += graph.get_sequence(handle);
730 | 
731 |             prevSeg = seg;
732 |             prevHandle = handle;
733 |             
734 |             segStart += stPinchSegment_getLength(seg);
735 |             
736 |             if (seg == lastSeg) {
737 |                 break;
738 |             }
739 |         }
740 | 
741 |         // make sure the path we added is the same as the hal
742 |         string halPathString;
743 |         sequence->getString(halPathString);
744 |         if (pathString.length() != halPathString.length()) {
745 |             throw runtime_error("Incorrect length in coverted path for " + sequence->getFullName() + ": " + std::to_string(pathString.length()) +
746 |                                 ". Should be: " + std::to_string(halPathString.length()));
747 |         }
748 |         vector<size_t> mismatches;
749 |         for (size_t i = 0; i < halPathString.size(); ++i) {
750 |             if (toupper(pathString[i]) != toupper(halPathString[i])) {
751 |                 mismatches.push_back(i);
752 |             }
753 |         }
754 |         if (!mismatches.empty()) {
755 |             stringstream msg;
756 |             msg << mismatches.size() << " mismatches found in converted path for " << sequence->getFullName() << ":\n";
757 |             for (size_t i = 0; i < mismatches.size() && i < 10; ++i) {
758 |                 msg << " path[" << mismatches[i] << "]=" << pathString[mismatches[i]] << ". should be " << halPathString[mismatches[i]] << "\n";
759 |             }
760 |             throw runtime_error(msg.str());
761 |         }
762 |     }
763 | }
764 | 
765 | void chop_graph(MutablePathMutableHandleGraph& graph, size_t maxNodeLength) {
766 |     // borrowed from https://github.com/vgteam/odgi/blob/master/src/subcommand/chop_main.cpp
767 |     std::vector<handle_t> to_chop;
768 |     graph.for_each_handle([&](const handle_t& handle) {
769 |             if (graph.get_length(handle) > maxNodeLength) {
770 |                 to_chop.push_back(handle);
771 |             }
772 |         });
773 | 
774 |     for (auto& handle : to_chop) {
775 |         // get divide points
776 |         uint64_t length = graph.get_length(handle);
777 |         std::vector<size_t> offsets;
778 |         for (uint64_t i = maxNodeLength; i < length; i+=maxNodeLength) {
779 |             offsets.push_back(i);
780 |         }
781 |         graph.divide_handle(handle, offsets);
782 |     }
783 | }
784 | 
785 | subrange_t resolve_subpath_naming(string& path_name) {
786 |     size_t first_length = 0;
787 |     size_t start_offset = 0;
788 |     bool found_subpath = false;
789 |     while (true) {
790 |         size_t sp = path_name.rfind("_sub_");
791 |         if (sp != string::npos) {
792 |             size_t up = path_name.rfind("_");
793 |             if (up != string::npos && up > sp + 1) {
794 |                 int64_t start;
795 |                 int64_t end;
796 |                 start = stol(path_name.substr(sp + 5, up - sp - 5));
797 |                 end = stol(path_name.substr(up + 1));
798 |                 stringstream new_name;
799 |                 start_offset += start; // final offset is sum of all nested offsets
800 |                 if (first_length == 0) {
801 |                     first_length = end - start;
802 |                     assert(first_length > 0);
803 |                 } else {
804 |                     // in the case of nested subpaths, the end coordinate will always
805 |                     // be derived from the start, plus the length of the "top" path
806 |                     end = start_offset + first_length;
807 |                 }
808 |                 new_name << path_name.substr(0, sp);
809 |                 path_name = new_name.str();
810 |                 found_subpath = true;
811 |             }
812 |         } else {
813 |             break;
814 |         }
815 |     }
816 |     if (found_subpath) {
817 |         return make_pair(start_offset, start_offset + first_length);
818 |     } else {
819 |         return PathMetadata::NO_SUBRANGE;
820 |     }
821 | }
822 | 
823 | size_t resolve_haplotype_naming(string& genome_name) {
824 |     size_t haplotype = PathMetadata::NO_HAPLOTYPE;
825 |     size_t dp = genome_name.rfind(".");
826 |     if (dp != string::npos) {
827 |         try {
828 |             haplotype = stol(genome_name.substr(dp + 1));
829 |             genome_name = genome_name.substr(0, dp);
830 |         } catch(...) {
831 |         }
832 |     }
833 |     return haplotype;
834 | }
835 | 


--------------------------------------------------------------------------------
/halMergeChroms.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2016 by Glenn Hickey (hickey@soe.ucsc.edu)
  3 |  *
  4 |  * Released under the MIT license, see LICENSE.txt
  5 |  */
  6 | 
  7 | // Merge chromosome HAL files into one big one.  Only star trees with same root name supported (ie what comes out of cactus-align-batch). 
  8 | 
  9 | //#define debug
 10 | 
 11 | #include <cstdlib>
 12 | #include <cstdlib>
 13 | #include <iostream>
 14 | #include <cassert>
 15 | #include <fstream>
 16 | #include <deque>
 17 | #include <unordered_map>
 18 | #include <unordered_set>
 19 | 
 20 | #include "hal.h"
 21 | 
 22 | using namespace std;
 23 | using namespace hal;
 24 | 
 25 | static void initParser(CLParser* optionsParser) {
 26 |     optionsParser->addArgument("inFiles", "comma-separated (only way in HAL parser!) list of input HAL files to merge");
 27 |     optionsParser->addArgument("outFile", "output HAL file");
 28 |     optionsParser->addOptionFlag("progress",
 29 |                                  "show progress",
 30 |                                  false);
 31 |     optionsParser->setDescription("Merge chromosome HALs into combined file.  Ancestral sequences are renamed as needed to avoid conflicts"
 32 |         ". Star trees only");
 33 | }
 34 | 
 35 | // we expect to see the same ancestor sequence names in multiple input files. we uniqify them by adding
 36 | // _i to them where i is the file's position in the input.
 37 | static string anc_seq_name(const string& seq_name, size_t idx) {
 38 |     return seq_name + "_" + to_string(idx);
 39 | }
 40 | 
 41 | // get the dimensions from all genomes in all input files
 42 | static pair<string, unordered_map<string, vector<Sequence::Info>>> get_hal_dimensions(CLParser* optionsParser,
 43 |                                                                                       const vector<string>& hal_paths) {
 44 |     // genome -> dimensions (covering all input)
 45 |     unordered_map<string, vector<Sequence::Info>> dimensions;
 46 |     // to check uniqueness
 47 |     unordered_set<string> sequence_names;
 48 | 
 49 |     string root_name;
 50 |     for (size_t i = 0; i < hal_paths.size(); ++i) {
 51 |         const string& hal_path = hal_paths[i];
 52 |         
 53 |         // open the hal file
 54 |         AlignmentConstPtr alignment(openHalAlignment(hal_path, optionsParser, READ_ACCESS));
 55 |     
 56 |         // for every genome
 57 |         vector<string> genome_names = alignment->getChildNames(alignment->getRootName());
 58 |         genome_names.push_back(alignment->getRootName());
 59 |         if (root_name.empty()) {
 60 |             root_name = alignment->getRootName();
 61 |         } else if (alignment->getRootName() != root_name) {
 62 |             throw hal_exception("Root mismatch: " + root_name + " vs " + alignment->getRootName());        
 63 |         }
 64 |         for (const string& genome_name : genome_names) {
 65 |             const Genome* genome = alignment->openGenome(genome_name);
 66 |             vector<Sequence::Info>& genome_dimensions = dimensions[genome_name];
 67 |             // for every sequence
 68 |             for (SequenceIteratorPtr seqIt = genome->getSequenceIterator(); not seqIt->atEnd(); seqIt->toNext()) {
 69 |                 const Sequence *sequence = seqIt->getSequence();
 70 |                 // add a little suffix to make ancestral sequences unique
 71 |                 string seq_name = genome->getParent() ? sequence->getName() : anc_seq_name(sequence->getName(), i);
 72 |                 genome_dimensions.emplace_back(seq_name,
 73 |                                                sequence->getSequenceLength(),
 74 |                                                sequence->getNumTopSegments(),
 75 |                                                sequence->getNumBottomSegments());
 76 |                 string full_name = genome_name + "." + seq_name;
 77 |                 if (sequence_names.count(full_name)) {
 78 |                     throw hal_exception("Duplicate sequence name found: " + full_name);
 79 |                 } else {
 80 |                     sequence_names.insert(full_name);
 81 |                 }
 82 |             }
 83 |             alignment->closeGenome(genome);        
 84 |         }
 85 |     }
 86 |     return make_pair(root_name, dimensions);
 87 | }
 88 | 
 89 | // append each input hal to the out_alignment, one after another.  all arrays are copied over,
 90 | // but need to be adjsuted to reflect their new offsets.
 91 | static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, const vector<string>& in_paths, bool progress) {
 92 | 
 93 |     // keep track of where we are in the output    
 94 |     vector<size_t> top_offsets(out_alignment->getChildNames(out_alignment->getRootName()).size(), 0);
 95 |     size_t bot_offset = 0;
 96 |     
 97 |     for (size_t i = 0; i < in_paths.size(); ++i) {
 98 |         AlignmentConstPtr in_alignment(openHalAlignment(in_paths[i], optionsParser, READ_ACCESS));
 99 |         const Genome* in_root = in_alignment->openGenome(in_alignment->getRootName());
100 |         Genome* out_root = out_alignment->openGenome(in_alignment->getRootName());
101 |         assert(in_root->getName() == out_root->getName());
102 |         size_t in_root_degree = in_root->getNumChildren();
103 |         size_t out_root_degree = out_root->getNumChildren();
104 |         vector<const Genome*> in_genomes = {in_root};
105 |         for (const string& in_child_name : in_alignment->getChildNames(in_root->getName())) {
106 |             in_genomes.push_back(in_alignment->openGenome(in_child_name));
107 |         }
108 | 
109 |         // copy the dna sequence by sequence
110 |         for (const Genome* in_genome : in_genomes) {
111 |             if (progress) {
112 |                 cerr << "[halMergeChroms]: copying dna for " << in_genome->getName() << " from " << in_paths[i] << endl;
113 |             }
114 |             Genome* out_genome = out_alignment->openGenome(in_genome->getName());
115 |             for (SequenceIteratorPtr in_si = in_genome->getSequenceIterator(); !in_si->atEnd(); in_si->toNext()) {
116 |                 const Sequence* in_sequence = in_si->getSequence();
117 |                 string out_seq_name = in_genome->getParent() ? in_sequence->getName() : anc_seq_name(in_sequence->getName(), i);
118 |                 Sequence* out_sequence = out_genome->getSequence(out_seq_name);
119 |                 DnaIteratorPtr in_di = in_sequence->getDnaIterator(0);
120 |                 DnaIteratorPtr out_di = out_sequence->getDnaIterator(0);
121 |                 assert(in_sequence->getSequenceLength() == out_sequence->getSequenceLength());
122 |                 string dna;
123 |                 in_sequence->getString(dna);
124 |                 out_sequence->setString(dna);
125 |             }
126 |         }
127 | 
128 |         // make a child index map (in -> out) for the root genome
129 |         // assume : all genomes in in_genome present in out_genome
130 |         vector<size_t> in_ci_to_out_ci(in_root->getNumChildren());
131 |         for (const string& in_child_name : in_alignment->getChildNames(in_root->getName())) {
132 |             in_ci_to_out_ci.at(in_root->getChildIndex(in_alignment->openGenome(in_child_name))) =
133 |                 out_root->getChildIndex(out_alignment->openGenome(in_child_name));
134 |         }
135 | 
136 |         // copy over the bottom segments of the root
137 |         if (progress) {
138 |             cerr << "[halMergeChroms]: copying bottom segments for " << in_root->getName() << " from " << in_paths[i]
139 |                  << " with bseg offset " << bot_offset << endl;
140 |         }
141 |         BottomSegmentIteratorPtr in_bi = in_root->getBottomSegmentIterator(0);
142 |         BottomSegmentIteratorPtr out_bi = out_root->getBottomSegmentIterator(bot_offset);
143 |         for (;!in_bi->atEnd(); in_bi->toRight(), out_bi->toRight()) {
144 |             // set the segment in the root genome
145 |             assert(out_bi->bseg()->getArrayIndex() == in_bi->bseg()->getArrayIndex() + bot_offset);
146 |             assert(out_bi->bseg()->getNumChildren() >= in_bi->bseg()->getNumChildren());
147 |             out_bi->bseg()->setTopParseIndex(NULL_INDEX);
148 |             // determine the sequence-relative coordinate in the input
149 |             const Sequence* in_sequence = in_bi->bseg()->getSequence();
150 |             int64_t in_start_coord = in_bi->bseg()->getStartPosition() - in_sequence->getStartPosition();
151 |             assert(in_start_coord >= 0 && in_start_coord < in_sequence->getSequenceLength());
152 |             // set the sequence relative coordinate in the output
153 |             const Sequence* out_sequence = out_root->getSequence(anc_seq_name(in_sequence->getName(), i));
154 |             int64_t out_start_coord = out_sequence->getStartPosition() + in_start_coord;
155 |             assert(out_start_coord >= 0 && out_start_coord < out_root->getSequenceLength());
156 |             out_bi->bseg()->setCoordinates(out_start_coord, in_bi->bseg()->getLength());
157 |             // set the segment in the child genomes
158 |             for (size_t out_ci = 0; out_ci < out_root_degree; ++out_ci) {
159 |                 out_bi->bseg()->setChildIndex(out_ci,  NULL_INDEX);
160 |             }
161 |             for (size_t in_ci = 0; in_ci < in_root_degree; ++in_ci) {
162 |                 size_t out_ci = in_ci_to_out_ci.at(in_ci);
163 |                 assert(out_ci < out_bi->bseg()->getNumChildren());
164 |                 if (in_bi->bseg()->hasChild(in_ci)) {
165 |                     out_bi->bseg()->setChildIndex(out_ci,  in_bi->bseg()->getChildIndex(in_ci) + top_offsets[out_ci]);
166 |                     out_bi->bseg()->setChildReversed(out_ci, in_bi->bseg()->getChildReversed(in_ci));
167 |                 }
168 |             }
169 |         }
170 | 
171 |         // for every child genome, copy over the top segments
172 |         for (const string& in_child_name : in_alignment->getChildNames(in_root->getName())) {
173 |             if (progress) { 
174 |                 cerr << "[halMergeChroms]: copying top segments for " << in_child_name << " from " << in_paths[i] << endl;
175 |             }
176 |             const Genome* in_child = in_alignment->openGenome(in_child_name);
177 |             Genome* out_child = out_alignment->openGenome(in_child_name);
178 |             
179 |             size_t in_ci = in_root->getChildIndex(in_child);
180 |             size_t out_ci = in_ci_to_out_ci[in_ci];
181 |             size_t top_offset = top_offsets[out_ci];
182 |             TopSegmentIteratorPtr in_ti = in_child->getTopSegmentIterator(0);
183 |             TopSegmentIteratorPtr out_ti = out_child->getTopSegmentIterator(top_offsets[out_ci]);
184 | 
185 |             for (;!in_ti->atEnd(); in_ti->toRight(), out_ti->toRight()) {
186 |                 // set the segment in the child genome
187 |                 assert(out_ti->tseg()->getArrayIndex() == in_ti->tseg()->getArrayIndex() + top_offset);
188 |                 if (in_ti->tseg()->hasParent()) {
189 |                     out_ti->tseg()->setParentIndex(in_ti->tseg()->getParentIndex() + bot_offset);
190 |                     out_ti->tseg()->setParentReversed(in_ti->tseg()->getParentReversed());
191 |                 } else {
192 |                     out_ti->tseg()->setParentIndex(NULL_INDEX);
193 |                 }
194 |                 out_ti->tseg()->setBottomParseIndex(NULL_INDEX);
195 |                 // determine the sequence-relative coordinate in the input
196 |                 const Sequence* in_sequence = in_ti->tseg()->getSequence();
197 |                 int64_t in_start_coord = in_ti->tseg()->getStartPosition() - in_sequence->getStartPosition();
198 |                 // set the sequence relative coordinate in the output
199 |                 const Sequence* out_sequence = out_child->getSequence(in_sequence->getName());
200 |                 int64_t out_start_coord = out_sequence->getStartPosition() + in_start_coord;
201 |                 out_ti->tseg()->setCoordinates(out_start_coord, in_ti->tseg()->getLength());
202 |                 // set the paralogy edge
203 |                 if (in_ti->tseg()->hasNextParalogy()) {
204 |                     out_ti->tseg()->setNextParalogyIndex(in_ti->tseg()->getNextParalogyIndex() + top_offset);
205 |                 } else {
206 |                     out_ti->tseg()->setNextParalogyIndex(NULL_INDEX);
207 |                 }
208 |             }
209 |         }
210 | 
211 |         // update the offsets to move past the current alignment in all genomes
212 |         bot_offset += in_root->getNumBottomSegments();
213 |         for (const string& in_child_name : in_alignment->getChildNames(in_root->getName())) {
214 |             const Genome* in_child = in_alignment->openGenome(in_child_name);
215 |             size_t in_ci = in_root->getChildIndex(in_child);
216 |             size_t out_ci = in_ci_to_out_ci[in_ci];
217 |             top_offsets[out_ci] += in_child->getNumTopSegments();
218 |         }
219 |     }
220 | }
221 |     
222 | int main(int argc, char** argv) {
223 |     CLParser optionsParser(WRITE_ACCESS);
224 |     initParser(&optionsParser);
225 |     string in_hal_paths;
226 |     string out_hal_path;
227 |     bool progress;
228 |     try {
229 |         optionsParser.parseOptions(argc, argv);
230 |         in_hal_paths = optionsParser.getArgument<string>("inFiles");
231 |         out_hal_path = optionsParser.getArgument<string>("outFile");
232 |         progress = optionsParser.getFlag("progress");
233 |     }
234 |     catch(exception& e) {
235 |         cerr << e.what() << endl;
236 |         optionsParser.printUsage(cerr);
237 |         exit(1);
238 |     }
239 | 
240 |     vector<string> in_paths = chopString(in_hal_paths, ",");
241 |     
242 |     // map genome -> dimensions for each input alignment
243 |     if (progress) {
244 |         cerr << "[halMergeChroms]: Scanning dimensions of " << in_paths.size() << " input files." << endl;
245 |     }
246 |     pair<string, unordered_map<string, vector<Sequence::Info>>> rd = get_hal_dimensions(&optionsParser, in_paths);
247 |     string& root_name = rd.first;
248 |     unordered_map<string, vector<Sequence::Info>>& dimensions = rd.second;
249 | 
250 |     // create the new file
251 |     if (progress) {
252 |         cerr << "[halMergeChroms]: Creating empty alignment: " << out_hal_path << endl;
253 |     }
254 |     AlignmentPtr alignment(openHalAlignment(out_hal_path, &optionsParser, READ_ACCESS | WRITE_ACCESS | CREATE_ACCESS));
255 | 
256 |     // set up the size of each genome, staring with the root
257 |     Genome* root_genome = alignment->addRootGenome(root_name);
258 |     for (auto& kv : dimensions) {
259 |         if (kv.first != root_name) {
260 |             Genome* leaf_genome = alignment->addLeafGenome(kv.first, root_name, 1);
261 |             leaf_genome->setDimensions(kv.second);
262 |         }
263 |     }
264 |     // important to set root dimensions after adding leaves so bottom segments have right number of slots
265 |     root_genome->setDimensions(dimensions.at(root_name));
266 | 
267 |     // copy over over everything
268 |     merge_hals(&optionsParser, alignment, in_paths, progress);
269 | 
270 |     if (progress) {
271 |       cerr << "[halMergeChroms]: Writing merged alignment" << endl;
272 |     }
273 |      
274 |     return 0;
275 | }
276 | 
277 | 


--------------------------------------------------------------------------------
/halRemoveDupes.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2016 by Glenn Hickey (hickey@soe.ucsc.edu)
 3 |  *
 4 |  * Released under the MIT license, see LICENSE.txt
 5 |  */
 6 | 
 7 | //#define debug
 8 | 
 9 | #include <cstdlib>
10 | #include <iostream>
11 | #include <cassert>
12 | #include <fstream>
13 | #include <deque>
14 | #include <unordered_map>
15 | 
16 | #include "hal.h"
17 | 
18 | using namespace std;
19 | using namespace hal;
20 | 
21 | static void initParser(CLParser* optionsParser) {
22 |     optionsParser->addArgument("halFile", "input hal file");
23 |     optionsParser->addArgument("genome", "remove all paralogy edges from this genome");
24 |     optionsParser->setDescription("Remove paralogy edges from given genome (in place)");
25 | }
26 | 
27 | int main(int argc, char** argv) {
28 |     CLParser optionsParser(WRITE_ACCESS);
29 |     initParser(&optionsParser);
30 |     string halPath;
31 |     string genomeName;
32 |     try {
33 |         optionsParser.parseOptions(argc, argv);
34 |         halPath = optionsParser.getArgument<string>("halFile");
35 |         genomeName = optionsParser.getArgument<string>("genome");
36 |     }
37 |     catch(exception& e) {
38 |         cerr << e.what() << endl;
39 |         optionsParser.printUsage(cerr);
40 |         exit(1);
41 |     }
42 |     try {
43 |         AlignmentPtr alignment(openHalAlignment(halPath, &optionsParser, READ_ACCESS | WRITE_ACCESS));
44 |         if (alignment->getNumGenomes() == 0) {
45 |             throw hal_exception("input hal alignmenet is empty");
46 |         }
47 | 
48 |         Genome* genome = alignment->openGenome(genomeName);
49 |         if (genome == NULL) {
50 |             throw hal_exception("Genome " + genomeName + " not found in alignment");
51 |         }
52 | 
53 |         if (genomeName == alignment->getRootName()) {
54 |             throw hal_exception("Cannot run on root");
55 |         }
56 | 
57 |         TopSegmentIteratorPtr topIt = genome->getTopSegmentIterator();
58 | 
59 |         size_t total_length = 0;
60 |         size_t total_edges = 0;
61 |         for (; not topIt->atEnd(); topIt->toRight()) {
62 |             TopSegment* topSeg = topIt->tseg();
63 |             if (topSeg->hasNextParalogy()) {
64 |                 topSeg->setNextParalogyIndex(NULL_INDEX);
65 |                 if (!topSeg->isCanonicalParalog()) {
66 |                     topSeg->setParentIndex(NULL_INDEX);
67 |                     total_length += topSeg->getLength();
68 |                     ++total_edges;
69 |                 }
70 |             }
71 |         }
72 | 
73 |         if (total_length > 0) {
74 |             cerr << "[halRemoveDupes]: " << total_edges << " paralogy edges removed from " << genomeName
75 |                  << " with total length " << total_length << endl;
76 |         } else {
77 |             cerr << "[halRemoveDupes] : No paralogy edges found in " << genomeName << endl;
78 |         }
79 |     }
80 | 
81 |     catch(exception& e) {
82 |         cerr << e.what() << endl;
83 |         exit(1);
84 |     }
85 |      
86 |     return 0;
87 | }
88 | 


--------------------------------------------------------------------------------
/halUnclip.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2016 by Glenn Hickey (hickey@soe.ucsc.edu)
  3 |  *
  4 |  * Released under the MIT license, see LICENSE.txt
  5 |  */
  6 | 
  7 | // Convert clipped sequences (like chr1_sub_110000_22220000) back to their original states
  8 | 
  9 | //#define debug
 10 | 
 11 | #include <cstdlib>
 12 | #include <cstdlib>
 13 | #include <iostream>
 14 | #include <cassert>
 15 | #include <fstream>
 16 | #include <deque>
 17 | #include <unordered_map>
 18 | #include <unordered_set>
 19 | 
 20 | #include "hal.h"
 21 | #include "commonC.h"
 22 | #include "bioioC.h"
 23 | #include "subpaths.h"
 24 | 
 25 | using namespace std;
 26 | using namespace hal;
 27 | 
 28 | static void initParser(CLParser* optionsParser) {
 29 |     optionsParser->addArgument("inFile", "input HAL file");
 30 |     optionsParser->addArgument("seqFile", "cactus-style seqfile. 1st col=genome name, 2nd col=(original) fasta file.  only local paths supported");
 31 |     optionsParser->addArgument("outFile", "output HAL file");
 32 |     optionsParser->addOptionFlag("progress",
 33 |                                  "show progress",
 34 |                                  false);
 35 |     optionsParser->addOptionFlag("validate",
 36 |                                  "run a (non-exhaustive) check on the output",
 37 |                                  false);
 38 |     optionsParser->addOption("targetGenomes",
 39 |                              "comma-separated (no spaces) list of target genomes "
 40 |                              "(others are not unclipped) (all leaves if empty)",
 41 |                              "\"\"");
 42 |     optionsParser->setDescription("Fill back clipped sequence (removed by cactus-preprocess) using the original fasta files"
 43 |         ". Star trees only");
 44 | }
 45 | 
 46 | static vector<string> split_delims(const string &s, const string& delims) {
 47 |     vector<string> elems;
 48 |     size_t start = string::npos;
 49 |     for (size_t i = 0; i < s.size(); ++i) {
 50 |         if (delims.find(s[i]) != string::npos) {
 51 |             if (start != string::npos && i > start) {
 52 |                 elems.push_back(s.substr(start, i - start));
 53 |             }
 54 |             start = string::npos;
 55 |         } else if (start == string::npos) {
 56 |             start = i;
 57 |         }
 58 |     }
 59 |     if (start != string::npos && start < s.size()) {
 60 |         elems.push_back(s.substr(start, s.size() - start));
 61 |     }
 62 |     return elems;
 63 | }
 64 | 
 65 | // c++ wrapper for sonlib -- load fasta file into memory
 66 | static unordered_map<string, pair<size_t, string>> read_fasta(const string& fa_path) {
 67 |     FILE* fa_file = fopen(fa_path.c_str(), "r");
 68 |     if (!fa_file) {
 69 |         cerr << "Unable to open fastat file: " << fa_path << endl;
 70 |         exit(1);
 71 |     }
 72 | 
 73 |     List* seqs = constructEmptyList(0, free);
 74 |     List* seq_lens = constructEmptyList(0, free);
 75 |     List* seq_names = constructEmptyList(0, free);
 76 | 
 77 |     fastaRead(fa_file, seqs, seq_lens, seq_names);
 78 | 
 79 |     // todo: should be done once, but sonlib fasta reading so slow it odesn't matter
 80 |     vector<unsigned char> cmap(numeric_limits<unsigned char>::max());
 81 |     for (unsigned char i = 0; i < cmap.size(); ++i) {
 82 |         switch (i) {
 83 |         case 'a':
 84 |         case 'c':
 85 |         case 'g':
 86 |         case 't':
 87 |         case 'A':
 88 |         case 'C':
 89 |         case 'G':
 90 |         case 'T':
 91 |             cmap[i] = i;
 92 |             break;
 93 |         default:
 94 |             cmap[i] = 'N';
 95 |             break;
 96 |         }
 97 |     }
 98 | 
 99 |     unordered_map<string, pair<size_t, string>> fa_info; 
100 |     for (int64_t i = 0; i < seqs->length; ++i) {
101 |         string name = (char*)seq_names->list[i];
102 |         size_t len = (size_t)listGetInt(seq_lens, i);
103 |         string seq = (char*)seqs->list[i];
104 |         for (size_t j = 0; j < seq.length(); ++j) {
105 |             // hal doesn't like non-acgtn characters
106 |             seq[j] = cmap[seq[j]];
107 |         }
108 |         fa_info[name] = make_pair(len, seq);
109 |     }
110 | 
111 |     destructList(seqs);
112 |     destructList(seq_lens);
113 |     destructList(seq_names);
114 | 
115 |     fclose(fa_file);
116 | 
117 |     return fa_info;
118 | }
119 | 
120 | // do a pass over the seqfile to get the total lengths of every sequence
121 | static unordered_map<string, size_t> get_dimensions_from_seqfile(const string& seqfile_path, const unordered_set<string>& target_set) {
122 |     unordered_map<string, size_t> seq_map;
123 |     
124 |     ifstream seqfile(seqfile_path);
125 |     if (!seqfile) {
126 |         cerr << "[halUnclip]: Unable to open seqfile: " << seqfile_path << endl;
127 |         exit(1);
128 |     }
129 | 
130 |     string buffer;
131 |     while (getline(seqfile, buffer)) {
132 |         vector<string> toks = split_delims(buffer, " \t");
133 |         if (toks.size() == 2) {
134 |             string name = toks[0];
135 |             string fa_path = toks[1];
136 |             if (target_set.count(name)) {
137 |                 unordered_map<string, pair<size_t, string>> fa_info = read_fasta(fa_path);
138 |                 for (auto& fi : fa_info) {
139 |                     seq_map[name + "." + fi.first] = fi.second.first;
140 |                 }
141 |             }
142 |         }
143 |     }
144 |     
145 |     return seq_map;
146 | }
147 | 
148 | static unordered_map<string, vector<Sequence::Info>> get_filled_dimensions(AlignmentConstPtr alignment, unordered_map<string, size_t>& seq_d,
149 |                                                                            const unordered_set<string>& target_set, bool progress) {
150 | 
151 |     unordered_map<string, vector<Sequence::Info>> dim_map;
152 | 
153 |     // copy root exactly as is
154 |     vector<Sequence::Info>& root_dims = dim_map[alignment->getRootName()];
155 |     const Genome* root_genome = alignment->openGenome(alignment->getRootName());
156 |     for (SequenceIteratorPtr seqIt = root_genome->getSequenceIterator(); not seqIt->atEnd(); seqIt->toNext()) {
157 |         const Sequence *sequence = seqIt->getSequence();
158 |         root_dims.push_back(Sequence::Info(sequence->getName(), sequence->getSequenceLength(), sequence->getNumTopSegments(), sequence->getNumBottomSegments()));
159 |     }
160 |     
161 |     vector<string> names = alignment->getChildNames(alignment->getRootName());
162 | 
163 |     for (const string& name : names) {
164 |         const Genome* genome = alignment->openGenome(name);
165 |         vector<Sequence::Info>&  dimensions = dim_map[name];
166 |         if (progress) {
167 |             cerr << "[halUnclip]: Scanning dimensions of genome " << genome->getName() << endl;
168 |         }
169 | 
170 |         // map base name to sequence fragments
171 |         unordered_map<string, vector<const Sequence*>> frag_map;
172 | 
173 |         // pass 1, map all hal sequences back to their base name and check that they correspond to a fasta sequence
174 |         for (SequenceIteratorPtr seqIt = genome->getSequenceIterator(); not seqIt->atEnd(); seqIt->toNext()) {
175 |             const Sequence *sequence = seqIt->getSequence();
176 |             string sequence_name = sequence->getName();
177 |             string parsed_name = target_set.count(genome->getName()) ? parse_subpath_name(sequence_name) : sequence_name;
178 |             string full_name = genome->getName() + "." + parsed_name;
179 |             size_t fa_len = sequence->getSequenceLength();
180 |             if (!seq_d.count(full_name)) {
181 |                 if (parsed_name != sequence_name) {
182 |                     cerr << "[halUnclip]: Unable to find sequence (from HAL) " << full_name << " in dimension map from input fasta" << endl;
183 |                     exit(1);
184 |                 }
185 |                 seq_d[full_name] = fa_len;
186 |             } else {
187 |                 fa_len = seq_d.at(full_name);
188 |             }
189 |             if (parsed_name == sequence_name && sequence->getSequenceLength() != fa_len) {
190 |                 cerr << "[halUnclip]: Sequence " << full_name << " has len=" << fa_len << " in fasta but len=" << sequence->getSequenceLength() << " in hal" << endl;
191 |                 exit(1);
192 |             }
193 |             if (parsed_name != sequence_name && sequence->getSequenceLength() > fa_len) {
194 |                 cerr << "[halUnclip]: Sequence " << sequence->getFullName() << " has len=" << fa_len << " in fasta but len=" << sequence->getSequenceLength() << " in hal" << endl;
195 |                 exit(1);
196 |             }
197 | 
198 |             frag_map[parsed_name].push_back(sequence);
199 |         }
200 | 
201 |         // pass 2: compute the dimensions for each base sequence
202 |         for (auto& nf : frag_map) {
203 |             const string& base_name = nf.first;
204 |             string full_name = genome->getName() + "." + base_name;
205 |             vector<const Sequence*>& frags = nf.second;
206 |             size_t fa_len;
207 |             fa_len = seq_d.at(full_name);
208 |             // sort the fragments by start position
209 |             map<size_t, const Sequence*> start_to_frag;
210 |             for (const Sequence* frag : frags) {
211 |                 int64_t start = -1;
212 |                 string parsed_name = target_set.count(name) ? parse_subpath_name(frag->getName(), &start) : frag->getName();
213 |                 if (start == -1) {
214 |                     start = 0;
215 |                     assert(frags.size() == 1);
216 |                 }
217 |                 start_to_frag[start] = frag;
218 |             }
219 | 
220 |             // count the top segments
221 |             size_t top = 0;
222 |             // count the gaps (separate counter just for debugging)
223 |             size_t gaps = 0;
224 |             if (start_to_frag.begin()->first > 0) {
225 |                 // gap in front
226 |                 ++gaps;
227 |             }
228 |             for (auto i = start_to_frag.begin(); i != start_to_frag.end(); ++i) {
229 |                 auto next = i;
230 |                 ++next;
231 |                 if (next != start_to_frag.end()) {
232 |                     if (i->first + i->second->getSequenceLength() < next->first) {
233 |                         // gap in middle
234 |                         ++gaps;
235 |                     }
236 |                 }
237 |                 top += i->second->getNumTopSegments();                
238 |             }
239 |             if (start_to_frag.rbegin()->first + start_to_frag.rbegin()->second->getSequenceLength() < fa_len) {
240 |                 // gap in back
241 |                 ++gaps;
242 |             }
243 |             dimensions.push_back(Sequence::Info(base_name, fa_len, top + gaps, 0));
244 |         }
245 |         
246 |         alignment->closeGenome(genome);
247 |     }
248 | 
249 |     return dim_map;
250 | }
251 | 
252 | static void copy_and_fill(AlignmentConstPtr in_alignment, AlignmentPtr out_alignment, const unordered_map<string, size_t>& seq_dims,
253 |                           const unordered_set<string>& target_set, bool progress) {
254 |     
255 |     const Genome* in_root_genome = in_alignment->openGenome(in_alignment->getRootName());
256 |     Genome* out_root_genome = out_alignment->openGenome(in_alignment->getRootName());
257 |     
258 |     vector<string> names = in_alignment->getChildNames(in_alignment->getRootName());
259 |     // with a lot of children, the bottom segments are unweildy.  they play havoc with default settings (chunk=1000 is too small)
260 |     // and are terribly slow even with tuning (except inmemory).  so we load up everything we need in this structure in memory
261 |     // so that the bottom segments can be set in a single pass
262 |     vector<vector<hal_index_t>> old_to_new_tsai_vec(names.size());
263 | 
264 |     for (const string& name : names) {
265 |         if (progress) {
266 |             cerr << "[halUnclip]: Copying segments of " << name << flush;
267 |         }
268 | 
269 |         const Genome* in_genome = in_alignment->openGenome(name);
270 |         Genome* out_genome = out_alignment->openGenome(name);
271 |         hal_index_t out_child_no = out_root_genome->getChildIndex(out_genome);
272 |         hal_index_t in_child_no = in_root_genome->getChildIndex(in_genome);
273 |         assert(in_child_no == out_child_no);
274 | 
275 |         // map base name to sequence fragments
276 |         // todo: same thing done in above funciton -- generalize?
277 |         unordered_map<string, vector<const Sequence*>> frag_map;
278 | 
279 |         // pass 1, map all hal sequences back to their base name and check that they correspond to a fasta sequence
280 |         if (progress) {
281 |             cerr << " [pass 1]" << flush;
282 |         }
283 |         for (SequenceIteratorPtr seqIt = in_genome->getSequenceIterator(); not seqIt->atEnd(); seqIt->toNext()) {
284 |             const Sequence *sequence = seqIt->getSequence();
285 |             string sequence_name = sequence->getName();
286 |             string parsed_name = target_set.count(name) ? parse_subpath_name(sequence_name) : sequence_name;
287 |             frag_map[parsed_name].push_back(sequence);
288 |         }
289 | 
290 |         // pass 2, copy each sequence fragment by fragment
291 |         if (progress) {
292 |             cerr << " [pass 2]" << flush;
293 |         }
294 |         vector<hal_index_t>& old_to_new_tsai = old_to_new_tsai_vec[out_child_no];
295 |         old_to_new_tsai.resize(in_genome->getNumTopSegments(), NULL_INDEX);
296 |     
297 |         for (auto& nf : frag_map) {
298 |             const string& base_name = nf.first;
299 |             vector<const Sequence*>& frags = nf.second;
300 |             
301 |             // sort the fragments by start position
302 |             map<size_t, const Sequence*> start_to_frag;
303 |             for (const Sequence* frag : frags) {
304 |                 int64_t start = -1;
305 |                 string parsed_name = target_set.count(name) ? parse_subpath_name(frag->getName(), &start) : frag->getName();
306 |                 if (start == -1) {
307 |                     start = 0;
308 |                     assert(frags.size() == 1);
309 |                 }
310 |                 start_to_frag[start] = frag;
311 |             }
312 | 
313 |             // the one output sequence that corresponds to the list of fragments in the input
314 |             Sequence* out_sequence = out_genome->getSequence(base_name);
315 |             assert(out_sequence != nullptr);
316 |             TopSegmentIteratorPtr out_top = out_sequence->getTopSegmentIterator();
317 |             TopSegment* ts;
318 | 
319 |             int64_t cur_pos = 0; //position in out_sequence
320 |             int64_t out_start = out_sequence->getStartPosition(); //offset needed when setting coorindatesin out_top
321 | 
322 |             // visit the ordered input sequence fragments that correspond to out_sequence
323 |             for (auto i = start_to_frag.begin(); i != start_to_frag.end(); ++i) {
324 |                 const Sequence* in_sequence_frag = i->second;
325 |                 int64_t frag_start = i->first;
326 |                 if (frag_start > cur_pos) {
327 |                     // need to add a gap *before* this fragment
328 |                     ts = out_top->tseg();
329 |                     ts->setCoordinates(cur_pos + out_start, frag_start - cur_pos);
330 |                     ts->setParentIndex(NULL_INDEX);
331 |                     ts->setNextParalogyIndex(NULL_INDEX);
332 |                     ts->setBottomParseIndex(NULL_INDEX);
333 | #ifdef debug
334 |                     cerr << "cur_pos=" << cur_pos << flush;
335 | #endif
336 |                     cur_pos += ts->getLength();
337 | #ifdef debug
338 |                     cerr << " after adding start gap cur_pos=" << cur_pos << " (frag name=" << in_sequence_frag->getName() << " fragstart=" << frag_start << ")" << endl;
339 | #endif
340 |                     out_top->toRight();
341 |                 }
342 | #ifdef debug
343 |                 cerr << "frag " << in_sequence_frag->getFullName() << " has " << in_sequence_frag->getNumTopSegments() << " topsegs which will map to range "
344 |                      << out_sequence->getTopSegmentIterator()->tseg()->getArrayIndex() << " - "
345 |                      << (out_sequence->getTopSegmentIterator()->tseg()->getArrayIndex() + in_sequence_frag->getNumTopSegments()) << endl;
346 | #endif
347 |                 // copy the fragment.  note that the ancestor coordinates haven't changed
348 |                 // any, so those coordinates can go directly
349 |                 TopSegmentIteratorPtr frag_top = in_sequence_frag->getTopSegmentIterator();
350 |                 size_t frag_top_count = in_sequence_frag->getNumTopSegments();
351 |                 for (size_t frag_top_i = 0; frag_top_i < frag_top_count; ++frag_top_i) {
352 |                     ts = out_top->tseg();
353 |                     ts->setCoordinates(out_start + cur_pos, frag_top->tseg()->getLength());
354 |                     ts->setParentIndex(frag_top->tseg()->getParentIndex());
355 |                     ts->setParentReversed(frag_top->tseg()->getParentReversed());
356 | 
357 |                     // set the bad value from input alignment, to be update later when we have map
358 |                     ts->setNextParalogyIndex(frag_top->tseg()->getNextParalogyIndex());
359 |                     ts->setBottomParseIndex(NULL_INDEX);
360 | #ifdef debug
361 |                     cerr << "cur_pos=" << cur_pos << flush;
362 | #endif
363 |                     cur_pos += ts->getLength();
364 | #ifdef debug
365 |                     cerr << " after adding frag_ts " << frag_top_i << " cur_pos=" << cur_pos << endl;
366 | #endif
367 |                     old_to_new_tsai[frag_top->tseg()->getArrayIndex()] = ts->getArrayIndex();
368 |                     frag_top->toRight();
369 |                     out_top->toRight();
370 |                 }            
371 |             }
372 |             if (cur_pos < (int64_t)out_sequence->getSequenceLength()) {
373 |                 // needto add a gap *after* the last fragment
374 |                 ts = out_top->tseg();
375 |                 ts->setCoordinates(out_start + cur_pos, (int64_t)out_sequence->getSequenceLength() - cur_pos);
376 |                 ts->setParentIndex(NULL_INDEX);
377 |                 ts->setNextParalogyIndex(NULL_INDEX);
378 |                 ts->setBottomParseIndex(NULL_INDEX);
379 | #ifdef debug
380 |                 cerr << "cur_pos="<< cur_pos << flush;
381 | #endif
382 |                 cur_pos += ts->getLength();
383 | #ifdef debug
384 |                 cerr << " after adding end gap cur_pos=" << cur_pos << endl;
385 | #endif
386 |                 out_top->toRight();
387 |             }
388 |             if (cur_pos != (int64_t)out_sequence->getSequenceLength()) {
389 |                 cerr << "[halUnclip]: sanity check fail for sequence " << name << "." << base_name << ".  The offset after conversion is "
390 |                      << cur_pos << " which is different than the sequence length of " << out_sequence->getSequenceLength() << endl
391 |                      << "[halUnclip]: the fragments are\n";
392 |                 for (size_t i = 0; i < frags.size(); ++i) {
393 |                     const Sequence* in_sequence_frag = frags[i];
394 |                     cerr << "     " << in_sequence_frag->getName() << " len=" << in_sequence_frag->getSequenceLength() << endl;                    
395 |                 }
396 |             }
397 |             assert(cur_pos == (int64_t)out_sequence->getSequenceLength());            
398 |             assert(out_top->getArrayIndex() == out_sequence->getTopSegmentIterator()->getArrayIndex() + (int64_t)out_sequence->getNumTopSegments());
399 |         }
400 | 
401 |         //pass 3: set the paralogy indexes
402 |         if (progress) {
403 |             cerr << " [pass 3]" << endl;
404 |         }        
405 |         TopSegment* ts;
406 |         for (TopSegmentIteratorPtr out_topit = out_genome->getTopSegmentIterator(); !out_topit->atEnd(); out_topit->toRight()) {
407 |             ts = out_topit->tseg();
408 |             if (ts->hasNextParalogy()) {
409 |                 ts->setNextParalogyIndex(old_to_new_tsai[ts->getNextParalogyIndex()]);
410 |             }
411 |         }
412 | 
413 |         in_alignment->closeGenome(in_genome);
414 |         out_alignment->closeGenome(out_genome);
415 |     }
416 | 
417 |     // copy the root
418 |     if (progress) {
419 |         cerr << "[halUnclip]: Copying root segments" << endl;
420 |     }
421 |     BottomSegmentIteratorPtr in_botit = in_root_genome->getBottomSegmentIterator();
422 |     BottomSegmentIteratorPtr out_botit = out_root_genome->getBottomSegmentIterator();
423 |     assert(in_root_genome->getNumBottomSegments() == out_root_genome->getNumBottomSegments());
424 |     assert(in_root_genome->getNumChildren() == out_root_genome->getNumChildren());
425 |     size_t num_bottom = in_root_genome->getNumBottomSegments();
426 |     size_t num_children = in_root_genome->getNumChildren();
427 |     for (size_t i = 0; i < num_bottom; ++i) {
428 |         BottomSegment* in_bs = in_botit->bseg();
429 |         BottomSegment* out_bs = out_botit->bseg();
430 |         out_bs->setCoordinates(in_bs->getStartPosition(), in_bs->getLength());
431 |         for (size_t j = 0; j < num_children; ++j) {
432 |             // everything's the same except the child index, which gets mapped via old_to_new_tsai_vec
433 |             hal_index_t in_ci = in_bs->getChildIndex(j);
434 |             hal_index_t out_ci = in_ci != NULL_INDEX ? old_to_new_tsai_vec[j][in_ci] : in_ci;
435 |             out_bs->setChildIndex(j, out_ci);
436 |             out_bs->setChildReversed(j, in_bs->getChildReversed(j));
437 |         }
438 |         out_bs->setTopParseIndex(NULL_INDEX);
439 |         in_botit->toRight();
440 |         out_botit->toRight();
441 |     }
442 | }
443 | 
444 | // go in and rewerite the sequences from the fasta
445 | void add_fasta_sequences(AlignmentConstPtr in_alignment, AlignmentPtr out_alignment, const string& seqfile_path, const unordered_set<string>& target_set, bool progress) {
446 |     ifstream seqfile(seqfile_path);
447 |     if (!seqfile) {
448 |         cerr << "[halUnclip]: Unable to open seqfile: " << seqfile_path << endl;
449 |         exit(1);
450 |     }
451 | 
452 |     string buffer;
453 |     set<string> done_set;
454 |     while (getline(seqfile, buffer)) {
455 |         vector<string> toks = split_delims(buffer, " \t");
456 |         if (toks.size() == 2) {
457 |             string name = toks[0];
458 |             string fa_path = toks[1];
459 |             if (target_set.count(name)) {
460 |                 done_set.insert(name);
461 |                 if (progress) {
462 |                     cerr << "[halUnclip]: Loading fasta for " << name << " ... " << flush;
463 |                 }
464 |                 unordered_map<string, pair<size_t, string>> fa_info = read_fasta(fa_path);
465 |                 if (progress) {
466 |                     cerr << "and setting dna strings in output genome" << endl;
467 |                 }
468 |                 Genome* genome = out_alignment->openGenome(name);
469 |                 assert(genome != nullptr);
470 |                 for (auto& fi : fa_info) {
471 |                     Sequence* sequence = genome->getSequence(fi.first);
472 |                     if (sequence != nullptr) {
473 |                         assert(sequence->getSequenceLength() == fi.second.first);
474 |                         sequence->setString(fi.second.second);
475 |                     }
476 |                 }
477 |             }
478 |         }
479 |     }
480 | 
481 |     // if there's no _sub sequences found, a genome is allowed to not be in the sequence map
482 |     // this is generally the case for the root, but could be the minigraph contigs
483 |     vector<string> names = in_alignment->getChildNames(in_alignment->getRootName());
484 |     names.push_back(in_alignment->getRootName());
485 |     for (const string& name : names) {
486 |         if (!done_set.count(name)) {
487 |             if (progress) {
488 |                 cerr << "[halUnclip]: Directly copying dna strings for " << name << endl;
489 |             };
490 |             const Genome* in_genome = in_alignment->openGenome(name);
491 |             Genome* out_genome = out_alignment->openGenome(name);
492 |             for (SequenceIteratorPtr seqIt = in_genome->getSequenceIterator(); not seqIt->atEnd(); seqIt->toNext()) {
493 |                 const Sequence* in_sequence = seqIt->getSequence();
494 |                 Sequence* out_sequence = out_genome->getSequence(in_sequence->getName());
495 |                 in_sequence->getString(buffer);
496 |                 out_sequence->setString(buffer);
497 |             }
498 |             if (name != in_alignment->getRootName()) {
499 |                 in_alignment->closeGenome(in_genome);
500 |                 out_alignment->closeGenome(out_genome);
501 |             }
502 |         }
503 |     }
504 | }
505 | 
506 | 
507 | // root->leaf alignments are consistent
508 | static void validate_alignments(AlignmentConstPtr in_alignment, AlignmentPtr out_alignment) {
509 | 
510 |     validateAlignment(out_alignment.get());
511 | 
512 |     const Genome* in_root_genome = in_alignment->openGenome(in_alignment->getRootName());
513 |     Genome* out_root_genome = out_alignment->openGenome(in_alignment->getRootName());
514 |     assert(in_root_genome->getNumBottomSegments() == out_root_genome->getNumBottomSegments());
515 |     assert(in_root_genome->getNumChildren() == out_root_genome->getNumChildren());
516 |     // we go by genome (instead of segment) to hopefully be cache-friendlier
517 |     for (size_t j = 0; j < in_root_genome->getNumChildren(); ++j) {
518 |         const Genome* in_genome = in_root_genome->getChild(j);
519 |         Genome* out_genome = out_root_genome->getChild(j);
520 |         BottomSegmentIteratorPtr in_botit = in_root_genome->getBottomSegmentIterator();
521 |         BottomSegmentIteratorPtr out_botit = out_root_genome->getBottomSegmentIterator();
522 |         TopSegmentIteratorPtr in_topit = in_genome->getTopSegmentIterator();
523 |         TopSegmentIteratorPtr out_topit = out_genome->getTopSegmentIterator();
524 |         for (size_t i = 0; i < in_genome->getNumBottomSegments(); ++i) {
525 |             in_topit->toChild(in_botit, j);
526 |             out_topit->toChild(out_botit, j);
527 |             
528 |             string s1, s2;
529 |             if (j == 0) {
530 |                 in_botit->getString(s1);
531 |                 out_botit->getString(s2);
532 |                 assert(s1 == s2);
533 |             }
534 |             in_topit->getString(s1);
535 |             out_topit->getString(s2);
536 |             assert(s1 == s2);
537 | 
538 |             string in_seq_name = in_topit->tseg()->getSequence()->getName();
539 |             string out_seq_name = out_topit->tseg()->getSequence()->getName();
540 |             int64_t start;
541 |             string in_base_name = parse_subpath_name(in_seq_name, &start);
542 |             assert(in_base_name == out_seq_name);
543 |             assert(in_topit->getReversed() == out_topit->getReversed());
544 |             if (!in_topit->getReversed()) {
545 |                 // punt on reverse check for now
546 |                 assert(in_topit->getStartPosition() + start == out_topit->getStartPosition());
547 |             }
548 | 
549 |             in_botit->toRight();
550 |             out_botit->toRight();
551 |         }
552 |         in_alignment->closeGenome(in_genome);
553 |         out_alignment->closeGenome(out_genome);        
554 |     }
555 | }
556 | 
557 | int main(int argc, char** argv) {
558 |     CLParser optionsParser(WRITE_ACCESS);
559 |     initParser(&optionsParser);
560 |     string in_hal_path;
561 |     string out_hal_path;
562 |     string seqfile_path;
563 |     string target_genomes;
564 |     bool progress;
565 |     bool validate;
566 |     try {
567 |         optionsParser.parseOptions(argc, argv);
568 |         in_hal_path = optionsParser.getArgument<string>("inFile");
569 |         seqfile_path = optionsParser.getArgument<string>("seqFile");
570 |         out_hal_path = optionsParser.getArgument<string>("outFile");
571 |         target_genomes = optionsParser.getOption<string>("targetGenomes");
572 |         progress = optionsParser.getFlag("progress");
573 |         validate = optionsParser.getFlag("validate");
574 |     }
575 |     catch(exception& e) {
576 |         cerr << e.what() << endl;
577 |         optionsParser.printUsage(cerr);
578 |         exit(1);
579 |     }
580 | 
581 |     // load the input genome
582 |     if (progress) {
583 |         cerr << "[halUnclip]: Opening input alignment" << endl;
584 |     }
585 |     AlignmentConstPtr in_alignment(openHalAlignment(in_hal_path, &optionsParser, READ_ACCESS));
586 | 
587 |     // and the output genome
588 |     if (progress) {
589 |         cerr << "[halUnclip]: Creating output alignment object" << endl;
590 |     }    
591 |     AlignmentPtr out_alignment(openHalAlignment(out_hal_path, &optionsParser, READ_ACCESS | WRITE_ACCESS | CREATE_ACCESS));
592 | 
593 |     // check the targets, defaulting to all leaves
594 |     vector<string> target_names;
595 |     if (target_genomes != "\"\"") {
596 |         target_names = chopString(target_genomes, ",");
597 |         for (const string& name : target_names) {
598 |             const Genome* genome = in_alignment->openGenome(name);
599 |             if (genome == nullptr) {
600 |                 cerr << "[halUnclip]: Target genome " << name << " not present in input HAL" << endl;
601 |                 exit(1);
602 |             }
603 |             in_alignment->closeGenome(genome);
604 |         }
605 |     } else {
606 |         target_names = in_alignment->getChildNames(in_alignment->getRootName());
607 |     }
608 |     unordered_set<string> target_set(target_names.begin(), target_names.end());
609 | 
610 |     // and load the fasta sequence sizes from the seqfile
611 |     if (progress) {
612 |         cerr << "[halUnclip]: Reading fasta dimensions from seqfile" << endl;
613 |     }
614 |     unordered_map<string, size_t> seq_dims = get_dimensions_from_seqfile(seqfile_path, target_set);
615 | 
616 |     if (progress) {
617 |         cerr << "[halUnclip]: Computing new hal dimensions" << endl;
618 |     }
619 |     unordered_map<string, vector<Sequence::Info>> dimensions = get_filled_dimensions(in_alignment, seq_dims, target_set, progress);
620 | 
621 |     // set up the size of each genome, staring with the root
622 |     string root_name = in_alignment->getRootName();
623 |     Genome* root_genome = out_alignment->addRootGenome(root_name);
624 |     // important to visit these in order, so child indexes are presesrved
625 |     vector<string> leaf_names = in_alignment->getChildNames(root_name);
626 |     for (const string& leaf_name : leaf_names) {
627 |         vector<Sequence::Info>& leaf_dims = dimensions.at(leaf_name);
628 |         Genome* leaf_genome = out_alignment->addLeafGenome(leaf_name, root_name, 1);
629 |         leaf_genome->setDimensions(leaf_dims);
630 |         if (progress) {
631 |             cerr << "[halUnclip]: Adding leaf genome " << leaf_name << " with length " << leaf_genome->getSequenceLength() << " and " << leaf_genome->getNumTopSegments() << " top segments" << endl;
632 |         }
633 |     }
634 | 
635 |     // important to set root dimensions after adding leaves so bottom segments have right number of slots
636 |     root_genome->setDimensions(dimensions.at(root_name));
637 |     if (progress) {
638 |         cerr << "[halUnclip]: Adding root genome " << root_name << " with length " << root_genome->getSequenceLength() << " and " << root_genome->getNumBottomSegments() << " bottom segments" << endl;
639 |     }    
640 |     
641 |     // copy over the filled graph
642 |     if (progress) {
643 |         cerr << "[halUnclip]: Copying and filling the graph" << endl;
644 |     }
645 |     copy_and_fill(in_alignment, out_alignment, seq_dims, target_set, progress);
646 | 
647 |     // add back the fasta sequences
648 |     if (progress) {
649 |         cerr << "[halUnclip]: Adding fasta sequences" << endl;
650 |     }    
651 |     add_fasta_sequences(in_alignment, out_alignment, seqfile_path, target_set, progress);
652 | 
653 |     if (validate) {
654 |         if (progress) {
655 |             cerr << "[halUnclip]: Validating alignment" << endl;
656 |         }
657 |         validate_alignments(in_alignment, out_alignment);
658 |     }    
659 |             
660 |     if (progress) {
661 |         cerr << "[halUnclip]: Writing output alignment" << endl;
662 |     }
663 |      
664 |     return 0;
665 | }
666 | 
667 | 


--------------------------------------------------------------------------------
/include.mk:
--------------------------------------------------------------------------------
 1 | binPath=${rootPath}
 2 | libPath=${rootPath}
 3 | 
 4 | sonLibRootPath=deps/sonLib
 5 | sonLibPath=${sonLibRootPath}/lib
 6 | 
 7 | halRootPath=deps/hal
 8 | halPath=${halRootPath}/lib
 9 | halIncPath=${halRootPath}/api/inc
10 | 
11 | libbdsgPath=${rootPath}/deps/libbdsg-easy
12 | 
13 | include  ${sonLibRootPath}/include.mk
14 | 
15 | CFLAGS += -I ${sonLibPath}  -I ${halPath} -I ${halIncPath}
16 | CXXFLAGS += -std=c++14 -I ${sonLibPath}  -I ${halPath} -I ${halIncPath} -I ${libbdsgPath}/include -UNDEBUG
17 | basicLibs = ${halPath}/libHal.a ${sonLibPath}/stPinchesAndCacti.a ${sonLibPath}/sonLib.a ${sonLibPath}/cuTest.a ${libbdsgPath}/lib/libbdsg.a ${libbdsgPath}/lib/libhandlegraph.a ${libbdsgPath}/lib/libsdsl.a ${libbdsgPath}/lib/libdivsufsort.a ${libbdsgPath}/lib/libdivsufsort64.a
18 | basicLibsDependencies = ${basicLibs}
19 | 
20 | # hdf5 compilation is done through its wrappers.
21 | # we can speficy our own (sonlib) compilers with these variables:
22 | HDF5_CXX = ${cpp}
23 | HDF5_CXXLINKER = ${cpp}
24 | HDF5_CC = ${cxx}
25 | HDF5_CCLINKER = ${cxx} 
26 | cpp = h5c++ ${h5prefix}
27 | cxx = h5cc ${h5prefix}
28 | 
29 | # add compiler flag and kent paths if udc is enabled
30 | # relies on KENTSRC containing path to top level kent/ dir
31 | # and MACHTYPE being specified
32 | ifdef ENABLE_UDC
33 | #  Find samtabix as in kent/src/inc/common.mk:
34 | 	ifeq (${SAMTABIXDIR},)
35 | 		SAMTABIXDIR = /hive/data/outside/samtabix/${MACHTYPE}
36 | 	endif
37 | 
38 | 	basicLibs += ${KENTSRC}/src/lib/${MACHTYPE}/jkweb.a  ${SAMTABIXDIR}/libsamtabix.a -lssl -lcrypto
39 | endif
40 | 
41 | 


--------------------------------------------------------------------------------
/paf.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | #include <vector>
 5 | #include <iostream>
 6 | 
 7 | using namespace std;
 8 | 
 9 | struct PafLine {
10 |     string query_name;
11 |     int64_t query_len;
12 |     int64_t query_start;
13 |     int64_t query_end;
14 |     char strand;
15 |     string target_name;
16 |     int64_t target_len;
17 |     int64_t target_start;
18 |     int64_t target_end;
19 |     int64_t num_matching;
20 |     int64_t num_bases;
21 |     int64_t mapq;
22 |     string cigar;
23 | };
24 | 
25 | inline vector<string> split_delims(const string &s, const string& delims, vector<string> &elems) {
26 |     size_t start = string::npos;
27 |     for (size_t i = 0; i < s.size(); ++i) {
28 |         if (delims.find(s[i]) != string::npos) {
29 |             if (start != string::npos && i > start) {
30 |                 elems.push_back(s.substr(start, i - start));
31 |             }
32 |             start = string::npos;
33 |         } else if (start == string::npos) {
34 |             start = i;
35 |         }
36 |     }
37 |     if (start != string::npos && start < s.size()) {
38 |         elems.push_back(s.substr(start, s.size() - start));
39 |     }
40 |     return elems;
41 | }
42 | 
43 | inline PafLine parse_paf_line(const string& paf_line) {
44 |     vector<string> toks;
45 |     split_delims(paf_line, "\t\n", toks);
46 |     assert(toks.size() > 12);
47 | 
48 |     PafLine paf;
49 |     paf.query_name = toks[0];
50 |     paf.query_len = stol(toks[1]);
51 |     paf.query_start = stol(toks[2]);
52 |     paf.query_end = stol(toks[3]);
53 |     assert(toks[4] == "+" || toks[4] == "-");
54 |     paf.strand = toks[4][0];
55 |     paf.target_name = toks[5];
56 |     paf.target_len = stol(toks[6]);
57 |     paf.target_start = stol(toks[7]);
58 |     paf.target_end = stol(toks[8]);
59 |     paf.num_matching = stol(toks[9]);
60 |     paf.num_bases = stol(toks[10]);
61 |     paf.mapq = stol(toks[11]);
62 | 
63 |     for (size_t i = 12; i < toks.size(); ++i) {
64 |         if (toks[i].compare(0, 3, "cg:Z:") == 0) {
65 |             paf.cigar = toks[i].substr(5);
66 |             break;
67 |         }
68 |     }
69 | 
70 |     return paf;
71 | }
72 | 
73 | inline ostream& operator<<(ostream& os, const PafLine& paf) {
74 |     os << paf.query_name << "\t" << paf.query_len << "\t" << paf.query_start << "\t" << paf.query_end << "\t"
75 |        << string(1, paf.strand) << "\t"
76 |        << paf.target_name << "\t" << paf.target_len << "\t" << paf.target_start << "\t" << paf.target_end << "\t"
77 |        << paf.num_matching << "\t" << paf.num_bases << "\t" << paf.mapq;
78 |     return os;
79 | }
80 | 


--------------------------------------------------------------------------------
/subpaths.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | #include <sstream>
 5 | #include <cassert>
 6 | 
 7 | inline std::string parse_subpath_name(const std::string& path_name, int64_t* out_start = nullptr, int64_t* out_end = nullptr) {
 8 | 
 9 |     std::string base_name = path_name;
10 |     if (out_start) {
11 |         *out_start = -1;
12 |     }
13 |     if (out_end) {
14 |         *out_end = -1;
15 |     }
16 |     
17 |     size_t first_length = 0;
18 |     size_t start_offset = 0;
19 |     while (true) {
20 |         size_t sp = base_name.rfind("_sub_");
21 |         if (sp != std::string::npos) {
22 |             size_t up = base_name.rfind("_");
23 |             if (up != std::string::npos && up > sp + 1) {
24 |                 int64_t start;
25 |                 int64_t end;
26 |                 try {
27 |                     start = stol(base_name.substr(sp + 5, up - sp - 5));
28 |                     end = stol(base_name.substr(up + 1));
29 |                 } catch (...) {
30 |                     return base_name;
31 |                 }
32 |                 std::stringstream new_name;
33 |                 start_offset += start; // final offset is sum of all nested offsets
34 |                 if (first_length == 0) {
35 |                     first_length = end - start;
36 |                     assert(first_length > 0);
37 |                 } else {
38 |                     // in the case of nested subpaths, the end coordinate will always
39 |                     // be derived from the start, plus the length of the "top" path
40 |                     end = start_offset + first_length;
41 |                 }
42 |                 if (out_start) {
43 |                     *out_start = start_offset;
44 |                 }
45 |                 if (out_end) {
46 |                     *out_end = end;
47 |                 }
48 |                 base_name = base_name.substr(0, sp);
49 |             }
50 |         } else {
51 |             break;
52 |         }
53 |     }
54 |     return base_name;
55 | }
56 | 
57 | inline void resolve_subpath_naming(std::string& path_name) {
58 |     int64_t sub_start;
59 |     int64_t sub_end;
60 |     std::string new_name = parse_subpath_name(path_name, &sub_start, &sub_end);
61 |     if (sub_start != -1) {
62 |         assert(new_name != path_name);
63 |         path_name = new_name + "[" + std::to_string(sub_start) + "-" + std::to_string(sub_end) + "]";
64 |     }
65 | }
66 | 
67 | 


--------------------------------------------------------------------------------
/tests/bash-tap/Changes:
--------------------------------------------------------------------------------
 1 | * bash-tap 1.0.2 (2013-05-19-00:59)
 2 |   * Make bash-tap-mock work with -e.
 3 |     Contributed by Daniel Nephin (@dnephin).
 4 | 
 5 | * bash-tap 1.0.1 (2012-07-14-20:59)
 6 |   * Clearer diagnostics for like/unlike.
 7 |   * Correct syntax for bash 3.2 regexp matching in like/unlike.
 8 | 
 9 | * bash-tap 1.0.0 (2012-06-20-15:31)
10 |   * TAP-compliant testing for bash.
11 |   * Function and command mocks for bash.
12 |   * In-process output capture helpers for testing.
13 | 


--------------------------------------------------------------------------------
/tests/bash-tap/README.mkdn:
--------------------------------------------------------------------------------
  1 | Bash-TAP
  2 | ========
  3 | 
  4 | Bash-TAP allows you to perform TAP-compliant tests within bash
  5 | using a similar test syntax to Perl's Test::More and Test::Builder,
  6 | suitable to run with `prove` or any other TAP-consuming test harness.
  7 | 
  8 | For more information about TAP (the Test Anything Protocol) visit:
  9 | http://testanything.org/
 10 | 
 11 | Installation and Usage
 12 | ----------------------
 13 | 
 14 | 1. Install the bash-tap files somewhere convenient for you.
 15 |    The default location of `../../bash-tap` relative to your
 16 |    test files is the easiest zero-conf way, but you can set
 17 |    the `$BASH_TAP_ROOT` environment variable if you want to
 18 |    install elsewhere.
 19 | 2. If you're writing tests then copy `bash-tap-bootstrap`
 20 |    into your tests dir and source it inside your tests with:
 21 | 
 22 | ```bash
 23 | . $(dirname $0)/bash-tap-bootstrap
 24 | ```
 25 | 
 26 | 3. Run your tests with `prove my_test_dir` or your favourite
 27 |    TAP-consuming test harness, or run them manually as a
 28 |    script if you just want to see the raw TAP output.
 29 | 
 30 | Example test file
 31 | -----------------
 32 | 
 33 | Here's example test file `01_read_rows_from_key_value_lines.t` from
 34 | https://github.com/illusori/bash-snippets
 35 | 
 36 | ```bash
 37 | #!/bin/bash
 38 | 
 39 | . $(dirname $0)/bash-tap-bootstrap
 40 | . $(dirname $0)/../read_rows_from_key_value_lines
 41 | 
 42 | columns_per_row=6
 43 | max_rows_per_rowset=3
 44 | total_rowsets=2
 45 | 
 46 | plan tests $(((columns_per_row * max_rows_per_rowset * total_rowsets) + total_rowsets))
 47 | 
 48 | #  Test data, resultset 1
 49 | results1="artist Assemblage 23
 50 | track Naked (God Module RMX)
 51 | album Addendum
 52 | year 2001
 53 | rating 80
 54 | tracktime 5:22
 55 | artist Ayria
 56 | track Sapphire
 57 | album Debris
 58 | year
 59 | rating 100
 60 | tracktime 6:14
 61 | artist Apoptygma Berzerk
 62 | track Kathy's Song
 63 | album Welcome To Earth \"Extra bit for testing\"
 64 | year
 65 | rating 100
 66 | tracktime 6:35"
 67 | 
 68 | #  Test data, resultset 2
 69 | results2="artist Colony 5
 70 | track The Bottle
 71 | album Lifeline
 72 | year
 73 | rating 80
 74 | tracktime 4:34"
 75 | 
 76 | output=$(_read_rows_from_key_value_lines "track" "$results1" 2>&1)
 77 | is "$output" "" "Read of rowset 1 should produce no output"
 78 | #  Since $() runs in a subshell, we need to run it "for real" now
 79 | _read_rows_from_key_value_lines "track" "$results1" &>/dev/null
 80 | 
 81 | #  Track 1
 82 | is "${track_artist[0]}" "Assemblage 23" "rowset 1 track 1 artist"
 83 | is "${track_track[0]}" "Naked (God Module RMX)" "rowset 1 track 1 track"
 84 | is "${track_album[0]}" "Addendum" "rowset 1 track 1 album"
 85 | is "${track_year[0]}" "2001" "rowset 1 track 1 year"
 86 | is "${track_rating[0]}" "80" "rowset 1 track 1 rating"
 87 | is "${track_tracktime[0]}" "5:22" "rowset 1 track 1 tracktime"
 88 | 
 89 | #  Track 2
 90 | is "${track_artist[1]}" "Ayria" "rowset 1 track 2 artist"
 91 | is "${track_track[1]}" "Sapphire" "rowset 1 track 2 track"
 92 | is "${track_album[1]}" "Debris" "rowset 1 track 2 album"
 93 | is "${track_year[1]}" "" "rowset 1 track 2 year"
 94 | is "${track_rating[1]}" "100" "rowset 1 track 2 rating"
 95 | is "${track_tracktime[1]}" "6:14" "rowset 1 track 2 tracktime"
 96 | 
 97 | #  Track 3
 98 | is "${track_artist[2]}" "Apoptygma Berzerk" "rowset 1 track 3 artist"
 99 | is "${track_track[2]}" "Kathy's Song" "rowset 1 track 3 track"
100 | is "${track_album[2]}" "Welcome To Earth \"Extra bit for testing\"" "rowset 1 track 3 album"
101 | is "${track_year[2]}" "" "rowset 1 track 3 year"
102 | is "${track_rating[2]}" "100" "rowset 1 track 3 rating"
103 | is "${track_tracktime[2]}" "6:35" "rowset 1 track 3 tracktime"
104 | 
105 | output=$(_read_rows_from_key_value_lines "track" "$results2" 2>&1)
106 | is "$output" "" "Read of rowset 2 should produce no output"
107 | #  Since $() runs in a subshell, we need to run it "for real now
108 | _read_rows_from_key_value_lines "track" "$results2" &>/dev/null
109 | 
110 | #  Track 1
111 | is "${track_artist[0]}" "Colony 5" "rowset 2 track 1 artist"
112 | is "${track_track[0]}" "The Bottle" "rowset 2 track 1 track"
113 | is "${track_album[0]}" "Lifeline" "rowset 2 track 1 album"
114 | is "${track_year[0]}" "" "rowset 2 track 1 year"
115 | is "${track_rating[0]}" "80" "rowset 2 track 1 rating"
116 | is "${track_tracktime[0]}" "4:34" "rowset 2 track 1 tracktime"
117 | 
118 | #  Track 2
119 | is "${track_artist[1]}" "" "rowset 2 track 2 artist"
120 | is "${track_track[1]}" "" "rowset 2 track 2 track"
121 | is "${track_album[1]}" "" "rowset 2 track 2 album"
122 | is "${track_year[1]}" "" "rowset 2 track 2 year"
123 | is "${track_rating[1]}" "" "rowset 2 track 2 rating"
124 | is "${track_tracktime[1]}" "" "rowset 2 track 2 tracktime"
125 | 
126 | #  Track 3
127 | is "${track_artist[2]}" "" "rowset 2 track 3 artist"
128 | is "${track_track[2]}" "" "rowset 2 track 3 track"
129 | is "${track_album[2]}" "" "rowset 2 track 3 album"
130 | is "${track_year[2]}" "" "rowset 2 track 3 year"
131 | is "${track_rating[2]}" "" "rowset 2 track 3 rating"
132 | is "${track_tracktime[2]}" "" "rowset 2 track 3 tracktime"
133 | ```
134 | 
135 | Running this gives output:
136 | 
137 | ```
138 | $ prove ~/projects/bash-snippets/t
139 | /Users/illusori/projects/bash-snippets/t/01_read_rows_from_key_value_lines.t .. ok
140 | All tests successful.
141 | Files=1, Tests=38,  0 wallclock secs ( 0.04 usr  0.00 sys +  0.04 cusr  0.02 csys =  0.10 CPU)
142 | Result: PASS
143 | ```
144 | 
145 | Or the verbose output:
146 | 
147 | ```
148 | $ prove -v ~/projects/bash-snippets/t
149 | /Users/illusori/projects/bash-snippets/t/01_read_rows_from_key_value_lines.t .. 
150 | 1..38
151 | ok 1 - Read of rowset 1 should produce no output
152 | ok 2 - rowset 1 track 1 artist
153 | ok 3 - rowset 1 track 1 track
154 | ok 4 - rowset 1 track 1 album
155 | ok 5 - rowset 1 track 1 year
156 | ok 6 - rowset 1 track 1 rating
157 | ok 7 - rowset 1 track 1 tracktime
158 | ok 8 - rowset 1 track 2 artist
159 | ok 9 - rowset 1 track 2 track
160 | ok 10 - rowset 1 track 2 album
161 | ok 11 - rowset 1 track 2 year
162 | ok 12 - rowset 1 track 2 rating
163 | ok 13 - rowset 1 track 2 tracktime
164 | ok 14 - rowset 1 track 3 artist
165 | ok 15 - rowset 1 track 3 track
166 | ok 16 - rowset 1 track 3 album
167 | ok 17 - rowset 1 track 3 year
168 | ok 18 - rowset 1 track 3 rating
169 | ok 19 - rowset 1 track 3 tracktime
170 | ok 20 - Read of rowset 2 should produce no output
171 | ok 21 - rowset 2 track 1 artist
172 | ok 22 - rowset 2 track 1 track
173 | ok 23 - rowset 2 track 1 album
174 | ok 24 - rowset 2 track 1 year
175 | ok 25 - rowset 2 track 1 rating
176 | ok 26 - rowset 2 track 1 tracktime
177 | ok 27 - rowset 2 track 2 artist
178 | ok 28 - rowset 2 track 2 track
179 | ok 29 - rowset 2 track 2 album
180 | ok 30 - rowset 2 track 2 year
181 | ok 31 - rowset 2 track 2 rating
182 | ok 32 - rowset 2 track 2 tracktime
183 | ok 33 - rowset 2 track 3 artist
184 | ok 34 - rowset 2 track 3 track
185 | ok 35 - rowset 2 track 3 album
186 | ok 36 - rowset 2 track 3 year
187 | ok 37 - rowset 2 track 3 rating
188 | ok 38 - rowset 2 track 3 tracktime
189 | ok
190 | All tests successful.
191 | Files=1, Tests=38,  0 wallclock secs ( 0.04 usr  0.01 sys +  0.04 cusr  0.02 csys =  0.11 CPU)
192 | Result: PASS
193 | ```
194 | 
195 | Mocking with bash-tap-mock
196 | --------------------------
197 | 
198 | Also included in `bash-tap` is a simple function mocking framework
199 | `bash-tap-mock`, it lets you mock commands and functions with
200 | `mock_command` and `restore_mocked_command`.
201 | 
202 | If you particularly care to only mock functions rather than commands
203 | (a good safeguard against typos), use `mock_function` and
204 | `restore_mocked_function`, which have some extended error checking
205 | ensuring the function you're mocking exists in the first place.
206 | 
207 | An example from https://github.com/illusori/bash-itunes is clearer:
208 | 
209 | ```bash
210 | #!/bin/bash
211 | 
212 | . $(dirname $0)/bash-tap-bootstrap
213 | . "$BASH_TAP_ROOT/bash-tap-mock"
214 | . $(dirname $0)/../itunes
215 | 
216 | plan tests 4
217 | 
218 | sent_command=''
219 | function mock_osascript() {
220 |     sent_command="$*"
221 |     restore_mocked_function "_osascript"
222 | }
223 | mock_function "_osascript" "mock_osascript"
224 | 
225 | start_output_capture
226 | _dispatch "stop"
227 | finish_output_capture stdout stderr
228 | 
229 | like "$sent_command" 'stop' "sent command should contain 'stop'"
230 | like "$sent_command" 'tell application "iTunes"' "sent command should contain 'tell application \"iTunes\"'"
231 | 
232 | is "$stdout" "Stopping iTunes." "stdout should tell user what happened"
233 | is "$stderr" "" "stderr should be empty"
234 | ```
235 | 


--------------------------------------------------------------------------------
/tests/bash-tap/bash-tap:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | bash_tap_version='1.0.2'
  4 | 
  5 | # Our state.
  6 | 
  7 | _bt_plan=''
  8 | _bt_expected_tests=0
  9 | _bt_plan_output=0
 10 | _bt_current_test=0
 11 | _bt_tap_output=''
 12 | _bt_has_output_plan=0
 13 | _bt_done_testing=0
 14 | _bt_output_capture=0
 15 | 
 16 | # Our test results so far
 17 | unset _bt_test_ok
 18 | unset _bt_test_actual_ok
 19 | unset _bt_test_name
 20 | unset _bt_test_type
 21 | unset _bt_test_reason
 22 | 
 23 | # Cleanup stuff.
 24 | declare -a _bt_on_exit_cmds
 25 | trap "_bt_on_exit" EXIT
 26 | 
 27 | # Planning functions.
 28 | 
 29 | function _bt_output_plan() {
 30 |     local num_tests="$1"
 31 |     local directive="$2"
 32 |     local reason="$3"
 33 | 
 34 |     if [ "$_bt_has_output_plan" = 1 ]; then
 35 |         _caller_error "The plan was already output"
 36 |     fi
 37 | 
 38 |     _bt_clear_out
 39 |     _bt_out "1..$num_tests"
 40 |     if [ -n "$directive" ]; then
 41 |         _bt_out " # $directive"
 42 |     fi
 43 |     if [ -n "$reason" ]; then
 44 |         _bt_out " $reason"
 45 |     fi
 46 |     _bt_print_out
 47 |     _bt_has_output_plan=1
 48 | }
 49 | 
 50 | function plan() {
 51 |     local plan="$1"
 52 | 
 53 |     case "$plan" in
 54 |         no_plan)  no_plan             ;;
 55 |         skip_all) skip_all "$2"       ;;
 56 |         tests)    expected_tests "$2" ;;
 57 |         *)        _bt_die "Unknown or missing plan: '$plan'" ;;
 58 |     esac
 59 | }
 60 | 
 61 | function expected_tests() {
 62 |     local num="$1"
 63 | 
 64 |     if [ -z "$num" ]; then
 65 |         echo $_bt_expected_tests
 66 |     else
 67 |         if [ -n "$_bt_plan" ]; then
 68 |             _bt_caller_error "Plan is already defined"
 69 |         fi
 70 |         # TODO: validate
 71 |         _bt_plan="$num"
 72 |         _bt_expected_tests="$num"
 73 |         _bt_output_plan "$_bt_expected_tests"
 74 |     fi
 75 | }
 76 | 
 77 | function no_plan() {
 78 |     if [ -n "$_bt_plan" ]; then
 79 |         _bt_caller_error "Plan is already defined"
 80 |     fi
 81 |     _bt_plan="no plan"
 82 | }
 83 | 
 84 | function done_testing() {
 85 |     local num_tests="$1"
 86 | 
 87 |     if [ -z "$num_tests" ]; then
 88 |         num_tests="$_bt_current_test"
 89 |     fi
 90 | 
 91 |     if [ "$_bt_done_testing" = 1 ]; then
 92 |         _bt_caller_error "done_testing was already called"
 93 |     fi
 94 | 
 95 |     if [ "$_bt_expected_tests" != 0 -a "$num_tests" != "$_bt_expected_tests" ]; then
 96 |         ok 0 "planned to run $_bt_expected_tests but done_testing expects $num_tests"
 97 |     else
 98 |         _bt_expected_tests="$num_tests"
 99 |     fi
100 | 
101 |     if [ "$_bt_has_output_plan" = 0 ]; then
102 |         _bt_plan="done testing"
103 |         _bt_output_plan "$num_tests"
104 |     fi
105 | }
106 | 
107 | function has_plan() {
108 |     test -n "$_bt_plan"
109 | }
110 | 
111 | function skip_all() {
112 |     local reason="${*:?}"
113 | 
114 |     _bt_output_plan 0 SKIP "$reason"
115 | }
116 | 
117 | # Test functions.
118 | 
119 | function ok() {
120 |     local result="$1"
121 |     local name="$2"
122 | 
123 |     _bt_current_test=$((_bt_current_test + 1))
124 | 
125 |     # TODO: validate $name
126 |     if [ -z "$name" ]; then
127 |         name='unnamed test'
128 |     fi
129 |     name="${name//#/\\#}"
130 | 
131 |     _bt_clear_out
132 |     if [ "$result" = 0 ]; then
133 |         _bt_out "not ok"
134 |         if [ -n "$TODO" ]; then
135 |             _bt_test_ok[$_bt_current_test]=1
136 |         else
137 |             _bt_test_ok[$_bt_current_test]=0
138 |         fi
139 |         _bt_test_actual_ok[$_bt_current_test]=0
140 |     else
141 |         _bt_out "ok"
142 |         _bt_test_ok[$_bt_current_test]=1
143 |         _bt_test_actual_ok[$_bt_current_test]="$result"
144 |     fi
145 | 
146 |     _bt_out " $_bt_current_test - $name"
147 |     _bt_test_name[$_bt_current_test]="$name"
148 | 
149 |     if [ -n "$TODO" ]; then
150 |         _bt_out " # TODO $TODO"
151 |         _bt_test_reason[$_bt_current_test]="$TODO"
152 |         _bt_test_type[$_bt_current_test]="todo"
153 |     else
154 |         _bt_test_reason[$_bt_current_test]=''
155 |         _bt_test_type[$_bt_current_test]=''
156 |     fi
157 | 
158 |     _bt_print_out
159 | }
160 | 
161 | function _is_diag() {
162 |     local result="$1"
163 |     local expected="$2"
164 | 
165 |     diag "         got: '$result'"
166 |     diag "    expected: '$expected'"
167 | }
168 | 
169 | function is() {
170 |     local result="$1"
171 |     local expected="$2"
172 |     local name="$3"
173 | 
174 |     if [ "$result" = "$expected" ]; then
175 |         ok 1 "$name"
176 |     else
177 |         ok 0 "$name"
178 |         _is_diag "$result" "$expected"
179 |     fi
180 | }
181 | 
182 | function _isnt_diag() {
183 |     local result="$1"
184 |     local expected="$2"
185 | 
186 |     diag "         got: '$result'"
187 |     diag "    expected: anything else"
188 | }
189 | 
190 | function isnt() {
191 |     local result="$1"
192 |     local expected="$2"
193 |     local name="$3"
194 | 
195 |     if [ "$result" != "$expected" ]; then
196 |         ok 1 "$name"
197 |     else
198 |         ok 0 "$name"
199 |         _isnt_diag "$result" "$expected"
200 |     fi
201 | }
202 | 
203 | function like() {
204 |     local result="$1"
205 |     local pattern="$2"
206 |     local name="$3"
207 | 
208 |     # NOTE: leave $pattern unquoted, see http://stackoverflow.com/a/218217/870000
209 |     if [[ "$result" =~ $pattern ]]; then
210 |         ok 1 "$name"
211 |     else
212 |         ok 0 "$name"
213 |         diag "         got: '$result'"
214 |         diag "    expected: match for '$pattern'"
215 |     fi
216 | }
217 | 
218 | function unlike() {
219 |     local result="$1"
220 |     local pattern="$2"
221 |     local name="$3"
222 | 
223 |     # NOTE: leave $pattern unquoted, see http://stackoverflow.com/a/218217/870000
224 |     if [[ ! "$result" =~ $pattern ]]; then
225 |         ok 1 "$name"
226 |     else
227 |         ok 0 "$name"
228 |         diag "         got: '$result'"
229 |         diag "    expected: no match for '$pattern'"
230 |     fi
231 | }
232 | 
233 | function cmp_ok() {
234 |     echo TODO
235 | }
236 | 
237 | # Other helper functions
238 | 
239 | function BAIL_OUT() {
240 |     echo TODO
241 | }
242 | 
243 | function skip() {
244 |     echo TODO
245 | }
246 | 
247 | function todo_skip() {
248 |     echo TODO
249 | }
250 | 
251 | function todo_start() {
252 |     echo TODO
253 | }
254 | 
255 | function todo_end() {
256 |     echo TODO
257 | }
258 | 
259 | # Output
260 | 
261 | function diag() {
262 |     local message="$1"
263 | 
264 |     if [ -n "$message" ]; then
265 |         _bt_escaped_echo "# $message"
266 |     fi
267 | }
268 | 
269 | # Util functions for output capture within current shell
270 | 
271 | function start_output_capture() {
272 |     if [ $_bt_output_capture = 1 ]; then
273 |         finish_output_capture
274 |         _bt_caller_error "Can't start output capture while already active"
275 |     fi
276 |     local stdout_tmpfile="/tmp/bash-itunes-test-out.$$"
277 |     local stderr_tmpfile="/tmp/bash-itunes-test-err.$$"
278 |     _bt_add_on_exit_cmd "rm -f '$stdout_tmpfile' '$stderr_tmpfile'"
279 |     _bt_output_capture=1
280 |     exec 3>&1 >$stdout_tmpfile 4>&2 2>$stderr_tmpfile
281 | }
282 | 
283 | function finish_output_capture() {
284 |     local capture_stdout_varname="$1"
285 |     local capture_stderr_varname="$2"
286 |     if [ $_bt_output_capture != 1 ]; then
287 |         _bt_caller_error "Can't finish output capture when it wasn't started"
288 |     fi
289 |     exec 1>&3 3>&- 2>&4 4>&-
290 |     _bt_output_capture=0
291 |     if [ -n "$capture_stdout_varname" ]; then
292 |         local stdout_tmpfile="/tmp/bash-itunes-test-out.$$"
293 |         eval "$capture_stdout_varname=\$(< $stdout_tmpfile)"
294 |     fi
295 |     if [ -n "$capture_stderr_varname" ]; then
296 |         local stderr_tmpfile="/tmp/bash-itunes-test-err.$$"
297 |         eval "$capture_stderr_varname=\$(< $stderr_tmpfile)"
298 |     fi
299 | }
300 | 
301 | # Internals
302 | 
303 | function _bt_stdout() {
304 |     echo "$@"
305 | }
306 | 
307 | function _bt_stderr() {
308 |     echo "$@" >&2
309 | }
310 | 
311 | function _bt_die() {
312 |     _bt_stderr "$@"
313 |     exit 255
314 | }
315 | 
316 | #  Report an error from the POV of the first calling point outside this file
317 | function _bt_caller_error() {
318 |     local message="$*"
319 | 
320 |     local thisfile="${BASH_SOURCE[0]}"
321 |     local file="$thisfile"
322 |     local frame_num=2
323 |     until [ "$file" != "$thisfile" ]; do
324 |         frame=$(caller "$frame_num")
325 |         IFS=' ' read line func file <<<"$frame"
326 |     done
327 | 
328 |     _bt_die "Error: $message, on line $line of $file"
329 | }
330 | 
331 | #  Echo the supplied message with lines after the
332 | #  first escaped as TAP comments.
333 | function _bt_escaped_echo() {
334 |     local message="$*"
335 | 
336 |     local output=''
337 |     while IFS= read -r line; do
338 |         output="$output\n# $line"
339 |     done <<<"$message"
340 |     echo -e "${output:4}"
341 | }
342 | 
343 | function _bt_clear_out() {
344 |     _bt_tap_output=""
345 | }
346 | 
347 | function _bt_out() {
348 |     _bt_tap_output="$_bt_tap_output$*"
349 | }
350 | 
351 | function _bt_print_out() {
352 |     _bt_escaped_echo "$_bt_tap_output"
353 | }
354 | 
355 | #  Cleanup stuff
356 | function _bt_add_on_exit_cmd() {
357 |     _bt_on_exit_cmds[${#_bt_on_exit_cmds[*]}]="$*"
358 | }
359 | 
360 | function _bt_on_exit() {
361 |     if [ $_bt_output_capture = 1 ]; then
362 |         finish_output_capture
363 |     fi
364 |     for exit_cmd in "${_bt_on_exit_cmds[@]}"; do
365 |         diag "cleanup: $exit_cmd"
366 |         eval "$exit_cmd"
367 |     done
368 |     # TODO: check that we've output a plan/results
369 | }
370 | 


--------------------------------------------------------------------------------
/tests/bash-tap/bash-tap-bootstrap:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | #  Bash TAP Bootstrap:
 4 | #    Copy this file into your project tests dir and source it
 5 | #    from each test file with:
 6 | #      . $(dirname $0)/bash-tap-bootstrap
 7 | #    It takes care of finding bash-tap or outputing a usage message.
 8 | #
 9 | 
10 | bash_tap_bootstrap_version='1.0.2'
11 | 
12 | if [ "${BASH_SOURCE[0]}" = "$0" ]; then
13 |     # Being run directly, probably by test harness running entire dir.
14 |     echo "1..0 # SKIP bash-tap-bootstrap isn't a test file"
15 |     exit 0
16 | fi
17 | 
18 | if [ -z "$BASH_TAP_ROOT" ]; then
19 |     #  TODO: search likely locations.
20 |     BASH_TAP_ROOT="$(dirname ${BASH_SOURCE[0]})/../../bash-tap"
21 | fi
22 | 
23 | if [ -f "$BASH_TAP_ROOT/bash-tap" ]; then
24 |     . "$BASH_TAP_ROOT/bash-tap"
25 | else
26 |     echo "Bail out!  Unable to find bash-tap.  Install from https://github.com/illusori/bash-tap or set \$BASH_TAP_ROOT if you have it installed somewhere unusual."
27 |     exit 255
28 | fi
29 | 


--------------------------------------------------------------------------------
/tests/bash-tap/bash-tap-mock:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # While not directly TAP-specific, being able to mock stuff
  4 | # in tests is pretty useful.
  5 | #
  6 | # If you're using bash-tap-bootstrap, then just source this
  7 | # file in your tests from the bash-tap directory found by
  8 | # the bootstrap by including this line after you've sourced
  9 | # bash-tap-bootstrap:
 10 | #
 11 | #   . "$BASH_TAP_ROOT/bash-tap-mock"
 12 | #
 13 | # If you're not using bash-tap-bootstrap then copy this file
 14 | # to your test directory and source it with:
 15 | #
 16 | #   . $(dirname $0)/bash-tap-mock
 17 | #
 18 | # It's important to note that if you're capturing the arguments
 19 | # passed to your mock function in a variable, and want that
 20 | # variable to be accessible to your tests, you must ensure that
 21 | # the mocked function is executed in the current shell and not
 22 | # a subshell.  In particular, this means you cannot use $() or
 23 | # `` to capture output of the function at the same time, as these
 24 | # invoke a subshell - the mock will happen, but any variables you
 25 | # set within your mock will only exist within the subshell.
 26 | # If you wish to capture output at the same time, you need to
 27 | # make use of the start_output_capture and finish_output_capture
 28 | # helper functons in bash-tap, or manually use file-descriptor
 29 | # redirects yourself to achieve the same effect.
 30 | 
 31 | bash_tap_mock_version='1.0.2'
 32 | 
 33 | if [ "${BASH_SOURCE[0]}" = "$0" ]; then
 34 |     # Being run directly, probably by test harness running entire dir.
 35 |     echo "1..0 # SKIP bash-tap-mock isn't a test file"
 36 |     exit 0
 37 | fi
 38 | 
 39 | function mock_function() {
 40 |     local original_name="$1"
 41 |     local mock_name="$2"
 42 |     local save_original_as="_btm_mocked_${original_name}"
 43 | 
 44 |     if [ -z $(declare -F "$save_original_as") ]; then
 45 |         _btm_copy_function "$original_name" "$save_original_as"
 46 |     fi
 47 |     _btm_copy_function "$mock_name" "$original_name"
 48 | }
 49 | 
 50 | function restore_mocked_function() {
 51 |     local original_name="$1"
 52 |     local save_original_as="_btm_mocked_${original_name}"
 53 | 
 54 |     if [ ! -z $(declare -F "$save_original_as") ]; then
 55 |         _btm_copy_function "$save_original_as" "$original_name"
 56 |         unset -f "$save_original_as"
 57 |     else
 58 |         _btm_caller_error "Can't find saved original function '$original_name' to restore"
 59 |     fi
 60 | }
 61 | 
 62 | function mock_command() {
 63 |     local command_name="$1"
 64 |     local mock_name="$2"
 65 | 
 66 |     if [ ! -z $(declare -F "$command_name") ]; then
 67 |         #  It's not actually a command, it's a function, mock that
 68 |         mock_function "$command_name" "$mock_name"
 69 |     else
 70 |         _btm_copy_function "$mock_name" "$command_name"
 71 |     fi
 72 | }
 73 | 
 74 | function restore_mocked_command() {
 75 |     local command_name="$1"
 76 | 
 77 |     local save_original_as="_btm_mocked_${command_name}"
 78 |     if [ ! -z $(declare -F "$save_original_as") ]; then
 79 |         #  Was actually a function mock not a command mock.
 80 |         restore_mocked_function "$command_name"
 81 |     else
 82 |         unset -f "$command_name" >/dev/null
 83 |     fi
 84 | }
 85 | 
 86 | # Copied from http://stackoverflow.com/a/1203628/870000
 87 | function _btm_copy_function() {
 88 |     declare -F $1 >/dev/null || _btm_caller_error "Can't find function '$1' to copy"
 89 |     eval "$(echo "${2}()"; declare -f ${1} | tail -n +2)"
 90 | }
 91 | 
 92 | #  Report an error from the POV of the first calling point outside this file
 93 | function _btm_caller_error() {
 94 |     local message="$*"
 95 | 
 96 |     local thisfile="${BASH_SOURCE[0]}"
 97 |     local file="$thisfile"
 98 |     local frame_num=2
 99 |     until [ "$file" != "$thisfile" ]; do
100 |         frame=$(caller "$frame_num")
101 |         IFS=' ' read line func file <<<"$frame"
102 |     done
103 | 
104 |     echo "Error: $message, on line $line of $file" >&2
105 |     exit 255
106 | }
107 | 


--------------------------------------------------------------------------------
/tests/chop/tiny-flat.gfa:
--------------------------------------------------------------------------------
1 | H	VN:Z:1.0
2 | S	1	CAAATAAGGCTTGGAAATTTTCTGGAGTTCTA
3 | S	2	TTATATTCCAACTCTCTG
4 | P	x	1+,2+	*
5 | L	1	+	2	+	*
6 | 


--------------------------------------------------------------------------------
/tests/chop/tiny-rev.gfa:
--------------------------------------------------------------------------------
1 | H	VN:Z:1.0
2 | S	1	CAAATAAGGCTTGGAAATTTTCTGGAGTTCTA
3 | S	2	TTATATTCCAACTCTCTG
4 | P	x	2-,1-	*
5 | L	1	+	2	+	*
6 | 


--------------------------------------------------------------------------------
/tests/small/small.maf:
--------------------------------------------------------------------------------
 1 | ##maf version=1
 2 | 
 3 | # SNP
 4 | a score=0 mafExtractor_splicedBlock=true splice_id=1_0
 5 | s human.1           0  3 +  10 GCA
 6 | s chimp.2       0  3 +  8 GCA
 7 | s cat.3    0  3 +  7 GTA
 8 | 
 9 | # Indel and strand change
10 | a score=0 mafExtractor_splicedBlock=true splice_id=1_0
11 | s human.1           3  7 +  10 GCAGAAT
12 | s chimp.2       3  5 +  8 GCAG--T
13 | s cat.3    0  4 -  7 --A-AAT
14 | 
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/tests/small/small2.maf:
--------------------------------------------------------------------------------
 1 | ##maf version=1
 2 | 
 3 | # SNP
 4 | a score=0 mafExtractor_splicedBlock=true splice_id=1_0
 5 | s human.1           0  3 +  10 GCA
 6 | s chimp.3       0  3 +  8 GCA
 7 | 
 8 | # Indel and strand change
 9 | a score=0 mafExtractor_splicedBlock=true splice_id=1_0
10 | s human.1           3  7 +  10 GCAGAAT
11 | s chimp.3       3  5 +  8 GCAG--T
12 | s cow.3    0  4 -  7 --A-AAT
13 | 
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/tests/small/truth.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "edge": [
  3 |     {
  4 |       "from": "1",
  5 |       "to": "9"
  6 |     },
  7 |     {
  8 |       "from": "1",
  9 |       "to": "2"
 10 |     },
 11 |     {
 12 |       "from": "2",
 13 |       "to": "3"
 14 |     },
 15 |     {
 16 |       "from": "3",
 17 |       "to": "8"
 18 |     },
 19 |     {
 20 |       "from": "3",
 21 |       "to": "4"
 22 |     },
 23 |     {
 24 |       "from": "3",
 25 |       "from_start": true,
 26 |       "to": "9",
 27 |       "to_end": true
 28 |     },
 29 |     {
 30 |       "from": "4",
 31 |       "to": "5",
 32 |       "to_end": true
 33 |     },
 34 |     {
 35 |       "from": "5",
 36 |       "from_start": true,
 37 |       "to": "7",
 38 |       "to_end": true
 39 |     },
 40 |     {
 41 |       "from": "5",
 42 |       "from_start": true,
 43 |       "to": "6"
 44 |     },
 45 |     {
 46 |       "from": "6",
 47 |       "to": "8",
 48 |       "to_end": true
 49 |     },
 50 |     {
 51 |       "from": "6",
 52 |       "to": "7",
 53 |       "to_end": true
 54 |     },
 55 |     {
 56 |       "from": "7",
 57 |       "from_start": true,
 58 |       "to": "8",
 59 |       "to_end": true
 60 |     }
 61 |   ],
 62 |   "node": [
 63 |     {
 64 |       "id": "1",
 65 |       "sequence": "G"
 66 |     },
 67 |     {
 68 |       "id": "2",
 69 |       "sequence": "C"
 70 |     },
 71 |     {
 72 |       "id": "3",
 73 |       "sequence": "A"
 74 |     },
 75 |     {
 76 |       "id": "4",
 77 |       "sequence": "GC"
 78 |     },
 79 |     {
 80 |       "id": "5",
 81 |       "sequence": "T"
 82 |     },
 83 |     {
 84 |       "id": "6",
 85 |       "sequence": "G"
 86 |     },
 87 |     {
 88 |       "id": "7",
 89 |       "sequence": "TT"
 90 |     },
 91 |     {
 92 |       "id": "8",
 93 |       "sequence": "A"
 94 |     },
 95 |     {
 96 |       "id": "9",
 97 |       "sequence": "T"
 98 |     }
 99 |   ],
100 |   "path": [
101 |     {
102 |       "mapping": [
103 |         {
104 |           "edit": [
105 |             {
106 |               "from_length": 1,
107 |               "to_length": 1
108 |             }
109 |           ],
110 |           "position": {
111 |             "node_id": "1"
112 |           },
113 |           "rank": "1"
114 |         },
115 |         {
116 |           "edit": [
117 |             {
118 |               "from_length": 1,
119 |               "to_length": 1
120 |             }
121 |           ],
122 |           "position": {
123 |             "node_id": "9"
124 |           },
125 |           "rank": "2"
126 |         },
127 |         {
128 |           "edit": [
129 |             {
130 |               "from_length": 1,
131 |               "to_length": 1
132 |             }
133 |           ],
134 |           "position": {
135 |             "node_id": "3"
136 |           },
137 |           "rank": "3"
138 |         },
139 |         {
140 |           "edit": [
141 |             {
142 |               "from_length": 1,
143 |               "to_length": 1
144 |             }
145 |           ],
146 |           "position": {
147 |             "node_id": "8"
148 |           },
149 |           "rank": "4"
150 |         },
151 |         {
152 |           "edit": [
153 |             {
154 |               "from_length": 2,
155 |               "to_length": 2
156 |             }
157 |           ],
158 |           "position": {
159 |             "node_id": "7"
160 |           },
161 |           "rank": "5"
162 |         },
163 |         {
164 |           "edit": [
165 |             {
166 |               "from_length": 1,
167 |               "to_length": 1
168 |             }
169 |           ],
170 |           "position": {
171 |             "node_id": "5"
172 |           },
173 |           "rank": "6"
174 |         }
175 |       ],
176 |       "name": "cat#0#3"
177 |     },
178 |     {
179 |       "mapping": [
180 |         {
181 |           "edit": [
182 |             {
183 |               "from_length": 1,
184 |               "to_length": 1
185 |             }
186 |           ],
187 |           "position": {
188 |             "node_id": "1"
189 |           },
190 |           "rank": "1"
191 |         },
192 |         {
193 |           "edit": [
194 |             {
195 |               "from_length": 1,
196 |               "to_length": 1
197 |             }
198 |           ],
199 |           "position": {
200 |             "node_id": "2"
201 |           },
202 |           "rank": "2"
203 |         },
204 |         {
205 |           "edit": [
206 |             {
207 |               "from_length": 1,
208 |               "to_length": 1
209 |             }
210 |           ],
211 |           "position": {
212 |             "node_id": "3"
213 |           },
214 |           "rank": "3"
215 |         },
216 |         {
217 |           "edit": [
218 |             {
219 |               "from_length": 2,
220 |               "to_length": 2
221 |             }
222 |           ],
223 |           "position": {
224 |             "node_id": "4"
225 |           },
226 |           "rank": "4"
227 |         },
228 |         {
229 |           "edit": [
230 |             {
231 |               "from_length": 1,
232 |               "to_length": 1
233 |             }
234 |           ],
235 |           "position": {
236 |             "is_reverse": true,
237 |             "node_id": "5"
238 |           },
239 |           "rank": "5"
240 |         },
241 |         {
242 |           "edit": [
243 |             {
244 |               "from_length": 1,
245 |               "to_length": 1
246 |             }
247 |           ],
248 |           "position": {
249 |             "node_id": "6"
250 |           },
251 |           "rank": "6"
252 |         },
253 |         {
254 |           "edit": [
255 |             {
256 |               "from_length": 1,
257 |               "to_length": 1
258 |             }
259 |           ],
260 |           "position": {
261 |             "is_reverse": true,
262 |             "node_id": "8"
263 |           },
264 |           "rank": "7"
265 |         }
266 |       ],
267 |       "name": "chimp#0#2"
268 |     },
269 |     {
270 |       "mapping": [
271 |         {
272 |           "edit": [
273 |             {
274 |               "from_length": 1,
275 |               "to_length": 1
276 |             }
277 |           ],
278 |           "position": {
279 |             "node_id": "1"
280 |           },
281 |           "rank": "1"
282 |         },
283 |         {
284 |           "edit": [
285 |             {
286 |               "from_length": 1,
287 |               "to_length": 1
288 |             }
289 |           ],
290 |           "position": {
291 |             "node_id": "2"
292 |           },
293 |           "rank": "2"
294 |         },
295 |         {
296 |           "edit": [
297 |             {
298 |               "from_length": 1,
299 |               "to_length": 1
300 |             }
301 |           ],
302 |           "position": {
303 |             "node_id": "3"
304 |           },
305 |           "rank": "3"
306 |         },
307 |         {
308 |           "edit": [
309 |             {
310 |               "from_length": 2,
311 |               "to_length": 2
312 |             }
313 |           ],
314 |           "position": {
315 |             "node_id": "4"
316 |           },
317 |           "rank": "4"
318 |         },
319 |         {
320 |           "edit": [
321 |             {
322 |               "from_length": 1,
323 |               "to_length": 1
324 |             }
325 |           ],
326 |           "position": {
327 |             "is_reverse": true,
328 |             "node_id": "5"
329 |           },
330 |           "rank": "5"
331 |         },
332 |         {
333 |           "edit": [
334 |             {
335 |               "from_length": 1,
336 |               "to_length": 1
337 |             }
338 |           ],
339 |           "position": {
340 |             "node_id": "6"
341 |           },
342 |           "rank": "6"
343 |         },
344 |         {
345 |           "edit": [
346 |             {
347 |               "from_length": 2,
348 |               "to_length": 2
349 |             }
350 |           ],
351 |           "position": {
352 |             "is_reverse": true,
353 |             "node_id": "7"
354 |           },
355 |           "rank": "7"
356 |         },
357 |         {
358 |           "edit": [
359 |             {
360 |               "from_length": 1,
361 |               "to_length": 1
362 |             }
363 |           ],
364 |           "position": {
365 |             "is_reverse": true,
366 |             "node_id": "8"
367 |           },
368 |           "rank": "8"
369 |         }
370 |       ],
371 |       "name": "human#0#1"
372 |     }
373 |   ]
374 | }
375 | 


--------------------------------------------------------------------------------
/tests/t/chop.t:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | BASH_TAP_ROOT=./bash-tap
 4 | . ${BASH_TAP_ROOT}/bash-tap-bootstrap
 5 | 
 6 | PATH=../bin:$PATH
 7 | PATH=../deps/hal:$PATH
 8 | 
 9 | plan tests 18
10 | 
11 | vg convert -g chop/tiny-flat.gfa -p > tiny-flat.vg
12 | printf "x\t0\t100\n" > all.bed
13 | clip-vg tiny-flat.vg -b all.bed | vg view - | grep -v ^H > chopped-all.gfa
14 | is "$(cat chopped-all.gfa | wc -l)" 0 "chopping everything clears out the graph"
15 | 
16 | rm -f all.bed chopped-all.gfa
17 | 
18 | printf "y\t0\t100\n" > none.bed
19 | clip-vg tiny-flat.vg -b none.bed | vg view - | grep -v ^H > chopped-none.gfa
20 | vg view tiny-flat.vg | grep -v ^H > orig.gfa
21 | diff chopped-none.gfa orig.gfa
22 | is "$?" 0 "chopping nothing doesn't change graph"
23 | 
24 | rm -f none.bed chopped-none.gfa orig.gfa
25 | 
26 | printf "x\t0\t1\n" > ends.bed
27 | printf "x\t48\t50\n" >> ends.bed
28 | clip-vg -n tiny-flat.vg -b ends.bed > chopped-ends.vg
29 | is "$(vg paths -Ev chopped-ends.vg)" "x[1-48]	47" "chopping ends gives subpath in the middle with correct length"
30 | is "$(vg stats -l chopped-ends.vg | awk '{print $2}')" "47" "chopping ends leaves correct number of bases"
31 | 
32 | rm -f ends.bed chopped-ends.vg
33 | 
34 | printf "x\t20\t25\n" > bits.bed
35 | printf "x\t1\t5\n" >> bits.bed
36 | printf "x\t10\t20\n" >> bits.bed
37 | printf "x\t40\t49\n" >> bits.bed
38 | clip-vg -n tiny-flat.vg -b bits.bed > chopped-bits.vg
39 | vg paths -Ev chopped-bits.vg | sed -e 's/\t/./g' >  bits.paths
40 | is "$(cat bits.paths | wc -l)" "4" "correct number of paths obtained after merging consectuive intervals"
41 | is "$(grep 'x\[0-1\].1' bits.paths | wc -l)" "1" "first bit found"
42 | is "$(grep 'x\[5-10\].5' bits.paths | wc -l)" "1" "next bit found"
43 | is "$(grep 'x\[25-40\].15' bits.paths | wc -l)" "1" "next bit after found"
44 | is "$(grep 'x\[49-50\].1' bits.paths | wc -l)" "1" "last bit found"
45 | 
46 | rm -f bits.bed chopped-bits.vg bits.paths
47 | 
48 | rm -f tiny-flat.vg
49 | 
50 | ########## flip path and repeat ##########
51 | 
52 | vg convert -g chop/tiny-rev.gfa -p > tiny-rev.vg
53 | #vg convert -g chop/tiny-rev.gfa -o > tiny-rev.vg
54 | printf "x\t0\t100\n" > all.bed
55 | clip-vg tiny-rev.vg -b all.bed | vg view - | grep -v ^H > chopped-all.gfa
56 | is "$(cat chopped-all.gfa | wc -l)" 0 "chopping everything clears out the graph"
57 | 
58 | rm -f all.bed chopped-all.gfa
59 | 
60 | printf "x\t0\t1\n" > ends.bed
61 | printf "x\t48\t50\n" >> ends.bed
62 | clip-vg -n tiny-rev.vg -b ends.bed > chopped-ends.vg
63 | is "$(vg paths -Ev chopped-ends.vg)" "x[1-48]	47" "chopping ends gives subpath in the middle with correct length"
64 | is "$(vg stats -l chopped-ends.vg | awk '{print $2}')" "47" "chopping ends leaves correct number of bases"
65 | 
66 | rm -f ends.bed chopped-ends.vg
67 | 
68 | printf "x\t20\t25\n" > bits.bed
69 | printf "x\t1\t5\n" >> bits.bed
70 | printf "x\t10\t20\n" >> bits.bed
71 | printf "x\t40\t49\n" >> bits.bed
72 | clip-vg -n tiny-rev.vg -b bits.bed > chopped-bits.vg
73 | vg paths -Ev chopped-bits.vg | sed -e 's/\t/./g' >  bits.paths
74 | is "$(cat bits.paths | wc -l)" "4" "correct number of paths obtained after merging consectuive intervals"
75 | is "$(grep 'x\[0-1\].1' bits.paths | wc -l)" "1" "first bit found"
76 | is "$(grep 'x\[5-10\].5' bits.paths | wc -l)" "1" "next bit found"
77 | is "$(grep 'x\[25-40\].15' bits.paths | wc -l)" "1" "next bit after found"
78 | is "$(grep 'x\[49-50\].1' bits.paths | wc -l)" "1" "last bit found"
79 | 
80 | rm -f bits.bed chopped-bits.vg bits.paths
81 | 
82 | rm -f tiny-rev.vg
83 | 
84 | # quick test for forwardization
85 | vg convert -g chop/tiny-fr.gfa -p > tiny-fr.vg
86 | vg paths -Fv tiny-fr.vg > tiny-fr.fa
87 | clip-vg tiny-fr.vg -e x -p > tiny-fr-forwardized.vg
88 | vg paths -Fv tiny-fr-forwardized.vg > tiny-fr-forwardized.fa
89 | diff tiny-fr.fa tiny-fr-forwardized.fa
90 | is "$?" 0  "fowawrsization does not affect path sequence"
91 | 
92 | rm -f tiny-fr.vg tiny-fr.fa tiny-fr-forwardized.vg tiny-fr-forwardized.fa tiny-fr-forwardized.fa
93 | 


--------------------------------------------------------------------------------
/tests/t/merge.t:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | BASH_TAP_ROOT=./bash-tap
 4 | . ${BASH_TAP_ROOT}/bash-tap-bootstrap
 5 | 
 6 | PATH=../bin:$PATH
 7 | PATH=../deps/hal:$PATH
 8 | 
 9 | plan tests 10
10 | 
11 | maf2hal small/small.maf small.hal
12 | maf2hal small/small2.maf small2.hal
13 | halMergeChroms small.hal,small2.hal merged1.hal
14 | halValidate merged1.hal
15 | is $? 0 "halMergeChroms produces valid hal"
16 | hal2fasta small.hal chimp > chimp.fa
17 | hal2fasta small2.hal chimp >> chimp.fa
18 | hal2fasta merged1.hal chimp > chimp.merge.fa
19 | diff chimp.fa chimp.merge.fa
20 | is $? 0 "halMergeChroms preserves chimp sequence"
21 | hal2fasta small.hal cat > cat.fa
22 | hal2fasta merged1.hal cat > cat.merge.fa
23 | diff cat.fa cat.merge.fa
24 | is $? 0 "halMergeChroms preserves cat sequence"
25 | hal2vg small.hal | vg mod -O - | vg ids -s - > small.vg
26 | hal2vg small2.hal | vg mod -O - | vg ids -s - > small2.vg
27 | hal2vg merged1.hal | vg mod -O - | vg ids -s - > merged1.vg
28 | vg view small.vg | sort > small.gfa
29 | vg view small2.vg | sort > small2.gfa
30 | vg find -x merged1.vg -p cat#0#3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/_0//g' | sed -e 's/_1//g' | sed -e "s/human chimp cat/chimp human cat/g" > merged1.comp1.gfa
31 | vg find -x merged1.vg -p cow#0#3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/_0//g' | sed -e 's/_1//g' | sed -e "s/human cow chimp/chimp human cow/g" > merged1.comp2.gfa
32 | diff small.gfa merged1.comp1.gfa
33 | is $? 0 "First component of merged graph identical to first input graph"
34 | diff small2.gfa merged1.comp2.gfa
35 | is $? 0 "Second component of merged graph identical to second input graph"
36 | 
37 | rm -f small.hal small2.halsmall.vg small2.vg small.gfa small2.gfa
38 | rm -f merged1.hal merged1.vg merged1.comp1.gfa merged1.comp2.gfa
39 | rm -f chimp.fa chimp.merge.fa
40 | rm -f cat.fa cat.merge.fa
41 | 
42 | ### copy paste above but change order ###
43 | 
44 | maf2hal small/small.maf small.hal
45 | maf2hal small/small2.maf small2.hal
46 | halMergeChroms small2.hal,small.hal merged1.hal
47 | halValidate merged1.hal
48 | is $? 0 "halMergeChroms produces valid hal"
49 | hal2fasta small2.hal chimp > chimp.fa
50 | hal2fasta small.hal chimp >> chimp.fa
51 | hal2fasta merged1.hal chimp > chimp.merge.fa
52 | diff chimp.fa chimp.merge.fa
53 | is $? 0 "halMergeChroms preserves chimp sequence"
54 | hal2fasta small.hal cat > cat.fa
55 | hal2fasta merged1.hal cat > cat.merge.fa
56 | diff cat.fa cat.merge.fa
57 | is $? 0 "halMergeChroms preserves cat sequence"
58 | hal2vg small.hal | vg mod -O - | vg ids -s - > small.vg
59 | hal2vg small2.hal | vg mod -O - | vg ids -s - > small2.vg
60 | hal2vg merged1.hal | vg mod -O - | vg ids -s - > merged1.vg
61 | vg view small.vg | sort > small.gfa
62 | vg view small2.vg | sort > small2.gfa
63 | vg find -x merged1.vg -p cat#0#3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/_0//g' | sed -e 's/_1//g' | sed -e "s/human chimp cat/chimp human cat/g" > merged1.comp1.gfa
64 | vg find -x merged1.vg -p cow#0#3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/_0//g' | sed -e 's/_1//g' | sed -e "s/human cow chimp/chimp human cow/g" > merged1.comp2.gfa
65 | diff small.gfa merged1.comp1.gfa
66 | is $? 0 "First component of merged graph identical to first input graph"
67 | diff small2.gfa merged1.comp2.gfa
68 | is $? 0 "Second component of merged graph identical to second input graph"
69 | 
70 | rm -f small.hal small2.halsmall.vg small2.vg small.gfa small2.gfa
71 | rm -f merged1.hal merged1.vg merged1.comp1.gfa merged1.comp2.gfa
72 | rm -f chimp.fa chimp.merge.fa
73 | rm -f cat.fa cat.merge.fa
74 | 


--------------------------------------------------------------------------------
/tests/t/small.t:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | BASH_TAP_ROOT=./bash-tap
 4 | . ${BASH_TAP_ROOT}/bash-tap-bootstrap
 5 | 
 6 | PATH=../bin:$PATH
 7 | PATH=../deps/hal:$PATH
 8 | 
 9 | plan tests 2
10 | 
11 | maf2hal small/small.maf small.hal
12 | hal2vg small.hal > small.vg
13 | vg view -j small.vg | jq . > small.json
14 | 
15 | is $(vg validate small.vg | wc -l) 0 "output vg validates"
16 | 
17 | # jq craziness from https://stackoverflow.com/questions/31930041/using-jq-or-alternative-command-line-tools-to-compare-json-files
18 | is $(jq --argfile a small.json --argfile b small/truth.json -n 'def post_recurse(f): def r: (f | select(. != null) | r), .; r; def post_recurse: post_recurse(.[]?); ($a | (post_recurse | arrays) |= sort) as $a | ($b | (post_recurse | arrays) |= sort) as $b | $a == $b') true "output graph identical to manually verified truth graph"
19 | 
20 | rm -f small.vg small.json
21 | 
22 | rm -f small.hal 
23 | 


--------------------------------------------------------------------------------