├── .gitignore ├── .gitmodules ├── .travis.yml ├── Dockerfile ├── IntervalTree.h ├── LICENSE ├── Makefile ├── README.md ├── ReleaseNotes.md ├── build-tools ├── CMakeLists.txt ├── makeBinRelease ├── makeSrcRelease ├── quayTagRelease └── releaseLib.sh ├── clip-vg.cpp ├── count-vg-hap-cov.cpp ├── filter-paf-deletions.cpp ├── hal2vg.cpp ├── halMergeChroms.cpp ├── halRemoveDupes.cpp ├── halUnclip.cpp ├── include.mk ├── paf.hpp ├── subpaths.h └── tests ├── bash-tap ├── Changes ├── README.mkdn ├── bash-tap ├── bash-tap-bootstrap └── bash-tap-mock ├── chop ├── tiny-flat.gfa └── tiny-rev.gfa ├── small ├── small.maf ├── small2.maf └── truth.json └── t ├── chop.t ├── merge.t └── small.t /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files 2 | *.slo 3 | *.lo 4 | *.o 5 | *.obj 6 | 7 | # Precompiled Headers 8 | *.gch 9 | *.pch 10 | 11 | # Compiled Dynamic libraries 12 | *.so 13 | *.dylib 14 | *.dll 15 | 16 | # Fortran module files 17 | *.mod 18 | *.smod 19 | 20 | # Compiled Static libraries 21 | *.lai 22 | *.la 23 | *.a 24 | *.lib 25 | 26 | # Executables 27 | *.exe 28 | *.out 29 | *.app 30 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "deps/hal"] 2 | path = deps/hal 3 | url = https://github.com/ComparativeGenomicsToolkit/hal.git 4 | [submodule "deps/sonLib"] 5 | path = deps/sonLib 6 | url = https://github.com/ComparativeGenomicsToolkit/sonLib.git 7 | [submodule "deps/libbdsg-easy"] 8 | path = deps/libbdsg-easy 9 | url = https://github.com/vgteam/libbdsg-easy.git 10 | [submodule "deps/pinchesAndCacti"] 11 | path = deps/pinchesAndCacti 12 | url = https://github.com/ComparativeGenomicsToolkit/pinchesAndCacti.git 13 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Control file for continuous integration testing at http://travis-ci.org/ 2 | 3 | language: cpp 4 | compiler: gcc 5 | 6 | before_install: 7 | - git submodule update --init --recursive 8 | - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libomp; fi 9 | - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo apt-get -qq update; fi 10 | - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo apt-get install -y libhdf5-serial-dev python3 python3-pip libpython3-dev wget; fi 11 | - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install hdf5 python3.6 python3-pip || echo "a brew error code when installing gcc is expected"; fi 12 | 13 | install: 14 | - sudo pip3 install setuptools --upgrade 15 | - wget https://github.com/vgteam/vg/releases/download/v1.30.0/vg && chmod u+x vg 16 | 17 | script: 18 | - export PATH=$(pwd):$PATH 19 | - export PATH=$(pwd)/deps/hal/bin:$PATH 20 | - make test 21 | 22 | dist: bionic 23 | osx_image: xcode10.1 24 | 25 | matrix: 26 | include: 27 | - os: linux 28 | compiler: gcc 29 | #- os: osx 30 | # compiler: clang 31 | 32 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # creates an image containing vg and hal2vg 2 | 3 | # build on compatible vg image 4 | # (this is for convenience of having vg and hal2vg in the same image, as hal2vg no longer depends on vg to build) 5 | FROM quay.io/vgteam/vg:v1.25.0 6 | 7 | # update system and install dependencies not present in vg image 8 | RUN apt-get -qq update && apt-get -qq install -y libhdf5-dev build-essential python3-dev python3-pip cmake libz-dev pkg-config git 9 | 10 | # copy current directory to docker 11 | ADD . /hal2vg 12 | 13 | # set working directory 14 | WORKDIR /hal2vg 15 | 16 | # build 17 | RUN make clean ; make 18 | 19 | # add hal2vg to the PATH 20 | ENV PATH /hal2vg:/hal2vg/deps/hal/bin:$PATH 21 | -------------------------------------------------------------------------------- /IntervalTree.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file was taken from 3 | https://github.com/ekg/intervaltree/commit/aa5937755000f1cd007402d03b6f7ce4427c5d21 4 | 5 | It has the following license: 6 | 7 | Copyright (c) 2011 Erik Garrison 8 | 9 | Permission is hereby granted, free of charge, to any person obtaining a copy of 10 | this software and associated documentation files (the "Software"), to deal in 11 | the Software without restriction, including without limitation the rights to 12 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 13 | of the Software, and to permit persons to whom the Software is furnished to do 14 | so, subject to the following conditions: 15 | 16 | The above copyright notice and this permission notice shall be included in all 17 | copies or substantial portions of the Software. 18 | 19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | SOFTWARE. 26 | */ 27 | 28 | #ifndef __INTERVAL_TREE_H 29 | #define __INTERVAL_TREE_H 30 | 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | #ifdef USE_INTERVAL_TREE_NAMESPACE 39 | namespace interval_tree { 40 | #endif 41 | template 42 | class Interval { 43 | public: 44 | Scalar start; 45 | Scalar stop; 46 | Value value; 47 | Interval(const Scalar& s, const Scalar& e, const Value& v) 48 | : start(std::min(s, e)) 49 | , stop(std::max(s, e)) 50 | , value(v) 51 | {} 52 | }; 53 | 54 | template 55 | Value intervalStart(const Interval& i) { 56 | return i.start; 57 | } 58 | 59 | template 60 | Value intervalStop(const Interval& i) { 61 | return i.stop; 62 | } 63 | 64 | template 65 | std::ostream& operator<<(std::ostream& out, const Interval& i) { 66 | out << "Interval(" << i.start << ", " << i.stop << "): " << i.value; 67 | return out; 68 | } 69 | 70 | template 71 | class IntervalTree { 72 | public: 73 | typedef Interval interval; 74 | typedef std::vector interval_vector; 75 | 76 | 77 | struct IntervalStartCmp { 78 | bool operator()(const interval& a, const interval& b) { 79 | return a.start < b.start; 80 | } 81 | }; 82 | 83 | struct IntervalStopCmp { 84 | bool operator()(const interval& a, const interval& b) { 85 | return a.stop < b.stop; 86 | } 87 | }; 88 | 89 | IntervalTree() 90 | : left(nullptr) 91 | , right(nullptr) 92 | , center(0) 93 | {} 94 | 95 | ~IntervalTree() = default; 96 | 97 | std::unique_ptr clone() const { 98 | return std::unique_ptr(new IntervalTree(*this)); 99 | } 100 | 101 | IntervalTree(const IntervalTree& other) 102 | : intervals(other.intervals), 103 | left(other.left ? other.left->clone() : nullptr), 104 | right(other.right ? other.right->clone() : nullptr), 105 | center(other.center) 106 | {} 107 | 108 | IntervalTree& operator=(IntervalTree&&) = default; 109 | IntervalTree(IntervalTree&&) = default; 110 | 111 | IntervalTree& operator=(const IntervalTree& other) { 112 | center = other.center; 113 | intervals = other.intervals; 114 | left = other.left ? other.left->clone() : nullptr; 115 | right = other.right ? other.right->clone() : nullptr; 116 | return *this; 117 | } 118 | 119 | IntervalTree( 120 | interval_vector ivals, 121 | std::size_t depth = 16, 122 | std::size_t minbucket = 64, 123 | std::size_t maxbucket = 512, 124 | Scalar leftextent = 0, 125 | Scalar rightextent = 0) 126 | : left(nullptr) 127 | , right(nullptr) 128 | { 129 | --depth; 130 | const auto minmaxStop = std::minmax_element(ivals.begin(), ivals.end(), 131 | IntervalStopCmp()); 132 | const auto minmaxStart = std::minmax_element(ivals.begin(), ivals.end(), 133 | IntervalStartCmp()); 134 | if (!ivals.empty()) { 135 | center = (minmaxStart.first->start + minmaxStop.second->stop) / 2; 136 | } 137 | if (leftextent == 0 && rightextent == 0) { 138 | // sort intervals by start 139 | std::sort(ivals.begin(), ivals.end(), IntervalStartCmp()); 140 | } else { 141 | assert(std::is_sorted(ivals.begin(), ivals.end(), IntervalStartCmp())); 142 | } 143 | if (depth == 0 || (ivals.size() < minbucket && ivals.size() < maxbucket)) { 144 | std::sort(ivals.begin(), ivals.end(), IntervalStartCmp()); 145 | intervals = std::move(ivals); 146 | assert(is_valid().first); 147 | return; 148 | } else { 149 | Scalar leftp = 0; 150 | Scalar rightp = 0; 151 | 152 | if (leftextent || rightextent) { 153 | leftp = leftextent; 154 | rightp = rightextent; 155 | } else { 156 | leftp = ivals.front().start; 157 | rightp = std::max_element(ivals.begin(), ivals.end(), 158 | IntervalStopCmp())->stop; 159 | } 160 | 161 | interval_vector lefts; 162 | interval_vector rights; 163 | 164 | for (typename interval_vector::const_iterator i = ivals.begin(); 165 | i != ivals.end(); ++i) { 166 | const interval& interval = *i; 167 | if (interval.stop < center) { 168 | lefts.push_back(interval); 169 | } else if (interval.start > center) { 170 | rights.push_back(interval); 171 | } else { 172 | assert(interval.start <= center); 173 | assert(center <= interval.stop); 174 | intervals.push_back(interval); 175 | } 176 | } 177 | 178 | if (!lefts.empty()) { 179 | left.reset(new IntervalTree(std::move(lefts), 180 | depth, minbucket, maxbucket, 181 | leftp, center)); 182 | } 183 | if (!rights.empty()) { 184 | right.reset(new IntervalTree(std::move(rights), 185 | depth, minbucket, maxbucket, 186 | center, rightp)); 187 | } 188 | } 189 | assert(is_valid().first); 190 | } 191 | 192 | // Call f on all intervals near the range [start, stop]: 193 | template 194 | void visit_near(const Scalar& start, const Scalar& stop, UnaryFunction f) const { 195 | if (!intervals.empty() && ! (stop < intervals.front().start)) { 196 | for (auto & i : intervals) { 197 | f(i); 198 | } 199 | } 200 | if (left && start <= center) { 201 | left->visit_near(start, stop, f); 202 | } 203 | if (right && stop >= center) { 204 | right->visit_near(start, stop, f); 205 | } 206 | } 207 | 208 | // Call f on all intervals crossing pos 209 | template 210 | void visit_overlapping(const Scalar& pos, UnaryFunction f) const { 211 | visit_overlapping(pos, pos, f); 212 | } 213 | 214 | // Call f on all intervals overlapping [start, stop] 215 | template 216 | void visit_overlapping(const Scalar& start, const Scalar& stop, UnaryFunction f) const { 217 | auto filterF = [&](const interval& interval) { 218 | if (interval.stop >= start && interval.start <= stop) { 219 | // Only apply f if overlapping 220 | f(interval); 221 | } 222 | }; 223 | visit_near(start, stop, filterF); 224 | } 225 | 226 | // Call f on all intervals contained within [start, stop] 227 | template 228 | void visit_contained(const Scalar& start, const Scalar& stop, UnaryFunction f) const { 229 | auto filterF = [&](const interval& interval) { 230 | if (start <= interval.start && interval.stop <= stop) { 231 | f(interval); 232 | } 233 | }; 234 | visit_near(start, stop, filterF); 235 | } 236 | 237 | interval_vector findOverlapping(const Scalar& start, const Scalar& stop) const { 238 | interval_vector result; 239 | visit_overlapping(start, stop, 240 | [&](const interval& interval) { 241 | result.emplace_back(interval); 242 | }); 243 | return result; 244 | } 245 | 246 | interval_vector findContained(const Scalar& start, const Scalar& stop) const { 247 | interval_vector result; 248 | visit_contained(start, stop, 249 | [&](const interval& interval) { 250 | result.push_back(interval); 251 | }); 252 | return result; 253 | } 254 | bool empty() const { 255 | if (left && !left->empty()) { 256 | return false; 257 | } 258 | if (!intervals.empty()) { 259 | return false; 260 | } 261 | if (right && !right->empty()) { 262 | return false; 263 | } 264 | return true; 265 | } 266 | 267 | template 268 | void visit_all(UnaryFunction f) const { 269 | if (left) { 270 | left->visit_all(f); 271 | } 272 | std::for_each(intervals.begin(), intervals.end(), f); 273 | if (right) { 274 | right->visit_all(f); 275 | } 276 | } 277 | 278 | std::pair extentBruitForce() const { 279 | struct Extent { 280 | std::pair x = {std::numeric_limits::max(), 281 | std::numeric_limits::min() }; 282 | void operator()(const interval & interval) { 283 | x.first = std::min(x.first, interval.start); 284 | x.second = std::max(x.second, interval.stop); 285 | } 286 | }; 287 | Extent extent; 288 | 289 | visit_all([&](const interval & interval) { extent(interval); }); 290 | return extent.x; 291 | } 292 | 293 | // Check all constraints. 294 | // If first is false, second is invalid. 295 | std::pair> is_valid() const { 296 | const auto minmaxStop = std::minmax_element(intervals.begin(), intervals.end(), 297 | IntervalStopCmp()); 298 | const auto minmaxStart = std::minmax_element(intervals.begin(), intervals.end(), 299 | IntervalStartCmp()); 300 | 301 | std::pair> result = {true, { std::numeric_limits::max(), 302 | std::numeric_limits::min() }}; 303 | if (!intervals.empty()) { 304 | result.second.first = std::min(result.second.first, minmaxStart.first->start); 305 | result.second.second = std::min(result.second.second, minmaxStop.second->stop); 306 | } 307 | if (left) { 308 | auto valid = left->is_valid(); 309 | result.first &= valid.first; 310 | result.second.first = std::min(result.second.first, valid.second.first); 311 | result.second.second = std::min(result.second.second, valid.second.second); 312 | if (!result.first) { return result; } 313 | if (valid.second.second >= center) { 314 | result.first = false; 315 | return result; 316 | } 317 | } 318 | if (right) { 319 | auto valid = right->is_valid(); 320 | result.first &= valid.first; 321 | result.second.first = std::min(result.second.first, valid.second.first); 322 | result.second.second = std::min(result.second.second, valid.second.second); 323 | if (!result.first) { return result; } 324 | if (valid.second.first <= center) { 325 | result.first = false; 326 | return result; 327 | } 328 | } 329 | if (!std::is_sorted(intervals.begin(), intervals.end(), IntervalStartCmp())) { 330 | result.first = false; 331 | } 332 | return result; 333 | } 334 | 335 | friend std::ostream& operator<<(std::ostream& os, const IntervalTree& itree) { 336 | return writeOut(os, itree); 337 | } 338 | 339 | friend std::ostream& writeOut(std::ostream& os, const IntervalTree& itree, 340 | std::size_t depth = 0) { 341 | auto pad = [&]() { for (std::size_t i = 0; i != depth; ++i) { os << ' '; } }; 342 | pad(); os << "center: " << itree.center << '\n'; 343 | for (const interval & inter : itree.intervals) { 344 | pad(); os << inter << '\n'; 345 | } 346 | if (itree.left) { 347 | pad(); os << "left:\n"; 348 | writeOut(os, *itree.left, depth + 1); 349 | } else { 350 | pad(); os << "left: nullptr\n"; 351 | } 352 | if (itree.right) { 353 | pad(); os << "right:\n"; 354 | writeOut(os, *itree.right, depth + 1); 355 | } else { 356 | pad(); os << "right: nullptr\n"; 357 | } 358 | return os; 359 | } 360 | 361 | private: 362 | interval_vector intervals; 363 | std::unique_ptr left; 364 | std::unique_ptr right; 365 | Scalar center; 366 | }; 367 | #ifdef USE_INTERVAL_TREE_NAMESPACE 368 | } 369 | #endif 370 | 371 | #endif 372 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (C) 2020 by UCSC Computational Genomics Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # simplest possible to start. dangerous since all header deps done manually. ne 2 | rootPath = ./ 3 | include ./include.mk 4 | 5 | all : hal2vg clip-vg halRemoveDupes halMergeChroms halUnclip filter-paf-deletions count-vg-hap-cov 6 | 7 | # Note: hdf5 from apt doesn't seem to work for static builds. It should be installed 8 | # from source and configured with "--enable-static --disable-shared", then have its 9 | # bin put at the front of PATH 10 | static: 11 | CFLAGS="$${CFLAGS} -static" \ 12 | CXXFLAGS="$${CXXFLAGS} -static" \ 13 | ${MAKE} all 14 | 15 | check-static: static 16 | if [ $(shell ls hal2vg clip-vg halRemoveDupes halMergeChroms halUnclip filter-paf-deletions count-vg-hap-cov | xargs ldd 2>& 1 | grep "not a dynamic" | wc -l) = $(shell ls hal2vg clip-vg halRemoveDupes halMergeChroms halUnclip filter-paf-deletions count-vg-hap-cov | wc -l) ] ; then\ 17 | echo "ldd verified that all files in bin/ are static";\ 18 | else\ 19 | echo "ldd found dynamic linked binary in bin/";\ 20 | exit 1;\ 21 | fi 22 | 23 | cleanFast : 24 | rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o halMergeChroms halMergeChroms.o halUnclip halUnclip.o filter-paf-deletions filter-paf-deletions.o count-vg-hap-cov.o count-vg-hap-cov 25 | 26 | clean : 27 | rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o halMergeChroms halMergeChroms.o halUnclip halUnclip.o filter-paf-deletions filter-paf-deletions.o 28 | cd deps/sonLib && make clean 29 | cd deps/pinchesAndCacti && make clean 30 | cd deps/hal && make clean 31 | cd deps/libbdsg-easy && make clean 32 | 33 | hal2vg.o : hal2vg.cpp ${basicLibsDependencies} 34 | ${cpp} ${CXXFLAGS} -I . hal2vg.cpp -c 35 | 36 | ${sonLibPath}/sonLib.a : 37 | cd deps/sonLib && make 38 | 39 | ${halPath}/libHal.a : ${sonLibPath}/sonLib.a 40 | cd deps/hal && make 41 | 42 | ${sonLibPath}/stPinchesAndCacti.a : ${sonLibPath}/sonLib.a 43 | cd deps/pinchesAndCacti && make 44 | 45 | ${libbdsgPath}/lib/libbdsg.a : 46 | cd deps/libbdsg-easy && make 47 | 48 | hal2vg : hal2vg.o ${basicLibsDependencies} 49 | ${cpp} ${CXXFLAGS} -fopenmp -pthread hal2vg.o ${basicLibs} -o hal2vg 50 | 51 | clip-vg.o : clip-vg.cpp ${basicLibsDependencies} 52 | ${cpp} ${CXXFLAGS} -I . clip-vg.cpp -c 53 | 54 | clip-vg : clip-vg.o ${basicLibsDependencies} 55 | ${cpp} ${CXXFLAGS} -fopenmp -pthread clip-vg.o ${basicLibs} -o clip-vg 56 | 57 | halRemoveDupes.o : halRemoveDupes.cpp ${basicLibsDependencies} 58 | ${cpp} ${CXXFLAGS} -I . halRemoveDupes.cpp -c 59 | 60 | halRemoveDupes : halRemoveDupes.o ${basicLibsDependencies} 61 | ${cpp} ${CXXFLAGS} -fopenmp -pthread halRemoveDupes.o ${basicLibs} -o halRemoveDupes 62 | 63 | halMergeChroms.o : halMergeChroms.cpp ${basicLibsDependencies} 64 | ${cpp} ${CXXFLAGS} -I . halMergeChroms.cpp -c 65 | 66 | halMergeChroms : halMergeChroms.o ${basicLibsDependencies} 67 | ${cpp} ${CXXFLAGS} -fopenmp -pthread halMergeChroms.o ${basicLibs} -o halMergeChroms 68 | 69 | halUnclip.o : halUnclip.cpp subpaths.h ${basicLibsDependencies} 70 | ${cpp} ${CXXFLAGS} -I . halUnclip.cpp -c 71 | 72 | halUnclip : halUnclip.o ${basicLibsDependencies} 73 | ${cpp} ${CXXFLAGS} -fopenmp -pthread halUnclip.o ${basicLibs} -o halUnclip 74 | 75 | filter-paf-deletions.o : filter-paf-deletions.cpp subpaths.h paf.hpp ${basicLibsDependencies} 76 | ${cpp} ${CXXFLAGS} -I . filter-paf-deletions.cpp -c 77 | 78 | filter-paf-deletions : filter-paf-deletions.o ${basicLibsDependencies} 79 | ${cpp} ${CXXFLAGS} -fopenmp -pthread filter-paf-deletions.o ${basicLibs} -o filter-paf-deletions 80 | 81 | count-vg-hap-cov.o : count-vg-hap-cov.cpp ${basicLibsDependencies} 82 | ${cpp} ${CXXFLAGS} -I . count-vg-hap-cov.cpp -c 83 | 84 | count-vg-hap-cov : count-vg-hap-cov.o ${basicLibsDependencies} 85 | ${cpp} ${CXXFLAGS} -fopenmp -pthread count-vg-hap-cov.o ${basicLibs} -o count-vg-hap-cov 86 | 87 | test : 88 | make 89 | cd tests && prove -v t 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # hal2vg 2 | [![Build Status](https://travis-ci.org/ComparativeGenomicsToolkit/hal2vg.svg?branch=master)](https://travis-ci.org/ComparativeGenomicsToolkit/hal2vg) 3 | 4 | Convert [HAL](https://github.com/glennhickey/hal) to [vg](https://github.com/vgteam/vg)-compatible sequence graph. 5 | 6 | Supports the three sequence graph formats in [libbdsg](https://github.com/vgteam/libbdsg): 7 | * PackedGraph (default) 8 | * ODGI 9 | * HashGraph 10 | 11 | ## Algorithm 12 | 13 | 1. Each sequence in the HAL is added as a thread to a [Pinch Graph](https://github.com/ComparativeGenomicsToolkit/pinchesAndCacti). 14 | 2. Exact pairwise alignment blocks (no gaps or substitutions) are extracted from each branch in the HAL tree and "pinched" in the graph 15 | 3. For each branch, bases in the child that have substitutions in the parent (snps) are aligned across the tree using the column iterator and all exact matches are extracted and pinched. 16 | 4. Pinch graph is cleaned up by merging trivial joins 17 | 5. Each HAL sequence is traced through the pinch graph, adding nodes and edges to the output sequence graph. A table is maintained to map each pinch graph block to a sequence graph node. 18 | 19 | ## Suggested Postprocessing: 20 | 21 | * Sort the output with `vg ids --sort`. 22 | 23 | ## Installation 24 | 25 | ### Binary Release 26 | 27 | You can download a standalone binary for the latest release [here](https://github.com/ComparativeGenomicsToolkit/hal2vg/releases). 28 | 29 | ### Compile From Source 30 | 31 | You can use the the [Dockerfile](Dockerfile) as a guide to see how all dependencies are installed with `apt` on Ubuntu. More details on installing HDF5 can be found in the [HAL README](https://github.com/ComparativeGenomicsToolkit/hal) 32 | 33 | **Cloning:** Don't forget to clone submodules with the `--recursive` option: 34 | 35 | git clone https://github.com/glennhickey/hal2vg.git --recursive 36 | 37 | **Compiling:** 38 | 39 | make 40 | 41 | ## Usage 42 | 43 | It is required to use the `--inMemory` option for all but trivial inputs. 44 | 45 | `vg` has been tuned to work best on graphs with nodes chopped to at most 32 bases. It is therefore recommended to use the `--chop 32` option. 46 | 47 | ``` 48 | hal2vg input.hal --inMemory --chop 32 --progress > output.pg 49 | ``` 50 | 51 | **Note**: The output graph is only readable by vg version 1.24.0 and greater. 52 | 53 | Copyright (C) 2020 by UCSC Computational Genomics Lab 54 | 55 | -------------------------------------------------------------------------------- /ReleaseNotes.md: -------------------------------------------------------------------------------- 1 | # Release 1.0.1 2020-09-07 2 | 3 | This release contains a bugfix required to use the subsetting options such as `--targetGenomes` and `--ignoreGenomes` without crashing. 4 | 5 | # Release 1.0.0 2020-08-19 6 | 7 | This release uses Cactus's Pinch Graph library to create the sequence graph. 8 | 9 | Notable Changes: 10 | - Bespoke STL-based structures and algorimths replaced with Pinch Graph library 11 | - SNPS are aligned through using the column iterator instead of tables 12 | - Much more performant than original implementaiton, but still only tested on smallish graphs 13 | -------------------------------------------------------------------------------- /build-tools/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Specify the minimum version for CMake 2 | cmake_minimum_required(VERSION 3.10) 3 | 4 | # This defines default install directories like "lib" 5 | include(GNUInstallDirs) 6 | 7 | # Project's name 8 | project(libhandlegraph) 9 | # We build using c++14 10 | set(CMAKE_CXX_STANDARD 14) 11 | 12 | # Use all standard-compliant optimizations 13 | set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -g") 14 | set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g") 15 | 16 | # Let cmake decide where to put the output files, allowing for out-of-tree builds. 17 | 18 | if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) 19 | # We are probably an external project. Don't use @rpath in Mac builds' 20 | # install_name fields (AKA LC_ID_DYLIB in otool -l output). Populate with 21 | # an absolute path instead. This will let us actually find the library when 22 | # we use it as a CMake external project and don't fully install it to any 23 | # normal lib directory. 24 | message("libhandlegraph is root project or external_project") 25 | set (CMAKE_MACOSX_RPATH OFF) 26 | else() 27 | # We are probably an add_subdirectory. We will expect to be in the root 28 | # project's lib directory, so we do want to have our non-installed 29 | # install_name use @rpath. 30 | message("libhandlegraph is add_subdirectory project") 31 | set (CMAKE_MACOSX_RPATH ON) 32 | endif() 33 | 34 | # The install_name gets modified on installation to be this. 35 | set (CMAKE_INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}") 36 | 37 | add_library(handlegraph_objs OBJECT 38 | src/deletable_handle_graph.cpp 39 | src/handle_graph.cpp 40 | src/mutable_handle_graph.cpp 41 | src/path_metadata.cpp 42 | src/mutable_path_metadata.cpp 43 | src/path_handle_graph.cpp 44 | src/path_position_handle_graph.cpp 45 | src/mutable_path_handle_graph.cpp 46 | src/ranked_handle_graph.cpp 47 | src/serializable.cpp 48 | src/snarl_decomposition.cpp 49 | src/trivially_serializable.cpp 50 | src/types.cpp 51 | src/copy_graph.cpp 52 | src/append_graph.cpp 53 | src/are_equivalent.cpp 54 | src/find_tips.cpp 55 | src/topological_sort.cpp 56 | src/apply_orientations.cpp 57 | src/is_single_stranded.cpp 58 | src/count_walks.cpp 59 | src/eades_algorithm.cpp 60 | src/dagify.cpp 61 | src/strongly_connected_components.cpp 62 | src/find_shortest_paths.cpp 63 | src/dijkstra.cpp 64 | src/is_acyclic.cpp 65 | src/reverse_complement.cpp 66 | src/split_strands.cpp 67 | src/chop.cpp 68 | src/weakly_connected_components.cpp 69 | src/extend.cpp 70 | src/include/handlegraph/handle_graph.hpp 71 | src/include/handlegraph/mutable_handle_graph.hpp 72 | src/include/handlegraph/deletable_handle_graph.hpp 73 | src/include/handlegraph/path_handle_graph.hpp 74 | src/include/handlegraph/path_position_handle_graph.hpp 75 | src/include/handlegraph/mutable_path_handle_graph.hpp 76 | src/include/handlegraph/mutable_path_mutable_handle_graph.hpp 77 | src/include/handlegraph/mutable_path_deletable_handle_graph.hpp 78 | src/include/handlegraph/expanding_overlay_graph.hpp 79 | src/include/handlegraph/util.hpp 80 | src/include/handlegraph/types.hpp 81 | src/include/handlegraph/iteratee.hpp 82 | src/include/handlegraph/algorithms/copy_graph.hpp 83 | src/include/handlegraph/algorithms/append_graph.hpp 84 | src/include/handlegraph/algorithms/are_equivalent.hpp 85 | src/include/handlegraph/algorithms/find_tips.hpp 86 | src/include/handlegraph/algorithms/topological_sort.hpp 87 | src/include/handlegraph/algorithms/apply_orientations.hpp 88 | src/include/handlegraph/algorithms/is_single_stranded.hpp 89 | src/include/handlegraph/algorithms/count_walks.hpp 90 | src/include/handlegraph/algorithms/eades_algorithm.hpp 91 | src/include/handlegraph/algorithms/dagify.hpp 92 | src/include/handlegraph/algorithms/strongly_connected_components.hpp 93 | src/include/handlegraph/algorithms/find_shortest_paths.hpp 94 | src/include/handlegraph/algorithms/dijkstra.hpp 95 | src/include/handlegraph/algorithms/reverse_complement.hpp 96 | src/include/handlegraph/algorithms/is_acyclic.hpp 97 | src/include/handlegraph/algorithms/split_strands.hpp 98 | src/include/handlegraph/algorithms/chop.hpp 99 | src/include/handlegraph/algorithms/weakly_connected_components.hpp 100 | src/include/handlegraph/algorithms/extend.hpp 101 | ) 102 | 103 | # Use the include directory when building the objects. 104 | # It can't be picked up via dependency by the other libraries even if it's public. 105 | target_include_directories(handlegraph_objs PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src/include") 106 | 107 | # Build objects position-independent to allow a shared library 108 | set_target_properties(handlegraph_objs PROPERTIES POSITION_INDEPENDENT_CODE TRUE) 109 | 110 | # Make static and shared versions with the same base name. 111 | # Make sure to give them interface include directories that depending targets can use. 112 | #add_library(handlegraph_shared SHARED $) 113 | #set_target_properties(handlegraph_shared PROPERTIES OUTPUT_NAME handlegraph) 114 | #target_include_directories(handlegraph_shared INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}/src/include") 115 | add_library(handlegraph_static STATIC $) 116 | set_target_properties(handlegraph_static PROPERTIES OUTPUT_NAME handlegraph) 117 | target_include_directories(handlegraph_static INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}/src/include") 118 | 119 | # Set up for installability 120 | #install(TARGETS handlegraph_shared handlegraph_static 121 | install(TARGETS handlegraph_static 122 | RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} 123 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} 124 | ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) 125 | install(DIRECTORY src/include/handlegraph 126 | DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} 127 | FILES_MATCHING PATTERN "*.hpp" 128 | ) 129 | -------------------------------------------------------------------------------- /build-tools/makeBinRelease: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Generate release hal2vg binary 3 | # Must have a static buildable hdf5 (ie not the one from apt) 4 | # Must be run after tree is tagged and pushed to master. 5 | # Use --keep to keep working directory for debugging. 6 | 7 | mydir=$(dirname $(which $0)) 8 | source ${mydir}/releaseLib.sh 9 | 10 | keep=no 11 | if [ $1 = '--keep' ] ; then 12 | keep=yes 13 | fi 14 | set -beEu -o pipefail 15 | 16 | buildDir=$(realpath -m build) 17 | binBuildDir="${buildDir}/bin-tmp" 18 | 19 | set -x 20 | rm -rf ${binBuildDir} 21 | mkdir -p ${binBuildDir} 22 | cd ${binBuildDir} 23 | git clone https://github.com/ComparativeGenomicsToolkit/hal2vg.git 24 | cd hal2vg 25 | git fetch --tags origin 26 | 27 | REL_TAG=$(getLatestReleaseTag) 28 | git checkout "${REL_TAG}" 29 | git submodule update --init --recursive 30 | 31 | # todo: update / figure out / remove hack: 32 | cp ./build-tools/CMakeLists.txt ./deps/libbdsg-easy/deps/libhandlegraph/CMakeLists.txt 33 | 34 | if [ $(man gcc | grep nehalem | wc -l) -ge 1 ] 35 | then 36 | # attempt to increase portability by using older architecture 37 | # this make/sed/make thing is a hack to get around a linking error that just cropped up 38 | CFLAGS="-march=nehalem" CXXFLAGS="-march=nehalem" make static || true 39 | sed -i deps/libbdsg-easy/deps/libbdsg/Makefile -e "s/-lomp//g" 40 | CFLAGS="-march=nehalem" CXXFLAGS="-march=nehalem" make check-static 41 | else 42 | make static || true 43 | sed -i deps/libbdsg-easy/deps/libbdsg/Makefile -e "s/-lomp//g" 44 | make check-static 45 | fi 46 | 47 | cp hal2vg clip-vg halRemoveDupes halMergeChroms halUnclip filter-paf-deletions count-vg-hap-cov ${buildDir}/ 48 | 49 | -------------------------------------------------------------------------------- /build-tools/makeSrcRelease: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Generate release tar file of source 3 | # Must be run after tree is tagged and pushed to master. 4 | # Use --keep to keep working directory for debugging. 5 | 6 | mydir=$(dirname $(which $0)) 7 | source ${mydir}/releaseLib.sh 8 | 9 | keep=no 10 | if [ $1 = '--keep' ] ; then 11 | keep=yes 12 | fi 13 | set -beEu -o pipefail 14 | 15 | buildDir=$(realpath -m build) 16 | srcBuildDir="${buildDir}/src-tmp" 17 | 18 | set -x 19 | rm -rf ${srcBuildDir} 20 | mkdir -p ${srcBuildDir} 21 | cd ${srcBuildDir} 22 | git clone --recursive https://github.com/ComparativeGenomicsToolkit/hal2vg.git 23 | cd hal2vg 24 | git fetch --tags origin 25 | 26 | REL_TAG=$(getLatestReleaseTag) 27 | git checkout "${REL_TAG}" 28 | git submodule update --init --recursive 29 | find deps -name ".git" -exec rm -Rf "{}" \; 30 | cd .. 31 | mv hal2vg hal2vg-${REL_TAG} 32 | tar -czf ${buildDir}/hal2vg-${REL_TAG}.tar.gz hal2vg-${REL_TAG} 33 | if [ "$keep" = "no" ] ; then 34 | rm -Rf ${srcBuildDir} 35 | fi 36 | -------------------------------------------------------------------------------- /build-tools/quayTagRelease: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # tag the docker image at quay.io corresponding to the release 3 | 4 | set -x 5 | set -beEu -o pipefail 6 | mydir=$(dirname $(which $0)) 7 | source ${mydir}/releaseLib.sh 8 | 9 | REL_TAG=$(getLatestReleaseTag) 10 | REL_COMMIT=$(git rev-list -n 1 ${REL_TAG}) 11 | 12 | docker image tag ${dockname}:${REL_COMMIT} ${dockname}:${REL_TAG} 13 | -------------------------------------------------------------------------------- /build-tools/releaseLib.sh: -------------------------------------------------------------------------------- 1 | # definitions and functions for release bash programs 2 | 3 | PYTHON=python3.6 4 | PIP="${PYTHON} -m pip" 5 | 6 | dockstore="quay.io/comparative-genomics-toolkit" 7 | dockname=${dockstore}/hal2vg 8 | 9 | 10 | # get the tag for the lastest release, in the form v1.2.3, from git 11 | getLatestReleaseTag() { 12 | git describe --tags $(git rev-list --tags --max-count=10) | egrep -e '^v[0-9]+\.[0-9]+\.[0-9]+$' | head -1 13 | } 14 | 15 | -------------------------------------------------------------------------------- /count-vg-hap-cov.cpp: -------------------------------------------------------------------------------- 1 | // Count the number of bases that aren't in a given reference sample. 2 | // Print the table of results stratisfied by number of covering samples 3 | // Assume's current cactus convertion of Sample.Haplotype.Contig 4 | 5 | //#define debug 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "bdsg/packed_graph.hpp" 18 | #include "bdsg/hash_graph.hpp" 19 | 20 | using namespace std; 21 | using namespace handlegraph; 22 | using namespace bdsg; 23 | 24 | static unique_ptr load_graph(istream& graph_stream) { 25 | 26 | char magic_bytes[4]; 27 | graph_stream.read(magic_bytes, 4); 28 | uint32_t magic_number = ntohl(*((uint32_t*) magic_bytes)); 29 | graph_stream.clear(); 30 | graph_stream.seekg(0, ios::beg); 31 | 32 | PathHandleGraph* graph; 33 | if (magic_number == PackedGraph().get_magic_number()) { 34 | graph = new PackedGraph(); 35 | } else if (magic_number == HashGraph().get_magic_number()) { 36 | graph = new HashGraph(); 37 | } else { 38 | cerr << "Unable to parse input graph with magic number " << magic_number << endl; 39 | exit(1); 40 | } 41 | dynamic_cast(graph)->deserialize(graph_stream); 42 | 43 | return unique_ptr(graph); 44 | } 45 | 46 | void help(char** argv) { 47 | cerr << "usage: " << argv[0] << " [options] [graph] [graph] [...]" << endl 48 | << "Count nodes and bp in graph covered by different sample counts\n" 49 | << "Assumes SAMPLE.HAPLOTYPE.CONTIG path name format" << endl 50 | << endl 51 | << "options: " << endl 52 | << " -r, --reference Include counts of nodes that are not present in the given reference sample prefix" << endl 53 | << " -i, --ignore Completely ignore all paths with given prefix [default: _MINIGRAPH_]" << endl 54 | << " -t, --threads Number of threads [default: all]" << endl 55 | << " -s, --separator Use this separator for tokenizing path name. Haplotype key will be first 2 tokens (or all tokens if fewer than 2) [default=.]" << endl 56 | << " -p, --progress Print progress" << endl 57 | << endl; 58 | } 59 | 60 | // returns SAMPLE.HAPLOTYPE 61 | // todo: vg/bdsg in progress of adpoting conventions / api 62 | // to manage stuff like this -- should switch to using that 63 | const string& get_sample_name(const PathHandleGraph* graph, path_handle_t path_handle, 64 | unordered_map& name_map, 65 | char separator) { 66 | if (!name_map.count(path_handle)) { 67 | string path_name = graph->get_path_name(path_handle); 68 | string sample; 69 | int dots = 0; 70 | for (int64_t i = 0; i < path_name.length(); ++i) { 71 | if (path_name[i] == separator) { 72 | ++dots; 73 | } 74 | if (dots == 2) { 75 | break; 76 | } 77 | sample.push_back(path_name[i]); 78 | } 79 | name_map[path_handle] = sample; 80 | } 81 | return name_map.at(path_handle); 82 | } 83 | 84 | int main(int argc, char** argv) { 85 | 86 | string ref_sample; 87 | string ignore_sample = "_MINIGRAPH_"; 88 | char separator = '.'; 89 | bool progress = false; 90 | 91 | int c; 92 | optind = 1; 93 | while (true) { 94 | 95 | static const struct option long_options[] = { 96 | {"help", no_argument, 0, 'h'}, 97 | {"ref-sample", required_argument, 0, 'r'}, 98 | {"ignore", required_argument, 0, 'i'}, 99 | {"separator", required_argument, 0, 's'}, 100 | {"threads", required_argument, 0, 't'}, 101 | {"progress", no_argument, 0, 'p'}, 102 | {0, 0, 0, 0} 103 | }; 104 | 105 | int option_index = 0; 106 | 107 | c = getopt_long (argc, argv, "hr:s:i:t:p", 108 | long_options, &option_index); 109 | 110 | // Detect the end of the options. 111 | if (c == -1) 112 | break; 113 | 114 | switch (c) 115 | { 116 | case 'r': 117 | ref_sample = optarg; 118 | break; 119 | case 'i': 120 | ignore_sample = optarg; 121 | break; 122 | case 's': 123 | assert(strlen(optarg) == 1); 124 | separator = optarg[0]; 125 | break; 126 | case 't': 127 | { 128 | int num_threads = stoi(optarg); 129 | if (num_threads <= 0) { 130 | cerr << "error:[count-vg-hap-depth] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl; 131 | exit(1); 132 | } 133 | omp_set_num_threads(num_threads); 134 | break; 135 | } 136 | case 'p': 137 | progress = true; 138 | break; 139 | case 'h': 140 | case '?': 141 | /* getopt_long already printed an error message. */ 142 | help(argv); 143 | exit(1); 144 | break; 145 | default: 146 | abort (); 147 | } 148 | } 149 | 150 | if (argc <= 1) { 151 | help(argv); 152 | return 1; 153 | } 154 | 155 | // Parse the positional argument 156 | if (optind >= argc) { 157 | cerr << "[count-vg-hap-depth] error: too few arguments" << endl; 158 | help(argv); 159 | return 1; 160 | } 161 | 162 | // depth stats (one per thread) 163 | vector> depth_base_counts(get_thread_count()); 164 | vector> depth_nfree_base_counts(get_thread_count()); 165 | vector> depth_node_counts(get_thread_count()); 166 | vector> depth_base_counts_nonref(get_thread_count()); 167 | vector> depth_nfree_base_counts_nonref(get_thread_count()); 168 | vector> depth_node_counts_nonref(get_thread_count()); 169 | 170 | // do counts for each graph arg 171 | while(optind < argc) { 172 | 173 | string graph_path = argv[optind++]; 174 | ifstream graph_stream(graph_path); 175 | if (!graph_stream) { 176 | cerr << "[count-vg-hap-depth] error: Unable to open input graph " << graph_path << endl; 177 | return 1; 178 | } 179 | unique_ptr graph = load_graph(graph_stream); 180 | graph_stream.close(); 181 | if (progress) { 182 | cerr << "[count-vg-hap-depth]: Loaded graph" << endl; 183 | } 184 | 185 | // path handle to sample key (one per thread) 186 | vector> name_maps(get_thread_count()); 187 | 188 | if (progress) { 189 | cerr << "[count-vg-hap-depth]: Calculating coverage with " << depth_base_counts.size() << " threads" << endl; 190 | } 191 | 192 | graph->for_each_handle([&](handle_t handle) { 193 | int64_t t = omp_get_thread_num(); 194 | // collect all the samples that step on the node 195 | set sample_set; 196 | bool ref = false; 197 | graph->for_each_step_on_handle(handle, [&](step_handle_t step_handle) { 198 | const string& sample_name = get_sample_name(graph.get(), graph->get_path_handle_of_step(step_handle), name_maps[t], separator); 199 | if (ignore_sample.empty() || sample_name.compare(0, ignore_sample.length(), ignore_sample) != 0) { 200 | if (!ref && sample_name.compare(0, ref_sample.length(), ref_sample) == 0) { 201 | ref = true; 202 | } 203 | sample_set.insert(sample_name); 204 | } 205 | }); 206 | // update the total coverage 207 | int64_t coverage = sample_set.size(); 208 | if (depth_base_counts[t].size() <= coverage) { 209 | depth_base_counts[t].resize(coverage + 1, 0); 210 | depth_node_counts[t].resize(coverage + 1, 0); 211 | depth_nfree_base_counts[t].resize(coverage + 1, 0); 212 | } 213 | int64_t node_len = graph->get_length(handle); 214 | int64_t num_ns = 0; 215 | string node_seq = graph->get_sequence(handle); 216 | for (auto c : node_seq) { 217 | if (c == 'N' || c == 'n') { 218 | ++num_ns; 219 | } 220 | } 221 | depth_base_counts[t][coverage] += node_len; 222 | depth_nfree_base_counts[t][coverage] += node_len - num_ns; 223 | depth_node_counts[t][coverage] += 1; 224 | 225 | if (!ref && !ref_sample.empty()) { 226 | // update the nonref coverage 227 | int64_t coverage = sample_set.size(); 228 | if (depth_base_counts_nonref[t].size() <= coverage) { 229 | depth_base_counts_nonref[t].resize(coverage + 1, 0); 230 | depth_node_counts_nonref[t].resize(coverage + 1, 0); 231 | depth_nfree_base_counts_nonref[t].resize(coverage + 1, 0); 232 | } 233 | depth_base_counts_nonref[t][coverage] += node_len; 234 | depth_nfree_base_counts_nonref[t][coverage] += node_len - num_ns; 235 | depth_node_counts_nonref[t][coverage] += 1; 236 | } 237 | }, 238 | true); 239 | } 240 | 241 | // make sure all tables have same size 242 | size_t max_size = 0; 243 | for (int64_t t = 0; t < get_thread_count(); ++t) { 244 | max_size = std::max(max_size, depth_base_counts[t].size()); 245 | max_size = std::max(max_size, depth_base_counts_nonref[t].size()); 246 | } 247 | for (int64_t t = 0; t < get_thread_count(); ++t) { 248 | if (depth_base_counts[t].size() < max_size) { 249 | depth_base_counts[t].resize(max_size, 0); 250 | depth_nfree_base_counts[t].resize(max_size, 0); 251 | depth_node_counts[t].resize(max_size, 0); 252 | } 253 | if (depth_base_counts_nonref[t].size() < max_size) { 254 | depth_base_counts_nonref[t].resize(max_size, 0); 255 | depth_nfree_base_counts_nonref[t].resize(max_size, 0); 256 | depth_node_counts_nonref[t].resize(max_size, 0); 257 | } 258 | assert(depth_base_counts[t].size() == max_size); 259 | assert(depth_nfree_base_counts[t].size() == max_size); 260 | assert(depth_node_counts[t].size() == max_size); 261 | assert(depth_base_counts_nonref[t].size() == max_size); 262 | assert(depth_nfree_base_counts_nonref[t].size() == max_size); 263 | assert(depth_node_counts_nonref[t].size() == max_size); 264 | } 265 | 266 | if (progress) { 267 | cerr << "[count-vg-hap-depth]: Merging data from different threads" << endl; 268 | } 269 | 270 | // merge up the threads 271 | for (int64_t t = 1; t < get_thread_count(); ++t) { 272 | for (int64_t coverage = 0; coverage < depth_base_counts[t].size(); ++coverage) { 273 | assert(depth_base_counts[0].size() > coverage); 274 | depth_base_counts[0][coverage] += depth_base_counts[t][coverage]; 275 | depth_nfree_base_counts[0][coverage] += depth_nfree_base_counts[t][coverage]; 276 | depth_node_counts[0][coverage] += depth_node_counts[t][coverage]; 277 | 278 | if (!ref_sample.empty()) { 279 | assert(depth_base_counts_nonref[0].size() > coverage); 280 | depth_base_counts_nonref[0][coverage] += depth_base_counts_nonref[t][coverage]; 281 | depth_nfree_base_counts_nonref[0][coverage] += depth_nfree_base_counts_nonref[t][coverage]; 282 | depth_node_counts_nonref[0][coverage] += depth_node_counts_nonref[t][coverage]; 283 | } 284 | } 285 | } 286 | 287 | // there's almost certainly an stl one-line for this.. oh well 288 | function(const vector&)> get_cumul = [](const vector& v) { 289 | int64_t tot = 0; 290 | vector cumul(v.size(), 0); 291 | for (int64_t i = 0; i < v.size(); ++i) { 292 | tot += v[i]; 293 | cumul[i] = tot; 294 | } 295 | return cumul; 296 | }; 297 | function(const vector&)> get_lumuc = [](const vector& v) { 298 | int64_t tot = 0; 299 | vector cumul(v.size(), 0); 300 | for (int64_t i = v.size() - 1; i >= 0; --i) { 301 | tot += v[i]; 302 | cumul[i] = tot; 303 | } 304 | return cumul; 305 | }; 306 | 307 | // keep cumulative counts while we're at it 308 | // cumulate from 0 309 | vector node_counts_cumul = get_cumul(depth_node_counts[0]); 310 | vector base_counts_cumul = get_cumul(depth_base_counts[0]); 311 | vector nfree_base_counts_cumul = get_cumul(depth_nfree_base_counts[0]); 312 | vector node_counts_nonref_cumul = get_cumul(depth_node_counts_nonref[0]); 313 | vector base_counts_nonref_cumul = get_cumul(depth_base_counts_nonref[0]); 314 | vector nfree_base_counts_nonref_cumul = get_cumul(depth_nfree_base_counts_nonref[0]); 315 | 316 | //cumulate from end 317 | vector node_counts_lumuc = get_lumuc(depth_node_counts[0]); 318 | vector base_counts_lumuc = get_lumuc(depth_base_counts[0]); 319 | vector nfree_base_counts_lumuc = get_lumuc(depth_nfree_base_counts[0]); 320 | vector node_counts_nonref_lumuc = get_lumuc(depth_node_counts_nonref[0]); 321 | vector base_counts_nonref_lumuc = get_lumuc(depth_base_counts_nonref[0]); 322 | vector nfree_base_counts_nonref_lumuc = get_lumuc(depth_nfree_base_counts_nonref[0]); 323 | 324 | // print the results 325 | cout << "hap-depth" 326 | << "\t" << "nodes" << "\t" << "bases" << "\t" << "non-n-bases" 327 | << "\t" << "nodes-cumul" << "\t" <<"bases-cumul" << "\t" << "non-n-bases-cumul" 328 | << "\t" << "nodes-cumul-rev" << "\t" << "bases-cumul-rev" << "\t" << "non-n-bases-cumul-rev"; 329 | if (!ref_sample.empty()) { 330 | cout << "\t" << "nodes-nonref" << "\t" << "bases-nonref" << "\t" << "non-n-bases-nonref" 331 | << "\t" << "nodes-cumul-nonref" << "\t" << "bases-cumul-nonref" << "\t" << "non-n-bases-cumul-nonref" 332 | << "\t" << "nodes-cumul-rev-nonref" << "\t" << "bases-cumul-rev-nonref" << "\t" << "non-n-bases-cumul-rev-nonref"; 333 | } 334 | cout << endl; 335 | 336 | for (int64_t coverage = 0; coverage < depth_base_counts[0].size(); ++coverage) { 337 | cout << coverage 338 | << "\t" << depth_node_counts[0][coverage] << "\t" << depth_base_counts[0][coverage] << "\t" << depth_nfree_base_counts[0][coverage] 339 | << "\t" << node_counts_cumul[coverage] << "\t" << base_counts_cumul[coverage] << "\t" << nfree_base_counts_cumul[coverage] 340 | << "\t" << node_counts_lumuc[coverage] << "\t" << base_counts_lumuc[coverage] << "\t" << nfree_base_counts_lumuc[coverage]; 341 | if (!ref_sample.empty()) { 342 | cout << "\t" << depth_node_counts_nonref[0][coverage] << "\t" << depth_base_counts_nonref[0][coverage] << "\t" << depth_nfree_base_counts_nonref[0][coverage] 343 | << "\t" << node_counts_nonref_cumul[coverage] << "\t" << base_counts_nonref_cumul[coverage] << "\t" << nfree_base_counts_nonref_cumul[coverage] 344 | << "\t" << node_counts_nonref_lumuc[coverage] << "\t" << base_counts_nonref_lumuc[coverage] << "\t" << nfree_base_counts_nonref_lumuc[coverage]; 345 | } 346 | cout << "\n"; 347 | } 348 | 349 | return 0; 350 | } 351 | -------------------------------------------------------------------------------- /filter-paf-deletions.cpp: -------------------------------------------------------------------------------- 1 | // Filter big deletions from PAF. These can sometimes arise from minigraph split alignments. They are rare, but can really 2 | // mess up topology of graph if even one gets into cactus. 3 | 4 | // 1) Estimate anchors along reference path for every node in input graph 5 | // 2) Scan every query in PAF in order, and look at target blocks 6 | // 3) Use table from 1) in order to estimate the distances between target blocks 7 | // 4) If two conesecutive target blocks span too big of a distance, delete the smaller block 8 | 9 | //#define debug 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include "bdsg/packed_graph.hpp" 22 | #include "bdsg/hash_graph.hpp" 23 | 24 | #include "IntervalTree.h" 25 | #include "paf.hpp" 26 | 27 | using namespace std; 28 | using namespace handlegraph; 29 | using namespace bdsg; 30 | 31 | struct Anchor { 32 | path_handle_t path_handle; 33 | int64_t max_offset; 34 | int64_t min_offset; 35 | }; 36 | 37 | struct PafDelta { 38 | int64_t delta; 39 | int64_t ref_delta; 40 | int64_t query_delta; 41 | int64_t ref_overlap_size; 42 | int64_t prev_ref_start; 43 | int64_t prev_ref_end; 44 | int64_t cur_ref_start; 45 | int64_t cur_ref_end; 46 | int64_t query_len; 47 | }; 48 | 49 | static unique_ptr load_graph(istream& graph_stream); 50 | static pair, unordered_map> load_trans(const string& trans_path); 51 | static unordered_map index_graph(const PathHandleGraph* graph, 52 | const string& ref_prefix); 53 | static unordered_map> index_deletions(const PathHandleGraph* graph, const unordered_map& index); 54 | static pair, unordered_map> load_paf(ifstream& paf_file); 55 | static int64_t for_each_query_block(const vector& paf_lines, const vector& masking, 56 | function visit_block); 57 | static int64_t check_delta(int64_t max_deletion_threshold, int64_t max_insertion_threshold, const PafDelta& paf_delta, double overlap_threshold, 58 | double deletion_size_threshold); 59 | static PafDelta get_delta(path_handle_t ref_path, const PafLine& prev_paf, const PafLine& cur_paf, 60 | const unordered_map& mg_to_vg, const unordered_map& ref_index, 61 | const unordered_map>& ref_deletions); 62 | 63 | void help(char** argv) { 64 | cerr << "usage: " << argv[0] << " [options] -d \n" << endl 65 | << "Use distances from graph to filter out implied deletions from PAF (cigars not considered, only blocks)" << endl 66 | << " : minigraph as obtained from vg convert -g graph.gfa" << endl 67 | << " : node translation from vg convert -g -T" << endl 68 | << " : paf alignment from cactus-graphmap" << endl 69 | << endl 70 | << "options: " << endl 71 | << " -d --del-threshold F Only remove deletions greater than this. if < 1, then interpreted as fraction of reference path size" << endl 72 | << " -i, --ins-threshold F Like , but applied to insertions instead of deletions [-1]" << endl 73 | << " -m, --max-filter F If F* matches need to be pulled apart to resolve a single deletion, just leave it alone [1]" << endl 74 | << " -s, --del-size-threshold F Remove any deletion if the source contig size is < F* [-1: disabled]" << endl 75 | << " -r, --ref-prefix STR Only consider paths whose names start with STR" << endl 76 | << " -p, --progress Print progress" << endl 77 | << " -o, --filter-off-ref Filter mappings that aren't in dominant ref" << endl 78 | << " -v, --verbose Print deletions" << endl 79 | << " -t, --threads N Number of threads to use (used only for indexing graph) [default: all available]" << endl 80 | << endl; 81 | } 82 | 83 | int main(int argc, char** argv) { 84 | 85 | string ref_prefix; 86 | bool progress = false; 87 | bool verbose = false; 88 | bool keep_off_ref = true; 89 | // only filter deletions that don't overlap an existing deletion by at least this much 90 | // (doesn't seem to a factor -- most big deletions not in minigraph) 91 | double overlap_threshold = 0.5; 92 | double filter_threshold = 1.0; 93 | double max_insertion = -1.0; 94 | double max_deletion = -1.0; 95 | double deletion_size_threshold = -1.0; 96 | int c; 97 | optind = 1; 98 | while (true) { 99 | 100 | static const struct option long_options[] = { 101 | {"del-threshold", required_argument, 0, 'd'}, 102 | {"ins-threshold", required_argument, 0, 'i'}, 103 | {"max-filter", required_argument, 0, 'm'}, 104 | {"del-size-threshold", required_argument, 0, 's'}, 105 | {"ref-prefix", required_argument, 0, 'r'}, 106 | {"filter-off-ref", no_argument, 0, 'o'}, 107 | {"help", no_argument, 0, 'h'}, 108 | {"progress", no_argument, 0, 'p'}, 109 | {"verbose", no_argument, 0, 'v'}, 110 | {"threads", required_argument, 0, 't'}, 111 | {0, 0, 0, 0} 112 | }; 113 | 114 | int option_index = 0; 115 | 116 | c = getopt_long (argc, argv, "d:i:m:s:r:khpvt:", 117 | long_options, &option_index); 118 | 119 | // Detect the end of the options. 120 | if (c == -1) 121 | break; 122 | 123 | switch (c) 124 | { 125 | case 'd': 126 | max_deletion = stof(optarg); 127 | break; 128 | case 'i': 129 | max_insertion = stof(optarg); 130 | break; 131 | case 'm': 132 | filter_threshold = stof(optarg); 133 | break; 134 | case 's': 135 | deletion_size_threshold = stof(optarg); 136 | break; 137 | case 'r': 138 | ref_prefix = optarg; 139 | break; 140 | case 'o': 141 | keep_off_ref = false; 142 | break; 143 | case 'v': 144 | verbose = true; 145 | break; 146 | case 'p': 147 | progress = true; 148 | break; 149 | case 't': 150 | { 151 | int num_threads = stoi(optarg); 152 | if (num_threads <= 0) { 153 | cerr << "[filter-paf-deletions] error: Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl; 154 | exit(1); 155 | } 156 | omp_set_num_threads(num_threads); 157 | break; 158 | } 159 | case 'h': 160 | case '?': 161 | /* getopt_long already printed an error message. */ 162 | help(argv); 163 | exit(1); 164 | break; 165 | default: 166 | abort (); 167 | } 168 | } 169 | 170 | if (argc <= 3) { 171 | cerr << "[filter-paf-deletions] error: too few arguments\n" << endl; 172 | help(argv); 173 | return 1; 174 | } 175 | 176 | // Parse the positional argument 177 | if (optind >= argc) { 178 | cerr << "[filter-paf-deletions] error: too few arguments\n" << endl; 179 | help(argv); 180 | return 1; 181 | } 182 | 183 | if (optind != argc - 3) { 184 | cerr << "[filter-paf-deletions] error: too many arguments\n" << endl; 185 | help(argv); 186 | return 1; 187 | } 188 | 189 | if (max_deletion <= 0 && max_insertion <= 0) { 190 | cerr << "[filter-paf-deletions] error: at least one of -d or -i must be set to positive value" << endl; 191 | return 1; 192 | } 193 | 194 | string graph_path = argv[optind++]; 195 | string trans_path = argv[optind++]; 196 | string paf_path = argv[optind++]; 197 | 198 | // load the graph 199 | ifstream graph_stream(graph_path); 200 | if (!graph_stream) { 201 | cerr << "[filter-paf-deletions] error: Unable to open input graph " << graph_path << endl; 202 | return 1; 203 | } 204 | unique_ptr graph = load_graph(graph_stream); 205 | graph_stream.close(); 206 | if (progress) { 207 | cerr << "[filter-paf-deletions]: Loaded graph" << endl; 208 | } 209 | 210 | // load the minigraph <-> vg id translation table (because our PAF is expressed in terms of the minigraph 211 | // ids but we lose them when converting to vg.) 212 | unordered_map mg_to_vg; 213 | unordered_map vg_to_mg; 214 | std::tie(mg_to_vg, vg_to_mg) = load_trans(trans_path); 215 | 216 | if (progress) { 217 | cerr << "[filter-paf-deletions]: Loaded " << mg_to_vg.size() << " translations." << endl; 218 | } 219 | 220 | // open the paf 221 | ifstream paf_file(paf_path); 222 | if (!paf_file) { 223 | cerr << "[filter-paf-deletions] error: Unable to open PAF" << endl; 224 | return 1; 225 | } 226 | 227 | // load the paf into memory 228 | vector paf_lines; 229 | unordered_map orig_to_sorted; 230 | std::tie(paf_lines, orig_to_sorted) = load_paf(paf_file); 231 | if (progress) { 232 | cerr << "[filter-paf-deletions]: Loaded " << paf_lines.size() << " paf lines" << endl; 233 | } 234 | 235 | // index the minigraph 236 | // this maps each node in the graph to a (maximal) reference interval 237 | unordered_map ref_index = index_graph(graph.get(), ref_prefix); 238 | if (progress) { 239 | cerr << "[filter-paf-deletions]: Created reference path index" << endl; 240 | } 241 | 242 | unordered_map ref_path_to_length; 243 | if ((max_deletion > 0 && max_deletion <= 1.) || (max_insertion > 0. && max_insertion <= 1.)) { 244 | graph->for_each_path_handle([&](path_handle_t path_handle) { 245 | int64_t len = 0; 246 | graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { 247 | len += graph->get_length(graph->get_handle_of_step(step_handle)); 248 | }); 249 | ref_path_to_length[path_handle] = len; 250 | }); 251 | if (progress) { 252 | cerr << "[filter-paf-deletions]: Computed lengths for " << ref_path_to_length.size() << " reference paths" << endl; 253 | } 254 | } 255 | 256 | unordered_map> ref_deletions = index_deletions(graph.get(), ref_index); 257 | if (progress) { 258 | cerr << "[filter-paf-deletions]: Created reference deletion index" << endl; 259 | } 260 | 261 | #ifdef debug 262 | for (auto fam : ref_index) { 263 | cerr << fam.first << " -> " << graph->get_path_name(fam.second.path_handle) << " " << fam.second.min_offset << " " << fam.second.max_offset << endl; 264 | } 265 | #endif 266 | 267 | // we have everything needed to filter the paf 268 | vector filtered_lines(paf_lines.size(), false); 269 | int64_t filtered_line_total = 0; 270 | int64_t filtered_line_it = 0; 271 | int64_t filtered_match_total = 0; 272 | vector off_ref_filtered_lines(paf_lines.size(), false); 273 | int64_t off_ref_filtered_line_it = 0; 274 | int64_t off_ref_filtered_match_total = 0; 275 | int64_t iteration = 0; 276 | 277 | do { 278 | filtered_line_it = 0; 279 | for_each_query_block(paf_lines, filtered_lines, [&](int64_t block_start, int64_t block_end) { 280 | assert(!filtered_lines[block_start] && !filtered_lines[block_end]); 281 | // get some stats about the block 282 | unordered_map ref_path_sizes; 283 | int64_t total_matches = 0; 284 | for (int64_t i = block_start; i <= block_end; ++i) { 285 | if (!filtered_lines[i]) { 286 | const PafLine& paf = paf_lines[i]; 287 | nid_t target_id = mg_to_vg.at(paf.target_name); 288 | const Anchor& anchor = ref_index.at(target_id); 289 | ref_path_sizes[anchor.path_handle] += paf.num_matching; 290 | total_matches += paf.num_matching; 291 | } else { 292 | assert(iteration > 0); 293 | } 294 | } 295 | if (total_matches == 0) { 296 | // whole block was filtered, nothing to be done 297 | return; 298 | } 299 | // find the number one reference path by match coverage 300 | // todo: what about tie? 301 | path_handle_t ref_path; 302 | int64_t ref_path_size = -1; 303 | for (const auto& rps : ref_path_sizes) { 304 | if (rps.second > ref_path_size) { 305 | ref_path_size = rps.second; 306 | ref_path = rps.first; 307 | } 308 | } 309 | 310 | // support fracitonal thresholds which apply to path length 311 | int64_t max_deletion_threshold = max_deletion; 312 | int64_t max_insertion_threshold = max_insertion; 313 | if (max_deletion > 0. && max_deletion <= 1.) { 314 | max_deletion_threshold = max_deletion * ref_path_to_length.at(ref_path); 315 | } 316 | if (max_insertion > 0. && max_insertion <= 1.) { 317 | max_insertion_threshold = max_insertion * ref_path_to_length.at(ref_path); 318 | } 319 | 320 | // mask out everything off this path 321 | // get rid of all off-reference path mappings right away 322 | // note: these are flagged to be ignored but not actually filtered 323 | // unless keep_off_ref is set to false, then they are removed 324 | int64_t off_ref_total = 0; 325 | int64_t off_ref_match_total = 0; 326 | for (int64_t i = block_start; i <= block_end; ++i) { 327 | if (!filtered_lines[i]) { 328 | nid_t cur_target_id = mg_to_vg.at(paf_lines[i].target_name); 329 | const Anchor& cur_anchor = ref_index.at(cur_target_id); 330 | if (cur_anchor.path_handle != ref_path) { 331 | off_ref_filtered_lines[i] = true; 332 | ++off_ref_total; 333 | off_ref_match_total += paf_lines[i].num_matching; 334 | } 335 | } 336 | } 337 | off_ref_filtered_line_it += off_ref_total; 338 | off_ref_filtered_match_total += off_ref_match_total; 339 | if (!keep_off_ref && verbose && off_ref_total > 0) { 340 | cerr << "[filter-paf-deletions]: filtered " << off_ref_total << " lines with " << off_ref_match_total << " bases " 341 | << " because they did not map to reference sequence " << graph->get_path_name(ref_path) << " all in block " 342 | << "\n I=" << block_start <<": " << paf_lines[block_start] 343 | << "\n J=" << block_end << ": " << paf_lines[block_end] << endl << endl; 344 | } 345 | 346 | // try to find a gap that exceeds the length 347 | int64_t prev_idx = -1; 348 | // these are the boundaries of the deletions in the block 349 | // the deletion is between cut_point[i] and cut_point[i] - 1 350 | vector cut_points; 351 | vector cut_sizes; 352 | for (int64_t i = block_start; i <= block_end; ++i) { 353 | if (filtered_lines[i] || off_ref_filtered_lines[i]) { 354 | continue; 355 | } 356 | if (prev_idx == -1) { 357 | prev_idx = i; 358 | continue; 359 | } 360 | // if we got this far that means we're on the path and we have a prev on the path too 361 | // do a rough delta check 362 | assert(prev_idx < i); 363 | const PafLine& cur_paf = paf_lines[i]; 364 | const PafLine& prev_paf = paf_lines[prev_idx]; 365 | PafDelta paf_delta = get_delta(ref_path, prev_paf, cur_paf, mg_to_vg, ref_index, ref_deletions); 366 | 367 | int64_t checked_delta = check_delta(max_deletion_threshold, max_insertion_threshold, paf_delta, overlap_threshold, deletion_size_threshold); 368 | 369 | if (checked_delta != 0) { 370 | if (verbose) { 371 | cerr << "[filter-paf-deletions]: detected " << (checked_delta > 0 ? "deletion" : "insertion") 372 | << " of size " << (int64_t)abs(paf_delta.delta) << " with overlap " << paf_delta.ref_overlap_size 373 | << " on ref path " << graph->get_path_name(ref_path) << " with cur anchor (" 374 | << paf_delta.cur_ref_start << ", " << paf_delta.cur_ref_end << ") and prev anchor (" << paf_delta.prev_ref_start << ", " 375 | << paf_delta.prev_ref_end << ") and threshold " << max_deletion_threshold 376 | << " on following paf line:\n I=" << (prev_idx) <<": " << prev_paf 377 | << "\n J=" << i << ": " << cur_paf << endl << endl; 378 | } 379 | cut_points.push_back(i); 380 | cut_sizes.push_back(paf_delta.delta); 381 | } 382 | 383 | prev_idx = i; 384 | } 385 | 386 | // greedy heuristic: for every deletion cut point, we try scanning forward and backward to find the minimum 387 | // resolving block of lines whose removal solves the deletion 388 | for (int64_t j = 0; j < cut_points.size(); ++j) { 389 | if (filtered_lines[cut_points[j]]) { 390 | continue; 391 | } 392 | // go backward 393 | int64_t backward_matches = 0; 394 | int64_t backward_candidate = -1; // last *unfiltered* line scanning left 395 | int64_t prev_idx = -1; 396 | for (int64_t k = cut_points[j] - 1; k >= block_start && backward_candidate == -1; --k) { 397 | if (!filtered_lines[k] && !off_ref_filtered_lines[k]) { 398 | if (prev_idx == -1) { 399 | prev_idx = k; 400 | } 401 | const PafLine& prev_paf = paf_lines[k]; 402 | const PafLine& cur_paf = paf_lines[cut_points[j]]; 403 | PafDelta paf_delta = get_delta(ref_path, prev_paf, cur_paf, mg_to_vg, ref_index, ref_deletions); 404 | int64_t checked_delta = check_delta(max_deletion_threshold, max_insertion_threshold, paf_delta, overlap_threshold, 405 | deletion_size_threshold); 406 | if (checked_delta == 0) { 407 | backward_candidate = k; 408 | } else { 409 | backward_matches += paf_lines[k].num_matching; 410 | } 411 | } 412 | } 413 | if (backward_candidate == -1) { 414 | // need to delete block_start too 415 | backward_candidate = block_start - 1; 416 | } 417 | 418 | // go forward 419 | int64_t forward_matches = 0; 420 | int64_t forward_candidate = -1; // last *unfiltered* line scanning right 421 | for (int64_t k = cut_points[j]; k <= block_end && forward_candidate == -1 && prev_idx != -1; ++k) { 422 | if (!filtered_lines[k] && !off_ref_filtered_lines[k]) { 423 | const PafLine& prev_paf = paf_lines[prev_idx]; 424 | const PafLine& cur_paf = paf_lines[k]; 425 | int64_t max_deletion_threshold = max_deletion; 426 | int64_t max_insertion_threshold = max_insertion; 427 | PafDelta paf_delta = get_delta(ref_path, prev_paf, cur_paf, mg_to_vg, ref_index, ref_deletions); 428 | int64_t checked_delta = check_delta(max_deletion_threshold, max_insertion_threshold, paf_delta, overlap_threshold, 429 | deletion_size_threshold); 430 | if (checked_delta == 0) { 431 | forward_candidate = k; 432 | } else { 433 | forward_matches += paf_lines[k].num_matching; 434 | } 435 | } 436 | } 437 | if (forward_candidate == -1) { 438 | // need to delete block_end too 439 | forward_candidate = block_end + 1; 440 | } 441 | 442 | assert(backward_candidate >= block_start - 1 && forward_candidate <= block_end + 1); 443 | 444 | int64_t min_segment_start = -1; 445 | int64_t min_segment_end = -1; 446 | int64_t min_segment_matches = -1; 447 | if (backward_matches < forward_matches) { 448 | min_segment_start = backward_candidate + 1; // want to filter right of candidate (not include) 449 | min_segment_end = cut_points[j] - 1; 450 | min_segment_matches = backward_matches; 451 | } else { 452 | min_segment_start = cut_points[j]; 453 | min_segment_end = forward_candidate - 1; // want to filter left of candidate (not include) 454 | min_segment_matches = forward_matches; 455 | } 456 | 457 | int64_t max_matches_deleted = filter_threshold * cut_sizes[j]; 458 | if (j == 0) { 459 | if (min_segment_end < min_segment_start) { 460 | cerr << "ms me " << min_segment_start << " " << min_segment_end << " bm fm " << backward_matches << " " << forward_matches 461 | << " bc fc " << backward_candidate << " " << forward_candidate << endl; 462 | } 463 | assert(min_segment_start <= min_segment_end); 464 | } 465 | if (min_segment_matches > 0) { 466 | 467 | if (min_segment_matches <= max_matches_deleted) { 468 | int64_t lines_in_segment = 0; 469 | for (int64_t k = min_segment_start; k <= min_segment_end; ++k) { 470 | if (!filtered_lines[k]) { 471 | filtered_lines[k] = true; 472 | ++filtered_line_it; 473 | filtered_match_total += paf_lines[k].num_matching; 474 | ++lines_in_segment; 475 | } 476 | } 477 | 478 | if (verbose) { 479 | cerr << "[filter-paf-deletions]: filtering " << lines_in_segment << " PAF lines between (inclusively)\n I=" 480 | << min_segment_start << ": " << paf_lines[min_segment_start] 481 | << "\n J=" << min_segment_end << ": " << paf_lines[min_segment_end] 482 | << "\nfor a total of " << min_segment_matches << " matches" << endl << endl; 483 | } 484 | } else { 485 | if (verbose) { 486 | cerr << "[filter-paf-deletions]: leaving in PAF lines between (inclusively)\n I=" 487 | << min_segment_start << ": " << paf_lines[min_segment_start] 488 | << "\n J=" << min_segment_end << ": " << paf_lines[min_segment_end] 489 | << "\nfor a total of " << min_segment_matches << " matches, which exceeds deletion threshold of " << max_matches_deleted 490 | << endl << endl; 491 | } 492 | 493 | } 494 | } 495 | } 496 | }); 497 | if (!keep_off_ref) { 498 | filtered_line_it += off_ref_filtered_line_it; 499 | } 500 | 501 | if (progress) { 502 | cerr << "[filter-paf-deletions]: Iteration " << iteration << ": Found " << filtered_line_it << " lines to filter" << endl; 503 | } 504 | ++iteration; 505 | filtered_line_total += filtered_line_it; 506 | if (!keep_off_ref) { 507 | filtered_line_total += off_ref_filtered_line_it; 508 | } 509 | } while (filtered_line_it > 0); 510 | 511 | if (!keep_off_ref) { 512 | filtered_match_total += off_ref_filtered_match_total; 513 | } 514 | if (progress) { 515 | cerr << "[filter-paf-deletions]: Filtering out " << filtered_line_total << " paf lines totaling " << filtered_match_total << " matches" << endl; 516 | } 517 | 518 | // output the unfiltered lines 519 | paf_file.clear(); 520 | paf_file.seekg(0, ios::beg) ; 521 | string buffer; 522 | for (int64_t line_no = 0; line_no < filtered_lines.size(); ++line_no) { 523 | int64_t sorted_line_no = orig_to_sorted.at(line_no); 524 | const auto& ret = getline(paf_file, buffer); 525 | assert(ret); 526 | 527 | // sanity check: 528 | PafLine paf_line = parse_paf_line(buffer); 529 | assert(paf_line.query_name == paf_lines[sorted_line_no].query_name); 530 | assert(paf_line.query_start == paf_lines[sorted_line_no].query_start); 531 | assert(paf_line.query_end == paf_lines[sorted_line_no].query_end); 532 | assert(paf_line.target_name == paf_lines[sorted_line_no].target_name); 533 | assert(paf_line.target_start == paf_lines[sorted_line_no].target_start); 534 | assert(paf_line.target_end == paf_lines[sorted_line_no].target_end); 535 | 536 | if (filtered_lines[sorted_line_no] == false && (keep_off_ref || off_ref_filtered_lines[sorted_line_no] == false)) { 537 | cout << buffer << "\n"; 538 | } 539 | } 540 | cout << flush; 541 | 542 | return 0; 543 | } 544 | 545 | static string strip_prefix(const string& name) { 546 | if (name.compare(0, 3, "id=") == 0) { 547 | size_t p = name.find('|', 3); 548 | assert(p != string::npos); 549 | return name.substr(p + 1); 550 | } 551 | return name; 552 | } 553 | 554 | unordered_map index_graph(const PathHandleGraph* graph, 555 | const string& ref_prefix) { 556 | 557 | // start by making a path position index 558 | // minigraph assumption: no more than one path per handle! 559 | unordered_map position_index; 560 | graph->for_each_path_handle([&](path_handle_t path_handle) { 561 | if (graph->get_path_name(path_handle).compare(0, ref_prefix.length(), ref_prefix) == 0) { 562 | size_t offset = 0; 563 | graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { 564 | handle_t handle = graph->get_handle_of_step(step_handle); 565 | size_t len = graph->get_length(handle); 566 | assert(len > 0); 567 | assert(!position_index.count(handle)); 568 | position_index[handle] = offset; 569 | position_index[graph->flip(handle)] = offset + len - 1; 570 | offset += len; 571 | }); 572 | } 573 | }); 574 | 575 | if (position_index.empty()) { 576 | cerr << "[filter-paf-deletions] error: no reference path found" << endl; 577 | exit(0); 578 | } 579 | 580 | vector> thread_results(get_thread_count()); 581 | 582 | // really slow brute-force relies on minigraph not having too many nodes 583 | graph->for_each_handle([&](handle_t handle) { 584 | unordered_map& ref_index = thread_results[omp_get_thread_num()]; 585 | 586 | // find all reference nodes that are connected via BFS 587 | unordered_set context; 588 | unordered_set ref_handles; 589 | vector cur_handles = {handle}; 590 | while (!cur_handles.empty()) { 591 | vector next_handles; 592 | for (auto& h : cur_handles) { 593 | if (!context.count(h)) { 594 | context.insert(h); 595 | if (position_index.count(h)) { 596 | // dead-end on reference 597 | ref_handles.insert(h); 598 | } else { 599 | graph->follow_edges(h, false, [&](handle_t n) { 600 | next_handles.push_back(n); 601 | }); 602 | graph->follow_edges(h, true, [&](handle_t p) { 603 | next_handles.push_back(p); 604 | }); 605 | } 606 | } 607 | } 608 | cur_handles = std::move(next_handles); 609 | } 610 | 611 | // update the index with reference offsets 612 | unordered_set ref_path_set; 613 | int64_t min_ref_offset = numeric_limits::max(); 614 | int64_t max_ref_offset = -1; 615 | for (handle_t ref_handle : ref_handles) { 616 | vector steps = graph->steps_of_handle(ref_handle); 617 | assert(steps.size() == 1); 618 | path_handle_t ref_path_handle = graph->get_path_handle_of_step(steps.back()); 619 | ref_path_set.insert(ref_path_handle); 620 | // assumption: only one reference path in component 621 | // (fair for minigraph, but may need to do better than prefix for path selection) 622 | assert(ref_path_set.size() == 1); 623 | int64_t ref_offset = position_index.at(ref_handle); 624 | int64_t ref_offset_rev = position_index.at(graph->flip(ref_handle)); 625 | min_ref_offset = std::min(min_ref_offset, min(ref_offset, ref_offset_rev)); 626 | max_ref_offset = std::max(max_ref_offset, max(ref_offset, ref_offset_rev)); 627 | assert(max_ref_offset >= min_ref_offset); 628 | } 629 | if (!ref_path_set.empty()) { 630 | assert(ref_path_set.size() == 1); 631 | Anchor& anchor = ref_index[graph->get_id(handle)]; 632 | anchor.path_handle = *ref_path_set.begin(); 633 | anchor.min_offset = min_ref_offset; 634 | anchor.max_offset = max_ref_offset; 635 | assert(anchor.max_offset >= anchor.min_offset); 636 | } 637 | }, true); 638 | 639 | // merge up the indexes 640 | for (size_t i = 1; i < thread_results.size(); ++i) { 641 | for (const auto& id_anchor : thread_results[i]) { 642 | thread_results[0][id_anchor.first] = id_anchor.second; 643 | } 644 | thread_results[i].clear(); 645 | } 646 | 647 | return thread_results[0]; 648 | } 649 | 650 | pair, unordered_map> load_paf(ifstream& paf_file) { 651 | 652 | vector> numbered_lines; 653 | string buffer; 654 | int64_t line_no = 0; 655 | while (getline(paf_file, buffer)) { 656 | PafLine paf_line = parse_paf_line(buffer); 657 | // dont use this 658 | paf_line.cigar = ""; 659 | numbered_lines.push_back(make_pair(line_no++, paf_line)); 660 | } 661 | std::sort(numbered_lines.begin(), numbered_lines.end(), [&](const pair& p1, const pair& p2) { 662 | return p1.second.query_name < p2.second.query_name || 663 | (p1.second.query_name == p2.second.query_name && p1.second.query_start < p2.second.query_start); 664 | }); 665 | 666 | vector paf_lines; 667 | unordered_map orig_to_sorted; 668 | for (int64_t i = 0; i < numbered_lines.size(); ++i) { 669 | paf_lines.push_back(numbered_lines[i].second); 670 | orig_to_sorted[numbered_lines[i].first] = i; 671 | } 672 | 673 | return make_pair(paf_lines, orig_to_sorted); 674 | } 675 | 676 | int64_t for_each_query_block(const vector& paf_lines, const vector& filtered_lines, 677 | function visit_block) { 678 | if (paf_lines.empty()) { 679 | assert(false); 680 | } 681 | int64_t block_start = -1; 682 | int64_t block_end = -1; 683 | string prev_query; 684 | int64_t num_visits = 0; 685 | for (int64_t i = 0; i < paf_lines.size(); ++i) { 686 | if (filtered_lines[i]) { 687 | continue; 688 | } 689 | const PafLine& paf = paf_lines[i]; 690 | if (block_start == -1) { 691 | block_start = i; 692 | } else if (paf.query_name != prev_query) { 693 | assert(!prev_query.empty()); 694 | if (block_start > -1) { 695 | // visit the previous block 696 | visit_block(block_start, block_end); 697 | } 698 | ++num_visits; 699 | //start a new block 700 | block_start = i; 701 | } 702 | // update end of current block 703 | block_end = i; 704 | prev_query = paf.query_name; 705 | } 706 | 707 | if (block_end != -1) { 708 | // visit last block if present 709 | visit_block(block_start, block_end); 710 | ++num_visits; 711 | } 712 | return num_visits; 713 | } 714 | 715 | unordered_map> index_deletions(const PathHandleGraph* graph, const unordered_map& index) { 716 | 717 | vector>>> thread_deletions(get_thread_count()); 718 | 719 | // get approximate deletion intervals using the index 720 | graph->for_each_edge([&](edge_t edge) { 721 | const Anchor& a1 = index.at(graph->get_id(edge.first)); 722 | const Anchor& a2 = index.at(graph->get_id(edge.second)); 723 | if (a1.path_handle == a2.path_handle) { 724 | Interval interval(0, 0, 0); 725 | if (a1.min_offset < a2.min_offset) { 726 | interval.start = a1.max_offset; 727 | interval.stop = a2.min_offset; 728 | } else { 729 | interval.start = a2.max_offset; 730 | interval.stop = a1.min_offset; 731 | } 732 | interval.value = interval.stop - interval.start; 733 | if (interval.value > 1) { 734 | thread_deletions[omp_get_thread_num()][a1.path_handle].push_back(interval); 735 | } 736 | } 737 | }, true); 738 | 739 | for (size_t i = 1; i < thread_deletions.size(); ++i) { 740 | for (const auto& pi : thread_deletions[i]) { 741 | for (const auto& interval : pi.second) { 742 | thread_deletions[0][pi.first].push_back(interval); 743 | } 744 | } 745 | thread_deletions[i].clear(); 746 | } 747 | 748 | unordered_map> path_to_tree; 749 | for (const auto& pi : thread_deletions[0]) { 750 | path_to_tree[pi.first] = IntervalTree(pi.second); 751 | } 752 | return path_to_tree; 753 | } 754 | 755 | 756 | pair, unordered_map> load_trans(const string& trans_path) { 757 | ifstream trans_file(trans_path); 758 | if (!trans_file) { 759 | cerr << "[filter-paf-deletions] error: Unable to load trans file" << endl; 760 | exit(1); 761 | } 762 | 763 | unordered_map mg_to_vg; 764 | unordered_map vg_to_mg; 765 | 766 | string buffer; 767 | while (getline(trans_file, buffer)) { 768 | vector toks; 769 | split_delims(buffer, "\t\n", toks); 770 | assert(toks.size() == 3 && toks[0] == "T"); 771 | string& mg_name = toks[1]; 772 | bool has_prefix = mg_name.compare(0, 3, "id=") == 0; 773 | mg_to_vg[mg_name] = stol(toks[2]); 774 | // hack to support prefixed or not minigraph 775 | // just by keeping both versions in the map no matter what 776 | // todo: parameterize prefix name 777 | if (has_prefix) { 778 | mg_to_vg[strip_prefix(mg_name)] = stol(toks[2]); 779 | } else { 780 | mg_to_vg["id=_MINIGRAPH_|" + mg_name] = stol(toks[2]); 781 | } 782 | vg_to_mg[stol(toks[2])] = mg_name; 783 | } 784 | 785 | return make_pair(mg_to_vg, vg_to_mg); 786 | } 787 | 788 | unique_ptr load_graph(istream& graph_stream) { 789 | 790 | char magic_bytes[4]; 791 | graph_stream.read(magic_bytes, 4); 792 | uint32_t magic_number = ntohl(*((uint32_t*) magic_bytes)); 793 | graph_stream.clear(); 794 | graph_stream.seekg(0, ios::beg); 795 | 796 | MutablePathMutableHandleGraph* graph; 797 | if (magic_number == PackedGraph().get_magic_number()) { 798 | graph = new PackedGraph(); 799 | } else if (magic_number == HashGraph().get_magic_number()) { 800 | graph = new HashGraph(); 801 | } else { 802 | cerr << "Unable to parse input graph with magic number " << magic_number << endl; 803 | exit(1); 804 | } 805 | dynamic_cast(graph)->deserialize(graph_stream); 806 | 807 | return unique_ptr(graph); 808 | } 809 | 810 | // return -Delta for insertion, +Delta for deletion and 0 if it doesn't pass thresholds 811 | int64_t check_delta(int64_t max_deletion_threshold, int64_t max_insertion_threshold, const PafDelta& paf_delta, double overlap_threshold, 812 | double deletion_size_threshold) { 813 | int64_t ret = 0; 814 | if (paf_delta.delta != 0) { 815 | // note: paf_delta.delta is ref-query, so deletions are positive and insertions are negative 816 | if (paf_delta.delta < 0 && max_insertion_threshold > 0 && -paf_delta.delta > max_insertion_threshold) { 817 | ret = paf_delta.delta; 818 | } else if (paf_delta.delta > 0 && max_deletion_threshold > 0 && 819 | (paf_delta.delta > max_deletion_threshold || 820 | (deletion_size_threshold >= 0 && paf_delta.query_len < deletion_size_threshold * abs(paf_delta.delta))) && 821 | abs((double)paf_delta.ref_overlap_size / paf_delta.delta) < overlap_threshold) { 822 | ret = paf_delta.delta; 823 | } 824 | } 825 | return ret; 826 | } 827 | 828 | PafDelta get_delta(path_handle_t ref_path, const PafLine& prev_paf, const PafLine& cur_paf, 829 | const unordered_map& mg_to_vg, const unordered_map& ref_index, 830 | const unordered_map>& ref_deletions) { 831 | 832 | PafDelta paf_delta; 833 | 834 | paf_delta.query_len = prev_paf.query_len; 835 | assert(paf_delta.query_len == cur_paf.query_len); 836 | 837 | paf_delta.query_delta = cur_paf.query_start - prev_paf.query_end; // not abs because sorted 838 | 839 | nid_t prev_target_id = mg_to_vg.at(prev_paf.target_name); 840 | const Anchor& prev_anchor = ref_index.at(prev_target_id); 841 | 842 | nid_t cur_target_id = mg_to_vg.at(cur_paf.target_name); 843 | const Anchor& cur_anchor = ref_index.at(cur_target_id); 844 | 845 | // todo : verify 846 | paf_delta.cur_ref_start = cur_anchor.min_offset + cur_paf.target_start; 847 | paf_delta.cur_ref_end = cur_anchor.max_offset - (cur_paf.target_len - cur_paf.target_end); 848 | paf_delta.prev_ref_start = prev_anchor.min_offset + prev_paf.target_start; 849 | paf_delta.prev_ref_end = prev_anchor.max_offset - (prev_paf.target_len - prev_paf.target_end); 850 | 851 | int64_t cur_ref_start = paf_delta.cur_ref_start; 852 | int64_t cur_ref_end = paf_delta.cur_ref_end; 853 | int64_t prev_ref_start = paf_delta.prev_ref_start; 854 | int64_t prev_ref_end = paf_delta.prev_ref_end; 855 | 856 | // sort the ref intervals 857 | if (cur_ref_start < prev_ref_start) { 858 | swap(cur_ref_start, prev_ref_start); 859 | swap(cur_ref_end, prev_ref_end); 860 | } 861 | paf_delta.ref_delta = cur_ref_start - prev_ref_end; 862 | 863 | paf_delta.delta = paf_delta.ref_delta > 0 ? paf_delta.ref_delta - paf_delta.query_delta : 0; 864 | 865 | paf_delta.ref_overlap_size = 0; 866 | if (paf_delta.delta > 0) { 867 | if (ref_deletions.count(ref_path)) { 868 | vector> overlaps = ref_deletions.at(ref_path).findOverlapping(prev_ref_end, cur_ref_start); 869 | for (const auto& overlap : overlaps) { 870 | int64_t intersection_start = max(prev_ref_end, overlap.start); 871 | int64_t intersection_stop = min(cur_ref_start, overlap.stop); 872 | paf_delta.ref_overlap_size = max(paf_delta.ref_overlap_size, intersection_stop - intersection_start); 873 | } 874 | } 875 | } 876 | 877 | return paf_delta; 878 | } 879 | -------------------------------------------------------------------------------- /hal2vg.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2016 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | // This file was created by merging hal2sg.cpp and sg2vg.cpp with 8 | // a small amount of glue for the interface. 9 | 10 | //#define debug 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include "stPinchGraphs.h" 20 | #include "bdsg/packed_graph.hpp" 21 | #include "bdsg/hash_graph.hpp" 22 | #include "hal.h" 23 | 24 | using namespace std; 25 | using namespace hal; 26 | using namespace handlegraph; 27 | using namespace bdsg; 28 | using namespace handlegraph; 29 | 30 | static void initParser(CLParser* optionsParser) { 31 | optionsParser->addArgument("halFile", "input hal file"); 32 | optionsParser->addOption("refGenomes", 33 | "comma-separated (no spaces) genomes to treat as reference paths with all others as haplotype paths (default=all genomes)", 34 | "\"\""); 35 | optionsParser->addOption("rootGenome", 36 | "process only genomes in clade with specified root" 37 | " (HAL root if empty)", 38 | "\"\""); 39 | optionsParser->addOption("targetGenomes", 40 | "comma-separated (no spaces) list of target genomes " 41 | "(others are excluded) (all leaves if empty)", 42 | "\"\""); 43 | optionsParser->addOptionFlag("noAncestors", 44 | "don't write ancestral paths, nor sequence exclusive to ancestral genomes", 45 | false); 46 | optionsParser->addOption("ignoreGenomes", 47 | "comma-separated (no spaces) list of genomes to ignore", 48 | "\"\""); 49 | optionsParser->addOption("outputFormat", 50 | "output graph format in {pg, hg} [default=pg]", 51 | "pg"); 52 | optionsParser->addOption("chop", 53 | "chop up nodes in output graph so they are not longer than given length", 54 | 0); 55 | optionsParser->addOptionFlag("progress", 56 | "show progress", 57 | false); 58 | optionsParser->setDescription("Convert HAL alignment to handle graph"); 59 | 60 | } 61 | 62 | static void add_genome_threads(const Genome* genome, 63 | stPinchThreadSet* threads, 64 | vector& IDToName, 65 | unordered_map& nameToID); 66 | 67 | static void pinch_genome(const Genome* genome, 68 | stPinchThreadSet* threads, 69 | unordered_map& nameToID, 70 | const vector& targetNames, 71 | unordered_map>& snp_cache); 72 | 73 | static void pinch_snp(const Genome* genome, 74 | stPinchThreadSet* threads, 75 | unordered_map& nameToID, 76 | const TopSegmentIteratorPtr& topIt, 77 | int64_t topOffset, 78 | ColumnIteratorPtr& colIt, 79 | char topBase, 80 | stPinchThread* topThread, 81 | unordered_map>& snp_cache); 82 | 83 | static void pinch_to_handle(const Genome* genome, 84 | stPinchThreadSet* threadSet, 85 | const vector& IDToName, 86 | const unordered_map& nameToID, 87 | unordered_map& blockToNode, 88 | MutablePathMutableHandleGraph& graph, 89 | const vector& refNames); 90 | 91 | static void chop_graph(MutablePathMutableHandleGraph& graph, size_t maxNodeLength); 92 | 93 | static subrange_t resolve_subpath_naming(string& path_name); 94 | 95 | static size_t resolve_haplotype_naming(string& genome_name); 96 | 97 | int main(int argc, char** argv) { 98 | CLParser optionsParser; 99 | initParser(&optionsParser); 100 | string halPath; 101 | string refGenomes; 102 | string rootGenomeName; 103 | string targetGenomes; 104 | bool noAncestors; 105 | string ignoreGenomes; 106 | string outputFormat; 107 | size_t maxNodeLength; 108 | bool progress; 109 | try { 110 | optionsParser.parseOptions(argc, argv); 111 | halPath = optionsParser.getArgument("halFile"); 112 | refGenomes = optionsParser.getOption("refGenomes"); 113 | rootGenomeName = optionsParser.getOption("rootGenome"); 114 | targetGenomes = optionsParser.getOption("targetGenomes"); 115 | noAncestors = optionsParser.getFlag("noAncestors"); 116 | ignoreGenomes = optionsParser.getOption("ignoreGenomes"); 117 | outputFormat = optionsParser.getOption("outputFormat"); 118 | if (outputFormat != "pg" && outputFormat != "hg") { 119 | throw hal_exception("--outputFormat must be one of {pg, hg}"); 120 | } 121 | if (ignoreGenomes != "\"\"" && targetGenomes != "\"\"") { 122 | throw hal_exception("--ignoreGenomes and --targetGenomes options are " 123 | "mutually exclusive"); 124 | } 125 | 126 | maxNodeLength = optionsParser.getOption("chop"); 127 | progress = optionsParser.getFlag("progress"); 128 | } 129 | catch(exception& e) { 130 | cerr << e.what() << endl; 131 | optionsParser.printUsage(cerr); 132 | exit(1); 133 | } 134 | try { 135 | AlignmentConstPtr alignment(openHalAlignment(halPath, &optionsParser)); 136 | if (alignment->getNumGenomes() == 0) { 137 | throw hal_exception("input hal alignmenet is empty"); 138 | } 139 | 140 | vector refNames; 141 | if (refGenomes != "\"\"") { 142 | refNames = chopString(refGenomes, ","); 143 | std::sort(refNames.begin(), refNames.end()); 144 | } 145 | 146 | // default to alignment root if none specified 147 | bool givenRoot = true; 148 | if (rootGenomeName == "\"\"") { 149 | givenRoot = false; 150 | rootGenomeName = alignment->getRootName(); 151 | const Genome* rootGenome = alignment->openGenome(rootGenomeName); 152 | if (rootGenome == NULL) { 153 | throw hal_exception(string("Root genome, ") + rootGenomeName + 154 | ", not found in alignment"); 155 | } 156 | alignment->closeGenome(rootGenome); 157 | } 158 | 159 | vector ignoreNames; 160 | if (ignoreGenomes != "\"\"") { 161 | ignoreNames = chopString(ignoreGenomes, ","); 162 | std::sort(ignoreNames.begin(), ignoreNames.end()); 163 | } 164 | 165 | vector targetNames; 166 | bool givenTargets; 167 | if (targetGenomes != "\"\"") { 168 | // if we're supplied targets, we use them 169 | targetNames = chopString(targetGenomes, ","); 170 | givenTargets = true; 171 | } else { 172 | // otherwise, we take all the leaves below the root, except any that are ignored 173 | vector leafNames = alignment->getLeafNamesBelow(rootGenomeName); 174 | for (size_t i = 0; i < leafNames.size(); ++i) { 175 | if (!std::binary_search(ignoreNames.begin(), ignoreNames.end(), leafNames[i])) { 176 | targetNames.push_back(leafNames[i]); 177 | } 178 | } 179 | givenTargets = false; 180 | } 181 | std::sort(targetNames.begin(), targetNames.end()); 182 | 183 | // keep track of internal nodes needed to transitively align our targets 184 | vector spanningNames; 185 | set targetSet; 186 | for (size_t i = 0; i < targetNames.size(); ++i) { 187 | const Genome* targetGenome = alignment->openGenome(targetNames[i]); 188 | if (targetGenome == NULL) { 189 | throw hal_exception(string("Target genome, ") + targetNames[i] + 190 | ", not found in alignment"); 191 | } 192 | targetSet.insert(targetGenome); 193 | } 194 | const Genome* rootGenome = getLowestCommonAncestor(targetSet); 195 | set targetSetCpy = targetSet; 196 | getGenomesInSpanningTree(targetSetCpy, targetSet); 197 | if (!givenRoot) { 198 | // update our root if it wasn't user-specified 199 | rootGenomeName = rootGenome->getName(); 200 | } 201 | for (set::iterator i = targetSet.begin(); i != targetSet.end(); ++i) { 202 | if ((*i)->getNumChildren() > 0) { 203 | spanningNames.push_back((*i)->getName()); 204 | } 205 | alignment->closeGenome(*i); 206 | } 207 | std::sort(spanningNames.begin(), spanningNames.end()); 208 | 209 | if (progress) { 210 | cerr << "Root: " << rootGenomeName << endl; 211 | if (!targetNames.empty()) { 212 | cerr << "Targets:"; 213 | for (size_t i = 0; i < targetNames.size(); ++i) { 214 | cerr << " " << targetNames[i]; 215 | } 216 | cerr << endl; 217 | } 218 | if (!spanningNames.empty()) { 219 | cerr << "Spanning:"; 220 | for (size_t i = 0; i < spanningNames.size(); ++i) { 221 | cerr << " " << spanningNames[i]; 222 | } 223 | cerr << endl; 224 | } 225 | if (!ignoreNames.empty()) { 226 | cerr << "Ignore:"; 227 | for (size_t i = 0; i < ignoreNames.size(); ++i) { 228 | cerr << " " << ignoreNames[i]; 229 | } 230 | cerr << endl; 231 | } 232 | } 233 | 234 | // map Sequence pointers to integers (assumes sequence pointers stable within hal) 235 | vector IDToName; 236 | unordered_map nameToID; 237 | 238 | // start up our pinch graph 239 | stPinchThreadSet* threadSet = stPinchThreadSet_construct(); 240 | 241 | const Genome* parentGenome = nullptr; 242 | string parentName; 243 | 244 | deque queue = {rootGenomeName}; 245 | 246 | vector pinchGenomes; 247 | 248 | while (!queue.empty()) { 249 | string genomeName = queue.front(); 250 | queue.pop_front(); 251 | 252 | // we have a target set, and this genome isn't in it, and this genome isn't needed to span it 253 | // so we can ignore it completely 254 | bool ignoreGenome = (!std::binary_search(targetNames.begin(), targetNames.end(), genomeName) && 255 | !std::binary_search(spanningNames.begin(), spanningNames.end(), genomeName) && 256 | genomeName != rootGenomeName); 257 | 258 | const Genome* genome = alignment->openGenome(genomeName); 259 | string curParent = alignment->getParentName(genomeName); 260 | 261 | // add the genome sequences as threads 262 | if (!ignoreGenome) { 263 | if (progress && !(!curParent.empty() && genomeName != rootGenomeName)) { 264 | cerr << "adding threads from " << genome->getName() << endl; 265 | } 266 | add_genome_threads(genome, threadSet, IDToName, nameToID); 267 | } 268 | 269 | if (!ignoreGenome && !curParent.empty() && genomeName != rootGenomeName) { 270 | // load up the parent genome if it's not already open, taking care 271 | // to only ever have one parent open at a time 272 | if (curParent != parentName) { 273 | if (parentGenome != nullptr) { 274 | alignment->closeGenome(parentGenome); 275 | } 276 | parentName = curParent; 277 | parentGenome = alignment->openGenome(parentName); 278 | } 279 | 280 | // pinching must now be done in second pass, so we queue up the genome here 281 | pinchGenomes.push_back(genome->getName()); 282 | } 283 | 284 | // recurse on children 285 | vector childs = alignment->getChildNames(genomeName); 286 | for (size_t i = 0; i < childs.size(); ++i) { 287 | queue.push_back(childs[i]); 288 | } 289 | 290 | // todo: this logic not very efficient for normal (ie non-star trees) 291 | alignment->closeGenome(genome); 292 | 293 | } 294 | 295 | if (parentGenome != nullptr) { 296 | alignment->closeGenome(parentGenome); 297 | } 298 | 299 | // do all the pinching 300 | unordered_map> snp_cache; 301 | for (size_t i = 0; i < pinchGenomes.size(); ++i) { 302 | 303 | // pinch the child with its parent 304 | if (progress) { 305 | cerr << "pinching " << pinchGenomes[i] << endl; 306 | } 307 | pinch_genome(alignment->openGenome(pinchGenomes[i]), threadSet, nameToID, targetNames, snp_cache); 308 | } 309 | snp_cache.clear(); 310 | 311 | // clean up the pinch graph 312 | if (progress) { 313 | cerr << "merging trivial segments and blocks in pinch graph" << endl; 314 | } 315 | stPinchThreadSet_joinTrivialBoundaries(threadSet); 316 | 317 | // make a handle graph 318 | unique_ptr graph; 319 | if (outputFormat == "pg") { 320 | graph = unique_ptr(new PackedGraph()); 321 | } else if (outputFormat == "hg") { 322 | graph = unique_ptr(new HashGraph()); 323 | } else { 324 | assert(false); 325 | } 326 | 327 | // keep track of where blocks fit into the handle graph 328 | unordered_map blockToNode; 329 | 330 | // start iterating over the genomes again in order to export to handle graph 331 | queue = {rootGenomeName}; 332 | while (!queue.empty()) { 333 | string genomeName = queue.front(); 334 | queue.pop_front(); 335 | 336 | // skip it if 337 | // it's an ancestor and we don't want ancestors or 338 | // if we have targets and it's not in it or 339 | // if it's on the ignore list 340 | bool ignoreGenome = ((noAncestors && !alignment->getChildNames(genomeName).empty()) || 341 | (givenTargets && !std::binary_search(targetNames.begin(), targetNames.end(), genomeName)) || 342 | (std::binary_search(ignoreNames.begin(), ignoreNames.end(), genomeName))); 343 | if (!ignoreGenome) { 344 | const Genome* genome = alignment->openGenome(genomeName); 345 | 346 | if (progress) { 347 | cerr << "converting " << genomeName << " with " << genome->getNumSequences() 348 | << " sequences and total length " << genome->getSequenceLength() << endl; 349 | } 350 | pinch_to_handle(genome, threadSet, IDToName, nameToID, blockToNode, *graph, refNames); 351 | 352 | alignment->closeGenome(genome); 353 | } 354 | 355 | vector childs = alignment->getChildNames(genomeName); 356 | for (size_t i = 0; i < childs.size(); ++i) { 357 | queue.push_back(childs[i]); 358 | } 359 | } 360 | 361 | // free the pinch graph 362 | stPinchThreadSet_destruct(threadSet); 363 | 364 | // free the hal 365 | alignment = AlignmentConstPtr(); 366 | 367 | // chop 368 | if (maxNodeLength > 0) { 369 | if (progress) { 370 | cerr << "chopping graph to max node size " << maxNodeLength << endl; 371 | } 372 | chop_graph(*graph, maxNodeLength); 373 | } 374 | 375 | // write out the graph 376 | if (progress) { 377 | cerr << "serializing graph" << endl; 378 | } 379 | dynamic_cast(graph.get())->serialize(cout); 380 | } 381 | catch(exception& e) { 382 | cerr << e.what() << endl; 383 | exit(1); 384 | } 385 | 386 | return 0; 387 | } 388 | 389 | // Add every sequence from the genome into the pinch graph 390 | void add_genome_threads(const Genome* genome, 391 | stPinchThreadSet* threads, 392 | vector& IDToName, 393 | unordered_map& nameToID) { 394 | 395 | for (SequenceIteratorPtr seqIt = genome->getSequenceIterator(); not seqIt->atEnd(); seqIt->toNext()) { 396 | const Sequence *sequence = seqIt->getSequence(); 397 | hal_size_t seqLen = sequence->getSequenceLength(); 398 | string name = sequence->getFullName(); 399 | // update lookups to map hal sequence to numeric id 400 | int64_t seqID = IDToName.size(); 401 | nameToID[name] = seqID; 402 | IDToName.push_back(name); 403 | // add to thread set 404 | #ifdef debug 405 | cerr << "Adding sequence " << name << " as thread " << seqID << " with length " << seqLen << endl; 406 | #endif 407 | stPinchThreadSet_addThread(threads, seqID, 0, seqLen); 408 | } 409 | } 410 | 411 | // Use exact pairwise alginments from genome to its parent to make the pinch graph 412 | void pinch_genome(const Genome* genome, 413 | stPinchThreadSet* threads, 414 | unordered_map& nameToID, 415 | const vector& targetNames, 416 | unordered_map>& snp_cache) { 417 | 418 | TopSegmentIteratorPtr topIt = genome->getTopSegmentIterator(); 419 | BottomSegmentIteratorPtr botIt = genome->getParent()->getBottomSegmentIterator(); 420 | 421 | // make a target set for column iterator pinching. unfortunately this means 422 | // opening every single genome 423 | const Alignment* alignment = genome->getAlignment(); 424 | set targets; 425 | for (size_t i = 0; i < targetNames.size(); ++i) { 426 | targets.insert(alignment->openGenome(targetNames[i])); 427 | } 428 | 429 | ColumnIteratorPtr colIt = genome->getColumnIterator(&targets); 430 | 431 | // avoid thread set lookups 432 | const Sequence* topSeq = nullptr; 433 | const Sequence* botSeq = nullptr; 434 | stPinchThread* topThread = nullptr; 435 | stPinchThread* botThread = nullptr; 436 | string topString; 437 | string botString; 438 | 439 | // merge up consecutive segments for fewer pinches 440 | stPinchThread* prevTopThread = nullptr; 441 | stPinchThread* prevBotThread = nullptr; 442 | hal_index_t prevStart1 = -1; 443 | hal_index_t prevStart2 = -1; 444 | hal_index_t prevLength = -1; 445 | bool prevReversed = false; 446 | 447 | for (; not topIt->atEnd(); topIt->toRight()) { 448 | if (topIt->tseg()->hasParent()) { 449 | botIt->toParent(topIt); 450 | 451 | // todo: lots of string lookups 452 | int64_t topID = nameToID[topIt->tseg()->getSequence()->getFullName()]; 453 | int64_t botID = nameToID[botIt->bseg()->getSequence()->getFullName()]; 454 | 455 | if (topIt->tseg()->getSequence() != topSeq) { 456 | topSeq = topIt->tseg()->getSequence(); 457 | topThread = stPinchThreadSet_getThread(threads, topID); 458 | } 459 | if (botIt->bseg()->getSequence() != botSeq) { 460 | botSeq = botIt->bseg()->getSequence(); 461 | botThread = stPinchThreadSet_getThread(threads, botID); 462 | } 463 | 464 | topIt->getString(topString); 465 | botIt->getString(botString); 466 | 467 | #ifdef debug 468 | cerr << "pinching " << endl 469 | << " " << *topIt << endl 470 | << " " << topString << endl 471 | << " " << *botIt << endl 472 | << " " << botString << endl; 473 | #endif 474 | 475 | int64_t first_match = -1; 476 | int64_t last_match = -1; 477 | for (int64_t i = 0; i < (int64_t)topString.length(); ++i) { 478 | if (std::toupper(topString[i]) == std::toupper(botString[i])) { 479 | if (first_match == -1) { 480 | first_match = i; 481 | } 482 | last_match = i; 483 | } else if (colIt.get() != NULL) { 484 | pinch_snp(genome, threads, nameToID, topIt, i, colIt, 485 | std::toupper(topString[i]), topThread, snp_cache); 486 | } 487 | if (std::toupper(topString[i]) != std::toupper(botString[i]) || i == (int64_t)topString.length() - 1) { 488 | if (last_match >= first_match && first_match >= 0) { 489 | hal_index_t length = last_match - first_match + 1; 490 | hal_index_t start1 = topIt->tseg()->getStartPosition() + first_match - topSeq->getStartPosition(); 491 | hal_index_t start2; 492 | if (!botIt->getReversed()) { 493 | start2 = botIt->bseg()->getStartPosition() + first_match - botSeq->getStartPosition(); 494 | } else { 495 | start2 = botIt->bseg()->getEndPosition() - first_match - length + 1 - botSeq->getStartPosition(); 496 | } 497 | #ifdef debug 498 | cerr << " inserting (fm=" << first_match <<",lm=" << last_match << ", s1=" << start1 << ",s2=" << start2 << ",l=" << length 499 | << ", hl1=" << topSeq->getSequenceLength() << ",hl2=" << botSeq->getSequenceLength() << ",pl1=" << stPinchThread_getLength(topThread) 500 | << ", pl2=" << stPinchThread_getLength(botThread) << ", rev=" << botIt->getReversed() 501 | << " sp1g=" << (start1 + topSeq->getStartPosition()) << " sp2g=" << (start2 + botSeq->getStartPosition()) << endl 502 | << " " << topString.substr(first_match, length) << endl; 503 | #endif 504 | // are we dealing with two consectuive segments? 505 | bool canMerge = topThread == prevTopThread && 506 | botThread == prevBotThread && 507 | start1 == prevStart1 + prevLength && 508 | botIt->getReversed() == prevReversed && 509 | ((!prevReversed && start2 == prevStart2 + prevLength) || 510 | (prevReversed && start2 + length == prevStart2)); 511 | 512 | if (canMerge) { 513 | // if consecutive, just merge 514 | prevLength += length; 515 | if (botIt->getReversed()) { 516 | prevStart2 = start2; 517 | } 518 | } else { 519 | // otherwise 520 | if (prevTopThread != nullptr) { 521 | // pinch the last segment 522 | stPinchThread_pinch(prevTopThread, 523 | prevBotThread, 524 | prevStart1, 525 | prevStart2, 526 | prevLength, 527 | !prevReversed); 528 | } 529 | // and update our previous 530 | prevTopThread = topThread; 531 | prevBotThread = botThread; 532 | prevStart1 = start1; 533 | prevStart2 = start2; 534 | prevLength = length; 535 | prevReversed = botIt->getReversed(); 536 | } 537 | 538 | } 539 | first_match = -1; 540 | last_match = -1; 541 | } 542 | } 543 | } 544 | } 545 | // do that last pinch 546 | if (prevTopThread != nullptr) { 547 | stPinchThread_pinch(prevTopThread, 548 | prevBotThread, 549 | prevStart1, 550 | prevStart2, 551 | prevLength, 552 | !prevReversed); 553 | } 554 | } 555 | 556 | // Use the column iterator to find all alignments of this snp and pinch accordingly 557 | // 558 | // Todo: Worried this might be too slow to use at scale. Also, it blows away all previous 559 | // efforts in hal2vg to be cache-friendly by only loading 2 genomes at a time, so it may 560 | // hog lots of memory. On a star tree, it may just be better to manually scan the siblings 561 | // before resorting to the column iterator. Or perhaps do everything in the pinch graph 562 | // by pinching snps then doing a pass over the graph to break them apart once its constructed. 563 | void pinch_snp(const Genome* genome, 564 | stPinchThreadSet* threads, 565 | unordered_map& nameToID, 566 | const TopSegmentIteratorPtr& topIt, 567 | int64_t topOffset, 568 | ColumnIteratorPtr& colIt, 569 | char topBase, 570 | stPinchThread* topThread, 571 | unordered_map>& snp_cache) { 572 | 573 | const Sequence* topSeq = topIt->tseg()->getSequence(); 574 | hal_index_t topStart = topIt->tseg()->getStartPosition() + topOffset - topSeq->getStartPosition(); 575 | 576 | vector& cache_rec = snp_cache[topThread]; 577 | if (!cache_rec.empty() && cache_rec[topStart] == true) { 578 | // we've already pinched this base 579 | return; 580 | } 581 | 582 | // move the column iterator into position 583 | colIt->toSite(topStart + topSeq->getStartPosition(), topStart + topSeq->getStartPosition() + 1); 584 | 585 | const ColumnIterator::ColumnMap* columnMap = colIt->getColumnMap(); 586 | 587 | // remember all equivalence classes of pinches 588 | map>> base_pinches; 589 | 590 | // scan through all the homologous bases, breaking them into lists for each possible nucleotide 591 | for (ColumnIterator::ColumnMap::const_iterator cmi = columnMap->begin(); cmi != columnMap->end(); ++cmi) { 592 | const Sequence* sequence = cmi->first; 593 | for (ColumnIterator::DNASet::const_iterator dsi = cmi->second->begin(); dsi != cmi->second->end(); ++dsi) { 594 | char botBase = std::toupper((*dsi)->getBase()); 595 | 596 | int64_t otherID = nameToID[sequence->getFullName()]; 597 | stPinchThread* otherThread = stPinchThreadSet_getThread(threads, otherID); 598 | hal_index_t otherStart = (*dsi)->getArrayIndex() - sequence->getStartPosition(); 599 | 600 | base_pinches[botBase].push_back(make_tuple(otherThread, otherStart, !(*dsi)->getReversed())); 601 | 602 | } 603 | } 604 | 605 | // pinch through each nucleotde 606 | for (auto& bp : base_pinches) { 607 | vector>& other_positions = bp.second; 608 | for (size_t i = 0; i < other_positions.size(); ++i) { 609 | if (i > 0) { 610 | stPinchThread_pinch(get<0>(other_positions[0]), 611 | get<0>(other_positions[i]), 612 | get<1>(other_positions[0]), 613 | get<1>(other_positions[i]), 614 | 1, 615 | get<2>(other_positions[0]) == get<2>(other_positions[i])); 616 | } 617 | // update the cache 618 | vector& cache_vec = snp_cache[get<0>(other_positions[i])]; 619 | if (cache_vec.empty()) { 620 | cache_vec.resize(stPinchThread_getLength(get<0>(other_positions[i])), false); 621 | } 622 | cache_vec[get<1>(other_positions[i])] = true; 623 | } 624 | } 625 | } 626 | 627 | // create nodes and edges for a genome using the pinch graph 628 | void pinch_to_handle(const Genome* genome, 629 | stPinchThreadSet* threadSet, 630 | const vector& IDToName, 631 | const unordered_map& nameToID, 632 | unordered_map& blockToNode, 633 | MutablePathMutableHandleGraph& graph, 634 | const vector& refNames) { 635 | 636 | // iterate over the sequences of the genome 637 | for (SequenceIteratorPtr seqIt = genome->getSequenceIterator(); not seqIt->atEnd(); seqIt->toNext()) { 638 | const Sequence *sequence = seqIt->getSequence(); 639 | PathSense sense = PathSense::REFERENCE; 640 | if (!refNames.empty() && !std::binary_search(refNames.begin(), refNames.end(), genome->getName())) { 641 | sense = PathSense::HAPLOTYPE; 642 | } 643 | int64_t seqID = nameToID.find(sequence->getFullName())->second; 644 | stPinchThread* thread = stPinchThreadSet_getThread(threadSet, seqID); 645 | 646 | // cactus_graphmap_split can make paths like contig_sub_1_3. here we convert that 647 | // into a format vg can (sometimes) understand contig[1-3]. 648 | // (the reason we go through this is that assembly hubs can't handle any special characters apparently) 649 | string parsed_name = sequence->getName(); 650 | subrange_t subpath = resolve_subpath_naming(parsed_name); 651 | string parsed_genome_name = genome->getName(); 652 | size_t haplotype = resolve_haplotype_naming(parsed_genome_name); 653 | if (haplotype == PathMetadata::NO_HAPLOTYPE) { 654 | haplotype = 0; 655 | } 656 | // create the path 657 | path_handle_t pathHandle = graph.create_path(sense, 658 | parsed_genome_name, 659 | parsed_name, 660 | haplotype, 661 | sense == PathSense::HAPLOTYPE ? 0 : PathMetadata::NO_PHASE_BLOCK, 662 | subpath, 663 | false); 664 | string pathString; 665 | 666 | // iterate over the segments of the sequence 667 | stPinchSegment* prevSeg = nullptr; 668 | handle_t prevHandle; 669 | stPinchSegment* lastSeg = stPinchThread_getLast(thread); 670 | hal_index_t segStart = 0; 671 | string seqString; 672 | for (stPinchSegment* seg = stPinchThread_getFirst(thread); ; 673 | seg = stPinchSegment_get3Prime(seg)) { 674 | 675 | // get the segment's block. note that if it's not aligned to anything, it will have no block 676 | stPinchBlock* block = stPinchSegment_getBlock(seg); 677 | bool reversed = block != nullptr && stPinchSegment_getBlockOrientation(seg) == 0; 678 | handle_t handle; 679 | 680 | // get the segment's dna sequence from the hal 681 | sequence->getSubString(seqString, segStart, stPinchSegment_getLength(seg)); 682 | if (reversed) { 683 | // we always work in block-relative orientation 684 | reverseComplement(seqString); 685 | } 686 | 687 | // have we already converted this block? 688 | auto bi = blockToNode.find(block); 689 | if (bi == blockToNode.end()) { 690 | // no: it is a new block 691 | handle = graph.create_handle(seqString); 692 | if (block != nullptr) { 693 | blockToNode[block] = graph.get_id(handle); 694 | } 695 | #ifdef debug 696 | cerr << "created node " << graph.get_id(handle) << " for block " << block << " from " << sequence->getFullName() << " at " << segStart 697 | << " rev=" << reversed << " len=" << seqString.length() 698 | << endl; 699 | cerr << "node seq " << graph.get_sequence(handle) << endl; 700 | #endif 701 | } else { 702 | // yes: we can find it in the table 703 | handle = graph.get_handle(bi->second); 704 | #ifdef debug 705 | cerr << "found node " << graph.get_id(handle) << " for block " << block << " from " << sequence->getFullName() << " at " << segStart 706 | << " rev=" << reversed << " len=" << seqString.length() 707 | << endl; 708 | cerr << "node seq " << graph.get_sequence(handle) << endl; 709 | cerr << "my substring " << seqString << endl; 710 | #endif 711 | } 712 | assert(!graph.get_is_reverse(handle)); 713 | if (reversed) { 714 | handle = graph.flip(handle); 715 | assert(graph.get_is_reverse(handle)); 716 | } 717 | 718 | // wire up the edge to previous 719 | if (prevSeg != nullptr) { 720 | #ifdef debug 721 | cerr << "creating edge from " << graph.get_id(prevHandle) << ":" << graph.get_is_reverse(prevHandle) << " -> " 722 | << graph.get_id(handle) << ":" << graph.get_is_reverse(handle) << endl; 723 | #endif 724 | graph.create_edge(prevHandle, handle); 725 | } 726 | 727 | // add the node to the path 728 | graph.append_step(pathHandle, handle); 729 | pathString += graph.get_sequence(handle); 730 | 731 | prevSeg = seg; 732 | prevHandle = handle; 733 | 734 | segStart += stPinchSegment_getLength(seg); 735 | 736 | if (seg == lastSeg) { 737 | break; 738 | } 739 | } 740 | 741 | // make sure the path we added is the same as the hal 742 | string halPathString; 743 | sequence->getString(halPathString); 744 | if (pathString.length() != halPathString.length()) { 745 | throw runtime_error("Incorrect length in coverted path for " + sequence->getFullName() + ": " + std::to_string(pathString.length()) + 746 | ". Should be: " + std::to_string(halPathString.length())); 747 | } 748 | vector mismatches; 749 | for (size_t i = 0; i < halPathString.size(); ++i) { 750 | if (toupper(pathString[i]) != toupper(halPathString[i])) { 751 | mismatches.push_back(i); 752 | } 753 | } 754 | if (!mismatches.empty()) { 755 | stringstream msg; 756 | msg << mismatches.size() << " mismatches found in converted path for " << sequence->getFullName() << ":\n"; 757 | for (size_t i = 0; i < mismatches.size() && i < 10; ++i) { 758 | msg << " path[" << mismatches[i] << "]=" << pathString[mismatches[i]] << ". should be " << halPathString[mismatches[i]] << "\n"; 759 | } 760 | throw runtime_error(msg.str()); 761 | } 762 | } 763 | } 764 | 765 | void chop_graph(MutablePathMutableHandleGraph& graph, size_t maxNodeLength) { 766 | // borrowed from https://github.com/vgteam/odgi/blob/master/src/subcommand/chop_main.cpp 767 | std::vector to_chop; 768 | graph.for_each_handle([&](const handle_t& handle) { 769 | if (graph.get_length(handle) > maxNodeLength) { 770 | to_chop.push_back(handle); 771 | } 772 | }); 773 | 774 | for (auto& handle : to_chop) { 775 | // get divide points 776 | uint64_t length = graph.get_length(handle); 777 | std::vector offsets; 778 | for (uint64_t i = maxNodeLength; i < length; i+=maxNodeLength) { 779 | offsets.push_back(i); 780 | } 781 | graph.divide_handle(handle, offsets); 782 | } 783 | } 784 | 785 | subrange_t resolve_subpath_naming(string& path_name) { 786 | size_t first_length = 0; 787 | size_t start_offset = 0; 788 | bool found_subpath = false; 789 | while (true) { 790 | size_t sp = path_name.rfind("_sub_"); 791 | if (sp != string::npos) { 792 | size_t up = path_name.rfind("_"); 793 | if (up != string::npos && up > sp + 1) { 794 | int64_t start; 795 | int64_t end; 796 | start = stol(path_name.substr(sp + 5, up - sp - 5)); 797 | end = stol(path_name.substr(up + 1)); 798 | stringstream new_name; 799 | start_offset += start; // final offset is sum of all nested offsets 800 | if (first_length == 0) { 801 | first_length = end - start; 802 | assert(first_length > 0); 803 | } else { 804 | // in the case of nested subpaths, the end coordinate will always 805 | // be derived from the start, plus the length of the "top" path 806 | end = start_offset + first_length; 807 | } 808 | new_name << path_name.substr(0, sp); 809 | path_name = new_name.str(); 810 | found_subpath = true; 811 | } 812 | } else { 813 | break; 814 | } 815 | } 816 | if (found_subpath) { 817 | return make_pair(start_offset, start_offset + first_length); 818 | } else { 819 | return PathMetadata::NO_SUBRANGE; 820 | } 821 | } 822 | 823 | size_t resolve_haplotype_naming(string& genome_name) { 824 | size_t haplotype = PathMetadata::NO_HAPLOTYPE; 825 | size_t dp = genome_name.rfind("."); 826 | if (dp != string::npos) { 827 | try { 828 | haplotype = stol(genome_name.substr(dp + 1)); 829 | genome_name = genome_name.substr(0, dp); 830 | } catch(...) { 831 | } 832 | } 833 | return haplotype; 834 | } 835 | -------------------------------------------------------------------------------- /halMergeChroms.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2016 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | // Merge chromosome HAL files into one big one. Only star trees with same root name supported (ie what comes out of cactus-align-batch). 8 | 9 | //#define debug 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include "hal.h" 21 | 22 | using namespace std; 23 | using namespace hal; 24 | 25 | static void initParser(CLParser* optionsParser) { 26 | optionsParser->addArgument("inFiles", "comma-separated (only way in HAL parser!) list of input HAL files to merge"); 27 | optionsParser->addArgument("outFile", "output HAL file"); 28 | optionsParser->addOptionFlag("progress", 29 | "show progress", 30 | false); 31 | optionsParser->setDescription("Merge chromosome HALs into combined file. Ancestral sequences are renamed as needed to avoid conflicts" 32 | ". Star trees only"); 33 | } 34 | 35 | // we expect to see the same ancestor sequence names in multiple input files. we uniqify them by adding 36 | // _i to them where i is the file's position in the input. 37 | static string anc_seq_name(const string& seq_name, size_t idx) { 38 | return seq_name + "_" + to_string(idx); 39 | } 40 | 41 | // get the dimensions from all genomes in all input files 42 | static pair>> get_hal_dimensions(CLParser* optionsParser, 43 | const vector& hal_paths) { 44 | // genome -> dimensions (covering all input) 45 | unordered_map> dimensions; 46 | // to check uniqueness 47 | unordered_set sequence_names; 48 | 49 | string root_name; 50 | for (size_t i = 0; i < hal_paths.size(); ++i) { 51 | const string& hal_path = hal_paths[i]; 52 | 53 | // open the hal file 54 | AlignmentConstPtr alignment(openHalAlignment(hal_path, optionsParser, READ_ACCESS)); 55 | 56 | // for every genome 57 | vector genome_names = alignment->getChildNames(alignment->getRootName()); 58 | genome_names.push_back(alignment->getRootName()); 59 | if (root_name.empty()) { 60 | root_name = alignment->getRootName(); 61 | } else if (alignment->getRootName() != root_name) { 62 | throw hal_exception("Root mismatch: " + root_name + " vs " + alignment->getRootName()); 63 | } 64 | for (const string& genome_name : genome_names) { 65 | const Genome* genome = alignment->openGenome(genome_name); 66 | vector& genome_dimensions = dimensions[genome_name]; 67 | // for every sequence 68 | for (SequenceIteratorPtr seqIt = genome->getSequenceIterator(); not seqIt->atEnd(); seqIt->toNext()) { 69 | const Sequence *sequence = seqIt->getSequence(); 70 | // add a little suffix to make ancestral sequences unique 71 | string seq_name = genome->getParent() ? sequence->getName() : anc_seq_name(sequence->getName(), i); 72 | genome_dimensions.emplace_back(seq_name, 73 | sequence->getSequenceLength(), 74 | sequence->getNumTopSegments(), 75 | sequence->getNumBottomSegments()); 76 | string full_name = genome_name + "." + seq_name; 77 | if (sequence_names.count(full_name)) { 78 | throw hal_exception("Duplicate sequence name found: " + full_name); 79 | } else { 80 | sequence_names.insert(full_name); 81 | } 82 | } 83 | alignment->closeGenome(genome); 84 | } 85 | } 86 | return make_pair(root_name, dimensions); 87 | } 88 | 89 | // append each input hal to the out_alignment, one after another. all arrays are copied over, 90 | // but need to be adjsuted to reflect their new offsets. 91 | static void merge_hals(CLParser* optionsParser, AlignmentPtr out_alignment, const vector& in_paths, bool progress) { 92 | 93 | // keep track of where we are in the output 94 | vector top_offsets(out_alignment->getChildNames(out_alignment->getRootName()).size(), 0); 95 | size_t bot_offset = 0; 96 | 97 | for (size_t i = 0; i < in_paths.size(); ++i) { 98 | AlignmentConstPtr in_alignment(openHalAlignment(in_paths[i], optionsParser, READ_ACCESS)); 99 | const Genome* in_root = in_alignment->openGenome(in_alignment->getRootName()); 100 | Genome* out_root = out_alignment->openGenome(in_alignment->getRootName()); 101 | assert(in_root->getName() == out_root->getName()); 102 | size_t in_root_degree = in_root->getNumChildren(); 103 | size_t out_root_degree = out_root->getNumChildren(); 104 | vector in_genomes = {in_root}; 105 | for (const string& in_child_name : in_alignment->getChildNames(in_root->getName())) { 106 | in_genomes.push_back(in_alignment->openGenome(in_child_name)); 107 | } 108 | 109 | // copy the dna sequence by sequence 110 | for (const Genome* in_genome : in_genomes) { 111 | if (progress) { 112 | cerr << "[halMergeChroms]: copying dna for " << in_genome->getName() << " from " << in_paths[i] << endl; 113 | } 114 | Genome* out_genome = out_alignment->openGenome(in_genome->getName()); 115 | for (SequenceIteratorPtr in_si = in_genome->getSequenceIterator(); !in_si->atEnd(); in_si->toNext()) { 116 | const Sequence* in_sequence = in_si->getSequence(); 117 | string out_seq_name = in_genome->getParent() ? in_sequence->getName() : anc_seq_name(in_sequence->getName(), i); 118 | Sequence* out_sequence = out_genome->getSequence(out_seq_name); 119 | DnaIteratorPtr in_di = in_sequence->getDnaIterator(0); 120 | DnaIteratorPtr out_di = out_sequence->getDnaIterator(0); 121 | assert(in_sequence->getSequenceLength() == out_sequence->getSequenceLength()); 122 | string dna; 123 | in_sequence->getString(dna); 124 | out_sequence->setString(dna); 125 | } 126 | } 127 | 128 | // make a child index map (in -> out) for the root genome 129 | // assume : all genomes in in_genome present in out_genome 130 | vector in_ci_to_out_ci(in_root->getNumChildren()); 131 | for (const string& in_child_name : in_alignment->getChildNames(in_root->getName())) { 132 | in_ci_to_out_ci.at(in_root->getChildIndex(in_alignment->openGenome(in_child_name))) = 133 | out_root->getChildIndex(out_alignment->openGenome(in_child_name)); 134 | } 135 | 136 | // copy over the bottom segments of the root 137 | if (progress) { 138 | cerr << "[halMergeChroms]: copying bottom segments for " << in_root->getName() << " from " << in_paths[i] 139 | << " with bseg offset " << bot_offset << endl; 140 | } 141 | BottomSegmentIteratorPtr in_bi = in_root->getBottomSegmentIterator(0); 142 | BottomSegmentIteratorPtr out_bi = out_root->getBottomSegmentIterator(bot_offset); 143 | for (;!in_bi->atEnd(); in_bi->toRight(), out_bi->toRight()) { 144 | // set the segment in the root genome 145 | assert(out_bi->bseg()->getArrayIndex() == in_bi->bseg()->getArrayIndex() + bot_offset); 146 | assert(out_bi->bseg()->getNumChildren() >= in_bi->bseg()->getNumChildren()); 147 | out_bi->bseg()->setTopParseIndex(NULL_INDEX); 148 | // determine the sequence-relative coordinate in the input 149 | const Sequence* in_sequence = in_bi->bseg()->getSequence(); 150 | int64_t in_start_coord = in_bi->bseg()->getStartPosition() - in_sequence->getStartPosition(); 151 | assert(in_start_coord >= 0 && in_start_coord < in_sequence->getSequenceLength()); 152 | // set the sequence relative coordinate in the output 153 | const Sequence* out_sequence = out_root->getSequence(anc_seq_name(in_sequence->getName(), i)); 154 | int64_t out_start_coord = out_sequence->getStartPosition() + in_start_coord; 155 | assert(out_start_coord >= 0 && out_start_coord < out_root->getSequenceLength()); 156 | out_bi->bseg()->setCoordinates(out_start_coord, in_bi->bseg()->getLength()); 157 | // set the segment in the child genomes 158 | for (size_t out_ci = 0; out_ci < out_root_degree; ++out_ci) { 159 | out_bi->bseg()->setChildIndex(out_ci, NULL_INDEX); 160 | } 161 | for (size_t in_ci = 0; in_ci < in_root_degree; ++in_ci) { 162 | size_t out_ci = in_ci_to_out_ci.at(in_ci); 163 | assert(out_ci < out_bi->bseg()->getNumChildren()); 164 | if (in_bi->bseg()->hasChild(in_ci)) { 165 | out_bi->bseg()->setChildIndex(out_ci, in_bi->bseg()->getChildIndex(in_ci) + top_offsets[out_ci]); 166 | out_bi->bseg()->setChildReversed(out_ci, in_bi->bseg()->getChildReversed(in_ci)); 167 | } 168 | } 169 | } 170 | 171 | // for every child genome, copy over the top segments 172 | for (const string& in_child_name : in_alignment->getChildNames(in_root->getName())) { 173 | if (progress) { 174 | cerr << "[halMergeChroms]: copying top segments for " << in_child_name << " from " << in_paths[i] << endl; 175 | } 176 | const Genome* in_child = in_alignment->openGenome(in_child_name); 177 | Genome* out_child = out_alignment->openGenome(in_child_name); 178 | 179 | size_t in_ci = in_root->getChildIndex(in_child); 180 | size_t out_ci = in_ci_to_out_ci[in_ci]; 181 | size_t top_offset = top_offsets[out_ci]; 182 | TopSegmentIteratorPtr in_ti = in_child->getTopSegmentIterator(0); 183 | TopSegmentIteratorPtr out_ti = out_child->getTopSegmentIterator(top_offsets[out_ci]); 184 | 185 | for (;!in_ti->atEnd(); in_ti->toRight(), out_ti->toRight()) { 186 | // set the segment in the child genome 187 | assert(out_ti->tseg()->getArrayIndex() == in_ti->tseg()->getArrayIndex() + top_offset); 188 | if (in_ti->tseg()->hasParent()) { 189 | out_ti->tseg()->setParentIndex(in_ti->tseg()->getParentIndex() + bot_offset); 190 | out_ti->tseg()->setParentReversed(in_ti->tseg()->getParentReversed()); 191 | } else { 192 | out_ti->tseg()->setParentIndex(NULL_INDEX); 193 | } 194 | out_ti->tseg()->setBottomParseIndex(NULL_INDEX); 195 | // determine the sequence-relative coordinate in the input 196 | const Sequence* in_sequence = in_ti->tseg()->getSequence(); 197 | int64_t in_start_coord = in_ti->tseg()->getStartPosition() - in_sequence->getStartPosition(); 198 | // set the sequence relative coordinate in the output 199 | const Sequence* out_sequence = out_child->getSequence(in_sequence->getName()); 200 | int64_t out_start_coord = out_sequence->getStartPosition() + in_start_coord; 201 | out_ti->tseg()->setCoordinates(out_start_coord, in_ti->tseg()->getLength()); 202 | // set the paralogy edge 203 | if (in_ti->tseg()->hasNextParalogy()) { 204 | out_ti->tseg()->setNextParalogyIndex(in_ti->tseg()->getNextParalogyIndex() + top_offset); 205 | } else { 206 | out_ti->tseg()->setNextParalogyIndex(NULL_INDEX); 207 | } 208 | } 209 | } 210 | 211 | // update the offsets to move past the current alignment in all genomes 212 | bot_offset += in_root->getNumBottomSegments(); 213 | for (const string& in_child_name : in_alignment->getChildNames(in_root->getName())) { 214 | const Genome* in_child = in_alignment->openGenome(in_child_name); 215 | size_t in_ci = in_root->getChildIndex(in_child); 216 | size_t out_ci = in_ci_to_out_ci[in_ci]; 217 | top_offsets[out_ci] += in_child->getNumTopSegments(); 218 | } 219 | } 220 | } 221 | 222 | int main(int argc, char** argv) { 223 | CLParser optionsParser(WRITE_ACCESS); 224 | initParser(&optionsParser); 225 | string in_hal_paths; 226 | string out_hal_path; 227 | bool progress; 228 | try { 229 | optionsParser.parseOptions(argc, argv); 230 | in_hal_paths = optionsParser.getArgument("inFiles"); 231 | out_hal_path = optionsParser.getArgument("outFile"); 232 | progress = optionsParser.getFlag("progress"); 233 | } 234 | catch(exception& e) { 235 | cerr << e.what() << endl; 236 | optionsParser.printUsage(cerr); 237 | exit(1); 238 | } 239 | 240 | vector in_paths = chopString(in_hal_paths, ","); 241 | 242 | // map genome -> dimensions for each input alignment 243 | if (progress) { 244 | cerr << "[halMergeChroms]: Scanning dimensions of " << in_paths.size() << " input files." << endl; 245 | } 246 | pair>> rd = get_hal_dimensions(&optionsParser, in_paths); 247 | string& root_name = rd.first; 248 | unordered_map>& dimensions = rd.second; 249 | 250 | // create the new file 251 | if (progress) { 252 | cerr << "[halMergeChroms]: Creating empty alignment: " << out_hal_path << endl; 253 | } 254 | AlignmentPtr alignment(openHalAlignment(out_hal_path, &optionsParser, READ_ACCESS | WRITE_ACCESS | CREATE_ACCESS)); 255 | 256 | // set up the size of each genome, staring with the root 257 | Genome* root_genome = alignment->addRootGenome(root_name); 258 | for (auto& kv : dimensions) { 259 | if (kv.first != root_name) { 260 | Genome* leaf_genome = alignment->addLeafGenome(kv.first, root_name, 1); 261 | leaf_genome->setDimensions(kv.second); 262 | } 263 | } 264 | // important to set root dimensions after adding leaves so bottom segments have right number of slots 265 | root_genome->setDimensions(dimensions.at(root_name)); 266 | 267 | // copy over over everything 268 | merge_hals(&optionsParser, alignment, in_paths, progress); 269 | 270 | if (progress) { 271 | cerr << "[halMergeChroms]: Writing merged alignment" << endl; 272 | } 273 | 274 | return 0; 275 | } 276 | 277 | -------------------------------------------------------------------------------- /halRemoveDupes.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2016 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | //#define debug 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include "hal.h" 17 | 18 | using namespace std; 19 | using namespace hal; 20 | 21 | static void initParser(CLParser* optionsParser) { 22 | optionsParser->addArgument("halFile", "input hal file"); 23 | optionsParser->addArgument("genome", "remove all paralogy edges from this genome"); 24 | optionsParser->setDescription("Remove paralogy edges from given genome (in place)"); 25 | } 26 | 27 | int main(int argc, char** argv) { 28 | CLParser optionsParser(WRITE_ACCESS); 29 | initParser(&optionsParser); 30 | string halPath; 31 | string genomeName; 32 | try { 33 | optionsParser.parseOptions(argc, argv); 34 | halPath = optionsParser.getArgument("halFile"); 35 | genomeName = optionsParser.getArgument("genome"); 36 | } 37 | catch(exception& e) { 38 | cerr << e.what() << endl; 39 | optionsParser.printUsage(cerr); 40 | exit(1); 41 | } 42 | try { 43 | AlignmentPtr alignment(openHalAlignment(halPath, &optionsParser, READ_ACCESS | WRITE_ACCESS)); 44 | if (alignment->getNumGenomes() == 0) { 45 | throw hal_exception("input hal alignmenet is empty"); 46 | } 47 | 48 | Genome* genome = alignment->openGenome(genomeName); 49 | if (genome == NULL) { 50 | throw hal_exception("Genome " + genomeName + " not found in alignment"); 51 | } 52 | 53 | if (genomeName == alignment->getRootName()) { 54 | throw hal_exception("Cannot run on root"); 55 | } 56 | 57 | TopSegmentIteratorPtr topIt = genome->getTopSegmentIterator(); 58 | 59 | size_t total_length = 0; 60 | size_t total_edges = 0; 61 | for (; not topIt->atEnd(); topIt->toRight()) { 62 | TopSegment* topSeg = topIt->tseg(); 63 | if (topSeg->hasNextParalogy()) { 64 | topSeg->setNextParalogyIndex(NULL_INDEX); 65 | if (!topSeg->isCanonicalParalog()) { 66 | topSeg->setParentIndex(NULL_INDEX); 67 | total_length += topSeg->getLength(); 68 | ++total_edges; 69 | } 70 | } 71 | } 72 | 73 | if (total_length > 0) { 74 | cerr << "[halRemoveDupes]: " << total_edges << " paralogy edges removed from " << genomeName 75 | << " with total length " << total_length << endl; 76 | } else { 77 | cerr << "[halRemoveDupes] : No paralogy edges found in " << genomeName << endl; 78 | } 79 | } 80 | 81 | catch(exception& e) { 82 | cerr << e.what() << endl; 83 | exit(1); 84 | } 85 | 86 | return 0; 87 | } 88 | -------------------------------------------------------------------------------- /halUnclip.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2016 by Glenn Hickey (hickey@soe.ucsc.edu) 3 | * 4 | * Released under the MIT license, see LICENSE.txt 5 | */ 6 | 7 | // Convert clipped sequences (like chr1_sub_110000_22220000) back to their original states 8 | 9 | //#define debug 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include "hal.h" 21 | #include "commonC.h" 22 | #include "bioioC.h" 23 | #include "subpaths.h" 24 | 25 | using namespace std; 26 | using namespace hal; 27 | 28 | static void initParser(CLParser* optionsParser) { 29 | optionsParser->addArgument("inFile", "input HAL file"); 30 | optionsParser->addArgument("seqFile", "cactus-style seqfile. 1st col=genome name, 2nd col=(original) fasta file. only local paths supported"); 31 | optionsParser->addArgument("outFile", "output HAL file"); 32 | optionsParser->addOptionFlag("progress", 33 | "show progress", 34 | false); 35 | optionsParser->addOptionFlag("validate", 36 | "run a (non-exhaustive) check on the output", 37 | false); 38 | optionsParser->addOption("targetGenomes", 39 | "comma-separated (no spaces) list of target genomes " 40 | "(others are not unclipped) (all leaves if empty)", 41 | "\"\""); 42 | optionsParser->setDescription("Fill back clipped sequence (removed by cactus-preprocess) using the original fasta files" 43 | ". Star trees only"); 44 | } 45 | 46 | static vector split_delims(const string &s, const string& delims) { 47 | vector elems; 48 | size_t start = string::npos; 49 | for (size_t i = 0; i < s.size(); ++i) { 50 | if (delims.find(s[i]) != string::npos) { 51 | if (start != string::npos && i > start) { 52 | elems.push_back(s.substr(start, i - start)); 53 | } 54 | start = string::npos; 55 | } else if (start == string::npos) { 56 | start = i; 57 | } 58 | } 59 | if (start != string::npos && start < s.size()) { 60 | elems.push_back(s.substr(start, s.size() - start)); 61 | } 62 | return elems; 63 | } 64 | 65 | // c++ wrapper for sonlib -- load fasta file into memory 66 | static unordered_map> read_fasta(const string& fa_path) { 67 | FILE* fa_file = fopen(fa_path.c_str(), "r"); 68 | if (!fa_file) { 69 | cerr << "Unable to open fastat file: " << fa_path << endl; 70 | exit(1); 71 | } 72 | 73 | List* seqs = constructEmptyList(0, free); 74 | List* seq_lens = constructEmptyList(0, free); 75 | List* seq_names = constructEmptyList(0, free); 76 | 77 | fastaRead(fa_file, seqs, seq_lens, seq_names); 78 | 79 | // todo: should be done once, but sonlib fasta reading so slow it odesn't matter 80 | vector cmap(numeric_limits::max()); 81 | for (unsigned char i = 0; i < cmap.size(); ++i) { 82 | switch (i) { 83 | case 'a': 84 | case 'c': 85 | case 'g': 86 | case 't': 87 | case 'A': 88 | case 'C': 89 | case 'G': 90 | case 'T': 91 | cmap[i] = i; 92 | break; 93 | default: 94 | cmap[i] = 'N'; 95 | break; 96 | } 97 | } 98 | 99 | unordered_map> fa_info; 100 | for (int64_t i = 0; i < seqs->length; ++i) { 101 | string name = (char*)seq_names->list[i]; 102 | size_t len = (size_t)listGetInt(seq_lens, i); 103 | string seq = (char*)seqs->list[i]; 104 | for (size_t j = 0; j < seq.length(); ++j) { 105 | // hal doesn't like non-acgtn characters 106 | seq[j] = cmap[seq[j]]; 107 | } 108 | fa_info[name] = make_pair(len, seq); 109 | } 110 | 111 | destructList(seqs); 112 | destructList(seq_lens); 113 | destructList(seq_names); 114 | 115 | fclose(fa_file); 116 | 117 | return fa_info; 118 | } 119 | 120 | // do a pass over the seqfile to get the total lengths of every sequence 121 | static unordered_map get_dimensions_from_seqfile(const string& seqfile_path, const unordered_set& target_set) { 122 | unordered_map seq_map; 123 | 124 | ifstream seqfile(seqfile_path); 125 | if (!seqfile) { 126 | cerr << "[halUnclip]: Unable to open seqfile: " << seqfile_path << endl; 127 | exit(1); 128 | } 129 | 130 | string buffer; 131 | while (getline(seqfile, buffer)) { 132 | vector toks = split_delims(buffer, " \t"); 133 | if (toks.size() == 2) { 134 | string name = toks[0]; 135 | string fa_path = toks[1]; 136 | if (target_set.count(name)) { 137 | unordered_map> fa_info = read_fasta(fa_path); 138 | for (auto& fi : fa_info) { 139 | seq_map[name + "." + fi.first] = fi.second.first; 140 | } 141 | } 142 | } 143 | } 144 | 145 | return seq_map; 146 | } 147 | 148 | static unordered_map> get_filled_dimensions(AlignmentConstPtr alignment, unordered_map& seq_d, 149 | const unordered_set& target_set, bool progress) { 150 | 151 | unordered_map> dim_map; 152 | 153 | // copy root exactly as is 154 | vector& root_dims = dim_map[alignment->getRootName()]; 155 | const Genome* root_genome = alignment->openGenome(alignment->getRootName()); 156 | for (SequenceIteratorPtr seqIt = root_genome->getSequenceIterator(); not seqIt->atEnd(); seqIt->toNext()) { 157 | const Sequence *sequence = seqIt->getSequence(); 158 | root_dims.push_back(Sequence::Info(sequence->getName(), sequence->getSequenceLength(), sequence->getNumTopSegments(), sequence->getNumBottomSegments())); 159 | } 160 | 161 | vector names = alignment->getChildNames(alignment->getRootName()); 162 | 163 | for (const string& name : names) { 164 | const Genome* genome = alignment->openGenome(name); 165 | vector& dimensions = dim_map[name]; 166 | if (progress) { 167 | cerr << "[halUnclip]: Scanning dimensions of genome " << genome->getName() << endl; 168 | } 169 | 170 | // map base name to sequence fragments 171 | unordered_map> frag_map; 172 | 173 | // pass 1, map all hal sequences back to their base name and check that they correspond to a fasta sequence 174 | for (SequenceIteratorPtr seqIt = genome->getSequenceIterator(); not seqIt->atEnd(); seqIt->toNext()) { 175 | const Sequence *sequence = seqIt->getSequence(); 176 | string sequence_name = sequence->getName(); 177 | string parsed_name = target_set.count(genome->getName()) ? parse_subpath_name(sequence_name) : sequence_name; 178 | string full_name = genome->getName() + "." + parsed_name; 179 | size_t fa_len = sequence->getSequenceLength(); 180 | if (!seq_d.count(full_name)) { 181 | if (parsed_name != sequence_name) { 182 | cerr << "[halUnclip]: Unable to find sequence (from HAL) " << full_name << " in dimension map from input fasta" << endl; 183 | exit(1); 184 | } 185 | seq_d[full_name] = fa_len; 186 | } else { 187 | fa_len = seq_d.at(full_name); 188 | } 189 | if (parsed_name == sequence_name && sequence->getSequenceLength() != fa_len) { 190 | cerr << "[halUnclip]: Sequence " << full_name << " has len=" << fa_len << " in fasta but len=" << sequence->getSequenceLength() << " in hal" << endl; 191 | exit(1); 192 | } 193 | if (parsed_name != sequence_name && sequence->getSequenceLength() > fa_len) { 194 | cerr << "[halUnclip]: Sequence " << sequence->getFullName() << " has len=" << fa_len << " in fasta but len=" << sequence->getSequenceLength() << " in hal" << endl; 195 | exit(1); 196 | } 197 | 198 | frag_map[parsed_name].push_back(sequence); 199 | } 200 | 201 | // pass 2: compute the dimensions for each base sequence 202 | for (auto& nf : frag_map) { 203 | const string& base_name = nf.first; 204 | string full_name = genome->getName() + "." + base_name; 205 | vector& frags = nf.second; 206 | size_t fa_len; 207 | fa_len = seq_d.at(full_name); 208 | // sort the fragments by start position 209 | map start_to_frag; 210 | for (const Sequence* frag : frags) { 211 | int64_t start = -1; 212 | string parsed_name = target_set.count(name) ? parse_subpath_name(frag->getName(), &start) : frag->getName(); 213 | if (start == -1) { 214 | start = 0; 215 | assert(frags.size() == 1); 216 | } 217 | start_to_frag[start] = frag; 218 | } 219 | 220 | // count the top segments 221 | size_t top = 0; 222 | // count the gaps (separate counter just for debugging) 223 | size_t gaps = 0; 224 | if (start_to_frag.begin()->first > 0) { 225 | // gap in front 226 | ++gaps; 227 | } 228 | for (auto i = start_to_frag.begin(); i != start_to_frag.end(); ++i) { 229 | auto next = i; 230 | ++next; 231 | if (next != start_to_frag.end()) { 232 | if (i->first + i->second->getSequenceLength() < next->first) { 233 | // gap in middle 234 | ++gaps; 235 | } 236 | } 237 | top += i->second->getNumTopSegments(); 238 | } 239 | if (start_to_frag.rbegin()->first + start_to_frag.rbegin()->second->getSequenceLength() < fa_len) { 240 | // gap in back 241 | ++gaps; 242 | } 243 | dimensions.push_back(Sequence::Info(base_name, fa_len, top + gaps, 0)); 244 | } 245 | 246 | alignment->closeGenome(genome); 247 | } 248 | 249 | return dim_map; 250 | } 251 | 252 | static void copy_and_fill(AlignmentConstPtr in_alignment, AlignmentPtr out_alignment, const unordered_map& seq_dims, 253 | const unordered_set& target_set, bool progress) { 254 | 255 | const Genome* in_root_genome = in_alignment->openGenome(in_alignment->getRootName()); 256 | Genome* out_root_genome = out_alignment->openGenome(in_alignment->getRootName()); 257 | 258 | vector names = in_alignment->getChildNames(in_alignment->getRootName()); 259 | // with a lot of children, the bottom segments are unweildy. they play havoc with default settings (chunk=1000 is too small) 260 | // and are terribly slow even with tuning (except inmemory). so we load up everything we need in this structure in memory 261 | // so that the bottom segments can be set in a single pass 262 | vector> old_to_new_tsai_vec(names.size()); 263 | 264 | for (const string& name : names) { 265 | if (progress) { 266 | cerr << "[halUnclip]: Copying segments of " << name << flush; 267 | } 268 | 269 | const Genome* in_genome = in_alignment->openGenome(name); 270 | Genome* out_genome = out_alignment->openGenome(name); 271 | hal_index_t out_child_no = out_root_genome->getChildIndex(out_genome); 272 | hal_index_t in_child_no = in_root_genome->getChildIndex(in_genome); 273 | assert(in_child_no == out_child_no); 274 | 275 | // map base name to sequence fragments 276 | // todo: same thing done in above funciton -- generalize? 277 | unordered_map> frag_map; 278 | 279 | // pass 1, map all hal sequences back to their base name and check that they correspond to a fasta sequence 280 | if (progress) { 281 | cerr << " [pass 1]" << flush; 282 | } 283 | for (SequenceIteratorPtr seqIt = in_genome->getSequenceIterator(); not seqIt->atEnd(); seqIt->toNext()) { 284 | const Sequence *sequence = seqIt->getSequence(); 285 | string sequence_name = sequence->getName(); 286 | string parsed_name = target_set.count(name) ? parse_subpath_name(sequence_name) : sequence_name; 287 | frag_map[parsed_name].push_back(sequence); 288 | } 289 | 290 | // pass 2, copy each sequence fragment by fragment 291 | if (progress) { 292 | cerr << " [pass 2]" << flush; 293 | } 294 | vector& old_to_new_tsai = old_to_new_tsai_vec[out_child_no]; 295 | old_to_new_tsai.resize(in_genome->getNumTopSegments(), NULL_INDEX); 296 | 297 | for (auto& nf : frag_map) { 298 | const string& base_name = nf.first; 299 | vector& frags = nf.second; 300 | 301 | // sort the fragments by start position 302 | map start_to_frag; 303 | for (const Sequence* frag : frags) { 304 | int64_t start = -1; 305 | string parsed_name = target_set.count(name) ? parse_subpath_name(frag->getName(), &start) : frag->getName(); 306 | if (start == -1) { 307 | start = 0; 308 | assert(frags.size() == 1); 309 | } 310 | start_to_frag[start] = frag; 311 | } 312 | 313 | // the one output sequence that corresponds to the list of fragments in the input 314 | Sequence* out_sequence = out_genome->getSequence(base_name); 315 | assert(out_sequence != nullptr); 316 | TopSegmentIteratorPtr out_top = out_sequence->getTopSegmentIterator(); 317 | TopSegment* ts; 318 | 319 | int64_t cur_pos = 0; //position in out_sequence 320 | int64_t out_start = out_sequence->getStartPosition(); //offset needed when setting coorindatesin out_top 321 | 322 | // visit the ordered input sequence fragments that correspond to out_sequence 323 | for (auto i = start_to_frag.begin(); i != start_to_frag.end(); ++i) { 324 | const Sequence* in_sequence_frag = i->second; 325 | int64_t frag_start = i->first; 326 | if (frag_start > cur_pos) { 327 | // need to add a gap *before* this fragment 328 | ts = out_top->tseg(); 329 | ts->setCoordinates(cur_pos + out_start, frag_start - cur_pos); 330 | ts->setParentIndex(NULL_INDEX); 331 | ts->setNextParalogyIndex(NULL_INDEX); 332 | ts->setBottomParseIndex(NULL_INDEX); 333 | #ifdef debug 334 | cerr << "cur_pos=" << cur_pos << flush; 335 | #endif 336 | cur_pos += ts->getLength(); 337 | #ifdef debug 338 | cerr << " after adding start gap cur_pos=" << cur_pos << " (frag name=" << in_sequence_frag->getName() << " fragstart=" << frag_start << ")" << endl; 339 | #endif 340 | out_top->toRight(); 341 | } 342 | #ifdef debug 343 | cerr << "frag " << in_sequence_frag->getFullName() << " has " << in_sequence_frag->getNumTopSegments() << " topsegs which will map to range " 344 | << out_sequence->getTopSegmentIterator()->tseg()->getArrayIndex() << " - " 345 | << (out_sequence->getTopSegmentIterator()->tseg()->getArrayIndex() + in_sequence_frag->getNumTopSegments()) << endl; 346 | #endif 347 | // copy the fragment. note that the ancestor coordinates haven't changed 348 | // any, so those coordinates can go directly 349 | TopSegmentIteratorPtr frag_top = in_sequence_frag->getTopSegmentIterator(); 350 | size_t frag_top_count = in_sequence_frag->getNumTopSegments(); 351 | for (size_t frag_top_i = 0; frag_top_i < frag_top_count; ++frag_top_i) { 352 | ts = out_top->tseg(); 353 | ts->setCoordinates(out_start + cur_pos, frag_top->tseg()->getLength()); 354 | ts->setParentIndex(frag_top->tseg()->getParentIndex()); 355 | ts->setParentReversed(frag_top->tseg()->getParentReversed()); 356 | 357 | // set the bad value from input alignment, to be update later when we have map 358 | ts->setNextParalogyIndex(frag_top->tseg()->getNextParalogyIndex()); 359 | ts->setBottomParseIndex(NULL_INDEX); 360 | #ifdef debug 361 | cerr << "cur_pos=" << cur_pos << flush; 362 | #endif 363 | cur_pos += ts->getLength(); 364 | #ifdef debug 365 | cerr << " after adding frag_ts " << frag_top_i << " cur_pos=" << cur_pos << endl; 366 | #endif 367 | old_to_new_tsai[frag_top->tseg()->getArrayIndex()] = ts->getArrayIndex(); 368 | frag_top->toRight(); 369 | out_top->toRight(); 370 | } 371 | } 372 | if (cur_pos < (int64_t)out_sequence->getSequenceLength()) { 373 | // needto add a gap *after* the last fragment 374 | ts = out_top->tseg(); 375 | ts->setCoordinates(out_start + cur_pos, (int64_t)out_sequence->getSequenceLength() - cur_pos); 376 | ts->setParentIndex(NULL_INDEX); 377 | ts->setNextParalogyIndex(NULL_INDEX); 378 | ts->setBottomParseIndex(NULL_INDEX); 379 | #ifdef debug 380 | cerr << "cur_pos="<< cur_pos << flush; 381 | #endif 382 | cur_pos += ts->getLength(); 383 | #ifdef debug 384 | cerr << " after adding end gap cur_pos=" << cur_pos << endl; 385 | #endif 386 | out_top->toRight(); 387 | } 388 | if (cur_pos != (int64_t)out_sequence->getSequenceLength()) { 389 | cerr << "[halUnclip]: sanity check fail for sequence " << name << "." << base_name << ". The offset after conversion is " 390 | << cur_pos << " which is different than the sequence length of " << out_sequence->getSequenceLength() << endl 391 | << "[halUnclip]: the fragments are\n"; 392 | for (size_t i = 0; i < frags.size(); ++i) { 393 | const Sequence* in_sequence_frag = frags[i]; 394 | cerr << " " << in_sequence_frag->getName() << " len=" << in_sequence_frag->getSequenceLength() << endl; 395 | } 396 | } 397 | assert(cur_pos == (int64_t)out_sequence->getSequenceLength()); 398 | assert(out_top->getArrayIndex() == out_sequence->getTopSegmentIterator()->getArrayIndex() + (int64_t)out_sequence->getNumTopSegments()); 399 | } 400 | 401 | //pass 3: set the paralogy indexes 402 | if (progress) { 403 | cerr << " [pass 3]" << endl; 404 | } 405 | TopSegment* ts; 406 | for (TopSegmentIteratorPtr out_topit = out_genome->getTopSegmentIterator(); !out_topit->atEnd(); out_topit->toRight()) { 407 | ts = out_topit->tseg(); 408 | if (ts->hasNextParalogy()) { 409 | ts->setNextParalogyIndex(old_to_new_tsai[ts->getNextParalogyIndex()]); 410 | } 411 | } 412 | 413 | in_alignment->closeGenome(in_genome); 414 | out_alignment->closeGenome(out_genome); 415 | } 416 | 417 | // copy the root 418 | if (progress) { 419 | cerr << "[halUnclip]: Copying root segments" << endl; 420 | } 421 | BottomSegmentIteratorPtr in_botit = in_root_genome->getBottomSegmentIterator(); 422 | BottomSegmentIteratorPtr out_botit = out_root_genome->getBottomSegmentIterator(); 423 | assert(in_root_genome->getNumBottomSegments() == out_root_genome->getNumBottomSegments()); 424 | assert(in_root_genome->getNumChildren() == out_root_genome->getNumChildren()); 425 | size_t num_bottom = in_root_genome->getNumBottomSegments(); 426 | size_t num_children = in_root_genome->getNumChildren(); 427 | for (size_t i = 0; i < num_bottom; ++i) { 428 | BottomSegment* in_bs = in_botit->bseg(); 429 | BottomSegment* out_bs = out_botit->bseg(); 430 | out_bs->setCoordinates(in_bs->getStartPosition(), in_bs->getLength()); 431 | for (size_t j = 0; j < num_children; ++j) { 432 | // everything's the same except the child index, which gets mapped via old_to_new_tsai_vec 433 | hal_index_t in_ci = in_bs->getChildIndex(j); 434 | hal_index_t out_ci = in_ci != NULL_INDEX ? old_to_new_tsai_vec[j][in_ci] : in_ci; 435 | out_bs->setChildIndex(j, out_ci); 436 | out_bs->setChildReversed(j, in_bs->getChildReversed(j)); 437 | } 438 | out_bs->setTopParseIndex(NULL_INDEX); 439 | in_botit->toRight(); 440 | out_botit->toRight(); 441 | } 442 | } 443 | 444 | // go in and rewerite the sequences from the fasta 445 | void add_fasta_sequences(AlignmentConstPtr in_alignment, AlignmentPtr out_alignment, const string& seqfile_path, const unordered_set& target_set, bool progress) { 446 | ifstream seqfile(seqfile_path); 447 | if (!seqfile) { 448 | cerr << "[halUnclip]: Unable to open seqfile: " << seqfile_path << endl; 449 | exit(1); 450 | } 451 | 452 | string buffer; 453 | set done_set; 454 | while (getline(seqfile, buffer)) { 455 | vector toks = split_delims(buffer, " \t"); 456 | if (toks.size() == 2) { 457 | string name = toks[0]; 458 | string fa_path = toks[1]; 459 | if (target_set.count(name)) { 460 | done_set.insert(name); 461 | if (progress) { 462 | cerr << "[halUnclip]: Loading fasta for " << name << " ... " << flush; 463 | } 464 | unordered_map> fa_info = read_fasta(fa_path); 465 | if (progress) { 466 | cerr << "and setting dna strings in output genome" << endl; 467 | } 468 | Genome* genome = out_alignment->openGenome(name); 469 | assert(genome != nullptr); 470 | for (auto& fi : fa_info) { 471 | Sequence* sequence = genome->getSequence(fi.first); 472 | if (sequence != nullptr) { 473 | assert(sequence->getSequenceLength() == fi.second.first); 474 | sequence->setString(fi.second.second); 475 | } 476 | } 477 | } 478 | } 479 | } 480 | 481 | // if there's no _sub sequences found, a genome is allowed to not be in the sequence map 482 | // this is generally the case for the root, but could be the minigraph contigs 483 | vector names = in_alignment->getChildNames(in_alignment->getRootName()); 484 | names.push_back(in_alignment->getRootName()); 485 | for (const string& name : names) { 486 | if (!done_set.count(name)) { 487 | if (progress) { 488 | cerr << "[halUnclip]: Directly copying dna strings for " << name << endl; 489 | }; 490 | const Genome* in_genome = in_alignment->openGenome(name); 491 | Genome* out_genome = out_alignment->openGenome(name); 492 | for (SequenceIteratorPtr seqIt = in_genome->getSequenceIterator(); not seqIt->atEnd(); seqIt->toNext()) { 493 | const Sequence* in_sequence = seqIt->getSequence(); 494 | Sequence* out_sequence = out_genome->getSequence(in_sequence->getName()); 495 | in_sequence->getString(buffer); 496 | out_sequence->setString(buffer); 497 | } 498 | if (name != in_alignment->getRootName()) { 499 | in_alignment->closeGenome(in_genome); 500 | out_alignment->closeGenome(out_genome); 501 | } 502 | } 503 | } 504 | } 505 | 506 | 507 | // root->leaf alignments are consistent 508 | static void validate_alignments(AlignmentConstPtr in_alignment, AlignmentPtr out_alignment) { 509 | 510 | validateAlignment(out_alignment.get()); 511 | 512 | const Genome* in_root_genome = in_alignment->openGenome(in_alignment->getRootName()); 513 | Genome* out_root_genome = out_alignment->openGenome(in_alignment->getRootName()); 514 | assert(in_root_genome->getNumBottomSegments() == out_root_genome->getNumBottomSegments()); 515 | assert(in_root_genome->getNumChildren() == out_root_genome->getNumChildren()); 516 | // we go by genome (instead of segment) to hopefully be cache-friendlier 517 | for (size_t j = 0; j < in_root_genome->getNumChildren(); ++j) { 518 | const Genome* in_genome = in_root_genome->getChild(j); 519 | Genome* out_genome = out_root_genome->getChild(j); 520 | BottomSegmentIteratorPtr in_botit = in_root_genome->getBottomSegmentIterator(); 521 | BottomSegmentIteratorPtr out_botit = out_root_genome->getBottomSegmentIterator(); 522 | TopSegmentIteratorPtr in_topit = in_genome->getTopSegmentIterator(); 523 | TopSegmentIteratorPtr out_topit = out_genome->getTopSegmentIterator(); 524 | for (size_t i = 0; i < in_genome->getNumBottomSegments(); ++i) { 525 | in_topit->toChild(in_botit, j); 526 | out_topit->toChild(out_botit, j); 527 | 528 | string s1, s2; 529 | if (j == 0) { 530 | in_botit->getString(s1); 531 | out_botit->getString(s2); 532 | assert(s1 == s2); 533 | } 534 | in_topit->getString(s1); 535 | out_topit->getString(s2); 536 | assert(s1 == s2); 537 | 538 | string in_seq_name = in_topit->tseg()->getSequence()->getName(); 539 | string out_seq_name = out_topit->tseg()->getSequence()->getName(); 540 | int64_t start; 541 | string in_base_name = parse_subpath_name(in_seq_name, &start); 542 | assert(in_base_name == out_seq_name); 543 | assert(in_topit->getReversed() == out_topit->getReversed()); 544 | if (!in_topit->getReversed()) { 545 | // punt on reverse check for now 546 | assert(in_topit->getStartPosition() + start == out_topit->getStartPosition()); 547 | } 548 | 549 | in_botit->toRight(); 550 | out_botit->toRight(); 551 | } 552 | in_alignment->closeGenome(in_genome); 553 | out_alignment->closeGenome(out_genome); 554 | } 555 | } 556 | 557 | int main(int argc, char** argv) { 558 | CLParser optionsParser(WRITE_ACCESS); 559 | initParser(&optionsParser); 560 | string in_hal_path; 561 | string out_hal_path; 562 | string seqfile_path; 563 | string target_genomes; 564 | bool progress; 565 | bool validate; 566 | try { 567 | optionsParser.parseOptions(argc, argv); 568 | in_hal_path = optionsParser.getArgument("inFile"); 569 | seqfile_path = optionsParser.getArgument("seqFile"); 570 | out_hal_path = optionsParser.getArgument("outFile"); 571 | target_genomes = optionsParser.getOption("targetGenomes"); 572 | progress = optionsParser.getFlag("progress"); 573 | validate = optionsParser.getFlag("validate"); 574 | } 575 | catch(exception& e) { 576 | cerr << e.what() << endl; 577 | optionsParser.printUsage(cerr); 578 | exit(1); 579 | } 580 | 581 | // load the input genome 582 | if (progress) { 583 | cerr << "[halUnclip]: Opening input alignment" << endl; 584 | } 585 | AlignmentConstPtr in_alignment(openHalAlignment(in_hal_path, &optionsParser, READ_ACCESS)); 586 | 587 | // and the output genome 588 | if (progress) { 589 | cerr << "[halUnclip]: Creating output alignment object" << endl; 590 | } 591 | AlignmentPtr out_alignment(openHalAlignment(out_hal_path, &optionsParser, READ_ACCESS | WRITE_ACCESS | CREATE_ACCESS)); 592 | 593 | // check the targets, defaulting to all leaves 594 | vector target_names; 595 | if (target_genomes != "\"\"") { 596 | target_names = chopString(target_genomes, ","); 597 | for (const string& name : target_names) { 598 | const Genome* genome = in_alignment->openGenome(name); 599 | if (genome == nullptr) { 600 | cerr << "[halUnclip]: Target genome " << name << " not present in input HAL" << endl; 601 | exit(1); 602 | } 603 | in_alignment->closeGenome(genome); 604 | } 605 | } else { 606 | target_names = in_alignment->getChildNames(in_alignment->getRootName()); 607 | } 608 | unordered_set target_set(target_names.begin(), target_names.end()); 609 | 610 | // and load the fasta sequence sizes from the seqfile 611 | if (progress) { 612 | cerr << "[halUnclip]: Reading fasta dimensions from seqfile" << endl; 613 | } 614 | unordered_map seq_dims = get_dimensions_from_seqfile(seqfile_path, target_set); 615 | 616 | if (progress) { 617 | cerr << "[halUnclip]: Computing new hal dimensions" << endl; 618 | } 619 | unordered_map> dimensions = get_filled_dimensions(in_alignment, seq_dims, target_set, progress); 620 | 621 | // set up the size of each genome, staring with the root 622 | string root_name = in_alignment->getRootName(); 623 | Genome* root_genome = out_alignment->addRootGenome(root_name); 624 | // important to visit these in order, so child indexes are presesrved 625 | vector leaf_names = in_alignment->getChildNames(root_name); 626 | for (const string& leaf_name : leaf_names) { 627 | vector& leaf_dims = dimensions.at(leaf_name); 628 | Genome* leaf_genome = out_alignment->addLeafGenome(leaf_name, root_name, 1); 629 | leaf_genome->setDimensions(leaf_dims); 630 | if (progress) { 631 | cerr << "[halUnclip]: Adding leaf genome " << leaf_name << " with length " << leaf_genome->getSequenceLength() << " and " << leaf_genome->getNumTopSegments() << " top segments" << endl; 632 | } 633 | } 634 | 635 | // important to set root dimensions after adding leaves so bottom segments have right number of slots 636 | root_genome->setDimensions(dimensions.at(root_name)); 637 | if (progress) { 638 | cerr << "[halUnclip]: Adding root genome " << root_name << " with length " << root_genome->getSequenceLength() << " and " << root_genome->getNumBottomSegments() << " bottom segments" << endl; 639 | } 640 | 641 | // copy over the filled graph 642 | if (progress) { 643 | cerr << "[halUnclip]: Copying and filling the graph" << endl; 644 | } 645 | copy_and_fill(in_alignment, out_alignment, seq_dims, target_set, progress); 646 | 647 | // add back the fasta sequences 648 | if (progress) { 649 | cerr << "[halUnclip]: Adding fasta sequences" << endl; 650 | } 651 | add_fasta_sequences(in_alignment, out_alignment, seqfile_path, target_set, progress); 652 | 653 | if (validate) { 654 | if (progress) { 655 | cerr << "[halUnclip]: Validating alignment" << endl; 656 | } 657 | validate_alignments(in_alignment, out_alignment); 658 | } 659 | 660 | if (progress) { 661 | cerr << "[halUnclip]: Writing output alignment" << endl; 662 | } 663 | 664 | return 0; 665 | } 666 | 667 | -------------------------------------------------------------------------------- /include.mk: -------------------------------------------------------------------------------- 1 | binPath=${rootPath} 2 | libPath=${rootPath} 3 | 4 | sonLibRootPath=deps/sonLib 5 | sonLibPath=${sonLibRootPath}/lib 6 | 7 | halRootPath=deps/hal 8 | halPath=${halRootPath}/lib 9 | halIncPath=${halRootPath}/api/inc 10 | 11 | libbdsgPath=${rootPath}/deps/libbdsg-easy 12 | 13 | include ${sonLibRootPath}/include.mk 14 | 15 | CFLAGS += -I ${sonLibPath} -I ${halPath} -I ${halIncPath} 16 | CXXFLAGS += -std=c++14 -I ${sonLibPath} -I ${halPath} -I ${halIncPath} -I ${libbdsgPath}/include -UNDEBUG 17 | basicLibs = ${halPath}/libHal.a ${sonLibPath}/stPinchesAndCacti.a ${sonLibPath}/sonLib.a ${sonLibPath}/cuTest.a ${libbdsgPath}/lib/libbdsg.a ${libbdsgPath}/lib/libhandlegraph.a ${libbdsgPath}/lib/libsdsl.a ${libbdsgPath}/lib/libdivsufsort.a ${libbdsgPath}/lib/libdivsufsort64.a 18 | basicLibsDependencies = ${basicLibs} 19 | 20 | # hdf5 compilation is done through its wrappers. 21 | # we can speficy our own (sonlib) compilers with these variables: 22 | HDF5_CXX = ${cpp} 23 | HDF5_CXXLINKER = ${cpp} 24 | HDF5_CC = ${cxx} 25 | HDF5_CCLINKER = ${cxx} 26 | cpp = h5c++ ${h5prefix} 27 | cxx = h5cc ${h5prefix} 28 | 29 | # add compiler flag and kent paths if udc is enabled 30 | # relies on KENTSRC containing path to top level kent/ dir 31 | # and MACHTYPE being specified 32 | ifdef ENABLE_UDC 33 | # Find samtabix as in kent/src/inc/common.mk: 34 | ifeq (${SAMTABIXDIR},) 35 | SAMTABIXDIR = /hive/data/outside/samtabix/${MACHTYPE} 36 | endif 37 | 38 | basicLibs += ${KENTSRC}/src/lib/${MACHTYPE}/jkweb.a ${SAMTABIXDIR}/libsamtabix.a -lssl -lcrypto 39 | endif 40 | 41 | -------------------------------------------------------------------------------- /paf.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | using namespace std; 8 | 9 | struct PafLine { 10 | string query_name; 11 | int64_t query_len; 12 | int64_t query_start; 13 | int64_t query_end; 14 | char strand; 15 | string target_name; 16 | int64_t target_len; 17 | int64_t target_start; 18 | int64_t target_end; 19 | int64_t num_matching; 20 | int64_t num_bases; 21 | int64_t mapq; 22 | string cigar; 23 | }; 24 | 25 | inline vector split_delims(const string &s, const string& delims, vector &elems) { 26 | size_t start = string::npos; 27 | for (size_t i = 0; i < s.size(); ++i) { 28 | if (delims.find(s[i]) != string::npos) { 29 | if (start != string::npos && i > start) { 30 | elems.push_back(s.substr(start, i - start)); 31 | } 32 | start = string::npos; 33 | } else if (start == string::npos) { 34 | start = i; 35 | } 36 | } 37 | if (start != string::npos && start < s.size()) { 38 | elems.push_back(s.substr(start, s.size() - start)); 39 | } 40 | return elems; 41 | } 42 | 43 | inline PafLine parse_paf_line(const string& paf_line) { 44 | vector toks; 45 | split_delims(paf_line, "\t\n", toks); 46 | assert(toks.size() > 12); 47 | 48 | PafLine paf; 49 | paf.query_name = toks[0]; 50 | paf.query_len = stol(toks[1]); 51 | paf.query_start = stol(toks[2]); 52 | paf.query_end = stol(toks[3]); 53 | assert(toks[4] == "+" || toks[4] == "-"); 54 | paf.strand = toks[4][0]; 55 | paf.target_name = toks[5]; 56 | paf.target_len = stol(toks[6]); 57 | paf.target_start = stol(toks[7]); 58 | paf.target_end = stol(toks[8]); 59 | paf.num_matching = stol(toks[9]); 60 | paf.num_bases = stol(toks[10]); 61 | paf.mapq = stol(toks[11]); 62 | 63 | for (size_t i = 12; i < toks.size(); ++i) { 64 | if (toks[i].compare(0, 3, "cg:Z:") == 0) { 65 | paf.cigar = toks[i].substr(5); 66 | break; 67 | } 68 | } 69 | 70 | return paf; 71 | } 72 | 73 | inline ostream& operator<<(ostream& os, const PafLine& paf) { 74 | os << paf.query_name << "\t" << paf.query_len << "\t" << paf.query_start << "\t" << paf.query_end << "\t" 75 | << string(1, paf.strand) << "\t" 76 | << paf.target_name << "\t" << paf.target_len << "\t" << paf.target_start << "\t" << paf.target_end << "\t" 77 | << paf.num_matching << "\t" << paf.num_bases << "\t" << paf.mapq; 78 | return os; 79 | } 80 | -------------------------------------------------------------------------------- /subpaths.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | inline std::string parse_subpath_name(const std::string& path_name, int64_t* out_start = nullptr, int64_t* out_end = nullptr) { 8 | 9 | std::string base_name = path_name; 10 | if (out_start) { 11 | *out_start = -1; 12 | } 13 | if (out_end) { 14 | *out_end = -1; 15 | } 16 | 17 | size_t first_length = 0; 18 | size_t start_offset = 0; 19 | while (true) { 20 | size_t sp = base_name.rfind("_sub_"); 21 | if (sp != std::string::npos) { 22 | size_t up = base_name.rfind("_"); 23 | if (up != std::string::npos && up > sp + 1) { 24 | int64_t start; 25 | int64_t end; 26 | try { 27 | start = stol(base_name.substr(sp + 5, up - sp - 5)); 28 | end = stol(base_name.substr(up + 1)); 29 | } catch (...) { 30 | return base_name; 31 | } 32 | std::stringstream new_name; 33 | start_offset += start; // final offset is sum of all nested offsets 34 | if (first_length == 0) { 35 | first_length = end - start; 36 | assert(first_length > 0); 37 | } else { 38 | // in the case of nested subpaths, the end coordinate will always 39 | // be derived from the start, plus the length of the "top" path 40 | end = start_offset + first_length; 41 | } 42 | if (out_start) { 43 | *out_start = start_offset; 44 | } 45 | if (out_end) { 46 | *out_end = end; 47 | } 48 | base_name = base_name.substr(0, sp); 49 | } 50 | } else { 51 | break; 52 | } 53 | } 54 | return base_name; 55 | } 56 | 57 | inline void resolve_subpath_naming(std::string& path_name) { 58 | int64_t sub_start; 59 | int64_t sub_end; 60 | std::string new_name = parse_subpath_name(path_name, &sub_start, &sub_end); 61 | if (sub_start != -1) { 62 | assert(new_name != path_name); 63 | path_name = new_name + "[" + std::to_string(sub_start) + "-" + std::to_string(sub_end) + "]"; 64 | } 65 | } 66 | 67 | -------------------------------------------------------------------------------- /tests/bash-tap/Changes: -------------------------------------------------------------------------------- 1 | * bash-tap 1.0.2 (2013-05-19-00:59) 2 | * Make bash-tap-mock work with -e. 3 | Contributed by Daniel Nephin (@dnephin). 4 | 5 | * bash-tap 1.0.1 (2012-07-14-20:59) 6 | * Clearer diagnostics for like/unlike. 7 | * Correct syntax for bash 3.2 regexp matching in like/unlike. 8 | 9 | * bash-tap 1.0.0 (2012-06-20-15:31) 10 | * TAP-compliant testing for bash. 11 | * Function and command mocks for bash. 12 | * In-process output capture helpers for testing. 13 | -------------------------------------------------------------------------------- /tests/bash-tap/README.mkdn: -------------------------------------------------------------------------------- 1 | Bash-TAP 2 | ======== 3 | 4 | Bash-TAP allows you to perform TAP-compliant tests within bash 5 | using a similar test syntax to Perl's Test::More and Test::Builder, 6 | suitable to run with `prove` or any other TAP-consuming test harness. 7 | 8 | For more information about TAP (the Test Anything Protocol) visit: 9 | http://testanything.org/ 10 | 11 | Installation and Usage 12 | ---------------------- 13 | 14 | 1. Install the bash-tap files somewhere convenient for you. 15 | The default location of `../../bash-tap` relative to your 16 | test files is the easiest zero-conf way, but you can set 17 | the `$BASH_TAP_ROOT` environment variable if you want to 18 | install elsewhere. 19 | 2. If you're writing tests then copy `bash-tap-bootstrap` 20 | into your tests dir and source it inside your tests with: 21 | 22 | ```bash 23 | . $(dirname $0)/bash-tap-bootstrap 24 | ``` 25 | 26 | 3. Run your tests with `prove my_test_dir` or your favourite 27 | TAP-consuming test harness, or run them manually as a 28 | script if you just want to see the raw TAP output. 29 | 30 | Example test file 31 | ----------------- 32 | 33 | Here's example test file `01_read_rows_from_key_value_lines.t` from 34 | https://github.com/illusori/bash-snippets 35 | 36 | ```bash 37 | #!/bin/bash 38 | 39 | . $(dirname $0)/bash-tap-bootstrap 40 | . $(dirname $0)/../read_rows_from_key_value_lines 41 | 42 | columns_per_row=6 43 | max_rows_per_rowset=3 44 | total_rowsets=2 45 | 46 | plan tests $(((columns_per_row * max_rows_per_rowset * total_rowsets) + total_rowsets)) 47 | 48 | # Test data, resultset 1 49 | results1="artist Assemblage 23 50 | track Naked (God Module RMX) 51 | album Addendum 52 | year 2001 53 | rating 80 54 | tracktime 5:22 55 | artist Ayria 56 | track Sapphire 57 | album Debris 58 | year 59 | rating 100 60 | tracktime 6:14 61 | artist Apoptygma Berzerk 62 | track Kathy's Song 63 | album Welcome To Earth \"Extra bit for testing\" 64 | year 65 | rating 100 66 | tracktime 6:35" 67 | 68 | # Test data, resultset 2 69 | results2="artist Colony 5 70 | track The Bottle 71 | album Lifeline 72 | year 73 | rating 80 74 | tracktime 4:34" 75 | 76 | output=$(_read_rows_from_key_value_lines "track" "$results1" 2>&1) 77 | is "$output" "" "Read of rowset 1 should produce no output" 78 | # Since $() runs in a subshell, we need to run it "for real" now 79 | _read_rows_from_key_value_lines "track" "$results1" &>/dev/null 80 | 81 | # Track 1 82 | is "${track_artist[0]}" "Assemblage 23" "rowset 1 track 1 artist" 83 | is "${track_track[0]}" "Naked (God Module RMX)" "rowset 1 track 1 track" 84 | is "${track_album[0]}" "Addendum" "rowset 1 track 1 album" 85 | is "${track_year[0]}" "2001" "rowset 1 track 1 year" 86 | is "${track_rating[0]}" "80" "rowset 1 track 1 rating" 87 | is "${track_tracktime[0]}" "5:22" "rowset 1 track 1 tracktime" 88 | 89 | # Track 2 90 | is "${track_artist[1]}" "Ayria" "rowset 1 track 2 artist" 91 | is "${track_track[1]}" "Sapphire" "rowset 1 track 2 track" 92 | is "${track_album[1]}" "Debris" "rowset 1 track 2 album" 93 | is "${track_year[1]}" "" "rowset 1 track 2 year" 94 | is "${track_rating[1]}" "100" "rowset 1 track 2 rating" 95 | is "${track_tracktime[1]}" "6:14" "rowset 1 track 2 tracktime" 96 | 97 | # Track 3 98 | is "${track_artist[2]}" "Apoptygma Berzerk" "rowset 1 track 3 artist" 99 | is "${track_track[2]}" "Kathy's Song" "rowset 1 track 3 track" 100 | is "${track_album[2]}" "Welcome To Earth \"Extra bit for testing\"" "rowset 1 track 3 album" 101 | is "${track_year[2]}" "" "rowset 1 track 3 year" 102 | is "${track_rating[2]}" "100" "rowset 1 track 3 rating" 103 | is "${track_tracktime[2]}" "6:35" "rowset 1 track 3 tracktime" 104 | 105 | output=$(_read_rows_from_key_value_lines "track" "$results2" 2>&1) 106 | is "$output" "" "Read of rowset 2 should produce no output" 107 | # Since $() runs in a subshell, we need to run it "for real now 108 | _read_rows_from_key_value_lines "track" "$results2" &>/dev/null 109 | 110 | # Track 1 111 | is "${track_artist[0]}" "Colony 5" "rowset 2 track 1 artist" 112 | is "${track_track[0]}" "The Bottle" "rowset 2 track 1 track" 113 | is "${track_album[0]}" "Lifeline" "rowset 2 track 1 album" 114 | is "${track_year[0]}" "" "rowset 2 track 1 year" 115 | is "${track_rating[0]}" "80" "rowset 2 track 1 rating" 116 | is "${track_tracktime[0]}" "4:34" "rowset 2 track 1 tracktime" 117 | 118 | # Track 2 119 | is "${track_artist[1]}" "" "rowset 2 track 2 artist" 120 | is "${track_track[1]}" "" "rowset 2 track 2 track" 121 | is "${track_album[1]}" "" "rowset 2 track 2 album" 122 | is "${track_year[1]}" "" "rowset 2 track 2 year" 123 | is "${track_rating[1]}" "" "rowset 2 track 2 rating" 124 | is "${track_tracktime[1]}" "" "rowset 2 track 2 tracktime" 125 | 126 | # Track 3 127 | is "${track_artist[2]}" "" "rowset 2 track 3 artist" 128 | is "${track_track[2]}" "" "rowset 2 track 3 track" 129 | is "${track_album[2]}" "" "rowset 2 track 3 album" 130 | is "${track_year[2]}" "" "rowset 2 track 3 year" 131 | is "${track_rating[2]}" "" "rowset 2 track 3 rating" 132 | is "${track_tracktime[2]}" "" "rowset 2 track 3 tracktime" 133 | ``` 134 | 135 | Running this gives output: 136 | 137 | ``` 138 | $ prove ~/projects/bash-snippets/t 139 | /Users/illusori/projects/bash-snippets/t/01_read_rows_from_key_value_lines.t .. ok 140 | All tests successful. 141 | Files=1, Tests=38, 0 wallclock secs ( 0.04 usr 0.00 sys + 0.04 cusr 0.02 csys = 0.10 CPU) 142 | Result: PASS 143 | ``` 144 | 145 | Or the verbose output: 146 | 147 | ``` 148 | $ prove -v ~/projects/bash-snippets/t 149 | /Users/illusori/projects/bash-snippets/t/01_read_rows_from_key_value_lines.t .. 150 | 1..38 151 | ok 1 - Read of rowset 1 should produce no output 152 | ok 2 - rowset 1 track 1 artist 153 | ok 3 - rowset 1 track 1 track 154 | ok 4 - rowset 1 track 1 album 155 | ok 5 - rowset 1 track 1 year 156 | ok 6 - rowset 1 track 1 rating 157 | ok 7 - rowset 1 track 1 tracktime 158 | ok 8 - rowset 1 track 2 artist 159 | ok 9 - rowset 1 track 2 track 160 | ok 10 - rowset 1 track 2 album 161 | ok 11 - rowset 1 track 2 year 162 | ok 12 - rowset 1 track 2 rating 163 | ok 13 - rowset 1 track 2 tracktime 164 | ok 14 - rowset 1 track 3 artist 165 | ok 15 - rowset 1 track 3 track 166 | ok 16 - rowset 1 track 3 album 167 | ok 17 - rowset 1 track 3 year 168 | ok 18 - rowset 1 track 3 rating 169 | ok 19 - rowset 1 track 3 tracktime 170 | ok 20 - Read of rowset 2 should produce no output 171 | ok 21 - rowset 2 track 1 artist 172 | ok 22 - rowset 2 track 1 track 173 | ok 23 - rowset 2 track 1 album 174 | ok 24 - rowset 2 track 1 year 175 | ok 25 - rowset 2 track 1 rating 176 | ok 26 - rowset 2 track 1 tracktime 177 | ok 27 - rowset 2 track 2 artist 178 | ok 28 - rowset 2 track 2 track 179 | ok 29 - rowset 2 track 2 album 180 | ok 30 - rowset 2 track 2 year 181 | ok 31 - rowset 2 track 2 rating 182 | ok 32 - rowset 2 track 2 tracktime 183 | ok 33 - rowset 2 track 3 artist 184 | ok 34 - rowset 2 track 3 track 185 | ok 35 - rowset 2 track 3 album 186 | ok 36 - rowset 2 track 3 year 187 | ok 37 - rowset 2 track 3 rating 188 | ok 38 - rowset 2 track 3 tracktime 189 | ok 190 | All tests successful. 191 | Files=1, Tests=38, 0 wallclock secs ( 0.04 usr 0.01 sys + 0.04 cusr 0.02 csys = 0.11 CPU) 192 | Result: PASS 193 | ``` 194 | 195 | Mocking with bash-tap-mock 196 | -------------------------- 197 | 198 | Also included in `bash-tap` is a simple function mocking framework 199 | `bash-tap-mock`, it lets you mock commands and functions with 200 | `mock_command` and `restore_mocked_command`. 201 | 202 | If you particularly care to only mock functions rather than commands 203 | (a good safeguard against typos), use `mock_function` and 204 | `restore_mocked_function`, which have some extended error checking 205 | ensuring the function you're mocking exists in the first place. 206 | 207 | An example from https://github.com/illusori/bash-itunes is clearer: 208 | 209 | ```bash 210 | #!/bin/bash 211 | 212 | . $(dirname $0)/bash-tap-bootstrap 213 | . "$BASH_TAP_ROOT/bash-tap-mock" 214 | . $(dirname $0)/../itunes 215 | 216 | plan tests 4 217 | 218 | sent_command='' 219 | function mock_osascript() { 220 | sent_command="$*" 221 | restore_mocked_function "_osascript" 222 | } 223 | mock_function "_osascript" "mock_osascript" 224 | 225 | start_output_capture 226 | _dispatch "stop" 227 | finish_output_capture stdout stderr 228 | 229 | like "$sent_command" 'stop' "sent command should contain 'stop'" 230 | like "$sent_command" 'tell application "iTunes"' "sent command should contain 'tell application \"iTunes\"'" 231 | 232 | is "$stdout" "Stopping iTunes." "stdout should tell user what happened" 233 | is "$stderr" "" "stderr should be empty" 234 | ``` 235 | -------------------------------------------------------------------------------- /tests/bash-tap/bash-tap: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | bash_tap_version='1.0.2' 4 | 5 | # Our state. 6 | 7 | _bt_plan='' 8 | _bt_expected_tests=0 9 | _bt_plan_output=0 10 | _bt_current_test=0 11 | _bt_tap_output='' 12 | _bt_has_output_plan=0 13 | _bt_done_testing=0 14 | _bt_output_capture=0 15 | 16 | # Our test results so far 17 | unset _bt_test_ok 18 | unset _bt_test_actual_ok 19 | unset _bt_test_name 20 | unset _bt_test_type 21 | unset _bt_test_reason 22 | 23 | # Cleanup stuff. 24 | declare -a _bt_on_exit_cmds 25 | trap "_bt_on_exit" EXIT 26 | 27 | # Planning functions. 28 | 29 | function _bt_output_plan() { 30 | local num_tests="$1" 31 | local directive="$2" 32 | local reason="$3" 33 | 34 | if [ "$_bt_has_output_plan" = 1 ]; then 35 | _caller_error "The plan was already output" 36 | fi 37 | 38 | _bt_clear_out 39 | _bt_out "1..$num_tests" 40 | if [ -n "$directive" ]; then 41 | _bt_out " # $directive" 42 | fi 43 | if [ -n "$reason" ]; then 44 | _bt_out " $reason" 45 | fi 46 | _bt_print_out 47 | _bt_has_output_plan=1 48 | } 49 | 50 | function plan() { 51 | local plan="$1" 52 | 53 | case "$plan" in 54 | no_plan) no_plan ;; 55 | skip_all) skip_all "$2" ;; 56 | tests) expected_tests "$2" ;; 57 | *) _bt_die "Unknown or missing plan: '$plan'" ;; 58 | esac 59 | } 60 | 61 | function expected_tests() { 62 | local num="$1" 63 | 64 | if [ -z "$num" ]; then 65 | echo $_bt_expected_tests 66 | else 67 | if [ -n "$_bt_plan" ]; then 68 | _bt_caller_error "Plan is already defined" 69 | fi 70 | # TODO: validate 71 | _bt_plan="$num" 72 | _bt_expected_tests="$num" 73 | _bt_output_plan "$_bt_expected_tests" 74 | fi 75 | } 76 | 77 | function no_plan() { 78 | if [ -n "$_bt_plan" ]; then 79 | _bt_caller_error "Plan is already defined" 80 | fi 81 | _bt_plan="no plan" 82 | } 83 | 84 | function done_testing() { 85 | local num_tests="$1" 86 | 87 | if [ -z "$num_tests" ]; then 88 | num_tests="$_bt_current_test" 89 | fi 90 | 91 | if [ "$_bt_done_testing" = 1 ]; then 92 | _bt_caller_error "done_testing was already called" 93 | fi 94 | 95 | if [ "$_bt_expected_tests" != 0 -a "$num_tests" != "$_bt_expected_tests" ]; then 96 | ok 0 "planned to run $_bt_expected_tests but done_testing expects $num_tests" 97 | else 98 | _bt_expected_tests="$num_tests" 99 | fi 100 | 101 | if [ "$_bt_has_output_plan" = 0 ]; then 102 | _bt_plan="done testing" 103 | _bt_output_plan "$num_tests" 104 | fi 105 | } 106 | 107 | function has_plan() { 108 | test -n "$_bt_plan" 109 | } 110 | 111 | function skip_all() { 112 | local reason="${*:?}" 113 | 114 | _bt_output_plan 0 SKIP "$reason" 115 | } 116 | 117 | # Test functions. 118 | 119 | function ok() { 120 | local result="$1" 121 | local name="$2" 122 | 123 | _bt_current_test=$((_bt_current_test + 1)) 124 | 125 | # TODO: validate $name 126 | if [ -z "$name" ]; then 127 | name='unnamed test' 128 | fi 129 | name="${name//#/\\#}" 130 | 131 | _bt_clear_out 132 | if [ "$result" = 0 ]; then 133 | _bt_out "not ok" 134 | if [ -n "$TODO" ]; then 135 | _bt_test_ok[$_bt_current_test]=1 136 | else 137 | _bt_test_ok[$_bt_current_test]=0 138 | fi 139 | _bt_test_actual_ok[$_bt_current_test]=0 140 | else 141 | _bt_out "ok" 142 | _bt_test_ok[$_bt_current_test]=1 143 | _bt_test_actual_ok[$_bt_current_test]="$result" 144 | fi 145 | 146 | _bt_out " $_bt_current_test - $name" 147 | _bt_test_name[$_bt_current_test]="$name" 148 | 149 | if [ -n "$TODO" ]; then 150 | _bt_out " # TODO $TODO" 151 | _bt_test_reason[$_bt_current_test]="$TODO" 152 | _bt_test_type[$_bt_current_test]="todo" 153 | else 154 | _bt_test_reason[$_bt_current_test]='' 155 | _bt_test_type[$_bt_current_test]='' 156 | fi 157 | 158 | _bt_print_out 159 | } 160 | 161 | function _is_diag() { 162 | local result="$1" 163 | local expected="$2" 164 | 165 | diag " got: '$result'" 166 | diag " expected: '$expected'" 167 | } 168 | 169 | function is() { 170 | local result="$1" 171 | local expected="$2" 172 | local name="$3" 173 | 174 | if [ "$result" = "$expected" ]; then 175 | ok 1 "$name" 176 | else 177 | ok 0 "$name" 178 | _is_diag "$result" "$expected" 179 | fi 180 | } 181 | 182 | function _isnt_diag() { 183 | local result="$1" 184 | local expected="$2" 185 | 186 | diag " got: '$result'" 187 | diag " expected: anything else" 188 | } 189 | 190 | function isnt() { 191 | local result="$1" 192 | local expected="$2" 193 | local name="$3" 194 | 195 | if [ "$result" != "$expected" ]; then 196 | ok 1 "$name" 197 | else 198 | ok 0 "$name" 199 | _isnt_diag "$result" "$expected" 200 | fi 201 | } 202 | 203 | function like() { 204 | local result="$1" 205 | local pattern="$2" 206 | local name="$3" 207 | 208 | # NOTE: leave $pattern unquoted, see http://stackoverflow.com/a/218217/870000 209 | if [[ "$result" =~ $pattern ]]; then 210 | ok 1 "$name" 211 | else 212 | ok 0 "$name" 213 | diag " got: '$result'" 214 | diag " expected: match for '$pattern'" 215 | fi 216 | } 217 | 218 | function unlike() { 219 | local result="$1" 220 | local pattern="$2" 221 | local name="$3" 222 | 223 | # NOTE: leave $pattern unquoted, see http://stackoverflow.com/a/218217/870000 224 | if [[ ! "$result" =~ $pattern ]]; then 225 | ok 1 "$name" 226 | else 227 | ok 0 "$name" 228 | diag " got: '$result'" 229 | diag " expected: no match for '$pattern'" 230 | fi 231 | } 232 | 233 | function cmp_ok() { 234 | echo TODO 235 | } 236 | 237 | # Other helper functions 238 | 239 | function BAIL_OUT() { 240 | echo TODO 241 | } 242 | 243 | function skip() { 244 | echo TODO 245 | } 246 | 247 | function todo_skip() { 248 | echo TODO 249 | } 250 | 251 | function todo_start() { 252 | echo TODO 253 | } 254 | 255 | function todo_end() { 256 | echo TODO 257 | } 258 | 259 | # Output 260 | 261 | function diag() { 262 | local message="$1" 263 | 264 | if [ -n "$message" ]; then 265 | _bt_escaped_echo "# $message" 266 | fi 267 | } 268 | 269 | # Util functions for output capture within current shell 270 | 271 | function start_output_capture() { 272 | if [ $_bt_output_capture = 1 ]; then 273 | finish_output_capture 274 | _bt_caller_error "Can't start output capture while already active" 275 | fi 276 | local stdout_tmpfile="/tmp/bash-itunes-test-out.$$" 277 | local stderr_tmpfile="/tmp/bash-itunes-test-err.$$" 278 | _bt_add_on_exit_cmd "rm -f '$stdout_tmpfile' '$stderr_tmpfile'" 279 | _bt_output_capture=1 280 | exec 3>&1 >$stdout_tmpfile 4>&2 2>$stderr_tmpfile 281 | } 282 | 283 | function finish_output_capture() { 284 | local capture_stdout_varname="$1" 285 | local capture_stderr_varname="$2" 286 | if [ $_bt_output_capture != 1 ]; then 287 | _bt_caller_error "Can't finish output capture when it wasn't started" 288 | fi 289 | exec 1>&3 3>&- 2>&4 4>&- 290 | _bt_output_capture=0 291 | if [ -n "$capture_stdout_varname" ]; then 292 | local stdout_tmpfile="/tmp/bash-itunes-test-out.$$" 293 | eval "$capture_stdout_varname=\$(< $stdout_tmpfile)" 294 | fi 295 | if [ -n "$capture_stderr_varname" ]; then 296 | local stderr_tmpfile="/tmp/bash-itunes-test-err.$$" 297 | eval "$capture_stderr_varname=\$(< $stderr_tmpfile)" 298 | fi 299 | } 300 | 301 | # Internals 302 | 303 | function _bt_stdout() { 304 | echo "$@" 305 | } 306 | 307 | function _bt_stderr() { 308 | echo "$@" >&2 309 | } 310 | 311 | function _bt_die() { 312 | _bt_stderr "$@" 313 | exit 255 314 | } 315 | 316 | # Report an error from the POV of the first calling point outside this file 317 | function _bt_caller_error() { 318 | local message="$*" 319 | 320 | local thisfile="${BASH_SOURCE[0]}" 321 | local file="$thisfile" 322 | local frame_num=2 323 | until [ "$file" != "$thisfile" ]; do 324 | frame=$(caller "$frame_num") 325 | IFS=' ' read line func file <<<"$frame" 326 | done 327 | 328 | _bt_die "Error: $message, on line $line of $file" 329 | } 330 | 331 | # Echo the supplied message with lines after the 332 | # first escaped as TAP comments. 333 | function _bt_escaped_echo() { 334 | local message="$*" 335 | 336 | local output='' 337 | while IFS= read -r line; do 338 | output="$output\n# $line" 339 | done <<<"$message" 340 | echo -e "${output:4}" 341 | } 342 | 343 | function _bt_clear_out() { 344 | _bt_tap_output="" 345 | } 346 | 347 | function _bt_out() { 348 | _bt_tap_output="$_bt_tap_output$*" 349 | } 350 | 351 | function _bt_print_out() { 352 | _bt_escaped_echo "$_bt_tap_output" 353 | } 354 | 355 | # Cleanup stuff 356 | function _bt_add_on_exit_cmd() { 357 | _bt_on_exit_cmds[${#_bt_on_exit_cmds[*]}]="$*" 358 | } 359 | 360 | function _bt_on_exit() { 361 | if [ $_bt_output_capture = 1 ]; then 362 | finish_output_capture 363 | fi 364 | for exit_cmd in "${_bt_on_exit_cmds[@]}"; do 365 | diag "cleanup: $exit_cmd" 366 | eval "$exit_cmd" 367 | done 368 | # TODO: check that we've output a plan/results 369 | } 370 | -------------------------------------------------------------------------------- /tests/bash-tap/bash-tap-bootstrap: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Bash TAP Bootstrap: 4 | # Copy this file into your project tests dir and source it 5 | # from each test file with: 6 | # . $(dirname $0)/bash-tap-bootstrap 7 | # It takes care of finding bash-tap or outputing a usage message. 8 | # 9 | 10 | bash_tap_bootstrap_version='1.0.2' 11 | 12 | if [ "${BASH_SOURCE[0]}" = "$0" ]; then 13 | # Being run directly, probably by test harness running entire dir. 14 | echo "1..0 # SKIP bash-tap-bootstrap isn't a test file" 15 | exit 0 16 | fi 17 | 18 | if [ -z "$BASH_TAP_ROOT" ]; then 19 | # TODO: search likely locations. 20 | BASH_TAP_ROOT="$(dirname ${BASH_SOURCE[0]})/../../bash-tap" 21 | fi 22 | 23 | if [ -f "$BASH_TAP_ROOT/bash-tap" ]; then 24 | . "$BASH_TAP_ROOT/bash-tap" 25 | else 26 | echo "Bail out! Unable to find bash-tap. Install from https://github.com/illusori/bash-tap or set \$BASH_TAP_ROOT if you have it installed somewhere unusual." 27 | exit 255 28 | fi 29 | -------------------------------------------------------------------------------- /tests/bash-tap/bash-tap-mock: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # While not directly TAP-specific, being able to mock stuff 4 | # in tests is pretty useful. 5 | # 6 | # If you're using bash-tap-bootstrap, then just source this 7 | # file in your tests from the bash-tap directory found by 8 | # the bootstrap by including this line after you've sourced 9 | # bash-tap-bootstrap: 10 | # 11 | # . "$BASH_TAP_ROOT/bash-tap-mock" 12 | # 13 | # If you're not using bash-tap-bootstrap then copy this file 14 | # to your test directory and source it with: 15 | # 16 | # . $(dirname $0)/bash-tap-mock 17 | # 18 | # It's important to note that if you're capturing the arguments 19 | # passed to your mock function in a variable, and want that 20 | # variable to be accessible to your tests, you must ensure that 21 | # the mocked function is executed in the current shell and not 22 | # a subshell. In particular, this means you cannot use $() or 23 | # `` to capture output of the function at the same time, as these 24 | # invoke a subshell - the mock will happen, but any variables you 25 | # set within your mock will only exist within the subshell. 26 | # If you wish to capture output at the same time, you need to 27 | # make use of the start_output_capture and finish_output_capture 28 | # helper functons in bash-tap, or manually use file-descriptor 29 | # redirects yourself to achieve the same effect. 30 | 31 | bash_tap_mock_version='1.0.2' 32 | 33 | if [ "${BASH_SOURCE[0]}" = "$0" ]; then 34 | # Being run directly, probably by test harness running entire dir. 35 | echo "1..0 # SKIP bash-tap-mock isn't a test file" 36 | exit 0 37 | fi 38 | 39 | function mock_function() { 40 | local original_name="$1" 41 | local mock_name="$2" 42 | local save_original_as="_btm_mocked_${original_name}" 43 | 44 | if [ -z $(declare -F "$save_original_as") ]; then 45 | _btm_copy_function "$original_name" "$save_original_as" 46 | fi 47 | _btm_copy_function "$mock_name" "$original_name" 48 | } 49 | 50 | function restore_mocked_function() { 51 | local original_name="$1" 52 | local save_original_as="_btm_mocked_${original_name}" 53 | 54 | if [ ! -z $(declare -F "$save_original_as") ]; then 55 | _btm_copy_function "$save_original_as" "$original_name" 56 | unset -f "$save_original_as" 57 | else 58 | _btm_caller_error "Can't find saved original function '$original_name' to restore" 59 | fi 60 | } 61 | 62 | function mock_command() { 63 | local command_name="$1" 64 | local mock_name="$2" 65 | 66 | if [ ! -z $(declare -F "$command_name") ]; then 67 | # It's not actually a command, it's a function, mock that 68 | mock_function "$command_name" "$mock_name" 69 | else 70 | _btm_copy_function "$mock_name" "$command_name" 71 | fi 72 | } 73 | 74 | function restore_mocked_command() { 75 | local command_name="$1" 76 | 77 | local save_original_as="_btm_mocked_${command_name}" 78 | if [ ! -z $(declare -F "$save_original_as") ]; then 79 | # Was actually a function mock not a command mock. 80 | restore_mocked_function "$command_name" 81 | else 82 | unset -f "$command_name" >/dev/null 83 | fi 84 | } 85 | 86 | # Copied from http://stackoverflow.com/a/1203628/870000 87 | function _btm_copy_function() { 88 | declare -F $1 >/dev/null || _btm_caller_error "Can't find function '$1' to copy" 89 | eval "$(echo "${2}()"; declare -f ${1} | tail -n +2)" 90 | } 91 | 92 | # Report an error from the POV of the first calling point outside this file 93 | function _btm_caller_error() { 94 | local message="$*" 95 | 96 | local thisfile="${BASH_SOURCE[0]}" 97 | local file="$thisfile" 98 | local frame_num=2 99 | until [ "$file" != "$thisfile" ]; do 100 | frame=$(caller "$frame_num") 101 | IFS=' ' read line func file <<<"$frame" 102 | done 103 | 104 | echo "Error: $message, on line $line of $file" >&2 105 | exit 255 106 | } 107 | -------------------------------------------------------------------------------- /tests/chop/tiny-flat.gfa: -------------------------------------------------------------------------------- 1 | H VN:Z:1.0 2 | S 1 CAAATAAGGCTTGGAAATTTTCTGGAGTTCTA 3 | S 2 TTATATTCCAACTCTCTG 4 | P x 1+,2+ * 5 | L 1 + 2 + * 6 | -------------------------------------------------------------------------------- /tests/chop/tiny-rev.gfa: -------------------------------------------------------------------------------- 1 | H VN:Z:1.0 2 | S 1 CAAATAAGGCTTGGAAATTTTCTGGAGTTCTA 3 | S 2 TTATATTCCAACTCTCTG 4 | P x 2-,1- * 5 | L 1 + 2 + * 6 | -------------------------------------------------------------------------------- /tests/small/small.maf: -------------------------------------------------------------------------------- 1 | ##maf version=1 2 | 3 | # SNP 4 | a score=0 mafExtractor_splicedBlock=true splice_id=1_0 5 | s human.1 0 3 + 10 GCA 6 | s chimp.2 0 3 + 8 GCA 7 | s cat.3 0 3 + 7 GTA 8 | 9 | # Indel and strand change 10 | a score=0 mafExtractor_splicedBlock=true splice_id=1_0 11 | s human.1 3 7 + 10 GCAGAAT 12 | s chimp.2 3 5 + 8 GCAG--T 13 | s cat.3 0 4 - 7 --A-AAT 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /tests/small/small2.maf: -------------------------------------------------------------------------------- 1 | ##maf version=1 2 | 3 | # SNP 4 | a score=0 mafExtractor_splicedBlock=true splice_id=1_0 5 | s human.1 0 3 + 10 GCA 6 | s chimp.3 0 3 + 8 GCA 7 | 8 | # Indel and strand change 9 | a score=0 mafExtractor_splicedBlock=true splice_id=1_0 10 | s human.1 3 7 + 10 GCAGAAT 11 | s chimp.3 3 5 + 8 GCAG--T 12 | s cow.3 0 4 - 7 --A-AAT 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /tests/small/truth.json: -------------------------------------------------------------------------------- 1 | { 2 | "edge": [ 3 | { 4 | "from": "1", 5 | "to": "9" 6 | }, 7 | { 8 | "from": "1", 9 | "to": "2" 10 | }, 11 | { 12 | "from": "2", 13 | "to": "3" 14 | }, 15 | { 16 | "from": "3", 17 | "to": "8" 18 | }, 19 | { 20 | "from": "3", 21 | "to": "4" 22 | }, 23 | { 24 | "from": "3", 25 | "from_start": true, 26 | "to": "9", 27 | "to_end": true 28 | }, 29 | { 30 | "from": "4", 31 | "to": "5", 32 | "to_end": true 33 | }, 34 | { 35 | "from": "5", 36 | "from_start": true, 37 | "to": "7", 38 | "to_end": true 39 | }, 40 | { 41 | "from": "5", 42 | "from_start": true, 43 | "to": "6" 44 | }, 45 | { 46 | "from": "6", 47 | "to": "8", 48 | "to_end": true 49 | }, 50 | { 51 | "from": "6", 52 | "to": "7", 53 | "to_end": true 54 | }, 55 | { 56 | "from": "7", 57 | "from_start": true, 58 | "to": "8", 59 | "to_end": true 60 | } 61 | ], 62 | "node": [ 63 | { 64 | "id": "1", 65 | "sequence": "G" 66 | }, 67 | { 68 | "id": "2", 69 | "sequence": "C" 70 | }, 71 | { 72 | "id": "3", 73 | "sequence": "A" 74 | }, 75 | { 76 | "id": "4", 77 | "sequence": "GC" 78 | }, 79 | { 80 | "id": "5", 81 | "sequence": "T" 82 | }, 83 | { 84 | "id": "6", 85 | "sequence": "G" 86 | }, 87 | { 88 | "id": "7", 89 | "sequence": "TT" 90 | }, 91 | { 92 | "id": "8", 93 | "sequence": "A" 94 | }, 95 | { 96 | "id": "9", 97 | "sequence": "T" 98 | } 99 | ], 100 | "path": [ 101 | { 102 | "mapping": [ 103 | { 104 | "edit": [ 105 | { 106 | "from_length": 1, 107 | "to_length": 1 108 | } 109 | ], 110 | "position": { 111 | "node_id": "1" 112 | }, 113 | "rank": "1" 114 | }, 115 | { 116 | "edit": [ 117 | { 118 | "from_length": 1, 119 | "to_length": 1 120 | } 121 | ], 122 | "position": { 123 | "node_id": "9" 124 | }, 125 | "rank": "2" 126 | }, 127 | { 128 | "edit": [ 129 | { 130 | "from_length": 1, 131 | "to_length": 1 132 | } 133 | ], 134 | "position": { 135 | "node_id": "3" 136 | }, 137 | "rank": "3" 138 | }, 139 | { 140 | "edit": [ 141 | { 142 | "from_length": 1, 143 | "to_length": 1 144 | } 145 | ], 146 | "position": { 147 | "node_id": "8" 148 | }, 149 | "rank": "4" 150 | }, 151 | { 152 | "edit": [ 153 | { 154 | "from_length": 2, 155 | "to_length": 2 156 | } 157 | ], 158 | "position": { 159 | "node_id": "7" 160 | }, 161 | "rank": "5" 162 | }, 163 | { 164 | "edit": [ 165 | { 166 | "from_length": 1, 167 | "to_length": 1 168 | } 169 | ], 170 | "position": { 171 | "node_id": "5" 172 | }, 173 | "rank": "6" 174 | } 175 | ], 176 | "name": "cat#0#3" 177 | }, 178 | { 179 | "mapping": [ 180 | { 181 | "edit": [ 182 | { 183 | "from_length": 1, 184 | "to_length": 1 185 | } 186 | ], 187 | "position": { 188 | "node_id": "1" 189 | }, 190 | "rank": "1" 191 | }, 192 | { 193 | "edit": [ 194 | { 195 | "from_length": 1, 196 | "to_length": 1 197 | } 198 | ], 199 | "position": { 200 | "node_id": "2" 201 | }, 202 | "rank": "2" 203 | }, 204 | { 205 | "edit": [ 206 | { 207 | "from_length": 1, 208 | "to_length": 1 209 | } 210 | ], 211 | "position": { 212 | "node_id": "3" 213 | }, 214 | "rank": "3" 215 | }, 216 | { 217 | "edit": [ 218 | { 219 | "from_length": 2, 220 | "to_length": 2 221 | } 222 | ], 223 | "position": { 224 | "node_id": "4" 225 | }, 226 | "rank": "4" 227 | }, 228 | { 229 | "edit": [ 230 | { 231 | "from_length": 1, 232 | "to_length": 1 233 | } 234 | ], 235 | "position": { 236 | "is_reverse": true, 237 | "node_id": "5" 238 | }, 239 | "rank": "5" 240 | }, 241 | { 242 | "edit": [ 243 | { 244 | "from_length": 1, 245 | "to_length": 1 246 | } 247 | ], 248 | "position": { 249 | "node_id": "6" 250 | }, 251 | "rank": "6" 252 | }, 253 | { 254 | "edit": [ 255 | { 256 | "from_length": 1, 257 | "to_length": 1 258 | } 259 | ], 260 | "position": { 261 | "is_reverse": true, 262 | "node_id": "8" 263 | }, 264 | "rank": "7" 265 | } 266 | ], 267 | "name": "chimp#0#2" 268 | }, 269 | { 270 | "mapping": [ 271 | { 272 | "edit": [ 273 | { 274 | "from_length": 1, 275 | "to_length": 1 276 | } 277 | ], 278 | "position": { 279 | "node_id": "1" 280 | }, 281 | "rank": "1" 282 | }, 283 | { 284 | "edit": [ 285 | { 286 | "from_length": 1, 287 | "to_length": 1 288 | } 289 | ], 290 | "position": { 291 | "node_id": "2" 292 | }, 293 | "rank": "2" 294 | }, 295 | { 296 | "edit": [ 297 | { 298 | "from_length": 1, 299 | "to_length": 1 300 | } 301 | ], 302 | "position": { 303 | "node_id": "3" 304 | }, 305 | "rank": "3" 306 | }, 307 | { 308 | "edit": [ 309 | { 310 | "from_length": 2, 311 | "to_length": 2 312 | } 313 | ], 314 | "position": { 315 | "node_id": "4" 316 | }, 317 | "rank": "4" 318 | }, 319 | { 320 | "edit": [ 321 | { 322 | "from_length": 1, 323 | "to_length": 1 324 | } 325 | ], 326 | "position": { 327 | "is_reverse": true, 328 | "node_id": "5" 329 | }, 330 | "rank": "5" 331 | }, 332 | { 333 | "edit": [ 334 | { 335 | "from_length": 1, 336 | "to_length": 1 337 | } 338 | ], 339 | "position": { 340 | "node_id": "6" 341 | }, 342 | "rank": "6" 343 | }, 344 | { 345 | "edit": [ 346 | { 347 | "from_length": 2, 348 | "to_length": 2 349 | } 350 | ], 351 | "position": { 352 | "is_reverse": true, 353 | "node_id": "7" 354 | }, 355 | "rank": "7" 356 | }, 357 | { 358 | "edit": [ 359 | { 360 | "from_length": 1, 361 | "to_length": 1 362 | } 363 | ], 364 | "position": { 365 | "is_reverse": true, 366 | "node_id": "8" 367 | }, 368 | "rank": "8" 369 | } 370 | ], 371 | "name": "human#0#1" 372 | } 373 | ] 374 | } 375 | -------------------------------------------------------------------------------- /tests/t/chop.t: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | BASH_TAP_ROOT=./bash-tap 4 | . ${BASH_TAP_ROOT}/bash-tap-bootstrap 5 | 6 | PATH=../bin:$PATH 7 | PATH=../deps/hal:$PATH 8 | 9 | plan tests 18 10 | 11 | vg convert -g chop/tiny-flat.gfa -p > tiny-flat.vg 12 | printf "x\t0\t100\n" > all.bed 13 | clip-vg tiny-flat.vg -b all.bed | vg view - | grep -v ^H > chopped-all.gfa 14 | is "$(cat chopped-all.gfa | wc -l)" 0 "chopping everything clears out the graph" 15 | 16 | rm -f all.bed chopped-all.gfa 17 | 18 | printf "y\t0\t100\n" > none.bed 19 | clip-vg tiny-flat.vg -b none.bed | vg view - | grep -v ^H > chopped-none.gfa 20 | vg view tiny-flat.vg | grep -v ^H > orig.gfa 21 | diff chopped-none.gfa orig.gfa 22 | is "$?" 0 "chopping nothing doesn't change graph" 23 | 24 | rm -f none.bed chopped-none.gfa orig.gfa 25 | 26 | printf "x\t0\t1\n" > ends.bed 27 | printf "x\t48\t50\n" >> ends.bed 28 | clip-vg -n tiny-flat.vg -b ends.bed > chopped-ends.vg 29 | is "$(vg paths -Ev chopped-ends.vg)" "x[1-48] 47" "chopping ends gives subpath in the middle with correct length" 30 | is "$(vg stats -l chopped-ends.vg | awk '{print $2}')" "47" "chopping ends leaves correct number of bases" 31 | 32 | rm -f ends.bed chopped-ends.vg 33 | 34 | printf "x\t20\t25\n" > bits.bed 35 | printf "x\t1\t5\n" >> bits.bed 36 | printf "x\t10\t20\n" >> bits.bed 37 | printf "x\t40\t49\n" >> bits.bed 38 | clip-vg -n tiny-flat.vg -b bits.bed > chopped-bits.vg 39 | vg paths -Ev chopped-bits.vg | sed -e 's/\t/./g' > bits.paths 40 | is "$(cat bits.paths | wc -l)" "4" "correct number of paths obtained after merging consectuive intervals" 41 | is "$(grep 'x\[0-1\].1' bits.paths | wc -l)" "1" "first bit found" 42 | is "$(grep 'x\[5-10\].5' bits.paths | wc -l)" "1" "next bit found" 43 | is "$(grep 'x\[25-40\].15' bits.paths | wc -l)" "1" "next bit after found" 44 | is "$(grep 'x\[49-50\].1' bits.paths | wc -l)" "1" "last bit found" 45 | 46 | rm -f bits.bed chopped-bits.vg bits.paths 47 | 48 | rm -f tiny-flat.vg 49 | 50 | ########## flip path and repeat ########## 51 | 52 | vg convert -g chop/tiny-rev.gfa -p > tiny-rev.vg 53 | #vg convert -g chop/tiny-rev.gfa -o > tiny-rev.vg 54 | printf "x\t0\t100\n" > all.bed 55 | clip-vg tiny-rev.vg -b all.bed | vg view - | grep -v ^H > chopped-all.gfa 56 | is "$(cat chopped-all.gfa | wc -l)" 0 "chopping everything clears out the graph" 57 | 58 | rm -f all.bed chopped-all.gfa 59 | 60 | printf "x\t0\t1\n" > ends.bed 61 | printf "x\t48\t50\n" >> ends.bed 62 | clip-vg -n tiny-rev.vg -b ends.bed > chopped-ends.vg 63 | is "$(vg paths -Ev chopped-ends.vg)" "x[1-48] 47" "chopping ends gives subpath in the middle with correct length" 64 | is "$(vg stats -l chopped-ends.vg | awk '{print $2}')" "47" "chopping ends leaves correct number of bases" 65 | 66 | rm -f ends.bed chopped-ends.vg 67 | 68 | printf "x\t20\t25\n" > bits.bed 69 | printf "x\t1\t5\n" >> bits.bed 70 | printf "x\t10\t20\n" >> bits.bed 71 | printf "x\t40\t49\n" >> bits.bed 72 | clip-vg -n tiny-rev.vg -b bits.bed > chopped-bits.vg 73 | vg paths -Ev chopped-bits.vg | sed -e 's/\t/./g' > bits.paths 74 | is "$(cat bits.paths | wc -l)" "4" "correct number of paths obtained after merging consectuive intervals" 75 | is "$(grep 'x\[0-1\].1' bits.paths | wc -l)" "1" "first bit found" 76 | is "$(grep 'x\[5-10\].5' bits.paths | wc -l)" "1" "next bit found" 77 | is "$(grep 'x\[25-40\].15' bits.paths | wc -l)" "1" "next bit after found" 78 | is "$(grep 'x\[49-50\].1' bits.paths | wc -l)" "1" "last bit found" 79 | 80 | rm -f bits.bed chopped-bits.vg bits.paths 81 | 82 | rm -f tiny-rev.vg 83 | 84 | # quick test for forwardization 85 | vg convert -g chop/tiny-fr.gfa -p > tiny-fr.vg 86 | vg paths -Fv tiny-fr.vg > tiny-fr.fa 87 | clip-vg tiny-fr.vg -e x -p > tiny-fr-forwardized.vg 88 | vg paths -Fv tiny-fr-forwardized.vg > tiny-fr-forwardized.fa 89 | diff tiny-fr.fa tiny-fr-forwardized.fa 90 | is "$?" 0 "fowawrsization does not affect path sequence" 91 | 92 | rm -f tiny-fr.vg tiny-fr.fa tiny-fr-forwardized.vg tiny-fr-forwardized.fa tiny-fr-forwardized.fa 93 | -------------------------------------------------------------------------------- /tests/t/merge.t: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | BASH_TAP_ROOT=./bash-tap 4 | . ${BASH_TAP_ROOT}/bash-tap-bootstrap 5 | 6 | PATH=../bin:$PATH 7 | PATH=../deps/hal:$PATH 8 | 9 | plan tests 10 10 | 11 | maf2hal small/small.maf small.hal 12 | maf2hal small/small2.maf small2.hal 13 | halMergeChroms small.hal,small2.hal merged1.hal 14 | halValidate merged1.hal 15 | is $? 0 "halMergeChroms produces valid hal" 16 | hal2fasta small.hal chimp > chimp.fa 17 | hal2fasta small2.hal chimp >> chimp.fa 18 | hal2fasta merged1.hal chimp > chimp.merge.fa 19 | diff chimp.fa chimp.merge.fa 20 | is $? 0 "halMergeChroms preserves chimp sequence" 21 | hal2fasta small.hal cat > cat.fa 22 | hal2fasta merged1.hal cat > cat.merge.fa 23 | diff cat.fa cat.merge.fa 24 | is $? 0 "halMergeChroms preserves cat sequence" 25 | hal2vg small.hal | vg mod -O - | vg ids -s - > small.vg 26 | hal2vg small2.hal | vg mod -O - | vg ids -s - > small2.vg 27 | hal2vg merged1.hal | vg mod -O - | vg ids -s - > merged1.vg 28 | vg view small.vg | sort > small.gfa 29 | vg view small2.vg | sort > small2.gfa 30 | vg find -x merged1.vg -p cat#0#3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/_0//g' | sed -e 's/_1//g' | sed -e "s/human chimp cat/chimp human cat/g" > merged1.comp1.gfa 31 | vg find -x merged1.vg -p cow#0#3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/_0//g' | sed -e 's/_1//g' | sed -e "s/human cow chimp/chimp human cow/g" > merged1.comp2.gfa 32 | diff small.gfa merged1.comp1.gfa 33 | is $? 0 "First component of merged graph identical to first input graph" 34 | diff small2.gfa merged1.comp2.gfa 35 | is $? 0 "Second component of merged graph identical to second input graph" 36 | 37 | rm -f small.hal small2.halsmall.vg small2.vg small.gfa small2.gfa 38 | rm -f merged1.hal merged1.vg merged1.comp1.gfa merged1.comp2.gfa 39 | rm -f chimp.fa chimp.merge.fa 40 | rm -f cat.fa cat.merge.fa 41 | 42 | ### copy paste above but change order ### 43 | 44 | maf2hal small/small.maf small.hal 45 | maf2hal small/small2.maf small2.hal 46 | halMergeChroms small2.hal,small.hal merged1.hal 47 | halValidate merged1.hal 48 | is $? 0 "halMergeChroms produces valid hal" 49 | hal2fasta small2.hal chimp > chimp.fa 50 | hal2fasta small.hal chimp >> chimp.fa 51 | hal2fasta merged1.hal chimp > chimp.merge.fa 52 | diff chimp.fa chimp.merge.fa 53 | is $? 0 "halMergeChroms preserves chimp sequence" 54 | hal2fasta small.hal cat > cat.fa 55 | hal2fasta merged1.hal cat > cat.merge.fa 56 | diff cat.fa cat.merge.fa 57 | is $? 0 "halMergeChroms preserves cat sequence" 58 | hal2vg small.hal | vg mod -O - | vg ids -s - > small.vg 59 | hal2vg small2.hal | vg mod -O - | vg ids -s - > small2.vg 60 | hal2vg merged1.hal | vg mod -O - | vg ids -s - > merged1.vg 61 | vg view small.vg | sort > small.gfa 62 | vg view small2.vg | sort > small2.gfa 63 | vg find -x merged1.vg -p cat#0#3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/_0//g' | sed -e 's/_1//g' | sed -e "s/human chimp cat/chimp human cat/g" > merged1.comp1.gfa 64 | vg find -x merged1.vg -p cow#0#3:1 -c 1000 | vg ids -s - | vg view - | sort | sed -e 's/_0//g' | sed -e 's/_1//g' | sed -e "s/human cow chimp/chimp human cow/g" > merged1.comp2.gfa 65 | diff small.gfa merged1.comp1.gfa 66 | is $? 0 "First component of merged graph identical to first input graph" 67 | diff small2.gfa merged1.comp2.gfa 68 | is $? 0 "Second component of merged graph identical to second input graph" 69 | 70 | rm -f small.hal small2.halsmall.vg small2.vg small.gfa small2.gfa 71 | rm -f merged1.hal merged1.vg merged1.comp1.gfa merged1.comp2.gfa 72 | rm -f chimp.fa chimp.merge.fa 73 | rm -f cat.fa cat.merge.fa 74 | -------------------------------------------------------------------------------- /tests/t/small.t: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | BASH_TAP_ROOT=./bash-tap 4 | . ${BASH_TAP_ROOT}/bash-tap-bootstrap 5 | 6 | PATH=../bin:$PATH 7 | PATH=../deps/hal:$PATH 8 | 9 | plan tests 2 10 | 11 | maf2hal small/small.maf small.hal 12 | hal2vg small.hal > small.vg 13 | vg view -j small.vg | jq . > small.json 14 | 15 | is $(vg validate small.vg | wc -l) 0 "output vg validates" 16 | 17 | # jq craziness from https://stackoverflow.com/questions/31930041/using-jq-or-alternative-command-line-tools-to-compare-json-files 18 | is $(jq --argfile a small.json --argfile b small/truth.json -n 'def post_recurse(f): def r: (f | select(. != null) | r), .; r; def post_recurse: post_recurse(.[]?); ($a | (post_recurse | arrays) |= sort) as $a | ($b | (post_recurse | arrays) |= sort) as $b | $a == $b') true "output graph identical to manually verified truth graph" 19 | 20 | rm -f small.vg small.json 21 | 22 | rm -f small.hal 23 | --------------------------------------------------------------------------------