├── .github └── workflows │ ├── README.md │ ├── create_release.yml │ ├── val.bat │ ├── val.sh │ └── validate.yml ├── .gitignore ├── .gitmodules ├── LICENSE ├── Makefile ├── README.md ├── images └── graph.png ├── include ├── gfastats-global.h ├── input.h ├── main.h └── validate.h ├── instructions └── README.md ├── scaffolding └── README.md ├── scripts ├── gfastats_stats.sh ├── plot_runtime.R └── submit_gfastats_stats.sh ├── src ├── generate-random-fasta.cpp ├── generate-tests.cpp ├── input.cpp ├── main.cpp └── validate.cpp ├── testFiles ├── random1.agp ├── random1.comment.sak ├── random1.fasta ├── random1.fasta.1.bed ├── random1.fasta.2.bed ├── random1.fasta.3.bed ├── random1.fasta.4.bed ├── random1.fasta.5.bed ├── random1.fasta.gz ├── random1.fastq ├── random1.fastq.gz ├── random1.gfa2 ├── random1.gfa2.instructions.sak ├── random1.hc.sak ├── random1.hdc.sak ├── random1.instructions.sak ├── random1.mask.sak ├── random1.rename.sak ├── random1.rvcp.sak ├── random2.gfa ├── random2.gfa.gz ├── random2.gfa2 ├── random2.gfa2.agp ├── random2.gfa2.gz ├── random2.noseq.gfa ├── random3.sorting.fasta ├── random4.fasta ├── random5.findovl.gfa └── random6.circular.gfa └── validateFiles ├── README.md ├── random1.fasta.10.tst ├── random1.fasta.11.tst ├── random1.fasta.3.tst ├── random1.fasta.4.tst ├── random1.fasta.5.tst ├── random1.fasta.6.tst ├── random1.fasta.7.tst ├── random1.fasta.78.tst ├── random1.fasta.79.tst ├── random1.fasta.8.tst ├── random1.fasta.80.tst ├── random1.fasta.81.tst ├── random1.fasta.82.tst ├── random1.fasta.83.tst ├── random1.fasta.84.tst ├── random1.fasta.85.tst ├── random1.fasta.86.tst ├── random1.fasta.87.tst ├── random1.fasta.88.tst ├── random1.fasta.89.tst ├── random1.fasta.9.tst ├── random1.fasta.90.tst ├── random1.fasta.91.tst ├── random1.fasta.92.tst ├── random1.fasta.93.tst ├── random1.fasta.94.tst ├── random1.fasta.95.tst ├── random1.fasta.96.tst ├── random1.fasta.gz.100.tst ├── random1.fasta.gz.69.tst ├── random1.fasta.gz.70.tst ├── random1.fasta.gz.71.tst ├── random1.fasta.gz.72.tst ├── random1.fasta.gz.73.tst ├── random1.fasta.gz.74.tst ├── random1.fasta.gz.75.tst ├── random1.fasta.gz.76.tst ├── random1.fasta.gz.77.tst ├── random1.fasta.gz.97.tst ├── random1.fasta.gz.98.tst ├── random1.fasta.gz.99.tst ├── random1.fastq.101.tst ├── random1.fastq.102.tst ├── random1.fastq.103.tst ├── random1.fastq.104.tst ├── random1.fastq.54.tst ├── random1.fastq.55.tst ├── random1.fastq.56.tst ├── random1.fastq.57.tst ├── random1.fastq.58.tst ├── random1.fastq.59.tst ├── random1.fastq.60.tst ├── random1.fastq.61.tst ├── random1.fastq.62.tst ├── random1.fastq.gz.105.tst ├── random1.fastq.gz.106.tst ├── random1.fastq.gz.107.tst ├── random1.fastq.gz.108.tst ├── random1.fastq.gz.33.tst ├── random1.fastq.gz.34.tst ├── random1.fastq.gz.35.tst ├── random1.fastq.gz.36.tst ├── random1.fastq.gz.37.tst ├── random1.fastq.gz.38.tst ├── random1.fastq.gz.39.tst ├── random1.fastq.gz.40.tst ├── random1.fastq.gz.41.tst ├── random1.gfa2.109.tst ├── random1.gfa2.110.tst ├── random1.gfa2.111.tst ├── random1.gfa2.112.tst ├── random1.gfa2.113.tst ├── random1.gfa2.12.tst ├── random1.gfa2.13.tst ├── random1.gfa2.14.tst ├── random2.gfa.42.tst ├── random2.gfa.43.tst ├── random2.gfa.44.tst ├── random2.gfa.gz.45.tst ├── random2.gfa.gz.46.tst ├── random2.gfa.gz.47.tst ├── random2.gfa2.63.tst ├── random2.gfa2.64.tst ├── random2.gfa2.65.tst ├── random2.gfa2.gz.51.tst ├── random2.gfa2.gz.52.tst ├── random2.gfa2.gz.53.tst ├── random2.noseq.gfa.114.tst ├── random2.noseq.gfa.48.tst ├── random2.noseq.gfa.49.tst ├── random2.noseq.gfa.50.tst ├── random3.sorting.fasta.24.tst ├── random3.sorting.fasta.25.tst ├── random3.sorting.fasta.26.tst ├── random3.sorting.fasta.27.tst ├── random3.sorting.fasta.28.tst ├── random3.sorting.fasta.29.tst ├── random3.sorting.fasta.30.tst ├── random3.sorting.fasta.31.tst ├── random3.sorting.fasta.32.tst ├── random4.fasta.115.tst ├── random4.fasta.15.tst ├── random4.fasta.16.tst ├── random4.fasta.17.tst ├── random4.fasta.18.tst ├── random4.fasta.19.tst ├── random4.fasta.20.tst ├── random4.fasta.21.tst ├── random4.fasta.22.tst ├── random4.fasta.23.tst ├── random5.findovl.gfa.116.tst ├── random5.findovl.gfa.66.tst ├── random5.findovl.gfa.67.tst ├── random5.findovl.gfa.68.tst ├── random6.circular.gfa.0.tst ├── random6.circular.gfa.1.tst ├── random6.circular.gfa.117.tst └── random6.circular.gfa.2.tst /.github/workflows/README.md: -------------------------------------------------------------------------------- 1 | validate is automatically run on pushes to any branch, or pull requests to main 2 | 3 | to automatically create a new release and automatically upload mac, ubuntu, and windows builds run: 4 | `git tag v*` 5 | `git push origin v*` 6 | where * is the version number. 7 | 8 | Example: 9 | `git tag v1.2.1` 10 | `git push origin v1.2.1` 11 | 12 | -------------------------------------------------------------------------------- /.github/workflows/create_release.yml: -------------------------------------------------------------------------------- 1 | name: Create Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' # Trigger on version tags 7 | 8 | jobs: 9 | create_release: 10 | name: Create Release 11 | permissions: write-all 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout code 15 | uses: actions/checkout@v2 16 | with: 17 | submodules: recursive 18 | - name: Create Release 19 | id: create_release 20 | uses: actions/create-release@v1 21 | env: 22 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 23 | with: 24 | tag_name: ${{ github.ref }} 25 | release_name: gfastats ${{ github.ref }} 26 | body: | 27 | Changes in this Release 28 | draft: false 29 | prerelease: false 30 | outputs: 31 | upload_url: ${{ steps.create_release.outputs.upload_url }} 32 | 33 | add_resources: 34 | needs: create_release 35 | name: Add Resources 36 | strategy: 37 | matrix: 38 | OS: [macos-13, ubuntu-latest, windows-2019] 39 | include: 40 | - OS: macos-13 41 | OS_NAME: macOS 42 | - OS: ubuntu-latest 43 | OS_NAME: linux 44 | - OS: windows-latest 45 | OS_NAME: win 46 | runs-on: ${{ matrix.OS }} 47 | steps: 48 | - name: Checkout code 49 | uses: actions/checkout@v2 50 | with: 51 | submodules: recursive 52 | - name: Build 53 | run: make -j 54 | 55 | - name: Make binary executable (Linux & macOS) 56 | if: matrix.OS_NAME != 'win' 57 | run: chmod +x build/bin/gfastats 58 | 59 | - name: Zip (Windows) 60 | if: matrix.OS_NAME == 'win' 61 | uses: papeloto/action-zip@v1 62 | with: 63 | files: build/bin/gfastats.exe 64 | dest: result.zip 65 | 66 | - name: Tar (Linux & macOS) 67 | if: matrix.OS_NAME != 'win' 68 | run: tar -czvf result.tar.gz -C build/bin gfastats 69 | 70 | - name: Add binaries & Upload tarball (Linux & macOS) 71 | if: matrix.OS_NAME != 'win' 72 | uses: actions/upload-release-asset@v1 73 | env: 74 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 75 | with: 76 | upload_url: ${{ needs.create_release.outputs.upload_url }} 77 | asset_path: result.tar.gz 78 | asset_name: gfastats.${{ github.ref_name }}-${{matrix.OS_NAME}}.tar.gz 79 | asset_content_type: application/tar 80 | 81 | - name: Add binaries & Upload tarball (Windows) 82 | if: matrix.OS_NAME == 'win' 83 | uses: actions/upload-release-asset@v1 84 | env: 85 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 86 | with: 87 | upload_url: ${{ needs.create_release.outputs.upload_url }} 88 | asset_path: result.zip 89 | asset_name: gfastats.${{ github.ref_name }}-${{matrix.OS_NAME}}.zip 90 | asset_content_type: application/zip 91 | 92 | add_submodules: 93 | needs: create_release 94 | name: Source with submodules 95 | runs-on: ubuntu-latest 96 | steps: 97 | - name: Checkout code 98 | uses: actions/checkout@v2 99 | with: 100 | submodules: true 101 | - name: Zip 102 | uses: papeloto/action-zip@v1 103 | with: 104 | files: . 105 | dest: result.zip 106 | - name: Add files 107 | uses: actions/upload-release-asset@v1 108 | env: 109 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 110 | with: 111 | upload_url: ${{ needs.create_release.outputs.upload_url }} 112 | asset_path: result.zip 113 | asset_name: gfastats.${{ github.ref_name }}-with_submodules.zip 114 | asset_content_type: application/zip 115 | -------------------------------------------------------------------------------- /.github/workflows/val.bat: -------------------------------------------------------------------------------- 1 | "build/bin/gfastats-validate.exe" validateFiles 2 | -------------------------------------------------------------------------------- /.github/workflows/val.sh: -------------------------------------------------------------------------------- 1 | build/bin/gfastats-validate validateFiles 2 | -------------------------------------------------------------------------------- /.github/workflows/validate.yml: -------------------------------------------------------------------------------- 1 | name: Validate 2 | 3 | on: 4 | push: 5 | pull_request: 6 | branches: [ main ] 7 | 8 | jobs: 9 | validate: 10 | name: Validate 11 | strategy: 12 | matrix: 13 | include: 14 | - os: macos-latest 15 | command: ".github/workflows/val.sh" 16 | - os: ubuntu-latest 17 | command: ".github/workflows/val.sh" 18 | chmod: true 19 | - os: windows-latest 20 | command: ".github/workflows/val.bat" 21 | fail-fast: false 22 | runs-on: ${{ matrix.os }} 23 | steps: 24 | - name: Checkout code 25 | uses: actions/checkout@v3 26 | with: 27 | submodules: true 28 | - name: Build 29 | run: make all -j 30 | - name: Validate 31 | run: | 32 | chmod +x .github/workflows/val.sh 33 | ${{ matrix.command }} 34 | 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.gitignore.io/api/swift,xcode 2 | 3 | ### Swift ### 4 | # Xcode 5 | # 6 | # gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore 7 | 8 | ## Build generated 9 | build/ 10 | DerivedData/ 11 | 12 | ## Various settings 13 | *.pbxuser 14 | !default.pbxuser 15 | *.mode1v3 16 | !default.mode1v3 17 | *.mode2v3 18 | !default.mode2v3 19 | *.perspectivev3 20 | !default.perspectivev3 21 | xcuserdata/ 22 | 23 | ## Other 24 | *.moved-aside 25 | *.xccheckout 26 | *.xcscmblueprint 27 | 28 | ## Obj-C/Swift specific 29 | *.hmap 30 | *.ipa 31 | *.dSYM.zip 32 | *.dSYM 33 | 34 | ## Playgrounds 35 | timeline.xctimeline 36 | playground.xcworkspace 37 | 38 | # Swift Package Manager 39 | # 40 | # Add this line if you want to avoid checking in source code from Swift Package Manager dependencies. 41 | # Packages/ 42 | # Package.pins 43 | .build/ 44 | 45 | # CocoaPods - Refactored to standalone file 46 | 47 | # Carthage - Refactored to standalone file 48 | 49 | # fastlane 50 | # 51 | # It is recommended to not store the screenshots in the git repo. Instead, use fastlane to re-generate the 52 | # screenshots whenever they are needed. 53 | # For more information about the recommended setup visit: 54 | # https://docs.fastlane.tools/best-practices/source-control/#source-control 55 | 56 | fastlane/report.xml 57 | fastlane/Preview.html 58 | fastlane/screenshots 59 | fastlane/test_output 60 | 61 | ### Xcode ### 62 | # Xcode 63 | # 64 | # gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore 65 | 66 | ## Build generated 67 | 68 | ## Various settings 69 | 70 | ## Other 71 | 72 | ### Xcode Patch ### 73 | *.xcodeproj 74 | *.xcodeproj/* 75 | !*.xcodeproj/project.pbxproj 76 | *.xcodeproj/xcshareddata/ 77 | !*.xcodeproj/xcuserdata/ 78 | !*.xcworkspace/contents.xcworkspacedata 79 | /*.gcno 80 | 81 | .DS_Store 82 | *.pbxproj 83 | *.xcworkspacedata 84 | *.plist 85 | 86 | 87 | # End of https://www.gitignore.io/api/swift,xcode,vscode 88 | 89 | tmp.txt 90 | err.txt 91 | out 92 | 93 | *.o 94 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "gfalibs"] 2 | path = gfalibs 3 | url = https://github.com/vgl-hub/gfalibs.git 4 | ignore = untracked 5 | ignore = dirty 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Giulio Formenti 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CXX ?= g++ 2 | INCLUDE_DIR ?= -I./include -Igfalibs/include 3 | WARNINGS = -Wall -Wextra 4 | 5 | CXXFLAGS = -g -std=gnu++14 -O3 $(INCLUDE_DIR) $(WARNINGS) $(CFLAGS) 6 | 7 | TARGET = gfastats 8 | TEST_TARGET = validate 9 | GENERATE_TARGET = generate-tests 10 | RANDOM_FASTA_TARGET = generate-random-fasta 11 | BUILD = build/bin 12 | SOURCE = src 13 | INCLUDE = include 14 | BINDIR := $(BUILD)/.o 15 | 16 | LIBS = -lz 17 | LDFLAGS= -pthread 18 | 19 | #gfalibs 20 | GFALIBS_DIR := $(CURDIR)/gfalibs 21 | 22 | OBJS := main input 23 | BINS := $(addprefix $(BINDIR)/, $(OBJS)) 24 | 25 | head: $(BINS) gfalibs | $(BUILD) 26 | $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $(BUILD)/$(TARGET) $(wildcard $(BINDIR)/*) $(GFALIBS_DIR)/*.o $(LIBS) 27 | 28 | debug: CXXFLAGS += -DDEBUG 29 | debug: CCFLAGS += -DDEBUG 30 | debug: head 31 | 32 | all: head validate regenerate random_fasta 33 | 34 | $(OBJS): %: $(BINDIR)/% 35 | @ 36 | $(BINDIR)%: $(SOURCE)/%.cpp $(INCLUDE)/%.h | $(BINDIR) 37 | $(CXX) $(CXXFLAGS) $(LDFLAGS) -c $(SOURCE)/$(notdir $@).cpp -o $@ 38 | 39 | .PHONY: gfalibs 40 | gfalibs: 41 | $(MAKE) -j -C $(GFALIBS_DIR) CXXFLAGS="$(CXXFLAGS)" 42 | 43 | validate: | $(BUILD) 44 | $(CXX) $(CXXFLAGS) -o $(BUILD)/$(TARGET)-$(TEST_TARGET) $(SOURCE)/$(TEST_TARGET).cpp $(LIBS) 45 | 46 | regenerate: | $(BUILD) 47 | $(CXX) $(CXXFLAGS) -o $(BUILD)/$(TARGET)-$(GENERATE_TARGET) $(SOURCE)/$(GENERATE_TARGET).cpp $(LIBS) 48 | 49 | random_fasta: | $(BUILD) 50 | $(CXX) $(CXXFLAGS) -o $(BUILD)/$(TARGET)-$(RANDOM_FASTA_TARGET) $(SOURCE)/$(RANDOM_FASTA_TARGET).cpp $(LIBS) 51 | 52 | $(BUILD): 53 | -mkdir -p $@ 54 | 55 | $(BINDIR): 56 | -mkdir -p $@ 57 | 58 | debug: CXXFLAGS += -DDEBUG -O0 59 | debug: head 60 | 61 | clean: 62 | $(RM) -r build 63 | $(MAKE) -C $(GFALIBS_DIR) clean 64 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gfastats 2 | 3 | The swiss army knife for genome assembly. 4 | 5 | **gfastats** is a single fast and exhaustive tool for **summary statistics** and simultaneous \*fa\* (fasta, fastq, gfa [.gz]) genome assembly file **manipulation**. **gfastats** also allows seamless fasta<>fastq<>gfa[.gz] conversion. It has been tested in genomes even >100Gbp. 6 | 7 | Typical fast\* metrics include: 8 | 9 | - scaffold, contig and gap size 10 | - number of scaffolds, contigs and gaps 11 | - total length of scaffolds, contigs and gaps 12 | - scaffold, contig, gap N50 and statistics (full N\*/NG\* statistics with the `--nstar-report` flag) 13 | - area under the curve (AuN/AuNG) values for scaffolds, contigs and gaps 14 | - average scaffold, contig, gap size 15 | - largest scaffold, contig and gap 16 | - base composition and GC content 17 | - soft-masked base counts (lower case bases) 18 | 19 | Typical gfa metrics include (see also note below when evaluating gfa): 20 | 21 | - Number of nodes and edges 22 | - Average degree 23 | - Number of connected components, and length of the largets connected component 24 | - Number of dead ends 25 | - Number of disconnected components, and their total length 26 | - Number of bubbles 27 | 28 | Metrics for each scaffold/contig can be generated with the `--seq-report` flag. 29 | 30 | `Bed` coordinates and sizes of scaffolds, contigs and gaps can be outputted with the options `--out-coord` and `--out-size`. By default, `--out-coord` produces a full representation of the assembly in `agp` format. 31 | 32 | Additionally, input can be filtered using scaffold lists or `bed` coordinate files with the options `--include-bed` and `--exclude-bed`. 33 | 34 | Importantly, the filtered input can be outputted in any \*fa\* (fasta, fastq, gfa [.gz]) format. 35 | 36 | ## Installation 37 | 38 | Either download one of the releases or `git clone https://github.com/vgl-hub/gfastats.git --recursive` and `make -j` in `gfastats` folder. 39 | 40 | ## Usage 41 | 42 | `gfastats input.[fasta|fastq|gfa][.gz] [expected genome size] [header[:start-end]]` 43 | 44 | To check out all options and flags use `gfastats -h`. 45 | 46 | **Note**: if you have a GFA without paths defined (e.g. as output from hifiasm) you will need to add the `--discover-paths` options in order to generate statistics for contigs and scaffolds. This is an attempt to clearly distinguish contigs from segments. 47 | 48 | You can test some typical usage with the files in the `testFiles` folder, e.g.: 49 | 50 | ``` 51 | gfastats testFiles/random1.fasta -o gfa // converts fasta to gfa 52 | gfastats testFiles/random2.gfa2.gfa -o fa // converts gfa to fasta 53 | ``` 54 | 55 | ## Assembly manipulation 56 | 57 | **gfastats** allows extensive assembly manipulation at the sequence level. Manipulation is achieved using a set of _instructions_ provided as an ordered list in a file to the option `-k` / `--swiss-army-knife`: 58 | 59 | ``` 60 | gfastats testFiles/random1.fasta -k testFiles/random1.instructions.sak -o gfa // reads fasta applies a set of instructions and outputs gfa 61 | ``` 62 | 63 | The _instructions_ are sequentially processed to generate the final output. Examples of _instructions_ are: 64 | 65 | ``` 66 | JOIN contig1+ contig2+ 50 [gap1] [scaffold1] [this is a new scaffold] // introduces a new gap of 50 bp between scaffold1 and scaffold2 with optional id gap1, effectively joining the two sequences into a new sequences named scaffold1 with an optional comment 67 | SPLIT contig1+ contig2+ // splits the scaffold containing contig1 and contig2, effectively removing the existing gap between them 68 | ``` 69 | 70 | The _instructions_ directly provide the list of edits that were introduced. The _instructions_ could be from an automated tool or from manual annotation. 71 | 72 | A prime example of manipulations using input from an automated tool is overlaying AGP coordinates on top of the graph to generate new scaffolds, which can be achieved with: 73 | ``` 74 | gfastats input.fasta|input.gfa -a input.agp -o output.fasta|output.gfa 75 | ``` 76 | 77 | See the instruction wiki for a full list of _instructions_. 78 | 79 | ## Description 80 | 81 | Please refer to **gfastats** paper for a complete description. Briefly, **gfastats** reads and stores any fasta<>fastq<>gfa[.gz] in gfa format. **gfastats** then builds a bidirected graph representation of the assembly using adjacency lists, where each node is a segment, and each edge is a gap (see figure below). The original sequence can be directly manipulated from the graph. Finally, walking the graph allows to generate different kinds of outputs, including manipulated assemblies and feature coordinates. 82 | 83 |

84 | alt gfastats assembly graph 85 |

86 | 87 | ## How to cite 88 | 89 | If you use **gfastats** in your work, please cite: 90 | 91 | Gfastats: conversion, evaluation and manipulation of genome sequences using assembly graphs 92 | 93 | Giulio Formenti, Linelle Abueg, Angelo Brajuka, Nadolina Brajuka, Cristo Gallardo, Alice Giani, Olivier Fedrigo, Erich D. Jarvis 94 | 95 | doi: https://doi.org/10.1093/bioinformatics/btac460 96 | -------------------------------------------------------------------------------- /images/graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vgl-hub/gfastats/cedd755227fe0d7b6daf3fee932e325af54f3b21/images/graph.png -------------------------------------------------------------------------------- /include/gfastats-global.h: -------------------------------------------------------------------------------- 1 | #ifndef GLOBAL_H 2 | #define GLOBAL_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "log.h" 11 | #include "threadpool.h" 12 | 13 | extern UserInput userInput; 14 | 15 | #endif /* GLOBAL_H */ 16 | -------------------------------------------------------------------------------- /include/input.h: -------------------------------------------------------------------------------- 1 | #ifndef INPUT_H 2 | #define INPUT_H 3 | 4 | struct UserInputGfastats : UserInput { 5 | 6 | std::vector outFiles; // output files 7 | int segmentReport_flag = 0; 8 | int pathReport_flag = 0; 9 | int outSequence_flag = 0; 10 | int nstarReport_flag = 0; 11 | int outSize_flag = 0; 12 | int outCoord_flag = 0; 13 | int outFile_flag = 0; 14 | int outBubbles_flag = 0; 15 | int cmd_flag = 0; 16 | int rmGaps_flag = 0; 17 | int extractContigs_flag = 0; 18 | int terminalOvlLen = 0; 19 | 20 | }; 21 | 22 | class Input { 23 | 24 | UserInputGfastats userInput; 25 | // stream read variable definition 26 | std::string firstLine; 27 | unsigned int seqPos = 0; // to keep track of the original sequence order 28 | 29 | std::string newLine, seqHeader, seqComment, line, bedHeader; 30 | 31 | std::shared_ptr stream; 32 | 33 | std::vector instructions; 34 | 35 | public: 36 | 37 | void load(UserInputGfastats userInput); 38 | 39 | void read(InSequences& inSequence); 40 | 41 | }; 42 | 43 | #endif /* INPUT_H */ 44 | -------------------------------------------------------------------------------- /include/main.h: -------------------------------------------------------------------------------- 1 | #ifndef MAIN_H 2 | #define MAIN_H 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | 14 | #include //required for zstream 15 | #include 16 | #include 17 | #include 18 | #include //required for zstream 19 | #include //required for zstream 20 | #include // for graph manipulation 21 | #include // toupper() 22 | #include 23 | 24 | #include 25 | #include 26 | 27 | #include 28 | #include 29 | #include 30 | 31 | #include "log.h" 32 | 33 | #include "uid-generator.h" 34 | 35 | #include "bed.h" 36 | 37 | #include "global.h" // global variables 38 | #include "struct.h" 39 | #include "functions.h" // global functions 40 | 41 | #include "threadpool.h" 42 | 43 | #include 44 | 45 | #include "zlib.h" 46 | #include 47 | #include 48 | #include 49 | 50 | #include "gfa-lines.h" 51 | 52 | #include "gfa.h" // gfa classes 53 | #include "sak.h" // swiss army knife classes 54 | 55 | #include "stream-obj.h" 56 | 57 | #include "output.h" // output classes 58 | #include "input.h" 59 | 60 | #endif /* MAIN_H */ 61 | -------------------------------------------------------------------------------- /include/validate.h: -------------------------------------------------------------------------------- 1 | #ifndef GFASTATS_VALIDATE_H 2 | #define GFASTATS_VALIDATE_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | std::string getExePath(const std::string &argv0) { 14 | std::string exePath = argv0.substr(0, argv0.find_last_of("/\\")+1); 15 | std::replace(exePath.begin(), exePath.end(), '\\', '/'); 16 | #ifdef _WIN32 17 | exePath += "gfastats.exe"; 18 | #else 19 | exePath += "gfastats"; 20 | #endif 21 | return exePath; 22 | } 23 | 24 | std::string rmFileExt(const std::string path) { // utility to strip file extension from file 25 | if (path == "." || path == "..") 26 | return path; 27 | 28 | size_t pos = path.find_last_of("\\/."); 29 | if (pos != std::string::npos && path[pos] == '.') 30 | return path.substr(0, pos); 31 | 32 | return path; 33 | } 34 | 35 | std::string getFileExt(std::string fileName) // utility to get file extension 36 | { 37 | if(fileName.find_last_of(".") != std::string::npos) { 38 | 39 | if(fileName.substr(fileName.find_last_of(".")+1) == "gz") { 40 | 41 | fileName = rmFileExt(fileName); 42 | 43 | return getFileExt(fileName) + ".gz"; 44 | 45 | } 46 | 47 | return fileName.substr(fileName.find_last_of(".")+1); 48 | } 49 | return ""; 50 | } 51 | 52 | std::vector list_dir(const char *path) { 53 | std::vector list; 54 | struct dirent *entry; 55 | DIR *dir = opendir(path); 56 | 57 | if (dir == NULL) { 58 | std::cerr << "error: unable to access " << path << std::endl; 59 | exit(0); 60 | } 61 | while ((entry = readdir(dir)) != NULL) { 62 | DIR *f = opendir((std::string(path)+"/"+entry->d_name).c_str()); 63 | if(f == NULL) /*not a directory*/ list.push_back(std::string(entry->d_name)); 64 | else closedir(f); 65 | } 66 | closedir(dir); 67 | return list; 68 | } 69 | 70 | void get_recursive(const std::string &path, std::set &paths) { 71 | if(getFileExt(path) == "tst") { 72 | paths.insert(path); 73 | } else { 74 | DIR *dir = opendir(path.c_str()); 75 | if(dir != NULL) { 76 | for(const auto &file : list_dir(path.c_str())) { 77 | get_recursive((path+"/"+file).c_str(), paths); 78 | } 79 | closedir(dir); 80 | } 81 | } 82 | } 83 | 84 | int i = 0; 85 | 86 | void genTest(std::string exePath, const std::string &file, const std::string &args){ 87 | std::string tstFile = "validateFiles/"+file+"."+std::to_string(i)+".tst"; 88 | std::cout << "generating: " << tstFile << std::endl; 89 | std::ofstream ostream; 90 | ostream.open(tstFile); 91 | ostream << "testFiles/" << file << " " << args << "\nembedded" << std::endl; 92 | ostream.close(); 93 | #ifdef _WIN32 94 | std::string cmd = "\"\""+exePath+"\" testFiles/"+file+" "+args+" >> "+tstFile+"\""; 95 | #else 96 | std::string cmd = "\""+exePath+"\" testFiles/"+file+" "+args+" >> "+tstFile; 97 | #endif 98 | int exit = system(cmd.c_str()); 99 | if (exit == EXIT_SUCCESS) { 100 | ostream << cmd << std::endl; 101 | ostream << "Command executed."; 102 | } 103 | ++i; 104 | }; 105 | 106 | #endif // #ifndef GFASTATS_VALIDATE_H 107 | -------------------------------------------------------------------------------- /instructions/README.md: -------------------------------------------------------------------------------- 1 | # Instruction wiki 2 | 3 | Instructions are sequentially executed and each instruction is described by tab-separated columns. 4 | 5 | ## JOIN 6 | 7 | The JOIN instruction introduces a new gap of 50 bp between `scaffold1` and `scaffold2` (two paths) with id `gap1`, effectively joining the two sequences into a new sequence with id `new_scaffold` and an optional comment. 8 | 9 | ``` 10 | JOIN scaffold1+ scaffold2+ 50 gap1 new_scaffold 11 | JOIN scaffold1(1:100)+ scaffold2(1:100)+ 50 gap1 new_scaffold // optional subsetting 12 | ``` 13 | 14 | ## SPLIT 15 | 16 | The SPLIT instruction splits the scaffold containing `segment1` and `segment2`, effectively removing the existing gap between them. Two optional comments can be provided. 17 | 18 | ``` 19 | SPLIT segment1+ segment2+ scaffold1 scaffold2 [this is a new scaffold1] [this is a new scaffold2] 20 | ``` 21 | 22 | ## EXCISE 23 | 24 | The EXCISE instruction removes segment1 from its scaffold, leaving it unplaced and adding a gap of 50bp with id `gap1` between the original sequences 25 | 26 | ``` 27 | EXCISE segment1 50 gap1 28 | ``` 29 | 30 | ## REMOVE 31 | 32 | The REMOVE instruction removes the paths involving the specified segment. 33 | 34 | ``` 35 | REMOVE segment1 36 | ``` 37 | 38 | ## EXCLUDE 39 | 40 | The EXCLUDE instruction removes the specified path and all its components. 41 | 42 | ``` 43 | EXCLUDE path1 44 | ``` 45 | 46 | ## ERASE 47 | 48 | The ERASE instruction trims off the sequence range specified from the given segment. 49 | 50 | ``` 51 | ERASE segment1:10-100 // deletes segment1 sequence between the coordinates provided (in bed format) 52 | ``` 53 | 54 | ## RVCP 55 | 56 | The RVCP instruction reverse-complements path1 or segment1 sequence in place 57 | 58 | ``` 59 | RVCP path1/segment1 60 | ``` 61 | 62 | ## INVERT 63 | 64 | The INVERT instruction inverts segment1 sequence in place 65 | 66 | ``` 67 | INVERT segment1 68 | ``` 69 | 70 | ## RESIZE 71 | 72 | The RESIZE instruction resizes the size of gap1 to 50 bp 73 | 74 | ``` 75 | RESIZE gap1 50 76 | ``` 77 | 78 | ## MASK 79 | 80 | The MASK instruction masks with 50 Ns a portion of a path, effectively adding a gap in the corresponding segment of optional size 5. If size is not provided, the masked size is used 81 | 82 | ``` 83 | MASK path1 10 60 [5] 84 | ``` 85 | 86 | ## CLEAVE 87 | 88 | The CLEAVE instruction breaks the specified segment at the given position generating segment2 and segment3, optionally connected by an edge 89 | 90 | ``` 91 | CLEAVE segment1 50 segment2 segment3 [edge1] 92 | ``` 93 | 94 | ## RENAME 95 | 96 | The RENAME instruction renames a path. It can be used to rename FASTA headers. 97 | 98 | ``` 99 | RENAME path1 new_path 100 | ``` 101 | 102 | ## COMMENT 103 | 104 | The COMMENT instruction add or replaces an existing comment associated with a specific path/header. 105 | 106 | ``` 107 | COMMENT path1 comment 108 | ``` 109 | 110 | ## Yet to be implemented 111 | 112 | ``` 113 | ADD contig3 contig1+ 50 contig2+ 50 ACGT // introduces a new contig named contig3 with sequence ACGT between contig1 and contig2 leaving 50bp gaps on each side 114 | REPLACE contig1:20-24 ACGT // replaces the sequence at coordinates contig1:20-24 with ACGT 115 | ``` 116 | -------------------------------------------------------------------------------- /scaffolding/README.md: -------------------------------------------------------------------------------- 1 | ### example data: bTaeGut2 Hifiasm (HiC) assembly 2 | right click -> download link 3 | - [hap1 contigs as GFA](https://genomeark.s3.amazonaws.com/species/Taeniopygia_guttata/bTaeGut2/assembly_vgp_hic_2.0/intermediates/hifiasm/bTaeGut2.trim.HiC.hic.hap1.p_ctg.gfa) 4 | - [hap1 s1 AGP](https://genomeark.s3.amazonaws.com/species/Taeniopygia_guttata/bTaeGut2/assembly_vgp_hic_2.0/intermediates/bionano_hap1/agp_fasta/bTaeGut2_Saphyr_DLE1_3172351_bppAdjust_cmap_bTaeGut2_trim_HiC_hic_hap1_p_ctg_fasta_NGScontigs_HYBRID_SCAFFOLD.agp) 5 | - [hap1 s2 AGP](https://genomeark.s3.amazonaws.com/species/Taeniopygia_guttata/bTaeGut2/assembly_vgp_hic_2.0/intermediates/salsa_hap1/bTaeGut2_hap1_s1.gfastats.rename_salsa/scaffolds_FINAL.original-coordinates.agp) 6 | - [hap1 s2 final fasta (to check your results)](https://genomeark.s3.amazonaws.com/species/Taeniopygia_guttata/bTaeGut2/assembly_vgp_hic_2.0/bTaeGut2.hic.hap1.s2.fasta) 7 | 8 | The starting files from hifiasm-HiC workflow are the hap1 & hap2 GFAs: 9 | 10 | `bTaeGut2.hap1.gfa` and `bTaeGut2.hap2.gfa` 11 | 12 | Convert GFA -> FASTA run bionano to obtain s1 AGPs. `bTaeGut2.hap1.fasta` into Bionano produces `bTaeGut2.hap1.s1.agp`, and same for hap2. 13 | 14 | NOTE: IF Bionano is cutting, then fix the subseq lines. Bionano is not cutting in Galaxy, so do not need to run `sed` command on Galaxy assemblies. 15 | ````bash 16 | # THIS IS NOT NEEDED FOR GALAXY ASSEMBLIES 17 | cat bTaeGut2_hap1_s1.agp | sed 's/W\t\(.*\)_subseq_\([0-9]*\):\([0-9]*\)\t[0-9]*\t[0-9]*\t\(.\)/W\t\1\t\2\t\3\t\4/g' | sed 's/subseq_\([0-9]*\):\([0-9]*\)/subseq_\1_\2/g' > bTaeGut2_hap1_s1.edit.agp 18 | ```` 19 | 20 | ##### UPDATE: MAY 3, 2022 21 | Newer versions of gfastats append `_path` to path names, so the Bionano AGP must be processed accordingly. **This needs to happen even if Bionano is not cutting -- i.e. this needs to happen for Galaxy assemblies!** 22 | 23 | an example of fixing the Bionano AGP to recognize `_path` in contig names: 24 | ````bash 25 | awk '{OFS = "\t"}{if ($0 ~ /^#/) print $0 }{if ($6 ~ /h1*/) print $1,$2,$3,$4,$5,$6"_path",$7,$8,$9; if ($6 ~ /^[0-9]/) print $0}' bTaeGut2.hap1.s1.edit.agp > bTaeGut2.hap1.s1.edit.path.agp 26 | ```` 27 | 28 | Overlap s1 AGP onto c1/p1 GFA. `--discover` is so gfastats finds the paths in the GFA 29 | ````bash 30 | gfastats bTaeGut2.trim.HiC.hic.hap1.p_ctg.gfa --discover -o bTaeGut2.hap1.discover.gfa 31 | gfastats bTaeGut2.hap1.discover.gfa --discover -a bTaeGut2.hap1.s1.edit.path.agp -o bTaeGut2.hap1.s1.gfa 32 | ```` 33 | 34 | Convert s1 GFA -> s1 FASTA, run salsa to obtain s2 AGP. 35 | ````bash 36 | gfastats bTaeGut2.hap1.s1.gfa -o bTaeGut2.hap1.s1.gfastats.fasta 37 | ```` 38 | NOTE: IF Bionano is cutting, then subseq lines have colons in the names, so you need to remove those before SALSA 39 | ````bash 40 | ## Removing colons from bionano scaff names, because salsa doesn't like it 41 | # THIS IS NOT NEEDED FOR GALAXY ASSEMBLIES 42 | sed 's/:/_/g' bTaeGut2.hap1.s1.gfastats.fasta > bTaeGut2.hap1.s1.gfastats.nocolon.fasta 43 | ```` 44 | 45 | `bTaeGut2.hap1.s1.gfastats.fasta` into SALSA produces `bTaeGut2.hap1.s2.agp` 46 | 47 | Overlap s2 AGP onto s1 GFA to create s2 GFA 48 | ````bash 49 | cp /scaffolds_FINAL.original-coordinates.agp > ./bTaeGut2.hap1.s2.originalcoords.agp 50 | gfastats bTaeGut2.hap1.s1.gfa -a bTaeGut2.hap1.s2.originalcoords.agp -o bTaeGut2.hap1.s2.gfa 51 | ```` 52 | If you want to convert this s2 GFA to s2 FASTA: 53 | ````bash 54 | gfastats bTaeGut2.hap1.s2.gfa -o bTaeGut2.hap1.s2.gfastats.fasta 55 | ```` 56 | -------------------------------------------------------------------------------- /scripts/gfastats_stats.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | 4 | pathlist=$1 5 | LINE=$(sed -n "$SLURM_ARRAY_TASK_ID"p $pathlist) 6 | 7 | echo "SCRIPT START FOR $LINE ----------------------------" 8 | printf "Path to genomeark file: $LINE \n\n" 9 | 10 | IFS='/' read -r -a array <<< $LINE 11 | fastaname=${array[4]} ##make more robust - basename 12 | echo "Local filename: $LINE" 13 | 14 | printf "Path to genomeark file: $LINE \n\n" 15 | 16 | statsname=$LINE.gfastats 17 | 18 | aws s3 cp s3://genomeark/${LINE} ./${fastaname} 19 | 20 | ziptime=$(wc -c $fastaname) 21 | echo "Time on compressed $fastaname" 22 | TIMEFORMAT=%R 23 | ziptime=$(time (gfastats $fastaname > ${fastaname}_temp_out.txt) 2>&1) 24 | 25 | ziplength=$(grep "Total scaffold length" ${fastaname}_temp_out.txt | grep -Eo "[0-9]+") 26 | echo "Compressed length: $ziplength" 27 | printf "$LINE\t $ziplength\t $ziptime\t gzip\n" >> gfastats_stats_out.txt 28 | 29 | uncomp=$(echo $fastaname | sed 's/.gz//g') 30 | echo "Decompressing $fastaname \n\n" 31 | gunzip $fastaname 32 | 33 | echo "Time on uncompressed fasta: $uncomp" 34 | unziptime=$(time (gfastats $uncomp > ${fastaname}_temp_out_2.txt) 2>&1) 35 | 36 | unziplength=$(grep "Total scaffold length" ${fastaname}_temp_out_2.txt | grep -Eo "[0-9]+") 37 | echo "Decompressed length: $unziplength" 38 | 39 | printf "$LINE\t $unziplength\t $unziptime\t plain text\n" >> gfastats_stats_out.txt 40 | 41 | rm $uncomp 42 | rm ${fastaname}_temp_out.txt 43 | rm ${fastaname}_temp_out_2.txt 44 | 45 | echo "SCRIPT COMPLETE FOR $LINE --------------------" 46 | 47 | -------------------------------------------------------------------------------- /scripts/plot_runtime.R: -------------------------------------------------------------------------------- 1 | setwd(dirname(rstudioapi::getSourceEditorContext()$path)) 2 | 3 | library(ggplot2) 4 | 5 | df<-read.csv("data.txt", header = TRUE, sep = "\t") 6 | 7 | png(file="Fig 1c.png", 8 | width=2000, height=1000) 9 | 10 | ggplot(df, aes(x=size, y=time, group=format)) + 11 | geom_point(aes(color=format), size = 3)+ 12 | scale_color_grey() + theme_classic() + 13 | geom_smooth(aes(color=format)) + 14 | theme( 15 | text = element_text(size = 60), 16 | legend.title = element_blank(), 17 | legend.key.size = unit(3,"cm"), 18 | axis.title.x = element_text(margin = margin(t = 20, r = 0, b = 0, l = 0)) 19 | ) + 20 | xlab("Genome size (Gbp)") + ylab("Time (s)") + 21 | guides(color=guide_legend(override.aes=list(fill=NA))) 22 | 23 | dev.off() 24 | -------------------------------------------------------------------------------- /scripts/submit_gfastats_stats.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | pathlist=$1 4 | linecount=$(wc -l $pathlist | awk '{print $1}') 5 | echo $linecount 6 | 7 | log=logs/slurm_%A.log 8 | sbatch -p hpc,vgl,vgl_bigmem -c 1 --error=$log --output=$log --array=1-$linecount gfastats_stats.sh $pathlist 9 | 10 | 11 | -------------------------------------------------------------------------------- /src/generate-random-fasta.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | typedef unsigned long long ull; 7 | 8 | // random from 0 to unsigned long long max 9 | ull rndull() { 10 | #if RAND_MAX == 2147483647 // can get 32 bits of randomness from std::rand() 11 | return (((ull)std::rand()) < 32) | (((ull)std::rand())); 12 | #else // only guarunteed 16 bits of randomness from std::rand() 13 | return (((ull)std::rand()) < 48) | (((ull)std::rand()) < 32) | (((ull)std::rand()) < 16) | (((ull)std::rand())); 14 | #endif 15 | } 16 | 17 | ull rnd(ull min, ull max) { 18 | return rndull()%(max-min)+min; 19 | } 20 | 21 | char rndACGT() { 22 | static const char acgt[] = {'A', 'C', 'G', 'T'}; 23 | return acgt[rand()%4]; 24 | } 25 | 26 | int main(int argc, char **argv) { 27 | if(argc != 10) { 28 | std::cout << "usage: generate-random-fasta " << std::endl; 29 | } 30 | 31 | ull contig_min_size = std::stoull(argv[2]), 32 | contig_max_size = std::stoull(argv[3]), 33 | gap_min_size = std::stoull(argv[4]), 34 | gap_max_size = std::stoull(argv[5]), 35 | min_num_contigs = std::stoull(argv[6]), 36 | max_num_contigs = std::stoull(argv[7]), 37 | min_num_headers = std::stoull(argv[8]), 38 | max_num_headers = std::stoull(argv[9]); 39 | 40 | std::srand(std::time(nullptr)); 41 | 42 | std::ofstream output_file; 43 | output_file.open(argv[1]); 44 | if(!output_file.is_open()) { 45 | std::cerr << "couldn't open the specified file: <" << argv[1] << ">" << std::endl; 46 | return EXIT_FAILURE; 47 | } 48 | ull num_headers = rnd(min_num_headers, max_num_headers); 49 | for(ull h=0; hHeader" << h+1 << std::endl; 51 | 52 | if(std::rand()%2 == 1) { 53 | ull gap_size = rnd(gap_min_size, gap_max_size); 54 | for(ull i=0; i 2 | #include 3 | #include 4 | 5 | #include "validate.h" 6 | 7 | int main(int, char **argv) { 8 | std::cout << "WARNING: only run this program if gfastats is in a working state" << std::endl; 9 | std::cout << "WARNING: previous validate files will be deleted" << std::endl; 10 | std::cout << "continue? (Y/N) "; 11 | std::string input; 12 | std::cin >> input; 13 | if(input != "Y" && input != "y") { 14 | std::cout << "validate generation cancelled" << std::endl; 15 | std::exit(0); 16 | } 17 | std::cout << "deleting old validate files..." << std::endl; 18 | 19 | for(auto &file : list_dir("validateFiles")) { 20 | if(getFileExt(file) != "tst") continue; // dont delete README 21 | file = "validateFiles/"+file; 22 | if(remove(file.c_str()) != 0) { 23 | std::cerr << "error deleting <" << file << ">" << std::endl; 24 | return -1; 25 | } 26 | } 27 | 28 | std::cout << "generating new validate files..." << std::endl; 29 | 30 | std::string exePath = getExePath(argv[0]); 31 | 32 | const std::map, std::vector> ext_args = { 33 | {{"fasta", "fasta.gz", "fastq", "fastq.gz"}, {"", "-s s", "-s c", "-s g", "-b a", "-b s", "-b c", "-b g", "--homopolymer-compress 1 -ofa"}}, 34 | {{"gfa", "gfa.gz", "gfa2", "gfa2.gz"}, {"-o gfa2", "-o gfa", "-o fasta"}} 35 | // {{set of test file extensions}, {list of command line args to run with}} 36 | }; 37 | 38 | const std::map, std::vector> file_args = { 39 | {{"random1.fasta", "random1.fasta.gz", "random1.fastq", "random1.fastq.gz", "random1.gfa2"}, {"-a testFiles/random1.agp --stats", "-a testFiles/random1.agp -ofa", "-k testFiles/random1.rvcp.sak", "-k testFiles/random1.mask.sak"}}, 40 | {{"random1.fasta"}, {"-k testFiles/random1.instructions.sak", "-ofa -k testFiles/random1.instructions.sak", "-ofa -k testFiles/random1.hc.sak", "-ofa -k testFiles/random1.hdc.sak", "Header2", "-ofa -e testFiles/random1.fasta.1.bed", "-ofa -e testFiles/random1.fasta.2.bed", "-ofa -e testFiles/random1.fasta.3.bed", "-ofa -e testFiles/random1.fasta.4.bed", "-ofa -e testFiles/random1.fasta.5.bed", "-ofa -i testFiles/random1.fasta.1.bed", "-ofa -i testFiles/random1.fasta.2.bed", "-ofa -i testFiles/random1.fasta.3.bed", "-ofa -i testFiles/random1.fasta.4.bed", "-ofa -i testFiles/random1.fasta.5.bed"}}, 41 | {{"random2.noseq.gfa"}, {""}}, 42 | {{"random1.gfa2"}, {"-k testFiles/random1.gfa2.instructions.sak"}}, 43 | {{"random4.fasta"}, {""}}, 44 | {{"random5.findovl.gfa"}, {"--discover-terminal-overlaps 3 -ogfa"}}, 45 | {{"random6.circular.gfa"}, {""}} 46 | 47 | // {{set of test file paths}, {list of command line args to run with}} 48 | }; 49 | 50 | const std::set exclude {"agp", "sak"}; 51 | 52 | for(const std::string &file : list_dir("testFiles")) { 53 | std::string ext = getFileExt(file); 54 | if(exclude.count(ext)) continue; 55 | for(auto pair : ext_args) { 56 | if(!pair.first.count(ext)) continue; 57 | for(auto args : pair.second) { 58 | genTest(exePath, file, args); 59 | } 60 | } 61 | } 62 | 63 | std::fstream fstream; 64 | for(const auto &pair : file_args) { 65 | for(const std::string &file : pair.first) { 66 | fstream.open("testFiles/"+file); 67 | if(!fstream) continue; 68 | fstream.close(); 69 | for(const std::string &args : pair.second) { 70 | genTest(exePath, file, args); 71 | } 72 | } 73 | } 74 | 75 | std::exit(EXIT_SUCCESS); 76 | } 77 | -------------------------------------------------------------------------------- /src/input.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | #include "log.h" 11 | #include "global.h" 12 | #include "uid-generator.h" 13 | 14 | #include "bed.h" 15 | #include "struct.h" 16 | #include "functions.h" 17 | 18 | #include "gfa-lines.h" 19 | #include "gfa.h" 20 | #include "sak.h" 21 | 22 | #include "stream-obj.h" 23 | 24 | #include "input-agp.h" 25 | #include "input-filters.h" 26 | #include "input-gfa.h" 27 | #include "input.h" 28 | 29 | void Input::load(UserInputGfastats userInput) { 30 | 31 | this->userInput = userInput; 32 | 33 | } 34 | 35 | void Input::read(InSequences& inSequences) { 36 | 37 | if (userInput.inSequence.empty()) {return;} 38 | 39 | threadPool.init(maxThreads); // initialize threadpool 40 | 41 | if (!userInput.inSak.empty() || userInput.pipeType == 'k') { 42 | 43 | StreamObj streamObj; 44 | 45 | stream = streamObj.openStream(userInput, 'k'); 46 | 47 | SAK sak; // create a new swiss army knife 48 | 49 | while (getline(*stream, line)) { 50 | 51 | std::istringstream iss(line); 52 | 53 | instructions.push_back(sak.readInstruction(line)); // use the swiss army knife to read the instruction 54 | 55 | } 56 | 57 | lg.verbose("Finished reading SAK instructions"); 58 | 59 | } 60 | 61 | if (!userInput.inBedInclude.empty() || userInput.pipeType == 'i') { 62 | 63 | StreamObj streamObj; 64 | stream = streamObj.openStream(userInput, 'i'); 65 | 66 | while (getline(*stream, line)) { 67 | 68 | uint64_t begin = 0, end = 0; 69 | std::istringstream iss(line); 70 | iss >> bedHeader >> begin >> end; 71 | userInput.bedIncludeList.pushCoordinates(bedHeader, begin, end); 72 | } 73 | lg.verbose("Finished reading BED include list"); 74 | } 75 | 76 | BedCoordinates bedExcludeList; 77 | 78 | if (!userInput.inBedExclude.empty() || userInput.pipeType == 'e') { 79 | 80 | StreamObj streamObj; 81 | stream = streamObj.openStream(userInput, 'e'); 82 | 83 | while (getline(*stream, line)) { 84 | 85 | uint64_t begin = 0, end = 0; 86 | std::istringstream iss(line); 87 | iss >> bedHeader >> begin >> end; 88 | 89 | bedExcludeList.pushCoordinates(bedHeader, begin, end); 90 | } 91 | lg.verbose("Finished reading BED exclude list"); 92 | } 93 | 94 | if (!userInput.inSequence.empty() || userInput.pipeType == 'f') { 95 | 96 | StreamObj streamObj; 97 | 98 | stream = streamObj.openStream(userInput, 'f'); 99 | 100 | if (stream) { 101 | 102 | switch (stream->peek()) { 103 | 104 | case '>': { 105 | 106 | stream->get(); 107 | 108 | while (getline(*stream, newLine)) { 109 | 110 | if(userInput.bedIncludeList.size() - bedExcludeList.size() != 0 && userInput.bedIncludeList.size() - bedExcludeList.size() == inSequences.getPathN()) { // we have all the sequences needed 111 | lg.verbose("Found all sequences, stop streaming input"); 112 | break; 113 | } 114 | size_t spacePos = newLine.find(" "); 115 | seqHeader = newLine.substr(0, spacePos); 116 | if (spacePos != std::string::npos) 117 | seqComment = newLine.substr(spacePos + 1); 118 | else 119 | seqComment.clear(); 120 | 121 | std::string* inSequence = new std::string; 122 | getline(*stream, *inSequence, '>'); 123 | lg.verbose("Individual fasta sequence read"); 124 | 125 | Sequence* sequence = includeExcludeSeq(seqHeader, seqComment, inSequence, userInput.bedIncludeList, bedExcludeList); 126 | 127 | if (sequence != NULL) { 128 | sequence->seqPos = seqPos; // remember the order 129 | inSequences.appendSequence(sequence, userInput.hc_cutoff); 130 | seqPos++; 131 | } 132 | } 133 | break; 134 | } 135 | case '@': { 136 | 137 | while (getline(*stream, newLine)) { // file input 138 | 139 | if(userInput.bedIncludeList.size() - bedExcludeList.size() != 0 && userInput.bedIncludeList.size() - bedExcludeList.size() == inSequences.getPathN()) { // we have all the sequences needed 140 | lg.verbose("Found all sequences, stop streaming input"); 141 | break; 142 | 143 | } 144 | newLine.erase(0, 1); 145 | size_t spacePos = newLine.find(" "); 146 | seqHeader = newLine.substr(0, spacePos); 147 | if (spacePos != std::string::npos) 148 | seqComment = newLine.substr(spacePos + 1); 149 | else 150 | seqComment.clear(); 151 | 152 | std::string* inSequence = new std::string; 153 | getline(*stream, *inSequence); 154 | 155 | getline(*stream, newLine); 156 | 157 | std::string* inSequenceQuality = new std::string; 158 | getline(*stream, *inSequenceQuality); 159 | 160 | Sequence* sequence = includeExcludeSeq(seqHeader, seqComment, inSequence, userInput.bedIncludeList, bedExcludeList, inSequenceQuality); 161 | 162 | if (sequence != NULL) { 163 | 164 | sequence->seqPos = seqPos; // remember the order 165 | 166 | inSequences.appendSequence(sequence, userInput.hc_cutoff); 167 | 168 | seqPos++; 169 | 170 | } 171 | 172 | } 173 | 174 | break; 175 | 176 | } 177 | default: { 178 | 179 | readGFA(inSequences, userInput, stream, &bedExcludeList); 180 | 181 | } 182 | 183 | } 184 | 185 | lg.verbose("End of file"); 186 | 187 | }else{ 188 | 189 | fprintf(stderr, "Stream not successful: %s", userInput.inSequence.c_str()); 190 | exit(1); 191 | 192 | } 193 | 194 | } 195 | 196 | jobWait(threadPool); 197 | 198 | inSequences.sortSegmentsByOriginal(); 199 | 200 | if (userInput.rmGaps_flag) 201 | inSequences.removeTerminalGaps(); 202 | 203 | if (userInput.extractContigs_flag) { 204 | 205 | inSequences.clearGaps(); 206 | inSequences.clearPaths(); 207 | 208 | } 209 | 210 | if (userInput.extractContigs_flag || userInput.discoverPaths_flag) 211 | inSequences.discoverPaths(); 212 | 213 | if (userInput.terminalOvlLen != 0) 214 | inSequences.discoverTerminalOverlaps(userInput.terminalOvlLen); 215 | 216 | if (!instructions.empty()) { 217 | 218 | lg.verbose("\nStarted instruction execution"); 219 | 220 | SAK sak; // create a new swiss army knife 221 | 222 | for (Instruction instruction : instructions) { // execute swiss army knife instructions 223 | 224 | sak.executeInstruction(inSequences, instruction); 225 | lg.verbose(instruction.action + " instruction executed"); 226 | 227 | } 228 | 229 | } 230 | 231 | if (!userInput.inAgp.empty() || userInput.pipeType == 'a') 232 | readAgp(inSequences, userInput); 233 | 234 | if (userInput.sortType == "ascending") { 235 | inSequences.sortPathsByNameAscending(); 236 | }else if (userInput.sortType == "descending") { 237 | inSequences.sortPathsByNameDescending(); 238 | }else if (userInput.sortType == "largest") { 239 | inSequences.sortPathsBySize(0); 240 | }else if (userInput.sortType == "smallest") { 241 | inSequences.sortPathsBySize(1); 242 | }else if (userInput.sortType != "none" && ifFileExists(userInput.sortType.c_str())){ 243 | 244 | stream = std::make_unique(std::ifstream(userInput.sortType)); 245 | 246 | std::string header; 247 | std::vector headerList; 248 | 249 | while (getline(*stream, line)) { // read the file to vector 250 | 251 | std::istringstream iss(line); 252 | iss >> header; 253 | 254 | headerList.push_back(header); 255 | 256 | } 257 | 258 | inSequences.sortPathsByList(headerList); 259 | 260 | }else if(userInput.inAgp.empty() && !(userInput.pipeType == 'a')){ 261 | inSequences.sortPathsByOriginal(); 262 | } 263 | 264 | inSequences.updateStats(); 265 | 266 | threadPool.join(); 267 | 268 | } 269 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | // 2 | //fastats.cpp 3 | // 4 | //Created by Giulio Formenti on 12/17/21. 5 | // 6 | 7 | #include "main.h" 8 | 9 | std::string version = "1.3.11"; 10 | 11 | //global 12 | std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now(); // immediately start the clock when the program is run 13 | 14 | int verbose_flag; 15 | Log lg; 16 | std::vector logs; 17 | int tabular_flag; 18 | 19 | int maxThreads = 0; 20 | std::mutex mtx; 21 | ThreadPool> threadPool; 22 | 23 | UserInputGfastats userInput; // initialize input object 24 | 25 | int main(int argc, char **argv) { 26 | 27 | short int c; // optarg 28 | short unsigned int pos_op = 1; // optional arguments 29 | unsigned long long int gSize = 0; // expected genome size, with 0 NG/LG* statistics are not computed 30 | 31 | char bedOutType = 'a'; // default output type with bed flag (agp) 32 | bool isPipe = false; // to check if input is from pipe 33 | 34 | if (argc == 1) { // gfastats with no arguments 35 | printf("gfastats input.[fasta|fastq|gfa][.gz] [expected genome size] [header[:start-end]]\n-h for additional help.\n"); 36 | exit(EXIT_SUCCESS); 37 | } 38 | static struct option long_options[] = { // struct mapping long options 39 | {"input-sequence", required_argument, 0, 'f'}, 40 | 41 | {"threads", required_argument, 0, 'j'}, 42 | 43 | {"agp-to-path", required_argument, 0, 'a'}, // agp to path conversion 44 | {"swiss-army-knife", required_argument, 0, 'k'}, // the swiss army knife 45 | {"remove-terminal-gaps", no_argument, &userInput.rmGaps_flag, 1}, // this remove all gap edges at the end of sequences 46 | {"homopolymer-compress", required_argument, 0, 0}, 47 | {"discover-paths", no_argument, &userInput.discoverPaths_flag, 1}, 48 | {"discover-terminal-overlaps", optional_argument, 0, 0}, 49 | {"sort", required_argument, 0, 0}, 50 | {"extract-contigs", no_argument, &userInput.extractContigs_flag, 1}, 51 | 52 | {"include-bed", required_argument, 0, 'i'}, 53 | {"exclude-bed", required_argument, 0, 'e'}, 54 | 55 | {"out-format", required_argument, 0, 'o'}, 56 | {"line-length", required_argument, 0, 0}, 57 | {"no-sequence", no_argument, &userInput.noSequence, 1}, 58 | {"out-sequence", no_argument, &userInput.outSequence_flag, 1}, 59 | {"out-size", required_argument, 0, 's'}, 60 | {"out-coord", required_argument, 0, 'b'}, 61 | {"out-bubbles", no_argument, &userInput.outBubbles_flag, 1}, 62 | 63 | {"stats", no_argument, &userInput.stats_flag, 1}, 64 | {"segment-report", no_argument, &userInput.segmentReport_flag, 1}, 65 | {"path-report", no_argument, &userInput.pathReport_flag, 1}, 66 | {"nstar-report", no_argument, &userInput.nstarReport_flag, 1}, 67 | {"tabular", no_argument, 0, 't'}, 68 | {"locale", required_argument, 0, 0}, 69 | 70 | {"verbose", no_argument, &verbose_flag, 1}, 71 | {"cmd", no_argument, &userInput.cmd_flag, 1}, 72 | {"version", no_argument, 0, 'v'}, 73 | {"help", no_argument, 0, 'h'}, 74 | 75 | {0, 0, 0, 0} 76 | }; 77 | 78 | while (1) { // loop through argv 79 | 80 | int option_index = 0; 81 | int curind = optind; 82 | c = getopt_long(argc, argv, "-:a:b:e:f:i:j:k:o:s:tvh", 83 | long_options, &option_index); 84 | 85 | if (optind < argc && !isPipe) // if pipe wasn't assigned already 86 | isPipe = isDash(argv[optind]) ? true : false; // check if the argument to the option is a '-' and set it as pipe input 87 | 88 | if (optarg != nullptr && !isPipe) // case where pipe input is given as positional argument (input sequence file) 89 | isPipe = isDash(optarg) ? true : false; 90 | 91 | if (c == -1) // exit the loop if run out of options 92 | break; 93 | 94 | switch (c) { 95 | case '?': 96 | if (optopt) 97 | printf("Unrecognized short option (%c).\n", optopt); 98 | else 99 | printf("Unrecognized long option (%s).\n", argv[curind]); 100 | exit(EXIT_FAILURE); 101 | case ':': // handle options without arguments 102 | switch (optopt) { // the command line option last matched 103 | case 'b': 104 | bedOutType = 'a'; // default bed output is agp is -b option is given without argument 105 | userInput.outCoord_flag = 1; 106 | break; 107 | 108 | case 's': 109 | bedOutType = 's'; // default size output is scaffold is -s option is given without argument 110 | userInput.outSize_flag = 1; 111 | break; 112 | 113 | case 'o': 114 | userInput.outFiles.push_back("fasta"); // default output is fasta if -o option is given without argument 115 | userInput.outFile_flag = 1; 116 | break; 117 | 118 | default: 119 | fprintf(stderr, "option -%c is missing a required argument\n", optopt); 120 | return EXIT_FAILURE; 121 | } 122 | break; 123 | default: // handle positional arguments 124 | if (pos_op == 1) { // only one positional argument given 125 | 126 | if (isPipe && userInput.pipeType == 'n') // check whether input is from pipe and that pipe input was not already set 127 | userInput.pipeType = 'f'; // pipe input is a sequence 128 | else{ // input is a regular file 129 | ifFileExists(optarg); 130 | userInput.inSequence = optarg; 131 | } 132 | pos_op++; 133 | 134 | }else if (pos_op == 2 || pos_op == 3) { // if >1 positional argument, check what additional positional arguments are present 135 | 136 | if (isInt(optarg)) { // if the positional argument is a number, it is likely the expected genome size 137 | 138 | gSize = atoll(optarg); pos_op++; 139 | 140 | }else{ // else it is an include argument 141 | 142 | std::tuple coordinate = parseCoordinate(std::string(optarg)); 143 | userInput.bedIncludeList.pushCoordinates(std::get<0>(coordinate), std::get<1>(coordinate), std::get<2>(coordinate)); 144 | pos_op++; 145 | 146 | } 147 | }else{ 148 | printf("Error: too many positional arguments (%s).\n",optarg); 149 | exit(EXIT_FAILURE); 150 | } 151 | break; 152 | case 0: // case for long options without short options 153 | 154 | if (strcmp(long_options[option_index].name,"discover-terminal-overlaps") == 0) { 155 | 156 | if (optarg == NULL && optind < argc && argv[optind][0] != '-') 157 | optarg = argv[optind++]; 158 | 159 | if (optarg != NULL) 160 | userInput.terminalOvlLen = atoi(optarg); 161 | else 162 | userInput.terminalOvlLen = 1000; 163 | } 164 | if (strcmp(long_options[option_index].name,"line-length") == 0) 165 | userInput.splitLength = atoi(optarg); 166 | 167 | if (strcmp(long_options[option_index].name,"sort") == 0) { 168 | 169 | std::vector options {"none", "ascending", "descending", "largest", "smallest"}; 170 | 171 | if (std::find(options.begin(), options.end(), optarg) != options.end() || ifFileExists(optarg)) 172 | userInput.sortType = optarg; 173 | else{ 174 | printf("Error: unrecognized sorting option (%s).\n", optarg); 175 | exit(1); 176 | } 177 | } 178 | if(strcmp(long_options[option_index].name,"homopolymer-compress") == 0) { 179 | userInput.hc_cutoff = atoi(optarg); 180 | userInput.stats_flag = 1; 181 | } 182 | if (strcmp(long_options[option_index].name,"locale") == 0) { 183 | 184 | setlocale(LC_ALL, optarg); 185 | std::cout.imbue(std::locale(optarg)); 186 | std::locale::global(std::locale(optarg)); 187 | userInput.stats_flag = 1; 188 | } 189 | break; 190 | case 'a': // agp to paths 191 | 192 | if (isPipe && userInput.pipeType == 'n') // check whether input is from pipe and that pipe input was not already set 193 | userInput.pipeType = 'a'; // pipe input is agp 194 | else{ // input is a regular file 195 | ifFileExists(optarg); 196 | userInput.inAgp = optarg; 197 | } 198 | userInput.stats_flag = 1; 199 | break; 200 | case 'b': // output bed type (agp, contig, gaps) 201 | bedOutType = *optarg; 202 | userInput.outCoord_flag = 1; 203 | break; 204 | case 'e': // bed exclude 205 | 206 | if (isPipe && userInput.pipeType == 'n') // check whether input is from pipe and that pipe input was not already set 207 | userInput.pipeType = 'e'; // pipe input is an exclude bed 208 | else{ // input is a regular file 209 | ifFileExists(optarg); 210 | userInput.inBedExclude = optarg; 211 | } 212 | userInput.stats_flag = 1; 213 | break; 214 | case 'f': // input sequence 215 | 216 | if (isPipe && userInput.pipeType == 'n') // check whether input is from pipe and that pipe input was not already set 217 | userInput.pipeType = 'f'; // pipe input is a sequence 218 | else{ // input is a regular file 219 | 220 | ifFileExists(optarg); 221 | userInput.inSequence = optarg; 222 | userInput.stats_flag = 1; 223 | } 224 | break; 225 | case 'i': // bed include 226 | 227 | if (isPipe && userInput.pipeType == 'n') // check whether input is from pipe and that pipe input was not already set 228 | userInput.pipeType = 'i'; // pipe input is an include bed 229 | else{ // input is a regular file 230 | ifFileExists(optarg); 231 | userInput.inBedInclude = optarg; 232 | } 233 | userInput.stats_flag = 1; 234 | break; 235 | case 'j': // max threads 236 | maxThreads = atoi(optarg); 237 | userInput.stats_flag = 1; 238 | break; 239 | case 'k': // the swiss army knife 240 | 241 | if (isPipe && userInput.pipeType == 'n') // check whether input is from pipe and that pipe input was not already set 242 | userInput.pipeType = 'k'; // pipe input is a set of instructions for the swiss army knife 243 | else{ // input is a regular file 244 | ifFileExists(optarg); 245 | userInput.inSak = optarg; 246 | } 247 | userInput.stats_flag = 1; 248 | break; 249 | case 'o': // handle output (file or stdout) 250 | 251 | userInput.outFile_flag = 1; 252 | 253 | if (isPipe && userInput.pipeType == 'n') // check whether input is from pipe and that pipe input was not already set 254 | userInput.pipeType = 'r'; // pipe input is a sequence 255 | else{ // outputs are regular files 256 | 257 | optind--; 258 | 259 | std::string file; 260 | uint8_t i = 0; 261 | 262 | for( ;optind < argc && !isInt(argv[optind]); optind++) { 263 | 264 | if (i > 0 && *argv[optind] == '-') 265 | break; 266 | 267 | file = argv[optind]; 268 | 269 | if (file.find("-o") != std::string::npos) 270 | file.erase(0, 2); // handle file name attached to option 271 | 272 | userInput.outFiles.push_back(file); 273 | ++i; 274 | } 275 | userInput.stats_flag = 1; 276 | } 277 | break; 278 | case 's': // output size of features 279 | bedOutType = *optarg; 280 | userInput.outSize_flag = 1; 281 | break; 282 | case 't': // tabular output 283 | tabular_flag = 1; 284 | break; 285 | case 'v': // software version 286 | printf("gfastats v%s\n", version.c_str()); 287 | printf("Giulio Formenti giulio.formenti@gmail.com\n"); 288 | exit(EXIT_SUCCESS); 289 | case 'h': // help 290 | printf("gfastats input.[fasta|fastq|gfa][.gz] [expected genome size] [header[:start-end]]\n"); 291 | printf("genome size: estimated genome size for NG* statistics (optional).\n"); 292 | printf("header: target specific sequence by header, optionally with coordinates (optional).\n"); 293 | printf("\nOptions:\n"); 294 | printf("\t-a --agp-to-path converts input agp to path and replaces existing paths.\n"); 295 | printf("\t-b --out-coord a|s|c|g generates bed coordinates of given feature (agp|scaffolds|contigs|gaps default:agp).\n"); 296 | printf("\t-e --exclude-bed opposite of --include-bed. They can be combined (no coordinates).\n"); 297 | printf("\t-f --input-sequence input file (fasta, fastq, gfa [.gz]). Also as first positional argument.\n"); 298 | printf("\t-h --help print help and exit.\n"); 299 | printf("\t-i --include-bed generates output on a subset list of headers or coordinates in 0-based bed format.\n"); 300 | printf("\t-k --swiss-army-knife set of instructions provided as an ordered list.\n"); 301 | printf("\t-j --threads numbers of threads (default: max).\n"); 302 | printf("\t-o --out-format fasta|fastq|gfa[.gz] outputs selected sequences. If more than the extension is provided the output is written to the specified file (e.g. out.fasta.gz). Multiple file outputs can be given at once.\n"); 303 | printf("\t-s --out-size s|c|g generates size list of given feature (scaffolds|contigs|gaps default:scaffolds).\n"); 304 | printf("\t-t --tabular output in tabular format.\n"); 305 | printf("\t-v --version software version.\n\n"); 306 | printf("\t--cmd print $0 to stdout.\n"); 307 | printf("\t--remove-terminal-gaps removes leading/trailing Ns from scaffolds.\n"); 308 | printf("\t--discover-paths prototype to induce paths from input.\n"); 309 | printf("\t--discover-terminal-overlaps append perfect terminal overlaps of minimum length n (default: 1000).\n"); 310 | printf("\t--homopolymer-compress compress all the homopolymers longer than n in the input.\n"); 311 | printf("\t--line-length specifies line length in when output format is fasta. Default has no line breaks.\n"); 312 | printf("\t--nstar-report generates full N* and L* statistics.\n"); 313 | printf("\t--no-sequence do not output the sequence (eg. in gfa).\n"); 314 | printf("\t--out-sequence reports also the actual sequence (in combination with --seq-report).\n"); 315 | printf("\t--out-bubbles outputs a potential list of bubbles in the graph.\n"); 316 | printf("\t--segment-report report statistics for each segment/contig.\n"); 317 | printf("\t--path-report report statistics for each path/scaffold.\n"); 318 | printf("\t--sort ascending|descending|largest|smallest|file sort sequences according to input. Ascending/descending used the sequence/path header.\n"); 319 | printf("\t--stats report summary statistics (default).\n"); 320 | printf("\t--verbose verbose output.\n"); 321 | printf("\t--locale set a different locale, for instance to use , for thousand separators use en_US.UTF-8.\n"); 322 | printf("\nAll input files can be piped from stdin using '-'.\n"); 323 | exit(EXIT_SUCCESS); 324 | } 325 | if (argc == 2 || // handle various cases in which the output should include summary stats 326 | (argc == 3 && pos_op == 2) || 327 | (argc == 4 && pos_op == 3) || 328 | userInput.nstarReport_flag || 329 | userInput.discoverPaths_flag) { 330 | 331 | userInput.stats_flag = 1; // default mode 'stats' 332 | } 333 | } 334 | lg.verbose("Input variables assigned"); 335 | 336 | if (userInput.cmd_flag) { // print command line 337 | for (unsigned short int arg_counter = 0; arg_counter < argc; arg_counter++) { 338 | printf("%s ", argv[arg_counter]); 339 | } 340 | printf("\n"); 341 | } 342 | 343 | Input in; 344 | 345 | in.load(userInput); // load user input 346 | lg.verbose("Loaded user input"); 347 | 348 | InSequences inSequences; // initialize sequence collection object 349 | lg.verbose("Sequence object generated"); 350 | 351 | in.read(inSequences); // read input content to inSequences container 352 | lg.verbose("Finished reading input files"); 353 | if(verbose_flag) {std::cerr<<"\n";}; 354 | 355 | Report report; 356 | 357 | if (userInput.segmentReport_flag || userInput.outSequence_flag) { // report results for each sequence 358 | userInput.stats_flag = 0; 359 | report.segmentReport(inSequences, userInput.outSequence_flag); 360 | } 361 | if (userInput.pathReport_flag) { // report results for each sequence 362 | userInput.stats_flag = 0; 363 | report.pathReport(inSequences); 364 | } 365 | if (userInput.outFile_flag) { // output sequences to file or stdout 366 | userInput.stats_flag = 0; 367 | for (std::string file : userInput.outFiles) 368 | report.writeToStream(inSequences, file, userInput); 369 | } 370 | if (userInput.outCoord_flag || userInput.outSize_flag) { // output coordinates 371 | userInput.stats_flag = 0; 372 | report.outCoord(inSequences, bedOutType, userInput.outSize_flag); 373 | } 374 | if (userInput.stats_flag) { // output summary statistics 375 | report.reportStats(inSequences, gSize, userInput.outBubbles_flag); 376 | } 377 | if (userInput.nstarReport_flag) { // output full N/L* statistics 378 | report.nstarReport(inSequences, gSize); 379 | } 380 | lg.verbose("Generated output"); 381 | exit(EXIT_SUCCESS); 382 | } 383 | -------------------------------------------------------------------------------- /src/validate.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | USAGE: 3 | test 4 | 5 | EXAMPLE: 6 | build/bin/gfastats-validate validateFiles 7 | build/bin/gfastats-validate validateFiles/random1.fasta0.tst 8 | 9 | 10 | */ 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #include 24 | 25 | bool printCommand = false; 26 | const std::string tmp = "tmp.txt"; 27 | const std::string err = "err.txt"; 28 | bool pass = true; 29 | 30 | void printFAIL(const char *m1="", const char *m2="", const char *m3="", const char *m4="") { 31 | pass = false; 32 | std::cout << "\033[0;31mFAIL\033[0m " << m1 << " " << m2 << " " << m3 << " " << m4 << std::endl; 33 | } 34 | 35 | void printPASS(const char *m1="", const char *m2="", const char *m3="", const char *m4="") { 36 | std::cout << "\033[0;32mPASS\033[0m " << m1 << " " << m2 << " " << m3 << " " << m4 << std::endl; 37 | } 38 | 39 | int main(int argc, char **argv) { 40 | if (argc == 1) { // test with no arguments 41 | std::cout << "gfastats-validate " << std::endl; 42 | exit(EXIT_SUCCESS); 43 | } 44 | 45 | int opt; 46 | while((opt = getopt(argc, argv, "c")) != -1) 47 | { 48 | switch(opt) 49 | { 50 | case 'c': 51 | printCommand = true; 52 | break; 53 | } 54 | } 55 | 56 | std::set input_files; 57 | 58 | for(int i=1; i "+tmp+" 2>"+err+"\""; 81 | #else 82 | std::string cmd = "\""+exePath+"\""+" "+line+" > "+tmp+" 2>"+err; 83 | #endif 84 | if(printCommand) std::cout << cmd << std::endl; 85 | 86 | if(system(cmd.c_str()) != EXIT_SUCCESS) { 87 | printFAIL(input_file.c_str(), "runtime error"); 88 | istream.close(); 89 | std::ifstream errfstream; 90 | errfstream.open(err); 91 | if(!errfstream) { 92 | std::cout << " error: couldn't open err.txt" << std::endl; 93 | continue; 94 | } 95 | for(std::string line; std::getline(errfstream, line);) { 96 | std::cout << " " << line.c_str() << std::endl; 97 | } 98 | errfstream.close(); 99 | continue; 100 | } 101 | 102 | 103 | std::getline(istream, line); 104 | exp.open(line); 105 | if(exp) { 106 | expOutput = &exp; // seperate expected output file 107 | } else if(line == "embedded") { 108 | expOutput = &istream; 109 | } else { 110 | printFAIL("couldn't open expected output"); 111 | continue; 112 | } 113 | 114 | actOutput.open(tmp); 115 | std::string line; 116 | std::getline(*expOutput, line); 117 | if(line == "+++Summary+++: ") { 118 | std::getline(actOutput, line); 119 | std::set exp_summary, act_summary; 120 | while(!actOutput.eof()) { 121 | std::getline(actOutput, line); 122 | act_summary.insert(line); 123 | } 124 | while(!expOutput->eof()) { 125 | std::getline(*expOutput, line); 126 | exp_summary.insert(line); 127 | } 128 | std::set additions, missings; 129 | for(const auto &entry : exp_summary) { 130 | if(act_summary.count(entry) == 0) { 131 | missings.insert(entry); 132 | } 133 | } 134 | for(const auto &entry : act_summary) { 135 | if(exp_summary.count(entry) == 0) { 136 | additions.insert(entry); 137 | } 138 | } 139 | 140 | actOutput.close(); 141 | exp.close(); 142 | istream.close(); 143 | 144 | if(additions.size() > 0 || missings.size() > 0) { 145 | printFAIL(input_file.c_str(), "expected output did not match actual output"); 146 | std::cout << "additions:" << std::endl; 147 | for(const auto &addition : additions) { 148 | std::cout << addition << std::endl; 149 | } 150 | std::cout << "missing:" << std::endl; 151 | for(const auto &missing : missings) { 152 | std::cout << missing << std::endl; 153 | } 154 | 155 | continue; // to next validate file 156 | } 157 | } 158 | else { 159 | std::vector> diffs; 160 | 161 | std::string l1, l2; 162 | std::getline(actOutput, l1); 163 | l2 = line; 164 | if(l1 != l2) diffs.push_back(std::pair(l1, l2)); 165 | 166 | while(!actOutput.eof() || !expOutput->eof()) { 167 | std::getline(actOutput, l1); 168 | std::getline(*expOutput, l2); 169 | if(l1 != l2) diffs.push_back(std::pair(l1, l2)); 170 | } 171 | 172 | actOutput.close(); 173 | exp.close(); 174 | istream.close(); 175 | 176 | if(diffs.size() > 0) { 177 | printFAIL(input_file.c_str(), "expected output did not match actual output"); 178 | for(const auto &pair : diffs) { 179 | std::cout << " expected: " << pair.second.c_str() << std::endl << " actual: " << pair.first.c_str() << std::endl; 180 | } 181 | continue; 182 | } 183 | } 184 | 185 | printPASS(input_file.c_str()); 186 | } 187 | 188 | if(input_files.size() != 0 && remove(tmp.c_str()) != 0) { 189 | std::cerr << "error deleting temp file " << tmp.c_str() << std::endl; 190 | } 191 | 192 | exit(pass ? EXIT_SUCCESS : EXIT_FAILURE); 193 | } 194 | -------------------------------------------------------------------------------- /testFiles/random1.agp: -------------------------------------------------------------------------------- 1 | newpath1 1 10 1 N 10 scaffold yes 2 | newpath1 11 15 2 W Header1 2 5 + 3 | newpath1 16 20 3 N 5 scaffold yes 4 | newpath1 21 23 4 W Header2 1 3 - 5 | newpath1 24 28 5 N 5 scaffold yes 6 | newpath1 29 34 6 W Header3 4 8 + 7 | newpath1 35 40 7 N 5 scaffold yes 8 | newpath2 1 10 1 N 10 scaffold yes 9 | newpath2 11 15 2 W Header5 3 7 - 10 | newpath2 16 20 3 N 5 scaffold yes 11 | newpath2 21 35 4 W Header4 1 15 + 12 | -------------------------------------------------------------------------------- /testFiles/random1.comment.sak: -------------------------------------------------------------------------------- 1 | COMMENT Header3 This is header 3 2 | COMMENT Header2 This is header 2 3 | COMMENT Header5 This is header 5 4 | COMMENT Header4 This is header 4 5 | COMMENT Header1 This is header 1 6 | -------------------------------------------------------------------------------- /testFiles/random1.fasta: -------------------------------------------------------------------------------- 1 | >Header1 5bp sequence with no gaps and 2 lowercase bases 2 | CGa 3 | cT 4 | >Header2 5bp sequence with internal 1bp non-canonical gap 5 | CG 6 | AXT 7 | >Header3 10bp sequence with internal 4bp and 1bp terminal canonical gap 8 | TGANA 9 | TNCTN 10 | >Header4 15bp sequence with start 3bp canonical gap and 3 lowercase bases 11 | NNNTTCC 12 | TcgCACtC 13 | >Header5 15bp sequence with terminal 3bp canonical gap 14 | AACTCGAT 15 | CACGNNN 16 | -------------------------------------------------------------------------------- /testFiles/random1.fasta.1.bed: -------------------------------------------------------------------------------- 1 | Header1 0 5 2 | Header2 0 3 3 | Header2 4 5 4 | Header3 0 3 5 | Header3 4 6 6 | Header3 7 9 7 | Header4 2 13 8 | Header5 3 14 9 | -------------------------------------------------------------------------------- /testFiles/random1.fasta.2.bed: -------------------------------------------------------------------------------- 1 | Header3 2 | Header4 3 | -------------------------------------------------------------------------------- /testFiles/random1.fasta.3.bed: -------------------------------------------------------------------------------- 1 | Header1 4 5 2 | Header4 3 | -------------------------------------------------------------------------------- /testFiles/random1.fasta.4.bed: -------------------------------------------------------------------------------- 1 | Header3 2 | Header4 4 6 3 | -------------------------------------------------------------------------------- /testFiles/random1.fasta.5.bed: -------------------------------------------------------------------------------- 1 | Header3 4 6 2 | Header4 3 | -------------------------------------------------------------------------------- /testFiles/random1.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vgl-hub/gfastats/cedd755227fe0d7b6daf3fee932e325af54f3b21/testFiles/random1.fasta.gz -------------------------------------------------------------------------------- /testFiles/random1.fastq: -------------------------------------------------------------------------------- 1 | @Header1 5bp sequence with no gaps 2 | CGacT 3 | + 4 | 12345 5 | @Header2 5bp sequence with internal 1bp non-canonical gap 6 | CGAXT 7 | + 8 | 56789 9 | @Header3 10bp sequence with internal 4bp and 1bp terminal canonical gap 10 | TGANATNCTN 11 | + 12 | 56789:;<=> 13 | @Header4 15bp sequence with start 3bp canonical gap 14 | NNNTTCCTcgCACtC 15 | + 16 | !!!45:;<=>?5678 17 | @Header5 15bp sequence with terminal 3bp canonical gap 18 | AACTCGATCACGNNN 19 | + 20 | 98765:;<=>?5678 21 | -------------------------------------------------------------------------------- /testFiles/random1.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vgl-hub/gfastats/cedd755227fe0d7b6daf3fee932e325af54f3b21/testFiles/random1.fastq.gz -------------------------------------------------------------------------------- /testFiles/random1.gfa2: -------------------------------------------------------------------------------- 1 | H VN:Z:2.0 2 | S Header1.1 5 CGacT 3 | S Header2.1 3 CGA 4 | S Header2.3 1 T 5 | S Header3.1 3 TGA 6 | S Header3.3 2 AT 7 | S Header3.5 2 CT 8 | S Header4.2 12 TTCCTcgCACtC 9 | S Header5.1 12 AACTCGATCACG 10 | G Header2.2 Header2.1+ Header2.3+ 1 11 | G Header3.2 Header3.1+ Header3.3+ 1 12 | G Header3.4 Header3.3+ Header3.5+ 1 13 | G Header3.6 Header3.5+ Header3.5- 1 14 | G Header4.1 Header4.2+ Header4.2+ 3 15 | G Header5.2 Header5.1+ Header5.1- 3 16 | O Header1 Header1.1+ 5bp sequence with no gaps and 2 lowercase bases 17 | O Header2 Header2.1+ Header2.2 Header2.3+ 5bp sequence with internal 1bp non-canonical gap 18 | O Header3 Header3.1+ Header3.2 Header3.3+ Header3.4 Header3.5+ Header3.6 10bp sequence with internal 4bp and 1bp terminal canonical gap 19 | O Header4 Header4.1 Header4.2+ 15bp sequence with start 3bp canonical gap and 3 lowercase bases 20 | O Header5 Header5.1+ Header5.2 15bp sequence with terminal 3bp canonical gap 21 | -------------------------------------------------------------------------------- /testFiles/random1.gfa2.instructions.sak: -------------------------------------------------------------------------------- 1 | EXCLUDE Header3 2 | -------------------------------------------------------------------------------- /testFiles/random1.hc.sak: -------------------------------------------------------------------------------- 1 | COMPRESS Header4.2 1 2 | COMPRESS Header5.1 1 3 | -------------------------------------------------------------------------------- /testFiles/random1.hdc.sak: -------------------------------------------------------------------------------- 1 | COMPRESS Header4.2 1 2 | DECOMPRESS Header4.2 3 | COMPRESS Header5.1 1 4 | DECOMPRESS Header5.1 5 | -------------------------------------------------------------------------------- /testFiles/random1.instructions.sak: -------------------------------------------------------------------------------- 1 | JOIN Header1+ Header2+ 5 newGap1 Scaffold1 2 | JOIN Header4+ Header5+ 5 newGap2 Scaffold2 3 | JOIN Scaffold1+ Header3+ 10 newGap3 FinalScaffold 4 | SPLIT Header2.1 Header2.3 Scaffold3 Scaffold4 5 | EXCISE Header3.3 3 newGap4 6 | INVERT Header5.1 7 | REMOVE Header1.1 8 | RESIZE newGap2 10 9 | -------------------------------------------------------------------------------- /testFiles/random1.mask.sak: -------------------------------------------------------------------------------- 1 | MASK Header5 1 3 5 2 | -------------------------------------------------------------------------------- /testFiles/random1.rename.sak: -------------------------------------------------------------------------------- 1 | RENAME Header4 newHeader4 2 | RENAME Header1 newHeader1 3 | RENAME Header5 newHeader5 4 | RENAME Header2 newHeader2 5 | RENAME Header3 newHeader3 6 | -------------------------------------------------------------------------------- /testFiles/random1.rvcp.sak: -------------------------------------------------------------------------------- 1 | RVCP Header4 2 | RVCP Header3 3 | -------------------------------------------------------------------------------- /testFiles/random2.gfa: -------------------------------------------------------------------------------- 1 | H VN:Z:1.2 2 | S 11 ACCTT LN:i:5 QL:Z:?@97? 3 | S 12 TCAAGG LN:i:6 QL:Z:@6?84@ 4 | S 13 CTTgaTT LN:i:7 QL:Z:>=?@877 5 | L 11 + 12 - 4M 6 | L 12 - 13 + 5M 7 | L 11 + 13 + 3M 8 | J 11 + 13 - 5 SC:i:1 9 | J 13 - 12 + 3 SC:i:1 10 | P 14 11+;13-;12+ 5,3 11 | P 15 11+,12-,13+ 4M,5M 12 | -------------------------------------------------------------------------------- /testFiles/random2.gfa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vgl-hub/gfastats/cedd755227fe0d7b6daf3fee932e325af54f3b21/testFiles/random2.gfa.gz -------------------------------------------------------------------------------- /testFiles/random2.gfa2: -------------------------------------------------------------------------------- 1 | H VN:Z:2.0 2 | S id2 6 TCAAGG 3 | G id5 id3+ id4- 5 4 | S id3 7 CTTGATT 5 | G id6 id1+ id2+ 3 6 | S id1 5 ACCTT 7 | S id4 8 CATGACTC 8 | E id10 id1+ id2- 2 4 2 4 3M 9 | S id7 9 TGAATGAAA 10 | G id8 id2+ id3- 2 11 | G id9 id7+ id7+ 5 12 | E id11 id2- id1+ 2 4 2 4 3M 13 | O id12 id1+ id6 id2(1:3)+ id8 id3- 14 | O path1 id1+ 15 | O path2 id2+ 16 | O path3 id3+ 17 | -------------------------------------------------------------------------------- /testFiles/random2.gfa2.agp: -------------------------------------------------------------------------------- 1 | id13 1 5 1 W path1 1 5 + 2 | id13 6 8 2 N 3 gap1 yes 3 | id13 9 14 3 W path2 1 6 + 4 | id13 15 16 4 N 2 gap2 yes 5 | id13 17 23 5 W path3 1 7 - 6 | -------------------------------------------------------------------------------- /testFiles/random2.gfa2.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vgl-hub/gfastats/cedd755227fe0d7b6daf3fee932e325af54f3b21/testFiles/random2.gfa2.gz -------------------------------------------------------------------------------- /testFiles/random2.noseq.gfa: -------------------------------------------------------------------------------- 1 | H VN:Z:1.2 2 | S 11 * LN:i:5 QL:Z:?@97? 3 | S 12 * LN:i:6 QL:Z:@6?84@ 4 | S 13 * LN:i:7 QL:Z:>=?@877 5 | L 11 + 12 - 4M 6 | L 12 - 13 + 5M 7 | L 11 + 13 + 3M 8 | J 11 + 13 - 5 SC:i:1 9 | J 13 - 12 + 3 SC:i:1 10 | P 14 11+;13-;12+ 5,3 11 | P 15 11+,12-,13+ 4M,5M 12 | -------------------------------------------------------------------------------- /testFiles/random3.sorting.fasta: -------------------------------------------------------------------------------- 1 | >c 2 | ACGT 3 | >d 4 | CGTA 5 | >b 6 | GTAC 7 | >a 8 | TACG 9 | -------------------------------------------------------------------------------- /testFiles/random4.fasta: -------------------------------------------------------------------------------- 1 | >Header1 5bp sequence with no gaps and 2 lowercase bases 2 | CGacT 3 | >Header2 5bp sequence with internal 1bp non-canonical gap 4 | CGAXT 5 | >Header3 10bp sequence with internal 4bp and 1bp terminal canonical gap 6 | TGANATNCTN 7 | >Header4 15bp sequence with start 3bp canonical gap and 3 lowercase bases 8 | NNNTTCCTcgCACtC 9 | >Header5 15bp sequence with terminal 3bp canonical gap 10 | AACTCGATCACGNNN 11 | -------------------------------------------------------------------------------- /testFiles/random5.findovl.gfa: -------------------------------------------------------------------------------- 1 | H VN:Z:1.2 2 | S 11 CCGTTCCATGAAGGCCAGAGTTACTTACCGGCCCTTTCCATGCGCGCGCCATAAA LN:i:55 3 | S 12 GATTTAAGAATATGTTAACGGAGGATTGCACGATCTTCTCTCCTCGTGAGAGAATTTATG LN:i:60 4 | S 13 AAATCGCATAGCTATGTATTTTGCAGAGGTAGCGACATCTTGACGGGCACTTCACAGATAGTGGG LN:i:65 5 | J 11 + 13 - 5 SC:i:1 6 | J 13 - 12 + 3 SC:i:1 7 | P 14 11+;13-;12+ 5,3 8 | P 15 11+,12-,13+ 6M,5M 9 | -------------------------------------------------------------------------------- /testFiles/random6.circular.gfa: -------------------------------------------------------------------------------- 1 | H VN:Z:1.0 2 | S edge_1 * dp:i:32 3 | S edge_2 * dp:i:21 4 | S edge_3 * dp:i:0 5 | S edge_4 * dp:i:0 6 | S edge_5 * dp:i:2 7 | S edge_6 * dp:i:0 8 | S edge_7 * dp:i:6 9 | S edge_8 * dp:i:20 10 | S edge_9 * dp:i:0 11 | L edge_1 + edge_1 + 0M RC:i:0 12 | L edge_1 - edge_1 - 0M RC:i:18 13 | L edge_2 + edge_2 + 0M RC:i:0 14 | L edge_2 - edge_2 - 0M RC:i:0 15 | L edge_7 + edge_7 + 0M RC:i:32 16 | L edge_7 - edge_7 - 0M RC:i:40 17 | P contig_1 edge_1+ * 18 | P contig_2 edge_2+ * 19 | P contig_3 edge_3+ * 20 | P contig_4 edge_4+ * 21 | P contig_5 edge_5+ * 22 | P contig_6 edge_6+ * 23 | P contig_7 edge_7+ * 24 | P contig_8 edge_8+ * 25 | P contig_9 edge_9+ * 26 | -------------------------------------------------------------------------------- /validateFiles/README.md: -------------------------------------------------------------------------------- 1 | # gfastats validation 2 | 3 | Validation files with a .tst extension are used to ensure that gfastats is in a working state after changes to the code. 4 | A .tst file consists of a single line of command line arguments to run gfastats with, followed by either "embedded" to signify the expected output is in the same file, or a path to a file with the expected output. 5 | gfastats is run with the given command line arguments and the expected output is compared to the actual output, any differences can be printed with the `-v` option, and gfastats-validate will return `EXIT_FAILURE`. The exact commands being run to test output can be seen with the `-c` option. 6 | 7 | Run gfastats-validate with .tst files as command line arguments or folders to be recursively searched for all .tst files. 8 | The val.sh and val.bat scripts will run gfastats-validate with all files in "validateFiles/". 9 | 10 | Example Usage: 11 | ``` 12 | gfastats-validate validateFiles/random1.fasta0.tst // tests only random1.fasta0.tst 13 | gfastats-validate validateFiles // tests all files in the validateFiles folder 14 | ``` 15 | 16 | Test files can be automatically generated by running `gfastats-generate-tests`, but this should only be done while gfastats is in a working state, with no known bugs or unexpected behaviour. 17 | 18 | `gfastats-validate` is run automatically with all files in "validateFiles/" on windows, ubuntu, and mac virtual machines through github actions when any changes are pushed or merged to the main branch. 19 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.10.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta -b g 2 | embedded 3 | Header2 3 4 4 | Header3 3 4 5 | Header3 6 7 6 | Header3 9 10 7 | Header4 0 3 8 | Header5 12 15 9 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.11.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta --homopolymer-compress 1 -ofa 2 | embedded 3 | >Header1 5bp sequence with no gaps and 2 lowercase bases 4 | CGacT 5 | >Header2 5bp sequence with internal 1bp non-canonical gap 6 | CGANT 7 | >Header3 10bp sequence with internal 4bp and 1bp terminal canonical gap 8 | TGANATNCTN 9 | >Header4 15bp sequence with start 3bp canonical gap and 3 lowercase bases 10 | NNNTCTcgCACtC 11 | >Header5 15bp sequence with terminal 3bp canonical gap 12 | ACTCGATCACGNNN 13 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.3.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 5 5 | Total scaffold length: 50 6 | Average scaffold length: 10.00 7 | Scaffold N50: 15 8 | Scaffold auN: 12.00 9 | Scaffold L50: 2 10 | Largest scaffold: 15 11 | Smallest scaffold: 5 12 | # contigs: 8 13 | Total contig length: 40 14 | Average contig length: 5.00 15 | Contig N50: 12 16 | Contig auN: 8.50 17 | Contig L50: 2 18 | Largest contig: 12 19 | Smallest contig: 1 20 | # gaps in scaffolds: 6 21 | Total gap length in scaffolds: 10 22 | Average gap length in scaffolds: 1.67 23 | Gap N50 in scaffolds: 3 24 | Gap auN in scaffolds: 2.20 25 | Gap L50 in scaffolds: 2 26 | Largest gap in scaffolds: 3 27 | Smallest gap in scaffolds: 1 28 | Base composition (A:C:G:T): 9:14:6:11 29 | GC content %: 50.00 30 | # soft-masked bases: 5 31 | # segments: 8 32 | Total segment length: 40 33 | Average segment length: 5.00 34 | # gaps: 6 35 | # paths: 5 36 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.4.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta -s s 2 | embedded 3 | Header1 5 4 | Header2 5 5 | Header3 10 6 | Header4 15 7 | Header5 15 8 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.5.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta -s c 2 | embedded 3 | Header1.1 5 4 | Header2.1 3 5 | Header2.3 1 6 | Header3.1 3 7 | Header3.3 2 8 | Header3.5 2 9 | Header4.2 12 10 | Header5.1 12 11 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.6.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta -s g 2 | embedded 3 | Header2.2 1 4 | Header3.2 1 5 | Header3.4 1 6 | Header3.6 1 7 | Header4.1 3 8 | Header5.2 3 9 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.7.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta -b a 2 | embedded 3 | Header1 1 5 1 W Header1.1 1 5 + 4 | Header2 1 3 1 W Header2.1 1 3 + 5 | Header2 4 4 2 N 1 Header2.2 yes 6 | Header2 5 5 3 W Header2.3 1 1 + 7 | Header3 1 3 1 W Header3.1 1 3 + 8 | Header3 4 4 2 N 1 Header3.2 yes 9 | Header3 5 6 3 W Header3.3 1 2 + 10 | Header3 7 7 4 N 1 Header3.4 yes 11 | Header3 8 9 5 W Header3.5 1 2 + 12 | Header3 10 10 6 N 1 Header3.6 yes 13 | Header4 1 3 1 N 3 Header4.1 yes 14 | Header4 4 15 2 W Header4.2 1 12 + 15 | Header5 1 12 1 W Header5.1 1 12 + 16 | Header5 13 15 2 N 3 Header5.2 yes 17 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.78.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta -k testFiles/random1.instructions.sak 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 2 5 | Total scaffold length: 60 6 | Average scaffold length: 30.00 7 | Scaffold N50: 40 8 | Scaffold auN: 33.33 9 | Scaffold L50: 1 10 | Largest scaffold: 40 11 | Smallest scaffold: 20 12 | # contigs: 5 13 | Total contig length: 30 14 | Average contig length: 6.00 15 | Contig N50: 12 16 | Contig auN: 10.07 17 | Contig L50: 2 18 | Largest contig: 12 19 | Smallest contig: 1 20 | # gaps in scaffolds: 6 21 | Total gap length in scaffolds: 30 22 | Average gap length in scaffolds: 5.00 23 | Gap N50 in scaffolds: 10 24 | Gap auN in scaffolds: 7.60 25 | Gap L50 in scaffolds: 2 26 | Largest gap in scaffolds: 10 27 | Smallest gap in scaffolds: 1 28 | Base composition (A:C:G:T): 6:11:4:9 29 | GC content %: 50.00 30 | # soft-masked bases: 3 31 | # segments: 8 32 | Total segment length: 40 33 | Average segment length: 5.00 34 | # gaps: 7 35 | # paths: 2 36 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.79.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta -ofa -k testFiles/random1.instructions.sak 2 | embedded 3 | >Scaffold2 4 | NNNTTCCTcgCACtCNNNNNNNNNNGCACTAGCTCAANNN 5 | >Scaffold4 6 | TNNNNNNNNNNTGANNNCTN 7 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.8.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta -b s 2 | embedded 3 | Header1 0 5 4 | Header2 0 5 5 | Header3 0 10 6 | Header4 0 15 7 | Header5 0 15 8 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.80.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta -ofa -k testFiles/random1.hc.sak 2 | embedded 3 | >Header1 5bp sequence with no gaps and 2 lowercase bases 4 | CGacT 5 | >Header2 5bp sequence with internal 1bp non-canonical gap 6 | CGANT 7 | >Header3 10bp sequence with internal 4bp and 1bp terminal canonical gap 8 | TGANATNCTN 9 | >Header4 15bp sequence with start 3bp canonical gap and 3 lowercase bases 10 | NNNTCTcgCACtC 11 | >Header5 15bp sequence with terminal 3bp canonical gap 12 | ACTCGATCACGNNN 13 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.81.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta -ofa -k testFiles/random1.hdc.sak 2 | embedded 3 | >Header1 5bp sequence with no gaps and 2 lowercase bases 4 | CGacT 5 | >Header2 5bp sequence with internal 1bp non-canonical gap 6 | CGANT 7 | >Header3 10bp sequence with internal 4bp and 1bp terminal canonical gap 8 | TGANATNCTN 9 | >Header4 15bp sequence with start 3bp canonical gap and 3 lowercase bases 10 | NNNTTCCTcgCACtC 11 | >Header5 15bp sequence with terminal 3bp canonical gap 12 | AACTCGATCACGNNN 13 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.82.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta Header2 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 1 5 | Total scaffold length: 5 6 | Average scaffold length: 5.00 7 | Scaffold N50: 5 8 | Scaffold auN: 5.00 9 | Scaffold L50: 1 10 | Largest scaffold: 5 11 | Smallest scaffold: 5 12 | # contigs: 2 13 | Total contig length: 4 14 | Average contig length: 2.00 15 | Contig N50: 3 16 | Contig auN: 2.50 17 | Contig L50: 1 18 | Largest contig: 3 19 | Smallest contig: 1 20 | # gaps in scaffolds: 1 21 | Total gap length in scaffolds: 1 22 | Average gap length in scaffolds: 1.00 23 | Gap N50 in scaffolds: 1 24 | Gap auN in scaffolds: 1.00 25 | Gap L50 in scaffolds: 1 26 | Largest gap in scaffolds: 1 27 | Smallest gap in scaffolds: 1 28 | Base composition (A:C:G:T): 1:1:1:1 29 | GC content %: 50.00 30 | # soft-masked bases: 0 31 | # segments: 2 32 | Total segment length: 4 33 | Average segment length: 2.00 34 | # gaps: 1 35 | # paths: 1 36 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.83.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta -ofa -e testFiles/random1.fasta.1.bed 2 | embedded 3 | >Header2 5bp sequence with internal 1bp non-canonical gap 4 | N 5 | >Header3 10bp sequence with internal 4bp and 1bp terminal canonical gap 6 | NNN 7 | >Header4 15bp sequence with start 3bp canonical gap and 3 lowercase bases 8 | NNtC 9 | >Header5 15bp sequence with terminal 3bp canonical gap 10 | AACN 11 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.84.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta -ofa -e testFiles/random1.fasta.2.bed 2 | embedded 3 | >Header1 5bp sequence with no gaps and 2 lowercase bases 4 | CGacT 5 | >Header2 5bp sequence with internal 1bp non-canonical gap 6 | CGANT 7 | >Header5 15bp sequence with terminal 3bp canonical gap 8 | AACTCGATCACGNNN 9 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.85.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta -ofa -e testFiles/random1.fasta.3.bed 2 | embedded 3 | >Header1 5bp sequence with no gaps and 2 lowercase bases 4 | CGac 5 | >Header2 5bp sequence with internal 1bp non-canonical gap 6 | CGANT 7 | >Header3 10bp sequence with internal 4bp and 1bp terminal canonical gap 8 | TGANATNCTN 9 | >Header5 15bp sequence with terminal 3bp canonical gap 10 | AACTCGATCACGNNN 11 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.86.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta -ofa -e testFiles/random1.fasta.4.bed 2 | embedded 3 | >Header1 5bp sequence with no gaps and 2 lowercase bases 4 | CGacT 5 | >Header2 5bp sequence with internal 1bp non-canonical gap 6 | CGANT 7 | >Header4 15bp sequence with start 3bp canonical gap and 3 lowercase bases 8 | NNNTCTcgCACtC 9 | >Header5 15bp sequence with terminal 3bp canonical gap 10 | AACTCGATCACGNNN 11 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.87.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta -ofa -e testFiles/random1.fasta.5.bed 2 | embedded 3 | >Header1 5bp sequence with no gaps and 2 lowercase bases 4 | CGacT 5 | >Header2 5bp sequence with internal 1bp non-canonical gap 6 | CGANT 7 | >Header3 10bp sequence with internal 4bp and 1bp terminal canonical gap 8 | TGANNCTN 9 | >Header5 15bp sequence with terminal 3bp canonical gap 10 | AACTCGATCACGNNN 11 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.88.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta -ofa -i testFiles/random1.fasta.1.bed 2 | embedded 3 | >Header1 5bp sequence with no gaps and 2 lowercase bases 4 | CGacT 5 | >Header2 5bp sequence with internal 1bp non-canonical gap 6 | CGAT 7 | >Header3 10bp sequence with internal 4bp and 1bp terminal canonical gap 8 | TGAATCT 9 | >Header4 15bp sequence with start 3bp canonical gap and 3 lowercase bases 10 | NTTCCTcgCAC 11 | >Header5 15bp sequence with terminal 3bp canonical gap 12 | TCGATCACGNN 13 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.89.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta -ofa -i testFiles/random1.fasta.2.bed 2 | embedded 3 | >Header3 10bp sequence with internal 4bp and 1bp terminal canonical gap 4 | TGANATNCTN 5 | >Header4 15bp sequence with start 3bp canonical gap and 3 lowercase bases 6 | NNNTTCCTcgCACtC 7 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.9.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta -b c 2 | embedded 3 | Header1 0 5 4 | Header2 0 3 5 | Header2 4 5 6 | Header3 0 3 7 | Header3 4 6 8 | Header3 7 9 9 | Header4 3 15 10 | Header5 0 12 11 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.90.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta -ofa -i testFiles/random1.fasta.3.bed 2 | embedded 3 | >Header1 5bp sequence with no gaps and 2 lowercase bases 4 | T 5 | >Header4 15bp sequence with start 3bp canonical gap and 3 lowercase bases 6 | NNNTTCCTcgCACtC 7 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.91.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta -ofa -i testFiles/random1.fasta.4.bed 2 | embedded 3 | >Header3 10bp sequence with internal 4bp and 1bp terminal canonical gap 4 | TGANATNCTN 5 | >Header4 15bp sequence with start 3bp canonical gap and 3 lowercase bases 6 | TC 7 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.92.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta -ofa -i testFiles/random1.fasta.5.bed 2 | embedded 3 | >Header3 10bp sequence with internal 4bp and 1bp terminal canonical gap 4 | AT 5 | >Header4 15bp sequence with start 3bp canonical gap and 3 lowercase bases 6 | NNNTTCCTcgCACtC 7 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.93.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta -a testFiles/random1.agp --stats 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 2 5 | Total scaffold length: 47 6 | Average scaffold length: 23.50 7 | Scaffold N50: 25 8 | Scaffold auN: 23.60 9 | Scaffold L50: 1 10 | Largest scaffold: 25 11 | Smallest scaffold: 22 12 | # contigs: 6 13 | Total contig length: 27 14 | Average contig length: 4.50 15 | Contig N50: 5 16 | Contig auN: 7.37 17 | Contig L50: 2 18 | Largest contig: 12 19 | Smallest contig: 1 20 | # gaps in scaffolds: 4 21 | Total gap length in scaffolds: 20 22 | Average gap length in scaffolds: 5.00 23 | Gap N50 in scaffolds: 6 24 | Gap auN in scaffolds: 6.30 25 | Gap L50 in scaffolds: 2 26 | Largest gap in scaffolds: 8 27 | Smallest gap in scaffolds: 1 28 | Base composition (A:C:G:T): 4:10:5:8 29 | GC content %: 55.56 30 | # soft-masked bases: 5 31 | # segments: 8 32 | Total segment length: 40 33 | Average segment length: 5.00 34 | # gaps: 9 35 | # paths: 2 36 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.94.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta -a testFiles/random1.agp -ofa 2 | embedded 3 | >newpath1 4 | GacTNNNNNTCGNNNNNNATNC 5 | >newpath2 6 | TCGAGNNNNNNNNTTCCTcgCACtC 7 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.95.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta -k testFiles/random1.rvcp.sak 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 5 5 | Total scaffold length: 50 6 | Average scaffold length: 10.00 7 | Scaffold N50: 15 8 | Scaffold auN: 12.00 9 | Scaffold L50: 2 10 | Largest scaffold: 15 11 | Smallest scaffold: 5 12 | # contigs: 8 13 | Total contig length: 40 14 | Average contig length: 5.00 15 | Contig N50: 12 16 | Contig auN: 8.50 17 | Contig L50: 2 18 | Largest contig: 12 19 | Smallest contig: 1 20 | # gaps in scaffolds: 6 21 | Total gap length in scaffolds: 10 22 | Average gap length in scaffolds: 1.67 23 | Gap N50 in scaffolds: 3 24 | Gap auN in scaffolds: 2.20 25 | Gap L50 in scaffolds: 2 26 | Largest gap in scaffolds: 3 27 | Smallest gap in scaffolds: 1 28 | Base composition (A:C:G:T): 13:9:11:7 29 | GC content %: 50.00 30 | # soft-masked bases: 5 31 | # segments: 8 32 | Total segment length: 40 33 | Average segment length: 5.00 34 | # gaps: 6 35 | # paths: 5 36 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.96.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta -k testFiles/random1.mask.sak 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 5 5 | Total scaffold length: 53 6 | Average scaffold length: 10.60 7 | Scaffold N50: 15 8 | Scaffold auN: 13.19 9 | Scaffold L50: 2 10 | Largest scaffold: 18 11 | Smallest scaffold: 5 12 | # contigs: 9 13 | Total contig length: 38 14 | Average contig length: 4.22 15 | Contig N50: 9 16 | Contig auN: 7.32 17 | Contig L50: 2 18 | Largest contig: 12 19 | Smallest contig: 1 20 | # gaps in scaffolds: 7 21 | Total gap length in scaffolds: 15 22 | Average gap length in scaffolds: 2.14 23 | Gap N50 in scaffolds: 3 24 | Gap auN in scaffolds: 3.13 25 | Gap L50 in scaffolds: 2 26 | Largest gap in scaffolds: 5 27 | Smallest gap in scaffolds: 1 28 | Base composition (A:C:G:T): 8:13:6:11 29 | GC content %: 50.00 30 | # soft-masked bases: 5 31 | # segments: 8 32 | Total segment length: 40 33 | Average segment length: 5.00 34 | # gaps: 7 35 | # paths: 5 36 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.gz.100.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta.gz -k testFiles/random1.mask.sak 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 5 5 | Total scaffold length: 53 6 | Average scaffold length: 10.60 7 | Scaffold N50: 15 8 | Scaffold auN: 13.19 9 | Scaffold L50: 2 10 | Largest scaffold: 18 11 | Smallest scaffold: 5 12 | # contigs: 9 13 | Total contig length: 38 14 | Average contig length: 4.22 15 | Contig N50: 9 16 | Contig auN: 7.32 17 | Contig L50: 2 18 | Largest contig: 12 19 | Smallest contig: 1 20 | # gaps in scaffolds: 7 21 | Total gap length in scaffolds: 15 22 | Average gap length in scaffolds: 2.14 23 | Gap N50 in scaffolds: 3 24 | Gap auN in scaffolds: 3.13 25 | Gap L50 in scaffolds: 2 26 | Largest gap in scaffolds: 5 27 | Smallest gap in scaffolds: 1 28 | Base composition (A:C:G:T): 8:13:6:11 29 | GC content %: 50.00 30 | # soft-masked bases: 5 31 | # segments: 8 32 | Total segment length: 40 33 | Average segment length: 5.00 34 | # gaps: 7 35 | # paths: 5 36 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.gz.69.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta.gz 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 5 5 | Total scaffold length: 50 6 | Average scaffold length: 10.00 7 | Scaffold N50: 15 8 | Scaffold auN: 12.00 9 | Scaffold L50: 2 10 | Largest scaffold: 15 11 | Smallest scaffold: 5 12 | # contigs: 8 13 | Total contig length: 40 14 | Average contig length: 5.00 15 | Contig N50: 12 16 | Contig auN: 8.50 17 | Contig L50: 2 18 | Largest contig: 12 19 | Smallest contig: 1 20 | # gaps in scaffolds: 6 21 | Total gap length in scaffolds: 10 22 | Average gap length in scaffolds: 1.67 23 | Gap N50 in scaffolds: 3 24 | Gap auN in scaffolds: 2.20 25 | Gap L50 in scaffolds: 2 26 | Largest gap in scaffolds: 3 27 | Smallest gap in scaffolds: 1 28 | Base composition (A:C:G:T): 9:14:6:11 29 | GC content %: 50.00 30 | # soft-masked bases: 5 31 | # segments: 8 32 | Total segment length: 40 33 | Average segment length: 5.00 34 | # gaps: 6 35 | # paths: 5 36 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.gz.70.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta.gz -s s 2 | embedded 3 | Header1 5 4 | Header2 5 5 | Header3 10 6 | Header4 15 7 | Header5 15 8 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.gz.71.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta.gz -s c 2 | embedded 3 | Header1.1 5 4 | Header2.1 3 5 | Header2.3 1 6 | Header3.1 3 7 | Header3.3 2 8 | Header3.5 2 9 | Header4.2 12 10 | Header5.1 12 11 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.gz.72.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta.gz -s g 2 | embedded 3 | Header2.2 1 4 | Header3.2 1 5 | Header3.4 1 6 | Header3.6 1 7 | Header4.1 3 8 | Header5.2 3 9 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.gz.73.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta.gz -b a 2 | embedded 3 | Header1 1 5 1 W Header1.1 1 5 + 4 | Header2 1 3 1 W Header2.1 1 3 + 5 | Header2 4 4 2 N 1 Header2.2 yes 6 | Header2 5 5 3 W Header2.3 1 1 + 7 | Header3 1 3 1 W Header3.1 1 3 + 8 | Header3 4 4 2 N 1 Header3.2 yes 9 | Header3 5 6 3 W Header3.3 1 2 + 10 | Header3 7 7 4 N 1 Header3.4 yes 11 | Header3 8 9 5 W Header3.5 1 2 + 12 | Header3 10 10 6 N 1 Header3.6 yes 13 | Header4 1 3 1 N 3 Header4.1 yes 14 | Header4 4 15 2 W Header4.2 1 12 + 15 | Header5 1 12 1 W Header5.1 1 12 + 16 | Header5 13 15 2 N 3 Header5.2 yes 17 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.gz.74.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta.gz -b s 2 | embedded 3 | Header1 0 5 4 | Header2 0 5 5 | Header3 0 10 6 | Header4 0 15 7 | Header5 0 15 8 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.gz.75.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta.gz -b c 2 | embedded 3 | Header1 0 5 4 | Header2 0 3 5 | Header2 4 5 6 | Header3 0 3 7 | Header3 4 6 8 | Header3 7 9 9 | Header4 3 15 10 | Header5 0 12 11 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.gz.76.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta.gz -b g 2 | embedded 3 | Header2 3 4 4 | Header3 3 4 5 | Header3 6 7 6 | Header3 9 10 7 | Header4 0 3 8 | Header5 12 15 9 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.gz.77.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta.gz --homopolymer-compress 1 -ofa 2 | embedded 3 | >Header1 5bp sequence with no gaps and 2 lowercase bases 4 | CGacT 5 | >Header2 5bp sequence with internal 1bp non-canonical gap 6 | CGANT 7 | >Header3 10bp sequence with internal 4bp and 1bp terminal canonical gap 8 | TGANATNCTN 9 | >Header4 15bp sequence with start 3bp canonical gap and 3 lowercase bases 10 | NNNTCTcgCACtC 11 | >Header5 15bp sequence with terminal 3bp canonical gap 12 | ACTCGATCACGNNN 13 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.gz.97.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta.gz -a testFiles/random1.agp --stats 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 2 5 | Total scaffold length: 47 6 | Average scaffold length: 23.50 7 | Scaffold N50: 25 8 | Scaffold auN: 23.60 9 | Scaffold L50: 1 10 | Largest scaffold: 25 11 | Smallest scaffold: 22 12 | # contigs: 6 13 | Total contig length: 27 14 | Average contig length: 4.50 15 | Contig N50: 5 16 | Contig auN: 7.37 17 | Contig L50: 2 18 | Largest contig: 12 19 | Smallest contig: 1 20 | # gaps in scaffolds: 4 21 | Total gap length in scaffolds: 20 22 | Average gap length in scaffolds: 5.00 23 | Gap N50 in scaffolds: 6 24 | Gap auN in scaffolds: 6.30 25 | Gap L50 in scaffolds: 2 26 | Largest gap in scaffolds: 8 27 | Smallest gap in scaffolds: 1 28 | Base composition (A:C:G:T): 4:10:5:8 29 | GC content %: 55.56 30 | # soft-masked bases: 5 31 | # segments: 8 32 | Total segment length: 40 33 | Average segment length: 5.00 34 | # gaps: 9 35 | # paths: 2 36 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.gz.98.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta.gz -a testFiles/random1.agp -ofa 2 | embedded 3 | >newpath1 4 | GacTNNNNNTCGNNNNNNATNC 5 | >newpath2 6 | TCGAGNNNNNNNNTTCCTcgCACtC 7 | -------------------------------------------------------------------------------- /validateFiles/random1.fasta.gz.99.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fasta.gz -k testFiles/random1.rvcp.sak 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 5 5 | Total scaffold length: 50 6 | Average scaffold length: 10.00 7 | Scaffold N50: 15 8 | Scaffold auN: 12.00 9 | Scaffold L50: 2 10 | Largest scaffold: 15 11 | Smallest scaffold: 5 12 | # contigs: 8 13 | Total contig length: 40 14 | Average contig length: 5.00 15 | Contig N50: 12 16 | Contig auN: 8.50 17 | Contig L50: 2 18 | Largest contig: 12 19 | Smallest contig: 1 20 | # gaps in scaffolds: 6 21 | Total gap length in scaffolds: 10 22 | Average gap length in scaffolds: 1.67 23 | Gap N50 in scaffolds: 3 24 | Gap auN in scaffolds: 2.20 25 | Gap L50 in scaffolds: 2 26 | Largest gap in scaffolds: 3 27 | Smallest gap in scaffolds: 1 28 | Base composition (A:C:G:T): 13:9:11:7 29 | GC content %: 50.00 30 | # soft-masked bases: 5 31 | # segments: 8 32 | Total segment length: 40 33 | Average segment length: 5.00 34 | # gaps: 6 35 | # paths: 5 36 | -------------------------------------------------------------------------------- /validateFiles/random1.fastq.101.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fastq -a testFiles/random1.agp --stats 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 2 5 | Total scaffold length: 47 6 | Average scaffold length: 23.50 7 | Scaffold N50: 25 8 | Scaffold auN: 23.60 9 | Scaffold L50: 1 10 | Largest scaffold: 25 11 | Smallest scaffold: 22 12 | # contigs: 6 13 | Total contig length: 27 14 | Average contig length: 4.50 15 | Contig N50: 5 16 | Contig auN: 7.37 17 | Contig L50: 2 18 | Largest contig: 12 19 | Smallest contig: 1 20 | # gaps in scaffolds: 4 21 | Total gap length in scaffolds: 20 22 | Average gap length in scaffolds: 5.00 23 | Gap N50 in scaffolds: 6 24 | Gap auN in scaffolds: 6.30 25 | Gap L50 in scaffolds: 2 26 | Largest gap in scaffolds: 8 27 | Smallest gap in scaffolds: 1 28 | Base composition (A:C:G:T): 4:10:5:8 29 | GC content %: 55.56 30 | # soft-masked bases: 5 31 | # segments: 8 32 | Total segment length: 40 33 | Average segment length: 5.00 34 | # gaps: 9 35 | # paths: 2 36 | -------------------------------------------------------------------------------- /validateFiles/random1.fastq.102.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fastq -a testFiles/random1.agp -ofa 2 | embedded 3 | >newpath1 4 | GacTNNNNNTCGNNNNNNATNC 5 | >newpath2 6 | TCGAGNNNNNNNNTTCCTcgCACtC 7 | -------------------------------------------------------------------------------- /validateFiles/random1.fastq.103.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fastq -k testFiles/random1.rvcp.sak 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 5 5 | Total scaffold length: 50 6 | Average scaffold length: 10.00 7 | Scaffold N50: 15 8 | Scaffold auN: 12.00 9 | Scaffold L50: 2 10 | Largest scaffold: 15 11 | Smallest scaffold: 5 12 | # contigs: 8 13 | Total contig length: 40 14 | Average contig length: 5.00 15 | Contig N50: 12 16 | Contig auN: 8.50 17 | Contig L50: 2 18 | Largest contig: 12 19 | Smallest contig: 1 20 | # gaps in scaffolds: 6 21 | Total gap length in scaffolds: 10 22 | Average gap length in scaffolds: 1.67 23 | Gap N50 in scaffolds: 3 24 | Gap auN in scaffolds: 2.20 25 | Gap L50 in scaffolds: 2 26 | Largest gap in scaffolds: 3 27 | Smallest gap in scaffolds: 1 28 | Base composition (A:C:G:T): 13:9:11:7 29 | GC content %: 50.00 30 | # soft-masked bases: 5 31 | # segments: 8 32 | Total segment length: 40 33 | Average segment length: 5.00 34 | # gaps: 6 35 | # paths: 5 36 | -------------------------------------------------------------------------------- /validateFiles/random1.fastq.104.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fastq -k testFiles/random1.mask.sak 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 5 5 | Total scaffold length: 53 6 | Average scaffold length: 10.60 7 | Scaffold N50: 15 8 | Scaffold auN: 13.19 9 | Scaffold L50: 2 10 | Largest scaffold: 18 11 | Smallest scaffold: 5 12 | # contigs: 9 13 | Total contig length: 38 14 | Average contig length: 4.22 15 | Contig N50: 9 16 | Contig auN: 7.32 17 | Contig L50: 2 18 | Largest contig: 12 19 | Smallest contig: 1 20 | # gaps in scaffolds: 7 21 | Total gap length in scaffolds: 15 22 | Average gap length in scaffolds: 2.14 23 | Gap N50 in scaffolds: 3 24 | Gap auN in scaffolds: 3.13 25 | Gap L50 in scaffolds: 2 26 | Largest gap in scaffolds: 5 27 | Smallest gap in scaffolds: 1 28 | Base composition (A:C:G:T): 8:13:6:11 29 | GC content %: 50.00 30 | # soft-masked bases: 5 31 | # segments: 8 32 | Total segment length: 40 33 | Average segment length: 5.00 34 | # gaps: 7 35 | # paths: 5 36 | -------------------------------------------------------------------------------- /validateFiles/random1.fastq.54.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fastq 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 5 5 | Total scaffold length: 50 6 | Average scaffold length: 10.00 7 | Scaffold N50: 15 8 | Scaffold auN: 12.00 9 | Scaffold L50: 2 10 | Largest scaffold: 15 11 | Smallest scaffold: 5 12 | # contigs: 8 13 | Total contig length: 40 14 | Average contig length: 5.00 15 | Contig N50: 12 16 | Contig auN: 8.50 17 | Contig L50: 2 18 | Largest contig: 12 19 | Smallest contig: 1 20 | # gaps in scaffolds: 6 21 | Total gap length in scaffolds: 10 22 | Average gap length in scaffolds: 1.67 23 | Gap N50 in scaffolds: 3 24 | Gap auN in scaffolds: 2.20 25 | Gap L50 in scaffolds: 2 26 | Largest gap in scaffolds: 3 27 | Smallest gap in scaffolds: 1 28 | Base composition (A:C:G:T): 9:14:6:11 29 | GC content %: 50.00 30 | # soft-masked bases: 5 31 | # segments: 8 32 | Total segment length: 40 33 | Average segment length: 5.00 34 | # gaps: 6 35 | # paths: 5 36 | -------------------------------------------------------------------------------- /validateFiles/random1.fastq.55.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fastq -s s 2 | embedded 3 | Header1 5 4 | Header2 5 5 | Header3 10 6 | Header4 15 7 | Header5 15 8 | -------------------------------------------------------------------------------- /validateFiles/random1.fastq.56.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fastq -s c 2 | embedded 3 | Header1.1 5 4 | Header2.1 3 5 | Header2.3 1 6 | Header3.1 3 7 | Header3.3 2 8 | Header3.5 2 9 | Header4.2 12 10 | Header5.1 12 11 | -------------------------------------------------------------------------------- /validateFiles/random1.fastq.57.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fastq -s g 2 | embedded 3 | Header2.2 1 4 | Header3.2 1 5 | Header3.4 1 6 | Header3.6 1 7 | Header4.1 3 8 | Header5.2 3 9 | -------------------------------------------------------------------------------- /validateFiles/random1.fastq.58.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fastq -b a 2 | embedded 3 | Header1 1 5 1 W Header1.1 1 5 + 4 | Header2 1 3 1 W Header2.1 1 3 + 5 | Header2 4 4 2 N 1 Header2.2 yes 6 | Header2 5 5 3 W Header2.3 1 1 + 7 | Header3 1 3 1 W Header3.1 1 3 + 8 | Header3 4 4 2 N 1 Header3.2 yes 9 | Header3 5 6 3 W Header3.3 1 2 + 10 | Header3 7 7 4 N 1 Header3.4 yes 11 | Header3 8 9 5 W Header3.5 1 2 + 12 | Header3 10 10 6 N 1 Header3.6 yes 13 | Header4 1 3 1 N 3 Header4.1 yes 14 | Header4 4 15 2 W Header4.2 1 12 + 15 | Header5 1 12 1 W Header5.1 1 12 + 16 | Header5 13 15 2 N 3 Header5.2 yes 17 | -------------------------------------------------------------------------------- /validateFiles/random1.fastq.59.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fastq -b s 2 | embedded 3 | Header1 0 5 4 | Header2 0 5 5 | Header3 0 10 6 | Header4 0 15 7 | Header5 0 15 8 | -------------------------------------------------------------------------------- /validateFiles/random1.fastq.60.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fastq -b c 2 | embedded 3 | Header1 0 5 4 | Header2 0 3 5 | Header2 4 5 6 | Header3 0 3 7 | Header3 4 6 8 | Header3 7 9 9 | Header4 3 15 10 | Header5 0 12 11 | -------------------------------------------------------------------------------- /validateFiles/random1.fastq.61.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fastq -b g 2 | embedded 3 | Header2 3 4 4 | Header3 3 4 5 | Header3 6 7 6 | Header3 9 10 7 | Header4 0 3 8 | Header5 12 15 9 | -------------------------------------------------------------------------------- /validateFiles/random1.fastq.62.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fastq --homopolymer-compress 1 -ofa 2 | embedded 3 | >Header1 5bp sequence with no gaps 4 | CGacT 5 | >Header2 5bp sequence with internal 1bp non-canonical gap 6 | CGANT 7 | >Header3 10bp sequence with internal 4bp and 1bp terminal canonical gap 8 | TGANATNCTN 9 | >Header4 15bp sequence with start 3bp canonical gap 10 | NNNTCTcgCACtC 11 | >Header5 15bp sequence with terminal 3bp canonical gap 12 | ACTCGATCACGNNN 13 | -------------------------------------------------------------------------------- /validateFiles/random1.fastq.gz.105.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fastq.gz -a testFiles/random1.agp --stats 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 2 5 | Total scaffold length: 47 6 | Average scaffold length: 23.50 7 | Scaffold N50: 25 8 | Scaffold auN: 23.60 9 | Scaffold L50: 1 10 | Largest scaffold: 25 11 | Smallest scaffold: 22 12 | # contigs: 6 13 | Total contig length: 27 14 | Average contig length: 4.50 15 | Contig N50: 5 16 | Contig auN: 7.37 17 | Contig L50: 2 18 | Largest contig: 12 19 | Smallest contig: 1 20 | # gaps in scaffolds: 4 21 | Total gap length in scaffolds: 20 22 | Average gap length in scaffolds: 5.00 23 | Gap N50 in scaffolds: 6 24 | Gap auN in scaffolds: 6.30 25 | Gap L50 in scaffolds: 2 26 | Largest gap in scaffolds: 8 27 | Smallest gap in scaffolds: 1 28 | Base composition (A:C:G:T): 4:10:5:8 29 | GC content %: 55.56 30 | # soft-masked bases: 5 31 | # segments: 8 32 | Total segment length: 40 33 | Average segment length: 5.00 34 | # gaps: 9 35 | # paths: 2 36 | -------------------------------------------------------------------------------- /validateFiles/random1.fastq.gz.106.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fastq.gz -a testFiles/random1.agp -ofa 2 | embedded 3 | >newpath1 4 | GacTNNNNNTCGNNNNNNATNC 5 | >newpath2 6 | TCGAGNNNNNNNNTTCCTcgCACtC 7 | -------------------------------------------------------------------------------- /validateFiles/random1.fastq.gz.107.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fastq.gz -k testFiles/random1.rvcp.sak 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 5 5 | Total scaffold length: 50 6 | Average scaffold length: 10.00 7 | Scaffold N50: 15 8 | Scaffold auN: 12.00 9 | Scaffold L50: 2 10 | Largest scaffold: 15 11 | Smallest scaffold: 5 12 | # contigs: 8 13 | Total contig length: 40 14 | Average contig length: 5.00 15 | Contig N50: 12 16 | Contig auN: 8.50 17 | Contig L50: 2 18 | Largest contig: 12 19 | Smallest contig: 1 20 | # gaps in scaffolds: 6 21 | Total gap length in scaffolds: 10 22 | Average gap length in scaffolds: 1.67 23 | Gap N50 in scaffolds: 3 24 | Gap auN in scaffolds: 2.20 25 | Gap L50 in scaffolds: 2 26 | Largest gap in scaffolds: 3 27 | Smallest gap in scaffolds: 1 28 | Base composition (A:C:G:T): 13:9:11:7 29 | GC content %: 50.00 30 | # soft-masked bases: 5 31 | # segments: 8 32 | Total segment length: 40 33 | Average segment length: 5.00 34 | # gaps: 6 35 | # paths: 5 36 | -------------------------------------------------------------------------------- /validateFiles/random1.fastq.gz.108.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fastq.gz -k testFiles/random1.mask.sak 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 5 5 | Total scaffold length: 53 6 | Average scaffold length: 10.60 7 | Scaffold N50: 15 8 | Scaffold auN: 13.19 9 | Scaffold L50: 2 10 | Largest scaffold: 18 11 | Smallest scaffold: 5 12 | # contigs: 9 13 | Total contig length: 38 14 | Average contig length: 4.22 15 | Contig N50: 9 16 | Contig auN: 7.32 17 | Contig L50: 2 18 | Largest contig: 12 19 | Smallest contig: 1 20 | # gaps in scaffolds: 7 21 | Total gap length in scaffolds: 15 22 | Average gap length in scaffolds: 2.14 23 | Gap N50 in scaffolds: 3 24 | Gap auN in scaffolds: 3.13 25 | Gap L50 in scaffolds: 2 26 | Largest gap in scaffolds: 5 27 | Smallest gap in scaffolds: 1 28 | Base composition (A:C:G:T): 8:13:6:11 29 | GC content %: 50.00 30 | # soft-masked bases: 5 31 | # segments: 8 32 | Total segment length: 40 33 | Average segment length: 5.00 34 | # gaps: 7 35 | # paths: 5 36 | -------------------------------------------------------------------------------- /validateFiles/random1.fastq.gz.33.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fastq.gz 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 5 5 | Total scaffold length: 50 6 | Average scaffold length: 10.00 7 | Scaffold N50: 15 8 | Scaffold auN: 12.00 9 | Scaffold L50: 2 10 | Largest scaffold: 15 11 | Smallest scaffold: 5 12 | # contigs: 8 13 | Total contig length: 40 14 | Average contig length: 5.00 15 | Contig N50: 12 16 | Contig auN: 8.50 17 | Contig L50: 2 18 | Largest contig: 12 19 | Smallest contig: 1 20 | # gaps in scaffolds: 6 21 | Total gap length in scaffolds: 10 22 | Average gap length in scaffolds: 1.67 23 | Gap N50 in scaffolds: 3 24 | Gap auN in scaffolds: 2.20 25 | Gap L50 in scaffolds: 2 26 | Largest gap in scaffolds: 3 27 | Smallest gap in scaffolds: 1 28 | Base composition (A:C:G:T): 9:14:6:11 29 | GC content %: 50.00 30 | # soft-masked bases: 5 31 | # segments: 8 32 | Total segment length: 40 33 | Average segment length: 5.00 34 | # gaps: 6 35 | # paths: 5 36 | -------------------------------------------------------------------------------- /validateFiles/random1.fastq.gz.34.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fastq.gz -s s 2 | embedded 3 | Header1 5 4 | Header2 5 5 | Header3 10 6 | Header4 15 7 | Header5 15 8 | -------------------------------------------------------------------------------- /validateFiles/random1.fastq.gz.35.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fastq.gz -s c 2 | embedded 3 | Header1.1 5 4 | Header2.1 3 5 | Header2.3 1 6 | Header3.1 3 7 | Header3.3 2 8 | Header3.5 2 9 | Header4.2 12 10 | Header5.1 12 11 | -------------------------------------------------------------------------------- /validateFiles/random1.fastq.gz.36.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fastq.gz -s g 2 | embedded 3 | Header2.2 1 4 | Header3.2 1 5 | Header3.4 1 6 | Header3.6 1 7 | Header4.1 3 8 | Header5.2 3 9 | -------------------------------------------------------------------------------- /validateFiles/random1.fastq.gz.37.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fastq.gz -b a 2 | embedded 3 | Header1 1 5 1 W Header1.1 1 5 + 4 | Header2 1 3 1 W Header2.1 1 3 + 5 | Header2 4 4 2 N 1 Header2.2 yes 6 | Header2 5 5 3 W Header2.3 1 1 + 7 | Header3 1 3 1 W Header3.1 1 3 + 8 | Header3 4 4 2 N 1 Header3.2 yes 9 | Header3 5 6 3 W Header3.3 1 2 + 10 | Header3 7 7 4 N 1 Header3.4 yes 11 | Header3 8 9 5 W Header3.5 1 2 + 12 | Header3 10 10 6 N 1 Header3.6 yes 13 | Header4 1 3 1 N 3 Header4.1 yes 14 | Header4 4 15 2 W Header4.2 1 12 + 15 | Header5 1 12 1 W Header5.1 1 12 + 16 | Header5 13 15 2 N 3 Header5.2 yes 17 | -------------------------------------------------------------------------------- /validateFiles/random1.fastq.gz.38.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fastq.gz -b s 2 | embedded 3 | Header1 0 5 4 | Header2 0 5 5 | Header3 0 10 6 | Header4 0 15 7 | Header5 0 15 8 | -------------------------------------------------------------------------------- /validateFiles/random1.fastq.gz.39.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fastq.gz -b c 2 | embedded 3 | Header1 0 5 4 | Header2 0 3 5 | Header2 4 5 6 | Header3 0 3 7 | Header3 4 6 8 | Header3 7 9 9 | Header4 3 15 10 | Header5 0 12 11 | -------------------------------------------------------------------------------- /validateFiles/random1.fastq.gz.40.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fastq.gz -b g 2 | embedded 3 | Header2 3 4 4 | Header3 3 4 5 | Header3 6 7 6 | Header3 9 10 7 | Header4 0 3 8 | Header5 12 15 9 | -------------------------------------------------------------------------------- /validateFiles/random1.fastq.gz.41.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.fastq.gz --homopolymer-compress 1 -ofa 2 | embedded 3 | >Header1 5bp sequence with no gaps 4 | CGacT 5 | >Header2 5bp sequence with internal 1bp non-canonical gap 6 | CGANT 7 | >Header3 10bp sequence with internal 4bp and 1bp terminal canonical gap 8 | TGANATNCTN 9 | >Header4 15bp sequence with start 3bp canonical gap 10 | NNNTCTcgCACtC 11 | >Header5 15bp sequence with terminal 3bp canonical gap 12 | ACTCGATCACGNNN 13 | -------------------------------------------------------------------------------- /validateFiles/random1.gfa2.109.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.gfa2 -a testFiles/random1.agp --stats 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 2 5 | Total scaffold length: 47 6 | Average scaffold length: 23.50 7 | Scaffold N50: 25 8 | Scaffold auN: 23.60 9 | Scaffold L50: 1 10 | Largest scaffold: 25 11 | Smallest scaffold: 22 12 | # contigs: 6 13 | Total contig length: 27 14 | Average contig length: 4.50 15 | Contig N50: 5 16 | Contig auN: 7.37 17 | Contig L50: 2 18 | Largest contig: 12 19 | Smallest contig: 1 20 | # gaps in scaffolds: 4 21 | Total gap length in scaffolds: 20 22 | Average gap length in scaffolds: 5.00 23 | Gap N50 in scaffolds: 6 24 | Gap auN in scaffolds: 6.30 25 | Gap L50 in scaffolds: 2 26 | Largest gap in scaffolds: 8 27 | Smallest gap in scaffolds: 1 28 | Base composition (A:C:G:T): 4:10:5:8 29 | GC content %: 55.56 30 | # soft-masked bases: 5 31 | # segments: 8 32 | Total segment length: 40 33 | Average segment length: 5.00 34 | # gaps: 9 35 | # paths: 2 36 | -------------------------------------------------------------------------------- /validateFiles/random1.gfa2.110.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.gfa2 -a testFiles/random1.agp -ofa 2 | embedded 3 | >newpath1 4 | GacTNNNNNTCGNNNNNNATNC 5 | >newpath2 6 | TCGAGNNNNNNNNTTCCTcgCACtC 7 | -------------------------------------------------------------------------------- /validateFiles/random1.gfa2.111.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.gfa2 -k testFiles/random1.rvcp.sak 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 5 5 | Total scaffold length: 50 6 | Average scaffold length: 10.00 7 | Scaffold N50: 15 8 | Scaffold auN: 12.00 9 | Scaffold L50: 2 10 | Largest scaffold: 15 11 | Smallest scaffold: 5 12 | # contigs: 8 13 | Total contig length: 40 14 | Average contig length: 5.00 15 | Contig N50: 12 16 | Contig auN: 8.50 17 | Contig L50: 2 18 | Largest contig: 12 19 | Smallest contig: 1 20 | # gaps in scaffolds: 6 21 | Total gap length in scaffolds: 10 22 | Average gap length in scaffolds: 1.67 23 | Gap N50 in scaffolds: 3 24 | Gap auN in scaffolds: 2.20 25 | Gap L50 in scaffolds: 2 26 | Largest gap in scaffolds: 3 27 | Smallest gap in scaffolds: 1 28 | Base composition (A:C:G:T): 13:9:11:7 29 | GC content %: 50.00 30 | # soft-masked bases: 5 31 | # segments: 8 32 | Total segment length: 40 33 | Average segment length: 5.00 34 | # gaps: 6 35 | # paths: 5 36 | -------------------------------------------------------------------------------- /validateFiles/random1.gfa2.112.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.gfa2 -k testFiles/random1.mask.sak 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 5 5 | Total scaffold length: 53 6 | Average scaffold length: 10.60 7 | Scaffold N50: 15 8 | Scaffold auN: 13.19 9 | Scaffold L50: 2 10 | Largest scaffold: 18 11 | Smallest scaffold: 5 12 | # contigs: 9 13 | Total contig length: 38 14 | Average contig length: 4.22 15 | Contig N50: 9 16 | Contig auN: 7.32 17 | Contig L50: 2 18 | Largest contig: 12 19 | Smallest contig: 1 20 | # gaps in scaffolds: 7 21 | Total gap length in scaffolds: 15 22 | Average gap length in scaffolds: 2.14 23 | Gap N50 in scaffolds: 3 24 | Gap auN in scaffolds: 3.13 25 | Gap L50 in scaffolds: 2 26 | Largest gap in scaffolds: 5 27 | Smallest gap in scaffolds: 1 28 | Base composition (A:C:G:T): 8:13:6:11 29 | GC content %: 50.00 30 | # soft-masked bases: 5 31 | # segments: 8 32 | Total segment length: 40 33 | Average segment length: 5.00 34 | # gaps: 7 35 | # paths: 5 36 | -------------------------------------------------------------------------------- /validateFiles/random1.gfa2.113.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.gfa2 -k testFiles/random1.gfa2.instructions.sak 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 4 5 | Total scaffold length: 40 6 | Average scaffold length: 10.00 7 | Scaffold N50: 15 8 | Scaffold auN: 12.50 9 | Scaffold L50: 2 10 | Largest scaffold: 15 11 | Smallest scaffold: 5 12 | # contigs: 5 13 | Total contig length: 33 14 | Average contig length: 6.60 15 | Contig N50: 12 16 | Contig auN: 9.79 17 | Contig L50: 2 18 | Largest contig: 12 19 | Smallest contig: 1 20 | # gaps in scaffolds: 3 21 | Total gap length in scaffolds: 7 22 | Average gap length in scaffolds: 2.33 23 | Gap N50 in scaffolds: 3 24 | Gap auN in scaffolds: 2.71 25 | Gap L50 in scaffolds: 2 26 | Largest gap in scaffolds: 3 27 | Smallest gap in scaffolds: 1 28 | Base composition (A:C:G:T): 7:13:5:8 29 | GC content %: 54.55 30 | # soft-masked bases: 5 31 | # segments: 5 32 | Total segment length: 33 33 | Average segment length: 6.60 34 | # gaps: 3 35 | # paths: 4 36 | -------------------------------------------------------------------------------- /validateFiles/random1.gfa2.12.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.gfa2 -o gfa2 2 | embedded 3 | H VN:Z:2.0 4 | S Header1.1 5 CGacT 5 | S Header2.1 3 CGA 6 | S Header2.3 1 T 7 | S Header3.1 3 TGA 8 | S Header3.3 2 AT 9 | S Header3.5 2 CT 10 | S Header4.2 12 TTCCTcgCACtC 11 | S Header5.1 12 AACTCGATCACG 12 | G Header2.2 Header2.1+ Header2.3+ 1 13 | G Header3.2 Header3.1+ Header3.3+ 1 14 | G Header3.4 Header3.3+ Header3.5+ 1 15 | G Header3.6 Header3.5+ Header3.5- 1 16 | G Header4.1 Header4.2+ Header4.2+ 3 17 | G Header5.2 Header5.1+ Header5.1- 3 18 | O Header1 Header1.1+ 19 | O Header2 Header2.1+ Header2.2 Header2.3+ 20 | O Header3 Header3.1+ Header3.2 Header3.3+ Header3.4 Header3.5+ Header3.6 21 | O Header4 Header4.1 Header4.2+ 22 | O Header5 Header5.1+ Header5.2 23 | -------------------------------------------------------------------------------- /validateFiles/random1.gfa2.13.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.gfa2 -o gfa 2 | embedded 3 | H VN:Z:1.2 4 | S Header1.1 CGacT 5 | S Header2.1 CGA 6 | S Header2.3 T 7 | S Header3.1 TGA 8 | S Header3.3 AT 9 | S Header3.5 CT 10 | S Header4.2 TTCCTcgCACtC 11 | S Header5.1 AACTCGATCACG 12 | J Header2.1 + Header2.3 + 1 13 | J Header3.1 + Header3.3 + 1 14 | J Header3.3 + Header3.5 + 1 15 | J Header3.5 + Header3.5 - 1 16 | J Header4.2 + Header4.2 + 3 17 | J Header5.1 + Header5.1 - 3 18 | P Header1 Header1.1+ * 19 | P Header2 Header2.1+;Header2.3+ * 20 | P Header3 Header3.1+;Header3.3+;Header3.5+; * 21 | P Header4 ;Header4.2+ * 22 | P Header5 Header5.1+; * 23 | -------------------------------------------------------------------------------- /validateFiles/random1.gfa2.14.tst: -------------------------------------------------------------------------------- 1 | testFiles/random1.gfa2 -o fasta 2 | embedded 3 | >Header1 4 | CGacT 5 | >Header2 6 | CGANT 7 | >Header3 8 | TGANATNCTN 9 | >Header4 10 | NNNTTCCTcgCACtC 11 | >Header5 12 | AACTCGATCACGNNN 13 | -------------------------------------------------------------------------------- /validateFiles/random2.gfa.42.tst: -------------------------------------------------------------------------------- 1 | testFiles/random2.gfa -o gfa2 2 | embedded 3 | H VN:Z:2.0 4 | S 11 5 ACCTT LN:i:5 QL:Z:?@97? 5 | S 12 6 TCAAGG LN:i:6 QL:Z:@6?84@ 6 | S 13 7 CTTgaTT LN:i:7 QL:Z:>=?@877 7 | E edge0 11 + 12 - 4M 8 | E edge1 12 - 13 + 5M 9 | E edge2 11 + 13 + 3M 10 | G gap0 11+ 13- 5 SC:i:1 11 | G gap1 13- 12+ 3 SC:i:1 12 | O 14 11+ gap0 13- gap1 12+ 13 | O 15 11+ edge0 12- edge1 13+ 14 | -------------------------------------------------------------------------------- /validateFiles/random2.gfa.43.tst: -------------------------------------------------------------------------------- 1 | testFiles/random2.gfa -o gfa 2 | embedded 3 | H VN:Z:1.2 4 | S 11 ACCTT LN:i:5 QL:Z:?@97? 5 | S 12 TCAAGG LN:i:6 QL:Z:@6?84@ 6 | S 13 CTTgaTT LN:i:7 QL:Z:>=?@877 7 | L 11 + 12 - 4M 8 | L 12 - 13 + 5M 9 | L 11 + 13 + 3M 10 | J 11 + 13 - 5 SC:i:1 11 | J 13 - 12 + 3 SC:i:1 12 | P 14 11+;13-;12+ * 13 | P 15 11+,12-,13+ * 14 | -------------------------------------------------------------------------------- /validateFiles/random2.gfa.44.tst: -------------------------------------------------------------------------------- 1 | testFiles/random2.gfa -o fasta 2 | embedded 3 | >14 4 | ACCTTNNNNNAAtcAAGNNNTCAAGG 5 | >15 6 | ACCTTGATT 7 | -------------------------------------------------------------------------------- /validateFiles/random2.gfa.gz.45.tst: -------------------------------------------------------------------------------- 1 | testFiles/random2.gfa.gz -o gfa2 2 | embedded 3 | H VN:Z:2.0 4 | S 11 5 ACCTT LN:i:5 QL:Z:?@97? 5 | S 12 6 TCAAGG LN:i:6 QL:Z:@6?84@ 6 | S 13 7 CTTgaTT LN:i:7 QL:Z:>=?@877 7 | E edge0 11 + 12 - 4M 8 | E edge1 12 - 13 + 5M 9 | E edge2 11 + 13 + 3M 10 | G gap0 11+ 13- 5 SC:i:1 11 | G gap1 13- 12+ 3 SC:i:1 12 | O 14 11+ gap0 13- gap1 12+ 13 | O 15 11+ edge0 12- edge1 13+ 14 | -------------------------------------------------------------------------------- /validateFiles/random2.gfa.gz.46.tst: -------------------------------------------------------------------------------- 1 | testFiles/random2.gfa.gz -o gfa 2 | embedded 3 | H VN:Z:1.2 4 | S 11 ACCTT LN:i:5 QL:Z:?@97? 5 | S 12 TCAAGG LN:i:6 QL:Z:@6?84@ 6 | S 13 CTTgaTT LN:i:7 QL:Z:>=?@877 7 | L 11 + 12 - 4M 8 | L 12 - 13 + 5M 9 | L 11 + 13 + 3M 10 | J 11 + 13 - 5 SC:i:1 11 | J 13 - 12 + 3 SC:i:1 12 | P 14 11+;13-;12+ * 13 | P 15 11+,12-,13+ * 14 | -------------------------------------------------------------------------------- /validateFiles/random2.gfa.gz.47.tst: -------------------------------------------------------------------------------- 1 | testFiles/random2.gfa.gz -o fasta 2 | embedded 3 | >14 4 | ACCTTNNNNNAAtcAAGNNNTCAAGG 5 | >15 6 | ACCTTGATT 7 | -------------------------------------------------------------------------------- /validateFiles/random2.gfa2.63.tst: -------------------------------------------------------------------------------- 1 | testFiles/random2.gfa2 -o gfa2 2 | embedded 3 | H VN:Z:2.0 4 | S id2 6 TCAAGG 5 | S id3 7 CTTGATT 6 | S id1 5 ACCTT 7 | S id4 8 CATGACTC 8 | S id7 9 TGAATGAAA 9 | E id10 id1 + id2 - 3M 10 | E id11 id2 - id1 + 3M 11 | G id5 id3+ id4- 5 12 | G id6 id1+ id2+ 3 13 | G id8 id2+ id3- 2 14 | G id9 id7+ id7+ 5 15 | O id12 id1+ id6 id2(1:3)+ id8 id3- 16 | O path1 id1+ 17 | O path2 id2+ 18 | O path3 id3+ 19 | -------------------------------------------------------------------------------- /validateFiles/random2.gfa2.64.tst: -------------------------------------------------------------------------------- 1 | testFiles/random2.gfa2 -o gfa 2 | embedded 3 | H VN:Z:1.2 4 | S id2 TCAAGG 5 | S id3 CTTGATT 6 | S id1 ACCTT 7 | S id4 CATGACTC 8 | S id7 TGAATGAAA 9 | L id1 + id2 - 3M 10 | L id2 - id1 + 3M 11 | J id3 + id4 - 5 12 | J id1 + id2 + 3 13 | J id2 + id3 - 2 14 | J id7 + id7 + 5 15 | P id12 id1+;id2(1:3)+;id3- * 16 | P path1 id1+ * 17 | P path2 id2+ * 18 | P path3 id3+ * 19 | -------------------------------------------------------------------------------- /validateFiles/random2.gfa2.65.tst: -------------------------------------------------------------------------------- 1 | testFiles/random2.gfa2 -o fasta 2 | embedded 3 | >id12 4 | ACCTTNNNTCANNAATCAAG 5 | >path1 6 | ACCTT 7 | >path2 8 | TCAAGG 9 | >path3 10 | CTTGATT 11 | -------------------------------------------------------------------------------- /validateFiles/random2.gfa2.gz.51.tst: -------------------------------------------------------------------------------- 1 | testFiles/random2.gfa2.gz -o gfa2 2 | embedded 3 | H VN:Z:2.0 4 | S id2 6 TCAAGG 5 | S id3 7 CTTGATT 6 | S id1 5 ACCTT 7 | S id4 8 CATGACTC 8 | S id7 9 TGAATGAAA 9 | E id10 id1 + id2 - 3M 10 | E id11 id2 - id1 + 3M 11 | G id5 id3+ id4- 5 12 | G id6 id1+ id2+ 3 13 | G id8 id2+ id3- 2 14 | G id9 id7+ id7+ 5 15 | O id12 id1+ id6 id2(1:3)+ id8 id3- 16 | O path1 id1+ 17 | O path2 id2+ 18 | O path3 id3+ 19 | -------------------------------------------------------------------------------- /validateFiles/random2.gfa2.gz.52.tst: -------------------------------------------------------------------------------- 1 | testFiles/random2.gfa2.gz -o gfa 2 | embedded 3 | H VN:Z:1.2 4 | S id2 TCAAGG 5 | S id3 CTTGATT 6 | S id1 ACCTT 7 | S id4 CATGACTC 8 | S id7 TGAATGAAA 9 | L id1 + id2 - 3M 10 | L id2 - id1 + 3M 11 | J id3 + id4 - 5 12 | J id1 + id2 + 3 13 | J id2 + id3 - 2 14 | J id7 + id7 + 5 15 | P id12 id1+;id2(1:3)+;id3- * 16 | P path1 id1+ * 17 | P path2 id2+ * 18 | P path3 id3+ * 19 | -------------------------------------------------------------------------------- /validateFiles/random2.gfa2.gz.53.tst: -------------------------------------------------------------------------------- 1 | testFiles/random2.gfa2.gz -o fasta 2 | embedded 3 | >id12 4 | ACCTTNNNTCANNAATCAAG 5 | >path1 6 | ACCTT 7 | >path2 8 | TCAAGG 9 | >path3 10 | CTTGATT 11 | -------------------------------------------------------------------------------- /validateFiles/random2.noseq.gfa.114.tst: -------------------------------------------------------------------------------- 1 | testFiles/random2.noseq.gfa 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 2 5 | Total scaffold length: 44 6 | Average scaffold length: 22.00 7 | Scaffold N50: 26 8 | Scaffold auN: 22.73 9 | Scaffold L50: 1 10 | Largest scaffold: 26 11 | Smallest scaffold: 18 12 | # contigs: 6 13 | Total contig length: 36 14 | Average contig length: 6.00 15 | Contig N50: 6 16 | Contig auN: 6.11 17 | Contig L50: 3 18 | Largest contig: 7 19 | Smallest contig: 5 20 | # gaps in scaffolds: 2 21 | Total gap length in scaffolds: 8 22 | Average gap length in scaffolds: 4.00 23 | Gap N50 in scaffolds: 5 24 | Gap auN in scaffolds: 4.25 25 | Gap L50 in scaffolds: 1 26 | Largest gap in scaffolds: 5 27 | Smallest gap in scaffolds: 3 28 | Base composition (A:C:G:T): 0:0:0:0 29 | GC content %: nan 30 | # soft-masked bases: 36 31 | # segments: 3 32 | Total segment length: 0 33 | Average segment length: 0.00 34 | # gaps: 2 35 | # paths: 2 36 | # edges: 3 37 | Average degree: 1.00 38 | # connected components: 1 39 | Largest connected component length: 18 40 | # dead ends: 2 41 | # disconnected components: 0 42 | Total length disconnected components: 0 43 | # separated components: 1 44 | # bubbles: 0 45 | # circular segments: 0 46 | # circular paths: 0 47 | -------------------------------------------------------------------------------- /validateFiles/random2.noseq.gfa.48.tst: -------------------------------------------------------------------------------- 1 | testFiles/random2.noseq.gfa -o gfa2 2 | embedded 3 | H VN:Z:2.0 4 | S 11 5 * LN:i:5 QL:Z:?@97? 5 | S 12 6 * LN:i:6 QL:Z:@6?84@ 6 | S 13 7 * LN:i:7 QL:Z:>=?@877 7 | E edge0 11 + 12 - 4M 8 | E edge1 12 - 13 + 5M 9 | E edge2 11 + 13 + 3M 10 | G gap0 11+ 13- 5 SC:i:1 11 | G gap1 13- 12+ 3 SC:i:1 12 | O 14 11+ gap0 13- gap1 12+ 13 | O 15 11+ edge0 12- edge1 13+ 14 | -------------------------------------------------------------------------------- /validateFiles/random2.noseq.gfa.49.tst: -------------------------------------------------------------------------------- 1 | testFiles/random2.noseq.gfa -o gfa 2 | embedded 3 | H VN:Z:1.2 4 | S 11 * LN:i:5 QL:Z:?@97? 5 | S 12 * LN:i:6 QL:Z:@6?84@ 6 | S 13 * LN:i:7 QL:Z:>=?@877 7 | L 11 + 12 - 4M 8 | L 12 - 13 + 5M 9 | L 11 + 13 + 3M 10 | J 11 + 13 - 5 SC:i:1 11 | J 13 - 12 + 3 SC:i:1 12 | P 14 11+;13-;12+ * 13 | P 15 11+,12-,13+ * 14 | -------------------------------------------------------------------------------- /validateFiles/random2.noseq.gfa.50.tst: -------------------------------------------------------------------------------- 1 | testFiles/random2.noseq.gfa -o fasta 2 | embedded 3 | >14 4 | Error: Fasta output not possible without segment sequence. Terminating. 5 | -------------------------------------------------------------------------------- /validateFiles/random3.sorting.fasta.24.tst: -------------------------------------------------------------------------------- 1 | testFiles/random3.sorting.fasta 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 4 5 | Total scaffold length: 16 6 | Average scaffold length: 4.00 7 | Scaffold N50: 4 8 | Scaffold auN: 4.00 9 | Scaffold L50: 2 10 | Largest scaffold: 4 11 | Smallest scaffold: 4 12 | # contigs: 4 13 | Total contig length: 16 14 | Average contig length: 4.00 15 | Contig N50: 4 16 | Contig auN: 4.00 17 | Contig L50: 2 18 | Largest contig: 4 19 | Smallest contig: 4 20 | # gaps in scaffolds: 0 21 | Total gap length in scaffolds: 0 22 | Average gap length in scaffolds: 0.00 23 | Gap N50 in scaffolds: 0 24 | Gap auN in scaffolds: 0.00 25 | Gap L50 in scaffolds: 0 26 | Largest gap in scaffolds: 0 27 | Smallest gap in scaffolds: 0 28 | Base composition (A:C:G:T): 4:4:4:4 29 | GC content %: 50.00 30 | # soft-masked bases: 0 31 | # segments: 4 32 | Total segment length: 16 33 | Average segment length: 4.00 34 | # gaps: 0 35 | # paths: 4 36 | -------------------------------------------------------------------------------- /validateFiles/random3.sorting.fasta.25.tst: -------------------------------------------------------------------------------- 1 | testFiles/random3.sorting.fasta -s s 2 | embedded 3 | c 4 4 | d 4 5 | b 4 6 | a 4 7 | -------------------------------------------------------------------------------- /validateFiles/random3.sorting.fasta.26.tst: -------------------------------------------------------------------------------- 1 | testFiles/random3.sorting.fasta -s c 2 | embedded 3 | c.1 4 4 | d.1 4 5 | b.1 4 6 | a.1 4 7 | -------------------------------------------------------------------------------- /validateFiles/random3.sorting.fasta.27.tst: -------------------------------------------------------------------------------- 1 | testFiles/random3.sorting.fasta -s g 2 | embedded 3 | -------------------------------------------------------------------------------- /validateFiles/random3.sorting.fasta.28.tst: -------------------------------------------------------------------------------- 1 | testFiles/random3.sorting.fasta -b a 2 | embedded 3 | c 1 4 1 W c.1 1 4 + 4 | d 1 4 1 W d.1 1 4 + 5 | b 1 4 1 W b.1 1 4 + 6 | a 1 4 1 W a.1 1 4 + 7 | -------------------------------------------------------------------------------- /validateFiles/random3.sorting.fasta.29.tst: -------------------------------------------------------------------------------- 1 | testFiles/random3.sorting.fasta -b s 2 | embedded 3 | c 0 4 4 | d 0 4 5 | b 0 4 6 | a 0 4 7 | -------------------------------------------------------------------------------- /validateFiles/random3.sorting.fasta.30.tst: -------------------------------------------------------------------------------- 1 | testFiles/random3.sorting.fasta -b c 2 | embedded 3 | c 0 4 4 | d 0 4 5 | b 0 4 6 | a 0 4 7 | -------------------------------------------------------------------------------- /validateFiles/random3.sorting.fasta.31.tst: -------------------------------------------------------------------------------- 1 | testFiles/random3.sorting.fasta -b g 2 | embedded 3 | -------------------------------------------------------------------------------- /validateFiles/random3.sorting.fasta.32.tst: -------------------------------------------------------------------------------- 1 | testFiles/random3.sorting.fasta --homopolymer-compress 1 -ofa 2 | embedded 3 | >c 4 | ACGT 5 | >d 6 | CGTA 7 | >b 8 | GTAC 9 | >a 10 | TACG 11 | -------------------------------------------------------------------------------- /validateFiles/random4.fasta.115.tst: -------------------------------------------------------------------------------- 1 | testFiles/random4.fasta 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 5 5 | Total scaffold length: 50 6 | Average scaffold length: 10.00 7 | Scaffold N50: 15 8 | Scaffold auN: 12.00 9 | Scaffold L50: 2 10 | Largest scaffold: 15 11 | Smallest scaffold: 5 12 | # contigs: 8 13 | Total contig length: 40 14 | Average contig length: 5.00 15 | Contig N50: 12 16 | Contig auN: 8.50 17 | Contig L50: 2 18 | Largest contig: 12 19 | Smallest contig: 1 20 | # gaps in scaffolds: 6 21 | Total gap length in scaffolds: 10 22 | Average gap length in scaffolds: 1.67 23 | Gap N50 in scaffolds: 3 24 | Gap auN in scaffolds: 2.20 25 | Gap L50 in scaffolds: 2 26 | Largest gap in scaffolds: 3 27 | Smallest gap in scaffolds: 1 28 | Base composition (A:C:G:T): 9:14:6:11 29 | GC content %: 50.00 30 | # soft-masked bases: 5 31 | # segments: 8 32 | Total segment length: 40 33 | Average segment length: 5.00 34 | # gaps: 6 35 | # paths: 5 36 | -------------------------------------------------------------------------------- /validateFiles/random4.fasta.15.tst: -------------------------------------------------------------------------------- 1 | testFiles/random4.fasta 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 5 5 | Total scaffold length: 50 6 | Average scaffold length: 10.00 7 | Scaffold N50: 15 8 | Scaffold auN: 12.00 9 | Scaffold L50: 2 10 | Largest scaffold: 15 11 | Smallest scaffold: 5 12 | # contigs: 8 13 | Total contig length: 40 14 | Average contig length: 5.00 15 | Contig N50: 12 16 | Contig auN: 8.50 17 | Contig L50: 2 18 | Largest contig: 12 19 | Smallest contig: 1 20 | # gaps in scaffolds: 6 21 | Total gap length in scaffolds: 10 22 | Average gap length in scaffolds: 1.67 23 | Gap N50 in scaffolds: 3 24 | Gap auN in scaffolds: 2.20 25 | Gap L50 in scaffolds: 2 26 | Largest gap in scaffolds: 3 27 | Smallest gap in scaffolds: 1 28 | Base composition (A:C:G:T): 9:14:6:11 29 | GC content %: 50.00 30 | # soft-masked bases: 5 31 | # segments: 8 32 | Total segment length: 40 33 | Average segment length: 5.00 34 | # gaps: 6 35 | # paths: 5 36 | -------------------------------------------------------------------------------- /validateFiles/random4.fasta.16.tst: -------------------------------------------------------------------------------- 1 | testFiles/random4.fasta -s s 2 | embedded 3 | Header1 5 4 | Header2 5 5 | Header3 10 6 | Header4 15 7 | Header5 15 8 | -------------------------------------------------------------------------------- /validateFiles/random4.fasta.17.tst: -------------------------------------------------------------------------------- 1 | testFiles/random4.fasta -s c 2 | embedded 3 | Header1.1 5 4 | Header2.1 3 5 | Header2.3 1 6 | Header3.1 3 7 | Header3.3 2 8 | Header3.5 2 9 | Header4.2 12 10 | Header5.1 12 11 | -------------------------------------------------------------------------------- /validateFiles/random4.fasta.18.tst: -------------------------------------------------------------------------------- 1 | testFiles/random4.fasta -s g 2 | embedded 3 | Header2.2 1 4 | Header3.2 1 5 | Header3.4 1 6 | Header3.6 1 7 | Header4.1 3 8 | Header5.2 3 9 | -------------------------------------------------------------------------------- /validateFiles/random4.fasta.19.tst: -------------------------------------------------------------------------------- 1 | testFiles/random4.fasta -b a 2 | embedded 3 | Header1 1 5 1 W Header1.1 1 5 + 4 | Header2 1 3 1 W Header2.1 1 3 + 5 | Header2 4 4 2 N 1 Header2.2 yes 6 | Header2 5 5 3 W Header2.3 1 1 + 7 | Header3 1 3 1 W Header3.1 1 3 + 8 | Header3 4 4 2 N 1 Header3.2 yes 9 | Header3 5 6 3 W Header3.3 1 2 + 10 | Header3 7 7 4 N 1 Header3.4 yes 11 | Header3 8 9 5 W Header3.5 1 2 + 12 | Header3 10 10 6 N 1 Header3.6 yes 13 | Header4 1 3 1 N 3 Header4.1 yes 14 | Header4 4 15 2 W Header4.2 1 12 + 15 | Header5 1 12 1 W Header5.1 1 12 + 16 | Header5 13 15 2 N 3 Header5.2 yes 17 | -------------------------------------------------------------------------------- /validateFiles/random4.fasta.20.tst: -------------------------------------------------------------------------------- 1 | testFiles/random4.fasta -b s 2 | embedded 3 | Header1 0 5 4 | Header2 0 5 5 | Header3 0 10 6 | Header4 0 15 7 | Header5 0 15 8 | -------------------------------------------------------------------------------- /validateFiles/random4.fasta.21.tst: -------------------------------------------------------------------------------- 1 | testFiles/random4.fasta -b c 2 | embedded 3 | Header1 0 5 4 | Header2 0 3 5 | Header2 4 5 6 | Header3 0 3 7 | Header3 4 6 8 | Header3 7 9 9 | Header4 3 15 10 | Header5 0 12 11 | -------------------------------------------------------------------------------- /validateFiles/random4.fasta.22.tst: -------------------------------------------------------------------------------- 1 | testFiles/random4.fasta -b g 2 | embedded 3 | Header2 3 4 4 | Header3 3 4 5 | Header3 6 7 6 | Header3 9 10 7 | Header4 0 3 8 | Header5 12 15 9 | -------------------------------------------------------------------------------- /validateFiles/random4.fasta.23.tst: -------------------------------------------------------------------------------- 1 | testFiles/random4.fasta --homopolymer-compress 1 -ofa 2 | embedded 3 | >Header1 5bp sequence with no gaps and 2 lowercase bases 4 | CGacT 5 | >Header2 5bp sequence with internal 1bp non-canonical gap 6 | CGANT 7 | >Header3 10bp sequence with internal 4bp and 1bp terminal canonical gap 8 | TGANATNCTN 9 | >Header4 15bp sequence with start 3bp canonical gap and 3 lowercase bases 10 | NNNTCTcgCACtC 11 | >Header5 15bp sequence with terminal 3bp canonical gap 12 | ACTCGATCACGNNN 13 | -------------------------------------------------------------------------------- /validateFiles/random5.findovl.gfa.116.tst: -------------------------------------------------------------------------------- 1 | testFiles/random5.findovl.gfa --discover-terminal-overlaps 3 -ogfa 2 | embedded 3 | H VN:Z:1.2 4 | S 11 CCGTTCCATGAAGGCCAGAGTTACTTACCGGCCCTTTCCATGCGCGCGCCATAAA LN:i:55 5 | S 12 GATTTAAGAATATGTTAACGGAGGATTGCACGATCTTCTCTCCTCGTGAGAGAATTTATG LN:i:60 6 | S 13 AAATCGCATAGCTATGTATTTTGCAGAGGTAGCGACATCTTGACGGGCACTTCACAGATAGTGGG LN:i:65 7 | L 11 + 12 - 6M 8 | L 11 + 13 + 3M 9 | L 12 - 13 + 5M 10 | J 11 + 13 - 5 SC:i:1 11 | J 13 - 12 + 3 SC:i:1 12 | P 14 11+;13-;12+ * 13 | P 15 11+12-13+ * 14 | -------------------------------------------------------------------------------- /validateFiles/random5.findovl.gfa.66.tst: -------------------------------------------------------------------------------- 1 | testFiles/random5.findovl.gfa -o gfa2 2 | embedded 3 | H VN:Z:2.0 4 | S 11 55 CCGTTCCATGAAGGCCAGAGTTACTTACCGGCCCTTTCCATGCGCGCGCCATAAA LN:i:55 5 | S 12 60 GATTTAAGAATATGTTAACGGAGGATTGCACGATCTTCTCTCCTCGTGAGAGAATTTATG LN:i:60 6 | S 13 65 AAATCGCATAGCTATGTATTTTGCAGAGGTAGCGACATCTTGACGGGCACTTCACAGATAGTGGG LN:i:65 7 | G gap0 11+ 13- 5 SC:i:1 8 | G gap1 13- 12+ 3 SC:i:1 9 | O 14 11+ gap0 13- gap1 12+ 10 | O 15 11+ 12- 13+ 11 | -------------------------------------------------------------------------------- /validateFiles/random5.findovl.gfa.67.tst: -------------------------------------------------------------------------------- 1 | testFiles/random5.findovl.gfa -o gfa 2 | embedded 3 | H VN:Z:1.2 4 | S 11 CCGTTCCATGAAGGCCAGAGTTACTTACCGGCCCTTTCCATGCGCGCGCCATAAA LN:i:55 5 | S 12 GATTTAAGAATATGTTAACGGAGGATTGCACGATCTTCTCTCCTCGTGAGAGAATTTATG LN:i:60 6 | S 13 AAATCGCATAGCTATGTATTTTGCAGAGGTAGCGACATCTTGACGGGCACTTCACAGATAGTGGG LN:i:65 7 | J 11 + 13 - 5 SC:i:1 8 | J 13 - 12 + 3 SC:i:1 9 | P 14 11+;13-;12+ * 10 | P 15 11+12-13+ * 11 | -------------------------------------------------------------------------------- /validateFiles/random5.findovl.gfa.68.tst: -------------------------------------------------------------------------------- 1 | testFiles/random5.findovl.gfa -o fasta 2 | embedded 3 | >14 4 | CCGTTCCATGAAGGCCAGAGTTACTTACCGGCCCTTTCCATGCGCGCGCCATAAANNNNNCCCACTATCTGTGAAGTGCCCGTCAAGATGTCGCTACCTCTGCAAAATACATAGCTATGCGATTTNNNGATTTAAGAATATGTTAACGGAGGATTGCACGATCTTCTCTCCTCGTGAGAGAATTTATG 5 | >15 6 | CCGTTCCATGAAGGCCAGAGTTACTTACCGGCCCTTTCCATGCGCGCGCCATAAACATAAATTCTCTCACGAGGAGAGAAGATCGTGCAATCCTCCGTTAACATATTCTTAAATCAAATCGCATAGCTATGTATTTTGCAGAGGTAGCGACATCTTGACGGGCACTTCACAGATAGTGGG 7 | -------------------------------------------------------------------------------- /validateFiles/random6.circular.gfa.0.tst: -------------------------------------------------------------------------------- 1 | testFiles/random6.circular.gfa -o gfa2 2 | embedded 3 | H VN:Z:2.0 4 | S edge_1 0 * dp:i:32 5 | S edge_2 0 * dp:i:21 6 | S edge_3 0 * dp:i:0 7 | S edge_4 0 * dp:i:0 8 | S edge_5 0 * dp:i:2 9 | S edge_6 0 * dp:i:0 10 | S edge_7 0 * dp:i:6 11 | S edge_8 0 * dp:i:20 12 | S edge_9 0 * dp:i:0 13 | E edge0 edge_1 + edge_1 + 0M RC:i:0 14 | E edge1 edge_1 - edge_1 - 0M RC:i:18 15 | E edge2 edge_2 + edge_2 + 0M RC:i:0 16 | E edge3 edge_2 - edge_2 - 0M RC:i:0 17 | E edge4 edge_7 + edge_7 + 0M RC:i:32 18 | E edge5 edge_7 - edge_7 - 0M RC:i:40 19 | O contig_1 edge_1+ 20 | O contig_2 edge_2+ 21 | O contig_3 edge_3+ 22 | O contig_4 edge_4+ 23 | O contig_5 edge_5+ 24 | O contig_6 edge_6+ 25 | O contig_7 edge_7+ 26 | O contig_8 edge_8+ 27 | O contig_9 edge_9+ 28 | -------------------------------------------------------------------------------- /validateFiles/random6.circular.gfa.1.tst: -------------------------------------------------------------------------------- 1 | testFiles/random6.circular.gfa -o gfa 2 | embedded 3 | H VN:Z:1.2 4 | S edge_1 * dp:i:32 5 | S edge_2 * dp:i:21 6 | S edge_3 * dp:i:0 7 | S edge_4 * dp:i:0 8 | S edge_5 * dp:i:2 9 | S edge_6 * dp:i:0 10 | S edge_7 * dp:i:6 11 | S edge_8 * dp:i:20 12 | S edge_9 * dp:i:0 13 | L edge_1 + edge_1 + 0M RC:i:0 14 | L edge_1 - edge_1 - 0M RC:i:18 15 | L edge_2 + edge_2 + 0M RC:i:0 16 | L edge_2 - edge_2 - 0M RC:i:0 17 | L edge_7 + edge_7 + 0M RC:i:32 18 | L edge_7 - edge_7 - 0M RC:i:40 19 | P contig_1 edge_1+ * 20 | P contig_2 edge_2+ * 21 | P contig_3 edge_3+ * 22 | P contig_4 edge_4+ * 23 | P contig_5 edge_5+ * 24 | P contig_6 edge_6+ * 25 | P contig_7 edge_7+ * 26 | P contig_8 edge_8+ * 27 | P contig_9 edge_9+ * 28 | -------------------------------------------------------------------------------- /validateFiles/random6.circular.gfa.117.tst: -------------------------------------------------------------------------------- 1 | testFiles/random6.circular.gfa 2 | embedded 3 | +++Assembly summary+++: 4 | # scaffolds: 9 5 | Total scaffold length: 0 6 | Average scaffold length: 0.00 7 | Scaffold N50: 0 8 | Scaffold auN: nan 9 | Scaffold L50: 1 10 | Largest scaffold: 0 11 | Smallest scaffold: 0 12 | # contigs: 9 13 | Total contig length: 0 14 | Average contig length: 0.00 15 | Contig N50: 0 16 | Contig auN: nan 17 | Contig L50: 1 18 | Largest contig: 0 19 | Smallest contig: 0 20 | # gaps in scaffolds: 0 21 | Total gap length in scaffolds: 0 22 | Average gap length in scaffolds: 0.00 23 | Gap N50 in scaffolds: 0 24 | Gap auN in scaffolds: 0.00 25 | Gap L50 in scaffolds: 0 26 | Largest gap in scaffolds: 0 27 | Smallest gap in scaffolds: 0 28 | Base composition (A:C:G:T): 0:0:0:0 29 | GC content %: nan 30 | # soft-masked bases: 0 31 | # segments: 9 32 | Total segment length: 0 33 | Average segment length: 0.00 34 | # gaps: 0 35 | # paths: 9 36 | # edges: 6 37 | Average degree: 0.67 38 | # connected components: 3 39 | Largest connected component length: 0 40 | # dead ends: 12 41 | # disconnected components: 6 42 | Total length disconnected components: 0 43 | # separated components: 9 44 | # bubbles: 0 45 | # circular segments: 3 46 | # circular paths: 3 47 | -------------------------------------------------------------------------------- /validateFiles/random6.circular.gfa.2.tst: -------------------------------------------------------------------------------- 1 | testFiles/random6.circular.gfa -o fasta 2 | embedded 3 | >contig_1 4 | Error: Fasta output not possible without segment sequence. Terminating. 5 | --------------------------------------------------------------------------------