├── .github
└── workflows
│ ├── README.md
│ ├── create_release.yml
│ ├── val.bat
│ ├── val.sh
│ └── validate.yml
├── .gitignore
├── .gitmodules
├── LICENSE
├── Makefile
├── README.md
├── images
└── graph.png
├── include
├── gfastats-global.h
├── input.h
├── main.h
└── validate.h
├── instructions
└── README.md
├── scaffolding
└── README.md
├── scripts
├── gfastats_stats.sh
├── plot_runtime.R
└── submit_gfastats_stats.sh
├── src
├── generate-random-fasta.cpp
├── generate-tests.cpp
├── input.cpp
├── main.cpp
└── validate.cpp
├── testFiles
├── random1.agp
├── random1.comment.sak
├── random1.fasta
├── random1.fasta.1.bed
├── random1.fasta.2.bed
├── random1.fasta.3.bed
├── random1.fasta.4.bed
├── random1.fasta.5.bed
├── random1.fasta.gz
├── random1.fastq
├── random1.fastq.gz
├── random1.gfa2
├── random1.gfa2.instructions.sak
├── random1.hc.sak
├── random1.hdc.sak
├── random1.instructions.sak
├── random1.mask.sak
├── random1.rename.sak
├── random1.rvcp.sak
├── random2.gfa
├── random2.gfa.gz
├── random2.gfa2
├── random2.gfa2.agp
├── random2.gfa2.gz
├── random2.noseq.gfa
├── random3.sorting.fasta
├── random4.fasta
├── random5.findovl.gfa
└── random6.circular.gfa
└── validateFiles
├── README.md
├── random1.fasta.10.tst
├── random1.fasta.11.tst
├── random1.fasta.3.tst
├── random1.fasta.4.tst
├── random1.fasta.5.tst
├── random1.fasta.6.tst
├── random1.fasta.7.tst
├── random1.fasta.78.tst
├── random1.fasta.79.tst
├── random1.fasta.8.tst
├── random1.fasta.80.tst
├── random1.fasta.81.tst
├── random1.fasta.82.tst
├── random1.fasta.83.tst
├── random1.fasta.84.tst
├── random1.fasta.85.tst
├── random1.fasta.86.tst
├── random1.fasta.87.tst
├── random1.fasta.88.tst
├── random1.fasta.89.tst
├── random1.fasta.9.tst
├── random1.fasta.90.tst
├── random1.fasta.91.tst
├── random1.fasta.92.tst
├── random1.fasta.93.tst
├── random1.fasta.94.tst
├── random1.fasta.95.tst
├── random1.fasta.96.tst
├── random1.fasta.gz.100.tst
├── random1.fasta.gz.69.tst
├── random1.fasta.gz.70.tst
├── random1.fasta.gz.71.tst
├── random1.fasta.gz.72.tst
├── random1.fasta.gz.73.tst
├── random1.fasta.gz.74.tst
├── random1.fasta.gz.75.tst
├── random1.fasta.gz.76.tst
├── random1.fasta.gz.77.tst
├── random1.fasta.gz.97.tst
├── random1.fasta.gz.98.tst
├── random1.fasta.gz.99.tst
├── random1.fastq.101.tst
├── random1.fastq.102.tst
├── random1.fastq.103.tst
├── random1.fastq.104.tst
├── random1.fastq.54.tst
├── random1.fastq.55.tst
├── random1.fastq.56.tst
├── random1.fastq.57.tst
├── random1.fastq.58.tst
├── random1.fastq.59.tst
├── random1.fastq.60.tst
├── random1.fastq.61.tst
├── random1.fastq.62.tst
├── random1.fastq.gz.105.tst
├── random1.fastq.gz.106.tst
├── random1.fastq.gz.107.tst
├── random1.fastq.gz.108.tst
├── random1.fastq.gz.33.tst
├── random1.fastq.gz.34.tst
├── random1.fastq.gz.35.tst
├── random1.fastq.gz.36.tst
├── random1.fastq.gz.37.tst
├── random1.fastq.gz.38.tst
├── random1.fastq.gz.39.tst
├── random1.fastq.gz.40.tst
├── random1.fastq.gz.41.tst
├── random1.gfa2.109.tst
├── random1.gfa2.110.tst
├── random1.gfa2.111.tst
├── random1.gfa2.112.tst
├── random1.gfa2.113.tst
├── random1.gfa2.12.tst
├── random1.gfa2.13.tst
├── random1.gfa2.14.tst
├── random2.gfa.42.tst
├── random2.gfa.43.tst
├── random2.gfa.44.tst
├── random2.gfa.gz.45.tst
├── random2.gfa.gz.46.tst
├── random2.gfa.gz.47.tst
├── random2.gfa2.63.tst
├── random2.gfa2.64.tst
├── random2.gfa2.65.tst
├── random2.gfa2.gz.51.tst
├── random2.gfa2.gz.52.tst
├── random2.gfa2.gz.53.tst
├── random2.noseq.gfa.114.tst
├── random2.noseq.gfa.48.tst
├── random2.noseq.gfa.49.tst
├── random2.noseq.gfa.50.tst
├── random3.sorting.fasta.24.tst
├── random3.sorting.fasta.25.tst
├── random3.sorting.fasta.26.tst
├── random3.sorting.fasta.27.tst
├── random3.sorting.fasta.28.tst
├── random3.sorting.fasta.29.tst
├── random3.sorting.fasta.30.tst
├── random3.sorting.fasta.31.tst
├── random3.sorting.fasta.32.tst
├── random4.fasta.115.tst
├── random4.fasta.15.tst
├── random4.fasta.16.tst
├── random4.fasta.17.tst
├── random4.fasta.18.tst
├── random4.fasta.19.tst
├── random4.fasta.20.tst
├── random4.fasta.21.tst
├── random4.fasta.22.tst
├── random4.fasta.23.tst
├── random5.findovl.gfa.116.tst
├── random5.findovl.gfa.66.tst
├── random5.findovl.gfa.67.tst
├── random5.findovl.gfa.68.tst
├── random6.circular.gfa.0.tst
├── random6.circular.gfa.1.tst
├── random6.circular.gfa.117.tst
└── random6.circular.gfa.2.tst
/.github/workflows/README.md:
--------------------------------------------------------------------------------
1 | validate is automatically run on pushes to any branch, or pull requests to main
2 |
3 | to automatically create a new release and automatically upload mac, ubuntu, and windows builds run:
4 | `git tag v*`
5 | `git push origin v*`
6 | where * is the version number.
7 |
8 | Example:
9 | `git tag v1.2.1`
10 | `git push origin v1.2.1`
11 |
12 |
--------------------------------------------------------------------------------
/.github/workflows/create_release.yml:
--------------------------------------------------------------------------------
1 | name: Create Release
2 |
3 | on:
4 | push:
5 | tags:
6 | - 'v*' # Trigger on version tags
7 |
8 | jobs:
9 | create_release:
10 | name: Create Release
11 | permissions: write-all
12 | runs-on: ubuntu-latest
13 | steps:
14 | - name: Checkout code
15 | uses: actions/checkout@v2
16 | with:
17 | submodules: recursive
18 | - name: Create Release
19 | id: create_release
20 | uses: actions/create-release@v1
21 | env:
22 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
23 | with:
24 | tag_name: ${{ github.ref }}
25 | release_name: gfastats ${{ github.ref }}
26 | body: |
27 | Changes in this Release
28 | draft: false
29 | prerelease: false
30 | outputs:
31 | upload_url: ${{ steps.create_release.outputs.upload_url }}
32 |
33 | add_resources:
34 | needs: create_release
35 | name: Add Resources
36 | strategy:
37 | matrix:
38 | OS: [macos-13, ubuntu-latest, windows-2019]
39 | include:
40 | - OS: macos-13
41 | OS_NAME: macOS
42 | - OS: ubuntu-latest
43 | OS_NAME: linux
44 | - OS: windows-latest
45 | OS_NAME: win
46 | runs-on: ${{ matrix.OS }}
47 | steps:
48 | - name: Checkout code
49 | uses: actions/checkout@v2
50 | with:
51 | submodules: recursive
52 | - name: Build
53 | run: make -j
54 |
55 | - name: Make binary executable (Linux & macOS)
56 | if: matrix.OS_NAME != 'win'
57 | run: chmod +x build/bin/gfastats
58 |
59 | - name: Zip (Windows)
60 | if: matrix.OS_NAME == 'win'
61 | uses: papeloto/action-zip@v1
62 | with:
63 | files: build/bin/gfastats.exe
64 | dest: result.zip
65 |
66 | - name: Tar (Linux & macOS)
67 | if: matrix.OS_NAME != 'win'
68 | run: tar -czvf result.tar.gz -C build/bin gfastats
69 |
70 | - name: Add binaries & Upload tarball (Linux & macOS)
71 | if: matrix.OS_NAME != 'win'
72 | uses: actions/upload-release-asset@v1
73 | env:
74 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
75 | with:
76 | upload_url: ${{ needs.create_release.outputs.upload_url }}
77 | asset_path: result.tar.gz
78 | asset_name: gfastats.${{ github.ref_name }}-${{matrix.OS_NAME}}.tar.gz
79 | asset_content_type: application/tar
80 |
81 | - name: Add binaries & Upload tarball (Windows)
82 | if: matrix.OS_NAME == 'win'
83 | uses: actions/upload-release-asset@v1
84 | env:
85 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
86 | with:
87 | upload_url: ${{ needs.create_release.outputs.upload_url }}
88 | asset_path: result.zip
89 | asset_name: gfastats.${{ github.ref_name }}-${{matrix.OS_NAME}}.zip
90 | asset_content_type: application/zip
91 |
92 | add_submodules:
93 | needs: create_release
94 | name: Source with submodules
95 | runs-on: ubuntu-latest
96 | steps:
97 | - name: Checkout code
98 | uses: actions/checkout@v2
99 | with:
100 | submodules: true
101 | - name: Zip
102 | uses: papeloto/action-zip@v1
103 | with:
104 | files: .
105 | dest: result.zip
106 | - name: Add files
107 | uses: actions/upload-release-asset@v1
108 | env:
109 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
110 | with:
111 | upload_url: ${{ needs.create_release.outputs.upload_url }}
112 | asset_path: result.zip
113 | asset_name: gfastats.${{ github.ref_name }}-with_submodules.zip
114 | asset_content_type: application/zip
115 |
--------------------------------------------------------------------------------
/.github/workflows/val.bat:
--------------------------------------------------------------------------------
1 | "build/bin/gfastats-validate.exe" validateFiles
2 |
--------------------------------------------------------------------------------
/.github/workflows/val.sh:
--------------------------------------------------------------------------------
1 | build/bin/gfastats-validate validateFiles
2 |
--------------------------------------------------------------------------------
/.github/workflows/validate.yml:
--------------------------------------------------------------------------------
1 | name: Validate
2 |
3 | on:
4 | push:
5 | pull_request:
6 | branches: [ main ]
7 |
8 | jobs:
9 | validate:
10 | name: Validate
11 | strategy:
12 | matrix:
13 | include:
14 | - os: macos-latest
15 | command: ".github/workflows/val.sh"
16 | - os: ubuntu-latest
17 | command: ".github/workflows/val.sh"
18 | chmod: true
19 | - os: windows-latest
20 | command: ".github/workflows/val.bat"
21 | fail-fast: false
22 | runs-on: ${{ matrix.os }}
23 | steps:
24 | - name: Checkout code
25 | uses: actions/checkout@v3
26 | with:
27 | submodules: true
28 | - name: Build
29 | run: make all -j
30 | - name: Validate
31 | run: |
32 | chmod +x .github/workflows/val.sh
33 | ${{ matrix.command }}
34 |
35 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by https://www.gitignore.io/api/swift,xcode
2 |
3 | ### Swift ###
4 | # Xcode
5 | #
6 | # gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore
7 |
8 | ## Build generated
9 | build/
10 | DerivedData/
11 |
12 | ## Various settings
13 | *.pbxuser
14 | !default.pbxuser
15 | *.mode1v3
16 | !default.mode1v3
17 | *.mode2v3
18 | !default.mode2v3
19 | *.perspectivev3
20 | !default.perspectivev3
21 | xcuserdata/
22 |
23 | ## Other
24 | *.moved-aside
25 | *.xccheckout
26 | *.xcscmblueprint
27 |
28 | ## Obj-C/Swift specific
29 | *.hmap
30 | *.ipa
31 | *.dSYM.zip
32 | *.dSYM
33 |
34 | ## Playgrounds
35 | timeline.xctimeline
36 | playground.xcworkspace
37 |
38 | # Swift Package Manager
39 | #
40 | # Add this line if you want to avoid checking in source code from Swift Package Manager dependencies.
41 | # Packages/
42 | # Package.pins
43 | .build/
44 |
45 | # CocoaPods - Refactored to standalone file
46 |
47 | # Carthage - Refactored to standalone file
48 |
49 | # fastlane
50 | #
51 | # It is recommended to not store the screenshots in the git repo. Instead, use fastlane to re-generate the
52 | # screenshots whenever they are needed.
53 | # For more information about the recommended setup visit:
54 | # https://docs.fastlane.tools/best-practices/source-control/#source-control
55 |
56 | fastlane/report.xml
57 | fastlane/Preview.html
58 | fastlane/screenshots
59 | fastlane/test_output
60 |
61 | ### Xcode ###
62 | # Xcode
63 | #
64 | # gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore
65 |
66 | ## Build generated
67 |
68 | ## Various settings
69 |
70 | ## Other
71 |
72 | ### Xcode Patch ###
73 | *.xcodeproj
74 | *.xcodeproj/*
75 | !*.xcodeproj/project.pbxproj
76 | *.xcodeproj/xcshareddata/
77 | !*.xcodeproj/xcuserdata/
78 | !*.xcworkspace/contents.xcworkspacedata
79 | /*.gcno
80 |
81 | .DS_Store
82 | *.pbxproj
83 | *.xcworkspacedata
84 | *.plist
85 |
86 |
87 | # End of https://www.gitignore.io/api/swift,xcode,vscode
88 |
89 | tmp.txt
90 | err.txt
91 | out
92 |
93 | *.o
94 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "gfalibs"]
2 | path = gfalibs
3 | url = https://github.com/vgl-hub/gfalibs.git
4 | ignore = untracked
5 | ignore = dirty
6 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Giulio Formenti
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | CXX ?= g++
2 | INCLUDE_DIR ?= -I./include -Igfalibs/include
3 | WARNINGS = -Wall -Wextra
4 |
5 | CXXFLAGS = -g -std=gnu++14 -O3 $(INCLUDE_DIR) $(WARNINGS) $(CFLAGS)
6 |
7 | TARGET = gfastats
8 | TEST_TARGET = validate
9 | GENERATE_TARGET = generate-tests
10 | RANDOM_FASTA_TARGET = generate-random-fasta
11 | BUILD = build/bin
12 | SOURCE = src
13 | INCLUDE = include
14 | BINDIR := $(BUILD)/.o
15 |
16 | LIBS = -lz
17 | LDFLAGS= -pthread
18 |
19 | #gfalibs
20 | GFALIBS_DIR := $(CURDIR)/gfalibs
21 |
22 | OBJS := main input
23 | BINS := $(addprefix $(BINDIR)/, $(OBJS))
24 |
25 | head: $(BINS) gfalibs | $(BUILD)
26 | $(CXX) $(CXXFLAGS) $(LDFLAGS) -o $(BUILD)/$(TARGET) $(wildcard $(BINDIR)/*) $(GFALIBS_DIR)/*.o $(LIBS)
27 |
28 | debug: CXXFLAGS += -DDEBUG
29 | debug: CCFLAGS += -DDEBUG
30 | debug: head
31 |
32 | all: head validate regenerate random_fasta
33 |
34 | $(OBJS): %: $(BINDIR)/%
35 | @
36 | $(BINDIR)%: $(SOURCE)/%.cpp $(INCLUDE)/%.h | $(BINDIR)
37 | $(CXX) $(CXXFLAGS) $(LDFLAGS) -c $(SOURCE)/$(notdir $@).cpp -o $@
38 |
39 | .PHONY: gfalibs
40 | gfalibs:
41 | $(MAKE) -j -C $(GFALIBS_DIR) CXXFLAGS="$(CXXFLAGS)"
42 |
43 | validate: | $(BUILD)
44 | $(CXX) $(CXXFLAGS) -o $(BUILD)/$(TARGET)-$(TEST_TARGET) $(SOURCE)/$(TEST_TARGET).cpp $(LIBS)
45 |
46 | regenerate: | $(BUILD)
47 | $(CXX) $(CXXFLAGS) -o $(BUILD)/$(TARGET)-$(GENERATE_TARGET) $(SOURCE)/$(GENERATE_TARGET).cpp $(LIBS)
48 |
49 | random_fasta: | $(BUILD)
50 | $(CXX) $(CXXFLAGS) -o $(BUILD)/$(TARGET)-$(RANDOM_FASTA_TARGET) $(SOURCE)/$(RANDOM_FASTA_TARGET).cpp $(LIBS)
51 |
52 | $(BUILD):
53 | -mkdir -p $@
54 |
55 | $(BINDIR):
56 | -mkdir -p $@
57 |
58 | debug: CXXFLAGS += -DDEBUG -O0
59 | debug: head
60 |
61 | clean:
62 | $(RM) -r build
63 | $(MAKE) -C $(GFALIBS_DIR) clean
64 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # gfastats
2 |
3 | The swiss army knife for genome assembly.
4 |
5 | **gfastats** is a single fast and exhaustive tool for **summary statistics** and simultaneous \*fa\* (fasta, fastq, gfa [.gz]) genome assembly file **manipulation**. **gfastats** also allows seamless fasta<>fastq<>gfa[.gz] conversion. It has been tested in genomes even >100Gbp.
6 |
7 | Typical fast\* metrics include:
8 |
9 | - scaffold, contig and gap size
10 | - number of scaffolds, contigs and gaps
11 | - total length of scaffolds, contigs and gaps
12 | - scaffold, contig, gap N50 and statistics (full N\*/NG\* statistics with the `--nstar-report` flag)
13 | - area under the curve (AuN/AuNG) values for scaffolds, contigs and gaps
14 | - average scaffold, contig, gap size
15 | - largest scaffold, contig and gap
16 | - base composition and GC content
17 | - soft-masked base counts (lower case bases)
18 |
19 | Typical gfa metrics include (see also note below when evaluating gfa):
20 |
21 | - Number of nodes and edges
22 | - Average degree
23 | - Number of connected components, and length of the largets connected component
24 | - Number of dead ends
25 | - Number of disconnected components, and their total length
26 | - Number of bubbles
27 |
28 | Metrics for each scaffold/contig can be generated with the `--seq-report` flag.
29 |
30 | `Bed` coordinates and sizes of scaffolds, contigs and gaps can be outputted with the options `--out-coord` and `--out-size`. By default, `--out-coord` produces a full representation of the assembly in `agp` format.
31 |
32 | Additionally, input can be filtered using scaffold lists or `bed` coordinate files with the options `--include-bed` and `--exclude-bed`.
33 |
34 | Importantly, the filtered input can be outputted in any \*fa\* (fasta, fastq, gfa [.gz]) format.
35 |
36 | ## Installation
37 |
38 | Either download one of the releases or `git clone https://github.com/vgl-hub/gfastats.git --recursive` and `make -j` in `gfastats` folder.
39 |
40 | ## Usage
41 |
42 | `gfastats input.[fasta|fastq|gfa][.gz] [expected genome size] [header[:start-end]]`
43 |
44 | To check out all options and flags use `gfastats -h`.
45 |
46 | **Note**: if you have a GFA without paths defined (e.g. as output from hifiasm) you will need to add the `--discover-paths` options in order to generate statistics for contigs and scaffolds. This is an attempt to clearly distinguish contigs from segments.
47 |
48 | You can test some typical usage with the files in the `testFiles` folder, e.g.:
49 |
50 | ```
51 | gfastats testFiles/random1.fasta -o gfa // converts fasta to gfa
52 | gfastats testFiles/random2.gfa2.gfa -o fa // converts gfa to fasta
53 | ```
54 |
55 | ## Assembly manipulation
56 |
57 | **gfastats** allows extensive assembly manipulation at the sequence level. Manipulation is achieved using a set of _instructions_ provided as an ordered list in a file to the option `-k` / `--swiss-army-knife`:
58 |
59 | ```
60 | gfastats testFiles/random1.fasta -k testFiles/random1.instructions.sak -o gfa // reads fasta applies a set of instructions and outputs gfa
61 | ```
62 |
63 | The _instructions_ are sequentially processed to generate the final output. Examples of _instructions_ are:
64 |
65 | ```
66 | JOIN contig1+ contig2+ 50 [gap1] [scaffold1] [this is a new scaffold] // introduces a new gap of 50 bp between scaffold1 and scaffold2 with optional id gap1, effectively joining the two sequences into a new sequences named scaffold1 with an optional comment
67 | SPLIT contig1+ contig2+ // splits the scaffold containing contig1 and contig2, effectively removing the existing gap between them
68 | ```
69 |
70 | The _instructions_ directly provide the list of edits that were introduced. The _instructions_ could be from an automated tool or from manual annotation.
71 |
72 | A prime example of manipulations using input from an automated tool is overlaying AGP coordinates on top of the graph to generate new scaffolds, which can be achieved with:
73 | ```
74 | gfastats input.fasta|input.gfa -a input.agp -o output.fasta|output.gfa
75 | ```
76 |
77 | See the instruction wiki for a full list of _instructions_.
78 |
79 | ## Description
80 |
81 | Please refer to **gfastats** paper for a complete description. Briefly, **gfastats** reads and stores any fasta<>fastq<>gfa[.gz] in gfa format. **gfastats** then builds a bidirected graph representation of the assembly using adjacency lists, where each node is a segment, and each edge is a gap (see figure below). The original sequence can be directly manipulated from the graph. Finally, walking the graph allows to generate different kinds of outputs, including manipulated assemblies and feature coordinates.
82 |
83 |
84 |
85 |
86 |
87 | ## How to cite
88 |
89 | If you use **gfastats** in your work, please cite:
90 |
91 | Gfastats: conversion, evaluation and manipulation of genome sequences using assembly graphs
92 |
93 | Giulio Formenti, Linelle Abueg, Angelo Brajuka, Nadolina Brajuka, Cristo Gallardo, Alice Giani, Olivier Fedrigo, Erich D. Jarvis
94 |
95 | doi: https://doi.org/10.1093/bioinformatics/btac460
96 |
--------------------------------------------------------------------------------
/images/graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vgl-hub/gfastats/cedd755227fe0d7b6daf3fee932e325af54f3b21/images/graph.png
--------------------------------------------------------------------------------
/include/gfastats-global.h:
--------------------------------------------------------------------------------
1 | #ifndef GLOBAL_H
2 | #define GLOBAL_H
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 |
10 | #include "log.h"
11 | #include "threadpool.h"
12 |
13 | extern UserInput userInput;
14 |
15 | #endif /* GLOBAL_H */
16 |
--------------------------------------------------------------------------------
/include/input.h:
--------------------------------------------------------------------------------
1 | #ifndef INPUT_H
2 | #define INPUT_H
3 |
4 | struct UserInputGfastats : UserInput {
5 |
6 | std::vector outFiles; // output files
7 | int segmentReport_flag = 0;
8 | int pathReport_flag = 0;
9 | int outSequence_flag = 0;
10 | int nstarReport_flag = 0;
11 | int outSize_flag = 0;
12 | int outCoord_flag = 0;
13 | int outFile_flag = 0;
14 | int outBubbles_flag = 0;
15 | int cmd_flag = 0;
16 | int rmGaps_flag = 0;
17 | int extractContigs_flag = 0;
18 | int terminalOvlLen = 0;
19 |
20 | };
21 |
22 | class Input {
23 |
24 | UserInputGfastats userInput;
25 | // stream read variable definition
26 | std::string firstLine;
27 | unsigned int seqPos = 0; // to keep track of the original sequence order
28 |
29 | std::string newLine, seqHeader, seqComment, line, bedHeader;
30 |
31 | std::shared_ptr stream;
32 |
33 | std::vector instructions;
34 |
35 | public:
36 |
37 | void load(UserInputGfastats userInput);
38 |
39 | void read(InSequences& inSequence);
40 |
41 | };
42 |
43 | #endif /* INPUT_H */
44 |
--------------------------------------------------------------------------------
/include/main.h:
--------------------------------------------------------------------------------
1 | #ifndef MAIN_H
2 | #define MAIN_H
3 |
4 | #include
5 | #include
6 |
7 | #include
8 | #include
9 | #include
10 |
11 | #include
12 | #include
13 |
14 | #include //required for zstream
15 | #include
16 | #include
17 | #include
18 | #include //required for zstream
19 | #include //required for zstream
20 | #include // for graph manipulation
21 | #include // toupper()
22 | #include
23 |
24 | #include
25 | #include
26 |
27 | #include
28 | #include
29 | #include
30 |
31 | #include "log.h"
32 |
33 | #include "uid-generator.h"
34 |
35 | #include "bed.h"
36 |
37 | #include "global.h" // global variables
38 | #include "struct.h"
39 | #include "functions.h" // global functions
40 |
41 | #include "threadpool.h"
42 |
43 | #include
44 |
45 | #include "zlib.h"
46 | #include
47 | #include
48 | #include
49 |
50 | #include "gfa-lines.h"
51 |
52 | #include "gfa.h" // gfa classes
53 | #include "sak.h" // swiss army knife classes
54 |
55 | #include "stream-obj.h"
56 |
57 | #include "output.h" // output classes
58 | #include "input.h"
59 |
60 | #endif /* MAIN_H */
61 |
--------------------------------------------------------------------------------
/include/validate.h:
--------------------------------------------------------------------------------
1 | #ifndef GFASTATS_VALIDATE_H
2 | #define GFASTATS_VALIDATE_H
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 |
13 | std::string getExePath(const std::string &argv0) {
14 | std::string exePath = argv0.substr(0, argv0.find_last_of("/\\")+1);
15 | std::replace(exePath.begin(), exePath.end(), '\\', '/');
16 | #ifdef _WIN32
17 | exePath += "gfastats.exe";
18 | #else
19 | exePath += "gfastats";
20 | #endif
21 | return exePath;
22 | }
23 |
24 | std::string rmFileExt(const std::string path) { // utility to strip file extension from file
25 | if (path == "." || path == "..")
26 | return path;
27 |
28 | size_t pos = path.find_last_of("\\/.");
29 | if (pos != std::string::npos && path[pos] == '.')
30 | return path.substr(0, pos);
31 |
32 | return path;
33 | }
34 |
35 | std::string getFileExt(std::string fileName) // utility to get file extension
36 | {
37 | if(fileName.find_last_of(".") != std::string::npos) {
38 |
39 | if(fileName.substr(fileName.find_last_of(".")+1) == "gz") {
40 |
41 | fileName = rmFileExt(fileName);
42 |
43 | return getFileExt(fileName) + ".gz";
44 |
45 | }
46 |
47 | return fileName.substr(fileName.find_last_of(".")+1);
48 | }
49 | return "";
50 | }
51 |
52 | std::vector list_dir(const char *path) {
53 | std::vector list;
54 | struct dirent *entry;
55 | DIR *dir = opendir(path);
56 |
57 | if (dir == NULL) {
58 | std::cerr << "error: unable to access " << path << std::endl;
59 | exit(0);
60 | }
61 | while ((entry = readdir(dir)) != NULL) {
62 | DIR *f = opendir((std::string(path)+"/"+entry->d_name).c_str());
63 | if(f == NULL) /*not a directory*/ list.push_back(std::string(entry->d_name));
64 | else closedir(f);
65 | }
66 | closedir(dir);
67 | return list;
68 | }
69 |
70 | void get_recursive(const std::string &path, std::set &paths) {
71 | if(getFileExt(path) == "tst") {
72 | paths.insert(path);
73 | } else {
74 | DIR *dir = opendir(path.c_str());
75 | if(dir != NULL) {
76 | for(const auto &file : list_dir(path.c_str())) {
77 | get_recursive((path+"/"+file).c_str(), paths);
78 | }
79 | closedir(dir);
80 | }
81 | }
82 | }
83 |
84 | int i = 0;
85 |
86 | void genTest(std::string exePath, const std::string &file, const std::string &args){
87 | std::string tstFile = "validateFiles/"+file+"."+std::to_string(i)+".tst";
88 | std::cout << "generating: " << tstFile << std::endl;
89 | std::ofstream ostream;
90 | ostream.open(tstFile);
91 | ostream << "testFiles/" << file << " " << args << "\nembedded" << std::endl;
92 | ostream.close();
93 | #ifdef _WIN32
94 | std::string cmd = "\"\""+exePath+"\" testFiles/"+file+" "+args+" >> "+tstFile+"\"";
95 | #else
96 | std::string cmd = "\""+exePath+"\" testFiles/"+file+" "+args+" >> "+tstFile;
97 | #endif
98 | int exit = system(cmd.c_str());
99 | if (exit == EXIT_SUCCESS) {
100 | ostream << cmd << std::endl;
101 | ostream << "Command executed.";
102 | }
103 | ++i;
104 | };
105 |
106 | #endif // #ifndef GFASTATS_VALIDATE_H
107 |
--------------------------------------------------------------------------------
/instructions/README.md:
--------------------------------------------------------------------------------
1 | # Instruction wiki
2 |
3 | Instructions are sequentially executed and each instruction is described by tab-separated columns.
4 |
5 | ## JOIN
6 |
7 | The JOIN instruction introduces a new gap of 50 bp between `scaffold1` and `scaffold2` (two paths) with id `gap1`, effectively joining the two sequences into a new sequence with id `new_scaffold` and an optional comment.
8 |
9 | ```
10 | JOIN scaffold1+ scaffold2+ 50 gap1 new_scaffold
11 | JOIN scaffold1(1:100)+ scaffold2(1:100)+ 50 gap1 new_scaffold // optional subsetting
12 | ```
13 |
14 | ## SPLIT
15 |
16 | The SPLIT instruction splits the scaffold containing `segment1` and `segment2`, effectively removing the existing gap between them. Two optional comments can be provided.
17 |
18 | ```
19 | SPLIT segment1+ segment2+ scaffold1 scaffold2 [this is a new scaffold1] [this is a new scaffold2]
20 | ```
21 |
22 | ## EXCISE
23 |
24 | The EXCISE instruction removes segment1 from its scaffold, leaving it unplaced and adding a gap of 50bp with id `gap1` between the original sequences
25 |
26 | ```
27 | EXCISE segment1 50 gap1
28 | ```
29 |
30 | ## REMOVE
31 |
32 | The REMOVE instruction removes the paths involving the specified segment.
33 |
34 | ```
35 | REMOVE segment1
36 | ```
37 |
38 | ## EXCLUDE
39 |
40 | The EXCLUDE instruction removes the specified path and all its components.
41 |
42 | ```
43 | EXCLUDE path1
44 | ```
45 |
46 | ## ERASE
47 |
48 | The ERASE instruction trims off the sequence range specified from the given segment.
49 |
50 | ```
51 | ERASE segment1:10-100 // deletes segment1 sequence between the coordinates provided (in bed format)
52 | ```
53 |
54 | ## RVCP
55 |
56 | The RVCP instruction reverse-complements path1 or segment1 sequence in place
57 |
58 | ```
59 | RVCP path1/segment1
60 | ```
61 |
62 | ## INVERT
63 |
64 | The INVERT instruction inverts segment1 sequence in place
65 |
66 | ```
67 | INVERT segment1
68 | ```
69 |
70 | ## RESIZE
71 |
72 | The RESIZE instruction resizes the size of gap1 to 50 bp
73 |
74 | ```
75 | RESIZE gap1 50
76 | ```
77 |
78 | ## MASK
79 |
80 | The MASK instruction masks with 50 Ns a portion of a path, effectively adding a gap in the corresponding segment of optional size 5. If size is not provided, the masked size is used
81 |
82 | ```
83 | MASK path1 10 60 [5]
84 | ```
85 |
86 | ## CLEAVE
87 |
88 | The CLEAVE instruction breaks the specified segment at the given position generating segment2 and segment3, optionally connected by an edge
89 |
90 | ```
91 | CLEAVE segment1 50 segment2 segment3 [edge1]
92 | ```
93 |
94 | ## RENAME
95 |
96 | The RENAME instruction renames a path. It can be used to rename FASTA headers.
97 |
98 | ```
99 | RENAME path1 new_path
100 | ```
101 |
102 | ## COMMENT
103 |
104 | The COMMENT instruction add or replaces an existing comment associated with a specific path/header.
105 |
106 | ```
107 | COMMENT path1 comment
108 | ```
109 |
110 | ## Yet to be implemented
111 |
112 | ```
113 | ADD contig3 contig1+ 50 contig2+ 50 ACGT // introduces a new contig named contig3 with sequence ACGT between contig1 and contig2 leaving 50bp gaps on each side
114 | REPLACE contig1:20-24 ACGT // replaces the sequence at coordinates contig1:20-24 with ACGT
115 | ```
116 |
--------------------------------------------------------------------------------
/scaffolding/README.md:
--------------------------------------------------------------------------------
1 | ### example data: bTaeGut2 Hifiasm (HiC) assembly
2 | right click -> download link
3 | - [hap1 contigs as GFA](https://genomeark.s3.amazonaws.com/species/Taeniopygia_guttata/bTaeGut2/assembly_vgp_hic_2.0/intermediates/hifiasm/bTaeGut2.trim.HiC.hic.hap1.p_ctg.gfa)
4 | - [hap1 s1 AGP](https://genomeark.s3.amazonaws.com/species/Taeniopygia_guttata/bTaeGut2/assembly_vgp_hic_2.0/intermediates/bionano_hap1/agp_fasta/bTaeGut2_Saphyr_DLE1_3172351_bppAdjust_cmap_bTaeGut2_trim_HiC_hic_hap1_p_ctg_fasta_NGScontigs_HYBRID_SCAFFOLD.agp)
5 | - [hap1 s2 AGP](https://genomeark.s3.amazonaws.com/species/Taeniopygia_guttata/bTaeGut2/assembly_vgp_hic_2.0/intermediates/salsa_hap1/bTaeGut2_hap1_s1.gfastats.rename_salsa/scaffolds_FINAL.original-coordinates.agp)
6 | - [hap1 s2 final fasta (to check your results)](https://genomeark.s3.amazonaws.com/species/Taeniopygia_guttata/bTaeGut2/assembly_vgp_hic_2.0/bTaeGut2.hic.hap1.s2.fasta)
7 |
8 | The starting files from hifiasm-HiC workflow are the hap1 & hap2 GFAs:
9 |
10 | `bTaeGut2.hap1.gfa` and `bTaeGut2.hap2.gfa`
11 |
12 | Convert GFA -> FASTA run bionano to obtain s1 AGPs. `bTaeGut2.hap1.fasta` into Bionano produces `bTaeGut2.hap1.s1.agp`, and same for hap2.
13 |
14 | NOTE: IF Bionano is cutting, then fix the subseq lines. Bionano is not cutting in Galaxy, so do not need to run `sed` command on Galaxy assemblies.
15 | ````bash
16 | # THIS IS NOT NEEDED FOR GALAXY ASSEMBLIES
17 | cat bTaeGut2_hap1_s1.agp | sed 's/W\t\(.*\)_subseq_\([0-9]*\):\([0-9]*\)\t[0-9]*\t[0-9]*\t\(.\)/W\t\1\t\2\t\3\t\4/g' | sed 's/subseq_\([0-9]*\):\([0-9]*\)/subseq_\1_\2/g' > bTaeGut2_hap1_s1.edit.agp
18 | ````
19 |
20 | ##### UPDATE: MAY 3, 2022
21 | Newer versions of gfastats append `_path` to path names, so the Bionano AGP must be processed accordingly. **This needs to happen even if Bionano is not cutting -- i.e. this needs to happen for Galaxy assemblies!**
22 |
23 | an example of fixing the Bionano AGP to recognize `_path` in contig names:
24 | ````bash
25 | awk '{OFS = "\t"}{if ($0 ~ /^#/) print $0 }{if ($6 ~ /h1*/) print $1,$2,$3,$4,$5,$6"_path",$7,$8,$9; if ($6 ~ /^[0-9]/) print $0}' bTaeGut2.hap1.s1.edit.agp > bTaeGut2.hap1.s1.edit.path.agp
26 | ````
27 |
28 | Overlap s1 AGP onto c1/p1 GFA. `--discover` is so gfastats finds the paths in the GFA
29 | ````bash
30 | gfastats bTaeGut2.trim.HiC.hic.hap1.p_ctg.gfa --discover -o bTaeGut2.hap1.discover.gfa
31 | gfastats bTaeGut2.hap1.discover.gfa --discover -a bTaeGut2.hap1.s1.edit.path.agp -o bTaeGut2.hap1.s1.gfa
32 | ````
33 |
34 | Convert s1 GFA -> s1 FASTA, run salsa to obtain s2 AGP.
35 | ````bash
36 | gfastats bTaeGut2.hap1.s1.gfa -o bTaeGut2.hap1.s1.gfastats.fasta
37 | ````
38 | NOTE: IF Bionano is cutting, then subseq lines have colons in the names, so you need to remove those before SALSA
39 | ````bash
40 | ## Removing colons from bionano scaff names, because salsa doesn't like it
41 | # THIS IS NOT NEEDED FOR GALAXY ASSEMBLIES
42 | sed 's/:/_/g' bTaeGut2.hap1.s1.gfastats.fasta > bTaeGut2.hap1.s1.gfastats.nocolon.fasta
43 | ````
44 |
45 | `bTaeGut2.hap1.s1.gfastats.fasta` into SALSA produces `bTaeGut2.hap1.s2.agp`
46 |
47 | Overlap s2 AGP onto s1 GFA to create s2 GFA
48 | ````bash
49 | cp /scaffolds_FINAL.original-coordinates.agp > ./bTaeGut2.hap1.s2.originalcoords.agp
50 | gfastats bTaeGut2.hap1.s1.gfa -a bTaeGut2.hap1.s2.originalcoords.agp -o bTaeGut2.hap1.s2.gfa
51 | ````
52 | If you want to convert this s2 GFA to s2 FASTA:
53 | ````bash
54 | gfastats bTaeGut2.hap1.s2.gfa -o bTaeGut2.hap1.s2.gfastats.fasta
55 | ````
56 |
--------------------------------------------------------------------------------
/scripts/gfastats_stats.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 |
4 | pathlist=$1
5 | LINE=$(sed -n "$SLURM_ARRAY_TASK_ID"p $pathlist)
6 |
7 | echo "SCRIPT START FOR $LINE ----------------------------"
8 | printf "Path to genomeark file: $LINE \n\n"
9 |
10 | IFS='/' read -r -a array <<< $LINE
11 | fastaname=${array[4]} ##make more robust - basename
12 | echo "Local filename: $LINE"
13 |
14 | printf "Path to genomeark file: $LINE \n\n"
15 |
16 | statsname=$LINE.gfastats
17 |
18 | aws s3 cp s3://genomeark/${LINE} ./${fastaname}
19 |
20 | ziptime=$(wc -c $fastaname)
21 | echo "Time on compressed $fastaname"
22 | TIMEFORMAT=%R
23 | ziptime=$(time (gfastats $fastaname > ${fastaname}_temp_out.txt) 2>&1)
24 |
25 | ziplength=$(grep "Total scaffold length" ${fastaname}_temp_out.txt | grep -Eo "[0-9]+")
26 | echo "Compressed length: $ziplength"
27 | printf "$LINE\t $ziplength\t $ziptime\t gzip\n" >> gfastats_stats_out.txt
28 |
29 | uncomp=$(echo $fastaname | sed 's/.gz//g')
30 | echo "Decompressing $fastaname \n\n"
31 | gunzip $fastaname
32 |
33 | echo "Time on uncompressed fasta: $uncomp"
34 | unziptime=$(time (gfastats $uncomp > ${fastaname}_temp_out_2.txt) 2>&1)
35 |
36 | unziplength=$(grep "Total scaffold length" ${fastaname}_temp_out_2.txt | grep -Eo "[0-9]+")
37 | echo "Decompressed length: $unziplength"
38 |
39 | printf "$LINE\t $unziplength\t $unziptime\t plain text\n" >> gfastats_stats_out.txt
40 |
41 | rm $uncomp
42 | rm ${fastaname}_temp_out.txt
43 | rm ${fastaname}_temp_out_2.txt
44 |
45 | echo "SCRIPT COMPLETE FOR $LINE --------------------"
46 |
47 |
--------------------------------------------------------------------------------
/scripts/plot_runtime.R:
--------------------------------------------------------------------------------
1 | setwd(dirname(rstudioapi::getSourceEditorContext()$path))
2 |
3 | library(ggplot2)
4 |
5 | df<-read.csv("data.txt", header = TRUE, sep = "\t")
6 |
7 | png(file="Fig 1c.png",
8 | width=2000, height=1000)
9 |
10 | ggplot(df, aes(x=size, y=time, group=format)) +
11 | geom_point(aes(color=format), size = 3)+
12 | scale_color_grey() + theme_classic() +
13 | geom_smooth(aes(color=format)) +
14 | theme(
15 | text = element_text(size = 60),
16 | legend.title = element_blank(),
17 | legend.key.size = unit(3,"cm"),
18 | axis.title.x = element_text(margin = margin(t = 20, r = 0, b = 0, l = 0))
19 | ) +
20 | xlab("Genome size (Gbp)") + ylab("Time (s)") +
21 | guides(color=guide_legend(override.aes=list(fill=NA)))
22 |
23 | dev.off()
24 |
--------------------------------------------------------------------------------
/scripts/submit_gfastats_stats.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | pathlist=$1
4 | linecount=$(wc -l $pathlist | awk '{print $1}')
5 | echo $linecount
6 |
7 | log=logs/slurm_%A.log
8 | sbatch -p hpc,vgl,vgl_bigmem -c 1 --error=$log --output=$log --array=1-$linecount gfastats_stats.sh $pathlist
9 |
10 |
11 |
--------------------------------------------------------------------------------
/src/generate-random-fasta.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | typedef unsigned long long ull;
7 |
8 | // random from 0 to unsigned long long max
9 | ull rndull() {
10 | #if RAND_MAX == 2147483647 // can get 32 bits of randomness from std::rand()
11 | return (((ull)std::rand()) < 32) | (((ull)std::rand()));
12 | #else // only guarunteed 16 bits of randomness from std::rand()
13 | return (((ull)std::rand()) < 48) | (((ull)std::rand()) < 32) | (((ull)std::rand()) < 16) | (((ull)std::rand()));
14 | #endif
15 | }
16 |
17 | ull rnd(ull min, ull max) {
18 | return rndull()%(max-min)+min;
19 | }
20 |
21 | char rndACGT() {
22 | static const char acgt[] = {'A', 'C', 'G', 'T'};
23 | return acgt[rand()%4];
24 | }
25 |
26 | int main(int argc, char **argv) {
27 | if(argc != 10) {
28 | std::cout << "usage: generate-random-fasta " << std::endl;
29 | }
30 |
31 | ull contig_min_size = std::stoull(argv[2]),
32 | contig_max_size = std::stoull(argv[3]),
33 | gap_min_size = std::stoull(argv[4]),
34 | gap_max_size = std::stoull(argv[5]),
35 | min_num_contigs = std::stoull(argv[6]),
36 | max_num_contigs = std::stoull(argv[7]),
37 | min_num_headers = std::stoull(argv[8]),
38 | max_num_headers = std::stoull(argv[9]);
39 |
40 | std::srand(std::time(nullptr));
41 |
42 | std::ofstream output_file;
43 | output_file.open(argv[1]);
44 | if(!output_file.is_open()) {
45 | std::cerr << "couldn't open the specified file: <" << argv[1] << ">" << std::endl;
46 | return EXIT_FAILURE;
47 | }
48 | ull num_headers = rnd(min_num_headers, max_num_headers);
49 | for(ull h=0; hHeader" << h+1 << std::endl;
51 |
52 | if(std::rand()%2 == 1) {
53 | ull gap_size = rnd(gap_min_size, gap_max_size);
54 | for(ull i=0; i
2 | #include