├── thirdparty ├── interleaved-split ├── interleave ├── Chunker ├── Trimmer.cpp ├── Splitter.cpp ├── SeqChunker-sed ├── SeqChunker-dd └── SeqChunker-perl ├── src ├── BlasrAdapter.h ├── parsingargs.h ├── HALC.h ├── parsingargs.cpp └── BlasrAdapter.cpp ├── Makefile ├── Readme.md └── runHALC.py /thirdparty/interleaved-split: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use warnings; 3 | use strict; 4 | 5 | use Pod::Usage; 6 | 7 | =head1 SYNOPSIS 8 | 9 | cat interleaved.fq | interleaved_split 1>lib_1.fq 2>lib_2.fq; 10 | 11 | =cut 12 | 13 | pod2usage if @ARGV; 14 | 15 | my ($i, $h, $s, $d, $q) = (0); 16 | 17 | while($h=<>,$s=<>,$d=<>,$q=<>){ 18 | $i++; 19 | print {$i%2 ? *STDOUT : *STDERR} $h,$s,$d,$q 20 | } 21 | 22 | =head1 AUTHOR 23 | 24 | Thomas Hackl S 25 | 26 | =cut 27 | -------------------------------------------------------------------------------- /thirdparty/interleave: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use warnings; 3 | use strict; 4 | 5 | use Pod::Usage; 6 | 7 | =head1 SYNOPSIS 8 | 9 | interleave [B_1.fq B_2.fq ...] > interleaved.fq"; 10 | 11 | =cut 12 | 13 | my @fq = @ARGV; 14 | pod2usage unless @fq; 15 | die "uneven number of libraries!" if @fq%2; 16 | 17 | while(@fq){ 18 | my $rf = shift(@fq); 19 | my $mf = shift(@fq); 20 | open(my $r, $rf) or die "$!: $rf"; 21 | open(my $m, $mf) or die "$!: $mf"; 22 | 23 | while(defined(my $h = <$r>)){ 24 | print $h, scalar <$r>, scalar <$r>, scalar <$r>; 25 | print scalar <$m>, scalar <$m>, scalar <$m>, scalar <$m>; 26 | } 27 | 28 | close $rf; 29 | close $mf; 30 | } 31 | 32 | =head1 AUTHOR 33 | 34 | Thomas Hackl S 35 | 36 | =cut 37 | -------------------------------------------------------------------------------- /src/BlasrAdapter.h: -------------------------------------------------------------------------------- 1 | /* 2 | * BlasrAdapter.h 3 | * 4 | * Created on: Dec 2, 2015 5 | * Author: llx 6 | */ 7 | 8 | #ifndef BLASRADAPTER_H_ 9 | #define BLASRADAPTER_H_ 10 | #include 11 | #include 12 | 13 | #include "HALC.h" 14 | 15 | inline bool ComFileindex(Ccutpoint first, Ccutpoint second) 16 | { 17 | return first.fileindex < second.fileindex; 18 | } 19 | 20 | class BlasrAdapter 21 | { 22 | private: 23 | int mythreshold; 24 | char * contigfilename; 25 | std::string outputpath; 26 | std::string prefix; 27 | public: 28 | BlasrAdapter(int threshold , char * contigfile, std::string outpath); 29 | bool RunAdapter(std::ifstream &infile); 30 | bool ChangeCutPoints(); 31 | bool ChangeCutPointsReverse(); 32 | bool GetNewBlasrFile(std::ifstream &infile); 33 | bool ReSortCutPoints(); 34 | }; 35 | 36 | 37 | 38 | 39 | 40 | #endif /* BLASRADAPTER_H_ */ 41 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # FLAGS 2 | CXXFLAGS = -fopenmp#-Wall # put compiler settings here 3 | # put linker settings here 4 | CXX = g++ -std=c++11 -g -O0 5 | RM = rm -f 6 | MV = mv 7 | CP = cp 8 | 9 | all:HALC Splitter Trimmer bin instbin 10 | 11 | HALC:src/HALC.o src/BlasrAdapter.o src/parsingargs.o 12 | $(CXX) $(CXXFLAGS) src/HALC.o src/BlasrAdapter.o src/parsingargs.o -o HALC 13 | 14 | HALC.o:src/HALC.cpp src/HALC.h 15 | $(CXX) $(CXXFLAGS) -c src/HALC.cpp 16 | 17 | BlasrAdapter.o:src/BlasrAdapter.cpp src/BlasrAdapter.h 18 | $(CXX) $(CXXFLAGS) -c src/BlasrAdapter.cpp 19 | 20 | parsingargs.o:src/parsingargs.cpp src/parsingargs.h 21 | $(CXX) $(CXXFLAGS) -c src/parsingargs.cpp 22 | 23 | Splitter:thirdparty/Splitter.o 24 | $(CXX) thirdparty/Splitter.o -o Splitter 25 | 26 | Splitter.o:thirdparty/Splitter.cpp 27 | $(CXX) -c thirdparty/Splitter.cpp 28 | 29 | Trimmer:thirdparty/Trimmer.o 30 | $(CXX) thirdparty/Trimmer.o -o Trimmer 31 | 32 | Trimmer.o:thirdparty/Trimmer.cpp 33 | $(CXX) -c thirdparty/Trimmer.cpp 34 | 35 | bin: 36 | mkdir bin 37 | 38 | instbin: 39 | $(MV) HALC Splitter Trimmer bin 40 | $(CP) thirdparty/Chunker thirdparty/SeqChunker-dd thirdparty/SeqChunker-perl thirdparty/SeqChunker-sed thirdparty/interleave thirdparty/interleaved-split bin 41 | 42 | clean: 43 | $(RM) src/HALC.o src/BlasrAdapter.o src/parsingargs.o thirdparty/Splitter.o thirdparty/Trimmer.o 44 | 45 | purge: clean 46 | $(RM) -r bin/ 47 | 48 | -------------------------------------------------------------------------------- /thirdparty/Chunker: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Author: 4 | # Frank Foerster, frank.foerster@biozentrum.uni-wuerzburg.de 5 | # Simon Pfaff, simon.pfaff@stud-mail.uni-wuerzburg.de 6 | # Thomas Hackl, thomas.hackl@uni-wuerzburg.de 7 | # 8 | # Last Modified: Thomas Hackl, 2014-09-10 9 | 10 | # Version 0.4 11 | # reliable detect binary folder with compability to MacOSX 12 | function expand_link { 13 | FILE="$1" 14 | 15 | TRACKING_FILE=$(mktemp) 16 | 17 | echo "$FILE" >"$TRACKING_FILE" 18 | 19 | NEW_FILE=$(readlink "$FILE"); 20 | 21 | while [ "$?" -eq 0 ] 22 | do 23 | echo "$NEW_FILE" >>"$TRACKING_FILE" 24 | 25 | # check for duplicates 26 | NUM_DUPLICATES=$(cat "$TRACKING_FILE" | sort | uniq -d | wc -l) 27 | 28 | if [ "$NUM_DUPLICATES" -gt 0 ] 29 | then 30 | break 31 | fi 32 | 33 | FILE=$(dirname "$FILE")/"$NEW_FILE" 34 | echo "$FILE" >&2 35 | NEW_FILE=$(readlink "$FILE") 36 | done 37 | 38 | rm "$TRACKING_FILE" 39 | 40 | echo "FILE should be located at '$FILE'" >&2 41 | 42 | echo "$FILE" 43 | } 44 | 45 | Bin="$(dirname "$(expand_link "$0")")" 46 | 47 | # check if perl exists, than use the perl version 48 | if [ ! -z $(which perl) ]; then 49 | 50 | perl "$Bin/SeqChunker-perl" $@ 51 | 52 | else 53 | # use the bash only or the sed based version of SeqChunker 54 | 55 | #SeqChunker-dd need Bash Version 4 or higher, fallback to slower sed version 56 | 57 | VER=`bash --version | grep -Eo [0-9][.][0-9] | cut -d. -f1` 58 | 59 | if [ "$VER" -ge 4 ]; then 60 | 61 | bash "$Bin/SeqChunker-dd" $@ 62 | 63 | else 64 | 65 | bash "$Bin/SeqChunker-sed" $@ 66 | echo "#Bash Version older than 4.0, using SeqChunker-sed" 67 | fi; 68 | fi; 69 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | ### LATEST NEWS 2 | The HALC paper is accepted for publication in BMC Bioinformatics! 3 | 4 | ### Overview 5 | HALC is software that makes error correction for long reads with high throughput. 6 | 7 | ### Copy right 8 | HALC is under the [Artistic License 2.0](http://opensource.org/licenses/Artistic-2.0). 9 | 10 | ### Short manual 11 | 1. System requirements 12 | 13 | HALC is suitable for 32-bit or 64-bit machines with Linux operating systems. At least 4GB of system memory is recommended for correcting larger data sets. 14 | 15 | 2. Installation 16 | 17 | Aligner [BLASR](https://github.com/PacificBiosciences/blasr) and error correction software [LoRDEC](http://www.atgc-montpellier.fr/lordec/) (only for -ordinary mode) are required to run HALC. 18 | * The source files in 'src' and 'thirdparty' folders can be compiled to generate a 'bin' folder by running Makefile: `make all`. 19 | * Put BLASR, LoRDEC and the 'bin' folder to your $PATH: `export PATH=PATH2BLASR:$PATH` , `export PATH=PATH2LoRDEC:$PATH` and `export PATH=PATH2bin:$PATH`, respectively. 20 | 21 | 3. Inputs 22 | * Long reads in FASTA format. 23 | * Contigs assembled from the corresponding short reads in FASTA format. 24 | * The initial short reads in FASTA format (only for -ordinary mode; obtained with `cat left_reads.fa >short_reads.fa` and then `cat right_reads.fa >>short_reads.fa`). 25 | 26 | 4. Using AlignGraph 27 | 28 | ``` 29 | runHALC.py long_reads.fa contigs.fa [-options|-options] 30 | ``` 31 | 32 |

Options (default value):
33 | -o/-ordinary short_reads.fa (yes)
34 | Ordinary mode utilizing repeats to make correction. The error correction software LoRDEC and the initial short reads are required to refine the repeat corrected regions. It is exclusive with the -repeat-free option.
35 | -r/-repeat-free (no)
36 | Repeat-free mode without utilizing repeats to make correction. It is exclusive with the -ordinary option.
37 | -b/-boundary n (4)
38 | Maximum boundary difference to split the subcontigs.
39 | -a/-accurate (yes)
40 | Accurate construction of the contig graph.
41 | -c/-coverage n (auto)
42 | Expected coverage on contigs. If not specified, it can be automatically calculated.
43 | -w/-width n (4)
44 | Maximum width of the dynamic programming table.
45 | -k/-kmer n (25)
46 | Kmer length for LoRDEC refinement.
47 | -t/-threads n (auto)
48 | Number of threads for one process to create. It is automatically set to the number of computing cores.
49 | -l/-log (no)
50 | System log to print.

51 | 52 | 5. Outputs 53 | * Error corrected full long reads. 54 | * Error corrected trimmed long reads. 55 | * Error corrected split long reads. 56 | 57 | ### Chinese name 58 | HALC's Chinese name is 浩克. 59 | 60 | 61 | -------------------------------------------------------------------------------- /src/parsingargs.h: -------------------------------------------------------------------------------- 1 | #ifndef PARSINGARGS_H 2 | #define PARSINGARGS_H 3 | /* purpose @ 解析输入的参数,需先通过AddArgType将必须参数和可允许的参数key加入到判定列表中 4 | * 通过Parse中的result将结果返回,其中结果的key为合法的key,vecotr为参数列表 5 | * 参数列表支持去掉参数前后的引号和\对引号和\的转义 6 | * 7 | * 特殊合法字段: 8 | * 格式 实际存储值 9 | * \\value\" \value" 10 | * "\\\value\"" \value" 11 | * 12 | * 注意事项: 13 | * 1、输入参数列表中参数分隔以空格区分 14 | * 2、- 后跟单字符关键字,--后跟长字符串关键字 15 | * 3、关键字不能重复出现,长短关键字不能同时出现在参数列表,否则会Parse函数会提示参数错误 16 | * 17 | * 用法: 18 | * ParsingArgs pa; 19 | * pa.AddArgType('l',"getlist", ParsingArgs::NO_VALUE); //NO_VALUE关键字后不能有参数 20 | * pa.AddArgType('p',"getuser", ParsingArgs::MAYBE_VALUE); //MAYBE_VALUE 关键字后可能有关键字 21 | * pa.AddArgType('o',"outFile", ParsingArgs::MUST_VALUE); // MUST_VALUE 关键字后必须有参数 22 | * std::map > result; 23 | * int iRet = pa.Parse(tmpPara,result); //result以输入关键字为key存储相关的值序列 24 | * 25 | * date @ 2014.02.19 26 | * author @ haibin.wang 27 | * 28 | */ 29 | 30 | #include 31 | #include 32 | #include 33 | 34 | class ParsingArgs 35 | { 36 | public: 37 | ParsingArgs(); 38 | ~ParsingArgs(); 39 | enum KeyFlag{ INVALID_KEY=-1, NO_VALUE, MAYBE_VALUE, MUST_VALUE}; 40 | /* pur @ 添加解释参数,一个参数可以是长参数,也可以是缩写的段参数,短参数只能为单字符,longName和shortName至少要有一个 41 | * para @ shortName 短参数名,0为不要短参数 42 | * para @ longName 长参数名 ,NULL为不要长参数 43 | * para @ flag 是否需要参数,0不需要,1必须要,2可要可不要 44 | * return @ true 添加成功,false添加失败 45 | */ 46 | bool AddArgType(const char shortName, const char * longName = NULL, KeyFlag flag=NO_VALUE); 47 | 48 | /* pur @ 根据参数类型解释传入的字符串 49 | * para @ paras 需要解释的字符串 50 | * para @ result 返回解析后的结果 51 | * para @ errPos 当错误的时候返回出错的大概位置 52 | * return @ 0 解释成功,负数 解释失败 53 | * -1 未知参数错误 54 | -2 不能有参数的选项有参数错误 55 | * -3 必有参数选项后没有跟参数 56 | * -4 关键字没有加入到AddArgType中 57 | * -5 关键字重复 58 | */ 59 | int Parse(const std::string & paras, std::map > & result, std::string &errPos); 60 | 61 | private: 62 | /* pur @ 判定传入的参数是否是已经添加的参数类型,如果是则去掉-或--,并返回 63 | * para @ key 要判定的参数 64 | * return @ -1 不是合法参数类型 否则返回Option中的flag 65 | */ 66 | KeyFlag GetKeyFlag(std::string &key); 67 | 68 | /* pur @ 删除关键字前的-或-- 69 | */ 70 | void RemoveKeyFlag(std::string & paras); 71 | 72 | /* pur @ 从Paras中获取一个单词,自动过滤掉单词前后引号,并实现\对空格和引号的转义 73 | * para @ Paras 返回第一个单词后的所有内容 74 | * para @ word 返回第一单词 75 | * return @ 成功返回true,false失败 76 | */ 77 | bool GetWord(std::string & Paras, std::string & word); 78 | 79 | /* pur @ 检查关键字是否重复 80 | * para @ key 被检查的关键字 81 | * para @ result已存储的关键字序列 82 | * return @ true 是重复的,false不重复 83 | */ 84 | bool IsDuplicateKey(const std::string &key, const std::map > & result); 85 | 86 | struct Option 87 | { 88 | std::string m_longName; 89 | char m_shortName; 90 | KeyFlag m_flag; 91 | }; 92 | 93 | std::vector