├── .gitignore ├── tools ├── byte_hist.pl ├── mydiff.pl ├── multi.sessions.example.pl ├── molder.cpp └── slimfastq.multi ├── samples ├── mydiff.pl ├── badsprintf.fq ├── tstb.fq └── tstc.fq ├── filer.tst.cpp ├── coder.hpp ├── bfiler.hpp ├── bfiler.cpp ├── xfile.hpp ├── main.cpp ├── config.hpp ├── README.md ├── common.hpp ├── filer.hpp ├── Makefile ├── base2_ranger.hpp ├── xfile.cpp ├── recs.hpp ├── log64_ranger.hpp ├── gens.hpp ├── qlts.hpp ├── usrs.hpp ├── qlts.cpp ├── power_ranger.hpp ├── gens.cpp ├── filer.cpp ├── utest.cpp ├── config.cpp ├── recs.cpp └── usrs.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | PROF 2 | one 3 | one.prof 4 | one.opt 5 | slimfastq 6 | slimfastq.gdb 7 | slimfastq.valgrind 8 | -------------------------------------------------------------------------------- /tools/byte_hist.pl: -------------------------------------------------------------------------------- 1 | # -*- cperl -*- 2 | 3 | # find longest line, etc: 4 | # perl -lne 'if(length>$m){$m=length};END{print"$m"}' filename 5 | 6 | eval 'exec perl $0 $*' 7 | if 0 ; 8 | 9 | use 5.6.0 ; 10 | use warnings ; 11 | use strict ; 12 | use integer ; 13 | use bytes ; 14 | 15 | my $f = shift or die "usage: $0 " ; 16 | my $n = shift || 1000_000; 17 | my %a; 18 | my $count = 0; 19 | 20 | open F, $f or die "can't open $f: $!\n" ; 21 | binmode F; 22 | while (read F, my $buf, 10000) { 23 | for (split //, $buf) { 24 | $a{ord $_} ++ ; 25 | exit if $n < ++ $count; 26 | } 27 | } 28 | 29 | END { 30 | printf "%02x: $a{$_}\n", $_ for sort {$a{$a} <=> $a{$b}} keys %a ; 31 | printf "count=$count numc=%d\n", scalar keys %a; 32 | } 33 | 34 | 35 | -------------------------------------------------------------------------------- /tools/mydiff.pl: -------------------------------------------------------------------------------- 1 | # -*- cperl -*- 2 | 3 | eval 'exec /usr/bin/perl $0 $*' 4 | if 0 ; 5 | 6 | use 5.6.0 ; 7 | use warnings ; 8 | use strict ; 9 | use integer ; 10 | use bytes ; 11 | 12 | my $a = shift ; 13 | my $b = shift ; 14 | 15 | die < 18 | HOWTO 19 | 20 | open A, $a or die "can't read '$a': $!\n" ; 21 | open B, $b or die "can't read '$b': $!\n" ; 22 | 23 | sub carp($) { 24 | use Term::ANSIColor qw(:constants); 25 | $Term::ANSIColor::AUTORESET = 1; 26 | my $errstr = RED " *********************************"; 27 | die "$errstr\n@_\n$errstr\n"; 28 | } 29 | my $l = 0; 30 | while (1) { 31 | my $A = ; 32 | my $B = ; 33 | $l++; 34 | last if not $A and not $B ; 35 | carp "$a: early eof line $l" if not $A ; 36 | carp "$b: early eof line $l" if not $B ; 37 | carp "mismatch at line $l:\n$a\n$b\n$A$B" if $A ne $B ; 38 | } 39 | 40 | print STDERR "$a, $b:\n = Full Match = \n"; 41 | 42 | exit(0); 43 | -------------------------------------------------------------------------------- /samples/mydiff.pl: -------------------------------------------------------------------------------- 1 | # -*- cperl -*- 2 | 3 | eval 'exec /usr/bin/perl $0 $*' 4 | if 0 ; 5 | 6 | use 5.6.0 ; 7 | use warnings ; 8 | use strict ; 9 | use integer ; 10 | use bytes ; 11 | 12 | my $a = shift ; 13 | my $b = shift ; 14 | 15 | die < 18 | HOWTO 19 | 20 | open A, $a or die "can't read '$a': $!\n" ; 21 | open B, $b or die "can't read '$b': $!\n" ; 22 | 23 | sub carp($) { 24 | use Term::ANSIColor qw(:constants); 25 | $Term::ANSIColor::AUTORESET = 1; 26 | my $errstr = RED " *********************************"; 27 | die "$errstr\nError:\n@_\n$errstr\n"; 28 | } 29 | my $l = 0; 30 | while (1) { 31 | my $A = ; 32 | my $B = ; 33 | $l++; 34 | last if not $A and not $B ; 35 | carp "$a: early eof line $l" if not $A ; 36 | carp "$b: early eof line $l" if not $B ; 37 | carp "mismatch at line $l:\n$a\n$b\n$A$B" if $A ne $B ; 38 | } 39 | 40 | print STDERR "$a, $b:\n = Full Match = \n"; 41 | 42 | exit(0); 43 | -------------------------------------------------------------------------------- /samples/badsprintf.fq: -------------------------------------------------------------------------------- 1 | @E00386:183:HKVK2CCXY:6:1101:19837:1291 1:N:0:6 2 | NCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCNCCCCCCCCCNNCNCNNNCNCCNNCCNCCCNCCCNCCCNCCNCCCCCCCCCCCCCCCCNCNCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC 3 | + 4 | #-A- 3 | #include 4 | #include 5 | #include "filer.cpp" 6 | 7 | const char* tst_filename = "/tmp/slimfastq.filer.tst"; 8 | 9 | void test_big_file() { 10 | 11 | unlink(tst_filename); 12 | printf("Create %s..\n", tst_filename); 13 | FILE* out = fopen(tst_filename, "wb"); 14 | if (not out) 15 | croak("Failed creating %d", 0); 16 | 17 | onef.init_write(out); 18 | for (UINT64 i = 0 ; i < (1ULL<<32)/FILER_PAGE + 100; i++) 19 | onef.allocate(); // bigger than 4G size 20 | 21 | { 22 | FilerSave tst("onef.tst"); 23 | for (UCHAR c = 0; c < 0xf0; c++) 24 | tst.put(c); 25 | } 26 | 27 | off_t size = onef.finit_size(); 28 | onef.finit_write(); 29 | 30 | struct stat st; 31 | stat(tst_filename, &st); 32 | if (st.st_size != size) 33 | croak("Expected size %lld", size); 34 | else { 35 | int unlink_failed = unlink(tst_filename); 36 | assert(not unlink_failed); 37 | printf("Delete %s\n", tst_filename); 38 | } 39 | 40 | } 41 | 42 | int main(int, char**) { 43 | 44 | test_big_file(); 45 | return 0; 46 | } 47 | -------------------------------------------------------------------------------- /samples/tstb.fq: -------------------------------------------------------------------------------- 1 | @SRR100006.334169 80C29ABXX110103:6:1:21370:24547/2 2 | GGTCCTCGGAGGACGCTGACAGGGATGTCTGTCTGCAGTGGCCAGAAGGGGCAGGGTGCAGATACTCATAGGGTTC 3 | + 4 | HHHHHHHHHFHHHHHHEHHHAGGGFHGEHFHHHHHHFDDFDFGCG9EDCBEEDEE+@B@@################ 5 | @SRR100006.334295 80C29ABXX110103:6:1:21489:24556/2 6 | ACCCCCTCCAGGCCCTGACTCCCCACCTCAGGCCATATCATCCACAGAAAGCCAAATTCCAGCCATATTGAACCAT 7 | + 8 | HHHHHHHHHHHDHHHHHFHHHHHHEHHHHGHDBCHHFDBFFFBEFEFFHADHEHHF>EBGEBDEEEB;8@B@;DC5 9 | @SRR100006.334654 80C29ABXX110103:6:1:21443:5%175/2 10 | TGGCGGAGAACCGAACGCCGAGACGTGTCCGAGCGAAGCCCGGGACAGCAGAGACAATTATCACCACGAGTCTGCA 11 | + 12 | ############################################################################ 13 | @SRR100006.334662 80C29ABXX110103:6:1:21423:6%175/2 14 | AGGAAGCGAGAACCAGAACTAAATTGAGAGGAAACATGAATGGACAGGATGTTGACGCACGGCTAGTAGCGGACGT 15 | + 16 | //@@######################################################################## 17 | @SRR100006.335044 80C29ABXX110103:6:1:21370:33%15/2 18 | AAGGCCAGGTCACCATCTCAGCCGACAAGTCCATCAGCACCGCCTACCTGCAGTGGAGCAGCCTGAAGGCCTCGGA 19 | + 20 | HHHDHHHHHHHHHHHHHHHHHHHHEHHHGBHHHFHHHFHHBHHFGECHFHHHHDHHGHGHHHFHHGAEGCAFBEFF 21 | @SRR100006.335072 80C29ABXX110103:6:1:21424:35%15/2 22 | ATATCAAGATACGCTGTTGGCCTCACCCCTGTTGTCAAGGAACACACACACAGCCCTGGTGGGTATGTGCTGGACC 23 | + 24 | DHH=HDF==8DBEEE=?E8EFGBFFGFF9FAA3A:EEEE5E=EEDCA 25 | -------------------------------------------------------------------------------- /coder.hpp: -------------------------------------------------------------------------------- 1 | ///////////////////////////////////////////////////////////////////////////// 2 | // This code was taken from fqz_comp. According to James Bonfield it is // 3 | // based on Eugene Shelwien's coders6c2.zip // 4 | // // 5 | // (James admires the coder's efficiency, and so do I) // 6 | // // 7 | // Also see // 8 | // http://cpansearch.perl.org/src/SALVA/Compress-PPMd-0.10/Coder.hpp // 9 | ///////////////////////////////////////////////////////////////////////////// 10 | 11 | 12 | #ifdef ZP_RANGECODER_H 13 | #error Do not load me twice 14 | #endif 15 | #define ZP_RANGECODER_H 16 | 17 | 18 | #include "filer.hpp" 19 | #include "common.hpp" 20 | 21 | class RCoder { 22 | 23 | private: 24 | enum { TOP =(1ULL<<24) }; 25 | UINT64 low; 26 | UINT64 code; 27 | UINT32 range; 28 | 29 | FilerLoad* m_in ; 30 | FilerSave* m_out; 31 | 32 | public: 33 | RCoder() { m_in = NULL; m_out = NULL; } 34 | void init(FilerSave* f_out) { 35 | m_out = f_out; 36 | m_in = NULL; 37 | low=0; 38 | range=(UINT32)-1; 39 | } 40 | 41 | void init(FilerLoad* f_in) { 42 | m_in = f_in; 43 | m_out = NULL; 44 | low=0; 45 | range=(UINT32)-1; 46 | code = 0; // happy compiler 47 | for (int i=0; i<8; i++) 48 | code = (code<<8) | m_in->get(); 49 | } 50 | 51 | ~RCoder() { done(); } 52 | void done() { 53 | // Allow explicit 'done' in case we wish not FilerSave close before this call 54 | if (m_out) { 55 | for (int i=0; i<8; i++) { 56 | m_out->put(low >> 56); 57 | low <<= 8; 58 | } 59 | m_out = NULL; 60 | } 61 | } 62 | 63 | /////////////// 64 | // FASTER ! // 65 | /////////////// 66 | void Encode (UINT32 cumFreq, UINT32 freq, UINT32 totFreq) { 67 | 68 | range /= totFreq ; 69 | low += cumFreq * range ; 70 | range *= freq; 71 | 72 | // assert (cumFreq + freq <= totFreq); 73 | 74 | while( rangeput(low >> 56); 78 | range <<= 8; 79 | low <<= 8; 80 | } 81 | } 82 | 83 | UINT32 GetFreq (UINT32 totFreq) { 84 | range /= totFreq; 85 | return code/range; 86 | } 87 | 88 | void Decode (UINT32 cumFreq, UINT32 freq, UINT32 totFreq) { 89 | UINT32 temp = cumFreq*range; 90 | low += temp; 91 | code -= temp; 92 | range*= freq; 93 | 94 | while( rangeget(); 99 | range<<=8; 100 | low <<=8; 101 | } 102 | } 103 | }; 104 | -------------------------------------------------------------------------------- /bfiler.hpp: -------------------------------------------------------------------------------- 1 | /***********************************************************************************************************************/ 2 | /* This program was written by Josef Ezra */ 3 | /* Copyright (c) 2013, Infinidat */ 4 | /* All rights reserved. */ 5 | /* */ 6 | /* Redistribution and use in source and binary forms, with or without modification, are permitted provided that */ 7 | /* the following conditions are met: */ 8 | /* */ 9 | /* Redistributions of source code must retain the above copyright notice, this list of conditions and the following */ 10 | /* disclaimer. */ 11 | /* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following */ 12 | /* disclaimer in the documentation and/or other materials provided with the distribution. */ 13 | /* Neither the name of the Infinidat nor the names of its contributors may be used to endorse or promote products */ 14 | /* derived from this software without specific prior written permission. */ 15 | /* */ 16 | /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 17 | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 18 | /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ 19 | /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */ 20 | /* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, */ 21 | /* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE */ 22 | /* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 23 | /***********************************************************************************************************************/ 24 | 25 | 26 | #ifndef FQ_BFILE_H 27 | #define FQ_BFILE_H 28 | 29 | 30 | #include 31 | #include "common.hpp" 32 | #include "config.hpp" 33 | #include "filer.hpp" 34 | #include "coder.hpp" 35 | 36 | class PowerRanger; 37 | class BFileBase { 38 | protected: 39 | UCHAR bmap; 40 | UCHAR bcnt; 41 | PowerRanger* ranger; 42 | RCoder rc; 43 | BFileBase(); 44 | ~BFileBase(); 45 | }; 46 | 47 | class BFileSave: private BFileBase { 48 | FilerSave* filer; 49 | public: 50 | BFileSave(const char* name); 51 | ~BFileSave(); 52 | void putb(bool bit); 53 | }; 54 | 55 | class BFileLoad: private BFileBase { 56 | FilerLoad* filer; 57 | bool is_valid; 58 | public: 59 | BFileLoad(const char* name); 60 | ~BFileLoad(); 61 | bool getb(); 62 | }; 63 | 64 | #endif // FQ_BFILE_H 65 | -------------------------------------------------------------------------------- /bfiler.cpp: -------------------------------------------------------------------------------- 1 | /***********************************************************************************************************************/ 2 | /* This program was written by Josef Ezra */ 3 | /* Copyright (c) 2013, Infinidat */ 4 | /* All rights reserved. */ 5 | /* */ 6 | /* Redistribution and use in source and binary forms, with or without modification, are permitted provided that */ 7 | /* the following conditions are met: */ 8 | /* */ 9 | /* Redistributions of source code must retain the above copyright notice, this list of conditions and the following */ 10 | /* disclaimer. */ 11 | /* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following */ 12 | /* disclaimer in the documentation and/or other materials provided with the distribution. */ 13 | /* Neither the name of the Infinidat nor the names of its contributors may be used to endorse or promote products */ 14 | /* derived from this software without specific prior written permission. */ 15 | /* */ 16 | /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 17 | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 18 | /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ 19 | /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */ 20 | /* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, */ 21 | /* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE */ 22 | /* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 23 | /***********************************************************************************************************************/ 24 | 25 | 26 | 27 | #include "bfiler.hpp" 28 | #include "power_ranger.hpp" 29 | 30 | BFileBase::BFileBase() { 31 | ranger = new PowerRanger(); 32 | assert(ranger); 33 | } 34 | 35 | BFileBase::~BFileBase() { 36 | DELETE(ranger); 37 | } 38 | 39 | BFileSave::BFileSave(const char* name) { 40 | filer = new FilerSave(name); 41 | assert(filer); 42 | rc.init(filer); 43 | } 44 | 45 | BFileSave::~BFileSave() { 46 | if (bcnt and filer) 47 | ranger->put(&rc, bmap); 48 | rc.done(); 49 | DELETE(filer); 50 | } 51 | 52 | void BFileSave::putb(bool bit) { 53 | bmap |= ((!!bit)<put(&rc, bmap); 56 | bmap = bcnt = 0; 57 | } 58 | } 59 | 60 | BFileLoad::BFileLoad(const char* name) { 61 | filer = new FilerLoad(name, &is_valid); 62 | assert(filer); 63 | rc.init(filer); 64 | } 65 | 66 | BFileLoad::~BFileLoad() { 67 | rc.done(); 68 | DELETE(filer); 69 | } 70 | 71 | bool BFileLoad::getb() { 72 | rarely_if(bcnt == 0) { 73 | bcnt = 8; 74 | bmap = ranger->get(&rc); 75 | } 76 | return !! (bmap & (0x100>> bcnt--)); 77 | } 78 | 79 | -------------------------------------------------------------------------------- /xfile.hpp: -------------------------------------------------------------------------------- 1 | /***********************************************************************************************************************/ 2 | /* This program was written by Josef Ezra */ 3 | /* Copyright (c) 2013, Infinidat */ 4 | /* All rights reserved. */ 5 | /* */ 6 | /* Redistribution and use in source and binary forms, with or without modification, are permitted provided that */ 7 | /* the following conditions are met: */ 8 | /* */ 9 | /* Redistributions of source code must retain the above copyright notice, this list of conditions and the following */ 10 | /* disclaimer. */ 11 | /* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following */ 12 | /* disclaimer in the documentation and/or other materials provided with the distribution. */ 13 | /* Neither the name of the Infinidat nor the names of its contributors may be used to endorse or promote products */ 14 | /* derived from this software without specific prior written permission. */ 15 | /* */ 16 | /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 17 | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 18 | /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ 19 | /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */ 20 | /* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, */ 21 | /* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE */ 22 | /* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 23 | /***********************************************************************************************************************/ 24 | 25 | 26 | #ifndef FQ_XFILE_H 27 | #define FQ_XFILE_H 28 | 29 | 30 | #include 31 | #include "common.hpp" 32 | #include "config.hpp" 33 | 34 | #include "filer.hpp" 35 | #include "power_ranger.hpp" // exception list 36 | 37 | class XFileBase { 38 | 39 | protected: 40 | 41 | XFileBase(const char* filename); 42 | 43 | const char* m_filename; 44 | bool m_valid; 45 | RCoder rcoder; 46 | PowerRangerU ranger; 47 | PowerRanger ranger_str; 48 | }; 49 | 50 | 51 | class XFileSave : private XFileBase { 52 | FilerSave* filer; 53 | void init(); 54 | 55 | public: 56 | XFileSave(const char* filename); 57 | ~XFileSave(); 58 | bool put(UINT64 gap); 59 | void put_chr(UCHAR chr); 60 | void put_str(const UCHAR* p, size_t len); 61 | size_t tell() const ; 62 | bool has_file() const { return !! filer ; } 63 | }; 64 | 65 | class XFileLoad : private XFileBase { 66 | FilerLoad* filer; 67 | void init(); 68 | public: 69 | XFileLoad(const char* filename); 70 | ~XFileLoad(); 71 | UINT64 get(); 72 | UCHAR get_chr(); 73 | UCHAR* get_str(UCHAR* p); 74 | bool is_valid() const { return m_valid; } 75 | }; 76 | 77 | 78 | #endif 79 | -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | /***********************************************************************************************************************/ 2 | /* This program was written by Josef Ezra */ 3 | /* Copyright (c) 2013, Infinidat */ 4 | /* All rights reserved. */ 5 | /* */ 6 | /* Redistribution and use in source and binary forms, with or without modification, are permitted provided that */ 7 | /* the following conditions are met: */ 8 | /* */ 9 | /* Redistributions of source code must retain the above copyright notice, this list of conditions and the following */ 10 | /* disclaimer. */ 11 | /* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following */ 12 | /* disclaimer in the documentation and/or other materials provided with the distribution. */ 13 | /* Neither the name of the Infinidat nor the names of its contributors may be used to endorse or promote products */ 14 | /* derived from this software without specific prior written permission. */ 15 | /* */ 16 | /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 17 | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 18 | /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ 19 | /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */ 20 | /* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, */ 21 | /* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE */ 22 | /* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 23 | /***********************************************************************************************************************/ 24 | 25 | // Manifest: 26 | // update by jezra Sep, 2019 27 | 28 | // This project was written in a "simplified" c++ to support any old compiler out there. The structure is: 29 | // During save (encoding) 30 | // UsrSave - handling fastq files, detecting records and providing them to: 31 | // RecSave - saving the header lines (break to word tockens and save diffs) 32 | // GenSave - saves the genomic data (optimized for 4 values) 33 | // QltSave - saves the quality measure (optimized for 64 values) 34 | // During load (decode) 35 | // UsrLoad - reads record fragments from RecLoad, GenLoad, QltLoad and prints in order 36 | 37 | // under the hood, use WORM file system to implement multiple range coders streams. Note that the 38 | // file's first block is metadata + info. 39 | 40 | 41 | #include 42 | #include 43 | #include 44 | #include 45 | 46 | #include "common.hpp" 47 | #include "config.hpp" 48 | #include "usrs.hpp" 49 | 50 | Config conf; 51 | int main(int argc, char** argv) { 52 | 53 | conf.init(argc, argv); 54 | int ret = 55 | conf.encode ? 56 | UsrSave () . encode() : 57 | UsrLoad () . decode() ; 58 | 59 | conf.finit(); 60 | return ret; 61 | } 62 | -------------------------------------------------------------------------------- /config.hpp: -------------------------------------------------------------------------------- 1 | /***********************************************************************************************************************/ 2 | /* This program was written by Josef Ezra */ 3 | /* Copyright (c) 2013, Infinidat */ 4 | /* All rights reserved. */ 5 | /* */ 6 | /* Redistribution and use in source and binary forms, with or without modification, are permitted provided that */ 7 | /* the following conditions are met: */ 8 | /* */ 9 | /* Redistributions of source code must retain the above copyright notice, this list of conditions and the following */ 10 | /* disclaimer. */ 11 | /* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following */ 12 | /* disclaimer in the documentation and/or other materials provided with the distribution. */ 13 | /* Neither the name of the Infinidat nor the names of its contributors may be used to endorse or promote products */ 14 | /* derived from this software without specific prior written permission. */ 15 | /* */ 16 | /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 17 | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 18 | /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ 19 | /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */ 20 | /* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, */ 21 | /* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE */ 22 | /* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 23 | /***********************************************************************************************************************/ 24 | 25 | #ifndef FQ_CONFIG_H 26 | #define FQ_CONFIG_H 27 | 28 | #include 29 | #include "filer.hpp" 30 | 31 | class Config; 32 | 33 | void croak(const char *format, ...)__attribute__ ((noreturn, cold)); 34 | 35 | class Config { 36 | // Singleton - croaks if already exists 37 | public: 38 | Config(); 39 | ~Config(); 40 | void init (int argc, char **argv); 41 | void finit(); 42 | FILE * file_usr() const { return reinterpret_cast(f_usr);} 43 | int version, level, decoder_version; 44 | bool encode, profiling, quiet; 45 | 46 | void load_info() const; 47 | void set_info(const char* key, const char* val) const; 48 | void set_info(const char* key, long long num) const; 49 | const char* get_info(const char* key) const; 50 | bool has_info(const char* key) const; 51 | bool get_bool(const char* key) const; 52 | long long get_long(const char* key, long long val=0) const; 53 | 54 | private: 55 | void usage() const; 56 | void statistics_dump() const; 57 | 58 | FILE *f_usr; 59 | FilerSave* m_info_filer; 60 | }; 61 | 62 | extern Config conf; 63 | extern unsigned long long g_record_count; 64 | extern unsigned long long g_genofs_count; 65 | 66 | #endif // FQ_CONFIG_H 67 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | slimfastq 2 | ========= 3 | 4 | slimfastq would efficiently compresses/decompresses fastq files. It features: 5 | 6 | * High compression ratio 7 | * Relatively low CPU, memory usage 8 | * Truly lossless compression/decompression 9 | * Posix piping friendly (i.e. fastq input/output steam is serialzed during compression/decompression) 10 | 11 | Usage 12 | ----- 13 | 14 | % slimfastq *file.fastq* *new-file.sfq* : compress *file.fastq* to *new-file.sfq* 15 | % slimfastq -1 *file.fastq* *new-file.sfq*: compress *file.fastq* to *new-file.sfq*, using little CPU/memory resources 16 | (-1 to -4 are levels of compression/resources trade-offs, -3 is default) 17 | 18 | % slimfastq *file.sfq* : decompress *file.sfq* to stdout (format is determined by stamp, not name) 19 | % slimfastq *file.sfq* *file.fastq : decompress *file.sfq* to *file.fastq* 20 | 21 | % slimfastq -h : get help 22 | 23 | pipe usage: 24 | % gzip -dc *file.fastq.gz* | slimfastq -f *file.sfq* : convert from gzip to sfq format (and save a lot of disk space) 25 | % slimfastq *file.sfq* | md5sum - : get the checksum of the decompressed file without creating a file 26 | 27 | 28 | The multi threads FAQ 29 | --------------------- 30 | The main reason slimfastq is a single thread application is to avoid the overhead of semaphores, L2 flushes, and context 31 | switches. The goal is to focus on the speed of N files compression/decompression instead of a single file. 32 | Use slimfastq.multi script (located under tools/) to compress/decompress multiple files in parallel. The '-h' argument, as 33 | expected, will provide help. This script can be easily edited for a sepcial setup. Please do not hesitate to email me if 34 | any help is needed. 35 | 36 | Compile 37 | ------- 38 | Simple compilation: 39 | * run "make" or "make test" 40 | (If the compilation fails, please let me know). 41 | 42 | Profile optimized compilation: 43 | * Compile slimfastq in a gcc profile generator mode (use 'make slimfastq.prof'). 44 | * Compress/decompress some of your fastq files - this will generate some *.gcda files. 45 | * Recompile with optimization flags for the profiler generated data (use 'make slimfastq.opt'). 46 | (For whatever it's worth, the author of this page could not notice any significant performance change yield by the optimized compilation.) 47 | 48 | Test 49 | ------- 50 | "make test" will compress, decompress and compare all the fastq files in the ./samples dir. 51 | Some testing tips to check your own fastq files: 52 | 1) slimfastq is a lossless compression and posix pipes friendly, therefore it's easy to check the integrity of a large file with checksums: 53 | 54 | % md5sum large-file.fq 55 | % ./slimfastq largefile.fq -O /tmp/tst.sfq 56 | % ./slimfastq /tmp/tst.sfq | md5sum - 57 | 58 | (if md5sum don't match, one can use ./tools/mydiff.pl to the bad line. And I'll be grateful for a bug report) 59 | 2) use time. Example: 60 | 61 | % /usr/bin/time -f " IO : io=%I faults=%F\n MEM: max=%M kb Average=%K kb\n CPU: Percentage=%P real=%e sys=%S user=%U" slimfastq large-file.fq /tmp/a.tst -O 62 | 63 | 3) Performance wise, a single file compression/decompression is not very interesting. The script tools/slimfastq.multi can be used to evaluate performance of 64 | concurrent files compression/decompression. This script can be edited to use with other compression softwares - general or fastq specific. 65 | 4) Using slimfastq.multi, try to increase/decrease thread count to find the optimal number for a specific system. 66 | 67 | Install 68 | ------- 69 | After compile 70 | * run "sudo make install" 71 | * Alternatively to "make install", copy the "slimfastq" executable and the "tools/slimfastq.multi" script to any location. 72 | 73 | License 74 | ------- 75 | The BSD 3-Clause 76 | 77 | Platform 78 | -------- 79 | slimfastq was developed and optimized for x86_64 GNU/Linux and Darwin OS. For other system's support requests, please contact Josef Ezra. 80 | 81 | Contact 82 | ------- 83 | Josef Ezra (jezra at appple.com), (jezra at cpan.org) 84 | 85 | -------------------------------------------------------------------------------- /tools/multi.sessions.example.pl: -------------------------------------------------------------------------------- 1 | # -*- cperl -*- 2 | 3 | # This program was written by Josef Ezra , # 4 | # Copyright (c) 2019, Infinidat # 5 | # All rights reserved. # 6 | # # 7 | # Redistribution and use in source and binary forms, with or without modification, are permitted provided that # 8 | # the following conditions are met: # 9 | # # 10 | # Redistributions of source code must retain the above copyright notice, this list of conditions and the following # 11 | # disclaimer. # 12 | # Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following # 13 | # disclaimer in the documentation and/or other materials provided with the distribution. # 14 | # Neither the name of the Infinidat nor the names of its contributors may be used to endorse or promote products # 15 | # derived from this software without specific prior written permission. # 16 | 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, # 18 | # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # 19 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # 20 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # 21 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, # 22 | # WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE # 23 | # USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # 24 | 25 | eval 'exec /usr/bin/perl $0 $*' 26 | if 0 ; 27 | 28 | use 5.6.0 ; 29 | use warnings ; 30 | use strict ; 31 | use integer ; 32 | use bytes ; 33 | 34 | use threads ; 35 | use threads::shared ; 36 | use File::Basename; 37 | 38 | my $num_t = shift or die < [src-dir] [tgt-dir] [slimfastq-cmd] 41 | While: 42 | max-N : (required) how many files to compress concurrently 43 | src-dir: (optional) directory to find '*.fastq' files (default is './') 44 | tgt-dir: (optional) directory to save '*.sfq' compressed files (default is src-dir/SFQ) 45 | slimfastq-cmd: valid path/name to the slimfastq (default is './slimfastq') 46 | 47 | This will call ./slimfastq ( this script) to compress all *.fastq *.fq files in src-dir 48 | into *.sfq in tgt-dir. tgt-dir may be created (if only one directory level is missing). 49 | 50 | Please feel free to change the script to your needs. 51 | 52 | 53 | USAGE 54 | 55 | my $sdir = shift || '.'; 56 | my $tdir = shift || './SFQ'; 57 | my $sfq = shift || "./slimfastq"; # change to your needs 58 | 59 | die "$sdir: does not exists\n" if not -d $sdir; 60 | die "$tdir: can't create\n" if not -d $tdir and not mkdir $tdir; 61 | 62 | my @files : shared ; 63 | @files = glob "$sdir/*.fastq $sdir/*.fq"; 64 | 65 | sub getfile() { 66 | lock @files; 67 | shift @files; 68 | } 69 | 70 | sub just_doit { 71 | while (my $u = getfile()) { 72 | my $b = basename $u, '.fastq', '.fq' ; 73 | my $f = "$tdir/$b.sfq"; 74 | system "$sfq -q -u $u -f $f"; 75 | } 76 | } 77 | 78 | my @threads; 79 | sub thread_push(){ 80 | push @threads, threads->create( \&just_doit ) ; 81 | } 82 | sub thread_pop(){ 83 | my $t = shift @threads; 84 | $t->join; 85 | } 86 | 87 | while (@files) { 88 | thread_push; 89 | thread_pop if @threads > $num_t; 90 | } thread_pop while @threads; 91 | 92 | 93 | -------------------------------------------------------------------------------- /common.hpp: -------------------------------------------------------------------------------- 1 | /***********************************************************************************************************************/ 2 | /* This program was written by Josef Ezra */ 3 | /* Copyright (c) 2013, Infinidat */ 4 | /* All rights reserved. */ 5 | /* */ 6 | /* Redistribution and use in source and binary forms, with or without modification, are permitted provided that */ 7 | /* the following conditions are met: */ 8 | /* */ 9 | /* Redistributions of source code must retain the above copyright notice, this list of conditions and the following */ 10 | /* disclaimer. */ 11 | /* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following */ 12 | /* disclaimer in the documentation and/or other materials provided with the distribution. */ 13 | /* Neither the name of the Infinidat nor the names of its contributors may be used to endorse or promote products */ 14 | /* derived from this software without specific prior written permission. */ 15 | /* */ 16 | /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 17 | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 18 | /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ 19 | /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */ 20 | /* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, */ 21 | /* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE */ 22 | /* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 23 | /***********************************************************************************************************************/ 24 | 25 | 26 | 27 | 28 | #ifndef FQ_COMMON_H 29 | #define FQ_COMMON_H 30 | 31 | #define PACKED __attribute__((__packed__)) 32 | 33 | #define LIKELY(X) __builtin_expect((X),1) 34 | #define RARELY(X) __builtin_expect((X),0) 35 | // TEST: Would profiler optimization work better with no expectations 36 | // #define LIKELY(X) (X) 37 | // #define RARELY(X) (X) 38 | #define likely_if(x) if (LIKELY(x)) 39 | #define rarely_if(x) if (RARELY(x)) 40 | 41 | typedef unsigned char UCHAR ; 42 | typedef UCHAR UINT8 ; 43 | typedef unsigned short UINT16; 44 | typedef unsigned int UINT32; 45 | typedef unsigned long long UINT64; 46 | 47 | #define BZERO(X) bzero(&(X), sizeof(X)) 48 | #define DELETE(X) do {if (X) delete X; X = NULL; } while (0) 49 | 50 | #define IS_CLR(exmap, offset) (0 == (exmap&(1ULL< 65 | # define PREFETCH(X) _mm_prefetch((const char *)(X), _MM_HINT_T0) 66 | // This prefetch saves 8 seconds (on 5.4G fastq), but seems to use slightly 67 | // more cpu (or is it just doing the same work quota at lesser time?) 68 | #else 69 | # define PREFETCH(X) 70 | #endif 71 | 72 | #endif // FQ_COMMON_H 73 | -------------------------------------------------------------------------------- /filer.hpp: -------------------------------------------------------------------------------- 1 | /***********************************************************************************************************************/ 2 | /* This program was written by Josef Ezra */ 3 | /* Copyright (c) 2013, Infinidat */ 4 | /* All rights reserved. */ 5 | /* */ 6 | /* Redistribution and use in source and binary forms, with or without modification, are permitted provided that */ 7 | /* the following conditions are met: */ 8 | /* */ 9 | /* Redistributions of source code must retain the above copyright notice, this list of conditions and the following */ 10 | /* disclaimer. */ 11 | /* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following */ 12 | /* disclaimer in the documentation and/or other materials provided with the distribution. */ 13 | /* Neither the name of the Infinidat nor the names of its contributors may be used to endorse or promote products */ 14 | /* derived from this software without specific prior written permission. */ 15 | /* */ 16 | /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 17 | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 18 | /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ 19 | /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */ 20 | /* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, */ 21 | /* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE */ 22 | /* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 23 | /***********************************************************************************************************************/ 24 | 25 | 26 | #ifndef COMMON_FILER_H 27 | #define COMMON_FILER_H 28 | 29 | #include "common.hpp" 30 | #include 31 | #include 32 | #include 33 | 34 | #define FILER_PAGE 0x2000 35 | 36 | class FilerBase { 37 | protected: 38 | enum { 39 | maxi_nodes = FILER_PAGE / sizeof(UINT32) - 1, 40 | size_nodes, 41 | }; 42 | 43 | UINT64 m_page_count; 44 | bool m_valid; 45 | UCHAR m_buff[ FILER_PAGE+10 ]; 46 | size_t m_cur, m_count; 47 | bool *m_valid_ptr; 48 | UINT32 m_node[ size_nodes ]; 49 | UINT32 m_node_i; 50 | UINT32 m_node_p; 51 | UINT32 m_onef_i; 52 | 53 | FilerBase(); 54 | size_t tell() const ; 55 | }; 56 | 57 | class FilerSave : private FilerBase { 58 | public: 59 | static void init(FILE* in); 60 | static void finit(); 61 | static UINT64 finit_size(); 62 | 63 | FilerSave(const char* name); 64 | FilerSave(int forty_two); 65 | ~FilerSave(); 66 | 67 | bool is_valid() const ; 68 | size_t tell() const ; 69 | 70 | inline bool put(UCHAR c) { 71 | rarely_if(m_cur >= FILER_PAGE) 72 | save_page(); 73 | m_buff[m_cur++] = c; 74 | return m_valid; 75 | } 76 | 77 | private: 78 | void save_node(UINT32 next_node); 79 | void save_page(bool finit=false); 80 | // UINT32 findex; 81 | }; 82 | 83 | class FilerLoad : private FilerBase { 84 | public: 85 | static void init(FILE* in); 86 | static void confess(); 87 | FilerLoad(const char* name, bool *valid_ptr); 88 | FilerLoad(int forty_two, bool* valid_ptr); 89 | ~FilerLoad(); 90 | 91 | bool is_valid() const ; 92 | size_t tell() const ; 93 | 94 | inline UCHAR get() { 95 | rarely_if(m_count <= m_cur) load_page(); 96 | return m_valid ? m_buff[m_cur++] : 0; 97 | } 98 | 99 | private: 100 | void load_page(); 101 | }; 102 | 103 | #endif // COMMON_FILER_H 104 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | 3 | WALL = -Wall -Werror 4 | FLAGS = $(WALL) -g -DDO_DEBUG -D_FILE_OFFSET_BITS=64 -std=c++0x 5 | # PROF_DIR = PROF <= GCC Bug 47793: relative path turns into absolute 6 | PROF_DIR = 7 | FLAGS_FAST= -O3 -fomit-frame-pointer -fstrict-aliasing -ffast-math -msse3 -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE 8 | FLAGS_OPT = $(WALL) $(PROF_DIR) $(FLAGS_FAST) -fprofile-use 9 | FLAGS_PROF= $(WALL) $(PROF_DIR) $(FLAGS_FAST) -fprofile-generate 10 | FLAGS_OPT2 = $(WALL) $(PROF_DIR) $(FLAGS_FAST) 11 | 12 | # harmless in non-macOS env 13 | export MACOSX_DEPLOYMENT_TARGET=10.14 14 | 15 | # ifeq ("$(wildcard ./tmp_samples)","") 16 | ifeq ($(sampdir),) 17 | TEST_FILES = $(wildcard samples/*.fq) 18 | TEST_FILES += $(wildcard samples/*.fastq) 19 | else 20 | TEST_FILES = $(wildcard $(sampdir)/*.fq) 21 | TEST_FILES += $(wildcard $(sampdir)/*.fastq) 22 | endif 23 | 24 | 25 | all: slimfastq 26 | opt: slimfastq.opt 27 | 28 | prof-opt.run: 29 | for l in 1 2 3; do\ 30 | for f in $(TEST_FILES) ; do \ 31 | echo $$f ... ; \ 32 | ./slimfastq.prof -f /tmp/mytst -u $$f -O -P -l $$l ; \ 33 | ./slimfastq.prof -f /tmp/mytst -u /tmp/mytst.copy -O -d ; \ 34 | done ; \ 35 | done 36 | 37 | prof-opt: slimfastq.prof 38 | make prof-opt.run 39 | make opt 40 | make profclean 41 | 42 | # find . -name '*.gcda' -delete 43 | profclean: 44 | mkdir -p PROF 45 | find PROF -name '*.gcda' -delete 46 | mv *.gcda PROF/ || true 47 | rm *.prof || true 48 | 49 | clean: profclean 50 | find . -name '*.o' -delete 51 | rm slimfastq slimfastq.* test-filer || true 52 | 53 | gdb: slimfastq.gdb 54 | SOURCES= $(filter-out utest.cpp one.cpp molder.cpp filer.tst.cpp, $(shell ls *.cpp)) 55 | HEADERS= $(shell echo *.hpp) 56 | slimfastq.gdb: $(SOURCES) $(HEADERS) 57 | g++ $(FLAGS) -o $@ $(SOURCES) 58 | 59 | valgrind: slimfastq.valgrind 60 | slimfastq.valgrind: $(SOURCES) $(HEADERS) 61 | g++ $(FLAGS) -DHAPPY_VALGRIND -o $@ $(SOURCES) 62 | 63 | .PHONY: slimfastq slimfastq.gdb slimfastq.valgrind test-filer 64 | 65 | slimfastq: 66 | g++ $(FLAGS_OPT2) -o $@ $(SOURCES) 67 | @ echo "Done." 68 | 69 | slimfastq.opt: 70 | mv PROF/*.gcda . || true 71 | g++ $(FLAGS_OPT) -o $@ $(SOURCES) 72 | mv *.gcda PROF || true 73 | slimfastq.prof: 74 | g++ $(FLAGS_PROF) -o $@ $(SOURCES) 75 | 76 | prof: 77 | g++ -O3 -fstrict-aliasing -ffast-math -pg -o slimfastq.prof $(SOURCES) 78 | 79 | molder: molder.cpp pager.cpp pager.hpp 80 | g++ $(FLAGS) molder.cpp pager.cpp -o $@ 81 | 82 | tags: 83 | etags $(SOURCES) $(HEADERS) 84 | 85 | # UTSRC= $(filter-out one.cpp molder.cpp main.cpp, $(shell ls *.cpp)) 86 | # UTHDR= $(shell ls *.hpp) 87 | # slimfastq.utest: $(UTSRC) $(UTHDR) 88 | # g++ $(FLAGS) $(UTSRC) -o $@ 89 | # 90 | # utest: slimfastq.utest 91 | # ./slimfastq.utest 92 | 93 | # small: all 94 | # ./slimfastq -f ../data/small.fq -u ../data/small.fq -O 95 | # ./slimfastq -f ../data/small.fq -u ../data/small.fq.tst -O -d 96 | # ../data/mydiff.pl ../data/small.fq ../data/small.fq.tst 97 | 98 | # time: prof-opt 99 | # time ./slimfastq.opt -f ../data/t -u ../data/s.fastq -O 100 | # time ./slimfastq.opt -f ../data/t -u ../data/s.fastq.tst -O -d 101 | # cmp ../data/s.fastq ../data/s.fastq.tst 102 | 103 | test: all 104 | for l in 1 2 3 4; do \ 105 | for f in $(TEST_FILES) ; do \ 106 | echo $$f $$l... ; \ 107 | rm /tmp/mytst.* || true; \ 108 | ./slimfastq -u $$f -f /tmp/mytst -O -l $$l -q && \ 109 | ./slimfastq -u /tmp/mytst.fastq -f /tmp/mytst -O -d && \ 110 | tools/mydiff.pl $$f /tmp/mytst.fastq || break ; \ 111 | done || break ; \ 112 | done 113 | 114 | tost: all 115 | for l in 1 ; do \ 116 | for f in $(TEST_FILES) ; do \ 117 | echo $$f $$l... ; \ 118 | rm /tmp/mytst.* || true; \ 119 | ./slimfastq -u $$f -f /tmp/mytst -O -l $$l -q && \ 120 | ./slimfastq -u /tmp/mytst.fastq -f /tmp/mytst -O -d && \ 121 | tools/mydiff.pl $$f /tmp/mytst.fastq || break ; \ 122 | done || break ; \ 123 | done 124 | 125 | test-filer: 126 | g++ $(FLAGS) -o $@ -g filer.tst.cpp 127 | @ ./$@ && echo "Pass!" || echo "Fail" 128 | 129 | playground: 130 | @ echo $(filter-out molder.cpp, $(shell echo *.cpp)) 131 | 132 | install: 133 | ifeq ("$(wildcard ./slimfastq)","") 134 | @echo "Please compile slimfastq first. You can do it by running:"; 135 | @echo "% make" 136 | @echo "(optional) %make test" 137 | @echo "% make install" 138 | @echo "(Note that 'make slimfastq install' would not work as expected)" 139 | @false 140 | else 141 | @install -t /usr/local/bin/ ./slimfastq tools/slimfastq.multi 2>/dev/null \ 142 | || cp ./slimfastq tools/slimfastq.multi /usr/local/bin/ \ 143 | || echo "failed to install, please use ./slimfastq and tools/slimfastq.multi in place, or copy them to your lookup path" 144 | @echo "\nAll done!" 145 | endif 146 | 147 | uninstall: 148 | rm /usr/local/bin/slimfastq* || /bin/true 149 | 150 | 151 | 152 | -------------------------------------------------------------------------------- /base2_ranger.hpp: -------------------------------------------------------------------------------- 1 | /***********************************************************************************************************************/ 2 | /* This program was written by Josef Ezra */ 3 | /* Copyright (c) 2013, Infinidat */ 4 | /* All rights reserved. */ 5 | /* */ 6 | /* Redistribution and use in source and binary forms, with or without modification, are permitted provided that */ 7 | /* the following conditions are met: */ 8 | /* */ 9 | /* Redistributions of source code must retain the above copyright notice, this list of conditions and the following */ 10 | /* disclaimer. */ 11 | /* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following */ 12 | /* disclaimer in the documentation and/or other materials provided with the distribution. */ 13 | /* Neither the name of the Infinidat nor the names of its contributors may be used to endorse or promote products */ 14 | /* derived from this software without specific prior written permission. */ 15 | /* */ 16 | /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 17 | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 18 | /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ 19 | /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */ 20 | /* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, */ 21 | /* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE */ 22 | /* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 23 | /***********************************************************************************************************************/ 24 | 25 | // Based on James Bonfield's fqz_comp 26 | 27 | #ifndef ZP_BASES_RANGER_H 28 | #define ZP_BASES_RANGER_H 29 | 30 | #ifndef ZP_RANGECODER_H 31 | #include "coder.hpp" 32 | #endif 33 | #include "common.hpp" 34 | 35 | class Base2Ranger { 36 | 37 | enum { // STEP = 1, 38 | MAX_FREQ=254, 39 | INIT_VAL=3, 40 | M_ONES = 0x01010101, 41 | }; 42 | 43 | union { 44 | UCHAR freq[4]; 45 | UINT32 freq_val; 46 | }; 47 | 48 | void normalize() { 49 | freq_val = 50 | ((freq_val & ~M_ONES) >> 1 ) | 51 | (freq_val & M_ONES); 52 | // (freq_val & ~M_ONES) >> 1 ; 53 | } 54 | 55 | inline UINT16 getsum() { 56 | // return 4 + (freq[0] + freq[1]) + (freq[2] + freq[3]); 57 | return (freq[0] + freq[1]) + (freq[2] + freq[3]); 58 | } 59 | 60 | inline void update_freq(int sym , UINT16 total) { 61 | rarely_if(freq[sym] > (MAX_FREQ)) 62 | normalize(); 63 | 64 | // freq[sym] += STEP; 65 | freq[sym]++; 66 | } 67 | 68 | public: 69 | Base2Ranger() { 70 | // BZERO made it slower 71 | freq_val = INIT_VAL * M_ONES; 72 | } 73 | 74 | inline void put(RCoder *rc, UCHAR sym) { 75 | UINT16 total = getsum(); 76 | UINT16 offs = 0; 77 | switch(sym) { 78 | case 3: offs += freq[2]; 79 | case 2: offs += freq[1]; 80 | case 1: offs += freq[0]; 81 | } 82 | rc->Encode( offs, freq[sym], total); 83 | update_freq(sym, total); 84 | } 85 | 86 | inline UCHAR get(RCoder *rc) { 87 | 88 | UINT16 total = getsum(); 89 | UINT32 prob = rc->GetFreq(total); 90 | 91 | UINT32 sumf = 0; 92 | int i; 93 | for (i = 0; i < 4; i++) { 94 | if (sumf + freq[i] <= prob) 95 | sumf += freq[i]; 96 | else 97 | break; 98 | } 99 | assert(i<4); 100 | rc->Decode(sumf, freq[i], total); 101 | 102 | update_freq(i, total); 103 | return i; 104 | } 105 | } PACKED ; 106 | 107 | #endif // ZP_BASES_RANGER_H 108 | -------------------------------------------------------------------------------- /xfile.cpp: -------------------------------------------------------------------------------- 1 | /***********************************************************************************************************************/ 2 | /* This program was written by Josef Ezra */ 3 | /* Copyright (c) 2013, Infinidat */ 4 | /* All rights reserved. */ 5 | /* */ 6 | /* Redistribution and use in source and binary forms, with or without modification, are permitted provided that */ 7 | /* the following conditions are met: */ 8 | /* */ 9 | /* Redistributions of source code must retain the above copyright notice, this list of conditions and the following */ 10 | /* disclaimer. */ 11 | /* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following */ 12 | /* disclaimer in the documentation and/or other materials provided with the distribution. */ 13 | /* Neither the name of the Infinidat nor the names of its contributors may be used to endorse or promote products */ 14 | /* derived from this software without specific prior written permission. */ 15 | /* */ 16 | /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 17 | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 18 | /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ 19 | /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */ 20 | /* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, */ 21 | /* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE */ 22 | /* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 23 | /***********************************************************************************************************************/ 24 | 25 | 26 | #include "xfile.hpp" 27 | #include 28 | #include 29 | #include 30 | 31 | XFileBase::XFileBase(const char* filename 32 | ) : m_filename(filename) 33 | {} 34 | 35 | 36 | XFileSave::XFileSave(const char* filename 37 | ) : XFileBase(filename), 38 | filer(NULL) {} 39 | 40 | XFileSave::~XFileSave() { 41 | if (filer) { 42 | put(0); // no more exceptions 43 | rcoder.done(); 44 | delete filer; 45 | filer = NULL; 46 | } 47 | } 48 | XFileLoad::XFileLoad(const char* filename 49 | ) : XFileBase(filename), 50 | filer(NULL) {} 51 | 52 | XFileLoad::~XFileLoad() { 53 | if (filer) { 54 | rcoder.done(); 55 | delete filer; 56 | filer = NULL; 57 | } 58 | } 59 | 60 | void XFileSave::init () { 61 | filer = new FilerSave(m_filename); 62 | assert(filer); 63 | rcoder.init(filer); 64 | } 65 | 66 | bool XFileSave::put(UINT64 gap) { 67 | rarely_if(not filer) init(); 68 | return ranger.put_u(&rcoder, gap); 69 | } 70 | 71 | void XFileSave::put_chr(UCHAR chr) { 72 | rarely_if(not filer) init(); 73 | ranger_str.put(&rcoder, chr); 74 | } 75 | 76 | UCHAR XFileLoad::get_chr() { 77 | rarely_if(not filer) init(); // assert(filer) ? 78 | return ranger_str.get(&rcoder); 79 | } 80 | 81 | void XFileLoad::init() { 82 | filer = new FilerLoad(m_filename, &m_valid); 83 | assert(filer); 84 | if (m_valid) 85 | rcoder.init(filer); 86 | else 87 | DELETE(filer); 88 | } 89 | 90 | UINT64 XFileLoad::get() { 91 | rarely_if(not filer) init(); 92 | return m_valid ? ranger.get_u(&rcoder) : 0; 93 | } 94 | 95 | void XFileSave::put_str(const UCHAR* p, size_t len) { 96 | put(len); 97 | for (UINT32 j = 0; j < len; j++) 98 | ranger_str.put(&rcoder, p[j]); 99 | } 100 | 101 | UCHAR* XFileLoad::get_str(UCHAR* p) { 102 | size_t len = get(); 103 | for (UINT32 j = 0; j < len; j++) 104 | p[j] = ranger_str.get(&rcoder); 105 | return p + len; 106 | } 107 | 108 | size_t XFileSave::tell() const { 109 | return filer ? filer->tell() : 0; 110 | } 111 | -------------------------------------------------------------------------------- /recs.hpp: -------------------------------------------------------------------------------- 1 | /***********************************************************************************************************************/ 2 | /* This program was written by Josef Ezra */ 3 | /* Copyright (c) 2013, Infinidat */ 4 | /* All rights reserved. */ 5 | /* */ 6 | /* Redistribution and use in source and binary forms, with or without modification, are permitted provided that */ 7 | /* the following conditions are met: */ 8 | /* */ 9 | /* Redistributions of source code must retain the above copyright notice, this list of conditions and the following */ 10 | /* disclaimer. */ 11 | /* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following */ 12 | /* disclaimer in the documentation and/or other materials provided with the distribution. */ 13 | /* Neither the name of the Infinidat nor the names of its contributors may be used to endorse or promote products */ 14 | /* derived from this software without specific prior written permission. */ 15 | /* */ 16 | /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 17 | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 18 | /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ 19 | /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */ 20 | /* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, */ 21 | /* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE */ 22 | /* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 23 | /***********************************************************************************************************************/ 24 | 25 | 26 | #ifndef FQ_RECS_H 27 | #define FQ_RECS_H 28 | 29 | #include "common.hpp" 30 | #include "config.hpp" 31 | #include 32 | 33 | #include "xfile.hpp" 34 | 35 | class RecBase { 36 | 37 | protected: 38 | 39 | RecBase() {} 40 | ~RecBase() {rcoder.done();} 41 | 42 | struct ranger_t { 43 | PowerRanger type; 44 | PowerRanger str; 45 | PowerRangerU num; 46 | } PACKED ; 47 | 48 | ranger_t ranger[66]; 49 | 50 | RCoder rcoder; 51 | 52 | struct { 53 | bool initilized; 54 | UINT64 index; 55 | // long long num[10]; - TODO: cache array of prev atoi and end pointers 56 | } m_last; 57 | 58 | struct { 59 | UINT32 big_i; 60 | UINT32 str_n; 61 | UINT32 str_l; 62 | UINT32 new_n; 63 | UINT32 new_l; 64 | } stats; 65 | 66 | bool m_valid; 67 | 68 | // void range_init(); 69 | 70 | struct space_map { 71 | int off[65]; 72 | int wln[65]; 73 | UCHAR str[65]; 74 | UCHAR len; 75 | }; 76 | space_map smap [2]; 77 | UCHAR ctype[2][65]; // 0=? 1=deci, 2=hexa 78 | UINT64 cnumb[2][65]; // cache the number (if relevant) 79 | bool imap; 80 | 81 | // UINT64 last_map; 82 | void map_space(const UCHAR* p, bool index); 83 | }; 84 | 85 | class RecSave : private RecBase { 86 | public: 87 | RecSave(); 88 | ~RecSave(); 89 | 90 | void save(const UCHAR* buf, const UCHAR* end, const UCHAR* prev_buf, const UCHAR* prev_end); 91 | private: 92 | void save_first_line(const UCHAR* buf, const UCHAR* end); 93 | void put_type(UCHAR i, UCHAR type); 94 | void put_num(UCHAR i, long long num); 95 | void put_str(UCHAR i, const UCHAR* p, UINT32 len); 96 | 97 | FilerSave* filer; 98 | XFileSave* x_file; 99 | }; 100 | 101 | class RecLoad : private RecBase { 102 | public: 103 | RecLoad(); 104 | ~RecLoad(); 105 | 106 | inline bool is_valid() {return m_valid;} 107 | size_t load(UCHAR* buf, const UCHAR* prev); 108 | size_t load_pre5(UCHAR* buf, const UCHAR* prev); 109 | 110 | private: 111 | size_t load_first_line(UCHAR* buf); 112 | 113 | long long get_num (UCHAR i); 114 | UCHAR get_type(UCHAR i); 115 | UCHAR get_len (UCHAR i); 116 | UCHAR* get_str (UCHAR i, UCHAR* p); 117 | 118 | int comp_version; 119 | 120 | FilerLoad* filer; 121 | XFileLoad* x_file; 122 | }; 123 | 124 | 125 | 126 | #endif 127 | -------------------------------------------------------------------------------- /log64_ranger.hpp: -------------------------------------------------------------------------------- 1 | /***********************************************************************************************************************/ 2 | /* This program was written by Josef Ezra */ 3 | /* Copyright (c) 2013, Infinidat */ 4 | /* All rights reserved. */ 5 | /* */ 6 | /* Redistribution and use in source and binary forms, with or without modification, are permitted provided that */ 7 | /* the following conditions are met: */ 8 | /* */ 9 | /* Redistributions of source code must retain the above copyright notice, this list of conditions and the following */ 10 | /* disclaimer. */ 11 | /* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following */ 12 | /* disclaimer in the documentation and/or other materials provided with the distribution. */ 13 | /* Neither the name of the Infinidat nor the names of its contributors may be used to endorse or promote products */ 14 | /* derived from this software without specific prior written permission. */ 15 | /* */ 16 | /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 17 | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 18 | /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ 19 | /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */ 20 | /* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, */ 21 | /* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE */ 22 | /* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 23 | /***********************************************************************************************************************/ 24 | 25 | // Based on James Bonfield's fqz_comp 26 | 27 | #ifndef ZP_LOG64_RANGER_H 28 | #define ZP_LOG64_RANGER_H 29 | 30 | #ifndef ZP_RANGECODER_H 31 | #include "coder.hpp" 32 | #endif 33 | 34 | #define LAST_QLT 63 35 | 36 | class Log64Ranger { 37 | enum { 38 | STEP=6, 39 | // NSYM=63, // decrease from 64 to get better memory alignment 40 | NSYM=64, 41 | MAX_FREQ=(1<<16)-64, 42 | }; 43 | 44 | 45 | UINT16 freq[NSYM]; 46 | UINT16 iend ; 47 | UINT32 total; 48 | UCHAR count; 49 | UCHAR syms[NSYM]; 50 | 51 | void normalize() { 52 | for (UINT32 i = total = 0; i < iend; i++) 53 | total += (freq[i] /= 2); 54 | } 55 | 56 | inline UCHAR down_level(int i) { 57 | 58 | UCHAR c = syms[i ]; 59 | syms[i] = syms[i-1]; 60 | syms[i-1] = c; 61 | 62 | UINT16 f = freq[i]; 63 | freq[i] = freq[i-1]; 64 | freq[i-1] = f; 65 | 66 | return c; 67 | } 68 | 69 | inline UCHAR update_freq(int i) { 70 | 71 | rarely_if(freq[i] > (MAX_FREQ - STEP)) { 72 | rarely_if( i == 0 and 73 | freq[i] + 20U > total) 74 | return syms[i]; 75 | 76 | normalize(); 77 | } 78 | 79 | freq[i] += STEP; 80 | total += STEP; 81 | 82 | return LIKELY (i == 0 or 83 | (++ count & 0xf) or 84 | freq[i] <= freq[i-1] ) ? 85 | syms[i] : 86 | down_level(i); 87 | } 88 | 89 | public: 90 | Log64Ranger() { 91 | #ifdef HAPPY_VALGRIND 92 | bzero(this, sizeof(*this)); 93 | #else 94 | bzero(&iend, 8); 95 | #endif 96 | } 97 | 98 | inline void put(RCoder *rc, UCHAR sym) { 99 | UINT32 sumf = 0; 100 | UINT32 i = 0; 101 | 102 | assert(sym < NSYM); 103 | rarely_if(iend <= sym) 104 | for (;iend <= sym; iend++) 105 | syms[iend] = iend; 106 | 107 | for (; syms[i] != sym; sumf += freq[i++]); 108 | 109 | rc->Encode(sumf+i, freq[i]+1, total + NSYM); 110 | 111 | update_freq(i); 112 | } 113 | 114 | inline UINT16 get(RCoder *rc) { 115 | 116 | UINT32 vtot = total + NSYM; 117 | UINT32 sumf = 0; 118 | UINT32 i = 0; 119 | 120 | UINT32 prob = rc->GetFreq(vtot); 121 | 122 | for ( i = 0; 123 | i < NSYM; 124 | i ++ ) { 125 | 126 | rarely_if(iend == i) 127 | syms[ iend++ ] = i; 128 | 129 | if (sumf + freq[i] + 1 <= prob) 130 | sumf += freq[i] + 1; 131 | else 132 | break; 133 | } 134 | 135 | rc->Decode(sumf, freq[i]+1, vtot); 136 | 137 | return update_freq(i); 138 | } 139 | 140 | } PACKED; 141 | 142 | #endif // already loaded 143 | -------------------------------------------------------------------------------- /gens.hpp: -------------------------------------------------------------------------------- 1 | /***********************************************************************************************************************/ 2 | /* This program was written by Josef Ezra */ 3 | /* Copyright (c) 2013, Infinidat */ 4 | /* All rights reserved. */ 5 | /* */ 6 | /* Redistribution and use in source and binary forms, with or without modification, are permitted provided that */ 7 | /* the following conditions are met: */ 8 | /* */ 9 | /* Redistributions of source code must retain the above copyright notice, this list of conditions and the following */ 10 | /* disclaimer. */ 11 | /* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following */ 12 | /* disclaimer in the documentation and/or other materials provided with the distribution. */ 13 | /* Neither the name of the Infinidat nor the names of its contributors may be used to endorse or promote products */ 14 | /* derived from this software without specific prior written permission. */ 15 | /* */ 16 | /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 17 | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 18 | /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ 19 | /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */ 20 | /* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, */ 21 | /* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE */ 22 | /* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 23 | /***********************************************************************************************************************/ 24 | 25 | 26 | 27 | #ifndef FQ_GENS_H 28 | #define FQ_GENS_H 29 | 30 | #include "common.hpp" 31 | #include "config.hpp" 32 | #include 33 | 34 | #include "filer.hpp" 35 | #include "base2_ranger.hpp" 36 | #include "xfile.hpp" 37 | 38 | class GenBase { 39 | protected: 40 | GenBase() {} 41 | ~GenBase(){} 42 | 43 | #define BRANGER_SIZE_1 (1<<18) 44 | #define BRANGER_MASK_1 (BRANGER_SIZE_1-1) 45 | 46 | #define BRANGER_SIZE_2 (1<<22) 47 | #define BRANGER_MASK_2 (BRANGER_SIZE_2-1) 48 | 49 | #define BRANGER_SIZE_3 (1<<24) 50 | #define BRANGER_MASK_3 (BRANGER_SIZE_3-1) 51 | 52 | #define BRANGER_SIZE_4 (1<<26) 53 | #define BRANGER_MASK_4 (BRANGER_SIZE_4-1) 54 | 55 | Base2Ranger* ranger; 56 | RCoder rcoder; 57 | 58 | struct { 59 | // UINT64 count; 60 | UINT64 Ns_index; 61 | UINT64 Nn_index; 62 | } m_last; 63 | 64 | struct { 65 | UINT32 big_gaps; 66 | } m_stats ; 67 | 68 | bool m_valid; 69 | UCHAR m_N_byte; 70 | 71 | void range_init(); 72 | size_t ranger_cnt(); 73 | 74 | inline UINT64 get_mask() const { 75 | switch (conf.level) { 76 | default: 77 | case 1: return BRANGER_MASK_1; 78 | case 2: return BRANGER_MASK_2; 79 | case 3: return BRANGER_MASK_3; 80 | case 4: return BRANGER_MASK_4; 81 | } 82 | } 83 | }; 84 | 85 | class GenSave : private GenBase { 86 | public: 87 | GenSave() ; 88 | ~GenSave(); 89 | void save(const UCHAR* gen, UCHAR* qlt, UINT64 llen, UINT64 qlen) 90 | {return llen == qlen ? 91 | save_x(gen, qlt, llen , get_mask()) : 92 | save_x(gen, qlt, llen , qlen, get_mask()); 93 | } 94 | 95 | private: 96 | void bad_q_or_bad_n(UCHAR gen, UCHAR qlt, bool bad_n, bool bad_q); 97 | inline UCHAR normalize_gen(UCHAR gen, UCHAR qlt); 98 | void save_x(const UCHAR* gen, const UCHAR* qlt, UINT64 size, const UINT64 mask); 99 | void save_x(const UCHAR* gen, const UCHAR* qlt, UINT64 llen, UINT64 qlen, const UINT64 mask); 100 | 101 | FilerSave* filer; 102 | XFileSave* x_Ns; 103 | XFileSave* x_Nn; 104 | }; 105 | 106 | class GenLoad : private GenBase { 107 | public: 108 | GenLoad(); 109 | ~GenLoad(); 110 | 111 | UINT32 load(UCHAR* gen, const UCHAR* qlt, UINT64 llen, UINT64 qlen) 112 | {return llen == qlen ? 113 | load_x(gen, qlt, llen, get_mask()) : 114 | load_x(gen, qlt, llen, qlen, get_mask()); 115 | } 116 | 117 | private: 118 | UINT32 load_x(UCHAR* gen, const UCHAR* qlt, UINT64 llen, const UINT64 mask); 119 | UINT32 load_x(UCHAR* gen, const UCHAR* qlt, UINT64 llen, UINT64 qlen, const UINT64 mask); 120 | inline void normalize_gen(UCHAR &gen, UCHAR qlt); 121 | 122 | // bool m_validNs, m_validNn; 123 | const char* m_gencode; 124 | 125 | FilerLoad* filer; 126 | XFileLoad* x_Ns; 127 | XFileLoad* x_Nn; 128 | }; 129 | 130 | 131 | #endif // FQ_GENS_H 132 | -------------------------------------------------------------------------------- /qlts.hpp: -------------------------------------------------------------------------------- 1 | /***********************************************************************************************************************/ 2 | /* This program was written by Josef Ezra */ 3 | /* Copyright (c) 2013, Infinidat */ 4 | /* All rights reserved. */ 5 | /* */ 6 | /* Redistribution and use in source and binary forms, with or without modification, are permitted provided that */ 7 | /* the following conditions are met: */ 8 | /* */ 9 | /* Redistributions of source code must retain the above copyright notice, this list of conditions and the following */ 10 | /* disclaimer. */ 11 | /* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following */ 12 | /* disclaimer in the documentation and/or other materials provided with the distribution. */ 13 | /* Neither the name of the Infinidat nor the names of its contributors may be used to endorse or promote products */ 14 | /* derived from this software without specific prior written permission. */ 15 | /* */ 16 | /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 17 | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 18 | /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ 19 | /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */ 20 | /* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, */ 21 | /* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE */ 22 | /* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 23 | /***********************************************************************************************************************/ 24 | 25 | 26 | #ifndef FQ_QLTS_H 27 | #define FQ_QLTS_H 28 | 29 | #include "common.hpp" 30 | #include "config.hpp" 31 | #include "filer.hpp" 32 | #include "log64_ranger.hpp" 33 | #include "power_ranger.hpp" 34 | 35 | #define RANGER_SIZE_2 (1<<16) 36 | #define RANGER_MASK_2 (RANGER_SIZE_2-1) 37 | 38 | #define RANGER_SIZE_1 (1<<12) 39 | #define RANGER_MASK_1 (RANGER_SIZE_1-1) 40 | 41 | class QltBase { 42 | protected: 43 | 44 | Log64Ranger* ranger; 45 | PowerRanger exranger; 46 | RCoder rcoder; 47 | bool m_valid; 48 | 49 | size_t ranger_cnt(); 50 | // void range_init(); 51 | 52 | inline static UINT32 calc_last_1 (UINT32 last, UCHAR b) { 53 | return ( b | (last << 6) ) & RANGER_MASK_1; 54 | } 55 | inline static UINT32 calc_last_2 (UINT32 last, UCHAR b) { 56 | return ( b | (last << 6) ) & RANGER_MASK_2; 57 | } 58 | // inline static UINT32 calc_last_3 (UINT32 last, UCHAR b) { 59 | // return ( b | (last << 6) ) & RANGER_MASK_3; 60 | // // TODO: use delta 61 | // } 62 | inline static UINT32 calc_last_delta(UINT32 &delta, UCHAR q, UCHAR q1, UCHAR q2) { 63 | 64 | // This brilliant code could only be of James Bonfield 65 | if (q1>q) 66 | delta += (q1-q); 67 | 68 | return 69 | ( q 70 | | ((q1 < q2 ? q2 : q1) << 6) 71 | | ((q1 == q2 ) << 12) 72 | | ((7 > (delta>>3) ? (delta>>3) : 7 ) << 13) 73 | ) & RANGER_MASK_2 ; 74 | } 75 | }; 76 | 77 | class QltSave : private QltBase { 78 | public: 79 | QltSave(); 80 | ~QltSave(); 81 | 82 | inline void save(const UCHAR* buf, size_t size) { 83 | switch(conf.level) { 84 | default: 85 | case 1: return save_1(buf, size); 86 | case 2: return save_2(buf, size); 87 | case 3: 88 | case 4: return save_3(buf, size); 89 | } 90 | } 91 | // void filer_init(); 92 | bool is_valid(); 93 | private: 94 | void save_1(const UCHAR* buf, size_t size); 95 | void save_2(const UCHAR* buf, size_t size); 96 | void save_3(const UCHAR* buf, size_t size); 97 | FilerSave* filer; 98 | struct { 99 | UINT32 extra_hi_qlt; 100 | } m_last; 101 | }; 102 | 103 | class QltLoad : private QltBase { 104 | public: 105 | QltLoad(); 106 | ~QltLoad(); 107 | 108 | inline UINT32 load(UCHAR* buffer, const size_t size) { 109 | switch(conf.level) { 110 | default: 111 | case 1: return load_1(buffer, size); 112 | case 2: return load_2(buffer, size); 113 | case 3: 114 | case 4: return load_3(buffer, size); 115 | } 116 | } 117 | bool is_valid(); 118 | private: 119 | UINT32 load_1 (UCHAR* buffer, const size_t size); 120 | UINT32 load_2 (UCHAR* buffer, const size_t size); 121 | UINT32 load_3 (UCHAR* buffer, const size_t size); 122 | 123 | FilerLoad* filer; 124 | }; 125 | 126 | #endif // FQ_QLTS_H 127 | -------------------------------------------------------------------------------- /usrs.hpp: -------------------------------------------------------------------------------- 1 | /***********************************************************************************************************************/ 2 | /* This program was written by Josef Ezra */ 3 | /* Copyright (c) 2013, Infinidat */ 4 | /* All rights reserved. */ 5 | /* */ 6 | /* Redistribution and use in source and binary forms, with or without modification, are permitted provided that */ 7 | /* the following conditions are met: */ 8 | /* */ 9 | /* Redistributions of source code must retain the above copyright notice, this list of conditions and the following */ 10 | /* disclaimer. */ 11 | /* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following */ 12 | /* disclaimer in the documentation and/or other materials provided with the distribution. */ 13 | /* Neither the name of the Infinidat nor the names of its contributors may be used to endorse or promote products */ 14 | /* derived from this software without specific prior written permission. */ 15 | /* */ 16 | /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 17 | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 18 | /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ 19 | /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */ 20 | /* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, */ 21 | /* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE */ 22 | /* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 23 | /***********************************************************************************************************************/ 24 | 25 | #ifndef FQ_USRS_H 26 | #define FQ_USRS_H 27 | 28 | #include "common.hpp" 29 | #include "config.hpp" 30 | #include 31 | #include "xfile.hpp" 32 | 33 | // NOTE: soon there would be new machines, providing longer lines. 34 | #define MAX_ID_LLEN 0x2000 35 | #define MAX_GN_LLEN 0x10000 36 | #define MAX_REC_LEN (2*(MAX_GN_LLEN + MAX_ID_LLEN)) 37 | 38 | class UsrBase { 39 | 40 | public: 41 | enum exception_t { 42 | ET_LLEN, 43 | ET_QLEN, 44 | ET_SOLPF_GEN, 45 | ET_SOLPF_QLT, 46 | 47 | ET_END 48 | } ; 49 | 50 | #define PLL_SIZE 0x100000 51 | #define PLL_STRT MAX_REC_LEN 52 | // PLL_STRT must be more than record size 53 | #define PLL_LAST (PLL_SIZE + PLL_STRT) 54 | 55 | struct { 56 | UINT64 i_llen; 57 | UINT64 i_qlen; 58 | UINT64 i_sgen; 59 | UINT64 i_sqlt; 60 | 61 | UINT64 i_long; 62 | 63 | UCHAR solid_pf_gen; 64 | UCHAR solid_pf_qlt; 65 | } m_last; 66 | }; 67 | 68 | class UsrSave : public UsrBase { 69 | 70 | 71 | public: 72 | UsrSave(); 73 | ~UsrSave(); 74 | 75 | int encode(); 76 | 77 | private: 78 | bool get_record(); 79 | bool get_oversized_record(int cur, bool from_get=true); 80 | void load_page(); 81 | void update(exception_t type, UINT16 dat); 82 | 83 | inline void load_check(); 84 | inline UCHAR load_char(); 85 | inline void expect(UCHAR chr); 86 | bool mid_rec_msg() const ; 87 | void determine_record(); 88 | UINT64 estimate_rec_limit(); 89 | 90 | bool m_valid; 91 | UCHAR m_buff[PLL_LAST+10]; 92 | size_t m_page_count; 93 | int m_cur, m_end; 94 | FILE *m_in; 95 | bool first_cycle; 96 | int m_llen, m_qlen; 97 | bool m_solid; 98 | XFileSave* x_llen; 99 | XFileSave* x_qlen; 100 | 101 | XFileSave* x_sgen; 102 | XFileSave* x_sqlt; 103 | XFileSave* x_lgen; 104 | XFileSave* x_lqlt; 105 | XFileSave* x_lrec; 106 | struct { 107 | UCHAR* rec; 108 | UCHAR* rec_end; 109 | UCHAR* prev_rec; 110 | UCHAR* prev_rec_end; 111 | UCHAR* gen; 112 | UCHAR* qlt; 113 | } mp; 114 | UCHAR mp_last[MAX_REC_LEN]; 115 | }; 116 | 117 | class UsrLoad : public UsrBase { 118 | public: 119 | UsrLoad(); 120 | ~UsrLoad(); 121 | int decode(); 122 | 123 | private: 124 | void save(); 125 | void update(); 126 | void putline(UCHAR* buf, UINT32 size); 127 | 128 | FILE *m_out; 129 | 130 | size_t m_llen, m_llen_factor, m_qlen; 131 | // UINT64 m_rec_total; 132 | bool m_2nd_rec, m_solid; 133 | UINT32 m_rec_size; 134 | long comp_version; 135 | 136 | bool flip; 137 | UCHAR m_rep[MAX_ID_LLEN + 1 ]; 138 | UCHAR m_rec[MAX_ID_LLEN + 1 ]; 139 | UCHAR m_qlt[MAX_GN_LLEN + 2 ]; 140 | UCHAR m_gen[MAX_GN_LLEN + 4 ]; 141 | UCHAR * m_qlt_ptr; 142 | UCHAR * m_gen_ptr; 143 | 144 | XFileLoad* x_llen; 145 | XFileLoad* x_qlen; 146 | XFileLoad* x_sgen; 147 | XFileLoad* x_sqlt; 148 | XFileLoad* x_lgen; 149 | XFileLoad* x_lqlt; 150 | XFileLoad* x_lrec; 151 | }; 152 | 153 | #endif 154 | -------------------------------------------------------------------------------- /tools/molder.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Written by Josef Ezra during R&D phase. 3 | // Please do not use without author's explicit permission. 4 | // 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | #include "common.hpp" 15 | #include "pager.hpp" 16 | 17 | static const int version = 1; // inernal version 18 | #define PR_BASE 5 19 | // 31 bits prime = 0x7fffffff, 32 - 0xfffffffb 20 | #define PR_MODL 0xfffffffb 21 | #define PR_POWR 0x86eed05a 22 | 23 | 24 | struct eqstr { bool operator()(UINT32 s1, UINT32 s2) const { return (s1 == s2) ; } }; 25 | 26 | #define HASH_DENSE 1 27 | #ifdef HASH_DENSE 28 | #include 29 | typedef google::dense_hash_map, eqstr> my_hash; 30 | #else 31 | #include 32 | typedef google::sparse_hash_map, eqstr> my_hash; 33 | #endif 34 | #include 35 | 36 | void croak(const char* msg) { 37 | if (errno) 38 | fprintf(stderr, "%s: %s\n", msg, strerror(errno)); 39 | else 40 | fprintf(stderr, "%s\n", msg); 41 | exit(1); 42 | } 43 | 44 | void croak(const char* fmt, long long num) { 45 | fprintf(stderr, fmt, num); 46 | fprintf(stderr, "\n"); 47 | exit(1); 48 | } 49 | 50 | void usage() { 51 | printf("\ 52 | TODO : add usage \n\ 53 | \n\ 54 | "); 55 | } 56 | 57 | static void check_fh(FILE* f, const char* name, bool read=false) { 58 | if (f) return; 59 | fprintf(stderr, "Can't %s file '%s': %s\n", 60 | read?"read":"write", name, strerror(errno)); 61 | exit(1); 62 | } 63 | 64 | static void check_op(int something, char chr) { 65 | if (something) return; 66 | fprintf(stderr, "Missing essential argument: -%c\n", chr); 67 | exit(1); 68 | } 69 | 70 | const char* withsuffix(std::string str, const char* suffix) { 71 | str += suffix; 72 | return strdup(str.c_str()); 73 | } 74 | 75 | // using google::dense_hash_map; 76 | 77 | class Molder { 78 | FILE *f_usr, *f_key, *f_seq, *f_rec; 79 | PagerSave* pgkey; 80 | PagerSave* pgseq; 81 | 82 | // TODO: use google's sparasehash, 83 | // where molder uses sparse_hash_set 84 | // and izg-fq uses dense_hash_map 85 | // std::map all; 86 | my_hash all; 87 | 88 | UINT32 f_offs, c_offs; 89 | UINT32 c_bases; 90 | 91 | // TODO: verify no fastq file holds lines shorter that this one 92 | #define LLINE 76 93 | 94 | UCHAR raw_a [LLINE] ; 95 | UCHAR raw_i ; 96 | UINT64 rabin; // TODO: rrabin inplace 97 | void add_fc(UCHAR c) { 98 | UINT64 oldc = raw_a[raw_i] * PR_POWR % PR_MODL; 99 | rabin *= PR_BASE; 100 | rabin += raw_a[raw_i] = c; 101 | rabin %= PR_MODL; 102 | if (rabin < oldc) 103 | rabin += PR_MODL; 104 | rabin -= oldc; 105 | ++ raw_i %= LLINE; 106 | ++ c_bases; 107 | } 108 | bool exists() { 109 | my_hash::iterator it = all.find(rabin); 110 | return it != all.end(); 111 | } 112 | void add_c(UCHAR c) { 113 | // if unique key, save key + cur c 114 | // calc next key 115 | if (not exists()) { 116 | // UINT64 tmp = (key << 32) | f_offs; 117 | // pgseq->put(tmp); 118 | // fputc(c, f_seq); 119 | all[rabin] = f_offs++; 120 | } 121 | add_fc(c); 122 | } 123 | void process() { 124 | for(int 125 | c = fgetc(f_usr); 126 | c != EOF; 127 | c = fgetc(f_usr)) { 128 | int val = 4; 129 | switch(c) { 130 | case '>': 131 | while (c != '\n') c = fgetc(f_usr); 132 | break; 133 | case '\n': case 'N': case 'n': break; 134 | default: croak("unexpected value: %c", c); 135 | case 'a': case 'A' : val = 1; break; 136 | case 'c': case 'C' : val = 2; break; 137 | case 'g': case 'G' : val = 3; break; 138 | case 't': case 'T' : val = 4; break; 139 | } 140 | if (val < 3) 141 | add_c(val); 142 | } 143 | } 144 | void init_raw() { 145 | for (int i = 0, c = fgetc(f_usr); 146 | i < LLINE and c != EOF; 147 | c = fgetc(f_usr)) { 148 | UCHAR val = 4; 149 | switch(c) { 150 | case '>': while (c != '\n') c = fgetc(f_usr); continue; 151 | case '\n': case 'N': case 'n': continue; 152 | default: croak("unexpected value: %c", c); continue; 153 | 154 | case 'a': case 'A' : val = 0; break; 155 | case 'c': case 'C' : val = 1; break; 156 | case 'g': case 'G' : val = 2; break; 157 | case 't': case 'T' : val = 3; break; 158 | } 159 | assert (val < 4); 160 | i ++; 161 | add_fc(val); 162 | } 163 | } 164 | public: 165 | Molder(const char* usr, const char* key, const char* seq) { 166 | 167 | #ifdef HASH_DENSE 168 | all.set_empty_key(0); 169 | #endif 170 | 171 | f_usr = *usr ? fopen(usr, "rb") : stdin ; 172 | check_fh(f_usr, usr, true); 173 | const char *write_fl = "wb"; // TODO: "ab" 174 | 175 | f_key = fopen(key, write_fl); 176 | check_fh(f_key, key); 177 | pgkey = new PagerSave(f_key); 178 | 179 | f_seq = fopen(seq, write_fl); 180 | check_fh(f_seq, seq); 181 | 182 | c_bases = 0; 183 | // init keys 184 | f_offs = ftell(f_seq); 185 | c_offs = 0; 186 | 187 | rabin = 0; 188 | BZERO(raw_a); 189 | raw_i = 0; 190 | 191 | init_raw(); 192 | process(); 193 | } 194 | ~Molder() { 195 | fprintf(stdout, "%u bases, %lu keys\n", c_bases, all.size()); 196 | } 197 | 198 | }; 199 | 200 | int main(int argc, char** argv) { 201 | 202 | std::string usr, fil; 203 | const char* short_opt = "hvf:u:"; 204 | for ( int opt = getopt(argc, argv, short_opt); 205 | opt != -1; 206 | opt = getopt(argc, argv, short_opt)) 207 | switch(opt) { 208 | case 'u': usr = optarg ; break; 209 | case 'f': fil = optarg ; break; 210 | case 'v': printf("Version: %u\n", version); exit(0); 211 | case 'h': usage(); exit(0); 212 | }; 213 | 214 | check_op(fil.length(), 'f'); 215 | 216 | const char *key = withsuffix(fil, ".key"); 217 | const char *seq = withsuffix(fil, ".seq"); 218 | // const char *rec = withsuffix(fil, ".rec"); 219 | Molder mold(usr.c_str(), key, seq); 220 | return 0; 221 | } 222 | 223 | 224 | -------------------------------------------------------------------------------- /qlts.cpp: -------------------------------------------------------------------------------- 1 | /***********************************************************************************************************************/ 2 | /* This program was written by Josef Ezra */ 3 | /* Copyright (c) 2013, Infinidat */ 4 | /* All rights reserved. */ 5 | /* */ 6 | /* Redistribution and use in source and binary forms, with or without modification, are permitted provided that */ 7 | /* the following conditions are met: */ 8 | /* */ 9 | /* Redistributions of source code must retain the above copyright notice, this list of conditions and the following */ 10 | /* disclaimer. */ 11 | /* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following */ 12 | /* disclaimer in the documentation and/or other materials provided with the distribution. */ 13 | /* Neither the name of the Infinidat nor the names of its contributors may be used to endorse or promote products */ 14 | /* derived from this software without specific prior written permission. */ 15 | /* */ 16 | /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 17 | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 18 | /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ 19 | /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */ 20 | /* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, */ 21 | /* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE */ 22 | /* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 23 | /***********************************************************************************************************************/ 24 | 25 | 26 | #include "qlts.hpp" 27 | #include 28 | 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | size_t QltBase::ranger_cnt() { 35 | return 36 | conf.level == 1 ? RANGER_SIZE_1 : 37 | // conf.level == 3 ? RANGER_SIZE_3 : 38 | RANGER_SIZE_2 ; 39 | } 40 | 41 | // void QltBase::range_init() { 42 | // // bzero(ranger, sizeof(ranger[0]) * ranger_cnt()); 43 | // } 44 | 45 | QltSave::QltSave() { 46 | filer = new FilerSave("qlt"); 47 | ranger = new Log64Ranger[ranger_cnt()]; 48 | assert(filer); 49 | assert(ranger); 50 | rcoder.init(filer); 51 | BZERO(m_last); 52 | // range_init(); 53 | } 54 | 55 | QltSave::~QltSave() { 56 | rcoder.done(); 57 | if (not conf.quiet) { 58 | if (m_last.extra_hi_qlt) { 59 | conf.set_info("qlt.extra.hi", m_last.extra_hi_qlt); 60 | } 61 | } 62 | 63 | delete [] ranger; 64 | delete filer; 65 | } 66 | 67 | bool QltSave::is_valid() { 68 | return 69 | m_valid and 70 | filer and 71 | filer->is_valid(); 72 | } 73 | 74 | void QltSave::save_1(const UCHAR* buf, size_t size) { 75 | UINT32 last = 0; 76 | for (const UCHAR* p = buf ; p < buf + size; p++) { 77 | UCHAR b = UCHAR(*p-'!'); 78 | 79 | PREFETCH(ranger + last); 80 | likely_if(b < LAST_QLT) 81 | ranger[last].put(&rcoder, b); 82 | else { 83 | ranger[last].put(&rcoder, LAST_QLT); 84 | exranger.put(&rcoder, b); 85 | m_last.extra_hi_qlt ++ ; 86 | } 87 | last = calc_last_1(last, b); 88 | } 89 | } 90 | 91 | void QltSave::save_2(const UCHAR* buf, size_t size) { 92 | UINT32 last = 0; 93 | for (const UCHAR* p = buf ; p < buf + size; p++) { 94 | UCHAR b = UCHAR(*p-'!'); 95 | 96 | PREFETCH(ranger + last); 97 | likely_if(b < LAST_QLT) 98 | ranger[last].put(&rcoder, b); 99 | else { 100 | ranger[last].put(&rcoder, LAST_QLT); 101 | exranger.put(&rcoder, b); 102 | m_last.extra_hi_qlt ++ ; 103 | } 104 | last = calc_last_2(last, b); 105 | } 106 | } 107 | 108 | void QltSave::save_3(const UCHAR* buf, size_t size) { 109 | UINT32 last = 0; 110 | UINT32 delta = 5; 111 | UCHAR q1 = 0, q2 = 0; 112 | UINT32 di = 0; 113 | 114 | for (const UCHAR* p = buf ; p < buf + size; p++) { 115 | UCHAR b = UCHAR(*p-'!'); 116 | 117 | PREFETCH(ranger + last); 118 | likely_if(b < LAST_QLT) 119 | ranger[last].put(&rcoder, b); 120 | else { 121 | // croak("This qlt rate makes no sence: '%c' (record %d, pos %d)", *p, g_record_count, p-buf); 122 | ranger[last].put(&rcoder, LAST_QLT); 123 | exranger.put(&rcoder, b); 124 | m_last.extra_hi_qlt ++ ; 125 | } 126 | 127 | if (++ di & 1) { 128 | last = calc_last_delta(delta, b, q1, q2); 129 | q2 = b; 130 | } 131 | else { 132 | last = calc_last_delta(delta, b, q2, q1); 133 | q1 = b; 134 | } 135 | } 136 | } 137 | 138 | ////////// 139 | // load // 140 | ////////// 141 | 142 | QltLoad::QltLoad() { 143 | filer = new FilerLoad("qlt", &m_valid); 144 | ranger = new Log64Ranger[ranger_cnt()]; 145 | 146 | rcoder.init(filer); 147 | // range_init(); 148 | } 149 | 150 | QltLoad::~QltLoad() { 151 | rcoder.done(); 152 | delete [] ranger; 153 | delete filer; 154 | } 155 | 156 | bool QltLoad::is_valid() { 157 | return 158 | m_valid and 159 | filer and 160 | filer->is_valid(); 161 | } 162 | 163 | UINT32 QltLoad::load_1(UCHAR* buf, const size_t size) { 164 | 165 | UINT32 last = 0 ; 166 | 167 | for (UCHAR* p = buf; p < buf + size ; p++) { 168 | UCHAR b = ranger[last].get(&rcoder); 169 | 170 | rarely_if(b == LAST_QLT) 171 | b = exranger.get(&rcoder); 172 | 173 | *p = UCHAR('!' + b); 174 | 175 | last = calc_last_1(last, b); 176 | } 177 | buf[size] = '\n'; 178 | return m_valid ? size : 0; 179 | } 180 | 181 | UINT32 QltLoad::load_2(UCHAR* buf, const size_t size) { 182 | 183 | UINT32 last = 0 ; 184 | for (UCHAR* p = buf; p < buf + size ; p++) { 185 | PREFETCH(ranger + last); 186 | UCHAR b = ranger[last].get(&rcoder); 187 | 188 | rarely_if(b == LAST_QLT) 189 | b = exranger.get(&rcoder); 190 | 191 | *p = UCHAR('!' + b); 192 | 193 | last = calc_last_2(last, b); 194 | } 195 | buf[size] = '\n'; 196 | return m_valid ? size : 0; 197 | } 198 | 199 | UINT32 QltLoad::load_3(UCHAR* buf, const size_t size) { 200 | 201 | UINT32 last = 0 ; 202 | UINT32 delta = 5; 203 | UCHAR q1 = 0, q2 = 0; 204 | UINT32 di = 0; 205 | 206 | for (UCHAR* p = buf; p < buf + size ; p++) { 207 | PREFETCH(ranger + last); 208 | UCHAR b = ranger[last].get(&rcoder); 209 | 210 | rarely_if(b == LAST_QLT) 211 | // croak("extra high qlt. Record %d, pos %d", g_record_count, p-buf); 212 | b = exranger.get(&rcoder); 213 | // likely_if(b < LAST_QLT) 214 | // *p = UCHAR('!' + b); 215 | // else { 216 | // b = exranger.get(&rcoder); 217 | // *p = UCHAR('!' + b); 218 | // } 219 | *p = UCHAR('!' + b); 220 | 221 | if (++di & 1) { 222 | last = calc_last_delta(delta, b, q1, q2); 223 | q2 = b; 224 | } 225 | else { 226 | last = calc_last_delta(delta, b, q2, q1); 227 | q1 = b; 228 | } 229 | } 230 | buf[size] = '\n'; 231 | // buf[size+1] = 0; 232 | // printf("%llu: %s", g_record_count, buf); 233 | return m_valid ? size : 0; 234 | } 235 | 236 | -------------------------------------------------------------------------------- /power_ranger.hpp: -------------------------------------------------------------------------------- 1 | /***********************************************************************************************************************/ 2 | /* This program was written by Josef Ezra */ 3 | /* Copyright (c) 2013, Infinidat */ 4 | /* All rights reserved. */ 5 | /* */ 6 | /* Redistribution and use in source and binary forms, with or without modification, are permitted provided that */ 7 | /* the following conditions are met: */ 8 | /* */ 9 | /* Redistributions of source code must retain the above copyright notice, this list of conditions and the following */ 10 | /* disclaimer. */ 11 | /* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following */ 12 | /* disclaimer in the documentation and/or other materials provided with the distribution. */ 13 | /* Neither the name of the Infinidat nor the names of its contributors may be used to endorse or promote products */ 14 | /* derived from this software without specific prior written permission. */ 15 | /* */ 16 | /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 17 | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 18 | /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ 19 | /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */ 20 | /* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, */ 21 | /* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE */ 22 | /* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 23 | /***********************************************************************************************************************/ 24 | 25 | // Based on James Bonfield's fqz_comp 26 | 27 | #ifndef ZP_POWER_RANGER 28 | #define ZP_POWER_RANGER 29 | 30 | #ifndef ZP_RANGECODER_H 31 | #include "coder.hpp" 32 | #endif 33 | 34 | #include 35 | 36 | class PowerRanger { 37 | enum { 38 | STEP=14, 39 | NSYM=256, 40 | MAX_FREQ=(1<<15)-32, 41 | }; 42 | 43 | UINT32 total; 44 | UINT16 freq[NSYM]; 45 | UINT16 iend ; 46 | 47 | UCHAR count; 48 | UCHAR syms[NSYM]; 49 | 50 | void normalize() { 51 | for (UINT32 i = total = 0; i < iend; i++) 52 | total += (freq[i] >>= 1 ); 53 | } 54 | 55 | inline UCHAR down_level(int i) { 56 | 57 | UCHAR t = syms[i ]; 58 | syms[i] = syms[i-1]; 59 | syms[i-1] = t; 60 | 61 | UINT16 f = freq[i]; 62 | freq[i] = freq[i-1]; 63 | freq[i-1] = f; 64 | 65 | return t; 66 | } 67 | 68 | inline UCHAR update_freq(int i) { 69 | 70 | rarely_if(freq[i] > (MAX_FREQ - STEP)) { 71 | if (i == 0 and 72 | freq[i] + 256U > total) 73 | return syms[i]; 74 | 75 | normalize(); 76 | } 77 | 78 | freq[i] += STEP; 79 | total += STEP; 80 | 81 | return LIKELY (i == 0 or 82 | (++ count & 0xf) or 83 | freq[i] <= freq[i-1] ) ? 84 | syms[i] : 85 | down_level(i); 86 | } 87 | 88 | public: 89 | PowerRanger() { 90 | bzero(this, sizeof(*this)); 91 | } 92 | 93 | void put(RCoder *rc, UCHAR sym) { 94 | UINT32 sumf = 0; 95 | UINT32 i = 0; 96 | 97 | rarely_if(iend <= sym) 98 | for (;iend <= sym; iend++) 99 | syms[iend] = iend; 100 | 101 | for (; syms[i] != sym; sumf += freq[i++]); 102 | 103 | rc->Encode(sumf+i, freq[i]+1, total + NSYM); 104 | 105 | update_freq(i); 106 | } 107 | 108 | UINT16 get(RCoder *rc) { 109 | 110 | UINT32 vtot = total + NSYM; // - iend; 111 | UINT32 sumf = 0; 112 | UINT32 i; 113 | 114 | UINT32 prob = rc->GetFreq(vtot); 115 | for ( i = 0; 116 | i < NSYM; 117 | i ++ ) { 118 | 119 | rarely_if(iend == i) 120 | syms[ iend++ ] = i; 121 | 122 | if (sumf + freq[i] + 1 <= prob) 123 | sumf += freq[i] + 1; 124 | else 125 | break; 126 | } 127 | rc->Decode(sumf, freq[i]+1, vtot); 128 | 129 | return update_freq(i); 130 | } 131 | } PACKED; 132 | 133 | class PowerRangerU { 134 | 135 | PowerRanger p[14]; 136 | 137 | public: 138 | bool put_u(RCoder *rc, UINT64 num) { 139 | 140 | likely_if(num <= 0x7f) { 141 | p[0].put(rc, 0xff & num); 142 | return false; 143 | } 144 | likely_if (num < 0x7ffe) { 145 | p[0].put(rc, 0xff & (0x80 | (num>>8))); 146 | p[1].put(rc, 0xff & num); 147 | return false; 148 | } 149 | p[0].put(rc, 0xff); 150 | if (num < 1ULL<<32) { 151 | p[1].put(rc, 0xfe); 152 | for (int shift=0, i=2; shift < 32; shift+=8, i++) 153 | p[i].put(rc, 0xff & (num>>shift)); 154 | return true; 155 | } 156 | { 157 | p[1].put(rc, 0xff); 158 | for (int shift=0, i=6; shift < 64; shift+=8, i++) 159 | p[i].put(rc, 0xff & (num>>shift)); 160 | return true; 161 | } 162 | } 163 | 164 | UINT64 get_u(RCoder *rc) { 165 | 166 | UINT64 num = p[0].get(rc); 167 | rarely_if(num > 0x7f) { 168 | 169 | num <<= 8; 170 | num |= p[1].get(rc); 171 | likely_if(num < 0xfffe) 172 | num &= 0x7fff; 173 | 174 | else likely_if (num == 0xfffe) { 175 | num = 0; 176 | for (int shift=0, i = 2; shift < 32; shift+=8, i++) { 177 | UINT64 c = p[i].get(rc); 178 | num |= (c<= -0x80+3 and 202 | num <= 0x7f) { 203 | p[0].put(rc, num & 0xff); 204 | return false; 205 | } 206 | likely_if (num >= -0x8000 and 207 | num <= 0x7fff) { 208 | p[0].put(rc, 0x80); 209 | p[1].put(rc, 0xff & (num)); 210 | p[2].put(rc, 0xff & (num>>8)); 211 | return false; 212 | } 213 | if (num >= -0x80000000LL and 214 | num <= 0x7fffffffLL ) { 215 | p[0].put(rc, 0x81); 216 | for (int shift = 0, i=3; shift < 32; shift+=8, i++) 217 | p[i].put(rc, 0xff&(num>>shift)); 218 | return true; 219 | } 220 | { 221 | p[0].put(rc, 0x82); 222 | for (int shift = 0, i=7; shift < 64; shift+=8, i++) 223 | p[i].put(rc, 0xff&(num>>shift)); 224 | 225 | return true; 226 | } 227 | } 228 | 229 | long long get_i(RCoder* rc) { 230 | long long num = p[0].get(rc); 231 | 232 | rarely_if(num == 0x80 ) { 233 | num = p[1].get(rc); 234 | num |= p[2].get(rc) << 8; 235 | num = (short) num; 236 | } 237 | else rarely_if(num == 0x81) { 238 | num = 0; 239 | for (int shift = 0, i=3; shift < 32; shift+=8, i++) { 240 | int c = p[i].get(rc); 241 | num |= c << shift; 242 | } 243 | num = (int) num; 244 | } 245 | else rarely_if(num == 0x82) { 246 | num = 0; 247 | for (int shift = 0, i=7; shift < 64; shift+=8, i++) { 248 | UINT64 c = p[i].get(rc); 249 | num |= c << shift; 250 | } 251 | } 252 | else if (0x80&num) 253 | num = (char)num; 254 | 255 | return num ; 256 | } 257 | 258 | } PACKED ; 259 | 260 | #endif 261 | -------------------------------------------------------------------------------- /gens.cpp: -------------------------------------------------------------------------------- 1 | /***********************************************************************************************************************/ 2 | /* This program was written by Josef Ezra */ 3 | /* Copyright (c) 2013, Infinidat */ 4 | /* All rights reserved. */ 5 | /* */ 6 | /* Redistribution and use in source and binary forms, with or without modification, are permitted provided that */ 7 | /* the following conditions are met: */ 8 | /* */ 9 | /* Redistributions of source code must retain the above copyright notice, this list of conditions and the following */ 10 | /* disclaimer. */ 11 | /* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following */ 12 | /* disclaimer in the documentation and/or other materials provided with the distribution. */ 13 | /* Neither the name of the Infinidat nor the names of its contributors may be used to endorse or promote products */ 14 | /* derived from this software without specific prior written permission. */ 15 | /* */ 16 | /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 17 | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 18 | /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ 19 | /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */ 20 | /* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, */ 21 | /* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE */ 22 | /* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 23 | /***********************************************************************************************************************/ 24 | 25 | 26 | 27 | 28 | #include "gens.hpp" 29 | #include "config.hpp" 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | 36 | size_t GenBase::ranger_cnt() { 37 | switch(conf.level) { 38 | case 1: return BRANGER_SIZE_1; 39 | default: 40 | case 2: return BRANGER_SIZE_2; 41 | case 3: return BRANGER_SIZE_3; 42 | case 4: return BRANGER_SIZE_4; 43 | } 44 | } 45 | 46 | void GenBase::range_init() { 47 | // bzero(ranger, sizeof(ranger[0])* cnt); 48 | // memset(ranger, 2, sizeof(ranger[0])*BRANGER_SIZE); - how could it be slower than nantive constructor? I would expect 'memset' implementation to use pipes 49 | } 50 | 51 | ////////// 52 | // save // 53 | ////////// 54 | 55 | static char gencodes[256]; 56 | 57 | GenSave::GenSave() { 58 | 59 | BZERO(m_stats); 60 | BZERO(m_last); 61 | m_N_byte = 0; 62 | 63 | filer = new FilerSave("gen"); 64 | assert(filer); 65 | rcoder.init(filer); 66 | ranger = new Base2Ranger[ranger_cnt()]; 67 | range_init(); 68 | 69 | x_Ns = new XFileSave("gen.Ns"); 70 | x_Nn = new XFileSave("gen.Nn"); 71 | 72 | memset(gencodes, 0x10, sizeof(gencodes)); 73 | gencodes[(UINT8)'0'] = gencodes[(UINT8)'A'] = gencodes[(UINT8)'a'] = 0; 74 | gencodes[(UINT8)'1'] = gencodes[(UINT8)'C'] = gencodes[(UINT8)'c'] = 1; 75 | gencodes[(UINT8)'2'] = gencodes[(UINT8)'G'] = gencodes[(UINT8)'g'] = 2; 76 | gencodes[(UINT8)'3'] = gencodes[(UINT8)'T'] = gencodes[(UINT8)'t'] = 3; 77 | gencodes[(UINT8)'.'] = gencodes[(UINT8)'N'] = gencodes[(UINT8)'n'] = 4; 78 | } 79 | 80 | GenSave::~GenSave() { 81 | rcoder.done(); 82 | if (not conf.quiet) { 83 | } 84 | delete []ranger; 85 | delete filer; 86 | 87 | DELETE(x_Ns); 88 | DELETE(x_Nn); 89 | } 90 | 91 | void GenSave::bad_q_or_bad_n(UCHAR gen, UCHAR qlt, bool bad_n, bool bad_q) 92 | { 93 | if (not bad_n) { 94 | // bad_q only 95 | x_Nn->put(g_genofs_count - m_last.Nn_index); 96 | m_last.Nn_index = g_genofs_count; 97 | return; 98 | } 99 | 100 | rarely_if(not m_N_byte) { 101 | // TODO: make a single m_last - exceptions file 102 | m_N_byte = gen; 103 | if ('N' != gen) 104 | conf.set_info("gen.N_byte", gen); 105 | } 106 | // TODO: eliminate this temp sanity 107 | rarely_if (gen != m_N_byte) 108 | croak("switched N_byte: %c", gen); 109 | 110 | if (not bad_q) { 111 | x_Ns->put(g_genofs_count - m_last.Ns_index); 112 | m_last.Ns_index = g_genofs_count ; 113 | } 114 | } 115 | 116 | inline UCHAR GenSave::normalize_gen(UCHAR gen, UCHAR qlt) { 117 | 118 | bool bad_n; // = false; 119 | const bool bad_q = qlt == '!'; 120 | 121 | UCHAR n = gencodes[gen]; 122 | likely_if (n <= 3) 123 | bad_n = false; 124 | else { 125 | rarely_if( n > 4) 126 | croak("unexpected genome char: %c", gen); 127 | bad_n = true; 128 | n = 0; 129 | } 130 | g_genofs_count ++; 131 | 132 | rarely_if(bad_n or bad_q) 133 | bad_q_or_bad_n(gen, qlt, bad_n, bad_q); 134 | 135 | return n; 136 | } 137 | 138 | void GenSave::save_x(const UCHAR* gen, const UCHAR* qlt, UINT64 size, const UINT64 mask) { 139 | UINT32 last = 0x007616c7; 140 | const UCHAR* g = gen; const UCHAR* q = qlt; 141 | for (; g < gen + size ; g++, q++) { 142 | UCHAR n = normalize_gen(*g, *q); 143 | last &= mask; 144 | PREFETCH(ranger + last); 145 | ranger[last].put(&rcoder, n); 146 | last = ((last<<2) | n); 147 | } 148 | } 149 | 150 | void GenSave::save_x(const UCHAR* gen, const UCHAR* qlt, UINT64 llen, UINT64 qlen, const UINT64 mask) { 151 | UINT32 last = 0x007616c7; 152 | for (UINT32 i = 0; i < llen; i++) { 153 | UCHAR n = normalize_gen(gen[i], i < qlen ? qlt[i] : 40); 154 | last &= mask; 155 | PREFETCH(ranger + last); 156 | ranger[last].put(&rcoder, n); 157 | last = ((last<<2) | n); 158 | } 159 | } 160 | 161 | ////////// 162 | // load // 163 | ////////// 164 | 165 | GenLoad::GenLoad() { 166 | BZERO(m_last); 167 | m_valid = true; 168 | 169 | m_N_byte = conf.get_long("gen.N_byte", 'N') ; 170 | bool is_solid = conf.get_bool("usr.solid"); 171 | bool is_lowercase = false; 172 | 173 | m_gencode = 174 | is_solid ? 175 | "0123" : 176 | is_lowercase ? 177 | "acgt" : 178 | "ACGT" ; 179 | 180 | filer = new FilerLoad("gen", &m_valid); 181 | assert(filer); 182 | rcoder.init(filer); 183 | ranger = new Base2Ranger[ranger_cnt()]; 184 | range_init(); 185 | 186 | x_Ns = new XFileLoad("gen.Ns"); 187 | x_Nn = new XFileLoad("gen.Nn"); 188 | m_last.Ns_index = x_Ns->get(); 189 | m_last.Nn_index = x_Nn->get(); 190 | } 191 | 192 | GenLoad::~GenLoad() { 193 | rcoder.done(); 194 | delete []ranger; 195 | delete filer; 196 | DELETE(x_Ns); 197 | DELETE(x_Nn); 198 | } 199 | 200 | void GenLoad::normalize_gen(UCHAR & gen, UCHAR qlt) { 201 | g_genofs_count ++ ; 202 | 203 | rarely_if (m_last.Nn_index == g_genofs_count) 204 | m_last.Nn_index += x_Nn->get(); 205 | 206 | else rarely_if (qlt == '!') 207 | gen = m_N_byte; 208 | 209 | else rarely_if (m_last.Ns_index == g_genofs_count) { 210 | gen = m_N_byte; 211 | m_last.Ns_index += x_Ns->get(); 212 | } 213 | } 214 | 215 | UINT32 GenLoad::load_x(UCHAR* gen, const UCHAR* qlt, UINT64 size, const UINT64 mask) { 216 | 217 | UINT32 last = 0x007616c7; 218 | 219 | UCHAR* g = gen; const UCHAR* q = qlt; 220 | for (; g < gen + size ; g++, q++) { 221 | last &= mask ; 222 | PREFETCH(ranger + last); 223 | UCHAR b = ranger[last].get(&rcoder); 224 | 225 | *g = m_gencode [ b ]; 226 | last = ((last<<2) + b); 227 | normalize_gen(*g, *q); 228 | } 229 | 230 | return m_valid ? size : 0; 231 | } 232 | 233 | UINT32 GenLoad::load_x(UCHAR* gen, const UCHAR* qlt, UINT64 llen, UINT64 qlen, const UINT64 mask) { 234 | // rare: mismatch qlt/gen line sizes. This typically happens when padding the gen line. In this 235 | // case, assume valid base. 236 | UINT32 last = 0x007616c7; 237 | 238 | for (UINT32 i = 0; i < llen; i++) { 239 | last &= mask ; 240 | PREFETCH(ranger + last); 241 | UCHAR b = ranger[last].get(&rcoder); 242 | 243 | gen[i] = m_gencode[ b ]; 244 | last = ((last<<2) + b); 245 | normalize_gen(gen[i], i < qlen ? qlt[i] : 40); 246 | } 247 | 248 | return m_valid ? llen : 0; 249 | } 250 | -------------------------------------------------------------------------------- /tools/slimfastq.multi: -------------------------------------------------------------------------------- 1 | # -*- cperl -*- 2 | 3 | # This program was written by Josef Ezra , # 4 | # Copyright (c) 2019, Infinidat # 5 | # All rights reserved. # 6 | # # 7 | # Redistribution and use in source and binary forms, with or without modification, are permitted provided that # 8 | # the following conditions are met: # 9 | # # 10 | # Redistributions of source code must retain the above copyright notice, this list of conditions and the following # 11 | # disclaimer. # 12 | # Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following # 13 | # disclaimer in the documentation and/or other materials provided with the distribution. # 14 | # Neither the name of the Infinidat nor the names of its contributors may be used to endorse or promote products # 15 | # derived from this software without specific prior written permission. # 16 | 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, # 18 | # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # 19 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # 20 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # 21 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, # 22 | # WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE # 23 | # USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # 24 | 25 | # ********** README ************ 26 | # This script was written with easy editing, by non perl expecrts, in mind. If you need to 27 | # change anything - go for it! 28 | 29 | # /usr/bin/perl is the default perl location on most unix systems. 30 | eval 'exec /usr/bin/perl $0 $*' 31 | if 0 ; 32 | 33 | use 5.6.0 ; 34 | use warnings ; 35 | use strict ; 36 | use integer ; 37 | use bytes ; 38 | 39 | use threads ; 40 | use threads::shared ; 41 | use File::Basename; 42 | use Getopt::Long; 43 | use List::Util 'first' ; 44 | 45 | sub helpthem() { die < : default is compress/decompress in the same dir 55 | -f -fq_suffix : comma separated list, default is '.fastq,.fq' (decompression uses the first in list) 56 | -s -sfq_suffix : default is '.sfq' 57 | -c -count : concurrent parallel compressions, default is based on 'lscpu' command or 8 (if not available) 58 | -e -exec : slimfastq command location (default searched in \$PATH, script's dir, and one up). 59 | -v -verbose : log to stdout 60 | -help : show this message 61 | 62 | Examples: 63 | $0 -t my/dir/SFQ my/dir/FQ : compress all '*.fastq', '*.fq' files in my/dir/FQ into my/dir/SFQ/*.sfq 64 | $0 -d -t my/dir/FQ my/dir/SFQ : decompress all '*.sfq' files in my/dir/SFQ into my/dir/FQ/*.fastq 65 | 66 | Note: 67 | Editing this script was meant to be easy. Do it to make this script work with other compression software or 68 | to change other default behaviors. 69 | 70 | Help 71 | 72 | my %arg; # global arg holds the user options 73 | GetOptions(map {/(\w+)/; $_, \$arg{$1 || $_}} 74 | qw/help verbose 75 | decompress 76 | recursively 77 | tgt_dir=s 78 | fq_suffix=s 79 | sfq_suffix=s 80 | count=n 81 | exec=s 82 | /); 83 | 84 | helpthem if $arg{help}; 85 | 86 | # First init some globals, for fast access 87 | 88 | my $comp = not $arg{decompress}; 89 | 90 | my @fq_suffix = split /\s*,\s*/, ($arg{fq_suffix} || ''); 91 | @fq_suffix = ('.fastq', '.fq') unless @fq_suffix; 92 | my $fq_suffix = $fq_suffix[0]; 93 | 94 | my $sfq_suffix = $arg{sfq_suffix} || '.sfq'; 95 | 96 | my @glob_suffix = $comp ? (@fq_suffix) : ($sfq_suffix) ; 97 | 98 | my @files : shared ; 99 | @files = @ARGV ; 100 | # while (my $f = shift) { 101 | # ? check each one ? 102 | # push @files, $f; 103 | # } 104 | die "Missing directories or files to " . ($comp ? 'compress' : 'decompress') . " in the command arguments\n for more details, try '$0 -h'\n" 105 | unless @files ; 106 | 107 | my $verbose_lock : shared; 108 | sub verb(@) { 109 | # log function - here you can add date, print to file, etc. 110 | return unless $arg{verbose}; 111 | lock $verbose_lock; 112 | print @_, "\n" 113 | } 114 | 115 | my $tgt_dir = get_tgt_dir(); 116 | my $num_t = get_threads_count() || 8 ; 117 | my $exec = get_exec() or die "Can't find a slimfastq executable. Please use the -exec option\n"; 118 | 119 | verb "Running $num_t threads. Exec is $exec"; 120 | 121 | sub get_tgt_dir { 122 | my $d = $arg{tgt_dir} or return; 123 | mkdir $d or die "Can't create $d: $!\n" unless -d $d; 124 | # ^-- if not exists, attemp to create one level only 125 | $d 126 | } 127 | 128 | sub get_threads_count { 129 | # return the number of threads to run. First of user arg, system cores, or 8 130 | return $arg{count} if $arg{count} and 0+$arg{count}; 131 | my $lscpu = qx/lscpu/ || ''; 132 | my $cnt; 133 | if ($lscpu =~ /CPU\W*s?\W+(\d+)/i) { 134 | $cnt = $1; 135 | if ($lscpu =~ /thread[\W]*s?\W*per core\W+(\d+)/i) { 136 | $cnt *= $1; 137 | } 138 | } 139 | $cnt 140 | } 141 | 142 | sub get_exec { 143 | # return the slimfastq executable path 144 | my $e = $arg{exec}; 145 | if ($e) { 146 | die "$e: not a valid executable\n" unless -e $e ; 147 | return $e; 148 | } 149 | first {-x } map {"$_/slimfastq"} (dirname($0), dirname($0) . "/..", split (':', $ENV{PATH} ), '.', '..'); 150 | } 151 | 152 | # All initialized, now do the actual task 153 | 154 | sub getfile() { 155 | # this thread protected sub will parse direcotries (if needed) and provide the caller one unique filename 156 | lock @files; 157 | while (@files) { 158 | my $f = shift @files; 159 | $f =~ s/\/+$//; 160 | return $f if -f $f ; 161 | if (-d _ ) { 162 | push @files, grep {-f $_} glob join ' ', map {"$f/*$_"} @glob_suffix; 163 | push @files, grep {-d $_} glob "$f/*" if $arg{recursively}; # double reading the directory is less efficeint than a single scan, but this seems to be neglectable 164 | } 165 | else { 166 | warn "$f: not a file nor a directory\n"; 167 | } 168 | } 169 | undef 170 | } 171 | 172 | sub compress { 173 | while (my $u = getfile()) { 174 | my $b = basename $u, @fq_suffix; 175 | my $d = $tgt_dir || dirname $u; 176 | my $f = "$d/$b$sfq_suffix"; 177 | system "$exec -u $u -f $f -O "; 178 | if ($arg{verbose}) { 179 | my $su = (-s $u)/1000_000 ; 180 | my $sf = (-s $f)/1000_000 ; 181 | verb "comp $u (${su}MB) -> $f (${sf}MB)" 182 | } 183 | } 184 | } 185 | 186 | sub decompress { 187 | while (my $f = getfile()) { 188 | my $stat = qx/head -6 $f/; 189 | if ($stat !~ /whoami\=slimfastq/) { 190 | verb "ERR: $f doesn't seem to be a valid slimfastq file"; 191 | continue; 192 | } 193 | my $b = $stat =~ /orig.filename=(.*?)\s*$/m ? (basename $1) : (basename $f, $sfq_suffix) . "$fq_suffix" ; 194 | my $d = $tgt_dir || dirname $f; 195 | my $u = "$d/$b"; 196 | system "$exec -d -u $u -f $f "; 197 | if ($arg{verbose}) { 198 | my $su = (-s $u)/1000_000 ; 199 | my $sf = (-s $f)/1000_000 ; 200 | verb "decomp $u (${su}MB) <- $f (${sf}MB)" 201 | } 202 | } 203 | } 204 | 205 | # sub just_doit { 206 | # while (my $u = getfile()) { 207 | # my $b = basename $u, '.fastq', '.fq' ; 208 | # my $f = "$tdir/$b.sfq"; 209 | # system "$sfq -q -u $u -f $f"; 210 | # } 211 | # } 212 | 213 | my @threads; 214 | sub thread_push(){ 215 | push @threads, threads->create( $comp ? \&compress : \&decompress ) ; 216 | } 217 | sub thread_pop(){ 218 | my $t = shift @threads; 219 | $t->join; 220 | } 221 | 222 | # Main code - create $num_t threads, each triggering compress/decompress in a loop untill all files are done 223 | while (@files) { 224 | thread_push; 225 | thread_pop if @threads > $num_t; 226 | } thread_pop while @threads; 227 | 228 | 229 | -------------------------------------------------------------------------------- /samples/tstc.fq: -------------------------------------------------------------------------------- 1 | @SRR003649.10929720 304D6AAXX:5:98:1541:808 2 | TATAAGAGGTCTCTTAAATAGTGTTTTTATTTTTTTAAATAAAGACAAATG 3 | +SRR003649.10929720 304D6AAXX:5:98:1541:808 4 | I%%.3I$-&?G%)II$)*9"$+"%&D+,-&.)&&,$'++5)+(*$+&$&,! 5 | @SRR003649.10929721 304D6AAXX:5:98:627:375 6 | GGGTAGTCAGATATTAAAAATAAAAAGTGCGCTTTTTTGACTCCATGACTC 7 | +SRR003649.10929721 304D6AAXX:5:98:627:375 8 | )&/-A;*+24(%,I%+13'&%*-$$-.&+%')&"&$'3*$((2%&+$&-&# 9 | @SRR003649.10929724 304D6AAXX:5:98:1497:1722 10 | CATGTTAAGGGTTTCATTTGTAGCAACACAAAAACTNCCCCATATTCTACG 11 | +SRR003649.10929724 304D6AAXX:5:98:1497:1722 12 | I&I+:,%&*&.III@%3A1D371&&&")*/'-()&+"%F%$&#(#&+''!) 13 | @SRR003649.10929725 304D6AAXX:5:98:914:430 14 | CTTTGACTTGATGCTTTCTGCTTGACTATTTTCCGGGGCGCGTCCCTCGAA 15 | +SRR003649.10929725 304D6AAXX:5:98:914:430 16 | +'9A6+13I5)161A*-*'%,;C""$/$-+%#%&&#+&&%&$%&%($!&!$ 17 | @SRR003649.10929726 304D6AAXX:5:98:846:1740 18 | CCGGCCATTAAGGTACAGTGGCGTTCGCACTTGCTTNNNACGTCCGCAGGC 19 | +SRR003649.10929726 304D6AAXX:5:98:846:1740 20 | ;&GI3%+:2/,//%*)&-(%%1'%$%#""$$#1%'%"""$'!#!)&'$#"& 21 | @SRR003649.10929727 304D6AAXX:5:98:857:1392 22 | GCTCAGGATGTCACTCTCTTTGATGAGATTGTCAGCAAAATCTGCCCAGCC 23 | +SRR003649.10929727 304D6AAXX:5:98:857:1392 24 | IFDIHIIII1'0:0,%C3:1721+5/+-.;%-),-('*,%+%#%4("(&&% 25 | @SRR003649.10929728 304D6AAXX:5:98:893:657 26 | TGTAGGGTGGCTCCATTTATTCCTAGCGGTTGATATGCGCCTTGGTGTCCA 27 | +SRR003649.10929728 304D6AAXX:5:98:893:657 28 | I%.0II=/'E-I'-6EIA8?$'(4(3!+*&+%$$"%-""$$$$""$)&!#( 29 | @SRR003649.10929730 304D6AAXX:5:98:948:888 30 | CAGAGAAAAAGCTGGAGCACGCTACGGGTACTAATTGTAGATACCCACGAA 31 | +SRR003649.10929730 304D6AAXX:5:98:948:888 32 | G+9'I&7>%-;E,H41=+';)*$212.)$$)*$%,#-$++&%&'"2&%&%" 33 | @SRR003649.10929732 304D6AAXX:5:98:993:796 34 | TCTTTAGGTTTATTAGAGAAGGGCAAAAGCCAGACACTTGACATTGTTATT 35 | +SRR003649.10929732 304D6AAXX:5:98:993:796 36 | I(III&4IA**%+.I%&+;2>57:)9$3%3,-$5-$+3&,'"-%0*&5-(+*)& 41 | @SRR003649.10929736 304D6AAXX:5:98:1032:156 42 | ATGGTTTCTTATGATTTTTCTATATGGAGCTTTAGCATCCGCAACGATTGT 43 | +SRR003649.10929736 304D6AAXX:5:98:1032:156 44 | &I$&%)E%)/''%$+%&%)%$/''&)%!*$.""$$$-,+&+0%&%)*-#&% 45 | @SRR003649.10929738 304D6AAXX:5:98:73:1877 46 | TATCAAGAAGCATAGGTGATAAAACTATTTGTTGAANNNNNNNNNNNNNNN 47 | +SRR003649.10929738 304D6AAXX:5:98:73:1877 48 | II00%4;%+I%?-B0I0>I6@>073,=,@.104&/)""""""""""""""" 49 | @SRR003649.10929739 304D6AAXX:5:98:694:1432 50 | TTCTAAAAGATTTTATTGCCTTTATCTTCTCATATTCTGAATGAGTTTCAC 51 | +SRR003649.10929739 304D6AAXX:5:98:694:1432 52 | IIIII2F5I0IIEI:II;A7III2F3I>2I$*B+GA/)&'&++"+3%.6*% 53 | @SRR003649.10929740 304D6AAXX:5:98:254:629 54 | ATTTTGTCATATTACCAGACTTGTTTTTCTGGTTCCTCTCATTTGGGTAGC 55 | +SRR003649.10929740 304D6AAXX:5:98:254:629 56 | 4II2II7BBI3H90:+.A*.9.915->2&4+%./,'&&+,%*)-,%&&%.! 57 | @SRR003649.10929741 304D6AAXX:5:98:1409:564 58 | GAGGTTGAATTTACTGATTTTAGGTTTTGAAGGTGTGGGAAAAGAAATAAT 59 | +SRR003649.10929741 304D6AAXX:5:98:1409:564 60 | ,$$$/,A--I7I9++&-+4.()2%"%+"'$(&%(%$420%*'#2!$)**&" 61 | @SRR003649.10929742 304D6AAXX:5:98:714:1190 62 | CTTATGTGAAGATGTTTCCTTTTACACCACAGGCCTGGAAGCGCTCCACAT 63 | +SRR003649.10929742 304D6AAXX:5:98:714:1190 64 | =IIIII8IA:IEII3IIC1H:BF$+-1,&!&4-)*37.&&+.+/&.,'.(& 65 | @SRR003649.10929743 304D6AAXX:5:98:895:434 66 | ATGGTGAAATCTCAGCCATCCTGCTCTGTCCCACTCCAGATGCGAATCATC 67 | +SRR003649.10929743 304D6AAXX:5:98:895:434 68 | +>I++C4/1=67)/,&1)1(&8'))"0&+()*%)%",#-&&0%-#'&,$'% 69 | @SRR003649.10929745 304D6AAXX:5:98:770:471 70 | TAGGAGTATGAGTGTGCCAGTGAGTTAGTATCTCAGTGTGTCAGGGATGTG 71 | +SRR003649.10929745 304D6AAXX:5:98:770:471 72 | F+II2I=@II)I.C5.@9,9,5*0+8'(5/7&85'++6'<+,-5<7$-.'. 73 | @SRR003649.10929752 304D6AAXX:5:98:667:850 74 | ACAAAACATGTTCCATGGTTTCTTTATATGGTATAAAGGAAAGTGAAATGA 75 | +SRR003649.10929752 304D6AAXX:5:98:667:850 76 | *0/2*),(.&+5.,+-/*&+)'.-&)+&('&%&)'%$%*#%$'%%$%"#(% 77 | @SRR003649.10929753 304D6AAXX:5:98:498:1341 78 | GGCTGATGTATTCAGAATTGAAGGGCATGCAATCATCCCAAATACACACCG 79 | +SRR003649.10929753 304D6AAXX:5:98:498:1341 80 | II&%4%.&<%7BI+&7%-/>$,'-&"&+9)$%-#&$+%3$$""!""($#-" 81 | @SRR003649.10929756 304D6AAXX:5:98:698:1956 82 | TGAGCCGGCAGGAGAAGCAACGGTGGGGGTCGCTGGNNNNNNNNNNNNNNN 83 | +SRR003649.10929756 304D6AAXX:5:98:698:1956 84 | &*II1IH;%'IIII'2442%$88%0+9@,%$+(#4?""""""""""""""" 85 | @SRR003649.10929757 304D6AAXX:5:98:154:1930 86 | ACTGCTATGACGCAGTCTTTTTTTTGCTGCTACTGTNNNNNNNNNNNNNNN 87 | +SRR003649.10929757 304D6AAXX:5:98:154:1930 88 | I-II15%II7'187928+064.=774&*,0."*$$"%*'""""""""""""""" 101 | @SRR003649.10929773 304D6AAXX:5:98:1088:266 102 | AAAAAAAATTTATGGGTTAAATACACAGAAGGCAAGCATTTTCCAGTGCTT 103 | +SRR003649.10929773 304D6AAXX:5:98:1088:266 104 | C>/3.-/A.(/>+1@B0572(1,%6!$*#"1+#-/(&&/&&)'$,-$,/+& 105 | @SRR003649.10929775 304D6AAXX:5:98:860:393 106 | AAAAAAAAAATCAGAAATTATCATAACTAAATTGTTAAAACTTTGGAAAAC 107 | +SRR003649.10929775 304D6AAXX:5:98:860:393 108 | &2,0I>?@,04CI0:6.98.>,61+,(+(6&')$(:3+-22(0&/&*.+(+ 109 | @SRR003649.10929776 304D6AAXX:5:98:520:1364 110 | AGCTTTATTCATAATTGTCAAAACGTGGAAGCAACTAAGCTGTTCTTCAGT 111 | +SRR003649.10929776 304D6AAXX:5:98:520:1364 112 | /+"2GI'I-04+;*;(*)+:$,$+(-='*(*#%-$3%&&&*.##)#*%"#$ 113 | @SRR003649.10929777 304D6AAXX:5:98:1179:998 114 | GGGAAGCGGAGGCGGGTGGAGTGATTAGGTATTGGGCCTGGAGTTGGGGCA 115 | +SRR003649.10929777 304D6AAXX:5:98:1179:998 116 | +DIIII)9I(0IFI(%&I).*,+'B+(0%#)%!)$#%+$++"$!#,$#-&! 117 | @SRR003649.10929779 304D6AAXX:5:98:1055:823 118 | TTGCACACCCTGCTCTCTGCCTCCATATCTTCCTCTGTGCCCAGGCCTGGG 119 | +SRR003649.10929779 304D6AAXX:5:98:1055:823 120 | I+'(F-2C2%0%"@+,*&(&&&&'&,("$*(&+#&$+%"*+-(*&!'%#** 121 | @SRR003649.10929780 304D6AAXX:5:98:982:192 122 | ACTTGCTAGGGACATCTTTGGATACAGTCATCAGGGAAGGCCTTCGAGGAG 123 | +SRR003649.10929780 304D6AAXX:5:98:982:192 124 | ,+%04"*04>0/)'6-/6>-.+.%+%%$(%&+&)+$$)/,')&#$6$-+&+ 125 | @SRR003649.10929783 304D6AAXX:5:98:997:424 126 | TGTTCAGATGTGGCATGTTCAGCAAAATTGTGACCACTGTTTCCCTACTGC 127 | +SRR003649.10929783 304D6AAXX:5:98:997:424 128 | I$3I3.*924)B'0%3&5$&$'%''-*.&$3!*&.$!%""$(%!%"&&%#% 129 | @SRR003649.10929784 304D6AAXX:5:98:217:1273 130 | CGGTTGTATTTCTTTCTCCTCGGGCCTGTTCCTTCATATAACTCATTTGGA 131 | +SRR003649.10929784 304D6AAXX:5:98:217:1273 132 | @IIIII82III.IA;%I7134,00),73.<%)95%%,&/&%-$5&)*'$$% 133 | @SRR003649.10929786 304D6AAXX:5:98:1298:1040 134 | ACAGAATATACATTTTTTTCAGCACCACACCACACCTATTCCAAAATTGAC 135 | +SRR003649.10929786 304D6AAXX:5:98:1298:1040 136 | 0-II88=68?7+62I:12.**+&++*-/*)$+.+,%&(/&6-%+)++$$*? 137 | @SRR003649.10929788 304D6AAXX:5:98:734:707 138 | AATCATCATTTAGTGATATCTAATGGAATTTATGAGTGGACACGAATCAAA 139 | +SRR003649.10929788 304D6AAXX:5:98:734:707 140 | 4I?-B%D0)&,@,)11%".(&2$$$-8-2%#'$#&"&0&$+!'*'%#)(&) 141 | @SRR003649.10929789 304D6AAXX:5:98:273:458 142 | CCCATTAATGGGCAACTGTTTAACCTGTATTTCTCCAGTGTCCCAGTTACT 143 | +SRR003649.10929789 304D6AAXX:5:98:273:458 144 | H*+;;H0)'1I)5%74,(H/9(%"%2&%$+%&$7&)(.%,*&$&$&(#'(- 145 | @SRR003649.10929790 304D6AAXX:5:98:1760:1421 146 | TTGTGTAATTATGCCCAATTATTTTCATTCCAGTAATGGTCTAGAGAAGGA 147 | +SRR003649.10929790 304D6AAXX:5:98:1760:1421 148 | I24<<&C;I2%=%.%+,00B&58:+*-6+//%%$$+'I:,55<2&8'(>"& 149 | @SRR003649.10929793 304D6AAXX:5:98:1573:1757 150 | TCCCTAGTAGCTTGGACCCCAAGTGCCACCATCCACNNNNNNNNNCAAACC 151 | +SRR003649.10929793 304D6AAXX:5:98:1573:1757 152 | '@//:&C*2.+-+26/3:-/70*.'*,'.*+%,*$3"""""""""($"$&) 153 | @SRR003649.10929794 304D6AAXX:5:98:1112:128 154 | TTTCTTTTTTAATTTTGTTTTTTGTTTATTTATTTATTGATGAATTATCTT 155 | +SRR003649.10929794 304D6AAXX:5:98:1112:128 156 | )&'-I&I*&C%%),&1&6$%*&:$6&$%)&(#+&3#62&)/%$#/-'-(+* 157 | @SRR003649.10929799 304D6AAXX:5:98:910:1002 158 | ACATATCTCAACTGGATTATTTTTCAGTGTTTAGGGGTGAACAGGGTGAAC 159 | +SRR003649.10929799 304D6AAXX:5:98:910:1002 160 | 8.;=/B+7>78@,9825C2CEI8:+%-+@,2/2$0:1&&%0++*(,+*8=1'+(.+,+ 177 | @SRR003649.10929806 304D6AAXX:5:98:636:1182 178 | GAATGGGAAAGGACTACATTACCAAAGAAGTGATAACTTCACGAGCGTCAC 179 | +SRR003649.10929806 304D6AAXX:5:98:636:1182 180 | I.<3IC+&=&2>1:++$;&&%7.*$(2"'%%$$%%$1$#+$#'%+#%$$'4 181 | @SRR003649.10929807 304D6AAXX:5:98:924:344 182 | AAAAAGCAGTCATAGCGTTCAGGCACCCCATAAGGGCACCCCAGCTTCCTT 183 | +SRR003649.10929807 304D6AAXX:5:98:924:344 184 | 2;37*2)951.5)/&60&.&5&"5+$&&)5(/&()"2,3+50&17&').(& 185 | @SRR003649.10929808 304D6AAXX:5:98:1038:1919 186 | GGAGGGCAAATAAACTTCTACGTTCCCTAAGCCTTCNNNNNNNNNNNNNNN 187 | +SRR003649.10929808 304D6AAXX:5:98:1038:1919 188 | ':5III*IC/+I:>2I<712*'24&6?+4%*%*%6&""""""""""""""" 189 | @SRR003649.10929810 304D6AAXX:5:98:137:3 190 | CAAGATTTATGTCCTTGGGTAACAGGCTTAACCTTCCTTTCATTGAAGATG 191 | +SRR003649.10929810 304D6AAXX:5:98:137:3 192 | /1/A-:>:/:.10+>6>/+(9/-.+*&+/,'($+*%%%"#$('$)$'"#'' 193 | @SRR003649.10929811 304D6AAXX:5:98:1568:926 194 | GGTATAGCTCGACCTGTTTGTGTGAGCCCTGTTCATGCTGGCCCTGCGCTC 195 | +SRR003649.10929811 304D6AAXX:5:98:1568:926 196 | -4=*',%&%"$-)*5&/+;'"+%$')$*!+%#&&&+>(.+""-&%-&$&"" 197 | @SRR003649.10929813 304D6AAXX:5:98:671:548 198 | AGGACAGTCTCTTCAATAAATTGGTGTTGGAAAAACTGGACACACAACTGA 199 | +SRR003649.10929813 304D6AAXX:5:98:671:548 200 | III)5=I-&I271.3-9$2(F(1-,+-I:.(1"(,&$+.&%*')+*%",-& 201 | -------------------------------------------------------------------------------- /filer.cpp: -------------------------------------------------------------------------------- 1 | /***********************************************************************************************************************/ 2 | /* This program was written by Josef Ezra */ 3 | /* Copyright (c) 2013, Infinidat */ 4 | /* All rights reserved. */ 5 | /* */ 6 | /* Redistribution and use in source and binary forms, with or without modification, are permitted provided that */ 7 | /* the following conditions are met: */ 8 | /* */ 9 | /* Redistributions of source code must retain the above copyright notice, this list of conditions and the following */ 10 | /* disclaimer. */ 11 | /* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following */ 12 | /* disclaimer in the documentation and/or other materials provided with the distribution. */ 13 | /* Neither the name of the Infinidat nor the names of its contributors may be used to endorse or promote products */ 14 | /* derived from this software without specific prior written permission. */ 15 | /* */ 16 | /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 17 | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 18 | /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ 19 | /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */ 20 | /* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, */ 21 | /* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE */ 22 | /* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 23 | /***********************************************************************************************************************/ 24 | 25 | 26 | 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | #include "filer.hpp" 33 | 34 | static void croak(const char* fmt, long long num) __attribute__ ((noreturn, cold)); 35 | static void croak(const char* fmt, long long num) { 36 | fprintf(stderr, fmt, num, errno ? strerror(errno) : ""); 37 | fprintf(stderr, "\n"); 38 | exit(1); 39 | } 40 | 41 | struct OneFile { 42 | struct file_attr_t { 43 | 44 | UINT64 name; 45 | UINT64 size ; // in bytes 46 | UINT32 first; // page index 47 | UINT32 node ; // page index 48 | } PACKED; 49 | enum { 50 | max_root_files = FILER_PAGE / sizeof(file_attr_t), 51 | max_node_files = FILER_PAGE / 4, 52 | }; 53 | file_attr_t files[max_root_files+1]; // pad to fit one page 54 | UINT64 next_findex; // offset in files table 55 | UINT64 num_pages; // free alloc index 56 | 57 | // UINT32 files_index; - currently limitted to single root dir (341 files, including zero). 58 | 59 | FILE* m_out; 60 | FILE* m_in ; 61 | 62 | UINT32 get_findex() { 63 | assert(m_out); 64 | rarely_if(next_findex >= max_root_files) 65 | croak("Internal error: Too many open files: %d %s", next_findex); 66 | return next_findex ++; 67 | } 68 | UINT32 get_findex(UINT64 name) { 69 | assert(m_in); 70 | for(UINT32 i = 1; i < next_findex; i++) 71 | if (files[i].name == name) 72 | return i; 73 | return 0; 74 | } 75 | UINT32 allocate() { 76 | return num_pages ++; 77 | } 78 | void read_page(UINT32 offset, UCHAR* page) { 79 | // Note: -D_FILE_OFFSET_BITS=64 is required 80 | UINT64 offs = offset; 81 | offs *= FILER_PAGE; 82 | fseek(m_in, offs , SEEK_SET); 83 | UINT32 cnt = fread(page, 1, FILER_PAGE, m_in); 84 | if (cnt != FILER_PAGE) 85 | croak("Failed reading page index %d: %s", offset); 86 | } 87 | void write_page(UINT32 offset, UCHAR* page) { 88 | UINT64 offs = offset; 89 | offs *= FILER_PAGE; 90 | fseek(m_out, offs, SEEK_SET); 91 | UINT32 cnt = fwrite(page, 1, FILER_PAGE, m_out); 92 | if (cnt != FILER_PAGE) 93 | croak("Failed writing page index %d: %s", offset); 94 | } 95 | OneFile() { 96 | m_out = m_in = NULL; 97 | } 98 | void init_read(FILE* in) { 99 | next_findex = 1; 100 | m_in = in; 101 | m_out = NULL; 102 | 103 | read_page(1, (UCHAR*)files); 104 | next_findex = files[0].first; 105 | files[0].first = 0; 106 | } 107 | void do_confess() const { 108 | // i: 'name' : size : first : node 109 | fprintf(stderr, " i: name : NODES count : 1st : 2nd\n"); 110 | for (UINT32 i = 0; 111 | i < next_findex; 112 | i++) { 113 | char name[10]; 114 | strncpy(name, (char*)&files[i].name, 8); 115 | fprintf(stderr, "%2d: %-10s: %-15lld: %d:\t: %d\n", 116 | i, (i? name : ""), files[i].size, files[i].first, files[i].node); 117 | } 118 | } 119 | void init_write(FILE* out) { 120 | next_findex = 1; 121 | m_in = NULL; 122 | m_out = out; 123 | 124 | files[0].name = 0; 125 | files[0].first = 0; 126 | num_pages = 2; 127 | } 128 | void finit_write() { 129 | if (m_out) { 130 | files[0].first = next_findex; 131 | write_page(1, (UCHAR*)files); 132 | fclose(m_out); 133 | m_out = NULL; 134 | } 135 | } 136 | UINT64 finit_size() { 137 | return num_pages * FILER_PAGE; 138 | } 139 | ~OneFile() { finit_write() ; } // call explicitly from config? 140 | } onef ; 141 | 142 | // Static 143 | 144 | void FilerSave::init(FILE* out) { onef.init_write(out); } 145 | void FilerSave::finit() { onef.finit_write() ; } 146 | void FilerLoad::init(FILE* in) { onef.init_read(in) ; } 147 | void FilerLoad::confess() { onef.do_confess() ; } 148 | UINT64 FilerSave::finit_size() { return onef.finit_size(); } 149 | 150 | // Base 151 | 152 | static UINT64 name2u(const char* name) { 153 | UINT64 uname ; 154 | assert(strlen(name) <=8 ); 155 | strncpy((char*)&uname, name, 8); 156 | return uname; 157 | } 158 | 159 | FilerBase::FilerBase() { 160 | m_node_i = 0; 161 | m_node_p = 0; 162 | 163 | m_valid = true; 164 | m_cur = m_count = 0; 165 | m_page_count = 0; 166 | } 167 | 168 | size_t FilerBase::tell() const { 169 | return 170 | ( m_page_count ) ? 171 | ((m_page_count-1) * FILER_PAGE) + 172 | ( m_cur ) : 173 | ( m_cur ) ; 174 | } 175 | 176 | size_t FilerSave::tell() const { 177 | return FilerBase::tell(); 178 | } 179 | 180 | size_t FilerLoad::tell() const { 181 | return FilerBase::tell(); 182 | } 183 | 184 | // Save 185 | 186 | bool FilerSave::is_valid() const { return m_valid; } 187 | 188 | FilerSave::FilerSave(const char* name) { 189 | UINT32 fi = m_onef_i = onef.get_findex(); 190 | onef.files[fi].name = name2u(name); 191 | onef.files[fi].size = 0; 192 | onef.files[fi].node = 0; 193 | onef.files[fi].first = onef.allocate(); 194 | } 195 | 196 | FilerSave::FilerSave(int forty_two) { 197 | assert(forty_two == 42); // Verify this constructor wasn't called by mistake 198 | 199 | BZERO(onef.files[0]); 200 | m_onef_i = 0; 201 | } 202 | 203 | FilerSave::~FilerSave() { 204 | save_page(true); 205 | m_valid = false; 206 | if (m_node_p) 207 | save_node(0); 208 | } 209 | 210 | void FilerSave::save_node(UINT32 next_node) { 211 | assert(m_node_i <= maxi_nodes ); 212 | assert(m_node_p); 213 | m_node[m_node_i ] = next_node; 214 | onef.write_page(m_node_p, (UCHAR*) m_node); 215 | m_node_p = next_node; 216 | m_node_i = 0; 217 | } 218 | 219 | void FilerSave::save_page(bool finit) { 220 | if (not m_valid or 221 | not m_cur) 222 | return ; 223 | 224 | assert(m_cur <= FILER_PAGE); 225 | 226 | rarely_if (not m_node_p) { 227 | assert(m_node_i == 0); 228 | onef.write_page(onef.files[m_onef_i].first, m_buff); 229 | if (not finit) // (m_cur == FILER_PAGE) // Don't allocate at EOF (harmless in the rare case of exact one page file) 230 | onef.files[m_onef_i].node = m_node_p = onef.allocate(); 231 | } 232 | else { 233 | onef.write_page(m_node[ m_node_i ++ ], m_buff); 234 | rarely_if (m_node_i == maxi_nodes and not finit) 235 | save_node(onef.allocate()); 236 | } 237 | if (not finit) 238 | m_node[ m_node_i ] = onef.allocate(); 239 | onef.files[m_onef_i].size += m_cur; 240 | m_cur = 0; 241 | m_page_count++; 242 | } 243 | 244 | // Load 245 | 246 | FilerLoad::FilerLoad(const char* name, bool* valid_ptr) { 247 | 248 | m_onef_i = onef.get_findex(name2u(name)); 249 | m_valid_ptr = valid_ptr; 250 | if (not m_onef_i) { 251 | *valid_ptr = m_valid = false; 252 | return; 253 | } 254 | * valid_ptr = m_valid = true ; 255 | load_page(); 256 | } 257 | 258 | FilerLoad::FilerLoad(int forty_two, bool* valid_ptr) { 259 | assert(forty_two == 42); // Verify this constructor wasn't called by mistake 260 | m_onef_i = 0; 261 | m_valid_ptr = valid_ptr; 262 | * valid_ptr = m_valid = true ; 263 | load_page(); 264 | } 265 | 266 | FilerLoad::~FilerLoad() { 267 | if ( m_valid_ptr ) 268 | *m_valid_ptr = false; 269 | } 270 | 271 | bool FilerLoad::is_valid() const { return m_valid ; } 272 | 273 | void FilerLoad::load_page() { 274 | rarely_if(not m_valid) 275 | return; 276 | 277 | rarely_if (tell() >= onef.files[m_onef_i].size ) { 278 | *m_valid_ptr = m_valid = false; 279 | return; // EOF 280 | } 281 | 282 | rarely_if (m_node_p == 0) { 283 | assert(m_node_i == 0); 284 | onef.read_page(onef.files[m_onef_i].first, m_buff); 285 | m_node_p = onef.files[m_onef_i].node; 286 | if (m_node_p) 287 | onef.read_page(m_node_p, (UCHAR*) m_node); 288 | m_node_i = 0; 289 | } 290 | else { 291 | rarely_if (m_node_i == maxi_nodes) { // load node page 292 | m_node_p = m_node[ maxi_nodes ]; // keep it for debugging 293 | onef.read_page(m_node_p, (UCHAR*) m_node); 294 | m_node_i = 0; 295 | } 296 | onef.read_page(m_node[m_node_i ++], m_buff); 297 | } 298 | 299 | m_cur = 0; 300 | UINT64 size = onef.files[m_onef_i].size; 301 | m_count = size/FILER_PAGE == m_page_count ? size % FILER_PAGE : FILER_PAGE ; 302 | m_page_count++; 303 | } 304 | -------------------------------------------------------------------------------- /utest.cpp: -------------------------------------------------------------------------------- 1 | /***********************************************************************************************************************/ 2 | /* This program was written by Josef Ezra */ 3 | /* Copyright (c) 2013, Infinidat */ 4 | /* All rights reserved. */ 5 | /* */ 6 | /* Redistribution and use in source and binary forms, with or without modification, are permitted provided that */ 7 | /* the following conditions are met: */ 8 | /* */ 9 | /* Redistributions of source code must retain the above copyright notice, this list of conditions and the following */ 10 | /* disclaimer. */ 11 | /* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following */ 12 | /* disclaimer in the documentation and/or other materials provided with the distribution. */ 13 | /* Neither the name of the Infinidat nor the names of its contributors may be used to endorse or promote products */ 14 | /* derived from this software without specific prior written permission. */ 15 | /* */ 16 | /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 17 | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 18 | /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ 19 | /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */ 20 | /* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, */ 21 | /* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE */ 22 | /* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 23 | /***********************************************************************************************************************/ 24 | 25 | 26 | 27 | 28 | 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | #include "common.hpp" 36 | #include "filer.hpp" 37 | #include "config.hpp" 38 | 39 | #define protected public 40 | 41 | #include "qlts.hpp" 42 | #include "recs.hpp" 43 | #include "gens.hpp" 44 | // #include "power_ranger.hpp" 45 | 46 | #undef protected 47 | 48 | #define TITLE(X) fprintf(stderr, "UTEST: %s\n", X) 49 | 50 | void test_filer() { 51 | TITLE("filer"); 52 | const char* fname = "/tmp/utest~filer"; 53 | { 54 | FILE *fh = fopen(fname, "w"); 55 | assert(fh); 56 | FilerSave::init(fh); 57 | FilerSave filer("utest") ; 58 | assert(filer.is_valid()); 59 | 60 | filer.put(8); 61 | // filer.put4(0x12345678); 62 | // filer.put4(0xfeda9877); 63 | // filer.put8(UINT64(0x0102030405060708)); 64 | // filer.put8(UINT64(0xffeeddccbbaa9988)); 65 | for (int i = 0; i < 0x810*FILER_PAGE+10; i++) 66 | filer.put(i&0xff); 67 | } 68 | FilerSave::finit(); 69 | { 70 | FILE *fh = fopen(fname, "r"); 71 | assert(fh); 72 | bool valid; 73 | FilerLoad::init(fh); 74 | FilerLoad filer("utest", &valid) ; 75 | assert(filer.is_valid()); 76 | UINT64 val; 77 | #define ASSERT(SUB, VAL) val = filer.SUB(); assert(val == VAL && valid) 78 | ASSERT(get, 8); 79 | // ASSERT(get4, 0x12345678); 80 | // ASSERT(get4, UINT64(0xfeda9877)); 81 | // ASSERT(get8, UINT64(0x0102030405060708)); 82 | // ASSERT(get8, UINT64(0xffeeddccbbaa9988)); 83 | #undef ASSERT 84 | for (int i = 0; i < 0x810*FILER_PAGE+10; i++) { 85 | UCHAR c = filer.get(); 86 | assert(c == (i&0xff)); 87 | } 88 | } 89 | } 90 | 91 | 92 | // void fill_buf(UCHAR* buf, int start, int size) { 93 | // for (int i = 0; i < size ; i++) 94 | // buf[i] = '!' + ((i*7+start) % 62); 95 | // } 96 | 97 | Config conf; 98 | /* 99 | void test_qlt() { 100 | TITLE("qlt stream"); 101 | const char* argv[] = {"utest", "-f", "/tmp/utest", "-O"}; 102 | conf.init(4, (char**)argv, 777); 103 | { 104 | QltSave qlt; 105 | UCHAR buf[100]; 106 | for(int i = 0; i < 101; i++) { 107 | fill_buf(buf, i, 91); 108 | qlt.save_2(buf, 91); 109 | } 110 | } 111 | { 112 | // const char* argv[] = {"utest", "-f", "/tmp/utest", "-O", "-d"}; 113 | // Config conf(5, (char**)argv, 777); 114 | QltLoad qlt; 115 | UCHAR buf[100]; 116 | UCHAR tst[100]; 117 | for(int i = 0; i < 101; i++) { 118 | fill_buf(tst, i, 91); 119 | qlt.load_2(buf, 91); 120 | for (int j=0; j<91; j++) 121 | assert(tst[j] == buf[j]); 122 | } 123 | } 124 | } 125 | */ 126 | 127 | const char* fname = "/tmp/utest~filer"; 128 | bool _valid ; 129 | RCoder * rc = NULL; 130 | FilerLoad* filer_l = NULL; 131 | FilerSave* filer_s = NULL; 132 | 133 | void rc_init(bool load=0) { 134 | _valid = true; 135 | rc = new RCoder; 136 | assert(rc); 137 | if (load) { 138 | FILE* fh = fopen(fname, "r"); 139 | assert(fh); 140 | FilerLoad::init(fh); 141 | rc->init(filer_l = new FilerLoad("utest", &_valid)); 142 | } 143 | else { 144 | FILE* fh = fopen(fname, "w"); 145 | assert(fh); 146 | FilerSave::init(fh); 147 | rc->init(filer_s = new FilerSave("utest")); 148 | } 149 | } 150 | 151 | void rc_finit() { 152 | rc -> done(); 153 | DELETE (rc); 154 | DELETE (filer_l); 155 | DELETE (filer_s); 156 | FilerSave::finit(); 157 | } 158 | 159 | void test_log64_ranger() { 160 | TITLE("log64 put/get"); 161 | { 162 | rc_init(0); 163 | Log64Ranger ranger; 164 | BZERO(ranger); 165 | for (int i = 0; i < 300; i++) 166 | ranger.put(rc, i%63); 167 | for (int i = 0; i < 1000; i+=17) 168 | ranger.put(rc, i%63); 169 | rc_finit(); 170 | } 171 | { 172 | rc_init(1); 173 | Log64Ranger ranger; 174 | BZERO(ranger); 175 | for (int i = 0; i < 300; i++) { 176 | UCHAR c = ranger.get(rc); 177 | assert(c == (i%63)); 178 | } 179 | for (int i = 0; i < 1000; i+=17) { 180 | UCHAR c = ranger.get(rc); 181 | assert(c == (i%63)); 182 | } 183 | rc_finit(); 184 | } 185 | } 186 | 187 | void test_power_ranger() { 188 | TITLE("ranger put_c/get_c"); 189 | { 190 | rc_init(0); 191 | PowerRanger ranger; 192 | BZERO(ranger); 193 | for (int i = 0; i < 300; i++) 194 | ranger.put(rc, i&0xff); 195 | for (int i = 0; i < 1000; i+=17) 196 | ranger.put(rc, i&0xff); 197 | rc_finit(); 198 | } 199 | { 200 | rc_init(1); 201 | PowerRanger ranger; 202 | BZERO(ranger); 203 | for (int i = 0; i < 300; i++) { 204 | UCHAR c = ranger.get(rc); 205 | assert(c == (i&0xff)); 206 | } 207 | for (int i = 0; i < 1000; i+=17) { 208 | UCHAR c = ranger.get(rc); 209 | assert(c == (i&0xff)); 210 | } 211 | rc_finit(); 212 | } 213 | TITLE("ranger put_i / get_i"); 214 | int arr[] = { -121, -122, -1, 575}; 215 | { 216 | rc_init(); 217 | PowerRangerI ranger; 218 | BZERO(ranger); 219 | for (int i = 0; i < 4; i++) 220 | ranger.put_i(rc, arr[i]); 221 | for (int i = -300; i < 300; i++) 222 | ranger.put_i(rc, i); 223 | for (int i = -100; i < 100; i++) 224 | ranger.put_i(rc, i*7); 225 | rc_finit(); 226 | } 227 | { 228 | rc_init(1); 229 | PowerRangerI ranger; 230 | BZERO(ranger); 231 | for (int i = 0; i < 4; i++){ 232 | int c = ranger.get_i(rc); 233 | assert(c == arr[i]); 234 | } 235 | for (int i = -300; i < 300; i++) { 236 | int c = ranger.get_i(rc); 237 | assert(c == i); 238 | } 239 | for (int i = -100; i < 100; i++) { 240 | int c = ranger.get_i(rc); 241 | assert(c == i*7); 242 | } 243 | rc_finit(); 244 | } 245 | TITLE("ranger put_u / get_u"); 246 | { 247 | rc_init(); 248 | PowerRangerU ranger; 249 | BZERO(ranger); 250 | for (int i = 0; i < 300; i++) 251 | ranger.put_u(rc, i); 252 | for (int i = 1000; i ; i--) 253 | ranger.put_u(rc, i*77); 254 | 255 | rc_finit(); 256 | } 257 | { 258 | rc_init(1); 259 | PowerRangerU ranger; 260 | BZERO(ranger); 261 | for (int i = 0; i < 300; i++) { 262 | int c = ranger.get_u(rc); 263 | assert(c == i); 264 | } 265 | for (int i = 1000; i ; i--) { 266 | int c = ranger.get_u(rc); 267 | assert(c == i*77); 268 | } 269 | rc_finit(); 270 | } 271 | } 272 | 273 | void test_power_ranger_extra() { 274 | TITLE("extra ranger"); 275 | UINT64 array[] = 276 | { 0xfffffffffff, 0x123456789abcd, 0, 4, 277 | 0xfffffffffff, 0x123456789abcd, 0, 4, 278 | 0xa, 0xffff, 0xfffe, 100 279 | }; 280 | long long arrai[] = 281 | { -0x7ffffff, 0x7ffffff, -100000L, 9999L, 282 | -1, 2994389, -2, -3 283 | }; 284 | { 285 | // FILE *fh = fopen(fname, "w"); 286 | // assert(fh); 287 | // 288 | // FilerSave filer(fh) ; 289 | // assert(filer.is_valid()); 290 | 291 | PowerRangerU ranger; 292 | // RCoder rcoder; 293 | // rcoder.init(&filer); 294 | // BZERO(ranger); 295 | // RCoder* r = &rcoder; 296 | rc_init(0); 297 | RCoder* r = rc; 298 | 299 | for (int i = 0; i < 300; i++) 300 | ranger.put_u(r, (i&0x7f)); 301 | for (UINT64 i=0; i < 1000; i++) 302 | ranger.put_u(r, i); 303 | for (UINT64 i=0xfff0; i < 0x10234; i++) 304 | ranger.put_u(r, i); 305 | for (int i = 0; i < 12; i++) 306 | ranger.put_u(r, array[i]); 307 | for (int i = -300; i < 300; i++) 308 | ranger.put_u(r, i); 309 | for (int i = 0; i < 8; i++) 310 | ranger.put_u(r, 0|arrai[i]); 311 | rc_finit(); 312 | } 313 | { 314 | // FILE *fh = fopen(fname, "r"); 315 | // assert(fh); 316 | // bool valid; 317 | // 318 | // FilerLoad filer(fh, &valid) ; 319 | // assert(filer.is_valid()); 320 | 321 | PowerRangerU ranger; 322 | // RCoder rcoder; 323 | // rcoder.init(&filer); 324 | // BZERO(ranger); 325 | // RCoder* r = &rcoder; 326 | rc_init(1); 327 | RCoder* r = rc; 328 | 329 | for (int i = 0; i < 300; i++) { 330 | UCHAR c = ranger.get_u(r); 331 | assert(c == (i&0x7f)); 332 | } 333 | for (UINT64 i=0; i < 1000; i++) { 334 | UINT64 u = ranger.get_u(r); 335 | assert(u == i); 336 | } 337 | for (UINT64 i=0xfff0; i < 0x10234; i++) { 338 | UINT64 u = ranger.get_u(r); 339 | assert(u == i); 340 | } 341 | for (int i = 0; i < 12; i++) { 342 | UINT64 u = ranger.get_u(r); 343 | assert(u == array[i]); 344 | } 345 | for (int i = -300; i < 300; i++) { 346 | long l = ranger.get_u(r); 347 | assert(l == i); 348 | } 349 | for (int i = 0; i < 8; i++) { 350 | long u = ranger.get_u(r); 351 | assert(u == arrai[i]); 352 | } 353 | rc_finit(); 354 | } 355 | } 356 | 357 | // void test_recbase() { 358 | // 359 | // TITLE("test RecBase"); 360 | // // const UCHAR* mystr = (const UCHAR*)"welcome to Amsterdam had have a )*(&^(*4758*^&%)) +++~ time!"; 361 | // // int mystr_len = strlen((const char*)mystr); 362 | // { 363 | // FILE *fh = fopen(fname, "w"); 364 | // assert(fh); 365 | // 366 | // FilerSave filer(fh) ; 367 | // assert(filer.is_valid()); 368 | // 369 | // RecBase base ; 370 | // BZERO(base); 371 | // base.range_init(); 372 | // base.rcoder.init(&filer); 373 | // 374 | // // base.put_str(0, mystr, mystr_len); 375 | // 376 | // // for (int i = -1000; i < 2000; i++) { 377 | // // base.put_len(0, i*11+9); 378 | // // base.put_num(0, i*7); 379 | // // } 380 | // 381 | // } 382 | // { 383 | // FILE *fh = fopen(fname, "r"); 384 | // assert(fh); 385 | // bool valid; 386 | // 387 | // FilerLoad filer(fh, &valid) ; 388 | // assert(filer.is_valid()); 389 | // RecBase base ; 390 | // BZERO(base); 391 | // base.range_init(); 392 | // base.rcoder.init(&filer); 393 | // 394 | // // UCHAR str[1000]; 395 | // // UCHAR* p = base.get_str(0, str); 396 | // // assert(0 == strcmp((const char*)str, (const char*)mystr)); 397 | // // assert(p == str + mystr_len); 398 | // 399 | // // for (int i = -1000; i < 2000; i++) { 400 | // // int len = base.get_len(0); 401 | // // assert(len == i*11+9); 402 | // // long long num = base.get_num(0); 403 | // // assert(num == i*7); 404 | // // } 405 | // } 406 | // } 407 | 408 | int main(int argc, char *argv[]) { 409 | 410 | test_filer(); 411 | test_log64_ranger(); 412 | test_power_ranger(); 413 | // test_qlt(); 414 | test_power_ranger_extra(); 415 | // test_recbase(); 416 | 417 | return 0; 418 | } 419 | -------------------------------------------------------------------------------- /config.cpp: -------------------------------------------------------------------------------- 1 | /***********************************************************************************************************************/ 2 | /* This program was written by Josef Ezra */ 3 | /* Copyright (c) 2013, Infinidat */ 4 | /* All rights reserved. */ 5 | /* */ 6 | /* Redistribution and use in source and binary forms, with or without modification, are permitted provided that */ 7 | /* the following conditions are met: */ 8 | /* */ 9 | /* Redistributions of source code must retain the above copyright notice, this list of conditions and the following */ 10 | /* disclaimer. */ 11 | /* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following */ 12 | /* disclaimer in the documentation and/or other materials provided with the distribution. */ 13 | /* Neither the name of the Infinidat nor the names of its contributors may be used to endorse or promote products */ 14 | /* derived from this software without specific prior written permission. */ 15 | /* */ 16 | /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 17 | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 18 | /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ 19 | /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */ 20 | /* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, */ 21 | /* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE */ 22 | /* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 23 | /***********************************************************************************************************************/ 24 | 25 | 26 | 27 | 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include // Variadic 40 | 41 | #include "config.hpp" 42 | 43 | // Globals 44 | static const int internal_version = 6; // inernal version 45 | static const char* user_version="2.04"; 46 | 47 | unsigned long long g_record_count = 0; 48 | unsigned long long g_genofs_count = 0; 49 | 50 | typedef std::map info_t; 51 | typedef std::pair info_pair; 52 | info_t info_map; 53 | 54 | void croak(const char *format, ...) { 55 | va_list args; 56 | va_start(args, format); 57 | fprintf(stderr, "slimfastq: %s %s: ", 58 | ( conf.encode ? "encoding" : "decoding"), 59 | conf.get_info("orig.filename")); 60 | 61 | // if (errno) 62 | // fprintf(stderr, "err=%s", strerror(errno)); 63 | 64 | vfprintf(stderr, format, args); 65 | fprintf(stderr, "\n"); 66 | 67 | exit(1); 68 | } 69 | 70 | // void croak(const char* fmt, long long num) { 71 | // fprintf(stderr, fmt, num); 72 | // fprintf(stderr, "\n"); 73 | // exit(1); 74 | // } 75 | 76 | void Config::statistics_dump() const { 77 | fprintf(stderr, ":::: Info ::::\n"); 78 | for (std::map::iterator it = info_map.begin(); 79 | it != info_map.end(); 80 | it ++ ) 81 | fprintf(stderr, "%-16s = %s\n", it->first.c_str(), it->second.c_str()); 82 | fprintf(stderr, "\n:::: Files stream ::::\n"); 83 | FilerLoad::confess(); 84 | exit(0); 85 | } 86 | 87 | void Config::load_info() const { 88 | 89 | info_map.clear(); 90 | char line[0x200]; 91 | bool valid; 92 | FilerLoad filer(42, &valid); 93 | while (valid) { 94 | for (int i = 0; i < 0x200; i++) { 95 | line[i] = filer.get(); 96 | if (not valid or 97 | line[i] == '\n') 98 | line[i] = 0; 99 | if (line[i] == 0) 100 | break; 101 | } 102 | char* pos = index(line, '='); 103 | if (pos) { 104 | *pos = 0; 105 | info_map.insert(info_pair (line, pos+1)); 106 | } 107 | } 108 | } 109 | 110 | bool Config::has_info(const char* key) const { 111 | const char* something = info_map[key].c_str(); 112 | return 0 != strlen(something); 113 | } 114 | 115 | const char* Config::get_info(const char* key) const { 116 | const char* something = info_map[key].c_str(); 117 | if (0 == strlen(something)) 118 | // croak("%s: no value for '%s'\n", m_info_filename, ke); 119 | return ""; 120 | 121 | return something; 122 | } 123 | 124 | bool Config::get_bool(const char* key) const { 125 | const char* something = info_map[key].c_str(); 126 | return strlen(something) and *something != '0'; 127 | } 128 | 129 | long long Config::get_long(const char* key, long long val) const { 130 | const char* something = info_map[key].c_str(); 131 | return strlen(something) ? atoll(something) : val; 132 | } 133 | 134 | void put_str(FilerSave* filer, const char* str) { 135 | int sanity = 0x200; 136 | while (*str and --sanity) 137 | filer->put(*str++); 138 | if (not sanity) 139 | croak("oversize string value"); 140 | } 141 | 142 | void Config::set_info(const char* key, const char* val) const { 143 | 144 | put_str(m_info_filer, key); 145 | m_info_filer->put('='); 146 | put_str(m_info_filer, val); 147 | m_info_filer->put('\n'); 148 | 149 | info_map.insert(info_pair(key, val)); 150 | } 151 | 152 | void Config::set_info(const char* key, long long num) const { 153 | char buf[40]; 154 | sprintf(buf, "%lld", num); 155 | set_info(key, buf); 156 | } 157 | 158 | void Config::usage() const { 159 | printf("\ 160 | Usage: \n\ 161 | -u usr-filename : (default: stdin)\n\ 162 | -f comp-filename : required - compressed\n\ 163 | -d : decode (instead of encoding) \n\ 164 | -O : silently overwrite existing files\n\ 165 | -l level : compression level 1 to 4 (default is 3 ) \n\ 166 | -1, -2, -3, -4 : alias for -l 1, -l 2, etc \n\ 167 | Where levels are:\n\ 168 | 1: Uses less than 4M memory (!), yields the worse compression (still much better than gzip)\n\ 169 | 2: Uses about 30M memory, resonable compression \n\ 170 | 3: Uses about 80M memory, best compression \n\ 171 | 4: Compress a little more, but very costly (competition mode?) \n\ 172 | \n\ 173 | -v : version : internal version \n\ 174 | -h : help : this message \n\ 175 | -s : stat : information about a compressed file \n\ 176 | -q : suppress extra stats info that could have been seen by -s \n\ 177 | \n\ 178 | DWIM (Do what I mean) - Intuitive use of 'slimfastq A B' : \n\ 179 | If A appears to be a fastq file, and:\n\ 180 | B does not exists, or -O option is used: compress A to B \n\ 181 | If A appears to be a slimfastq file, and: \n\ 182 | B does not exist, or -O option is used: decompress A to B \n\ 183 | B is omitted: decompress A to stdout \n\ 184 | Examples: \n\ 185 | %% slimfastq : compress to \n\ 186 | %% slimfastq -1 to , using level 1 \n\ 187 | %% slimfastq : decompress to stdout \n\ 188 | %% slimfastq to | slimfastq -f : convert from gzip to sfq format\n\ 190 | Verification example:\n\ 191 | %% md5sum : remember checksum \n\ 192 | %% slimfastq : compress \n\ 193 | %% slimfastq | md5sum - : decompress pipe to md5sum, compare checksums \n\ 194 | \n\ 195 | Note: to support pipes and reduce the use of resources, slimfastq was coded to run in a \n\ 196 | single thread. For a multi-session example - efficiently compressing multiple files in \n\ 197 | parallel - please use tools/slimfastq.multi -h (or make install; slimfastq.multi -h) \n\ 198 | \n\ 199 | "); 200 | exit(0); 201 | } 202 | 203 | // (DISABLED -TBD)-s size : set partition to (megabyte units) \n\ - 204 | // (DISABLED -TBD)-p partition : only open this partition (-d implied) \n\ - 205 | // \n\ - 206 | 207 | static void check_fh(FILE* f, std::string name, bool read=false) { 208 | if (f) return; 209 | fprintf(stderr, "Can't %s file '%s': %s\n", 210 | read?"read":"write", name.c_str(), strerror(errno)); 211 | exit(1); 212 | } 213 | 214 | static void check_op(int something, char chr) { 215 | if (something) return; 216 | fprintf(stderr, "Missing essential argument: -%c\n", chr); 217 | exit(1); 218 | } 219 | 220 | Config::Config(){ 221 | 222 | version = internal_version; 223 | 224 | quiet = false; 225 | profiling = false; 226 | encode = true; 227 | level = 3; 228 | m_info_filer = NULL; 229 | } 230 | 231 | static int range_level(int level) { 232 | return 233 | level > 4 ? 4 : 234 | level < 1 ? 1 : 235 | level ; 236 | } 237 | 238 | static bool initialized = false; 239 | void Config::init(int argc, char **argv) { 240 | if (initialized) 241 | croak("Internal error: 2nd Config init"); 242 | 243 | initialized = true; 244 | 245 | std::string usr, fil; 246 | bool overwrite = false; 247 | bool statistics = false; 248 | 249 | if (argc == 1) usage(); 250 | // TODO? long options 251 | // const char* short_opt = "POvhd 1234 u:f:s:p:l:"; 252 | const char* short_opt = "qPsvhdO 1234 u:f:l:"; 253 | for ( int opt = getopt(argc, argv, short_opt); 254 | opt != -1; 255 | opt = getopt(argc, argv, short_opt)) 256 | switch (opt) { 257 | case 'u': usr = optarg ; break; 258 | case 'f': fil = optarg ; break; 259 | 260 | case 'l': level = strtoll(optarg, 0, 0); break; 261 | case '1': case '2' : case '3': case '4': 262 | level = opt - '0'; break; 263 | 264 | case 'd': encode = false; break; 265 | case 'O': overwrite = true ; break; 266 | case 'P': profiling = true ; break; 267 | case 'q': quiet = true ; break; 268 | case 'v': 269 | printf("Version %s\nInternal format version=%u\n", user_version, version); 270 | exit(0); 271 | case 'h': 272 | usage(); 273 | case 's': statistics = true; encode = false; break; 274 | 275 | default: 276 | croak("Ilagal args: use -h for help"); 277 | } 278 | 279 | while (optind < argc) { 280 | // DWIM guessing ... 281 | char *file = argv[optind ++]; 282 | FILE* fh = fopen(file, "rb"); 283 | if (! fh) { 284 | if (not encode and not usr.length()) 285 | usr = file; 286 | else if (encode and not fil.length()) 287 | fil = file; 288 | else { 289 | fprintf(stderr, "What am I suppose to do with '%s'?\n (please specify explicitly with -f/-u prefix)\n(Note: not an existing file)\n", 290 | file); 291 | exit(1); 292 | } 293 | continue; 294 | } 295 | const char* sfqstamp = "whoami=slimfastq" ; 296 | char initline[20]; 297 | BZERO(initline); 298 | size_t cnt = fread(initline, 1, 19, fh); 299 | if (cnt and 300 | not fil.length() and 301 | 0 == strncmp(initline, sfqstamp, strlen(sfqstamp))) { 302 | fil = file; 303 | encode = !! usr.length(); 304 | } 305 | else if (cnt and 306 | not usr.length() and 307 | initline[0] == '@') { 308 | usr = file; 309 | } 310 | else if (not usr.length() and 311 | not encode and 312 | (not cnt or overwrite)) { 313 | usr = file; 314 | } 315 | else if (encode and 316 | usr.length() and 317 | not fil.length() and 318 | (not cnt or overwrite)) { 319 | fil = file; 320 | } 321 | else { 322 | fprintf(stderr, "What am I suppose to do with '%s'?\n (please specify explicitly with -f/-u prefix)\n(Note: file exists!)\n", file); 323 | exit(1); 324 | } 325 | } 326 | check_op(fil.length(), 'f'); 327 | 328 | const char* wr_flags = overwrite ? "wb" : "wbx" ; 329 | if (encode) { 330 | FILE* fh = fopen(fil.c_str(), wr_flags); 331 | check_fh(fh, fil, false); 332 | FilerSave::init(fh); 333 | m_info_filer = new FilerSave(42); 334 | set_info("whoami", "slimfastq"); 335 | set_info("version", version); 336 | set_info("config.level", range_level(level)); 337 | 338 | if (usr.length()) { 339 | set_info("orig.filename", usr.c_str()); 340 | f_usr = fopen(usr.c_str(), "rb") ; 341 | check_fh(f_usr, usr, true); 342 | fseek(f_usr, 0L, SEEK_END); 343 | set_info("orig.size", ftell(f_usr)); 344 | fseek(f_usr, 0L, SEEK_SET); 345 | } 346 | else { 347 | set_info("orig.filename", "<< stdin >>"); 348 | f_usr = stdin ; 349 | } 350 | } 351 | else { 352 | FILE* fh = fopen(fil.c_str(), "rb"); 353 | check_fh(fh, fil, true); 354 | fseek(fh, 0L, SEEK_END); 355 | UINT64 file_size = ftell(fh); 356 | fseek(fh, 0L, SEEK_SET); 357 | 358 | FilerLoad::init(fh); 359 | load_info(); 360 | if (statistics) 361 | statistics_dump(); 362 | 363 | level = range_level(get_long("config.level", 2)); 364 | 365 | f_usr = usr.length() ? fopen(usr.c_str(), wr_flags) : stdout; 366 | check_fh(f_usr, usr); 367 | 368 | UINT64 comp_size = get_long("comp.size", 0); 369 | if (comp_size > 0 and 370 | comp_size != file_size) 371 | croak("expected compressed file size to be %lld", comp_size); 372 | 373 | decoder_version = conf.get_long("version"); 374 | if (decoder_version and 375 | decoder_version > version) 376 | croak("%s was compressed with slimfastq version %d. My version is %d. Please upgrade me before decoing", 377 | fil.c_str(), decoder_version, version); 378 | } 379 | } 380 | 381 | void Config::finit() { 382 | if (m_info_filer) { 383 | UINT64 size = FilerSave::finit_size(); 384 | set_info("comp.size", size); 385 | delete(m_info_filer); 386 | FilerSave::finit(); 387 | m_info_filer = NULL; 388 | } 389 | } 390 | 391 | Config::~Config() { 392 | finit(); 393 | 394 | #if 0 395 | TODO: 396 | if (ferror(in)) { 397 | fprintf(stderr, "Read error: %s\n", strerror(errno)); 398 | return 1; 399 | } 400 | if (ferror(out)) { 401 | fprintf(stderr, "Write error: %s\n", strerror(errno)); 402 | return 2; 403 | } 404 | #endif 405 | 406 | } 407 | -------------------------------------------------------------------------------- /recs.cpp: -------------------------------------------------------------------------------- 1 | /***********************************************************************************************************************/ 2 | /* This program was written by Josef Ezra */ 3 | /* Copyright (c) 2013, Infinidat */ 4 | /* All rights reserved. */ 5 | /* */ 6 | /* Redistribution and use in source and binary forms, with or without modification, are permitted provided that */ 7 | /* the following conditions are met: */ 8 | /* */ 9 | /* Redistributions of source code must retain the above copyright notice, this list of conditions and the following */ 10 | /* disclaimer. */ 11 | /* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following */ 12 | /* disclaimer in the documentation and/or other materials provided with the distribution. */ 13 | /* Neither the name of the Infinidat nor the names of its contributors may be used to endorse or promote products */ 14 | /* derived from this software without specific prior written permission. */ 15 | /* */ 16 | /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 17 | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 18 | /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ 19 | /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */ 20 | /* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, */ 21 | /* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE */ 22 | /* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 23 | /***********************************************************************************************************************/ 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include "recs.hpp" 30 | 31 | #define MAX_LLINE 400 // TODO: assert max is enough at constructor 32 | 33 | 34 | RecSave::RecSave() { 35 | 36 | m_valid = true; 37 | 38 | BZERO(m_last); 39 | BZERO(stats); 40 | // range_init(); 41 | 42 | filer = new FilerSave("rec"); 43 | assert(filer); 44 | rcoder.init(filer); 45 | 46 | x_file = new XFileSave("rec.x"); 47 | 48 | smap[0].len = smap[1].len = 0; 49 | } 50 | 51 | RecSave::~RecSave() { 52 | rcoder.done(); 53 | if (not conf.quiet) { 54 | } 55 | 56 | DELETE(filer); 57 | DELETE(x_file); 58 | } 59 | 60 | UCHAR* sncpy(UCHAR* target, const UCHAR* source, int n) { 61 | int i; 62 | for (i = 0 ; i < n and source[i]; i++) 63 | target[i] = source[i]; 64 | target[i] = 0; 65 | return target + i; 66 | } 67 | 68 | void RecSave::save_first_line(const UCHAR* buf, const UCHAR* end) { 69 | UCHAR first[MAX_LLINE]; 70 | sncpy(first, buf, end-buf); 71 | conf.set_info("rec.first", (const char*)first); 72 | // conf.save_info(); 73 | 74 | m_last.initilized = true; 75 | } 76 | 77 | void RecSave::put_str(UCHAR i, const UCHAR* p, UINT32 len) { 78 | ranger[i].num.put_u(&rcoder, len); 79 | for (UINT32 j = 0; j < len; j++) 80 | ranger[i].str.put(&rcoder, p[j]); 81 | } 82 | 83 | 84 | void RecSave::put_num(UCHAR i, long long num) { 85 | if (ranger[i].num.put_u(&rcoder, num)) 86 | stats.big_i ++; 87 | } 88 | 89 | void RecSave::put_type(UCHAR i, UCHAR type) { 90 | ranger[i].type.put(&rcoder, type); 91 | } 92 | 93 | RecLoad::RecLoad() { 94 | m_valid = true; 95 | // range_init(); 96 | 97 | filer = new FilerLoad("rec", &m_valid); 98 | assert(filer); 99 | rcoder.init(filer); 100 | 101 | BZERO(m_last); 102 | 103 | x_file = new XFileLoad("rec.x"); 104 | m_last.index = x_file->get(); 105 | } 106 | 107 | RecLoad::~RecLoad() { 108 | rcoder.done(); 109 | DELETE(x_file); 110 | } 111 | 112 | size_t RecLoad::load_first_line(UCHAR* buf) { 113 | 114 | m_last.initilized = true; 115 | 116 | const char *first = conf.get_info("rec.first"); 117 | strcpy((char*)buf, first); 118 | return strlen(first); 119 | } 120 | 121 | UCHAR RecLoad::get_type(UCHAR i) { 122 | return ranger[i].type.get(&rcoder); 123 | } 124 | 125 | long long RecLoad::get_num(UCHAR i) { 126 | return ranger[i].num.get_u(&rcoder); 127 | } 128 | 129 | UCHAR* RecLoad::get_str(UCHAR i, UCHAR* p) { 130 | UINT32 len = ranger[i].num.get_u(&rcoder); 131 | for (UINT32 j = 0; j < len; j++) 132 | p[j] = ranger[i].str.get(&rcoder); 133 | return p + len; 134 | } 135 | 136 | 137 | //////////////////////////////// 138 | 139 | static bool isword(UCHAR c) { return isdigit(c) or isalpha(c);} 140 | 141 | void RecBase::map_space(const UCHAR* p, bool flip) { 142 | smap[flip].len = 0; 143 | smap[flip].off[0] = 0; 144 | for (int i = 0; ; i++) { 145 | if (not isword(p[i])) { 146 | smap[flip].wln [smap[flip].len ] = i - smap[flip].off[smap[flip].len]; 147 | smap[flip].str[smap[flip].len++] = p[i]; 148 | smap[flip].off[smap[flip].len ] = i + 1; 149 | 150 | rarely_if (p[i] == 0 or p[i] == '\n') 151 | break; 152 | 153 | rarely_if(smap[flip].len > 64) 154 | croak("ERROR: irregulal record (over 64 non alpha non digit). Is it a valid fastq file?"); 155 | } 156 | } 157 | } 158 | 159 | // Division of labor 160 | enum seg_type { 161 | // keep these values backward compatible 162 | ST_DGT = 0, // deci, greater than 163 | ST_DLT = 1, // deci, less than 164 | 165 | ST_STR = 2, // string 166 | 167 | // version 5 new values 168 | 169 | ST_HGT = 3, // hexa, greater than, 'a' - 'f' 170 | ST_HLT = 4, // hexa, less than 171 | 172 | // hex numbers with one zero preceding (rare) 173 | ST_HGT_Z = 5, // '0'.hexa, greater than, 'a' - 'f' 174 | ST_HLT_Z = 6, // '0'.hexa, less than 175 | 176 | 177 | // capital letter numbers (Yack!) 178 | ST_HGTC = 7, // hexa, greater than, capital 'A'-'F' 179 | ST_HLTC = 8, // hexa, greater than, capital 'A'-'F' 180 | 181 | // capital letter numbers 182 | ST_HGTC_Z = 9, // '0'.hexa, greater than, capital 'A'-'F' 183 | ST_HLTC_Z = 10, // '0'.hexa, greater than, capital 'A'-'F' 184 | 185 | // very unlikely 186 | ST_DGT_Z = 11, // '0'.deci, greater than 187 | ST_DLT_Z = 12, // '0'.deci, less than 188 | 189 | }; 190 | 191 | 192 | static UCHAR numberwang(const UCHAR* p, int len, UINT64 &num, UCHAR pctype) 193 | { 194 | // return type 195 | // hex if deci & prev == hex 196 | // hex if hex 197 | // deci if deci 198 | // str else 199 | int i = 0; 200 | bool has_z = p[i] == '0'; 201 | rarely_if (has_z) 202 | if ( p[++i] == '0') // can't recreate two preceding zeros 203 | return ST_STR; 204 | 205 | UCHAR caps = 0; // 0=? 1=lower 2=caps 206 | 207 | num = 0; 208 | while (pctype != 2) { // prev weren't hex 209 | if (i >= len) 210 | return has_z ? ST_DGT_Z : ST_DGT ; 211 | 212 | if (isdigit(p[i])) { 213 | UINT64 tnum = (num<<3) + (num<<1) + (p[i++]) - '0'; 214 | if (tnum < num) return ST_STR; // too big 215 | num = tnum; 216 | continue; 217 | } 218 | // Here: not a deci 219 | 220 | if ((p[i]|0x20) < 'a' or 221 | (p[i]|0x20) > 'f') 222 | return ST_STR; 223 | 224 | // reset and try as hex 225 | caps = 1 + (p[i] < 'a'); 226 | i = has_z; 227 | num = 0; 228 | break; 229 | } 230 | 231 | if (len > 16) 232 | return ST_STR; 233 | 234 | for (; i < len; i ++) { 235 | int nibel; 236 | if (isdigit (p[i])) 237 | nibel = (p[i]) - '0'; 238 | 239 | else if (p[i] >= 'a' and p[i] <= 'f') { 240 | if (caps == 2) 241 | return ST_STR; // can't combine upper and lower 242 | caps = 1; 243 | nibel = 10 + (p[i] - 'a'); 244 | } 245 | else if (p[i] >= 'A' and p[i] <= 'F') { 246 | if (caps == 1) 247 | return ST_STR; // can't combine upper and lower 248 | caps = 2; 249 | nibel = 10 + (p[i] - 'A'); 250 | } 251 | else 252 | return ST_STR; 253 | 254 | num = (num<<4) + nibel; 255 | } 256 | 257 | // Here: it's an hexa 258 | return 259 | caps == 2 ? 260 | has_z ? ST_HGTC_Z : ST_HGTC : 261 | has_z ? ST_HGT_Z : ST_HGT ; 262 | } 263 | 264 | 265 | static bool is_number(const UCHAR* p, int len, long long &num) { 266 | rarely_if (*p == '0') 267 | return false; 268 | num = 0; 269 | for (int i = 0 ; i < len; i ++) 270 | if (isdigit(p[i])) 271 | num = (num<<3) + (num<<1) + (p[i]) - '0'; 272 | else 273 | return false; 274 | return true; 275 | } 276 | 277 | void RecSave::save(const UCHAR* buf, const UCHAR* end, const UCHAR* prev_buf, const UCHAR* prev_end) { 278 | 279 | rarely_if(not m_last.initilized) { 280 | imap = 0; 281 | save_first_line(buf, end); 282 | map_space(buf, imap); 283 | bzero(ctype[0], sizeof(ctype[0])); // clear cache 284 | bzero(ctype[1], sizeof(ctype[1])); // clear cache 285 | return; 286 | } 287 | bool pmap = imap; 288 | imap = imap ? 0 : 1; 289 | map_space(buf, imap); 290 | 291 | rarely_if (smap[imap].len != smap[pmap].len or 292 | memcmp(smap[imap].str, smap[pmap].str, smap[imap].len)) { 293 | // Too bad, new kind of header 294 | // must push to extension file 295 | 296 | x_file->put(g_record_count - m_last.index); 297 | m_last.index = g_record_count; 298 | x_file->put_str(buf, end-buf); 299 | bzero(ctype[imap], sizeof(ctype[imap])); // clear cache 300 | 301 | stats.new_n ++ ; 302 | stats.new_l += end-buf; 303 | return; 304 | } 305 | 306 | UINT64 map = 0; 307 | for (int i = 0; i < smap[imap].len; i ++) 308 | if ( smap[imap].wln[i] != smap[pmap].wln[i] or 309 | memcmp(buf + smap[imap].off[i], prev_buf + smap[pmap].off[i], smap[imap].wln[i])) 310 | DO_SET(map, i); 311 | 312 | put_num (0, map); // mapping of changes. (zero could have meant copy prev) 313 | 314 | for (int i = 0; i < smap[0].len; i++) { 315 | if (IS_SET(map, i)) { 316 | 317 | // int i = BFIRST(map)-1; 318 | // DO_CLR(map, i); 319 | const UCHAR* b = buf + smap[imap].off[i]; 320 | // const UCHAR* p = prev_buf + smap[pmap].off[i]; 321 | UINT64 bnum; 322 | 323 | UCHAR type = numberwang(b, smap[imap].wln[i], bnum, ctype[pmap][i]); 324 | if ( type == ST_STR) { 325 | put_type(i+1, type); 326 | put_str (i+1, b, smap[imap].wln[i]); 327 | stats.str_n ++ ; 328 | stats.str_l += smap[imap].wln[i]; 329 | ctype[imap][i] = 0; // NAN 330 | continue; 331 | } 332 | // HERE: this is a number 333 | UINT64 pnum = ctype[pmap][i] ? cnumb[pmap][i] : 0; 334 | UINT64 gap ; 335 | 336 | ctype[imap][i] = (type < ST_STR or type >= ST_DGT_Z) ? 1 : 2; 337 | cnumb[imap][i] = bnum; 338 | 339 | if (bnum < pnum) { 340 | gap = pnum - bnum; 341 | type ++ ; 342 | } 343 | else 344 | gap = bnum - pnum; 345 | 346 | put_type(i+1, type); 347 | put_num (i+1, gap); 348 | 349 | // if (is_number(b, smap[imap].wln[i], bnum) and 350 | // is_number(p, smap[pmap].wln[i], pnum)) { 351 | // if (bnum > pnum) { 352 | // put_type(i+1, ST_DGT); 353 | // put_num (i+1, bnum - pnum); 354 | // } 355 | // else { 356 | // put_type(i+1, ST_DLT); 357 | // put_num (i+1, pnum - bnum); 358 | // } 359 | // } 360 | // else { 361 | // put_type(i+1, ST_STR); 362 | // put_str (i+1, b, smap[imap].wln[i]); 363 | // stats.str_n ++ ; 364 | // stats.str_l += smap[imap].wln[i]; 365 | // } 366 | } 367 | else { 368 | ctype[imap][i] = ctype[pmap][i]; 369 | cnumb[imap][i] = cnumb[pmap][i]; 370 | } 371 | } 372 | } 373 | 374 | size_t RecLoad::load(UCHAR* buf, const UCHAR* prev) { 375 | rarely_if(not m_last.initilized) { 376 | comp_version = conf.decoder_version; 377 | bzero(ctype[0], sizeof(ctype[0])); // clear cache 378 | bzero(ctype[1], sizeof(ctype[1])); // clear cache 379 | imap = 0; 380 | return load_first_line(buf); 381 | } 382 | 383 | bool pmap = imap; 384 | imap = imap ? 0 : 1; 385 | 386 | rarely_if(m_last.index == g_record_count) { 387 | 388 | UCHAR* b = x_file->get_str(buf); 389 | m_last.index += x_file->get(); 390 | bzero(ctype[imap], sizeof(ctype[imap])); // clear cache 391 | 392 | return b - buf; 393 | } 394 | 395 | map_space(prev, 0); // TODO? optimize by caching prev 396 | 397 | rarely_if (comp_version < 5) 398 | return load_pre5(buf, prev); 399 | 400 | UINT64 map = get_num(0); 401 | UCHAR* b = buf; 402 | 403 | for (int i = 0; i < smap[0].len; i++) { 404 | // if (IS_SET(last_map, i)) { 405 | if (not IS_SET(map, i)) { 406 | // b = (UCHAR*)mempcpy(b, prev + smap[0].off[i], smap[0].wln[i]); 407 | // mempcpy is not compatible with Mac OS, must do it the hard way .. 408 | UINT32 count = smap[0].wln[i] ; 409 | memcpy(b, prev + smap[0].off[i], count ); 410 | b += count; 411 | *b ++ = smap[0].str[i]; 412 | ctype[imap][i] = ctype[pmap][i]; 413 | cnumb[imap][i] = cnumb[pmap][i]; 414 | continue; 415 | } 416 | 417 | UCHAR type = get_type(i+1); 418 | 419 | if ( type == ST_STR ) { 420 | b = get_str(i+1, b); 421 | ctype[imap][i] = 0; 422 | *b ++ = smap[0].str[i]; 423 | continue; 424 | } 425 | 426 | UINT64 pval = ctype[pmap][i] == 0 ? 0 : cnumb[pmap][i]; 427 | UINT64 gap = get_num(i+1); 428 | UINT64 val ; 429 | const char* fmt; 430 | #define ITEM(A, B) fmt=A; val=B; break 431 | switch ((seg_type)type) { 432 | 433 | case ST_DGT : ITEM("%lld", pval + gap); 434 | case ST_DLT : ITEM("%lld", pval - gap); 435 | case ST_STR : croak("WTF"); break; 436 | case ST_HGT : ITEM("%llx", pval + gap); 437 | case ST_HLT : ITEM("%llx", pval - gap); 438 | case ST_HGT_Z : ITEM("0%llx", pval + gap); 439 | case ST_HLT_Z : ITEM("0%llx", pval - gap); 440 | case ST_HGTC : ITEM("%llX", pval + gap); 441 | case ST_HLTC : ITEM("%llX", pval - gap); 442 | case ST_HGTC_Z : ITEM("0%llX", pval + gap); 443 | case ST_HLTC_Z : ITEM("0%llX", pval - gap); 444 | case ST_DGT_Z : ITEM("0%lld", pval + gap); 445 | case ST_DLT_Z : ITEM("0%lld", pval - gap); 446 | 447 | default: 448 | croak("REC: bad type value %d", type); 449 | } 450 | #undef ITEM 451 | ctype[imap][i] = (type < ST_STR or type >= ST_DGT_Z) ? 1 : 2; 452 | cnumb[imap][i] = val; 453 | if (val == 0) 454 | *b ++ = '0'; // Under certain conditions sprintf(b, '%lld', 0) was printing "00". Yack! 455 | else 456 | b += sprintf((char*)b, fmt, val); 457 | 458 | *b ++ = smap[0].str[i]; 459 | } 460 | return b-buf-1; 461 | } 462 | 463 | size_t RecLoad::load_pre5(UCHAR* buf, const UCHAR* prev) { 464 | 465 | UINT64 map = get_num(0); 466 | 467 | UCHAR* b = buf; 468 | for (int i = 0; i < smap[0].len; i++) { 469 | // if (IS_SET(last_map, i)) { 470 | if (IS_SET(map, i)) { 471 | 472 | UCHAR type = get_type(i+1); 473 | switch (type) { 474 | 475 | case ST_DGT: 476 | case ST_DLT: { 477 | // compatibility - let it open old fi 478 | long long pval; 479 | bool expect_num = is_number(prev + smap[0].off[i], smap[0].wln[i], pval); 480 | assert(expect_num); 481 | long long gap = get_num(i+1); 482 | long long val = type == ST_DGT ? pval + gap : pval - gap ; 483 | // if (UINT64(val) > 0xffffffff00000000) { 484 | // fprintf(stderr, "big val: 0x%llx gap=0x%llx %s pval=0x%llx\n", 485 | // val, gap, (type == ST_DGT ? "GAP" : "PAG"), pval); 486 | // // debug point 487 | // } 488 | 489 | b += sprintf((char*)b, "%lld", val); 490 | } break; 491 | 492 | case ST_STR: { 493 | b = get_str(i+1, b); 494 | } break; 495 | 496 | default: 497 | croak("REC: bad type value %d", type); 498 | } 499 | } 500 | else { 501 | // b = (UCHAR*)mempcpy(b, prev + smap[0].off[i], smap[0].wln[i]); 502 | // mempcpy is not compatible with Mac OS, must do it the hard way .. 503 | UINT32 count = smap[0].wln[i] ; 504 | memcpy(b, prev + smap[0].off[i], count ); 505 | b += count; 506 | } 507 | *b ++ = smap[0].str[i]; 508 | } 509 | return b-buf-1; 510 | } 511 | -------------------------------------------------------------------------------- /usrs.cpp: -------------------------------------------------------------------------------- 1 | /***********************************************************************************************************************/ 2 | /* This program was written by Josef Ezra */ 3 | /* Copyright (c) 2013, Infinidat */ 4 | /* All rights reserved. */ 5 | /* */ 6 | /* Redistribution and use in source and binary forms, with or without modification, are permitted provided that */ 7 | /* the following conditions are met: */ 8 | /* */ 9 | /* Redistributions of source code must retain the above copyright notice, this list of conditions and the following */ 10 | /* disclaimer. */ 11 | /* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following */ 12 | /* disclaimer in the documentation and/or other materials provided with the distribution. */ 13 | /* Neither the name of the Infinidat nor the names of its contributors may be used to endorse or promote products */ 14 | /* derived from this software without specific prior written permission. */ 15 | /* */ 16 | /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 17 | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 18 | /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */ 19 | /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */ 20 | /* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, */ 21 | /* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE */ 22 | /* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 23 | /***********************************************************************************************************************/ 24 | 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | #include "usrs.hpp" 32 | 33 | #include "gens.hpp" 34 | #include "recs.hpp" 35 | #include "qlts.hpp" 36 | 37 | UsrSave::UsrSave() { 38 | BZERO(m_last); 39 | BZERO(mp); 40 | 41 | m_valid = true ; 42 | m_cur = 0 ; 43 | m_end = 0 ; 44 | m_page_count = 0; 45 | m_llen = 0; 46 | m_qlen = 0; 47 | m_solid = false; 48 | x_llen = new XFileSave("usr.x"); 49 | x_qlen = new XFileSave("usr.x.q"); 50 | x_sgen = new XFileSave("usr.pfg"); 51 | x_sqlt = new XFileSave("usr.pfq"); 52 | 53 | x_lgen = new XFileSave("usr.lgen"); 54 | x_lqlt = new XFileSave("usr.lqlt"); 55 | x_lrec = new XFileSave("usr.lrec"); 56 | 57 | m_in = conf.file_usr(); 58 | load_page(); 59 | if (m_valid) 60 | determine_record(); 61 | } 62 | 63 | UsrSave::~UsrSave(){ 64 | if (not conf.quiet) { 65 | char b[0x100]; 66 | UINT64 long_gen=x_llen->tell(); 67 | UINT64 long_qlt=x_qlen->tell(); 68 | if (long_gen or long_qlt) { 69 | sprintf(b, "gen:%llu qlt:%llu", long_gen, long_qlt); 70 | conf.set_info("log.size.change", b); 71 | } 72 | UINT64 sol_gen_pf=x_sgen->tell(); 73 | UINT64 sol_qlt_pf=x_sqlt->tell(); 74 | if (sol_gen_pf or sol_qlt_pf) { 75 | sprintf(b, "gen:%llu qlt:%llu", sol_gen_pf, sol_qlt_pf); 76 | conf.set_info("log.solid.pf", b); 77 | } 78 | UINT64 os_rec=x_lrec->tell(); 79 | UINT64 os_gen=x_lgen->tell(); 80 | UINT64 os_qlt=x_lqlt->tell(); 81 | if (os_rec or os_gen or os_qlt) { 82 | sprintf(b, "rec:%llu gen:%llu qlt:%llu", os_rec, os_gen, os_qlt); 83 | conf.set_info("log.oversize", b); 84 | } 85 | } 86 | DELETE(x_llen); 87 | DELETE(x_qlen); 88 | DELETE(x_sgen); 89 | DELETE(x_sqlt); 90 | 91 | DELETE(x_lrec); 92 | DELETE(x_lgen); 93 | DELETE(x_lqlt); 94 | } 95 | 96 | void UsrSave::load_page() { 97 | 98 | size_t size = m_end - m_cur ; 99 | assert(size <= PLL_STRT); 100 | size_t start = PLL_STRT - size; 101 | for (size_t i = 0 ; i < size; i++) 102 | m_buff[start+i] = m_buff[m_cur+i]; 103 | 104 | likely_if (mp.rec != NULL) { 105 | UCHAR* pl = mp_last; 106 | for (UCHAR* p = mp.rec; p < mp.rec_end; p++ , pl++) 107 | *pl = *p; 108 | mp.rec = mp_last; 109 | mp.rec_end = pl; 110 | *pl++ = '\n'; // Just in case 111 | *pl = 0; // clean debug 112 | } 113 | 114 | size_t cnt = fread(&m_buff[PLL_STRT], 1, PLL_SIZE, m_in); 115 | m_cur = start; 116 | m_end = PLL_STRT + cnt ; 117 | 118 | if (m_cur == m_end) 119 | m_valid = false; 120 | else 121 | m_page_count++; 122 | } 123 | 124 | void UsrSave::update(exception_t type, UINT16 dat) { 125 | switch (type) { 126 | case ET_LLEN: 127 | x_llen -> put ( g_record_count - m_last.i_llen ); 128 | x_llen -> put ( dat ); 129 | m_last.i_llen = g_record_count; 130 | m_llen = dat; 131 | break; 132 | 133 | case ET_QLEN: 134 | x_qlen -> put( g_record_count - m_last.i_qlen ); 135 | x_qlen -> put( dat ); 136 | m_last.i_qlen = g_record_count; 137 | // m_qlen = dat; 138 | break; 139 | 140 | case ET_SOLPF_GEN: 141 | x_sgen->put( g_record_count - m_last.i_sgen); 142 | x_sgen->put_chr( dat ); 143 | m_last.i_sgen = g_record_count; 144 | m_last.solid_pf_gen = dat; 145 | break; 146 | 147 | case ET_SOLPF_QLT: 148 | x_sqlt->put( g_record_count - m_last.i_sqlt); 149 | x_sqlt->put_chr( dat ); 150 | m_last.i_sqlt = g_record_count; 151 | m_last.solid_pf_qlt = dat; 152 | break; 153 | 154 | default : assert(0); 155 | } 156 | } 157 | 158 | void UsrSave::expect(UCHAR chr) { 159 | likely_if (m_buff[m_cur++] == chr) 160 | return; 161 | 162 | croak("fastq file: expecting '%c', got '%c' after record %llu", chr, m_buff[m_cur-1], g_record_count); 163 | } 164 | 165 | bool UsrSave::mid_rec_msg() const { 166 | 167 | croak("fastq file: record seems truncated after record %llu", g_record_count); 168 | } 169 | 170 | void UsrSave::load_check() { 171 | likely_if (m_cur+PLL_STRT < m_end) 172 | return; 173 | for (int i = m_cur, cnt=0; i < m_end; i++) 174 | if (m_buff[i] == '\n') 175 | if (++cnt >= 4) 176 | return; 177 | load_page(); 178 | } 179 | 180 | UCHAR UsrSave::load_char() { 181 | if (m_cur >= m_end) 182 | load_page(); 183 | return m_buff[m_cur++]; 184 | } 185 | 186 | void UsrSave::determine_record() { 187 | 188 | int q = m_cur; 189 | 190 | rarely_if(m_cur >= m_end) { 191 | if (not conf.quiet) { 192 | if (x_lrec->tell()) 193 | fprintf(stderr, "::: HEY all records were oversized\n"); 194 | else 195 | fprintf(stderr, "::: HEY no records were found"); 196 | } 197 | return; 198 | } 199 | 200 | rarely_if(m_buff[q++] != '@') 201 | croak("first record: Missing prefix '@', is it really a fastq format?"); 202 | 203 | int sanity = MAX_ID_LLEN; 204 | while (-- sanity and m_buff[q] != '\n') 205 | q ++ ; 206 | rarely_if( not sanity) { 207 | // croak("first record: REC is too long"); 208 | g_record_count++; 209 | get_oversized_record(m_cur, false); 210 | return determine_record(); 211 | } 212 | 213 | if (m_buff[q++] != '\n') 214 | croak("first record: Expected newline, got '%c'", m_buff[q-1]); 215 | 216 | int qg = q; 217 | // LLEN 218 | for (int i = 1; 219 | i < MAX_GN_LLEN and not m_llen; 220 | i++) 221 | if (m_buff[q + i] == '\n') 222 | m_llen = i; 223 | 224 | if (not m_llen) { 225 | // croak("first record: GEN is too long"); 226 | g_record_count++; 227 | get_oversized_record(m_cur, false); 228 | return determine_record(); 229 | } 230 | 231 | q += m_llen + 1; 232 | if (m_buff[q] != '+') 233 | croak("first record: Missing 2nd prefix '+', is it really a fastq format?"); 234 | 235 | // 2ND ID 236 | bool has_2nd_id = false; 237 | while (m_buff[++ q ] != '\n') 238 | if (m_buff[ q ] != ' ' ) 239 | has_2nd_id = true; 240 | 241 | // SOLID 242 | bool d_solid = false; 243 | for (int i = 1; 244 | i < m_llen and not d_solid and not m_solid; 245 | i ++ ) 246 | switch (m_buff[qg + i] | 0x20) { 247 | case '0': case '1': case '2': case '3': 248 | m_solid = true; 249 | break; 250 | 251 | case 'a': case 'c' : case 'g' : case 't': 252 | d_solid = true; 253 | break; 254 | 255 | default: 256 | // TODO: what if first record is all Ns ? 257 | // - seek in other records 258 | break; 259 | } 260 | 261 | if (m_solid) { 262 | conf.set_info("usr.solid", m_solid); 263 | m_llen --; 264 | } 265 | conf.set_info("llen", m_llen); 266 | conf.set_info("usr.2id", has_2nd_id); // TODO 267 | } 268 | 269 | bool UsrSave::get_oversized_record(int cur, bool from_get) { 270 | // collect this record 271 | x_lrec->put( g_record_count - m_last.i_long); 272 | m_last.i_long = g_record_count ; 273 | 274 | m_cur = cur ; // zap back 275 | UCHAR c = load_char(); 276 | if ('@' != c) 277 | croak("record %llu: bad (long) record", g_record_count); 278 | 279 | #define CHK_VALID if (!m_valid) croak("record %llu: seems truncated", g_record_count) 280 | #define PUT_LINE(X) do { c = load_char(); X->put_chr(c); } while (c != '\n' and m_valid) 281 | 282 | PUT_LINE(x_lrec); 283 | CHK_VALID; 284 | 285 | PUT_LINE(x_lgen); 286 | CHK_VALID; 287 | 288 | PUT_LINE(x_lrec); // 2nd rec 289 | CHK_VALID; 290 | 291 | PUT_LINE(x_lqlt); 292 | CHK_VALID; 293 | 294 | #undef PUT_LINE 295 | #undef CHK_VALID 296 | if (from_get) { 297 | g_record_count ++ ; 298 | return get_record(); 299 | } 300 | return true; 301 | } 302 | 303 | bool UsrSave::get_record() { 304 | 305 | load_check(); 306 | 307 | #define CHECK_OVERFLOW rarely_if (m_cur >= m_end) return mid_rec_msg() 308 | 309 | if (m_cur >= m_end) return m_valid = false; 310 | 311 | int currec = m_cur; 312 | expect('@'); 313 | int sanity = MAX_ID_LLEN; 314 | while (-- sanity and m_buff[m_cur] != '\n') 315 | m_cur ++ ; 316 | rarely_if( not sanity) 317 | // croak("record %llu: rec line too long", g_record_count); 318 | return get_oversized_record(currec); 319 | 320 | mp.rec_end = &(m_buff[m_cur]); 321 | CHECK_OVERFLOW ; 322 | expect('\n'); 323 | 324 | UCHAR update_solid_pf = 0; 325 | if (m_solid) { 326 | rarely_if (m_last.solid_pf_gen != m_buff[m_cur]) 327 | update_solid_pf = m_buff[m_cur]; 328 | m_cur++; 329 | } 330 | 331 | mp.gen = &(m_buff[m_cur]); 332 | const int gi = m_cur ; 333 | sanity = MAX_GN_LLEN; 334 | while (-- sanity and m_buff[m_cur] != '\n') 335 | m_cur ++; 336 | 337 | rarely_if( not sanity) 338 | return get_oversized_record(currec); 339 | 340 | rarely_if(update_solid_pf) // update only after last potential get_oversized_record 341 | update(ET_SOLPF_GEN, update_solid_pf); 342 | 343 | CHECK_OVERFLOW; 344 | rarely_if(m_llen != m_cur-gi) 345 | update(ET_LLEN, m_cur-gi); 346 | 347 | expect('\n'); 348 | expect('+' ); 349 | for (sanity = MAX_ID_LLEN; 350 | -- sanity and m_buff[m_cur] != '\n'; 351 | m_cur ++ ); 352 | CHECK_OVERFLOW; 353 | rarely_if(not sanity) 354 | croak("wierd second id at record %llu", g_record_count); 355 | 356 | expect('\n'); 357 | 358 | if (m_solid) { 359 | rarely_if (m_last.solid_pf_qlt != m_buff[m_cur]) 360 | update(ET_SOLPF_QLT, m_buff[m_cur]); 361 | 362 | m_cur++; 363 | } 364 | 365 | mp.qlt = &(m_buff[m_cur]); 366 | // m_cur += m_llen; 367 | m_qlen = 0; 368 | for (sanity = MAX_GN_LLEN; 369 | -- sanity and m_buff[m_cur+m_qlen] != '\n'; 370 | m_qlen++); 371 | 372 | rarely_if( not sanity) 373 | return get_oversized_record(currec); 374 | 375 | rarely_if( m_qlen != m_llen) 376 | update(ET_QLEN, m_qlen); 377 | 378 | m_cur += m_qlen; 379 | CHECK_OVERFLOW; 380 | 381 | expect('\n'); 382 | 383 | mp.prev_rec = mp.rec; 384 | mp.prev_rec_end = mp.rec_end; 385 | mp.rec = &(m_buff[currec+1]); 386 | 387 | return true; 388 | 389 | #undef CHECK_OVERFLOW 390 | } 391 | 392 | int UsrSave::encode() { 393 | 394 | UINT32 sanity = conf.profiling ? 100000 : 3000000000; 395 | 396 | RecSave rec; 397 | GenSave gen; 398 | QltSave qlt; 399 | 400 | while( ++ g_record_count < sanity and get_record() ) { 401 | gen.save(mp.gen, mp.qlt, m_llen, m_qlen); 402 | rec.save(mp.rec, mp.rec_end, mp.prev_rec, mp.prev_rec_end); 403 | qlt.save(mp.qlt, m_qlen); 404 | } 405 | conf.set_info("num_records", g_record_count-1); 406 | return 0; 407 | } 408 | 409 | // load 410 | 411 | UsrLoad::UsrLoad() { 412 | BZERO(m_last); 413 | flip = 0; 414 | m_out = conf.file_usr(); 415 | m_rec[0] = '@' ; 416 | m_rep[0] = '@' ; 417 | m_2nd_rec = conf.get_long("usr.2id"); 418 | m_solid = conf.get_bool("usr.solid"); 419 | m_llen = conf.get_long("llen"); 420 | m_qlen = m_llen; 421 | 422 | if (not m_2nd_rec) { 423 | m_gen[m_llen+1] = '\n'; 424 | m_gen[m_llen+2] = '+' ; 425 | } 426 | 427 | if (m_solid) { 428 | m_gen_ptr = m_gen; 429 | m_qlt_ptr = m_qlt; 430 | m_llen_factor = 1; 431 | } 432 | else { 433 | m_gen_ptr = m_gen + 1; 434 | m_qlt_ptr = m_qlt + 1; 435 | m_llen_factor = 0; 436 | } 437 | 438 | x_llen = new XFileLoad("usr.x"); 439 | x_qlen = new XFileLoad("usr.x.q"); 440 | x_sgen = new XFileLoad("usr.pfg"); 441 | x_sqlt = new XFileLoad("usr.pfq"); 442 | 443 | m_last.i_llen = x_llen->get(); 444 | m_last.i_qlen = x_qlen->get(); 445 | m_last.i_sgen = x_sgen->get(); 446 | m_last.i_sqlt = x_sqlt->get(); 447 | 448 | x_lrec = new XFileLoad("usr.lrec"); 449 | m_last.i_long = x_lrec->get(); 450 | if (m_last.i_long) { 451 | x_lgen = new XFileLoad("usr.lgen"); 452 | x_lqlt = new XFileLoad("usr.lqlt"); 453 | } 454 | else { 455 | DELETE(x_lrec); 456 | x_lgen = x_lqlt = NULL; 457 | } 458 | } 459 | 460 | UsrLoad::~UsrLoad() { 461 | DELETE(x_llen); 462 | DELETE(x_qlen); 463 | DELETE(x_sgen); 464 | DELETE(x_sqlt); 465 | 466 | DELETE(x_lgen); 467 | DELETE(x_lqlt); 468 | DELETE(x_lrec); 469 | } 470 | 471 | void UsrLoad::update() { 472 | 473 | rarely_if(m_last.i_long == g_record_count) { 474 | UCHAR c = '@'; 475 | fputc(c, m_out); 476 | #define PUT_LINE(X) do { c = X->get_chr(); fputc(c, m_out); } while (c != '\n') 477 | PUT_LINE(x_lrec); 478 | PUT_LINE(x_lgen); 479 | PUT_LINE(x_lrec); 480 | PUT_LINE(x_lqlt); 481 | #undef PUT_LINE 482 | m_last.i_long += x_lrec->get(); 483 | g_record_count++; 484 | return update(); 485 | } 486 | 487 | rarely_if(m_last.i_llen == g_record_count) { 488 | m_llen = x_llen -> get(); 489 | m_qlen = m_llen; 490 | m_last.i_llen += x_llen->get(); 491 | m_gen[m_llen+1] = '\n'; 492 | m_gen[m_llen+2] = '+'; 493 | } 494 | rarely_if(m_last.i_qlen == g_record_count) { 495 | m_qlen = x_qlen -> get(); 496 | m_last.i_qlen += x_qlen->get(); 497 | } 498 | else if (m_qlen != m_llen) 499 | m_qlen = m_llen; 500 | rarely_if(m_solid and 501 | m_last.i_sgen == g_record_count) { 502 | m_last.solid_pf_gen = m_gen[0] = x_sgen->get_chr(); 503 | m_last.i_sgen += x_sgen->get(); 504 | } 505 | rarely_if(m_solid and 506 | m_last.i_sqlt == g_record_count) { 507 | m_last.solid_pf_qlt = m_qlt[0] = x_sqlt->get_chr(); 508 | m_last.i_sqlt += x_sqlt->get(); 509 | } 510 | } 511 | 512 | void UsrLoad::save() { 513 | 514 | UCHAR* p_rec = flip ? m_rep : m_rec ; 515 | flip = flip ? 0 : 1 ; 516 | 517 | putline(p_rec, m_rec_size+1); 518 | if (m_2nd_rec) { 519 | putline(m_gen_ptr, m_llen + m_llen_factor); 520 | p_rec[0] = '+'; 521 | putline(p_rec, m_rec_size+1); 522 | p_rec[0] = '@'; 523 | } 524 | else { 525 | putline(m_gen_ptr, m_llen + m_llen_factor + 2); 526 | } 527 | putline(m_qlt_ptr, m_qlen + m_llen_factor); 528 | 529 | } 530 | 531 | void UsrLoad::putline(UCHAR* buf, UINT32 size) { 532 | buf[size++] = '\n'; 533 | size_t cnt = fwrite(buf, 1, size, m_out); 534 | 535 | rarely_if (cnt != size) 536 | croak("USR: Error writing output"); 537 | } 538 | 539 | int UsrLoad::decode() { 540 | 541 | size_t n_recs = conf.get_long("num_records"); 542 | 543 | rarely_if( ! n_recs and not m_last.i_long) 544 | croak("Zero records, what's going on?"); 545 | 546 | comp_version = conf.decoder_version; 547 | 548 | RecLoad rec; 549 | GenLoad gen; 550 | QltLoad qlt; 551 | 552 | UCHAR* b_qlt = m_qlt+1 ; 553 | UCHAR* b_gen = m_gen+1 ; 554 | 555 | while (1) { 556 | g_record_count++; 557 | update(); 558 | if (g_record_count > n_recs) 559 | break; 560 | 561 | UCHAR* b_rec = (flip ? m_rep : m_rec)+1 ; 562 | UCHAR* p_rec = (flip ? m_rec : m_rep)+1 ; 563 | 564 | m_rec_size = rec.load(b_rec, p_rec); 565 | rarely_if (not m_rec_size) 566 | croak("premature EOF - %llu records left", n_recs+1); 567 | 568 | qlt.load(b_qlt, m_qlen); 569 | gen.load(b_gen, b_qlt, m_llen, m_qlen); 570 | save(); 571 | } 572 | // sanity: verify all objects are done (by croak?) 573 | return 0; 574 | } 575 | --------------------------------------------------------------------------------