├── .gitmodules ├── CMakeLists.txt ├── Dockerfile ├── GenomeBiology_scripts.zip ├── LICENSE.txt ├── README.md ├── compile.sh ├── compileUKYDLX.sh ├── createToy.sh ├── dockerassets ├── BuildWrapper.sh ├── GroupWrapper.sh ├── binary_parallel.sh ├── build_grps.sh ├── build_grps_grps.sh ├── build_seqothello.sh ├── compress_aws_kmer.sh ├── compresult.sh ├── convert_kmer_to_bin_S3.sh ├── echo.sh ├── fetch_and_run.sh ├── makexml.sh ├── oneBinaryWrapper.sh ├── query.sh └── runmeasure.py ├── example ├── STEP1_Jellyfish.sh ├── STEP2_Binary.sh ├── STEP3_Group.sh ├── STEP_additional_1.sh ├── STEP_additional_2.sh ├── STEP_additional_3.sh ├── build_and_query.sh ├── experiments_list.10.txt ├── fq │ ├── experiment_00.R1.fastq │ ├── experiment_00.R2.fastq │ ├── experiment_01.R1.fastq │ ├── experiment_01.R2.fastq │ ├── experiment_02.R1.fastq │ ├── experiment_02.R2.fastq │ ├── experiment_03.R1.fastq │ ├── experiment_03.R2.fastq │ ├── experiment_04.R1.fastq │ ├── experiment_04.R2.fastq │ ├── experiment_05.R1.fastq │ ├── experiment_05.R2.fastq │ ├── experiment_06.R1.fastq │ ├── experiment_06.R2.fastq │ ├── experiment_07.R1.fastq │ ├── experiment_07.R2.fastq │ ├── experiment_08.R1.fastq │ ├── experiment_08.R2.fastq │ ├── experiment_09.R1.fastq │ └── experiment_09.R2.fastq ├── kmer_list.10.txt ├── transcripts.fa └── transpose.sh ├── genBuildFromJellyfishKmers.sh ├── genExample.sh ├── lib ├── CMakeLists.txt ├── args.hxx ├── socket.cpp ├── socket.h ├── threadpool.h ├── tinyxml2.cpp └── tinyxml2.h ├── manual.md ├── rebuild.sh ├── run.sh ├── scripts_submission └── scripts.tar ├── seqothlib ├── CMakeLists.txt ├── L1Node.cpp ├── L1Node.hpp ├── L2Node.cpp ├── L2Node.hpp ├── disjointset.h ├── filegrouper.hpp ├── io_helper.hpp ├── jellyfish_helper.hpp ├── oltnew.h ├── othello.h ├── othellotypes.hpp ├── util.cpp └── util.h ├── src ├── CMakeLists.txt ├── build.cc ├── checktidy.sh ├── client.cc ├── group.cc ├── preprocess.cc ├── printrates.cc ├── query.cc └── testL1Node.cpp └── test ├── CMakeLists.txt ├── clear.sh ├── comp.cc ├── datagen.cc ├── gendata.sh ├── test.sh ├── testTT.fa └── unit ├── CMakeLists.txt ├── main.cpp ├── testL1Node.cpp ├── testL1Node.h ├── testL2Node.cpp └── testL2Node.h /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "Jellyfish"] 2 | path = Jellyfish 3 | url = https://github.com/gmarcais/Jellyfish 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | if(NOT CMAKE_BUILD_TYPE) 3 | set(CMAKE_BUILD_TYPE Release) 4 | endif() 5 | 6 | set(CMAKE_CXX_FLAGS "-Wall -Wextra -pedantic") 7 | set(CMAKE_CXX_FLAGS_DEBUG "-g -O0") 8 | set(CMAKE_CXX_FLAGS_RELEASE "-O3") 9 | 10 | PROJECT(SeqOthello) 11 | find_package (Threads) 12 | find_package (ZLIB REQUIRED ) 13 | add_definitions(-std=gnu++11 -march=native) 14 | ADD_SUBDIRECTORY(lib) 15 | ADD_SUBDIRECTORY(seqothlib) 16 | ADD_SUBDIRECTORY(src bin) 17 | ADD_SUBDIRECTORY(test) 18 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM amazonlinux:latest 2 | RUN yum -y install which unzip aws-cli parallel procps 3 | # RUN yum -y install stress 4 | 5 | WORKDIR /tmp 6 | ENV BUILD_DEP 'zlib-devel gcc gcc-c++ automake libtool diffutils pkgconfig gettext-devel glibc-static' 7 | RUN yum makecache fast && yum -y install wget bzip2 file 8 | RUN yum -y install zlib-devel gcc gcc-c++ automake libtool diffutils pkgconfig gettext-devel glibc-static && \ 9 | cd /tmp && wget https://github.com/samtools/htslib/releases/download/1.8/htslib-1.8.tar.bz2 && tar jxf htslib-1.8.tar.bz2 && cd /tmp/htslib-1.8 && autoreconf -i && ./configure --disable-bz2 --disable-lzma && make -j8 && make install && \ 10 | cd /tmp && wget https://github.com/gmarcais/Jellyfish/releases/download/v2.2.10/jellyfish-2.2.10.tar.gz && tar zxf jellyfish-2.2.10.tar.gz && cd jellyfish-2.2.10 && export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig && autoreconf -i && ./configure && make -j8 && make install && \ 11 | yum -y remove zlib-devel gcc gcc-c++ automake libtool diffutils gettext-devel glibc-static && yum -y autoremove && yum -y autoremove 12 | 13 | COPY build/bin/* /bin/ 14 | COPY build/test/comp /bin/ 15 | COPY dockerassets/* /usr/local/bin/ 16 | 17 | # gem RUN gem install yaggo 18 | # COPY Jellyfish /tmp/ 19 | # ncurses-devel ncurses libtool file bzip2-devel xz-devel 20 | # download htslib, yaggo 21 | # RUN ./configure --disable-bz2 --disable-lzma && make -j8 && make install #/usr/local/lib/pkgconfig/htslib.pc 22 | 23 | # RUN wget https://github.com/samtools/samtools/releases/download/1.8/samtools-1.8.tar.bz2 && tar jxf samtools-1.8.tar.bz2 24 | # WORKDIR /tmp/samtools-1.8 25 | # RUN yum install gcc gcc-c++ 26 | # RUN ./configure && make -j4 && make install 27 | # WORKDIR /tmp 28 | 29 | # USER ec2-user 30 | # ENTRYPOINT /usr/local/bin/fetch_and_run.sh 31 | -------------------------------------------------------------------------------- /GenomeBiology_scripts.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuBioinfo/SeqOthello/68d47e0e562ba2b9405971be45862e7d5d2961b1/GenomeBiology_scripts.zip -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # SeqOthello 3 | 4 | __SeqOthello__ is an ultra-fast and memory-efficient indexing structure to support arbitrary sequence query against large collections of RNA-seq experiments. Taking a sequence as query input, SeqOthello returns either the total _k_-mer hits of the query sequence or the detailed presence/absence information of individual k-mers across all the indexed experiments. 5 | 6 | A preprint of the paper describing SeqOthello is available [here](https://www.biorxiv.org/content/biorxiv/early/2018/02/01/258772.full.pdf). 7 | 8 | ## SeqOthello Installation 9 | 10 | ### System Requirements 11 | __SeqOthello__ is tested on Linux platforms with the following system settings. 12 | The performance is optimized for Intel CPUs with SSE4.2 support. 13 | 14 | * cmake >= 2.8.4 15 | * gcc >= 4.9.1 16 | * zlib >= 1.2.3 17 | 18 | Ubuntu 19 | ``` 20 | sudo apt install cmake build-essential zlib1g-dev 21 | ``` 22 | Fedora 23 | ``` 24 | sudo yum install gcc-c++ cmake zlib-dev 25 | ``` 26 | 27 | Mac OS 10.12. Please install cmake using [brew](https://brew.sh/) 28 | ``` 29 | brew install cmake 30 | ``` 31 | 32 | ### Installation 33 | 34 | 1. Clone the repository: 35 | 36 | ``` 37 | git clone https://github.com/LiuBioinfo/SeqOthello.git 38 | ``` 39 | 40 | 1. Build and install: 41 | 42 | ``` 43 | cd SeqOthello/ 44 | ./compile.sh 45 | ``` 46 | 47 | After successfully build the code, you can find __SeqOthello__ toolchain at ``build/bin/``. 48 | 49 | ``` 50 | ls build/bin 51 | ``` 52 | 53 | 54 | ## Manual 55 | 56 | ### Build SeqOthello with an example. 57 | 58 | The construction of __SeqOthello__ requires the input of a list of 59 | k-mer files, each of which contains the set of _k_-mers extracted from 60 | reads of the corresponding RNA-Seq experiment. 61 | Currently the k-mer file is generated by [Jellyfish](https://github.com/gmarcais/Jellyfish) from fasta/fastq files. 62 | 63 | For demonstration purpose, we provide an ``example/`` project including 10 simulated pair-end RNA-seq experiments for testing. You can use the pipeline script ``example/build_and_query.sh`` to build the __SeqOthello__ map for the sample project from the ``fastq`` files and query the example transcripts included in ``example/transcripts.fa`` 64 | 65 | 1. Extract _k_-mer count using [Jellyfish](https://github.com/gmarcais/Jellyfish) 66 | 67 | You may need to install [Jellyfish](https://github.com/gmarcais/Jellyfish) first. 68 | 69 | First, use Jellyfish to generate _k_-mer files for the experiments included in ``experiments_list.10.txt`` and save them in the temporary folder ``tmp/kmers``. This setup may take about 10 seconds. For the given example, we provide a shell scripts for this step ``STEP1_Jellyfish.sh``. The You may execute the script in ``example`` folder: 70 | ``` 71 | cd example 72 | ./STEP1_Jellyfish.sh 73 | ``` 74 | 75 | 1. Convert _k_-mer files to __SeqOthello__ binary format. 76 | 77 | The second step is to convert k-mer files to SeqOthello binary format using ``PreProcess``. 78 | For the given example, we provide a shell scripts for this step ``STEP2_Binary.sh``. You may execute the script in ``example`` folder 79 | 80 | ``` 81 | ./STEP2_Binary.sh 82 | ``` 83 | 84 | 85 | 1. Make __SeqOthello__ group files 86 | In the third step, binary files are grouped by the ``Group`` tool, into small subsets for further process. Generally, each group contains approximately 50 samples. Since we only have 10 samples in this example, we build two groups in tmp/grp/, where ``Grp_00`` contains experiments specified in ``tmp/binary_list.part00``, which is corresponds to the first 5 lines of ``experiments_list.10.txt``. The reset 5 examples are included in ``Grp_01``. We put these two filenames in ``tmp/grp_list``. 87 | 88 | For the given example, we provide a shell scripts for this step ``STEP3_Group.sh``. You may execute the script in ``example`` folder 89 | 90 | ``` 91 | ./STEP3_Group.sh 92 | ``` 93 | 94 | 95 | 1. Build __SeqOthello__ mapping 96 | 97 | Now, we can build the __SeqOthello__ mapping between the entire set of _k_-mers and their experiment ids using the ``Build`` tool. The indexes groups are speicified by ``--flist`` parameter, order of group files essentially determines the orders of the experiments stored in SeqOthello. 98 | You may execute the following command in ``example`` folder 99 | 100 | ``` 101 | mkdir -p map 102 | 103 | ../build/bin/Build --flist=tmp/grp_list --grp-folder=tmp/grp/ --out-folder=map/ 104 | 105 | ``` 106 | And then you will find the SeqOthello mapping in the ``map`` folder. The file ``map.xml`` provides the metadata of the SeqOthello structure. 107 | 108 | 109 | 1. An example of adding new experiments to existing __SeqOthello__. 110 | 111 | In some cases the users may choose to add additional experiments to exisiting SeqOthello mapping. We recommend the users keep the __group files__ generated during SeqOthello construction, so that it can be reused for further reconstruction. 112 | 113 | For example, in addition to the 10 experiments, we have 3 new kmer files generated by Jellyfish. Execute this and you will find the additional example kmer files in tmp/kmers/ 114 | ``` STEP_additional_1.sh ``` 115 | 116 | Similarly, we need to convert these k-mer files into binary files and then convert the binary files to group files. Execute these commands. 117 | ``` STEP_additional_2.sh ``` 118 | ``` STEP_additional_3.sh ``` 119 | ``` 120 | mkdir -p mapa 121 | 122 | ../build/bin/Build --flist=tmp/grp_list_a --grp-folder=tmp/grp/ --out-folder=mapa/ 123 | ``` 124 | 125 | ## Build __SeqOthello__ with custom experiments 126 | 127 | To build __SeqOthello__ with custom experiments, please use Jellyfish you may use the script generator ``genBuildFromJellyfishKmers.sh`` 128 | 129 | ``` 130 | ./genBuildFromJellyfishKmers.sh 131 | ``` 132 | 133 | For all prompted input questions, use the default value for the example data set, or, enter custom values for custom experiments. Then three scripts will be generated and you can execute them one by one. 134 | 135 | ``` 136 | ./ConvertToBinary.sh 137 | ./MakeGroup.sh 138 | ./BuildSeqOthello.sh 139 | ``` 140 | 141 | ### Transcripts Query 142 | 143 | __SeqOthello__ ``Query`` takes ``.fa`` files as input and can generate output in the following two format. 144 | 145 | The file ``transcripts.fa`` contains 3 human transcript sequences and are used in the following example to demonstrate query results. 146 | 147 | ``` 148 | grep "ENST" transcripts.fa 149 | >ENST00000431542 150 | >ENST00000428263 151 | >ENST00000628538 152 | ``` 153 | 154 | 1. Number of _k_-mer hits per sequence 155 | 156 | By default, for each transcript in the query, __SeqOthello__ returns the 157 | total number of _k_-mer hits of each experiment in a tab-delimited table. 158 | 159 | ``` 160 | ../build/bin/Query --map-folder=map/ \ 161 | --transcript=transcripts.fa \ 162 | --output=query.hits.txt \ 163 | --qthread=1 164 | ``` 165 | 166 | The query results has three rows, one transcript per row. The first column indicates the transcripts index matching the order in ``transcript.fa``. The rest of the columns are the query results for all the experiments in the __SeqOthello__ map. These experiments are in the order specified in the ``map.xml``. 167 | 168 | ``` 169 | cat query.hits.txt 170 | transcript# 0 222 214 200 150 160 215 209 220 193 233 171 | transcript# 1 205 205 168 210 217 202 190 211 160 196 172 | transcript# 2 115 115 115 115 115 115 115 115 115 115 173 | ``` 174 | 175 | 176 | 2. Detailed _k_-mer hit map per query 177 | 178 | If ``--detail`` is used, __SeqOthello__ may return the detailed map regarding individual _k_-mer’s presence/absence information across all the indexed experiments. This mode is limited to one transcript per query due to the large amount of output generated. 179 | 180 | ``` 181 | head -2 transcripts.fa > ENST00000431542.fa 182 | 183 | ../build/bin/Query --map-folder=map/ \ 184 | --transcript=ENST00000431542.fa \ 185 | --output=ENST00000431542.hits.txt \ 186 | --detail \ 187 | --qthread=1 188 | ``` 189 | 190 | The output has two columns. The first column is the _k_-mer sequence and the following set of columns contains the detailed hit map across all indexed experiments. We use ``+`` and ``.`` sign to indicate whether a _k_-mer is present or absence in an indexed experiment respectively. Each column of the ``+`` and ``-`` map, from left to right, indicates the hit infomation for one experiment indexed in the SeqOthello. These experiments are in the order specified in the ``map.xml``. 191 | 192 | Here is an example output showing the first 10 _k_-mers in transcript ENST00000431542. 193 | 194 | ``` 195 | head -10 ENST00000431542.hits.txt 196 | GTGGGAGTCGCCACCGCACCC .......... 197 | CGTGGGAGTCGCCACCGCACC .........+ 198 | CCGTGGGAGTCGCCACCGCAC .........+ 199 | GCCGTGGGAGTCGCCACCGCA .......+.+ 200 | CGCCGTGGGAGTCGCCACCGC .......+.+ 201 | CCGCCGTGGGAGTCGCCACCG .......+.+ 202 | TCCGCCGTGGGAGTCGCCACC .......+.+ 203 | ATCCGCCGTGGGAGTCGCCAC +......+.+ 204 | CATCCGCCGTGGGAGTCGCCA +......+.+ 205 | CCATCCGCCGTGGGAGTCGCC +....+.+.+ 206 | ``` 207 | 208 | You may also use the gnu __datamash__ tool to view the transposed representation of this file. 209 | ``` 210 | ./transpose.sh ENST00000431542.hits.txt 211 | ``` 212 | ## SeqOthello Online 213 | 214 | __SeqOthello__ also accommodates online features for small-batch queries. Online queries preload the entire index into memory prior to querying, and can be executed in approximately 0.09 seconds per transcript. 215 | 216 | Use the following command to start a server on the machine, (e.g., on TCP port 3322). The service will run as a deamon. 217 | 218 | ``` 219 | ../build/bin/Query \ 220 | --map-folder=map/ \ 221 | --start-server-port 3322 222 | ``` 223 | 224 | Open a new terminal, run the Client program for _k_-mer hit query. 225 | 226 | ``` 227 | ../build/bin/Client \ 228 | --transcript=transcripts.fa \ 229 | --output=online.query.txt \ 230 | --kmer-hit \ 231 | --port=3322 232 | ``` 233 | 234 | ## Build SeqOthello in parallel 235 | 236 | The ``PreProcess`` and ``Group`` steps in __SeqOthello__ construction can be easily paralleled. 237 | For example, with GNU Parallel, you run the ``PreProcess`` commands for the 10 experiments: 238 | 239 | ``` 240 | cat experiments_list.10.txt | \ 241 | parallel ../build/bin/PreProcess \ 242 | --k=21 \ 243 | --cutoff=1 \ 244 | --in=tmp/kmers/{}.kmer \ 245 | --out=tmp/kmer_bins/{}.bin 246 | ``` 247 | 248 | You can build the 2 group files in parallel with: 249 | ``` 250 | parallel ../build/bin/Group \ 251 | --flist=tmp/binary_list.part{1} \ 252 | --folder=tmp/kmer_bins/ \ 253 | --output=tmp/grp/Grp_{1} \ 254 | ::: 01 02 255 | 256 | ``` 257 | ## License 258 | Please refer to LICENSE.TXT. 259 | 260 | ## Citation 261 | ``` 262 | SeqOthello: Query over RNA-seq experiments at scale 263 | Ye Yu, Jinpeng Liu, Xinan Liu, Yi Zhang, Eamonn Magner, Chen Qian, Jinze Liu 264 | bioRxiv 258772; doi: https://doi.org/10.1101/258772 265 | ``` 266 | 267 | ## Getting help 268 | For questions running __SeqOthello__, please post to [SeqOthello Google Group](https://groups.google.com/forum/#!forum/seqothello) 269 | 270 | ## Known issues 271 | 272 | - Linux systems often have a limit on the number of files that can be opened simutaneously. When there are a large number of experiment files, the Build program of SeqOthello may hit the limit. You may set it to larger numbers by using ``limit``. e.g, 273 | ``ulimit -nS 4096`` 274 | 275 | - The maximum size of k-mers supported as of now is 31. 276 | 277 | - Each of binary file is associated with a xml file, the xml files should be put in the same folder with the binary files. Similarly, please put the associated xml files for the group files in the same folder with the group files. 278 | -------------------------------------------------------------------------------- /compile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # you need to update Jellyfish in the Jelyfish folder and run ./configure in it. 3 | # then make here. 4 | rm -rf build 5 | mkdir build 6 | cd build 7 | cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER='g++' 8 | make -j4 9 | -------------------------------------------------------------------------------- /compileUKYDLX.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | module add cmake/2.8.4 3 | module add gcc/4.9.1 4 | sed '/test/d' -i CMakeLists.txt 5 | rm -rf build 6 | mkdir build 7 | cd build 8 | cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER='g++' 9 | make -j4 10 | cd .. 11 | echo 'ADD_SUBDIRECTORY(test)' >> CMakeLists.txt 12 | -------------------------------------------------------------------------------- /createToy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | mkdir -p example 3 | echo "Create example set of kmer files ..." 4 | ./genExample.sh 5 | echo "Create scripts ..." 6 | echo "For the example set of kmer files, please use the default option for all the following settings ..." 7 | ./genBuildFromJellyfishKmers.sh 8 | -------------------------------------------------------------------------------- /dockerassets/BuildWrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | GrpFiles=$@ 3 | 4 | for i in $GrpFiles; do 5 | if [[ "$i" == *grp ]]; then 6 | echo $i; 7 | fi 8 | done > Grplist 9 | 10 | mkdir -p map 11 | Build --flist=Grplist --out-folder=map/ 12 | -------------------------------------------------------------------------------- /dockerassets/GroupWrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | GrpName=`echo $@ | md5sum | cut -c1-9` 3 | echo $GrpName 4 | BinFiles=$@ 5 | Binlist=${GrpName}.binlist 6 | echo $BinFiles 7 | 8 | for i in $BinFiles; do 9 | if [[ "$i" == *bin ]]; then 10 | echo $i; 11 | fi 12 | done > ${Binlist} 13 | cat ${Binlist} 14 | 15 | echo Group --flist=${Binlist} --output=${GrpName}.grp 16 | Group --flist=${Binlist} --output=${GrpName}.grp 17 | -------------------------------------------------------------------------------- /dockerassets/binary_parallel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # A POSIX variable 4 | OPTIND=1 # Reset in case getopts has been used previously in the shell. 5 | 6 | # Initialize our own variables: 7 | output_file="" 8 | verbose=0 9 | 10 | while getopts "t:k:" opt; do 11 | case "$opt" in 12 | t) q="$OPTARG" 13 | ;; 14 | k) k="$OPTARG" 15 | ;; 16 | esac 17 | done 18 | 19 | shift $((OPTIND-1)) 20 | 21 | [ "${1:-}" = "--" ] && shift 22 | 23 | parallel --jobs 100% oneBinaryWrapper.sh {1} $q $k ::: $@ 24 | # End of file 25 | 26 | -------------------------------------------------------------------------------- /dockerassets/build_grps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | BASENAME="${0##*/}" 4 | PATH="/bin:/usr/bin:/sbin:/usr/sbin:/usr/local/bin:/usr/local/sbin" 5 | 6 | 7 | usage () { 8 | if [ "${#@}" -ne 0 ]; then 9 | echo "* ${*}" 10 | echo 11 | fi 12 | cat <&2 31 | rm -rf ${TMPDIR} 32 | exit 1 33 | } 34 | 35 | if [ -z "$3" ]; then 36 | usage "" 37 | fi 38 | 39 | S3_SRC_PREFIX=$1 40 | S3_DEST=$2 41 | S3_GRP_DESCRIPTION=$3 42 | 43 | GRPNAMEFULL=`basename ${S3_GRP_DESCRIPTION}` 44 | GRPNAME="${GRPNAMEFULL%.*}" 45 | 46 | #downloading all files 47 | echo GRPNAME $GRPNAME 48 | echo GRPNAMEFULL $GRPNAMEFULL 49 | aws s3 cp ${S3_GRP_DESCRIPTION} ${TMPDIR}/${GRPNAME}.GrpDescripton || error_exit "exit" 50 | 51 | while read -r i; do 52 | aws s3 cp ${S3_SRC_PREFIX}${i} ${TMPDIR}/${i} || error_exit "exit" 53 | aws s3 cp ${S3_SRC_PREFIX}${i}.xml ${TMPDIR}/${i}.xml || error_exit "exit" 54 | echo ${i} >> ${TMPDIR}/${GRPNAME}.SeqOGroup 55 | done < ${TMPDIR}/${GRPNAME}.GrpDescripton 56 | 57 | echo Running Group --flist=${TMPDIR}/${GRPNAME}.SeqOGroup --folder=${TMPDIR}/ --output=${TMPDIR}/${GRPNAME}.Grp 58 | Group --flist=${TMPDIR}/${GRPNAME}.SeqOGroup --folder=${TMPDIR}/ --output=${TMPDIR}/${GRPNAME}.Grp || error_exit "Error while grouping" 59 | 60 | aws s3 cp ${TMPDIR}/${GRPNAME}.Grp ${S3_DEST}${GRPNAME}.Grp || error_exit "exit" 61 | aws s3 cp ${TMPDIR}/${GRPNAME}.Grp.xml ${S3_DEST}${GRPNAME}.Grp.xml || error_exit "exit" 62 | TMPDIR="$(mktemp -d -t tmp.XXXXXXXXX)" || error_exit "Failed to create temp directory." 63 | 64 | rm -rf ${TMPDIR} 65 | -------------------------------------------------------------------------------- /dockerassets/build_grps_grps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | BASENAME="${0##*/}" 4 | PATH="/bin:/usr/bin:/sbin:/usr/sbin:/usr/local/bin:/usr/local/sbin" 5 | 6 | 7 | usage () { 8 | if [ "${#@}" -ne 0 ]; then 9 | echo "* ${*}" 10 | echo 11 | fi 12 | cat <&2 31 | rm -rf ${TMPDIR} 32 | exit 1 33 | } 34 | 35 | if [ -z "$3" ]; then 36 | usage "" 37 | fi 38 | 39 | S3_SRC_PREFIX=$1 40 | S3_DEST=$2 41 | S3_GRP_DESCRIPTION=$3 42 | 43 | GRPNAMEFULL=`basename ${S3_GRP_DESCRIPTION}` 44 | GRPNAME="${GRPNAMEFULL%.*}" 45 | 46 | #downloading all files 47 | echo GRPNAME $GRPNAME 48 | echo GRPNAMEFULL $GRPNAMEFULL 49 | aws s3 cp ${S3_GRP_DESCRIPTION} ${TMPDIR}/${GRPNAME}.GrpDescripton || error_exit "exit" 50 | 51 | 52 | parallel aws s3 cp ${S3_SRC_PREFIX}{1} ${TMPDIR}/{1} :::: ${TMPDIR}/${GRPNAME}.GrpDescripton 53 | parallel aws s3 cp ${S3_SRC_PREFIX}{1}.xml ${TMPDIR}/{1}.xml :::: ${TMPDIR}/${GRPNAME}.GrpDescripton 54 | 55 | echo Running Group --flist=${TMPDIR}/${GRPNAME}.GrpDescripton --folder=${TMPDIR}/ --output=${TMPDIR}/${GRPNAME}.Grp --group 56 | Group --flist=${TMPDIR}/${GRPNAME}.GrpDescripton --folder=${TMPDIR}/ --output=${TMPDIR}/${GRPNAME}.Grp --group || error_exit "Error while grouping" 57 | 58 | aws s3 cp ${TMPDIR}/${GRPNAME}.Grp ${S3_DEST}${GRPNAME}.Grp || error_exit "exit" 59 | aws s3 cp ${TMPDIR}/${GRPNAME}.Grp.xml ${S3_DEST}${GRPNAME}.Grp.xml || error_exit "exit" 60 | TMPDIR="$(mktemp -d -t tmp.XXXXXXXXX)" || error_exit "Failed to create temp directory." 61 | 62 | #if [ -z "$4" ]; then 63 | rm -rf ${TMPDIR} 64 | #fi 65 | 66 | -------------------------------------------------------------------------------- /dockerassets/build_seqothello.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | BASENAME="${0##*/}" 4 | PATH="/bin:/usr/bin:/sbin:/usr/sbin:/usr/local/bin:/usr/local/sbin" 5 | 6 | #build_seqothello.sh s3://seqothellotest/grp_c_10/ s3://seqothellotest/grp/0test.Grplist s3://seqothellotest/map_10/ 7 | usage () { 8 | if [ "${#@}" -ne 0 ]; then 9 | echo "* ${*}" 10 | echo 11 | fi 12 | cat <&2 29 | exit 1 30 | } 31 | 32 | if [ -z "$3" ]; then 33 | usage "" 34 | fi 35 | 36 | S3_GRP_PREFIX=$1 37 | S3_GRP_LIST=$2 38 | S3_DEST=$3 39 | S3_MEASURE_OUTPUT=$4 40 | 41 | scheme="$(echo "${S3_GRP_LIST}" | cut -d: -f1)" 42 | if [ "${scheme}" != "s3" ]; then 43 | usage "S3_GRP_LIST must be for an S3 object; expecting URL starting with s3://" 44 | fi 45 | 46 | scheme="$(echo "${S3_GRP_PREFIX}" | cut -d: -f1)" 47 | if [ "${scheme}" != "s3" ]; then 48 | usage "S3_GRP_PREFIX must be a folder containing the grp files; expecting URL starting with s3://" 49 | fi 50 | 51 | scheme="$(echo "${S3_DEST}" | cut -d: -f1)" 52 | if [ "${scheme}" != "s3" ]; then 53 | usage "S3_DEST must be a valid S3 prefix; expecting URL starting with s3://" 54 | fi 55 | 56 | if [ "$4" ]; then 57 | scheme="$(echo "${S3_MEASURE_OUTPUT}" | cut -d: -f1)" 58 | if [ "${scheme}" != "s3" ]; then 59 | usage "S3_MEASURE_OUTPUT must be a valid S3 prefix; expecting URL starting with s3://" 60 | fi 61 | fi 62 | 63 | 64 | MAPNAMEFULL=`basename ${S3_GRP_LIST}` 65 | MAPNAME="${MAPNAMEFULL%.*}" 66 | 67 | #downloading all files 68 | echo MAPNAME $MAPNAME 69 | echo S3_MEASURE_OUTPUT $4 70 | echo S3_MEASURE_OUTPUT $S3_MEASURE_OUTPUT 71 | aws s3 cp ${S3_GRP_LIST} ${TMPDIR}/${MAPNAME}.GrpList 72 | 73 | 74 | if grep -q "grp" ${TMPDIR}/${MAPNAME}.GrpList 75 | then 76 | parallel aws s3 cp ${S3_GRP_PREFIX}{1} ${TMPDIR}/{1} :::: ${TMPDIR}/${MAPNAME}.GrpList || error_exit "failed to download" 77 | parallel aws s3 cp ${S3_GRP_PREFIX}{1}.xml ${TMPDIR}/{1}.xml :::: ${TMPDIR}/${MAPNAME}.GrpList || error_exit "failed to download from s3" 78 | cp ${TMPDIR}/${MAPNAME}.GrpList ${TMPDIR}/${MAPNAME}.SeqOGroups 79 | else 80 | parallel aws s3 cp ${S3_GRP_PREFIX}{1}.Grp ${TMPDIR}/{1}.Grp :::: ${TMPDIR}/${MAPNAME}.GrpList || error_exit "failed to download" 81 | parallel aws s3 cp ${S3_GRP_PREFIX}{1}.Grp.xml ${TMPDIR}/{1}.Grp.xml :::: ${TMPDIR}/${MAPNAME}.GrpList || error_exit "failed to download from s3" 82 | while read -r i; do 83 | echo ${i}.Grp >> ${TMPDIR}/${MAPNAME}.SeqOGroups 84 | done < ${TMPDIR}/${MAPNAME}.GrpList 85 | fi 86 | #head -n 1 /tmp/${MAPNAME}.SeqOGroups > /tmp/${MAPNAME}.SeqOGroups.only1 87 | mkdir ${TMPDIR}/mapOut.${MAPNAME} 88 | 89 | 90 | if [ -z "$4" ]; then 91 | Build --flist=${TMPDIR}/${MAPNAME}.SeqOGroups --folder=${TMPDIR}/ --out-folder=${TMPDIR}/mapOut.${MAPNAME}/ || error_exit "failed during building Seqothello"; 92 | else 93 | CMD="Build --flist="${TMPDIR}"/"${MAPNAME}".SeqOGroups --folder="${TMPDIR}"/ --out-folder="${TMPDIR}"/mapOut."${MAPNAME}"/" 94 | echo "Running command" ${CMD} 95 | python /usr/local/bin/runmeasure.py --cmd """${CMD}""" --log ${TMPDIR}/runlog 2>${TMPDIR}/timelog 96 | aws s3 cp ${TMPDIR}/timelog ${S3_MEASURE_OUTPUT}${MAPNAME}.timelog 97 | aws s3 cp ${TMPDIR}/runlog ${S3_MEASURE_OUTPUT}${MAPNAME}.runlog 98 | fi 99 | 100 | #for i in `ls /tmp/mapOut.${MAPNAME}/`; do 101 | # echo aws s3 cp /tmp/${MAPNAME}/$i ${S3_DEST}`basename $i` 102 | # aws s3 cp /tmp/${MAPNAME}/$i ${S3_DEST}`basename $i` 103 | #done; 104 | parallel aws s3 cp ${TMPDIR}/mapOut.${MAPNAME}/{1} ${S3_DEST}${MAPNAME}/`basename {1}` ::: `ls ${TMPDIR}/mapOut.${MAPNAME}/` || error_exit "failed during upload" 105 | 106 | rm -rf ${TMPDIR} 107 | -------------------------------------------------------------------------------- /dockerassets/compress_aws_kmer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | bname=`basename $1`; 3 | aws s3 cp $1 . 4 | tar zcvf ${bname}.tar.gz ${bname} 5 | aws s3 cp ${bname}.tar.gz $2 6 | rm -rf ${bname}* 7 | 8 | -------------------------------------------------------------------------------- /dockerassets/compresult.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | BASENAME="${0##*/}" 4 | PATH="/bin:/usr/bin:/sbin:/usr/sbin:/usr/local/bin:/usr/local/sbin" 5 | 6 | 7 | usage () { 8 | if [ "${#@}" -ne 0 ]; then 9 | echo "* ${*}" 10 | echo 11 | fi 12 | cat <&2 32 | rm -rf ${TMPDIR} 33 | exit 1 34 | } 35 | 36 | 37 | N=198092 38 | parallel 'aws s3 cp s3://seqothellotest/2652comp/trans.tara{1} '${TMPDIR} ::: ` for i in {a..m}; do echo $i; done ` 39 | cat ${TMPDIR}/trans.tara* | tar xf - -C ${TMPDIR} 40 | aws s3 cp s3://seqothellotest/2652comp/$1.bin ${TMPDIR} 41 | ls ${TMPDIR} 42 | for i in `seq $2 $3`; do 43 | echo -n $i' '; 44 | comp ${TMPDIR}/$1.bin ${TMPDIR}/trans/$i 2>/dev/null | tail -n 1 45 | done > ${TMPDIR}/$1.CompResult.$2.$3 46 | 47 | aws s3 cp ${TMPDIR}/$1.CompResult.$2.$3 s3://seqothellotest/2652compResult/ 48 | 49 | rm -rf ${TMPDIR} 50 | -------------------------------------------------------------------------------- /dockerassets/convert_kmer_to_bin_S3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | BASENAME="${0##*/}" 4 | PATH="/bin:/usr/bin:/sbin:/usr/sbin:/usr/local/bin:/usr/local/sbin" 5 | 6 | 7 | usage () { 8 | if [ "${#@}" -ne 0 ]; then 9 | echo "* ${*}" 10 | echo 11 | fi 12 | cat <&2 30 | rm -rf ${TMPDIR} 31 | } 32 | 33 | if [ -z "$2" ]; then 34 | usage "" 35 | fi 36 | 37 | error_exit () { 38 | echo "${BASENAME} - ${1}" >&2 39 | exit 1 40 | } 41 | 42 | S3_SRC=$1 43 | S3_DEST=$2 44 | CUTOFF=$3 45 | K=$4 46 | 47 | FNAMEFULL=`basename ${S3_SRC}` 48 | FNAME="${FNAMEFULL%.*}" 49 | echo $FNAMEFULL 50 | echo $FNAME 51 | aws s3 cp ${S3_SRC} ${TMPDIR}/${FNAME}.kmer || error_exit "Failed to download jellyfish output "${FNAME}" from amazon S3" 52 | PreProcess --in=${TMPDIR}/${FNAME}.kmer --out=${TMPDIR}/${FNAME}.bin --cutoff=${CUTOFF} --k=${K} || error_exit "fail while converting kmer" 53 | aws s3 cp ${TMPDIR}/${FNAME}.bin.xml ${S3_DEST}${FNAME}.bin.xml || error_exit "fail uploading"${FNAME}".bin.xml" 54 | PreProcess --in=${TMPDIR}/${FNAME}.kmer --out=${TMPDIR}/${FNAME}.histo --k=${K} --histogram || error_exit "fail while converting kmer" 55 | aws s3 cp ${TMPDIR}/${FNAME}.bin ${S3_DEST}${FNAME}.bin || error_exit "fail uploading" 56 | aws s3 cp ${TMPDIR}/${FNAME}.histo ${S3_DEST}${FNAME}.histo || error_exit "fail uploading" 57 | rm -rf ${TMPDIR} 58 | 59 | -------------------------------------------------------------------------------- /dockerassets/echo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | date 3 | echo "Args: $@" 4 | env 5 | echo "This is my simple test job!." 6 | echo "jobId: $AWS_BATCH_JOB_ID" 7 | echo "jobQueue: $AWS_BATCH_JQ_NAME" 8 | echo "computeEnvironment: $AWS_BATCH_CE_NAME" 9 | date 10 | echo $@ 11 | echo "bye bye!!" 12 | 13 | -------------------------------------------------------------------------------- /dockerassets/fetch_and_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2013-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the 6 | # License. A copy of the License is located at 7 | # 8 | # http://aws.amazon.com/apache2.0/ 9 | # 10 | # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 11 | # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | # This script can help you download and run a script from S3 using aws-cli. 15 | # It can also download a zip file from S3 and run a script from inside. 16 | # See below for usage instructions. 17 | 18 | PATH="/bin:/usr/bin:/sbin:/usr/sbin:/usr/local/bin:/usr/local/sbin" 19 | BASENAME="${0##*/}" 20 | 21 | usage () { 22 | if [ "${#@}" -ne 0 ]; then 23 | echo "* ${*}" 24 | echo 25 | fi 26 | cat <&2 48 | exit 1 49 | } 50 | 51 | 52 | if [ -z "${BATCH_FILE_S3_URL}" ]; then 53 | usage "BATCH_FILE_S3_URL not set. No object to download." 54 | fi 55 | 56 | if [ -z "${JOB_CPU_PERCENTILE}" ]; then 57 | SEQUENTIAL=1 58 | fi 59 | 60 | if [ -z "${AWS_BATCH_JOB_ARRAY_INDEX}" ]; then 61 | # usage "need to be run in aws batch job array" 62 | AWS_BATCH_JOB_ARRAY_INDEX=0 63 | TOTAL_JOBS=1 64 | fi 65 | 66 | scheme="$(echo "${BATCH_FILE_S3_URL}" | cut -d: -f1)" 67 | if [ "${scheme}" != "s3" ]; then 68 | usage "BATCH_FILE_S3_URL must be for an S3 object; expecting URL starting with s3://" 69 | fi 70 | 71 | # Check that necessary programs are available 72 | which aws >/dev/null 2>&1 || error_exit "Unable to find AWS CLI executable." 73 | 74 | 75 | # Create a temporary directory to hold the downloaded contents, and make sure 76 | # it's removed later, unless the user set KEEP_BATCH_FILE_CONTENTS. 77 | cleanup () { 78 | if [ -z "${KEEP_BATCH_FILE_CONTENTS}" ] \ 79 | && [ -n "${TMPDIR}" ] \ 80 | && [ "${TMPDIR}" != "/" ]; then 81 | rm -r "${TMPDIR}" 82 | fi 83 | } 84 | trap 'cleanup' EXIT HUP INT QUIT TERM 85 | # mktemp arguments are not very portable. We make a temporary directory with 86 | # portable arguments, then use a consistent filename within. 87 | TMPDIR="$(mktemp -d -t tmp.XXXXXXXXX)" || error_exit "Failed to create temp directory." 88 | TMPFILE="${TMPDIR}/batch-file-temp" 89 | install -m 0600 /dev/null "${TMPFILE}" || error_exit "Failed to create temp file." 90 | 91 | # Fetch and run a script 92 | fetch_and_run_script () { 93 | # Create a temporary file and download the script 94 | aws s3 cp "${BATCH_FILE_S3_URL}" - > "${TMPFILE}" || error_exit "Failed to download S3 script." 95 | LINECOUNT=`wc -l ${TMPFILE} | cut -f1 -d' '` 96 | LOW_LINE=$(( $LINECOUNT * $AWS_BATCH_JOB_ARRAY_INDEX / $TOTAL_JOBS + 1 )) 97 | HIGH_LINE=$(( $LINECOUNT * (1 + $AWS_BATCH_JOB_ARRAY_INDEX) / $TOTAL_JOBS )) 98 | 99 | echo ${TMPFILE} has ${LINECOUNT} lines. 100 | sed -n ${LOW_LINE},${HIGH_LINE}p ${TMPFILE} > ${TMPFILE}.mypart 101 | echo This is job with index $AWS_BATCH_JOB_ARRAY_INDEX in array. Execut lines ${LOW_LINE} to ${HIGH_LINE}. 102 | if [ -z "${SEQUENTIAL}" ]; then 103 | parallel --jobs ${JOB_CPU_PERCENTILE}'%' :::: ${TMPFILE}.mypart 104 | else 105 | echo "Run these commands" 106 | cat ${TMPFILE}.mypart 107 | sh ${TMPFILE}.mypart 108 | fi 109 | 110 | # Make the temporary file executable and run it with any given arguments 111 | # local script="./${1}"; shift 112 | # chmod u+x "${TMPFILE}" || error_exit "Failed to chmod script." 113 | # exec ${TMPFILE} "${@}" || error_exit "Failed to execute script." 114 | } 115 | 116 | 117 | echo Starting Job with `nproc --all` CPUS. `head -n 1 /proc/meminfo`. JOBS_PER_CPU=$JOB_CPU_PERCENTILE'%' 118 | 119 | # Main - dispatch user request to appropriate function 120 | fetch_and_run_script "${@}" 121 | -------------------------------------------------------------------------------- /dockerassets/makexml.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | aws s3 cp s3://seqothellotest/bin_c_10/$1.histo $1.histo 3 | CNT10=`awk -F',' '{ SUM += $2 } END { print SUM }' $1.histo` 4 | CNT15=`awk -F',' '{ if ($1>=15) SUM += $2; } END { print SUM }' $1.histo` 5 | CNT20=`awk -F',' '{ if ($1>=20) SUM += $2; } END { print SUM }' $1.histo` 6 | CNT25=`awk -F',' '{ if ($1>=25) SUM += $2; } END { print SUM }' $1.histo` 7 | CNT30=`awk -F',' '{ if ($1>=30) SUM += $2; } END { print SUM }' $1.histo` 8 | CNT40=`awk -F',' '{ if ($1>=40) SUM += $2; } END { print SUM }' $1.histo` 9 | CNT50=`awk -F',' '{ if ($1>=50) SUM += $2; } END { print SUM }' $1.histo` 10 | echo $CNT10 $CNT15 $CNT20 $CNT25 $CNT30 $CNT40 $CNT50 11 | TMPFILE=`mktemp` 12 | for i in 10 15 20 25 30 40 50; do 13 | echo '' > ${TMPFILE}.$i 14 | done 15 | echo ' ' >> ${TMPFILE}.10 16 | echo ' ' >> ${TMPFILE}.15 17 | echo ' ' >> ${TMPFILE}.20 18 | echo ' ' >> ${TMPFILE}.25 19 | echo ' ' >> ${TMPFILE}.30 20 | echo ' ' >> ${TMPFILE}.40 21 | echo ' ' >> ${TMPFILE}.50 22 | for i in 10 15 20 25 30 40 50; do 23 | echo '' >> ${TMPFILE}.$i 24 | aws s3 cp ${TMPFILE}.$i s3://seqothellotest/bin_c_$i/$1.bin.xml 25 | done 26 | 27 | -------------------------------------------------------------------------------- /dockerassets/oneBinaryWrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | FNAMEFULL=$1 4 | CUTOFF=$2 5 | K=$3 6 | BASENAME=`basename $1` 7 | FNAME="${BASENAME%.*}" 8 | echo PreProcess --in=$1 --out=${FNAME}.bin --cutoff=${CUTOFF} --k=${K} || error_exit "fail while converting kmer" 9 | PreProcess --in=$1 --out=${FNAME}.bin --cutoff=${CUTOFF} --k=${K} || error_exit "fail while converting kmer" 10 | 11 | -------------------------------------------------------------------------------- /dockerassets/query.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | BASENAME="${0##*/}" 4 | PATH="/bin:/usr/bin:/sbin:/usr/sbin:/usr/local/bin:/usr/local/sbin" 5 | 6 | #build_seqothello.sh s3://seqothellotest/grp_c_10/ s3://seqothellotest/grp/0test.Grplist s3://seqothellotest/map_10/ 7 | usage () { 8 | if [ "${#@}" -ne 0 ]; then 9 | echo "* ${*}" 10 | echo 11 | fi 12 | cat <&2 28 | } 29 | 30 | if [ -z "$3" ]; then 31 | usage "" 32 | fi 33 | 34 | S3_SEQOTHELLO_PATH=$1 35 | S3_TRANSCRIPT=$2 36 | S3_RESULT=$3 37 | QTHREAD=$4 38 | S3_MEASURE_OUTPUT=$5 39 | 40 | if [ -z "${QTHREAD}" ]; then 41 | QTHREAD=4 42 | fi 43 | 44 | TMPDIR="$(mktemp -d -t tmp.XXXXXXXXX)" || error_exit "Failed to create temp directory." 45 | 46 | MAPNAMEFULL=`basename ${S3_SEQOTHELLO_PATH}` 47 | MAPNAME="${MAPNAMEFULL%.*}" 48 | 49 | #downloading all files 50 | echo MAPNAME $MAPNAME 51 | #aws s3 sync ${S3_SEQOTHELLO_PATH} ${TMPDIR} 52 | aws s3 ls ${S3_SEQOTHELLO_PATH} | cut -c32- > ${TMPDIR}/flist 53 | parallel aws s3 cp ${S3_SEQOTHELLO_PATH}{1} ${TMPDIR}/ :::: ${TMPDIR}/flist 54 | aws s3 cp ${S3_TRANSCRIPT} ${TMPDIR}/transcript 55 | CMD="Query --map-folder="${TMPDIR}"/ --transcript="${TMPDIR}"/transcript --qthread="${QTHREAD}" --output "${TMPDIR}"/result" 56 | python /usr/local/bin/runmeasure.py --cmd """${CMD}""" --log ${TMPDIR}/runlog 2>${TMPDIR}/timelog 57 | tar czf ${TMPDIR}/result.tar.gz ${TMPDIR}/result 58 | if [ "${S3_MEASURE_OUTPUT}" ]; then 59 | aws s3 cp ${TMPDIR}/timelog ${S3_MEASURE_OUTPUT}${MAPNAME}.${QTHREAD}.timelog 60 | aws s3 cp ${TMPDIR}/runlog ${S3_MEASURE_OUTPUT}${MAPNAME}.${QTHREAD}.runlog 61 | fi 62 | 63 | aws s3 cp ${TMPDIR}/result.tar.gz ${S3_RESULT}${MAPNAME}.${QTHREAD}.tar.gz 64 | rm -rf ${TMPDIR} 65 | -------------------------------------------------------------------------------- /dockerassets/runmeasure.py: -------------------------------------------------------------------------------- 1 | import time 2 | import threading 3 | import datetime as DT 4 | import logging 5 | import argparse 6 | import subprocess 7 | logger = logging.getLogger(__name__) 8 | 9 | logging.basicConfig(level=logging.DEBUG, 10 | format='[%(asctime)s %(threadName)s] %(message)s', 11 | datefmt='%H:%M:%S') 12 | 13 | 14 | def do_work(id, stop): 15 | logger.info("starting to check subprocess {}".format(id)) 16 | logger.info(subprocess.check_output(['ps', '-p', '{}'.format(id), '-o' ,'rss,pid,cmd,pcpu'])) 17 | while True: 18 | try: 19 | logger.info(subprocess.check_output(['ps', '-p', '{}'.format(id), '-o' ,'rss,pid,pcpu','h'])) 20 | except subprocess.CalledProcessError: 21 | pass 22 | if stop(): 23 | logger.info("I am thread watching on pid {}. stopping".format(id)) 24 | break 25 | time.sleep(1) 26 | 27 | 28 | 29 | if __name__ == '__main__': 30 | parser = argparse.ArgumentParser('run and measure memory') 31 | parser.add_argument('--cmd', nargs = 1, required = True) 32 | parser.add_argument('--log', nargs = 1, required = True) 33 | arg= parser.parse_args() 34 | print(arg) 35 | stop_threads = False 36 | workers = [] 37 | f = open(arg.log[0], "w") 38 | p = subprocess.Popen(arg.cmd[0].split(' '), stdout=f) 39 | tmp = threading.Thread(target=do_work, args=(p.pid, lambda: stop_threads)) 40 | print(p.pid) 41 | workers.append(tmp) 42 | tmp.start() 43 | p.wait() 44 | stop_threads = True 45 | for worker in workers: 46 | worker.join() 47 | 48 | print('Finis.') 49 | 50 | 51 | -------------------------------------------------------------------------------- /example/STEP1_Jellyfish.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p tmp/kmers 4 | while IFS=' ' read -r exp; 5 | do 6 | jellyfish count -s 10M \ 7 | -m 21 -C -t 2 -o tmp/kmers/${exp}.jf \ 8 | fq/${exp}.R1.fastq \ 9 | fq/${exp}.R2.fastq; 10 | # -s [10M] Bloom filter size used in Jellyfish. You may need to use larger values for real experiments. 11 | # -m 21 Length of kmers 12 | 13 | jellyfish dump -t -L 3 \ 14 | -c tmp/kmers/${exp}.jf \ 15 | -o tmp/kmers/${exp}.kmer; 16 | 17 | # rm tmp/kmers/${exp}.jf; 18 | done < experiments_list.10.txt 19 | -------------------------------------------------------------------------------- /example/STEP2_Binary.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p tmp/kmer_bins 4 | 5 | while IFS=' ' read -r exp 6 | do 7 | ../build/bin/PreProcess \ 8 | --k=21 \ 9 | --cutoff=1 \ 10 | --in=tmp/kmers/${exp}.kmer \ 11 | --out=tmp/kmer_bins/${exp}.bin 12 | 13 | done < experiments_list.10.txt 14 | -------------------------------------------------------------------------------- /example/STEP3_Group.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | #Create the group list for the binary files. 5 | head -n 5 experiments_list.10.txt | sed "s/$/\.bin/g" \ 6 | > tmp/binary_list.part00 7 | 8 | tail -n 5 experiments_list.10.txt | sed "s/$/\.bin/g" \ 9 | > tmp/binary_list.part01 10 | #Build occurrence map for the group. 11 | mkdir -p tmp/grp 12 | 13 | ../build/bin/Group \ 14 | --flist=tmp/binary_list.part00 \ 15 | --folder=tmp/kmer_bins/ \ 16 | --output=tmp/grp/Grp_00 17 | 18 | ../build/bin/Group \ 19 | --flist=tmp/binary_list.part01 \ 20 | --folder=tmp/kmer_bins/ \ 21 | --output=tmp/grp/Grp_01 22 | 23 | echo Grp_00 > tmp/grp_list 24 | echo Grp_01 >> tmp/grp_list 25 | 26 | -------------------------------------------------------------------------------- /example/STEP_additional_1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | for i in 0 1 2; do 3 | head -n 1024 tmp/kmers/experiment_0$i.kmer > tmp/kmers/experiment_a$i.kmer; 4 | done 5 | 6 | -------------------------------------------------------------------------------- /example/STEP_additional_2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p tmp/kmer_bins 4 | ls -p tmp/kmers/experiment_a*.kmer | xargs -n 1 basename | cut -d. -f1 | 5 | while IFS=' ' read -r exp 6 | do 7 | ../build/bin/PreProcess \ 8 | --k=21 \ 9 | --cutoff=1 \ 10 | --in=tmp/kmers/${exp}.kmer \ 11 | --out=tmp/kmer_bins/${exp}.bin 12 | 13 | done 14 | -------------------------------------------------------------------------------- /example/STEP_additional_3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Create the group list for the binary files. 4 | ls -p tmp/kmer_bins/experiment_a*.bin | xargs -n 1 basename > tmp/binary_list.parta 5 | 6 | #Build occurrence map for the group. 7 | 8 | ../build/bin/Group \ 9 | --flist=tmp/binary_list.parta \ 10 | --folder=tmp/kmer_bins/ \ 11 | --output=tmp/grp/Grp_a 12 | 13 | cp tmp/grp_list tmp/grp_list_a 14 | echo Grp_a >> tmp/grp_list_a 15 | 16 | -------------------------------------------------------------------------------- /example/build_and_query.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Experiments configurations 4 | SEQOTHELLO="../build/bin" 5 | THREADS=8 6 | KMER_LEN=21 7 | FQ_DIR="fq" 8 | EXP_LIST="experiments_list.10.txt" 9 | KMER_DIR="kmers" 10 | KMER_COUNT_TH=1 11 | TMP='tmp' 12 | BINS_DIR="${TMP}/kmer_bins" 13 | GRP_DIR="${TMP}/grp" 14 | MAP_DIR="map" 15 | N_PER_GROUPS=10 16 | 17 | if [ -d ${TMP} ] 18 | then 19 | rm -rf ${TMP} 20 | fi 21 | 22 | mkdir -p ${KMER_DIR} ${BINS_DIR} ${GRP_DIR} ${MAP_DIR} 23 | 24 | 25 | echo "1.Generating k-mer file for each experiment, might takes a while..." 26 | while IFS= read -r exp 27 | do 28 | if [ -e ${KMER_DIR}/${exp}.${KMER_COUNT_TH}.kmer ] 29 | then 30 | echo " $exp kmer file exists, will skip" 31 | else 32 | echo " process $exp" 33 | jellyfish count -m 21 -s 1000M -C -t ${THREADS} \ 34 | -o ${KMER_DIR}/${exp}.jf \ 35 | ${FQ_DIR}/${exp}.R1.fastq ${FQ_DIR}/${exp}.R2.fastq 36 | 37 | jellyfish dump -t -L ${KMER_COUNT_TH} \ 38 | -c ${KMER_DIR}/${exp}.jf \ 39 | -o ${KMER_DIR}/${exp}.${KMER_COUNT_TH}.kmer 40 | rm ${KMER_DIR}/${exp}.jf 41 | fi 42 | 43 | done < ${EXP_LIST} 44 | 45 | echo "2.Converting k-mer files to binary" 46 | while IFS= read -r exp 47 | do 48 | ${SEQOTHELLO}/PreProcess --k=${KMER_LEN} \ 49 | --cutoff=${KMER_COUNT_TH} \ 50 | --in=${KMER_DIR}/${exp}.${KMER_COUNT_TH}.kmer \ 51 | --out=${BINS_DIR}/${exp}.bin \ 52 | 53 | 54 | 55 | done < ${EXP_LIST} 56 | 57 | echo "3. Create Group files" 58 | 59 | 60 | sed "s/$/\.bin/g" ${EXP_LIST} | \ 61 | split -d -l ${N_PER_GROUPS} - ${TMP}/binary_list.part 62 | 63 | for blist in $(ls -m1 ${TMP}/binary_list.part*); do 64 | ${SEQOTHELLO}/Group --flist=${blist} \ 65 | --folder=${BINS_DIR}/ \ 66 | --output=${GRP_DIR}/Grp_"${blist##*.}" 67 | 68 | echo Grp_${blist##*.} >> ${TMP}/grp_list 69 | done 70 | 71 | echo "4. Build SeqOthello" 72 | ${SEQOTHELLO}/Build \ 73 | --flist=${TMP}/grp_list \ 74 | --folder=${GRP_DIR}/ \ 75 | --out-folder=${MAP_DIR}/ 76 | 77 | echo "5. Query transcripts" 78 | 79 | ${SEQOTHELLO}/Query \ 80 | --map-folder=${MAP_DIR}/ \ 81 | --transcript=transcripts.fa \ 82 | --output=query_results.txt 83 | 84 | -------------------------------------------------------------------------------- /example/experiments_list.10.txt: -------------------------------------------------------------------------------- 1 | experiment_00 2 | experiment_01 3 | experiment_02 4 | experiment_03 5 | experiment_04 6 | experiment_05 7 | experiment_06 8 | experiment_07 9 | experiment_08 10 | experiment_09 11 | -------------------------------------------------------------------------------- /example/kmer_list.10.txt: -------------------------------------------------------------------------------- 1 | experiment_00.kmer 2 | experiment_01.kmer 3 | experiment_02.kmer 4 | experiment_03.kmer 5 | experiment_04.kmer 6 | experiment_05.kmer 7 | experiment_06.kmer 8 | experiment_07.kmer 9 | experiment_08.kmer 10 | experiment_09.kmer 11 | -------------------------------------------------------------------------------- /example/transcripts.fa: -------------------------------------------------------------------------------- 1 | >ENST00000431542 2 | GGGTGCGGTGGCGACTCCCACGGCGGATGGCTGGGTCGAGAGCACATCTTTATTTTCTCTCTAGATCATTCTTCTTCCTATTTTGATTTGAGAAAGGGAACGTGAGAATTAGGTCAACCAGAGTGTTTTCCCAGGTGTTGAAAGGCTCAGGGTGCTCGCCCACATCAGTGAGGGAGGTGCAGCTGATGAAGGAAGTGCTGGGTGCAGAAGGGGATGCTCTGAAATGGTGGCTTCTGTTTGCAGTTTCAAAGCTCAG 3 | >ENST00000428263 4 | TGTTTTTGTTTTTTTTGAGACAGGGTCTTGTTCTGCTGTCCAGGCTGGAGTACAGTGGCACAATAATGGCTCACTGCAGCCTTGATCTCCTGGGCTCAAGTGATCCTCCTGCCTCAGCCTCCCAAGTATCCAAGACTACAGGAACCAAAGAAAGATCTTGCATTTTCTAATGAATTCTGAGCCCACAGTACAACTCATCTTTGGAAATCACTGCTCCAGAATCCAAAGCATGGTGCTCAGGTACT 5 | >ENST00000628538 6 | TGTGTGCAAGCATGTGTGTATGCATGTATACGTGTATACATCTGCATGTATTGCATGCTTATGCATGAGCATGCATGTATATGTACATGTATGCGTGTGCATGTATATGTGCACATGTGTGTACATGTGCTCACA 7 | -------------------------------------------------------------------------------- /example/transpose.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | sed $1 -e 's/$/\^/g' | fold -w1 | paste -sd' ' | sed 's/\^/\n/g' | sed "1 s|^| |" | sed '$ d' | datamash transpose -t' ' 3 | -------------------------------------------------------------------------------- /genBuildFromJellyfishKmers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TOOLCHAIN_PATH_DEFAULT=`pwd`/build/bin 4 | read -e -p "Please enter the folder contains SeqOthello toolchain [default: $TOOLCHAIN_PATH_DEFAULT]: " INPUT 5 | TOOLCHAIN_PATH="${INPUT:-$TOOLCHAIN_PATH_DEFAULT}" 6 | while [ ! -f ${TOOLCHAIN_PATH}/Build ]; do 7 | echo ${TOOLCHAIN_PATH}'/Build not found.' 8 | read -e -p "Please enter the folder contains SeqOthello toolchain [default: $TOOLCHAIN_PATH_DEFAULT]: " INPUT 9 | TOOLCHAIN_PATH="${INPUT:-$TOOLCHAIN_PATH_DEFAULT}" 10 | done 11 | 12 | KMER_PATH_DEFAULT=`pwd`/example/kmer 13 | read -e -p "Please enter the folder that contains all Jellyfish generated Kmer files [default: $KMER_PATH_DEFAULT]: " INPUT 14 | KMER_PATH="${INPUT:-$KMER_PATH_DEFAULT}" 15 | while [ ! -d ${KMER_PATH} ]; do 16 | echo ${KMER_PATH} 'not found.' 17 | read -e -p "Please enter the folder that contains all Jellyfish generated Kmer files [default: $KMER_PATH_DEFAULT]: " INPUT 18 | KMER_PATH="${INPUT:-$KMER_PATH_DEFAULT}" 19 | done 20 | 21 | KMER_FLIST_DEFAULT=${KMER_PATH}/flist 22 | read -e -p "Please enter a file that contains all filenames of the Kmer files [default: ${KMER_FLIST_DEFAULT}]: " INPUT 23 | KMER_FLIST="${INPUT:-$KMER_FLIST_DEFAULT}" 24 | 25 | while [ ! -f ${KMER_FLIST} ]; do 26 | echo ${KMER_FLIST} 'does not exist.' 27 | read -e -p "Please enter a file that contains all filenames of the Kmer files [default: ${KMER_FLIST_DEFAULT}]: " INPUT 28 | KMER_FLIST="${INPUT:-$KMER_FLIST_DEFAULT}" 29 | done 30 | 31 | read -e -p "Where can we keep some temporary files? [default: `pwd`/example]: " INPUT 32 | TEMP_FOLDER="${INPUT:-`pwd`/example}" 33 | 34 | if [ ! -d ${TEMP_FOLDER}/bin/ ]; then 35 | echo 'folder' ${TEMP_FOLDER}'/bin/ does not exist, creating.' 36 | mkdir -p ${TEMP_FOLDER}/bin 37 | fi 38 | 39 | 40 | read -e -p "Please enter the value of k [default : 20]: " INPUT 41 | k=${INPUT:-20} 42 | 43 | NUM_CONVERTED=0 44 | CONVERT_TO_BINARY=example/ConvertToBinary.sh 45 | BINARY_LIST=example/BinaryList 46 | BINARY_LIST_PREFIX=BinaryList.Part. 47 | 48 | if [ -f ${BINARY_LIST} ]; then 49 | echo ${BINARY_LIST} 'exists. rebuilding this file.' 50 | rm -rf ${BINARY_LIST}* 51 | fi 52 | if [ -f ${CONVERT_TO_BINARY} ]; then 53 | echo ${CONVERT_TO_BINARY} 'exists. rebuilding this file.' 54 | rm -rf ${CONVERT_TO_BINARY} 55 | fi 56 | 57 | for kmerfile in `cat ${KMER_FLIST}`; do 58 | binfile="${kmerfile%.*}".Bin 59 | echo ${TOOLCHAIN_PATH}/PreProcess --in=${KMER_PATH}/${kmerfile} --out=${TEMP_FOLDER}/bin/$binfile --k=${k} >> ${CONVERT_TO_BINARY} 60 | echo $binfile >> ${BINARY_LIST} 61 | ((NUM_CONVERTED++)) 62 | done 63 | 64 | echo 'Step 1: Convert the kmer files to binaryKmer files. ' 65 | echo ' Prepared the script of converting '${NUM_CONVERTED}' kmer files' as ${CONVERT_TO_BINARY} 66 | echo ' These binaryKmer files are listed in ' ${BINARY_LIST} 67 | 68 | echo 'Step 2: Group the binaryKmer files.' 69 | echo ${TEMP_FOLDER} 70 | FILE_PER_GROUP=30 71 | while [ $((FILE_PER_GROUP * FILE_PER_GROUP)) -le $NUM_CONVERTED ] ; do 72 | ((FILE_PER_GROUP++)) 73 | done 74 | 75 | 76 | split -l ${FILE_PER_GROUP} -d ${BINARY_LIST} ${TEMP_FOLDER}/${BINARY_LIST_PREFIX} 77 | echo ' Each group contains at most '$FILE_PER_GROUP 'files.' 78 | echo ' These group description files are' ${TEMP_FOLDER}/${BINARY_LIST_PREFIX}'*' 79 | 80 | if [ ! -d ${TEMP_FOLDER}/grp ]; then 81 | echo 'folder' ${TEMP_FOLDER}'/grp/ does not exist, creating.' 82 | mkdir -p ${TEMP_FOLDER}/grp 83 | fi 84 | 85 | MAKE_GROUP=example/MakeGroup.sh 86 | if [ -f ${MAKE_GROUP} ]; then 87 | echo ${MAKE_GROUP} 'exists. rebuilding this file.' 88 | rm -rf ${MAKE_GROUP} 89 | fi 90 | 91 | GRPLIST=`pwd`/example/GrpList 92 | if [ -d ${GRPLIST} ]; then 93 | echo ${GRPLIST} 'exists.' rebuilding this file. 94 | rm -rf ${GRPLIST} 95 | fi 96 | GRP_CONVERTED=0 97 | echo ${TEMP_FOLDER} 98 | for flist in `ls -m1 ${TEMP_FOLDER}/${BINARY_LIST_PREFIX}*`; do 99 | echo ${TOOLCHAIN_PATH}/Group --flist=$flist --folder=${TEMP_FOLDER}/bin/ --output=${TEMP_FOLDER}/grp/Grp"${flist##*.}" >> ${MAKE_GROUP} 100 | echo Grp"${flist##*.}" >> ${GRPLIST} 101 | ((GRP_CONVERTED++)) 102 | done 103 | 104 | echo ' Prepared the script of making '${GRP_CONVERTED}' groups' as ${MAKE_GROUP} 105 | echo ' These group files are listed in ' ${GRPLIST} 106 | echo ' These group are located in in ' ${TEMP_FOLDER}/grp 107 | 108 | echo 'Step 3: Build the SeqOthello structure' 109 | 110 | BUILD_SCRIPT=example/BuildSeqOthello.sh 111 | 112 | EXPORT_FOLDER_DEFAULT=`pwd`/example/out 113 | read -e -p "Please enter the folder to put the SeqOthello files. [default: $EXPORT_FOLDER_DEFAULT]: " INPUT 114 | EXPORT_FOLDER="${INPUT:-$EXPORT_FOLDER_DEFAULT}" 115 | 116 | if [ ! -d ${EXPORT_FOLDER} ]; then 117 | echo 'folder' ${EXPORT_FOLDER} 'does not exist, creating.' 118 | mkdir -p ${EXPORT_FOLDER} 119 | fi 120 | 121 | while [ ! -d ${KMER_PATH} ]; do 122 | echo ${KMER_PATH} 'not found.' 123 | read -e -p "Please enter the folder that contains all Jellyfish generated Kmer files [default: $KMER_PATH_DEFAULT]: " INPUT 124 | done 125 | echo ${TOOLCHAIN_PATH}/Build --flist=${GRPLIST} --folder=${TEMP_FOLDER}/grp/ --out-folder=$EXPORT_FOLDER/ '>' $EXPORT_FOLDER/Build.log > ${BUILD_SCRIPT} 126 | 127 | chmod +x ${CONVERT_TO_BINARY} ${BUILD_SCRIPT} ${MAKE_GROUP} 128 | echo 129 | echo 'Summary: Generated the following three scripts to build the SeqOthello structure' 130 | echo ' 1. ' ${CONVERT_TO_BINARY} ': ' `wc ${CONVERT_TO_BINARY} -l | cut -d' ' -f1` 'lines.' 131 | echo ' 2. ' ${MAKE_GROUP} ': ' `wc ${MAKE_GROUP} -l | cut -d' ' -f1` 'lines.' 132 | echo ' 3. ' ${BUILD_SCRIPT} ': ' `wc ${BUILD_SCRIPT} -l | cut -d' ' -f1` 'lines.' 133 | echo 134 | echo ' Please run these three scripts one by one, for example, using the following commands. ' 135 | echo ' # ./'${CONVERT_TO_BINARY} 136 | echo ' # ./'${MAKE_GROUP} 137 | echo ' # ./'${BUILD_SCRIPT} 138 | echo 139 | echo ' Note: the commands within each script can be executed in parallel.' 140 | echo ' For example, using GNU Parallel, you can run the scripts as:' 141 | echo ' # cat '${CONVERT_TO_BINARY}' | parallel -j 16' 142 | echo ' # cat '${MAKE_GROUP}' | parallel -j 16' 143 | echo ' # ./'${BUILD_SCRIPT} 144 | 145 | -------------------------------------------------------------------------------- /genExample.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | KMER_DIR=example/kmer 3 | CURR_DIR=`pwd` 4 | mkdir -p $KMER_DIR 5 | cd $KMER_DIR 6 | $CURR_DIR/build/test/datagen -f 182 -k 15300 > genlog 7 | echo '>xxxxxxxxx|yyyyyyyyyy' > test.fa 8 | tail -n 1 genlog >> test.fa 9 | for i in {0..181}; do echo F$i.Kmer ; done > flist 10 | cd $CURR_DIR 11 | -------------------------------------------------------------------------------- /lib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | SET(LibTinyXML_SRC tinyxml2.cpp) 2 | ADD_LIBRARY(tinyxml2 STATIC ${LibTinyXML_SRC}) 3 | 4 | SET(LibTCP_SRC socket.cpp) 5 | ADD_LIBRARY(smalltcp STATIC ${LibTCP_SRC}) 6 | 7 | -------------------------------------------------------------------------------- /lib/socket.cpp: -------------------------------------------------------------------------------- 1 | // This file is a part of SeqOthello. Please refer to LICENSE.TXT for the LICENSE 2 | #include "socket.h" 3 | 4 | #ifdef WIN32 5 | #include 6 | #else 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #endif 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | using namespace std; 20 | 21 | #define THROW_IF(val) if ((val)) throw runtime_error("error in " + string(__FUNCTION__)) 22 | #define THROW_IF_MSG(msg,val) if ((val)) throw runtime_error("error in " + string(__FUNCTION__) + ":" + (msg)) 23 | 24 | #ifdef WIN32 25 | static bool initialized = false; 26 | #endif 27 | 28 | Socket::Socket(int type, int protocol) 29 | { 30 | #ifdef WIN32 31 | if (!initialized) { 32 | WORD wVersionRequested; 33 | WSADATA wsaData; 34 | wVersionRequested = MAKEWORD(2, 0); // Request WinSock v2.0 35 | THROW_IF_MSG("fail to connect", (WSAStartup(wVersionRequested, &wsaData) != 0)); 36 | initialized = true; 37 | } 38 | #endif 39 | THROW_IF_MSG("fail to connect", (sockDesc = socket(PF_INET, type, protocol)) < 0); 40 | } 41 | 42 | Socket::Socket(int sockDesc) 43 | { 44 | this->sockDesc = sockDesc; 45 | } 46 | 47 | Socket::~Socket() 48 | { 49 | #ifdef WIN32 50 | ::closesocket(sockDesc); 51 | #else 52 | ::close(sockDesc); 53 | #endif 54 | sockDesc = -1; 55 | } 56 | 57 | string Socket::getLocalAddr() 58 | { 59 | sockaddr_in addr; 60 | unsigned int addr_len = sizeof(addr); 61 | THROW_IF((getsockname(sockDesc, (sockaddr *) &addr, (socklen_t *) &addr_len) < 0)); 62 | return inet_ntoa(addr.sin_addr); 63 | } 64 | 65 | unsigned short Socket::getLocalPort() 66 | { 67 | sockaddr_in addr; 68 | unsigned int addr_len = sizeof(addr); 69 | THROW_IF(getsockname(sockDesc, (sockaddr *) &addr, (socklen_t *) &addr_len) < 0); 70 | return ntohs(addr.sin_port); 71 | } 72 | 73 | void Socket::setLocalPort(unsigned short localPort) 74 | { 75 | sockaddr_in localAddr; 76 | memset(&localAddr, 0, sizeof(localAddr)); 77 | localAddr.sin_family = AF_INET; 78 | localAddr.sin_addr.s_addr = htonl(INADDR_ANY); 79 | localAddr.sin_port = htons(localPort); 80 | THROW_IF_MSG("fail to bind port", (bind(sockDesc, (sockaddr *) &localAddr, sizeof(sockaddr_in)) < 0)); 81 | } 82 | 83 | void TCPSocket::connect(const string &foreignAddr, 84 | unsigned short port) 85 | { 86 | sockaddr_in destAddr; 87 | memset(&destAddr, 0, sizeof(destAddr)); 88 | destAddr.sin_family = AF_INET; 89 | hostent *host; 90 | THROW_IF_MSG( "Fail to resolve name"+ foreignAddr,((host = gethostbyname(foreignAddr.c_str())) == NULL)); 91 | destAddr.sin_addr.s_addr = *((unsigned long *) host->h_addr_list[0]); 92 | destAddr.sin_port = htons(port); 93 | THROW_IF_MSG("Fail to connect to "+foreignAddr,::connect(sockDesc, (sockaddr *) &destAddr, sizeof(destAddr)) < 0); 94 | } 95 | 96 | void TCPSocket::send(const void *buffer, int bufferLen) 97 | { 98 | THROW_IF(::send(sockDesc, (void *) buffer, bufferLen, 0) < 0); 99 | } 100 | 101 | int TCPSocket::recv(void *buffer, int bufferLen) 102 | { 103 | int rtn; 104 | THROW_IF((rtn = ::recv(sockDesc, (void *) buffer, bufferLen, 0)) < 0); 105 | return rtn; 106 | } 107 | 108 | string TCPSocket::getForeignAddr() 109 | { 110 | sockaddr_in addr; 111 | unsigned int addr_len = sizeof(addr); 112 | THROW_IF (getpeername(sockDesc, (sockaddr *) &addr,(socklen_t *) &addr_len) < 0); 113 | return inet_ntoa(addr.sin_addr); 114 | } 115 | 116 | unsigned short TCPSocket::getForeignPort() 117 | { 118 | sockaddr_in addr; 119 | unsigned int addr_len = sizeof(addr); 120 | THROW_IF(getpeername(sockDesc, (sockaddr *) &addr, (socklen_t *) &addr_len) < 0); 121 | return ntohs(addr.sin_port); 122 | } 123 | 124 | 125 | TCPSocket::TCPSocket() : Socket(SOCK_STREAM, IPPROTO_TCP) {} 126 | 127 | TCPSocket::TCPSocket(const string &foreignAddr, unsigned short foreignPort) : Socket(SOCK_STREAM, IPPROTO_TCP) 128 | { 129 | connect(foreignAddr, foreignPort); 130 | } 131 | 132 | TCPSocket::TCPSocket(int newConnSD) : Socket(newConnSD) {} 133 | 134 | void TCPSocket::sendmsg(const string &str) 135 | { 136 | this->sendmsg(str.c_str(), str.size()); 137 | }; 138 | 139 | void TCPSocket::sendmsg(const char * buf, uint32_t len) 140 | { 141 | if (len == 0) { 142 | int sendlen = 0; 143 | this->send((void *) (&sendlen), sizeof(int32_t)); 144 | printf("Send empty msg\n"); 145 | return; 146 | } 147 | 148 | for (unsigned int shift = 0; shift*BUFLEN =len) 151 | sendlen = len - (shift*BUFLEN); 152 | this->send((void *) (&sendlen), sizeof(int32_t)); 153 | #pragma GCC diagnostic push 154 | #pragma GCC diagnostic ignored "-Wpointer-arith" 155 | this->send(& buf[shift*BUFLEN], sendlen<0?-sendlen:sendlen); 156 | // printf("Send sub mesg: %d\n", sendlen); 157 | #pragma GCC diagnostic pop 158 | } 159 | printf("Send mesg %d\n", len); 160 | } 161 | 162 | bool TCPSocket::recvmsg(string &str) 163 | { 164 | int32_t len; 165 | str = ""; 166 | while (true) { 167 | uint32_t siz = this->recv(&len, sizeof(int32_t)); 168 | if (siz != sizeof(uint32_t)) return false; 169 | if ((len < 0 && len != -BUFLEN) || len>BUFLEN) { 170 | return false; 171 | } 172 | int recvlen = (len <0)?-len:len; 173 | // printf("Recvlen %d\n", recvlen); 174 | if (recvlen >0) { 175 | this->recv(&recvbuf[0], recvlen); 176 | // printf("recv sub mesg: %d\n", recvlen); 177 | str += string (&recvbuf[0],(&recvbuf[0])+recvlen); 178 | } 179 | if (len>=0) return true; 180 | } 181 | return true; 182 | } 183 | 184 | TCPServerSocket::TCPServerSocket(unsigned short localPort, int queueLen) 185 | : Socket(SOCK_STREAM, IPPROTO_TCP) 186 | { 187 | setLocalPort(localPort); 188 | THROW_IF_MSG("Fail while setting listening socket", listen(sockDesc, queueLen) < 0); 189 | } 190 | 191 | TCPSocket *TCPServerSocket::accept() 192 | { 193 | int newConnSD; 194 | THROW_IF((newConnSD = ::accept(sockDesc, NULL, 0)) < 0); 195 | return new TCPSocket(newConnSD); 196 | } 197 | 198 | 199 | -------------------------------------------------------------------------------- /lib/socket.h: -------------------------------------------------------------------------------- 1 | /* 2 | * C++ sockets on Unix and Windows 3 | */ 4 | 5 | #ifndef __SOCKET_UTIL_H 6 | #define __SOCKET_UTIL_H 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | 15 | static string TYPE_CONTAINMENT = "CONTAINMENT"; 16 | static string TYPE_COVERAGE = "COVERAGE"; 17 | class Socket 18 | { 19 | public: 20 | ~Socket(); 21 | string getLocalAddr(); 22 | unsigned short getLocalPort(); 23 | void setLocalPort(unsigned short localPort); 24 | private: 25 | Socket(const Socket &sock); 26 | void operator=(const Socket &sock); 27 | protected: 28 | int sockDesc; // Socket descriptor 29 | Socket(int type, int protocol); 30 | Socket(int sockDesc); 31 | }; 32 | 33 | 34 | class TCPSocket : public Socket 35 | { 36 | public: 37 | TCPSocket(); 38 | TCPSocket(const string &foreignAddr, unsigned short foreignPort); 39 | void connect(const string &foreignAddr, unsigned short foreignPort); 40 | unsigned short getForeignPort(); 41 | void sendmsg(const string &str); 42 | void sendmsg(const char * buf, uint32_t len); 43 | static constexpr int BUFLEN=65536; 44 | char recvbuf[BUFLEN+64]; 45 | bool recvmsg(string &msg); 46 | string getForeignAddr(); 47 | private: 48 | void send(const void *buffer, int bufferLen); 49 | int recv(void *buffer, int bufferLen); 50 | friend class TCPServerSocket; 51 | TCPSocket(int newConnSD); 52 | }; 53 | 54 | class TCPServerSocket : public Socket 55 | { 56 | public: 57 | TCPServerSocket(unsigned short localPort, int queueLen = 5); 58 | TCPSocket *accept(); 59 | }; 60 | 61 | 62 | #endif 63 | -------------------------------------------------------------------------------- /lib/threadpool.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | /* 3 | * This file is modified from https://github.com/progschj/ThreadPool 4 | * ----------------------------------------------------------------- 5 | *Copyright (c) 2012 Jakob Progsch, Václav Zeman 6 | 7 | This software is provided 'as-is', without any express or implied 8 | warranty. In no event will the authors be held liable for any damages 9 | arising from the use of this software. 10 | 11 | Permission is granted to anyone to use this software for any purpose, 12 | including commercial applications, and to alter it and redistribute it 13 | freely, subject to the following restrictions: 14 | 15 | 1. The origin of this software must not be misrepresented; you must not 16 | claim that you wrote the original software. If you use this software 17 | in a product, an acknowledgment in the product documentation would be 18 | appreciated but is not required. 19 | 20 | 2. Altered source versions must be plainly marked as such, and must not be 21 | misrepresented as being the original software. 22 | 23 | 3. This notice may not be removed or altered from any source 24 | distribution. 25 | ------------------------------------- 26 | */ 27 | 28 | 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | 39 | class ThreadPool { 40 | public: 41 | using PIF = std::pair >; 42 | class Compare { 43 | public: 44 | bool operator()(PIF& a , PIF& b) { 45 | return a.first > b.first; 46 | } 47 | }; 48 | std::mutex write_mutex; 49 | private: 50 | // need to keep track of threads so we can join them 51 | std::vector< std::thread > workers; 52 | // the task queue 53 | 54 | std::priority_queue< 55 | PIF, 56 | std::vector, 57 | Compare 58 | > tasks; 59 | 60 | // synchronization 61 | std::mutex queue_mutex; 62 | std::condition_variable condition, condition_resource; 63 | int resource; 64 | bool stop; 65 | 66 | public: 67 | // the constructor just launches some amount of workers 68 | ThreadPool(size_t threads, size_t _resource) 69 | : resource(_resource), stop(false) 70 | { 71 | for(size_t i = 0; i task; 78 | int pri; 79 | { 80 | std::unique_lock lock(this->queue_mutex); 81 | this->condition.wait(lock, 82 | [this] { return this->stop || !this->tasks.empty(); }); 83 | if(this->stop && this->tasks.empty()) 84 | return; 85 | task = std::move(this->tasks.top().second); 86 | pri = this->tasks.top().first; 87 | this->tasks.pop(); 88 | } 89 | // { 90 | // std::unique_lock lock(this->queue_mutex); 91 | // this->condition_resource.wait(lock, 92 | // [this] { return this->resource > 0; }); 93 | // this->resource-=pri; 94 | // } 95 | printf("Starting task ID %d\n",pri); 96 | task(); 97 | printf("Finish task ID %d\n",pri); 98 | // { 99 | // std::unique_lock lock(this->queue_mutex); 100 | // this->resource+=pri; 101 | // } 102 | condition_resource.notify_all(); 103 | } 104 | } 105 | ); 106 | } 107 | 108 | // add new work item to the pool 109 | template 110 | std::future enqueue(int x, F&& f) 111 | { 112 | using return_type = int; 113 | 114 | auto task = std::make_shared< std::packaged_task >( 115 | std::bind(std::forward(f)) 116 | ); 117 | 118 | std::future res = task->get_future(); 119 | { 120 | std::unique_lock lock(queue_mutex); 121 | 122 | // don't allow enqueueing after stopping the pool 123 | if(stop) 124 | throw std::runtime_error("enqueue on stopped ThreadPool"); 125 | 126 | tasks.emplace(std::make_pair(x, [task]() { 127 | (*task)(); 128 | })); 129 | } 130 | condition.notify_one(); 131 | return res; 132 | } 133 | 134 | // the destructor joins all threads 135 | ~ThreadPool() 136 | { 137 | { 138 | std::unique_lock lock(queue_mutex); 139 | stop = true; 140 | } 141 | condition.notify_all(); 142 | for(std::thread &worker: workers) 143 | worker.join(); 144 | } 145 | 146 | }; 147 | 148 | -------------------------------------------------------------------------------- /manual.md: -------------------------------------------------------------------------------- 1 | # Manual 2 | __SeqOthello__ toolchain includes the following modules: 3 | 4 | * ``PreProcess`` converts _k_-mer files to __SeqOthello__ 5 | 64 bit binary format. 6 | 7 | * ``Group`` combines a subset of the 64 bit binary kmer files into 8 | group. 9 | 10 | * ``Build`` creates __SeqOthello__ mapping between the entire set 11 | of _k_-mers and their experiment ids. 12 | 13 | * `` 14 | 15 | 16 | ``` 17 | PreProcess {OPTIONS} 18 | 19 | Convert a Jellyfish output file to a sorted binary file. 20 | 21 | OPTIONS: 22 | 23 | -h, --help Display this help menu 24 | --in=[string] filename for the input kmer file 25 | --out=[string] filename for the output binary kmer file 26 | --k=[integer] k, length of kmer 27 | --cutoff=[integer] cutoff, minimal expression value for 28 | kmer to be included into the file. 29 | --histogram get histogram 30 | ``` 31 | 32 | 33 | ``` 34 | Group {OPTIONS} 35 | 36 | Preprocess binary files to grouped files. 37 | Each line of the file must contain exactly one file name, e.g, xxxx.bin 38 | The file should be in 64bit kmer format, the xml file must present in the 39 | same folder. xxxx.bin.xml 40 | 41 | OPTIONS: 42 | 43 | -h, --help Display this help menu 44 | --flist=[string] a file containing the filenames 45 | --folder=[string] where to find this file 46 | --output=[string] output file 47 | --limit=[integer] stop after getting this number of kmers. 48 | Note: for test small data set only. 49 | --group Create a group using some group files 50 | 51 | ``` 52 | 53 | 54 | ``` 55 | Build {OPTIONS} 56 | 57 | Build SeqOthello! 58 | 59 | OPTIONS: 60 | 61 | -h, --help Display this help menu 62 | --flist=[string] a file containing the filenames of Grp 63 | files, these Grp files should be created 64 | by the Preprocess tool. Each line should 65 | contain one file name. 66 | --folder=[string] where to find these Grp files. i.e. , a 67 | path that contains the Grp files. 68 | --out-folder=[string] a folder to put the generated SeqOthello 69 | map. 70 | --estimate-limit=[int] read this number of Kmers to estimate 71 | the distribution. 72 | --count-only only count the keys and the histogram, 73 | do not build the seqOthello. 74 | 75 | ``` 76 | 77 | 78 | ``` 79 | Query {OPTIONS} 80 | 81 | Query SeqOthello! 82 | 83 | OPTIONS: 84 | 85 | -h, --help Display this help menu 86 | --map-folder=[string] the path contains SeqOthello mapping 87 | file. 88 | --transcript=[string] file containing transcripts 89 | --output=[string] where to put the results 90 | --detail Show the detailed query results for the 91 | transcripts 92 | --noreverse do not use reverse complement 93 | --qthread=[int] how many threads to use for query, 94 | default = 1 95 | --start-server-port=[int] start a SeqOthello Server at port 96 | --print-kmers-index=[int] printout kmers that matches a sample 97 | with index. 98 | 99 | 100 | ``` 101 | -------------------------------------------------------------------------------- /rebuild.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd build 3 | make -j4 4 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ## Where are the compiled tool-chain located? 4 | TOOL_FOLDER=./build/bin/ 5 | 6 | ## Where are the kmer files generated by Jellyfish? 7 | RAW_KMER_FOLDER=./test/ 8 | 9 | ## Where can we put some temporary files? (need write permission). 10 | TEMP_FOLDER=./ 11 | 12 | ## Where do we put the generated SeqOthelloMap? 13 | 14 | -------------------------------------------------------------------------------- /seqothlib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(libL2Node_SRCS 2 | L2Node.hpp 3 | L2Node.cpp 4 | ) 5 | 6 | set (libUtil_SRCS 7 | util.h 8 | util.cpp 9 | ) 10 | 11 | set (libL1Node_SRCS 12 | L1Node.hpp 13 | L1Node.cpp 14 | ) 15 | # Declare the library 16 | 17 | add_library(libUtil STATIC ${libUtil_SRCS}) 18 | 19 | add_library(libL2Node STATIC ${libL2Node_SRCS}) 20 | 21 | add_library(libL1Node STATIC ${libL1Node_SRCS}) 22 | 23 | TARGET_LINK_LIBRARIES(libL2Node libUtil tinyxml2 z) 24 | 25 | TARGET_LINK_LIBRARIES(libL1Node libUtil tinyxml2 z pthread) 26 | 27 | # Specify here the include directories exported 28 | # by this library 29 | include_directories(libL2Node PUBLIC 30 | ${CMAKE_CURRENT_SOURCE_DIR} 31 | ../lib/ 32 | ) 33 | 34 | include_directories(libL1Node PUBLIC 35 | ${CMAKE_CURRENT_SOURCE_DIR} 36 | ../lib/ 37 | ) 38 | -------------------------------------------------------------------------------- /seqothlib/L1Node.cpp: -------------------------------------------------------------------------------- 1 | // This file is a part of SeqOthello. Please refer to LICENSE.TXT for the LICENSE 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | using namespace std; 9 | 10 | L1Node::~L1Node() { 11 | for (uint32_t i = 0 ; i < othellos.size(); i++) { 12 | delete othellos[i]; 13 | } 14 | othellos.clear(); 15 | for (uint32_t i = 0 ; i >(shift); 25 | if (grp >= grpidlimit) 26 | throw std::invalid_argument("invalid key to put in L1"); 27 | kV[grp]->push_back(k); 28 | vV[grp]->push_back(v); 29 | } 30 | 31 | uint64_t L1Node::queryInt(uint64_t k) { 32 | uint32_t grp = (k)>>(shift); 33 | if (othellos[grp] == NULL) 34 | return 0; 35 | return othellos[grp]->queryInt(k); 36 | } 37 | 38 | void L1Node::constructothello(uint32_t id, uint32_t L, string fname) { 39 | Othello * othello = NULL; 40 | printf("%s : start to construct L1 Node part %u\n", get_thid().c_str(), id); 41 | if (kV[id]->size()) 42 | othello = new Othello(L, *kV[id], *vV[id], true, 200); 43 | printf("%s : Write to Gzip File %s.%d\n", get_thid().c_str(),fname.c_str(),id); 44 | char cbuf[0x400]; 45 | memset(cbuf,0,sizeof(cbuf)); 46 | sprintf(cbuf,"%s.%d",fname.c_str(), id); 47 | gzFile fout = gzopen(cbuf, "wb"); 48 | unsigned char buf[0x20]; 49 | memset(buf,0,sizeof(buf)); 50 | if (kV[id]->size()) { 51 | othello->exportInfo(buf); 52 | gzwrite(fout, buf, sizeof(buf)); 53 | othello->writeDataToGzipFile(fout); 54 | kV[id]->release(); 55 | vV[id]->release(); 56 | } 57 | else 58 | gzwrite(fout,buf,sizeof(buf)); 59 | 60 | gzclose(fout); 61 | delete othello; 62 | printf("%s : L1 part %u consturction finished.\n", get_thid().c_str(), id); 63 | } 64 | void L1Node::constructAndWrite(uint32_t L, uint32_t threads, string fname) { 65 | vector vthreadL1; 66 | uint64_t curreintInQ = 0; 67 | 68 | for (uint32_t i = 0 ; i < grpidlimit; i++) { 69 | if (curreintInQ > L1InQlimit || vthreadL1.size()>= threads) { 70 | for (auto &th : vthreadL1) 71 | th.join(); 72 | vthreadL1.clear(); 73 | curreintInQ = 0; 74 | } 75 | curreintInQ += kV[i]->size(); 76 | vthreadL1.push_back(std::thread(&L1Node::constructothello, this, i, L, fname)); 77 | } 78 | for (auto &th : vthreadL1) 79 | th.join(); 80 | } 81 | void L1Node::loadFromFile(string fname) { 82 | grpidlimit = (1< (buf); 98 | othellos[i]->loadDataFromGzipFile(fin); 99 | if (!othellos[i]->loaded) { 100 | delete othellos[i]; 101 | othellos[i] = NULL; 102 | } 103 | } 104 | gzclose(fin); 105 | } 106 | } 107 | 108 | void L1Node::putInfoToXml(tinyxml2::XMLElement *pe, string fname) { 109 | for (unsigned int i = 0 ; i < grpidlimit; i++) 110 | if (kV[i]->size()) { 111 | auto pNode = pe->GetDocument()->NewElement("L1NodePart"); 112 | char cbuf[0x400]; 113 | memset(cbuf,0,sizeof(cbuf)); 114 | sprintf(cbuf,"%s.%d",fname.c_str(), i); 115 | pNode->SetAttribute("Filename", cbuf); 116 | pNode->SetAttribute("KeyCount", (uint32_t) kV[i]->size()); 117 | pe->InsertEndChild(pNode); 118 | } 119 | } 120 | 121 | map L1Node::printrates() { 122 | map sum; 123 | for (auto *p: othellos) { 124 | map tmap; 125 | p->getrates(tmap); 126 | for (auto &x: tmap) 127 | sum[x.first] += x.second; 128 | } 129 | for (auto &p: sum) 130 | p.second/=othellos.size(); 131 | return sum; 132 | } 133 | 134 | void L1Node::setfname(string str) { 135 | fname = str; 136 | } 137 | 138 | int queryThreadInPool(Othello &oth, vector> &ans, const vector> &kmers, const unsigned int grp, const unsigned int st, const unsigned int ed, const uint32_t shift) { 139 | printf("Query L1 grp %d for transcripts from %d to %d\n", grp,st,ed-1); 140 | int totcnt = 0; 141 | for (unsigned int i = st ; i < ed; i++) 142 | for (unsigned int j = 0 ; j < (kmers)[i].size(); j++) 143 | if (grp == ((kmers)[i][j] >> shift)) { 144 | totcnt++; 145 | (ans)[i][j] = oth.queryInt((kmers)[i][j]); 146 | } 147 | return totcnt; 148 | } 149 | void L1Node::queryPartAndPutToVV(vector> &ans, vector> &kmers, unsigned int grp, unsigned int threads) { 150 | if (grp >= (1U< *oth; 164 | if (memcmp(buf, buf0, 0x20) ==0) { 165 | return; 166 | } 167 | else { 168 | oth = new Othello (buf); 169 | oth->loadDataFromGzipFile(fin); 170 | if (!oth->loaded) { 171 | delete oth; 172 | return; 173 | } 174 | } 175 | int maxs = 32; 176 | vector loc; 177 | for (int i = 0 ; i<=maxs; i++) 178 | loc.push_back(kmers.size()*i/maxs); 179 | std::vector> results; 180 | for (int thd = 0; thd < maxs; thd++) { 181 | int st = loc[thd]; 182 | int ed = loc[thd+1]; 183 | if (st == ed) continue; 184 | auto lambda = std::bind(queryThreadInPool, 185 | std::ref(*oth), std::ref(ans), std::ref(kmers), (grp), (st), (ed), (shift)); 186 | std::future x = pool.enqueue(thd, lambda); 187 | results.emplace_back(std::move(x)); 188 | } 189 | for (auto && result: results) 190 | result.get(); 191 | 192 | delete oth; 193 | return; 194 | } 195 | -------------------------------------------------------------------------------- /seqothlib/L1Node.hpp: -------------------------------------------------------------------------------- 1 | // This file is a part of SeqOthello. Please refer to LICENSE.TXT for the LICENSE 2 | #pragma once 3 | #include "othello.h" 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | 10 | class L1Node { 11 | void constructothello(uint32_t, uint32_t, string); 12 | private: 13 | uint32_t splitbit; 14 | uint32_t shift; 15 | string fname; 16 | public: 17 | uint32_t kmerLength; 18 | vector *> othellos; 19 | vector *> kV; 20 | vector *> vV; 21 | uint32_t grpidlimit; 22 | constexpr static uint64_t L1Partlimit = 1048576*128; 23 | constexpr static uint64_t L1InQlimit = 1048576*512; 24 | L1Node() {} 25 | L1Node(uint64_t estimatedKmerCount, int _kmerlength, const string &buf) : kmerLength(_kmerlength) { 26 | splitbit = 0; 27 | while( (estimatedKmerCount >> splitbit) > L1Partlimit) splitbit ++; 28 | 29 | if (splitbit >= kmerLength) 30 | throw std::invalid_argument("invalid parameter for L1Node"); 31 | setsplitbit(kmerLength, splitbit); 32 | grpidlimit = (1<< splitbit); 33 | kV.clear(); 34 | vV.clear(); 35 | for (unsigned int i = 0 ; i < grpidlimit; i++) { 36 | stringstream ss; 37 | ss << buf << i; 38 | string fstr; 39 | ss >> fstr; 40 | kV.push_back(new IOBuf((fstr+".keys").c_str())); 41 | vV.push_back(new IOBuf((fstr+".values").c_str())); 42 | } 43 | othellos.resize(grpidlimit); 44 | } 45 | 46 | uint64_t queryInt(uint64_t k); 47 | void add(uint64_t &k, uint16_t v); 48 | void writeToFile(string fname); 49 | ~L1Node(); 50 | void constructAndWrite(uint32_t, uint32_t, string); 51 | void loadFromFile(string fname); 52 | void putInfoToXml(tinyxml2::XMLElement *, string); 53 | void setsplitbit(uint32_t _kmerlength, uint32_t t) { 54 | kmerLength = _kmerlength; 55 | splitbit = t; 56 | shift = kmerLength*2 - splitbit; 57 | } 58 | uint32_t getsplitbit() { 59 | return splitbit; 60 | } 61 | map printrates(); 62 | void setfname(string); 63 | void queryPartAndPutToVV(vector> &ans, vector> &kmers, unsigned int grp, unsigned int threads); 64 | }; 65 | 66 | 67 | -------------------------------------------------------------------------------- /seqothlib/L2Node.cpp: -------------------------------------------------------------------------------- 1 | // This file is a part of SeqOthello. Please refer to LICENSE.TXT for the LICENSE 2 | #include "L2Node.hpp" 3 | 4 | 5 | inline uint8_t get4b(uint8_t **pp, bool &hasvalue, uint8_t &buff) { 6 | if (hasvalue) { 7 | hasvalue = false; 8 | (*pp)++; 9 | return buff; 10 | } 11 | else { 12 | buff = ((**pp)>>4); 13 | hasvalue = true; 14 | return (**pp) & 0xF; 15 | } 16 | } 17 | inline void put4b(uint8_t **pp, bool &filledhalf, uint8_t val) { 18 | if (filledhalf) { 19 | (**pp) |= ((val & 0xF)<<4); 20 | (*pp)++; 21 | filledhalf = false; 22 | } else { 23 | (**pp) = (val & 0xF); 24 | filledhalf = true; 25 | } 26 | } 27 | inline uint32_t getvalue(uint8_t **pp, bool & hasvalue, uint8_t &buff, bool & finished) { 28 | uint8_t v = get4b(pp, hasvalue, buff); 29 | if (v & 0x8) { 30 | if (v == 8) finished = true; 31 | return (v&0x7); 32 | } 33 | uint32_t v2 = get4b(pp, hasvalue, buff); 34 | if (v & 0x4) { 35 | uint32_t a = (v & 0x3); 36 | return ( (a<<4) | v2); 37 | } 38 | uint32_t v3 = get4b(pp, hasvalue, buff); 39 | if (v & 0x2) { 40 | uint32_t a = (v & 0x1); 41 | return ( (a<<8) | (v2<<4) | v3); 42 | } 43 | uint32_t v4 = get4b(pp, hasvalue, buff); 44 | if (v == 1) { 45 | return (v2<<8) | (v3<<4) | v4; 46 | } 47 | uint32_t v5 = get4b(pp, hasvalue, buff); 48 | uint32_t v6 = get4b(pp, hasvalue, buff); 49 | return ( (v2<<16) | (v3<<12) | (v4<<8) | (v5<<4) | v6); 50 | } 51 | uint32_t valuelistDecode(uint8_t *p, vector &val, uint32_t maxmem) { 52 | uint8_t **pp = &p; 53 | val.clear(); 54 | bool hasvalue = false; 55 | uint8_t buff = 0; 56 | bool finished = false;; 57 | while ( *pp < (p+maxmem) ) { 58 | uint32_t x = getvalue(pp, hasvalue, buff,finished); 59 | if (finished) 60 | return val.size(); 61 | val.push_back(x); 62 | if (x == 1) { 63 | uint32_t dup = getvalue(pp, hasvalue, buff, finished); 64 | while (dup > 1) { 65 | val.push_back(1); 66 | dup--; 67 | } 68 | } 69 | } 70 | return val.size(); 71 | } 72 | inline void putvalue(uint32_t x, uint8_t **pp, bool &filledhalf, bool really, uint32_t & ans) { 73 | if (x>0xFFF) { //>12bits 74 | if (really) { 75 | put4b(pp, filledhalf, 0); 76 | put4b(pp, filledhalf, 0xF & (x >> 16)); 77 | put4b(pp, filledhalf, 0xF & (x >> 12)); 78 | put4b(pp, filledhalf, 0xF & (x >> 8)); 79 | put4b(pp, filledhalf, 0xF & (x >> 4)); 80 | put4b(pp, filledhalf, 0xF & x); 81 | } 82 | else ans += 6; 83 | } 84 | else if (x>0x1FF) { //10~12bits 85 | if (really) { 86 | put4b(pp, filledhalf, 1); 87 | put4b(pp, filledhalf, 0xF & (x >> 8)); 88 | put4b(pp, filledhalf, 0xF & (x >> 4)); 89 | put4b(pp, filledhalf, 0xF & x); 90 | } 91 | else ans += 4; 92 | } 93 | else if (x>0x3F) { // 7~9 bits 94 | if (really) { 95 | put4b(pp, filledhalf, 2 | (x>>8) ); 96 | put4b(pp, filledhalf, 0xF & (x >> 4)); 97 | put4b(pp, filledhalf, 0xF & x); 98 | } 99 | else ans += 3; 100 | } 101 | else if (x>0x7 || x==0) { // 4~6 bits or 0 102 | if (really) { 103 | put4b(pp, filledhalf, 4 | (x>>4) ); 104 | put4b(pp, filledhalf, 0xF & x); 105 | } 106 | else ans += 2; 107 | } 108 | else { //<=3bits 109 | if (really) 110 | put4b(pp, filledhalf, 0x8 | x); 111 | else 112 | ans ++; 113 | } 114 | 115 | } 116 | uint32_t valuelistEncode(uint8_t *p, vector &val, bool really) { 117 | uint32_t ans = 0; 118 | 119 | uint8_t **pp = &p; 120 | uint8_t *p0 = p; //starting location 121 | bool filledhalf = false; 122 | //inline uint8_t put4b(uint8_t **pp, bool &filledhalf, uint8_t val) { 123 | for (unsigned int i = 0 ; i < val.size();) { 124 | auto x = val[i]; 125 | putvalue(x, pp, filledhalf, really, ans); 126 | if (x==1) { 127 | unsigned int j = i; 128 | while (val[j] == 1) { 129 | j++; 130 | if (j == val.size()) break; 131 | } 132 | putvalue(j-i, pp, filledhalf, really, ans); 133 | i=j; 134 | } 135 | else 136 | i++; 137 | } 138 | if (really) { 139 | put4b(pp, filledhalf, 8); 140 | return (p-p0) + (filledhalf); 141 | } 142 | ans ++; 143 | return (ans>>1) + (ans & 1); 144 | } 145 | 146 | void L2Node::constructOth() { 147 | uint32_t L = 8; 148 | 149 | #pragma GCC diagnostic push 150 | #pragma GCC diagnostic ignored "-Wsign-compare" 151 | while ((1<size(), L2NodeTypes::typestr.at(this->getType()).c_str(), L, entrycnt); 154 | oth = new Othello (L, *keys, *values, true, 0); 155 | for (auto &k: oth->removedKeys) { 156 | printf("%s: Removed key vNode %lx\n", get_thid().c_str(), k); 157 | } 158 | keys->release(); 159 | values->release(); 160 | } 161 | 162 | #pragma GCC diagnostic push 163 | #pragma GCC diagnostic ignored "-Wunused-parameter" 164 | bool L2ShortValueListNode::smartQuery(const keyType *k, vector &ret, vector &retmap) { 165 | uint64_t index = L2Node::oth->queryInt(*k); 166 | ret.clear(); 167 | if (index >= uint64list.size()) return true; 168 | uint64_t vl = uint64list[index]; 169 | uint32_t valcnt = valuecnt; 170 | while (valcnt -- ) { 171 | uint32_t pq = vl & mask; 172 | vl >>= maxnl; 173 | ret.push_back(pq); 174 | } 175 | return true; 176 | } 177 | #pragma GCC diagnostic pop 178 | 179 | bool L2EncodedValueListNode::smartQuery(const keyType *k, vector &ret, vector &retmap) { 180 | uint64_t index = L2Node::oth->queryInt(*k); 181 | if (encodetype == L2NodeTypes::VALUE_INDEX_ENCODED) { 182 | ret.clear(); 183 | if (IOLengthInBytes*index >= lines.size()) return true; 184 | if (index==0) return true; 185 | vector decode; 186 | valuelistDecode(&lines[IOLengthInBytes*index], decode, IOLengthInBytes); 187 | if (decode.size()==0) return true; 188 | uint32_t last; 189 | ret.push_back(last = decode[0]); 190 | for (uint32_t i = 1; i< decode.size(); i++) { 191 | last += decode[i]; 192 | ret.push_back(last); 193 | } 194 | return true; 195 | } 196 | else { 197 | //MAPP 198 | if (IOLengthInBytes*index >= lines.size()) { 199 | ret.clear(); 200 | return true; 201 | } 202 | retmap = vector (lines.begin()+IOLengthInBytes * index, lines.begin() + IOLengthInBytes * (index+1)); 203 | return false; 204 | } 205 | } 206 | 207 | void L2ShortValueListNode::add(keyType &k, vector & valuelist) { 208 | if (fdata==NULL) { 209 | fdata = gzopen((gzfname+".dat").c_str(),"wb"); 210 | // gzbuffer(fdata,64*1024); 211 | if (fdata == NULL) { 212 | fprintf(stderr,"failed to open file %s to write\n", (gzfname+".dat").c_str()); 213 | return; 214 | } 215 | } 216 | keycnt++; 217 | uint64_t value = 0ULL; 218 | for (auto pval = valuelist.rbegin(); pval!=valuelist.rend(); pval++) { 219 | value <<= maxnl; 220 | value |= (*pval & mask); 221 | } 222 | if (valuemap.count(value) == 0) { 223 | //we always prepend one to avoid \tau result = 0; 224 | if (siz == 0) { 225 | uint64_t u0 = 0; 226 | siz ++; 227 | gzwrite(fdata, &u0, IOLengthInBytes); 228 | } 229 | //} 230 | valuemap[value] = siz; 231 | gzwrite(fdata, &value, IOLengthInBytes); 232 | siz++; 233 | entrycnt = siz; 234 | } 235 | values->push_back(valuemap[value]); 236 | keys->push_back(k); 237 | return; 238 | 239 | } 240 | 241 | void L2EncodedValueListNode::add(keyType &k, vector & valuelist) { // this valuelist is diff. 242 | vector buff(IOLengthInBytes,0); 243 | if (encodetype!= L2NodeTypes::VALUE_INDEX_ENCODED) 244 | throw invalid_argument("can not add value list L2EncodedValueListNode"); 245 | if (fdata==NULL) { 246 | fdata = gzopen((gzfname+".dat").c_str(),"wb"); 247 | // gzbuffer(fdata,64*1024); 248 | if (fdata == NULL) { 249 | fprintf(stderr,"failed to open file %s to write\n", (gzfname+".dat").c_str()); 250 | return; 251 | } 252 | } 253 | keys->push_back(k); 254 | keycnt++; 255 | valuelistEncode(&buff[0], valuelist, true); 256 | if (IOLengthInBytes<=8) { 257 | uint64_t v64 = 0; 258 | memcpy(&v64, &buff[0], IOLengthInBytes); 259 | if (valuemap.count(v64) ==0) { 260 | if (siz == 0) { 261 | siz += IOLengthInBytes; 262 | entrycnt++; 263 | gzwrite(fdata,&buff[0], IOLengthInBytes); 264 | } 265 | valuemap[v64] = entrycnt; 266 | entrycnt++; 267 | siz += IOLengthInBytes; 268 | gzwrite(fdata,&buff[0], IOLengthInBytes); 269 | } 270 | values->push_back(valuemap[v64]); 271 | } 272 | else { 273 | if (siz == 0) { //lines.size() == 0) { 274 | //lines.resize(IOLengthInBytes); 275 | siz += IOLengthInBytes; 276 | entrycnt++; 277 | gzwrite(fdata,&buff[0], IOLengthInBytes); 278 | } 279 | //uint32_t curr = lines.size(); 280 | //lines.resize(lines.size() + IOLengthInBytes); 281 | siz += IOLengthInBytes; 282 | entrycnt++; 283 | gzwrite(fdata,&buff[0], IOLengthInBytes); 284 | values->push_back(keycnt); 285 | } 286 | 287 | } 288 | 289 | void L2EncodedValueListNode::addMAPP(keyType &k, vector &mapp) { 290 | if (encodetype!= L2NodeTypes::MAPP) 291 | throw invalid_argument("can not add bitmap to L2EncodedValueListNode"); 292 | if (fdata==NULL) { 293 | fdata = gzopen((gzfname+".dat").c_str(),"wb"); 294 | // gzbuffer(fdata,256*1024); 295 | if (fdata == NULL) { 296 | fprintf(stderr,"failed to open file %s to write\n", (gzfname+".dat").c_str()); 297 | return; 298 | } 299 | } 300 | keys->push_back(k); 301 | //TODO :: Need to pre-pend a record for false positives! 302 | keycnt++; 303 | entrycnt++; 304 | if (siz == 0) { //lines.size() == 0) { 305 | //lines.resize(mapp.size()); 306 | siz += IOLengthInBytes; 307 | vector buff(IOLengthInBytes); 308 | gzwrite(fdata,&buff[0], IOLengthInBytes); 309 | entrycnt++; 310 | } 311 | if (mapp.size() != IOLengthInBytes) { 312 | throw invalid_argument("can not add bitmap to L2ShortValuelist type"); 313 | } 314 | //uint32_t curr = lines.size(); 315 | //lines.insert(lines.end(), mapp.begin(), mapp.end()); 316 | gzwrite(fdata,&mapp[0], IOLengthInBytes); 317 | values->push_back(keycnt); 318 | siz += IOLengthInBytes; 319 | } 320 | 321 | void L2ShortValueListNode::writeDataToGzipFile() { 322 | printf("%s : writing to L2 Gzip File %s\n", get_thid().c_str(), gzfname.c_str()); 323 | gzFile fout = gzopen(gzfname.c_str(), "wb"); 324 | unsigned char buf[0x20]; 325 | memset(buf,0,sizeof(buf)); 326 | memcpy(buf, &valuecnt, 4); 327 | memcpy(&buf[4], &maxnl, 4); 328 | memcpy(&buf[8], &siz, 4); 329 | gzwrite(fout, buf,sizeof(buf)); 330 | L2Node::oth->exportInfo(buf); 331 | gzwrite(fout, buf,sizeof(buf)); 332 | L2Node::oth->writeDataToGzipFile(fout); 333 | gzclose(fout); 334 | uint64list.clear(); 335 | valuemap.clear(); 336 | delete L2Node::oth; 337 | delete keys; 338 | delete values; 339 | //for (auto const & vl: uint64list) { 340 | // uint64_t rvl = vl; 341 | // gzwrite(fdata, &rvl, IOLengthInBytes); 342 | //} 343 | gzclose(fdata); 344 | } 345 | 346 | void L2EncodedValueListNode::writeDataToGzipFile() { 347 | printf("%s: Write L2 Node %s\n", get_thid().c_str(), gzfname.c_str()); 348 | gzFile fout = gzopen(gzfname.c_str(), "wb"); 349 | unsigned char buf[0x20]; 350 | memset(buf,0,sizeof(buf)); 351 | memcpy(buf, &IOLengthInBytes, 4); 352 | memcpy(buf+4, &encodetype, 4); 353 | memcpy(buf+8, &siz, 4); 354 | gzwrite(fout, buf,sizeof(buf)); 355 | L2Node::oth->exportInfo(buf); 356 | gzwrite(fout, buf,sizeof(buf)); 357 | L2Node::oth->writeDataToGzipFile(fout); 358 | lines.clear(); 359 | //gzwrite(fdata, &lines[0], lines.size()); 360 | delete L2Node::oth; 361 | delete keys; 362 | delete values; 363 | gzclose(fout); 364 | gzclose(fdata); 365 | } 366 | 367 | 368 | void L2ShortValueListNode::loadDataFromGzipFile() { 369 | printf("%s: Load L2 Node %s\n", get_thid().c_str(), gzfname.c_str()); 370 | gzFile fin = gzopen(gzfname.c_str(), "rb"); 371 | // gzbuffer(fin,256*1024); 372 | unsigned char buf[0x20]; 373 | memset(buf,0,sizeof(buf)); 374 | gzread(fin, buf,sizeof(buf)); 375 | memcpy(&valuecnt, buf, 4); 376 | memcpy(&maxnl, &buf[4], 4); 377 | uint32_t siz; 378 | memcpy(&siz, &buf[8], 4); 379 | gzread(fin, buf,sizeof(buf)); 380 | L2Node::oth = new Othello (buf); 381 | L2Node::oth->loadDataFromGzipFile(fin); 382 | gzFile fin2 = gzopen((gzfname+".dat").c_str(), "rb"); 383 | // gzbuffer(fin2,256*1024); 384 | uint64list.resize(0);//ShortVLcount); 385 | for (uint32_t i = 0 ; i < siz; i++) { 386 | uint64_t vl = 0ULL; 387 | gzread(fin2, &vl, IOLengthInBytes); 388 | uint64list.push_back(vl); 389 | } 390 | gzclose(fin); 391 | gzclose(fin2); 392 | } 393 | 394 | 395 | void L2EncodedValueListNode::loadDataFromGzipFile() { 396 | printf("%s: Load L2 Node %s\n", get_thid().c_str(), gzfname.c_str()); 397 | gzFile fin = gzopen(gzfname.c_str(), "rb"); 398 | // gzbuffer(fin,256*1024); 399 | unsigned char buf[0x20]; 400 | memset(buf,0,sizeof(buf)); 401 | gzread(fin, buf,sizeof(buf)); 402 | memcpy(&IOLengthInBytes, buf, 4); 403 | memcpy(&encodetype, &buf[4], 4); 404 | uint32_t siz; 405 | memcpy(&siz, &buf[8], 4); 406 | gzread(fin, buf,sizeof(buf)); 407 | L2Node::oth = new Othello (buf); 408 | L2Node::oth->loadDataFromGzipFile(fin); 409 | lines.resize(siz);//ShortVLcount); 410 | gzFile fin2 = gzopen((gzfname+".dat").c_str(), "rb"); 411 | // gzbuffer(fin2,256*1024); 412 | gzread(fin2, &lines[0], siz); 413 | gzclose(fin); 414 | gzclose(fin2); 415 | } 416 | 417 | void L2ShortValueListNode::putInfoToXml(tinyxml2::XMLElement * pe) { 418 | string typestr = L2NodeTypes::typestr.at(this->getType()); 419 | pe->SetAttribute("Type", typestr.c_str()); 420 | pe->SetAttribute("ValueCnt", valuecnt); 421 | pe->SetAttribute("BitsPerValue", maxnl); 422 | pe->SetAttribute("Keycount", keycnt); 423 | pe->SetAttribute("EntryCount", entrycnt); 424 | pe->SetAttribute("L2FileName", gzfname.c_str()); 425 | } 426 | 427 | void L2EncodedValueListNode::putInfoToXml(tinyxml2::XMLElement *pe) { 428 | string typestr = L2NodeTypes::typestr.at(this->getType()); 429 | pe->SetAttribute("Type", typestr.c_str()); 430 | pe->SetAttribute("IOLengthInBytes", IOLengthInBytes); 431 | pe->SetAttribute("Keycount", keycnt); 432 | pe->SetAttribute("EntryCount", entrycnt); 433 | pe->SetAttribute("L2FileName", gzfname.c_str()); 434 | } 435 | 436 | 437 | std::shared_ptr 438 | L2Node::createL2Node( tinyxml2::XMLElement *p, string folder) { 439 | std::shared_ptr ptr(nullptr); 440 | string fname(p->Attribute("L2FileName")); 441 | if (!folder.empty()) { 442 | auto pos = fname.find_last_of('/'); 443 | fname = folder + fname.substr(pos+1); 444 | } 445 | 446 | if (strcmp(p->Attribute("Type"), L2NodeTypes::typestr.at(L2NodeTypes::VALUE_INDEX_SHORT).c_str()) == 0) { 447 | int valuecnt = p->IntAttribute("ValueCnt"); 448 | int maxnl = p->IntAttribute("BitsPerValue"); 449 | int entrycnt = p->IntAttribute("EntryCount"); 450 | if (entrycnt) 451 | ptr = make_shared(valuecnt, maxnl,fname); 452 | } 453 | 454 | if (strcmp(p->Attribute("Type"), L2NodeTypes::typestr.at(L2NodeTypes::VALUE_INDEX_ENCODED).c_str()) == 0) { 455 | int IOL = p->IntAttribute("IOLengthInBytes"); 456 | int type = L2NodeTypes::VALUE_INDEX_ENCODED; 457 | int entrycnt = p->IntAttribute("EntryCount"); 458 | if (entrycnt) 459 | ptr = make_shared(IOL, type,fname); 460 | } 461 | 462 | if (strcmp(p->Attribute("Type"), L2NodeTypes::typestr.at(L2NodeTypes::MAPP).c_str()) == 0) { 463 | int IOL = p->IntAttribute("IOLengthInBytes"); 464 | int type = L2NodeTypes::MAPP; 465 | int entrycnt = p->IntAttribute("EntryCount"); 466 | if (entrycnt) 467 | ptr = make_shared(IOL, type,fname); 468 | } 469 | return ptr; 470 | } 471 | 472 | uint64_t 473 | L2EncodedValueListNode:: getvalcnt() { 474 | return lines.size(); 475 | } 476 | 477 | uint64_t 478 | L2ShortValueListNode::getvalcnt() { 479 | return uint64list.size() * IOLengthInBytes; 480 | } 481 | 482 | double L2ShortValueListNode::expectedOnes(double &prb) { 483 | map tmap; 484 | L2Node::oth->getrates(tmap); 485 | prb = 0; 486 | double ans = 0; 487 | for (unsigned int i = 1 ; i < uint64list.size(); i++) { 488 | ans += tmap[i]*valuecnt; 489 | prb += tmap[i]; 490 | } 491 | return ans; 492 | } 493 | 494 | double L2EncodedValueListNode::expectedOnes(double &prb) { 495 | map tmap; 496 | L2Node::oth->getrates(tmap); 497 | vector cnt8(256); 498 | for (int t = 0 ; t < 256; t++) { 499 | int tt = t; 500 | while (tt) { 501 | cnt8[t]+=(tt&1); 502 | tt>>=1; 503 | } 504 | } 505 | double ans = 0; 506 | prb = 0.0; 507 | for (unsigned int index = 1; index< lines.size()/IOLengthInBytes; index++) { 508 | prb += tmap[index]; 509 | if (encodetype == L2NodeTypes::VALUE_INDEX_ENCODED) { 510 | vector decode; 511 | valuelistDecode(&lines[IOLengthInBytes*index], decode, IOLengthInBytes); 512 | ans += decode.size() * tmap[index]; 513 | } 514 | else { 515 | int tot = 0; 516 | for (unsigned int j = index*IOLengthInBytes; j < IOLengthInBytes*(index+1); j++) 517 | tot += cnt8[lines[j]]; 518 | ans += tot * tmap[index]; 519 | } 520 | } 521 | return ans; 522 | } 523 | 524 | map L2Node::getRates() { 525 | map tmap; 526 | oth->getrates(tmap); 527 | return tmap; 528 | } 529 | 530 | int L2ShortValueListNode::getEntrycnt() { 531 | return uint64list.size(); 532 | } 533 | 534 | int L2EncodedValueListNode::getEntrycnt() { 535 | return lines.size()/IOLengthInBytes; 536 | } 537 | 538 | map L2ShortValueListNode::computeProb(map &p) { 539 | map ret; 540 | for (int i = 1; i< uint64list.size(); i++) { 541 | uint64_t vl = uint64list[i]; 542 | uint32_t valcnt = valuecnt; 543 | while (valcnt -- ) { 544 | uint32_t pq = vl & mask; 545 | vl >>= maxnl; 546 | ret[pq] += p[i]; 547 | } 548 | } 549 | return ret; 550 | } 551 | 552 | map L2EncodedValueListNode::computeProb(map &p) { 553 | map ret; 554 | if (encodetype == L2NodeTypes::VALUE_INDEX_ENCODED) { 555 | int high = lines.size()/IOLengthInBytes; 556 | for (int i = 1; i decode; 558 | valuelistDecode(&lines[IOLengthInBytes*i], decode, IOLengthInBytes); 559 | if (decode.size()==0) continue; 560 | uint32_t last=0; 561 | for (uint32_t v = 0; v< decode.size(); v++) { 562 | last += decode[v]; 563 | ret[last] += p[i]; 564 | } 565 | } 566 | } 567 | if (encodetype == L2NodeTypes::MAPP) { 568 | int high = lines.size()/IOLengthInBytes; 569 | for (int i = 1; i (lines.begin()+IOLengthInBytes * i, lines.begin() + IOLengthInBytes * (i+1)); 571 | for (int q = 0; q 3 | #include 4 | #include 5 | #include 6 | #include "othello.h" 7 | #include "util.h" 8 | #include 9 | #include 10 | 11 | using namespace std; 12 | 13 | uint32_t valuelistEncode(uint8_t *, vector &val, bool really); //return encode length in byte. 14 | 15 | uint32_t valuelistDecode(uint8_t *, vector &val, uint32_t maxmem); 16 | 17 | typedef uint64_t keyType; 18 | namespace L2NodeTypes { 19 | static const int VALUE_INDEX_SHORT = 16; 20 | static const int VALUE_INDEX_ENCODED = 17; 21 | static const int MAPP = 4; 22 | const map typestr= { {VALUE_INDEX_SHORT, "ShortValueList"}, {VALUE_INDEX_ENCODED,"EncodedValueList"}, {MAPP,"Bitmap"}}; 23 | }; 24 | 25 | class L2Node { 26 | public: 27 | virtual int getType() = 0; 28 | virtual bool smartQuery(const keyType *k, vector &ret, vector &retmap) = 0; 29 | virtual void add(keyType &k, vector &) = 0; 30 | virtual void addMAPP(keyType &k, vector &mapp) = 0; 31 | virtual void writeDataToGzipFile() = 0; 32 | virtual void loadDataFromGzipFile() = 0; 33 | uint32_t keycnt = 0; 34 | IOBuf * values; 35 | IOBuf * keys; 36 | void constructOth(); 37 | uint32_t entrycnt = 0; 38 | Othello *oth = NULL; 39 | virtual void putInfoToXml(tinyxml2::XMLElement *) = 0; 40 | virtual uint64_t getvalcnt() = 0; 41 | static std::shared_ptr createL2Node( tinyxml2::XMLElement *p, string folder=""); 42 | string gzfname; 43 | virtual double expectedOnes(double &) = 0; 44 | map getRates(); 45 | virtual map computeProb(map &) = 0; 46 | virtual int getEntrycnt() = 0; 47 | }; 48 | 49 | class L2ShortValueListNode : public L2Node { 50 | vector uint64list; 51 | uint32_t valuecnt, maxnl, mask; 52 | uint32_t siz = 0; 53 | map valuemap; 54 | uint32_t IOLengthInBytes; 55 | void definetypes() { 56 | mask = (1<((fname+".keys").c_str()); 70 | values = new IOBuf((fname+".values").c_str()); 71 | definetypes(); 72 | } 73 | ~L2ShortValueListNode() {} 74 | bool smartQuery(const keyType *k, vector &ret, vector &retmap) override; 75 | void add(keyType &k, vector &) override; 76 | void addMAPP(keyType &, vector &) override { 77 | throw invalid_argument("can not add bitmap to L2ShortValuelist type"); 78 | } 79 | void writeDataToGzipFile() override; 80 | void loadDataFromGzipFile() override; 81 | void putInfoToXml(tinyxml2::XMLElement *) override; 82 | uint64_t getvalcnt() override; 83 | double expectedOnes(double &) override; 84 | map computeProb(map &); 85 | int getEntrycnt(); 86 | }; 87 | 88 | class L2EncodedValueListNode : public L2Node { 89 | vector lines; 90 | uint32_t siz = 0; 91 | uint32_t IOLengthInBytes, encodetype; 92 | uint32_t keycnt = 0; 93 | map valuemap; 94 | public: 95 | int getType() override { 96 | return encodetype; 97 | } 98 | gzFile fdata = NULL; 99 | L2EncodedValueListNode(uint32_t _IOLengthInBytes, uint32_t _encodetype, string fname) : IOLengthInBytes(_IOLengthInBytes), encodetype(_encodetype) { 100 | if (encodetype != L2NodeTypes::MAPP && encodetype!= L2NodeTypes::VALUE_INDEX_ENCODED) 101 | throw invalid_argument("can not add bitmap to L2ShortValuelist type"); 102 | L2Node::gzfname = fname; 103 | keys = new IOBuf((fname+".keys").c_str()); 104 | values = new IOBuf((fname+".values").c_str()); 105 | } 106 | ~L2EncodedValueListNode() {} 107 | bool smartQuery(const keyType *k, vector &ret, vector &retmap) override; 108 | void add(keyType &k, vector &) override; 109 | void addMAPP(keyType &k, vector &mapp) override; 110 | void writeDataToGzipFile() override; 111 | void loadDataFromGzipFile() override; 112 | void putInfoToXml(tinyxml2::XMLElement *) override; 113 | uint64_t getvalcnt() override; 114 | double expectedOnes(double &) override; 115 | map computeProb(map &) override; 116 | int getEntrycnt(); 117 | }; 118 | 119 | -------------------------------------------------------------------------------- /seqothlib/disjointset.h: -------------------------------------------------------------------------------- 1 | // This file is a part of SeqOthello. Please refer to LICENSE.TXT for the LICENSE 2 | #pragma once 3 | #include 4 | using namespace std; 5 | /*! \file disjointset.h 6 | * Disjoint Set data structure. 7 | */ 8 | /*! 9 | * \brief Disjoint Set data structure. Helps to test the acyclicity of the graph during construction. 10 | * */ 11 | class DisjointSet { 12 | int siz; 13 | vector *fa =NULL; 14 | public: 15 | uint32_t getfa(int i) { 16 | if ((*fa)[i] < 0) (*fa)[i] = i; 17 | else if ((*fa)[i]!=i) 18 | (*fa)[i] = getfa((*fa)[i]); 19 | return (*fa)[i]; 20 | } 21 | //! Release the memory to save some space. 22 | void finish() { 23 | fa->clear(); 24 | delete fa; 25 | fa = NULL; 26 | } 27 | void setLength(int n) { 28 | if (fa != NULL) { 29 | fa->resize(n); 30 | clear(); 31 | } 32 | else 33 | fa = new vector (n,-1); 34 | } 35 | //! re-initilize the disjoint sets. 36 | void clear() { 37 | for (auto &a : *fa) 38 | a = -1; 39 | } 40 | void merge(int a, int b) { 41 | if (a==0) swap(a,b); //a!=0 42 | (*fa)[getfa(b)] = getfa(a); 43 | } 44 | bool sameset(int a, int b) { 45 | return getfa(a)==getfa(b); 46 | } 47 | bool isroot(int a) { 48 | return ((*fa)[a]==a); 49 | } 50 | }; 51 | -------------------------------------------------------------------------------- /seqothlib/filegrouper.hpp: -------------------------------------------------------------------------------- 1 | // This file is a part of SeqOthello. Please refer to LICENSE.TXT for the LICENSE 2 | /*! 3 | * \file filecombi.h 4 | * Contains IO utilities. 5 | */ 6 | #pragma once 7 | #include 8 | 9 | 10 | 11 | template 12 | class KmerGroupedReader { //: public FileReader { 13 | vector< FILE *> fV; 14 | struct KIDpair { 15 | keyType k; 16 | uint32_t id; 17 | bool finished; 18 | bool friend operator <( const KIDpair &a, const KIDpair &b) { 19 | if (a.finished != b.finished) return (((int) a.finished) > ((int) b.finished)); 20 | return a.k>b.k; 21 | } 22 | }; 23 | public: 24 | int fnamecnt; 25 | private: 26 | bool detailmap = false; //when detailmap == false, the read only returns the key, not the bitmap. 27 | vector *> readers; 28 | vector *> grpreaders; //must use 16-bit grpids. 29 | priority_queue PQ; 30 | bool combineMode = false; //used when there are >=800 files; 31 | uint32_t combineCount; // split the file into combineCount groups, 32 | bool getFileIsSorted() { 33 | return true; 34 | } 35 | KmerReader *filter = NULL; 36 | protected: 37 | keyType nextPossibleKey; 38 | bool hasNext = true; 39 | void setKmerFilter(KmerReader * _reader) { 40 | filter = _reader; 41 | hasNext = filter->getNext(&nextPossibleKey); 42 | } 43 | 44 | void groupFile(string fname, vector lf, string prefix, int32_t idshift, bool useBinaryKmerFile, const char * tmpfolder) { 45 | vector *> readers; 46 | priority_queue PQN; 47 | for (string s: lf) { 48 | string fname = prefix + s ; 49 | if (useBinaryKmerFile) 50 | readers.push_back(new BinaryKmerReader(fname.c_str())); 51 | else { 52 | string tmpfname(tmpfolder); 53 | tmpfname = tmpfname + s + ".bintmp"; 54 | readers.push_back(new SortedKmerTxtReader(fname.c_str(),KmerLength,NULL)); 55 | } 56 | keyType key; 57 | readers[readers.size()-1]->getNext(&key); 58 | KIDpair kid = {key, (uint32_t) (idshift+readers.size()-1), false}; 59 | PQN.push(kid); 60 | } 61 | 62 | MultivalueFileReaderWriter * writer; 63 | writer = new MultivalueFileReaderWriter (fname.c_str(),8,2,false,filter); 64 | 65 | 66 | // Loop key for these files; 67 | while (true) { 68 | keyType key = PQN.top().k; 69 | uint32_t id = PQN.top().id; 70 | vector ret; 71 | if (PQN.top().finished) { 72 | for (auto r: readers) { 73 | r->finish(); 74 | delete r; 75 | } 76 | printf("find %d keys\n", writer->getKeycount()); 77 | writer->finish(); 78 | delete writer; 79 | return; 80 | } 81 | while (PQN.top().k == key && !PQN.top().finished) { 82 | int tid = PQN.top().id; 83 | if (detailmap) ret.push_back(tid); 84 | keyType nextk; 85 | bool finish = !readers[tid-idshift]->getNext(&nextk); 86 | PQN.pop(); 87 | KIDpair kid = {nextk, (uint32_t) tid, finish}; 88 | PQN.push(kid); 89 | } 90 | writer->write(&key, ret); 91 | } 92 | } 93 | vector< vector > grpTmpValue; 94 | 95 | ConstantLengthKmerHelper *pubhelper; 96 | 97 | public: 98 | uint32_t KmerLength; 99 | virtual int getFileCount() { 100 | return fnamecnt; 101 | } 102 | virtual void finish() { 103 | for (auto f: fV) fclose(f); 104 | } 105 | virtual void reset() { 106 | printf(" Do not support reset() \n"); 107 | } 108 | KmerGroupedReader() {} 109 | KmerGroupedReader(const char * NCBIfname, const char * fnameprefix, const char * tmpFileDirectory, uint32_t _KmerLength, bool useBinaryKmerFile = true, bool _Detailmap = true, KmerReader *_filter = NULL ) { 110 | detailmap = _Detailmap; 111 | KmerLength = _KmerLength; 112 | ConstantLengthKmerHelper helper(KmerLength,-1); 113 | pubhelper = new ConstantLengthKmerHelper(KmerLength,-1); 114 | if (_filter!=NULL) 115 | setKmerFilter(_filter); 116 | FILE * fNCBI; 117 | string prefix ( fnameprefix); 118 | fNCBI = fopen(NCBIfname, "r"); 119 | //Assuming each line of the file contains a filename. unless , if the first line is '=XX', then this file describes a list of intermediate files created during the construction., and there are XX samples. e.g, '=2560' 120 | char buf[4096]; 121 | readers.clear(); 122 | vector fnames; 123 | while (true) { 124 | if (fgets(buf, 4096, fNCBI) == NULL) break; // read a Species 125 | string fname(buf); 126 | if (*fname.rbegin() == '\n') fname = fname.substr(0,fname.size()-1); 127 | fnames.push_back(fname); 128 | } 129 | bool combineFromIntermediate = (fnames[0][0] == '='); 130 | if (combineFromIntermediate) { 131 | char buf[1024]; 132 | strcpy(buf, fnames[0].c_str()); 133 | sscanf(buf+1,"%d",&fnamecnt); 134 | } 135 | else 136 | fnamecnt = fnames.size(); 137 | #ifndef FILE_PER_GRP 138 | #define FILE_PER_GRP 500 139 | #endif 140 | int nn = FILE_PER_GRP; 141 | combineMode = (fnames.size()>nn) || combineFromIntermediate; 142 | if (combineMode) { 143 | int curr = 0; 144 | int combineCount = 0; 145 | vector grpfnames; 146 | if (combineFromIntermediate) { 147 | fnames.erase(fnames.begin()); 148 | string tmpFolder(tmpFileDirectory); 149 | for (auto &s: fnames) { 150 | grpfnames.push_back(tmpFolder+s); 151 | } 152 | } 153 | else 154 | while (curr < fnames.size()) { 155 | vector * fnamesInThisgrp ; 156 | if (curr + nn < fnames.size()) 157 | fnamesInThisgrp = new vector (fnames.begin()+curr, fnames.begin()+curr+nn); 158 | else 159 | fnamesInThisgrp = new vector (fnames.begin()+curr, fnames.end()); 160 | stringstream ss; 161 | string tmpFolder(tmpFileDirectory); 162 | 163 | ss<> fnamegrp; 166 | grpfnames.push_back(fnamegrp); 167 | printf("merge kmer files %d %d to grp %s\n", curr, curr+fnamesInThisgrp->size()-1, fnamegrp.c_str()); 168 | groupFile(fnamegrp, *fnamesInThisgrp, prefix, curr, useBinaryKmerFile,tmpFileDirectory); 169 | curr += fnamesInThisgrp->size(); 170 | delete fnamesInThisgrp; 171 | } 172 | combineCount = grpfnames.size(); 173 | for (string v: grpfnames) { 174 | grpreaders.push_back( new MultivalueFileReaderWriter(v.c_str(), sizeof(keyType),2, true)); 175 | keyType key; 176 | uint16_t valuebuf[1024]; 177 | grpreaders[grpreaders.size()-1]->getNext(&key, valuebuf); 178 | vector Vvaluebuf; 179 | for (int i = 0 ; grpreaders[0]->valid(valuebuf[i]); i++) 180 | Vvaluebuf.push_back(valuebuf[i]); 181 | grpTmpValue.push_back(Vvaluebuf); 182 | KIDpair kid = {key, (uint32_t) (grpreaders.size()-1), false}; 183 | PQ.push(kid); 184 | } 185 | } 186 | else 187 | for (int i = 0 ; i < fnames.size(); i++) { 188 | string fname = prefix + fnames[i] ; 189 | if (useBinaryKmerFile) 190 | readers.push_back(new BinaryKmerReader(fname.c_str())); 191 | else { 192 | string tmpfname(tmpFileDirectory); 193 | tmpfname = tmpfname + fnames[i] + ".bintmp"; 194 | readers.push_back(new SortedKmerTxtReader(fname.c_str(),KmerLength,tmpfname.c_str())); 195 | } 196 | keyType key; 197 | readers[readers.size()-1]->getNext(&key); 198 | KIDpair kid = {key, (uint32_t) (readers.size()-1), false}; 199 | PQ.push(kid); 200 | } 201 | filter->reset(); 202 | fclose(fNCBI); 203 | } 204 | 205 | ~KmerGroupedReader() { 206 | if (combineMode) { 207 | for (int i = 0 ; i < grpreaders.size(); i++) 208 | delete grpreaders[i]; 209 | } 210 | else 211 | for (int i = 0 ; i < readers.size(); i++) 212 | delete readers[i]; 213 | } 214 | unsigned long long keycount = 0; 215 | unsigned long long matchcount = 0; 216 | virtual bool getNext( BinaryBitSet *kvpair) { 217 | if (filter == NULL) 218 | return getNextImpl(kvpair); 219 | while (hasNext) { 220 | if (!getNextImpl(kvpair)) return hasNext = false; 221 | if (kvpair->k < nextPossibleKey) 222 | while ((kvpair->k < nextPossibleKey)) { 223 | if (!getNextImpl(kvpair)) return false; 224 | } 225 | if (kvpair->k == nextPossibleKey) { 226 | hasNext = filter->getNext(&nextPossibleKey); 227 | matchcount++; 228 | return true; 229 | } 230 | while (hasNext && (kvpair->k > nextPossibleKey)) { 231 | hasNext = filter->getNext(&nextPossibleKey); 232 | } 233 | if (kvpair->k == nextPossibleKey) { 234 | if (hasNext) 235 | hasNext = filter->getNext(&nextPossibleKey); 236 | matchcount++; 237 | return true; 238 | } 239 | } 240 | return false; 241 | } 242 | private: 243 | 244 | template 245 | bool getNextValueListImpl(keyType *k, vector &ret) { 246 | *k = PQ.top().k; 247 | if (PQ.top().finished) { 248 | finish(); 249 | return false; 250 | } 251 | ret.clear(); 252 | while (PQ.top().k == *k && !PQ.top().finished) { 253 | int tid; 254 | tid = PQ.top().id; 255 | keyType nextk; 256 | bool finish; 257 | if (combineMode) { 258 | ret.insert(ret.end(),grpTmpValue[tid].begin(),grpTmpValue[tid].end()); 259 | int ll = grpTmpValue[tid].size(); 260 | uint16_t valuebuf[1024]; 261 | finish = !grpreaders[tid]->getNext(&nextk, valuebuf); 262 | if (detailmap) { 263 | grpTmpValue[tid].clear(); 264 | for (int i = 0; grpreaders[tid]->valid(valuebuf[i]); i++) 265 | grpTmpValue[tid].push_back(valuebuf[i]); 266 | } 267 | } 268 | else { 269 | if (detailmap) ret.push_back(tid); 270 | finish = !readers[tid]->getNext(&nextk); 271 | } 272 | PQ.pop(); 273 | KIDpair kid = {nextk, (uint32_t) tid, finish}; 274 | PQ.push(kid); 275 | } 276 | updatekeycount(); 277 | return true; 278 | } 279 | public: 280 | virtual bool getNextImpl( BinaryBitSet *kvpair) 281 | { 282 | vector ret; 283 | if (!getNextValueListImpl( & kvpair->k, ret)) return false; 284 | kvpair -> reset(); 285 | if (detailmap) 286 | for (auto a: ret) { 287 | kvpair->setvalue(a); 288 | } 289 | return true; 290 | } 291 | 292 | template 293 | bool getNextValueList(keyType *k, vector &v) { 294 | v.clear(); 295 | if (filter == NULL) 296 | return getNextValueListImpl(k,v); 297 | while (hasNext) { 298 | if (!getNextValueListImpl(k,v)) return hasNext = false; 299 | if (*k < nextPossibleKey) 300 | while ((*k < nextPossibleKey)) { 301 | if (!getNextValueListImpl(k,v)) return false; 302 | } 303 | if (*k == nextPossibleKey) { 304 | hasNext = filter->getNext(&nextPossibleKey); 305 | matchcount++; 306 | return true; 307 | } 308 | while (hasNext && (*k > nextPossibleKey)) { 309 | hasNext = filter->getNext(&nextPossibleKey); 310 | } 311 | if (*k == nextPossibleKey) { 312 | if (hasNext) 313 | hasNext = filter->getNext(&nextPossibleKey); 314 | matchcount++; 315 | return true; 316 | } 317 | } 318 | 319 | } 320 | protected: 321 | virtual void updatekeycount() { 322 | keycount ++; 323 | if (keycount > 1000000) 324 | if ((keycount & (keycount-1))==0) { 325 | printcurrtime(); 326 | printf("Got %lld keys\n", keycount); 327 | if (filter!=NULL) 328 | printf("Passed %lld keys\n", matchcount); 329 | if (combineMode) { 330 | printf("Currpos:\t"); 331 | for (auto a: grpreaders) 332 | printf("%lld\t", a->getpos()); 333 | printf("\n"); 334 | } 335 | 336 | } 337 | } 338 | }; 339 | 340 | 341 | template 342 | class KmerExpressionGroupedReader : KmerGroupedReader { 343 | struct KVID3 { 344 | keyType k; 345 | uint16_t v; 346 | uint32_t id; 347 | bool finished; 348 | bool friend operator <( const KVID3 &a, const KVID3 &b) { 349 | if (a.finished != b.finished) return (((int) a.finished) > ((int) b.finished)); 350 | return a.k>b.k; 351 | } 352 | }; 353 | int keycount = 0; 354 | priority_queue PQ; 355 | vector *> readers; 356 | // combinemode not suported yet. 357 | public: 358 | KmerExpressionGroupedReader(const char * NCBIfname, const char * fnameprefix, const char * tmpFileDirectory, uint32_t _KmerLength, bool useBinaryKmerFile = true, ValueConverter * valueConverter = NULL, KmerReader * _filter = NULL ) { 359 | KmerGroupedReader:: KmerLength = _KmerLength; 360 | ConstantLengthKmerHelper helper(_KmerLength,-1); 361 | FILE * fNCBI; 362 | string prefix ( fnameprefix); 363 | fNCBI = fopen(NCBIfname, "r"); 364 | //Assuming each line of the file contains a filename. 365 | char buf[4096]; 366 | readers.clear(); 367 | vector fnames; 368 | if (_filter!=NULL) 369 | this->setKmerFilter(_filter); 370 | while (true) { 371 | if (fgets(buf, 4096, fNCBI) == NULL) break; // read a Species 372 | string fname(buf); 373 | fnames.push_back(fname); 374 | } 375 | KmerGroupedReader::fnamecnt = fnames.size(); 376 | if (fnames.size()> 500) { 377 | printf("Do not support >=500 names yet\n"); 378 | return; 379 | } 380 | if (!useBinaryKmerFile) { 381 | printf("File type not supported. Please use 64-16 binary files"); 382 | return; 383 | } 384 | for (int i = 0 ; i < fnames.size(); i++) { 385 | string fname = prefix + fnames[i] ; 386 | readers.push_back(new SinglevalueFileReaderWriter(fname.c_str(),true, valueConverter)); 387 | keyType key; 388 | uint16_t value; 389 | readers[readers.size()-1]->getNext(&key,&value); 390 | KVID3 kid = {key, value, (uint32_t) (readers.size()-1), false}; 391 | PQ.push(kid); 392 | } 393 | fclose(fNCBI); 394 | } 395 | 396 | virtual bool getNextImpl( BinaryBitSet *kvpair) override { 397 | keyType key = PQ.top().k; 398 | vector> ret; 399 | if (PQ.top().finished) { 400 | 401 | KmerGroupedReader::finish(); 402 | return false; 403 | } 404 | while (PQ.top().k == key && !PQ.top().finished) { 405 | int tid; 406 | tid = PQ.top().id; 407 | 408 | keyType nextk; 409 | bool finish; 410 | ret.push_back({tid,PQ.top().v}); 411 | uint16_t nextv; 412 | finish = !readers[tid]->getNext(&nextk, &nextv); 413 | PQ.pop(); 414 | KVID3 kid = {nextk, nextv, (uint32_t) tid, finish}; 415 | PQ.push(kid); 416 | } 417 | kvpair -> reset(); 418 | kvpair -> k = key; 419 | for (auto &a: ret) { 420 | if (VALUEBIT == 1) 421 | kvpair->setvalue(a.first); 422 | if (VALUEBIT > 1) { 423 | int id = (a.first); 424 | (*kvpair).setvalue(id, a.second & ((1<:: updatekeycount(); 428 | return true; 429 | } 430 | }; 431 | 432 | template 433 | class TestKmerGroupedReader : public KmerGroupedReader { //: public FileReader { 434 | public: 435 | int fncnt; 436 | int keyleft; 437 | TestKmerGroupedReader(int _fn, int _kl) { 438 | fncnt = _fn; 439 | keyleft = _kl; 440 | } 441 | int getFileCount() override { 442 | return fncnt; 443 | } 444 | void finish() override {} 445 | void reset() override {} 446 | int keycount = 0; 447 | bool getNext( BinaryBitSet *kvpair) override { 448 | if (keyleft ==0) return false; 449 | kvpair -> reset(); 450 | kvpair -> k = (((keyType) keyleft << 32) | keycount ^ 0x49218c3); 451 | int i = 1; 452 | uint32_t tmp = keycount^(keycount*keycount) ^ 0x18c83; 453 | kvpair -> setvalue(tmp % fncnt); 454 | keyleft --; 455 | keycount ++; 456 | if ((keycount & (keycount-1))==0) { 457 | printf("Got %lld keys\n", keycount); 458 | } 459 | return true; 460 | } 461 | }; 462 | 463 | -------------------------------------------------------------------------------- /seqothlib/jellyfish_helper.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #define HAVE_NUMERIC_LIMITS128 13 | 14 | #include 15 | #include 16 | //#include 17 | #include 18 | 19 | 20 | template 21 | class JellyfishFileReader : public FileReader { 22 | jellyfish::file_header header; 23 | std::ifstream ifs; 24 | ConstantLengthKmerHelper *io_helper; 25 | binary_reader *reader; 26 | public: 27 | int kmerlength = 0; 28 | JellyfishFileReader(const char * fname) : ifs(fname) { 29 | header.read(ifs); 30 | jellyfish::mer_dna::k(header.key_len() / 2); 31 | io_helper = new ConstantLengthKmerHelper(header.key_len()/2, 0); 32 | kmerlength = header.key_len() / 2; 33 | reader = new binary_reader(ifs, &header); 34 | } 35 | void finish() { 36 | } 37 | void reset() { 38 | ifs.clear(); 39 | ifs.seekg(0); 40 | header.read(ifs); 41 | } 42 | #pragma GCC push_options 43 | #pragma GCC optimize ("O0") 44 | bool getNext(keyType *T, valueType *V) { 45 | if (!reader->next()) return false; 46 | // stringstream ss; ss<< reader->key(); 47 | // string s; 48 | // ss >> s; 49 | // string s = reader->key(); 50 | *V = reader->val(); 51 | *T = *(reader->key().data()); 52 | // keyType T2; valueType V2; 53 | // char buf[60]; 54 | // strcpy(buf, s.c_str()); 55 | // io_helper->convert(buf, &T2, &V2); 56 | // cout << "Reader: " << reader->key(); 57 | // cout << "Reader key:" << *T << " Reconstruct" << T2 << endl; 58 | } 59 | #pragma GCC pop_options 60 | bool getFileIsSorted() { 61 | return false; 62 | } 63 | }; 64 | -------------------------------------------------------------------------------- /seqothlib/othellotypes.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "io_helper.hpp" 7 | using namespace std; 8 | struct KV6432 { 9 | uint64_t k; 10 | uint32_t v; 11 | bool friend operator <( const KV6432 &a, const KV6432 &b) { 12 | return a.k < b.k; 13 | } 14 | } __attribute__((packed)); 15 | 16 | 17 | template //filereader supports ->get(k,v) 18 | //this->getNext.. gets vector> 19 | class SGroupReader { 20 | struct KIDpair { 21 | keyType k; 22 | valueType v; 23 | uint32_t id; 24 | bool finished; 25 | bool friend operator <( const KIDpair &a, const KIDpair &b) { 26 | if (a.finished != b.finished) return (((int) a.finished) > ((int) b.finished)); 27 | return a.k>b.k; 28 | } 29 | }; 30 | vector readers; 31 | priority_queue PQ; 32 | protected: 33 | bool hasNext = true; 34 | 35 | public: 36 | SGroupReader() {} 37 | SGroupReader(vector & fnames) { 38 | for (auto const fname : fnames) { 39 | readers.push_back(new fileReader(fname.c_str())); 40 | keyType k; 41 | valueType v; 42 | readers[readers.size()-1]->getNext(&k, &v); 43 | KIDpair kid = {k, v, (uint32_t) (readers.size()-1), false}; 44 | PQ.push(kid); 45 | } 46 | } 47 | 48 | ~SGroupReader() { 49 | for (int i = 0 ; i < readers.size(); i++) 50 | delete readers[i]; 51 | } 52 | unsigned long long keycount = 0; 53 | unsigned long long matchcount = 0; 54 | public: 55 | bool getNextValueList(keyType &k, vector > &ret) { 56 | k = PQ.top().k; 57 | if (PQ.top().finished) { 58 | return false; 59 | } 60 | ret.clear(); 61 | while (PQ.top().k == k && !PQ.top().finished) { 62 | int tid; 63 | tid = PQ.top().id; 64 | keyType nextk; 65 | valueType nextv; 66 | ret.push_back(make_pair(tid,PQ.top().v)); 67 | bool finish = !readers[tid]->getNext(&nextk, &nextv); 68 | PQ.pop(); 69 | KIDpair kid = {nextk, nextv, (uint32_t) tid, finish}; 70 | PQ.push(kid); 71 | } 72 | updatekeycount(); 73 | return true; 74 | } 75 | 76 | protected: 77 | virtual void updatekeycount() { 78 | keycount ++; 79 | if (keycount > 1000000) 80 | if ((keycount & (keycount-1))==0) { 81 | printcurrtime(); 82 | printf("Got %lld keys\n", keycount); 83 | } 84 | } 85 | }; 86 | 87 | /* 88 | template 89 | class GrpReader { 90 | public: 91 | std::shared_ptr< 92 | SGroupReader< 93 | uint64_t, //key 94 | vector>, //each value is a list of ID/value 95 | ValuelistExpressionReader 96 | > > reader; 97 | int fnamecnt; 98 | int maxlimit = 0x7FFFFFFF; 99 | GrpReader(string fname, string folder, int _maxlimit = -1) { 100 | if (_maxlimit >=0) 101 | maxlimit = _maxlimit; 102 | 103 | FILE * fNCBI; 104 | fNCBI = fopen(fname.c_str(), "r"); 105 | char buf[4096]; 106 | vector fnames; 107 | while (true) { 108 | if (fgets(buf, 4096, fNCBI) == NULL) break; 109 | string fname(buf); 110 | if (*fname.rbegin() == '\n') 111 | fname = fname.substr(0,fname.size()-1); 112 | fnames.push_back(folder+fname); 113 | } 114 | reader = make_shared< SGroupReader< 115 | uint64_t, //key 116 | vector>, //each value is a list of ID/value 117 | ValuelistExpressionReader 118 | >>(fnames); 119 | fnamecnt = fnames.size(); 120 | } 121 | bool getNextValueList( 122 | keyType &k, 123 | vector< pair > & ans) 124 | { 125 | vector> >> ret; //return is a list of pair, where V is a vector> 126 | bool res = reader->getNextValueList(k, ret); 127 | if (res) { 128 | ans.clear(); 129 | for (auto const &v: ret) { 130 | ans.insert(ans.end(), v.second.begin(), v.second.end()); 131 | } 132 | } 133 | return res & (maxlimit-- > 0); 134 | } 135 | }; 136 | */ 137 | 138 | inline int encodelengths(const vector &data, int &bestk) { 139 | //encode this as: 140 | //4 bits: k (max = 16, min=2) 141 | //next k*n bits: 142 | // encode the 'gaps': x0,x1,x2....: 143 | // index of 1's: x0, x0+x1+1, x0+x1+x2+2, ... 144 | // range of x_i: >=0 145 | // encoded as: (say k=4), 146 | // 0..0xE: 0..14 147 | // 15t+14: t times of F, and a cell of 0..E 148 | // at the end: all F. 149 | 150 | vector cnt(17,0); 151 | for (uint32_t i = 0 ; i < data.size(); i++) { 152 | int x; 153 | if (i ==0) x= data[i]; 154 | else x=data[i]-data[i-1]-1; 155 | for (int p = 2; p<=16; p++) { 156 | cnt[p] += (x / ((1< candi({2,4,8,12}); 161 | #pragma GCC diagnostic push 162 | #pragma GCC diagnostic ignored "-Wsign-compare" 163 | for (auto k : candi) 164 | if (4 + k*(data.size() + cnt[k]) < ans) { 165 | ans = 4+ k*(data.size() + cnt[k]); 166 | bestk = k; 167 | } 168 | return ans; 169 | #pragma GCC diagnostic pop 170 | } 171 | -------------------------------------------------------------------------------- /seqothlib/util.cpp: -------------------------------------------------------------------------------- 1 | // This file is a part of SeqOthello. Please refer to LICENSE.TXT for the LICENSE 2 | #include "util.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | 10 | //! split a c-style string with delimineter chara. 11 | std::vector split(const char * str, char deli) { 12 | std::istringstream ss(str); 13 | std::string token; 14 | std::vector ret; 15 | while(std::getline(ss, token, deli)) { 16 | if (token.size()>=1) 17 | ret.push_back(token); 18 | } 19 | return ret; 20 | } 21 | void printcurrtime() { 22 | auto end = std::chrono::system_clock::now(); 23 | std::time_t end_time = std::chrono::system_clock::to_time_t(end); 24 | 25 | printf("%s ::", std::ctime(&end_time)); 26 | } 27 | 28 | 29 | //! convert a 64-bit Integer to human-readable format in K/M/G. e.g, 102400 is converted to "100K". 30 | std::string human(uint64_t word) { 31 | std::stringstream ss; 32 | if (word <= 1024) ss << word; 33 | else if (word <= 10240) ss << std::setprecision(2) << word*1.0/1024<<"K"; 34 | else if (word <= 1048576) ss << word/1024<<"K"; 35 | else if (word <= 10485760) ss << word*1.0/1048576<<"M"; 36 | else if (word <= (1048576<<10)) ss << word/1048576<<"M"; 37 | else ss << word*1.0/(1<<30) <<"G"; 38 | std::string s; 39 | ss >>s; 40 | return s; 41 | } 42 | 43 | std::string get_thid() { 44 | std::stringstream ss; 45 | ss <<"Thread_" <> s; 48 | return s; 49 | } 50 | -------------------------------------------------------------------------------- /seqothlib/util.h: -------------------------------------------------------------------------------- 1 | // This file is a part of SeqOthello. Please refer to LICENSE.TXT for the LICENSE 2 | #pragma once 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | std::string human(uint64_t word); 9 | 10 | std::vector split(const char * str, char deli); 11 | void printcurrtime(); 12 | 13 | std::string get_thid(); 14 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | INCLUDE_DIRECTORIES(../lib ../seqothlib ../Jellyfish/include ../Jellyfish ../Jellyfish) 2 | set(HEADERS ../lib/tinyxml2.h) 3 | 4 | set(PREPROCESS_SRC preprocess.cc) 5 | set(GROUP_SRC group.cc) 6 | set(BUILD_SRC build.cc) 7 | set(QUERY_SRC query.cc) 8 | set(PRINT_SRC printrates.cc) 9 | 10 | ADD_LIBRARY(Jellyfish_Json STATIC ../Jellyfish/lib/jsoncpp.cpp) 11 | ADD_LIBRARY(Jellyfish_Matrix STATIC ../Jellyfish/config.h ../Jellyfish/lib/rectangular_binary_matrix.cc) 12 | SET_TARGET_PROPERTIES(Jellyfish_Matrix PROPERTIES COMPILE_FLAGS "-DHAVE_CONFIG_H") 13 | ADD_LIBRARY(Jellyfish_Mer_DNA STATIC ../Jellyfish/lib/mer_dna.cc) 14 | 15 | ADD_EXECUTABLE(PreProcess ${PREPROCESS_SRC}) 16 | TARGET_LINK_LIBRARIES(PreProcess tinyxml2 Jellyfish_Json Jellyfish_Matrix Jellyfish_Mer_DNA) 17 | 18 | ADD_EXECUTABLE(Group ${GROUP_SRC}) 19 | TARGET_LINK_LIBRARIES(Group pthread tinyxml2 libUtil) 20 | 21 | ADD_EXECUTABLE(Build ${BUILD_SRC}) 22 | TARGET_LINK_LIBRARIES(Build tinyxml2 z pthread libL2Node libL1Node) 23 | 24 | ADD_EXECUTABLE(Query ${QUERY_SRC}) 25 | TARGET_LINK_LIBRARIES(Query tinyxml2 z pthread libL2Node libL1Node smalltcp) 26 | 27 | ADD_EXECUTABLE(PrintRates ${PRINT_SRC}) 28 | TARGET_LINK_LIBRARIES(PrintRates tinyxml2 z pthread libL2Node libL1Node smalltcp) 29 | 30 | ADD_EXECUTABLE(Client client.cc) 31 | TARGET_LINK_LIBRARIES(Client pthread smalltcp) 32 | 33 | ADD_EXECUTABLE(Test testL1Node.cpp) 34 | TARGET_LINK_LIBRARIES(Test tinyxml2 z pthread libL2Node libL1Node) 35 | 36 | 37 | -------------------------------------------------------------------------------- /src/build.cc: -------------------------------------------------------------------------------- 1 | // This file is a part of SeqOthello. Please refer to LICENSE.TXT for the LICENSE 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | using namespace std; 19 | 20 | 21 | int main(int argc, char ** argv) { 22 | args::ArgumentParser parser("Build SeqOthello map from Group files.\n"); 23 | args::HelpFlag help(parser, "help", "Display the help menu.", {'h', "help"}); 24 | args::ValueFlag argInputname(parser, "string", "The file list containing the names of Group files created by the Group function.", {"flist"}); 25 | args::ValueFlag argFolder(parser, "string", "The directory to the Group files.", {"folder","grp-folder"}); 26 | args::ValueFlag argOutputname(parser, "string", "The directory to the SeqOthello map.", {"out-folder"}); 27 | //args::ValueFlag argThread(parser, "int", "number of parallel threads to build SeqOthello", {"thread"}); 28 | args::ValueFlag argLimit(parser, "int", "Nuumber of kmers used to estimate the distribution. Default 10485760.", {"estimate-limit"}); 29 | args::Flag argCountOnly(parser, "count-only", "Only count the keys and the histogram, do not build the seqOthello.", {"count-only"}); 30 | //args::ValueFlag argEXP(parser, "int", "Expression bits, optional: None, 1, 2, 4", {"exp"}); 31 | 32 | 33 | try 34 | { 35 | parser.ParseCLI(argc, argv); 36 | } 37 | catch (args::Help) 38 | { 39 | std::cout << parser; 40 | return 0; 41 | } 42 | catch (args::ParseError e) 43 | { 44 | std::cerr << e.what() << std::endl; 45 | std::cerr << parser; 46 | return 1; 47 | } 48 | catch (args::ValidationError e) 49 | { 50 | std::cerr << e.what() << std::endl; 51 | std::cerr << parser; 52 | return 1; 53 | } 54 | if (!(argInputname && argOutputname)) { 55 | std::cerr << parser; 56 | return 1; 57 | } 58 | int nThreads = 1; 59 | //if (argThread) 60 | // nThreads = args::get(argThread); 61 | vector keyHisto, encodeHisto; 62 | 63 | string prefix = ""; 64 | if (argFolder) prefix = args::get(argFolder); 65 | string fname = args::get(argInputname); 66 | FILE * ffnames = fopen(args::get(argInputname).c_str(), "r"); 67 | if (ffnames == NULL) 68 | throw std::invalid_argument("Error reading file"+argInputname); 69 | char buf[4096]; 70 | vector fnames; 71 | while (true) { 72 | if (fgets(buf, 4096, ffnames) == NULL) break; 73 | string fname(buf); 74 | if (*fname.rbegin() == '\n') fname = fname.substr(0,fname.size()-1); 75 | fnames.push_back(prefix+fname); 76 | } 77 | auto reader = make_shared>(fnames); 78 | reader->verbose = true; 79 | vector ret; 80 | vector encodebuf; 81 | 82 | uint32_t samplecount = reader->gethigh(); 83 | printf("samplecount = %d\n", samplecount); 84 | keyHisto.resize(samplecount+5); 85 | encodeHisto.resize(samplecount+5); 86 | tinyxml2::XMLDocument * xml = new tinyxml2::XMLDocument(); 87 | 88 | if (argCountOnly) { 89 | uint64_t k= 0; 90 | int64_t cnt = 0; 91 | vector< vector > detailedHisto(samplecount, vector(samplecount+1, 0)); 92 | while (reader->getNextValueList(k, ret)) { 93 | uint32_t keycnt = ret.size(); 94 | if (keycnt > samplecount) { 95 | printf("%d \n", keycnt); 96 | for (auto &x: ret) 97 | printf("%d ", x); 98 | printf("\n"); 99 | } 100 | keyHisto[keycnt]++; 101 | for (auto &x: ret) 102 | detailedHisto[x][keycnt]++; 103 | cnt ++; 104 | } 105 | auto pRoot = xml->NewElement("Root"); 106 | 107 | auto pcountInfo = xml->NewElement("KeyDistributionInfo"); 108 | pcountInfo->SetAttribute("TotalKeycount", cnt); 109 | for (unsigned int i = 0; i < reader->gethigh(); i++) 110 | if (keyHisto[i]) { 111 | auto pHisNode = xml->NewElement("entry"); 112 | pHisNode->SetAttribute("freq", i); 113 | pHisNode->SetAttribute("value", (uint32_t) keyHisto[i]); 114 | pcountInfo->InsertEndChild(pHisNode); 115 | } 116 | pRoot->InsertEndChild(pcountInfo); 117 | 118 | xml->InsertFirstChild(pRoot); 119 | string output = "keydistribution.xml"; 120 | xml->SaveFile(output.c_str()); 121 | auto vres = reader->getSampleInfo(); 122 | FILE *fout = fopen("histo.txt","w"); 123 | for (unsigned int i = 0 ; i < samplecount; i++) { 124 | fprintf(fout, "%s,", vres[i].c_str()); 125 | for (unsigned int j = 1; j<=samplecount; j++) 126 | fprintf(fout, "%d,", detailedHisto[i][j]); 127 | fprintf(fout,"\n"); 128 | } 129 | fclose (fout); 130 | return 0; 131 | } 132 | int limit = 10485760; 133 | if (argLimit) 134 | limit = args::get(argLimit); 135 | printf("Estimate the distribution with the first %d Kmers. \n", limit); 136 | uint64_t keycount = 0; 137 | auto distr = SeqOthello::estimateParameters(reader.get(), limit, keycount); 138 | /* 139 | for (int i = 0 ; i < distr.size(); i++) { 140 | printf("%d->%d\n", i, distr[i]); 141 | }*/ 142 | printf("We estimate there are %lu keys\n", keycount); 143 | reader->reset(); 144 | // auto reader = make_shared> (args::get(argInputname), args::get(argFolder)); 145 | auto seqoth = make_shared (); 146 | 147 | seqoth->constructFromReader(reader.get(), args::get(argOutputname), nThreads, distr, keycount); 148 | return 0; 149 | } 150 | -------------------------------------------------------------------------------- /src/checktidy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | clang-tidy $1 -- -I../include/ -I../lib/ --std=gnu++11 -header-filter=.* 3 | 4 | -------------------------------------------------------------------------------- /src/client.cc: -------------------------------------------------------------------------------- 1 | // This file is a part of SeqOthello. Please refer to LICENSE.TXT for the LICENSE 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "socket.h" 17 | 18 | using namespace std; 19 | 20 | 21 | int main(int argc, char ** argv) { 22 | args::ArgumentParser parser("The client to query SeqOthello from a server. \n"); 23 | args::HelpFlag help(parser, "help", "Display the help menu.", {'h', "help"}); 24 | args::ValueFlag argTranscriptName(parser, "string", "The filename of the transcript fasta file.", {"transcript"}); 25 | args::ValueFlag resultsName(parser, "string", "The filename of the output.", {"output"}); 26 | args::ValueFlag argServerPort(parser, "int", "The port number of the SeqOthello Server.", {"port"}); 27 | args::ValueFlag argServerAdd(parser, "string", "Start a SeqOthello Server at address (default: localhost).", {"server"}); 28 | args::Flag argInteractive(parser, "", "Start interactive CLI.", {"interactive"}); 29 | args::Flag argContainmentQuery(parser, "", "Return the total number of k-mer hits.", {"kmer-hit"}); 30 | args::Flag argCoverageQuery(parser, "", "Return detailed k-mer presence/absence information for the transcript, limited to one trascript per query.", {"kmer-hit-map"}); 31 | 32 | try 33 | { 34 | parser.ParseCLI(argc, argv); 35 | } 36 | catch (args::Help) 37 | { 38 | std::cout << parser; 39 | return 0; 40 | } 41 | catch (args::ParseError e) 42 | { 43 | std::cerr << e.what() << std::endl; 44 | std::cerr << parser; 45 | return 1; 46 | } 47 | catch (args::ValidationError e) 48 | { 49 | std::cerr << e.what() << std::endl; 50 | std::cerr << parser; 51 | return 1; 52 | } 53 | FILE *fin, *fout; 54 | string helpstr = "usage: \t Q ATGCATGC..................... : Show Containment query results for transcript.\n" 55 | " : \t D ATGCATGC..................... : Show Coverage query results for transcript.\n" 56 | " : \t H : Show help info\n"; 57 | int x = ((int) (argInteractive)) + ((int) argContainmentQuery) + ((int) argCoverageQuery); 58 | if (x != 1) { 59 | std::cerr << " must be one of --interactive, --containment, --coverage"<< std::endl; 60 | return 1; 61 | } 62 | string query_type; 63 | if (!argInteractive) { 64 | if (!(argTranscriptName && resultsName && argServerPort)) { 65 | std::cerr << "must specify args --transcript, --output, --port" << std::endl; 66 | return 1; 67 | } 68 | #ifdef __APPLE__ 69 | fin = fopen(args::get(argTranscriptName).c_str(),"rb"); 70 | #else 71 | fin = fopen64(args::get(argTranscriptName).c_str(),"rb"); 72 | #endif 73 | string fnameout = args::get(resultsName); 74 | fout = fopen(fnameout.c_str(), "w"); 75 | 76 | if (fin == NULL) { 77 | std:: cerr << "Error reading file " << args::get(argTranscriptName) << std::endl; 78 | return 1; 79 | } 80 | if (argContainmentQuery) query_type = TYPE_CONTAINMENT; 81 | if (argCoverageQuery) query_type = TYPE_COVERAGE; 82 | } 83 | else { 84 | if ((!argServerPort) || argTranscriptName || resultsName) { 85 | std::cerr << "wrong arg" << std::endl; 86 | return 1; 87 | } 88 | fin = stdin; 89 | fout = stdout; 90 | fprintf(fout,"%s", helpstr.c_str()); 91 | } 92 | 93 | 94 | char buf[1048576]; 95 | memset(buf,0,sizeof(buf)); 96 | 97 | try { 98 | string servadd = "localhost"; 99 | if (argServerAdd) servadd = args::get(argServerAdd); 100 | TCPSocket sock(servadd.c_str(), args::get(argServerPort)); 101 | printf("Connecting to %s : %d\n", servadd.c_str(), args::get(argServerPort)); 102 | while ( fgets(buf,sizeof(buf),fin)!= NULL) { 103 | char * p, *p0; 104 | p0 = p = &buf[0]; 105 | if (argInteractive) { 106 | if ((buf[0] != 'Q' && buf[0]!='D') || buf[1]!=' ') { 107 | printf("%s\n",helpstr.c_str()); 108 | continue; 109 | } 110 | p0 = p = &buf[2]; 111 | if (buf[0] == 'Q') 112 | query_type = TYPE_CONTAINMENT; 113 | if (buf[0] == 'D') 114 | query_type = TYPE_COVERAGE; 115 | } 116 | 117 | if (*p !='A' && *p!='T' && *p !='G' && *p != 'C') continue; 118 | while (*p == 'A' || *p == 'T' || *p == 'G' || *p == 'C' || *p == 'N') p++; 119 | *p = '\0'; 120 | if (strlen(buf)<=3) break; 121 | int strl; 122 | sock.sendmsg(query_type); 123 | sock.sendmsg(p0, strl = strlen(p0)); 124 | printf("Sent a query %s with %d bases.\n", query_type.c_str(), strl ); 125 | string buf; 126 | int tot = 0 ; 127 | while (sock.recvmsg(buf)) { 128 | if (buf.size()==0) break; 129 | fprintf(fout, "%s\n", buf.c_str()); 130 | tot += buf.size(); 131 | printf("received reponse of %d Bytes.\n", tot); 132 | } 133 | if (!argInteractive) 134 | printf("received reponse of %d Bytes.\n", tot); 135 | } 136 | sock.sendmsg(""); 137 | } 138 | catch(std::runtime_error e) { 139 | fclose(fin); 140 | fclose(fout); 141 | cerr << e.what() << endl; 142 | exit(1); 143 | } 144 | fclose(fin); 145 | fclose(fout); 146 | 147 | 148 | 149 | 150 | } 151 | -------------------------------------------------------------------------------- /src/group.cc: -------------------------------------------------------------------------------- 1 | // This file is a part of SeqOthello. Please refer to LICENSE.TXT for the LICENSE 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | using namespace std; 19 | 20 | int getKmerLengthfromxml(string fname,bool argGroup) { 21 | fname += ".xml"; 22 | tinyxml2::XMLDocument doc; 23 | doc.LoadFile( fname.c_str() ); 24 | int ret = 0; 25 | if (argGroup) { 26 | const tinyxml2::XMLElement * pSampleInfo = doc.FirstChildElement( "Root" )->FirstChildElement( "GroupInfo" ); 27 | pSampleInfo->QueryIntAttribute("KmerLength", &ret); 28 | } else { 29 | const tinyxml2::XMLElement * pSampleInfo = doc.FirstChildElement( "Root" )->FirstChildElement( "SampleInfo" ); 30 | pSampleInfo->QueryIntAttribute("KmerLength", &ret); 31 | } 32 | return ret; 33 | } 34 | int main(int argc, char ** argv) { 35 | args::ArgumentParser parser("Use a subset of the experiments to build the SeqOthello Group file."); 36 | args::HelpFlag help(parser, "help", "Display this help menu", {'h', "help"}); 37 | args::ValueFlag argFname(parser, "string", "a file containing the filenames of the binary files. Each line of the [flist] must contain exactly one file name, e.g, xxxx.bin", {"flist"}); 38 | args::ValueFlag argFolder(parser, "string", "The directory to the output binary files.", {"folder"}); 39 | args::ValueFlag argOut(parser, "string", "The filename of the output Group file.", {"output"}); 40 | args::ValueFlag argMaxKmerCount(parser, "integer", "The maximum number of kmers used to build the Group file. Note: this function is for testing purpose only.", {"limit"}); 41 | args::Flag argGroup(parser,"","Create a group using some group files (debugging only).", {"group"}); 42 | try 43 | { 44 | parser.ParseCLI(argc, argv); 45 | } 46 | catch (args::Help) 47 | { 48 | std::cout << parser; 49 | return 0; 50 | } 51 | catch (args::ParseError e) 52 | { 53 | std::cerr << e.what() << std::endl; 54 | std::cerr << parser; 55 | return 1; 56 | } 57 | catch (args::ValidationError e) 58 | { 59 | std::cerr << e.what() << std::endl; 60 | std::cerr << parser; 61 | return 1; 62 | } 63 | if (!(argFname && argOut)) { 64 | // std::cerr << "Must specify args. Try --help." << std::endl; 65 | std::cerr << parser; 66 | return 1; 67 | } 68 | int limit = 0x7FFFFFFF; 69 | if (argMaxKmerCount) { 70 | limit = args::get(argMaxKmerCount); 71 | } 72 | string prefix=""; 73 | if (argFolder) 74 | prefix = args::get(argFolder); 75 | FILE * ffnames = fopen(args::get(argFname).c_str(), "r"); 76 | char buf[4096]; 77 | vector fnames; 78 | while (true) { 79 | if (fgets(buf, 4096, ffnames) == NULL) break; 80 | string fname(buf); 81 | if (*fname.rbegin() == '\n') fname = fname.substr(0,fname.size()-1); 82 | fnames.push_back(prefix+fname); 83 | } 84 | if (fnames.size()>250) { 85 | std::cerr << "Too many files in the group." << std::endl; 86 | return 1; 87 | } 88 | 89 | //check kmer length consistent; 90 | set kmerlengthset; 91 | for (auto s:fnames) { 92 | int kl = getKmerLengthfromxml(s, argGroup); 93 | kmerlengthset.insert(kl); 94 | } 95 | if (kmerlengthset.size() > 1) { 96 | std::cerr << "Kmer length not consistent." << std::endl; 97 | return 1; 98 | } 99 | if (fnames.size() == 0) { 100 | std::cerr << "fail to get kmer filelist." << std::endl; 101 | return 1; 102 | 103 | } 104 | int KmerLength = *kmerlengthset.begin(); 105 | std::function &)> func; 106 | if (!argGroup) { 107 | auto const reader = new KmerGroupReader< uint64_t, BinaryKmerReader >(fnames); 108 | func = std::bind(& KmerGroupReader< uint64_t, BinaryKmerReader>::getNextValueList, reader, placeholders::_1, placeholders::_2); 109 | } 110 | else { 111 | auto const reader = new KmerGroupComposer(fnames); 112 | func = std::bind( & KmerGroupComposer::getNextValueList, reader, placeholders::_1, placeholders::_2); 113 | } 114 | uint64_t k; 115 | vector ret; 116 | auto writer = new MultivalueFileReaderWriter (args::get(argOut).c_str(), sizeof(uint64_t), sizeof(uint8_t), false); 117 | 118 | vector vhistogram(16384,0); 119 | uint64_t cnt = 0; 120 | while (func(k, ret) && (limit -- >0)) { 121 | vhistogram[ret.size()] ++; 122 | sort(ret.begin(), ret.end()); 123 | vector res; 124 | res.reserve(ret.size()); 125 | for (auto &x: ret) res.push_back(x); 126 | writer->write(&k,res); 127 | /*for (auto &r: ret) 128 | r.first+= from; 129 | writer->write(k, ret);*/ 130 | /*printf("%llx\t", k); 131 | for (auto &x: ret) 132 | printf("%d ", x); 133 | printf("\n");*/ 134 | cnt++; 135 | } 136 | writer->finish(); 137 | tinyxml2::XMLDocument xml; 138 | int filecnt = 0; 139 | auto pRoot = xml.NewElement("Root"); 140 | auto pSamples = xml.NewElement("Samples"); 141 | for (auto &fname : fnames) { 142 | string sampleXmlF = fname +".xml"; 143 | printf("%s\n", sampleXmlF.c_str()); 144 | tinyxml2::XMLDocument doc; 145 | doc.LoadFile( sampleXmlF.c_str() ); 146 | if (!argGroup) { 147 | filecnt++; 148 | const tinyxml2::XMLElement * pSampleInfo = doc.FirstChildElement( "Root" )->FirstChildElement( "SampleInfo" ); 149 | string str; 150 | const auto attr = pSampleInfo->FindAttribute("KmerFile"); 151 | if (attr) 152 | printf("query: %s\n", attr->Value()); 153 | tinyxml2::XMLNode * cpyNode = pSampleInfo->DeepClone(&xml); 154 | pSamples->InsertEndChild(cpyNode); 155 | } 156 | else { 157 | tinyxml2::XMLElement * pSamplesFrom = doc.FirstChildElement( "Root" )->FirstChildElement( "Samples" ); 158 | for (tinyxml2::XMLElement * child = pSamplesFrom->FirstChildElement("SampleInfo"); child != NULL; child = child->NextSiblingElement()) { 159 | tinyxml2::XMLNode * cpyNode = child->DeepClone(&xml); 160 | pSamples->InsertEndChild(cpyNode); 161 | filecnt++; 162 | } 163 | 164 | } 165 | } 166 | auto pGroupInfo = xml.NewElement("GroupInfo"); 167 | pGroupInfo->SetAttribute("TotalSamples", (uint32_t) filecnt); 168 | pGroupInfo->SetAttribute("KmerLength", (uint32_t) KmerLength); 169 | pGroupInfo->SetAttribute("GroupFile", args::get(argOut).c_str() ); 170 | pGroupInfo->SetAttribute("Keycount", (int64_t) cnt); 171 | pRoot->InsertFirstChild(pGroupInfo); 172 | auto pHistogram = xml.NewElement("Histogram"); 173 | for (int i = 1; i< 16384; i++) 174 | if (vhistogram[i]) { 175 | auto pHisNode = xml.NewElement("entry"); 176 | pHisNode->SetAttribute("freq", i); 177 | pHisNode->SetAttribute("value", (uint32_t) vhistogram[i]); 178 | pHistogram->InsertEndChild(pHisNode); 179 | } 180 | pRoot->InsertEndChild(pHistogram); 181 | pRoot->InsertEndChild(pSamples); 182 | xml.InsertFirstChild(pRoot); 183 | auto xmlName = args::get(argOut) + ".xml"; 184 | xml.SaveFile(xmlName.c_str()); 185 | 186 | delete writer; 187 | cout << "wrote "< 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | using namespace std; 13 | int main(int argc, char * argv[]) { 14 | args::ArgumentParser parser("Convert a Jellyfish output file to binary format supported by SeqOthello.", ""); 15 | args::HelpFlag help(parser, "help", "Display this help menu", {'h', "help"}); 16 | args::ValueFlag argInputname(parser, "string", "filename for the input kmer file", {"in"}); 17 | args::ValueFlag argOutputname(parser, "string", "filename for the output binary kmer file", {"out"}); 18 | args::ValueFlag argKmerlength(parser, "integer", "k, length of kmer", {"k"}); 19 | args::ValueFlag nCutoff(parser, "integer", "Optional value. Only k-mers with at least [cutoff] counts are kept for building SeqOthello. ", {"cutoff"}); 20 | args::Flag argHistogram(parser, "", "Use this command to generate a histogram of k-mer expression.", {"histogram"}); 21 | args::Flag argJellyfishOutput(parser, "", "use jellyfish output file.", {"jellyfish"}); 22 | 23 | try 24 | { 25 | parser.ParseCLI(argc, argv); 26 | } 27 | catch (args::Help) 28 | { 29 | std::cout << parser; 30 | return 0; 31 | } 32 | catch (args::ParseError e) 33 | { 34 | std::cerr << e.what() << std::endl; 35 | std::cerr << parser; 36 | return 1; 37 | } 38 | catch (args::ValidationError e) 39 | { 40 | std::cerr << e.what() << std::endl; 41 | std::cerr << parser; 42 | return 1; 43 | } 44 | if (!(argInputname && argOutputname && argKmerlength)) { 45 | // std::cerr << "must specify args" << std::endl; 46 | std::cerr << parser; 47 | return 1; 48 | } 49 | 50 | int kmerlength = args::get(argKmerlength); 51 | 52 | ConstantLengthKmerHelper iohelper(kmerlength,0); 53 | 54 | vector VKmer; 55 | FileReader *freader; 56 | string finName = args::get(argInputname); 57 | string foutName = args::get(argOutputname); 58 | uint32_t cutoff = 0; 59 | if (nCutoff) 60 | cutoff = args::get(nCutoff); 61 | printf("Read files from %s\n", finName.c_str()); 62 | if (argJellyfishOutput) { 63 | auto p = new JellyfishFileReader(finName.c_str()); 64 | freader = p; 65 | kmerlength = p->kmerlength; 66 | } 67 | else { 68 | freader = new KmerFileReader (finName.c_str(), &iohelper,false); 69 | } 70 | uint32_t minInputExpression = 0x7FFFFFFF; 71 | uint64_t k; 72 | uint32_t v; 73 | if (argHistogram) { 74 | map his; 75 | while (freader->getNext(&k, &v)) { 76 | his[v]++; 77 | } 78 | FILE *fout = fopen(foutName.c_str(),"w"); 79 | for (auto &x: his) { 80 | fprintf(fout, "%d,%d\n", x.first, x.second); 81 | } 82 | fclose(fout); 83 | return 0; 84 | } 85 | while (freader->getNext(&k, &v)) { 86 | if (v < minInputExpression) 87 | minInputExpression = v; 88 | if (v >= cutoff) 89 | VKmer.push_back(k); 90 | } 91 | if (VKmer.size() >0 ) { 92 | printf("Sorting %lu keys\n", VKmer.size()); 93 | sort(VKmer.begin(),VKmer.end()); 94 | } 95 | else { 96 | printf("Empty kmer files\n"); 97 | } 98 | unsigned long long cnt = 0; 99 | if (VKmer.size()) { 100 | cnt ++; 101 | for (unsigned int i = 1; i < VKmer.size(); i++ ) { 102 | if (VKmer[i] != VKmer[cnt-1]) { 103 | VKmer[cnt] = VKmer[i]; 104 | cnt++; 105 | } 106 | } 107 | } 108 | printf("Writing %lld keys to %s\n", cnt, foutName.c_str()); 109 | FILE *fout = fopen(foutName.c_str(),"wb"); 110 | fwrite(&VKmer[0], cnt, sizeof(VKmer[0]), fout); 111 | fclose(fout); 112 | 113 | tinyxml2::XMLDocument xml; 114 | auto pRoot = xml.NewElement("Root"); 115 | auto pElement = xml.NewElement("SampleInfo"); 116 | pElement->SetAttribute("KmerFile", finName.c_str()); 117 | pElement->SetAttribute("KmerLength", kmerlength); 118 | pElement->SetAttribute("BinaryFile", foutName.c_str()); 119 | pElement->SetAttribute("KmerCount", (unsigned int) VKmer.size()); 120 | if (cnt) { 121 | pElement->SetAttribute("Cutoff", cutoff); 122 | pElement->SetAttribute("MinExpressionInKmerFile", minInputExpression); 123 | pElement->SetAttribute("UniqueKmerCount",(unsigned int) cnt); 124 | } 125 | pRoot->InsertEndChild(pElement); 126 | xml.InsertFirstChild(pRoot); 127 | auto xmlName = foutName + ".xml"; 128 | xml.SaveFile(xmlName.c_str()); 129 | return 0; 130 | } 131 | -------------------------------------------------------------------------------- /src/printrates.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "oltnew.h" 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | using namespace std; 18 | 19 | 20 | int main(int argc, char ** argv) { 21 | args::ArgumentParser parser("Query SeqOthello。 \n"); 22 | args::HelpFlag help(parser, "help", "Display the help menu.", {'h', "help"}); 23 | args::ValueFlag argSeqOthName(parser, "string", "The directory to SeqOthello map.", {"map-folder"}); 24 | args::ValueFlag argL2(parser, "int", "L2 Node ID", {"L2"}); 25 | try 26 | { 27 | parser.ParseCLI(argc, argv); 28 | } 29 | catch (args::Help) 30 | { 31 | std::cout << parser; 32 | return 0; 33 | } 34 | catch (args::ParseError e) 35 | { 36 | std::cerr << e.what() << std::endl; 37 | std::cerr << parser; 38 | return 1; 39 | } 40 | catch (args::ValidationError e) 41 | { 42 | std::cerr << e.what() << std::endl; 43 | std::cerr << parser; 44 | return 1; 45 | } 46 | SeqOthello * seqoth; 47 | string filename = args::get(argSeqOthName); 48 | seqoth = new SeqOthello (filename, 16 ,false); 49 | if (!argL2) { 50 | seqoth->loadL1(seqoth->kmerLength); 51 | seqoth->printrates(); 52 | } 53 | if (argL2) { 54 | int L2id = args::get(argL2); 55 | seqoth->loadL2Node(L2id); 56 | if (!seqoth->vNodes[L2id]) return 0; 57 | auto othProvalues = seqoth->vNodes[L2id]->getRates(); 58 | auto retmap = seqoth->vNodes[L2id]->computeProb(othProvalues); 59 | int entrycnt = seqoth->vNodes[L2id]->getEntrycnt(); 60 | double zeroRate =0; 61 | for (auto &x:othProvalues) { 62 | if (x.first ==0 || x.first>entrycnt) 63 | zeroRate += x.second; 64 | } 65 | printf("L2 Node %d, entrycnt %d\n", L2id, entrycnt); 66 | for (auto &x: retmap) 67 | printf("%d %.8lf\n", x.first,x.second); 68 | printf("Zero %.8lf\n", zeroRate); 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /src/testL1Node.cpp: -------------------------------------------------------------------------------- 1 | // This file is a part of SeqOthello. Please refer to LICENSE.TXT for the LICENSE 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | int main(int argc, char* argv [] ) { 9 | int n = 10485760; 10 | int thread = 0; 11 | sscanf(argv[1], "%d", &n); 12 | sscanf(argv[2], "%d", &thread); 13 | vector k; 14 | vector v; 15 | for (int i = 0 ; i < n ; i++) { 16 | k.push_back((rand()^(rand()<<12))&0xFFFFFFFULL); 17 | v.push_back(rand()%0xFFFF); 18 | } 19 | sort(k.begin(), k.end()); 20 | L1Node * p = new L1Node(1048575*128*4, 14, "testloc"); 21 | for (int i = 0 ; i < n ; i++) { 22 | p->add(k[i], v[i]); 23 | } 24 | p->constructAndWrite(14, thread, "test"); 25 | int splitbit = p->getsplitbit(); 26 | printf("%d", splitbit); 27 | /* 28 | L1Node *q = new L1Node(0,14); 29 | q->setsplitbit(14, splitbit); 30 | q->loadFromFile("test"); 31 | int uneq = 0; 32 | for (int i = 0 ; i < n; i++) { 33 | uint64_t res = q->queryInt(k[i]); 34 | if ((res ^ v[i])& 0xFFF) { 35 | printf("query result for %lx : %lx, %x", k[i], res, v[i]); 36 | uneq++; 37 | } 38 | } 39 | */ 40 | } 41 | /* 42 | void testVAL(vector val) { 43 | 44 | vector buf(64,0); 45 | vector buf0(64,0); 46 | uint32_t q = valuelistEncode(&buf[0], val, false); 47 | EXPECT_EQ(buf,buf0); 48 | uint32_t q2 = valuelistEncode(&buf[0], val, true); 49 | EXPECT_EQ(q,q2); 50 | vector valret; 51 | uint32_t ql = valuelistDecode(&buf[0], valret, 64); 52 | EXPECT_EQ(ql, valret.size()); 53 | EXPECT_EQ(valret, val); 54 | 55 | } 56 | TEST_F(L2NodeTest, TestEncodeDecode) { 57 | for (int i = 0 ; i < 100; i++) { 58 | int l = 1+ i % 6; 59 | vector val; 60 | for (int j = 0 ; j < l ; j++) { 61 | int x = rand() % (0xF); 62 | if (rand() & 1) x = ((x<<4) | (rand () % 0xF)); 63 | if (rand() & 1) x = ((x<<4) | (rand () % 0xF)); 64 | if (rand() & 1) x = ((x<<4) | (rand () % 0xF)); 65 | if (rand() & 1) x = ((x<<4) | (rand () % 0xF)); 66 | val.push_back(x+1); 67 | } 68 | testVAL(val); 69 | } 70 | vector val1 = { 57, 13758, 5 }; 71 | testVAL(val1); 72 | } 73 | 74 | TEST_F(L2NodeTest, TestL2Short) { 75 | 76 | L2Node *N = new L2ShortValueListNode (5,8); 77 | int NN = 20; 78 | std::random_device rd; //Will be used to obtain a seed for the random number engine 79 | std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd() 80 | std::uniform_int_distribution<> dis(0, 0x6FFFFFFFULL); 81 | vector vK; 82 | for (uint64_t i=0; i v; 88 | for (int j = 1; j<=5; j++) v.push_back((i+j*37)%100); 89 | sort(v.begin(), v.end()); 90 | N->add(k,v); 91 | } 92 | N->constructOth(); 93 | 94 | for (uint64_t i=0; i v,vret; 97 | vector vretmap; 98 | for (int j = 1; j<=5; j++) v.push_back((i+j*37)%100); 99 | sort(v.begin(), v.end()); 100 | 101 | bool ret = N->smartQuery(&k, vret, vretmap); 102 | EXPECT_EQ(ret, true); 103 | EXPECT_EQ(vret, v); 104 | } 105 | 106 | N->gzfname = "test.gz"; 107 | N->writeDataToGzipFile(); 108 | 109 | L2Node *N2 = new L2ShortValueListNode (5,8); 110 | N2->gzfname = "test.gz"; 111 | N2->loadDataFromGzipFile(); 112 | for (uint64_t i = 0; i < NN; i++) { 113 | uint64_t k = vK[i]; 114 | vector v,vret; 115 | vector vretmap; 116 | for (int j = 1; j<=5; j++) v.push_back((i+j*37)%100); 117 | sort(v.begin(), v.end()); 118 | bool ret = N2->smartQuery(&k, vret, vretmap); 119 | 120 | EXPECT_EQ(ret, true); 121 | EXPECT_EQ(vret, v); 122 | } 123 | } 124 | 125 | TEST_F(L2NodeTest, TestL2MAPP) { 126 | std::random_device rd; //Will be used to obtain a seed for the random number engine 127 | std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd() 128 | std::uniform_int_distribution<> dis(0, 255); 129 | std::uniform_int_distribution<> dis2(0, 0x6FFFFFFFULL); 130 | int totN = 101; 131 | int totK = 256; 132 | int L = 12; 133 | vector buf; 134 | vector vK; 135 | for (int i = 0; i <= totN*L; i++) { 136 | buf.push_back(dis(gen)); 137 | } 138 | for (uint64_t i=0; i tmp(buf.begin()+(i*L), buf.begin() + ((i+1)*L)); 146 | N->addMAPP(k,tmp); 147 | } 148 | N->constructOth(); 149 | 150 | N->gzfname = "test.gz"; 151 | N->writeDataToGzipFile(); 152 | 153 | gzFile fin = gzopen("test.gz", "rb"); 154 | L2Node *N2 = new L2ShortValueListNode (4,6); 155 | N2->gzfname = "test.gz"; 156 | N2->loadDataFromGzipFile(); 157 | 158 | for (uint64_t i=0; i vret; 161 | vector vretmap, vretmap2; 162 | vector tmp(buf.begin()+(i*L), buf.begin() + ((i+1)*L)); 163 | bool ret = N->smartQuery(&k, vret, vretmap); 164 | bool ret2 = N->smartQuery(&k, vret, vretmap2); 165 | EXPECT_EQ(ret, false); 166 | EXPECT_EQ(ret2, false); 167 | EXPECT_EQ(vretmap, tmp); 168 | EXPECT_EQ(vretmap2, tmp); 169 | } 170 | } 171 | 172 | TEST_F(L2NodeTest, TestL2EncodeLong) { 173 | std::random_device rd; //Will be used to obtain a seed for the random number engine 174 | std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd() 175 | std::uniform_int_distribution<> dis(0, 255); 176 | std::uniform_int_distribution<> dis2(0, 0x6FFFFFFFULL); 177 | int totN = 101; 178 | int totK = 256; 179 | int L = 12; 180 | vector> vlists; 181 | vector vK; 182 | uint32_t maxlength = 0; 183 | for (int i = 0; i < totN; i++) { 184 | vector vec; 185 | uint32_t last; 186 | vec.push_back(last = dis(gen)); 187 | uint32_t upper = dis2(gen) % 15 + 1; 188 | for (int i = 1 ; i<= upper; i++) { 189 | last += (dis(gen) + 1); 190 | if (dis2(gen) & 1) 191 | last += dis(gen); 192 | if (dis2(gen) & 3) 193 | last += dis(gen); 194 | vec.push_back(last); 195 | } 196 | vlists.push_back(vec); 197 | vector diff; diff.push_back(vec[0]); 198 | for (int i = 1; i< vec.size(); i++) 199 | diff.push_back(vec[i] - vec[i-1]); 200 | uint32_t encodelength = valuelistEncode(NULL, diff, false); 201 | if (encodelength > maxlength) maxlength = encodelength + 1; 202 | 203 | } 204 | for (uint64_t i=0; i vec = vlists[i]; 213 | vector diff; diff.push_back(vec[0]); 214 | for (int i = 1; i< vec.size(); i++) 215 | diff.push_back(vec[i] - vec[i-1]); 216 | N->add(k,diff); 217 | } 218 | N->constructOth(); 219 | 220 | gzFile fout = gzopen("test.gz", "wb"); 221 | N->gzfname = "test.gz"; 222 | N->writeDataToGzipFile(); 223 | gzclose(fout); 224 | gzFile fin = gzopen("test.gz", "rb"); 225 | L2Node *N2 = new L2EncodedValueListNode (maxlength,L2NodeTypes::VALUE_INDEX_ENCODED); 226 | N2->gzfname = "test.gz"; 227 | N2->loadDataFromGzipFile(); 228 | 229 | for (uint64_t i=0; i vret, vret2; 232 | vector vretmap, vretmap2; 233 | bool ret = N->smartQuery(&k, vret, vretmap); 234 | bool ret2 = N->smartQuery(&k, vret2, vretmap2); 235 | EXPECT_EQ(ret, true); 236 | EXPECT_EQ(ret2, true); 237 | vector vl = vlists[i]; 238 | EXPECT_EQ(vret, vl); 239 | EXPECT_EQ(vret2, vl); 240 | } 241 | 242 | } 243 | */ 244 | 245 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | INCLUDE_DIRECTORIES(../lib ../seqothlib) 2 | 3 | find_package(Threads REQUIRED) 4 | 5 | ADD_EXECUTABLE(datagen datagen.cc) 6 | ADD_EXECUTABLE(comp comp.cc) 7 | 8 | 9 | if (CMAKE_BUILD_TYPE MATCHES Debug) 10 | # Enable ExternalProject CMake module 11 | include(ExternalProject) 12 | 13 | # Download and install GoogleTest 14 | ExternalProject_Add( 15 | gtest 16 | URL https://github.com/google/googletest/archive/master.zip 17 | PREFIX ${CMAKE_CURRENT_BINARY_DIR}/gtest 18 | # Disable install step 19 | INSTALL_COMMAND "" 20 | ) 21 | 22 | # Get GTest source and binary directories from CMake project 23 | ExternalProject_Get_Property(gtest source_dir binary_dir) 24 | 25 | # Create a libgtest target to be used as a dependency by test programs 26 | add_library(libgtest IMPORTED STATIC GLOBAL) 27 | add_dependencies(libgtest gtest) 28 | 29 | # Set libgtest properties 30 | set_target_properties(libgtest PROPERTIES 31 | "IMPORTED_LOCATION" "${binary_dir}/googlemock/gtest/libgtest.a" 32 | "IMPORTED_LINK_INTERFACE_LIBRARIES" "${CMAKE_THREAD_LIBS_INIT}" 33 | ) 34 | 35 | # Create a libgmock target to be used as a dependency by test programs 36 | add_library(libgmock IMPORTED STATIC GLOBAL) 37 | add_dependencies(libgmock gtest) 38 | 39 | # Set libgmock properties 40 | set_target_properties(libgmock PROPERTIES 41 | "IMPORTED_LOCATION" "${binary_dir}/googlemock/libgmock.a" 42 | "IMPORTED_LINK_INTERFACE_LIBRARIES" "${CMAKE_THREAD_LIBS_INIT}" 43 | ) 44 | 45 | # I couldn't make it work with INTERFACE_INCLUDE_DIRECTORIES 46 | include_directories("${source_dir}/googletest/include" 47 | "${source_dir}/googlemock/include") 48 | 49 | add_subdirectory(unit) 50 | endif() 51 | -------------------------------------------------------------------------------- /test/clear.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | rm -f *.xml 3 | rm -f *.grp 4 | rm -rf raw bin.64 5 | rm flist* 6 | -------------------------------------------------------------------------------- /test/comp.cc: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace std; 3 | int main(int argc, char ** argv) { 4 | auto readerA = new BinaryKmerReader(argv[1]); 5 | auto readerB = new BinaryKmerReader(argv[2]); 6 | uint64_t va=0, vb=0; 7 | bool ra = true, rb=true; 8 | int CAP = 0, DA =0, DB = 0; 9 | while (ra && rb) { 10 | if (va == vb) { 11 | uint64_t v0 = va; 12 | uint32_t cnta=0,cntb=0; 13 | while (va == v0 && ra) { 14 | ra = readerA->getNext(&va); 15 | cnta++; 16 | } 17 | while (vb == v0 && rb) { 18 | rb = readerB->getNext(&vb); 19 | cntb++; 20 | } 21 | CAP+= (cnta getNext(&va); 27 | } 28 | else { 29 | DB++; 30 | rb = readerB->getNext(&vb); 31 | } 32 | } 33 | while (ra) { DA++; ra = readerA->getNext(&va);} 34 | while (rb) { DB++; rb = readerB->getNext(&vb);} 35 | printf("%d\t%d\t%d\n", CAP,DA,DB); 36 | return 0; 37 | } 38 | -------------------------------------------------------------------------------- /test/datagen.cc: -------------------------------------------------------------------------------- 1 | // This file is a part of SeqOthello. Please refer to LICENSE.TXT for the LICENSE 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | int main(int argc, char ** argv) { 10 | 11 | args::ArgumentParser parser("Generate a list of kmer data (k=20) for the purpose of testing.", ""); 12 | args::HelpFlag help(parser, "help", "Display this help menu", {'h', "help"}); 13 | args::ValueFlag nFiles(parser, "integer", "number of files", {'f'}); 14 | args::ValueFlag nKmers(parser, "integer", "number of kmers", {'k'}); 15 | args::Flag boolUnique(parser,"","these are unique",{'u'}); 16 | try 17 | { 18 | parser.ParseCLI(argc, argv); 19 | } 20 | catch (args::Help) 21 | { 22 | std::cout << parser; 23 | return 0; 24 | } 25 | catch (args::ParseError e) 26 | { 27 | std::cerr << e.what() << std::endl; 28 | std::cerr << parser; 29 | return 1; 30 | } 31 | catch (args::ValidationError e) 32 | { 33 | std::cerr << e.what() << std::endl; 34 | std::cerr << parser; 35 | return 1; 36 | } 37 | 38 | int files = (nFiles)?(args::get(nFiles)):(20); 39 | unsigned kmers = (nKmers)?(args::get(nKmers)):(100); 40 | auto helper = new ConstantLengthKmerHelper(20,0); 41 | 42 | 43 | uint64_t start_state = 0x185abd8c71u; /* Any nonzero start state will work. */ 44 | uint64_t lfsr = start_state; 45 | uint64_t bit; /* Must be 16bit to allow bit<<15 later in the code */ 46 | 47 | 48 | 49 | 50 | char buf[32]; 51 | memset(buf,0,sizeof(buf)); 52 | uint64_t vv = 0; 53 | std::set sss; 54 | std::vector vkmer; 55 | std::string str; 56 | std::vector> vp; 57 | char fname[30]; 58 | memset(fname,0,sizeof(fname)); 59 | for (uint64_t i = 0; i< kmers; i++) { 60 | // x^40 + x ^ 38 + x^ 21 + x^9 61 | bit = ((lfsr >> 0) ^ (lfsr >> 2) ^ (lfsr >> 19) ^ (lfsr >> 21) ) & 1; 62 | lfsr = (lfsr >> 1) | (bit << 39); 63 | bit = ((lfsr >> 0) ^ (lfsr >> 2) ^ (lfsr >> 19) ^ (lfsr >> 21) ) & 1; 64 | lfsr = (lfsr >> 1) | (bit << 39); 65 | vv = lfsr; 66 | sss.insert(vv); 67 | helper->convertstring(buf,&vv); 68 | if (i == kmers+files) { 69 | std::string tot(buf); 70 | str = tot + str; 71 | } 72 | else { 73 | std::string std=" "; 74 | std[0] = buf[19]; 75 | str = std + str; 76 | vkmer.push_back(vv); 77 | } 78 | } 79 | for (unsigned int i = 0; i < kmers; i++) { 80 | set sint; 81 | if (boolUnique) { 82 | sint.insert(i % files); 83 | } 84 | else { 85 | for (unsigned int j = 0; j<= i/files/3; j++) { 86 | sint.insert((i+j)%files); 87 | // sint.insert((i/files+(i/files+3)*j)%files); 88 | } 89 | //sint.insert(i*3 % files); 90 | for (int j = 0; j<=10; j++) 91 | if (i & (1<minSelfAndRevcomp(k); 107 | helper->convertstring(buf,&k); 108 | fprintf(fout, "%s %d\n", buf, (j-i+2)*i); 109 | } 110 | fclose(fout); 111 | } 112 | printf("%lud %d\n", sss.size(), kmers+files); 113 | std::cout << str << endl; 114 | return 0; 115 | 116 | } 117 | -------------------------------------------------------------------------------- /test/gendata.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ../build/test/datagen -f 182 -k 15300 > genlog 3 | echo '>11' > testTT.fa 4 | tail -n 1 genlog >> testTT.fa 5 | echo '>11' >> testTT.fa 6 | tail -n 1 genlog | cut -c1-100 >> testTT.fa 7 | echo '>11' >> testTT.fa 8 | tail -n 1 genlog | cut -c1-75 >> testTT.fa 9 | echo '>11' >> testTT.fa 10 | tail -n 1 genlog | cut -c1-50 >> testTT.fa 11 | echo '>11' >> testTT.fa 12 | tail -n 1 genlog | cut -c1-25 >> testTT.fa 13 | echo '>11' >> testTT.fa 14 | for j in {1..50}; do rm -rf tmp; for i in A T G C; do echo $i >> tmp; done; echo `sort tmp -R | awk '{print}' ORS=''`; done | awk '{print}' ORS='' >> testTT.fa 15 | rm 16.Kmer 16 | touch 16.Kmer 17 | for i in *.Kmer; do 18 | ../build/bin/PreProcess --in=$i --out=$i.bin --k=20; 19 | done 20 | rm -rf raw bin.64 21 | mkdir raw 22 | mkdir bin.64 23 | mv *.Kmer raw/ 24 | mv *.bin bin.64/ 25 | mv *.bin.xml bin.64/ 26 | rm -f flistA flistB flistC flistD 27 | for i in {0..49}; do 28 | echo F$i.Kmer.bin >> flistA 29 | done 30 | for i in {50..99}; do 31 | echo F$i.Kmer.bin >> flistB 32 | done 33 | 34 | for i in {100..149}; do 35 | echo F$i.Kmer.bin >> flistC 36 | done 37 | 38 | for i in {150..181}; do 39 | echo F$i.Kmer.bin >> flistD 40 | done 41 | 42 | ../build/test/datagen -f 100 -k 220 -u > genlog.unique 43 | tail -n 1 genlog.unique > testTT.fau 44 | for i in *.Kmer; do 45 | ../build/bin/PreProcess --in=$i --out=$i.bin --k=20; 46 | done 47 | rm -rf raw.unique bin.64.unique 48 | mkdir raw.unique bin.64.unique 49 | mv *.Kmer raw.unique/ 50 | mv *.bin bin.64.unique/ 51 | mv *.bin.xml bin.64.unique/ 52 | rm -rf flistUnique 53 | 54 | -------------------------------------------------------------------------------- /test/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | rm -rf grp 3 | mkdir grp 4 | 5 | echo group 6 | 7 | ../build/bin/Group --flist=flistA --folder=./bin.64/ --output=GrpA.grp > processlog 8 | ../build/bin/Group --flist=flistB --folder=./bin.64/ --output=GrpB.grp >> processlog 9 | ../build/bin/Group --flist=flistC --folder=./bin.64/ --output=GrpC.grp >> processlog 10 | ../build/bin/Group --flist=flistD --folder=./bin.64/ --output=GrpD.grp >> processlog 11 | 12 | ls *.grp > grplist 13 | mv *.grp grp/ 14 | mv *.grp.xml grp/ 15 | 16 | echo buildLargeGroup 17 | ../build/bin/Group --flist=grplist --folder=./grp/ --output=GrpAll.grp --group > grplog 18 | mv *.grp grp/ 19 | mv *.grp.xml grp/ 20 | echo 'GrpAll.grp' > grplistsingle 21 | 22 | #echo Count 23 | #../countkey --flist=grplist --folder=./grp/ -e > keycountlog 24 | rm -rf mapOut mapOutGrp 25 | mkdir mapOut 26 | mkdir mapOutGrp 27 | echo Build 28 | ../build/bin/Build --flist=grplist --folder=./grp/ --out-folder=mapOut/ --count-only> Buildlog 29 | ../build/bin/Build --flist=grplist --folder=./grp/ --out-folder=mapOut/ > Buildlog.tt 30 | echo BuildOneGroup 31 | ../build/bin/Build --flist=grplistsingle --folder=./grp/ --out-folder=mapOutGrp/ --count-only> BuildlogGrp 32 | ../build/bin/Build --flist=grplistsingle --folder=./grp/ --out-folder=mapOutGrp/ > Buildlog.ttGrp 33 | echo Query 34 | ../build/bin/Query --map-folder=mapOut/ --transcript=testTT.fa --detail --output queryresult > querylog 35 | ../build/bin/Query --map-folder=mapOut/ --transcript=testTT.fa --qthread=4 --output queryresultAgg > querylogAgg 36 | # rm ./mapOut/map.L2.7 37 | 38 | ../build/bin/Group --flist=flistA --folder=./bin.64.unique/ --output=GrpAU.grp > processlogU 39 | ../build/bin/Group --flist=flistB --folder=./bin.64.unique/ --output=GrpBU.grp >> processlogU 40 | ls *U.grp > grplistU 41 | rm -rf grpU 42 | mkdir grpU 43 | mv *U.grp grpU/ 44 | mv *U.grp.xml grpU/ 45 | 46 | rm -rf mapOutU 47 | mkdir mapOutU 48 | echo Build 49 | ../build/bin/Build --flist=grplistU --folder=./grpU/ --out-folder=mapOutU/ > Buildlog.ttU 50 | 51 | echo Query 52 | ../build/bin/Query --map-folder=mapOutU/ --transcript=testTT.fau --detail --output queryresultU > querylogU 53 | ../build/bin/Query --map-folder=mapOutU/ --transcript=testTT.fau --qthread=4 --output queryresultAggU > querylogAggU 54 | 55 | # ../build/bin/Query --map-folder=mapOut/ --start-server-port 3322 56 | -------------------------------------------------------------------------------- /test/testTT.fa: -------------------------------------------------------------------------------- 1 | >11 2 | CGTCCGTACCCGGGCTATCCCCGGATTACACGGCCACATAGCGGGTGCTAACCACTCGAGATCCAGCTCACTGGAGATTTATAACGGTGTCCCTAAACAAATGTAGACCGGCGGCAATGACCCTCCATCTTGCGTTAATTGAGTAAGATAATGTGACAACGAACGGGATATAGGGGGTAGTAACTCTAAAAGACGCGTTCTGGAAGTCACCCACGTGCATAGGCTTTATCTGTCTATCGAAACCTTTGGTAGTTAATAGCGGCCAGGTAAGCATAATAAATTCTCGTGGTTTGACTGAAGCGATCTTCGAAATAATACGACGCAAACTGGCGTCCGCAACTTCCATACTGTATGACGCCGGGCTTTATCCGTTCCCAGTAAAATATATCGATCTCTACTACGTCAGAAAATGATCTTGCATGAGTCCGATGTGCTAGTTCAATGAGTTGCCTTCTTTAATGGATTCTGCGCTCGAGCGCCATGGAAACGAGTATTCACTTTTGGTTCTCCTAGCAACCGAGGCCATAGAATGGTATCCTGGGTCGGGCCTAGTCTAATCGTCCCGGCTTGCTAACATACAAATGGTCACCAGAGTATCGGCAGACCTGCCTGTCACGAATGACCGAAGGCCTGGCATTGTAATTTAGAAGCTTCCGATAGCCTAGCGCGCGAGAACGGCAGGAAGAAAAGACTGTGAGGTCAAGGCAAACCGAGAAGAGTTACGAACAATTTACCCCTACGTAAAAACTTGGACACAAACCTTCTCTGCATCAACCCGAGATGTGTCAACTAGATCAAACAATCATAAATGTGGGAGGTAGACACTTTAATAAACCTACACTGGCTCAGAAAGTCCGCCCAGACAGAGAATGACTTAACCTGGAATCTCATTGTCCACAGTCCGGCGTCCCGCAAGAGGAGATTAATAAAATACTATTCCGCAACGTGGTACTGGACACTTCCTTCAACGTGAGCCGGACAGGACCTGTTTCAGCCTTCACTGTAATATGGTCATTTTAGTTCCTTACCTTCACGACACTCGCTTAGCTCTCGTATTAAGGCGGGATAGTCTTACTTAACGTAACCTCTTAACCGAGTTTTAGTCCGCGCATGAGTGGGTTCAGTAACTACGTTGTCATGGCTCATTCAGGTCGTGGGGAGAGCTGCACGCTTGCAAACGTTCTGGTTCCGAGATTCACGGAAGGCGTCGCCGGAATAACCAGCTTGTGGTCGTTTGAGTCAATATGAGTATAAACACTTCGATTGGCTCAACTTACTCGCTATCCAAGATGAATTAGGCGGGATTAATACTGTGAGCATGAAGGACTTCTATATTGTGATATGGGCAAAATGATCCAGGAGGCAGTAACATAGGTCTCCGTGTTGACCCTGAAGGTGTTCCTCGAGCGGCGTGTAGTGAGCATGCTACGGGGGGCGTGAGTTATCAGATCCGTGTGATAGACTGCGCGGGTGCAACGTTCTCAACATTATCTAAGAAGGAGGACCCTACTGAAGACAACCAGTTGCAATACCTCGCTGATCGAGTCCGATTCTGTGACACCGGGTCAAGTCGCTTCTTTCCACTCCCCGCAAGATGCTTATGGATCTGCTCTACCGTTTGATTCTACTGGGTGGGGGCAGGTGTAAACAGACCAAGATATCAGTGTGATGGACTCCGAGTTTGAGACATTCCTCCAATTCGATTCTTCTCTTGGGGTCTTCCATGCCCGGCGTTGGCCAGATCGGGCACGGTCAGCGCGGGGTGCAGCGAGCGCCTAGTATGGGGTGTGCCGACACTTTCTGAAAAGCGGTGGTTTCCACAAAGGTCAAACTATAACGGTACGTTAGAATAAGTGCCTAAGGACCTACTCCAGTAATTGGACCTAGACAACTAGCATGGATTCTCAGCACGACGTAGGGCCGACTAGTGCGCACTAGGTCTTCTCGATGGAACCTATTGGCGCTTGATTATAGTATCATGACAAACCCATAGGCCACCCATGTCCGCAAAGGTGGCGCACGACGGTCTCGTCCCTCGGTACCATAGCGAACAGAAACATTAGTAAGCAGTCCAGTGCGCCCTTTCAGCCGGTTATGGGGCACTTAGACCCCCGTTAGGACCCACACCCATGTTGTCTCGCGCCGCCCCGGTGACACGGGTTCAATCGTCCTCTTAAACTCTTTGTGAAAATGCAGCCATTGGTGAAATCCAGCGACCGCGTAACCTGGATAACCACGCCACCTGCCAAACTGTCTGGTGCATTACGGATAAAGCCCGACGGAGACGTTTCACGTCGGGGAATTAAATAGTGCGTGCACTCGGTCGACGCTTGTCACGAACTTTGGCTCAATCGCAGCCTTTCCCAACCCCTGACTCGTAGCTGACATCTTTCGTGACTTCCTTCTTTGATCATCTTGGAGTTTTATGCATAATTTGTTACAGTAACCCTGCCCATTAGGAGATAGGACTAAAGCGAACGAGTCGTCCTTGGCGCAACAGATTCCACGGATTTCAGCGTAGCTGACTTGCTTAGGTGAATCCCTTAATAGCGTCGTATAGATCTGCGGCGGTCACCTCGGGGTGGGCGCTCGCTGGGACGCAAGGAAGAATTTATCGTGCGGTGTGAATACGTCAGAGTACCCTTACCCTAGACAAGAGTCCACACCCAAAGATCTCCCGGGTCAACACTTCGCGCGCTCGCCTATCGGAATAACTATCATCGCGAGTCGCTATCTTTTAAACTAAGCCCATAGCTCATTATCTACTCGGCCATAAACAGATCCTGCGTAAGGAATATAACGGTCAGTGGCCTAAAACTGCGCACCTTCAGACTCAAATGGAGTGCGATCGTGTCGCAATTCCCGAAGATATCTCTTGTTTCCCGGGGATGCAGCGCGACCGAACACAACGAAGCCGGGAGACATGCCAATTCATCCTTGGTGCCTTGGATGAGACACTAAATTGTGTAGCGAGATCTTGCGAGCCGCCAGTGCTCTGCACCGTTTCTAGGAATAAGCTTTGCCAGGGGAGCTATTCAGAGTTCTAGTACTAGTCCATTATTAGCCACAGTGTCGTCAAACATATCCGGACAAGTAAGTTAGAAATTCCAGTTCTCAGCCCCACATTTAAGCATTAATAAATATGTCCTTAGCACGGAACGAGTGTTGCCGCAGCCTAGGTCAGGAGTGTGTAGGGCTTTGATCTGTAGCAGCGAAGGACACCTATACTAGGGCGCCACTTGGGCAATTTCCTTTAAGTCGCTCCTGCCTCTCGAATTCCGGAGTCCCAGCTTCGGCGAACTGCAAGAAGATTAATTCACCATACTAAGGGGGATCAAACAAGTCGAAGGGCTAGAACCACGCCCTTCTTGACCGGTTGGAGCCGCACAGCATTGGTGTAGTACCAGCGTGCACTGACAGTGAGTAGCATTCACCACGCCTCTTAGTCTCTTTTCCTAAATGCTTAGCGGTTTAAGATCCAGGTCATCTCCGCGGTAGGTGTGTGCCATATTAACAACAAAGGCCGTCAAGCAGGATGACAACAATGAAGGTCCGAAGTGGCACGCGGAATGCTGCTATAAGCACTACGAGCGTGGTAACACGATATAGAAGTCACCGTGTGTTGAGAAGAACATTTCTATCAAAATGTGCCGGCTGCATGGACGGGGGGCCTGCGACCTTAAGTGAATCCGAGCAAAGCTACGTTTGAGGGCCGGGCGTACGCATCAGTTGATGTCTCAAGCGAAACTGATAAGCAACGTGCGTCTGGGAATCAAAGTATAGACAGACATAGTCGTCATTAGCCTGGAGATACCTGGTCTAATTAAGCGTGTAACGCCGTCGTATAAAAATGCGAACAGATGGCGGATGTGGGTCAAGGACTCTCCATATGCGACCGTGAGGATAAGTAGAATATTCCACTATTCAGCTGTACATTTCCGGACCGGCAGGAATGTCAGTAGTATGAGGTGGCCCCCCTGATGGCTTAGGCCGGACCTATCTTACAGTGTTGCCTCACTGTTGTTTTTCACTATACCCGGGACCACGATTTAGCTCTTTCGCCTGGAAGAAGCACCGAGTAATGGCGTCGATAGGAATAACGGATGCCGATGTAATCTACGGTCTGGGGCAGGTTGTACGTCGCGGTTGCGCAACTGGACCGGTGACTCGCGGAGCATGAGGCTAGACTACGTGTTTTAGTTACTCCCCCGCACTATAATTGAGAGATTAGACCTGAGTTCTCGGTGCACGTTATTAACCGTTGGAACTTTGCTCGGCACTTTATCATGGCTTTCTGCTTTATGACGGAGGGTTGGGGATAACTCAGCACCTTCTGCCTGTTTCATTTTATAGTTTGTCAAATAATGGCTACACTGGAGTCTGCCATACCACGCGTGAATAACTAGGGGAGTGGTAGGTCCTATACTCAATCACATAGTTCGACGAACTGAGGATAACGGGACCAGGACTGAAAAACACTGGTGGAAGATTTCTGGGTTGATCTTTTTTTCCCGAGATATTAAAGAGGGTTACGTAATTGAGCTGATTCCTGAGGTCTATTGGCTAGAGCGGTGCCCTGGTCCTGGCCCACTCTAATAGGGGGAATCTGGTAATGCTTTCAATCGGTATCACGAACCCCAGCCAGGCCGACGATTTAAGGTCCTCGCCTTCTGAGGACTAGTCTGCAGAACCGGTTAATTTCGCCCTAGGGGTTAGTATATGGGACCGTGGGCTTTCTGATACTTTTCCGTTCGCGGCGACAAAACTCTCACAGTGCCGGCTAAGACCAATTCGAGCAATAGTATGCGTAATCAGTCTACACGCGTTACTCCTAACCATTTTATCATATACCCGGAGCCACGTGGTCTAGATAAGCGGACCACGCCGAAAACGCCCAGGTGCTTGTGGGATCTGCGAGCTGGCCAATACCAACTATAATAAGAATACAGCACAGCAGCTCTTACATTATTTTCCTCGACATCCCAGGGTTTACAACTTAAGCGCCAGCAGATCTTCTGCATCGTTGAGACGATGCTCGTAAAAAATGGCGAATCCATAATTGCCCCAAAAATGTGGTTTAGGCGCAGGTAAGTTTGCGGATCGACGTTCATAATTTATCGGCGACGGGGAGCGCCAATTATCAGTTAATACACATGAGGACGACAACCTTCGTCAGATCAAATGCAAGCACGTCTTGATCGAATGGCGACTCTACTGCGATGTGTCTGCCATGAGAGAAAAAATCCTCACCGCGACGGAGTATTGACCACTGCTACCTCATAAGGAAAAGGGGCACATTTAACCCTAGGCCTTGTTATCACCATAGCAGCGCCAAGAAATCGGTGCAATCGCAGCCTGCTCCAGGGGCCCTGCTGTCCAGTCGAACTGACCGATCCAGGGAGCGGTTCCCCGCACGTCTCACGGTGGGGTACGGCCGTAGAGGCATTGTCACGGATCATCTGGGTTACCCCGAGAGCTACTGCGAGCCGACATGTATCGTGTAATGAAAGCGTACGCATCACGAACTATCGTTTGTTAAAGCTTCGAGCCGGCTAAGTCAAAAGCGGTGCTCCTAGACTACCATCGAATACAGGTGGGGATCCGCCCCTGAATGTCTAGCGGCCTTTCTGACATACTGAGAAGACCGAGCCCATCCCGTTTTGCACGAACACGTAGAGCACCCTAATGATCATCTAGTTACTCTGCGGTGACTGGCCGTACCACGCGGGGACACAGCTACATCGTTGCCCGACGAGACCCGCTGAGATGGTACACGTATGTACCGGCCCCCTACGCAATCTGCTGGCCTTGGCAAAGTACATACTACTGTGTATGACGATCCGCCCTTAAGATGACATCTTTTCATTTGGAAGATAATAGCTCCGTCCATACTGTAATACGGTACAAGAAGTTTGACTTTCCGGAATCCAGACTCAGCGGGTGCCGAGCCTCTGGCGGGCGAGCCAGAAGGGAAGATCGTCAGGGTGTCAGTTTCTGAGGTCTATACTGGTCTTCACCCTTTACGATATCGGTTCAGAGACGAAGCTGACACGAATCGAGCATTAGACAAGAACGACCTAAATTAGAACTCAGGTGTGGTTATAGGAGTGGCACGCTCCAGACGTGTCGTACGCCGGGCGGCTTCCGGTGTCTTGCCTAACCGCTATACTGGGGCTCATTACTTTGCGCTTGAGTTCGGCCTCGTAAAACTACCCACTCTTGCTAGCACATAGGACGTAAACAGGTATAATCAGGACCTGCATCGCTCCGTCGCATTGAAGTTACGCTGCGCGCGGGATACCCGCTTCATTGCTGATTTCAGAGAGTGTAATTCGGTGGCAACGAGTTCTCGGATCATAACTTATCCCGCTCGCCAAGCGGAAGCGGGTAGTAAGAAGACTAGCCTATATAACTACGTAGTTGGCCTTATCCGAGGAGCATCATGGAGAAGTGAATGCACGCGCGGCCTACCATCTTTGCCACTAGTGGACGCTCGTGCAGCTCAACAATTGGCGACCCTCGAGCCGGTCTGCCCATGAGTGAGTCCAGTGTTGAATGGGTTTGCCGAGCCGCAGTTATTCCACATATTTGATCAAAGCGAATGATGAAAGTGACTAGTAAGGATCAATCGCGTCGATTGCAATCTGATCCGCCCTAGGAATCGGACCTGGTTAATCGAGGTTTATTCTCTGCATGGCATCCAAGTCGGATCGTACGCAGAAGCCCTTGTAATGGGACTGACCGTTAAGGGAGGGCTTAGTTGAGGGTGAAACGACGTAGCGTAGTTTGTAGACCGATCAGCGATTCCTTGTAATAAGTTTCCAACCACGCGGATCCCCCCGGTACAACTTTCGTATGACATACGAGCACCCGTTGAAGCCTCGATCCATTTAAGTTTGCAGCGGTGTACCTAGACAAATGCTAGTTACGAACAATGATTCACTACCATAAGACTGATTGGTGGGAAGCTTTCAGACTTCTCTGCCGAAGAGTAACTAGGACCCAGCGGCCCGGCCCTAAAATTCTTAACAAATTGGTTCTTCACATCAAACCACACGTGCACTAGGCTAGCTGAAACTGGCAAAGAGCTATGACACCCGTATTCCTCCATTTGTGGTGTGGCATATATCAGTGGTGGTACTAGCTGGGCGCCGTAAGATCACCTGCGCTCGCGCGCATCCGAGGTACTAACCTGTATCGTTGAAATCTCTGATCTAGGCGGCGTGGTCTGCGTACCATGTGCACCTTACCTTGAACACCTCATGTGTCCCGGGAGCGTGATTCGACATCCCGCGATGGAAGGACATCTGGTACCGGCTCCAAGGAGAGAGTGTTATTCCAGAATAACTTGAGAAGCCACGCAAGTCACCAGGTTGTAGTGTTCTGATGCTTAGCAGACCTCCCACATTTCGGATCAGTTCGGGCATACTTTCACTAAATGCTTCACCCAAACACAGGGTCACATACACGGATAGAAACAACTAGGTTAAGCTAGGACGTGAGCTATTCTGGATACTTATTGCTGCGAGTCATATCCTTTACTTGTAAGATATTGATCTCGACGTCGTACCCCACACTGCTTGGTACGCTGTAAGAGGTGTTTAGTGATGGCATCTGCAAGTTATACTGCTACCTGTGCGCACATCGGTCAGCTTACTCTCAGGATGCTTTGTCTCTTGCAGTACTCAACCTTAAATGATGAGACTATGCTCTTACCCGGCAGTACCTTTGATTCGGTCTCAAGTACCTCTCTAGGGAAGGTGTAAACTGCGTAAGATTGTGTGTGTATTAACTCCGGTACTTATACCCATTTTCGGATATATGAGGGATTTGGGAGTCTAACTGCTGTCATGACGACTATCTACTACTCGATCTAGAGTTTTGGGTTATCCGCGGGTGGCTTGGATGAGACTATAAATACCTGTCGCGGAATCCATACACCCTAGTAAAGGAAGGGACACCAGGAGGAGGATTATCAAAAAGCATCCCTCAGTAACCACCGAGATACGTTATCATAGATTGCTAATTACGCCCTCTAGGGGTCACTTTGACTGTCGGACTGCGCCAGCAAGGTGTGCTAGACCGCGTTACTTCAAATCTGGGTTGCCTTATATTAGACTGTGTCAACCGACTTAGAGGGAAGAGTGTGCCCTCCCTTGGGTAAGAGCCACTGGGGACTTGCTTCAGACCTTCGACACAGAAGGTCCTCCGACTACAAATAACGTATAAGGGACGATCATTGTGGTTTAAGGATCTAAGATAGTAGTCACACCCTATTTCAGAGTGTGTGCCCACTGTGAGAAAAGTTTAACAAATTATCTCTTGGAAACAAAAGATTATGTAGCGACGCTATGTATTAATTCCTGAAAACATATATAAACCTCCATCCCGTCCGCCCCATCGGAGCACTGATTAATAGGGGGCTAGCATCCTTGCTCACTAGTTATGAAAGCATCAGGTCAAAAAAGGCCCAATGTCGTGGACTCGCCGCCCGATTATTTCAGATAGCTTGATGGGTTATCAAGCCCCCCTCTAAGCGGAACAAAAGCCATACCCGGGATATCCAGAAATTACAGCTCACACGACACCTGTAGGGCAAATAGAAAGCAGTTGAGCCTAACCACAGCCTCATTCAATCAAGCACTAGTGGTGTCATATCCCGCAATGGAGACATGTCTTTGGAGAATCCTCCAGAGATCTCTATGTCTTCCCGGACATTACTCTCAGTCGAATAAGGGATGAGTTTCTTTACGAATAGGAATGCTTCGATGCATTCACGATGTAGCGTAGGCGGAGAGTTCACATTATCGATTGATACTTTCATCCTTGACGGTTCCAATTAACCATGCTTTCATCGCGCGTTCTGGTCCACCATAGAGTTGCTTGCTGCCTCCGCAACAGCAGGGCGGAACTGAATCCCTCCAGCTCTGTGCACGAACGTCGACTACCCCGAAGCTCAGAGACCCACTTTAAGACAGGACCAACGCAGCAAGATCTGCTAACAGCATGGAAATCGGGCTTTTATAAGTCCCAAGGCTCGCGTGCATAGAGCTAACTTTAACTTGCGGCTCTAATCAGGTGGTCACAGTGCGCCATAGGGTGTGCTAGCTAATCGATACGTAATCTAAGTAGAGCACACAACAGCAGAACCCGCGCCCAGGGCAGAGGCACCCTAACCTTTCCAAAATGGGTGTTTTACGCGCTTTGTGTGGTATACTAATCTTTGTCCTGCGGCTCGTTTCAGGCTATATGTCCCCAAACGCTCATCGGGTTTGAGTCCAACGACGAGAATTTCGTCCACTGCATGGAGCGGTCCAAACTGAGGCCTTGTGCGGCAAATGTAATATAGGTGCGTCGCCGGGCTTGGATTATCTATAATACAAGGTTGGTCAATTGGATACTGGACTACAGGAGCATCATGATAGAAAGACGCAAATTGCATGAGTCCAGAAGTTATACCCTCTAAGCTCACGACAGCCATTCGATATAGAATAATGAGTAGAACATCCATGCCTTGGAAGTCACCAACATGTACGGAGCCAGAGCACCTACTGGGGATTATCTCACCGATCTAGGGTTCGAAACCATACCATTTCAACGATCCCAGCGAATATAGTAGACCTCTTGATTCTGAAGGTTGGTTGATCTACACTACATCAAAAGGAAGCTGGGCACAGAAGGTTTCGCAGAGGTGGCCGTAGCACCCCTGCGGTTCAAGGCCAGGTAATATCATAGACACTGCACTTCATAGACGACATCCGTAGCAGCGTTAGCTCAATATTGTTTTTGGTTTATGGGTTGCGGTGCGGAAACCACAAGCTGGACCGCGCCTTGAATACGAGATGGTACTTCTTCCCTCTGCCTCTCAATTAGAAACAACGAATGCGAAAAATCGCTTTGACGACTCCCGGCCACCGCTTGCTCGAGACTGAACTTACCGTGCTCAAGGTTATTACACAAGTAGCTCCCTATTCGGGTGTGTGCGCATACTGGACCAAAAGTACCAGCCCGTTCAGTCCAACACTTGTGCTGTCCTGCTAATACTACCGCTGTCCAAAATTGCATCACGCGTGAGGATAACCTGCCAGGTTCTAAGCGGAAAGGGGCCGCGGCCCGCGTAGTTCGTCGTCTTTTACCCCAGCGGGTGATCTTGCCGAAGTCACCGGACGATGGGCGGCAGCTGGTTGCACCTTCGACGAAATATTACTTGGGGCGAGAAGGCAAACCGAGCCCAGATACGGTTACCTTAACGGAACGTAGTGAACTGGCCGTCCGTTCTACAACGTATTAACAGGGGCTCAGCAGCAAGGCCTGCCCCGGATGCCAACCGGCCCATGCGGTGACTCGATCCTGACTGGTCTCGTTGGGGGGAAGTGAGCAGTATGTATTGATAACATTGCGGATTTCGGTCTGACAGTCTAGCGTTGTAGAGTCGGGGAGTCTACGAGTCGCTACATCGCCTCTACGCCGCCTGCGGCGATCAATATCTAGTGCTGTTATATATAGTTTACCAGGACGTGATATGAACATTCGGTTGGGGTCTAAAGTTAAATTAGAATTAACATGAGTGAACCTGTTGGGGCATTCTGAACGACGATGAGTCCCCCACAAGAACTTGGATCACTGTCCATCTACAAGATTTGCAAGGGATGGATGGACTGTAAGATCAACAACTACCGCCAATAGTTAGTCAGGGCCACTTATTACGACGGCCGGAGGACTCTGTTCACATTGGGTCCTCACAGCCACCCACATTATATGATAAAGCTAGTCTAGTTGGATCATTTGTGCCAAGATGGTCGCAGACCGTTCTATTGCCAATTAAAGCTGATCTAGCTGCAAGTACACCCTCAAGGCAGGTTTCCCTCACACAAGCCCCCACAAATACCAGCTTAGCGTCAGGTAGCGAACATTATCATGACCTACACTACTCTCATAGGCCCTTCGATGCGGCCAACCAGCCATAGATGGGAGATGGCTCCCAGAACCGAGGTGCTTTTTGTCATTGTCCCATGCGTTATCCGCCAAGAGTGGGTTCTGCGCTGCACTACCAGTGTCTTAGATTATTACTTGGACAACCCTCTAGTAGTCTATACGCAGGAGGACCTGCCTTCCGCATTAATCAGCCCCGAACTATTAGCCGACATTCCCATGTACTTCAGGAGAATTGAAGCTTTGCTTATAGGGGGACGTCGCGAATTCCACACTAGGATAATGTATCCTCCTAATGACTTGAAGAACTACTACATCGGTTAGACGTATCCCACGTCACGCCCGCATCCTACAGGGAAATATCTGTACCCGACTTGAGGATAGCTTGTTTGCCCGAGGAAAGAAAGAGCGCCACGTATATCTCGACGTCAAACTCCTTTGTAGCCCTACTATTGTTCAATTCTATTAGGTGAGTGAGGGTTTAGGAAGACTGTATTATCCGTACCATGGGTGTCGGACACCATTACGGGGCACCCAGTTTAGTAAGAAGCTTCGGTCTCCCGAGGTGTCAGGGAGTAACTGACGAAATCCAGTAGCATGCAAGTGATGATCGCTGGAATCATGCCATATAGGACTGCGGAAGAGTAAGAATACAAAATCATTCCGGTGGCACACTTCAAGTGCTCGCCGTCTAAGGCAGTCGCTGCGAGGGGCCACTGATGCGCCTCGCCACTACCTGATTCGCGACAATGGGCTGTTGTGTAGAGCCTGGTGGATGAGTCAGCAAGGCAGGCCTCTGGGTGGGTGAACCCCACAGCTTCGAAAATAAAAGTTGTGCGCCTCCTCCGCTGCGATATCACTTCCCTCTAAGGAAGTCCTTAGTTGGTCCACTCCCGACGTATATACGATAGGAGCATTGGGACTCTTTGGCTATTTATTACAGTGAACGGCCCAATTTGTAAGTAGGTTGCCATATACTTCCAATGGGCGTGAGACCCCATCACGGAGTGGAGTCCCATCTTTCTCCAATCCGAACAGAGCTTGCGACAGACTATCCCGCAAGTCTTTAAGGCGACAAAAACAACACGGGAATCTATCCAAATGGCTCTTTGTAGCTTGGGAACGATGTAGATACCACACGTATTTGGTAATTCACTGCGCACGTTCAATCAATCCGGGTGAACGATTTGAAAAGGGAGGAAAAGGAGCGGGGTCGCACCCCCGCAGGCTCTAATGTGAAAATATCCACGTCGCCTAATCGTTCTTACTACGCCAGATCGTTGTTCAGAGCTATGATCCGGACAAATCAAATCAGTTGCTATGCCTAGCATGAGCTTATTCCGGATGTACGCTAACATGGAGCCCACAGGCCAAATCCTAGATGGCACGTTTCTGTAGTCTGAGGATGTCGGGCGAGGGGTCACACGATGTCCGGCATTTTGTACGTCGATAGGTATCCTTGCCTGGTAATAATAGCTAGGGGAATGATTAACGTTCTCGGACTCTACTGTTCGAGCCAAACCCCTCACTACAGTACGCAGCTGACTATTGAACGCCATTGTACGCGCATTAGACTCTTGCCACAATGGGTTTCGATGCTACAATGTGTACACCCTGGACTAGGAGTATAATGGGCACGCTCCTGGTAATGCGAACGCGCCTCGACCGCATTATTATCCTAGAACTCTTGGAATTGACACAGTCAACACCAATACTCGACTAATACCGTCAGATACCCGTTGTTCGTACGGGAGTAGCACGCTAGTCTACACACGCGGATTACGTTAGTACGCGGGTCGTTAGGCGCGTCCACGTGGCATAGTTAAAAAGAATACTCACTTAACGTTCGACTATACCAGCCATCCATCGACCGCCTCACATGGCCATAATATATGGAGGCCACGCAGGTGGTAAACCTTACGCTGATCTTGGCCGGAGCCGCGAACTTGACAGTCAAGACCTCGGCTCTCCATGGCATGTGGCCGCCGTATGTACGTGTTTTCGTATTGGCGAGACAGTCCCCGCTTACTGACATGTTTGACGTACAACCTATACCATGCAGTGACGAGCCCATGATCGCCTTTAACACGGTTCCCATGCACTAAAGAGGGGATGGTCTGGGAGAACAGGATGAGACTATAATACCTGTGAGTCCACAGGTTTTGCACGAGATGAGCGTAGACCGGCGTATTGCTAGGGCCCTCGTGAGATCGGTCATAAGGTGTACGGTACCCCGCAATTTAGATAATGGGAAACGTCATCCTTTCGCACTGGGGCCGAACGCGATAGGTTGTGCTCTTCAATGCTGATACGTGATCTTATCACCCGGATCTGGGATCTAGTCAAACGTTGGAAGGGCAAAGTGGGGGTATCTCTCTGCGCGATATCAATTCTCCTAAAAATGCAGCCCGATTAGAGGTACATCATGTCCTTTTGAACGTGGCATAGGCGTCAACGGGTGTCGGGAGATTTTCCCGCGTCGGAGCGTCCTGAAAGGAGACAGGCATCCACAACTGTTCTAGACAGTTTGACCGGACGTTTTGCAGGTGAACTTGGTTTATTCAGAATATCCGGCTCCCGCTACACCGTTCCCCGTTATACGGGATTGGTCGTTACGATAAAATCCCGTGTCTTTGCTGATGGCAGAGGCATCAATTATCCAGCAACACAAAAGATGACGTTAACTCCTCTGAGCTCAATAACAGACGAGTCTGGCTACGCACTATTTACATTACGCATTCTTGCCAAGGTATATCGTATCCCAGGATGTAATATTACTTAAGCCGAAGAGTCTCCCCTCCGCCAGGTAATTATCACTGGCGAGGGTAGTTTGCAAGGAACTAAATACTAAGGTGCCACTTAAGCAATTTGGCTTAAGTACATCCTGCCAACTAGGCCGACGAGTCCCTTAGAACTGGACTAAGGTAATTGGTTGGGCTGTTCTTACGCAATCGCCAGTTCCCCAAGATGGCAGCCATGTTAGGAACTGGCTTACATAAGGGCCTAGTAGTCACCGCGCTGATAGCAATCCCGAAATAGCTGGGAATAGACGACCAATACCAGGGAGCCTGTTGGTGTGTAGAAGAATAACTATATAAATATATGCTACGAATGCCAGCCCTCCCAACGACATCATTGACGGACTCCTCGCATCATCGTCCATGGAACTCATGCATGTCAGAAACTCTAGTAGTAGCTCCATTAGAGAAGGATTGCCGGTGACCAGCTTTGGTACCTCTAAAGCTAAGACTAGCCGAATACGCGATTATAATGAAACGGAAAGAATTCTTTCGGTCCGAACAAATAGATACGATTCCGCTCGTCTTATTGGGCTAAGGTGTAAACTTACCAACGCCGGTCAATATCAAGTCAGTCAATAGGCGCGCATGCCGTTCTACGGCGGGCCTTTTCCGATCCAGCATGCTGCCTGATGCGATGAGTGTCCGCTCTCACATAATGCGGTCTTTGAATTCATTTTCCCGCTGCGCCCTTCCCCCACCATGCTGATACCAGACTTATAAATGGACCGCACCTTACAAGAGAGCTATAACCCTTGATCTCTGACCAACAACTACTCGCACCGAGGTATCTTTGTAGTCACAGACATATACCTCATGGAGCATTCTGTCATCAACGGTTGAACGACTGTATGCGTGACCAGCTTCTAATAGTGCTAGGATCGGGGGGTCGCAGTAAGTGGCAGTTAGTATATCACGAACAGCTGGCTAGGTTGAACATGGGTTCTAAGGATGCACGTATAAACGGATGTCGAGAATGAACGTTCGAAGCTTCGTATACGTAGTCATCATTTGTATTTCCGGGGAGCCCTGCTCAGTAACACGATCTACAAGGGCGAGATGTCAAGACCTCAGGGCCCAGAACCAAACTCAAAGGTGACTCCCAAGCAAAAAAATCCTGCACACGAAATCTCTAACAGTTGGTAGATTGTGTTACCTCACCCTCCAGCCAGAGGTCTAAGGGCAGTATTTCAACGAAGGCTTGCATGCTTGGCGTCGTCACTGCGAAGCACTCAACGATCTCTAGCGCTGGAATGACGCGACTCACAGGTCGTAAGAGTGTAATGGTCCAGGAGAGCTACGTGATAAATAAATATTGATAGACGTTACCGTTTCAGGAGAAAAGGCAAAGCGACAAAATGCGGGGTTGACTTTGGAATACAGAATAGTCGGATACGGGTACGCTTCTAGGTAAAATACGGAACGAGTAATAATACGCGCAGTCCGTTAGGGCGTAAACAATTAGGCGAATTTGCATACAGCGCCAGGATCCCCATACCGACTGATTTGGTCAGTGGTAGCAAGACGGTACGTATGGCGGAGCAGCGTGTTACGACCGGGTTCGATACTA 3 | >11 4 | CGTCCGTACCCGGGCTATCCCCGGATTACACGGCCACATAGCGGGTGCTAACCACTCGAGATCCAGCTCACTGGAGATTTATAACGGTGTCCCTAAACAA 5 | >11 6 | CGTCCGTACCCGGGCTATCCCCGGATTACACGGCCACATAGCGGGTGCTAACCACTCGAGATCCAGCTCACTGGA 7 | >11 8 | CGTCCGTACCCGGGCTATCCCCGGATTACACGGCCACATAGCGGGTGCTA 9 | >11 10 | CGTCCGTACCCGGGCTATCCCCGGA 11 | >11 12 | GACTCTAGGCTATCAGGATCTGACTCAGCTGAGATCACTGAGTCATCGTCGAAGCTCGATGATCTCGAACGTGTACATCGCTGAAGCTAGCTTAGCATGCAGTCGTACGACTTAGCCGTACAGTGACTCGATGACTTCGAATGCAGTCTCAGCTGAATGCGACTGCTACTAGTCAGCTGACGTAGCTAGTCACTGACGAT -------------------------------------------------------------------------------- /test/unit/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | file(GLOB SRCS *.cpp) 2 | 3 | INCLUDE_DIRECTORIES(../../lib ../../seqothlib) 4 | 5 | ADD_EXECUTABLE(testL2Node testL2Node.cpp main.cpp) 6 | ADD_EXECUTABLE(testL1Node testL1Node.cpp main.cpp) 7 | 8 | TARGET_LINK_LIBRARIES(testL2Node 9 | libL2Node 10 | libgtest 11 | libgmock 12 | z 13 | ) 14 | 15 | TARGET_LINK_LIBRARIES(testL1Node 16 | libL1Node 17 | libgtest 18 | libgmock 19 | z 20 | ) 21 | add_test(NAME testfoo 22 | COMMAND testfoo) 23 | -------------------------------------------------------------------------------- /test/unit/main.cpp: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | 3 | int main(int argc, char **argv) 4 | { 5 | ::testing::InitGoogleTest(&argc, argv); 6 | int ret = RUN_ALL_TESTS(); 7 | return ret; 8 | } 9 | -------------------------------------------------------------------------------- /test/unit/testL1Node.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "testL1Node.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | L1NodeTest::L1NodeTest() {} 9 | L1NodeTest::~L1NodeTest() {} 10 | 11 | void L1NodeTest::SetUp() {} 12 | void L1NodeTest::TearDown() {} 13 | 14 | TEST_F(L1NodeTest, TestL1BuildQuery) { 15 | int n = 100; 16 | vector k; 17 | vector v; 18 | for (int i = 0 ; i < n ; i++) { 19 | k.push_back(rand()&0xFFFFFFULL); 20 | v.push_back(rand()%0xFFFF); 21 | } 22 | sort(k.begin(), k.end()); 23 | L1Node * p = new L1Node(1048576*128*4, 12,"testtmp"); 24 | for (int i = 0 ; i < n ; i++) { 25 | p->add(k[i], v[i]); 26 | } 27 | p->constructAndWrite(12, 4, "test"); 28 | int splitbit = p->getsplitbit(); 29 | printf("%d", splitbit); 30 | L1Node *q = new L1Node(0,12,"testtmp"); 31 | q->setsplitbit(12, splitbit); 32 | q->loadFromFile("test"); 33 | int uneq = 0; 34 | for (int i = 0 ; i < n; i++) { 35 | uint64_t res = q->queryInt(k[i]); 36 | if ((res ^ v[i])& 0xFFF) { 37 | printf("query result for %lx : %lx, %x", k[i], res, v[i]); 38 | uneq++; 39 | } 40 | } 41 | EXPECT_EQ(uneq,0); 42 | } 43 | /* 44 | void testVAL(vector val) { 45 | 46 | vector buf(64,0); 47 | vector buf0(64,0); 48 | uint32_t q = valuelistEncode(&buf[0], val, false); 49 | EXPECT_EQ(buf,buf0); 50 | uint32_t q2 = valuelistEncode(&buf[0], val, true); 51 | EXPECT_EQ(q,q2); 52 | vector valret; 53 | uint32_t ql = valuelistDecode(&buf[0], valret, 64); 54 | EXPECT_EQ(ql, valret.size()); 55 | EXPECT_EQ(valret, val); 56 | 57 | } 58 | TEST_F(L2NodeTest, TestEncodeDecode) { 59 | for (int i = 0 ; i < 100; i++) { 60 | int l = 1+ i % 6; 61 | vector val; 62 | for (int j = 0 ; j < l ; j++) { 63 | int x = rand() % (0xF); 64 | if (rand() & 1) x = ((x<<4) | (rand () % 0xF)); 65 | if (rand() & 1) x = ((x<<4) | (rand () % 0xF)); 66 | if (rand() & 1) x = ((x<<4) | (rand () % 0xF)); 67 | if (rand() & 1) x = ((x<<4) | (rand () % 0xF)); 68 | val.push_back(x+1); 69 | } 70 | testVAL(val); 71 | } 72 | vector val1 = { 57, 13758, 5 }; 73 | testVAL(val1); 74 | } 75 | 76 | TEST_F(L2NodeTest, TestL2Short) { 77 | 78 | L2Node *N = new L2ShortValueListNode (5,8); 79 | int NN = 20; 80 | std::random_device rd; //Will be used to obtain a seed for the random number engine 81 | std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd() 82 | std::uniform_int_distribution<> dis(0, 0x6FFFFFFFULL); 83 | vector vK; 84 | for (uint64_t i=0; i v; 90 | for (int j = 1; j<=5; j++) v.push_back((i+j*37)%100); 91 | sort(v.begin(), v.end()); 92 | N->add(k,v); 93 | } 94 | N->constructOth(); 95 | 96 | for (uint64_t i=0; i v,vret; 99 | vector vretmap; 100 | for (int j = 1; j<=5; j++) v.push_back((i+j*37)%100); 101 | sort(v.begin(), v.end()); 102 | 103 | bool ret = N->smartQuery(&k, vret, vretmap); 104 | EXPECT_EQ(ret, true); 105 | EXPECT_EQ(vret, v); 106 | } 107 | 108 | N->gzfname = "test.gz"; 109 | N->writeDataToGzipFile(); 110 | 111 | L2Node *N2 = new L2ShortValueListNode (5,8); 112 | N2->gzfname = "test.gz"; 113 | N2->loadDataFromGzipFile(); 114 | for (uint64_t i = 0; i < NN; i++) { 115 | uint64_t k = vK[i]; 116 | vector v,vret; 117 | vector vretmap; 118 | for (int j = 1; j<=5; j++) v.push_back((i+j*37)%100); 119 | sort(v.begin(), v.end()); 120 | bool ret = N2->smartQuery(&k, vret, vretmap); 121 | 122 | EXPECT_EQ(ret, true); 123 | EXPECT_EQ(vret, v); 124 | } 125 | } 126 | 127 | TEST_F(L2NodeTest, TestL2MAPP) { 128 | std::random_device rd; //Will be used to obtain a seed for the random number engine 129 | std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd() 130 | std::uniform_int_distribution<> dis(0, 255); 131 | std::uniform_int_distribution<> dis2(0, 0x6FFFFFFFULL); 132 | int totN = 101; 133 | int totK = 256; 134 | int L = 12; 135 | vector buf; 136 | vector vK; 137 | for (int i = 0; i <= totN*L; i++) { 138 | buf.push_back(dis(gen)); 139 | } 140 | for (uint64_t i=0; i tmp(buf.begin()+(i*L), buf.begin() + ((i+1)*L)); 148 | N->addMAPP(k,tmp); 149 | } 150 | N->constructOth(); 151 | 152 | N->gzfname = "test.gz"; 153 | N->writeDataToGzipFile(); 154 | 155 | gzFile fin = gzopen("test.gz", "rb"); 156 | L2Node *N2 = new L2ShortValueListNode (4,6); 157 | N2->gzfname = "test.gz"; 158 | N2->loadDataFromGzipFile(); 159 | 160 | for (uint64_t i=0; i vret; 163 | vector vretmap, vretmap2; 164 | vector tmp(buf.begin()+(i*L), buf.begin() + ((i+1)*L)); 165 | bool ret = N->smartQuery(&k, vret, vretmap); 166 | bool ret2 = N->smartQuery(&k, vret, vretmap2); 167 | EXPECT_EQ(ret, false); 168 | EXPECT_EQ(ret2, false); 169 | EXPECT_EQ(vretmap, tmp); 170 | EXPECT_EQ(vretmap2, tmp); 171 | } 172 | } 173 | 174 | TEST_F(L2NodeTest, TestL2EncodeLong) { 175 | std::random_device rd; //Will be used to obtain a seed for the random number engine 176 | std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd() 177 | std::uniform_int_distribution<> dis(0, 255); 178 | std::uniform_int_distribution<> dis2(0, 0x6FFFFFFFULL); 179 | int totN = 101; 180 | int totK = 256; 181 | int L = 12; 182 | vector> vlists; 183 | vector vK; 184 | uint32_t maxlength = 0; 185 | for (int i = 0; i < totN; i++) { 186 | vector vec; 187 | uint32_t last; 188 | vec.push_back(last = dis(gen)); 189 | uint32_t upper = dis2(gen) % 15 + 1; 190 | for (int i = 1 ; i<= upper; i++) { 191 | last += (dis(gen) + 1); 192 | if (dis2(gen) & 1) 193 | last += dis(gen); 194 | if (dis2(gen) & 3) 195 | last += dis(gen); 196 | vec.push_back(last); 197 | } 198 | vlists.push_back(vec); 199 | vector diff; diff.push_back(vec[0]); 200 | for (int i = 1; i< vec.size(); i++) 201 | diff.push_back(vec[i] - vec[i-1]); 202 | uint32_t encodelength = valuelistEncode(NULL, diff, false); 203 | if (encodelength > maxlength) maxlength = encodelength + 1; 204 | 205 | } 206 | for (uint64_t i=0; i vec = vlists[i]; 215 | vector diff; diff.push_back(vec[0]); 216 | for (int i = 1; i< vec.size(); i++) 217 | diff.push_back(vec[i] - vec[i-1]); 218 | N->add(k,diff); 219 | } 220 | N->constructOth(); 221 | 222 | gzFile fout = gzopen("test.gz", "wb"); 223 | N->gzfname = "test.gz"; 224 | N->writeDataToGzipFile(); 225 | gzclose(fout); 226 | gzFile fin = gzopen("test.gz", "rb"); 227 | L2Node *N2 = new L2EncodedValueListNode (maxlength,L2NodeTypes::VALUE_INDEX_ENCODED); 228 | N2->gzfname = "test.gz"; 229 | N2->loadDataFromGzipFile(); 230 | 231 | for (uint64_t i=0; i vret, vret2; 234 | vector vretmap, vretmap2; 235 | bool ret = N->smartQuery(&k, vret, vretmap); 236 | bool ret2 = N->smartQuery(&k, vret2, vretmap2); 237 | EXPECT_EQ(ret, true); 238 | EXPECT_EQ(ret2, true); 239 | vector vl = vlists[i]; 240 | EXPECT_EQ(vret, vl); 241 | EXPECT_EQ(vret2, vl); 242 | } 243 | 244 | } 245 | */ 246 | 247 | -------------------------------------------------------------------------------- /test/unit/testL1Node.h: -------------------------------------------------------------------------------- 1 | #pragma GCC diagnostic push 2 | #pragma GCC diagnostic ignored "-Wunused-parameter" 3 | #include "gtest/gtest.h" 4 | 5 | // The fixture for testing class Foo. 6 | class L1NodeTest : public ::testing::Test { 7 | 8 | protected: 9 | 10 | // You can do set-up work for each test here. 11 | L1NodeTest(); 12 | 13 | // You can do clean-up work that doesn't throw exceptions here. 14 | virtual ~L1NodeTest(); 15 | 16 | // If the constructor and destructor are not enough for setting up 17 | // and cleaning up each test, you can define the following methods: 18 | 19 | // Code here will be called immediately after the constructor (right 20 | // before each test). 21 | virtual void SetUp(); 22 | 23 | // Code here will be called immediately after each test (right 24 | // before the destructor). 25 | virtual void TearDown(); 26 | 27 | }; 28 | -------------------------------------------------------------------------------- /test/unit/testL2Node.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "testL2Node.h" 3 | #include 4 | #include 5 | #include 6 | 7 | L2NodeTest::L2NodeTest() { 8 | 9 | } 10 | 11 | L2NodeTest::~L2NodeTest() {}; 12 | 13 | void L2NodeTest::SetUp() {}; 14 | 15 | void L2NodeTest::TearDown() {}; 16 | 17 | void testVAL(vector val) { 18 | 19 | vector buf(64,0); 20 | vector buf0(64,0); 21 | uint32_t q = valuelistEncode(&buf[0], val, false); 22 | EXPECT_EQ(buf,buf0); 23 | uint32_t q2 = valuelistEncode(&buf[0], val, true); 24 | EXPECT_EQ(q,q2); 25 | vector valret; 26 | uint32_t ql = valuelistDecode(&buf[0], valret, 64); 27 | EXPECT_EQ(ql, valret.size()); 28 | EXPECT_EQ(valret, val); 29 | 30 | } 31 | TEST_F(L2NodeTest, TestEncodeDecode) { 32 | for (int i = 0 ; i < 100; i++) { 33 | int l = 1+ i % 6; 34 | vector val; 35 | for (int j = 0 ; j < l ; j++) { 36 | int x = rand() % (0xF); 37 | if (rand() & 1) x = ((x<<4) | (rand () % 0xF)); 38 | if (rand() & 1) x = ((x<<4) | (rand () % 0xF)); 39 | if (rand() & 1) x = ((x<<4) | (rand () % 0xF)); 40 | if (rand() & 1) x = ((x<<4) | (rand () % 0xF)); 41 | val.push_back(x+1); 42 | } 43 | testVAL(val); 44 | } 45 | vector val1 = { 57, 13758, 5 }; 46 | testVAL(val1); 47 | } 48 | 49 | TEST_F(L2NodeTest, TestL2Short) { 50 | 51 | L2Node *N = new L2ShortValueListNode (5,8,"test.gz"); 52 | unsigned int NN = 20; 53 | std::random_device rd; //Will be used to obtain a seed for the random number engine 54 | std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd() 55 | std::uniform_int_distribution<> dis(0, 0x6FFFFFFFULL); 56 | vector vK; 57 | for (uint64_t i=0; i v; 63 | for (int j = 1; j<=5; j++) v.push_back((i+j*37)%100); 64 | sort(v.begin(), v.end()); 65 | N->add(k,v); 66 | } 67 | N->constructOth(); 68 | 69 | for (uint64_t i=0; i v,vret; 72 | vector vretmap; 73 | for (int j = 1; j<=5; j++) v.push_back((i+j*37)%100); 74 | sort(v.begin(), v.end()); 75 | 76 | bool ret = N->smartQuery(&k, vret, vretmap); 77 | EXPECT_EQ(ret, true); 78 | EXPECT_EQ(vret, v); 79 | } 80 | 81 | N->writeDataToGzipFile(); 82 | 83 | L2Node *N2 = new L2ShortValueListNode (5,8,"test.gz"); 84 | N2->loadDataFromGzipFile(); 85 | for (uint64_t i = 0; i < NN; i++) { 86 | uint64_t k = vK[i]; 87 | vector v,vret; 88 | vector vretmap; 89 | for (int j = 1; j<=5; j++) v.push_back((i+j*37)%100); 90 | sort(v.begin(), v.end()); 91 | bool ret = N2->smartQuery(&k, vret, vretmap); 92 | 93 | EXPECT_EQ(ret, true); 94 | EXPECT_EQ(vret, v); 95 | } 96 | } 97 | 98 | TEST_F(L2NodeTest, TestL2MAPP) { 99 | std::random_device rd; //Will be used to obtain a seed for the random number engine 100 | std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd() 101 | std::uniform_int_distribution<> dis(0, 255); 102 | std::uniform_int_distribution<> dis2(0, 0x6FFFFFFFULL); 103 | unsigned int totN = 101; 104 | unsigned int L = 12; 105 | vector buf; 106 | vector vK; 107 | for (unsigned int i = 0; i <= totN*L; i++) { 108 | buf.push_back(dis(gen)); 109 | } 110 | for (uint64_t i=0; i tmp(buf.begin()+(i*L), buf.begin() + ((i+1)*L)); 118 | N->addMAPP(k,tmp); 119 | } 120 | N->constructOth(); 121 | 122 | N->writeDataToGzipFile(); 123 | 124 | L2Node *N2 = new L2ShortValueListNode (4,6,"test.gz"); 125 | N2->loadDataFromGzipFile(); 126 | 127 | for (uint64_t i=0; i vret; 130 | vector vretmap, vretmap2; 131 | vector tmp(buf.begin()+(i*L), buf.begin() + ((i+1)*L)); 132 | bool ret = N->smartQuery(&k, vret, vretmap); 133 | bool ret2 = N->smartQuery(&k, vret, vretmap2); 134 | EXPECT_EQ(ret, false); 135 | EXPECT_EQ(ret2, false); 136 | EXPECT_EQ(vretmap, tmp); 137 | EXPECT_EQ(vretmap2, tmp); 138 | } 139 | } 140 | 141 | TEST_F(L2NodeTest, TestL2EncodeLong) { 142 | std::random_device rd; //Will be used to obtain a seed for the random number engine 143 | std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd() 144 | std::uniform_int_distribution<> dis(0, 255); 145 | std::uniform_int_distribution<> dis2(0, 0x6FFFFFFFULL); 146 | unsigned int totN = 1001; 147 | vector> vlists; 148 | vector vK; 149 | uint32_t maxlength = 0; 150 | for (unsigned int i = 0; i < totN; i++) { 151 | vector vec; 152 | uint32_t last; 153 | vec.push_back(last = dis(gen)); 154 | uint32_t upper = dis2(gen) % 15 + 1; 155 | for (unsigned int i = 1 ; i<= upper; i++) { 156 | last += (dis(gen) + 1); 157 | if (dis2(gen) & 1) 158 | last += dis(gen); 159 | if (dis2(gen) & 3) 160 | last += dis(gen); 161 | vec.push_back(last); 162 | } 163 | vlists.push_back(vec); 164 | vector diff; diff.push_back(vec[0]); 165 | for (unsigned int i = 1; i< vec.size(); i++) 166 | diff.push_back(vec[i] - vec[i-1]); 167 | uint32_t encodelength = valuelistEncode(NULL, diff, false); 168 | if (encodelength > maxlength) maxlength = encodelength + 1; 169 | 170 | } 171 | for (uint64_t i=0; i vec = vlists[i]; 180 | vector diff; diff.push_back(vec[0]); 181 | for (unsigned int i = 1; i< vec.size(); i++) 182 | diff.push_back(vec[i] - vec[i-1]); 183 | N->add(k,diff); 184 | } 185 | N->constructOth(); 186 | 187 | N->writeDataToGzipFile(); 188 | L2Node *N2 = new L2EncodedValueListNode (maxlength,L2NodeTypes::VALUE_INDEX_ENCODED,"test.gz"); 189 | N2->loadDataFromGzipFile(); 190 | 191 | for (uint64_t i=0; i vret, vret2; 194 | vector vretmap, vretmap2; 195 | bool ret = N->smartQuery(&k, vret, vretmap); 196 | bool ret2 = N->smartQuery(&k, vret2, vretmap2); 197 | EXPECT_EQ(ret, true); 198 | EXPECT_EQ(ret2, true); 199 | vector vl = vlists[i]; 200 | EXPECT_EQ(vret, vl); 201 | EXPECT_EQ(vret2, vl); 202 | } 203 | 204 | } 205 | 206 | -------------------------------------------------------------------------------- /test/unit/testL2Node.h: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | 3 | // The fixture for testing class Foo. 4 | class L2NodeTest : public ::testing::Test { 5 | 6 | protected: 7 | 8 | // You can do set-up work for each test here. 9 | L2NodeTest(); 10 | 11 | // You can do clean-up work that doesn't throw exceptions here. 12 | virtual ~L2NodeTest(); 13 | 14 | // If the constructor and destructor are not enough for setting up 15 | // and cleaning up each test, you can define the following methods: 16 | 17 | // Code here will be called immediately after the constructor (right 18 | // before each test). 19 | virtual void SetUp(); 20 | 21 | // Code here will be called immediately after each test (right 22 | // before the destructor). 23 | virtual void TearDown(); 24 | 25 | }; 26 | --------------------------------------------------------------------------------