├── py ├── .gitattributes ├── MANIFEST.in ├── peregrine │ ├── __init__.py │ ├── build_falcon4py.py │ ├── build_shimmer4py.py │ └── utils.py ├── setup_pypy.py ├── setup.cfg ├── setup.py └── scripts │ ├── path_to_contig.py │ └── pg_asm_cns.py ├── misc ├── logo.png └── logo.svg ├── docker ├── entry_dev.sh ├── entry.sh ├── test │ ├── run_test.sh │ ├── Makefile │ └── simulate_reads.py ├── bashrc ├── install_with_conda.sh ├── Dockerfile ├── Dockerfile.dockerhub ├── LICENSE.minimap2 └── LICENSE.falcon ├── test ├── genome_mapping │ ├── Makefile │ └── run_test.sh └── ecoli_K12 │ ├── Makefile │ ├── simulate_reads.py │ ├── run_test.sh │ └── run_test_one_level.sh ├── nim-mini ├── mmer_count.py ├── mmer_graph.py └── dump_mmmer.nim ├── src ├── kalloc.h ├── shmr_end_filter.c ├── Makefile ├── shmr_reduce.c ├── shmr_gather_mc.c ├── kvec.h ├── shmr_dedup.c ├── shmr_mkseqdb.c ├── shimmer.h ├── shmr_align.c ├── shimmer4py.c ├── DWmatch.c ├── mm_sketch.c ├── kalloc.c ├── shmr_index.c ├── kseq.h └── shmr_map.c ├── falcon ├── kalloc.h ├── falcon.h ├── kvec.h ├── common.h ├── kalloc.c └── DW_banded.c ├── install_with_conda.sh ├── .github └── workflows │ ├── build_docker_image.yml │ └── build_docker_image_release.yml ├── py-utils ├── simread.py ├── check_ovlp.py ├── dump_L0.py ├── FastaReader.py └── process_L2.py ├── LICENSE.minimap2 ├── LICENSE.falcon └── README.md /py/.gitattributes: -------------------------------------------------------------------------------- 1 | peregrine/_version.py export-subst 2 | -------------------------------------------------------------------------------- /misc/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cschin/Peregrine/HEAD/misc/logo.png -------------------------------------------------------------------------------- /py/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include versioneer.py 2 | include peregrine/_version.py 3 | -------------------------------------------------------------------------------- /docker/entry_dev.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . /opt/conda/etc/profile.d/conda.sh 3 | conda activate peregrine 4 | pg_run_dev.py $@ 5 | -------------------------------------------------------------------------------- /py/peregrine/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from ._version import get_versions 3 | __version__ = get_versions()['version'] 4 | sys.stderr.write(f"Peregrine Assembler & SHIMMER ASMKit({__version__})\n") 5 | del get_versions 6 | -------------------------------------------------------------------------------- /docker/entry.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . /opt/conda/etc/profile.d/conda.sh 3 | conda activate peregrine 4 | if [ $1 == "test" ]; then 5 | cd /opt/test 6 | bash /opt/test/run_test.sh 7 | else 8 | pg_run.py $@ 9 | fi 10 | -------------------------------------------------------------------------------- /test/genome_mapping/Makefile: -------------------------------------------------------------------------------- 1 | all: test 2 | bogus: clean test 3 | 4 | test: 5 | rm -rf ./wd ./logs 6 | /usr/bin/time ./run_test.sh > all.log 2>&1; mkdir -p logs; mv *.log logs 7 | 8 | clean: 9 | rm -rf ./wd/ ./logs/ seq_dataset.lst reads2ref.out ref2ref.out 10 | -------------------------------------------------------------------------------- /py/setup_pypy.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | import os 3 | os.environ["peregrine_base"] = os.path.abspath(os.path.pardir) 4 | 5 | setup(name='peregrine_pypy', 6 | version='0.1', 7 | install_requires=["networkx==2.4"], 8 | scripts = ["scripts/ovlp_to_graph.py", "scripts/graph_to_path.py"]) 9 | -------------------------------------------------------------------------------- /py/setup.cfg: -------------------------------------------------------------------------------- 1 | 2 | # See the docstring in versioneer.py for instructions. Note that you must 3 | # re-run 'versioneer.py setup' after changing this section, and commit the 4 | # resulting files. 5 | 6 | [versioneer] 7 | VCS = git 8 | style = pep440 9 | versionfile_source = peregrine/_version.py 10 | versionfile_build = peregrine/_version.py 11 | tag_prefix = pg 12 | #parentdir_prefix = 13 | 14 | -------------------------------------------------------------------------------- /docker/test/run_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e -o pipefail 3 | . /root/.bashrc 4 | make simreads 5 | make test-pypeflow 6 | mkdir -p /wd 7 | cp -a ./wd-pf/ /wd/ecoli_test_results/ 8 | cp K12MG1655.fa /wd/ecoli_test_results/ 9 | apt-get install -y mummer 10 | cd /wd/ecoli_test_results/ 11 | dnadiff K12MG1655.fa p_ctg_cns.fa 12 | echo 13 | echo dnadiff output of the assembled contig to the e. coli genome used for the simulated reads 14 | cat out.report 15 | -------------------------------------------------------------------------------- /docker/test/Makefile: -------------------------------------------------------------------------------- 1 | all: test test-pypeflow 2 | bogus: simreads clean test test-pypeflow 3 | 4 | K12MG1655.fa: 5 | wget https://s3.amazonaws.com//biologicaldatascience.org/data/ecoli-k12/K12MG1655.fa 6 | 7 | simreads: K12MG1655.fa 8 | mkdir -p ./reads 9 | python simulate_reads.py 10 | 11 | reads_0.fa: simreads 12 | 13 | test-pypeflow: 14 | rm -rf ./wd-pf 15 | find ${PWD}/reads/ -name "reads_*.fa" > seq_dataset.lst 16 | echo yes | /usr/bin/time pg_run.py asm seq_dataset.lst 12 4 8 4 1 1 1 1 1 --with-consensus --output ./wd-pf 17 | 18 | clean: 19 | rm -rf ./wd/ ./logs/ ./reads/ seq_dataset.lst 20 | -------------------------------------------------------------------------------- /nim-mini/mmer_count.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | fn = "preads4falcon_mer" 4 | mer_count = {} 5 | with open(fn) as f: 6 | for row in f: 7 | row = row.strip() 8 | if row[0] == ">": 9 | continue 10 | row = row.split() 11 | mer_count.setdefault(row[2], 0) 12 | mer_count[row[2]] += 1 13 | 14 | with open(fn) as f: 15 | for row in f: 16 | row = row.strip() 17 | if row[0] == ">": 18 | print(row) 19 | continue 20 | else: 21 | row = row.split() 22 | count = mer_count[row[2]] 23 | print(" ".join(row), count) 24 | 25 | -------------------------------------------------------------------------------- /src/kalloc.h: -------------------------------------------------------------------------------- 1 | #ifndef _KALLOC_H_ 2 | #define _KALLOC_H_ 3 | 4 | #include /* for size_t */ 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | 10 | typedef struct { 11 | size_t capacity, available, n_blocks, n_cores, largest; 12 | } km_stat_t; 13 | 14 | void *kmalloc(void *km, size_t size); 15 | void *krealloc(void *km, void *ptr, size_t size); 16 | void *kcalloc(void *km, size_t count, size_t size); 17 | void kfree(void *km, void *ptr); 18 | 19 | void *km_init(void); 20 | void km_destroy(void *km); 21 | void km_stat(const void *_km, km_stat_t *s); 22 | 23 | #ifdef __cplusplus 24 | } 25 | #endif 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /falcon/kalloc.h: -------------------------------------------------------------------------------- 1 | #ifndef _KALLOC_H_ 2 | #define _KALLOC_H_ 3 | 4 | #include /* for size_t */ 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | 10 | typedef struct { 11 | size_t capacity, available, n_blocks, n_cores, largest; 12 | } km_stat_t; 13 | 14 | void *kmalloc(void *km, size_t size); 15 | void *krealloc(void *km, void *ptr, size_t size); 16 | void *kcalloc(void *km, size_t count, size_t size); 17 | void kfree(void *km, void *ptr); 18 | 19 | void *km_init(void); 20 | void km_destroy(void *km); 21 | void km_stat(const void *_km, km_stat_t *s); 22 | 23 | #ifdef __cplusplus 24 | } 25 | #endif 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /docker/bashrc: -------------------------------------------------------------------------------- 1 | # ~/.bashrc: executed by bash(1) for non-login shells. 2 | 3 | # Note: PS1 and umask are already set in /etc/profile. You should not 4 | # need this unless you want different defaults for root. 5 | # PS1='${debian_chroot:+($debian_chroot)}\h:\w\$ ' 6 | # umask 022 7 | 8 | # You may uncomment the following lines if you want `ls' to be colorized: 9 | # export LS_OPTIONS='--color=auto' 10 | # eval "`dircolors`" 11 | # alias ls='ls $LS_OPTIONS' 12 | # alias ll='ls $LS_OPTIONS -l' 13 | # alias l='ls $LS_OPTIONS -lA' 14 | # 15 | # Some more alias to avoid making mistakes: 16 | # alias rm='rm -i' 17 | # alias cp='cp -i' 18 | # alias mv='mv -i' 19 | . /opt/conda/etc/profile.d/conda.sh 20 | conda activate peregrine 21 | -------------------------------------------------------------------------------- /install_with_conda.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . ~/anaconda3/bin/activate 3 | conda create -n peregrine -y python=3.7 4 | 5 | conda activate peregrine 6 | conda install -c conda-forge -y pypy3.6 7 | 8 | pushd py 9 | rm -rf .eggs/ dist/ build/ peregrine.egg-info/ peregrine_pypy.egg-info get-pip.py 10 | python3 setup.py install 11 | python3 setup.py clean --all 12 | popd 13 | git clone -b peregrine https://github.com/cschin/pypeFLOW.git 14 | pushd pypeFLOW 15 | python3 setup.py install 16 | popd 17 | pushd py 18 | wget -q https://bootstrap.pypa.io/get-pip.py 19 | pypy3 get-pip.py 20 | pypy3 setup_pypy.py install 21 | popd 22 | 23 | pushd src 24 | make all 25 | make install 26 | popd 27 | 28 | #python3 -m pip install cffi==1.12.2 29 | -------------------------------------------------------------------------------- /docker/install_with_conda.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . /opt/conda/bin/activate 3 | conda create -n peregrine -y python=3.7 4 | 5 | conda activate peregrine 6 | conda install -c conda-forge -y pypy3.6 7 | 8 | pushd py 9 | rm -rf .eggs/ dist/ build/ peregrine.egg-info/ peregrine_pypy.egg-info get-pip.py 10 | python3 setup.py install 11 | python3 setup.py clean --all 12 | popd 13 | git clone -b peregrine https://github.com/cschin/pypeFLOW.git 14 | pushd pypeFLOW 15 | python3 setup.py install 16 | popd 17 | pushd py 18 | wget -q https://bootstrap.pypa.io/get-pip.py 19 | wget -q https://bootstrap.pypa.io/get-pip.py 20 | pypy3 get-pip.py 21 | pypy3 setup_pypy.py install 22 | popd 23 | 24 | pushd src 25 | make all 26 | make install 27 | popd 28 | 29 | #python3 -m pip install cffi==1.12.2 30 | -------------------------------------------------------------------------------- /py/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | import versioneer 3 | import os 4 | os.environ["peregrine_base"] = os.path.abspath(os.path.pardir) 5 | 6 | setup(name='peregrine', 7 | version=versioneer.get_version(), 8 | cmdclass=versioneer.get_cmdclass(), 9 | packages=['peregrine'], 10 | package_dir = {'peregrine': 'peregrine'}, 11 | scripts = ["scripts/path_to_contig.py", 12 | "scripts/pg_asm_cns.py", 13 | "scripts/pg_run.py", 14 | "scripts/pg_run_dev.py"], 15 | setup_requires=["cffi>=1.12.0", 16 | "versioneer==0.18"], 17 | cffi_modules=["peregrine/build_shimmer4py.py:ffibuilder", 18 | "peregrine/build_falcon4py.py:ffibuilder"], 19 | install_requires=["cffi>=1.12.0", 20 | "docopt>=0.6.2", 21 | "numpy>=1.16.2"]) 22 | -------------------------------------------------------------------------------- /src/shmr_end_filter.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #define __STDC_LIMIT_MACROS 7 | #include "khash.h" 8 | #include "kvec.h" 9 | #include "shimmer.h" 10 | 11 | void mm_end_filter(mm128_v *p, mm128_v *p_out_5, mm128_v *p_out_3, 12 | khash_t(RLEN) * rlmap, uint32_t end_length) { 13 | uint32_t idx; 14 | uint32_t rid; 15 | uint32_t rlen; 16 | uint32_t pos, r_pos, span; 17 | khiter_t k; 18 | mm128_t mmer; 19 | 20 | for (idx = 0; idx < p->n; idx++) { 21 | mmer = p->a[idx]; 22 | rid = mmer.y >> 32; 23 | span = mmer.x & 0xFF; 24 | k = kh_get(RLEN, rlmap, rid); 25 | // is_missing = (k == kh_end(hmap)); 26 | rlen = kh_value(rlmap, k).len; 27 | pos = ((mmer.y & 0xFFFFFFFF) >> 1) + 1; 28 | r_pos = rlen - pos + span; 29 | if (pos < end_length) { 30 | kv_push(mm128_t, NULL, *p_out_5, mmer); 31 | }; 32 | if (r_pos < end_length) { 33 | kv_push(mm128_t, NULL, *p_out_3, mmer); 34 | }; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/miniconda3 2 | RUN apt-get update 3 | RUN apt-get install -y build-essential zlib1g zlib1g-dev 4 | RUN mkdir /opt/build 5 | COPY src.tgz /opt/build 6 | COPY install_with_conda.sh /opt/build 7 | RUN cd /opt/build; tar zxvf src.tgz; bash install_with_conda.sh 8 | RUN . /opt/conda/bin/activate; conda clean --all 9 | 10 | FROM continuumio/miniconda3 11 | COPY --from=0 /opt/conda /opt/conda 12 | RUN apt-get update 13 | RUN apt-get install -y parallel time 14 | RUN . /opt/conda/bin/activate; conda activate peregrine; python3 -m pip install cffi==1.12.2 15 | RUN apt-get install -y make 16 | RUN mkdir /opt/licenses 17 | COPY LICENSE /opt/licenses/LICENSE 18 | COPY LICENSE.falcon /opt/licenses/LICENSE.falcon 19 | COPY LICENSE.minimap2 /opt/licenses/LICENSE.minimap2 20 | RUN mkdir /opt/test 21 | COPY test/Makefile /opt/test 22 | COPY test/run_test.sh /opt/test 23 | COPY test/simulate_reads.py /opt/test 24 | COPY bashrc /root/.bashrc 25 | COPY entry.sh /opt/ 26 | COPY entry_dev.sh /opt/ 27 | WORKDIR /opt/test 28 | ENTRYPOINT ["/opt/entry.sh"] 29 | -------------------------------------------------------------------------------- /test/ecoli_K12/Makefile: -------------------------------------------------------------------------------- 1 | all: test test-pypeflow 2 | bogus: simreads clean test test-pypeflow 3 | 4 | K12MG1655.fa: 5 | wget https://www.dropbox.com/s/wqqnzachbdk4d3r/K12MG1655.fa 6 | 7 | simreads: K12MG1655.fa 8 | mkdir -p ./reads 9 | python simulate_reads.py 10 | 11 | reads_0.fa: simreads 12 | 13 | test: 14 | rm -rf ./wd ./logs 15 | /usr/bin/time ./run_test.sh > all.log 2>&1; mkdir -p logs; mv *.log logs 16 | 17 | test-pypeflow: 18 | rm -rf ./wd-pf 19 | find ${PWD}/reads/ -name "reads_*.fa" > seq_dataset.lst 20 | /usr/bin/time pg_run.py asm seq_dataset.lst 12 4 8 4 1 1 1 1 1 --with-consensus --output ./wd-pf 21 | 22 | test-pypeflow-with-L0: 23 | rm -rf ./wd-pf 24 | find ${PWD}/reads/ -name "reads_*.fa" > seq_dataset.lst 25 | /usr/bin/time pg_run.py asm seq_dataset.lst 12 4 8 4 1 1 1 1 1 --with-L0-index --with-consensus --output ./wd-pf 26 | 27 | test-pypeflow-l1: 28 | rm -rf ./wd-pf-l1 29 | find ${PWD}/reads/ -name "reads_*.fa" > seq_dataset.lst 30 | /usr/bin/time pg_run.py asm seq_dataset.lst 12 4 8 4 1 1 1 1 1 --shimmer-r 24 --with-consensus --shimmer-l 1 --output ./wd-pf-l1 31 | 32 | clean: 33 | rm -rf ./wd/ ./logs/ ./reads/ seq_dataset.lst 34 | -------------------------------------------------------------------------------- /docker/Dockerfile.dockerhub: -------------------------------------------------------------------------------- 1 | FROM continuumio/miniconda3 2 | RUN apt-get update 3 | RUN apt-get install -y build-essential zlib1g zlib1g-dev 4 | RUN mkdir /opt/build 5 | COPY docker/install_with_conda.sh /opt/build 6 | COPY py/ /opt/build/py/ 7 | COPY src/ /opt/build/src/ 8 | COPY falcon/ /opt/build/falcon/ 9 | RUN cd /opt/build; bash install_with_conda.sh 10 | RUN . /opt/conda/bin/activate; conda clean --all 11 | 12 | FROM continuumio/miniconda3 13 | COPY --from=0 /opt/conda /opt/conda 14 | RUN apt-get update 15 | RUN apt-get install -y parallel time 16 | RUN . /opt/conda/bin/activate; conda activate peregrine; python3 -m pip install cffi==1.12.2 17 | RUN apt-get install -y make 18 | RUN mkdir /opt/licenses 19 | COPY docker/LICENSE /opt/licenses/LICENSE 20 | COPY docker/LICENSE.falcon /opt/licenses/LICENSE.falcon 21 | COPY docker/LICENSE.minimap2 /opt/licenses/LICENSE.minimap2 22 | RUN mkdir /opt/test 23 | COPY docker/test/Makefile /opt/test 24 | COPY docker/test/run_test.sh /opt/test 25 | COPY docker/test/simulate_reads.py /opt/test 26 | COPY docker/bashrc /root/.bashrc 27 | COPY docker/entry.sh /opt/ 28 | COPY docker/entry_dev.sh /opt/ 29 | WORKDIR /opt/test 30 | ENTRYPOINT ["/opt/entry.sh"] 31 | -------------------------------------------------------------------------------- /.github/workflows/build_docker_image.yml: -------------------------------------------------------------------------------- 1 | name: build-and-test-docker-image-master-branch 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | 7 | pull_request: 8 | branches: [ master ] 9 | 10 | jobs: 11 | build: 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v2 16 | 17 | - name: build docker for the master branch 18 | shell: bash 19 | run: bash build_docker.sh master 20 | 21 | - name: test assembling E. coli 22 | if: success() 23 | shell: bash 24 | run: | 25 | mkdir -p $HOME/wd 26 | docker run -v ${GITHUB_WORKSPACE}/wd:/wd cschin/peregrine:latest test 27 | ls ${GITHUB_WORKSPACE}/wd/ecoli_test_results/ 28 | 29 | - uses: actions/upload-artifact@v2 30 | if: success() 31 | with: 32 | name: E. coli dnadiff results 33 | path: wd/ecoli_test_results/out.report 34 | 35 | - name: push image to docker hub 36 | if: ${{ success() && github.event_name == 'push' }} 37 | run: | 38 | echo '${{ secrets.docker_password }}' | docker login --username '${{ secrets.docker_user }}' --password-stdin 39 | docker push cschin/peregrine:latest 40 | 41 | -------------------------------------------------------------------------------- /py-utils/simread.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | rcmap = dict(zip("ACGT","TGCA")) 4 | 5 | def rc_seq(seq): 6 | return "".join([rcmap[c] for c in seq[::-1]]) 7 | 8 | def sim_error(seq): 9 | out_seq = [] 10 | for c in seq: 11 | if random.uniform(0, 1) < 0.01: 12 | c = random.choice( ('A','C','G','T', '', c+'A', c+'C', c+'G', c+'T') ) 13 | out_seq.append(c) 14 | return "".join(out_seq) 15 | 16 | seq = [] 17 | with open("K12MG1655.fa") as f: 18 | for row in f: 19 | row = row.strip() 20 | if len(row) < 1: 21 | continue 22 | if ">" == row[0]: 23 | continue 24 | seq.append(row) 25 | 26 | seq = "".join(seq) 27 | seq = seq + seq[:40000] 28 | 29 | rl = 15000 30 | read_count = 15 * len(seq) // rl 31 | 32 | sim_record = open("reads.bed","w") 33 | import random 34 | for i in range(read_count): 35 | rl2 = int(rl + random.gauss(0, 1500)) 36 | s = random.randint(0, len(seq)-40000) 37 | print(">{:06d}".format(i)) 38 | seq_tmp = sim_error(seq[s:s+rl2]) 39 | if random.randint(0,1) == 1: 40 | seq_tmp = rc_seq(seq_tmp) 41 | print(seq_tmp) 42 | print("{:06d}".format(i), s, s+rl2, sep="\t", file=sim_record) 43 | sim_record.close() 44 | 45 | -------------------------------------------------------------------------------- /.github/workflows/build_docker_image_release.yml: -------------------------------------------------------------------------------- 1 | name: build-and-test-docker-image-tagged-release 2 | 3 | on: 4 | push: 5 | tags: 6 | - pg* 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v2 14 | 15 | - name: build docker image for the tagged commit 16 | shell: bash 17 | run: bash build_docker.sh tag 18 | 19 | - name: test assembling E. coli 20 | if: success() 21 | shell: bash 22 | run: | 23 | mkdir -p ${GITHUB_WORKSPACE}/wd 24 | tag=$(git describe --always --abbrev=0 --tags) 25 | tag=${tag:2} 26 | docker run -v ${GITHUB_WORKSPACE}/wd:/wd cschin/peregrine:${tag} test 27 | ls ${GITHUB_WORKSPACE}/wd/ecoli_test_results/ 28 | 29 | - uses: actions/upload-artifact@v2 30 | if: success() 31 | with: 32 | name: E. coli dnadiff results 33 | path: wd/ecoli_test_results/out.report 34 | 35 | - name: push image to docker hub 36 | if: ${{ success() && github.event_name == 'push' }} 37 | run: | 38 | echo '${{ secrets.docker_password }}' | docker login --username '${{ secrets.docker_user }}' --password-stdin 39 | tag=$(git describe --always --abbrev=0 --tags) 40 | tag=${tag:2} 41 | docker push cschin/peregrine:${tag} 42 | 43 | -------------------------------------------------------------------------------- /LICENSE.minimap2: -------------------------------------------------------------------------------- 1 | 2 | This software uses the following libraray from Heng Li's Minimap2 3 | code under MIT License: 4 | 5 | mm_sketch.c kvec.h kseq.h khash.h kalloc.h kalloc.c 6 | 7 | The MIT License 8 | 9 | Copyright (c) 2018- Dana-Farber Cancer Institute 10 | 2017-2018 Broad Institute, Inc. 11 | 12 | Permission is hereby granted, free of charge, to any person obtaining 13 | a copy of this software and associated documentation files (the 14 | "Software"), to deal in the Software without restriction, including 15 | without limitation the rights to use, copy, modify, merge, publish, 16 | distribute, sublicense, and/or sell copies of the Software, and to 17 | permit persons to whom the Software is furnished to do so, subject to 18 | the following conditions: 19 | 20 | The above copyright notice and this permission notice shall be 21 | included in all copies or substantial portions of the Software. 22 | 23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 | SOFTWARE. 31 | -------------------------------------------------------------------------------- /docker/LICENSE.minimap2: -------------------------------------------------------------------------------- 1 | 2 | This software uses the following libraray from Heng Li's Minimap2 code under 3 | MIT Licesne 4 | 5 | mm_sketch.c kvec.h kseq.h khash.h kalloc.h kalloc.c 6 | 7 | The MIT License 8 | 9 | Copyright (c) 2018- Dana-Farber Cancer Institute 10 | 2017-2018 Broad Institute, Inc. 11 | 12 | Permission is hereby granted, free of charge, to any person obtaining 13 | a copy of this software and associated documentation files (the 14 | "Software"), to deal in the Software without restriction, including 15 | without limitation the rights to use, copy, modify, merge, publish, 16 | distribute, sublicense, and/or sell copies of the Software, and to 17 | permit persons to whom the Software is furnished to do so, subject to 18 | the following conditions: 19 | 20 | The above copyright notice and this permission notice shall be 21 | included in all copies or substantial portions of the Software. 22 | 23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 | SOFTWARE. 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /test/genome_mapping/run_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | ln -sf ../ecoli_K12/reads/ . 4 | ln -sf ../ecoli_K12/K12MG1655.fa . 5 | find ./reads/ -name "reads_*.fa" > seq_dataset.lst 6 | SHIMMER=../../.. 7 | SHIMMER=$(cd "$(dirname "../../../")"; pwd)/$(basename "$1") 8 | SHIMMERBIN=$SHIMMER/src 9 | WORKDIR=./wd/ 10 | INDEX=$WORKDIR/index 11 | pushd $SHIMMER 12 | echo SHIMMER revision: $(git rev-parse HEAD) 13 | popd 14 | echo get SHIMMER binaries from $SHIMMER 15 | mkdir -p $INDEX 16 | 17 | echo 18 | echo build read index 19 | time (/usr/bin/time $SHIMMERBIN/shmr_mkseqdb -p $INDEX/seq_dataset -d seq_dataset.lst 2> build_db.log) 20 | 21 | echo 22 | echo build ref index 23 | echo K12MG1655.fa > ref.lst 24 | time (/usr/bin/time $SHIMMERBIN/shmr_mkseqdb -p $INDEX/ref -d ref.lst 2> build_ref_db.log) 25 | 26 | echo build ref shimmer index 27 | time (for c in `seq 1 6`; do echo "/usr/bin/time $SHIMMERBIN/shmr_index -p $INDEX/seq_dataset -t 6 -c $c -o $INDEX/read 2> build_index.$c.log" ; done | parallel -j 4) 28 | 29 | echo build ref shimmer index 30 | time (for c in `seq 1 2`; do echo "/usr/bin/time $SHIMMERBIN/shmr_index -p $INDEX/ref -t 2 -c $c -o $INDEX/ref 2> build_ref_index.$c.log" ; done | parallel -j 2) 31 | 32 | echo run shimmer_map 33 | $SHIMMERBIN/shmr_map -r $INDEX/ref -m $INDEX/ref-L2 -p $INDEX/seq_dataset -l $INDEX/read-L2 -t 1 -c 1 > reads2ref.out 34 | 35 | $SHIMMERBIN/shmr_map -r $INDEX/ref -m $INDEX/ref-L2 -p $INDEX/ref -l $INDEX/ref-L2 -t 1 -c 1 > ref2ref.out 36 | -------------------------------------------------------------------------------- /falcon/falcon.h: -------------------------------------------------------------------------------- 1 | #include "kvec.h" 2 | #include "khash.h" 3 | #include "kalloc.h" 4 | 5 | typedef struct { 6 | seq_coor_t t_pos; 7 | uint8_t delta; 8 | char q_base; 9 | seq_coor_t p_t_pos; // the tag position of the previous base 10 | uint8_t p_delta; // the tag delta of the previous base 11 | char p_q_base; // the previous base 12 | unsigned q_id; 13 | } align_tag_t; 14 | 15 | typedef struct { 16 | seq_coor_t len; 17 | align_tag_t * align_tags; 18 | } align_tags_t; 19 | 20 | 21 | 22 | typedef struct { size_t n, m; uint64_t *a; } uint64_v; 23 | 24 | typedef struct { 25 | uint64_t ctag_key; 26 | uint64_t ptag_key; 27 | uint16_t coverage; 28 | uint16_t count; 29 | double score; 30 | } align_edge_t; 31 | 32 | typedef struct { size_t n, m; align_edge_t *a; } align_edge_v; 33 | 34 | KHASH_MAP_INIT_INT64(PTAG, uint16_t); 35 | typedef khash_t(PTAG) ptag_to_count_t; 36 | KHASH_MAP_INIT_INT64(CTAG, khash_t(PTAG) *); 37 | typedef khash_t(CTAG) ctag_to_ptag_t; 38 | 39 | typedef struct { 40 | uint64_t ctag_key; 41 | align_edge_t * best_edge; 42 | double best_score; 43 | } align_node_t; 44 | 45 | KHASH_MAP_INIT_INT64(NODE, align_node_t *); 46 | typedef khash_t(NODE) align_node_map_t; 47 | 48 | align_tags_t * get_align_tags( char *, char *, seq_coor_t, aln_range *, unsigned, seq_coor_t); 49 | void free_align_tags( align_tags_t * tags); 50 | consensus_data * get_cns_from_align_tags( align_tags_t **, unsigned, unsigned, unsigned ); 51 | -------------------------------------------------------------------------------- /docker/test/simulate_reads.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | random.seed(42) 4 | 5 | rcmap = dict(zip("ACGT","TGCA")) 6 | 7 | def rc_seq(seq): 8 | return "".join([rcmap[c] for c in seq[::-1]]) 9 | 10 | def sim_error(seq): 11 | out_seq = [] 12 | for c in seq: 13 | if random.uniform(0, 1) < 0.01: 14 | c = random.choice( ('A','C','G','T', '', c+'A', c+'C', c+'G', c+'T') ) 15 | out_seq.append(c) 16 | return "".join(out_seq) 17 | 18 | seq = [] 19 | with open("./K12MG1655.fa") as f: 20 | for row in f: 21 | row = row.strip() 22 | if len(row) < 1: 23 | continue 24 | if ">" == row[0]: 25 | continue 26 | seq.append(row) 27 | 28 | seq = "".join(seq) 29 | seq = seq + seq[:40000] 30 | 31 | rl = 15000 32 | for j in range(8): 33 | read_count = 2 * len(seq) // rl 34 | sim_record = open(f"reads/reads_{j}.bed","w") 35 | read_file = open(f"reads/reads_{j}.fa","w") 36 | import random 37 | for i in range(read_count): 38 | rl2 = int(rl + random.gauss(0, 1500)) 39 | s = random.randint(0, len(seq)-40000) 40 | print(">{:02d}/{:06d}/{}_{}".format(j,i,0,rl2), file=read_file) 41 | seq_tmp = sim_error(seq[s:s+rl2]) 42 | if random.randint(0,1) == 1: 43 | seq_tmp = rc_seq(seq_tmp) 44 | print(seq_tmp, file=read_file) 45 | print("{:02d}/{:06d}/{}_{}".format(j,i,0,rl2), s, s+rl2, sep="\t", file=sim_record) 46 | sim_record.close() 47 | read_file.close() 48 | 49 | -------------------------------------------------------------------------------- /test/ecoli_K12/simulate_reads.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | random.seed(42) 4 | 5 | rcmap = dict(zip("ACGT","TGCA")) 6 | 7 | def rc_seq(seq): 8 | return "".join([rcmap[c] for c in seq[::-1]]) 9 | 10 | def sim_error(seq): 11 | out_seq = [] 12 | for c in seq: 13 | if random.uniform(0, 1) < 0.01: 14 | c = random.choice( ('A','C','G','T', '', c+'A', c+'C', c+'G', c+'T') ) 15 | out_seq.append(c) 16 | return "".join(out_seq) 17 | 18 | seq = [] 19 | with open("./K12MG1655.fa") as f: 20 | for row in f: 21 | row = row.strip() 22 | if len(row) < 1: 23 | continue 24 | if ">" == row[0]: 25 | continue 26 | seq.append(row) 27 | 28 | seq = "".join(seq) 29 | seq = seq + seq[:40000] 30 | 31 | rl = 15000 32 | for j in range(8): 33 | read_count = 2 * len(seq) // rl 34 | sim_record = open(f"reads/reads_{j}.bed","w") 35 | read_file = open(f"reads/reads_{j}.fa","w") 36 | import random 37 | for i in range(read_count): 38 | rl2 = int(rl + random.gauss(0, 1500)) 39 | s = random.randint(0, len(seq)-40000) 40 | print(">{:02d}/{:06d}/{}_{}".format(j,i,0,rl2), file=read_file) 41 | seq_tmp = sim_error(seq[s:s+rl2]) 42 | if random.randint(0,1) == 1: 43 | seq_tmp = rc_seq(seq_tmp) 44 | print(seq_tmp, file=read_file) 45 | print("{:02d}/{:06d}/{}_{}".format(j,i,0,rl2), s, s+rl2, sep="\t", file=sim_record) 46 | sim_record.close() 47 | read_file.close() 48 | 49 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | 2 | .PHONY:all clean install 3 | 4 | CC=gcc 5 | CFLAGS=-O3 6 | CFLAGS+=-Wall -Wno-unused-result -Wno-unused-function -Werror 7 | #CFLAGS+=-Wall -Wno-unused-result -Wno-unused-function 8 | LDLIBS=-lz 9 | ALL=shmr_mkseqdb shmr_index shmr_gather_mc shmr_overlap shmr_dedup shmr_map 10 | 11 | all: $(ALL) 12 | 13 | kalloc.o: kalloc.c 14 | 15 | shmr_reduce.o: shmr_reduce.c 16 | 17 | mm_sketch.o: mm_sketch.c 18 | 19 | shmr_utils.o: shmr_utils.c 20 | 21 | shmr_utils.so: shmr_utils.c kalloc.c 22 | gcc -O3 -shared -fPIC -Wall kalloc.c shmr_utils.c -o shmr_utils.so 23 | 24 | shimmer4py.so: shimmer4py.c shmr_utils.c kalloc.c 25 | gcc -O3 -shared -fPIC -Wall kalloc.c shmr_utils.c shimmer4py.c -o shimmer4py.so 26 | 27 | shmr_end_filter.o: shmr_end_filter.c 28 | 29 | shmr_gather_mc.o: shmr_gather_mc.c 30 | 31 | DWmatch.o: DWmatch.c 32 | 33 | shmr_mkseqdb: shmr_mkseqdb.c kalloc.o shmr_utils.o 34 | 35 | shmr_index: shmr_index.c kalloc.o shmr_reduce.o mm_sketch.o shmr_utils.o shmr_end_filter.o 36 | 37 | shmr_gather_mc: shmr_gather_mc.o kalloc.o shmr_utils.o 38 | 39 | shmr_overlap: shmr_overlap.c shmr_utils.o kalloc.o DWmatch.o 40 | 41 | shmr_map: shmr_map.c shmr_utils.o kalloc.o DWmatch.o 42 | 43 | shmr_dedup: shmr_dedup.c kalloc.o 44 | 45 | shmr_dedup2: shmr_dedup2.c kalloc.o 46 | 47 | BINDIR=$(shell dirname $(shell which python)) 48 | install: 49 | cp $(ALL) $(BINDIR) 50 | 51 | clean: 52 | rm -f shmr_dedup shmr_gather_mc shmr_overlap \ 53 | shmr_map shmr_mkseqdb shmr_index \ 54 | shmr_gather_mc *.o *.so; rm -rf ./bin/ 55 | -------------------------------------------------------------------------------- /py-utils/check_ovlp.py: -------------------------------------------------------------------------------- 1 | from intervaltree import Interval, IntervalTree 2 | import glob 3 | tree = IntervalTree() 4 | 5 | rname2rid = {} 6 | with open("wd-pf/0-seqdb/seq_dataset.idx") as f: 7 | for row in f: 8 | row = row.strip().split() 9 | rname2rid[row[1]]=row[0] 10 | 11 | read_range = {} 12 | for fn in glob.glob("reads/*.bed"): 13 | with open(fn) as f: 14 | for row in f: 15 | row = row.strip().split() 16 | rname = row[0] 17 | s = int(row[1]) 18 | e = int(row[2]) 19 | tree.addi(s, e, rname2rid[rname]) 20 | read_range[rname2rid[rname]] = (s, e) 21 | if s < 40000: 22 | tree.addi(s+4639694, e+4639694, rname2rid[rname]) 23 | readpair = set() 24 | for rid in read_range: 25 | s, e = read_range[rid] 26 | for itvl in tree[s:e]: 27 | if itvl.data == rid: 28 | continue 29 | print("X", rid, itvl.data) 30 | readpair.add( (rid, itvl.data) ) 31 | readpair.add( (itvl.data, rid) ) 32 | 33 | ovlppair = set() 34 | with open("wd-pf/3-asm/preads.ovl") as f: 35 | for row in f: 36 | row = row.strip().split() 37 | if row[0] == "-": 38 | continue 39 | if (row[0], row[1]) in readpair: 40 | row.append("1") 41 | else: 42 | row.append("0") 43 | print("Y"," ".join(row)) 44 | ovlppair.add( (row[1], row[0]) ) 45 | ovlppair.add( (row[0], row[1]) ) 46 | 47 | for op in readpair: 48 | r1 = read_range[op[0]] 49 | r2 = read_range[op[1]] 50 | if r1[0] < r2[0]: 51 | olen = r1[1] - r2[0] 52 | else: 53 | olen = r2[1] - r1[0] 54 | if op in ovlppair: 55 | op = list(op) 56 | 57 | print("Z {} {} {} 1".format(op[0], op[1], olen)) 58 | else: 59 | print("Z {} {} {} 0".format(op[0], op[1], olen)) 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /LICENSE.falcon: -------------------------------------------------------------------------------- 1 | #################################################################################$$ 2 | # Copyright (c) 2011-2015, Pacific Biosciences of California, Inc. 3 | # 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted (subject to the limitations in the 8 | # disclaimer below) provided that the following conditions are met: 9 | # 10 | # * Redistributions of source code must retain the above copyright 11 | # notice, this list of conditions and the following disclaimer. 12 | # 13 | # * Redistributions in binary form must reproduce the above 14 | # copyright notice, this list of conditions and the following 15 | # disclaimer in the documentation and/or other materials provided 16 | # with the distribution. 17 | # 18 | # * Neither the name of Pacific Biosciences nor the names of its 19 | # contributors may be used to endorse or promote products derived 20 | # from this software without specific prior written permission. 21 | # 22 | # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE 23 | # GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC 24 | # BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED 25 | # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 26 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 27 | # DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS 28 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 29 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 30 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF 31 | # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 32 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 33 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 34 | # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 | # SUCH DAMAGE. 36 | #################################################################################$$ 37 | -------------------------------------------------------------------------------- /docker/LICENSE.falcon: -------------------------------------------------------------------------------- 1 | #################################################################################$$ 2 | # Copyright (c) 2011-2015, Pacific Biosciences of California, Inc. 3 | # 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted (subject to the limitations in the 8 | # disclaimer below) provided that the following conditions are met: 9 | # 10 | # * Redistributions of source code must retain the above copyright 11 | # notice, this list of conditions and the following disclaimer. 12 | # 13 | # * Redistributions in binary form must reproduce the above 14 | # copyright notice, this list of conditions and the following 15 | # disclaimer in the documentation and/or other materials provided 16 | # with the distribution. 17 | # 18 | # * Neither the name of Pacific Biosciences nor the names of its 19 | # contributors may be used to endorse or promote products derived 20 | # from this software without specific prior written permission. 21 | # 22 | # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE 23 | # GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC 24 | # BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED 25 | # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 26 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 27 | # DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS 28 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 29 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 30 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF 31 | # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 32 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 33 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 34 | # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 | # SUCH DAMAGE. 36 | #################################################################################$$ 37 | -------------------------------------------------------------------------------- /test/ecoli_K12/run_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | find $PWD/reads/ -name "reads_*.fa" > seq_dataset.lst 4 | WORKDIR=$PWD/wd 5 | INDEX=$WORKDIR/index 6 | OVLOUT=$WORKDIR/ovlp 7 | ASM=$WORKDIR/asm 8 | SHIMMER=../../.. 9 | pushd $SHIMMER 10 | echo SHIMMER revision: $(git rev-parse HEAD) 11 | popd 12 | echo get SHIMMER binaries from $SHIMMER 13 | mkdir -p $INDEX 14 | mkdir -p $OVLOUT 15 | mkdir -p $ASM 16 | echo 17 | echo build read index 18 | time (/usr/bin/time shmr_mkseqdb -p $INDEX/seq_dataset -d seq_dataset.lst 2> build_db.log) 19 | echo 20 | echo build shimmer index 21 | time (for c in `seq 1 12`; do echo "/usr/bin/time shmr_index -p $INDEX/seq_dataset -r 6 -t 12 -c $c -o $INDEX/shmr 2> build_index.$c.log" ; done | parallel -j 4) 22 | #time (for c in `seq 1 12`; do echo "/usr/bin/time shmr_index -p $INDEX/seq_dataset -l 1 -t 12 -c $c -o $INDEX/shmr 2> build_index.$c.log" ; done | parallel -j 4) 23 | echo 24 | echo build overlaps 25 | time (for c in `seq -f "%02g" 1 8`; do echo "/usr/bin/time shmr_overlap -p $INDEX/seq_dataset -l $INDEX/shmr-L2 -t 8 -c $c -o $OVLOUT/ovlp.$c 2> ovlp.$c.log"; done | parallel -j 4) 26 | echo 27 | echo faclon ovlp to graph 28 | cd $ASM 29 | time (cat ../ovlp/ovlp.* | shmr_dedup > preads.ovl; echo "-" >> preads.ovl) 30 | /usr/bin/time ovlp_to_graph.py >& asm.log 31 | ln -sf ../index/seq_dataset.* . 32 | #/usr/bin/time pypy graph_to_contig.py >& to_contig.log 33 | /usr/bin/time graph_to_path.py >& to_path.log 34 | /usr/bin/time path_to_contig.py $INDEX/seq_dataset p_ctg_tiling_path > p_ctg.fa 2> to_contig.log 35 | echo $PWD/p_ctg.fa > p_ctg.lst 36 | time (/usr/bin/time shmr_mkseqdb -p $INDEX/p_ctg -d p_ctg.lst 2> build_p_ctg_db.log) 37 | time (for c in `seq 1 1`; do echo "/usr/bin/time shmr_index -p $INDEX/p_ctg -r 6 -t 1 -c $c -o $INDEX/p_ctg 2> build_p_ctg_index.$c.log" ; done | parallel -j 4) 38 | time (/usr/bin/time shmr_map -r $INDEX/p_ctg -m $INDEX/p_ctg-L2 -p $INDEX/seq_dataset -l $INDEX/shmr-L2 -t 1 -c 1 > read_map.txt 2> map.log) 39 | time (/usr/bin/time cns_prototype.py $INDEX/seq_dataset $INDEX/p_ctg read_map.txt 1 1 > p_ctg_cns.fa 2> cns.log) 40 | 41 | -------------------------------------------------------------------------------- /nim-mini/mmer_graph.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | 3 | 4 | G = nx.DiGraph() 5 | m_count = {} 6 | 7 | #fn = "preads4falcon_mer" 8 | 9 | fn = "H08_mer" 10 | with open(fn) as f: 11 | for row in f: 12 | row = row.strip() 13 | if row[0] == ">": 14 | continue 15 | row = row.split() 16 | m_count.setdefault(row[2], 0) 17 | m_count[row[2]] += 1 18 | 19 | 20 | with open(fn) as f: 21 | for row in f: 22 | row = row.strip() 23 | if row[0] == ">": 24 | v = None 25 | w = None 26 | else: 27 | row = row.split() 28 | if v is not None: 29 | w = row[2] 30 | if m_count[v] > 5 and m_count[v] < 60 and \ 31 | m_count[w] > 5 and m_count[w] < 60: 32 | G.add_edge(v, w) 33 | if "count" not in G[v][w]: 34 | G[v][w]["count"] = 0 35 | G[v][w]["count"] += 1 36 | v = w 37 | else: 38 | v = row[2] 39 | 40 | #for v, w in G.edges(): 41 | # print(v, w, G[v][w]["count"], G.out_degree(v), G.in_degree(w)) 42 | 43 | remove_nodes = set() 44 | for v in G.nodes(): 45 | if G.out_degree(v) > 1 or G.in_degree(v) > 1: 46 | remove_nodes.add(v) 47 | 48 | G2 = G.copy() 49 | for v in list(remove_nodes): 50 | G2.remove_node(v) 51 | 52 | remove_nodes = set() 53 | for subG in nx.weakly_connected_component_subgraphs(G2): 54 | if len(subG) == 1: 55 | remove_nodes.update(subG.nodes()) 56 | 57 | for v in list(remove_nodes): 58 | G.remove_node(v) 59 | 60 | remove_nodes = set() 61 | for v in G.nodes(): 62 | if G.out_degree(v) == 0 or G.in_degree(v) == 0: 63 | remove_nodes.add(v) 64 | 65 | for v in list(remove_nodes): 66 | G.remove_node(v) 67 | 68 | for subG in nx.weakly_connected_component_subgraphs(G): 69 | subG_size = len(subG.nodes()) 70 | for v in subG.nodes(): 71 | print(subG_size, v, subG.in_degree(v), subG.out_degree(v)) 72 | 73 | 74 | nx.write_gexf(G, "test.gexf") 75 | -------------------------------------------------------------------------------- /test/ecoli_K12/run_test_one_level.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | find $PWD/reads/ -name "reads_*.fa" > seq_dataset.lst 4 | WORKDIR=$PWD/wd-l1 5 | INDEX=$WORKDIR/index 6 | OVLOUT=$WORKDIR/ovlp 7 | ASM=$WORKDIR/asm 8 | SHIMMER=../../.. 9 | pushd $SHIMMER 10 | echo SHIMMER revision: $(git rev-parse HEAD) 11 | popd 12 | echo get SHIMMER binaries from $SHIMMER 13 | mkdir -p $INDEX 14 | mkdir -p $OVLOUT 15 | mkdir -p $ASM 16 | echo 17 | echo build read index 18 | time (/usr/bin/time shmr_mkseqdb -p $INDEX/seq_dataset -d seq_dataset.lst 2> build_db.log) 19 | echo 20 | echo build shimmer index 21 | #time (for c in `seq 1 12`; do echo "/usr/bin/time shmr_index -p $INDEX/seq_dataset -t 12 -c $c -o $INDEX/shmr 2> build_index.$c.log" ; done | parallel -j 4) 22 | time (for c in `seq 1 12`; do echo "/usr/bin/time shmr_index -p $INDEX/seq_dataset -l 1 -r 36 -t 12 -c $c -o $INDEX/shmr 2> build_index.$c.log" ; done | parallel -j 4) 23 | echo 24 | echo build overlaps 25 | time (for c in `seq -f "%02g" 1 8`; do echo "/usr/bin/time shmr_overlap -p $INDEX/seq_dataset -l $INDEX/shmr-L1 -t 8 -c $c -o $OVLOUT/ovlp.$c 2> ovlp.$c.log"; done | parallel -j 4) 26 | echo 27 | echo faclon ovlp to graph 28 | cd $ASM 29 | time (cat ../ovlp/ovlp.* | shmr_dedup > preads.ovl; echo "-" >> preads.ovl) 30 | /usr/bin/time ovlp_to_graph.py >& asm.log 31 | ln -sf ../index/seq_dataset.* . 32 | #/usr/bin/time pypy graph_to_contig.py >& to_contig.log 33 | /usr/bin/time graph_to_path.py >& to_path.log 34 | /usr/bin/time path_to_contig.py $INDEX/seq_dataset p_ctg_tiling_path > p_ctg.fa 2> to_contig.log 35 | echo $PWD/p_ctg.fa > p_ctg.lst 36 | time (/usr/bin/time shmr_mkseqdb -p $INDEX/p_ctg -d p_ctg.lst 2> build_p_ctg_db.log) 37 | time (for c in `seq 1 1`; do echo "/usr/bin/time shmr_index -p $INDEX/p_ctg -t 1 -c $c -o $INDEX/p_ctg 2> build_p_ctg_index.$c.log" ; done | parallel -j 4) 38 | time (/usr/bin/time shmr_map -r $INDEX/p_ctg -m $INDEX/p_ctg-L2 -p $INDEX/seq_dataset -l $INDEX/shmr-L2 -t 1 -c 1 > read_map.txt 2> map.log) 39 | time (/usr/bin/time cns_prototype.py $INDEX/seq_dataset $INDEX/p_ctg read_map.txt 1 1 > p_ctg_cns.fa 2> cns.log) 40 | 41 | -------------------------------------------------------------------------------- /src/shmr_reduce.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #define __STDC_LIMIT_MACROS 7 | #include "kvec.h" 8 | #include "shimmer.h" 9 | 10 | typedef struct { 11 | uint8_t size; 12 | uint8_t head; 13 | mm128_t *mers; 14 | } small_m_buffer_t; 15 | 16 | static inline uint64_t hash64(uint64_t key, uint64_t mask) { 17 | key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1; 18 | key = key ^ key >> 24; 19 | key = ((key + (key << 3)) + (key << 8)) & mask; // key * 265 20 | key = key ^ key >> 14; 21 | key = ((key + (key << 2)) + (key << 4)) & mask; // key * 21 22 | key = key ^ key >> 28; 23 | key = (key + (key << 31)) & mask; 24 | return key; 25 | } 26 | 27 | void pop_push(small_m_buffer_t *smb, mm128_t mer) { 28 | smb->mers[smb->head] = mer; 29 | smb->head++; 30 | smb->head %= smb->size; 31 | } 32 | 33 | void find_minimizer(small_m_buffer_t *smb, mm128_t *mmer) { 34 | uint32_t i = 0; 35 | uint64_t min_val = UINT64_MAX; 36 | uint64_t h; 37 | 38 | mmer->x = smb->mers[0].x; 39 | mmer->y = smb->mers[0].y; 40 | min_val = smb->mers[0].x >> 8; 41 | 42 | for (i = 1; i < smb->size; i++) { 43 | h = smb->mers[i].x >> 8; 44 | if (h < min_val) { 45 | min_val = h; 46 | mmer->x = smb->mers[i].x; 47 | mmer->y = smb->mers[i].y; 48 | } 49 | } 50 | } 51 | 52 | /* rs: reduction size */ 53 | void mm_reduce(mm128_v *p, mm128_v *p_out, uint8_t rs) { 54 | uint32_t idx; 55 | uint32_t rid; 56 | uint32_t rid_ = UINT32_MAX; 57 | uint32_t r_offset = 0; 58 | mm128_t mmer, mmer_; 59 | small_m_buffer_t smb; 60 | 61 | kv_resize(mm128_t, NULL, *p_out, p->n); 62 | 63 | smb.size = rs; 64 | smb.head = 0; 65 | smb.mers = (mm128_t *)alloca(sizeof(mm128_t) * smb.size); 66 | memset(smb.mers, UINT8_MAX, rs * 16); 67 | 68 | mmer_.y = UINT64_MAX; 69 | 70 | for (idx = 0; idx < p->n; idx++, r_offset++) { 71 | rid = p->a[idx].y >> 32; 72 | if (rid != rid_) { 73 | r_offset = 0; 74 | memset(smb.mers, UINT8_MAX, rs * 16); 75 | smb.head = 0; 76 | rid_ = rid; 77 | } 78 | pop_push(&smb, p->a[idx]); 79 | if (r_offset < rs - 1) { 80 | continue; 81 | } 82 | find_minimizer(&smb, &mmer); 83 | if (mmer.y != mmer_.y) { 84 | // printf("%lu\n", mmer.x >> 8); 85 | kv_push(mm128_t, NULL, *p_out, mmer); 86 | mmer_.x = mmer.x; 87 | mmer_.y = mmer.y; 88 | } 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/shmr_gather_mc.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "khash.h" 8 | #include "kvec.h" 9 | #include "shimmer.h" 10 | 11 | extern char *optarg; 12 | extern int optind, opterr, optopt; 13 | 14 | int main(int argc, char *argv[]) { 15 | char *data_path_prefix = NULL; 16 | char mc_chunk_file_path[8192]; 17 | char mc_file_path[8192]; 18 | int written; 19 | int total_chunk = 1; 20 | int chunk = 1; 21 | int c; 22 | 23 | opterr = 0; 24 | 25 | while ((c = getopt(argc, argv, "p:t:")) != -1) { 26 | switch (c) { 27 | case 'p': 28 | data_path_prefix = optarg; 29 | break; 30 | case 't': 31 | total_chunk = atoi(optarg); 32 | break; 33 | case '?': 34 | if (optopt == 'd') { 35 | fprintf(stderr, 36 | "Option -%c not specified, please specify a prefix of file " 37 | "path the data filis\n", 38 | optopt); 39 | } 40 | if (optopt == 't') { 41 | fprintf(stderr, 42 | "Option -%c not specified, please specify the total number " 43 | "of chunks\n", 44 | optopt); 45 | } 46 | return 1; 47 | default: 48 | abort(); 49 | } 50 | } 51 | 52 | assert(total_chunk > 0); 53 | 54 | if (data_path_prefix == NULL) { 55 | data_path_prefix = (char *)calloc(8192, 1); 56 | snprintf(data_path_prefix, 8191, "shimmer"); 57 | } 58 | 59 | khash_t(MMC) *mcmap = kh_init(MMC); 60 | 61 | for (chunk = 1; chunk <= total_chunk; chunk++) { 62 | mm_count_v mc = {0, 0, 0}; 63 | written = snprintf(mc_chunk_file_path, sizeof mc_chunk_file_path, 64 | "%s-MC-%02d-of-%02d.dat", data_path_prefix, chunk, 65 | total_chunk); 66 | assert(written < sizeof(mc_chunk_file_path)); 67 | fprintf(stderr, "input data file: %s\n", mc_chunk_file_path); 68 | mc = read_mm_count(mc_chunk_file_path); 69 | aggregate_mm_count(mcmap, &mc); 70 | kv_destroy(mc); 71 | } 72 | 73 | mm_count_v mc_all = {0, 0, 0}; 74 | mm_count_to_vec(mcmap, &mc_all); 75 | 76 | written = snprintf(mc_file_path, sizeof mc_file_path, "%s-MC-all.dat", 77 | data_path_prefix); 78 | assert(written < sizeof(mc_file_path)); 79 | fprintf(stderr, "output data file: %s\n", mc_file_path); 80 | 81 | write_mm_count(mc_file_path, &mc_all); 82 | 83 | kv_destroy(mc_all); 84 | kh_destroy(MMC, mcmap); 85 | 86 | if (!data_path_prefix) free(data_path_prefix); 87 | return 0; 88 | } 89 | -------------------------------------------------------------------------------- /py/peregrine/build_falcon4py.py: -------------------------------------------------------------------------------- 1 | from cffi import FFI 2 | import os 3 | 4 | basedir = os.environ["peregrine_base"] 5 | 6 | ffibuilder = FFI() 7 | 8 | ffibuilder.cdef(""" 9 | 10 | typedef int seq_coor_t; 11 | 12 | typedef struct { 13 | seq_coor_t aln_str_size ; 14 | seq_coor_t dist ; 15 | seq_coor_t aln_q_s; 16 | seq_coor_t aln_q_e; 17 | seq_coor_t aln_t_s; 18 | seq_coor_t aln_t_e; 19 | char * q_aln_str; 20 | char * t_aln_str; 21 | 22 | } alignment; 23 | 24 | typedef struct { 25 | seq_coor_t t_pos; 26 | uint8_t delta; 27 | char q_base; 28 | seq_coor_t p_t_pos; // the tag position of the previous base 29 | uint8_t p_delta; // the tag delta of the previous base 30 | char p_q_base; // the previous base 31 | unsigned q_id; 32 | } align_tag_t; 33 | 34 | typedef struct { 35 | seq_coor_t len; 36 | align_tag_t * align_tags; 37 | } align_tags_t; 38 | 39 | typedef struct { 40 | seq_coor_t s1; 41 | seq_coor_t e1; 42 | seq_coor_t s2; 43 | seq_coor_t e2; 44 | long int score; 45 | } aln_range; 46 | 47 | typedef struct { 48 | char * sequence; 49 | uint8_t * eqv; 50 | } consensus_data; 51 | 52 | 53 | align_tags_t * get_align_tags( char * aln_q_seq, 54 | char * aln_t_seq, 55 | seq_coor_t aln_seq_len, 56 | aln_range * range, 57 | unsigned q_id, 58 | seq_coor_t t_offset); 59 | 60 | void free_align_tags( align_tags_t * tags); 61 | 62 | consensus_data * get_cns_from_align_tags( align_tags_t ** tag_seqs, 63 | unsigned n_tag_seqs, 64 | unsigned t_len, 65 | unsigned min_cov ); 66 | 67 | void free_consensus_data( consensus_data * consensus ); 68 | 69 | alignment * align(char * query_seq, seq_coor_t q_len, 70 | char * target_seq, seq_coor_t t_len, 71 | seq_coor_t band_tolerance, 72 | int get_aln_str); 73 | 74 | void free_alignment(alignment *); 75 | void *malloc(size_t size); 76 | void free(void *ptr); 77 | """) 78 | 79 | ffibuilder.set_source("peregrine._falcon4py", 80 | f""" 81 | #include "{basedir}/falcon/common.h" 82 | #include "{basedir}/falcon/falcon.h" 83 | """, sources = [f'{basedir}/falcon/falcon.c', 84 | f'{basedir}/falcon/DW_banded.c', 85 | f'{basedir}/falcon/kalloc.c']) # library name, for the linker 86 | 87 | if __name__ == "__main__": 88 | import sys 89 | ffibuilder.compile(verbose=True) 90 | -------------------------------------------------------------------------------- /py/peregrine/build_shimmer4py.py: -------------------------------------------------------------------------------- 1 | from cffi import FFI 2 | import os 3 | 4 | basedir = os.environ["peregrine_base"] 5 | 6 | ffibuilder = FFI() 7 | 8 | ffibuilder.cdef(""" 9 | void decode_biseq(uint8_t * src, char * seq, 10 | size_t len, uint8_t strand); 11 | 12 | typedef int32_t seq_coor_t; 13 | 14 | typedef struct { 15 | seq_coor_t m_size, dist ; 16 | seq_coor_t q_bgn, q_end; 17 | seq_coor_t t_bgn, t_end; 18 | seq_coor_t t_m_end, q_m_end; 19 | } ovlp_match_t; 20 | 21 | ovlp_match_t * ovlp_match(uint8_t * query_seq, 22 | seq_coor_t q_len, 23 | uint8_t q_strand, 24 | uint8_t * target_seq, 25 | seq_coor_t t_len, 26 | uint8_t t_strand, 27 | seq_coor_t band_tolerance); 28 | 29 | void free_ovlp_match(ovlp_match_t * match); 30 | 31 | typedef struct { uint64_t x, y; } mm128_t; 32 | 33 | typedef struct { size_t n, m; mm128_t *a; } mm128_v; 34 | 35 | mm128_v read_mmlist(char *); 36 | 37 | void free(void *ptr); 38 | 39 | typedef unsigned int khint32_t; 40 | 41 | typedef unsigned long khint64_t; 42 | 43 | typedef khint32_t khint_t; 44 | 45 | typedef struct { 46 | mm128_v * mmers; 47 | void * mmer0_map; 48 | void * rlmap; 49 | void * mcmap; 50 | void * ridmm;} py_mmer_t; 51 | 52 | void build_shimmer_map4py(py_mmer_t *, 53 | char *, char *, 54 | uint32_t, uint32_t, uint32_t, uint32_t); 55 | 56 | void get_shimmers_for_read(mm128_v *, py_mmer_t *, uint32_t); 57 | 58 | typedef struct { uint64_t x0, x1, y0, y1; uint8_t direction;} mp256_t; 59 | typedef struct { size_t n, m; mp256_t *a; } mp256_v; 60 | 61 | uint32_t get_mmer_count(py_mmer_t * , uint64_t); 62 | void get_shimmer_hits(mp256_v *, py_mmer_t *, uint64_t, uint32_t); 63 | 64 | typedef uint32_t mm_idx_t; 65 | typedef struct { size_t n, m; mm_idx_t *a; } mm_idx_v; 66 | 67 | typedef struct { 68 | mm_idx_v idx0; 69 | mm_idx_v idx1; 70 | } shmr_aln_t; 71 | 72 | typedef struct { size_t n, m; shmr_aln_t *a; } shmr_aln_v; 73 | 74 | shmr_aln_v * shmr_aln( mm128_v *, mm128_v *, uint8_t, uint32_t, uint32_t, uint32_t); 75 | 76 | void free_shmr_alns(shmr_aln_v *); 77 | 78 | // from mm_sketch.c 79 | void mm_sketch(void *, const char *, int , int, int , uint32_t , int , mm128_v *); 80 | 81 | // from shmr_reduce.c 82 | void mm_reduce(mm128_v *, mm128_v *, uint8_t); 83 | 84 | """) 85 | 86 | ffibuilder.set_source("peregrine._shimmer4py", 87 | f""" 88 | #include "{basedir}/src/shimmer.h" 89 | """, 90 | sources=[f'{basedir}/src/shimmer4py.c', 91 | f'{basedir}/src/DWmatch.c', 92 | f'{basedir}/src/shmr_align.c', 93 | f'{basedir}/src/shmr_utils.c', 94 | f'{basedir}/src/shmr_reduce.c', 95 | f'{basedir}/src/mm_sketch.c', 96 | f'{basedir}/src/kalloc.c']) # library name, for the linker 97 | 98 | if __name__ == "__main__": 99 | ffibuilder.compile(verbose=True) 100 | -------------------------------------------------------------------------------- /src/kvec.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, by Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* 27 | An example: 28 | 29 | #include "kvec.h" 30 | int main() { 31 | kvec_t(int) array; 32 | kv_init(array); 33 | kv_push(int, array, 10); // append 34 | kv_a(int, array, 20) = 5; // dynamic 35 | kv_A(array, 20) = 4; // static 36 | kv_destroy(array); 37 | return 0; 38 | } 39 | */ 40 | 41 | /* 42 | 2008-09-22 (0.1.0): 43 | 44 | * The initial version. 45 | 46 | */ 47 | 48 | #ifndef AC_KVEC_H 49 | #define AC_KVEC_H 50 | 51 | #include 52 | #include "kalloc.h" 53 | 54 | #define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 55 | 56 | #define kvec_t(type) struct { size_t n, m; type *a; } 57 | #define kv_init(v) ((v).n = (v).m = 0, (v).a = 0) 58 | #define kv_destroy(v) free((v).a) 59 | #define kv_A(v, i) ((v).a[(i)]) 60 | #define kv_pop(v) ((v).a[--(v).n]) 61 | #define kv_size(v) ((v).n) 62 | #define kv_max(v) ((v).m) 63 | 64 | #define kv_resize(type, km, v, s) do { \ 65 | if ((v).m < (s)) { \ 66 | (v).m = (s); \ 67 | kv_roundup32((v).m); \ 68 | (v).a = (type*)krealloc((km), (v).a, sizeof(type) * (v).m); \ 69 | } \ 70 | } while (0) 71 | 72 | #define kv_copy(type, km, v1, v0) do { \ 73 | if ((v1).m < (v0).n) kv_resize(type, (km), (v1), (v0).n); \ 74 | (v1).n = (v0).n; \ 75 | memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \ 76 | } while (0) \ 77 | 78 | #define kv_push(type, km, v, x) do { \ 79 | if ((v).n == (v).m) { \ 80 | (v).m = (v).m? (v).m<<1 : 2; \ 81 | (v).a = (type*)krealloc((km), (v).a, sizeof(type) * (v).m); \ 82 | } \ 83 | (v).a[(v).n++] = (x); \ 84 | } while (0) 85 | 86 | #define kv_pushp(type, km, v, p) do { \ 87 | if ((v).n == (v).m) { \ 88 | (v).m = (v).m? (v).m<<1 : 2; \ 89 | (v).a = (type*)krealloc((km), (v).a, sizeof(type) * (v).m); \ 90 | } \ 91 | *(p) = &(v).a[(v).n++]; \ 92 | } while (0) 93 | 94 | #define kv_reverse(type, v, start) do { \ 95 | if ((v).m > 0 && (v).n > (start)) { \ 96 | size_t __i, __end = (v).n - (start); \ 97 | type *__a = (v).a + (start); \ 98 | for (__i = 0; __i < __end>>1; ++__i) { \ 99 | type __t = __a[__end - 1 - __i]; \ 100 | __a[__end - 1 - __i] = __a[__i]; __a[__i] = __t; \ 101 | } \ 102 | } \ 103 | } while (0) 104 | 105 | #endif 106 | -------------------------------------------------------------------------------- /falcon/kvec.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, by Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* 27 | An example: 28 | 29 | #include "kvec.h" 30 | int main() { 31 | kvec_t(int) array; 32 | kv_init(array); 33 | kv_push(int, array, 10); // append 34 | kv_a(int, array, 20) = 5; // dynamic 35 | kv_A(array, 20) = 4; // static 36 | kv_destroy(array); 37 | return 0; 38 | } 39 | */ 40 | 41 | /* 42 | 2008-09-22 (0.1.0): 43 | 44 | * The initial version. 45 | 46 | */ 47 | 48 | #ifndef AC_KVEC_H 49 | #define AC_KVEC_H 50 | 51 | #include 52 | #include "kalloc.h" 53 | 54 | #define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 55 | 56 | #define kvec_t(type) struct { size_t n, m; type *a; } 57 | #define kv_init(v) ((v).n = (v).m = 0, (v).a = 0) 58 | #define kv_destroy(v) free((v).a) 59 | #define kv_A(v, i) ((v).a[(i)]) 60 | #define kv_pop(v) ((v).a[--(v).n]) 61 | #define kv_size(v) ((v).n) 62 | #define kv_max(v) ((v).m) 63 | 64 | #define kv_resize(type, km, v, s) do { \ 65 | if ((v).m < (s)) { \ 66 | (v).m = (s); \ 67 | kv_roundup32((v).m); \ 68 | (v).a = (type*)krealloc((km), (v).a, sizeof(type) * (v).m); \ 69 | } \ 70 | } while (0) 71 | 72 | #define kv_copy(type, km, v1, v0) do { \ 73 | if ((v1).m < (v0).n) kv_resize(type, (km), (v1), (v0).n); \ 74 | (v1).n = (v0).n; \ 75 | memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \ 76 | } while (0) \ 77 | 78 | #define kv_push(type, km, v, x) do { \ 79 | if ((v).n == (v).m) { \ 80 | (v).m = (v).m? (v).m<<1 : 2; \ 81 | (v).a = (type*)krealloc((km), (v).a, sizeof(type) * (v).m); \ 82 | } \ 83 | (v).a[(v).n++] = (x); \ 84 | } while (0) 85 | 86 | #define kv_pushp(type, km, v, p) do { \ 87 | if ((v).n == (v).m) { \ 88 | (v).m = (v).m? (v).m<<1 : 2; \ 89 | (v).a = (type*)krealloc((km), (v).a, sizeof(type) * (v).m); \ 90 | } \ 91 | *(p) = &(v).a[(v).n++]; \ 92 | } while (0) 93 | 94 | #define kv_reverse(type, v, start) do { \ 95 | if ((v).m > 0 && (v).n > (start)) { \ 96 | size_t __i, __end = (v).n - (start); \ 97 | type *__a = (v).a + (start); \ 98 | for (__i = 0; __i < __end>>1; ++__i) { \ 99 | type __t = __a[__end - 1 - __i]; \ 100 | __a[__end - 1 - __i] = __a[__i]; __a[__i] = __t; \ 101 | } \ 102 | } \ 103 | } while (0) 104 | 105 | #endif 106 | -------------------------------------------------------------------------------- /py-utils/dump_L0.py: -------------------------------------------------------------------------------- 1 | from cffi import FFI 2 | # import redis 3 | 4 | ffi = FFI() 5 | 6 | ffi.cdef(""" 7 | typedef struct { uint64_t x, y; } mm128_t; 8 | typedef struct { size_t n, m; mm128_t *a; } mm128_v; 9 | mm128_v read_mmlist(char *); 10 | void free(void *ptr); 11 | """) 12 | 13 | C = ffi.dlopen(None) 14 | mm_utils = ffi.dlopen("../src/mm_utils.so") 15 | # r_conn = redis.Redis(host='127.0.0.1', port=6379, db=0) 16 | 17 | rmap = dict(zip(b"ACGT", b"TGCA")) 18 | 19 | L0dump = open("L0.txt", "w") 20 | 21 | #hmmerL0 = ffi.new("mm128_v *") 22 | #hmmerL2 = ffi.new("mm128_v *") 23 | 24 | hmmerL0 = mm_utils.read_mmlist(b"../test/hmmer-L0-01-of-01.dat") 25 | 26 | rid2name = {} 27 | rid2len = {} 28 | # rid2seq = {} 29 | 30 | with open("../test/seq_dataset.idx") as f: 31 | for row in f: 32 | row = row.strip().split() 33 | rid, rname, rlen = row 34 | rid = int(rid) 35 | rlen = int(rlen) 36 | rid2name[rid] = rname 37 | rid2len[rid] = rlen 38 | 39 | """ 40 | * @param p minimizers 41 | * p->a[i].x = kMer<<8 | kmerSpan 42 | * p->a[i].y = rid<<32 | lastPos<<1 | strand 43 | * where lastPos is the position of the last base of the i-th minimizer, 44 | * and strand indicates whether the minimizer comes from the top or the bottom strand. 45 | * Callers may want to set "p->n = 0"; otherwise results are appended to p 46 | """ 47 | 48 | mmer_count = {} 49 | mer_five = {} 50 | mer_three = {} 51 | for i in range(hmmerL0.n): 52 | span = hmmerL0.a[i].x & 0xFF 53 | mmer = hmmerL0.a[i].x >> 8 54 | rid = hmmerL0.a[i].y >> 32 55 | pos_end = ((hmmerL0.a[i].y & 0xFFFFFFFF) >> 1) + 1 56 | strand = hmmerL0.a[i].y & 0x1 57 | mm_str = "{:014X}".format(mmer) 58 | # 59 | # mmer_count.setdefault(mm_str, 0) 60 | # mmer_count[mm_str] += 1 61 | # 62 | # kmer = bseq[pos_end-span:pos_end] 63 | # kmer_r = bytes([rmap[c] for c in kmer[::-1]]) 64 | r_pos_end = rid2len[rid] - pos_end + span 65 | name = rid2name[rid] 66 | 67 | if pos_end < 250: 68 | mer_five.setdefault(mmer, []) 69 | mer_five[mmer].append(name) 70 | if r_pos_end < 250: 71 | mer_three.setdefault(mmer, []) 72 | mer_three[mmer].append(name) 73 | 74 | print(name, pos_end, r_pos_end, 75 | strand, mm_str, file=L0dump) 76 | 77 | L0dump.close() 78 | 79 | dovetail_end = {} 80 | for i in range(hmmerL0.n): 81 | mmer = hmmerL0.a[i].x >> 8 82 | rid = hmmerL0.a[i].y >> 32 83 | rname = rid2name[rid] 84 | if mmer in mer_five: 85 | for rname0 in mer_five[mmer]: 86 | dovetail_end.setdefault(rname0, set()) 87 | dovetail_end[rname0].add( (5, rname) ) 88 | if mmer in mer_three: 89 | for rname0 in mer_three[mmer]: 90 | dovetail_end.setdefault(rname0, set()) 91 | dovetail_end[rname0].add( (3, rname) ) 92 | 93 | dt_file = open("L0_dt.txt","w") 94 | for rname in dovetail_end: 95 | for e, rname0 in list(dovetail_end[rname]): 96 | if rname == rname0: 97 | continue 98 | intersect = 0 99 | if e == 5 and (3, rname) in dovetail_end.get(rname0, {}): 100 | print ( 5, rname0, 3, rname, file=dt_file) 101 | if e == 5 and (5, rname) in dovetail_end.get(rname0, {}): 102 | print ( 5, rname0, 5, rname, file=dt_file) 103 | if e == 3 and (5, rname) in dovetail_end.get(rname0, {}): 104 | print ( 3, rname0, 5, rname, file=dt_file) 105 | if e == 3 and (3, rname) in dovetail_end.get(rname0, {}): 106 | print ( 3, rname0, 3, rname, file=dt_file) 107 | 108 | dt_file.close() 109 | 110 | C.free(hmmerL0.a) 111 | 112 | -------------------------------------------------------------------------------- /src/shmr_dedup.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "kalloc.h" 9 | #include "khash.h" 10 | #include "kvec.h" 11 | #include "shimmer.h" 12 | 13 | #define OVERLAP 0 14 | #define CONTAINS 1 15 | #define CONTAINED 2 16 | 17 | KHASH_MAP_INIT_INT64(RPAIR, uint8_t); 18 | 19 | int main(int argc, char *argv[]) { 20 | // 002408115 004118624 -14416 99.6 0 27 14387 15129 1 0 14392 14392 contains 21 | 22 | uint32_t a_bgn, a_end; 23 | uint32_t b_bgn, b_end; 24 | uint32_t rid0; 25 | uint32_t rid1; 26 | uint64_t ridp; 27 | int32_t absent; 28 | khiter_t k; 29 | 30 | khash_t(RPAIR) *rid_pairs = kh_init(RPAIR); 31 | 32 | while (!feof(stdin)) { 33 | ovlp_t ovlp; 34 | fread(&ovlp, sizeof(ovlp), 1, stdin); 35 | 36 | rid0 = (uint32_t)(ovlp.y0 >> 32); 37 | rid1 = (uint32_t)(ovlp.y1 >> 32); 38 | 39 | ridp = rid0 < rid1 ? (((uint64_t)rid0) << 32) | ((uint64_t)rid1) 40 | : (((uint64_t)rid1) << 32) | ((uint64_t)rid0); 41 | k = kh_get(RPAIR, rid_pairs, ridp); 42 | if (k == kh_end(rid_pairs)) { 43 | uint32_t pos0 = (uint32_t)((ovlp.y0 & 0xFFFFFFFF) >> 1) + 1; 44 | uint32_t rlen0 = ovlp.rl0; 45 | uint8_t strand0 = ovlp.strand0; 46 | 47 | uint32_t pos1 = (uint32_t)((ovlp.y1 & 0xFFFFFFFF) >> 1) + 1; 48 | uint32_t rlen1 = ovlp.rl1; 49 | uint8_t strand1 = ovlp.strand1; 50 | 51 | ovlp_match_t match = ovlp.match; 52 | /* Dump raw alignment results for debugging */ 53 | /* 54 | fprintf(stdout,"X %09d %u %u %d %d %d %09d %u %u %d %d %d %d %d %u\n", 55 | rid0, pos0, strand0, match.q_bgn, match.q_end, rlen0, 56 | rid1, pos1, strand1, match.t_bgn, match.t_end, rlen1, 57 | match.m_size, match.dist, ovlp.ovlp_type); 58 | */ 59 | seq_coor_t q_bgn, q_end, t_bgn, t_end; 60 | q_bgn = match.q_bgn; 61 | q_end = match.q_end; 62 | t_bgn = match.t_bgn; 63 | t_end = match.t_end; 64 | q_bgn -= t_bgn; 65 | t_bgn = 0; 66 | if (strand0 == ORIGINAL) { 67 | a_bgn = (seq_coor_t)(pos0 - pos1) + q_bgn; 68 | a_end = (seq_coor_t)(pos0 - pos1) + q_end; 69 | a_bgn = a_bgn < 0 ? 0 : a_bgn; // this ad-hoc fix, read should be 70 | // stiched by alignment 71 | a_end = a_end >= rlen0 ? rlen0 : a_end; 72 | } else { 73 | q_bgn -= t_bgn; 74 | t_bgn = 0; 75 | a_bgn = (seq_coor_t)rlen0 - (seq_coor_t)(pos0 - pos1) - q_end; 76 | a_end = (seq_coor_t)rlen0 - (seq_coor_t)(pos0 - pos1) - q_bgn; 77 | a_bgn = a_bgn < 0 ? 0 : a_bgn; // this ad-hoc fix 78 | a_end = a_end >= rlen0 ? rlen0 : a_end; 79 | } 80 | if (strand1 == ORIGINAL) { 81 | b_bgn = t_bgn; 82 | b_end = t_end; 83 | b_bgn = b_bgn < 0 ? 0 : b_bgn; // this ad-hoc fix 84 | b_end = b_end >= rlen1 ? rlen1 : b_end; 85 | } else { 86 | b_bgn = (seq_coor_t)rlen1 - t_end; 87 | b_end = (seq_coor_t)rlen1 - t_bgn; 88 | b_bgn = b_bgn < 0 ? 0 : b_bgn; // this ad-hoc fix 89 | b_end = b_end >= rlen1 ? rlen1 : b_end; 90 | } 91 | double err_est; 92 | err_est = 100.0 - 100.0 * (double)(match.dist) / (double)(match.m_size); 93 | fprintf(stdout, "%09d %09d %d %0.1f %u %d %d %u %u %d %d %u %s\n", rid0, 94 | rid1, -(match.m_size), err_est, ORIGINAL, a_bgn, a_end, rlen0, 95 | (strand0 == ORIGINAL ? strand1 : 1 - strand1), b_bgn, b_end, 96 | rlen1, 97 | ovlp.ovlp_type == OVERLAP 98 | ? "overlap" 99 | : (ovlp.ovlp_type == CONTAINS ? "contains" : "contained")); 100 | kh_put(RPAIR, rid_pairs, ridp, &absent); 101 | } 102 | } 103 | kh_destroy(RPAIR, rid_pairs); 104 | } 105 | -------------------------------------------------------------------------------- /src/shmr_mkseqdb.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "kseq.h" 8 | #include "shimmer.h" 9 | 10 | KSEQ_INIT(gzFile, gzread); 11 | 12 | extern char *optarg; 13 | extern int optind, opterr, optopt; 14 | 15 | int main(int argc, char *argv[]) { 16 | gzFile fp; 17 | FILE *seq_dataset_file; 18 | FILE *index_file; 19 | FILE *seqdb_file; 20 | kseq_t *seq; 21 | uint32_t rid; 22 | char *seq_dataset_path = NULL; 23 | char *seqdb_prefix = NULL; 24 | char index_fn[8192]; 25 | char seqdb_fn[8192]; 26 | char fn[8192]; 27 | int l, c; 28 | 29 | opterr = 0; 30 | 31 | while ((c = getopt(argc, argv, "d:p:")) != -1) { 32 | switch (c) { 33 | case 'd': 34 | seq_dataset_path = optarg; 35 | break; 36 | case 'p': 37 | seqdb_prefix = optarg; 38 | break; 39 | 40 | case '?': 41 | if (optopt == 'd') { 42 | fprintf(stderr, 43 | "Option -%c not specified, using 'seq_dataset.lst' as the " 44 | "input file\n", 45 | optopt); 46 | } else if (optopt == 'p') { 47 | fprintf(stderr, 48 | "Option -%c not specified, using 'seq_dataset' as the output " 49 | "prefix\n", 50 | optopt); 51 | } else { 52 | fprintf( 53 | stderr, 54 | "Usage: shmr_mkseqdb -d seq_dataset.lst -p seq_dataset_prefix\n"); 55 | } 56 | return 1; 57 | default: 58 | abort(); 59 | } 60 | } 61 | 62 | if (seq_dataset_path == NULL) { 63 | seq_dataset_path = (char *)calloc(8192, 1); 64 | snprintf(seq_dataset_path, 8191, "seq_dataset.lst"); 65 | } 66 | 67 | if (seqdb_prefix == NULL) { 68 | seqdb_prefix = (char *)calloc(8192, 1); 69 | snprintf(seqdb_prefix, 8191, "seq_dataset"); 70 | } 71 | 72 | seq_dataset_file = fopen(seq_dataset_path, "r"); 73 | printf("input sequence dataset file list: '%s'\n", seq_dataset_path); 74 | if (!seq_dataset_file) { 75 | fprintf(stderr, "file '%s' open error: %s\n", seq_dataset_path, 76 | strerror(errno)); 77 | exit(1); 78 | } 79 | 80 | int written; 81 | written = snprintf(index_fn, sizeof(index_fn), "%s.idx", seqdb_prefix); 82 | assert(written < sizeof(index_fn)); 83 | printf("output index file: %s\n", index_fn); 84 | index_file = fopen(index_fn, "w"); // use text file for now 85 | if (!index_file) { 86 | fprintf(stderr, "file '%s' open error: %s\n", index_fn, strerror(errno)); 87 | exit(1); 88 | } 89 | 90 | written = snprintf(seqdb_fn, sizeof(index_fn), "%s.seqdb", seqdb_prefix); 91 | assert(written < sizeof(seqdb_fn)); 92 | printf("output seqdb file: %s\n", index_fn); 93 | seqdb_file = fopen(seqdb_fn, "wb"); // use text file for now 94 | if (!index_file) { 95 | fprintf(stderr, "file '%s' open error: %s\n", index_fn, strerror(errno)); 96 | exit(1); 97 | } 98 | 99 | rid = 0; 100 | size_t offset = 0; 101 | while (fscanf(seq_dataset_file, "%s", fn) != EOF) { 102 | fp = gzopen(fn, "r"); 103 | if (!fp) { 104 | fprintf(stderr, "file '%s' open error: %s\n", fn, strerror(errno)); 105 | exit(1); 106 | } 107 | seq = kseq_init(fp); 108 | while ((l = kseq_read(seq)) >= 0) { 109 | uint8_t *encoded; 110 | encoded = malloc(seq->seq.l); 111 | encode_biseq(encoded, seq->seq.s, seq->seq.l); 112 | fprintf(index_file, "%09d %s %u %lu\n", rid, seq->name.s, seq->seq.l, 113 | offset); 114 | fwrite(encoded, sizeof(uint8_t), seq->seq.l, seqdb_file); 115 | rid += 1; 116 | offset += seq->seq.l; 117 | free(encoded); 118 | } 119 | kseq_destroy(seq); 120 | gzclose(fp); 121 | } 122 | fclose(seq_dataset_file); 123 | fclose(index_file); 124 | fclose(seqdb_file); 125 | if (!seq_dataset_path) free(seq_dataset_path); 126 | if (!seqdb_prefix) free(seqdb_prefix); 127 | return 0; 128 | } 129 | -------------------------------------------------------------------------------- /src/shimmer.h: -------------------------------------------------------------------------------- 1 | #ifndef SHIMMER_H 2 | #define SHIMMER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "khash.h" 9 | #include "kvec.h" 10 | 11 | #define ORIGINAL 0 12 | #define REVERSED 1 13 | 14 | #ifdef __cplusplus 15 | extern "C" { 16 | #endif 17 | 18 | void encode_biseq(uint8_t *, char *, size_t); 19 | 20 | void decode_biseq(uint8_t *, char *, size_t, uint8_t); 21 | 22 | void reverse_complement(char *, size_t); 23 | 24 | typedef struct { 25 | uint64_t x, y; 26 | } mm128_t; 27 | typedef struct { 28 | size_t n, m; 29 | mm128_t *a; 30 | } mm128_v; 31 | void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, 32 | int is_hpc, mm128_v *p); 33 | void mm_reduce(mm128_v *, mm128_v *, uint8_t); 34 | uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); 35 | 36 | void write_mmlist(char *, mm128_v *); 37 | mm128_v read_mmlist(char *); 38 | void append_mmlist(mm128_v *, mm128_v *); 39 | 40 | typedef struct { 41 | char *name; 42 | uint32_t rid; 43 | } seq_data_t; 44 | typedef struct { 45 | size_t n, m; 46 | seq_data_t *a; 47 | } seq_data_v; 48 | 49 | typedef struct { 50 | uint32_t len; 51 | size_t offset; 52 | } rl_t; 53 | KHASH_MAP_INIT_STR(RID, uint32_t); 54 | KHASH_MAP_INIT_INT(RLEN, rl_t); 55 | khash_t(RID) * build_read_index(char *, seq_data_v *, khash_t(RLEN) *); 56 | khash_t(RLEN) * get_read_length_map(char *); 57 | 58 | void mm_end_filter(mm128_v *, mm128_v *, mm128_v *, khash_t(RLEN) *, uint32_t); 59 | 60 | KHASH_MAP_INIT_INT64(MMC, uint32_t); 61 | typedef struct { 62 | uint64_t mer; 63 | uint32_t count; 64 | } mm_count_t; 65 | typedef struct { 66 | size_t n, m; 67 | mm_count_t *a; 68 | } mm_count_v; 69 | void mm_count(mm128_v *, khash_t(MMC) *, mm_count_v *); 70 | void write_mm_count(char *, mm_count_v *); 71 | void mm_count_to_vec(khash_t(MMC) *, mm_count_v *); 72 | mm_count_v read_mm_count(char *fn); 73 | 74 | void aggregate_mm_count(khash_t(MMC) *, mm_count_v *); 75 | 76 | typedef struct { 77 | uint64_t y0, y1; 78 | uint8_t direction; 79 | } mp128_t; 80 | typedef struct { 81 | size_t n, m; 82 | mp128_t *a; 83 | } mp128_v; 84 | KHASH_MAP_INIT_INT64(MMER1, mp128_v *); 85 | typedef khash_t(MMER1) * mmert1_p_t; 86 | KHASH_MAP_INIT_INT64(MMER0, mmert1_p_t); 87 | 88 | void build_map(mm128_v *, khash_t(MMER0) *, khash_t(RLEN) *, khash_t(MMC) *, 89 | uint32_t, uint32_t, uint32_t, uint32_t); 90 | 91 | char *get_read_seq(FILE *, uint32_t, khash_t(RLEN) *); 92 | uint8_t *get_read_seq_mmap_ptr(uint8_t *, uint32_t, khash_t(RLEN) *); 93 | 94 | // For DWmatch 95 | typedef int32_t seq_coor_t; 96 | 97 | typedef struct { 98 | seq_coor_t m_size, dist; 99 | seq_coor_t q_bgn, q_end; 100 | seq_coor_t t_bgn, t_end; 101 | seq_coor_t t_m_end, q_m_end; 102 | } ovlp_match_t; 103 | 104 | typedef struct { 105 | uint64_t y0, y1; 106 | uint32_t rl0, rl1; 107 | uint8_t strand0, strand1; 108 | uint8_t ovlp_type; 109 | ovlp_match_t match; 110 | } ovlp_t; 111 | 112 | typedef struct { 113 | seq_coor_t s1, e1; 114 | seq_coor_t s2, e2; 115 | long int score; 116 | } match_range; 117 | 118 | ovlp_match_t *ovlp_match(uint8_t *, seq_coor_t, uint8_t, uint8_t *, seq_coor_t, 119 | uint8_t, seq_coor_t); 120 | 121 | void free_ovlp_match(ovlp_match_t *); 122 | 123 | typedef struct { 124 | uint64_t x0, x1, y0, y1; 125 | uint8_t direction; 126 | } mp256_t; 127 | typedef struct { 128 | size_t n, m; 129 | mp256_t *a; 130 | } mp256_v; 131 | 132 | typedef struct { 133 | mm128_v *mmers; 134 | void *mmer0_map; 135 | void *rlmap; 136 | void *mcmap; 137 | void *ridmm; 138 | } py_mmer_t; 139 | 140 | // for shmr_align 141 | typedef uint32_t mm_idx_t; 142 | typedef kvec_t(mm_idx_t) mm_idx_v; 143 | 144 | typedef struct { 145 | mm_idx_v idx0; 146 | mm_idx_v idx1; 147 | } shmr_aln_t; 148 | 149 | typedef kvec_t(shmr_aln_t) shmr_aln_v; 150 | 151 | KHASH_MAP_INIT_INT64(MMIDX, mm_idx_v *); 152 | shmr_aln_v *shmr_aln(mm128_v *, mm128_v *, uint8_t, uint32_t, uint32_t, 153 | uint32_t); 154 | 155 | void free_shmr_alns(shmr_aln_v *); 156 | 157 | KHASH_MAP_INIT_INT(RIDMM, mm128_v *); 158 | void get_ridmm(khash_t(RIDMM) *, mm128_v *); 159 | uint32_t mmer_pos(mm128_t *); 160 | 161 | #ifdef __cplusplus 162 | } 163 | #endif 164 | 165 | #endif 166 | -------------------------------------------------------------------------------- /py/scripts/path_to_contig.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import mmap 4 | import sys 5 | import numpy 6 | from peregrine._shimmer4py import ffi, lib 7 | 8 | basemap = {1:"A",2:"C",4:"G",8:"T"} 9 | stitching_overhang_size = 500 10 | 11 | if __name__ == "__main__": 12 | seqdb_prefix = sys.argv[1] 13 | tiling_path_fn = sys.argv[2] 14 | 15 | f = open("{}.seqdb".format(seqdb_prefix), "rb") 16 | seqdb = mmap.mmap(f.fileno(), 0, 17 | flags=mmap.MAP_SHARED, prot=mmap.PROT_READ) 18 | 19 | read_idx = {} 20 | with open("{}.idx".format(seqdb_prefix)) as f: 21 | for row in f: 22 | row = row.strip().split() 23 | rid, rname, rlen, offset = row 24 | rid = int(rid) 25 | rlen = int(rlen) 26 | offset = int(offset) 27 | read_idx.setdefault(rid, {}) 28 | read_idx[rid]["name"] = rname 29 | read_idx[rid]["length"] = rlen 30 | read_idx[rid]["offset"] = offset 31 | 32 | tiling_path_data = {} 33 | with open(tiling_path_fn) as f: 34 | for row in f: 35 | row = row.strip().split() 36 | tiling_path_data.setdefault(row[0], []) 37 | tiling_path_data[row[0]].append(row) 38 | 39 | for ctg in tiling_path_data: 40 | segments = [] 41 | # I don't like to have the first read as it breaks the string formulation, 42 | # but poeple like it for no reason, so I will just do it 43 | ctg_id, v, w, r, s, e, olen, idt, _1, _2 = tiling_path_data[ctg][0] 44 | v = v.split(":") 45 | rid0 = int(v[0]) 46 | s0 = read_idx[rid0]["offset"] 47 | slen0 = read_idx[rid0]["length"] 48 | e0 = s0 + slen0 49 | bseq0 = seqdb[s0:e0] 50 | strand0 = 0 if v[1] == "E" else 1 51 | 52 | seq = ffi.new("char[{}]".format(slen0)) 53 | lib.decode_biseq(bseq0, seq, slen0, strand0) 54 | 55 | ctg_len = len(seq) 56 | segments.append((ctg_len, 0, seq)) 57 | for row in tiling_path_data[ctg]: 58 | ctg_id, v, w, r, s, e, olen, idt, _1, _2 = row 59 | v = v.split(":") 60 | w = w.split(":") 61 | s = int(s) 62 | e = int(e) 63 | olen = int(olen) 64 | idt = float(idt) 65 | 66 | rid0 = int(v[0]) 67 | s0 = read_idx[rid0]["offset"] 68 | slen0 = read_idx[rid0]["length"] 69 | e0 = s0 + slen0 70 | bseq0 = seqdb[s0:e0] 71 | strand0 = 0 if v[1] == "E" else 1 72 | 73 | rid1 = int(w[0]) 74 | s1 = read_idx[rid1]["offset"] 75 | slen1 = read_idx[rid1]["length"] 76 | e1 = s1 + slen1 77 | bseq1 = seqdb[s1:e1] 78 | strand1 = 0 if w[1] == "E" else 1 79 | 80 | offset1 = slen0 - stitching_overhang_size 81 | offset2 = slen1 - abs(e-s) - stitching_overhang_size 82 | match = lib.ovlp_match(bseq0[offset1:], slen0 - offset1, strand0, 83 | bseq1[offset2:], slen1 - offset2, strand1, 84 | 100) 85 | 86 | if strand1 == 1: 87 | s, e = slen1 - s, slen1 - e 88 | assert(e > s) 89 | seg_size = e - s + stitching_overhang_size - match.t_m_end 90 | seq = ffi.new("char[{}]".format(seg_size)) 91 | lib.decode_biseq(bseq1[e-seg_size:e], 92 | seq, 93 | seg_size, 94 | strand1) 95 | segments.append((ctg_len, 96 | ctg_len - stitching_overhang_size + match.q_m_end, 97 | seq)) 98 | # print(row) 99 | # print((ctg_len, match.q_m_end, match.t_m_end, 100 | # ctg_len - stitching_overhang_size + match.q_m_end, 101 | # ffi.string(seq))) 102 | ctg_len -= (stitching_overhang_size - match.q_m_end) 103 | ctg_len += (stitching_overhang_size - match.t_m_end) + e - s 104 | 105 | lib.free_ovlp_match(match) 106 | 107 | ctg_str = numpy.ones(ctg_len, dtype=numpy.byte) 108 | ctg_str *= ord('N') 109 | print(">{}".format(ctg_id)) 110 | for seg in segments: 111 | s = seg[1] 112 | e = seg[1] + len(ffi.string(seg[2])) 113 | ctg_str[s:e] = list(ffi.string(seg[2])) 114 | ffi.release(seg[2]) 115 | print("".join((chr(x) for x in ctg_str))) 116 | 117 | -------------------------------------------------------------------------------- /nim-mini/dump_mmmer.nim: -------------------------------------------------------------------------------- 1 | let doc = """ 2 | dump_mmmer 3 | 4 | Usage: 5 | dump_mmmer [options] 6 | 7 | Options: 8 | -h --help Show this screen 9 | -w --windowsize Window size [default: 64] 10 | """ 11 | 12 | import streams 13 | import strfmt 14 | import tables 15 | import strutils 16 | import sequtils 17 | import docopt 18 | import parseutils 19 | 20 | let args = docopt(doc, version = "dump minimizer") 21 | 22 | type 23 | pos_kmer = tuple[pos:uint32, kmer:uint32] 24 | 25 | var 26 | fn = $args[""] 27 | ws = parseInt($args["--windowsize"]).uint32 28 | 29 | echo fn 30 | 31 | var 32 | fs = newFileStream(fn, fmRead) 33 | line = "" 34 | 35 | # not used 36 | #[ 37 | base_to_code = {'A':0.uint32, 'C':1.uint32, 38 | 'G':2.uint32, 'T':3.uint32, 39 | 'a':0.uint32, 'c':1.uint32, 40 | 'g':2.uint32, 't':3.uint32, 41 | 'N':0.uint32}.toTable 42 | ]# 43 | 44 | code_to_base = {0.uint32:'A', 1.uint32:'C', 45 | 2.uint32:'G', 3.uint32:'T'}.toTable 46 | 47 | 48 | let xor_key = 0x7ed55d16.uint32 49 | let ksize = 16.uint32 50 | 51 | var mask = 0xFFFFFFFF.uint32 shr (32.uint32 - ksize * 2) 52 | 53 | 54 | proc rc_DNA_seq(dna_seq:var string) : void {.inline.} = 55 | var rc_map = {'A':'T', 'C':'G', 56 | 'G':'C', 'T':'A', 57 | 'a':'t', 'c':'g', 58 | 'g':'c', 't':'a', 59 | 'N':'N'}.toTable 60 | for i in 0..toInt(dna_seq.len/2-1): 61 | swap dna_seq[i], dna_seq[^(i+1)] 62 | for i in 0..= 0: 114 | mmer_seq[pos.int - ksize.int] = h_mer 115 | inc(pos) 116 | 117 | c_minimizer.pos = 0.uint32 118 | c_minimizer.kmer = 0xFFFFFFFF.uint32 119 | w_start = 0 120 | w_end = ws 121 | for pos in w_start ..< w_end: 122 | if mmer_seq[pos.int] < c_minimizer.kmer: 123 | c_minimizer.pos = pos 124 | c_minimizer.kmer = mmer_seq[pos.int] 125 | echo "0 ", c_minimizer.pos, " ", c_minimizer.kmer, " ", decode_hash(c_minimizer.kmer) 126 | 127 | for pos in ws ..< dna_seq.len.uint32 - ksize: 128 | # echo "X ", pos.int, " ", mmer_seq[pos.int], " ", c_minimizer.kmer 129 | if mmer_seq[pos.int] < c_minimizer.kmer: 130 | c_minimizer.pos = pos 131 | c_minimizer.kmer = mmer_seq[pos.int] 132 | echo "1 ",c_minimizer.pos, " ", c_minimizer.kmer, " ", decode_hash(c_minimizer.kmer) 133 | continue 134 | 135 | if pos.int - c_minimizer.pos.int >= ws.int: 136 | w_start = c_minimizer.pos + 1 137 | w_end = w_start + ws 138 | c_minimizer.kmer = 0xFFFFFFFF.uint32 139 | for pos2 in w_start ..< w_end: 140 | if mmer_seq[pos2.int] < c_minimizer.kmer: 141 | c_minimizer.pos = pos2 142 | c_minimizer.kmer = mmer_seq[pos2.int] 143 | echo "2 ", c_minimizer.pos, " ", c_minimizer.kmer, " ", decode_hash(c_minimizer.kmer) 144 | 145 | 146 | var 147 | dna_seq: string 148 | seq_name: string 149 | 150 | 151 | if not isNil(fs): 152 | while fs.readLine(line): 153 | if line[0] == '>': 154 | if not isNil(seq_name): 155 | if dna_seq.len < ws.int: 156 | seq_name = line.strip 157 | dna_seq = "" 158 | continue 159 | echo seq_name, "|", "n" 160 | find_minimizers(dna_seq) 161 | rc_DNA_seq(dna_seq) 162 | echo seq_name, "|", "c" 163 | find_minimizers(dna_seq) 164 | seq_name = line.strip 165 | dna_seq = "" 166 | continue 167 | 168 | if line[0] != '>': 169 | dna_seq.add(line.strip) 170 | 171 | echo seq_name, "|", "n" 172 | find_minimizers(dna_seq) 173 | rc_DNA_seq(dna_seq) 174 | echo seq_name, "|", "r" 175 | find_minimizers(dna_seq) 176 | fs.close() 177 | -------------------------------------------------------------------------------- /src/shmr_align.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "kalloc.h" 11 | #include "khash.h" 12 | #include "kvec.h" 13 | #include "shimmer.h" 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | #define MAX_SMALL_ALNS 4800 20 | 21 | shmr_aln_v *shmr_aln(mm128_v *mmers0, mm128_v *mmers1, uint8_t direction, 22 | uint32_t max_diff, uint32_t max_dist, 23 | uint32_t max_repeat) { 24 | /* generate a list of co-aligned mimimizer from two 25 | * minimizer lists: mv1 and mv2 26 | */ 27 | uint64_t mhash; 28 | mm128_t mmer0, mmer1; 29 | khash_t(MMIDX) *mmidx_map = kh_init(MMIDX); 30 | shmr_aln_v *alns; 31 | khiter_t k; 32 | int32_t absent; 33 | 34 | mm_idx_v *idx_tmp; 35 | 36 | alns = calloc(sizeof(shmr_aln_v), 1); 37 | 38 | mm_idx_t s = 0; 39 | /* build a hasmap from mhash to the index of the minimizer array */ 40 | for (;;) { 41 | if (s >= mmers0->n) break; 42 | mmer0 = mmers0->a[s]; 43 | mhash = mmer0.x >> 8; 44 | 45 | k = kh_put(MMIDX, mmidx_map, mhash, &absent); 46 | if (absent) { 47 | idx_tmp = calloc(sizeof(mm_idx_v), 1); 48 | kv_push(mm_idx_t, 0, *idx_tmp, s); 49 | kh_val(mmidx_map, k) = idx_tmp; 50 | } else { 51 | k = kh_get(MMIDX, mmidx_map, mhash); 52 | assert(k != kh_end(mmidx_map)); 53 | idx_tmp = kh_val(mmidx_map, k); 54 | kv_push(mm_idx_t, 0, *idx_tmp, s); 55 | } 56 | s++; 57 | } 58 | 59 | /* loop through 2nd shimmer list to build alginements */ 60 | mm_idx_t ss = 0; 61 | uint32_t small_aln_count = 0; 62 | for (;;) { 63 | if (ss >= mmers1->n) break; 64 | if (direction == 1) { // reversed 65 | s = mmers1->n - ss; 66 | } else { 67 | s = ss; 68 | } 69 | mmer1 = mmers1->a[s]; 70 | mhash = mmer1.x >> 8; 71 | k = kh_get(MMIDX, mmidx_map, mhash); 72 | if (k == kh_end(mmidx_map)) { 73 | ss++; 74 | continue; 75 | } 76 | idx_tmp = kh_val(mmidx_map, k); 77 | if (idx_tmp->n > max_repeat) { 78 | ss++; 79 | continue; 80 | } 81 | 82 | for (uint32_t i = 0; i < idx_tmp->n; i++) { 83 | mmer0 = mmers0->a[idx_tmp->a[i]]; 84 | int64_t delta0, delta1; 85 | int64_t mm_dist; 86 | if (direction == 0 && (mmer0.y & 0x1) != (mmer1.y & 0x1)) { 87 | continue; 88 | } 89 | 90 | if (direction == 1 && (mmer0.y & 0x1) == (mmer1.y & 0x1)) { 91 | continue; 92 | } 93 | 94 | if (direction == 1) { 95 | delta0 = abs(mmer_pos(&mmer0) + mmer_pos(&mmer1)); 96 | } else { 97 | delta0 = abs(mmer_pos(&mmer0) - mmer_pos(&mmer1)); 98 | } 99 | uint32_t best_aln_idx = UINT32_MAX; 100 | double min_diff = max_diff; 101 | uint8_t best_found = 0; 102 | small_aln_count = 0; 103 | for (uint32_t aln_idx = 0; aln_idx < alns->n; aln_idx++) { 104 | mm128_t m0, m1; 105 | shmr_aln_t *aln; 106 | size_t n; 107 | aln = alns->a + aln_idx; 108 | n = aln->idx0.n; 109 | 110 | if (n < 3) small_aln_count++; 111 | 112 | if (idx_tmp->a[i] < aln->idx0.a[n - 1]) continue; 113 | 114 | m0 = mmers0->a[aln->idx0.a[n - 1]]; 115 | m1 = mmers1->a[aln->idx1.a[n - 1]]; 116 | 117 | mm_dist = abs(mmer_pos(&mmer0) - mmer_pos(&m0)); 118 | if (mm_dist >= max_dist) continue; 119 | 120 | if (direction == 1) { 121 | delta1 = abs(mmer_pos(&m0) + mmer_pos(&m1)); 122 | } else { 123 | delta1 = abs(mmer_pos(&m0) - mmer_pos(&m1)); 124 | } 125 | // double diff = (double) abs(delta0 - delta1) / (double) (mm_dist); 126 | uint32_t diff = (uint32_t)abs((int32_t)delta0 - (int32_t)delta1); 127 | if (diff < max_diff && diff < min_diff && mm_dist < max_dist) { 128 | min_diff = diff; 129 | best_aln_idx = aln_idx; 130 | best_found = 1; 131 | } 132 | } 133 | if (best_found == 1) { 134 | shmr_aln_t *aln; 135 | aln = alns->a + best_aln_idx; 136 | kv_push(mm_idx_t, 0, aln->idx0, idx_tmp->a[i]); 137 | kv_push(mm_idx_t, 0, aln->idx1, s); 138 | // printf("best %d %d %d\n", best_aln_idx, idx_tmp->a[i], s); 139 | } else { 140 | shmr_aln_t *aln; 141 | aln = calloc(sizeof(shmr_aln_t), 1); 142 | kv_push(mm_idx_t, 0, aln->idx0, idx_tmp->a[i]); 143 | kv_push(mm_idx_t, 0, aln->idx1, s); 144 | kv_push(shmr_aln_t, 0, *alns, *aln); 145 | // printf("new %d %d %d\n", best_aln_idx, idx_tmp->a[i], s); 146 | } 147 | } 148 | ss++; 149 | if (small_aln_count > MAX_SMALL_ALNS) break; 150 | } 151 | 152 | for (khiter_t __i = kh_begin(mmidx_map); __i != kh_end(mmidx_map); ++__i) { 153 | if (!kh_exist(mmidx_map, __i)) continue; 154 | idx_tmp = kh_val(mmidx_map, __i); 155 | kv_destroy(*idx_tmp); 156 | free(idx_tmp); 157 | } 158 | kh_destroy(MMIDX, mmidx_map); 159 | return alns; 160 | } 161 | 162 | void free_shmr_alns(shmr_aln_v *alns) { 163 | for (uint32_t aln_idx = 0; aln_idx < alns->n; aln_idx++) { 164 | kv_destroy(alns->a[aln_idx].idx0); 165 | kv_destroy(alns->a[aln_idx].idx1); 166 | } 167 | kv_destroy(*alns); 168 | free(alns); 169 | } 170 | -------------------------------------------------------------------------------- /falcon/common.h: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * ===================================================================================== 4 | * 5 | * Filename: common.h 6 | * 7 | * Description: Common delclaration for the code base 8 | * 9 | * Version: 0.1 10 | * Created: 07/16/2013 07:46:23 AM 11 | * Revision: none 12 | * Compiler: gcc 13 | * 14 | * Author: Jason Chin, 15 | * Company: 16 | * 17 | * ===================================================================================== 18 | 19 | #################################################################################$$ 20 | # Copyright (c) 2011-2014, Pacific Biosciences of California, Inc. 21 | # 22 | # All rights reserved. 23 | # 24 | # Redistribution and use in source and binary forms, with or without 25 | # modification, are permitted (subject to the limitations in the 26 | # disclaimer below) provided that the following conditions are met: 27 | # 28 | # * Redistributions of source code must retain the above copyright 29 | # notice, this list of conditions and the following disclaimer. 30 | # 31 | # * Redistributions in binary form must reproduce the above 32 | # copyright notice, this list of conditions and the following 33 | # disclaimer in the documentation and/or other materials provided 34 | # with the distribution. 35 | # 36 | # * Neither the name of Pacific Biosciences nor the names of its 37 | # contributors may be used to endorse or promote products derived 38 | # from this software without specific prior written permission. 39 | # 40 | # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE 41 | # GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC 42 | # BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED 43 | # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 44 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 45 | # DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS 46 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 47 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 48 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF 49 | # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 50 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 51 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 52 | # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 53 | # SUCH DAMAGE. 54 | #################################################################################$$ 55 | */ 56 | 57 | #include 58 | 59 | typedef int seq_coor_t; 60 | 61 | typedef struct { 62 | seq_coor_t aln_str_size ; 63 | seq_coor_t dist ; 64 | seq_coor_t aln_q_s; 65 | seq_coor_t aln_q_e; 66 | seq_coor_t aln_t_s; 67 | seq_coor_t aln_t_e; 68 | char * q_aln_str; 69 | char * t_aln_str; 70 | 71 | } alignment; 72 | 73 | 74 | typedef struct { 75 | seq_coor_t pre_k; 76 | seq_coor_t x1; 77 | seq_coor_t y1; 78 | seq_coor_t x2; 79 | seq_coor_t y2; 80 | } d_path_data; 81 | 82 | typedef struct { 83 | seq_coor_t d; 84 | seq_coor_t k; 85 | seq_coor_t pre_k; 86 | seq_coor_t x1; 87 | seq_coor_t y1; 88 | seq_coor_t x2; 89 | seq_coor_t y2; 90 | } d_path_data2; 91 | 92 | typedef struct { 93 | seq_coor_t x; 94 | seq_coor_t y; 95 | } path_point; 96 | 97 | typedef struct { 98 | seq_coor_t start; 99 | seq_coor_t last; 100 | seq_coor_t count; 101 | } kmer_lookup; 102 | 103 | typedef unsigned char base; 104 | typedef base * seq_array; 105 | typedef seq_coor_t seq_addr; 106 | typedef seq_addr * seq_addr_array; 107 | 108 | 109 | typedef struct { 110 | seq_coor_t count; 111 | seq_coor_t * query_pos; 112 | seq_coor_t * target_pos; 113 | } kmer_match; 114 | 115 | 116 | typedef struct { 117 | seq_coor_t s1; 118 | seq_coor_t e1; 119 | seq_coor_t s2; 120 | seq_coor_t e2; 121 | long int score; 122 | } aln_range; 123 | 124 | 125 | typedef struct { 126 | char * sequence; 127 | uint8_t * eqv; 128 | } consensus_data; 129 | 130 | kmer_lookup * allocate_kmer_lookup (seq_coor_t); 131 | void init_kmer_lookup ( kmer_lookup *, seq_coor_t ); 132 | void free_kmer_lookup(kmer_lookup *); 133 | 134 | seq_array allocate_seq(seq_coor_t); 135 | void init_seq_array( seq_array, seq_coor_t); 136 | void free_seq_array(seq_array); 137 | 138 | seq_addr_array allocate_seq_addr(seq_coor_t size); 139 | 140 | void free_seq_addr_array(seq_addr_array); 141 | 142 | 143 | aln_range * find_best_aln_range(kmer_match *, 144 | seq_coor_t, 145 | seq_coor_t, 146 | seq_coor_t); 147 | 148 | void free_aln_range( aln_range *); 149 | 150 | kmer_match * find_kmer_pos_for_seq( char *, 151 | seq_coor_t, 152 | unsigned int K, 153 | seq_addr_array, 154 | kmer_lookup * ); 155 | 156 | void free_kmer_match( kmer_match * ptr); 157 | void free_kmer_lookup(kmer_lookup * ); 158 | 159 | 160 | 161 | void add_sequence( seq_coor_t, 162 | unsigned int, 163 | char *, 164 | seq_coor_t, 165 | seq_addr_array, 166 | seq_array, 167 | kmer_lookup *); 168 | 169 | void mask_k_mer(seq_coor_t, kmer_lookup *, seq_coor_t); 170 | 171 | alignment * align(char *, seq_coor_t, 172 | char *, seq_coor_t, 173 | seq_coor_t, 174 | int); 175 | 176 | void free_alignment(alignment *); 177 | 178 | void free_consensus_data(consensus_data *); 179 | 180 | -------------------------------------------------------------------------------- /src/shimmer4py.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "kalloc.h" 10 | #include "khash.h" 11 | #include "kvec.h" 12 | #include "shimmer.h" 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | extern char *optarg; 19 | extern int optind, opterr, optopt; 20 | 21 | #define handle_error(msg) \ 22 | do { \ 23 | perror(msg); \ 24 | exit(EXIT_FAILURE); \ 25 | } while (0) 26 | 27 | #define MMER_COUNT_LOWER_BOUND 2 28 | #define MMER_COUNT_UPPER_BOUND 240 29 | #define ORIGINAL 0 30 | #define REVERSED 1 31 | #define READ_END_FUZZINESS 48 32 | #define LOCAL_OVERLAP_UPPERBOUND 120 33 | #define OVERLAP 0 34 | #define CONTAINMENT 1 35 | 36 | KHASH_MAP_INIT_INT64(RPAIR, uint8_t); 37 | 38 | int mp128_comp(const void *a, const void *b) { 39 | mp128_t *a0 = (mp128_t *)a; 40 | mp128_t *b0 = (mp128_t *)b; 41 | return ((a0->y0 & 0xFFFFFFFF) >> 1) < ((b0->y0 & 0xFFFFFFFF) >> 1); 42 | } 43 | 44 | void build_shimmer_map4py(py_mmer_t *py_mmer, char *seqdb_prefix, 45 | char *shimmer_prefix, uint32_t mychunk, 46 | uint32_t total_chunk, uint32_t lowerbound, 47 | uint32_t upperbound) { 48 | char mmc_file_path[8192]; 49 | char mmer_file_path[8192]; 50 | char seq_idx_file_path[8192]; 51 | char seqdb_file_path[8291]; 52 | 53 | wordexp_t p; 54 | char **mmc_fns; 55 | char **shimmer_fns; 56 | 57 | mm128_v mmers_; 58 | mm_count_v mmc; 59 | 60 | khash_t(RLEN) * rlmap_; 61 | khash_t(MMC) *mcmap_ = kh_init(MMC); 62 | khash_t(MMER0) * mmer0_map_; 63 | khash_t(RIDMM) *ridmm_ = kh_init(RIDMM); 64 | 65 | assert(total_chunk > 0); 66 | assert(mychunk > 0 && mychunk <= total_chunk); 67 | 68 | if (seqdb_prefix == NULL) { 69 | seqdb_prefix = (char *)calloc(8192, 1); 70 | snprintf(seqdb_prefix, 8191, "seq_dataset"); 71 | } 72 | 73 | if (shimmer_prefix == NULL) { 74 | seqdb_prefix = (char *)calloc(8192, 1); 75 | snprintf(shimmer_prefix, 8191, "shimmer-L2"); 76 | } 77 | 78 | int written; 79 | written = snprintf(seq_idx_file_path, sizeof(seq_idx_file_path), "%s.idx", 80 | seqdb_prefix); 81 | assert(written < sizeof(seq_idx_file_path)); 82 | fprintf(stderr, "using index file: %s\n", seq_idx_file_path); 83 | 84 | rlmap_ = get_read_length_map(seq_idx_file_path); 85 | 86 | written = snprintf(seqdb_file_path, sizeof(seqdb_file_path), "%s.seqdb", 87 | seqdb_prefix); 88 | assert(written < sizeof(seqdb_file_path)); 89 | fprintf(stderr, "using seqdb file: %s\n", seqdb_file_path); 90 | 91 | py_mmer->mmers = malloc(sizeof(mm128_v)); 92 | py_mmer->mmers->n = 0; 93 | py_mmer->mmers->m = 0; 94 | py_mmer->mmers->a = 0; 95 | 96 | written = snprintf(mmer_file_path, sizeof(mmer_file_path), 97 | "%s-[0-9]*-of-[0-9]*.dat", shimmer_prefix); 98 | assert(written < sizeof(mmer_file_path)); 99 | wordexp(mmer_file_path, &p, 0); 100 | shimmer_fns = p.we_wordv; 101 | for (uint8_t i = 0; i < p.we_wordc; i++) { 102 | fprintf(stderr, "using shimmer data file: %s\n", shimmer_fns[i]); 103 | mmers_ = read_mmlist(shimmer_fns[i]); 104 | append_mmlist(py_mmer->mmers, &mmers_); 105 | kv_destroy(mmers_); 106 | } 107 | wordfree(&p); 108 | get_ridmm(ridmm_, py_mmer->mmers); 109 | 110 | written = snprintf(mmc_file_path, sizeof(mmc_file_path), 111 | "%s-MC-[0-9]*-of-[0-9]*.dat", shimmer_prefix); 112 | assert(written < sizeof(mmc_file_path)); 113 | wordexp(mmc_file_path, &p, 0); 114 | mmc_fns = p.we_wordv; 115 | for (uint8_t i = 0; i < p.we_wordc; i++) { 116 | fprintf(stderr, "using shimmer count file: %s\n", mmc_fns[i]); 117 | mmc = read_mm_count(mmc_fns[i]); 118 | aggregate_mm_count(mcmap_, &mmc); 119 | kv_destroy(mmc); 120 | } 121 | 122 | wordfree(&p); 123 | 124 | mmer0_map_ = kh_init(MMER0); 125 | 126 | build_map(py_mmer->mmers, mmer0_map_, rlmap_, mcmap_, mychunk, total_chunk, 127 | lowerbound, upperbound); 128 | py_mmer->mmer0_map = (void *)mmer0_map_; 129 | py_mmer->rlmap = (void *)rlmap_; 130 | py_mmer->mcmap = (void *)mcmap_; 131 | py_mmer->ridmm = (void *)ridmm_; 132 | } 133 | 134 | void get_shimmers_for_read(mm128_v *mmer, py_mmer_t *py_mmer, uint32_t rid) { 135 | khiter_t k; 136 | khash_t(RIDMM) * ridmm; 137 | mm128_v *mmer_; 138 | ridmm = (khash_t(RIDMM) *)py_mmer->ridmm; 139 | k = kh_get(RIDMM, ridmm, rid); 140 | if (k != kh_end(ridmm)) { 141 | mmer_ = kh_val(ridmm, k); 142 | } else { 143 | mmer_ = calloc(sizeof(mm128_v), 1); 144 | } 145 | mmer->n = mmer_->n; 146 | mmer->m = mmer_->m; 147 | mmer->a = mmer_->a; 148 | } 149 | 150 | uint32_t get_mmer_count(py_mmer_t *py_mmer, uint64_t mhash) { 151 | khash_t(MMC) *mcmap_ = (khash_t(MMC) *)py_mmer->mcmap; 152 | khiter_t k = kh_get(MMC, mcmap_, mhash); 153 | if (k != kh_end(mcmap_)) { 154 | return kh_val(mcmap_, k); 155 | } else { 156 | return 0; 157 | } 158 | } 159 | 160 | void get_shimmer_hits(mp256_v *mpv_out, py_mmer_t *py_mmer, uint64_t mhash0, 161 | uint32_t span) { 162 | khash_t(MMER0) *mmer0_map_ = (khash_t(MMER0) *)py_mmer->mmer0_map; 163 | // khash_t(RLEN) * rlmap_ = (khash_t(RLEN) *) rlmap_; 164 | // khash_t(MMC) * mcmap_ = (khash_t(MMC) *) mcmap_; 165 | 166 | mp128_v *mpv; 167 | mp256_t mp256; 168 | uint64_t mhash1; 169 | khiter_t k; 170 | 171 | khash_t(MMER1) * mmer1_map; 172 | mhash0 <<= 8; 173 | mhash0 |= span; 174 | mp256.x0 = mhash0; 175 | 176 | k = kh_get(MMER0, mmer0_map_, mhash0); 177 | if (k == kh_end(mmer0_map_)) { 178 | return; 179 | } 180 | mmer1_map = kh_val(mmer0_map_, k); 181 | for (khiter_t __j = kh_begin(mmer1_map); __j != kh_end(mmer1_map); ++__j) { 182 | if (!kh_exist(mmer1_map, __j)) continue; 183 | mhash1 = kh_key(mmer1_map, __j); 184 | mp256.x1 = mhash1; 185 | mhash1 >>= 8; 186 | mpv = kh_val(mmer1_map, __j); 187 | qsort(mpv->a, mpv->n, sizeof(mp128_t), mp128_comp); 188 | for (size_t __k0 = 0; __k0 < (mpv->n); __k0++) { 189 | mp256.y0 = mpv->a[__k0].y0; 190 | mp256.y1 = mpv->a[__k0].y1; 191 | mp256.direction = mpv->a[__k0].direction; 192 | 193 | kv_push(mp256_t, NULL, *mpv_out, mp256); 194 | } 195 | } 196 | } 197 | -------------------------------------------------------------------------------- /src/DWmatch.c: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * 4 | ===================================================================================== 5 | * 6 | * Filename: DW_banded.c 7 | * 8 | * Description: A banded version for the O(ND) greedy sequence alignment 9 | algorithm 10 | * 11 | * Version: 0.1 12 | * Created: 07/20/2013 17:00:00 13 | * Revision: none 14 | * Compiler: gcc 15 | * 16 | * Author: Jason Chin, 17 | * Company: 18 | * 19 | * 20 | ===================================================================================== 21 | 22 | #################################################################################$$ 23 | # Copyright (c) 2011-2014, Pacific Biosciences of California, Inc. 24 | # 25 | # All rights reserved. 26 | # 27 | # Redistribution and use in source and binary forms, with or without 28 | # modification, are permitted (subject to the limitations in the 29 | # disclaimer below) provided that the following conditions are met: 30 | # 31 | # * Redistributions of source code must retain the above copyright 32 | # notice, this list of conditions and the following disclaimer. 33 | # 34 | # * Redistributions in binary form must reproduce the above 35 | # copyright notice, this list of conditions and the following 36 | # disclaimer in the documentation and/or other materials provided 37 | # with the distribution. 38 | # 39 | # * Neither the name of Pacific Biosciences nor the names of its 40 | # contributors may be used to endorse or promote products derived 41 | # from this software without specific prior written permission. 42 | # 43 | # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE 44 | # GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC 45 | # BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED 46 | # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 47 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 48 | # DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS 49 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 50 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 51 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF 52 | # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 53 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 54 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 55 | # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 56 | # SUCH DAMAGE. 57 | #################################################################################$$ 58 | */ 59 | 60 | #include 61 | #include 62 | #include 63 | #include 64 | #include "shimmer.h" 65 | 66 | ovlp_match_t *ovlp_match(uint8_t *query_seq, seq_coor_t q_len, uint8_t q_strand, 67 | uint8_t *target_seq, seq_coor_t t_len, 68 | uint8_t t_strand, seq_coor_t band_tolerance) { 69 | seq_coor_t *V; 70 | seq_coor_t *U; // array of matched bases for each "k" 71 | seq_coor_t k_offset; 72 | seq_coor_t d; 73 | seq_coor_t k, k2; 74 | seq_coor_t best_m; // the best "matches" for each d 75 | seq_coor_t min_k, new_min_k; 76 | seq_coor_t max_k, new_max_k; 77 | seq_coor_t x, y; 78 | seq_coor_t x1, y1; 79 | seq_coor_t max_d; 80 | seq_coor_t band_size; 81 | 82 | uint8_t q_shift = 0; 83 | uint8_t t_shift = 0; 84 | 85 | bool start = false; 86 | 87 | ovlp_match_t *rtn; 88 | bool matched = false; 89 | 90 | q_shift = q_strand == 0 ? 0 : 4; 91 | t_shift = t_strand == 0 ? 0 : 4; 92 | 93 | // printf("debug: %ld %ld\n", q_len, t_len); 94 | // printf("%s\n", query_seq); 95 | 96 | max_d = (int)(0.3 * (q_len + t_len)); 97 | 98 | band_size = band_tolerance * 2; 99 | 100 | V = calloc(max_d * 2 + 1, sizeof(seq_coor_t)); 101 | U = calloc(max_d * 2 + 1, sizeof(seq_coor_t)); 102 | 103 | k_offset = max_d; 104 | 105 | rtn = calloc(1, sizeof(ovlp_match_t)); 106 | rtn->m_size = 0; 107 | rtn->q_bgn = 0; 108 | rtn->q_end = 0; 109 | rtn->t_bgn = 0; 110 | rtn->t_end = 0; 111 | rtn->q_m_end = 0; 112 | rtn->t_m_end = 0; 113 | uint32_t longest_match = 0; 114 | 115 | // printf("max_d: %lu, band_size: %lu\n", max_d, band_size); 116 | best_m = -1; 117 | min_k = 0; 118 | max_k = 0; 119 | for (d = 0; d < max_d; d++) { 120 | if (max_k - min_k > band_size) { 121 | break; 122 | } 123 | 124 | for (k = min_k; k <= max_k; k += 2) { 125 | if ((k == min_k) || 126 | ((k != max_k) && (V[k - 1 + k_offset] < V[k + 1 + k_offset]))) { 127 | x = V[k + 1 + k_offset]; 128 | } else { 129 | x = V[k - 1 + k_offset] + 1; 130 | } 131 | y = x - k; 132 | x1 = x; 133 | y1 = y; 134 | 135 | while (x < q_len && y < t_len && 136 | ((query_seq[x] >> q_shift) & 0x0F) == 137 | ((target_seq[y] >> t_shift) & 0x0F)) { 138 | x++; 139 | y++; 140 | } 141 | 142 | if ((x - x1 > 16) && (start == false)) { 143 | rtn->q_bgn = x1; 144 | rtn->t_bgn = y1; 145 | start = true; 146 | } 147 | 148 | if ((x - x1 > longest_match)) { 149 | longest_match = x - x1; 150 | rtn->q_m_end = x; 151 | rtn->t_m_end = y; 152 | } 153 | 154 | V[k + k_offset] = x; 155 | U[k + k_offset] = x + y; 156 | 157 | if (x + y > best_m) { 158 | best_m = x + y; 159 | } 160 | 161 | if (x >= q_len || y >= t_len) { 162 | matched = true; 163 | break; 164 | } 165 | } 166 | 167 | // For banding 168 | new_min_k = max_k; 169 | new_max_k = min_k; 170 | 171 | for (k2 = min_k; k2 <= max_k; k2 += 2) { 172 | if (U[k2 + k_offset] >= best_m - band_tolerance) { 173 | if (k2 < new_min_k) { 174 | new_min_k = k2; 175 | } 176 | if (k2 > new_max_k) { 177 | new_max_k = k2; 178 | } 179 | } 180 | } 181 | 182 | max_k = new_max_k + 1; 183 | min_k = new_min_k - 1; 184 | 185 | if (matched == true) { 186 | rtn->q_end = x; 187 | rtn->t_end = y; 188 | rtn->dist = d; 189 | // we don't really generate the alingment path here, so we can only 190 | // estimate the alignment string size 191 | rtn->m_size = 192 | (rtn->q_end - rtn->q_bgn + rtn->t_end - rtn->t_bgn + 2 * d) / 2; 193 | break; 194 | } 195 | } 196 | if (matched == false) { 197 | rtn->q_bgn = 0; 198 | rtn->t_bgn = 0; 199 | } 200 | 201 | free(V); 202 | free(U); 203 | return rtn; 204 | } 205 | 206 | void free_ovlp_match(ovlp_match_t *match) { free(match); } 207 | -------------------------------------------------------------------------------- /py-utils/FastaReader.py: -------------------------------------------------------------------------------- 1 | from os.path import abspath, expanduser 2 | from io import StringIO 3 | import contextlib 4 | import gzip 5 | import re 6 | import subprocess 7 | 8 | ## 9 | # Utility functions for FastaReader 10 | ## 11 | 12 | 13 | def wrap(s, columns): 14 | return "\n".join(s[start:start + columns] 15 | for start in range(0, len(s), columns)) 16 | 17 | 18 | def splitFastaHeader(name): 19 | """ 20 | Split a FASTA/FASTQ header into its id and metadata components 21 | """ 22 | nameParts = re.split('\s', name, maxsplit=1) 23 | id_ = nameParts[0] 24 | if len(nameParts) > 1: 25 | metadata = nameParts[1].strip() 26 | else: 27 | metadata = None 28 | return (id_, metadata) 29 | 30 | 31 | def splitFileContents(f, delimiter, BLOCKSIZE=8192): 32 | """ 33 | Same semantics as f.read().split(delimiter), but with memory usage 34 | determined by largest chunk rather than entire file size 35 | """ 36 | remainder = StringIO() 37 | while True: 38 | block = f.read(BLOCKSIZE) 39 | if not block: 40 | break 41 | parts = block.split(delimiter) 42 | remainder.write(parts[0]) 43 | for part in parts[1:]: 44 | yield remainder.getvalue() 45 | remainder = StringIO() 46 | remainder.write(part) 47 | yield remainder.getvalue() 48 | 49 | 50 | class FastaRecord(object): 51 | """ 52 | A FastaRecord object models a named sequence in a FASTA file. 53 | """ 54 | DELIMITER = ">" 55 | COLUMNS = 60 56 | 57 | def __init__(self, name, sequence): 58 | try: 59 | assert "\n" not in name 60 | assert "\n" not in sequence 61 | assert self.DELIMITER not in sequence 62 | self._name = name 63 | self._sequence = sequence 64 | self._id, self._metadata = splitFastaHeader(name) 65 | except AssertionError: 66 | raise ValueError("Invalid FASTA record data") 67 | 68 | @property 69 | def name(self): 70 | """ 71 | The name of the sequence in the FASTA file, equal to the entire 72 | FASTA header following the '>' character 73 | """ 74 | return self._name 75 | 76 | @property 77 | def id(self): 78 | """ 79 | The id of the sequence in the FASTA file, equal to the FASTA header 80 | up to the first whitespace. 81 | """ 82 | return self._id 83 | 84 | @property 85 | def metadata(self): 86 | """ 87 | The metadata associated with the sequence in the FASTA file, equal to 88 | the contents of the FASTA header following the first whitespace 89 | """ 90 | return self._metadata 91 | 92 | @property 93 | def sequence(self): 94 | """ 95 | The sequence for the record as present in the FASTA file. 96 | (Newlines are removed but otherwise no sequence normalization 97 | is performed). 98 | """ 99 | return self._sequence 100 | 101 | @property 102 | def length(self): 103 | """ 104 | Get the length of the FASTA sequence 105 | """ 106 | return len(self._sequence) 107 | 108 | @classmethod 109 | def fromString(cls, s): 110 | """ 111 | Interprets a string as a FASTA record. Does not make any 112 | assumptions about wrapping of the sequence string. 113 | """ 114 | try: 115 | lines = s.splitlines() 116 | assert len(lines) > 1 117 | assert lines[0][0] == cls.DELIMITER 118 | name = lines[0][1:] 119 | sequence = "".join(lines[1:]) 120 | return FastaRecord(name, sequence) 121 | except AssertionError: 122 | raise ValueError("String not recognized as a valid FASTA record") 123 | 124 | def __eq__(self, other): 125 | if isinstance(other, self.__class__): 126 | return (self.name == other.name and 127 | self.sequence == other.sequence) 128 | else: 129 | return False 130 | 131 | def __ne__(self, other): 132 | return not self.__eq__(other) 133 | 134 | def __str__(self): 135 | """ 136 | Output a string representation of this FASTA record, observing 137 | standard conventions about sequence wrapping. 138 | """ 139 | return (">%s\n" % self.name) + \ 140 | wrap(self.sequence, self.COLUMNS) 141 | 142 | 143 | # These are refactored from ReaderBase/FastaReader. 144 | 145 | def yield_fasta_records(f, fn): 146 | """ 147 | f: fileobj 148 | fn: str - filename (for exceptions) 149 | """ 150 | try: 151 | parts = splitFileContents(f, ">") 152 | assert "" == next(parts) 153 | for part in parts: 154 | yield FastaRecord.fromString(">" + part) 155 | except AssertionError: 156 | raise Exception("Invalid FASTA file {!r}".format(fn)) 157 | 158 | 159 | def stream_stdout(call, fn): 160 | args = call.split() 161 | proc = subprocess.Popen(args, stdin=open(fn), stdout=subprocess.PIPE) 162 | return proc.stdout 163 | 164 | 165 | @contextlib.contextmanager 166 | def open_fasta_reader(fn): 167 | """ 168 | fn: str - filename 169 | 170 | Note: If you already have a fileobj, you can iterate over yield_fasta_records() directly. 171 | 172 | Streaming reader for FASTA files, useable as a one-shot iterator 173 | over FastaRecord objects. Agnostic about line wrapping. 174 | Example: 175 | .. doctest:: 176 | TODO: Get data. 177 | > from pbcore import data 178 | > filename = data.getTinyFasta() 179 | > r = FastaReader(filename) 180 | > with open_fasta_reader(filename) as r: 181 | ... for record in r: 182 | ... print record.name, len(record.sequence) 183 | ref000001|EGFR_Exon_2 183 184 | ref000002|EGFR_Exon_3 203 185 | ref000003|EGFR_Exon_4 215 186 | ref000004|EGFR_Exon_5 157 187 | """ 188 | filename = abspath(expanduser(fn)) 189 | mode = 'r' 190 | if filename.endswith(".gz"): 191 | ofs = gzip.open(filename, mode) 192 | elif filename.endswith(".dexta"): 193 | ofs = stream_stdout("undexta -vkU -w60 -i", filename) 194 | else: 195 | ofs = open(filename, mode) 196 | yield yield_fasta_records(ofs, filename) 197 | ofs.close() 198 | 199 | 200 | class FastaReader(object): 201 | """Deprecated, but should still work (with filenames). 202 | """ 203 | 204 | def __iter__(self): 205 | with open_fasta_reader(self.filename) as reader: 206 | for rec in reader: 207 | yield rec 208 | 209 | def __init__(self, f): 210 | self.filename = f 211 | -------------------------------------------------------------------------------- /src/mm_sketch.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #define __STDC_LIMIT_MACROS 7 | #include "kvec.h" 8 | #include "shimmer.h" 9 | 10 | unsigned char seq_nt4_table[256] = { 11 | 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 12 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 13 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 14 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 15 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 4, 4, 16 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 17 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 18 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 19 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 20 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 21 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; 22 | 23 | static inline uint64_t hash64(uint64_t key, uint64_t mask) { 24 | key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1; 25 | key = key ^ key >> 24; 26 | key = ((key + (key << 3)) + (key << 8)) & mask; // key * 265 27 | key = key ^ key >> 14; 28 | key = ((key + (key << 2)) + (key << 4)) & mask; // key * 21 29 | key = key ^ key >> 28; 30 | key = (key + (key << 31)) & mask; 31 | return key; 32 | } 33 | 34 | typedef struct { // a simplified version of kdq 35 | int front, count; 36 | int a[32]; 37 | } tiny_queue_t; 38 | 39 | static inline void tq_push(tiny_queue_t *q, int x) { 40 | q->a[((q->count++) + q->front) & 0x1f] = x; 41 | } 42 | 43 | static inline int tq_shift(tiny_queue_t *q) { 44 | int x; 45 | if (q->count == 0) return -1; 46 | x = q->a[q->front++]; 47 | q->front &= 0x1f; 48 | --q->count; 49 | return x; 50 | } 51 | 52 | /** 53 | * Find symmetric (w,k)-minimizers on a DNA sequence 54 | * 55 | * @param km thread-local memory pool; using NULL falls back to malloc() 56 | * @param str DNA sequence 57 | * @param len length of $str 58 | * @param w find a minimizer for every $w consecutive k-mers 59 | * @param k k-mer size 60 | * @param rid reference ID; will be copied to the output $p array 61 | * @param is_hpc homopolymer-compressed or not 62 | * @param p minimizers 63 | * p->a[i].x = kMer<<8 | kmerSpan 64 | * p->a[i].y = rid<<32 | lastPos<<1 | strand 65 | * where lastPos is the position of the last base of the i-th 66 | * minimizer, and strand indicates whether the minimizer comes from the top or 67 | * the bottom strand. Callers may want to set "p->n = 0"; otherwise results are 68 | * appended to p 69 | */ 70 | void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, 71 | int is_hpc, mm128_v *p) { 72 | uint64_t shift1 = 2 * (k - 1), mask = (1ULL << 2 * k) - 1, kmer[2] = {0, 0}; 73 | int i, j, l, buf_pos, min_pos, kmer_span = 0; 74 | mm128_t buf[256], min = {UINT64_MAX, UINT64_MAX}; 75 | tiny_queue_t tq; 76 | 77 | assert(len > 0 && (w > 0 && w < 256) && 78 | (k > 0 && k <= 28)); // 56 bits for k-mer; could use long k-mers, but 79 | // 28 enough in practice 80 | memset(buf, 0xff, w * 16); 81 | memset(&tq, 0, sizeof(tiny_queue_t)); 82 | kv_resize(mm128_t, km, *p, p->n + len / w); 83 | 84 | for (i = l = buf_pos = min_pos = 0; i < len; ++i) { 85 | int c = seq_nt4_table[(uint8_t)str[i]]; 86 | mm128_t info = {UINT64_MAX, UINT64_MAX}; 87 | if (c < 4) { // not an ambiguous base 88 | int z; 89 | if (is_hpc) { 90 | int skip_len = 1; 91 | if (i + 1 < len && seq_nt4_table[(uint8_t)str[i + 1]] == c) { 92 | for (skip_len = 2; i + skip_len < len; ++skip_len) 93 | if (seq_nt4_table[(uint8_t)str[i + skip_len]] != c) break; 94 | i += 95 | skip_len - 1; // put $i at the end of the current homopolymer run 96 | } 97 | tq_push(&tq, skip_len); 98 | kmer_span += skip_len; 99 | if (tq.count > k) kmer_span -= tq_shift(&tq); 100 | } else 101 | kmer_span = l + 1 < k ? l + 1 : k; 102 | kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer 103 | kmer[1] = (kmer[1] >> 2) | (3ULL ^ c) << shift1; // reverse k-mer 104 | if (kmer[0] == kmer[1]) 105 | continue; // skip "symmetric k-mers" as we don't know it strand 106 | z = kmer[0] < kmer[1] ? 0 : 1; // strand 107 | ++l; 108 | if (l >= k && kmer_span < 256) { 109 | info.x = hash64(kmer[z], mask) << 8 | kmer_span; 110 | info.y = (uint64_t)rid << 32 | (uint32_t)i << 1 | z; 111 | } 112 | } else 113 | l = 0, tq.count = tq.front = 0, kmer_span = 0; 114 | buf[buf_pos] = info; // need to do this here as appropriate buf_pos and 115 | // buf[buf_pos] are needed below 116 | if (l == w + k - 1 && 117 | min.x != UINT64_MAX) { // special case for the first window - because 118 | // identical k-mers are not stored yet 119 | for (j = buf_pos + 1; j < w; ++j) 120 | if (min.x == buf[j].x && buf[j].y != min.y) 121 | kv_push(mm128_t, km, *p, buf[j]); 122 | for (j = 0; j < buf_pos; ++j) 123 | if (min.x == buf[j].x && buf[j].y != min.y) 124 | kv_push(mm128_t, km, *p, buf[j]); 125 | } 126 | if (info.x <= min.x) { // a new minimum; then write the old min 127 | if (l >= w + k && min.x != UINT64_MAX) kv_push(mm128_t, km, *p, min); 128 | min = info, min_pos = buf_pos; 129 | } else if (buf_pos == min_pos) { // old min has moved outside the window 130 | if (l >= w + k - 1 && min.x != UINT64_MAX) kv_push(mm128_t, km, *p, min); 131 | for (j = buf_pos + 1, min.x = UINT64_MAX; j < w; 132 | ++j) // the two loops are necessary when there are identical k-mers 133 | if (min.x >= buf[j].x) 134 | min = buf[j], 135 | min_pos = j; // >= is important s.t. min is always the closest k-mer 136 | for (j = 0; j <= buf_pos; ++j) 137 | if (min.x >= buf[j].x) min = buf[j], min_pos = j; 138 | if (l >= w + k - 1 && min.x != UINT64_MAX) { // write identical k-mers 139 | for (j = buf_pos + 1; j < w; 140 | ++j) // these two loops make sure the output is sorted 141 | if (min.x == buf[j].x && min.y != buf[j].y) 142 | kv_push(mm128_t, km, *p, buf[j]); 143 | for (j = 0; j <= buf_pos; ++j) 144 | if (min.x == buf[j].x && min.y != buf[j].y) 145 | kv_push(mm128_t, km, *p, buf[j]); 146 | } 147 | } 148 | if (++buf_pos == w) buf_pos = 0; 149 | } 150 | if (min.x != UINT64_MAX) kv_push(mm128_t, km, *p, min); 151 | } 152 | -------------------------------------------------------------------------------- /py/peregrine/utils.py: -------------------------------------------------------------------------------- 1 | import peregrine 2 | import sys, os 3 | import numpy as np 4 | from peregrine._shimmer4py import ffi as shimmer_ffi 5 | from peregrine._shimmer4py import lib as shimmer4py 6 | from peregrine._falcon4py import ffi as falcon_ffi 7 | from peregrine._falcon4py import lib as falcon4py 8 | 9 | 10 | rmap = dict(list(zip(b"ACGT", b"TGCA"))) 11 | 12 | 13 | def rc(seq): 14 | return bytes([rmap[c] for c in seq[::-1]]) 15 | 16 | 17 | def mmer2tuple(mmer): 18 | x = mmer.x 19 | y = mmer.y 20 | span = x & 0xFF 21 | mmer = x >> 8 22 | rid = y >> 32 23 | pos_end = ((y & 0xFFFFFFFF) >> 1) + 1 24 | direction = y & 0x1 25 | return (mmer, span, rid, pos_end, direction) 26 | 27 | 28 | def get_shimmers_from_seq(seq, rid=0, 29 | levels=2, reduction_factor=3, 30 | k=16, w=80): 31 | assert levels <= 2 32 | c_null = shimmer_ffi.NULL 33 | mmers = shimmer_ffi.new("mm128_v *") 34 | shimmer4py.mm_sketch(c_null, seq, len(seq), w, k, rid, 0, mmers) 35 | if levels == 0: 36 | return mmers 37 | elif levels == 1: 38 | mmers_L1 = shimmer_ffi.new("mm128_v *") 39 | shimmer4py.mm_reduce(mmers, mmers_L1, reduction_factor) 40 | shimmer_ffi.release(mmers) 41 | return mmers_L1 42 | elif levels == 2: 43 | mmers_L1 = shimmer_ffi.new("mm128_v *") 44 | mmers_L2 = shimmer_ffi.new("mm128_v *") 45 | shimmer4py.mm_reduce(mmers, mmers_L1, reduction_factor) 46 | shimmer4py.mm_reduce(mmers_L1, mmers_L2, reduction_factor) 47 | shimmer_ffi.release(mmers_L1) 48 | shimmer_ffi.release(mmers) 49 | return mmers_L2 50 | 51 | 52 | def get_shimmer_alns(shimmers0, shimmers1, direction=0, 53 | max_diff=100, max_dist=1200, 54 | max_repeat=1): 55 | aln = shimmer4py.shmr_aln(shimmers0, shimmers1, direction, 56 | max_diff, max_dist, max_repeat) 57 | aln_chains = [] 58 | for i in range(aln.n): 59 | chain = [] 60 | offsets = np.zeros(aln.a[i].idx0.n, dtype=np.float) 61 | for j in range(aln.a[i].idx0.n): 62 | idx0, idx1 = aln.a[i].idx0.a[j], aln.a[i].idx1.a[j] 63 | mmer0 = mmer2tuple(shimmers0.a[idx0]) 64 | mmer1 = mmer2tuple(shimmers1.a[idx1]) 65 | chain.append((mmer0, mmer1)) 66 | if direction == 0: # same direction 67 | d = mmer0[3] - mmer1[3] 68 | else: 69 | d = mmer0[3] + mmer1[3] 70 | offsets[j] = d 71 | aln_chains.append((chain, np.max(d), np.mean(d), np.min(d))) 72 | shimmer4py.free_shmr_alns(aln) 73 | return aln_chains 74 | 75 | 76 | def get_tag_from_seqs(read_seq, ref_seq, read_offset): 77 | rng = falcon_ffi.new("aln_range[1]") 78 | read_len = len(read_seq) 79 | ref_len = len(ref_seq) 80 | aligned = False 81 | if read_offset < 0: 82 | aln = falcon4py.align(read_seq[abs(read_offset):read_len], 83 | read_len - abs(read_offset), 84 | ref_seq, 85 | len(ref_seq), 86 | 150, 1) 87 | if abs(abs(aln.aln_q_e-aln.aln_q_s) - 88 | (read_len - abs(read_offset))) < 48: 89 | aligned = True 90 | rng[0].s1 = aln.aln_q_s 91 | rng[0].e1 = aln.aln_q_e 92 | rng[0].s2 = aln.aln_t_s 93 | rng[0].e2 = aln.aln_t_e 94 | t_offset = 0 95 | else: 96 | falcon4py.free_alignment(aln) 97 | else: 98 | aln = falcon4py.align(read_seq, 99 | read_len, 100 | ref_seq[read_offset:ref_len], 101 | ref_len-read_offset, 102 | 150, 1) 103 | if abs(abs(aln.aln_q_e-aln.aln_q_s)-read_len) < 48 or \ 104 | abs(ref_len-read_offset-abs(aln.aln_q_e-aln.aln_q_s)) < 48: 105 | aligned = True 106 | rng[0].s1 = aln.aln_q_s 107 | rng[0].e1 = aln.aln_q_e 108 | rng[0].s2 = aln.aln_t_s 109 | rng[0].e2 = aln.aln_t_e 110 | t_offset = read_offset 111 | else: 112 | falcon4py.free_alignment(aln) 113 | tag = None 114 | if aligned: 115 | tag = falcon4py.get_align_tags(aln.q_aln_str, 116 | aln.t_aln_str, 117 | aln.aln_str_size, 118 | rng, 0, t_offset) 119 | falcon4py.free_alignment(aln) 120 | falcon_ffi.release(rng) 121 | 122 | return tag 123 | 124 | 125 | def get_cns_from_reads(seqs): 126 | 127 | aln_count = 0 128 | tags = falcon_ffi.new("align_tags_t * [{}]".format(len(seqs)+1)) 129 | seq0 = seqs[0] 130 | shimmers0 = get_shimmers_from_seq(seq0, rid=0, levels=2) 131 | 132 | alns = get_shimmer_alns(shimmers0, shimmers0, 0) 133 | aln = alns[0] 134 | read_offset = aln[0][0][0][3] - aln[0][0][1][3] 135 | seq = seq0 136 | tag = get_tag_from_seqs(seq, seq0, read_offset) 137 | tags[aln_count] = tag 138 | aln_count += 1 139 | 140 | for i, seq in enumerate(seqs): 141 | if i == 0: 142 | continue 143 | rid = i * 2 144 | shimmers1 = get_shimmers_from_seq(seq, rid=rid, levels=2) 145 | alns = get_shimmer_alns(shimmers0, shimmers1, 0) 146 | alns.sort(key=lambda x: -len(x[0])) 147 | if len(alns) > 0: 148 | aln = alns[0] 149 | read_offset = aln[0][0][0][3] - aln[0][0][1][3] 150 | seq = seq0 151 | tag = get_tag_from_seqs(seq, seq0, read_offset) 152 | if tag is not None: 153 | tags[aln_count] = tag 154 | aln_count += 1 155 | shimmer4py.free(shimmers1.a) 156 | shimmer_ffi.release(shimmers1) 157 | 158 | rid = i * 2 + 1 159 | seq = rc(seq) 160 | shimmers1 = get_shimmers_from_seq(seq, rid=rid, levels=2) 161 | alns = get_shimmer_alns(shimmers0, shimmers1, 0) 162 | if len(alns) > 0: 163 | alns.sort(key=lambda x: -len(x[0])) 164 | aln = alns[0] 165 | read_offset = aln[0][0][0][3] - aln[0][0][1][3] 166 | tag = get_tag_from_seqs(seq, seq0, read_offset) 167 | if tag is not None: 168 | tags[aln_count] = tag 169 | aln_count += 1 170 | shimmer4py.free(shimmers1.a) 171 | shimmer_ffi.release(shimmers1) 172 | 173 | cns = falcon4py.get_cns_from_align_tags(tags, 174 | aln_count, 175 | len(seq0), 1) 176 | cns_seq = falcon_ffi.string(cns.sequence) 177 | falcon4py.free_consensus_data(cns) 178 | shimmer4py.free(shimmers0.a) 179 | shimmer_ffi.release(shimmers0) 180 | 181 | return cns_seq 182 | -------------------------------------------------------------------------------- /py-utils/process_L2.py: -------------------------------------------------------------------------------- 1 | from cffi import FFI 2 | # import redis 3 | 4 | ffi = FFI() 5 | 6 | ffi.cdef(""" 7 | typedef struct { uint64_t x, y; } mm128_t; 8 | typedef struct { size_t n, m; mm128_t *a; } mm128_v; 9 | mm128_v read_mmlist(char *); 10 | void free(void *ptr); 11 | """) 12 | 13 | C = ffi.dlopen(None) 14 | mm_utils = ffi.dlopen("../src/mm_utils.so") 15 | # r_conn = redis.Redis(host='127.0.0.1', port=6379, db=0) 16 | 17 | rmap = dict(zip(b"ACGT", b"TGCA")) 18 | 19 | L2dump = open("L2.txt", "w") 20 | 21 | #hmmerL0 = ffi.new("mm128_v *") 22 | #hmmerL2 = ffi.new("mm128_v *") 23 | 24 | hmmerL0 = mm_utils.read_mmlist(b"../test/hmmer-L0-01-of-01.dat") 25 | hmmerL2 = mm_utils.read_mmlist(b"../test/hmmer-L2-01-of-01.dat") 26 | 27 | rid2name = {} 28 | rid2len = {} 29 | # rid2seq = {} 30 | 31 | with open("../test/seq_dataset.idx") as f: 32 | for row in f: 33 | row = row.strip().split() 34 | rid, rname, rlen = row 35 | rid = int(rid) 36 | rlen = int(rlen) 37 | rid2name[rid] = rname 38 | rid2len[rid] = rlen 39 | 40 | """ 41 | * @param p minimizers 42 | * p->a[i].x = kMer<<8 | kmerSpan 43 | * p->a[i].y = rid<<32 | lastPos<<1 | strand 44 | * where lastPos is the position of the last base of the i-th minimizer, 45 | * and strand indicates whether the minimizer comes from the top or the bottom strand. 46 | * Callers may want to set "p->n = 0"; otherwise results are appended to p 47 | """ 48 | 49 | mmer_count = {} 50 | for i in range(hmmerL0.n): 51 | span = hmmerL0.a[i].x & 0xFF 52 | mmer = hmmerL0.a[i].x >> 8 53 | rid = hmmerL0.a[i].y >> 32 54 | pos_end = ((hmmerL0.a[i].y & 0xFFFFFFFF) >> 1) + 1 55 | strand = hmmerL0.a[i].y & 0x1 56 | mm_str = "{:014X}".format(mmer) 57 | mmer_count.setdefault(mm_str, 0) 58 | mmer_count[mm_str] += 1 59 | # 60 | # kmer = bseq[pos_end-span:pos_end] 61 | # kmer_r = bytes([rmap[c] for c in kmer[::-1]]) 62 | r_pos_end = rid2len[rid] - pos_end + span 63 | name = rid2name[rid] 64 | 65 | mmer_count_L2 = {} 66 | L2list = {} 67 | for i in range(hmmerL2.n): 68 | span = hmmerL2.a[i].x & 0xFF 69 | mmer = hmmerL2.a[i].x >> 8 70 | rid = hmmerL2.a[i].y >> 32 71 | pos_end = ((hmmerL2.a[i].y & 0xFFFFFFFF) >> 1) + 1 72 | strand = hmmerL2.a[i].y & 0x1 73 | r_pos_end = rid2len[rid] - pos_end + span 74 | name = rid2name[rid] 75 | mm_str = "{:014X}".format(mmer) 76 | mmer_count_L2.setdefault(mm_str, 0) 77 | mmer_count_L2[mm_str] += 1 78 | print(name, pos_end, r_pos_end, strand, 79 | mm_str, mmer_count[mm_str], file=L2dump) 80 | L2list.setdefault(rid, []) 81 | L2list[rid].append((pos_end, r_pos_end, strand, 82 | mm_str, name)) 83 | L2dump.close() 84 | 85 | L2map = {} 86 | rspan = {} 87 | for rid in L2list: 88 | lst = L2list[rid] 89 | if len(lst) < 2: 90 | continue 91 | rspan[rid] = lst[0][-2], lst[-1][-2] 92 | v = lst[0] 93 | for w in lst[1:]: 94 | v_pos_end, v_r_pos_end, v_strand, v_mmer, v_name = v 95 | # 96 | if mmer_count_L2[v_mmer] < 2: 97 | v = w 98 | continue 99 | w_pos_end, w_r_pos_end, w_strand, w_mmer, w_name = w 100 | # 101 | if mmer_count_L2[w_mmer] < 2: 102 | continue 103 | key = v_mmer, v_strand, w_mmer, w_strand 104 | L2map.setdefault(key, []) 105 | L2map[key].append((v_name, rid, 0, v_pos_end, w_pos_end)) 106 | v = w 107 | 108 | v = lst[-1] 109 | for w in lst[-2::-1]: 110 | v_pos_end, v_r_pos_end, v_strand, v_mmer, v_name = v 111 | # 112 | if mmer_count_L2[v_mmer] < 2: 113 | v = w 114 | continue 115 | w_pos_end, w_r_pos_end, w_strand, w_mmer, w_name = w 116 | # 117 | if mmer_count_L2[w_mmer] < 2: 118 | continue 119 | key = v_mmer, 1-v_strand, w_mmer, 1-w_strand 120 | L2map.setdefault(key, []) 121 | L2map[key].append((v_name, rid, 1, v_r_pos_end, w_r_pos_end)) 122 | v = w 123 | 124 | dt_pairs = set() 125 | with open("L0_dt.txt") as f: 126 | for row in f: 127 | row = row.strip().split() 128 | dt_pairs.add( (row[1], row[3]) ) 129 | dt_pairs.add( (row[3], row[1]) ) 130 | 131 | import networkx as nx 132 | G = nx.DiGraph() 133 | 134 | r_dimer_set = {} 135 | read_ovlp = {} 136 | for key in L2map.keys(): 137 | v_mmer, v_strand, w_mmer, w_strand = key 138 | for r in L2map[key]: 139 | v_name, v_rid, r_strand, v_pos_end, w_pos_end = r 140 | if v_name == "ref": continue 141 | r_dimer_set.setdefault(v_rid, set()) 142 | r_dimer_set[v_rid].add(key) 143 | 144 | for key in L2map.keys(): 145 | v_mmer, v_strand, w_mmer, w_strand = key 146 | rlist = [] 147 | n = len(L2map[key]) 148 | for r in L2map[key]: 149 | v_name, v_rid, r_strand, v_pos_end, w_pos_end = r 150 | if v_name == "ref": continue 151 | left_offset_v = -v_pos_end 152 | right_offset_v = rid2len[v_rid]-v_pos_end 153 | dist = w_pos_end - v_pos_end 154 | print("X", v_mmer, v_strand, w_mmer, w_strand, 155 | v_name, r_strand, rid2len[v_rid], 156 | v_pos_end, w_pos_end, 157 | mmer_count_L2[v_mmer], mmer_count_L2[w_mmer], n, 158 | left_offset_v, right_offset_v) 159 | 160 | if mmer_count_L2[v_mmer] >= 30 or mmer_count_L2[v_mmer] <= 1: 161 | continue 162 | if mmer_count_L2[w_mmer] >= 30 or mmer_count_L2[w_mmer] <= 1: 163 | continue 164 | 165 | rlist.append((left_offset_v, right_offset_v, v_name, 166 | v_rid, r_strand, dist )) 167 | 168 | if len(rlist) == 0: continue 169 | rlist.sort() 170 | 171 | p_set = set() 172 | left_offset_v0, right_offset_v0, r_name0, r_id0, r_strand0, dist0 = rlist[0] 173 | p_set = r_dimer_set[r_id0] 174 | for left_offset_v, right_offset_v, r_name, r_id, r_strand, dist in rlist[1:]: 175 | if right_offset_v0 < right_offset_v: 176 | overlap_count = len(r_dimer_set[r_id] & p_set) 177 | overlap_len = rid2len[r_id0] - abs(left_offset_v0-left_offset_v) 178 | dt_match = 1 if (r_name0, r_name) in dt_pairs else 0 179 | print("Y", v_mmer, v_strand, w_mmer, w_strand, 180 | r_name0, r_strand0, r_name, r_strand, 181 | overlap_count, overlap_len, left_offset_v0, left_offset_v, dt_match) 182 | if dt_match == 1: 183 | read_ovlp.setdefault((r_name0, r_strand0), []) 184 | read_ovlp.setdefault((r_name, 1-r_strand), []) 185 | read_ovlp[(r_name0, r_strand0)].append((overlap_len, (r_name, r_strand))) 186 | read_ovlp[(r_name, 1-r_strand)].append((-overlap_len, (r_name0, 1-r_strand0))) 187 | 188 | # G.add_edge("{}-{}".format(r_name, r_strand), "{}-{}".format(r_name0, r_strand0)) 189 | # G.add_edge("{}-{}".format(r_name0, r_strand0), "{}-{}".format(r_name, r_strand)) 190 | p_set = r_dimer_set[r_id] 191 | right_offset_v0 = right_offset_v 192 | left_offset_v0 = left_offset_v 193 | r_name0 = r_name 194 | r_strand0 = r_strand 195 | 196 | for k in read_ovlp: 197 | read_ovlp[k].sort() 198 | 199 | for v in read_ovlp[k][:]: 200 | G.add_edge( "{}-{}".format(*k), "{}-{}".format(*v[-1])) 201 | #for v in read_ovlp[k][-3:]: 202 | # G.add_edge( "{}-{}".format(*k), "{}-{}".format(*v[-1])) 203 | 204 | nx.write_gexf(G, "test.gexf") 205 | 206 | C.free(hmmerL0.a) 207 | C.free(hmmerL2.a) 208 | -------------------------------------------------------------------------------- /src/kalloc.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "kalloc.h" 5 | 6 | /* In kalloc, a *core* is a large chunk of contiguous memory. Each core is 7 | * associated with a master header, which keeps the size of the current core 8 | * and the pointer to next core. Kalloc allocates small *blocks* of memory from 9 | * the cores and organizes free memory blocks in a circular single-linked list. 10 | * 11 | * In the following diagram, "@" stands for the header of a free block (of type 12 | * header_t), "#" for the header of an allocated block (of type size_t), "-" 13 | * for free memory, and "+" for allocated memory. 14 | * 15 | * master This region is core 1. master This region is core 2. 16 | * | | 17 | * *@-------#++++++#++++++++++++@-------- *@----------#++++++++++++#+++++++@------------ 18 | * | | | | 19 | * p=p->ptr->ptr->ptr->ptr p->ptr p->ptr->ptr p->ptr->ptr->ptr 20 | */ 21 | 22 | #define MIN_CORE_SIZE 0x80000 23 | 24 | typedef struct header_t { 25 | size_t size; 26 | struct header_t *ptr; 27 | } header_t; 28 | 29 | typedef struct { 30 | header_t base, *loop_head, *core_head; /* base is a zero-sized block always kept in the loop */ 31 | } kmem_t; 32 | 33 | static void panic(const char *s) 34 | { 35 | fprintf(stderr, "%s\n", s); 36 | abort(); 37 | } 38 | 39 | void *km_init(void) 40 | { 41 | return calloc(1, sizeof(kmem_t)); 42 | } 43 | 44 | void km_destroy(void *_km) 45 | { 46 | kmem_t *km = (kmem_t*)_km; 47 | header_t *p, *q; 48 | if (km == NULL) return; 49 | for (p = km->core_head; p != NULL;) { 50 | q = p->ptr; 51 | free(p); 52 | p = q; 53 | } 54 | free(km); 55 | } 56 | 57 | static header_t *morecore(kmem_t *km, size_t nu) 58 | { 59 | header_t *q; 60 | size_t bytes, *p; 61 | nu = (nu + 1 + (MIN_CORE_SIZE - 1)) / MIN_CORE_SIZE * MIN_CORE_SIZE; /* the first +1 for core header */ 62 | bytes = nu * sizeof(header_t); 63 | q = (header_t*)malloc(bytes); 64 | if (!q) panic("[morecore] insufficient memory"); 65 | q->ptr = km->core_head, q->size = nu, km->core_head = q; 66 | p = (size_t*)(q + 1); 67 | *p = nu - 1; /* the size of the free block; -1 because the first unit is used for the core header */ 68 | kfree(km, p + 1); /* initialize the new "core"; NB: the core header is not looped. */ 69 | return km->loop_head; 70 | } 71 | 72 | void kfree(void *_km, void *ap) /* kfree() also adds a new core to the circular list */ 73 | { 74 | header_t *p, *q; 75 | kmem_t *km = (kmem_t*)_km; 76 | 77 | if (!ap) return; 78 | if (km == NULL) { 79 | free(ap); 80 | return; 81 | } 82 | p = (header_t*)((size_t*)ap - 1); 83 | p->size = *((size_t*)ap - 1); 84 | /* Find the pointer that points to the block to be freed. The following loop can stop on two conditions: 85 | * 86 | * a) "p>q && pptr": @------#++++++++#+++++++@------- @---------------#+++++++@------- 87 | * (can also be in | | | -> | | 88 | * two cores) q p q->ptr q q->ptr 89 | * 90 | * @-------- #+++++++++@-------- @-------- @------------------ 91 | * | | | -> | | 92 | * q p q->ptr q q->ptr 93 | * 94 | * b) "q>=q->ptr && (p>q || pptr)": @-------#+++++ @--------#+++++++ @-------#+++++ @---------------- 95 | * | | | -> | | 96 | * q->ptr q p q->ptr q 97 | * 98 | * #+++++++@----- #++++++++@------- @------------- #++++++++@------- 99 | * | | | -> | | 100 | * p q->ptr q q->ptr q 101 | */ 102 | for (q = km->loop_head; !(p > q && p < q->ptr); q = q->ptr) 103 | if (q >= q->ptr && (p > q || p < q->ptr)) break; 104 | if (p + p->size == q->ptr) { /* two adjacent blocks, merge p and q->ptr (the 2nd and 4th cases) */ 105 | p->size += q->ptr->size; 106 | p->ptr = q->ptr->ptr; 107 | } else if (p + p->size > q->ptr && q->ptr >= p) { 108 | panic("[kfree] The end of the allocated block enters a free block."); 109 | } else p->ptr = q->ptr; /* backup q->ptr */ 110 | 111 | if (q + q->size == p) { /* two adjacent blocks, merge q and p (the other two cases) */ 112 | q->size += p->size; 113 | q->ptr = p->ptr; 114 | km->loop_head = q; 115 | } else if (q + q->size > p && p >= q) { 116 | panic("[kfree] The end of a free block enters the allocated block."); 117 | } else km->loop_head = p, q->ptr = p; /* in two cores, cannot be merged; create a new block in the list */ 118 | } 119 | 120 | void *kmalloc(void *_km, size_t n_bytes) 121 | { 122 | kmem_t *km = (kmem_t*)_km; 123 | size_t n_units; 124 | header_t *p, *q; 125 | 126 | if (n_bytes == 0) return 0; 127 | if (km == NULL) return malloc(n_bytes); 128 | n_units = (n_bytes + sizeof(size_t) + sizeof(header_t) - 1) / sizeof(header_t) + 1; 129 | 130 | if (!(q = km->loop_head)) /* the first time when kmalloc() is called, intialize it */ 131 | q = km->loop_head = km->base.ptr = &km->base; 132 | for (p = q->ptr;; q = p, p = p->ptr) { /* search for a suitable block */ 133 | if (p->size >= n_units) { /* p->size if the size of current block. This line means the current block is large enough. */ 134 | if (p->size == n_units) q->ptr = p->ptr; /* no need to split the block */ 135 | else { /* split the block. NB: memory is allocated at the end of the block! */ 136 | p->size -= n_units; /* reduce the size of the free block */ 137 | p += p->size; /* p points to the allocated block */ 138 | *(size_t*)p = n_units; /* set the size */ 139 | } 140 | km->loop_head = q; /* set the end of chain */ 141 | return (size_t*)p + 1; 142 | } 143 | if (p == km->loop_head) { /* then ask for more "cores" */ 144 | if ((p = morecore(km, n_units)) == 0) return 0; 145 | } 146 | } 147 | } 148 | 149 | void *kcalloc(void *_km, size_t count, size_t size) 150 | { 151 | kmem_t *km = (kmem_t*)_km; 152 | void *p; 153 | if (size == 0 || count == 0) return 0; 154 | if (km == NULL) return calloc(count, size); 155 | p = kmalloc(km, count * size); 156 | memset(p, 0, count * size); 157 | return p; 158 | } 159 | 160 | void *krealloc(void *_km, void *ap, size_t n_bytes) // TODO: this can be made more efficient in principle 161 | { 162 | kmem_t *km = (kmem_t*)_km; 163 | size_t n_units, *p, *q; 164 | 165 | if (n_bytes == 0) { 166 | kfree(km, ap); return 0; 167 | } 168 | if (km == NULL) return realloc(ap, n_bytes); 169 | if (ap == NULL) return kmalloc(km, n_bytes); 170 | n_units = (n_bytes + sizeof(size_t) + sizeof(header_t) - 1) / sizeof(header_t); 171 | p = (size_t*)ap - 1; 172 | if (*p >= n_units) return ap; /* TODO: this prevents shrinking */ 173 | q = (size_t*)kmalloc(km, n_bytes); 174 | memcpy(q, ap, (*p - 1) * sizeof(header_t)); 175 | kfree(km, ap); 176 | return q; 177 | } 178 | 179 | void km_stat(const void *_km, km_stat_t *s) 180 | { 181 | kmem_t *km = (kmem_t*)_km; 182 | header_t *p; 183 | memset(s, 0, sizeof(km_stat_t)); 184 | if (km == NULL || km->loop_head == NULL) return; 185 | for (p = km->loop_head;; p = p->ptr) { 186 | s->available += p->size * sizeof(header_t); 187 | if (p->size != 0) ++s->n_blocks; /* &kmem_t::base is always one of the cores. It is zero-sized. */ 188 | if (p->ptr > p && p + p->size > p->ptr) 189 | panic("[km_stat] The end of a free block enters another free block."); 190 | if (p->ptr == km->loop_head) break; 191 | } 192 | for (p = km->core_head; p != NULL; p = p->ptr) { 193 | size_t size = p->size * sizeof(header_t); 194 | ++s->n_cores; 195 | s->capacity += size; 196 | s->largest = s->largest > size? s->largest : size; 197 | } 198 | } 199 | -------------------------------------------------------------------------------- /falcon/kalloc.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "kalloc.h" 5 | 6 | /* In kalloc, a *core* is a large chunk of contiguous memory. Each core is 7 | * associated with a master header, which keeps the size of the current core 8 | * and the pointer to next core. Kalloc allocates small *blocks* of memory from 9 | * the cores and organizes free memory blocks in a circular single-linked list. 10 | * 11 | * In the following diagram, "@" stands for the header of a free block (of type 12 | * header_t), "#" for the header of an allocated block (of type size_t), "-" 13 | * for free memory, and "+" for allocated memory. 14 | * 15 | * master This region is core 1. master This region is core 2. 16 | * | | 17 | * *@-------#++++++#++++++++++++@-------- *@----------#++++++++++++#+++++++@------------ 18 | * | | | | 19 | * p=p->ptr->ptr->ptr->ptr p->ptr p->ptr->ptr p->ptr->ptr->ptr 20 | */ 21 | 22 | #define MIN_CORE_SIZE 0x80000 23 | 24 | typedef struct header_t { 25 | size_t size; 26 | struct header_t *ptr; 27 | } header_t; 28 | 29 | typedef struct { 30 | header_t base, *loop_head, *core_head; /* base is a zero-sized block always kept in the loop */ 31 | } kmem_t; 32 | 33 | static void panic(const char *s) 34 | { 35 | fprintf(stderr, "%s\n", s); 36 | abort(); 37 | } 38 | 39 | void *km_init(void) 40 | { 41 | return calloc(1, sizeof(kmem_t)); 42 | } 43 | 44 | void km_destroy(void *_km) 45 | { 46 | kmem_t *km = (kmem_t*)_km; 47 | header_t *p, *q; 48 | if (km == NULL) return; 49 | for (p = km->core_head; p != NULL;) { 50 | q = p->ptr; 51 | free(p); 52 | p = q; 53 | } 54 | free(km); 55 | } 56 | 57 | static header_t *morecore(kmem_t *km, size_t nu) 58 | { 59 | header_t *q; 60 | size_t bytes, *p; 61 | nu = (nu + 1 + (MIN_CORE_SIZE - 1)) / MIN_CORE_SIZE * MIN_CORE_SIZE; /* the first +1 for core header */ 62 | bytes = nu * sizeof(header_t); 63 | q = (header_t*)malloc(bytes); 64 | if (!q) panic("[morecore] insufficient memory"); 65 | q->ptr = km->core_head, q->size = nu, km->core_head = q; 66 | p = (size_t*)(q + 1); 67 | *p = nu - 1; /* the size of the free block; -1 because the first unit is used for the core header */ 68 | kfree(km, p + 1); /* initialize the new "core"; NB: the core header is not looped. */ 69 | return km->loop_head; 70 | } 71 | 72 | void kfree(void *_km, void *ap) /* kfree() also adds a new core to the circular list */ 73 | { 74 | header_t *p, *q; 75 | kmem_t *km = (kmem_t*)_km; 76 | 77 | if (!ap) return; 78 | if (km == NULL) { 79 | free(ap); 80 | return; 81 | } 82 | p = (header_t*)((size_t*)ap - 1); 83 | p->size = *((size_t*)ap - 1); 84 | /* Find the pointer that points to the block to be freed. The following loop can stop on two conditions: 85 | * 86 | * a) "p>q && pptr": @------#++++++++#+++++++@------- @---------------#+++++++@------- 87 | * (can also be in | | | -> | | 88 | * two cores) q p q->ptr q q->ptr 89 | * 90 | * @-------- #+++++++++@-------- @-------- @------------------ 91 | * | | | -> | | 92 | * q p q->ptr q q->ptr 93 | * 94 | * b) "q>=q->ptr && (p>q || pptr)": @-------#+++++ @--------#+++++++ @-------#+++++ @---------------- 95 | * | | | -> | | 96 | * q->ptr q p q->ptr q 97 | * 98 | * #+++++++@----- #++++++++@------- @------------- #++++++++@------- 99 | * | | | -> | | 100 | * p q->ptr q q->ptr q 101 | */ 102 | for (q = km->loop_head; !(p > q && p < q->ptr); q = q->ptr) 103 | if (q >= q->ptr && (p > q || p < q->ptr)) break; 104 | if (p + p->size == q->ptr) { /* two adjacent blocks, merge p and q->ptr (the 2nd and 4th cases) */ 105 | p->size += q->ptr->size; 106 | p->ptr = q->ptr->ptr; 107 | } else if (p + p->size > q->ptr && q->ptr >= p) { 108 | panic("[kfree] The end of the allocated block enters a free block."); 109 | } else p->ptr = q->ptr; /* backup q->ptr */ 110 | 111 | if (q + q->size == p) { /* two adjacent blocks, merge q and p (the other two cases) */ 112 | q->size += p->size; 113 | q->ptr = p->ptr; 114 | km->loop_head = q; 115 | } else if (q + q->size > p && p >= q) { 116 | panic("[kfree] The end of a free block enters the allocated block."); 117 | } else km->loop_head = p, q->ptr = p; /* in two cores, cannot be merged; create a new block in the list */ 118 | } 119 | 120 | void *kmalloc(void *_km, size_t n_bytes) 121 | { 122 | kmem_t *km = (kmem_t*)_km; 123 | size_t n_units; 124 | header_t *p, *q; 125 | 126 | if (n_bytes == 0) return 0; 127 | if (km == NULL) return malloc(n_bytes); 128 | n_units = (n_bytes + sizeof(size_t) + sizeof(header_t) - 1) / sizeof(header_t) + 1; 129 | 130 | if (!(q = km->loop_head)) /* the first time when kmalloc() is called, intialize it */ 131 | q = km->loop_head = km->base.ptr = &km->base; 132 | for (p = q->ptr;; q = p, p = p->ptr) { /* search for a suitable block */ 133 | if (p->size >= n_units) { /* p->size if the size of current block. This line means the current block is large enough. */ 134 | if (p->size == n_units) q->ptr = p->ptr; /* no need to split the block */ 135 | else { /* split the block. NB: memory is allocated at the end of the block! */ 136 | p->size -= n_units; /* reduce the size of the free block */ 137 | p += p->size; /* p points to the allocated block */ 138 | *(size_t*)p = n_units; /* set the size */ 139 | } 140 | km->loop_head = q; /* set the end of chain */ 141 | return (size_t*)p + 1; 142 | } 143 | if (p == km->loop_head) { /* then ask for more "cores" */ 144 | if ((p = morecore(km, n_units)) == 0) return 0; 145 | } 146 | } 147 | } 148 | 149 | void *kcalloc(void *_km, size_t count, size_t size) 150 | { 151 | kmem_t *km = (kmem_t*)_km; 152 | void *p; 153 | if (size == 0 || count == 0) return 0; 154 | if (km == NULL) return calloc(count, size); 155 | p = kmalloc(km, count * size); 156 | memset(p, 0, count * size); 157 | return p; 158 | } 159 | 160 | void *krealloc(void *_km, void *ap, size_t n_bytes) // TODO: this can be made more efficient in principle 161 | { 162 | kmem_t *km = (kmem_t*)_km; 163 | size_t n_units, *p, *q; 164 | 165 | if (n_bytes == 0) { 166 | kfree(km, ap); return 0; 167 | } 168 | if (km == NULL) return realloc(ap, n_bytes); 169 | if (ap == NULL) return kmalloc(km, n_bytes); 170 | n_units = (n_bytes + sizeof(size_t) + sizeof(header_t) - 1) / sizeof(header_t); 171 | p = (size_t*)ap - 1; 172 | if (*p >= n_units) return ap; /* TODO: this prevents shrinking */ 173 | q = (size_t*)kmalloc(km, n_bytes); 174 | memcpy(q, ap, (*p - 1) * sizeof(header_t)); 175 | kfree(km, ap); 176 | return q; 177 | } 178 | 179 | void km_stat(const void *_km, km_stat_t *s) 180 | { 181 | kmem_t *km = (kmem_t*)_km; 182 | header_t *p; 183 | memset(s, 0, sizeof(km_stat_t)); 184 | if (km == NULL || km->loop_head == NULL) return; 185 | for (p = km->loop_head;; p = p->ptr) { 186 | s->available += p->size * sizeof(header_t); 187 | if (p->size != 0) ++s->n_blocks; /* &kmem_t::base is always one of the cores. It is zero-sized. */ 188 | if (p->ptr > p && p + p->size > p->ptr) 189 | panic("[km_stat] The end of a free block enters another free block."); 190 | if (p->ptr == km->loop_head) break; 191 | } 192 | for (p = km->core_head; p != NULL; p = p->ptr) { 193 | size_t size = p->size * sizeof(header_t); 194 | ++s->n_cores; 195 | s->capacity += size; 196 | s->largest = s->largest > size? s->largest : size; 197 | } 198 | } 199 | -------------------------------------------------------------------------------- /misc/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | Peregrine 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /src/shmr_index.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "khash.h" 11 | #include "kseq.h" 12 | #include "kvec.h" 13 | #include "shimmer.h" 14 | 15 | #define handle_error(msg) \ 16 | do { \ 17 | perror(msg); \ 18 | exit(EXIT_FAILURE); \ 19 | } while (0) 20 | 21 | #define REDUCTION_FACTOR 6 22 | #define DEFAULT_WINDOW_SIZE 80 23 | #define DEFAULT_KMER_SIZE 16 24 | 25 | extern char *optarg; 26 | extern int optind, opterr, optopt; 27 | 28 | void write_mc_count_mm128(char *fn, mm128_v *shimmer) { 29 | mm_count_v mmc = {0, 0, 0}; 30 | khash_t(MMC) *mcmap = kh_init(MMC); 31 | mm_count(shimmer, mcmap, &mmc); 32 | write_mm_count(fn, &mmc); 33 | kv_destroy(mmc); 34 | kh_destroy(MMC, mcmap); 35 | } 36 | 37 | int main(int argc, char *argv[]) { 38 | char *seqdb_prefix = NULL; 39 | char *shimmer_prefix = NULL; 40 | 41 | char seq_idx_file_path[8192]; 42 | char seqdb_file_path[8291]; 43 | char shimmer_output_path[8192]; 44 | 45 | FILE *seq_index_file; 46 | uint32_t rid; 47 | 48 | int c, written; 49 | int total_chunk = 1; 50 | int mychunk = 1; 51 | int reduction_factor = REDUCTION_FACTOR; 52 | int number_layers = 2; 53 | int output_L0 = 1; 54 | int window_size = DEFAULT_WINDOW_SIZE; 55 | int kmer_size = DEFAULT_KMER_SIZE; 56 | mm128_v shimmerL0 = {0, 0, 0}; 57 | mm128_v shimmerL1 = {0, 0, 0}; 58 | mm128_v shimmerL2 = {0, 0, 0}; 59 | seq_data_v seq_data = {0, 0, 0}; 60 | 61 | khash_t(RLEN) *rlmap = kh_init(RLEN); 62 | opterr = 0; 63 | 64 | while ((c = getopt(argc, argv, "p:o:t:c:l:r:m:w:k:")) != -1) { 65 | switch (c) { 66 | case 'p': 67 | seqdb_prefix = optarg; 68 | break; 69 | case 'o': 70 | shimmer_prefix = optarg; 71 | break; 72 | case 't': 73 | total_chunk = atoi(optarg); 74 | break; 75 | case 'c': 76 | mychunk = atoi(optarg); 77 | break; 78 | case 'r': 79 | reduction_factor = atoi(optarg); 80 | break; 81 | case 'l': 82 | number_layers = atoi(optarg); 83 | break; 84 | case 'm': 85 | output_L0 = atoi(optarg); 86 | break; 87 | case 'w': 88 | window_size = atoi(optarg); 89 | break; 90 | case 'k': 91 | kmer_size = atoi(optarg); 92 | break; 93 | case '?': 94 | if (optopt == 'p') { 95 | fprintf(stderr, 96 | "Option -%c not specified, using 'seq_dataset' as the " 97 | "sequence db prefix\n", 98 | optopt); 99 | } else if (optopt == 'o') { 100 | fprintf(stderr, 101 | "Option -%c not specified, using 'shimmer' as the output " 102 | "prefix\n", 103 | optopt); 104 | } 105 | return 1; 106 | default: 107 | abort(); 108 | } 109 | } 110 | 111 | assert(total_chunk > 0); 112 | assert(mychunk > 0 && mychunk <= total_chunk); 113 | assert(reduction_factor < 256); 114 | assert(window_size >= 24 && kmer_size >= 12 && window_size > kmer_size); 115 | 116 | fprintf(stderr, "reduction factor= %d\n", reduction_factor); 117 | 118 | if (seqdb_prefix == NULL) { 119 | seqdb_prefix = (char *)calloc(8192, 1); 120 | snprintf(seqdb_prefix, 8191, "seq_dataset"); 121 | } 122 | 123 | if (shimmer_prefix == NULL) { 124 | shimmer_prefix = (char *)calloc(8192, 1); 125 | snprintf(shimmer_prefix, 8191, "shimmer"); 126 | } 127 | 128 | written = snprintf(seq_idx_file_path, sizeof(seq_idx_file_path), "%s.idx", 129 | seqdb_prefix); 130 | assert(written < sizeof(seq_idx_file_path)); 131 | fprintf(stderr, "using index file: %s\n", seq_idx_file_path); 132 | 133 | rlmap = get_read_length_map(seq_idx_file_path); 134 | 135 | written = snprintf(seqdb_file_path, sizeof(seqdb_file_path), "%s.seqdb", 136 | seqdb_prefix); 137 | assert(written < sizeof(seqdb_file_path)); 138 | fprintf(stderr, "using seqdb file: %s\n", seqdb_file_path); 139 | 140 | int fd; 141 | struct stat sb; 142 | uint8_t *seq_p; 143 | fd = open(seqdb_file_path, O_RDONLY); 144 | if (fd == -1) handle_error("open"); 145 | 146 | if (fstat(fd, &sb) == -1) /* To obtain file size */ 147 | handle_error("fstat"); 148 | 149 | seq_p = (uint8_t *)mmap((void *)0, sb.st_size, PROT_READ, MAP_SHARED, fd, 0); 150 | 151 | seq_index_file = fopen(seq_idx_file_path, "r"); 152 | char name_buf[512]; 153 | uint32_t rlen; 154 | size_t offset; 155 | while (fscanf(seq_index_file, "%u %255s %u %lu", &rid, name_buf, &rlen, 156 | &offset) != EOF) { 157 | if ((rid % total_chunk) != (mychunk % total_chunk)) continue; 158 | char *seq = malloc(rlen + 1); 159 | decode_biseq(seq_p + offset, seq, rlen, 0); 160 | seq[rlen] = '\0'; 161 | mm_sketch(NULL, seq, rlen, window_size, kmer_size, rid, 0, &shimmerL0); 162 | free(seq); 163 | } 164 | 165 | if (output_L0 == 1) { 166 | written = snprintf(shimmer_output_path, sizeof shimmer_output_path, 167 | "%s-L0-%02d-of-%02d.dat", shimmer_prefix, mychunk, 168 | total_chunk); 169 | assert(written < sizeof(shimmer_output_path)); 170 | fprintf(stderr, "output data file: %s\n", shimmer_output_path); 171 | write_mmlist(shimmer_output_path, &shimmerL0); 172 | 173 | /* temporary disable this as it is not used for now 174 | mm128_v shimmerE5 = {0,0,0}; 175 | mm128_v shimmerE3 = {0,0,0}; 176 | 177 | mm_end_filter(&shimmerL0, &shimmerE5, &shimmerE3, rlmap, 250); 178 | written = snprintf(shimmer_output_path, sizeof shimmer_output_path, 179 | "%s-E5-%02d-of-%02d.dat", shimmer_prefix, mychunk, total_chunk); assert(written 180 | < sizeof(shimmer_output_path)); printf("output data file: %s\n", 181 | shimmer_output_path); write_mmlist(shimmer_output_path, &shimmerE5); 182 | kv_destroy(shimmerE5); 183 | 184 | written = snprintf(shimmer_output_path, sizeof shimmer_output_path, 185 | "%s-E3-%02d-of-%02d.dat", shimmer_prefix, mychunk, total_chunk); assert(written 186 | < sizeof(shimmer_output_path)); printf("output data file: %s\n", 187 | shimmer_output_path); write_mmlist(shimmer_output_path, &shimmerE3); 188 | kv_destroy(shimmerE3); 189 | */ 190 | 191 | written = snprintf(shimmer_output_path, sizeof shimmer_output_path, 192 | "%s-L0-MC-%02d-of-%02d.dat", shimmer_prefix, mychunk, 193 | total_chunk); 194 | assert(written < sizeof(shimmer_output_path)); 195 | printf("output data file: %s\n", shimmer_output_path); 196 | write_mc_count_mm128(shimmer_output_path, &shimmerL0); 197 | } 198 | 199 | mm_reduce(&shimmerL0, &shimmerL1, reduction_factor); 200 | kv_destroy(shimmerL0); 201 | if (number_layers == 1) { 202 | written = snprintf(shimmer_output_path, sizeof shimmer_output_path, 203 | "%s-L1-%02d-of-%02d.dat", shimmer_prefix, mychunk, 204 | total_chunk); 205 | assert(written < sizeof(shimmer_output_path)); 206 | printf("output data file: %s\n", shimmer_output_path); 207 | write_mmlist(shimmer_output_path, &shimmerL1); 208 | 209 | written = snprintf(shimmer_output_path, sizeof shimmer_output_path, 210 | "%s-L1-MC-%02d-of-%02d.dat", shimmer_prefix, mychunk, 211 | total_chunk); 212 | assert(written < sizeof(shimmer_output_path)); 213 | printf("output data file: %s\n", shimmer_output_path); 214 | write_mc_count_mm128(shimmer_output_path, &shimmerL1); 215 | } else if (number_layers > 1) { 216 | mm_reduce(&shimmerL1, &shimmerL2, reduction_factor); 217 | kv_destroy(shimmerL1); 218 | written = snprintf(shimmer_output_path, sizeof shimmer_output_path, 219 | "%s-L2-%02d-of-%02d.dat", shimmer_prefix, mychunk, 220 | total_chunk); 221 | assert(written < sizeof(shimmer_output_path)); 222 | fprintf(stderr, "output data file: %s\n", shimmer_output_path); 223 | write_mmlist(shimmer_output_path, &shimmerL2); 224 | 225 | written = snprintf(shimmer_output_path, sizeof shimmer_output_path, 226 | "%s-L2-MC-%02d-of-%02d.dat", shimmer_prefix, mychunk, 227 | total_chunk); 228 | assert(written < sizeof(shimmer_output_path)); 229 | printf("output data file: %s\n", shimmer_output_path); 230 | write_mc_count_mm128(shimmer_output_path, &shimmerL2); 231 | 232 | kv_destroy(shimmerL2); 233 | } 234 | 235 | kh_destroy(RLEN, rlmap); 236 | for (size_t _i = 0; _i < seq_data.n; _i++) { 237 | kfree(NULL, seq_data.a[_i].name); 238 | } 239 | kv_destroy(seq_data); 240 | 241 | munmap(seq_p, sb.st_size); 242 | if (!seqdb_prefix) free(seqdb_prefix); 243 | if (!shimmer_prefix) free(shimmer_prefix); 244 | return 0; 245 | } 246 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # This Repo Is Archived (on Feb 6, 2022) 2 | 3 | If you are interested in an updated Peregrine Genome Assembler, please check [https://github.com/cschin/peregrine-2021](https://github.com/cschin/peregrine-2021) 4 | 5 | 6 | PeregrineLogo 7 | 8 | [![Actions Status](https://github.com/cschin/Peregrine/workflows/build-and-test-docker-image-master-branch/badge.svg)](https://github.com/cschin/peregrine/actions) 9 | 10 | [![Actions Status](https://github.com/cschin/Peregrine/workflows/build-and-test-docker-image-tagged-release/badge.svg)](https://github.com/cschin/peregrine/actions) 11 | 12 | # Peregrine & SHIMMER Genome Assembly Toolkit 13 | 14 | Peregrine is a fast genome assembler for accurate long reads (length > 10kb, 15 | accuracy > 99%). It can assemble a human genome from 30x reads within 20 cpu 16 | hours from reads to polished consensus. It uses Sparse HIereachical MimiMizER 17 | (SHIMMER) for fast read-to-read overlaping without quadratic comparisions used 18 | in other OLC assemblers. 19 | 20 | This code base includes code that uses SHIMMER (Sparse HIerarchical MiniMimER) 21 | for genome assembly and other related applications. 22 | 23 | Currently, the assembly graph process is more or less identical to the 24 | approaches used in the FALCON assembler developed by Jason Chin and others in 25 | Pacific Biosciences, Inc. There are a number of other possible ways to generate 26 | contigs without a string graph but it will need some research work to make it 27 | happening. The FALCON graph module is also not very efficient as python scripts 28 | are running in single thread mode. 29 | 30 | 31 | ## Install 32 | 33 | We *do not* recommend that you install the software from the source code unless 34 | you are comfortable handling the required dependences for your system 35 | independently. Unless you have full control (e.g. root access) of the computer 36 | system you use to build Peregrine and you can install the proper GCC 37 | compiler/python/pypy/conda version, then you should try to learn to use [Docker 38 | images](https://hub.docker.com/r/cschin/peregrine/tags) that 39 | we provide, it will make your life and our life easier. 40 | 41 | As independent developers with limit resource, we cannot provide free support for 42 | solving dependence problem of your specific system. Instead, we can provide 43 | docker image so you can run the executables and their dependency using Docker. 44 | 45 | If you want to build for your system without using Docker, please see the 46 | `docker/Dockerfile` and `docker/install_with_conda.sh` as examples to 47 | install from scratch within a clean Conda environemnt. 48 | 49 | ## Run the assembler 50 | 51 | Peregrine is designed to run on single compute node. It does not need a grid 52 | computing job scheduling system. It uses Pypeflow to coordinate multiple 53 | concurrent processes. 54 | 55 | After revsion 0.1.5.3, You can test a small assembly using simulated E. Coli 56 | reads with Docker: 57 | 58 | ``` 59 | # please substitute $PWD and $IMAGETAG with proper values 60 | docker run -it --rm -v $PWD:/wd cschin/peregrine:$IMAGETAG test 61 | ``` 62 | 63 | The assembly results are in `$PWD/ecoli_test_results/`. The testing case will 64 | download an E. Coli reference and generate simulated reads. After the assembly 65 | is done, it also installs `nucmer` to run `dandiff` comparing the assembled 66 | contigs with the original E. coli reference. You can check the ouput by using 67 | `cat $PWD/ecoli_test_results/out.report` command. 68 | 69 | Here is the general usage for `pg_run.py` which starts the workflow for 70 | assembling a genome from input `fasta`, `fastq`, `fasta.gz` or 71 | `fastq.gz` files. 72 | 73 | ``` 74 | Usage: 75 | pg_run.py asm 76 | 77 | 78 | 79 | 80 | [--with-consensus] 81 | [--with-L0-index] 82 | [--output ] 83 | [--shimmer-k ] 84 | [--shimmer-w ] 85 | [--shimmer-r ] 86 | [--shimmer-l ] 87 | [--best_n_ovlp ] 88 | [--mc_lower ] 89 | [--mc_upper ] 90 | [--aln_bw ] 91 | [--ovlp_upper ] 92 | pg_run.py (-h | --help) 93 | pg_run.py --verison 94 | 95 | Options: 96 | -h --help Show this help 97 | --version Show version 98 | --with-consensus Generate consensus after getting the draft contigs 99 | --with-L0-index Keep level-0 index 100 | --output Set output directory (will be created if not exist) [default: ./wd] 101 | --shimmer-k Level 0 k-mer size [default: 16] 102 | --shimmer-w Level 0 window size [default: 80] 103 | --shimmer-r Reduction factore for high level SHIMMER [default: 6] 104 | --shimmer-l number of level of shimmer used, the value should be 1 or 2 [default: 2] 105 | --best_n_ovlp Find best n_ovlp overlap [default: 4] 106 | --mc_lower Does not cosider SHIMMER with count less than mc_low [default: 2] 107 | --mc_upper Does not cosider SHIMMER with count greater than mc_upper [default: 240] 108 | --aln_bw Max off-diagonal gap allow during overlap confirmation [default: 100] 109 | --ovlp_upper Ignore cluster with overlap count greater ovlp_upper [default: 120] 110 | ``` 111 | 112 | The first required option is `reads.lst`. The `reads.list` should a 113 | path to a file that contains the list of the paths of the input sequence files. 114 | 115 | The rest required options specify how to partition the data for different part 116 | of the pipeline and the number of the processors used for each of the step. 117 | 118 | `` and `` control the number of "chunks" and the 119 | number of cpu used concurrently for the initial SHIMMER index generation. 120 | 121 | `` and `` control the number of "chunks" and the 122 | number of cpu used concurrently for generating overlap inforrmation between 123 | reads. This part typically use the most memory and the exact size of RAM used 124 | concurrently depends on the size of input sequence data and the index file 125 | size. 126 | 127 | You can use larger number of `` and smaller number of 128 | `` on a smaller memory mechine. For example, I was able to finish 129 | this part using a machine with 32G RAM with `ovlp_nchunk=24` and 130 | `ovlp_nproc=1`. 131 | 132 | If there is enough memory, for example, AWS bothe m5d.metal and r5d.12xlarge 133 | have 384G RAM, they can support running 24 to 48 cpu cores at once. However, 134 | the overlap step needs to do random access the sequence data through shared 135 | memory mapped file, it will be great to reserve some RAM to cache the sequence 136 | in memory in RAM. In our test, 48 cores does not provide significant speeding 137 | comparing to use 24 cores. Also, if there is not enough memory, you may need 138 | fast SSD or nvme drives and reduce the number or CPU core concurrently 139 | accessing the sequence data. 140 | 141 | `` and `` control the partitioning and the 142 | number of cores used for mapping the sequence reads to draft contigs for the 143 | following consensus step. 144 | 145 | `` controls the number of cpu cores used for sorting the reads to 146 | contigs map. 147 | 148 | `` and `` control the partitioning and the number of 149 | cores used for generating the consensus from draft contigs. 150 | 151 | 152 | ## Runing Peregrine Using Docker 153 | 154 | Here is an example running Peregrine with Docker for a Peregrin build 155 | of tag 0.1.5.0 using an AWS m5d.metal or r5d.12xlarge instance. (You will 156 | need to configure the AWS instance to utilize the NVME drives and a 157 | docker environment.) 158 | 159 | ``` 160 | find /wd/chm13-fastq/ -name "*.fastq" | sort > chm13-seqdata.lst 161 | 162 | docker run -it -v /wd:/wd --user $(id -u):$(id -g) cschin/peregrine:0.1.5.0 asm \ 163 | /wd/chm13-seqdata.lst 24 24 24 24 24 24 24 24 24 \ 164 | --with-consensus --shimmer-r 3 --best_n_ovlp 8 \ 165 | --output /wd/chm13-asm-r3-pg0.1.5.0 166 | ``` 167 | 168 | Note that the paths in the `` should be the full paths to the 169 | sequuence files inside the docker container. 170 | 171 | 172 | ## LICENSE 173 | 174 | ### Peregrine & SHIMMER Genome Assembly Toolkit 175 | 176 | Peregrine Assembler and SHIMMER Genome Assembly Toolkit 177 | Copyright (c) 2019- by Jason, Chen-Shan, Chin 178 | 179 | Peregrine Assembler and SHIMMER Genome Assembly Toolkit 180 | is licensed under a Creative Commons 181 | Attribution-NonCommercial-ShareAlike 4.0 International 182 | License. 183 | 184 | You should have received a copy of the license along with this 185 | work. If not, see . 186 | 187 | 188 | ### Minimap2 189 | 190 | SHIMMER genome assembly toolkit uses C library developed by 191 | Heng Li for Minimap2. See LICENSE.minimap2 192 | 193 | 194 | ### FALCON 195 | 196 | See LICENSE.falcon for license for the code from FALCON 197 | -------------------------------------------------------------------------------- /src/kseq.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, 2009, 2011 Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* Last Modified: 05MAR2012 */ 27 | 28 | #ifndef AC_KSEQ_H 29 | #define AC_KSEQ_H 30 | 31 | #include 32 | #include 33 | #include 34 | 35 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r 36 | #define KS_SEP_TAB 1 // isspace() && !' ' 37 | #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) 38 | #define KS_SEP_MAX 2 39 | 40 | #define __KS_TYPE(type_t) \ 41 | typedef struct __kstream_t { \ 42 | int begin, end; \ 43 | int is_eof:2, bufsize:30; \ 44 | type_t f; \ 45 | unsigned char *buf; \ 46 | } kstream_t; 47 | 48 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) 49 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) 50 | 51 | #define __KS_BASIC(SCOPE, type_t, __bufsize) \ 52 | SCOPE kstream_t *ks_init(type_t f) \ 53 | { \ 54 | kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ 55 | ks->f = f; ks->bufsize = __bufsize; \ 56 | ks->buf = (unsigned char*)malloc(__bufsize); \ 57 | return ks; \ 58 | } \ 59 | SCOPE void ks_destroy(kstream_t *ks) \ 60 | { \ 61 | if (!ks) return; \ 62 | free(ks->buf); \ 63 | free(ks); \ 64 | } 65 | 66 | #define __KS_INLINED(__read) \ 67 | static inline int ks_getc(kstream_t *ks) \ 68 | { \ 69 | if (ks->is_eof && ks->begin >= ks->end) return -1; \ 70 | if (ks->begin >= ks->end) { \ 71 | ks->begin = 0; \ 72 | ks->end = __read(ks->f, ks->buf, ks->bufsize); \ 73 | if (ks->end < ks->bufsize) ks->is_eof = 1; \ 74 | if (ks->end == 0) return -1; \ 75 | } \ 76 | return (int)ks->buf[ks->begin++]; \ 77 | } \ 78 | static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ 79 | { return ks_getuntil2(ks, delimiter, str, dret, 0); } 80 | 81 | #ifndef KSTRING_T 82 | #define KSTRING_T kstring_t 83 | typedef struct __kstring_t { 84 | unsigned l, m; 85 | char *s; 86 | } kstring_t; 87 | #endif 88 | 89 | #ifndef kroundup32 90 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 91 | #endif 92 | 93 | #define __KS_GETUNTIL(SCOPE, __read) \ 94 | SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ 95 | { \ 96 | if (dret) *dret = 0; \ 97 | str->l = append? str->l : 0; \ 98 | if (ks->begin >= ks->end && ks->is_eof) return -1; \ 99 | for (;;) { \ 100 | int i; \ 101 | if (ks->begin >= ks->end) { \ 102 | if (!ks->is_eof) { \ 103 | ks->begin = 0; \ 104 | ks->end = __read(ks->f, ks->buf, ks->bufsize); \ 105 | if (ks->end < ks->bufsize) ks->is_eof = 1; \ 106 | if (ks->end == 0) break; \ 107 | } else break; \ 108 | } \ 109 | if (delimiter == KS_SEP_LINE) { \ 110 | for (i = ks->begin; i < ks->end; ++i) \ 111 | if (ks->buf[i] == '\n') break; \ 112 | } else if (delimiter > KS_SEP_MAX) { \ 113 | for (i = ks->begin; i < ks->end; ++i) \ 114 | if (ks->buf[i] == delimiter) break; \ 115 | } else if (delimiter == KS_SEP_SPACE) { \ 116 | for (i = ks->begin; i < ks->end; ++i) \ 117 | if (isspace(ks->buf[i])) break; \ 118 | } else if (delimiter == KS_SEP_TAB) { \ 119 | for (i = ks->begin; i < ks->end; ++i) \ 120 | if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ 121 | } else i = 0; /* never come to here! */ \ 122 | if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ 123 | str->m = str->l + (i - ks->begin) + 1; \ 124 | kroundup32(str->m); \ 125 | str->s = (char*)realloc(str->s, str->m); \ 126 | } \ 127 | memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ 128 | str->l = str->l + (i - ks->begin); \ 129 | ks->begin = i + 1; \ 130 | if (i < ks->end) { \ 131 | if (dret) *dret = ks->buf[i]; \ 132 | break; \ 133 | } \ 134 | } \ 135 | if (str->s == 0) { \ 136 | str->m = 1; \ 137 | str->s = (char*)calloc(1, 1); \ 138 | } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ 139 | str->s[str->l] = '\0'; \ 140 | return str->l; \ 141 | } 142 | 143 | #define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \ 144 | __KS_TYPE(type_t) \ 145 | __KS_BASIC(SCOPE, type_t, __bufsize) \ 146 | __KS_GETUNTIL(SCOPE, __read) \ 147 | __KS_INLINED(__read) 148 | 149 | #define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize) 150 | 151 | #define KSTREAM_DECLARE(type_t, __read) \ 152 | __KS_TYPE(type_t) \ 153 | extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \ 154 | extern kstream_t *ks_init(type_t f); \ 155 | extern void ks_destroy(kstream_t *ks); \ 156 | __KS_INLINED(__read) 157 | 158 | /****************** 159 | * FASTA/Q parser * 160 | ******************/ 161 | 162 | #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) 163 | 164 | #define __KSEQ_BASIC(SCOPE, type_t) \ 165 | SCOPE kseq_t *kseq_init(type_t fd) \ 166 | { \ 167 | kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ 168 | s->f = ks_init(fd); \ 169 | return s; \ 170 | } \ 171 | SCOPE void kseq_destroy(kseq_t *ks) \ 172 | { \ 173 | if (!ks) return; \ 174 | free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ 175 | ks_destroy(ks->f); \ 176 | free(ks); \ 177 | } 178 | 179 | /* Return value: 180 | >=0 length of the sequence (normal) 181 | -1 end-of-file 182 | -2 truncated quality string 183 | */ 184 | #define __KSEQ_READ(SCOPE) \ 185 | SCOPE int kseq_read(kseq_t *seq) \ 186 | { \ 187 | int c; \ 188 | kstream_t *ks = seq->f; \ 189 | if (seq->last_char == 0) { /* then jump to the next header line */ \ 190 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ 191 | if (c == -1) return -1; /* end of file */ \ 192 | seq->last_char = c; \ 193 | } /* else: the first header char has been read in the previous call */ \ 194 | seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ 195 | if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ 196 | if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ 197 | if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ 198 | seq->seq.m = 256; \ 199 | seq->seq.s = (char*)malloc(seq->seq.m); \ 200 | } \ 201 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ 202 | if (c == '\n') continue; /* skip empty lines */ \ 203 | seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ 204 | ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ 205 | } \ 206 | if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ 207 | if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ 208 | seq->seq.m = seq->seq.l + 2; \ 209 | kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ 210 | seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ 211 | } \ 212 | seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ 213 | if (c != '+') return seq->seq.l; /* FASTA */ \ 214 | if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ 215 | seq->qual.m = seq->seq.m; \ 216 | seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ 217 | } \ 218 | while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ 219 | if (c == -1) return -2; /* error: no quality string */ \ 220 | while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ 221 | seq->last_char = 0; /* we have not come to the next header line */ \ 222 | if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ 223 | return seq->seq.l; \ 224 | } 225 | 226 | #define __KSEQ_TYPE(type_t) \ 227 | typedef struct { \ 228 | kstring_t name, comment, seq, qual; \ 229 | int last_char; \ 230 | kstream_t *f; \ 231 | } kseq_t; 232 | 233 | #define KSEQ_INIT2(SCOPE, type_t, __read) \ 234 | KSTREAM_INIT2(SCOPE, type_t, __read, 16384) \ 235 | __KSEQ_TYPE(type_t) \ 236 | __KSEQ_BASIC(SCOPE, type_t) \ 237 | __KSEQ_READ(SCOPE) 238 | 239 | #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) 240 | 241 | #define KSEQ_DECLARE(type_t) \ 242 | __KS_TYPE(type_t) \ 243 | __KSEQ_TYPE(type_t) \ 244 | extern kseq_t *kseq_init(type_t fd); \ 245 | void kseq_destroy(kseq_t *ks); \ 246 | int kseq_read(kseq_t *seq); 247 | 248 | #endif 249 | -------------------------------------------------------------------------------- /py/scripts/pg_asm_cns.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import mmap 4 | import sys 5 | from peregrine._falcon4py import ffi 6 | from peregrine._falcon4py import lib as falcon 7 | from peregrine._shimmer4py import lib as shimmer 8 | import numpy as np 9 | from collections import OrderedDict 10 | 11 | ## No option parsing at thie moment, perhaps letter 12 | 13 | read_db_prefix = sys.argv[1] 14 | ref_db_prefix = sys.argv[2] 15 | read_to_contig_map = sys.argv[3] 16 | total_chunks = int(sys.argv[4]) 17 | my_chunk = int(sys.argv[5]) 18 | 19 | 20 | f = open("{}.seqdb".format(read_db_prefix), "rb") 21 | seqdb = mmap.mmap(f.fileno(), 0, flags=mmap.MAP_SHARED, prot=mmap.PROT_READ) 22 | 23 | f = open("{}.seqdb".format(ref_db_prefix), "rb") 24 | refdb = mmap.mmap(f.fileno(), 0, flags=mmap.MAP_SHARED, prot=mmap.PROT_READ) 25 | 26 | read_idx = {} 27 | with open("{}.idx".format(read_db_prefix)) as f: 28 | for row in f: 29 | row = row.strip().split() 30 | rid, rname, rlen, offset = row 31 | rid = int(rid) 32 | rlen = int(rlen) 33 | offset = int(offset) 34 | read_idx.setdefault(rid, {}) 35 | read_idx[rid]["name"] = rname 36 | read_idx[rid]["length"] = rlen 37 | read_idx[rid]["offset"] = offset 38 | 39 | 40 | ref_idx = {} 41 | with open("{}.idx".format(ref_db_prefix)) as f: 42 | for row in f: 43 | row = row.strip().split() 44 | rid, rname, rlen, offset = row 45 | rid = int(rid) 46 | rlen = int(rlen) 47 | offset = int(offset) 48 | ref_idx.setdefault(rid, {}) 49 | ref_idx[rid]["name"] = rname 50 | ref_idx[rid]["length"] = rlen 51 | ref_idx[rid]["offset"] = offset 52 | 53 | contig_to_read_map = OrderedDict() 54 | with open(read_to_contig_map) as f: 55 | for row in f: 56 | row = row.strip().split() 57 | row = tuple(int(c) for c in row) 58 | ctg_id = row[0] 59 | if (my_chunk % total_chunks) != (ctg_id % total_chunks): 60 | continue 61 | contig_to_read_map.setdefault(ctg_id, []) 62 | contig_to_read_map[ctg_id].append(row) 63 | 64 | rng = ffi.new("aln_range[1]") 65 | 66 | 67 | # TODO: we need to refactor this loop 68 | for ctg in contig_to_read_map: 69 | print("-\n", "ctg {}".format(ref_idx[ctg]["name"]), file=sys.stderr) 70 | contig_to_read_map[ctg].sort(key=lambda x: x[1]) 71 | read_map_groups = [] 72 | left_anchor = 1000 73 | map_group = [] 74 | 75 | for row in contig_to_read_map[ctg]: 76 | ref_p1 = row[1] 77 | if ref_p1 - left_anchor < 50000: 78 | map_group.append(row) 79 | else: 80 | if ref_p1 - left_anchor < 100000: 81 | read_map_groups.append([left_anchor, ref_p1, map_group]) 82 | else: 83 | read_map_groups.append([left_anchor, ref_p1, []]) 84 | map_group = [] 85 | left_anchor = ref_p1 86 | 87 | if ref_idx[ctg]["length"] - left_anchor < 100000: #current max template size for consensus 88 | if ref_idx[ctg]["length"] - left_anchor > 1000: 89 | read_map_groups.append((left_anchor, 90 | ref_idx[ctg]["length"], 91 | map_group)) 92 | elif len(read_map_groups) > 0: 93 | read_map_groups[-1][1] = ref_idx[ctg]["length"] 94 | read_map_groups[-1][2].extend(map_group) 95 | else: 96 | read_map_groups.append((left_anchor, ref_idx[ctg]["length"], [])) 97 | else: 98 | read_map_groups.append((left_anchor, ref_idx[ctg]["length"], [])) 99 | 100 | print("ctg {}".format(ref_idx[ctg]["name"]), 101 | len(read_map_groups), 102 | file=sys.stderr) 103 | 104 | #if len(read_map_groups) <= 2: #ignore short contig for now 105 | # continue 106 | 107 | cns_segments = [] 108 | j = 0 109 | for left, right, mapped in read_map_groups: 110 | print(f"--\n sg{j:03d}", left, right, right-left, len(mapped), file=sys.stderr) 111 | 112 | j += 1 113 | left = left-1000 114 | assert(left >= 0) 115 | rmap = {} 116 | 117 | for d in mapped: 118 | #print(d) 119 | read_id = d[3] 120 | read_offset = d[1] - d[4] 121 | read_strand = d[6] 122 | rmap.setdefault((read_id, read_strand), []) 123 | rmap[(read_id, read_strand)].append(read_offset) 124 | 125 | reads = [] 126 | 127 | for (read_id, read_strand), v in rmap.items(): 128 | v.sort() 129 | v_current = v[0] 130 | reads.append((read_id, read_strand, v_current - left, len(v))) 131 | print( (read_id, read_strand), v_current, file=sys.stderr); 132 | for vv in v: 133 | if vv > v_current + 50: 134 | v_current = vv 135 | reads.append((read_id, read_strand, v_current - left, len(v))) 136 | print( (read_id, read_strand), v_current, file=sys.stderr); 137 | 138 | 139 | reads.sort(key=lambda x: x[2]) 140 | s = ref_idx[ctg]["offset"] + left 141 | ref_len = right-left 142 | 143 | bseq0 = refdb[s:s+ref_len] 144 | 145 | ref_seq = ffi.new("char[{}]".format(ref_len)) 146 | 147 | shimmer.decode_biseq(bseq0, ref_seq, ref_len, 0) 148 | 149 | tags = ffi.new("align_tags_t * [{}]".format(len(reads)+1)) 150 | 151 | # need a back bone for some boundary case 152 | aln = falcon.align(ref_seq, ref_len, 153 | ref_seq, ref_len, 154 | 50, 1) 155 | rng[0].s1 = aln.aln_q_s 156 | rng[0].e1 = aln.aln_q_e 157 | rng[0].s2 = aln.aln_t_s 158 | rng[0].e2 = aln.aln_t_e 159 | tag = falcon.get_align_tags(aln.q_aln_str, 160 | aln.t_aln_str, 161 | aln.aln_str_size, 162 | rng, 0, 0) 163 | aln_count = 0 164 | tags[aln_count] = tag 165 | aln_count += 1 166 | falcon.free_alignment(aln) 167 | 168 | aln_base = 0 169 | for d in reads: 170 | #print(d) 171 | read_id = d[0] 172 | read_strand = d[1] 173 | read_shift = int(d[2]) 174 | s = read_idx[read_id]["offset"] 175 | read_len = read_idx[read_id]["length"] 176 | bseq1 = seqdb[s:s+read_len] 177 | read_seq = ffi.new("char[{}]".format(read_len)) 178 | shimmer.decode_biseq(bseq1, read_seq, read_len, read_strand) 179 | 180 | aligned = False 181 | t_offset = 0 182 | if read_shift < 0: 183 | aln = falcon.align(read_seq[abs(read_shift):read_len], 184 | read_len - abs(read_shift), 185 | ref_seq, 186 | ref_len, 187 | 150, 1) 188 | 189 | if abs(abs(aln.aln_q_e-aln.aln_q_s) - 190 | (read_len - abs(read_shift))) < 48: 191 | aligned = True 192 | 193 | rng[0].s1 = aln.aln_q_s 194 | rng[0].e1 = aln.aln_q_e 195 | rng[0].s2 = aln.aln_t_s 196 | rng[0].e2 = aln.aln_t_e 197 | t_offset = 0 198 | else: 199 | falcon.free_alignment(aln) 200 | else: 201 | aln = falcon.align(read_seq, 202 | read_len, 203 | ref_seq[read_shift:ref_len], 204 | ref_len-read_shift, 205 | 150, 1) 206 | 207 | if abs(abs(aln.aln_q_e-aln.aln_q_s)-read_len) < 48 or \ 208 | abs(ref_len-read_shift-abs(aln.aln_q_e-aln.aln_q_s)) < 48: 209 | aligned = True 210 | rng[0].s1 = aln.aln_q_s 211 | rng[0].e1 = aln.aln_q_e 212 | rng[0].s2 = aln.aln_t_s 213 | rng[0].e2 = aln.aln_t_e 214 | t_offset = read_shift 215 | else: 216 | falcon.free_alignment(aln) 217 | if aligned: 218 | print(f"{read_id} is algined", 219 | rng[0].s1 , rng[0].e1, rng[0].s2, rng[0].e2, file=sys.stderr) 220 | # print(ffi.string(aln.q_aln_str), file=sys.stderr) 221 | # rint(ffi.string(aln.t_aln_str), file=sys.stderr) 222 | sys.stderr.flush() 223 | tag = falcon.get_align_tags(aln.q_aln_str, 224 | aln.t_aln_str, 225 | aln.aln_str_size, 226 | rng, 0, t_offset) 227 | tags[aln_count] = tag 228 | aln_count += 1 229 | aln_base += abs(rng[0].e2 - rng[0].s2) 230 | falcon.free_alignment(aln) 231 | ffi.release(read_seq) 232 | aln_cov = aln_base/ref_len 233 | print(f"aln_count:{aln_count}, aln_base: {aln_base}, aln_cov: {aln_cov}", file=sys.stderr) 234 | 235 | if aln_base/ref_len < 3: 236 | cns_seq = ffi.string(ref_seq) 237 | cns_seq = cns_seq.lower() 238 | else: 239 | cns = falcon.get_cns_from_align_tags(tags, 240 | aln_count, len(ref_seq), 1) 241 | cns_seq = ffi.string(cns.sequence) 242 | falcon.free_consensus_data(cns) 243 | 244 | cns_segments.append(cns_seq) 245 | 246 | for i in range(aln_count): 247 | falcon.free_align_tags(tags[i]) 248 | ffi.release(tags) 249 | ffi.release(ref_seq) 250 | 251 | s0 = cns_segments[0] 252 | stiched_segments = [s0] 253 | p = 0 254 | for s1 in cns_segments[1:]: 255 | aln = falcon.align(s0[-1000:], 1000, 256 | s1[:1050], 1050, 400, 0) 257 | # print(aln.aln_q_s, aln.aln_q_e, aln.aln_t_s, aln.aln_t_e, aln.dist) 258 | if aln.aln_q_e < 1000: 259 | stiched_segments[-1] = stiched_segments[-1][:-(1000-aln.aln_q_e)] 260 | 261 | stiched_segments.append(s1[aln.aln_t_e:]) 262 | p += len(s1[aln.aln_t_e:]) 263 | print("stiching point:", p, file=sys.stderr) 264 | print("aln.aln_q_e:", aln.aln_q_e, file=sys.stderr) 265 | print("aln.aln_t_e:", aln.aln_t_e, file=sys.stderr) 266 | # print(ffi.string(aln.q_aln_str), file=sys.stderr) 267 | # print(ffi.string(aln.t_aln_str), file=sys.stderr) 268 | s0 = s1 269 | falcon.free_alignment(aln) 270 | 271 | contig = b"".join(stiched_segments) 272 | print(">{}".format(ref_idx[ctg]["name"])) 273 | print(contig.decode("ascii")) 274 | ffi.release(rng) 275 | -------------------------------------------------------------------------------- /src/shmr_map.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "kalloc.h" 11 | #include "khash.h" 12 | #include "kvec.h" 13 | #include "shimmer.h" 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | extern char *optarg; 20 | extern int optind, opterr, optopt; 21 | 22 | #define handle_error(msg) \ 23 | do { \ 24 | perror(msg); \ 25 | exit(EXIT_FAILURE); \ 26 | } while (0) 27 | 28 | #define MMER_COUNT_LOWER_BOUND 1 29 | #define MMER_COUNT_UPPER_BOUND 240 30 | #ifndef ORIGINAL 31 | #define ORIGINAL 0 32 | #endif 33 | #ifndef REVERSED 34 | #define REVERSED 1 35 | #endif 36 | #define READ_END_FUZZINESS 48 37 | #define LOCAL_OVERLAP_UPPERBOUND 120 38 | #define ALNBANDSIZE 100 39 | 40 | KHASH_MAP_INIT_INT64(RPAIR, uint8_t); 41 | 42 | int mp128_comp(const void *a, const void *b) { 43 | mp128_t *a0 = (mp128_t *)a; 44 | mp128_t *b0 = (mp128_t *)b; 45 | return ((a0->y0 & 0xFFFFFFFF) >> 1) < ((b0->y0 & 0xFFFFFFFF) >> 1); 46 | } 47 | 48 | void process_map(char *refdb_file_path, char *seqdb_file_path, 49 | mm128_v *ref_mmers, khash_t(RLEN) * ref_lmap, 50 | khash_t(MMER0) * mmer0_map, khash_t(RLEN) * rlmap, 51 | khash_t(MMC) * mcmap, uint32_t lowerbound, 52 | uint32_t upperbound) { 53 | int rfd, sfd; 54 | struct stat rsb, ssb; 55 | uint8_t *rseq_p, *seq_p; 56 | mp128_v *mpv; 57 | 58 | khash_t(MMER1) * mmer1_map; 59 | 60 | rfd = open(refdb_file_path, O_RDONLY); 61 | if (rfd == -1) handle_error("open"); 62 | 63 | if (fstat(rfd, &rsb) == -1) /* To obtain file size */ 64 | handle_error("fstat"); 65 | 66 | rseq_p = 67 | (uint8_t *)mmap((void *)0, rsb.st_size, PROT_READ, MAP_SHARED, rfd, 0); 68 | 69 | sfd = open(seqdb_file_path, O_RDONLY); 70 | if (sfd == -1) handle_error("open"); 71 | 72 | if (fstat(sfd, &ssb) == -1) /* To obtain file size */ 73 | handle_error("fstat"); 74 | 75 | seq_p = 76 | (uint8_t *)mmap((void *)0, ssb.st_size, PROT_READ, MAP_SHARED, sfd, 0); 77 | 78 | // clock_t begin = clock(); 79 | // clock_t end; 80 | mm128_t mmer0, mmer1; 81 | khiter_t k; 82 | 83 | size_t s = 0; 84 | assert(ref_mmers->n > 0); 85 | for (;;) { 86 | mmer0 = ref_mmers->a[s]; 87 | if (s >= ref_mmers->n) break; 88 | k = kh_get(MMER0, mmer0_map, mmer0.x); 89 | if (k != kh_end(mmer0_map)) break; 90 | s++; 91 | } 92 | 93 | for (size_t i = s + 1; i < ref_mmers->n; i++) { 94 | mmer1 = ref_mmers->a[i]; 95 | uint64_t mhash = mmer1.x >> 8; 96 | k = kh_get(MMC, mcmap, mhash); 97 | if (k == kh_end(mcmap)) continue; 98 | uint32_t mcount = kh_val(mcmap, k); 99 | if (mcount < lowerbound || mcount > upperbound) continue; 100 | 101 | if ((mmer0.y >> 32) != (mmer1.y >> 32)) { 102 | mmer0 = mmer1; 103 | continue; // the pairs are in the same read 104 | } 105 | 106 | k = kh_get(MMER0, mmer0_map, mmer0.x); 107 | if (k == kh_end(mmer0_map)) { 108 | mmer0 = mmer1; 109 | continue; 110 | } 111 | 112 | mmer1_map = kh_val(mmer0_map, k); 113 | k = kh_get(MMER1, mmer1_map, mmer1.x); 114 | if (k == kh_end(mmer1_map)) { 115 | mmer0 = mmer1; 116 | continue; 117 | } 118 | 119 | if (((mmer1.y >> 1) & 0xFFFFFFF) - ((mmer0.y >> 1) & 0xFFFFFFF) < 100) { 120 | mmer0 = mmer1; 121 | continue; 122 | } 123 | 124 | mpv = kh_val(mmer1_map, k); 125 | 126 | uint32_t ref_id; 127 | uint32_t ref_bgn; 128 | uint32_t ref_end; 129 | ref_id = (uint32_t)(mmer0.y >> 32); 130 | ref_bgn = (uint32_t)((mmer0.y & 0xFFFFFFFF) >> 1); 131 | ref_end = (uint32_t)((mmer1.y & 0xFFFFFFFF) >> 1); 132 | 133 | for (int j = 0; j < mpv->n; j++) { 134 | uint32_t read_id; 135 | uint32_t read_bgn; 136 | uint32_t read_end; 137 | uint8_t read_direction; 138 | 139 | read_id = mpv->a[j].y0 >> 32; 140 | read_bgn = (uint32_t)((mpv->a[j].y0 & 0xFFFFFFFF) >> 1); 141 | read_end = (uint32_t)((mpv->a[j].y1 & 0xFFFFFFFF) >> 1); 142 | read_direction = mpv->a[j].direction; 143 | assert(read_bgn < read_end); 144 | 145 | uint64_t mhash = mmer0.x >> 8; 146 | k = kh_get(MMC, mcmap, mhash); 147 | assert(k != kh_end(mcmap)); 148 | uint32_t mcount0 = kh_val(mcmap, k); 149 | mhash = mmer1.x >> 8; 150 | k = kh_get(MMC, mcmap, mhash); 151 | assert(k != kh_end(mcmap)); 152 | uint32_t mcount1 = kh_val(mcmap, k); 153 | printf("%u %u %u %u %u %u %d %u %u\n", ref_id, ref_bgn, ref_end, read_id, 154 | read_bgn, read_end, read_direction, mcount0, mcount1); 155 | } 156 | mmer0 = mmer1; 157 | } 158 | 159 | munmap(rseq_p, rsb.st_size); 160 | munmap(seq_p, ssb.st_size); 161 | } 162 | 163 | int main(int argc, char *argv[]) { 164 | char *refdb_prefix = NULL; 165 | char *seqdb_prefix = NULL; 166 | char *ref_shimmer_prefix = NULL; 167 | char *shimmer_prefix = NULL; 168 | 169 | char mmc_file_path[8192]; 170 | char mmer_file_path[8192]; 171 | char ref_idx_file_path[8192]; 172 | char refdb_file_path[8192]; 173 | char seq_idx_file_path[8192]; 174 | char seqdb_file_path[8192]; 175 | int c; 176 | uint32_t total_chunk = 1, mychunk = 1; 177 | 178 | uint32_t mc_upper = MMER_COUNT_UPPER_BOUND; 179 | uint32_t mc_lower = MMER_COUNT_LOWER_BOUND; 180 | 181 | wordexp_t p; 182 | char **mmc_fns; 183 | char **shimmer_fns; 184 | 185 | mm128_v ref_mmers = {0, 0, 0}; 186 | mm128_v mmers = {0, 0, 0}; 187 | mm128_v mmers_; 188 | mm_count_v mmc; 189 | 190 | khash_t(RLEN) * ref_lmap; 191 | khash_t(RLEN) * rlmap; 192 | khash_t(MMC) *mcmap = kh_init(MMC); 193 | 194 | khash_t(MMER0) * mmer0_map; 195 | khash_t(MMER1) * mmer1_map; 196 | 197 | mp128_v *mpv; 198 | 199 | opterr = 0; 200 | 201 | while ((c = getopt(argc, argv, "r:m:p:l:M:n:t:c:b:")) != -1) { 202 | switch (c) { 203 | case 'r': 204 | refdb_prefix = optarg; 205 | break; 206 | case 'm': 207 | ref_shimmer_prefix = optarg; 208 | break; 209 | case 'p': 210 | seqdb_prefix = optarg; 211 | break; 212 | case 'l': 213 | shimmer_prefix = optarg; 214 | break; 215 | case 'M': 216 | mc_upper = atoi(optarg); 217 | break; 218 | case 'n': 219 | mc_lower = atoi(optarg); 220 | break; 221 | case 't': 222 | total_chunk = atoi(optarg); 223 | break; 224 | case 'c': 225 | mychunk = atoi(optarg); 226 | break; 227 | case '?': 228 | if (optopt == 'r') { 229 | fprintf(stderr, 230 | "Option -%c not specified, using 'ref' as the ref sequence " 231 | "db prefix\n", 232 | optopt); 233 | } 234 | if (optopt == 'p') { 235 | fprintf(stderr, 236 | "Option -%c not specified, using 'seq_dataset' as the " 237 | "sequence db prefix\n", 238 | optopt); 239 | } 240 | if (optopt == 'l') { 241 | fprintf(stderr, 242 | "Option -%c not specified, using 'shimmer-L2' as the L2 " 243 | "index prefix\n", 244 | optopt); 245 | } 246 | return 1; 247 | default: 248 | abort(); 249 | } 250 | } 251 | 252 | assert(total_chunk > 0); 253 | assert(mychunk > 0 && mychunk <= total_chunk); 254 | 255 | if (refdb_prefix == NULL) { 256 | refdb_prefix = (char *)calloc(8192, 1); 257 | snprintf(refdb_prefix, 8191, "ref"); 258 | } 259 | 260 | if (ref_shimmer_prefix == NULL) { 261 | ref_shimmer_prefix = (char *)calloc(8192, 1); 262 | snprintf(ref_shimmer_prefix, 8191, "ref-L2"); 263 | } 264 | 265 | if (seqdb_prefix == NULL) { 266 | seqdb_prefix = (char *)calloc(8192, 1); 267 | snprintf(seqdb_prefix, 8191, "seq_dataset"); 268 | } 269 | 270 | if (shimmer_prefix == NULL) { 271 | shimmer_prefix = (char *)calloc(8192, 1); 272 | snprintf(shimmer_prefix, 8191, "shimmer-L2"); 273 | } 274 | 275 | int written; 276 | written = snprintf(ref_idx_file_path, sizeof(ref_idx_file_path), "%s.idx", 277 | refdb_prefix); 278 | assert(written < sizeof(ref_idx_file_path)); 279 | fprintf(stderr, "using ref index file: %s\n", ref_idx_file_path); 280 | 281 | ref_lmap = get_read_length_map(ref_idx_file_path); 282 | 283 | written = snprintf(refdb_file_path, sizeof(seqdb_file_path), "%s.seqdb", 284 | refdb_prefix); 285 | assert(written < sizeof(refdb_file_path)); 286 | fprintf(stderr, "using ref seqdb file: %s\n", refdb_file_path); 287 | 288 | written = snprintf(mmer_file_path, sizeof(mmer_file_path), 289 | "%s-[0-9]*-of-[0-9]*.dat", ref_shimmer_prefix); 290 | assert(written < sizeof(mmer_file_path)); 291 | wordexp(mmer_file_path, &p, 0); 292 | shimmer_fns = p.we_wordv; 293 | for (int i = 0; i < p.we_wordc; i++) { 294 | fprintf(stderr, "using ref shimmer data file: %s\n", shimmer_fns[i]); 295 | mmers_ = read_mmlist(shimmer_fns[i]); 296 | fprintf(stderr, "number of shimmers load: %lu\n", mmers_.n); 297 | append_mmlist(&ref_mmers, &mmers_); 298 | kv_destroy(mmers_); 299 | } 300 | wordfree(&p); 301 | 302 | written = snprintf(seq_idx_file_path, sizeof(seq_idx_file_path), "%s.idx", 303 | seqdb_prefix); 304 | assert(written < sizeof(seq_idx_file_path)); 305 | fprintf(stderr, "using index file: %s\n", seq_idx_file_path); 306 | 307 | rlmap = get_read_length_map(seq_idx_file_path); 308 | 309 | written = snprintf(seqdb_file_path, sizeof(seqdb_file_path), "%s.seqdb", 310 | seqdb_prefix); 311 | assert(written < sizeof(seqdb_file_path)); 312 | fprintf(stderr, "using seqdb file: %s\n", seqdb_file_path); 313 | 314 | written = snprintf(mmer_file_path, sizeof(mmer_file_path), 315 | "%s-[0-9]*-of-[0-9]*.dat", shimmer_prefix); 316 | 317 | assert(written < sizeof(mmer_file_path)); 318 | wordexp(mmer_file_path, &p, 0); 319 | shimmer_fns = p.we_wordv; 320 | for (int i = 0; i < p.we_wordc; i++) { 321 | fprintf(stderr, "using shimmer data file: %s\n", shimmer_fns[i]); 322 | mmers_ = read_mmlist(shimmer_fns[i]); 323 | fprintf(stderr, "number of shimmers load: %lu\n", mmers_.n); 324 | append_mmlist(&mmers, &mmers_); 325 | kv_destroy(mmers_); 326 | } 327 | wordfree(&p); 328 | 329 | char buffer[32768]; 330 | 331 | setvbuf(stdout, buffer, _IOFBF, sizeof(buffer)); 332 | 333 | written = snprintf(mmc_file_path, sizeof(mmc_file_path), 334 | "%s-MC-[0-9]*-of-[0-9]*.dat", shimmer_prefix); 335 | 336 | assert(written < sizeof(mmc_file_path)); 337 | wordexp(mmc_file_path, &p, 0); 338 | mmc_fns = p.we_wordv; 339 | for (int i = 0; i < p.we_wordc; i++) { 340 | fprintf(stderr, "using shimmer count file: %s\n", mmc_fns[i]); 341 | mmc = read_mm_count(mmc_fns[i]); 342 | aggregate_mm_count(mcmap, &mmc); 343 | kv_destroy(mmc); 344 | } 345 | 346 | wordfree(&p); 347 | 348 | mmer0_map = kh_init(MMER0); 349 | 350 | build_map(&mmers, mmer0_map, rlmap, mcmap, mychunk, total_chunk, mc_lower, 351 | mc_upper); 352 | 353 | process_map(refdb_file_path, seqdb_file_path, &ref_mmers, ref_lmap, mmer0_map, 354 | rlmap, mcmap, mc_lower, mc_upper); 355 | 356 | for (khiter_t __i = kh_begin(mmer0_map); __i != kh_end(mmer0_map); ++__i) { 357 | if (!kh_exist(mmer0_map, __i)) continue; 358 | mmer1_map = kh_val(mmer0_map, __i); 359 | for (khiter_t __j = kh_begin(mmer1_map); __j != kh_end(mmer1_map); ++__j) { 360 | if (!kh_exist(mmer1_map, __j)) continue; 361 | mpv = kh_val(mmer1_map, __j); 362 | kv_destroy(*mpv); 363 | } 364 | kh_destroy(MMER1, mmer1_map); 365 | } 366 | 367 | kh_destroy(MMER0, mmer0_map); 368 | kh_destroy(MMC, mcmap); 369 | kh_destroy(RLEN, rlmap); 370 | kv_destroy(mmers); 371 | kv_destroy(ref_mmers); 372 | fflush(stdout); 373 | } 374 | -------------------------------------------------------------------------------- /falcon/DW_banded.c: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * ===================================================================================== 4 | * 5 | * Filename: DW_banded.c 6 | * 7 | * Description: A banded version for the O(ND) greedy sequence alignment algorithm 8 | * 9 | * Version: 0.1 10 | * Created: 07/20/2013 17:00:00 11 | * Revision: none 12 | * Compiler: gcc 13 | * 14 | * Author: Jason Chin, 15 | * Company: 16 | * 17 | * ===================================================================================== 18 | 19 | #################################################################################$$ 20 | # Copyright (c) 2011-2014, Pacific Biosciences of California, Inc. 21 | # 22 | # All rights reserved. 23 | # 24 | # Redistribution and use in source and binary forms, with or without 25 | # modification, are permitted (subject to the limitations in the 26 | # disclaimer below) provided that the following conditions are met: 27 | # 28 | # * Redistributions of source code must retain the above copyright 29 | # notice, this list of conditions and the following disclaimer. 30 | # 31 | # * Redistributions in binary form must reproduce the above 32 | # copyright notice, this list of conditions and the following 33 | # disclaimer in the documentation and/or other materials provided 34 | # with the distribution. 35 | # 36 | # * Neither the name of Pacific Biosciences nor the names of its 37 | # contributors may be used to endorse or promote products derived 38 | # from this software without specific prior written permission. 39 | # 40 | # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE 41 | # GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC 42 | # BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED 43 | # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 44 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 45 | # DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS 46 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 47 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 48 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF 49 | # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 50 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 51 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 52 | # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 53 | # SUCH DAMAGE. 54 | #################################################################################$$ 55 | 56 | 57 | */ 58 | 59 | #include 60 | #include 61 | #include 62 | #include 63 | #include "common.h" 64 | 65 | int compare_d_path(const void * a, const void * b) 66 | { 67 | const d_path_data2 * arg1 = a; 68 | const d_path_data2 * arg2 = b; 69 | if (arg1->d - arg2->d == 0) { 70 | return arg1->k - arg2->k; 71 | } else { 72 | return arg1->d - arg2->d; 73 | } 74 | } 75 | 76 | 77 | void d_path_sort( d_path_data2 * base, unsigned long max_idx) { 78 | qsort(base, max_idx, sizeof(d_path_data2), compare_d_path); 79 | } 80 | 81 | d_path_data2 * get_dpath_idx( seq_coor_t d, seq_coor_t k, unsigned long max_idx, d_path_data2 * base) { 82 | d_path_data2 d_tmp; 83 | d_path_data2 *rtn; 84 | d_tmp.d = d; 85 | d_tmp.k = k; 86 | rtn = (d_path_data2 *) bsearch( &d_tmp, base, max_idx, sizeof(d_path_data2), compare_d_path); 87 | //printf("dp %ld %ld %ld %ld %ld %ld %ld\n", (rtn)->d, (rtn)->k, (rtn)->x1, (rtn)->y1, (rtn)->x2, (rtn)->y2, (rtn)->pre_k); 88 | 89 | return rtn; 90 | 91 | } 92 | 93 | void print_d_path( d_path_data2 * base, unsigned long max_idx) { 94 | unsigned long idx; 95 | for (idx = 0; idx < max_idx; idx++){ 96 | printf("dp %ld %d %d %d %d %d %d %d\n",idx, 97 | (base+idx)->d, (base+idx)->k, 98 | (base+idx)->x1, (base+idx)->y1, 99 | (base+idx)->x2, (base+idx)->y2, (base+idx)->pre_k); 100 | } 101 | } 102 | 103 | 104 | alignment * align(char * query_seq, seq_coor_t q_len, 105 | char * target_seq, seq_coor_t t_len, 106 | seq_coor_t band_tolerance, 107 | int get_aln_str) { 108 | seq_coor_t * V; 109 | seq_coor_t * U; // array of matched bases for each "k" 110 | seq_coor_t k_offset; 111 | seq_coor_t d; 112 | seq_coor_t k, k2; 113 | seq_coor_t best_m; // the best "matches" for each d 114 | seq_coor_t min_k, new_min_k; 115 | seq_coor_t max_k, new_max_k; 116 | seq_coor_t pre_k; 117 | seq_coor_t x, y; 118 | seq_coor_t cd; 119 | seq_coor_t ck; 120 | seq_coor_t cx, cy, nx, ny; 121 | seq_coor_t max_d; 122 | seq_coor_t band_size; 123 | unsigned long d_path_idx = 0; 124 | unsigned long max_idx = 0; 125 | 126 | d_path_data2 * d_path; 127 | d_path_data2 * d_path_aux; 128 | path_point * aln_path; 129 | seq_coor_t aln_path_idx; 130 | alignment * align_rtn; 131 | seq_coor_t aln_pos; 132 | seq_coor_t i; 133 | bool aligned = false; 134 | 135 | //printf("debug: %ld %ld\n", q_len, t_len); 136 | //printf("%s\n", query_seq); 137 | 138 | max_d = (int) (0.3*(q_len + t_len)); 139 | 140 | band_size = band_tolerance * 2; 141 | 142 | V = calloc( max_d * 2 + 1, sizeof(seq_coor_t) ); 143 | U = calloc( max_d * 2 + 1, sizeof(seq_coor_t) ); 144 | 145 | k_offset = max_d; 146 | 147 | // We should probably use hashmap to store the backtracing information to save memory allocation time 148 | // This O(MN) block allocation scheme is convient for now but it is slower for very long sequences 149 | d_path = calloc( max_d * (band_size + 1 ) * 2 + 1, sizeof(d_path_data2) ); 150 | 151 | aln_path = calloc( q_len + t_len + 1, sizeof(path_point) ); 152 | 153 | align_rtn = calloc( 1, sizeof(alignment)); 154 | align_rtn->t_aln_str = calloc( q_len + t_len + 1, sizeof(char)); 155 | align_rtn->q_aln_str = calloc( q_len + t_len + 1, sizeof(char)); 156 | align_rtn->aln_str_size = 0; 157 | align_rtn->aln_q_s = 0; 158 | align_rtn->aln_q_e = 0; 159 | align_rtn->aln_t_s = 0; 160 | align_rtn->aln_t_e = 0; 161 | 162 | //printf("max_d: %lu, band_size: %lu\n", max_d, band_size); 163 | best_m = -1; 164 | min_k = 0; 165 | max_k = 0; 166 | d_path_idx = 0; 167 | max_idx = 0; 168 | for (d = 0; d < max_d; d ++ ) { 169 | if (max_k - min_k > band_size) { 170 | break; 171 | } 172 | 173 | for (k = min_k; k <= max_k; k += 2) { 174 | 175 | if ( (k == min_k) || ((k != max_k) && (V[ k - 1 + k_offset ] < V[ k + 1 + k_offset])) ) { 176 | pre_k = k + 1; 177 | x = V[ k + 1 + k_offset]; 178 | } else { 179 | pre_k = k - 1; 180 | x = V[ k - 1 + k_offset] + 1; 181 | } 182 | y = x - k; 183 | d_path[d_path_idx].d = d; 184 | d_path[d_path_idx].k = k; 185 | d_path[d_path_idx].x1 = x; 186 | d_path[d_path_idx].y1 = y; 187 | 188 | while ( x < q_len && y < t_len && query_seq[x] == target_seq[y] ){ 189 | x++; 190 | y++; 191 | } 192 | 193 | d_path[d_path_idx].x2 = x; 194 | d_path[d_path_idx].y2 = y; 195 | d_path[d_path_idx].pre_k = pre_k; 196 | d_path_idx ++; 197 | 198 | V[ k + k_offset ] = x; 199 | U[ k + k_offset ] = x + y; 200 | 201 | if ( x + y > best_m) { 202 | best_m = x + y; 203 | } 204 | 205 | if ( x >= q_len || y >= t_len) { 206 | aligned = true; 207 | max_idx = d_path_idx; 208 | break; 209 | } 210 | } 211 | 212 | // For banding 213 | new_min_k = max_k; 214 | new_max_k = min_k; 215 | 216 | for (k2 = min_k; k2 <= max_k; k2 += 2) { 217 | if (U[ k2 + k_offset] >= best_m - band_tolerance ) { 218 | if ( k2 < new_min_k ) { 219 | new_min_k = k2; 220 | } 221 | if ( k2 > new_max_k ) { 222 | new_max_k = k2; 223 | } 224 | } 225 | } 226 | 227 | max_k = new_max_k + 1; 228 | min_k = new_min_k - 1; 229 | 230 | // For no banding 231 | // max_k ++; 232 | // min_k --; 233 | 234 | // For debuging 235 | // printf("min_max_k,d, %ld %ld %ld\n", min_k, max_k, d); 236 | 237 | if (aligned == true) { 238 | align_rtn->aln_q_e = x; 239 | align_rtn->aln_t_e = y; 240 | align_rtn->dist = d; 241 | align_rtn->aln_str_size = (x + y + d) / 2; 242 | align_rtn->aln_q_s = 0; 243 | align_rtn->aln_t_s = 0; 244 | 245 | d_path_sort(d_path, max_idx); 246 | //print_d_path(d_path, max_idx); 247 | 248 | if (get_aln_str > 0) { 249 | cd = d; 250 | ck = k; 251 | aln_path_idx = 0; 252 | while (cd >= 0 && aln_path_idx < q_len + t_len + 1) { 253 | d_path_aux = (d_path_data2 *) get_dpath_idx( cd, ck, max_idx, d_path); 254 | aln_path[aln_path_idx].x = d_path_aux -> x2; 255 | aln_path[aln_path_idx].y = d_path_aux -> y2; 256 | aln_path_idx ++; 257 | aln_path[aln_path_idx].x = d_path_aux -> x1; 258 | aln_path[aln_path_idx].y = d_path_aux -> y1; 259 | aln_path_idx ++; 260 | ck = d_path_aux -> pre_k; 261 | cd -= 1; 262 | } 263 | aln_path_idx --; 264 | cx = aln_path[aln_path_idx].x; 265 | cy = aln_path[aln_path_idx].y; 266 | align_rtn->aln_q_s = cx; 267 | align_rtn->aln_t_s = cy; 268 | aln_pos = 0; 269 | while ( aln_path_idx > 0 ) { 270 | aln_path_idx --; 271 | nx = aln_path[aln_path_idx].x; 272 | ny = aln_path[aln_path_idx].y; 273 | if (cx == nx && cy == ny){ 274 | continue; 275 | } 276 | if (nx == cx && ny != cy){ //advance in y 277 | for (i = 0; i < ny - cy; i++) { 278 | align_rtn->q_aln_str[aln_pos + i] = '-'; 279 | } 280 | for (i = 0; i < ny - cy; i++) { 281 | align_rtn->t_aln_str[aln_pos + i] = target_seq[cy + i]; 282 | } 283 | aln_pos += ny - cy; 284 | } else if (nx != cx && ny == cy){ //advance in x 285 | for (i = 0; i < nx - cx; i++) { 286 | align_rtn->q_aln_str[aln_pos + i] = query_seq[cx + i]; 287 | } 288 | for (i = 0; i < nx - cx; i++) { 289 | align_rtn->t_aln_str[aln_pos + i] = '-'; 290 | } 291 | aln_pos += nx - cx; 292 | } else { 293 | for (i = 0; i < nx - cx; i++) { 294 | align_rtn->q_aln_str[aln_pos + i] = query_seq[cx + i]; 295 | } 296 | for (i = 0; i < ny - cy; i++) { 297 | align_rtn->t_aln_str[aln_pos + i] = target_seq[cy + i]; 298 | } 299 | aln_pos += ny - cy; 300 | } 301 | cx = nx; 302 | cy = ny; 303 | } 304 | align_rtn->aln_str_size = aln_pos; 305 | } 306 | break; 307 | } 308 | } 309 | 310 | free(V); 311 | free(U); 312 | free(d_path); 313 | free(aln_path); 314 | return align_rtn; 315 | } 316 | 317 | 318 | void free_alignment(alignment * aln) { 319 | free(aln->q_aln_str); 320 | free(aln->t_aln_str); 321 | free(aln); 322 | } 323 | --------------------------------------------------------------------------------