├── py
    ├── .gitattributes
    ├── MANIFEST.in
    ├── peregrine
    │   ├── __init__.py
    │   ├── build_falcon4py.py
    │   ├── build_shimmer4py.py
    │   └── utils.py
    ├── setup_pypy.py
    ├── setup.cfg
    ├── setup.py
    └── scripts
    │   ├── path_to_contig.py
    │   └── pg_asm_cns.py
├── misc
    ├── logo.png
    └── logo.svg
├── docker
    ├── entry_dev.sh
    ├── entry.sh
    ├── test
    │   ├── run_test.sh
    │   ├── Makefile
    │   └── simulate_reads.py
    ├── bashrc
    ├── install_with_conda.sh
    ├── Dockerfile
    ├── Dockerfile.dockerhub
    ├── LICENSE.minimap2
    └── LICENSE.falcon
├── test
    ├── genome_mapping
    │   ├── Makefile
    │   └── run_test.sh
    └── ecoli_K12
    │   ├── Makefile
    │   ├── simulate_reads.py
    │   ├── run_test.sh
    │   └── run_test_one_level.sh
├── nim-mini
    ├── mmer_count.py
    ├── mmer_graph.py
    └── dump_mmmer.nim
├── src
    ├── kalloc.h
    ├── shmr_end_filter.c
    ├── Makefile
    ├── shmr_reduce.c
    ├── shmr_gather_mc.c
    ├── kvec.h
    ├── shmr_dedup.c
    ├── shmr_mkseqdb.c
    ├── shimmer.h
    ├── shmr_align.c
    ├── shimmer4py.c
    ├── DWmatch.c
    ├── mm_sketch.c
    ├── kalloc.c
    ├── shmr_index.c
    ├── kseq.h
    └── shmr_map.c
├── falcon
    ├── kalloc.h
    ├── falcon.h
    ├── kvec.h
    ├── common.h
    ├── kalloc.c
    └── DW_banded.c
├── install_with_conda.sh
├── .github
    └── workflows
    │   ├── build_docker_image.yml
    │   └── build_docker_image_release.yml
├── py-utils
    ├── simread.py
    ├── check_ovlp.py
    ├── dump_L0.py
    ├── FastaReader.py
    └── process_L2.py
├── LICENSE.minimap2
├── LICENSE.falcon
└── README.md


/py/.gitattributes:
--------------------------------------------------------------------------------
1 | peregrine/_version.py export-subst
2 | 


--------------------------------------------------------------------------------
/misc/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cschin/Peregrine/HEAD/misc/logo.png


--------------------------------------------------------------------------------
/py/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include versioneer.py
2 | include peregrine/_version.py
3 | 


--------------------------------------------------------------------------------
/docker/entry_dev.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | . /opt/conda/etc/profile.d/conda.sh
3 | conda activate peregrine
4 | pg_run_dev.py $@
5 | 


--------------------------------------------------------------------------------
/py/peregrine/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from ._version import get_versions
3 | __version__ = get_versions()['version']
4 | sys.stderr.write(f"Peregrine Assembler & SHIMMER ASMKit({__version__})\n")
5 | del get_versions
6 | 


--------------------------------------------------------------------------------
/docker/entry.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . /opt/conda/etc/profile.d/conda.sh
 3 | conda activate peregrine
 4 | if [ $1 == "test" ]; then 
 5 |   cd /opt/test
 6 |   bash /opt/test/run_test.sh
 7 | else  
 8 |   pg_run.py $@
 9 | fi
10 | 


--------------------------------------------------------------------------------
/test/genome_mapping/Makefile:
--------------------------------------------------------------------------------
 1 | all: test
 2 | bogus: clean test
 3 | 
 4 | test:
 5 | 	rm -rf ./wd ./logs
 6 | 	/usr/bin/time ./run_test.sh > all.log 2>&1;  mkdir -p logs;  mv *.log logs 
 7 | 
 8 | clean:
 9 | 	rm -rf ./wd/ ./logs/ seq_dataset.lst reads2ref.out ref2ref.out
10 | 


--------------------------------------------------------------------------------
/py/setup_pypy.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | import os
3 | os.environ["peregrine_base"] = os.path.abspath(os.path.pardir)
4 | 
5 | setup(name='peregrine_pypy',
6 |       version='0.1',
7 |       install_requires=["networkx==2.4"],
8 |       scripts = ["scripts/ovlp_to_graph.py", "scripts/graph_to_path.py"])
9 | 


--------------------------------------------------------------------------------
/py/setup.cfg:
--------------------------------------------------------------------------------
 1 | 
 2 | # See the docstring in versioneer.py for instructions. Note that you must
 3 | # re-run 'versioneer.py setup' after changing this section, and commit the
 4 | # resulting files.
 5 | 
 6 | [versioneer]
 7 | VCS = git
 8 | style = pep440
 9 | versionfile_source = peregrine/_version.py
10 | versionfile_build = peregrine/_version.py
11 | tag_prefix = pg
12 | #parentdir_prefix =
13 | 
14 | 


--------------------------------------------------------------------------------
/docker/test/run_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e -o pipefail
 3 | . /root/.bashrc
 4 | make simreads
 5 | make test-pypeflow
 6 | mkdir -p /wd
 7 | cp -a ./wd-pf/ /wd/ecoli_test_results/
 8 | cp K12MG1655.fa /wd/ecoli_test_results/
 9 | apt-get install -y mummer
10 | cd /wd/ecoli_test_results/
11 | dnadiff K12MG1655.fa p_ctg_cns.fa 
12 | echo 
13 | echo dnadiff output of the assembled contig to the e. coli genome used for the simulated reads
14 | cat out.report
15 | 


--------------------------------------------------------------------------------
/docker/test/Makefile:
--------------------------------------------------------------------------------
 1 | all: test test-pypeflow
 2 | bogus: simreads clean test test-pypeflow
 3 | 
 4 | K12MG1655.fa:
 5 | 	wget https://s3.amazonaws.com//biologicaldatascience.org/data/ecoli-k12/K12MG1655.fa 
 6 | 
 7 | simreads: K12MG1655.fa
 8 | 	mkdir -p ./reads
 9 | 	python simulate_reads.py
10 | 
11 | reads_0.fa: simreads
12 | 
13 | test-pypeflow:
14 | 	rm -rf ./wd-pf
15 | 	find ${PWD}/reads/ -name "reads_*.fa" > seq_dataset.lst
16 | 	echo yes | /usr/bin/time pg_run.py asm seq_dataset.lst 12 4 8 4 1 1 1 1 1 --with-consensus --output ./wd-pf
17 | 
18 | clean:
19 | 	rm -rf ./wd/ ./logs/ ./reads/ seq_dataset.lst
20 | 


--------------------------------------------------------------------------------
/nim-mini/mmer_count.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | fn = "preads4falcon_mer"
 4 | mer_count = {}
 5 | with open(fn) as f:
 6 |     for row in f:
 7 |         row = row.strip()
 8 |         if row[0] == ">":
 9 |             continue
10 |         row = row.split()
11 |         mer_count.setdefault(row[2], 0)
12 |         mer_count[row[2]] += 1
13 | 
14 | with open(fn) as f:
15 |     for row in f:
16 |         row = row.strip()
17 |         if row[0] == ">":
18 |             print(row)
19 |             continue
20 |         else:
21 |             row = row.split()
22 |             count = mer_count[row[2]]
23 |             print(" ".join(row), count)
24 | 
25 | 


--------------------------------------------------------------------------------
/src/kalloc.h:
--------------------------------------------------------------------------------
 1 | #ifndef _KALLOC_H_
 2 | #define _KALLOC_H_
 3 | 
 4 | #include <stddef.h> /* for size_t */
 5 | 
 6 | #ifdef __cplusplus
 7 | extern "C" {
 8 | #endif
 9 | 
10 | typedef struct {
11 | 	size_t capacity, available, n_blocks, n_cores, largest;
12 | } km_stat_t;
13 | 
14 | void *kmalloc(void *km, size_t size);
15 | void *krealloc(void *km, void *ptr, size_t size);
16 | void *kcalloc(void *km, size_t count, size_t size);
17 | void kfree(void *km, void *ptr);
18 | 
19 | void *km_init(void);
20 | void km_destroy(void *km);
21 | void km_stat(const void *_km, km_stat_t *s);
22 | 
23 | #ifdef __cplusplus
24 | }
25 | #endif
26 | 
27 | #endif
28 | 


--------------------------------------------------------------------------------
/falcon/kalloc.h:
--------------------------------------------------------------------------------
 1 | #ifndef _KALLOC_H_
 2 | #define _KALLOC_H_
 3 | 
 4 | #include <stddef.h> /* for size_t */
 5 | 
 6 | #ifdef __cplusplus
 7 | extern "C" {
 8 | #endif
 9 | 
10 | typedef struct {
11 | 	size_t capacity, available, n_blocks, n_cores, largest;
12 | } km_stat_t;
13 | 
14 | void *kmalloc(void *km, size_t size);
15 | void *krealloc(void *km, void *ptr, size_t size);
16 | void *kcalloc(void *km, size_t count, size_t size);
17 | void kfree(void *km, void *ptr);
18 | 
19 | void *km_init(void);
20 | void km_destroy(void *km);
21 | void km_stat(const void *_km, km_stat_t *s);
22 | 
23 | #ifdef __cplusplus
24 | }
25 | #endif
26 | 
27 | #endif
28 | 


--------------------------------------------------------------------------------
/docker/bashrc:
--------------------------------------------------------------------------------
 1 | # ~/.bashrc: executed by bash(1) for non-login shells.
 2 | 
 3 | # Note: PS1 and umask are already set in /etc/profile. You should not
 4 | # need this unless you want different defaults for root.
 5 | # PS1='${debian_chroot:+($debian_chroot)}\h:\w\$ '
 6 | # umask 022
 7 | 
 8 | # You may uncomment the following lines if you want `ls' to be colorized:
 9 | # export LS_OPTIONS='--color=auto'
10 | # eval "`dircolors`"
11 | # alias ls='ls $LS_OPTIONS'
12 | # alias ll='ls $LS_OPTIONS -l'
13 | # alias l='ls $LS_OPTIONS -lA'
14 | #
15 | # Some more alias to avoid making mistakes:
16 | # alias rm='rm -i'
17 | # alias cp='cp -i'
18 | # alias mv='mv -i'
19 | . /opt/conda/etc/profile.d/conda.sh
20 | conda activate peregrine
21 | 


--------------------------------------------------------------------------------
/install_with_conda.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . ~/anaconda3/bin/activate
 3 | conda create -n peregrine -y python=3.7
 4 | 
 5 | conda activate peregrine
 6 | conda install -c conda-forge -y pypy3.6
 7 | 
 8 | pushd py
 9 | rm -rf .eggs/ dist/ build/ peregrine.egg-info/ peregrine_pypy.egg-info get-pip.py
10 | python3 setup.py install
11 | python3 setup.py clean --all
12 | popd
13 | git clone -b peregrine https://github.com/cschin/pypeFLOW.git
14 | pushd pypeFLOW
15 | python3 setup.py install
16 | popd
17 | pushd py
18 | wget -q https://bootstrap.pypa.io/get-pip.py
19 | pypy3 get-pip.py
20 | pypy3 setup_pypy.py install
21 | popd
22 | 
23 | pushd src
24 | make all
25 | make install
26 | popd
27 | 
28 | #python3 -m pip install cffi==1.12.2
29 | 


--------------------------------------------------------------------------------
/docker/install_with_conda.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | . /opt/conda/bin/activate
 3 | conda create -n peregrine -y python=3.7
 4 | 
 5 | conda activate peregrine
 6 | conda install -c conda-forge -y pypy3.6
 7 | 
 8 | pushd py
 9 | rm -rf .eggs/ dist/ build/ peregrine.egg-info/ peregrine_pypy.egg-info get-pip.py
10 | python3 setup.py install
11 | python3 setup.py clean --all
12 | popd
13 | git clone -b peregrine https://github.com/cschin/pypeFLOW.git
14 | pushd pypeFLOW
15 | python3 setup.py install
16 | popd
17 | pushd py
18 | wget -q https://bootstrap.pypa.io/get-pip.py
19 | wget -q https://bootstrap.pypa.io/get-pip.py
20 | pypy3 get-pip.py
21 | pypy3 setup_pypy.py install
22 | popd
23 | 
24 | pushd src
25 | make all
26 | make install
27 | popd
28 | 
29 | #python3 -m pip install cffi==1.12.2
30 | 


--------------------------------------------------------------------------------
/py/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | import versioneer
 3 | import os
 4 | os.environ["peregrine_base"] = os.path.abspath(os.path.pardir)
 5 | 
 6 | setup(name='peregrine',
 7 |       version=versioneer.get_version(),
 8 |       cmdclass=versioneer.get_cmdclass(),
 9 |       packages=['peregrine'],
10 |       package_dir = {'peregrine': 'peregrine'},
11 |       scripts = ["scripts/path_to_contig.py",
12 |                  "scripts/pg_asm_cns.py",
13 |                  "scripts/pg_run.py",
14 |                  "scripts/pg_run_dev.py"],
15 |       setup_requires=["cffi>=1.12.0",
16 |                       "versioneer==0.18"],
17 |       cffi_modules=["peregrine/build_shimmer4py.py:ffibuilder",
18 |                     "peregrine/build_falcon4py.py:ffibuilder"],
19 |       install_requires=["cffi>=1.12.0",
20 |                         "docopt>=0.6.2",
21 |                         "numpy>=1.16.2"])
22 | 


--------------------------------------------------------------------------------
/src/shmr_end_filter.c:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <stdint.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <string.h>
 6 | #define __STDC_LIMIT_MACROS
 7 | #include "khash.h"
 8 | #include "kvec.h"
 9 | #include "shimmer.h"
10 | 
11 | void mm_end_filter(mm128_v *p, mm128_v *p_out_5, mm128_v *p_out_3,
12 |                    khash_t(RLEN) * rlmap, uint32_t end_length) {
13 |   uint32_t idx;
14 |   uint32_t rid;
15 |   uint32_t rlen;
16 |   uint32_t pos, r_pos, span;
17 |   khiter_t k;
18 |   mm128_t mmer;
19 | 
20 |   for (idx = 0; idx < p->n; idx++) {
21 |     mmer = p->a[idx];
22 |     rid = mmer.y >> 32;
23 |     span = mmer.x & 0xFF;
24 |     k = kh_get(RLEN, rlmap, rid);
25 |     // is_missing = (k == kh_end(hmap));
26 |     rlen = kh_value(rlmap, k).len;
27 |     pos = ((mmer.y & 0xFFFFFFFF) >> 1) + 1;
28 |     r_pos = rlen - pos + span;
29 |     if (pos < end_length) {
30 |       kv_push(mm128_t, NULL, *p_out_5, mmer);
31 |     };
32 |     if (r_pos < end_length) {
33 |       kv_push(mm128_t, NULL, *p_out_3, mmer);
34 |     };
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM continuumio/miniconda3
 2 | RUN apt-get update
 3 | RUN apt-get install -y build-essential zlib1g zlib1g-dev
 4 | RUN mkdir /opt/build
 5 | COPY src.tgz /opt/build
 6 | COPY install_with_conda.sh /opt/build
 7 | RUN cd /opt/build; tar zxvf src.tgz; bash install_with_conda.sh
 8 | RUN . /opt/conda/bin/activate; conda clean --all
 9 | 
10 | FROM continuumio/miniconda3
11 | COPY --from=0 /opt/conda /opt/conda
12 | RUN apt-get update
13 | RUN apt-get install -y parallel time
14 | RUN . /opt/conda/bin/activate; conda activate peregrine; python3 -m pip install cffi==1.12.2
15 | RUN apt-get install -y make
16 | RUN mkdir /opt/licenses
17 | COPY LICENSE /opt/licenses/LICENSE
18 | COPY LICENSE.falcon /opt/licenses/LICENSE.falcon
19 | COPY LICENSE.minimap2 /opt/licenses/LICENSE.minimap2
20 | RUN mkdir /opt/test
21 | COPY test/Makefile /opt/test
22 | COPY test/run_test.sh /opt/test
23 | COPY test/simulate_reads.py /opt/test
24 | COPY bashrc /root/.bashrc
25 | COPY entry.sh /opt/
26 | COPY entry_dev.sh /opt/
27 | WORKDIR /opt/test
28 | ENTRYPOINT ["/opt/entry.sh"]
29 | 


--------------------------------------------------------------------------------
/test/ecoli_K12/Makefile:
--------------------------------------------------------------------------------
 1 | all: test test-pypeflow
 2 | bogus: simreads clean test test-pypeflow
 3 | 
 4 | K12MG1655.fa:
 5 | 	wget https://www.dropbox.com/s/wqqnzachbdk4d3r/K12MG1655.fa
 6 | 
 7 | simreads: K12MG1655.fa
 8 | 	mkdir -p ./reads
 9 | 	python simulate_reads.py
10 | 
11 | reads_0.fa: simreads
12 | 
13 | test:
14 | 	rm -rf ./wd ./logs
15 | 	/usr/bin/time ./run_test.sh > all.log 2>&1;  mkdir -p logs;  mv *.log logs 
16 | 
17 | test-pypeflow:
18 | 	rm -rf ./wd-pf
19 | 	find ${PWD}/reads/ -name "reads_*.fa" > seq_dataset.lst
20 | 	/usr/bin/time pg_run.py asm seq_dataset.lst 12 4 8 4 1 1 1 1 1 --with-consensus --output ./wd-pf
21 | 
22 | test-pypeflow-with-L0:
23 | 	rm -rf ./wd-pf
24 | 	find ${PWD}/reads/ -name "reads_*.fa" > seq_dataset.lst
25 | 	/usr/bin/time pg_run.py asm seq_dataset.lst 12 4 8 4 1 1 1 1 1 --with-L0-index --with-consensus --output ./wd-pf
26 | 
27 | test-pypeflow-l1:
28 | 	rm -rf ./wd-pf-l1
29 | 	find ${PWD}/reads/ -name "reads_*.fa" > seq_dataset.lst
30 | 	/usr/bin/time pg_run.py asm seq_dataset.lst 12 4 8 4 1 1 1 1 1 --shimmer-r 24 --with-consensus --shimmer-l 1 --output ./wd-pf-l1
31 | 
32 | clean:
33 | 	rm -rf ./wd/ ./logs/ ./reads/ seq_dataset.lst
34 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.dockerhub:
--------------------------------------------------------------------------------
 1 | FROM continuumio/miniconda3
 2 | RUN apt-get update
 3 | RUN apt-get install -y build-essential zlib1g zlib1g-dev
 4 | RUN mkdir /opt/build
 5 | COPY docker/install_with_conda.sh /opt/build
 6 | COPY py/ /opt/build/py/
 7 | COPY src/ /opt/build/src/
 8 | COPY falcon/ /opt/build/falcon/
 9 | RUN cd /opt/build; bash install_with_conda.sh
10 | RUN . /opt/conda/bin/activate; conda clean --all
11 | 
12 | FROM continuumio/miniconda3
13 | COPY --from=0 /opt/conda /opt/conda
14 | RUN apt-get update
15 | RUN apt-get install -y parallel time
16 | RUN . /opt/conda/bin/activate; conda activate peregrine; python3 -m pip install cffi==1.12.2
17 | RUN apt-get install -y make
18 | RUN mkdir /opt/licenses
19 | COPY docker/LICENSE /opt/licenses/LICENSE
20 | COPY docker/LICENSE.falcon /opt/licenses/LICENSE.falcon
21 | COPY docker/LICENSE.minimap2 /opt/licenses/LICENSE.minimap2
22 | RUN mkdir /opt/test
23 | COPY docker/test/Makefile /opt/test
24 | COPY docker/test/run_test.sh /opt/test
25 | COPY docker/test/simulate_reads.py /opt/test
26 | COPY docker/bashrc /root/.bashrc
27 | COPY docker/entry.sh /opt/
28 | COPY docker/entry_dev.sh /opt/
29 | WORKDIR /opt/test
30 | ENTRYPOINT ["/opt/entry.sh"]
31 | 


--------------------------------------------------------------------------------
/.github/workflows/build_docker_image.yml:
--------------------------------------------------------------------------------
 1 | name: build-and-test-docker-image-master-branch
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 | 
 7 |   pull_request:
 8 |     branches: [ master ]
 9 | 
10 | jobs:
11 |   build:
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |     - uses: actions/checkout@v2
16 | 
17 |     - name: build docker for the master branch
18 |       shell: bash
19 |       run: bash build_docker.sh master
20 | 
21 |     - name: test assembling E. coli
22 |       if: success()
23 |       shell: bash
24 |       run: |
25 |         mkdir -p $HOME/wd  
26 |         docker run -v ${GITHUB_WORKSPACE}/wd:/wd cschin/peregrine:latest test
27 |         ls ${GITHUB_WORKSPACE}/wd/ecoli_test_results/
28 | 
29 |     - uses: actions/upload-artifact@v2
30 |       if: success()
31 |       with:
32 |         name: E. coli dnadiff results
33 |         path: wd/ecoli_test_results/out.report
34 |    
35 |     - name: push image to docker hub
36 |       if: ${{ success() &&  github.event_name == 'push' }}
37 |       run: |
38 |         echo '${{ secrets.docker_password }}' | docker login --username '${{ secrets.docker_user }}'  --password-stdin
39 |         docker push cschin/peregrine:latest
40 | 
41 | 


--------------------------------------------------------------------------------
/py-utils/simread.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | rcmap = dict(zip("ACGT","TGCA"))
 4 | 
 5 | def rc_seq(seq):
 6 |     return "".join([rcmap[c] for c in seq[::-1]])
 7 | 
 8 | def sim_error(seq):
 9 |     out_seq = []
10 |     for c in seq:
11 |         if random.uniform(0, 1) < 0.01:
12 |             c = random.choice( ('A','C','G','T', '', c+'A', c+'C', c+'G', c+'T') )
13 |         out_seq.append(c)
14 |     return "".join(out_seq)
15 | 
16 | seq = []
17 | with open("K12MG1655.fa") as f:
18 |     for row in f:
19 |         row = row.strip()
20 |         if len(row) < 1:
21 |             continue
22 |         if ">" == row[0]:
23 |             continue
24 |         seq.append(row)
25 | 
26 | seq = "".join(seq)
27 | seq = seq + seq[:40000]
28 | 
29 | rl = 15000
30 | read_count = 15 * len(seq) // rl
31 | 
32 | sim_record = open("reads.bed","w")
33 | import random
34 | for i in range(read_count):
35 |     rl2 = int(rl + random.gauss(0, 1500))
36 |     s  = random.randint(0, len(seq)-40000)
37 |     print(">{:06d}".format(i))
38 |     seq_tmp = sim_error(seq[s:s+rl2])
39 |     if random.randint(0,1) == 1:
40 |         seq_tmp = rc_seq(seq_tmp)
41 |     print(seq_tmp)
42 |     print("{:06d}".format(i), s, s+rl2, sep="\t", file=sim_record)
43 | sim_record.close()
44 | 
45 | 


--------------------------------------------------------------------------------
/.github/workflows/build_docker_image_release.yml:
--------------------------------------------------------------------------------
 1 | name: build-and-test-docker-image-tagged-release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - pg*
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |     - uses: actions/checkout@v2
14 | 
15 |     - name: build docker image for the tagged commit
16 |       shell: bash
17 |       run: bash build_docker.sh tag
18 | 
19 |     - name: test assembling E. coli
20 |       if: success()
21 |       shell: bash
22 |       run: |
23 |         mkdir -p ${GITHUB_WORKSPACE}/wd  
24 |         tag=$(git describe --always --abbrev=0 --tags)
25 |         tag=${tag:2}
26 |         docker run -v ${GITHUB_WORKSPACE}/wd:/wd cschin/peregrine:${tag} test
27 |         ls ${GITHUB_WORKSPACE}/wd/ecoli_test_results/
28 | 
29 |     - uses: actions/upload-artifact@v2
30 |       if: success()
31 |       with:
32 |         name: E. coli dnadiff results
33 |         path: wd/ecoli_test_results/out.report
34 |    
35 |     - name: push image to docker hub
36 |       if: ${{ success() &&  github.event_name == 'push' }}
37 |       run: |
38 |         echo '${{ secrets.docker_password }}' | docker login --username '${{ secrets.docker_user }}'  --password-stdin
39 |         tag=$(git describe --always --abbrev=0 --tags)
40 |         tag=${tag:2}
41 |         docker push cschin/peregrine:${tag}
42 | 
43 | 


--------------------------------------------------------------------------------
/LICENSE.minimap2:
--------------------------------------------------------------------------------
 1 | 
 2 | This software uses the following libraray from Heng Li's Minimap2 
 3 | code under MIT License:
 4 | 
 5 | mm_sketch.c kvec.h kseq.h khash.h kalloc.h kalloc.c
 6 | 
 7 | The MIT License
 8 | 
 9 | Copyright (c) 2018-     Dana-Farber Cancer Institute
10 |               2017-2018 Broad Institute, Inc.
11 | 
12 | Permission is hereby granted, free of charge, to any person obtaining
13 | a copy of this software and associated documentation files (the
14 | "Software"), to deal in the Software without restriction, including
15 | without limitation the rights to use, copy, modify, merge, publish,
16 | distribute, sublicense, and/or sell copies of the Software, and to
17 | permit persons to whom the Software is furnished to do so, subject to
18 | the following conditions:
19 | 
20 | The above copyright notice and this permission notice shall be
21 | included in all copies or substantial portions of the Software.
22 | 
23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 | SOFTWARE.
31 | 


--------------------------------------------------------------------------------
/docker/LICENSE.minimap2:
--------------------------------------------------------------------------------
 1 | 
 2 | This software uses the following libraray from Heng Li's Minimap2 code under
 3 | MIT Licesne
 4 | 
 5 | mm_sketch.c kvec.h kseq.h khash.h kalloc.h kalloc.c
 6 | 
 7 | The MIT License
 8 | 
 9 | Copyright (c) 2018-     Dana-Farber Cancer Institute
10 |               2017-2018 Broad Institute, Inc.
11 | 
12 | Permission is hereby granted, free of charge, to any person obtaining
13 | a copy of this software and associated documentation files (the
14 | "Software"), to deal in the Software without restriction, including
15 | without limitation the rights to use, copy, modify, merge, publish,
16 | distribute, sublicense, and/or sell copies of the Software, and to
17 | permit persons to whom the Software is furnished to do so, subject to
18 | the following conditions:
19 | 
20 | The above copyright notice and this permission notice shall be
21 | included in all copies or substantial portions of the Software.
22 | 
23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 | SOFTWARE.
31 | 
32 | 
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/test/genome_mapping/run_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | ln -sf ../ecoli_K12/reads/ .
 4 | ln -sf ../ecoli_K12/K12MG1655.fa .
 5 | find ./reads/ -name "reads_*.fa" > seq_dataset.lst
 6 | SHIMMER=../../..
 7 | SHIMMER=$(cd "$(dirname "../../../")"; pwd)/$(basename "$1")
 8 | SHIMMERBIN=$SHIMMER/src
 9 | WORKDIR=./wd/
10 | INDEX=$WORKDIR/index
11 | pushd $SHIMMER
12 | echo SHIMMER revision: $(git rev-parse HEAD)
13 | popd
14 | echo get SHIMMER binaries from $SHIMMER
15 | mkdir -p $INDEX
16 | 
17 | echo
18 | echo build read index
19 | time (/usr/bin/time $SHIMMERBIN/shmr_mkseqdb -p $INDEX/seq_dataset -d seq_dataset.lst 2> build_db.log)
20 | 
21 | echo
22 | echo build ref index
23 | echo K12MG1655.fa > ref.lst
24 | time (/usr/bin/time $SHIMMERBIN/shmr_mkseqdb -p $INDEX/ref -d ref.lst 2> build_ref_db.log)
25 | 
26 | echo build ref shimmer index
27 | time (for c in `seq 1 6`; do echo "/usr/bin/time $SHIMMERBIN/shmr_index -p $INDEX/seq_dataset -t 6 -c $c -o $INDEX/read 2> build_index.$c.log" ; done | parallel -j 4)
28 | 
29 | echo build ref shimmer index
30 | time (for c in `seq 1 2`; do echo "/usr/bin/time $SHIMMERBIN/shmr_index -p $INDEX/ref -t 2 -c $c -o $INDEX/ref 2> build_ref_index.$c.log" ; done | parallel -j 2)
31 | 
32 | echo run shimmer_map
33 | $SHIMMERBIN/shmr_map -r $INDEX/ref -m $INDEX/ref-L2 -p $INDEX/seq_dataset  -l $INDEX/read-L2  -t 1 -c 1  >  reads2ref.out
34 | 
35 | $SHIMMERBIN/shmr_map -r $INDEX/ref -m $INDEX/ref-L2 -p $INDEX/ref  -l $INDEX/ref-L2  -t 1 -c 1 > ref2ref.out
36 | 


--------------------------------------------------------------------------------
/falcon/falcon.h:
--------------------------------------------------------------------------------
 1 | #include "kvec.h"
 2 | #include "khash.h"
 3 | #include "kalloc.h"
 4 | 
 5 | typedef struct {
 6 |     seq_coor_t t_pos;
 7 |     uint8_t delta;
 8 |     char q_base;
 9 |     seq_coor_t p_t_pos;   // the tag position of the previous base
10 |     uint8_t p_delta;      // the tag delta of the previous base
11 |     char p_q_base;        // the previous base
12 |     unsigned q_id;
13 | } align_tag_t;
14 | 
15 | typedef struct {
16 |     seq_coor_t len;
17 |     align_tag_t * align_tags;
18 | } align_tags_t;
19 | 
20 | 
21 | 
22 | typedef struct { size_t n, m; uint64_t *a; } uint64_v;
23 | 
24 | typedef struct {
25 |     uint64_t ctag_key;
26 |     uint64_t ptag_key;
27 |     uint16_t coverage;
28 |     uint16_t count;
29 |     double score;
30 | } align_edge_t; 
31 | 
32 | typedef struct { size_t n, m; align_edge_t *a; } align_edge_v;
33 | 
34 | KHASH_MAP_INIT_INT64(PTAG, uint16_t);
35 | typedef khash_t(PTAG) ptag_to_count_t; 
36 | KHASH_MAP_INIT_INT64(CTAG, khash_t(PTAG) *);
37 | typedef khash_t(CTAG) ctag_to_ptag_t; 
38 | 
39 | typedef struct {
40 |     uint64_t ctag_key;
41 |     align_edge_t * best_edge;
42 |     double best_score;
43 | } align_node_t;
44 | 
45 | KHASH_MAP_INIT_INT64(NODE, align_node_t *);
46 | typedef khash_t(NODE) align_node_map_t; 
47 | 
48 | align_tags_t * get_align_tags( char *, char *, seq_coor_t, aln_range *, unsigned, seq_coor_t);
49 | void free_align_tags( align_tags_t * tags);
50 | consensus_data * get_cns_from_align_tags( align_tags_t **, unsigned, unsigned, unsigned ); 
51 | 


--------------------------------------------------------------------------------
/docker/test/simulate_reads.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | random.seed(42)
 4 | 
 5 | rcmap = dict(zip("ACGT","TGCA"))
 6 | 
 7 | def rc_seq(seq):
 8 |     return "".join([rcmap[c] for c in seq[::-1]])
 9 | 
10 | def sim_error(seq):
11 |     out_seq = []
12 |     for c in seq:
13 |         if random.uniform(0, 1) < 0.01:
14 |             c = random.choice( ('A','C','G','T', '', c+'A', c+'C', c+'G', c+'T') )
15 |         out_seq.append(c)
16 |     return "".join(out_seq)
17 | 
18 | seq = []
19 | with open("./K12MG1655.fa") as f:
20 |     for row in f:
21 |         row = row.strip()
22 |         if len(row) < 1:
23 |             continue
24 |         if ">" == row[0]:
25 |             continue
26 |         seq.append(row)
27 | 
28 | seq = "".join(seq)
29 | seq = seq + seq[:40000]
30 | 
31 | rl = 15000
32 | for j in range(8):
33 |     read_count = 2 * len(seq) // rl
34 |     sim_record = open(f"reads/reads_{j}.bed","w")
35 |     read_file = open(f"reads/reads_{j}.fa","w")
36 |     import random
37 |     for i in range(read_count):
38 |         rl2 = int(rl + random.gauss(0, 1500))
39 |         s  = random.randint(0, len(seq)-40000)
40 |         print(">{:02d}/{:06d}/{}_{}".format(j,i,0,rl2), file=read_file)
41 |         seq_tmp = sim_error(seq[s:s+rl2])
42 |         if random.randint(0,1) == 1:
43 |             seq_tmp = rc_seq(seq_tmp)
44 |         print(seq_tmp, file=read_file)
45 |         print("{:02d}/{:06d}/{}_{}".format(j,i,0,rl2), s, s+rl2, sep="\t", file=sim_record)
46 |     sim_record.close()
47 |     read_file.close()
48 | 
49 | 


--------------------------------------------------------------------------------
/test/ecoli_K12/simulate_reads.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | random.seed(42)
 4 | 
 5 | rcmap = dict(zip("ACGT","TGCA"))
 6 | 
 7 | def rc_seq(seq):
 8 |     return "".join([rcmap[c] for c in seq[::-1]])
 9 | 
10 | def sim_error(seq):
11 |     out_seq = []
12 |     for c in seq:
13 |         if random.uniform(0, 1) < 0.01:
14 |             c = random.choice( ('A','C','G','T', '', c+'A', c+'C', c+'G', c+'T') )
15 |         out_seq.append(c)
16 |     return "".join(out_seq)
17 | 
18 | seq = []
19 | with open("./K12MG1655.fa") as f:
20 |     for row in f:
21 |         row = row.strip()
22 |         if len(row) < 1:
23 |             continue
24 |         if ">" == row[0]:
25 |             continue
26 |         seq.append(row)
27 | 
28 | seq = "".join(seq)
29 | seq = seq + seq[:40000]
30 | 
31 | rl = 15000
32 | for j in range(8):
33 |     read_count = 2 * len(seq) // rl
34 |     sim_record = open(f"reads/reads_{j}.bed","w")
35 |     read_file = open(f"reads/reads_{j}.fa","w")
36 |     import random
37 |     for i in range(read_count):
38 |         rl2 = int(rl + random.gauss(0, 1500))
39 |         s  = random.randint(0, len(seq)-40000)
40 |         print(">{:02d}/{:06d}/{}_{}".format(j,i,0,rl2), file=read_file)
41 |         seq_tmp = sim_error(seq[s:s+rl2])
42 |         if random.randint(0,1) == 1:
43 |             seq_tmp = rc_seq(seq_tmp)
44 |         print(seq_tmp, file=read_file)
45 |         print("{:02d}/{:06d}/{}_{}".format(j,i,0,rl2), s, s+rl2, sep="\t", file=sim_record)
46 |     sim_record.close()
47 |     read_file.close()
48 | 
49 | 


--------------------------------------------------------------------------------
/src/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | .PHONY:all clean install 
 3 | 
 4 | CC=gcc
 5 | CFLAGS=-O3 
 6 | CFLAGS+=-Wall -Wno-unused-result -Wno-unused-function -Werror
 7 | #CFLAGS+=-Wall -Wno-unused-result -Wno-unused-function
 8 | LDLIBS=-lz
 9 | ALL=shmr_mkseqdb shmr_index shmr_gather_mc shmr_overlap shmr_dedup shmr_map
10 | 
11 | all: $(ALL)
12 | 
13 | kalloc.o: kalloc.c
14 | 
15 | shmr_reduce.o: shmr_reduce.c
16 | 
17 | mm_sketch.o: mm_sketch.c
18 | 
19 | shmr_utils.o: shmr_utils.c
20 | 
21 | shmr_utils.so: shmr_utils.c kalloc.c
22 | 	gcc -O3 -shared -fPIC -Wall kalloc.c shmr_utils.c -o shmr_utils.so
23 | 
24 | shimmer4py.so: shimmer4py.c shmr_utils.c kalloc.c
25 | 	gcc -O3 -shared -fPIC -Wall kalloc.c shmr_utils.c shimmer4py.c -o shimmer4py.so
26 | 
27 | shmr_end_filter.o: shmr_end_filter.c
28 | 
29 | shmr_gather_mc.o: shmr_gather_mc.c
30 | 
31 | DWmatch.o: DWmatch.c
32 | 
33 | shmr_mkseqdb: shmr_mkseqdb.c kalloc.o shmr_utils.o
34 | 
35 | shmr_index: shmr_index.c kalloc.o shmr_reduce.o mm_sketch.o shmr_utils.o shmr_end_filter.o
36 | 
37 | shmr_gather_mc: shmr_gather_mc.o kalloc.o shmr_utils.o
38 | 
39 | shmr_overlap: shmr_overlap.c shmr_utils.o kalloc.o  DWmatch.o
40 | 
41 | shmr_map: shmr_map.c shmr_utils.o kalloc.o  DWmatch.o
42 | 
43 | shmr_dedup: shmr_dedup.c kalloc.o
44 | 
45 | shmr_dedup2: shmr_dedup2.c kalloc.o
46 | 
47 | BINDIR=$(shell dirname $(shell which python))
48 | install:
49 | 	cp $(ALL) $(BINDIR) 
50 | 
51 | clean:
52 | 	rm -f shmr_dedup shmr_gather_mc shmr_overlap \
53 | 		shmr_map shmr_mkseqdb shmr_index \
54 | 		shmr_gather_mc *.o *.so; rm -rf ./bin/
55 | 


--------------------------------------------------------------------------------
/py-utils/check_ovlp.py:
--------------------------------------------------------------------------------
 1 | from intervaltree import Interval, IntervalTree
 2 | import glob
 3 | tree = IntervalTree()
 4 | 
 5 | rname2rid = {}
 6 | with open("wd-pf/0-seqdb/seq_dataset.idx") as f:
 7 |     for row in f:
 8 |         row = row.strip().split()
 9 |         rname2rid[row[1]]=row[0]
10 | 
11 | read_range = {}
12 | for fn in glob.glob("reads/*.bed"):
13 |     with open(fn) as f:
14 |         for row in f:
15 |             row = row.strip().split()
16 |             rname = row[0]
17 |             s = int(row[1])
18 |             e = int(row[2])
19 |             tree.addi(s, e, rname2rid[rname])
20 |             read_range[rname2rid[rname]] = (s, e)
21 |             if s < 40000:
22 |                 tree.addi(s+4639694, e+4639694, rname2rid[rname])
23 | readpair = set()
24 | for rid in read_range:
25 |     s, e = read_range[rid]
26 |     for itvl in tree[s:e]:
27 |         if itvl.data == rid:
28 |             continue
29 |         print("X", rid, itvl.data)
30 |         readpair.add( (rid, itvl.data) )
31 |         readpair.add( (itvl.data, rid) )
32 | 
33 | ovlppair = set()
34 | with open("wd-pf/3-asm/preads.ovl") as f:
35 |     for row in f:
36 |         row = row.strip().split()
37 |         if row[0] == "-":
38 |             continue
39 |         if (row[0], row[1]) in readpair:
40 |             row.append("1")
41 |         else:
42 |             row.append("0")
43 |         print("Y"," ".join(row))
44 |         ovlppair.add( (row[1], row[0]) )
45 |         ovlppair.add( (row[0], row[1]) )
46 | 
47 | for op in readpair:
48 |     r1 = read_range[op[0]]
49 |     r2 = read_range[op[1]]
50 |     if r1[0] < r2[0]:
51 |         olen = r1[1] - r2[0]
52 |     else:
53 |         olen = r2[1] - r1[0]
54 |     if op in ovlppair:
55 |         op = list(op)
56 | 
57 |         print("Z {} {} {} 1".format(op[0], op[1], olen))
58 |     else:
59 |         print("Z {} {} {} 0".format(op[0], op[1], olen))
60 | 
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/LICENSE.falcon:
--------------------------------------------------------------------------------
 1 | #################################################################################$$
 2 | # Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
 3 | #
 4 | # All rights reserved.
 5 | #
 6 | # Redistribution and use in source and binary forms, with or without
 7 | # modification, are permitted (subject to the limitations in the
 8 | # disclaimer below) provided that the following conditions are met:
 9 | #
10 | #  * Redistributions of source code must retain the above copyright
11 | #  notice, this list of conditions and the following disclaimer.
12 | #
13 | #  * Redistributions in binary form must reproduce the above
14 | #  copyright notice, this list of conditions and the following
15 | #  disclaimer in the documentation and/or other materials provided
16 | #  with the distribution.
17 | #
18 | #  * Neither the name of Pacific Biosciences nor the names of its
19 | #  contributors may be used to endorse or promote products derived
20 | #  from this software without specific prior written permission.
21 | #
22 | # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
23 | # GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
24 | # BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
25 | # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
27 | # DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
28 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
29 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
30 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
31 | # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
32 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
33 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
34 | # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 | # SUCH DAMAGE.
36 | #################################################################################$$
37 | 


--------------------------------------------------------------------------------
/docker/LICENSE.falcon:
--------------------------------------------------------------------------------
 1 | #################################################################################$$
 2 | # Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
 3 | #
 4 | # All rights reserved.
 5 | #
 6 | # Redistribution and use in source and binary forms, with or without
 7 | # modification, are permitted (subject to the limitations in the
 8 | # disclaimer below) provided that the following conditions are met:
 9 | #
10 | #  * Redistributions of source code must retain the above copyright
11 | #  notice, this list of conditions and the following disclaimer.
12 | #
13 | #  * Redistributions in binary form must reproduce the above
14 | #  copyright notice, this list of conditions and the following
15 | #  disclaimer in the documentation and/or other materials provided
16 | #  with the distribution.
17 | #
18 | #  * Neither the name of Pacific Biosciences nor the names of its
19 | #  contributors may be used to endorse or promote products derived
20 | #  from this software without specific prior written permission.
21 | #
22 | # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
23 | # GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
24 | # BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
25 | # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
27 | # DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
28 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
29 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
30 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
31 | # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
32 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
33 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
34 | # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 | # SUCH DAMAGE.
36 | #################################################################################$$
37 | 


--------------------------------------------------------------------------------
/test/ecoli_K12/run_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | find $PWD/reads/ -name "reads_*.fa" > seq_dataset.lst
 4 | WORKDIR=$PWD/wd
 5 | INDEX=$WORKDIR/index
 6 | OVLOUT=$WORKDIR/ovlp
 7 | ASM=$WORKDIR/asm
 8 | SHIMMER=../../..
 9 | pushd $SHIMMER
10 | echo SHIMMER revision: $(git rev-parse HEAD)
11 | popd
12 | echo get SHIMMER binaries from $SHIMMER
13 | mkdir -p $INDEX
14 | mkdir -p $OVLOUT
15 | mkdir -p $ASM
16 | echo
17 | echo build read index
18 | time (/usr/bin/time shmr_mkseqdb -p $INDEX/seq_dataset -d seq_dataset.lst 2> build_db.log)
19 | echo
20 | echo build shimmer index
21 | time (for c in `seq 1 12`; do echo "/usr/bin/time shmr_index -p $INDEX/seq_dataset -r 6 -t 12 -c $c -o $INDEX/shmr 2> build_index.$c.log" ; done | parallel -j 4)
22 | #time (for c in `seq 1 12`; do echo "/usr/bin/time shmr_index -p $INDEX/seq_dataset -l 1 -t 12 -c $c -o $INDEX/shmr 2> build_index.$c.log" ; done | parallel -j 4)
23 | echo
24 | echo build overlaps
25 | time (for c in `seq -f "%02g" 1 8`; do echo "/usr/bin/time shmr_overlap -p $INDEX/seq_dataset -l $INDEX/shmr-L2 -t 8 -c $c -o $OVLOUT/ovlp.$c 2> ovlp.$c.log"; done | parallel -j 4)
26 | echo
27 | echo faclon ovlp to graph
28 | cd $ASM
29 | time (cat ../ovlp/ovlp.* | shmr_dedup > preads.ovl; echo "-" >> preads.ovl)
30 | /usr/bin/time ovlp_to_graph.py >& asm.log
31 | ln -sf ../index/seq_dataset.* .
32 | #/usr/bin/time pypy graph_to_contig.py >& to_contig.log
33 | /usr/bin/time graph_to_path.py >& to_path.log
34 | /usr/bin/time path_to_contig.py $INDEX/seq_dataset p_ctg_tiling_path > p_ctg.fa 2> to_contig.log
35 | echo $PWD/p_ctg.fa > p_ctg.lst
36 | time (/usr/bin/time shmr_mkseqdb -p $INDEX/p_ctg -d p_ctg.lst 2> build_p_ctg_db.log)
37 | time (for c in `seq 1 1`; do echo "/usr/bin/time shmr_index -p $INDEX/p_ctg -r 6 -t 1 -c $c -o $INDEX/p_ctg 2> build_p_ctg_index.$c.log" ; done | parallel -j 4)
38 | time (/usr/bin/time shmr_map -r $INDEX/p_ctg -m $INDEX/p_ctg-L2 -p $INDEX/seq_dataset -l $INDEX/shmr-L2 -t 1 -c 1 > read_map.txt 2> map.log)
39 | time (/usr/bin/time cns_prototype.py $INDEX/seq_dataset $INDEX/p_ctg read_map.txt 1 1 > p_ctg_cns.fa 2> cns.log)
40 | 
41 | 


--------------------------------------------------------------------------------
/nim-mini/mmer_graph.py:
--------------------------------------------------------------------------------
 1 | import networkx as nx
 2 | 
 3 | 
 4 | G = nx.DiGraph()
 5 | m_count = {}
 6 | 
 7 | #fn = "preads4falcon_mer"
 8 | 
 9 | fn = "H08_mer"
10 | with open(fn) as f:
11 |     for row in f:
12 |         row = row.strip()
13 |         if row[0] == ">":
14 |             continue
15 |         row = row.split()
16 |         m_count.setdefault(row[2], 0)
17 |         m_count[row[2]] += 1
18 | 
19 | 
20 | with open(fn) as f:
21 |     for row in f:
22 |         row = row.strip()
23 |         if row[0] == ">":
24 |             v = None
25 |             w = None
26 |         else:
27 |             row = row.split()
28 |             if v is not None:
29 |                 w = row[2]
30 |                 if m_count[v] > 5 and m_count[v] < 60 and \
31 |                    m_count[w] > 5 and m_count[w] < 60:
32 |                     G.add_edge(v, w)
33 |                     if "count" not in G[v][w]:
34 |                         G[v][w]["count"] = 0
35 |                     G[v][w]["count"] += 1
36 |                 v = w
37 |             else:
38 |                 v = row[2]
39 | 
40 | #for v, w in G.edges():
41 | #    print(v, w, G[v][w]["count"], G.out_degree(v), G.in_degree(w))
42 | 
43 | remove_nodes = set()
44 | for v in G.nodes():
45 |     if G.out_degree(v) > 1 or G.in_degree(v) > 1:
46 |         remove_nodes.add(v)
47 | 
48 | G2 = G.copy()
49 | for v in list(remove_nodes):
50 |     G2.remove_node(v)
51 | 
52 | remove_nodes = set()
53 | for subG in nx.weakly_connected_component_subgraphs(G2):
54 |     if len(subG) == 1:
55 |         remove_nodes.update(subG.nodes())
56 | 
57 | for v in list(remove_nodes):
58 |     G.remove_node(v)
59 | 
60 | remove_nodes = set()
61 | for v in G.nodes():
62 |     if G.out_degree(v) == 0 or G.in_degree(v) == 0:
63 |         remove_nodes.add(v)
64 | 
65 | for v in list(remove_nodes):
66 |     G.remove_node(v)
67 | 
68 | for subG in nx.weakly_connected_component_subgraphs(G):
69 |     subG_size = len(subG.nodes())
70 |     for v in subG.nodes():
71 |         print(subG_size, v, subG.in_degree(v), subG.out_degree(v))
72 | 
73 | 
74 | nx.write_gexf(G, "test.gexf")
75 | 


--------------------------------------------------------------------------------
/test/ecoli_K12/run_test_one_level.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | find $PWD/reads/ -name "reads_*.fa" > seq_dataset.lst
 4 | WORKDIR=$PWD/wd-l1
 5 | INDEX=$WORKDIR/index
 6 | OVLOUT=$WORKDIR/ovlp
 7 | ASM=$WORKDIR/asm
 8 | SHIMMER=../../..
 9 | pushd $SHIMMER
10 | echo SHIMMER revision: $(git rev-parse HEAD)
11 | popd
12 | echo get SHIMMER binaries from $SHIMMER
13 | mkdir -p $INDEX
14 | mkdir -p $OVLOUT
15 | mkdir -p $ASM
16 | echo
17 | echo build read index
18 | time (/usr/bin/time shmr_mkseqdb -p $INDEX/seq_dataset -d seq_dataset.lst 2> build_db.log)
19 | echo
20 | echo build shimmer index
21 | #time (for c in `seq 1 12`; do echo "/usr/bin/time shmr_index -p $INDEX/seq_dataset -t 12 -c $c -o $INDEX/shmr 2> build_index.$c.log" ; done | parallel -j 4)
22 | time (for c in `seq 1 12`; do echo "/usr/bin/time shmr_index -p $INDEX/seq_dataset -l 1 -r 36 -t 12 -c $c -o $INDEX/shmr 2> build_index.$c.log" ; done | parallel -j 4)
23 | echo
24 | echo build overlaps
25 | time (for c in `seq -f "%02g" 1 8`; do echo "/usr/bin/time shmr_overlap -p $INDEX/seq_dataset -l $INDEX/shmr-L1 -t 8 -c $c -o $OVLOUT/ovlp.$c 2> ovlp.$c.log"; done | parallel -j 4)
26 | echo
27 | echo faclon ovlp to graph
28 | cd $ASM
29 | time (cat ../ovlp/ovlp.* | shmr_dedup > preads.ovl; echo "-" >> preads.ovl)
30 | /usr/bin/time ovlp_to_graph.py >& asm.log
31 | ln -sf ../index/seq_dataset.* .
32 | #/usr/bin/time pypy graph_to_contig.py >& to_contig.log
33 | /usr/bin/time graph_to_path.py >& to_path.log
34 | /usr/bin/time path_to_contig.py $INDEX/seq_dataset p_ctg_tiling_path > p_ctg.fa 2> to_contig.log
35 | echo $PWD/p_ctg.fa > p_ctg.lst
36 | time (/usr/bin/time shmr_mkseqdb -p $INDEX/p_ctg -d p_ctg.lst 2> build_p_ctg_db.log)
37 | time (for c in `seq 1 1`; do echo "/usr/bin/time shmr_index -p $INDEX/p_ctg -t 1 -c $c -o $INDEX/p_ctg 2> build_p_ctg_index.$c.log" ; done | parallel -j 4)
38 | time (/usr/bin/time shmr_map -r $INDEX/p_ctg -m $INDEX/p_ctg-L2 -p $INDEX/seq_dataset -l $INDEX/shmr-L2 -t 1 -c 1 > read_map.txt 2> map.log)
39 | time (/usr/bin/time cns_prototype.py $INDEX/seq_dataset $INDEX/p_ctg read_map.txt 1 1 > p_ctg_cns.fa 2> cns.log)
40 | 
41 | 


--------------------------------------------------------------------------------
/src/shmr_reduce.c:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <stdint.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <string.h>
 6 | #define __STDC_LIMIT_MACROS
 7 | #include "kvec.h"
 8 | #include "shimmer.h"
 9 | 
10 | typedef struct {
11 |   uint8_t size;
12 |   uint8_t head;
13 |   mm128_t *mers;
14 | } small_m_buffer_t;
15 | 
16 | static inline uint64_t hash64(uint64_t key, uint64_t mask) {
17 |   key = (~key + (key << 21)) & mask;  // key = (key << 21) - key - 1;
18 |   key = key ^ key >> 24;
19 |   key = ((key + (key << 3)) + (key << 8)) & mask;  // key * 265
20 |   key = key ^ key >> 14;
21 |   key = ((key + (key << 2)) + (key << 4)) & mask;  // key * 21
22 |   key = key ^ key >> 28;
23 |   key = (key + (key << 31)) & mask;
24 |   return key;
25 | }
26 | 
27 | void pop_push(small_m_buffer_t *smb, mm128_t mer) {
28 |   smb->mers[smb->head] = mer;
29 |   smb->head++;
30 |   smb->head %= smb->size;
31 | }
32 | 
33 | void find_minimizer(small_m_buffer_t *smb, mm128_t *mmer) {
34 |   uint32_t i = 0;
35 |   uint64_t min_val = UINT64_MAX;
36 |   uint64_t h;
37 | 
38 |   mmer->x = smb->mers[0].x;
39 |   mmer->y = smb->mers[0].y;
40 |   min_val = smb->mers[0].x >> 8;
41 | 
42 |   for (i = 1; i < smb->size; i++) {
43 |     h = smb->mers[i].x >> 8;
44 |     if (h < min_val) {
45 |       min_val = h;
46 |       mmer->x = smb->mers[i].x;
47 |       mmer->y = smb->mers[i].y;
48 |     }
49 |   }
50 | }
51 | 
52 | /* rs: reduction size */
53 | void mm_reduce(mm128_v *p, mm128_v *p_out, uint8_t rs) {
54 |   uint32_t idx;
55 |   uint32_t rid;
56 |   uint32_t rid_ = UINT32_MAX;
57 |   uint32_t r_offset = 0;
58 |   mm128_t mmer, mmer_;
59 |   small_m_buffer_t smb;
60 | 
61 |   kv_resize(mm128_t, NULL, *p_out, p->n);
62 | 
63 |   smb.size = rs;
64 |   smb.head = 0;
65 |   smb.mers = (mm128_t *)alloca(sizeof(mm128_t) * smb.size);
66 |   memset(smb.mers, UINT8_MAX, rs * 16);
67 | 
68 |   mmer_.y = UINT64_MAX;
69 | 
70 |   for (idx = 0; idx < p->n; idx++, r_offset++) {
71 |     rid = p->a[idx].y >> 32;
72 |     if (rid != rid_) {
73 |       r_offset = 0;
74 |       memset(smb.mers, UINT8_MAX, rs * 16);
75 |       smb.head = 0;
76 |       rid_ = rid;
77 |     }
78 |     pop_push(&smb, p->a[idx]);
79 |     if (r_offset < rs - 1) {
80 |       continue;
81 |     }
82 |     find_minimizer(&smb, &mmer);
83 |     if (mmer.y != mmer_.y) {
84 |       // printf("%lu\n", mmer.x >> 8);
85 |       kv_push(mm128_t, NULL, *p_out, mmer);
86 |       mmer_.x = mmer.x;
87 |       mmer_.y = mmer.y;
88 |     }
89 |   }
90 | }
91 | 


--------------------------------------------------------------------------------
/src/shmr_gather_mc.c:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <errno.h>
 3 | #include <stdint.h>
 4 | #include <stdio.h>
 5 | #include <unistd.h>
 6 | #include <zlib.h>
 7 | #include "khash.h"
 8 | #include "kvec.h"
 9 | #include "shimmer.h"
10 | 
11 | extern char *optarg;
12 | extern int optind, opterr, optopt;
13 | 
14 | int main(int argc, char *argv[]) {
15 |   char *data_path_prefix = NULL;
16 |   char mc_chunk_file_path[8192];
17 |   char mc_file_path[8192];
18 |   int written;
19 |   int total_chunk = 1;
20 |   int chunk = 1;
21 |   int c;
22 | 
23 |   opterr = 0;
24 | 
25 |   while ((c = getopt(argc, argv, "p:t:")) != -1) {
26 |     switch (c) {
27 |       case 'p':
28 |         data_path_prefix = optarg;
29 |         break;
30 |       case 't':
31 |         total_chunk = atoi(optarg);
32 |         break;
33 |       case '?':
34 |         if (optopt == 'd') {
35 |           fprintf(stderr,
36 |                   "Option -%c not specified, please specify a prefix of file "
37 |                   "path the data filis\n",
38 |                   optopt);
39 |         }
40 |         if (optopt == 't') {
41 |           fprintf(stderr,
42 |                   "Option -%c not specified, please specify the total number "
43 |                   "of chunks\n",
44 |                   optopt);
45 |         }
46 |         return 1;
47 |       default:
48 |         abort();
49 |     }
50 |   }
51 | 
52 |   assert(total_chunk > 0);
53 | 
54 |   if (data_path_prefix == NULL) {
55 |     data_path_prefix = (char *)calloc(8192, 1);
56 |     snprintf(data_path_prefix, 8191, "shimmer");
57 |   }
58 | 
59 |   khash_t(MMC) *mcmap = kh_init(MMC);
60 | 
61 |   for (chunk = 1; chunk <= total_chunk; chunk++) {
62 |     mm_count_v mc = {0, 0, 0};
63 |     written = snprintf(mc_chunk_file_path, sizeof mc_chunk_file_path,
64 |                        "%s-MC-%02d-of-%02d.dat", data_path_prefix, chunk,
65 |                        total_chunk);
66 |     assert(written < sizeof(mc_chunk_file_path));
67 |     fprintf(stderr, "input data file: %s\n", mc_chunk_file_path);
68 |     mc = read_mm_count(mc_chunk_file_path);
69 |     aggregate_mm_count(mcmap, &mc);
70 |     kv_destroy(mc);
71 |   }
72 | 
73 |   mm_count_v mc_all = {0, 0, 0};
74 |   mm_count_to_vec(mcmap, &mc_all);
75 | 
76 |   written = snprintf(mc_file_path, sizeof mc_file_path, "%s-MC-all.dat",
77 |                      data_path_prefix);
78 |   assert(written < sizeof(mc_file_path));
79 |   fprintf(stderr, "output data file: %s\n", mc_file_path);
80 | 
81 |   write_mm_count(mc_file_path, &mc_all);
82 | 
83 |   kv_destroy(mc_all);
84 |   kh_destroy(MMC, mcmap);
85 | 
86 |   if (!data_path_prefix) free(data_path_prefix);
87 |   return 0;
88 | }
89 | 


--------------------------------------------------------------------------------
/py/peregrine/build_falcon4py.py:
--------------------------------------------------------------------------------
 1 | from cffi import FFI
 2 | import os
 3 | 
 4 | basedir = os.environ["peregrine_base"]
 5 | 
 6 | ffibuilder = FFI()
 7 | 
 8 | ffibuilder.cdef("""
 9 | 
10 | typedef int seq_coor_t;
11 | 
12 | typedef struct {
13 |     seq_coor_t aln_str_size ;
14 |     seq_coor_t dist ;
15 |     seq_coor_t aln_q_s;
16 |     seq_coor_t aln_q_e;
17 |     seq_coor_t aln_t_s;
18 |     seq_coor_t aln_t_e;
19 |     char * q_aln_str;
20 |     char * t_aln_str;
21 | 
22 | } alignment;
23 | 
24 | typedef struct {
25 |     seq_coor_t t_pos;
26 |     uint8_t delta;
27 |     char q_base;
28 |     seq_coor_t p_t_pos;   // the tag position of the previous base
29 |     uint8_t p_delta; // the tag delta of the previous base
30 |     char p_q_base;        // the previous base
31 |     unsigned q_id;
32 | } align_tag_t;
33 | 
34 | typedef struct {
35 |     seq_coor_t len;
36 |     align_tag_t * align_tags;
37 | } align_tags_t;
38 | 
39 | typedef struct {
40 |     seq_coor_t s1;
41 |     seq_coor_t e1;
42 |     seq_coor_t s2;
43 |     seq_coor_t e2;
44 |     long int score;
45 | } aln_range;
46 | 
47 | typedef struct {
48 |     char * sequence;
49 |     uint8_t * eqv;
50 | } consensus_data;
51 | 
52 | 
53 | align_tags_t * get_align_tags( char * aln_q_seq,
54 |                             char * aln_t_seq,
55 |                             seq_coor_t aln_seq_len,
56 |                             aln_range * range,
57 |                             unsigned q_id,
58 |                             seq_coor_t t_offset);
59 | 
60 | void free_align_tags( align_tags_t * tags);
61 | 
62 | consensus_data * get_cns_from_align_tags( align_tags_t ** tag_seqs,
63 |                                           unsigned n_tag_seqs,
64 |                                           unsigned t_len,
65 |                                           unsigned min_cov );
66 | 
67 | void free_consensus_data( consensus_data * consensus );
68 | 
69 | alignment * align(char * query_seq, seq_coor_t q_len,
70 |                   char * target_seq, seq_coor_t t_len,
71 |                   seq_coor_t band_tolerance,
72 |                   int get_aln_str);
73 | 
74 | void free_alignment(alignment *);
75 | void *malloc(size_t size);
76 | void free(void *ptr);
77 | """)
78 | 
79 | ffibuilder.set_source("peregrine._falcon4py",
80 |                f"""
81 |                #include "{basedir}/falcon/common.h"
82 |                #include "{basedir}/falcon/falcon.h"
83 |                """, sources = [f'{basedir}/falcon/falcon.c',
84 |                                f'{basedir}/falcon/DW_banded.c',
85 |                                f'{basedir}/falcon/kalloc.c'])   # library name, for the linker
86 | 
87 | if __name__ == "__main__":
88 |     import sys
89 |     ffibuilder.compile(verbose=True)
90 | 


--------------------------------------------------------------------------------
/py/peregrine/build_shimmer4py.py:
--------------------------------------------------------------------------------
  1 | from cffi import FFI
  2 | import os
  3 | 
  4 | basedir = os.environ["peregrine_base"]
  5 | 
  6 | ffibuilder = FFI()
  7 | 
  8 | ffibuilder.cdef("""
  9 | void decode_biseq(uint8_t * src, char * seq,
 10 |                   size_t len, uint8_t strand);
 11 | 
 12 | typedef int32_t seq_coor_t;
 13 | 
 14 | typedef struct {
 15 | 	seq_coor_t m_size, dist ;
 16 | 	seq_coor_t q_bgn, q_end;
 17 | 	seq_coor_t t_bgn, t_end;
 18 |     seq_coor_t t_m_end, q_m_end;
 19 | } ovlp_match_t;
 20 | 
 21 | ovlp_match_t * ovlp_match(uint8_t * query_seq,
 22 |                           seq_coor_t q_len,
 23 |                           uint8_t q_strand,
 24 |                           uint8_t * target_seq,
 25 |                           seq_coor_t t_len,
 26 |                           uint8_t t_strand,
 27 |                           seq_coor_t band_tolerance);
 28 | 
 29 | void free_ovlp_match(ovlp_match_t * match);
 30 | 
 31 | typedef struct { uint64_t x, y; } mm128_t;
 32 | 
 33 | typedef struct { size_t n, m; mm128_t *a; } mm128_v;
 34 | 
 35 | mm128_v read_mmlist(char *);
 36 | 
 37 | void free(void *ptr);
 38 | 
 39 | typedef unsigned int khint32_t;
 40 | 
 41 | typedef unsigned long khint64_t;
 42 | 
 43 | typedef khint32_t khint_t;
 44 | 
 45 | typedef struct {
 46 |     mm128_v * mmers;
 47 |     void * mmer0_map;
 48 |     void * rlmap;
 49 |     void * mcmap;
 50 |     void * ridmm;} py_mmer_t;
 51 | 
 52 | void build_shimmer_map4py(py_mmer_t *,
 53 |         char *, char *,
 54 |         uint32_t, uint32_t, uint32_t, uint32_t);
 55 | 
 56 | void get_shimmers_for_read(mm128_v *, py_mmer_t *, uint32_t);
 57 | 
 58 | typedef struct { uint64_t x0, x1, y0, y1; uint8_t direction;} mp256_t;
 59 | typedef struct { size_t n, m; mp256_t *a; } mp256_v;
 60 | 
 61 | uint32_t get_mmer_count(py_mmer_t * , uint64_t);
 62 | void get_shimmer_hits(mp256_v *, py_mmer_t *, uint64_t, uint32_t);
 63 | 
 64 | typedef uint32_t mm_idx_t;
 65 | typedef struct { size_t n, m; mm_idx_t *a; } mm_idx_v;
 66 | 
 67 | typedef struct {
 68 |         mm_idx_v idx0;
 69 |         mm_idx_v idx1;
 70 | } shmr_aln_t;
 71 | 
 72 | typedef struct { size_t n, m; shmr_aln_t *a; } shmr_aln_v;
 73 | 
 74 | shmr_aln_v * shmr_aln( mm128_v *, mm128_v *, uint8_t, uint32_t, uint32_t, uint32_t);
 75 | 
 76 | void free_shmr_alns(shmr_aln_v *);
 77 | 
 78 | // from mm_sketch.c
 79 | void mm_sketch(void *, const char *, int , int, int , uint32_t , int , mm128_v *);
 80 | 
 81 | // from shmr_reduce.c
 82 | void mm_reduce(mm128_v *, mm128_v *, uint8_t);
 83 | 
 84 | """)
 85 | 
 86 | ffibuilder.set_source("peregrine._shimmer4py",
 87 |                f"""
 88 |                #include "{basedir}/src/shimmer.h"
 89 |                """,
 90 |                sources=[f'{basedir}/src/shimmer4py.c',
 91 |                         f'{basedir}/src/DWmatch.c',
 92 |                         f'{basedir}/src/shmr_align.c',
 93 |                         f'{basedir}/src/shmr_utils.c',
 94 |                         f'{basedir}/src/shmr_reduce.c',
 95 |                         f'{basedir}/src/mm_sketch.c',
 96 |                         f'{basedir}/src/kalloc.c'])   # library name, for the linker
 97 | 
 98 | if __name__ == "__main__":
 99 |     ffibuilder.compile(verbose=True)
100 | 


--------------------------------------------------------------------------------
/src/kvec.h:
--------------------------------------------------------------------------------
  1 | /* The MIT License
  2 | 
  3 |    Copyright (c) 2008, by Attractive Chaos <attractor@live.co.uk>
  4 | 
  5 |    Permission is hereby granted, free of charge, to any person obtaining
  6 |    a copy of this software and associated documentation files (the
  7 |    "Software"), to deal in the Software without restriction, including
  8 |    without limitation the rights to use, copy, modify, merge, publish,
  9 |    distribute, sublicense, and/or sell copies of the Software, and to
 10 |    permit persons to whom the Software is furnished to do so, subject to
 11 |    the following conditions:
 12 | 
 13 |    The above copyright notice and this permission notice shall be
 14 |    included in all copies or substantial portions of the Software.
 15 | 
 16 |    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 17 |    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 18 |    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 19 |    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 20 |    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 21 |    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 22 |    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 |    SOFTWARE.
 24 | */
 25 | 
 26 | /*
 27 |   An example:
 28 | 
 29 | #include "kvec.h"
 30 | int main() {
 31 | 	kvec_t(int) array;
 32 | 	kv_init(array);
 33 | 	kv_push(int, array, 10); // append
 34 | 	kv_a(int, array, 20) = 5; // dynamic
 35 | 	kv_A(array, 20) = 4; // static
 36 | 	kv_destroy(array);
 37 | 	return 0;
 38 | }
 39 | */
 40 | 
 41 | /*
 42 |   2008-09-22 (0.1.0):
 43 | 
 44 | 	* The initial version.
 45 | 
 46 | */
 47 | 
 48 | #ifndef AC_KVEC_H
 49 | #define AC_KVEC_H
 50 | 
 51 | #include <stdlib.h>
 52 | #include "kalloc.h"
 53 | 
 54 | #define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
 55 | 
 56 | #define kvec_t(type) struct { size_t n, m; type *a; }
 57 | #define kv_init(v) ((v).n = (v).m = 0, (v).a = 0)
 58 | #define kv_destroy(v) free((v).a)
 59 | #define kv_A(v, i) ((v).a[(i)])
 60 | #define kv_pop(v) ((v).a[--(v).n])
 61 | #define kv_size(v) ((v).n)
 62 | #define kv_max(v) ((v).m)
 63 | 
 64 | #define kv_resize(type, km, v, s) do { \
 65 | 		if ((v).m < (s)) { \
 66 | 			(v).m = (s); \
 67 | 			kv_roundup32((v).m); \
 68 | 			(v).a = (type*)krealloc((km), (v).a, sizeof(type) * (v).m); \
 69 | 		} \
 70 | 	} while (0)
 71 | 
 72 | #define kv_copy(type, km, v1, v0) do { \
 73 | 		if ((v1).m < (v0).n) kv_resize(type, (km), (v1), (v0).n); \
 74 | 		(v1).n = (v0).n; \
 75 | 		memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \
 76 | 	} while (0) \
 77 | 
 78 | #define kv_push(type, km, v, x) do { \
 79 | 		if ((v).n == (v).m) { \
 80 | 			(v).m = (v).m? (v).m<<1 : 2; \
 81 | 			(v).a = (type*)krealloc((km), (v).a, sizeof(type) * (v).m); \
 82 | 		} \
 83 | 		(v).a[(v).n++] = (x); \
 84 | 	} while (0)
 85 | 
 86 | #define kv_pushp(type, km, v, p) do { \
 87 | 		if ((v).n == (v).m) { \
 88 | 			(v).m = (v).m? (v).m<<1 : 2; \
 89 | 			(v).a = (type*)krealloc((km), (v).a, sizeof(type) * (v).m); \
 90 | 		} \
 91 | 		*(p) = &(v).a[(v).n++]; \
 92 | 	} while (0)
 93 | 
 94 | #define kv_reverse(type, v, start) do { \
 95 | 		if ((v).m > 0 && (v).n > (start)) { \
 96 | 			size_t __i, __end = (v).n - (start); \
 97 | 			type *__a = (v).a + (start); \
 98 | 			for (__i = 0; __i < __end>>1; ++__i) { \
 99 | 				type __t = __a[__end - 1 - __i]; \
100 | 				__a[__end - 1 - __i] = __a[__i]; __a[__i] = __t; \
101 | 			} \
102 | 		} \
103 | 	} while (0)
104 | 
105 | #endif
106 | 


--------------------------------------------------------------------------------
/falcon/kvec.h:
--------------------------------------------------------------------------------
  1 | /* The MIT License
  2 | 
  3 |    Copyright (c) 2008, by Attractive Chaos <attractor@live.co.uk>
  4 | 
  5 |    Permission is hereby granted, free of charge, to any person obtaining
  6 |    a copy of this software and associated documentation files (the
  7 |    "Software"), to deal in the Software without restriction, including
  8 |    without limitation the rights to use, copy, modify, merge, publish,
  9 |    distribute, sublicense, and/or sell copies of the Software, and to
 10 |    permit persons to whom the Software is furnished to do so, subject to
 11 |    the following conditions:
 12 | 
 13 |    The above copyright notice and this permission notice shall be
 14 |    included in all copies or substantial portions of the Software.
 15 | 
 16 |    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 17 |    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 18 |    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 19 |    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 20 |    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 21 |    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 22 |    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 |    SOFTWARE.
 24 | */
 25 | 
 26 | /*
 27 |   An example:
 28 | 
 29 | #include "kvec.h"
 30 | int main() {
 31 | 	kvec_t(int) array;
 32 | 	kv_init(array);
 33 | 	kv_push(int, array, 10); // append
 34 | 	kv_a(int, array, 20) = 5; // dynamic
 35 | 	kv_A(array, 20) = 4; // static
 36 | 	kv_destroy(array);
 37 | 	return 0;
 38 | }
 39 | */
 40 | 
 41 | /*
 42 |   2008-09-22 (0.1.0):
 43 | 
 44 | 	* The initial version.
 45 | 
 46 | */
 47 | 
 48 | #ifndef AC_KVEC_H
 49 | #define AC_KVEC_H
 50 | 
 51 | #include <stdlib.h>
 52 | #include "kalloc.h"
 53 | 
 54 | #define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
 55 | 
 56 | #define kvec_t(type) struct { size_t n, m; type *a; }
 57 | #define kv_init(v) ((v).n = (v).m = 0, (v).a = 0)
 58 | #define kv_destroy(v) free((v).a)
 59 | #define kv_A(v, i) ((v).a[(i)])
 60 | #define kv_pop(v) ((v).a[--(v).n])
 61 | #define kv_size(v) ((v).n)
 62 | #define kv_max(v) ((v).m)
 63 | 
 64 | #define kv_resize(type, km, v, s) do { \
 65 | 		if ((v).m < (s)) { \
 66 | 			(v).m = (s); \
 67 | 			kv_roundup32((v).m); \
 68 | 			(v).a = (type*)krealloc((km), (v).a, sizeof(type) * (v).m); \
 69 | 		} \
 70 | 	} while (0)
 71 | 
 72 | #define kv_copy(type, km, v1, v0) do { \
 73 | 		if ((v1).m < (v0).n) kv_resize(type, (km), (v1), (v0).n); \
 74 | 		(v1).n = (v0).n; \
 75 | 		memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \
 76 | 	} while (0) \
 77 | 
 78 | #define kv_push(type, km, v, x) do { \
 79 | 		if ((v).n == (v).m) { \
 80 | 			(v).m = (v).m? (v).m<<1 : 2; \
 81 | 			(v).a = (type*)krealloc((km), (v).a, sizeof(type) * (v).m); \
 82 | 		} \
 83 | 		(v).a[(v).n++] = (x); \
 84 | 	} while (0)
 85 | 
 86 | #define kv_pushp(type, km, v, p) do { \
 87 | 		if ((v).n == (v).m) { \
 88 | 			(v).m = (v).m? (v).m<<1 : 2; \
 89 | 			(v).a = (type*)krealloc((km), (v).a, sizeof(type) * (v).m); \
 90 | 		} \
 91 | 		*(p) = &(v).a[(v).n++]; \
 92 | 	} while (0)
 93 | 
 94 | #define kv_reverse(type, v, start) do { \
 95 | 		if ((v).m > 0 && (v).n > (start)) { \
 96 | 			size_t __i, __end = (v).n - (start); \
 97 | 			type *__a = (v).a + (start); \
 98 | 			for (__i = 0; __i < __end>>1; ++__i) { \
 99 | 				type __t = __a[__end - 1 - __i]; \
100 | 				__a[__end - 1 - __i] = __a[__i]; __a[__i] = __t; \
101 | 			} \
102 | 		} \
103 | 	} while (0)
104 | 
105 | #endif
106 | 


--------------------------------------------------------------------------------
/py-utils/dump_L0.py:
--------------------------------------------------------------------------------
  1 | from cffi import FFI
  2 | # import redis
  3 | 
  4 | ffi = FFI()
  5 | 
  6 | ffi.cdef("""
  7 | typedef struct { uint64_t x, y; } mm128_t;
  8 | typedef struct { size_t n, m; mm128_t *a; } mm128_v;
  9 | mm128_v read_mmlist(char *);
 10 | void free(void *ptr);
 11 | """)
 12 | 
 13 | C = ffi.dlopen(None)
 14 | mm_utils = ffi.dlopen("../src/mm_utils.so")
 15 | # r_conn = redis.Redis(host='127.0.0.1', port=6379, db=0)
 16 | 
 17 | rmap = dict(zip(b"ACGT", b"TGCA"))
 18 | 
 19 | L0dump = open("L0.txt", "w")
 20 | 
 21 | #hmmerL0 = ffi.new("mm128_v *")
 22 | #hmmerL2 = ffi.new("mm128_v *")
 23 | 
 24 | hmmerL0 = mm_utils.read_mmlist(b"../test/hmmer-L0-01-of-01.dat")
 25 | 
 26 | rid2name = {}
 27 | rid2len = {}
 28 | # rid2seq = {}
 29 | 
 30 | with open("../test/seq_dataset.idx") as f:
 31 |     for row in f:
 32 |         row = row.strip().split()
 33 |         rid, rname, rlen = row
 34 |         rid = int(rid)
 35 |         rlen = int(rlen)
 36 |         rid2name[rid] = rname
 37 |         rid2len[rid] = rlen
 38 | 
 39 | """
 40 | * @param p      minimizers
 41 | *               p->a[i].x = kMer<<8 | kmerSpan
 42 | *               p->a[i].y = rid<<32 | lastPos<<1 | strand
 43 | *               where lastPos is the position of the last base of the i-th minimizer,
 44 | *               and strand indicates whether the minimizer comes from the top or the bottom strand.
 45 | *               Callers may want to set "p->n = 0"; otherwise results are appended to p
 46 | """
 47 | 
 48 | mmer_count = {}
 49 | mer_five = {}
 50 | mer_three = {}
 51 | for i in range(hmmerL0.n):
 52 |     span = hmmerL0.a[i].x & 0xFF
 53 |     mmer = hmmerL0.a[i].x >> 8
 54 |     rid = hmmerL0.a[i].y >> 32
 55 |     pos_end = ((hmmerL0.a[i].y & 0xFFFFFFFF) >> 1) + 1
 56 |     strand = hmmerL0.a[i].y & 0x1
 57 |     mm_str = "{:014X}".format(mmer)
 58 |     #
 59 |     # mmer_count.setdefault(mm_str, 0)
 60 |     # mmer_count[mm_str] += 1
 61 |     #
 62 |     # kmer = bseq[pos_end-span:pos_end]
 63 |     # kmer_r =  bytes([rmap[c] for c in kmer[::-1]])
 64 |     r_pos_end = rid2len[rid] - pos_end + span
 65 |     name = rid2name[rid]
 66 | 
 67 |     if pos_end < 250:
 68 |         mer_five.setdefault(mmer, [])
 69 |         mer_five[mmer].append(name)
 70 |     if r_pos_end < 250:
 71 |         mer_three.setdefault(mmer, [])
 72 |         mer_three[mmer].append(name)
 73 | 
 74 |     print(name, pos_end, r_pos_end,
 75 |           strand, mm_str, file=L0dump)
 76 | 
 77 | L0dump.close()
 78 | 
 79 | dovetail_end = {}
 80 | for i in range(hmmerL0.n):
 81 |     mmer = hmmerL0.a[i].x >> 8
 82 |     rid = hmmerL0.a[i].y >> 32
 83 |     rname = rid2name[rid]
 84 |     if mmer in mer_five:
 85 |         for rname0 in mer_five[mmer]:
 86 |             dovetail_end.setdefault(rname0, set())
 87 |             dovetail_end[rname0].add( (5, rname) )
 88 |     if mmer in mer_three:
 89 |         for rname0 in mer_three[mmer]:
 90 |             dovetail_end.setdefault(rname0, set())
 91 |             dovetail_end[rname0].add( (3, rname) )
 92 | 
 93 | dt_file = open("L0_dt.txt","w")
 94 | for rname in dovetail_end:
 95 |     for e, rname0 in list(dovetail_end[rname]):
 96 |         if rname == rname0:
 97 |             continue
 98 |         intersect = 0
 99 |         if e == 5 and (3, rname) in dovetail_end.get(rname0, {}):
100 |             print ( 5, rname0, 3, rname, file=dt_file)
101 |         if e == 5 and (5, rname) in dovetail_end.get(rname0, {}):
102 |             print ( 5, rname0, 5, rname, file=dt_file)
103 |         if e == 3 and (5, rname) in dovetail_end.get(rname0, {}):
104 |             print ( 3, rname0, 5, rname, file=dt_file)
105 |         if e == 3 and (3, rname) in dovetail_end.get(rname0, {}):
106 |             print ( 3, rname0, 3, rname, file=dt_file)
107 | 
108 | dt_file.close()
109 | 
110 | C.free(hmmerL0.a)
111 | 
112 | 


--------------------------------------------------------------------------------
/src/shmr_dedup.c:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <errno.h>
  3 | #include <stdbool.h>
  4 | #include <stdint.h>
  5 | #include <stdio.h>
  6 | #include <string.h>
  7 | #include <unistd.h>
  8 | #include "kalloc.h"
  9 | #include "khash.h"
 10 | #include "kvec.h"
 11 | #include "shimmer.h"
 12 | 
 13 | #define OVERLAP 0
 14 | #define CONTAINS 1
 15 | #define CONTAINED 2
 16 | 
 17 | KHASH_MAP_INIT_INT64(RPAIR, uint8_t);
 18 | 
 19 | int main(int argc, char *argv[]) {
 20 |   // 002408115 004118624 -14416 99.6 0 27 14387 15129 1 0 14392 14392 contains
 21 | 
 22 |   uint32_t a_bgn, a_end;
 23 |   uint32_t b_bgn, b_end;
 24 |   uint32_t rid0;
 25 |   uint32_t rid1;
 26 |   uint64_t ridp;
 27 |   int32_t absent;
 28 |   khiter_t k;
 29 | 
 30 |   khash_t(RPAIR) *rid_pairs = kh_init(RPAIR);
 31 | 
 32 |   while (!feof(stdin)) {
 33 |     ovlp_t ovlp;
 34 |     fread(&ovlp, sizeof(ovlp), 1, stdin);
 35 | 
 36 |     rid0 = (uint32_t)(ovlp.y0 >> 32);
 37 |     rid1 = (uint32_t)(ovlp.y1 >> 32);
 38 | 
 39 |     ridp = rid0 < rid1 ? (((uint64_t)rid0) << 32) | ((uint64_t)rid1)
 40 |                        : (((uint64_t)rid1) << 32) | ((uint64_t)rid0);
 41 |     k = kh_get(RPAIR, rid_pairs, ridp);
 42 |     if (k == kh_end(rid_pairs)) {
 43 |       uint32_t pos0 = (uint32_t)((ovlp.y0 & 0xFFFFFFFF) >> 1) + 1;
 44 |       uint32_t rlen0 = ovlp.rl0;
 45 |       uint8_t strand0 = ovlp.strand0;
 46 | 
 47 |       uint32_t pos1 = (uint32_t)((ovlp.y1 & 0xFFFFFFFF) >> 1) + 1;
 48 |       uint32_t rlen1 = ovlp.rl1;
 49 |       uint8_t strand1 = ovlp.strand1;
 50 | 
 51 |       ovlp_match_t match = ovlp.match;
 52 |       /* Dump raw alignment results for debugging */
 53 |       /*
 54 |       fprintf(stdout,"X %09d %u %u %d %d %d %09d %u %u %d %d %d %d %d %u\n",
 55 |                       rid0, pos0, strand0, match.q_bgn, match.q_end, rlen0,
 56 |                       rid1, pos1, strand1, match.t_bgn, match.t_end, rlen1,
 57 |                       match.m_size, match.dist, ovlp.ovlp_type);
 58 |       */
 59 |       seq_coor_t q_bgn, q_end, t_bgn, t_end;
 60 |       q_bgn = match.q_bgn;
 61 |       q_end = match.q_end;
 62 |       t_bgn = match.t_bgn;
 63 |       t_end = match.t_end;
 64 |       q_bgn -= t_bgn;
 65 |       t_bgn = 0;
 66 |       if (strand0 == ORIGINAL) {
 67 |         a_bgn = (seq_coor_t)(pos0 - pos1) + q_bgn;
 68 |         a_end = (seq_coor_t)(pos0 - pos1) + q_end;
 69 |         a_bgn = a_bgn < 0 ? 0 : a_bgn;  // this ad-hoc fix, read should be
 70 |                                         // stiched by alignment
 71 |         a_end = a_end >= rlen0 ? rlen0 : a_end;
 72 |       } else {
 73 |         q_bgn -= t_bgn;
 74 |         t_bgn = 0;
 75 |         a_bgn = (seq_coor_t)rlen0 - (seq_coor_t)(pos0 - pos1) - q_end;
 76 |         a_end = (seq_coor_t)rlen0 - (seq_coor_t)(pos0 - pos1) - q_bgn;
 77 |         a_bgn = a_bgn < 0 ? 0 : a_bgn;  // this ad-hoc fix
 78 |         a_end = a_end >= rlen0 ? rlen0 : a_end;
 79 |       }
 80 |       if (strand1 == ORIGINAL) {
 81 |         b_bgn = t_bgn;
 82 |         b_end = t_end;
 83 |         b_bgn = b_bgn < 0 ? 0 : b_bgn;  // this ad-hoc fix
 84 |         b_end = b_end >= rlen1 ? rlen1 : b_end;
 85 |       } else {
 86 |         b_bgn = (seq_coor_t)rlen1 - t_end;
 87 |         b_end = (seq_coor_t)rlen1 - t_bgn;
 88 |         b_bgn = b_bgn < 0 ? 0 : b_bgn;  // this ad-hoc fix
 89 |         b_end = b_end >= rlen1 ? rlen1 : b_end;
 90 |       }
 91 |       double err_est;
 92 |       err_est = 100.0 - 100.0 * (double)(match.dist) / (double)(match.m_size);
 93 |       fprintf(stdout, "%09d %09d %d %0.1f %u %d %d %u %u %d %d %u %s\n", rid0,
 94 |               rid1, -(match.m_size), err_est, ORIGINAL, a_bgn, a_end, rlen0,
 95 |               (strand0 == ORIGINAL ? strand1 : 1 - strand1), b_bgn, b_end,
 96 |               rlen1,
 97 |               ovlp.ovlp_type == OVERLAP
 98 |                   ? "overlap"
 99 |                   : (ovlp.ovlp_type == CONTAINS ? "contains" : "contained"));
100 |       kh_put(RPAIR, rid_pairs, ridp, &absent);
101 |     }
102 |   }
103 |   kh_destroy(RPAIR, rid_pairs);
104 | }
105 | 


--------------------------------------------------------------------------------
/src/shmr_mkseqdb.c:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <errno.h>
  3 | #include <stdint.h>
  4 | #include <stdio.h>
  5 | #include <unistd.h>
  6 | #include <zlib.h>
  7 | #include "kseq.h"
  8 | #include "shimmer.h"
  9 | 
 10 | KSEQ_INIT(gzFile, gzread);
 11 | 
 12 | extern char *optarg;
 13 | extern int optind, opterr, optopt;
 14 | 
 15 | int main(int argc, char *argv[]) {
 16 |   gzFile fp;
 17 |   FILE *seq_dataset_file;
 18 |   FILE *index_file;
 19 |   FILE *seqdb_file;
 20 |   kseq_t *seq;
 21 |   uint32_t rid;
 22 |   char *seq_dataset_path = NULL;
 23 |   char *seqdb_prefix = NULL;
 24 |   char index_fn[8192];
 25 |   char seqdb_fn[8192];
 26 |   char fn[8192];
 27 |   int l, c;
 28 | 
 29 |   opterr = 0;
 30 | 
 31 |   while ((c = getopt(argc, argv, "d:p:")) != -1) {
 32 |     switch (c) {
 33 |       case 'd':
 34 |         seq_dataset_path = optarg;
 35 |         break;
 36 |       case 'p':
 37 |         seqdb_prefix = optarg;
 38 |         break;
 39 | 
 40 |       case '?':
 41 |         if (optopt == 'd') {
 42 |           fprintf(stderr,
 43 |                   "Option -%c not specified, using 'seq_dataset.lst' as the "
 44 |                   "input file\n",
 45 |                   optopt);
 46 |         } else if (optopt == 'p') {
 47 |           fprintf(stderr,
 48 |                   "Option -%c not specified, using 'seq_dataset' as the output "
 49 |                   "prefix\n",
 50 |                   optopt);
 51 |         } else {
 52 |           fprintf(
 53 |               stderr,
 54 |               "Usage: shmr_mkseqdb -d seq_dataset.lst -p seq_dataset_prefix\n");
 55 |         }
 56 |         return 1;
 57 |       default:
 58 |         abort();
 59 |     }
 60 |   }
 61 | 
 62 |   if (seq_dataset_path == NULL) {
 63 |     seq_dataset_path = (char *)calloc(8192, 1);
 64 |     snprintf(seq_dataset_path, 8191, "seq_dataset.lst");
 65 |   }
 66 | 
 67 |   if (seqdb_prefix == NULL) {
 68 |     seqdb_prefix = (char *)calloc(8192, 1);
 69 |     snprintf(seqdb_prefix, 8191, "seq_dataset");
 70 |   }
 71 | 
 72 |   seq_dataset_file = fopen(seq_dataset_path, "r");
 73 |   printf("input sequence dataset file list: '%s'\n", seq_dataset_path);
 74 |   if (!seq_dataset_file) {
 75 |     fprintf(stderr, "file '%s' open error: %s\n", seq_dataset_path,
 76 |             strerror(errno));
 77 |     exit(1);
 78 |   }
 79 | 
 80 |   int written;
 81 |   written = snprintf(index_fn, sizeof(index_fn), "%s.idx", seqdb_prefix);
 82 |   assert(written < sizeof(index_fn));
 83 |   printf("output index file: %s\n", index_fn);
 84 |   index_file = fopen(index_fn, "w");  // use text file for now
 85 |   if (!index_file) {
 86 |     fprintf(stderr, "file '%s' open error: %s\n", index_fn, strerror(errno));
 87 |     exit(1);
 88 |   }
 89 | 
 90 |   written = snprintf(seqdb_fn, sizeof(index_fn), "%s.seqdb", seqdb_prefix);
 91 |   assert(written < sizeof(seqdb_fn));
 92 |   printf("output seqdb file: %s\n", index_fn);
 93 |   seqdb_file = fopen(seqdb_fn, "wb");  // use text file for now
 94 |   if (!index_file) {
 95 |     fprintf(stderr, "file '%s' open error: %s\n", index_fn, strerror(errno));
 96 |     exit(1);
 97 |   }
 98 | 
 99 |   rid = 0;
100 |   size_t offset = 0;
101 |   while (fscanf(seq_dataset_file, "%s", fn) != EOF) {
102 |     fp = gzopen(fn, "r");
103 |     if (!fp) {
104 |       fprintf(stderr, "file '%s' open error: %s\n", fn, strerror(errno));
105 |       exit(1);
106 |     }
107 |     seq = kseq_init(fp);
108 |     while ((l = kseq_read(seq)) >= 0) {
109 |       uint8_t *encoded;
110 |       encoded = malloc(seq->seq.l);
111 |       encode_biseq(encoded, seq->seq.s, seq->seq.l);
112 |       fprintf(index_file, "%09d %s %u %lu\n", rid, seq->name.s, seq->seq.l,
113 |               offset);
114 |       fwrite(encoded, sizeof(uint8_t), seq->seq.l, seqdb_file);
115 |       rid += 1;
116 |       offset += seq->seq.l;
117 |       free(encoded);
118 |     }
119 |     kseq_destroy(seq);
120 |     gzclose(fp);
121 |   }
122 |   fclose(seq_dataset_file);
123 |   fclose(index_file);
124 |   fclose(seqdb_file);
125 |   if (!seq_dataset_path) free(seq_dataset_path);
126 |   if (!seqdb_prefix) free(seqdb_prefix);
127 |   return 0;
128 | }
129 | 


--------------------------------------------------------------------------------
/src/shimmer.h:
--------------------------------------------------------------------------------
  1 | #ifndef SHIMMER_H
  2 | #define SHIMMER_H
  3 | 
  4 | #include <assert.h>
  5 | #include <stdint.h>
  6 | #include <stdio.h>
  7 | #include <unistd.h>
  8 | #include "khash.h"
  9 | #include "kvec.h"
 10 | 
 11 | #define ORIGINAL 0
 12 | #define REVERSED 1
 13 | 
 14 | #ifdef __cplusplus
 15 | extern "C" {
 16 | #endif
 17 | 
 18 | void encode_biseq(uint8_t *, char *, size_t);
 19 | 
 20 | void decode_biseq(uint8_t *, char *, size_t, uint8_t);
 21 | 
 22 | void reverse_complement(char *, size_t);
 23 | 
 24 | typedef struct {
 25 |   uint64_t x, y;
 26 | } mm128_t;
 27 | typedef struct {
 28 |   size_t n, m;
 29 |   mm128_t *a;
 30 | } mm128_v;
 31 | void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid,
 32 |                int is_hpc, mm128_v *p);
 33 | void mm_reduce(mm128_v *, mm128_v *, uint8_t);
 34 | uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
 35 | 
 36 | void write_mmlist(char *, mm128_v *);
 37 | mm128_v read_mmlist(char *);
 38 | void append_mmlist(mm128_v *, mm128_v *);
 39 | 
 40 | typedef struct {
 41 |   char *name;
 42 |   uint32_t rid;
 43 | } seq_data_t;
 44 | typedef struct {
 45 |   size_t n, m;
 46 |   seq_data_t *a;
 47 | } seq_data_v;
 48 | 
 49 | typedef struct {
 50 |   uint32_t len;
 51 |   size_t offset;
 52 | } rl_t;
 53 | KHASH_MAP_INIT_STR(RID, uint32_t);
 54 | KHASH_MAP_INIT_INT(RLEN, rl_t);
 55 | khash_t(RID) * build_read_index(char *, seq_data_v *, khash_t(RLEN) *);
 56 | khash_t(RLEN) * get_read_length_map(char *);
 57 | 
 58 | void mm_end_filter(mm128_v *, mm128_v *, mm128_v *, khash_t(RLEN) *, uint32_t);
 59 | 
 60 | KHASH_MAP_INIT_INT64(MMC, uint32_t);
 61 | typedef struct {
 62 |   uint64_t mer;
 63 |   uint32_t count;
 64 | } mm_count_t;
 65 | typedef struct {
 66 |   size_t n, m;
 67 |   mm_count_t *a;
 68 | } mm_count_v;
 69 | void mm_count(mm128_v *, khash_t(MMC) *, mm_count_v *);
 70 | void write_mm_count(char *, mm_count_v *);
 71 | void mm_count_to_vec(khash_t(MMC) *, mm_count_v *);
 72 | mm_count_v read_mm_count(char *fn);
 73 | 
 74 | void aggregate_mm_count(khash_t(MMC) *, mm_count_v *);
 75 | 
 76 | typedef struct {
 77 |   uint64_t y0, y1;
 78 |   uint8_t direction;
 79 | } mp128_t;
 80 | typedef struct {
 81 |   size_t n, m;
 82 |   mp128_t *a;
 83 | } mp128_v;
 84 | KHASH_MAP_INIT_INT64(MMER1, mp128_v *);
 85 | typedef khash_t(MMER1) * mmert1_p_t;
 86 | KHASH_MAP_INIT_INT64(MMER0, mmert1_p_t);
 87 | 
 88 | void build_map(mm128_v *, khash_t(MMER0) *, khash_t(RLEN) *, khash_t(MMC) *,
 89 |                uint32_t, uint32_t, uint32_t, uint32_t);
 90 | 
 91 | char *get_read_seq(FILE *, uint32_t, khash_t(RLEN) *);
 92 | uint8_t *get_read_seq_mmap_ptr(uint8_t *, uint32_t, khash_t(RLEN) *);
 93 | 
 94 | // For DWmatch
 95 | typedef int32_t seq_coor_t;
 96 | 
 97 | typedef struct {
 98 |   seq_coor_t m_size, dist;
 99 |   seq_coor_t q_bgn, q_end;
100 |   seq_coor_t t_bgn, t_end;
101 |   seq_coor_t t_m_end, q_m_end;
102 | } ovlp_match_t;
103 | 
104 | typedef struct {
105 |   uint64_t y0, y1;
106 |   uint32_t rl0, rl1;
107 |   uint8_t strand0, strand1;
108 |   uint8_t ovlp_type;
109 |   ovlp_match_t match;
110 | } ovlp_t;
111 | 
112 | typedef struct {
113 |   seq_coor_t s1, e1;
114 |   seq_coor_t s2, e2;
115 |   long int score;
116 | } match_range;
117 | 
118 | ovlp_match_t *ovlp_match(uint8_t *, seq_coor_t, uint8_t, uint8_t *, seq_coor_t,
119 |                          uint8_t, seq_coor_t);
120 | 
121 | void free_ovlp_match(ovlp_match_t *);
122 | 
123 | typedef struct {
124 |   uint64_t x0, x1, y0, y1;
125 |   uint8_t direction;
126 | } mp256_t;
127 | typedef struct {
128 |   size_t n, m;
129 |   mp256_t *a;
130 | } mp256_v;
131 | 
132 | typedef struct {
133 |   mm128_v *mmers;
134 |   void *mmer0_map;
135 |   void *rlmap;
136 |   void *mcmap;
137 |   void *ridmm;
138 | } py_mmer_t;
139 | 
140 | // for shmr_align
141 | typedef uint32_t mm_idx_t;
142 | typedef kvec_t(mm_idx_t) mm_idx_v;
143 | 
144 | typedef struct {
145 |   mm_idx_v idx0;
146 |   mm_idx_v idx1;
147 | } shmr_aln_t;
148 | 
149 | typedef kvec_t(shmr_aln_t) shmr_aln_v;
150 | 
151 | KHASH_MAP_INIT_INT64(MMIDX, mm_idx_v *);
152 | shmr_aln_v *shmr_aln(mm128_v *, mm128_v *, uint8_t, uint32_t, uint32_t,
153 |                      uint32_t);
154 | 
155 | void free_shmr_alns(shmr_aln_v *);
156 | 
157 | KHASH_MAP_INIT_INT(RIDMM, mm128_v *);
158 | void get_ridmm(khash_t(RIDMM) *, mm128_v *);
159 | uint32_t mmer_pos(mm128_t *);
160 | 
161 | #ifdef __cplusplus
162 | }
163 | #endif
164 | 
165 | #endif
166 | 


--------------------------------------------------------------------------------
/py/scripts/path_to_contig.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import mmap
  4 | import sys
  5 | import numpy
  6 | from peregrine._shimmer4py import ffi, lib
  7 | 
  8 | basemap = {1:"A",2:"C",4:"G",8:"T"}
  9 | stitching_overhang_size = 500
 10 | 
 11 | if __name__ == "__main__":
 12 |     seqdb_prefix = sys.argv[1]
 13 |     tiling_path_fn = sys.argv[2]
 14 | 
 15 |     f = open("{}.seqdb".format(seqdb_prefix), "rb")
 16 |     seqdb = mmap.mmap(f.fileno(), 0,
 17 |                       flags=mmap.MAP_SHARED, prot=mmap.PROT_READ)
 18 | 
 19 |     read_idx = {}
 20 |     with open("{}.idx".format(seqdb_prefix)) as f:
 21 |         for row in f:
 22 |             row = row.strip().split()
 23 |             rid, rname, rlen, offset = row
 24 |             rid = int(rid)
 25 |             rlen = int(rlen)
 26 |             offset = int(offset)
 27 |             read_idx.setdefault(rid, {})
 28 |             read_idx[rid]["name"] = rname
 29 |             read_idx[rid]["length"] = rlen
 30 |             read_idx[rid]["offset"] = offset
 31 | 
 32 |     tiling_path_data = {}
 33 |     with open(tiling_path_fn) as f:
 34 |         for row in f:
 35 |             row = row.strip().split()
 36 |             tiling_path_data.setdefault(row[0], [])
 37 |             tiling_path_data[row[0]].append(row)
 38 | 
 39 |     for ctg in tiling_path_data:
 40 |         segments = []
 41 |         # I don't like to have the first read as it breaks the string formulation,
 42 |         # but poeple like it for no reason, so I will just do it
 43 |         ctg_id, v, w, r, s, e, olen, idt, _1, _2 = tiling_path_data[ctg][0]
 44 |         v = v.split(":")
 45 |         rid0 = int(v[0])
 46 |         s0 = read_idx[rid0]["offset"]
 47 |         slen0 = read_idx[rid0]["length"]
 48 |         e0 = s0 + slen0
 49 |         bseq0 = seqdb[s0:e0]
 50 |         strand0 = 0 if v[1] == "E" else 1
 51 | 
 52 |         seq = ffi.new("char[{}]".format(slen0))
 53 |         lib.decode_biseq(bseq0, seq, slen0, strand0)
 54 | 
 55 |         ctg_len = len(seq)
 56 |         segments.append((ctg_len, 0, seq))
 57 |         for row in tiling_path_data[ctg]:
 58 |             ctg_id, v, w, r, s, e, olen, idt, _1, _2 = row
 59 |             v = v.split(":")
 60 |             w = w.split(":")
 61 |             s = int(s)
 62 |             e = int(e)
 63 |             olen = int(olen)
 64 |             idt = float(idt)
 65 | 
 66 |             rid0 = int(v[0])
 67 |             s0 = read_idx[rid0]["offset"]
 68 |             slen0 = read_idx[rid0]["length"]
 69 |             e0 = s0 + slen0
 70 |             bseq0 = seqdb[s0:e0]
 71 |             strand0 = 0 if v[1] == "E" else 1
 72 | 
 73 |             rid1 = int(w[0])
 74 |             s1 = read_idx[rid1]["offset"]
 75 |             slen1 = read_idx[rid1]["length"]
 76 |             e1 = s1 + slen1
 77 |             bseq1 = seqdb[s1:e1]
 78 |             strand1 = 0 if w[1] == "E" else 1
 79 | 
 80 |             offset1 = slen0 - stitching_overhang_size
 81 |             offset2 = slen1 - abs(e-s) - stitching_overhang_size
 82 |             match = lib.ovlp_match(bseq0[offset1:], slen0 - offset1, strand0,
 83 |                                    bseq1[offset2:], slen1 - offset2, strand1,
 84 |                                    100)
 85 | 
 86 |             if strand1 == 1:
 87 |                 s, e = slen1 - s, slen1 - e
 88 |             assert(e > s)
 89 |             seg_size = e - s + stitching_overhang_size - match.t_m_end
 90 |             seq = ffi.new("char[{}]".format(seg_size))
 91 |             lib.decode_biseq(bseq1[e-seg_size:e],
 92 |                              seq,
 93 |                              seg_size,
 94 |                              strand1)
 95 |             segments.append((ctg_len,
 96 |                              ctg_len - stitching_overhang_size + match.q_m_end,
 97 |                             seq))
 98 |             # print(row)
 99 |             # print((ctg_len, match.q_m_end, match.t_m_end,
100 |             #       ctg_len - stitching_overhang_size + match.q_m_end,
101 |             #       ffi.string(seq)))
102 |             ctg_len -= (stitching_overhang_size - match.q_m_end)
103 |             ctg_len += (stitching_overhang_size - match.t_m_end) + e - s
104 | 
105 |             lib.free_ovlp_match(match)
106 | 
107 |         ctg_str = numpy.ones(ctg_len, dtype=numpy.byte)
108 |         ctg_str *= ord('N')
109 |         print(">{}".format(ctg_id))
110 |         for seg in segments:
111 |             s = seg[1]
112 |             e = seg[1] + len(ffi.string(seg[2]))
113 |             ctg_str[s:e] = list(ffi.string(seg[2]))
114 |             ffi.release(seg[2])
115 |         print("".join((chr(x) for x in ctg_str)))
116 | 
117 | 


--------------------------------------------------------------------------------
/nim-mini/dump_mmmer.nim:
--------------------------------------------------------------------------------
  1 | let doc = """
  2 | dump_mmmer
  3 | 
  4 | Usage:
  5 |   dump_mmmer [options] <fasta_file_name>
  6 | 
  7 | Options:
  8 |   -h --help                       Show this screen
  9 |   -w --windowsize <windowsize>    Window size [default: 64]
 10 | """
 11 | 
 12 | import streams
 13 | import strfmt
 14 | import tables
 15 | import strutils
 16 | import sequtils
 17 | import docopt
 18 | import parseutils
 19 | 
 20 | let args = docopt(doc, version = "dump minimizer")
 21 | 
 22 | type
 23 |   pos_kmer = tuple[pos:uint32, kmer:uint32]
 24 | 
 25 | var 
 26 |   fn = $args["<fasta_file_name>"]
 27 |   ws = parseInt($args["--windowsize"]).uint32
 28 | 
 29 | echo fn
 30 | 
 31 | var
 32 |   fs = newFileStream(fn, fmRead)
 33 |   line = ""
 34 | 
 35 | # not used
 36 | #[
 37 |   base_to_code = {'A':0.uint32, 'C':1.uint32,
 38 |                   'G':2.uint32, 'T':3.uint32,
 39 |                   'a':0.uint32, 'c':1.uint32,
 40 |                   'g':2.uint32, 't':3.uint32,
 41 |                   'N':0.uint32}.toTable
 42 | ]#
 43 | 
 44 |   code_to_base = {0.uint32:'A', 1.uint32:'C',
 45 |                   2.uint32:'G', 3.uint32:'T'}.toTable
 46 | 
 47 | 
 48 | let xor_key = 0x7ed55d16.uint32
 49 | let ksize = 16.uint32
 50 | 
 51 | var mask = 0xFFFFFFFF.uint32 shr (32.uint32 - ksize * 2)
 52 | 
 53 | 
 54 | proc rc_DNA_seq(dna_seq:var string) : void {.inline.} =
 55 |   var rc_map = {'A':'T', 'C':'G',
 56 |                 'G':'C', 'T':'A',
 57 |                 'a':'t', 'c':'g',
 58 |                 'g':'c', 't':'a',
 59 |                 'N':'N'}.toTable
 60 |   for i in 0..toInt(dna_seq.len/2-1):
 61 |     swap dna_seq[i], dna_seq[^(i+1)]
 62 |   for i in 0..<dna_seq.len:
 63 |     dna_seq[i] = rc_map[dna_seq[i]]
 64 | 
 65 | 
 66 | proc decode_hash(hashcode:uint32) : string  {.inline.} =
 67 |   var hc: uint32
 68 |   hc = hashcode xor xor_key
 69 |   hc = hc and mask
 70 |   var t: uint32
 71 |   var s = newSeq[char](0)
 72 |   for i in 0..<ksize:
 73 |     s.add(code_to_base[hc and 0x3])
 74 |     t = hc and 0x3
 75 |     #echo hashcode, " ", hashcode xor xor_key, " ",hc, " ",t
 76 |     hc = hc shr 2
 77 |   for i in 0..toInt(s.len/2-1):
 78 |     swap s[i], s[^(i+1)]
 79 |   return s.mapIt(string, $it).join
 80 | 
 81 | 
 82 | proc find_minimizers(dna_seq:string): void =
 83 |   var 
 84 |     c: char
 85 |     p_mer: uint32
 86 |     h_mer: uint32
 87 |     w_start: uint32
 88 |     w_end: uint32
 89 |     mmer_seq: seq[uint32]
 90 |     pos: uint32
 91 |     pos2: uint32
 92 |     pm: pos_kmer 
 93 |     c_minimizer : pos_kmer
 94 | 
 95 |   mmer_seq= newSeq[uint32](dna_seq.len)
 96 |   p_mer = 0x00000000.uint32
 97 |   pos = 0.uint32
 98 | 
 99 |   for c in dna_seq:
100 |     p_mer = p_mer shl 2 
101 |     case c
102 |     of 'C', 'c':
103 |       p_mer = p_mer or 1.uint32
104 |     of 'G', 'g':
105 |       p_mer = p_mer or 2.uint32
106 |     of 'T', 't':
107 |       p_mer = p_mer or 3.uint32
108 |     else:
109 |       p_mer = p_mer or 0.uint32
110 | 
111 |     p_mer = p_mer and mask
112 |     h_mer = p_mer xor xor_key
113 |     if pos.int - ksize.int >= 0:
114 |       mmer_seq[pos.int - ksize.int] = h_mer
115 |     inc(pos)
116 | 
117 |   c_minimizer.pos = 0.uint32
118 |   c_minimizer.kmer = 0xFFFFFFFF.uint32 
119 |   w_start = 0
120 |   w_end = ws
121 |   for pos in w_start ..< w_end:
122 |     if mmer_seq[pos.int] < c_minimizer.kmer:
123 |       c_minimizer.pos = pos
124 |       c_minimizer.kmer = mmer_seq[pos.int]
125 |   echo "0 ", c_minimizer.pos,  " ", c_minimizer.kmer, " ", decode_hash(c_minimizer.kmer)
126 | 
127 |   for pos in ws ..< dna_seq.len.uint32 - ksize:
128 |     # echo "X ", pos.int, " ", mmer_seq[pos.int], " ", c_minimizer.kmer
129 |     if mmer_seq[pos.int] < c_minimizer.kmer:
130 |       c_minimizer.pos = pos
131 |       c_minimizer.kmer = mmer_seq[pos.int]
132 |       echo "1 ",c_minimizer.pos,  " ", c_minimizer.kmer, " ", decode_hash(c_minimizer.kmer)
133 |       continue
134 | 
135 |     if pos.int - c_minimizer.pos.int >= ws.int:
136 |       w_start = c_minimizer.pos + 1
137 |       w_end = w_start + ws
138 |       c_minimizer.kmer = 0xFFFFFFFF.uint32 
139 |       for pos2 in w_start ..< w_end:
140 |         if mmer_seq[pos2.int] < c_minimizer.kmer:
141 |           c_minimizer.pos = pos2
142 |           c_minimizer.kmer = mmer_seq[pos2.int]
143 |       echo  "2 ", c_minimizer.pos,  " ", c_minimizer.kmer, " ", decode_hash(c_minimizer.kmer)
144 | 
145 | 
146 | var 
147 |   dna_seq: string
148 |   seq_name: string
149 | 
150 | 
151 | if not isNil(fs):
152 |   while fs.readLine(line):
153 |     if line[0] == '>':
154 |       if not isNil(seq_name):
155 |         if dna_seq.len < ws.int:
156 |           seq_name = line.strip
157 |           dna_seq = ""
158 |           continue
159 |         echo seq_name, "|", "n"
160 |         find_minimizers(dna_seq)
161 |         rc_DNA_seq(dna_seq)
162 |         echo seq_name, "|", "c"
163 |         find_minimizers(dna_seq)
164 |       seq_name = line.strip
165 |       dna_seq = ""
166 |       continue
167 | 
168 |     if line[0] != '>':
169 |       dna_seq.add(line.strip)
170 | 
171 |   echo seq_name, "|", "n"
172 |   find_minimizers(dna_seq)
173 |   rc_DNA_seq(dna_seq)
174 |   echo seq_name, "|", "r"
175 |   find_minimizers(dna_seq)
176 |   fs.close()
177 | 


--------------------------------------------------------------------------------
/src/shmr_align.c:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <errno.h>
  3 | #include <stdbool.h>
  4 | #include <stdint.h>
  5 | #include <stdio.h>
  6 | #include <string.h>
  7 | #include <time.h>
  8 | #include <unistd.h>
  9 | #include <wordexp.h>
 10 | #include "kalloc.h"
 11 | #include "khash.h"
 12 | #include "kvec.h"
 13 | #include "shimmer.h"
 14 | 
 15 | #include <fcntl.h>
 16 | #include <sys/mman.h>
 17 | #include <sys/stat.h>
 18 | 
 19 | #define MAX_SMALL_ALNS 4800
 20 | 
 21 | shmr_aln_v *shmr_aln(mm128_v *mmers0, mm128_v *mmers1, uint8_t direction,
 22 |                      uint32_t max_diff, uint32_t max_dist,
 23 |                      uint32_t max_repeat) {
 24 |   /* generate a list of co-aligned mimimizer from two
 25 |    * minimizer lists: mv1 and mv2
 26 |    */
 27 |   uint64_t mhash;
 28 |   mm128_t mmer0, mmer1;
 29 |   khash_t(MMIDX) *mmidx_map = kh_init(MMIDX);
 30 |   shmr_aln_v *alns;
 31 |   khiter_t k;
 32 |   int32_t absent;
 33 | 
 34 |   mm_idx_v *idx_tmp;
 35 | 
 36 |   alns = calloc(sizeof(shmr_aln_v), 1);
 37 | 
 38 |   mm_idx_t s = 0;
 39 |   /* build a hasmap from mhash to the index of the minimizer array */
 40 |   for (;;) {
 41 |     if (s >= mmers0->n) break;
 42 |     mmer0 = mmers0->a[s];
 43 |     mhash = mmer0.x >> 8;
 44 | 
 45 |     k = kh_put(MMIDX, mmidx_map, mhash, &absent);
 46 |     if (absent) {
 47 |       idx_tmp = calloc(sizeof(mm_idx_v), 1);
 48 |       kv_push(mm_idx_t, 0, *idx_tmp, s);
 49 |       kh_val(mmidx_map, k) = idx_tmp;
 50 |     } else {
 51 |       k = kh_get(MMIDX, mmidx_map, mhash);
 52 |       assert(k != kh_end(mmidx_map));
 53 |       idx_tmp = kh_val(mmidx_map, k);
 54 |       kv_push(mm_idx_t, 0, *idx_tmp, s);
 55 |     }
 56 |     s++;
 57 |   }
 58 | 
 59 |   /* loop through 2nd shimmer list to build alginements */
 60 |   mm_idx_t ss = 0;
 61 |   uint32_t small_aln_count = 0;
 62 |   for (;;) {
 63 |     if (ss >= mmers1->n) break;
 64 |     if (direction == 1) {  // reversed
 65 |       s = mmers1->n - ss;
 66 |     } else {
 67 |       s = ss;
 68 |     }
 69 |     mmer1 = mmers1->a[s];
 70 |     mhash = mmer1.x >> 8;
 71 |     k = kh_get(MMIDX, mmidx_map, mhash);
 72 |     if (k == kh_end(mmidx_map)) {
 73 |       ss++;
 74 |       continue;
 75 |     }
 76 |     idx_tmp = kh_val(mmidx_map, k);
 77 |     if (idx_tmp->n > max_repeat) {
 78 |       ss++;
 79 |       continue;
 80 |     }
 81 | 
 82 |     for (uint32_t i = 0; i < idx_tmp->n; i++) {
 83 |       mmer0 = mmers0->a[idx_tmp->a[i]];
 84 |       int64_t delta0, delta1;
 85 |       int64_t mm_dist;
 86 |       if (direction == 0 && (mmer0.y & 0x1) != (mmer1.y & 0x1)) {
 87 |         continue;
 88 |       }
 89 | 
 90 |       if (direction == 1 && (mmer0.y & 0x1) == (mmer1.y & 0x1)) {
 91 |         continue;
 92 |       }
 93 | 
 94 |       if (direction == 1) {
 95 |         delta0 = abs(mmer_pos(&mmer0) + mmer_pos(&mmer1));
 96 |       } else {
 97 |         delta0 = abs(mmer_pos(&mmer0) - mmer_pos(&mmer1));
 98 |       }
 99 |       uint32_t best_aln_idx = UINT32_MAX;
100 |       double min_diff = max_diff;
101 |       uint8_t best_found = 0;
102 |       small_aln_count = 0;
103 |       for (uint32_t aln_idx = 0; aln_idx < alns->n; aln_idx++) {
104 |         mm128_t m0, m1;
105 |         shmr_aln_t *aln;
106 |         size_t n;
107 |         aln = alns->a + aln_idx;
108 |         n = aln->idx0.n;
109 | 
110 |         if (n < 3) small_aln_count++;
111 | 
112 |         if (idx_tmp->a[i] < aln->idx0.a[n - 1]) continue;
113 | 
114 |         m0 = mmers0->a[aln->idx0.a[n - 1]];
115 |         m1 = mmers1->a[aln->idx1.a[n - 1]];
116 | 
117 |         mm_dist = abs(mmer_pos(&mmer0) - mmer_pos(&m0));
118 |         if (mm_dist >= max_dist) continue;
119 | 
120 |         if (direction == 1) {
121 |           delta1 = abs(mmer_pos(&m0) + mmer_pos(&m1));
122 |         } else {
123 |           delta1 = abs(mmer_pos(&m0) - mmer_pos(&m1));
124 |         }
125 |         // double diff = (double) abs(delta0 - delta1) / (double) (mm_dist);
126 |         uint32_t diff = (uint32_t)abs((int32_t)delta0 - (int32_t)delta1);
127 |         if (diff < max_diff && diff < min_diff && mm_dist < max_dist) {
128 |           min_diff = diff;
129 |           best_aln_idx = aln_idx;
130 |           best_found = 1;
131 |         }
132 |       }
133 |       if (best_found == 1) {
134 |         shmr_aln_t *aln;
135 |         aln = alns->a + best_aln_idx;
136 |         kv_push(mm_idx_t, 0, aln->idx0, idx_tmp->a[i]);
137 |         kv_push(mm_idx_t, 0, aln->idx1, s);
138 |         // printf("best %d %d %d\n", best_aln_idx, idx_tmp->a[i], s);
139 |       } else {
140 |         shmr_aln_t *aln;
141 |         aln = calloc(sizeof(shmr_aln_t), 1);
142 |         kv_push(mm_idx_t, 0, aln->idx0, idx_tmp->a[i]);
143 |         kv_push(mm_idx_t, 0, aln->idx1, s);
144 |         kv_push(shmr_aln_t, 0, *alns, *aln);
145 |         // printf("new %d %d %d\n", best_aln_idx, idx_tmp->a[i], s);
146 |       }
147 |     }
148 |     ss++;
149 |     if (small_aln_count > MAX_SMALL_ALNS) break;
150 |   }
151 | 
152 |   for (khiter_t __i = kh_begin(mmidx_map); __i != kh_end(mmidx_map); ++__i) {
153 |     if (!kh_exist(mmidx_map, __i)) continue;
154 |     idx_tmp = kh_val(mmidx_map, __i);
155 |     kv_destroy(*idx_tmp);
156 |     free(idx_tmp);
157 |   }
158 |   kh_destroy(MMIDX, mmidx_map);
159 |   return alns;
160 | }
161 | 
162 | void free_shmr_alns(shmr_aln_v *alns) {
163 |   for (uint32_t aln_idx = 0; aln_idx < alns->n; aln_idx++) {
164 |     kv_destroy(alns->a[aln_idx].idx0);
165 |     kv_destroy(alns->a[aln_idx].idx1);
166 |   }
167 |   kv_destroy(*alns);
168 |   free(alns);
169 | }
170 | 


--------------------------------------------------------------------------------
/falcon/common.h:
--------------------------------------------------------------------------------
  1 | 
  2 | /*
  3 |  * =====================================================================================
  4 |  *
  5 |  *       Filename:  common.h
  6 |  *
  7 |  *    Description:  Common delclaration for the code base 
  8 |  *
  9 |  *        Version:  0.1
 10 |  *        Created:  07/16/2013 07:46:23 AM
 11 |  *       Revision:  none
 12 |  *       Compiler:  gcc
 13 |  *
 14 |  *         Author:  Jason Chin, 
 15 |  *        Company:  
 16 |  *
 17 |  * =====================================================================================
 18 | 
 19 | #################################################################################$$
 20 | # Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
 21 | #
 22 | # All rights reserved.
 23 | #
 24 | # Redistribution and use in source and binary forms, with or without
 25 | # modification, are permitted (subject to the limitations in the
 26 | # disclaimer below) provided that the following conditions are met:
 27 | #
 28 | #  * Redistributions of source code must retain the above copyright
 29 | #  notice, this list of conditions and the following disclaimer.
 30 | #
 31 | #  * Redistributions in binary form must reproduce the above
 32 | #  copyright notice, this list of conditions and the following
 33 | #  disclaimer in the documentation and/or other materials provided
 34 | #  with the distribution.
 35 | #
 36 | #  * Neither the name of Pacific Biosciences nor the names of its
 37 | #  contributors may be used to endorse or promote products derived
 38 | #  from this software without specific prior written permission.
 39 | #
 40 | # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
 41 | # GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
 42 | # BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
 43 | # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 44 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 45 | # DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
 46 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 47 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 48 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 49 | # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 50 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 51 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 52 | # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 53 | # SUCH DAMAGE.
 54 | #################################################################################$$
 55 | */
 56 | 
 57 | #include <stdint.h>
 58 | 
 59 | typedef int seq_coor_t; 
 60 | 
 61 | typedef struct {    
 62 |     seq_coor_t aln_str_size ;
 63 |     seq_coor_t dist ;
 64 |     seq_coor_t aln_q_s;
 65 |     seq_coor_t aln_q_e;
 66 |     seq_coor_t aln_t_s;
 67 |     seq_coor_t aln_t_e;
 68 |     char * q_aln_str;
 69 |     char * t_aln_str;
 70 | 
 71 | } alignment;
 72 | 
 73 | 
 74 | typedef struct {
 75 |     seq_coor_t pre_k;
 76 |     seq_coor_t x1;
 77 |     seq_coor_t y1;
 78 |     seq_coor_t x2;
 79 |     seq_coor_t y2;
 80 | } d_path_data;
 81 | 
 82 | typedef struct {
 83 |     seq_coor_t d;
 84 |     seq_coor_t k;
 85 |     seq_coor_t pre_k;
 86 |     seq_coor_t x1;
 87 |     seq_coor_t y1;
 88 |     seq_coor_t x2;
 89 |     seq_coor_t y2;
 90 | } d_path_data2;
 91 | 
 92 | typedef struct {
 93 |     seq_coor_t x;
 94 |     seq_coor_t y;
 95 | } path_point;
 96 | 
 97 | typedef struct {    
 98 |     seq_coor_t start;
 99 |     seq_coor_t last;
100 |     seq_coor_t count;
101 | } kmer_lookup;
102 | 
103 | typedef unsigned char base;
104 | typedef base * seq_array;
105 | typedef seq_coor_t seq_addr;
106 | typedef seq_addr * seq_addr_array;
107 | 
108 | 
109 | typedef struct {
110 |     seq_coor_t count;
111 |     seq_coor_t * query_pos;
112 |     seq_coor_t * target_pos;
113 | } kmer_match;
114 | 
115 | 
116 | typedef struct {
117 |     seq_coor_t s1;
118 |     seq_coor_t e1;
119 |     seq_coor_t s2;
120 |     seq_coor_t e2;
121 |     long int score;
122 | } aln_range;
123 | 
124 | 
125 | typedef struct {
126 |     char * sequence;
127 |     uint8_t * eqv;
128 | } consensus_data;
129 | 
130 | kmer_lookup * allocate_kmer_lookup (seq_coor_t);
131 | void init_kmer_lookup ( kmer_lookup *,  seq_coor_t );
132 | void free_kmer_lookup(kmer_lookup *);
133 | 
134 | seq_array allocate_seq(seq_coor_t);
135 | void init_seq_array( seq_array, seq_coor_t);
136 | void free_seq_array(seq_array);
137 | 
138 | seq_addr_array allocate_seq_addr(seq_coor_t size); 
139 | 
140 | void free_seq_addr_array(seq_addr_array);
141 | 
142 | 
143 | aln_range * find_best_aln_range(kmer_match *, 
144 |         seq_coor_t, 
145 |         seq_coor_t, 
146 |         seq_coor_t); 
147 | 
148 | void free_aln_range( aln_range *);
149 | 
150 | kmer_match * find_kmer_pos_for_seq( char *, 
151 |         seq_coor_t, 
152 |         unsigned int K, 
153 |         seq_addr_array, 
154 |         kmer_lookup * );
155 | 
156 | void free_kmer_match( kmer_match * ptr);
157 | void free_kmer_lookup(kmer_lookup * );
158 | 
159 | 
160 | 
161 | void add_sequence( seq_coor_t, 
162 |         unsigned int, 
163 |         char *, 
164 |         seq_coor_t,
165 |         seq_addr_array, 
166 |         seq_array, 
167 |         kmer_lookup *); 
168 | 
169 | void mask_k_mer(seq_coor_t, kmer_lookup *, seq_coor_t);
170 | 
171 | alignment * align(char *, seq_coor_t,
172 |         char *, seq_coor_t,
173 |         seq_coor_t,
174 |         int); 
175 | 
176 | void free_alignment(alignment *);
177 | 
178 | void free_consensus_data(consensus_data *);
179 | 
180 | 


--------------------------------------------------------------------------------
/src/shimmer4py.c:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <errno.h>
  3 | #include <stdbool.h>
  4 | #include <stdint.h>
  5 | #include <stdio.h>
  6 | #include <string.h>
  7 | #include <unistd.h>
  8 | #include <wordexp.h>
  9 | #include "kalloc.h"
 10 | #include "khash.h"
 11 | #include "kvec.h"
 12 | #include "shimmer.h"
 13 | 
 14 | #include <fcntl.h>
 15 | #include <sys/mman.h>
 16 | #include <sys/stat.h>
 17 | 
 18 | extern char *optarg;
 19 | extern int optind, opterr, optopt;
 20 | 
 21 | #define handle_error(msg) \
 22 |   do {                    \
 23 |     perror(msg);          \
 24 |     exit(EXIT_FAILURE);   \
 25 |   } while (0)
 26 | 
 27 | #define MMER_COUNT_LOWER_BOUND 2
 28 | #define MMER_COUNT_UPPER_BOUND 240
 29 | #define ORIGINAL 0
 30 | #define REVERSED 1
 31 | #define READ_END_FUZZINESS 48
 32 | #define LOCAL_OVERLAP_UPPERBOUND 120
 33 | #define OVERLAP 0
 34 | #define CONTAINMENT 1
 35 | 
 36 | KHASH_MAP_INIT_INT64(RPAIR, uint8_t);
 37 | 
 38 | int mp128_comp(const void *a, const void *b) {
 39 |   mp128_t *a0 = (mp128_t *)a;
 40 |   mp128_t *b0 = (mp128_t *)b;
 41 |   return ((a0->y0 & 0xFFFFFFFF) >> 1) < ((b0->y0 & 0xFFFFFFFF) >> 1);
 42 | }
 43 | 
 44 | void build_shimmer_map4py(py_mmer_t *py_mmer, char *seqdb_prefix,
 45 |                           char *shimmer_prefix, uint32_t mychunk,
 46 |                           uint32_t total_chunk, uint32_t lowerbound,
 47 |                           uint32_t upperbound) {
 48 |   char mmc_file_path[8192];
 49 |   char mmer_file_path[8192];
 50 |   char seq_idx_file_path[8192];
 51 |   char seqdb_file_path[8291];
 52 | 
 53 |   wordexp_t p;
 54 |   char **mmc_fns;
 55 |   char **shimmer_fns;
 56 | 
 57 |   mm128_v mmers_;
 58 |   mm_count_v mmc;
 59 | 
 60 |   khash_t(RLEN) * rlmap_;
 61 |   khash_t(MMC) *mcmap_ = kh_init(MMC);
 62 |   khash_t(MMER0) * mmer0_map_;
 63 |   khash_t(RIDMM) *ridmm_ = kh_init(RIDMM);
 64 | 
 65 |   assert(total_chunk > 0);
 66 |   assert(mychunk > 0 && mychunk <= total_chunk);
 67 | 
 68 |   if (seqdb_prefix == NULL) {
 69 |     seqdb_prefix = (char *)calloc(8192, 1);
 70 |     snprintf(seqdb_prefix, 8191, "seq_dataset");
 71 |   }
 72 | 
 73 |   if (shimmer_prefix == NULL) {
 74 |     seqdb_prefix = (char *)calloc(8192, 1);
 75 |     snprintf(shimmer_prefix, 8191, "shimmer-L2");
 76 |   }
 77 | 
 78 |   int written;
 79 |   written = snprintf(seq_idx_file_path, sizeof(seq_idx_file_path), "%s.idx",
 80 |                      seqdb_prefix);
 81 |   assert(written < sizeof(seq_idx_file_path));
 82 |   fprintf(stderr, "using index file: %s\n", seq_idx_file_path);
 83 | 
 84 |   rlmap_ = get_read_length_map(seq_idx_file_path);
 85 | 
 86 |   written = snprintf(seqdb_file_path, sizeof(seqdb_file_path), "%s.seqdb",
 87 |                      seqdb_prefix);
 88 |   assert(written < sizeof(seqdb_file_path));
 89 |   fprintf(stderr, "using seqdb file: %s\n", seqdb_file_path);
 90 | 
 91 |   py_mmer->mmers = malloc(sizeof(mm128_v));
 92 |   py_mmer->mmers->n = 0;
 93 |   py_mmer->mmers->m = 0;
 94 |   py_mmer->mmers->a = 0;
 95 | 
 96 |   written = snprintf(mmer_file_path, sizeof(mmer_file_path),
 97 |                      "%s-[0-9]*-of-[0-9]*.dat", shimmer_prefix);
 98 |   assert(written < sizeof(mmer_file_path));
 99 |   wordexp(mmer_file_path, &p, 0);
100 |   shimmer_fns = p.we_wordv;
101 |   for (uint8_t i = 0; i < p.we_wordc; i++) {
102 |     fprintf(stderr, "using shimmer data file: %s\n", shimmer_fns[i]);
103 |     mmers_ = read_mmlist(shimmer_fns[i]);
104 |     append_mmlist(py_mmer->mmers, &mmers_);
105 |     kv_destroy(mmers_);
106 |   }
107 |   wordfree(&p);
108 |   get_ridmm(ridmm_, py_mmer->mmers);
109 | 
110 |   written = snprintf(mmc_file_path, sizeof(mmc_file_path),
111 |                      "%s-MC-[0-9]*-of-[0-9]*.dat", shimmer_prefix);
112 |   assert(written < sizeof(mmc_file_path));
113 |   wordexp(mmc_file_path, &p, 0);
114 |   mmc_fns = p.we_wordv;
115 |   for (uint8_t i = 0; i < p.we_wordc; i++) {
116 |     fprintf(stderr, "using shimmer count file: %s\n", mmc_fns[i]);
117 |     mmc = read_mm_count(mmc_fns[i]);
118 |     aggregate_mm_count(mcmap_, &mmc);
119 |     kv_destroy(mmc);
120 |   }
121 | 
122 |   wordfree(&p);
123 | 
124 |   mmer0_map_ = kh_init(MMER0);
125 | 
126 |   build_map(py_mmer->mmers, mmer0_map_, rlmap_, mcmap_, mychunk, total_chunk,
127 |             lowerbound, upperbound);
128 |   py_mmer->mmer0_map = (void *)mmer0_map_;
129 |   py_mmer->rlmap = (void *)rlmap_;
130 |   py_mmer->mcmap = (void *)mcmap_;
131 |   py_mmer->ridmm = (void *)ridmm_;
132 | }
133 | 
134 | void get_shimmers_for_read(mm128_v *mmer, py_mmer_t *py_mmer, uint32_t rid) {
135 |   khiter_t k;
136 |   khash_t(RIDMM) * ridmm;
137 |   mm128_v *mmer_;
138 |   ridmm = (khash_t(RIDMM) *)py_mmer->ridmm;
139 |   k = kh_get(RIDMM, ridmm, rid);
140 |   if (k != kh_end(ridmm)) {
141 |     mmer_ = kh_val(ridmm, k);
142 |   } else {
143 |     mmer_ = calloc(sizeof(mm128_v), 1);
144 |   }
145 |   mmer->n = mmer_->n;
146 |   mmer->m = mmer_->m;
147 |   mmer->a = mmer_->a;
148 | }
149 | 
150 | uint32_t get_mmer_count(py_mmer_t *py_mmer, uint64_t mhash) {
151 |   khash_t(MMC) *mcmap_ = (khash_t(MMC) *)py_mmer->mcmap;
152 |   khiter_t k = kh_get(MMC, mcmap_, mhash);
153 |   if (k != kh_end(mcmap_)) {
154 |     return kh_val(mcmap_, k);
155 |   } else {
156 |     return 0;
157 |   }
158 | }
159 | 
160 | void get_shimmer_hits(mp256_v *mpv_out, py_mmer_t *py_mmer, uint64_t mhash0,
161 |                       uint32_t span) {
162 |   khash_t(MMER0) *mmer0_map_ = (khash_t(MMER0) *)py_mmer->mmer0_map;
163 |   // khash_t(RLEN) * rlmap_ = (khash_t(RLEN) *) rlmap_;
164 |   // khash_t(MMC) * mcmap_ = (khash_t(MMC) *) mcmap_;
165 | 
166 |   mp128_v *mpv;
167 |   mp256_t mp256;
168 |   uint64_t mhash1;
169 |   khiter_t k;
170 | 
171 |   khash_t(MMER1) * mmer1_map;
172 |   mhash0 <<= 8;
173 |   mhash0 |= span;
174 |   mp256.x0 = mhash0;
175 | 
176 |   k = kh_get(MMER0, mmer0_map_, mhash0);
177 |   if (k == kh_end(mmer0_map_)) {
178 |     return;
179 |   }
180 |   mmer1_map = kh_val(mmer0_map_, k);
181 |   for (khiter_t __j = kh_begin(mmer1_map); __j != kh_end(mmer1_map); ++__j) {
182 |     if (!kh_exist(mmer1_map, __j)) continue;
183 |     mhash1 = kh_key(mmer1_map, __j);
184 |     mp256.x1 = mhash1;
185 |     mhash1 >>= 8;
186 |     mpv = kh_val(mmer1_map, __j);
187 |     qsort(mpv->a, mpv->n, sizeof(mp128_t), mp128_comp);
188 |     for (size_t __k0 = 0; __k0 < (mpv->n); __k0++) {
189 |       mp256.y0 = mpv->a[__k0].y0;
190 |       mp256.y1 = mpv->a[__k0].y1;
191 |       mp256.direction = mpv->a[__k0].direction;
192 | 
193 |       kv_push(mp256_t, NULL, *mpv_out, mp256);
194 |     }
195 |   }
196 | }
197 | 


--------------------------------------------------------------------------------
/src/DWmatch.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /*
  3 |  *
  4 |  =====================================================================================
  5 |  *
  6 |  *       Filename:  DW_banded.c
  7 |  *
  8 |  *    Description:  A banded version for the O(ND) greedy sequence alignment
  9 |  algorithm
 10 |  *
 11 |  *        Version:  0.1
 12 |  *        Created:  07/20/2013 17:00:00
 13 |  *       Revision:  none
 14 |  *       Compiler:  gcc
 15 |  *
 16 |  *         Author:  Jason Chin,
 17 |  *        Company:
 18 |  *
 19 |  *
 20 |  =====================================================================================
 21 | 
 22 |  #################################################################################$$
 23 |  # Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
 24 |  #
 25 |  # All rights reserved.
 26 |  #
 27 |  # Redistribution and use in source and binary forms, with or without
 28 |  # modification, are permitted (subject to the limitations in the
 29 |  # disclaimer below) provided that the following conditions are met:
 30 |  #
 31 |  #  * Redistributions of source code must retain the above copyright
 32 |  #  notice, this list of conditions and the following disclaimer.
 33 |  #
 34 |  #  * Redistributions in binary form must reproduce the above
 35 |  #  copyright notice, this list of conditions and the following
 36 |  #  disclaimer in the documentation and/or other materials provided
 37 |  #  with the distribution.
 38 |  #
 39 |  #  * Neither the name of Pacific Biosciences nor the names of its
 40 |  #  contributors may be used to endorse or promote products derived
 41 |  #  from this software without specific prior written permission.
 42 |  #
 43 |  # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
 44 |  # GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
 45 |  # BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
 46 |  # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 47 |  # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 48 |  # DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
 49 |  # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 50 |  # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 51 |  # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 52 |  # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 53 |  # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 54 |  # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 55 |  # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 56 |  # SUCH DAMAGE.
 57 |  #################################################################################$$
 58 | */
 59 | 
 60 | #include <limits.h>
 61 | #include <stdbool.h>
 62 | #include <stdio.h>
 63 | #include <stdlib.h>
 64 | #include "shimmer.h"
 65 | 
 66 | ovlp_match_t *ovlp_match(uint8_t *query_seq, seq_coor_t q_len, uint8_t q_strand,
 67 |                          uint8_t *target_seq, seq_coor_t t_len,
 68 |                          uint8_t t_strand, seq_coor_t band_tolerance) {
 69 |   seq_coor_t *V;
 70 |   seq_coor_t *U;  // array of matched bases for each "k"
 71 |   seq_coor_t k_offset;
 72 |   seq_coor_t d;
 73 |   seq_coor_t k, k2;
 74 |   seq_coor_t best_m;  // the best "matches" for each d
 75 |   seq_coor_t min_k, new_min_k;
 76 |   seq_coor_t max_k, new_max_k;
 77 |   seq_coor_t x, y;
 78 |   seq_coor_t x1, y1;
 79 |   seq_coor_t max_d;
 80 |   seq_coor_t band_size;
 81 | 
 82 |   uint8_t q_shift = 0;
 83 |   uint8_t t_shift = 0;
 84 | 
 85 |   bool start = false;
 86 | 
 87 |   ovlp_match_t *rtn;
 88 |   bool matched = false;
 89 | 
 90 |   q_shift = q_strand == 0 ? 0 : 4;
 91 |   t_shift = t_strand == 0 ? 0 : 4;
 92 | 
 93 |   // printf("debug: %ld %ld\n", q_len, t_len);
 94 |   // printf("%s\n", query_seq);
 95 | 
 96 |   max_d = (int)(0.3 * (q_len + t_len));
 97 | 
 98 |   band_size = band_tolerance * 2;
 99 | 
100 |   V = calloc(max_d * 2 + 1, sizeof(seq_coor_t));
101 |   U = calloc(max_d * 2 + 1, sizeof(seq_coor_t));
102 | 
103 |   k_offset = max_d;
104 | 
105 |   rtn = calloc(1, sizeof(ovlp_match_t));
106 |   rtn->m_size = 0;
107 |   rtn->q_bgn = 0;
108 |   rtn->q_end = 0;
109 |   rtn->t_bgn = 0;
110 |   rtn->t_end = 0;
111 |   rtn->q_m_end = 0;
112 |   rtn->t_m_end = 0;
113 |   uint32_t longest_match = 0;
114 | 
115 |   // printf("max_d: %lu, band_size: %lu\n", max_d, band_size);
116 |   best_m = -1;
117 |   min_k = 0;
118 |   max_k = 0;
119 |   for (d = 0; d < max_d; d++) {
120 |     if (max_k - min_k > band_size) {
121 |       break;
122 |     }
123 | 
124 |     for (k = min_k; k <= max_k; k += 2) {
125 |       if ((k == min_k) ||
126 |           ((k != max_k) && (V[k - 1 + k_offset] < V[k + 1 + k_offset]))) {
127 |         x = V[k + 1 + k_offset];
128 |       } else {
129 |         x = V[k - 1 + k_offset] + 1;
130 |       }
131 |       y = x - k;
132 |       x1 = x;
133 |       y1 = y;
134 | 
135 |       while (x < q_len && y < t_len &&
136 |              ((query_seq[x] >> q_shift) & 0x0F) ==
137 |                  ((target_seq[y] >> t_shift) & 0x0F)) {
138 |         x++;
139 |         y++;
140 |       }
141 | 
142 |       if ((x - x1 > 16) && (start == false)) {
143 |         rtn->q_bgn = x1;
144 |         rtn->t_bgn = y1;
145 |         start = true;
146 |       }
147 | 
148 |       if ((x - x1 > longest_match)) {
149 |         longest_match = x - x1;
150 |         rtn->q_m_end = x;
151 |         rtn->t_m_end = y;
152 |       }
153 | 
154 |       V[k + k_offset] = x;
155 |       U[k + k_offset] = x + y;
156 | 
157 |       if (x + y > best_m) {
158 |         best_m = x + y;
159 |       }
160 | 
161 |       if (x >= q_len || y >= t_len) {
162 |         matched = true;
163 |         break;
164 |       }
165 |     }
166 | 
167 |     // For banding
168 |     new_min_k = max_k;
169 |     new_max_k = min_k;
170 | 
171 |     for (k2 = min_k; k2 <= max_k; k2 += 2) {
172 |       if (U[k2 + k_offset] >= best_m - band_tolerance) {
173 |         if (k2 < new_min_k) {
174 |           new_min_k = k2;
175 |         }
176 |         if (k2 > new_max_k) {
177 |           new_max_k = k2;
178 |         }
179 |       }
180 |     }
181 | 
182 |     max_k = new_max_k + 1;
183 |     min_k = new_min_k - 1;
184 | 
185 |     if (matched == true) {
186 |       rtn->q_end = x;
187 |       rtn->t_end = y;
188 |       rtn->dist = d;
189 |       // we don't really generate the alingment path here, so we can only
190 |       // estimate the alignment string size
191 |       rtn->m_size =
192 |           (rtn->q_end - rtn->q_bgn + rtn->t_end - rtn->t_bgn + 2 * d) / 2;
193 |       break;
194 |     }
195 |   }
196 |   if (matched == false) {
197 |     rtn->q_bgn = 0;
198 |     rtn->t_bgn = 0;
199 |   }
200 | 
201 |   free(V);
202 |   free(U);
203 |   return rtn;
204 | }
205 | 
206 | void free_ovlp_match(ovlp_match_t *match) { free(match); }
207 | 


--------------------------------------------------------------------------------
/py-utils/FastaReader.py:
--------------------------------------------------------------------------------
  1 | from os.path import abspath, expanduser
  2 | from io import StringIO
  3 | import contextlib
  4 | import gzip
  5 | import re
  6 | import subprocess
  7 | 
  8 | ##
  9 | # Utility functions for FastaReader
 10 | ##
 11 | 
 12 | 
 13 | def wrap(s, columns):
 14 |     return "\n".join(s[start:start + columns]
 15 |                      for start in range(0, len(s), columns))
 16 | 
 17 | 
 18 | def splitFastaHeader(name):
 19 |     """
 20 |     Split a FASTA/FASTQ header into its id and metadata components
 21 |     """
 22 |     nameParts = re.split('\s', name, maxsplit=1)
 23 |     id_ = nameParts[0]
 24 |     if len(nameParts) > 1:
 25 |         metadata = nameParts[1].strip()
 26 |     else:
 27 |         metadata = None
 28 |     return (id_, metadata)
 29 | 
 30 | 
 31 | def splitFileContents(f, delimiter, BLOCKSIZE=8192):
 32 |     """
 33 |     Same semantics as f.read().split(delimiter), but with memory usage
 34 |     determined by largest chunk rather than entire file size
 35 |     """
 36 |     remainder = StringIO()
 37 |     while True:
 38 |         block = f.read(BLOCKSIZE)
 39 |         if not block:
 40 |             break
 41 |         parts = block.split(delimiter)
 42 |         remainder.write(parts[0])
 43 |         for part in parts[1:]:
 44 |             yield remainder.getvalue()
 45 |             remainder = StringIO()
 46 |             remainder.write(part)
 47 |     yield remainder.getvalue()
 48 | 
 49 | 
 50 | class FastaRecord(object):
 51 |     """
 52 |     A FastaRecord object models a named sequence in a FASTA file.
 53 |     """
 54 |     DELIMITER = ">"
 55 |     COLUMNS = 60
 56 | 
 57 |     def __init__(self, name, sequence):
 58 |         try:
 59 |             assert "\n" not in name
 60 |             assert "\n" not in sequence
 61 |             assert self.DELIMITER not in sequence
 62 |             self._name = name
 63 |             self._sequence = sequence
 64 |             self._id, self._metadata = splitFastaHeader(name)
 65 |         except AssertionError:
 66 |             raise ValueError("Invalid FASTA record data")
 67 | 
 68 |     @property
 69 |     def name(self):
 70 |         """
 71 |         The name of the sequence in the FASTA file, equal to the entire
 72 |         FASTA header following the '>' character
 73 |         """
 74 |         return self._name
 75 | 
 76 |     @property
 77 |     def id(self):
 78 |         """
 79 |         The id of the sequence in the FASTA file, equal to the FASTA header
 80 |         up to the first whitespace.
 81 |         """
 82 |         return self._id
 83 | 
 84 |     @property
 85 |     def metadata(self):
 86 |         """
 87 |         The metadata associated with the sequence in the FASTA file, equal to
 88 |         the contents of the FASTA header following the first whitespace
 89 |         """
 90 |         return self._metadata
 91 | 
 92 |     @property
 93 |     def sequence(self):
 94 |         """
 95 |         The sequence for the record as present in the FASTA file.
 96 |         (Newlines are removed but otherwise no sequence normalization
 97 |         is performed).
 98 |         """
 99 |         return self._sequence
100 | 
101 |     @property
102 |     def length(self):
103 |         """
104 |         Get the length of the FASTA sequence
105 |         """
106 |         return len(self._sequence)
107 | 
108 |     @classmethod
109 |     def fromString(cls, s):
110 |         """
111 |         Interprets a string as a FASTA record.  Does not make any
112 |         assumptions about wrapping of the sequence string.
113 |         """
114 |         try:
115 |             lines = s.splitlines()
116 |             assert len(lines) > 1
117 |             assert lines[0][0] == cls.DELIMITER
118 |             name = lines[0][1:]
119 |             sequence = "".join(lines[1:])
120 |             return FastaRecord(name, sequence)
121 |         except AssertionError:
122 |             raise ValueError("String not recognized as a valid FASTA record")
123 | 
124 |     def __eq__(self, other):
125 |         if isinstance(other, self.__class__):
126 |             return (self.name == other.name and
127 |                     self.sequence == other.sequence)
128 |         else:
129 |             return False
130 | 
131 |     def __ne__(self, other):
132 |         return not self.__eq__(other)
133 | 
134 |     def __str__(self):
135 |         """
136 |         Output a string representation of this FASTA record, observing
137 |         standard conventions about sequence wrapping.
138 |         """
139 |         return (">%s\n" % self.name) + \
140 |             wrap(self.sequence, self.COLUMNS)
141 | 
142 | 
143 | # These are refactored from ReaderBase/FastaReader.
144 | 
145 | def yield_fasta_records(f, fn):
146 |     """
147 |     f: fileobj
148 |     fn: str - filename (for exceptions)
149 |     """
150 |     try:
151 |         parts = splitFileContents(f, ">")
152 |         assert "" == next(parts)
153 |         for part in parts:
154 |             yield FastaRecord.fromString(">" + part)
155 |     except AssertionError:
156 |         raise Exception("Invalid FASTA file {!r}".format(fn))
157 | 
158 | 
159 | def stream_stdout(call, fn):
160 |     args = call.split()
161 |     proc = subprocess.Popen(args, stdin=open(fn), stdout=subprocess.PIPE)
162 |     return proc.stdout
163 | 
164 | 
165 | @contextlib.contextmanager
166 | def open_fasta_reader(fn):
167 |     """
168 |     fn: str - filename
169 | 
170 |     Note: If you already have a fileobj, you can iterate over yield_fasta_records() directly.
171 | 
172 |     Streaming reader for FASTA files, useable as a one-shot iterator
173 |     over FastaRecord objects.  Agnostic about line wrapping.
174 |     Example:
175 |     .. doctest::
176 |         TODO: Get data.
177 |         > from pbcore import data
178 |         > filename = data.getTinyFasta()
179 |         > r = FastaReader(filename)
180 |         > with open_fasta_reader(filename) as r:
181 |         ...  for record in r:
182 |         ...     print record.name, len(record.sequence)
183 |         ref000001|EGFR_Exon_2 183
184 |         ref000002|EGFR_Exon_3 203
185 |         ref000003|EGFR_Exon_4 215
186 |         ref000004|EGFR_Exon_5 157
187 |     """
188 |     filename = abspath(expanduser(fn))
189 |     mode = 'r'
190 |     if filename.endswith(".gz"):
191 |         ofs = gzip.open(filename, mode)
192 |     elif filename.endswith(".dexta"):
193 |         ofs = stream_stdout("undexta -vkU -w60 -i", filename)
194 |     else:
195 |         ofs = open(filename, mode)
196 |     yield yield_fasta_records(ofs, filename)
197 |     ofs.close()
198 | 
199 | 
200 | class FastaReader(object):
201 |     """Deprecated, but should still work (with filenames).
202 |     """
203 | 
204 |     def __iter__(self):
205 |         with open_fasta_reader(self.filename) as reader:
206 |             for rec in reader:
207 |                 yield rec
208 | 
209 |     def __init__(self, f):
210 |         self.filename = f
211 | 


--------------------------------------------------------------------------------
/src/mm_sketch.c:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <stdint.h>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include <string.h>
  6 | #define __STDC_LIMIT_MACROS
  7 | #include "kvec.h"
  8 | #include "shimmer.h"
  9 | 
 10 | unsigned char seq_nt4_table[256] = {
 11 |     0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 12 |     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 13 |     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2,
 14 |     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 15 |     4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 4, 4,
 16 |     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 17 |     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 18 |     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 19 |     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 20 |     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 21 |     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
 22 | 
 23 | static inline uint64_t hash64(uint64_t key, uint64_t mask) {
 24 |   key = (~key + (key << 21)) & mask;  // key = (key << 21) - key - 1;
 25 |   key = key ^ key >> 24;
 26 |   key = ((key + (key << 3)) + (key << 8)) & mask;  // key * 265
 27 |   key = key ^ key >> 14;
 28 |   key = ((key + (key << 2)) + (key << 4)) & mask;  // key * 21
 29 |   key = key ^ key >> 28;
 30 |   key = (key + (key << 31)) & mask;
 31 |   return key;
 32 | }
 33 | 
 34 | typedef struct {  // a simplified version of kdq
 35 |   int front, count;
 36 |   int a[32];
 37 | } tiny_queue_t;
 38 | 
 39 | static inline void tq_push(tiny_queue_t *q, int x) {
 40 |   q->a[((q->count++) + q->front) & 0x1f] = x;
 41 | }
 42 | 
 43 | static inline int tq_shift(tiny_queue_t *q) {
 44 |   int x;
 45 |   if (q->count == 0) return -1;
 46 |   x = q->a[q->front++];
 47 |   q->front &= 0x1f;
 48 |   --q->count;
 49 |   return x;
 50 | }
 51 | 
 52 | /**
 53 |  * Find symmetric (w,k)-minimizers on a DNA sequence
 54 |  *
 55 |  * @param km     thread-local memory pool; using NULL falls back to malloc()
 56 |  * @param str    DNA sequence
 57 |  * @param len    length of $str
 58 |  * @param w      find a minimizer for every $w consecutive k-mers
 59 |  * @param k      k-mer size
 60 |  * @param rid    reference ID; will be copied to the output $p array
 61 |  * @param is_hpc homopolymer-compressed or not
 62 |  * @param p      minimizers
 63 |  *               p->a[i].x = kMer<<8 | kmerSpan
 64 |  *               p->a[i].y = rid<<32 | lastPos<<1 | strand
 65 |  *               where lastPos is the position of the last base of the i-th
 66 |  * minimizer, and strand indicates whether the minimizer comes from the top or
 67 |  * the bottom strand. Callers may want to set "p->n = 0"; otherwise results are
 68 |  * appended to p
 69 |  */
 70 | void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid,
 71 |                int is_hpc, mm128_v *p) {
 72 |   uint64_t shift1 = 2 * (k - 1), mask = (1ULL << 2 * k) - 1, kmer[2] = {0, 0};
 73 |   int i, j, l, buf_pos, min_pos, kmer_span = 0;
 74 |   mm128_t buf[256], min = {UINT64_MAX, UINT64_MAX};
 75 |   tiny_queue_t tq;
 76 | 
 77 |   assert(len > 0 && (w > 0 && w < 256) &&
 78 |          (k > 0 && k <= 28));  // 56 bits for k-mer; could use long k-mers, but
 79 |                                // 28 enough in practice
 80 |   memset(buf, 0xff, w * 16);
 81 |   memset(&tq, 0, sizeof(tiny_queue_t));
 82 |   kv_resize(mm128_t, km, *p, p->n + len / w);
 83 | 
 84 |   for (i = l = buf_pos = min_pos = 0; i < len; ++i) {
 85 |     int c = seq_nt4_table[(uint8_t)str[i]];
 86 |     mm128_t info = {UINT64_MAX, UINT64_MAX};
 87 |     if (c < 4) {  // not an ambiguous base
 88 |       int z;
 89 |       if (is_hpc) {
 90 |         int skip_len = 1;
 91 |         if (i + 1 < len && seq_nt4_table[(uint8_t)str[i + 1]] == c) {
 92 |           for (skip_len = 2; i + skip_len < len; ++skip_len)
 93 |             if (seq_nt4_table[(uint8_t)str[i + skip_len]] != c) break;
 94 |           i +=
 95 |               skip_len - 1;  // put $i at the end of the current homopolymer run
 96 |         }
 97 |         tq_push(&tq, skip_len);
 98 |         kmer_span += skip_len;
 99 |         if (tq.count > k) kmer_span -= tq_shift(&tq);
100 |       } else
101 |         kmer_span = l + 1 < k ? l + 1 : k;
102 |       kmer[0] = (kmer[0] << 2 | c) & mask;              // forward k-mer
103 |       kmer[1] = (kmer[1] >> 2) | (3ULL ^ c) << shift1;  // reverse k-mer
104 |       if (kmer[0] == kmer[1])
105 |         continue;  // skip "symmetric k-mers" as we don't know it strand
106 |       z = kmer[0] < kmer[1] ? 0 : 1;  // strand
107 |       ++l;
108 |       if (l >= k && kmer_span < 256) {
109 |         info.x = hash64(kmer[z], mask) << 8 | kmer_span;
110 |         info.y = (uint64_t)rid << 32 | (uint32_t)i << 1 | z;
111 |       }
112 |     } else
113 |       l = 0, tq.count = tq.front = 0, kmer_span = 0;
114 |     buf[buf_pos] = info;  // need to do this here as appropriate buf_pos and
115 |                           // buf[buf_pos] are needed below
116 |     if (l == w + k - 1 &&
117 |         min.x != UINT64_MAX) {  // special case for the first window - because
118 |                                 // identical k-mers are not stored yet
119 |       for (j = buf_pos + 1; j < w; ++j)
120 |         if (min.x == buf[j].x && buf[j].y != min.y)
121 |           kv_push(mm128_t, km, *p, buf[j]);
122 |       for (j = 0; j < buf_pos; ++j)
123 |         if (min.x == buf[j].x && buf[j].y != min.y)
124 |           kv_push(mm128_t, km, *p, buf[j]);
125 |     }
126 |     if (info.x <= min.x) {  // a new minimum; then write the old min
127 |       if (l >= w + k && min.x != UINT64_MAX) kv_push(mm128_t, km, *p, min);
128 |       min = info, min_pos = buf_pos;
129 |     } else if (buf_pos == min_pos) {  // old min has moved outside the window
130 |       if (l >= w + k - 1 && min.x != UINT64_MAX) kv_push(mm128_t, km, *p, min);
131 |       for (j = buf_pos + 1, min.x = UINT64_MAX; j < w;
132 |            ++j)  // the two loops are necessary when there are identical k-mers
133 |         if (min.x >= buf[j].x)
134 |           min = buf[j],
135 |           min_pos = j;  // >= is important s.t. min is always the closest k-mer
136 |       for (j = 0; j <= buf_pos; ++j)
137 |         if (min.x >= buf[j].x) min = buf[j], min_pos = j;
138 |       if (l >= w + k - 1 && min.x != UINT64_MAX) {  // write identical k-mers
139 |         for (j = buf_pos + 1; j < w;
140 |              ++j)  // these two loops make sure the output is sorted
141 |           if (min.x == buf[j].x && min.y != buf[j].y)
142 |             kv_push(mm128_t, km, *p, buf[j]);
143 |         for (j = 0; j <= buf_pos; ++j)
144 |           if (min.x == buf[j].x && min.y != buf[j].y)
145 |             kv_push(mm128_t, km, *p, buf[j]);
146 |       }
147 |     }
148 |     if (++buf_pos == w) buf_pos = 0;
149 |   }
150 |   if (min.x != UINT64_MAX) kv_push(mm128_t, km, *p, min);
151 | }
152 | 


--------------------------------------------------------------------------------
/py/peregrine/utils.py:
--------------------------------------------------------------------------------
  1 | import peregrine
  2 | import sys, os
  3 | import numpy as np
  4 | from peregrine._shimmer4py import ffi as shimmer_ffi
  5 | from peregrine._shimmer4py import lib as shimmer4py
  6 | from peregrine._falcon4py import ffi as falcon_ffi
  7 | from peregrine._falcon4py import lib as falcon4py
  8 | 
  9 | 
 10 | rmap = dict(list(zip(b"ACGT", b"TGCA")))
 11 | 
 12 | 
 13 | def rc(seq):
 14 |     return bytes([rmap[c] for c in seq[::-1]])
 15 | 
 16 | 
 17 | def mmer2tuple(mmer):
 18 |     x = mmer.x
 19 |     y = mmer.y
 20 |     span = x & 0xFF
 21 |     mmer = x >> 8
 22 |     rid = y >> 32
 23 |     pos_end = ((y & 0xFFFFFFFF) >> 1) + 1
 24 |     direction = y & 0x1
 25 |     return (mmer, span, rid, pos_end, direction)
 26 | 
 27 | 
 28 | def get_shimmers_from_seq(seq, rid=0,
 29 |                           levels=2, reduction_factor=3,
 30 |                           k=16, w=80):
 31 |     assert levels <= 2
 32 |     c_null = shimmer_ffi.NULL
 33 |     mmers = shimmer_ffi.new("mm128_v *")
 34 |     shimmer4py.mm_sketch(c_null, seq, len(seq), w, k, rid, 0, mmers)
 35 |     if levels == 0:
 36 |         return mmers
 37 |     elif levels == 1:
 38 |         mmers_L1 = shimmer_ffi.new("mm128_v *")
 39 |         shimmer4py.mm_reduce(mmers, mmers_L1, reduction_factor)
 40 |         shimmer_ffi.release(mmers)
 41 |         return mmers_L1
 42 |     elif levels == 2:
 43 |         mmers_L1 = shimmer_ffi.new("mm128_v *")
 44 |         mmers_L2 = shimmer_ffi.new("mm128_v *")
 45 |         shimmer4py.mm_reduce(mmers, mmers_L1, reduction_factor)
 46 |         shimmer4py.mm_reduce(mmers_L1, mmers_L2, reduction_factor)
 47 |         shimmer_ffi.release(mmers_L1)
 48 |         shimmer_ffi.release(mmers)
 49 |         return mmers_L2
 50 | 
 51 | 
 52 | def get_shimmer_alns(shimmers0, shimmers1, direction=0,
 53 |                      max_diff=100, max_dist=1200,
 54 |                      max_repeat=1):
 55 |     aln = shimmer4py.shmr_aln(shimmers0, shimmers1, direction,
 56 |                               max_diff, max_dist, max_repeat)
 57 |     aln_chains = []
 58 |     for i in range(aln.n):
 59 |         chain = []
 60 |         offsets = np.zeros(aln.a[i].idx0.n, dtype=np.float)
 61 |         for j in range(aln.a[i].idx0.n):
 62 |             idx0, idx1 = aln.a[i].idx0.a[j], aln.a[i].idx1.a[j]
 63 |             mmer0 = mmer2tuple(shimmers0.a[idx0])
 64 |             mmer1 = mmer2tuple(shimmers1.a[idx1])
 65 |             chain.append((mmer0, mmer1))
 66 |             if direction == 0:  # same direction
 67 |                 d = mmer0[3] - mmer1[3]
 68 |             else:
 69 |                 d = mmer0[3] + mmer1[3]
 70 |             offsets[j] = d
 71 |         aln_chains.append((chain, np.max(d), np.mean(d), np.min(d)))
 72 |     shimmer4py.free_shmr_alns(aln)
 73 |     return aln_chains
 74 | 
 75 | 
 76 | def get_tag_from_seqs(read_seq, ref_seq, read_offset):
 77 |     rng = falcon_ffi.new("aln_range[1]")
 78 |     read_len = len(read_seq)
 79 |     ref_len = len(ref_seq)
 80 |     aligned = False
 81 |     if read_offset < 0:
 82 |         aln = falcon4py.align(read_seq[abs(read_offset):read_len],
 83 |                               read_len - abs(read_offset),
 84 |                               ref_seq,
 85 |                               len(ref_seq),
 86 |                               150, 1)
 87 |         if abs(abs(aln.aln_q_e-aln.aln_q_s) -
 88 |                 (read_len - abs(read_offset))) < 48:
 89 |             aligned = True
 90 |             rng[0].s1 = aln.aln_q_s
 91 |             rng[0].e1 = aln.aln_q_e
 92 |             rng[0].s2 = aln.aln_t_s
 93 |             rng[0].e2 = aln.aln_t_e
 94 |             t_offset = 0
 95 |         else:
 96 |             falcon4py.free_alignment(aln)
 97 |     else:
 98 |         aln = falcon4py.align(read_seq,
 99 |                               read_len,
100 |                               ref_seq[read_offset:ref_len],
101 |                               ref_len-read_offset,
102 |                               150, 1)
103 |         if abs(abs(aln.aln_q_e-aln.aln_q_s)-read_len) < 48 or \
104 |             abs(ref_len-read_offset-abs(aln.aln_q_e-aln.aln_q_s)) < 48:
105 |             aligned = True
106 |             rng[0].s1 = aln.aln_q_s
107 |             rng[0].e1 = aln.aln_q_e
108 |             rng[0].s2 = aln.aln_t_s
109 |             rng[0].e2 = aln.aln_t_e
110 |             t_offset = read_offset
111 |         else:
112 |             falcon4py.free_alignment(aln)
113 |     tag = None
114 |     if aligned:
115 |         tag = falcon4py.get_align_tags(aln.q_aln_str,
116 |                                        aln.t_aln_str,
117 |                                        aln.aln_str_size,
118 |                                        rng, 0, t_offset)
119 |         falcon4py.free_alignment(aln)
120 |     falcon_ffi.release(rng)
121 | 
122 |     return tag
123 | 
124 | 
125 | def get_cns_from_reads(seqs):
126 | 
127 |     aln_count = 0
128 |     tags = falcon_ffi.new("align_tags_t * [{}]".format(len(seqs)+1))
129 |     seq0 = seqs[0]
130 |     shimmers0 = get_shimmers_from_seq(seq0, rid=0, levels=2)
131 | 
132 |     alns = get_shimmer_alns(shimmers0, shimmers0, 0)
133 |     aln = alns[0]
134 |     read_offset = aln[0][0][0][3] - aln[0][0][1][3]
135 |     seq = seq0
136 |     tag = get_tag_from_seqs(seq, seq0, read_offset)
137 |     tags[aln_count] = tag
138 |     aln_count += 1
139 | 
140 |     for i, seq in enumerate(seqs):
141 |         if i == 0:
142 |             continue
143 |         rid = i * 2
144 |         shimmers1 = get_shimmers_from_seq(seq, rid=rid, levels=2)
145 |         alns = get_shimmer_alns(shimmers0, shimmers1, 0)
146 |         alns.sort(key=lambda x: -len(x[0]))
147 |         if len(alns) > 0:
148 |             aln = alns[0]
149 |             read_offset = aln[0][0][0][3] - aln[0][0][1][3]
150 |             seq = seq0
151 |             tag = get_tag_from_seqs(seq, seq0, read_offset)
152 |             if tag is not None:
153 |                 tags[aln_count] = tag
154 |                 aln_count += 1
155 |         shimmer4py.free(shimmers1.a)
156 |         shimmer_ffi.release(shimmers1)
157 | 
158 |         rid = i * 2 + 1
159 |         seq = rc(seq)
160 |         shimmers1 = get_shimmers_from_seq(seq, rid=rid, levels=2)
161 |         alns = get_shimmer_alns(shimmers0, shimmers1, 0)
162 |         if len(alns) > 0:
163 |             alns.sort(key=lambda x: -len(x[0]))
164 |             aln = alns[0]
165 |             read_offset = aln[0][0][0][3] - aln[0][0][1][3]
166 |             tag = get_tag_from_seqs(seq, seq0, read_offset)
167 |             if tag is not None:
168 |                 tags[aln_count] = tag
169 |                 aln_count += 1
170 |         shimmer4py.free(shimmers1.a)
171 |         shimmer_ffi.release(shimmers1)
172 | 
173 |     cns = falcon4py.get_cns_from_align_tags(tags,
174 |                                             aln_count,
175 |                                             len(seq0), 1)
176 |     cns_seq = falcon_ffi.string(cns.sequence)
177 |     falcon4py.free_consensus_data(cns)
178 |     shimmer4py.free(shimmers0.a)
179 |     shimmer_ffi.release(shimmers0)
180 | 
181 |     return cns_seq
182 | 


--------------------------------------------------------------------------------
/py-utils/process_L2.py:
--------------------------------------------------------------------------------
  1 | from cffi import FFI
  2 | # import redis
  3 | 
  4 | ffi = FFI()
  5 | 
  6 | ffi.cdef("""
  7 | typedef struct { uint64_t x, y; } mm128_t;
  8 | typedef struct { size_t n, m; mm128_t *a; } mm128_v;
  9 | mm128_v read_mmlist(char *);
 10 | void free(void *ptr);
 11 | """)
 12 | 
 13 | C = ffi.dlopen(None)
 14 | mm_utils = ffi.dlopen("../src/mm_utils.so")
 15 | # r_conn = redis.Redis(host='127.0.0.1', port=6379, db=0)
 16 | 
 17 | rmap = dict(zip(b"ACGT", b"TGCA"))
 18 | 
 19 | L2dump = open("L2.txt", "w")
 20 | 
 21 | #hmmerL0 = ffi.new("mm128_v *")
 22 | #hmmerL2 = ffi.new("mm128_v *")
 23 | 
 24 | hmmerL0 = mm_utils.read_mmlist(b"../test/hmmer-L0-01-of-01.dat")
 25 | hmmerL2 = mm_utils.read_mmlist(b"../test/hmmer-L2-01-of-01.dat")
 26 | 
 27 | rid2name = {}
 28 | rid2len = {}
 29 | # rid2seq = {}
 30 | 
 31 | with open("../test/seq_dataset.idx") as f:
 32 |     for row in f:
 33 |         row = row.strip().split()
 34 |         rid, rname, rlen = row
 35 |         rid = int(rid)
 36 |         rlen = int(rlen)
 37 |         rid2name[rid] = rname
 38 |         rid2len[rid] = rlen
 39 | 
 40 | """
 41 | * @param p      minimizers
 42 | *               p->a[i].x = kMer<<8 | kmerSpan
 43 | *               p->a[i].y = rid<<32 | lastPos<<1 | strand
 44 | *               where lastPos is the position of the last base of the i-th minimizer,
 45 | *               and strand indicates whether the minimizer comes from the top or the bottom strand.
 46 | *               Callers may want to set "p->n = 0"; otherwise results are appended to p
 47 | """
 48 | 
 49 | mmer_count = {}
 50 | for i in range(hmmerL0.n):
 51 |     span = hmmerL0.a[i].x & 0xFF
 52 |     mmer = hmmerL0.a[i].x >> 8
 53 |     rid = hmmerL0.a[i].y >> 32
 54 |     pos_end = ((hmmerL0.a[i].y & 0xFFFFFFFF) >> 1) + 1
 55 |     strand = hmmerL0.a[i].y & 0x1
 56 |     mm_str = "{:014X}".format(mmer)
 57 |     mmer_count.setdefault(mm_str, 0)
 58 |     mmer_count[mm_str] += 1
 59 |     #
 60 |     # kmer = bseq[pos_end-span:pos_end]
 61 |     # kmer_r =  bytes([rmap[c] for c in kmer[::-1]])
 62 |     r_pos_end = rid2len[rid] - pos_end + span
 63 |     name = rid2name[rid]
 64 | 
 65 | mmer_count_L2 = {}
 66 | L2list = {}
 67 | for i in range(hmmerL2.n):
 68 |     span = hmmerL2.a[i].x & 0xFF
 69 |     mmer = hmmerL2.a[i].x >> 8
 70 |     rid = hmmerL2.a[i].y >> 32
 71 |     pos_end = ((hmmerL2.a[i].y & 0xFFFFFFFF) >> 1) + 1
 72 |     strand = hmmerL2.a[i].y & 0x1
 73 |     r_pos_end = rid2len[rid] - pos_end + span
 74 |     name = rid2name[rid]
 75 |     mm_str = "{:014X}".format(mmer)
 76 |     mmer_count_L2.setdefault(mm_str, 0)
 77 |     mmer_count_L2[mm_str] += 1
 78 |     print(name, pos_end, r_pos_end, strand,
 79 |           mm_str, mmer_count[mm_str], file=L2dump)
 80 |     L2list.setdefault(rid, [])
 81 |     L2list[rid].append((pos_end, r_pos_end, strand,
 82 |                         mm_str, name))
 83 | L2dump.close()
 84 | 
 85 | L2map = {}
 86 | rspan = {}
 87 | for rid in L2list:
 88 |     lst = L2list[rid]
 89 |     if len(lst) < 2:
 90 |         continue
 91 |     rspan[rid] = lst[0][-2], lst[-1][-2]
 92 |     v = lst[0]
 93 |     for w in lst[1:]:
 94 |         v_pos_end, v_r_pos_end, v_strand, v_mmer, v_name = v
 95 |         #
 96 |         if mmer_count_L2[v_mmer] < 2:
 97 |             v = w
 98 |             continue
 99 |         w_pos_end, w_r_pos_end, w_strand, w_mmer, w_name = w
100 |         #
101 |         if mmer_count_L2[w_mmer] < 2:
102 |             continue
103 |         key = v_mmer, v_strand, w_mmer, w_strand
104 |         L2map.setdefault(key, [])
105 |         L2map[key].append((v_name, rid, 0, v_pos_end, w_pos_end))
106 |         v = w
107 | 
108 |     v = lst[-1]
109 |     for w in lst[-2::-1]:
110 |         v_pos_end, v_r_pos_end, v_strand, v_mmer, v_name = v
111 |         #
112 |         if mmer_count_L2[v_mmer] < 2:
113 |             v = w
114 |             continue
115 |         w_pos_end, w_r_pos_end, w_strand, w_mmer, w_name = w
116 |         #
117 |         if mmer_count_L2[w_mmer] < 2:
118 |             continue
119 |         key = v_mmer, 1-v_strand, w_mmer, 1-w_strand
120 |         L2map.setdefault(key, [])
121 |         L2map[key].append((v_name, rid, 1, v_r_pos_end, w_r_pos_end))
122 |         v = w
123 | 
124 | dt_pairs = set()
125 | with open("L0_dt.txt") as f:
126 |     for row in f:
127 |         row = row.strip().split()
128 |         dt_pairs.add( (row[1], row[3]) )
129 |         dt_pairs.add( (row[3], row[1]) )
130 | 
131 | import networkx as nx
132 | G = nx.DiGraph()
133 | 
134 | r_dimer_set = {}
135 | read_ovlp = {}
136 | for key in L2map.keys():
137 |     v_mmer, v_strand, w_mmer, w_strand = key
138 |     for r in L2map[key]:
139 |         v_name, v_rid, r_strand, v_pos_end, w_pos_end = r
140 |         if v_name == "ref": continue
141 |         r_dimer_set.setdefault(v_rid, set())
142 |         r_dimer_set[v_rid].add(key)
143 | 
144 | for key in L2map.keys():
145 |     v_mmer, v_strand, w_mmer, w_strand = key
146 |     rlist = []
147 |     n = len(L2map[key])
148 |     for r in L2map[key]:
149 |         v_name, v_rid, r_strand, v_pos_end, w_pos_end = r
150 |         if v_name == "ref": continue
151 |         left_offset_v = -v_pos_end
152 |         right_offset_v = rid2len[v_rid]-v_pos_end
153 |         dist = w_pos_end - v_pos_end
154 |         print("X", v_mmer, v_strand, w_mmer, w_strand,
155 |               v_name, r_strand, rid2len[v_rid],
156 |               v_pos_end, w_pos_end,
157 |               mmer_count_L2[v_mmer], mmer_count_L2[w_mmer], n,
158 |               left_offset_v, right_offset_v)
159 | 
160 |         if mmer_count_L2[v_mmer] >= 30 or mmer_count_L2[v_mmer] <= 1:
161 |             continue
162 |         if mmer_count_L2[w_mmer] >= 30 or mmer_count_L2[w_mmer] <= 1:
163 |             continue
164 | 
165 |         rlist.append((left_offset_v, right_offset_v, v_name,
166 |                       v_rid, r_strand, dist ))
167 | 
168 |     if len(rlist) == 0: continue
169 |     rlist.sort()
170 | 
171 |     p_set = set()
172 |     left_offset_v0, right_offset_v0, r_name0, r_id0, r_strand0, dist0 = rlist[0]
173 |     p_set = r_dimer_set[r_id0]
174 |     for left_offset_v, right_offset_v, r_name, r_id, r_strand, dist in rlist[1:]:
175 |         if right_offset_v0 < right_offset_v:
176 |             overlap_count = len(r_dimer_set[r_id] & p_set)
177 |             overlap_len = rid2len[r_id0] - abs(left_offset_v0-left_offset_v)
178 |             dt_match = 1 if (r_name0, r_name) in dt_pairs else 0
179 |             print("Y", v_mmer, v_strand, w_mmer, w_strand,
180 |                   r_name0, r_strand0, r_name, r_strand,
181 |                   overlap_count, overlap_len, left_offset_v0, left_offset_v, dt_match)
182 |             if dt_match == 1:
183 |                 read_ovlp.setdefault((r_name0, r_strand0), [])
184 |                 read_ovlp.setdefault((r_name, 1-r_strand), [])
185 |                 read_ovlp[(r_name0, r_strand0)].append((overlap_len, (r_name, r_strand)))
186 |                 read_ovlp[(r_name, 1-r_strand)].append((-overlap_len, (r_name0, 1-r_strand0)))
187 | 
188 |                 # G.add_edge("{}-{}".format(r_name, r_strand), "{}-{}".format(r_name0, r_strand0))
189 |                 # G.add_edge("{}-{}".format(r_name0, r_strand0), "{}-{}".format(r_name, r_strand))
190 |         p_set = r_dimer_set[r_id]
191 |         right_offset_v0 = right_offset_v
192 |         left_offset_v0 = left_offset_v
193 |         r_name0 = r_name
194 |         r_strand0 = r_strand
195 | 
196 | for k in read_ovlp:
197 |     read_ovlp[k].sort()
198 | 
199 |     for v in read_ovlp[k][:]:
200 |         G.add_edge( "{}-{}".format(*k), "{}-{}".format(*v[-1]))
201 |     #for v in read_ovlp[k][-3:]:
202 |     #    G.add_edge( "{}-{}".format(*k), "{}-{}".format(*v[-1]))
203 | 
204 | nx.write_gexf(G, "test.gexf")
205 | 
206 | C.free(hmmerL0.a)
207 | C.free(hmmerL2.a)
208 | 


--------------------------------------------------------------------------------
/src/kalloc.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include "kalloc.h"
  5 | 
  6 | /* In kalloc, a *core* is a large chunk of contiguous memory. Each core is
  7 |  * associated with a master header, which keeps the size of the current core
  8 |  * and the pointer to next core. Kalloc allocates small *blocks* of memory from
  9 |  * the cores and organizes free memory blocks in a circular single-linked list.
 10 |  *
 11 |  * In the following diagram, "@" stands for the header of a free block (of type
 12 |  * header_t), "#" for the header of an allocated block (of type size_t), "-"
 13 |  * for free memory, and "+" for allocated memory.
 14 |  *
 15 |  * master        This region is core 1.          master           This region is core 2.
 16 |  *      |                                             |
 17 |  *      *@-------#++++++#++++++++++++@--------        *@----------#++++++++++++#+++++++@------------
 18 |  *       |                           |                 |                               |
 19 |  *       p=p->ptr->ptr->ptr->ptr     p->ptr            p->ptr->ptr                     p->ptr->ptr->ptr
 20 |  */
 21 | 
 22 | #define MIN_CORE_SIZE 0x80000
 23 | 
 24 | typedef struct header_t {
 25 | 	size_t size;
 26 | 	struct header_t *ptr;
 27 | } header_t;
 28 | 
 29 | typedef struct {
 30 | 	header_t base, *loop_head, *core_head; /* base is a zero-sized block always kept in the loop */
 31 | } kmem_t;
 32 | 
 33 | static void panic(const char *s)
 34 | {
 35 | 	fprintf(stderr, "%s\n", s);
 36 | 	abort();
 37 | }
 38 | 
 39 | void *km_init(void)
 40 | {
 41 | 	return calloc(1, sizeof(kmem_t));
 42 | }
 43 | 
 44 | void km_destroy(void *_km)
 45 | {
 46 | 	kmem_t *km = (kmem_t*)_km;
 47 | 	header_t *p, *q;
 48 | 	if (km == NULL) return;
 49 | 	for (p = km->core_head; p != NULL;) {
 50 | 		q = p->ptr;
 51 | 		free(p);
 52 | 		p = q;
 53 | 	}
 54 | 	free(km);
 55 | }
 56 | 
 57 | static header_t *morecore(kmem_t *km, size_t nu)
 58 | {
 59 | 	header_t *q;
 60 | 	size_t bytes, *p;
 61 | 	nu = (nu + 1 + (MIN_CORE_SIZE - 1)) / MIN_CORE_SIZE * MIN_CORE_SIZE; /* the first +1 for core header */
 62 | 	bytes = nu * sizeof(header_t);
 63 | 	q = (header_t*)malloc(bytes);
 64 | 	if (!q) panic("[morecore] insufficient memory");
 65 | 	q->ptr = km->core_head, q->size = nu, km->core_head = q;
 66 | 	p = (size_t*)(q + 1);
 67 | 	*p = nu - 1; /* the size of the free block; -1 because the first unit is used for the core header */
 68 | 	kfree(km, p + 1); /* initialize the new "core"; NB: the core header is not looped. */
 69 | 	return km->loop_head;
 70 | }
 71 | 
 72 | void kfree(void *_km, void *ap) /* kfree() also adds a new core to the circular list */
 73 | {
 74 | 	header_t *p, *q;
 75 | 	kmem_t *km = (kmem_t*)_km;
 76 | 	
 77 | 	if (!ap) return;
 78 | 	if (km == NULL) {
 79 | 		free(ap);
 80 | 		return;
 81 | 	}
 82 | 	p = (header_t*)((size_t*)ap - 1);
 83 | 	p->size = *((size_t*)ap - 1);
 84 | 	/* Find the pointer that points to the block to be freed. The following loop can stop on two conditions:
 85 | 	 *
 86 | 	 * a) "p>q && p<q->ptr": @------#++++++++#+++++++@-------    @---------------#+++++++@-------
 87 | 	 *    (can also be in    |      |                |        -> |                       |
 88 | 	 *     two cores)        q      p           q->ptr           q                  q->ptr
 89 | 	 *
 90 | 	 *                       @--------    #+++++++++@--------    @--------    @------------------
 91 | 	 *                       |            |         |         -> |            |
 92 | 	 *                       q            p    q->ptr            q       q->ptr
 93 | 	 *
 94 | 	 * b) "q>=q->ptr && (p>q || p<q->ptr)":  @-------#+++++   @--------#+++++++     @-------#+++++   @----------------
 95 | 	 *                                       |                |        |         -> |                |
 96 | 	 *                                  q->ptr                q        p       q->ptr                q
 97 | 	 *
 98 | 	 *                                       #+++++++@-----   #++++++++@-------     @-------------   #++++++++@-------
 99 | 	 *                                       |       |                 |         -> |                         |
100 | 	 *                                       p  q->ptr                 q       q->ptr                         q
101 | 	 */
102 | 	for (q = km->loop_head; !(p > q && p < q->ptr); q = q->ptr)
103 | 		if (q >= q->ptr && (p > q || p < q->ptr)) break;
104 | 	if (p + p->size == q->ptr) { /* two adjacent blocks, merge p and q->ptr (the 2nd and 4th cases) */
105 | 		p->size += q->ptr->size;
106 | 		p->ptr = q->ptr->ptr;
107 | 	} else if (p + p->size > q->ptr && q->ptr >= p) {
108 | 		panic("[kfree] The end of the allocated block enters a free block.");
109 | 	} else p->ptr = q->ptr; /* backup q->ptr */
110 | 
111 | 	if (q + q->size == p) { /* two adjacent blocks, merge q and p (the other two cases) */
112 | 		q->size += p->size;
113 | 		q->ptr = p->ptr;
114 | 		km->loop_head = q;
115 | 	} else if (q + q->size > p && p >= q) {
116 | 		panic("[kfree] The end of a free block enters the allocated block.");
117 | 	} else km->loop_head = p, q->ptr = p; /* in two cores, cannot be merged; create a new block in the list */
118 | }
119 | 
120 | void *kmalloc(void *_km, size_t n_bytes)
121 | {
122 | 	kmem_t *km = (kmem_t*)_km;
123 | 	size_t n_units;
124 | 	header_t *p, *q;
125 | 
126 | 	if (n_bytes == 0) return 0;
127 | 	if (km == NULL) return malloc(n_bytes);
128 | 	n_units = (n_bytes + sizeof(size_t) + sizeof(header_t) - 1) / sizeof(header_t) + 1;
129 | 
130 | 	if (!(q = km->loop_head)) /* the first time when kmalloc() is called, intialize it */
131 | 		q = km->loop_head = km->base.ptr = &km->base;
132 | 	for (p = q->ptr;; q = p, p = p->ptr) { /* search for a suitable block */
133 | 		if (p->size >= n_units) { /* p->size if the size of current block. This line means the current block is large enough. */
134 | 			if (p->size == n_units) q->ptr = p->ptr; /* no need to split the block */
135 | 			else { /* split the block. NB: memory is allocated at the end of the block! */
136 | 				p->size -= n_units; /* reduce the size of the free block */
137 | 				p += p->size; /* p points to the allocated block */
138 | 				*(size_t*)p = n_units; /* set the size */
139 | 			}
140 | 			km->loop_head = q; /* set the end of chain */
141 | 			return (size_t*)p + 1;
142 | 		}
143 | 		if (p == km->loop_head) { /* then ask for more "cores" */
144 | 			if ((p = morecore(km, n_units)) == 0) return 0;
145 | 		}
146 | 	}
147 | }
148 | 
149 | void *kcalloc(void *_km, size_t count, size_t size)
150 | {
151 | 	kmem_t *km = (kmem_t*)_km;
152 | 	void *p;
153 | 	if (size == 0 || count == 0) return 0;
154 | 	if (km == NULL) return calloc(count, size);
155 | 	p = kmalloc(km, count * size);
156 | 	memset(p, 0, count * size);
157 | 	return p;
158 | }
159 | 
160 | void *krealloc(void *_km, void *ap, size_t n_bytes) // TODO: this can be made more efficient in principle
161 | {
162 | 	kmem_t *km = (kmem_t*)_km;
163 | 	size_t n_units, *p, *q;
164 | 
165 | 	if (n_bytes == 0) {
166 | 		kfree(km, ap); return 0;
167 | 	}
168 | 	if (km == NULL) return realloc(ap, n_bytes);
169 | 	if (ap == NULL) return kmalloc(km, n_bytes);
170 | 	n_units = (n_bytes + sizeof(size_t) + sizeof(header_t) - 1) / sizeof(header_t);
171 | 	p = (size_t*)ap - 1;
172 | 	if (*p >= n_units) return ap; /* TODO: this prevents shrinking */
173 | 	q = (size_t*)kmalloc(km, n_bytes);
174 | 	memcpy(q, ap, (*p - 1) * sizeof(header_t));
175 | 	kfree(km, ap);
176 | 	return q;
177 | }
178 | 
179 | void km_stat(const void *_km, km_stat_t *s)
180 | {
181 | 	kmem_t *km = (kmem_t*)_km;
182 | 	header_t *p;
183 | 	memset(s, 0, sizeof(km_stat_t));
184 | 	if (km == NULL || km->loop_head == NULL) return;
185 | 	for (p = km->loop_head;; p = p->ptr) {
186 | 		s->available += p->size * sizeof(header_t);
187 | 		if (p->size != 0) ++s->n_blocks; /* &kmem_t::base is always one of the cores. It is zero-sized. */
188 | 		if (p->ptr > p && p + p->size > p->ptr)
189 | 			panic("[km_stat] The end of a free block enters another free block.");
190 | 		if (p->ptr == km->loop_head) break;
191 | 	}
192 | 	for (p = km->core_head; p != NULL; p = p->ptr) {
193 | 		size_t size = p->size * sizeof(header_t);
194 | 		++s->n_cores;
195 | 		s->capacity += size;
196 | 		s->largest = s->largest > size? s->largest : size;
197 | 	}
198 | }
199 | 


--------------------------------------------------------------------------------
/falcon/kalloc.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include "kalloc.h"
  5 | 
  6 | /* In kalloc, a *core* is a large chunk of contiguous memory. Each core is
  7 |  * associated with a master header, which keeps the size of the current core
  8 |  * and the pointer to next core. Kalloc allocates small *blocks* of memory from
  9 |  * the cores and organizes free memory blocks in a circular single-linked list.
 10 |  *
 11 |  * In the following diagram, "@" stands for the header of a free block (of type
 12 |  * header_t), "#" for the header of an allocated block (of type size_t), "-"
 13 |  * for free memory, and "+" for allocated memory.
 14 |  *
 15 |  * master        This region is core 1.          master           This region is core 2.
 16 |  *      |                                             |
 17 |  *      *@-------#++++++#++++++++++++@--------        *@----------#++++++++++++#+++++++@------------
 18 |  *       |                           |                 |                               |
 19 |  *       p=p->ptr->ptr->ptr->ptr     p->ptr            p->ptr->ptr                     p->ptr->ptr->ptr
 20 |  */
 21 | 
 22 | #define MIN_CORE_SIZE 0x80000
 23 | 
 24 | typedef struct header_t {
 25 | 	size_t size;
 26 | 	struct header_t *ptr;
 27 | } header_t;
 28 | 
 29 | typedef struct {
 30 | 	header_t base, *loop_head, *core_head; /* base is a zero-sized block always kept in the loop */
 31 | } kmem_t;
 32 | 
 33 | static void panic(const char *s)
 34 | {
 35 | 	fprintf(stderr, "%s\n", s);
 36 | 	abort();
 37 | }
 38 | 
 39 | void *km_init(void)
 40 | {
 41 | 	return calloc(1, sizeof(kmem_t));
 42 | }
 43 | 
 44 | void km_destroy(void *_km)
 45 | {
 46 | 	kmem_t *km = (kmem_t*)_km;
 47 | 	header_t *p, *q;
 48 | 	if (km == NULL) return;
 49 | 	for (p = km->core_head; p != NULL;) {
 50 | 		q = p->ptr;
 51 | 		free(p);
 52 | 		p = q;
 53 | 	}
 54 | 	free(km);
 55 | }
 56 | 
 57 | static header_t *morecore(kmem_t *km, size_t nu)
 58 | {
 59 | 	header_t *q;
 60 | 	size_t bytes, *p;
 61 | 	nu = (nu + 1 + (MIN_CORE_SIZE - 1)) / MIN_CORE_SIZE * MIN_CORE_SIZE; /* the first +1 for core header */
 62 | 	bytes = nu * sizeof(header_t);
 63 | 	q = (header_t*)malloc(bytes);
 64 | 	if (!q) panic("[morecore] insufficient memory");
 65 | 	q->ptr = km->core_head, q->size = nu, km->core_head = q;
 66 | 	p = (size_t*)(q + 1);
 67 | 	*p = nu - 1; /* the size of the free block; -1 because the first unit is used for the core header */
 68 | 	kfree(km, p + 1); /* initialize the new "core"; NB: the core header is not looped. */
 69 | 	return km->loop_head;
 70 | }
 71 | 
 72 | void kfree(void *_km, void *ap) /* kfree() also adds a new core to the circular list */
 73 | {
 74 | 	header_t *p, *q;
 75 | 	kmem_t *km = (kmem_t*)_km;
 76 | 	
 77 | 	if (!ap) return;
 78 | 	if (km == NULL) {
 79 | 		free(ap);
 80 | 		return;
 81 | 	}
 82 | 	p = (header_t*)((size_t*)ap - 1);
 83 | 	p->size = *((size_t*)ap - 1);
 84 | 	/* Find the pointer that points to the block to be freed. The following loop can stop on two conditions:
 85 | 	 *
 86 | 	 * a) "p>q && p<q->ptr": @------#++++++++#+++++++@-------    @---------------#+++++++@-------
 87 | 	 *    (can also be in    |      |                |        -> |                       |
 88 | 	 *     two cores)        q      p           q->ptr           q                  q->ptr
 89 | 	 *
 90 | 	 *                       @--------    #+++++++++@--------    @--------    @------------------
 91 | 	 *                       |            |         |         -> |            |
 92 | 	 *                       q            p    q->ptr            q       q->ptr
 93 | 	 *
 94 | 	 * b) "q>=q->ptr && (p>q || p<q->ptr)":  @-------#+++++   @--------#+++++++     @-------#+++++   @----------------
 95 | 	 *                                       |                |        |         -> |                |
 96 | 	 *                                  q->ptr                q        p       q->ptr                q
 97 | 	 *
 98 | 	 *                                       #+++++++@-----   #++++++++@-------     @-------------   #++++++++@-------
 99 | 	 *                                       |       |                 |         -> |                         |
100 | 	 *                                       p  q->ptr                 q       q->ptr                         q
101 | 	 */
102 | 	for (q = km->loop_head; !(p > q && p < q->ptr); q = q->ptr)
103 | 		if (q >= q->ptr && (p > q || p < q->ptr)) break;
104 | 	if (p + p->size == q->ptr) { /* two adjacent blocks, merge p and q->ptr (the 2nd and 4th cases) */
105 | 		p->size += q->ptr->size;
106 | 		p->ptr = q->ptr->ptr;
107 | 	} else if (p + p->size > q->ptr && q->ptr >= p) {
108 | 		panic("[kfree] The end of the allocated block enters a free block.");
109 | 	} else p->ptr = q->ptr; /* backup q->ptr */
110 | 
111 | 	if (q + q->size == p) { /* two adjacent blocks, merge q and p (the other two cases) */
112 | 		q->size += p->size;
113 | 		q->ptr = p->ptr;
114 | 		km->loop_head = q;
115 | 	} else if (q + q->size > p && p >= q) {
116 | 		panic("[kfree] The end of a free block enters the allocated block.");
117 | 	} else km->loop_head = p, q->ptr = p; /* in two cores, cannot be merged; create a new block in the list */
118 | }
119 | 
120 | void *kmalloc(void *_km, size_t n_bytes)
121 | {
122 | 	kmem_t *km = (kmem_t*)_km;
123 | 	size_t n_units;
124 | 	header_t *p, *q;
125 | 
126 | 	if (n_bytes == 0) return 0;
127 | 	if (km == NULL) return malloc(n_bytes);
128 | 	n_units = (n_bytes + sizeof(size_t) + sizeof(header_t) - 1) / sizeof(header_t) + 1;
129 | 
130 | 	if (!(q = km->loop_head)) /* the first time when kmalloc() is called, intialize it */
131 | 		q = km->loop_head = km->base.ptr = &km->base;
132 | 	for (p = q->ptr;; q = p, p = p->ptr) { /* search for a suitable block */
133 | 		if (p->size >= n_units) { /* p->size if the size of current block. This line means the current block is large enough. */
134 | 			if (p->size == n_units) q->ptr = p->ptr; /* no need to split the block */
135 | 			else { /* split the block. NB: memory is allocated at the end of the block! */
136 | 				p->size -= n_units; /* reduce the size of the free block */
137 | 				p += p->size; /* p points to the allocated block */
138 | 				*(size_t*)p = n_units; /* set the size */
139 | 			}
140 | 			km->loop_head = q; /* set the end of chain */
141 | 			return (size_t*)p + 1;
142 | 		}
143 | 		if (p == km->loop_head) { /* then ask for more "cores" */
144 | 			if ((p = morecore(km, n_units)) == 0) return 0;
145 | 		}
146 | 	}
147 | }
148 | 
149 | void *kcalloc(void *_km, size_t count, size_t size)
150 | {
151 | 	kmem_t *km = (kmem_t*)_km;
152 | 	void *p;
153 | 	if (size == 0 || count == 0) return 0;
154 | 	if (km == NULL) return calloc(count, size);
155 | 	p = kmalloc(km, count * size);
156 | 	memset(p, 0, count * size);
157 | 	return p;
158 | }
159 | 
160 | void *krealloc(void *_km, void *ap, size_t n_bytes) // TODO: this can be made more efficient in principle
161 | {
162 | 	kmem_t *km = (kmem_t*)_km;
163 | 	size_t n_units, *p, *q;
164 | 
165 | 	if (n_bytes == 0) {
166 | 		kfree(km, ap); return 0;
167 | 	}
168 | 	if (km == NULL) return realloc(ap, n_bytes);
169 | 	if (ap == NULL) return kmalloc(km, n_bytes);
170 | 	n_units = (n_bytes + sizeof(size_t) + sizeof(header_t) - 1) / sizeof(header_t);
171 | 	p = (size_t*)ap - 1;
172 | 	if (*p >= n_units) return ap; /* TODO: this prevents shrinking */
173 | 	q = (size_t*)kmalloc(km, n_bytes);
174 | 	memcpy(q, ap, (*p - 1) * sizeof(header_t));
175 | 	kfree(km, ap);
176 | 	return q;
177 | }
178 | 
179 | void km_stat(const void *_km, km_stat_t *s)
180 | {
181 | 	kmem_t *km = (kmem_t*)_km;
182 | 	header_t *p;
183 | 	memset(s, 0, sizeof(km_stat_t));
184 | 	if (km == NULL || km->loop_head == NULL) return;
185 | 	for (p = km->loop_head;; p = p->ptr) {
186 | 		s->available += p->size * sizeof(header_t);
187 | 		if (p->size != 0) ++s->n_blocks; /* &kmem_t::base is always one of the cores. It is zero-sized. */
188 | 		if (p->ptr > p && p + p->size > p->ptr)
189 | 			panic("[km_stat] The end of a free block enters another free block.");
190 | 		if (p->ptr == km->loop_head) break;
191 | 	}
192 | 	for (p = km->core_head; p != NULL; p = p->ptr) {
193 | 		size_t size = p->size * sizeof(header_t);
194 | 		++s->n_cores;
195 | 		s->capacity += size;
196 | 		s->largest = s->largest > size? s->largest : size;
197 | 	}
198 | }
199 | 


--------------------------------------------------------------------------------
/misc/logo.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
 3 | <svg width="100%" height="100%" viewBox="0 0 247 246" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" xmlns:serif="http://www.serif.com/" style="fill-rule:evenodd;clip-rule:evenodd;">
 4 |     <g transform="matrix(1,0,0,1,11.0332,10.8976)">
 5 |         <g transform="matrix(1,0,0,1,-873.121,-487.926)">
 6 |             <g transform="matrix(1,0,0,1,485.08,304.02)">
 7 |                 <g>
 8 |                     <path d="M579.228,185.5C596.8,185.5 611,200.102 611,218.059L611,373.941C611,391.898 596.8,406.5 579.228,406.5L421.272,406.5C403.7,406.5 389.5,391.898 389.5,373.941L389.5,218.059C389.5,200.102 403.7,185.5 421.272,185.5L579.228,185.5Z" style="fill-opacity:0.643137;fill-rule:nonzero;"/>
 9 |                     <path d="M421.272,189.5L579.228,189.5C594.566,189.5 607,202.286 607,218.059L607,373.941C607,389.714 594.566,402.5 579.228,402.5L421.272,402.5C405.934,402.5 393.5,389.714 393.5,373.941L393.5,218.059C393.5,202.286 405.934,189.5 421.272,189.5Z" style="fill:url(#_Linear1);fill-rule:nonzero;"/>
10 |                     <path d="M421.272,189.5L579.228,189.5C594.566,189.5 607,202.286 607,218.059L607,373.941C607,389.714 594.566,402.5 579.228,402.5L421.272,402.5C405.934,402.5 393.5,389.714 393.5,373.941L393.5,218.059C393.5,202.286 405.934,189.5 421.272,189.5Z" style="fill-opacity:0;fill-rule:nonzero;stroke:rgb(41,82,139);stroke-width:8px;"/>
11 |                 </g>
12 |                 <g transform="matrix(-0.002,-1,1,-0.002,445.783,382.839)">
13 |                     <g>
14 |                         <text x="0px" y="0px" style="font-family:'Optima-BoldItalic', 'Optima';font-weight:700;font-style:italic;font-size:42px;fill:rgb(67,89,165);">Peregrine</text>
15 |                     </g>
16 |                 </g>
17 |                 <g>
18 |                     <path d="M498.431,224.439C498.928,232.061 500.585,231.398 498.762,234.878C496.94,238.357 496.774,240.843 491.637,252.773C486.501,264.703 486.335,265.862 480.536,275.141C474.737,284.42 469.931,289.888 476.062,284.752C482.193,279.615 479.707,282.598 485.838,276.301C491.969,270.005 489.318,273.319 496.277,265.531C503.236,257.743 500.585,260.395 507.378,253.767C514.172,247.139 516.657,248.133 518.646,243.825C520.634,239.517 521.297,241.505 519.64,237.86C517.983,234.215 518.314,231.564 512.349,228.913C506.384,226.261 498.431,224.439 498.431,224.439Z" style="fill:rgb(67,89,165);fill-rule:nonzero;"/>
19 |                     <path d="M499.094,255.369C498.276,257.496 496.967,264.695 496.803,269.276C496.64,273.857 495.331,275.493 495.658,280.401C495.985,285.309 495.331,283.509 495.658,290.872C495.985,298.234 493.531,295.616 497.131,302.978C500.73,310.341 500.566,312.14 503.348,311.65C506.129,311.159 505.643,310.116 507.481,307.108C510.906,301.501 510.219,307.723 508.91,300.197C507.601,292.671 508.316,294.079 507.274,286.728C506.364,280.31 506.231,283.658 506.364,276.867C506.507,269.579 506.253,267.321 506.829,262.258C507.318,257.967 508.281,257.537 507.378,253.767C506.661,250.767 499.094,255.369 499.094,255.369Z" style="fill:rgb(67,89,165);fill-rule:nonzero;"/>
20 |                     <path d="M530.179,218.722C526.742,221.947 526.991,218.727 527.68,224.439C528.037,227.4 528.927,229.819 529.581,232.764C530.236,235.709 530.992,236.009 530.79,238.17C530.599,240.205 531.108,240.078 528.706,242.772C524.677,247.292 516.763,253.079 524.943,248.334C533.124,243.59 533.402,244.638 535.087,241.626C536.362,239.348 536.293,238.984 538.828,238.514C542.964,237.747 543.343,237.924 546.191,235.736C548.746,233.773 549.992,233.223 549.873,229.276C549.81,227.19 550.281,225.75 547.928,223.753C543.833,220.28 544.654,220.632 539.037,219.343C533.985,218.183 530.179,218.722 530.179,218.722Z" style="fill:rgb(67,89,165);fill-rule:nonzero;"/>
21 |                     <path d="M508.91,253.687C513.979,255.996 512.208,258.715 517.498,256.994C521.942,255.549 521.434,254.514 526.274,252.897C529.648,251.769 527.398,252.768 535.414,250.142C543.431,247.516 544.713,247.025 549.634,246.698C554.556,246.371 553.619,246.392 557.055,246.883C560.491,247.373 565.759,248.456 564.941,244.529C564.123,240.603 564.621,239.683 560.489,239.315C553.317,238.674 554.758,238.997 549.634,240.014C542.093,241.512 547.769,240.703 539.121,242.685C531.215,244.497 536.802,244.15 528.691,245.801C521.091,247.349 518.958,248.981 514.769,250.142C510.828,251.234 508.91,253.687 508.91,253.687Z" style="fill:rgb(67,89,165);fill-rule:nonzero;"/>
22 |                     <path d="M521.114,248.189C522.825,250.142 522.942,250.142 524.453,252.897C527.007,257.553 526.945,254.904 527.451,261.485C527.958,268.065 527.424,263.942 527.804,268.877C528.183,273.812 528.579,272.681 528.563,277.735C528.534,286.759 525.81,284.193 526.791,288.238C527.413,290.798 527.474,293.256 530.133,293.335C535.067,293.482 535.795,291.152 538.92,290.293C542.03,289.437 541.564,289.554 542.356,286.728C543.902,281.215 543.495,283.429 543.622,277.355C543.748,271.281 543.242,275.078 543.116,269.003C542.989,262.929 543.495,266.128 543.116,259.512C542.736,252.897 545.037,255.036 542.938,251.312C539.827,245.794 543.347,246.259 538.081,244.629C532.85,243.01 533.472,243.847 531.231,243.965C525.432,244.268 521.114,248.189 521.114,248.189Z" style="fill:rgb(67,89,165);fill-rule:nonzero;"/>
23 |                     <path d="M512.664,267.17C514.072,269.636 514.356,269.652 516.413,271.098C518.906,272.851 519.724,272.197 521.434,271.914C523.136,271.632 521.223,272.04 526.285,269.51C531.347,266.979 528.213,268.24 534.16,266.089C540.108,263.938 537.996,264.398 543.833,263.481C549.209,262.637 546.515,262.812 549.634,263.1C552.452,263.359 556.434,263.147 556.419,260.492C556.405,257.886 556.311,257.134 555.847,256.804C552.581,254.48 550.723,254.617 547.388,255.532C543.933,256.479 546.086,256.482 540.138,258.203C534.191,259.923 539.405,258.595 532.825,260.619C526.244,262.644 531.267,260.947 525.066,262.845C518.865,264.743 512.664,267.17 512.664,267.17Z" style="fill:rgb(67,89,165);fill-rule:nonzero;"/>
24 |                     <path d="M514.769,282.67C516.035,284.821 515.782,284.687 517.553,285.707C519.325,286.728 517.847,286.728 521.434,286.728C525.02,286.728 522.236,286.973 526.285,285.328C530.335,283.683 529.702,283.556 533.372,282.544C537.041,281.531 535.97,281.406 540.205,280.392C542.998,279.724 542.51,279.959 544.469,279.886C546.381,279.815 549.637,280.019 551.721,280.139C553.92,280.266 554.264,280.14 556.029,279C558.538,277.381 559.281,276.669 559.054,275.861C558.673,274.499 558.364,274.069 558.301,273.939C557.036,271.31 558.662,272.351 556.029,270.685C555.081,270.084 553.977,270.372 549.634,270.648C541.881,271.141 547.038,270.031 540.332,272.554C533.625,275.078 535.902,276.469 529.828,277.608C523.754,278.747 523.858,279.633 520.579,280.266C517.3,280.899 514.769,282.67 514.769,282.67Z" style="fill:rgb(67,89,165);fill-rule:nonzero;"/>
25 |                     <path d="M508.281,298.723C509.91,301.983 508.844,299.983 511.016,302.856C512.82,305.242 510.932,306.125 518.524,303.088C526.117,300.051 525.189,300.717 528.361,300.062C533.716,298.956 534.026,298.543 539.012,298.141C548.221,297.399 544.88,297.279 549.634,297.349C555.891,297.443 554.772,298.849 559.093,299.072C562.468,299.247 563.194,299.754 564.86,296.464C566.526,293.173 566.232,294.535 564.86,291.956C563.489,289.377 565.382,289.496 560.705,288.112C556.029,286.728 555.684,286.974 551.41,287.781C545.24,288.945 548.101,288.532 540.002,290.225C531.903,291.919 531.002,291.725 527.371,293.31C522.834,295.291 525.987,294.031 519.165,295.929C512.343,297.828 508.281,298.723 508.281,298.723Z" style="fill:rgb(67,89,165);fill-rule:nonzero;"/>
26 |                     <path d="M460.899,328.125L469.755,334.29L479.956,339.111L494.193,332.609L503.049,328.349L514.769,324.426L521.114,322.632L532.083,319.942L543.833,317.363L555.513,316.915L566.002,317.475L576.139,320.39L583.202,322.296L588.807,319.942L592.369,314.673L589.703,308.619L583.426,305.027L574.121,305.027L563.808,305.929L549.88,308.731L536.343,312.52L519.08,314.561L504.731,317.139L492.175,319.493L479.844,322.071L460.899,328.125Z" style="fill:rgb(67,89,165);fill-rule:nonzero;"/>
27 |                     <path d="M519.08,301.669L520.131,306.938L520.131,312.296L520.931,321.511L520.131,331.712L520.931,342.362L522.219,350.321C522.219,350.321 523.003,355.029 523.115,356.374C523.227,357.72 526.366,366.688 526.366,366.688L529.617,370.163L533.653,367.248L536.904,361.195L538.809,354.357L540.379,346.285L541.164,336.42L541.836,327.677L541.052,319.942L540.491,309.292L537.688,304.803L528.945,300.884" style="fill:rgb(67,89,165);fill-rule:nonzero;"/>
28 |                 </g>
29 |             </g>
30 |         </g>
31 |     </g>
32 |     <defs>
33 |         <linearGradient id="_Linear1" x1="0" y1="0" x2="1" y2="0" gradientUnits="userSpaceOnUse" gradientTransform="matrix(1.30425e-14,213,-213,1.30425e-14,500.25,189.5)"><stop offset="0" style="stop-color:white;stop-opacity:1"/><stop offset="1" style="stop-color:rgb(212,212,212);stop-opacity:1"/></linearGradient>
34 |     </defs>
35 | </svg>
36 | 


--------------------------------------------------------------------------------
/src/shmr_index.c:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <errno.h>
  3 | #include <fcntl.h>
  4 | #include <stdint.h>
  5 | #include <stdio.h>
  6 | #include <sys/mman.h>
  7 | #include <sys/stat.h>
  8 | #include <unistd.h>
  9 | #include <zlib.h>
 10 | #include "khash.h"
 11 | #include "kseq.h"
 12 | #include "kvec.h"
 13 | #include "shimmer.h"
 14 | 
 15 | #define handle_error(msg) \
 16 |   do {                    \
 17 |     perror(msg);          \
 18 |     exit(EXIT_FAILURE);   \
 19 |   } while (0)
 20 | 
 21 | #define REDUCTION_FACTOR 6
 22 | #define DEFAULT_WINDOW_SIZE 80
 23 | #define DEFAULT_KMER_SIZE 16
 24 | 
 25 | extern char *optarg;
 26 | extern int optind, opterr, optopt;
 27 | 
 28 | void write_mc_count_mm128(char *fn, mm128_v *shimmer) {
 29 |   mm_count_v mmc = {0, 0, 0};
 30 |   khash_t(MMC) *mcmap = kh_init(MMC);
 31 |   mm_count(shimmer, mcmap, &mmc);
 32 |   write_mm_count(fn, &mmc);
 33 |   kv_destroy(mmc);
 34 |   kh_destroy(MMC, mcmap);
 35 | }
 36 | 
 37 | int main(int argc, char *argv[]) {
 38 |   char *seqdb_prefix = NULL;
 39 |   char *shimmer_prefix = NULL;
 40 | 
 41 |   char seq_idx_file_path[8192];
 42 |   char seqdb_file_path[8291];
 43 |   char shimmer_output_path[8192];
 44 | 
 45 |   FILE *seq_index_file;
 46 |   uint32_t rid;
 47 | 
 48 |   int c, written;
 49 |   int total_chunk = 1;
 50 |   int mychunk = 1;
 51 |   int reduction_factor = REDUCTION_FACTOR;
 52 |   int number_layers = 2;
 53 |   int output_L0 = 1;
 54 |   int window_size = DEFAULT_WINDOW_SIZE;
 55 |   int kmer_size = DEFAULT_KMER_SIZE;
 56 |   mm128_v shimmerL0 = {0, 0, 0};
 57 |   mm128_v shimmerL1 = {0, 0, 0};
 58 |   mm128_v shimmerL2 = {0, 0, 0};
 59 |   seq_data_v seq_data = {0, 0, 0};
 60 | 
 61 |   khash_t(RLEN) *rlmap = kh_init(RLEN);
 62 |   opterr = 0;
 63 | 
 64 |   while ((c = getopt(argc, argv, "p:o:t:c:l:r:m:w:k:")) != -1) {
 65 |     switch (c) {
 66 |       case 'p':
 67 |         seqdb_prefix = optarg;
 68 |         break;
 69 |       case 'o':
 70 |         shimmer_prefix = optarg;
 71 |         break;
 72 |       case 't':
 73 |         total_chunk = atoi(optarg);
 74 |         break;
 75 |       case 'c':
 76 |         mychunk = atoi(optarg);
 77 |         break;
 78 |       case 'r':
 79 |         reduction_factor = atoi(optarg);
 80 |         break;
 81 |       case 'l':
 82 |         number_layers = atoi(optarg);
 83 |         break;
 84 |       case 'm':
 85 |         output_L0 = atoi(optarg);
 86 |         break;
 87 |       case 'w':
 88 |         window_size = atoi(optarg);
 89 |         break;
 90 |       case 'k':
 91 |         kmer_size = atoi(optarg);
 92 |         break;
 93 |       case '?':
 94 |         if (optopt == 'p') {
 95 |           fprintf(stderr,
 96 |                   "Option -%c not specified, using 'seq_dataset' as the "
 97 |                   "sequence db prefix\n",
 98 |                   optopt);
 99 |         } else if (optopt == 'o') {
100 |           fprintf(stderr,
101 |                   "Option -%c not specified, using 'shimmer' as the output "
102 |                   "prefix\n",
103 |                   optopt);
104 |         }
105 |         return 1;
106 |       default:
107 |         abort();
108 |     }
109 |   }
110 | 
111 |   assert(total_chunk > 0);
112 |   assert(mychunk > 0 && mychunk <= total_chunk);
113 |   assert(reduction_factor < 256);
114 |   assert(window_size >= 24 && kmer_size >= 12 && window_size > kmer_size);
115 | 
116 |   fprintf(stderr, "reduction factor= %d\n", reduction_factor);
117 | 
118 |   if (seqdb_prefix == NULL) {
119 |     seqdb_prefix = (char *)calloc(8192, 1);
120 |     snprintf(seqdb_prefix, 8191, "seq_dataset");
121 |   }
122 | 
123 |   if (shimmer_prefix == NULL) {
124 |     shimmer_prefix = (char *)calloc(8192, 1);
125 |     snprintf(shimmer_prefix, 8191, "shimmer");
126 |   }
127 | 
128 |   written = snprintf(seq_idx_file_path, sizeof(seq_idx_file_path), "%s.idx",
129 |                      seqdb_prefix);
130 |   assert(written < sizeof(seq_idx_file_path));
131 |   fprintf(stderr, "using index file: %s\n", seq_idx_file_path);
132 | 
133 |   rlmap = get_read_length_map(seq_idx_file_path);
134 | 
135 |   written = snprintf(seqdb_file_path, sizeof(seqdb_file_path), "%s.seqdb",
136 |                      seqdb_prefix);
137 |   assert(written < sizeof(seqdb_file_path));
138 |   fprintf(stderr, "using seqdb file: %s\n", seqdb_file_path);
139 | 
140 |   int fd;
141 |   struct stat sb;
142 |   uint8_t *seq_p;
143 |   fd = open(seqdb_file_path, O_RDONLY);
144 |   if (fd == -1) handle_error("open");
145 | 
146 |   if (fstat(fd, &sb) == -1) /* To obtain file size */
147 |     handle_error("fstat");
148 | 
149 |   seq_p = (uint8_t *)mmap((void *)0, sb.st_size, PROT_READ, MAP_SHARED, fd, 0);
150 | 
151 |   seq_index_file = fopen(seq_idx_file_path, "r");
152 |   char name_buf[512];
153 |   uint32_t rlen;
154 |   size_t offset;
155 |   while (fscanf(seq_index_file, "%u %255s %u %lu", &rid, name_buf, &rlen,
156 |                 &offset) != EOF) {
157 |     if ((rid % total_chunk) != (mychunk % total_chunk)) continue;
158 |     char *seq = malloc(rlen + 1);
159 |     decode_biseq(seq_p + offset, seq, rlen, 0);
160 |     seq[rlen] = '\0';
161 |     mm_sketch(NULL, seq, rlen, window_size, kmer_size, rid, 0, &shimmerL0);
162 |     free(seq);
163 |   }
164 | 
165 |   if (output_L0 == 1) {
166 |     written = snprintf(shimmer_output_path, sizeof shimmer_output_path,
167 |                        "%s-L0-%02d-of-%02d.dat", shimmer_prefix, mychunk,
168 |                        total_chunk);
169 |     assert(written < sizeof(shimmer_output_path));
170 |     fprintf(stderr, "output data file: %s\n", shimmer_output_path);
171 |     write_mmlist(shimmer_output_path, &shimmerL0);
172 | 
173 |     /* temporary disable this as it is not used for now
174 | mm128_v shimmerE5 = {0,0,0};
175 | mm128_v shimmerE3 = {0,0,0};
176 | 
177 | mm_end_filter(&shimmerL0, &shimmerE5, &shimmerE3, rlmap, 250);
178 | written = snprintf(shimmer_output_path, sizeof shimmer_output_path,
179 | "%s-E5-%02d-of-%02d.dat", shimmer_prefix, mychunk, total_chunk); assert(written
180 | < sizeof(shimmer_output_path)); printf("output data file: %s\n",
181 | shimmer_output_path); write_mmlist(shimmer_output_path, &shimmerE5);
182 | kv_destroy(shimmerE5);
183 | 
184 | written = snprintf(shimmer_output_path, sizeof shimmer_output_path,
185 | "%s-E3-%02d-of-%02d.dat", shimmer_prefix, mychunk, total_chunk); assert(written
186 | < sizeof(shimmer_output_path)); printf("output data file: %s\n",
187 | shimmer_output_path); write_mmlist(shimmer_output_path, &shimmerE3);
188 | kv_destroy(shimmerE3);
189 | */
190 | 
191 |     written = snprintf(shimmer_output_path, sizeof shimmer_output_path,
192 |                        "%s-L0-MC-%02d-of-%02d.dat", shimmer_prefix, mychunk,
193 |                        total_chunk);
194 |     assert(written < sizeof(shimmer_output_path));
195 |     printf("output data file: %s\n", shimmer_output_path);
196 |     write_mc_count_mm128(shimmer_output_path, &shimmerL0);
197 |   }
198 | 
199 |   mm_reduce(&shimmerL0, &shimmerL1, reduction_factor);
200 |   kv_destroy(shimmerL0);
201 |   if (number_layers == 1) {
202 |     written = snprintf(shimmer_output_path, sizeof shimmer_output_path,
203 |                        "%s-L1-%02d-of-%02d.dat", shimmer_prefix, mychunk,
204 |                        total_chunk);
205 |     assert(written < sizeof(shimmer_output_path));
206 |     printf("output data file: %s\n", shimmer_output_path);
207 |     write_mmlist(shimmer_output_path, &shimmerL1);
208 | 
209 |     written = snprintf(shimmer_output_path, sizeof shimmer_output_path,
210 |                        "%s-L1-MC-%02d-of-%02d.dat", shimmer_prefix, mychunk,
211 |                        total_chunk);
212 |     assert(written < sizeof(shimmer_output_path));
213 |     printf("output data file: %s\n", shimmer_output_path);
214 |     write_mc_count_mm128(shimmer_output_path, &shimmerL1);
215 |   } else if (number_layers > 1) {
216 |     mm_reduce(&shimmerL1, &shimmerL2, reduction_factor);
217 |     kv_destroy(shimmerL1);
218 |     written = snprintf(shimmer_output_path, sizeof shimmer_output_path,
219 |                        "%s-L2-%02d-of-%02d.dat", shimmer_prefix, mychunk,
220 |                        total_chunk);
221 |     assert(written < sizeof(shimmer_output_path));
222 |     fprintf(stderr, "output data file: %s\n", shimmer_output_path);
223 |     write_mmlist(shimmer_output_path, &shimmerL2);
224 | 
225 |     written = snprintf(shimmer_output_path, sizeof shimmer_output_path,
226 |                        "%s-L2-MC-%02d-of-%02d.dat", shimmer_prefix, mychunk,
227 |                        total_chunk);
228 |     assert(written < sizeof(shimmer_output_path));
229 |     printf("output data file: %s\n", shimmer_output_path);
230 |     write_mc_count_mm128(shimmer_output_path, &shimmerL2);
231 | 
232 |     kv_destroy(shimmerL2);
233 |   }
234 | 
235 |   kh_destroy(RLEN, rlmap);
236 |   for (size_t _i = 0; _i < seq_data.n; _i++) {
237 |     kfree(NULL, seq_data.a[_i].name);
238 |   }
239 |   kv_destroy(seq_data);
240 | 
241 |   munmap(seq_p, sb.st_size);
242 |   if (!seqdb_prefix) free(seqdb_prefix);
243 |   if (!shimmer_prefix) free(shimmer_prefix);
244 |   return 0;
245 | }
246 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # This Repo Is Archived (on Feb 6, 2022)
  2 | 
  3 | If you are interested in an updated Peregrine Genome Assembler, please check [https://github.com/cschin/peregrine-2021](https://github.com/cschin/peregrine-2021)
  4 | 
  5 | 
  6 | <img src="misc/logo.png" alt="PeregrineLogo" width="120"/>
  7 | 
  8 | [![Actions Status](https://github.com/cschin/Peregrine/workflows/build-and-test-docker-image-master-branch/badge.svg)](https://github.com/cschin/peregrine/actions)
  9 | 
 10 | [![Actions Status](https://github.com/cschin/Peregrine/workflows/build-and-test-docker-image-tagged-release/badge.svg)](https://github.com/cschin/peregrine/actions)
 11 | 
 12 | # Peregrine & SHIMMER Genome Assembly Toolkit
 13 | 
 14 | Peregrine is a fast genome assembler for accurate long reads (length > 10kb,
 15 | accuracy > 99%). It can assemble a human genome from 30x reads within 20 cpu
 16 | hours from reads to polished consensus. It uses Sparse HIereachical MimiMizER
 17 | (SHIMMER) for fast read-to-read overlaping without quadratic comparisions used
 18 | in other OLC assemblers.
 19 | 
 20 | This code base includes code that uses SHIMMER (Sparse HIerarchical MiniMimER)
 21 | for genome assembly and other related applications.
 22 | 
 23 | Currently, the assembly graph process is more or less identical to the
 24 | approaches used in the FALCON assembler developed by Jason Chin and others in
 25 | Pacific Biosciences, Inc. There are a number of other possible ways to generate
 26 | contigs without a  string graph but it will need some research work to make it
 27 | happening. The FALCON graph module is also not very efficient as python scripts
 28 | are running in single thread mode.
 29 | 
 30 | 
 31 | ## Install
 32 | 
 33 | We *do not* recommend that you install the software from the source code unless
 34 | you are comfortable handling the required dependences for your system
 35 | independently. Unless you have full control (e.g. root access) of the computer 
 36 | system you use to build Peregrine and you can install the proper GCC 
 37 | compiler/python/pypy/conda version, then you should try to learn to use [Docker 
 38 | images](https://hub.docker.com/r/cschin/peregrine/tags) that 
 39 | we provide, it will make your life and our life easier. 
 40 | 
 41 | As independent developers with limit resource, we cannot provide free support for 
 42 | solving dependence problem of your specific system. Instead, we can provide 
 43 | docker image so you can run the executables and their dependency using Docker.  
 44 | 
 45 | If you want to build for your system without using Docker, please see the 
 46 | `docker/Dockerfile` and `docker/install_with_conda.sh` as examples to
 47 | install from scratch within a clean Conda environemnt.
 48 | 
 49 | ## Run the assembler
 50 | 
 51 | Peregrine is designed to run on single compute node. It does not need a grid
 52 | computing job scheduling system. It uses Pypeflow to coordinate multiple
 53 | concurrent processes.  
 54 | 
 55 | After revsion 0.1.5.3, You can test a small assembly using simulated E. Coli 
 56 | reads with Docker:
 57 | 
 58 | ```
 59 | # please substitute $PWD and $IMAGETAG with proper values
 60 | docker run -it --rm -v $PWD:/wd cschin/peregrine:$IMAGETAG test
 61 | ```
 62 | 
 63 | The assembly results are in `$PWD/ecoli_test_results/`. The testing case will
 64 | download an E. Coli reference and generate simulated reads. After the assembly
 65 | is done, it also installs `nucmer` to run `dandiff` comparing the assembled 
 66 | contigs with the original E. coli reference. You can check the ouput by using 
 67 | `cat $PWD/ecoli_test_results/out.report` command.
 68 | 
 69 | Here is the general usage for `pg_run.py` which starts the workflow for 
 70 | assembling a genome from input `fasta`, `fastq`, `fasta.gz` or 
 71 | `fastq.gz` files. 
 72 | 
 73 | ```
 74 | Usage:
 75 |   pg_run.py asm <reads.lst> <index_nchunk> <index_nproc>
 76 |                             <ovlp_nchunk> <ovlp_nproc>
 77 |                             <mapping_nchunk> <mapping_nproc>
 78 |                             <cns_nchunk> <cns_nproc>
 79 |                             <sort_nproc>
 80 |                             [--with-consensus]
 81 |                             [--with-L0-index]
 82 |                             [--output <output>]
 83 |                             [--shimmer-k <shimmer_k>]
 84 |                             [--shimmer-w <shimmer_w>]
 85 |                             [--shimmer-r <shimmer_r>]
 86 |                             [--shimmer-l <shimmer_l>]
 87 |                             [--best_n_ovlp <n_ovlp>]
 88 |                             [--mc_lower <mc_lower>]
 89 |                             [--mc_upper <mc_upper>]
 90 |                             [--aln_bw <aln_bw>]
 91 |                             [--ovlp_upper <ovlp_upper>]
 92 |   pg_run.py (-h | --help)
 93 |   pg_run.py --verison
 94 | 
 95 | Options:
 96 |   -h --help                   Show this help
 97 |   --version                   Show version
 98 |   --with-consensus            Generate consensus after getting the draft contigs
 99 |   --with-L0-index             Keep level-0 index
100 |   --output <output>           Set output directory (will be created if not exist) [default: ./wd]
101 |   --shimmer-k <shimmer_k>     Level 0 k-mer size [default: 16]
102 |   --shimmer-w <shimmer_w>     Level 0 window size [default: 80]
103 |   --shimmer-r <shimmer_r>     Reduction factore for high level SHIMMER [default: 6]
104 |   --shimmer-l <shimmer_l>     number of level of shimmer used, the value should be 1 or 2 [default: 2]
105 |   --best_n_ovlp <n_ovlp>      Find best n_ovlp overlap [default: 4]
106 |   --mc_lower <mc_lower>       Does not cosider SHIMMER with count less than mc_low [default: 2]
107 |   --mc_upper <mc_upper>       Does not cosider SHIMMER with count greater than mc_upper [default: 240]
108 |   --aln_bw <aln_bw>           Max off-diagonal gap allow during overlap confirmation [default: 100]
109 |   --ovlp_upper <ovlp_upper>   Ignore cluster with overlap count greater ovlp_upper [default: 120]
110 | ```
111 | 
112 | The first required option is `reads.lst`.  The `reads.list` should a
113 | path to a file that contains the list of the paths of the input sequence files.
114 | 
115 | The rest required options specify how to partition the data for different part
116 | of the pipeline and the number of the processors used for each of the step.
117 | 
118 | `<index_nchunk>`  and `<index_nproc>` control the number of "chunks" and the
119 | number of cpu used concurrently for the initial SHIMMER index generation.
120 | 
121 | `<ovlp_nchunk>`  and `<ovlp_nproc>` control the number of "chunks" and the
122 | number of cpu used concurrently for generating overlap inforrmation between
123 | reads. This part typically use the most memory and the exact size of RAM used
124 | concurrently depends on the size of input sequence data and the index file
125 | size. 
126 | 
127 | You can use larger number of `<ovlp_nchunk>` and smaller number of
128 | `<ovlp_nproc>` on a smaller memory mechine. For example, I was able to finish
129 | this part using a machine with 32G RAM with `ovlp_nchunk=24` and
130 | `ovlp_nproc=1`. 
131 | 
132 | If there is enough memory, for example, AWS bothe m5d.metal and r5d.12xlarge
133 | have 384G RAM, they can support running 24 to 48 cpu cores at once. However,
134 | the overlap step needs to do random access the sequence data through shared
135 | memory mapped file, it will be great to reserve some RAM to cache the sequence
136 | in memory in RAM. In our test, 48 cores does not provide significant speeding
137 | comparing to use 24 cores. Also, if there is not enough memory, you may need
138 | fast SSD or nvme drives and reduce the number or CPU core concurrently
139 | accessing the sequence data.
140 | 
141 | `<mapping_nchunk>` and `<mapping_nproc>` control the partitioning and the
142 | number of cores used for mapping the sequence reads to draft contigs for the
143 | following consensus step.
144 | 
145 | `<sort_nproc>` controls the number of cpu cores used for sorting the reads to
146 | contigs map.
147 | 
148 | `<cns_nchunk>` and  `<cns_nproc>` control the partitioning and the number of
149 | cores used for generating the consensus from draft contigs.
150 | 
151 | 
152 | ## Runing Peregrine Using Docker
153 | 
154 | Here is an example running Peregrine with Docker for a Peregrin build 
155 | of tag 0.1.5.0 using an AWS m5d.metal or r5d.12xlarge instance. (You will
156 | need to configure the AWS instance to utilize the NVME drives and a 
157 | docker environment.)
158 | 
159 | ```
160 | find /wd/chm13-fastq/ -name "*.fastq" | sort > chm13-seqdata.lst 
161 | 
162 | docker run -it -v /wd:/wd --user $(id -u):$(id -g) cschin/peregrine:0.1.5.0 asm \
163 |     /wd/chm13-seqdata.lst 24 24 24 24 24 24 24 24 24 \ 
164 |     --with-consensus --shimmer-r 3 --best_n_ovlp 8 \ 
165 |     --output /wd/chm13-asm-r3-pg0.1.5.0 
166 | ```
167 | 
168 | Note that the paths in the `<reads.lst>` should be the full paths to the
169 | sequuence files inside the docker container.
170 | 
171 | 
172 | ## LICENSE
173 | 
174 | ### Peregrine & SHIMMER Genome Assembly Toolkit
175 | 
176 | Peregrine Assembler and SHIMMER Genome Assembly Toolkit
177 | Copyright (c) 2019- by Jason, Chen-Shan, Chin
178 | 
179 | Peregrine Assembler and  SHIMMER Genome Assembly Toolkit 
180 | is licensed under a Creative Commons
181 | Attribution-NonCommercial-ShareAlike 4.0 International 
182 | License.
183 | 
184 | You should have received a copy of the license along with this
185 | work. If not, see <http://creativecommons.org/licenses/by-nc-sa/4.0/>.
186 | 
187 | 
188 | ### Minimap2
189 | 
190 | SHIMMER genome assembly toolkit uses C library developed by
191 | Heng Li for Minimap2.  See LICENSE.minimap2
192 | 
193 | 
194 | ### FALCON
195 | 
196 | See LICENSE.falcon for license for the code from FALCON 
197 | 


--------------------------------------------------------------------------------
/src/kseq.h:
--------------------------------------------------------------------------------
  1 | /* The MIT License
  2 | 
  3 |    Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
  4 | 
  5 |    Permission is hereby granted, free of charge, to any person obtaining
  6 |    a copy of this software and associated documentation files (the
  7 |    "Software"), to deal in the Software without restriction, including
  8 |    without limitation the rights to use, copy, modify, merge, publish,
  9 |    distribute, sublicense, and/or sell copies of the Software, and to
 10 |    permit persons to whom the Software is furnished to do so, subject to
 11 |    the following conditions:
 12 | 
 13 |    The above copyright notice and this permission notice shall be
 14 |    included in all copies or substantial portions of the Software.
 15 | 
 16 |    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 17 |    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 18 |    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 19 |    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 20 |    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 21 |    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 22 |    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 |    SOFTWARE.
 24 | */
 25 | 
 26 | /* Last Modified: 05MAR2012 */
 27 | 
 28 | #ifndef AC_KSEQ_H
 29 | #define AC_KSEQ_H
 30 | 
 31 | #include <ctype.h>
 32 | #include <string.h>
 33 | #include <stdlib.h>
 34 | 
 35 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
 36 | #define KS_SEP_TAB   1 // isspace() && !' '
 37 | #define KS_SEP_LINE  2 // line separator: "\n" (Unix) or "\r\n" (Windows)
 38 | #define KS_SEP_MAX   2
 39 | 
 40 | #define __KS_TYPE(type_t) \
 41 | 	typedef struct __kstream_t { \
 42 | 		int begin, end; \
 43 | 		int is_eof:2, bufsize:30; \
 44 | 		type_t f; \
 45 | 		unsigned char *buf; \
 46 | 	} kstream_t;
 47 | 
 48 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
 49 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
 50 | 
 51 | #define __KS_BASIC(SCOPE, type_t, __bufsize) \
 52 | 	SCOPE kstream_t *ks_init(type_t f) \
 53 | 	{ \
 54 | 		kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
 55 | 		ks->f = f; ks->bufsize = __bufsize; \
 56 | 		ks->buf = (unsigned char*)malloc(__bufsize); \
 57 | 		return ks; \
 58 | 	} \
 59 | 	SCOPE void ks_destroy(kstream_t *ks) \
 60 | 	{ \
 61 | 		if (!ks) return; \
 62 | 		free(ks->buf); \
 63 | 		free(ks); \
 64 | 	}
 65 | 
 66 | #define __KS_INLINED(__read) \
 67 | 	static inline int ks_getc(kstream_t *ks) \
 68 | 	{ \
 69 | 		if (ks->is_eof && ks->begin >= ks->end) return -1; \
 70 | 		if (ks->begin >= ks->end) { \
 71 | 			ks->begin = 0; \
 72 | 			ks->end = __read(ks->f, ks->buf, ks->bufsize); \
 73 | 			if (ks->end < ks->bufsize) ks->is_eof = 1; \
 74 | 			if (ks->end == 0) return -1; \
 75 | 		} \
 76 | 		return (int)ks->buf[ks->begin++]; \
 77 | 	} \
 78 | 	static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
 79 | 	{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
 80 | 
 81 | #ifndef KSTRING_T
 82 | #define KSTRING_T kstring_t
 83 | typedef struct __kstring_t {
 84 | 	unsigned l, m;
 85 | 	char *s;
 86 | } kstring_t;
 87 | #endif
 88 | 
 89 | #ifndef kroundup32
 90 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
 91 | #endif
 92 | 
 93 | #define __KS_GETUNTIL(SCOPE, __read) \
 94 | 	SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
 95 | 	{ \
 96 | 		if (dret) *dret = 0; \
 97 | 		str->l = append? str->l : 0; \
 98 | 		if (ks->begin >= ks->end && ks->is_eof) return -1; \
 99 | 		for (;;) { \
100 | 			int i; \
101 | 			if (ks->begin >= ks->end) { \
102 | 				if (!ks->is_eof) { \
103 | 					ks->begin = 0; \
104 | 					ks->end = __read(ks->f, ks->buf, ks->bufsize); \
105 | 					if (ks->end < ks->bufsize) ks->is_eof = 1; \
106 | 					if (ks->end == 0) break; \
107 | 				} else break; \
108 | 			} \
109 | 			if (delimiter == KS_SEP_LINE) { \
110 | 				for (i = ks->begin; i < ks->end; ++i) \
111 | 					if (ks->buf[i] == '\n') break; \
112 | 			} else if (delimiter > KS_SEP_MAX) { \
113 | 				for (i = ks->begin; i < ks->end; ++i) \
114 | 					if (ks->buf[i] == delimiter) break; \
115 | 			} else if (delimiter == KS_SEP_SPACE) { \
116 | 				for (i = ks->begin; i < ks->end; ++i) \
117 | 					if (isspace(ks->buf[i])) break; \
118 | 			} else if (delimiter == KS_SEP_TAB) { \
119 | 				for (i = ks->begin; i < ks->end; ++i) \
120 | 					if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
121 | 			} else i = 0; /* never come to here! */ \
122 | 			if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
123 | 				str->m = str->l + (i - ks->begin) + 1; \
124 | 				kroundup32(str->m); \
125 | 				str->s = (char*)realloc(str->s, str->m); \
126 | 			} \
127 | 			memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
128 | 			str->l = str->l + (i - ks->begin); \
129 | 			ks->begin = i + 1; \
130 | 			if (i < ks->end) { \
131 | 				if (dret) *dret = ks->buf[i]; \
132 | 				break; \
133 | 			} \
134 | 		} \
135 | 		if (str->s == 0) { \
136 | 			str->m = 1; \
137 | 			str->s = (char*)calloc(1, 1); \
138 | 		} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
139 | 		str->s[str->l] = '\0'; \
140 | 		return str->l; \
141 | 	}
142 | 
143 | #define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \
144 | 	__KS_TYPE(type_t) \
145 | 	__KS_BASIC(SCOPE, type_t, __bufsize) \
146 | 	__KS_GETUNTIL(SCOPE, __read) \
147 | 	__KS_INLINED(__read)
148 | 
149 | #define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize)
150 | 
151 | #define KSTREAM_DECLARE(type_t, __read) \
152 | 	__KS_TYPE(type_t) \
153 | 	extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \
154 | 	extern kstream_t *ks_init(type_t f); \
155 | 	extern void ks_destroy(kstream_t *ks); \
156 | 	__KS_INLINED(__read)
157 | 
158 | /******************
159 |  * FASTA/Q parser *
160 |  ******************/
161 | 
162 | #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
163 | 
164 | #define __KSEQ_BASIC(SCOPE, type_t) \
165 | 	SCOPE kseq_t *kseq_init(type_t fd) \
166 | 	{ \
167 | 		kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
168 | 		s->f = ks_init(fd); \
169 | 		return s; \
170 | 	} \
171 | 	SCOPE void kseq_destroy(kseq_t *ks) \
172 | 	{ \
173 | 		if (!ks) return; \
174 | 		free(ks->name.s); free(ks->comment.s); free(ks->seq.s);	free(ks->qual.s); \
175 | 		ks_destroy(ks->f); \
176 | 		free(ks); \
177 | 	}
178 | 
179 | /* Return value:
180 |    >=0  length of the sequence (normal)
181 |    -1   end-of-file
182 |    -2   truncated quality string
183 |  */
184 | #define __KSEQ_READ(SCOPE) \
185 | 	SCOPE int kseq_read(kseq_t *seq) \
186 | 	{ \
187 | 		int c; \
188 | 		kstream_t *ks = seq->f; \
189 | 		if (seq->last_char == 0) { /* then jump to the next header line */ \
190 | 			while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
191 | 			if (c == -1) return -1; /* end of file */ \
192 | 			seq->last_char = c; \
193 | 		} /* else: the first header char has been read in the previous call */ \
194 | 		seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
195 | 		if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
196 | 		if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
197 | 		if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
198 | 			seq->seq.m = 256; \
199 | 			seq->seq.s = (char*)malloc(seq->seq.m); \
200 | 		} \
201 | 		while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
202 | 			if (c == '\n') continue; /* skip empty lines */ \
203 | 			seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
204 | 			ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
205 | 		} \
206 | 		if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
207 | 		if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
208 | 			seq->seq.m = seq->seq.l + 2; \
209 | 			kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
210 | 			seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
211 | 		} \
212 | 		seq->seq.s[seq->seq.l] = 0;	/* null terminated string */ \
213 | 		if (c != '+') return seq->seq.l; /* FASTA */ \
214 | 		if (seq->qual.m < seq->seq.m) {	/* allocate memory for qual in case insufficient */ \
215 | 			seq->qual.m = seq->seq.m; \
216 | 			seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
217 | 		} \
218 | 		while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
219 | 		if (c == -1) return -2; /* error: no quality string */ \
220 | 		while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
221 | 		seq->last_char = 0;	/* we have not come to the next header line */ \
222 | 		if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
223 | 		return seq->seq.l; \
224 | 	}
225 | 
226 | #define __KSEQ_TYPE(type_t) \
227 | 	typedef struct { \
228 | 		kstring_t name, comment, seq, qual; \
229 | 		int last_char; \
230 | 		kstream_t *f; \
231 | 	} kseq_t;
232 | 
233 | #define KSEQ_INIT2(SCOPE, type_t, __read) \
234 | 	KSTREAM_INIT2(SCOPE, type_t, __read, 16384) \
235 | 	__KSEQ_TYPE(type_t) \
236 | 	__KSEQ_BASIC(SCOPE, type_t) \
237 | 	__KSEQ_READ(SCOPE)
238 | 
239 | #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
240 | 
241 | #define KSEQ_DECLARE(type_t) \
242 | 	__KS_TYPE(type_t) \
243 | 	__KSEQ_TYPE(type_t) \
244 | 	extern kseq_t *kseq_init(type_t fd); \
245 | 	void kseq_destroy(kseq_t *ks); \
246 | 	int kseq_read(kseq_t *seq);
247 | 
248 | #endif
249 | 


--------------------------------------------------------------------------------
/py/scripts/pg_asm_cns.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import mmap
  4 | import sys
  5 | from peregrine._falcon4py import ffi
  6 | from peregrine._falcon4py import lib as falcon
  7 | from peregrine._shimmer4py import lib as shimmer
  8 | import numpy as np
  9 | from collections import OrderedDict
 10 | 
 11 | ## No option parsing at thie moment, perhaps letter
 12 | 
 13 | read_db_prefix = sys.argv[1]
 14 | ref_db_prefix = sys.argv[2]
 15 | read_to_contig_map = sys.argv[3]
 16 | total_chunks = int(sys.argv[4])
 17 | my_chunk = int(sys.argv[5])
 18 | 
 19 | 
 20 | f = open("{}.seqdb".format(read_db_prefix), "rb")
 21 | seqdb = mmap.mmap(f.fileno(), 0, flags=mmap.MAP_SHARED, prot=mmap.PROT_READ)
 22 | 
 23 | f = open("{}.seqdb".format(ref_db_prefix), "rb")
 24 | refdb = mmap.mmap(f.fileno(), 0, flags=mmap.MAP_SHARED, prot=mmap.PROT_READ)
 25 | 
 26 | read_idx = {}
 27 | with open("{}.idx".format(read_db_prefix)) as f:
 28 |     for row in f:
 29 |         row = row.strip().split()
 30 |         rid, rname, rlen, offset = row
 31 |         rid = int(rid)
 32 |         rlen = int(rlen)
 33 |         offset = int(offset)
 34 |         read_idx.setdefault(rid, {})
 35 |         read_idx[rid]["name"] = rname
 36 |         read_idx[rid]["length"] = rlen
 37 |         read_idx[rid]["offset"] = offset
 38 | 
 39 | 
 40 | ref_idx = {}
 41 | with open("{}.idx".format(ref_db_prefix)) as f:
 42 |     for row in f:
 43 |         row = row.strip().split()
 44 |         rid, rname, rlen, offset = row
 45 |         rid = int(rid)
 46 |         rlen = int(rlen)
 47 |         offset = int(offset)
 48 |         ref_idx.setdefault(rid, {})
 49 |         ref_idx[rid]["name"] = rname
 50 |         ref_idx[rid]["length"] = rlen
 51 |         ref_idx[rid]["offset"] = offset
 52 | 
 53 | contig_to_read_map = OrderedDict()
 54 | with open(read_to_contig_map) as f:
 55 |     for row in f:
 56 |         row = row.strip().split()
 57 |         row = tuple(int(c) for c in row)
 58 |         ctg_id = row[0]
 59 |         if (my_chunk % total_chunks) != (ctg_id % total_chunks):
 60 |             continue
 61 |         contig_to_read_map.setdefault(ctg_id, [])
 62 |         contig_to_read_map[ctg_id].append(row)
 63 | 
 64 | rng = ffi.new("aln_range[1]")
 65 | 
 66 | 
 67 | # TODO: we need to refactor this loop
 68 | for ctg in contig_to_read_map:
 69 |     print("-\n", "ctg {}".format(ref_idx[ctg]["name"]), file=sys.stderr)
 70 |     contig_to_read_map[ctg].sort(key=lambda x: x[1])
 71 |     read_map_groups = []
 72 |     left_anchor = 1000
 73 |     map_group = []
 74 | 
 75 |     for row in contig_to_read_map[ctg]:
 76 |         ref_p1 = row[1]
 77 |         if ref_p1 - left_anchor < 50000:
 78 |             map_group.append(row)
 79 |         else:
 80 |             if ref_p1 - left_anchor < 100000:
 81 |                 read_map_groups.append([left_anchor, ref_p1, map_group])
 82 |             else:
 83 |                 read_map_groups.append([left_anchor, ref_p1, []])
 84 |             map_group = []
 85 |             left_anchor = ref_p1
 86 | 
 87 |     if ref_idx[ctg]["length"] - left_anchor < 100000:  #current max template size for consensus
 88 |         if ref_idx[ctg]["length"] - left_anchor > 1000:
 89 |             read_map_groups.append((left_anchor,
 90 |                                     ref_idx[ctg]["length"],
 91 |                                     map_group))
 92 |         elif len(read_map_groups) > 0:
 93 |             read_map_groups[-1][1] = ref_idx[ctg]["length"]
 94 |             read_map_groups[-1][2].extend(map_group)
 95 |         else:
 96 |             read_map_groups.append((left_anchor, ref_idx[ctg]["length"], []))
 97 |     else:
 98 |         read_map_groups.append((left_anchor, ref_idx[ctg]["length"], []))
 99 | 
100 |     print("ctg {}".format(ref_idx[ctg]["name"]),
101 |           len(read_map_groups),
102 |           file=sys.stderr)
103 | 
104 |     #if len(read_map_groups) <= 2: #ignore short contig for now
105 |     #    continue
106 | 
107 |     cns_segments = []
108 |     j = 0
109 |     for left, right, mapped in read_map_groups:
110 |         print(f"--\n sg{j:03d}", left, right, right-left, len(mapped), file=sys.stderr)
111 | 
112 |         j += 1
113 |         left = left-1000
114 |         assert(left >= 0)
115 |         rmap = {}
116 | 
117 |         for d in mapped:
118 |             #print(d)
119 |             read_id = d[3]
120 |             read_offset = d[1] - d[4]
121 |             read_strand = d[6]
122 |             rmap.setdefault((read_id, read_strand), [])
123 |             rmap[(read_id, read_strand)].append(read_offset)
124 | 
125 |         reads = []
126 | 
127 |         for (read_id, read_strand), v in rmap.items():
128 |             v.sort()
129 |             v_current = v[0]
130 |             reads.append((read_id, read_strand, v_current - left, len(v)))
131 |             print( (read_id, read_strand), v_current, file=sys.stderr);
132 |             for vv in v:
133 |                 if vv > v_current + 50:
134 |                     v_current = vv
135 |                     reads.append((read_id, read_strand, v_current - left, len(v)))
136 |                     print( (read_id, read_strand), v_current, file=sys.stderr);
137 | 
138 | 
139 |         reads.sort(key=lambda x: x[2])
140 |         s = ref_idx[ctg]["offset"] + left
141 |         ref_len = right-left
142 | 
143 |         bseq0 = refdb[s:s+ref_len]
144 | 
145 |         ref_seq = ffi.new("char[{}]".format(ref_len))
146 | 
147 |         shimmer.decode_biseq(bseq0, ref_seq, ref_len, 0)
148 | 
149 |         tags = ffi.new("align_tags_t * [{}]".format(len(reads)+1))
150 | 
151 |         # need a back bone for some boundary case
152 |         aln = falcon.align(ref_seq, ref_len,
153 |                            ref_seq, ref_len,
154 |                            50, 1)
155 |         rng[0].s1 = aln.aln_q_s
156 |         rng[0].e1 = aln.aln_q_e
157 |         rng[0].s2 = aln.aln_t_s
158 |         rng[0].e2 = aln.aln_t_e
159 |         tag = falcon.get_align_tags(aln.q_aln_str,
160 |                                     aln.t_aln_str,
161 |                                     aln.aln_str_size,
162 |                                     rng, 0, 0)
163 |         aln_count = 0
164 |         tags[aln_count] = tag
165 |         aln_count += 1
166 |         falcon.free_alignment(aln)
167 | 
168 |         aln_base = 0
169 |         for d in reads:
170 |             #print(d)
171 |             read_id = d[0]
172 |             read_strand = d[1]
173 |             read_shift = int(d[2])
174 |             s = read_idx[read_id]["offset"]
175 |             read_len = read_idx[read_id]["length"]
176 |             bseq1 = seqdb[s:s+read_len]
177 |             read_seq = ffi.new("char[{}]".format(read_len))
178 |             shimmer.decode_biseq(bseq1, read_seq, read_len, read_strand)
179 | 
180 |             aligned = False
181 |             t_offset = 0
182 |             if read_shift < 0:
183 |                 aln = falcon.align(read_seq[abs(read_shift):read_len],
184 |                                    read_len - abs(read_shift),
185 |                                    ref_seq,
186 |                                    ref_len,
187 |                                    150, 1)
188 | 
189 |                 if abs(abs(aln.aln_q_e-aln.aln_q_s) -
190 |                        (read_len - abs(read_shift))) < 48:
191 |                     aligned = True
192 | 
193 |                     rng[0].s1 = aln.aln_q_s
194 |                     rng[0].e1 = aln.aln_q_e
195 |                     rng[0].s2 = aln.aln_t_s
196 |                     rng[0].e2 = aln.aln_t_e
197 |                     t_offset = 0
198 |                 else:
199 |                     falcon.free_alignment(aln)
200 |             else:
201 |                 aln = falcon.align(read_seq,
202 |                                    read_len,
203 |                                    ref_seq[read_shift:ref_len],
204 |                                    ref_len-read_shift,
205 |                                    150, 1)
206 | 
207 |                 if abs(abs(aln.aln_q_e-aln.aln_q_s)-read_len) < 48 or \
208 |                    abs(ref_len-read_shift-abs(aln.aln_q_e-aln.aln_q_s)) < 48:
209 |                     aligned = True
210 |                     rng[0].s1 = aln.aln_q_s
211 |                     rng[0].e1 = aln.aln_q_e
212 |                     rng[0].s2 = aln.aln_t_s
213 |                     rng[0].e2 = aln.aln_t_e
214 |                     t_offset = read_shift
215 |                 else:
216 |                     falcon.free_alignment(aln)
217 |             if aligned:
218 |                 print(f"{read_id} is algined",
219 |                       rng[0].s1 , rng[0].e1, rng[0].s2, rng[0].e2, file=sys.stderr)
220 |                 # print(ffi.string(aln.q_aln_str), file=sys.stderr)
221 |                 # rint(ffi.string(aln.t_aln_str), file=sys.stderr)
222 |                 sys.stderr.flush()
223 |                 tag = falcon.get_align_tags(aln.q_aln_str,
224 |                                             aln.t_aln_str,
225 |                                             aln.aln_str_size,
226 |                                             rng, 0, t_offset)
227 |                 tags[aln_count] = tag
228 |                 aln_count += 1
229 |                 aln_base += abs(rng[0].e2 - rng[0].s2)
230 |                 falcon.free_alignment(aln)
231 |             ffi.release(read_seq)
232 |         aln_cov = aln_base/ref_len
233 |         print(f"aln_count:{aln_count}, aln_base: {aln_base}, aln_cov: {aln_cov}", file=sys.stderr)
234 | 
235 |         if aln_base/ref_len < 3:
236 |             cns_seq = ffi.string(ref_seq)
237 |             cns_seq = cns_seq.lower()
238 |         else:
239 |             cns = falcon.get_cns_from_align_tags(tags,
240 |                                                  aln_count, len(ref_seq), 1)
241 |             cns_seq = ffi.string(cns.sequence)
242 |             falcon.free_consensus_data(cns)
243 | 
244 |         cns_segments.append(cns_seq)
245 | 
246 |         for i in range(aln_count):
247 |             falcon.free_align_tags(tags[i])
248 |         ffi.release(tags)
249 |         ffi.release(ref_seq)
250 | 
251 |     s0 = cns_segments[0]
252 |     stiched_segments = [s0]
253 |     p = 0
254 |     for s1 in cns_segments[1:]:
255 |         aln = falcon.align(s0[-1000:], 1000,
256 |                            s1[:1050], 1050, 400, 0)
257 |         # print(aln.aln_q_s, aln.aln_q_e, aln.aln_t_s, aln.aln_t_e, aln.dist)
258 |         if aln.aln_q_e < 1000:
259 |             stiched_segments[-1] = stiched_segments[-1][:-(1000-aln.aln_q_e)]
260 | 
261 |         stiched_segments.append(s1[aln.aln_t_e:])
262 |         p += len(s1[aln.aln_t_e:])
263 |         print("stiching point:", p, file=sys.stderr)
264 |         print("aln.aln_q_e:", aln.aln_q_e, file=sys.stderr)
265 |         print("aln.aln_t_e:", aln.aln_t_e, file=sys.stderr)
266 |         # print(ffi.string(aln.q_aln_str), file=sys.stderr)
267 |         # print(ffi.string(aln.t_aln_str), file=sys.stderr)
268 |         s0 = s1
269 |         falcon.free_alignment(aln)
270 | 
271 |     contig = b"".join(stiched_segments)
272 |     print(">{}".format(ref_idx[ctg]["name"]))
273 |     print(contig.decode("ascii"))
274 | ffi.release(rng)
275 | 


--------------------------------------------------------------------------------
/src/shmr_map.c:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <errno.h>
  3 | #include <stdbool.h>
  4 | #include <stdint.h>
  5 | #include <stdio.h>
  6 | #include <string.h>
  7 | #include <time.h>
  8 | #include <unistd.h>
  9 | #include <wordexp.h>
 10 | #include "kalloc.h"
 11 | #include "khash.h"
 12 | #include "kvec.h"
 13 | #include "shimmer.h"
 14 | 
 15 | #include <fcntl.h>
 16 | #include <sys/mman.h>
 17 | #include <sys/stat.h>
 18 | 
 19 | extern char *optarg;
 20 | extern int optind, opterr, optopt;
 21 | 
 22 | #define handle_error(msg) \
 23 |   do {                    \
 24 |     perror(msg);          \
 25 |     exit(EXIT_FAILURE);   \
 26 |   } while (0)
 27 | 
 28 | #define MMER_COUNT_LOWER_BOUND 1
 29 | #define MMER_COUNT_UPPER_BOUND 240
 30 | #ifndef ORIGINAL
 31 | #define ORIGINAL 0
 32 | #endif
 33 | #ifndef REVERSED
 34 | #define REVERSED 1
 35 | #endif
 36 | #define READ_END_FUZZINESS 48
 37 | #define LOCAL_OVERLAP_UPPERBOUND 120
 38 | #define ALNBANDSIZE 100
 39 | 
 40 | KHASH_MAP_INIT_INT64(RPAIR, uint8_t);
 41 | 
 42 | int mp128_comp(const void *a, const void *b) {
 43 |   mp128_t *a0 = (mp128_t *)a;
 44 |   mp128_t *b0 = (mp128_t *)b;
 45 |   return ((a0->y0 & 0xFFFFFFFF) >> 1) < ((b0->y0 & 0xFFFFFFFF) >> 1);
 46 | }
 47 | 
 48 | void process_map(char *refdb_file_path, char *seqdb_file_path,
 49 |                  mm128_v *ref_mmers, khash_t(RLEN) * ref_lmap,
 50 |                  khash_t(MMER0) * mmer0_map, khash_t(RLEN) * rlmap,
 51 |                  khash_t(MMC) * mcmap, uint32_t lowerbound,
 52 |                  uint32_t upperbound) {
 53 |   int rfd, sfd;
 54 |   struct stat rsb, ssb;
 55 |   uint8_t *rseq_p, *seq_p;
 56 |   mp128_v *mpv;
 57 | 
 58 |   khash_t(MMER1) * mmer1_map;
 59 | 
 60 |   rfd = open(refdb_file_path, O_RDONLY);
 61 |   if (rfd == -1) handle_error("open");
 62 | 
 63 |   if (fstat(rfd, &rsb) == -1) /* To obtain file size */
 64 |     handle_error("fstat");
 65 | 
 66 |   rseq_p =
 67 |       (uint8_t *)mmap((void *)0, rsb.st_size, PROT_READ, MAP_SHARED, rfd, 0);
 68 | 
 69 |   sfd = open(seqdb_file_path, O_RDONLY);
 70 |   if (sfd == -1) handle_error("open");
 71 | 
 72 |   if (fstat(sfd, &ssb) == -1) /* To obtain file size */
 73 |     handle_error("fstat");
 74 | 
 75 |   seq_p =
 76 |       (uint8_t *)mmap((void *)0, ssb.st_size, PROT_READ, MAP_SHARED, sfd, 0);
 77 | 
 78 |   // clock_t begin = clock();
 79 |   // clock_t end;
 80 |   mm128_t mmer0, mmer1;
 81 |   khiter_t k;
 82 | 
 83 |   size_t s = 0;
 84 |   assert(ref_mmers->n > 0);
 85 |   for (;;) {
 86 |     mmer0 = ref_mmers->a[s];
 87 |     if (s >= ref_mmers->n) break;
 88 |     k = kh_get(MMER0, mmer0_map, mmer0.x);
 89 |     if (k != kh_end(mmer0_map)) break;
 90 |     s++;
 91 |   }
 92 | 
 93 |   for (size_t i = s + 1; i < ref_mmers->n; i++) {
 94 |     mmer1 = ref_mmers->a[i];
 95 |     uint64_t mhash = mmer1.x >> 8;
 96 |     k = kh_get(MMC, mcmap, mhash);
 97 |     if (k == kh_end(mcmap)) continue;
 98 |     uint32_t mcount = kh_val(mcmap, k);
 99 |     if (mcount < lowerbound || mcount > upperbound) continue;
100 | 
101 |     if ((mmer0.y >> 32) != (mmer1.y >> 32)) {
102 |       mmer0 = mmer1;
103 |       continue;  // the pairs are in the same read
104 |     }
105 | 
106 |     k = kh_get(MMER0, mmer0_map, mmer0.x);
107 |     if (k == kh_end(mmer0_map)) {
108 |       mmer0 = mmer1;
109 |       continue;
110 |     }
111 | 
112 |     mmer1_map = kh_val(mmer0_map, k);
113 |     k = kh_get(MMER1, mmer1_map, mmer1.x);
114 |     if (k == kh_end(mmer1_map)) {
115 |       mmer0 = mmer1;
116 |       continue;
117 |     }
118 | 
119 |     if (((mmer1.y >> 1) & 0xFFFFFFF) - ((mmer0.y >> 1) & 0xFFFFFFF) < 100) {
120 |       mmer0 = mmer1;
121 |       continue;
122 |     }
123 | 
124 |     mpv = kh_val(mmer1_map, k);
125 | 
126 |     uint32_t ref_id;
127 |     uint32_t ref_bgn;
128 |     uint32_t ref_end;
129 |     ref_id = (uint32_t)(mmer0.y >> 32);
130 |     ref_bgn = (uint32_t)((mmer0.y & 0xFFFFFFFF) >> 1);
131 |     ref_end = (uint32_t)((mmer1.y & 0xFFFFFFFF) >> 1);
132 | 
133 |     for (int j = 0; j < mpv->n; j++) {
134 |       uint32_t read_id;
135 |       uint32_t read_bgn;
136 |       uint32_t read_end;
137 |       uint8_t read_direction;
138 | 
139 |       read_id = mpv->a[j].y0 >> 32;
140 |       read_bgn = (uint32_t)((mpv->a[j].y0 & 0xFFFFFFFF) >> 1);
141 |       read_end = (uint32_t)((mpv->a[j].y1 & 0xFFFFFFFF) >> 1);
142 |       read_direction = mpv->a[j].direction;
143 |       assert(read_bgn < read_end);
144 | 
145 |       uint64_t mhash = mmer0.x >> 8;
146 |       k = kh_get(MMC, mcmap, mhash);
147 |       assert(k != kh_end(mcmap));
148 |       uint32_t mcount0 = kh_val(mcmap, k);
149 |       mhash = mmer1.x >> 8;
150 |       k = kh_get(MMC, mcmap, mhash);
151 |       assert(k != kh_end(mcmap));
152 |       uint32_t mcount1 = kh_val(mcmap, k);
153 |       printf("%u %u %u %u %u %u %d %u %u\n", ref_id, ref_bgn, ref_end, read_id,
154 |              read_bgn, read_end, read_direction, mcount0, mcount1);
155 |     }
156 |     mmer0 = mmer1;
157 |   }
158 | 
159 |   munmap(rseq_p, rsb.st_size);
160 |   munmap(seq_p, ssb.st_size);
161 | }
162 | 
163 | int main(int argc, char *argv[]) {
164 |   char *refdb_prefix = NULL;
165 |   char *seqdb_prefix = NULL;
166 |   char *ref_shimmer_prefix = NULL;
167 |   char *shimmer_prefix = NULL;
168 | 
169 |   char mmc_file_path[8192];
170 |   char mmer_file_path[8192];
171 |   char ref_idx_file_path[8192];
172 |   char refdb_file_path[8192];
173 |   char seq_idx_file_path[8192];
174 |   char seqdb_file_path[8192];
175 |   int c;
176 |   uint32_t total_chunk = 1, mychunk = 1;
177 | 
178 |   uint32_t mc_upper = MMER_COUNT_UPPER_BOUND;
179 |   uint32_t mc_lower = MMER_COUNT_LOWER_BOUND;
180 | 
181 |   wordexp_t p;
182 |   char **mmc_fns;
183 |   char **shimmer_fns;
184 | 
185 |   mm128_v ref_mmers = {0, 0, 0};
186 |   mm128_v mmers = {0, 0, 0};
187 |   mm128_v mmers_;
188 |   mm_count_v mmc;
189 | 
190 |   khash_t(RLEN) * ref_lmap;
191 |   khash_t(RLEN) * rlmap;
192 |   khash_t(MMC) *mcmap = kh_init(MMC);
193 | 
194 |   khash_t(MMER0) * mmer0_map;
195 |   khash_t(MMER1) * mmer1_map;
196 | 
197 |   mp128_v *mpv;
198 | 
199 |   opterr = 0;
200 | 
201 |   while ((c = getopt(argc, argv, "r:m:p:l:M:n:t:c:b:")) != -1) {
202 |     switch (c) {
203 |       case 'r':
204 |         refdb_prefix = optarg;
205 |         break;
206 |       case 'm':
207 |         ref_shimmer_prefix = optarg;
208 |         break;
209 |       case 'p':
210 |         seqdb_prefix = optarg;
211 |         break;
212 |       case 'l':
213 |         shimmer_prefix = optarg;
214 |         break;
215 |       case 'M':
216 |         mc_upper = atoi(optarg);
217 |         break;
218 |       case 'n':
219 |         mc_lower = atoi(optarg);
220 |         break;
221 |       case 't':
222 |         total_chunk = atoi(optarg);
223 |         break;
224 |       case 'c':
225 |         mychunk = atoi(optarg);
226 |         break;
227 |       case '?':
228 |         if (optopt == 'r') {
229 |           fprintf(stderr,
230 |                   "Option -%c not specified, using 'ref' as the ref sequence "
231 |                   "db prefix\n",
232 |                   optopt);
233 |         }
234 |         if (optopt == 'p') {
235 |           fprintf(stderr,
236 |                   "Option -%c not specified, using 'seq_dataset' as the "
237 |                   "sequence db prefix\n",
238 |                   optopt);
239 |         }
240 |         if (optopt == 'l') {
241 |           fprintf(stderr,
242 |                   "Option -%c not specified, using 'shimmer-L2' as the L2 "
243 |                   "index prefix\n",
244 |                   optopt);
245 |         }
246 |         return 1;
247 |       default:
248 |         abort();
249 |     }
250 |   }
251 | 
252 |   assert(total_chunk > 0);
253 |   assert(mychunk > 0 && mychunk <= total_chunk);
254 | 
255 |   if (refdb_prefix == NULL) {
256 |     refdb_prefix = (char *)calloc(8192, 1);
257 |     snprintf(refdb_prefix, 8191, "ref");
258 |   }
259 | 
260 |   if (ref_shimmer_prefix == NULL) {
261 |     ref_shimmer_prefix = (char *)calloc(8192, 1);
262 |     snprintf(ref_shimmer_prefix, 8191, "ref-L2");
263 |   }
264 | 
265 |   if (seqdb_prefix == NULL) {
266 |     seqdb_prefix = (char *)calloc(8192, 1);
267 |     snprintf(seqdb_prefix, 8191, "seq_dataset");
268 |   }
269 | 
270 |   if (shimmer_prefix == NULL) {
271 |     shimmer_prefix = (char *)calloc(8192, 1);
272 |     snprintf(shimmer_prefix, 8191, "shimmer-L2");
273 |   }
274 | 
275 |   int written;
276 |   written = snprintf(ref_idx_file_path, sizeof(ref_idx_file_path), "%s.idx",
277 |                      refdb_prefix);
278 |   assert(written < sizeof(ref_idx_file_path));
279 |   fprintf(stderr, "using ref index file: %s\n", ref_idx_file_path);
280 | 
281 |   ref_lmap = get_read_length_map(ref_idx_file_path);
282 | 
283 |   written = snprintf(refdb_file_path, sizeof(seqdb_file_path), "%s.seqdb",
284 |                      refdb_prefix);
285 |   assert(written < sizeof(refdb_file_path));
286 |   fprintf(stderr, "using ref seqdb file: %s\n", refdb_file_path);
287 | 
288 |   written = snprintf(mmer_file_path, sizeof(mmer_file_path),
289 |                      "%s-[0-9]*-of-[0-9]*.dat", ref_shimmer_prefix);
290 |   assert(written < sizeof(mmer_file_path));
291 |   wordexp(mmer_file_path, &p, 0);
292 |   shimmer_fns = p.we_wordv;
293 |   for (int i = 0; i < p.we_wordc; i++) {
294 |     fprintf(stderr, "using ref shimmer data file: %s\n", shimmer_fns[i]);
295 |     mmers_ = read_mmlist(shimmer_fns[i]);
296 |     fprintf(stderr, "number of shimmers load: %lu\n", mmers_.n);
297 |     append_mmlist(&ref_mmers, &mmers_);
298 |     kv_destroy(mmers_);
299 |   }
300 |   wordfree(&p);
301 | 
302 |   written = snprintf(seq_idx_file_path, sizeof(seq_idx_file_path), "%s.idx",
303 |                      seqdb_prefix);
304 |   assert(written < sizeof(seq_idx_file_path));
305 |   fprintf(stderr, "using index file: %s\n", seq_idx_file_path);
306 | 
307 |   rlmap = get_read_length_map(seq_idx_file_path);
308 | 
309 |   written = snprintf(seqdb_file_path, sizeof(seqdb_file_path), "%s.seqdb",
310 |                      seqdb_prefix);
311 |   assert(written < sizeof(seqdb_file_path));
312 |   fprintf(stderr, "using seqdb file: %s\n", seqdb_file_path);
313 | 
314 |   written = snprintf(mmer_file_path, sizeof(mmer_file_path),
315 |                      "%s-[0-9]*-of-[0-9]*.dat", shimmer_prefix);
316 | 
317 |   assert(written < sizeof(mmer_file_path));
318 |   wordexp(mmer_file_path, &p, 0);
319 |   shimmer_fns = p.we_wordv;
320 |   for (int i = 0; i < p.we_wordc; i++) {
321 |     fprintf(stderr, "using shimmer data file: %s\n", shimmer_fns[i]);
322 |     mmers_ = read_mmlist(shimmer_fns[i]);
323 |     fprintf(stderr, "number of shimmers load: %lu\n", mmers_.n);
324 |     append_mmlist(&mmers, &mmers_);
325 |     kv_destroy(mmers_);
326 |   }
327 |   wordfree(&p);
328 | 
329 |   char buffer[32768];
330 | 
331 |   setvbuf(stdout, buffer, _IOFBF, sizeof(buffer));
332 | 
333 |   written = snprintf(mmc_file_path, sizeof(mmc_file_path),
334 |                      "%s-MC-[0-9]*-of-[0-9]*.dat", shimmer_prefix);
335 | 
336 |   assert(written < sizeof(mmc_file_path));
337 |   wordexp(mmc_file_path, &p, 0);
338 |   mmc_fns = p.we_wordv;
339 |   for (int i = 0; i < p.we_wordc; i++) {
340 |     fprintf(stderr, "using shimmer count file: %s\n", mmc_fns[i]);
341 |     mmc = read_mm_count(mmc_fns[i]);
342 |     aggregate_mm_count(mcmap, &mmc);
343 |     kv_destroy(mmc);
344 |   }
345 | 
346 |   wordfree(&p);
347 | 
348 |   mmer0_map = kh_init(MMER0);
349 | 
350 |   build_map(&mmers, mmer0_map, rlmap, mcmap, mychunk, total_chunk, mc_lower,
351 |             mc_upper);
352 | 
353 |   process_map(refdb_file_path, seqdb_file_path, &ref_mmers, ref_lmap, mmer0_map,
354 |               rlmap, mcmap, mc_lower, mc_upper);
355 | 
356 |   for (khiter_t __i = kh_begin(mmer0_map); __i != kh_end(mmer0_map); ++__i) {
357 |     if (!kh_exist(mmer0_map, __i)) continue;
358 |     mmer1_map = kh_val(mmer0_map, __i);
359 |     for (khiter_t __j = kh_begin(mmer1_map); __j != kh_end(mmer1_map); ++__j) {
360 |       if (!kh_exist(mmer1_map, __j)) continue;
361 |       mpv = kh_val(mmer1_map, __j);
362 |       kv_destroy(*mpv);
363 |     }
364 |     kh_destroy(MMER1, mmer1_map);
365 |   }
366 | 
367 |   kh_destroy(MMER0, mmer0_map);
368 |   kh_destroy(MMC, mcmap);
369 |   kh_destroy(RLEN, rlmap);
370 |   kv_destroy(mmers);
371 |   kv_destroy(ref_mmers);
372 |   fflush(stdout);
373 | }
374 | 


--------------------------------------------------------------------------------
/falcon/DW_banded.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /*
  3 |  * =====================================================================================
  4 |  *
  5 |  *       Filename:  DW_banded.c
  6 |  *
  7 |  *    Description:  A banded version for the O(ND) greedy sequence alignment algorithm
  8 |  *
  9 |  *        Version:  0.1
 10 |  *        Created:  07/20/2013 17:00:00
 11 |  *       Revision:  none
 12 |  *       Compiler:  gcc
 13 |  *
 14 |  *         Author:  Jason Chin,
 15 |  *        Company:
 16 |  *
 17 |  * =====================================================================================
 18 | 
 19 | #################################################################################$$
 20 | # Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
 21 | #
 22 | # All rights reserved.
 23 | #
 24 | # Redistribution and use in source and binary forms, with or without
 25 | # modification, are permitted (subject to the limitations in the
 26 | # disclaimer below) provided that the following conditions are met:
 27 | #
 28 | #  * Redistributions of source code must retain the above copyright
 29 | #  notice, this list of conditions and the following disclaimer.
 30 | #
 31 | #  * Redistributions in binary form must reproduce the above
 32 | #  copyright notice, this list of conditions and the following
 33 | #  disclaimer in the documentation and/or other materials provided
 34 | #  with the distribution.
 35 | #
 36 | #  * Neither the name of Pacific Biosciences nor the names of its
 37 | #  contributors may be used to endorse or promote products derived
 38 | #  from this software without specific prior written permission.
 39 | #
 40 | # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
 41 | # GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
 42 | # BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
 43 | # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 44 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 45 | # DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
 46 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 47 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 48 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 49 | # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 50 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 51 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 52 | # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 53 | # SUCH DAMAGE.
 54 | #################################################################################$$
 55 | 
 56 | 
 57 | */
 58 | 
 59 | #include <stdlib.h>
 60 | #include <stdio.h>
 61 | #include <limits.h>
 62 | #include <stdbool.h>
 63 | #include "common.h"
 64 | 
 65 | int compare_d_path(const void * a, const void * b)
 66 | {
 67 |     const d_path_data2 * arg1 = a;
 68 |     const d_path_data2 * arg2 = b;
 69 |     if (arg1->d - arg2->d == 0) {
 70 |         return  arg1->k - arg2->k;
 71 |     } else {
 72 |         return arg1->d - arg2->d;
 73 |     }
 74 | }
 75 | 
 76 | 
 77 | void d_path_sort( d_path_data2 * base, unsigned long max_idx) {
 78 |     qsort(base, max_idx, sizeof(d_path_data2), compare_d_path);
 79 | }
 80 | 
 81 | d_path_data2 * get_dpath_idx( seq_coor_t d, seq_coor_t k, unsigned long max_idx, d_path_data2 * base) {
 82 |     d_path_data2 d_tmp;
 83 |     d_path_data2 *rtn;
 84 |     d_tmp.d = d;
 85 |     d_tmp.k = k;
 86 |     rtn = (d_path_data2 *)  bsearch( &d_tmp, base, max_idx, sizeof(d_path_data2), compare_d_path);
 87 |     //printf("dp %ld %ld %ld %ld %ld %ld %ld\n", (rtn)->d, (rtn)->k, (rtn)->x1, (rtn)->y1, (rtn)->x2, (rtn)->y2, (rtn)->pre_k);
 88 | 
 89 |     return rtn;
 90 | 
 91 | }
 92 | 
 93 | void print_d_path(  d_path_data2 * base, unsigned long max_idx) {
 94 |     unsigned long idx;
 95 |     for (idx = 0; idx < max_idx; idx++){
 96 |         printf("dp %ld %d %d %d %d %d %d %d\n",idx, 
 97 |                 (base+idx)->d, (base+idx)->k, 
 98 |                 (base+idx)->x1, (base+idx)->y1, 
 99 |                 (base+idx)->x2, (base+idx)->y2, (base+idx)->pre_k);
100 |     }
101 | }
102 | 
103 | 
104 | alignment * align(char * query_seq, seq_coor_t q_len,
105 |         char * target_seq, seq_coor_t t_len,
106 |         seq_coor_t band_tolerance,
107 |         int get_aln_str) {
108 |     seq_coor_t * V;
109 |     seq_coor_t * U;  // array of matched bases for each "k"
110 |     seq_coor_t k_offset;
111 |     seq_coor_t d;
112 |     seq_coor_t k, k2;
113 |     seq_coor_t best_m;  // the best "matches" for each d
114 |     seq_coor_t min_k, new_min_k;
115 |     seq_coor_t max_k, new_max_k;
116 |     seq_coor_t pre_k;
117 |     seq_coor_t x, y;
118 |     seq_coor_t cd;
119 |     seq_coor_t ck;
120 |     seq_coor_t cx, cy, nx, ny;
121 |     seq_coor_t max_d;
122 |     seq_coor_t band_size;
123 |     unsigned long d_path_idx = 0;
124 |     unsigned long max_idx = 0;
125 | 
126 |     d_path_data2 * d_path;
127 |     d_path_data2 * d_path_aux;
128 |     path_point * aln_path;
129 |     seq_coor_t aln_path_idx;
130 |     alignment * align_rtn;
131 |     seq_coor_t aln_pos;
132 |     seq_coor_t i;
133 |     bool aligned = false;
134 | 
135 |     //printf("debug: %ld %ld\n", q_len, t_len);
136 |     //printf("%s\n", query_seq);
137 | 
138 |     max_d = (int) (0.3*(q_len + t_len));
139 | 
140 |     band_size = band_tolerance * 2;
141 | 
142 |     V = calloc( max_d * 2 + 1, sizeof(seq_coor_t) );
143 |     U = calloc( max_d * 2 + 1, sizeof(seq_coor_t) );
144 | 
145 |     k_offset = max_d;
146 | 
147 |     // We should probably use hashmap to store the backtracing information to save memory allocation time
148 |     // This O(MN) block allocation scheme is convient for now but it is slower for very long sequences
149 |     d_path = calloc( max_d * (band_size + 1 ) * 2 + 1, sizeof(d_path_data2) );
150 | 
151 |     aln_path = calloc( q_len + t_len + 1, sizeof(path_point) );
152 | 
153 |     align_rtn = calloc( 1, sizeof(alignment));
154 |     align_rtn->t_aln_str = calloc( q_len + t_len + 1, sizeof(char));
155 |     align_rtn->q_aln_str = calloc( q_len + t_len + 1, sizeof(char));
156 |     align_rtn->aln_str_size = 0;
157 |     align_rtn->aln_q_s = 0;
158 |     align_rtn->aln_q_e = 0;
159 |     align_rtn->aln_t_s = 0;
160 |     align_rtn->aln_t_e = 0;
161 | 
162 |     //printf("max_d: %lu, band_size: %lu\n", max_d, band_size);
163 |     best_m = -1;
164 |     min_k = 0;
165 |     max_k = 0;
166 |     d_path_idx = 0;
167 |     max_idx = 0;
168 |     for (d = 0; d < max_d; d ++ ) {
169 |         if (max_k - min_k > band_size) {
170 |             break;
171 |         }
172 | 
173 |         for (k = min_k; k <= max_k;  k += 2) {
174 | 
175 |             if ( (k == min_k) || ((k != max_k) && (V[ k - 1 + k_offset ] < V[ k + 1 + k_offset])) ) {
176 |                 pre_k = k + 1;
177 |                 x = V[ k + 1 + k_offset];
178 |             } else {
179 |                 pre_k = k - 1;
180 |                 x = V[ k - 1 + k_offset] + 1;
181 |             }
182 |             y = x - k;
183 |             d_path[d_path_idx].d = d;
184 |             d_path[d_path_idx].k = k;
185 |             d_path[d_path_idx].x1 = x;
186 |             d_path[d_path_idx].y1 = y;
187 | 
188 |             while ( x < q_len && y < t_len && query_seq[x] == target_seq[y] ){
189 |                 x++;
190 |                 y++;
191 |             }
192 | 
193 |             d_path[d_path_idx].x2 = x;
194 |             d_path[d_path_idx].y2 = y;
195 |             d_path[d_path_idx].pre_k = pre_k;
196 |             d_path_idx ++;
197 | 
198 |             V[ k + k_offset ] = x;
199 |             U[ k + k_offset ] = x + y;
200 | 
201 |             if ( x + y > best_m) {
202 |                 best_m = x + y;
203 |             }
204 | 
205 |             if ( x >= q_len || y >= t_len) {
206 |                 aligned = true;
207 |                 max_idx = d_path_idx;
208 |                 break;
209 |             }
210 |         }
211 | 
212 |         // For banding
213 |         new_min_k = max_k;
214 |         new_max_k = min_k;
215 | 
216 |         for (k2 = min_k; k2 <= max_k;  k2 += 2) {
217 |             if (U[ k2 + k_offset] >= best_m - band_tolerance ) {
218 |                 if ( k2 < new_min_k ) {
219 |                     new_min_k = k2;
220 |                 }
221 |                 if ( k2 > new_max_k ) {
222 |                     new_max_k = k2;
223 |                 }
224 |             }
225 |         }
226 | 
227 |         max_k = new_max_k + 1;
228 |         min_k = new_min_k - 1;
229 | 
230 |         // For no banding
231 |         // max_k ++;
232 |         // min_k --;
233 | 
234 |         // For debuging
235 |         // printf("min_max_k,d, %ld %ld %ld\n", min_k, max_k, d);
236 | 
237 |         if (aligned == true) {
238 |             align_rtn->aln_q_e = x;
239 |             align_rtn->aln_t_e = y;
240 |             align_rtn->dist = d;
241 |             align_rtn->aln_str_size = (x + y + d) / 2;
242 |             align_rtn->aln_q_s = 0;
243 |             align_rtn->aln_t_s = 0;
244 | 
245 |             d_path_sort(d_path, max_idx);
246 |             //print_d_path(d_path, max_idx);
247 | 
248 |             if (get_aln_str > 0) {
249 |                 cd = d;
250 |                 ck = k;
251 |                 aln_path_idx = 0;
252 |                 while (cd >= 0 && aln_path_idx < q_len + t_len + 1) {
253 |                     d_path_aux = (d_path_data2 *) get_dpath_idx( cd, ck, max_idx, d_path);
254 |                     aln_path[aln_path_idx].x = d_path_aux -> x2;
255 |                     aln_path[aln_path_idx].y = d_path_aux -> y2;
256 |                     aln_path_idx ++;
257 |                     aln_path[aln_path_idx].x = d_path_aux -> x1;
258 |                     aln_path[aln_path_idx].y = d_path_aux -> y1;
259 |                     aln_path_idx ++;
260 |                     ck = d_path_aux -> pre_k;
261 |                     cd -= 1;
262 |                 }
263 |                 aln_path_idx --;
264 |                 cx = aln_path[aln_path_idx].x;
265 |                 cy = aln_path[aln_path_idx].y;
266 |                 align_rtn->aln_q_s = cx;
267 |                 align_rtn->aln_t_s = cy;
268 |                 aln_pos = 0;
269 |                 while ( aln_path_idx > 0 ) {
270 |                     aln_path_idx --;
271 |                     nx = aln_path[aln_path_idx].x;
272 |                     ny = aln_path[aln_path_idx].y;
273 |                     if (cx == nx && cy == ny){
274 |                         continue;
275 |                     }
276 |                     if (nx == cx && ny != cy){ //advance in y
277 |                         for (i = 0; i <  ny - cy; i++) {
278 |                             align_rtn->q_aln_str[aln_pos + i] = '-';
279 |                         }
280 |                         for (i = 0; i <  ny - cy; i++) {
281 |                             align_rtn->t_aln_str[aln_pos + i] = target_seq[cy + i];
282 |                         }
283 |                         aln_pos += ny - cy;
284 |                     } else if (nx != cx && ny == cy){ //advance in x
285 |                         for (i = 0; i <  nx - cx; i++) {
286 |                             align_rtn->q_aln_str[aln_pos + i] = query_seq[cx + i];
287 |                         }
288 |                         for (i = 0; i <  nx - cx; i++) {
289 |                             align_rtn->t_aln_str[aln_pos + i] = '-';
290 |                         }
291 |                         aln_pos += nx - cx;
292 |                     } else {
293 |                         for (i = 0; i <  nx - cx; i++) {
294 |                             align_rtn->q_aln_str[aln_pos + i] = query_seq[cx + i];
295 |                         }
296 |                         for (i = 0; i <  ny - cy; i++) {
297 |                             align_rtn->t_aln_str[aln_pos + i] = target_seq[cy + i];
298 |                         }
299 |                         aln_pos += ny - cy;
300 |                     }
301 |                     cx = nx;
302 |                     cy = ny;
303 |                 }
304 |                 align_rtn->aln_str_size = aln_pos;
305 |             }
306 |             break;
307 |         }
308 |     }
309 | 
310 |     free(V);
311 |     free(U);
312 |     free(d_path);
313 |     free(aln_path);
314 |     return align_rtn;
315 | }
316 | 
317 | 
318 | void free_alignment(alignment * aln) {
319 |     free(aln->q_aln_str);
320 |     free(aln->t_aln_str);
321 |     free(aln);
322 | }
323 | 


--------------------------------------------------------------------------------