├── .github
    └── workflows
    │   ├── create_artifect_for_release.yml
    │   └── test_and_build.yml
├── .gitignore
├── .gitmodules
├── Cargo.toml
├── LICENSE
├── LICENSE-GeneDX
├── README.md
├── aws_image_builder
    └── pgr-tk.yaml
├── build.sh
├── build_no_agc.sh
├── build_no_agc_apple_silicon.sh
├── docker
    ├── Dockerfile
    ├── Dockerfile.build_env-22.04
    └── github-build.pub
├── docker_exec_env
    ├── Dockerfile
    └── build.sh
├── docs_src
    └── alnmap_formap.md
├── images
    ├── AMY1A_example.png
    └── PGR_TK_Sketch_MAPG_construction.png
├── justfile
├── pgr-bin
    ├── Cargo.toml
    ├── build.rs
    ├── file_format_documents
    │   ├── ctgsv.bed.md
    │   ├── for_pgr-alnmap
    │   │   ├── alnmap.md
    │   │   ├── ctgmap.bed.md
    │   │   ├── ctgmap.json.md
    │   │   └── svcnd.bed.md
    │   ├── gfa_format.md
    │   ├── input_file_formats_for_pgr-pbundle-bed.md
    │   ├── output_files_for_pgr-pbundle-decomp.md
    │   └── principal_bundle_bed_file.md
    ├── src
    │   ├── _bin
    │   │   ├── README.txt
    │   │   ├── pgr-fasta-smp-count.rs
    │   │   ├── pgr-filter.rs
    │   │   ├── pgr-multifilter.rs
    │   │   ├── pgr-probe-match.rs
    │   │   ├── pgr-shmmr-pair-count.rs
    │   │   └── pgr-test.rs
    │   └── bin
    │   │   ├── pgr-alnmap.rs
    │   │   ├── pgr-annotate-bed-file.rs
    │   │   ├── pgr-annotate-vcf-file.rs
    │   │   ├── pgr-compare-cov.rs
    │   │   ├── pgr-compare-cov2.rs
    │   │   ├── pgr-fetch-seqs.rs
    │   │   ├── pgr-generate-chr-aln-plot.rs
    │   │   ├── pgr-generate-diploid-vcf.rs
    │   │   ├── pgr-generate-sv-analysis.rs
    │   │   ├── pgr-make-frgdb.rs
    │   │   ├── pgr-map-coordinate.rs
    │   │   ├── pgr-mdb.rs
    │   │   ├── pgr-merge-svcnd-bed.rs
    │   │   ├── pgr-pbundle-aln.rs
    │   │   ├── pgr-pbundle-bed2dist.rs
    │   │   ├── pgr-pbundle-bed2offset.rs
    │   │   ├── pgr-pbundle-bed2sorted.rs
    │   │   ├── pgr-pbundle-bed2svg.rs
    │   │   ├── pgr-pbundle-decomp.rs
    │   │   ├── pgr-pbundle-shmmr2dist.rs
    │   │   ├── pgr-query.rs
    │   │   └── pgr-shmmr-count.rs
    └── utility_scripts
    │   └── get_cytoband_to_json.py
├── pgr-db
    ├── Cargo.toml
    ├── build.rs
    ├── src
    │   ├── agc_io.rs
    │   ├── aln.rs
    │   ├── bindings.rs
    │   ├── ec.rs
    │   ├── ext.rs
    │   ├── fasta_io.rs
    │   ├── frag_file_io.rs
    │   ├── gff_db.rs
    │   ├── graph_utils.rs
    │   ├── kmer_filter.rs
    │   ├── lib.rs
    │   ├── seq_db.rs
    │   ├── seqs2variants.rs
    │   └── shmmrutils.rs
    ├── test
    │   └── test_data
    │   │   ├── consensus_test.fa
    │   │   ├── consensus_test2.fa
    │   │   ├── consensus_test3.fa
    │   │   ├── consensus_test4.fa
    │   │   ├── consensus_test5.fa
    │   │   ├── gen_agc.sh
    │   │   ├── gen_frag_db.py
    │   │   ├── seq0
    │   │   ├── seq1
    │   │   ├── test.agc
    │   │   ├── test.gff3.gz
    │   │   ├── test_agc_ref.fa
    │   │   ├── test_agc_seqs.fa
    │   │   ├── test_hits
    │   │   ├── test_rev.fa
    │   │   ├── test_seqs.fa
    │   │   ├── test_seqs2.fa.gz
    │   │   ├── test_seqs_frag.frg
    │   │   ├── test_seqs_frag.mdb
    │   │   ├── test_seqs_frag.midx
    │   │   └── test_seqs_frag.sdx
    └── wrapper.h
├── pgr-tk-workstation
    ├── Dockerfile
    ├── Readme.md
    ├── build.sh
    └── jupyterlab.sh
├── pgr-tk
    ├── Cargo.toml
    ├── Readme.md
    ├── build.rs
    ├── build.sh
    ├── examples
    │   ├── generate_gfa_for_CMRG.py
    │   └── get_variants.py
    ├── pgrtk
    │   └── __init__.py
    └── src
    │   └── lib.rs
└── pgr-web
    ├── frontend
        ├── Cargo.toml
        ├── Trunk.toml
        ├── index.html
        └── src
        │   ├── data
        │       └── ROIs.json
        │   └── main.rs
    ├── pgr-server
        ├── Cargo.toml
        └── src
        │   ├── ROIs.json
        │   ├── bundle_processing.rs
        │   └── main.rs
    ├── prod.sh
    ├── prod_no_agc.sh
    └── scripts
        ├── ROIs_examples.json
        ├── generare_ROIs.sh
        └── get_ROIs.py


/.github/workflows/create_artifect_for_release.yml:
--------------------------------------------------------------------------------
 1 | name: create_artifect_for_release
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |  
 6 | env:
 7 |   CARGO_TERM_COLOR: always
 8 | 
 9 | jobs:
10 |   build:
11 |     runs-on: ubuntu-latest
12 |     container: docker.io/cschin/pgr-tk-build-env
13 |     steps:
14 |     - uses: actions/checkout@v3
15 |       with:
16 |         submodules: recursive
17 |     - name: setup
18 |       run: |
19 |         rustup default stable
20 |     - name: Run tests
21 |       run: cargo test --verbose --workspace --exclude pgrtk
22 |     - name: Build
23 |       run: bash build.sh
24 |     - uses: actions/upload-artifact@v3
25 |       with:
26 |         name: build-artifact
27 |         path: |
28 |           target/release/pgr-mdb
29 |           target/release/pgr-fetch-seqs
30 |           target/release/pgr-make-frgdb
31 |           target/release/pgr-pbundle-bed2dist
32 |           target/release/pgr-pbundle-bed2offset
33 |           target/release/pgr-pbundle-bed2sorted
34 |           target/release/pgr-pbundle-bed2svg
35 |           target/release/pgr-pbundle-decomp
36 |           target/release/pgr-query
37 |           target/wheels/*.whl
38 | 


--------------------------------------------------------------------------------
/.github/workflows/test_and_build.yml:
--------------------------------------------------------------------------------
 1 | name: test_and_build
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 | 
 9 | env:
10 |   CARGO_TERM_COLOR: always
11 | 
12 | jobs:
13 |   build:
14 |     runs-on: ubuntu-latest
15 |     container: docker.io/cschin/pgr-tk-build-env
16 |     steps:
17 |     - uses: actions/checkout@v4
18 |       with:
19 |         submodules: recursive
20 |     - name: setup
21 |       run: /opt/cargo/bin/rustup default stable
22 |     - name: Run tests
23 |       run: /opt/cargo/bin/cargo test --verbose --workspace --exclude pgrtk
24 |     - name: Build
25 |       run: bash build.sh
26 |     - uses: actions/upload-artifact@v4
27 |       with:
28 |         name: build-artifact
29 |         path: |
30 |           target/release/pgr-mdb
31 |           target/release/pgr-fetch-seqs
32 |           target/release/pgr-make-frgdb
33 |           target/release/pgr-pbundle-bed2dist
34 |           target/release/pgr-pbundle-bed2offset
35 |           target/release/pgr-pbundle-bed2sorted
36 |           target/release/pgr-pbundle-bed2svg
37 |           target/release/pgr-pbundle-decomp
38 |           target/release/pgr-query
39 |           target/wheels/*.whl
40 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /target
 2 | */target
 3 | */Cargo.lock
 4 | */libagc.so
 5 | */test/test_data/test_shmmr.db
 6 | libagc.so
 7 | *.pyc
 8 | Cargo.lock
 9 | pgr-tk/pgrtk/*.so
10 | pgr-tk-workstation/*.whl
11 | docker/github-build
12 | .vscode/settings.json
13 | .gitignore
14 | pgr-web/dist/*
15 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "agc"]
2 | 	path = agc
3 | 	url = https://github.com/cschin/agc.git
4 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [workspace]
2 | members = ["pgr-db", "pgr-bin", 'pgr-tk', 'pgr-web/pgr-server', "pgr-web/frontend"] 
3 | resolver = "2"
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Pangenome Research ToolKit
 2 | 
 3 | 2023-2024 (c) Jason Chin
 4 | 
 5 | MIT License
 6 | 
 7 | Copyright (c) 2024 GeneDx
 8 | 
 9 | Permission is hereby granted, free of charge, to any person obtaining a copy
10 | of this software and associated documentation files (the "Software"), to deal
11 | in the Software without restriction, including without limitation the rights
12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 | copies of the Software, and to permit persons to whom the Software is
14 | furnished to do so, subject to the following conditions:
15 | 
16 | The above copyright notice and this permission notice shall be included in all
17 | copies or substantial portions of the Software.
18 | 
19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 | SOFTWARE.
26 | 


--------------------------------------------------------------------------------
/LICENSE-GeneDX:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 GeneDx
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PGR-tk: A PanGenomic Research Took Kit
  2 | 
  3 | [![test_and_build](https://github.com/cschin/pgr-tk/actions/workflows/test_and_build.yml/badge.svg)](https://github.com/cschin/pgr-tk/actions/workflows/test_and_build.yml)
  4 | 
  5 | This repository is a project to provide Python and Rust libraries to facilitate pangenomics analysis. Several algorithms and data structures used for the Peregrine Genome Assembler are useful for Pangenomics analysis as well. This repo takes those algorithms and data structure, combining other handy 3rd party tools to expose them as a library in Python (with Rust code for those computing parts that need performance.) 
  6 | 
  7 | ## What is PGR-tk?
  8 | 
  9 | Research Preprint: 
 10 | 
 11 | [Multiscale Analysis of Pangenome Enables Improved Representation of Genomic Diversity For Repetitive And Clinically Relevant Genes](https://www.biorxiv.org/content/10.1101/2022.08.05.502980v2)
 12 | 
 13 | PGR-TK provides pangenome assembly management, query and Minimizer Anchored Pangenome (MAP) Graph Generation
 14 | 
 15 | ![Pangenome Data Management and Minimizer Anchored Pangenome Graph Generation](/images/PGR_TK_Sketch_MAPG_construction.png)
 16 | 
 17 | With the MAP graph, we can use the "principal bundle decomposition" to study complicated structure variants and genome re-arragenment in the human populations.
 18 | 
 19 | ![AMY1A Example](/images/AMY1A_example.png)
 20 | 
 21 | 
 22 | ## Documentation, Usage and Examples
 23 | 
 24 | Command Line Tools:
 25 | 
 26 | PGR-TK provides the following tool to 
 27 | 
 28 | - create the PGR-TK sequence and index database
 29 | 	-  `pgr-mdb`: create pgr minimizer database with AGC backend
 30 | 	-  `pgr-make-frgdb`: create PGR-TK fragment minimizer database with frg format backend
 31 | - query the database to fetch sequences
 32 | 	- `pgr-query`: query a PGR-TK pangenome sequence database, ouput the hit summary and generate fasta files from the target sequences
 33 | - generate MAP-graph in GFA format and principal bundle decomposition bed file
 34 | 	- `pgr-pbundle-decomp`: generat the principal bundle decomposition though MAP Graph from a fasta file
 35 | - generate SVG from the principal bundle decomposition bed file
 36 | 	- `pgr-pbundle-bed2svg`: generate SVG from a principal bundle bed file
 37 | - auxiliary tools
 38 | 	- `pgr-pbundle-bed2sorted`: generate annotation file with a sorting order from the principal bundle decomposition
 39 | 	- `pgr-pbundle-bed2dist`: generate alignment scores between sequences using bundle decomposition from a principal bundle bed file
 40 | 
 41 | For each comannd, `command --help` provides the detail usage information. 
 42 | 
 43 | The API documentation is at https://genedx.github.io/pgr-tk/
 44 | 
 45 | A collection of Jupyter Notebooks are at https://github.com/genedx/pgr-tk-notebooks/
 46 | 
 47 | ## Built Binaries
 48 | 
 49 | Check https://github.com/genedx/pgr-tk/releases
 50 | 
 51 | 
 52 | ## Build
 53 | 
 54 | See `docker/Dockerfile.build_env-20.04` for a build enviroment under ubuntu 20.04.
 55 | With the proper build environment, just run `bash build.sh` to build all.
 56 | 
 57 | For example, on a Mac OS with Docker install, you can clone the repository and build a linux binary
 58 | within an Ubuntu 20.04 Linux distribution as follow:
 59 | 
 60 | 1. Build the Docker image for a build environment:
 61 | 
 62 | ```
 63 | git clone --recursive git@github.com:cschin/pgr-tk.git # clone the repo
 64 | cd pgr-tk/docker
 65 | ln -s Dockerfile.build_env-20.04 Dockerfile
 66 | docker build -t pgr-tk-build .
 67 | ```
 68 | 
 69 | 2. In the root directory of the repo `pgr-tk`:
 70 | 
 71 | Execute 
 72 | ```
 73 | docker run -it --rm -v $PWD:/wd/pgr-tk pgr-tk-build /bin/bash 
 74 | ```
 75 | 
 76 | 3. Build the `pgr-tk` inside the docker container from the image `pgr-tk-build`
 77 | 
 78 | ```
 79 | cd /wd/pgr-tk
 80 | bash build.sh
 81 | ```
 82 | 
 83 | The build python wheels will be in `target/wheels` which can be installed for ubuntun 20.04 python3.8 distribution. You can install it in the `pgr-tk-build` image as well to test it out.
 84 | 
 85 | 
 86 | ### Build Singularity image
 87 | 
 88 | If you have built the pgr-tk in a Docker container, you can use the following steps to build a Singularity image based on your Docker container.
 89 | 
 90 | **Step 1: Commit Docker container to image**
 91 | 
 92 | ```bash
 93 | docker commit <container id> <image name>:<version>
 94 | ```
 95 | 
 96 | **Step 2: Push Docker image to Docker Hub**
 97 | 
 98 | ```bash
 99 | docker login # if not already logged in
100 | docker push <image name>:<version>
101 | ```
102 | 
103 | **Step 3: Build Singularity image**
104 | 
105 | ```bash
106 | singularity build ./pgr-tk.v0.5.1.sif docker://<docker_repo>/<image name>:<version>
107 | ```
108 | 
109 | This will generate a .sif file in the current directory.
110 | 
111 | **Step 4: Execute**
112 | 
113 | ```bash
114 | singularity exec --fakeroot -B <host_path>:/<container_path> ./pgr-tk.v0.5.1.sif pgr-mdb test.input test_idx
115 | ```
116 | 
117 | Replace `<host_path>` with the actual path you wish to bind to the container.
118 | 
119 | The `--fakeroot` option allows you to build and run images as a "fake" root user.
120 | 
121 | ## Install stable verison v0.3.6 with Bioconda
122 | 
123 | If you have a conda install, you can try this to build an conda environment to use pgr-tk v0.3.6 (on linux only):
124 | 
125 | ```
126 | conda create -n pgr-tk python=3.8
127 | conda activate pgr-tk
128 | conda install -c bioconda -c conda-forge python_abi libstdcxx-ng=12 libclang13 pgr-tk=0.3.6
129 | ```
130 | 
131 | ## Troubleshooting
132 | 
133 | `Segmentation fault (core dumped)`
134 | 
135 | Usually, the issue arises because AGC encounters a version incompatibility when called by pgr-tk. The version of AGC that has been well-tested is [453c0afd](https://github.com/cschin/agc/tree/453c0afdc54b4aa00fa8e97a63f196931fdb81c4). To address this error, consider the following potential solutions:
136 | 
137 | 1. Compile pgr-tk using Docker or Singularity instead of directly on your computer. Ensure that the Docker container is based on Ubuntu 20.04.
138 | 
139 | 2. When cloning the pgr-tk repository, make sure to use the `--recursive` option. This will clone the AGC dependency as well."
140 | 
141 | 


--------------------------------------------------------------------------------
/aws_image_builder/pgr-tk.yaml:
--------------------------------------------------------------------------------
 1 | # Document Start
 2 | # This provide AWS Image Builder Component with Ubuntu 22.04 base image
 3 | name: "pgr-tk"
 4 | description: "Image with PGR-TK pre-install with pangenomic data"
 5 | schemaVersion: 1.0
 6 | phases:
 7 |   - name: build
 8 |     steps:
 9 |       - name: InstallBuildingToolChain
10 |         action: ExecuteBash
11 |         inputs:
12 |           commands:
13 |             - apt-get update
14 |             - DEBIAN_FRONTEND=noninteractive
15 |             - TZ=Etc/UTC
16 |             - apt-get install -y build-essential git ssh curl clang-14 cmake libssl-dev libssl3 pkg-config python3-pip
17 |             - mkdir -p /opt
18 |             - export RUSTUP_HOME=/opt/rustup
19 |             - export CARGO_HOME=/opt/cargo
20 |             - bash -c "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y"
21 |             - source /opt/cargo/env && rustup default stable
22 |             - source /opt/cargo/env && cargo install --locked maturin
23 |             - GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no" cd /opt/ && git clone --recursive https://github.com/cschin/pgr-tk.git
24 |             - source /opt/cargo/env && cd /opt/pgr-tk/pgr-bin && cargo install --path .
25 |             - source /opt/cargo/env && cd /opt/pgr-tk && bash build.sh
26 |             - pip install numpy
27 |             - pip install /opt/pgr-tk/target/wheels/pgrtk-*-*-*-linux_x86_64.whl
28 |             - chown ubuntu:ubuntu -R ${CARGO_HOME}
29 |             - echo source /opt/cargo/env  >> /home/ubuntu/.bashrc
30 | 
31 | # Document End
32 | 


--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
 1 | rustup default stable
 2 | 
 3 | ## if necessary, you can instal libclang / clang using Anaconda 
 4 | ## and set LIBCLANG_PATH to point to the libclang for cbindgen dependence clang-sys
 5 | # export LIBCLANG_PATH=$HOME/miniconda3/lib
 6 | 
 7 | ## if necessary, install maturin with `cargo install --locked maturin`
 8 | # cargo install --locked maturin
 9 | 
10 | cargo build -p pgr-db --release
11 | cargo build -p pgr-bin --release
12 | cargo install --path pgr-bin
13 | 
14 | pushd pgr-tk/
15 | maturin build --release
16 | maturin build --release --skip-auditwheel
17 | popd
18 | 


--------------------------------------------------------------------------------
/build_no_agc.sh:
--------------------------------------------------------------------------------
 1 | #pushd WFA2-lib
 2 | #make all
 3 | #popd
 4 | 
 5 | rustup default stable
 6 | cargo build -p pgr-db --release --no-default-features
 7 | cargo build -p pgr-bin --release --no-default-features
 8 | cargo install --path pgr-bin --no-default-features
 9 | 
10 | pushd pgr-tk/
11 | maturin build --release --no-default-features
12 | maturin build --release --skip-auditwheel --no-default-features
13 | popd
14 | 


--------------------------------------------------------------------------------
/build_no_agc_apple_silicon.sh:
--------------------------------------------------------------------------------
 1 | #pushd WFA2-lib
 2 | #make all
 3 | #popd
 4 | 
 5 | #rustup default stable
 6 | rustup default stable-aarch64-apple-darwin
 7 | cargo build -p pgr-db --release --no-default-features
 8 | cargo build -p pgr-bin --release --no-default-features
 9 | cargo install --path pgr-bin --no-default-features
10 | 
11 | pushd pgr-tk/
12 | maturin build --release --no-default-features
13 | maturin build --release --skip-auditwheel --no-default-features
14 | popd
15 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | Dockerfile.build_env-22.04


--------------------------------------------------------------------------------
/docker/Dockerfile.build_env-22.04:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:22.04
 2 | RUN apt-get update
 3 | ARG DEBIAN_FRONTEND=noninteractive
 4 | ENV TZ=Etc/UTC
 5 | RUN apt-get install -y build-essential git ssh curl clang-14 cmake libssl-dev libssl3 pkg-config libzstd-dev zstd
 6 | RUN mkdir -p /opt
 7 | ENV RUSTUP_HOME=/opt/rustup
 8 | ENV CARGO_HOME=/opt/cargo
 9 | RUN RUSTUP_HOME=${RUSTUP_HOME} CARGO_HOME=${CARGO_HOME} bash -c "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y"
10 | RUN . /opt/cargo/env && rustup default stable
11 | RUN . /opt/cargo/env && cargo install --locked maturin
12 | ENV GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no"
13 | RUN . /opt/cargo/env && rustup toolchain list
14 | RUN apt-get install -y zlib1g-dev zlib1g libdeflate-dev
15 | 


--------------------------------------------------------------------------------
/docker/github-build.pub:
--------------------------------------------------------------------------------
1 | ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQCRJ3R0ssNmojNFh2C72gptRMNA5+2eXEG9V8vJzPByXV55iaM4e+AqPzbPXuCRyQUh9tTp1aHx61OJOIxVKonZyAC/mL0o2VKWgBMoRmOs/WQSlY083uenUe9lkTYhCnnRWcLyxJXd6NfysJQ8odmD7ZcNq/2yA4RLcQ02OH8xZsEGxUxsC+LUaOjIMvHFP9yyoYEhY8CNDwiRCBoPJtNM48826uFP8aTvAe0OnalouA200QVQpDqHaxNvGmUg6GmwmLN2yU1DIPdXpCkCER63Uhz+kYPL6ZBDWmFc9ipr0MzaWFuLrISSmb1sAhrS/TLsBN90WK7+32bYbU4ArHPjVzR6SXobKVqf6Vd3WNbqW51b6LIuAGG85CU4hRKK51E016p47VTVwt4mFLGdQSXwRFXoXkEpNDjz2U3+gdXt+7VnsxlwUSw1lv9qZ6F8t798BVaHmurkeQKA7K0xlGYl0EBVVEnkKljl6m5Xsz7CBc7YDq6S8/YJDbCUbEPC9Nc= cschin@umf01
2 | 


--------------------------------------------------------------------------------
/docker_exec_env/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:24.04
 2 | RUN apt-get update
 3 | ARG DEBIAN_FRONTEND=noninteractive
 4 | ENV TZ=Etc/UTC
 5 | RUN apt-get install -y build-essential git ssh curl clang-14 cmake libssl-dev libssl3 pkg-config libzstd-dev zstd
 6 | RUN mkdir -p /opt
 7 | ENV RUSTUP_HOME=/opt/rustup
 8 | ENV CARGO_HOME=/opt/cargo
 9 | RUN RUSTUP_HOME=${RUSTUP_HOME} CARGO_HOME=${CARGO_HOME} bash -c "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y"
10 | RUN . /opt/cargo/env && rustup default stable
11 | RUN . /opt/cargo/env && cargo install --locked maturin
12 | ENV GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no"
13 | RUN . /opt/cargo/env && rustup toolchain list
14 | RUN apt-get install -y zlib1g-dev zlib1g libdeflate-dev
15 | RUN cd /opt/ && git clone --recursive https://github.com/cschin/pgr-tk.git
16 | RUN cd /opt/pgr-tk/ && . /opt/cargo/env && cargo install maturin && bash build.sh
17 | RUN apt-get install -y python3-pip python3.12-venv
18 | RUN cd /opt/pgr-tk/ && python3 -m venv /opt/pgr-tk-py/ && /opt/pgr-tk-py/bin/pip install target/wheels/pgrtk-0.6.0-cp312-cp312-linux_x86_64.whl numpy
19 | RUN cd /opt/pgr-tk/ &&  . /opt/cargo/env && cargo clean
20 | ENV PATH="/opt/cargo/bin:$PATH"
21 | 


--------------------------------------------------------------------------------
/docker_exec_env/build.sh:
--------------------------------------------------------------------------------
1 | docker build -t cschin/pgrtk_env:latest .
2 | 


--------------------------------------------------------------------------------
/docs_src/alnmap_formap.md:
--------------------------------------------------------------------------------
 1 | 
 2 | The command line tool `pgr-alnmap` can generate alingment map between a set of assembled contigs
 3 | and a reference file. It creates a number of files that are useful for downstram analysis.
 4 | This document describe the format of the generated files.
 5 | 
 6 | ## `*.alnmap` files
 7 | 
 8 | A `*.alnmap` files contain the blocks that mapped from the assembly contigs (also called "query") to
 9 | the reference fiel (also called "target") from the PGR-TK-WGS's whole genome alignment code. (We will
10 | describe the algorithm behind it in a different documents.) The file contains chains of alignments Each
11 | chains is a set of blocks. 
12 | 
13 | Each chain of the align block has an integer id which is the first field of the line (also called as a 
14 | record.) The second field is the record types:
15 | 
16 | - "B": the record represents the begin of the chain of the align blocks.
17 | - "E": the record represents the end of the chain of the align blocks.
18 | - "M": the record represents a full alignment (no variant) block.
19 | - "M_D": the record represents a full alignment (no variant) block, however, two or more query are mapped to the same target block
20 | - "M_O": the record represents a full alignment (no variant) block, however, two or more query are mapped to the same target block from overlapped alignment chains
21 | - "V": the record represetns there are variants between the query and the target, the variant information are appended.
22 | - "V_D": the record represents there are variants between the query and the target, the variant information are appended. There are other query  blocks mapping to the same target block.
23 | - "V_O": the record represents there are variants between the query and the target, the variant information are appended. There are other query  blocks mapping to the same target block from overlapped alignment chains.
24 | - "S": the record represents potential structral variants between the query and the target block.
25 | - "S_D": the record represents potential structral variants between the query and the target block. There are other query  blocks mapping to the same target block.
26 | - "S_O": the record represents potential structral variants between the query and the target block. There are other query  blocks mapping to the same target blocke
27 | 
28 | All records share the following common 9 fields seperated by `tab`:
29 | 
30 | `aligned_chain_id, block_type, target_name, target_start, target_end, query_name, query_start, query_end, query_strand`
31 | 
32 | 
33 | The follow command generate the unqiuely mapped blocks:
34 | 
35 | ```
36 | cat grch38_to_chm13.alnmap | awk '$2 == "V" || $2 =="M" || $2 == "S" ' | cut  -f1-9 | sort  -k3,3 -k4,4n -u >  grch38_to_chm13_unique_blocks.alnmap
37 | 
38 | ```
39 | 
40 | and the duplicate mapped blocks:
41 | 
42 | ```
43 | cat grch38_to_chm13.alnmap | awk '$2 == "V_D" || $2 =="M_D" || $2 == "S_D" ' | cut  -f1-9 | sort  -k3,3 -k4,4n -u >  grch38_to_chm13_dup_blocks.alnmap
44 | ```
45 | 
46 | For the "V", "V_D" and "V_O" records, six addition fields are appended:
47 | (`variant_position_in_the_target_block`, `variant_position_in_the_query_block`, `variant_position_in_the_target_sequence`, `variant_type`, `ref_seq`, `variant_seq`)
48 | 
49 | For the "B" and "E" blocks, two additional fields are appended: `query_sequence_length`, `the_alingment_orientation_of_the_contig`.
50 | 
51 | For the "S", "S_D" and "S_O" blocks, two addional fileds are appended: `the_alingment_orientation_of_the_contig`, `sv_candidate_ type`.
52 | 
53 | 
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/images/AMY1A_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cschin/pgr-tk/af629426abff01f7d27c08de504534a93611b7c2/images/AMY1A_example.png


--------------------------------------------------------------------------------
/images/PGR_TK_Sketch_MAPG_construction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cschin/pgr-tk/af629426abff01f7d27c08de504534a93611b7c2/images/PGR_TK_Sketch_MAPG_construction.png


--------------------------------------------------------------------------------
/justfile:
--------------------------------------------------------------------------------
 1 | build_no_agc:
 2 |     #!/usr/bin/env bash
 3 |     #pushd WFA2-lib
 4 |     #make all
 5 |     #popd
 6 | 
 7 |     rustup default stable
 8 |     cargo build -p pgr-db --release --no-default-features
 9 |     cargo build -p pgr-bin --release --no-default-features
10 |     cargo install --path pgr-bin --no-default-features
11 | 
12 |     pushd pgr-tk/
13 |     maturin build --release --no-default-features
14 |     maturin build --release --skip-auditwheel --no-default-features
15 |     popd
16 | 
17 | install_bin_no_agc:
18 |     cargo install --path pgr-bin --no-default-features
19 | 
20 | install_bin:
21 |     cargo install --path pgr-bin/
22 | 
23 | build:
24 |     #!/usr/bin/env bash
25 |     rustup default stable
26 | 
27 |     ## if necessary, you can instal libclang / clang using Anaconda 
28 |     ## and set LIBCLANG_PATH to point to the libclang for cbindgen dependence clang-sys
29 |     # export LIBCLANG_PATH=$HOME/miniconda3/lib
30 | 
31 |     ## if necessary, install maturin with `cargo install --locked maturin`
32 |     # cargo install --locked maturin
33 | 
34 |     cargo build -p pgr-db --release
35 |     cargo build -p pgr-bin --release
36 |     cargo install --path pgr-bin
37 | 
38 |     pushd pgr-tk/
39 |     maturin build --release
40 |     maturin build --release --skip-auditwheel
41 |     popd
42 | 
43 | 


--------------------------------------------------------------------------------
/pgr-bin/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "pgr-bin"
 3 | version = "0.6.0"
 4 | edition = "2021"
 5 | authors = ["Jason Chin <cschin@infoecho.net>"]
 6 | 
 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 8 | 
 9 | [dependencies]
10 | pgr-db = {path = "../pgr-db", default-features = false }
11 | flate2 = { version = "1.0.17", features = ["zlib-ng-compat"], default-features = false }
12 | log = { version = "^0.4.5", features = ["std"] }
13 | clap = { version = "4.0.30", features = ["derive"] }
14 | rustc-hash = "1.1.0"
15 | rayon = "1.5.2"
16 | regex = "1"
17 | svg = "0.16"
18 | kodama = "0.2.3"
19 | memmap2 = "0.5.10"
20 | bincode = { version = "2.0.0-rc.1", features = ["alloc"] }
21 | serde_json = "1.0.96"
22 | serde = "1.0.163"
23 | iset = "0.2.2"
24 | 
25 | [features]
26 | default = ["with_agc"]
27 | with_agc = ["pgr-db/with_agc"]
28 | 


--------------------------------------------------------------------------------
/pgr-bin/build.rs:
--------------------------------------------------------------------------------
  1 | // from https://vallentin.dev/2019/06/06/versioning
  2 | 
  3 | use std::env::consts::{ARCH, OS};
  4 | use std::process::Command;
  5 | 
  6 | #[cfg(debug_assertions)]
  7 | const BUILD_TYPE: &str = "debug";
  8 | #[cfg(not(debug_assertions))]
  9 | const BUILD_TYPE: &'static str = "release";
 10 | 
 11 | fn main() {
 12 |     let branch_name = get_branch_name();
 13 |     if branch_name != *"bioconda" {
 14 |         let version_string = format!(
 15 |             "{} {} ({}:{}{}, {} build, {} [{}] [{}])",
 16 |             env!("CARGO_PKG_NAME"),
 17 |             env!("CARGO_PKG_VERSION"),
 18 |             get_branch_name(),
 19 |             get_commit_hash(),
 20 |             if is_working_tree_clean() { "" } else { "+" },
 21 |             BUILD_TYPE,
 22 |             OS,
 23 |             ARCH,
 24 |             get_rustc_version()
 25 |         );
 26 | 
 27 |         println!("cargo:rustc-env=VERSION_STRING={}", version_string);
 28 |     } else {
 29 |         let version_string = format!(
 30 |             "{} {} (bioconda {} build, {} [{}] [{}])",
 31 |             env!("CARGO_PKG_NAME"),
 32 |             env!("CARGO_PKG_VERSION"),
 33 |             BUILD_TYPE,
 34 |             OS,
 35 |             ARCH,
 36 |             get_rustc_version()
 37 |         );
 38 |         println!("cargo:rustc-env=VERSION_STRING={}", version_string);
 39 |     }
 40 | }
 41 | 
 42 | fn get_rustc_version() -> String {
 43 |     let output = Command::new("rustc")
 44 |         .arg("--version")
 45 |         .current_dir(env!("CARGO_MANIFEST_DIR"))
 46 |         .output()
 47 |         .unwrap();
 48 | 
 49 |     assert!(output.status.success());
 50 | 
 51 |     String::from_utf8_lossy(&output.stdout)
 52 |         .trim_end()
 53 |         .to_string()
 54 | }
 55 | 
 56 | fn get_commit_hash() -> String {
 57 |     let output = Command::new("git")
 58 |         .arg("log")
 59 |         .arg("-1")
 60 |         .arg("--pretty=format:%h") // Abbreviated commit hash
 61 |         // .arg("--pretty=format:%H") // Full commit hash
 62 |         .current_dir(env!("CARGO_MANIFEST_DIR"))
 63 |         .output()
 64 |         .unwrap();
 65 | 
 66 |     // assert!(output.status.success());
 67 |     if output.status.success() {
 68 |         String::from_utf8_lossy(&output.stdout).to_string()
 69 |     } else {
 70 |         String::from("bioconda")
 71 |     }
 72 | }
 73 | 
 74 | fn get_branch_name() -> String {
 75 |     let output = Command::new("git")
 76 |         .arg("rev-parse")
 77 |         .arg("--abbrev-ref")
 78 |         .arg("HEAD")
 79 |         .current_dir(env!("CARGO_MANIFEST_DIR"))
 80 |         .output()
 81 |         .unwrap();
 82 | 
 83 |     //assert!(output.status.success());
 84 |     if output.status.success() {
 85 |         String::from_utf8_lossy(&output.stdout)
 86 |             .trim_end()
 87 |             .to_string()
 88 |     } else {
 89 |         String::from("bioconda")
 90 |     }
 91 | }
 92 | 
 93 | fn is_working_tree_clean() -> bool {
 94 |     let status = Command::new("git")
 95 |         .arg("diff")
 96 |         .arg("--quiet")
 97 |         .arg("--exit-code")
 98 |         .current_dir(env!("CARGO_MANIFEST_DIR"))
 99 |         .status()
100 |         .unwrap();
101 | 
102 |     if status.success() {
103 |         status.code().unwrap() == 0
104 |     } else {
105 |         true
106 |     }
107 | }
108 | 


--------------------------------------------------------------------------------
/pgr-bin/file_format_documents/ctgsv.bed.md:
--------------------------------------------------------------------------------
 1 | # ctgsv.bed File Format
 2 | 
 3 | The "ctgsv.bed" file is a tab-separated values (TSV) file that contains information about query contig alignments and potential structural variations. Each line in the file represents a region of interest in the query sequence.
 4 | 
 5 | ## File Structure
 6 | 
 7 | Each line in the file consists of four columns:
 8 | 
 9 | 1. Query Name
10 | 2. Start Position
11 | 3. End Position
12 | 4. Annotation
13 | 
14 | ## Column Descriptions
15 | 
16 | 1. **Query Name**: The name or identifier of the query contig.
17 | 
18 | 2. **Start Position**: The starting position of the region in the query contig (0-based).
19 | 
20 | 3. **End Position**: The ending position of the region in the query contig (exclusive).
21 | 
22 | 4. **Annotation**: A string containing information about the region, with fields separated by colons. The annotation format is as follows:
23 | 
24 |    `<Type>:<Target>:<Target_Start>-<Target_End>:<Orientation>:<Ctg_Orientation>:<Additional_Info>`
25 | 
26 |    - `<Type>`: Indicates the type of region:
27 |      - `QG`: Query Gap
28 |      - `QD`: Query Duplicate
29 |      - `QO`: Query Overlap
30 | 
31 |    - `<Target>`: The name of the target sequence this region aligns to.
32 | 
33 |    - `<Target_Start>-<Target_End>`: The start and end positions in the target sequence.
34 | 
35 |    - `<Orientation>`: The orientation of the alignment (0 for forward, 1 for reverse).
36 | 
37 |    - `<Ctg_Orientation>`: The overall orientation of the contig (0 for forward, 1 for reverse).
38 | 
39 |    - `<Additional_Info>`: Any additional information (may vary depending on the type).
40 | 
41 | ## Example
42 | 
43 | ```
44 | contig1    0    1000    QG:BGN>chr1:500-1500:0:0:32000
45 | contig1    1000 2000    QD:chr1>chr2:1000-2000:1:0:32000
46 | contig1    2000 3000    QO:chr2>chr3:2000-3000:0:0:32000
47 | contig1    3000 32000   QG:chr3>END
48 | ```
49 | 
50 | In this example:
51 | - The first line shows a query gap at the beginning of contig1, aligning to chr1.
52 | - The second line indicates a duplicated region in contig1, aligning to chr2 in reverse orientation.
53 | - The third line shows an overlapping region in contig1, aligning to chr3.
54 | - The last line represents the end of the contig, with a gap from the last alignment to the end of the sequence.
55 | 
56 | This file format provides a comprehensive view of how query contigs align to the target sequences, highlighting potential structural variations, duplications, and gaps in the query assembly.


--------------------------------------------------------------------------------
/pgr-bin/file_format_documents/for_pgr-alnmap/alnmap.md:
--------------------------------------------------------------------------------
 1 | # alnmap Output File Documentation
 2 | 
 3 | The alnmap output file contains detailed information about the alignment between a reference sequence and query sequences. Each line in the file represents a different type of record, with fields separated by tabs.
 4 | 
 5 | ## Record Types
 6 | 
 7 | 1. **Begin Record (B)**
 8 |    Format: `<aln_idx>\tB\t<target_name>\t<target_start>\t<target_end>\t<query_name>\t<query_start>\t<query_end>\t<orientation>\t<query_length>\t<contig_orientation>\t<target_duplicate>\t<target_overlap>\t<query_duplicate>\t<query_overlap>`
 9 | 
10 | 2. **End Record (E)**
11 |    Format: `<aln_idx>\tE\t<target_name>\t<target_start>\t<target_end>\t<query_name>\t<query_start>\t<query_end>\t<orientation>\t<query_length>\t<contig_orientation>`
12 | 
13 | 3. **Match Record (M, M_D, or M_O)**
14 |    Format: `<aln_idx>\t<match_type>\t<target_name>\t<target_start>\t<target_end>\t<query_name>\t<query_start>\t<query_end>\t<orientation>`
15 | 
16 | 4. **SV Candidate Record (S, S_D, or S_O)**
17 |    Format: `<aln_idx>\t<sv_type>\t<target_name>\t<target_start>\t<target_end>\t<query_name>\t<query_start>\t<query_end>\t<orientation>\t<contig_orientation>\t<diff_type>`
18 | 
19 | 5. **Variant Record (V, V_D, or V_O)**
20 |    Format: `<aln_idx>\t<variant_type>\t<target_name>\t<target_start>\t<target_end>\t<query_name>\t<query_start>\t<query_end>\t<orientation>\t<target_diff>\t<query_diff>\t<target_coord>\t<variant_type>\t<target_variant_seq>\t<query_variant_seq>`
21 | 
22 | ## Field Descriptions
23 | 
24 | - `aln_idx`: Alignment index (6-digit zero-padded number)
25 | - `target_name`: Name of the reference sequence
26 | - `target_start`: Start position in the reference sequence
27 | - `target_end`: End position in the reference sequence
28 | - `query_name`: Name of the query sequence
29 | - `query_start`: Start position in the query sequence
30 | - `query_end`: End position in the query sequence
31 | - `orientation`: Orientation of the alignment (0 for forward, 1 for reverse)
32 | - `query_length`: Length of the query sequence
33 | - `contig_orientation`: Orientation of the contig
34 | - `target_duplicate`: Whether the target region is duplicated (1) or not (0)
35 | - `target_overlap`: Whether the target region overlaps (1) or not (0)
36 | - `query_duplicate`: Whether the query region is duplicated (1) or not (0)
37 | - `query_overlap`: Whether the query region overlaps (1) or not (0)
38 | - `match_type`: Type of match (M, M_D, or M_O)
39 | - `sv_type`: Type of structural variant candidate (S, S_D, or S_O)
40 | - `diff_type`: Type of difference (A: FailAln, E: FailEndMatch, S: FailShortSeq, L: FailLengthDiff, U: Unknown)
41 | - `variant_type`: Type of variant (V, V_D, or V_O)
42 | - `target_diff`: Difference in the target sequence
43 | - `query_diff`: Difference in the query sequence
44 | - `target_coord`: Coordinate in the target sequence
45 | - `variant_type`: Type of variant (single character)
46 | - `target_variant_seq`: Variant sequence in the target
47 | - `query_variant_seq`: Variant sequence in the query
48 | 
49 | ## Notes
50 | 
51 | - Records with `_D` suffix indicate duplicated regions
52 | - Records with `_O` suffix indicate overlapping regions
53 | - The order of records in the file follows the alignment process
54 | 
55 | This documentation should help users understand the structure and content of the alnmap output file generated by the provided code.


--------------------------------------------------------------------------------
/pgr-bin/file_format_documents/for_pgr-alnmap/ctgmap.bed.md:
--------------------------------------------------------------------------------
 1 | # *.ctgmap.bed File Format
 2 | 
 3 | The *.ctgmap.bed file is a tab-separated values (TSV) file that describes the alignment of query contigs to a reference genome. It follows a modified BED format with additional fields to provide detailed information about the alignments.
 4 | 
 5 | ## File Structure
 6 | 
 7 | Each line in the file represents a single alignment between a query contig and the reference genome. The fields are separated by tabs and are ordered as follows:
 8 | 
 9 | 1. Reference sequence name
10 | 2. Start position on the reference (0-based)
11 | 3. End position on the reference (exclusive)
12 | 4. Additional information (colon-separated)
13 | 
14 | ## Fields
15 | 
16 | 1. **Reference sequence name**: The name of the reference sequence (chromosome or scaffold).
17 | 
18 | 2. **Start position**: The start position of the alignment on the reference sequence (0-based).
19 | 
20 | 3. **End position**: The end position of the alignment on the reference sequence (exclusive).
21 | 
22 | 4. **Additional information**: A colon-separated string containing the following fields:
23 |    a. Query sequence name
24 |    b. Start position on the query sequence
25 |    c. End position on the query sequence
26 |    d. Query contig length
27 |    e. Orientation (0 for forward, 1 for reverse)
28 |    f. Contig orientation (0 for forward, 1 for reverse)
29 |    g. Target duplication flag (0 for unique, 1 for duplicated)
30 |    h. Target overlap flag (0 for non-overlapping, 1 for overlapping)
31 |    i. Query duplication flag (0 for unique, 1 for duplicated)
32 |    j. Query overlap flag (0 for non-overlapping, 1 for overlapping)
33 | 
34 | ## Example
35 | 
36 | ```
37 | chr1    1000    2000    contig1:500:1500:3000:0:0:0:0:0:0
38 | ```
39 | 
40 | This example line can be interpreted as follows:
41 | - The alignment is on reference sequence "chr1" from position 1000 to 2000.
42 | - The query contig name is "contig1".
43 | - The alignment covers positions 500 to 1500 on the query contig.
44 | - The total length of the query contig is 3000 base pairs.
45 | - The alignment is in the forward orientation (0) for both the reference and the query.
46 | - The alignment is unique and non-overlapping on both the reference and the query (all flags are 0).
47 | 
48 | ## Usage
49 | 
50 | This file format is useful for visualizing and analyzing the alignment of query contigs to a reference genome. It can be used to identify structural variations, assess the quality of genome assemblies, and compare different genome versions or assemblies.


--------------------------------------------------------------------------------
/pgr-bin/file_format_documents/for_pgr-alnmap/ctgmap.json.md:
--------------------------------------------------------------------------------
 1 | # ctgmap.json Documentation
 2 | 
 3 | The `ctgmap.json` file contains information about contig mappings between a reference genome and query contigs. It is generated by the `pgr-alnmap` tool and provides detailed alignment information in a JSON format.
 4 | 
 5 | ## File Structure
 6 | 
 7 | The `ctgmap.json` file contains a single JSON object with the following structure:
 8 | 
 9 | ```json
10 | {
11 |   "records": [...],
12 |   "query_length": [...],
13 |   "target_length": [...]
14 | }
15 | ```
16 | 
17 | ### Records
18 | 
19 | The `records` field is an array of objects, where each object represents a contig mapping. Each mapping object has the following structure:
20 | 
21 | ```json
22 | {
23 |   "t_name": "string",
24 |   "ts": number,
25 |   "te": number,
26 |   "q_name": "string",
27 |   "qs": number,
28 |   "qe": number,
29 |   "ctg_len": number,
30 |   "orientation": number,
31 |   "ctg_orientation": number,
32 |   "t_dup": boolean,
33 |   "t_ovlp": boolean,
34 |   "q_dup": boolean,
35 |   "q_ovlp": boolean
36 | }
37 | ```
38 | 
39 | - `t_name`: Name of the target (reference) sequence
40 | - `ts`: Start position of the alignment on the target sequence
41 | - `te`: End position of the alignment on the target sequence
42 | - `q_name`: Name of the query contig
43 | - `qs`: Start position of the alignment on the query contig
44 | - `qe`: End position of the alignment on the query contig
45 | - `ctg_len`: Length of the query contig
46 | - `orientation`: Orientation of the alignment (0 for forward, 1 for reverse)
47 | - `ctg_orientation`: Orientation of the entire contig alignment
48 | - `t_dup`: Boolean indicating if the target region is duplicated
49 | - `t_ovlp`: Boolean indicating if the target region overlaps with other alignments
50 | - `q_dup`: Boolean indicating if the query region is duplicated
51 | - `q_ovlp`: Boolean indicating if the query region overlaps with other alignments
52 | 
53 | ### Query Length
54 | 
55 | The `query_length` field is an array of tuples containing information about query contig lengths:
56 | 
57 | ```json
58 | [
59 |   [id, "name", length],
60 |   ...
61 | ]
62 | ```
63 | 
64 | - `id`: Numeric identifier for the query contig
65 | - `name`: Name of the query contig
66 | - `length`: Length of the query contig in base pairs
67 | 
68 | ### Target Length
69 | 
70 | The `target_length` field is an array of tuples containing information about target (reference) sequence lengths:
71 | 
72 | ```json
73 | [
74 |   [id, "name", length],
75 |   ...
76 | ]
77 | ```
78 | 
79 | - `id`: Numeric identifier for the target sequence
80 | - `name`: Name of the target sequence
81 | - `length`: Length of the target sequence in base pairs
82 | 
83 | ## Usage
84 | 
85 | The `ctgmap.json` file can be used for various downstream analyses, including:
86 | 
87 | 1. Visualizing contig alignments
88 | 2. Identifying potential structural variations
89 | 3. Assessing the quality of genome assemblies
90 | 4. Comparing different assemblies or genome versions
91 | 
92 | To work with the `ctgmap.json` file, you can use any JSON parser in your preferred programming language to load and process the data.


--------------------------------------------------------------------------------
/pgr-bin/file_format_documents/for_pgr-alnmap/svcnd.bed.md:
--------------------------------------------------------------------------------
 1 | The "svcnd.bed" file is a BED (Browser Extensible Data) format file that contains information about structural variant candidates (SVCs) and alignment regions. Each line in the file represents a feature and has four tab-separated fields:
 2 | 
 3 | 1. Chromosome/Contig name (target sequence name)
 4 | 2. Start position (0-based)
 5 | 3. End position (1-based)
 6 | 4. Feature annotation
 7 | 
 8 | The feature annotation field contains detailed information about the SV candidate or alignment region. It can have different formats depending on the type of feature:
 9 | 
10 | 1. SV Candidates:
11 |    Format: `<SVC_TYPE>:<QUERY_NAME>:<QUERY_START>-<QUERY_END>:<ORIENTATION>:<CTG_ORIENTATION>:<DIFF_TYPE>`
12 |    
13 |    - SVC_TYPE: Can be "SVC" (regular SV candidate), "SVC_D" (SV in duplicated region), or "SVC_O" (SV in overlapped region)
14 |    - QUERY_NAME: Name of the query sequence
15 |    - QUERY_START and QUERY_END: Start and end positions in the query sequence
16 |    - ORIENTATION: Alignment orientation (0 for forward, 1 for reverse)
17 |    - CTG_ORIENTATION: Contig orientation
18 |    - DIFF_TYPE: Type of difference ('A' for alignment failure, 'E' for end mismatch, 'S' for short sequence, 'L' for length difference)
19 | 
20 | 2. Target Alignment Regions:
21 |    Format: `<TYPE>:<PREV_CTG>><NEXT_CTG>:<QUERY_START>:<QUERY_END>:<CTG_LEN>:<ORIENTATION>:<CTG_ORIENTATION>`
22 |    
23 |    - TYPE: Can be "TG" (gap), "TD" (duplication), or "TO" (overlap)
24 |    - PREV_CTG and NEXT_CTG: Names of the previous and next contigs
25 |    - QUERY_START and QUERY_END: Start and end positions in the query sequence
26 |    - CTG_LEN: Length of the contig
27 |    - ORIENTATION: Alignment orientation
28 |    - CTG_ORIENTATION: Contig orientation
29 | 
30 | The "svcnd.bed" file combines information about structural variant candidates and alignment regions, providing a comprehensive view of potential genomic variations and how query sequences align to the target reference. This format allows for easy visualization in genome browsers and can be used for further analysis of structural variations and alignment characteristics.


--------------------------------------------------------------------------------
/pgr-bin/file_format_documents/gfa_format.md:
--------------------------------------------------------------------------------
  1 | # GFA Output Format in PGR-TK
  2 | 
  3 | ## Overview
  4 | 
  5 | PGR-TK generates Graph Fragment Assembly (GFA) files to represent pangenome assembly graphs. These graphs are based on the Minimizer Anchored Pangenome (MAP) concept, which uses shimmer pairs (minimizers) to anchor paths through the genome.
  6 | 
  7 | Two main types of GFA files are produced:
  8 | - **MAP Graph GFA** (.mapg.gfa) - representing the full minimizer-anchored paths graph
  9 | - **Principal MAP Graph GFA** (.pmapg.gfa) - representing a simplified version with principal bundles
 10 | 
 11 | ## File Format Specification
 12 | 
 13 | GFA files in PGR-TK follow the standard GFA specification with customizations for MAP graphs:
 14 | 
 15 | ### Header Line (H)
 16 | ```
 17 | H   VN:Z:1.0   CM:Z:Sparse Genome Graph Generated By pgr-tk
 18 | ```
 19 | - `VN:Z:1.0` - Version number of the format
 20 | - `CM:Z:...` - Comment describing the file
 21 | 
 22 | ### Segments (S lines)
 23 | Segments represent nodes in the graph:
 24 | 
 25 | ```
 26 | S   <id>   *   LN:i:<length>   SN:Z:<shmmr_pair_signature>   [BN:i:<bundle_id>   BP:i:<position>]
 27 | ```
 28 | 
 29 | Fields:
 30 | - `<id>` - Numerical identifier for the segment
 31 | - `*` - Placeholder for sequence (not stored in this representation)
 32 | - `LN:i:<length>` - Length of the segment (average length plus k-mer size)
 33 | - `SN:Z:<shmmr_pair_signature>` - Hexadecimal representation of the shimmer pair (hash0_hash1)
 34 | 
 35 | Optional fields for principal bundle GFA:
 36 | - `BN:i:<bundle_id>` - Principal bundle identifier
 37 | - `BP:i:<position>` - Position within the bundle
 38 | 
 39 | ### Links (L lines)
 40 | Links represent edges connecting segments:
 41 | 
 42 | ```
 43 | L   <id1>   <strand1>   <id2>   <strand2>   <cigar>   SC:i:<support_count>
 44 | ```
 45 | 
 46 | Fields:
 47 | - `<id1>`, `<id2>` - IDs of connected segments
 48 | - `<strand1>`, `<strand2>` - Orientation (`+` or `-`)
 49 | - `<cigar>` - CIGAR string (typically `kM` where k is the k-mer size)
 50 | - `SC:i:<support_count>` - Number of sequences supporting this connection
 51 | 
 52 | ## MAP Graph Index Format (.mapg.idx)
 53 | 
 54 | The MAP Graph index (.mapg.idx) file complements the GFA representation with a tab-delimited text format. Each line begins with a single character that identifies the record type:
 55 | 
 56 | ### K Record (SHIMMER Parameters)
 57 | ```
 58 | K\t<w>\t<k>\t<r>\t<min_span>\t<sketch>
 59 | ```
 60 | 
 61 | Fields:
 62 | - `w` - Window size parameter (integer)
 63 | - `k` - K-mer size parameter (integer)
 64 | - `r` - Reduction factor parameter (integer)
 65 | - `min_span` - Minimum span length parameter (integer)
 66 | - `sketch` - Boolean flag (true/false) indicating if sketching was used
 67 | 
 68 | ### C Record (Contig Information)
 69 | ```
 70 | C\t<id>\t<contig_name>\t<source>\t<length>
 71 | ```
 72 | 
 73 | Fields:
 74 | - `id` - Unique identifier for the contig (integer)
 75 | - `contig_name` - Name of the contig
 76 | - `source` - Source sample name or "NA" if not available
 77 | - `length` - Length of the contig (integer)
 78 | 
 79 | ### F Record (Fragment Information)
 80 | ```
 81 | F\t<shimmer_pair_id>\t<seq_id>\t<direction>\t<start>\t<end>\t<fragment_id>
 82 | ```
 83 | 
 84 | Fields:
 85 | - `shimmer_pair_id` - SHIMMER pair identifier in hex format (e.g., "0123456789ABCDEF_0123456789ABCDEF")
 86 | - `seq_id` - Sequence identifier (integer)
 87 | - `direction` - Direction/orientation of the fragment (integer)
 88 | - `start` - Start position of the fragment (integer)
 89 | - `end` - End position of the fragment (integer)
 90 | - `fragment_id` - Fragment identifier (integer)
 91 | 
 92 | The index file allows for efficient loading and querying of the graph structure without having to parse the entire GFA file, which can be much larger.
 93 | 
 94 | ## Use Cases
 95 | 
 96 | - Representing complex genomic regions across multiple samples
 97 | - Analyzing structural variations and sequence relationships
 98 | - Visualizing pangenome structure
 99 | - Identifying shared genomic regions (principal bundles)
100 | 
101 | The GFA format enables visualization and algorithmic analysis of the graph structure, facilitating exploration of sequence relationships across multiple genomes, particularly in repetitive and structurally complex regions.


--------------------------------------------------------------------------------
/pgr-bin/file_format_documents/input_file_formats_for_pgr-pbundle-bed.md:
--------------------------------------------------------------------------------
 1 | # Input File Formats for pgr-pbundle-bed2svg
 2 | 
 3 | ## 1. Principal Bundle BED File (Required)
 4 | 
 5 | This is the main input file containing bundle information.
 6 | 
 7 | - Format: Tab-separated values
 8 | - Each line represents a bundle segment
 9 | - Fields:
10 |   1. Contig name
11 |   2. Start position
12 |   3. End position
13 |   4. Bundle information (colon-separated):
14 |      - Bundle ID
15 |      - Bundle vertex count (not used)
16 |      - Bundle direction (0 or 1)
17 |      - Bundle vertex start (not used)
18 |      - Bundle vertex end (not used)
19 | 
20 | Example: 
21 | ```
22 | ctg1 1000 2000 1:5:0:1000:2000 ctg1 3000 4000 2:3:1:3000:4000
23 | ```
24 | 
25 | ## 2. Annotation File (Optional)
26 | 
27 | Provides additional annotation text for each contig.
28 | 
29 | - Format: Tab-separated values
30 | - Fields:
31 |   1. Contig name
32 |   2. Annotation text
33 | 
34 | Example: 
35 | ```
36 | ctg1  Annotation for contig 1 
37 | ctg2  Annotation for contig 2
38 | ```
39 | 
40 | ## 3. Annotation Region BED File (Optional)
41 | 
42 | Defines regions for annotation tracks.
43 | 
44 | - Format: Tab-separated values
45 | - Fields:
46 |   1. Contig name
47 |   2. Start position
48 |   3. End position
49 |   4. Title
50 |   5. Color
51 | 
52 | Example:
53 | ```
54 | ctg1 5000 6000 Region_A #FF0000 
55 | ctg1 7000 8000 Region_B #00FF00
56 | ```
57 | 
58 | 
59 | ## 4. Offset File (Optional)
60 | 
61 | Provides offset values for each contig.
62 | 
63 | - Format: Tab-separated values
64 | - Fields:
65 |   1. Contig name
66 |   2. Offset value (integer)
67 | 
68 | Example:
69 | ```
70 | ctg1 1000 ctg2 -500
71 | ```
72 | 
73 | 
74 | ## 5. Dendrogram File (Optional)
75 | 
76 | Describes the hierarchical clustering of contigs.
77 | 
78 | - Format: Tab-separated values
79 | - Three types of lines:
80 |   1. Leaf nodes (L):
81 |      - L    [node_id]    [contig_name]
82 |   2. Internal nodes (I):
83 |      - I    [node_id]    [child_node0]    [child_node1]    [node_size]    [node_height]
84 |   3. Node positions (P):
85 |      - P    [node_id]    [node_position]    [node_height]    [node_size]
86 | 
87 | Example:
88 | ```
89 | L 1 ctg1 L 2 ctg2 I 3 1 2 2 0.5 P 1 0.0 0.0 1 P 2 1.0 0.0 1 P 3 0.5 0.5 2
90 | ```
91 | 


--------------------------------------------------------------------------------
/pgr-bin/file_format_documents/output_files_for_pgr-pbundle-decomp.md:
--------------------------------------------------------------------------------
 1 | # Output Files for pgr-pbundle-decomp
 2 | 
 3 | The `pgr-pbundle-decomp` tool generates several output files when decomposing principal bundles from sequence data. Each file serves a specific purpose in the analysis pipeline. Below is a comprehensive description of each output file format:
 4 | 
 5 | ## 1. Principal Bundle BED File (`[prefix].bed`)
 6 | 
 7 | This file contains the principal bundle decomposition results in BED format, which identifies regions in the genome that share similar sequence patterns. For more detailed information, see [Principal Bundle BED File Format](principal_bundle_bed_file.md).
 8 | 
 9 | ### Format:
10 | ```
11 | <contig_name> <start> <end> <bundle_id>:<bundle_size>:<direction>:<start_pos>:<end_pos>:<repeat_flag>
12 | ```
13 | 
14 | ### Fields:
15 | - `contig_name`: Name of the contig or chromosome
16 | - `start`: 0-based start position of the bundle on the contig (inclusive)
17 | - `end`: 0-based end position of the bundle on the contig (exclusive)
18 | - `bundle_id`: Unique identifier for the bundle
19 | - `bundle_size`: Number of sequences contained in the bundle
20 | - `direction`: Direction of the bundle (0 for forward, 1 for reverse)
21 | - `start_pos`: Start position within the principal bundle coordinate system
22 | - `end_pos`: End position within the principal bundle coordinate system
23 | - `repeat_flag`: Classification flag - 'R' for repeat regions, 'U' for unique regions
24 | 
25 | The first line of the file contains a comment with the command used to run the tool.
26 | 
27 | ## 2. Contig Summary File (`[prefix].ctg.summary.tsv`)
28 | 
29 | This tab-separated file provides detailed statistical information about bundle distribution across each contig, useful for quantitative analysis. This file is often used in conjunction with the principal bundle BED file for comprehensive analysis.
30 | 
31 | ### Header:
32 | ```
33 | #ctg length repeat_bundle_count repeat_bundle_sum repeat_bundle_percentage repeat_bundle_mean repeat_bundle_min repeat_bundle_max non_repeat_bundle_count non_repeat_bundle_sum non_repeat_bundle_percentage non_repeat_bundle_mean non_repeat_bundle_min non_repeat_bundle_max total_bundle_count total_bundle_coverage_percentage
34 | ```
35 | 
36 | 
37 | ### Fields:
38 | - `ctg`: Contig name
39 | - `length`: Total length of the contig in base pairs
40 | - `repeat_bundle_count`: Number of repeat bundles identified
41 | - `repeat_bundle_sum`: Total base pairs covered by repeat bundles
42 | - `repeat_bundle_percentage`: Percentage of contig covered by repeat bundles
43 | - `repeat_bundle_mean`: Mean length of repeat bundles
44 | - `repeat_bundle_min`: Minimum length of repeat bundles
45 | - `repeat_bundle_max`: Maximum length of repeat bundles
46 | - `non_repeat_bundle_count`: Number of non-repeat (unique) bundles
47 | - `non_repeat_bundle_sum`: Total base pairs covered by non-repeat bundles
48 | - `non_repeat_bundle_percentage`: Percentage of contig covered by non-repeat bundles
49 | - `non_repeat_bundle_mean`: Mean length of non-repeat bundles
50 | - `non_repeat_bundle_min`: Minimum length of non-repeat bundles
51 | - `non_repeat_bundle_max`: Maximum length of non-repeat bundles
52 | - `total_bundle_count`: Total number of bundles (repeat + non-repeat)
53 | - `total_bundle_coverage_percentage`: Percentage of contig covered by all bundles
54 | 
55 | ### Example:
56 | ```
57 | #ctg length repeat_bundle_count repeat_bundle_sum repeat_bundle_percentage repeat_bundle_mean repeat_bundle_min repeat_bundle_max non_repeat_bundle_count non_repeat_bundle_sum non_repeat_bundle_percentage non_repeat_bundle_mean non_repeat_bundle_min non_repeat_bundle_max total_bundle_count total_bundle_coverage_percentage
58 | chr1 248956422 2156 42568912 17.1 19744 500 125680 5842 192458700 77.3 32943 200 258462 7998 94.4
59 | ```
60 | 
61 | ## 3. MAP Graph GFA File (.mapg.gfa)
62 | 
63 | This file contains the Minimizer Anchor Profile (MAP) graph in GFA (Graphical Fragment Assembly) format. For detailed information about the GFA format used in PGR-TK, see [GFA Output Format in PGR-TK](gfa_format.md).
64 | 
65 | ## 4. MAP Graph Index File (.mapg.idx)
66 | 
67 | This file contains the index for the MAP graph, enabling efficient querying and traversal. The index format is described in detail in the [GFA Output Format in PGR-TK](gfa_format.md) document.
68 | 
69 | ## 5. Principal MAP Graph GFA File (.pmapg.gfa)
70 | 
71 | This file contains the principal MAP graph in GFA format, which is a simplified version focusing on the principal bundles. See [GFA Output Format in PGR-TK](gfa_format.md) for format details.
72 | 
73 | ## 6. Principal Bundle Data File (`[prefix].pdb`)
74 | 
75 | This binary file contains the complete principal bundle data, essential for downstream analysis tools in the PGR-TK suite. It includes:
76 | 
77 | - SHIMMER parameters used for minimizer generation (k-mer size, window size, reduction factor)
78 | - Bundle information with coordinate mappings between reference and bundle spaces
79 | - Fragment boundary information
80 | - Bundle connectivity data
81 | - Sequence mapping metadata
82 | 
83 | ## Related Documentation
84 | 
85 | - [Principal Bundle BED File Format](principal_bundle_bed_file.md) - Detailed explanation of the principal bundle BED file format
86 | - [Input File Formats for pgr-pbundle-bed2svg](input_file_formats_for_pgr-pbundle-bed.md) - Documentation for the input files required by pgr-pbundle-bed2svg
87 | - [GFA Output Format in PGR-TK](gfa_format.md) - Information about the GFA format used in PGR-TK
88 | - [Contig SV BED Format](ctgsv.bed.md) - Documentation for the contig SV BED format, another important file format in the PGR-TK ecosystem
89 | 
90 | 


--------------------------------------------------------------------------------
/pgr-bin/file_format_documents/principal_bundle_bed_file.md:
--------------------------------------------------------------------------------
 1 | # Principal Bundle BED File Format
 2 | 
 3 | ## Overview
 4 | 
 5 | This document describes the format of principal bundle BED files generated by the `pgr-pbundle-decomp` tool. These files contain decomposition information about genomic regions organized into principal bundles, which represent similar sequence regions across different contigs or assemblies.
 6 | 
 7 | ## File Format
 8 | 
 9 | The principal bundle BED file is a tab-separated text file where each line represents a bundle region in a contig.
10 | 
11 | ### Header
12 | 
13 | The file begins with a comment line containing the command used to generate the file:
14 | ```
15 | # cmd: <command_string>
16 | ```
17 | 
18 | ### Data Columns
19 | 
20 | Each subsequent line contains the following tab-separated fields:
21 | 
22 | 1. **Contig Name** (string)
23 |    - The name of the contig or sequence
24 | 
25 | 2. **Start Position** (integer)
26 |    - The 0-based start position of the bundle region on the contig
27 | 
28 | 3. **End Position** (integer)
29 |    - The end position of the bundle region on the contig
30 | 
31 | 4. **Bundle Information** (string)
32 |    - A colon-separated string containing the following components:
33 |      
34 |      a. **Bundle ID** (integer)
35 |         - The unique identifier for the bundle
36 |      
37 |      b. **Bundle Size** (integer)
38 |         - The total size of the bundle in base pairs
39 |      
40 |      c. **Direction** (0 or 1)
41 |         - 0: Forward direction
42 |         - 1: Reverse direction (reverse complement)
43 |      
44 |      d. **Start Position in Bundle** (integer)
45 |         - The start position of this region within the bundle coordinates
46 |      
47 |      e. **End Position in Bundle** (integer)
48 |         - The end position of this region within the bundle coordinates
49 |      
50 |      f. **Repeat Status** (R or U)
51 |         - R: Repeat region
52 |         - U: Unique (non-repeat) region
53 | 
54 | ## Example
55 | 
56 | ```
57 | contig1  1000  2000  42:5000:0:100:200:U
58 | ```
59 | 
60 | This line indicates:
61 | - Region is on `contig1`
62 | - Region spans from position 1000 to 2000 on the contig
63 | - Region belongs to bundle with ID 42
64 | - The total bundle size is 5000 bp
65 | - Region is in the forward direction (0)
66 | - Within the bundle, the region spans from position 100 to 200
67 | - The region is unique (U), not a repeat
68 | 
69 | ## Usage
70 | 
71 | Principal bundle BED files are typically used for:
72 | - Analyzing sequence similarity across different assemblies
73 | - Identifying structural variants
74 | - Visualizing genome alignments
75 | - Mapping coordinates between different assemblies
76 | 
77 | For related tools that work with these files, see:
78 | - `pgr-pbundle-bed2svg`: Generates SVG visualizations from bundle BED files
79 | - `pgr-pbundle-bed2dist`: Calculates distances between bundle regions
80 | - `pgr-pbundle-bed2sorted`: Sorts bundle BED files for more efficient processing
81 | - `pgr-pbundle-bed2offset`: Computes coordinate offsets between bundle regions


--------------------------------------------------------------------------------
/pgr-bin/src/_bin/README.txt:
--------------------------------------------------------------------------------
1 | experimental binaries
2 | 


--------------------------------------------------------------------------------
/pgr-bin/src/_bin/pgr-fasta-smp-count.rs:
--------------------------------------------------------------------------------
 1 | const VERSION_STRING: &'static str = env!("VERSION_STRING");
 2 | 
 3 | //use std::path::PathBuf;
 4 | use clap::{self, CommandFactory, Parser};
 5 | 
 6 | use pgr_db::shmmrutils::ShmmrSpec;
 7 | use std::fs::File;
 8 | use std::io::{BufWriter, Write};
 9 | 
10 | use pgr_db::seq_db;
11 | 
12 | #[derive(Parser, Debug)]
13 | #[clap(name = "pgr-seq-smp-count")]
14 | #[clap(author, version)]
15 | #[clap(about = "count shimmer pairs from fasta files", long_about = None)]
16 | struct CmdOptions {
17 |     #[clap(long, short)]
18 |     in_fasta: String,
19 |     #[clap(long, short)]
20 |     output_path: String,
21 |     //max_unique_count
22 |     #[clap(long, short, default_value_t = 4)]
23 |     min_count: usize,
24 |     #[clap(long, short, default_value_t = 31)]
25 |     w: u32,
26 |     #[clap(long, short, default_value_t = 31)]
27 |     k: u32,
28 |     #[clap(long, short, default_value_t = 1)]
29 |     r: u32,
30 |     #[clap(long, default_value_t = 0)]
31 |     min_span: u32,
32 | }
33 | 
34 | fn main() -> Result<(), std::io::Error> {
35 |     CmdOptions::command().version(VERSION_STRING).get_matches();
36 |     let args = CmdOptions::parse();
37 |     let filepath = args.in_fasta;
38 |     let spec = ShmmrSpec {
39 |         w: args.w,
40 |         k: args.k,
41 |         r: args.r,
42 |         min_span: args.min_span,
43 |         sketch: false,
44 |     };
45 |     let mut sdb = seq_db::CompactSeqDB::new(spec.clone());
46 |     sdb.load_seqs_from_fastx(filepath)?;
47 |     let mut out_buf = BufWriter::new(File::create(args.output_path)?);
48 |     sdb.frag_map
49 |         .into_iter()
50 |         .try_for_each(|(k, v)| -> Result<(), std::io::Error> {
51 |             let c = v.len(); 
52 |             if c >= args.min_count {
53 |                 out_buf.write_fmt(format_args!("{:016x} {:016x} {}\n", k.0, k.1, c))?;
54 |             };
55 |             Ok(())
56 |         })?;
57 |     Ok(())
58 | }
59 | 


--------------------------------------------------------------------------------
/pgr-bin/src/_bin/pgr-filter.rs:
--------------------------------------------------------------------------------
  1 | const VERSION_STRING: &'static str = env!("VERSION_STRING");
  2 | use clap::{self, CommandFactory, Parser};
  3 | use flate2::bufread::MultiGzDecoder;
  4 | use pgr_db::fasta_io::{FastaReader, FastaStreamReader, FastqStreamReader, SeqRec};
  5 | use pgr_db::kmer_filter::MinimizerFilter;
  6 | use rayon::prelude::*;
  7 | use std::fs::File;
  8 | use std::io::{self, BufReader, Read};
  9 | 
 10 | enum GZFastaReader {
 11 |     GZFile(FastaReader<BufReader<MultiGzDecoder<BufReader<File>>>>),
 12 |     RegularFile(FastaReader<BufReader<BufReader<File>>>),
 13 | }
 14 | 
 15 | #[derive(Parser, Debug)]
 16 | #[clap(name = "pgr-filter")]
 17 | #[clap(author, version)]
 18 | #[clap(about = "using Cuckoo Filter for Matching Reads To A Reference Set of Sequences", long_about = None)]
 19 | struct CmdOptions {
 20 |     ref_fasta_path: String,
 21 |     #[clap(long, short)]
 22 |     query_fastx_path: Option<String>,
 23 |     /// k-mer size
 24 |     #[clap(long, short, default_value_t = 32)]
 25 |     k: usize,
 26 |     /// count threshold
 27 |     #[clap(long, short, default_value_t = 0.8)]
 28 |     threshold: f32,
 29 |     #[clap(long)]
 30 |     fasta_stdin: bool,
 31 | }
 32 | 
 33 | fn get_fastx_reader(filepath: String) -> Result<GZFastaReader, std::io::Error> {
 34 |     let file = File::open(&filepath)?;
 35 |     let mut reader = BufReader::new(file);
 36 |     let mut is_gzfile = false;
 37 |     {
 38 |         let r = reader.by_ref();
 39 |         let mut buf = Vec::<u8>::new();
 40 |         let _ = r.take(2).read_to_end(&mut buf);
 41 |         if buf == [0x1F_u8, 0x8B_u8] {
 42 |             log::info!("input file: {} detected as gz-compressed file", filepath);
 43 |             is_gzfile = true;
 44 |         }
 45 |     }
 46 |     drop(reader);
 47 | 
 48 |     let file = File::open(&filepath)?;
 49 |     let reader = BufReader::new(file);
 50 |     let gz_buf = BufReader::new(MultiGzDecoder::new(reader));
 51 | 
 52 |     let file = File::open(&filepath)?;
 53 |     let reader = BufReader::new(file);
 54 |     let std_buf = BufReader::new(reader);
 55 | 
 56 |     if is_gzfile {
 57 |         drop(std_buf);
 58 |         Ok(GZFastaReader::GZFile(
 59 |             FastaReader::new(gz_buf, &filepath, 256, false).unwrap(),
 60 |         ))
 61 |     } else {
 62 |         drop(gz_buf);
 63 |         Ok(GZFastaReader::RegularFile(
 64 |             FastaReader::new(std_buf, &filepath, 256, false).unwrap(),
 65 |         ))
 66 |     }
 67 | }
 68 | 
 69 | fn main() -> Result<(), std::io::Error> {
 70 |     CmdOptions::command().version(VERSION_STRING).get_matches();
 71 |     let args = CmdOptions::parse();
 72 |     //let mut filter = KmerFilter::with_capacity(args.k, 1_usize << 24);
 73 |     let mut filter = MinimizerFilter::new(args.k);
 74 |     let mut add_seqs = |seq_iter: &mut dyn Iterator<Item = io::Result<SeqRec>>| {
 75 |         seq_iter.into_iter().for_each(|r| {
 76 |             if let Ok(r) = r {
 77 |                 filter.add_seq_mmers(&r.seq);
 78 |             };
 79 |         });
 80 |     };
 81 | 
 82 |     match get_fastx_reader(args.ref_fasta_path)? {
 83 |         GZFastaReader::GZFile(reader) => add_seqs(&mut reader.into_iter()),
 84 | 
 85 |         GZFastaReader::RegularFile(reader) => add_seqs(&mut reader.into_iter()),
 86 |     };
 87 | 
 88 |     let check_seqs = |seq_iter: &mut dyn Iterator<Item = io::Result<SeqRec>>| {
 89 |         let mut seq_data = Vec::<SeqRec>::new();
 90 |         for r in seq_iter {
 91 |             if let Ok(r) = r {
 92 |                 seq_data.push(r);
 93 |             };
 94 |             if seq_data.len() == 64 {
 95 |                 seq_data
 96 |                     .par_iter()
 97 |                     .map(|r| {
 98 |                         let (total, c) = filter.check_seq_mmers(&r.seq);
 99 |                         (r.clone(), total, c)
100 |                     })
101 |                     .collect::<Vec<(SeqRec, usize, usize)>>()
102 |                     .iter()
103 |                     .for_each(|(r, total, c)| {
104 |                         if *total > 0 {
105 |                             if (*c as f32) / (*total as f32) > args.threshold {
106 |                                 println!(">{} {} {}", String::from_utf8_lossy(&r.id), total, c);
107 |                                 println!("{}", String::from_utf8_lossy(&r.seq[..]));
108 |                             }
109 |                         }
110 |                     });
111 |                 seq_data.clear();
112 |             }
113 |         }
114 | 
115 |         seq_data
116 |             .into_par_iter()
117 |             .map(|r| {
118 |                 let (total, c) = filter.check_seq_mmers(&r.seq);
119 |                 (r, total, c)
120 |             })
121 |             .collect::<Vec<(SeqRec, usize, usize)>>()
122 |             .iter()
123 |             .for_each(|(r, total, c)| {
124 |                 if *total > 0 {
125 |                     if (*c as f32) / (*total as f32) > args.threshold {
126 |                         println!(">{} {} {}", String::from_utf8_lossy(&r.id), total, c);
127 |                         println!("{}", String::from_utf8_lossy(&r.seq[..]));
128 |                     }
129 |                 }
130 |             });
131 |     };
132 | 
133 |     if args.query_fastx_path.is_some() {
134 |         match get_fastx_reader(args.query_fastx_path.unwrap())? {
135 |             GZFastaReader::GZFile(reader) => check_seqs(&mut reader.into_iter()),
136 |             GZFastaReader::RegularFile(reader) => check_seqs(&mut reader.into_iter()),
137 |         }
138 |     } else {
139 |         if args.fasta_stdin {
140 |             let reader = FastaStreamReader::new(256);
141 |             check_seqs(&mut reader.into_iter());
142 |         } else {
143 |             let reader = FastqStreamReader::new(256);
144 |             check_seqs(&mut reader.into_iter());
145 |         }
146 |     }
147 | 
148 |     Ok(())
149 | }
150 | 


--------------------------------------------------------------------------------
/pgr-bin/src/_bin/pgr-multifilter.rs:
--------------------------------------------------------------------------------
  1 | const VERSION_STRING: &'static str = env!("VERSION_STRING");
  2 | use clap::{self, CommandFactory, Parser};
  3 | use flate2::bufread::MultiGzDecoder;
  4 | use pgr_db::kmer_filter::KmerFilter;
  5 | use pgr_db::fasta_io::{reverse_complement, FastaReader, FastqStreamReader, SeqRec};
  6 | use rayon::prelude::*;
  7 | use rustc_hash::FxHashMap;
  8 | use std::fs::File;
  9 | use std::io::{self, BufRead, BufReader, BufWriter, Error, ErrorKind, Read, Write};
 10 | 
 11 | 
 12 | enum GZFastaReader {
 13 |     GZFile(FastaReader<BufReader<MultiGzDecoder<BufReader<File>>>>),
 14 |     RegularFile(FastaReader<BufReader<BufReader<File>>>),
 15 | }
 16 | 
 17 | #[derive(Parser, Debug)]
 18 | #[clap(name = "pgr-multi-filter")]
 19 | #[clap(author, version)]
 20 | #[clap(about = "using Cuckoo Filter for Matching Reads To A Reference Set of Sequences", long_about = None)]
 21 | struct CmdOptions {
 22 |     ref_fasta_list: String,
 23 |     prefix: String,
 24 |     #[clap(long, short)]
 25 |     query_fastx_path: Option<String>,
 26 |     /// k-mer size
 27 |     #[clap(long, short, default_value_t = 32)]
 28 |     k: usize,
 29 |     /// count threshold
 30 |     #[clap(long, short, default_value_t = 4)]
 31 |     threshold: usize,
 32 | }
 33 | 
 34 | fn get_fastx_reader(filepath: String) -> Result<GZFastaReader, std::io::Error> {
 35 |     let file = File::open(&filepath)?;
 36 |     let mut reader = BufReader::new(file);
 37 |     let mut is_gzfile = false;
 38 |     {
 39 |         let r = reader.by_ref();
 40 |         let mut buf = Vec::<u8>::new();
 41 |         let _ = r.take(2).read_to_end(&mut buf);
 42 |         if buf == [0x1F_u8, 0x8B_u8] {
 43 |             log::info!("input file: {} detected as gz-compressed file", filepath);
 44 |             is_gzfile = true;
 45 |         }
 46 |     }
 47 |     drop(reader);
 48 | 
 49 |     let file = File::open(&filepath)?;
 50 |     let reader = BufReader::new(file);
 51 |     let gz_buf = BufReader::new(MultiGzDecoder::new(reader));
 52 | 
 53 |     let file = File::open(&filepath)?;
 54 |     let reader = BufReader::new(file);
 55 |     let std_buf = BufReader::new(reader);
 56 | 
 57 |     if is_gzfile {
 58 |         drop(std_buf);
 59 |         Ok(GZFastaReader::GZFile(
 60 |             FastaReader::new(gz_buf, &filepath, 256, false).unwrap(),
 61 |         ))
 62 |     } else {
 63 |         drop(gz_buf);
 64 |         Ok(GZFastaReader::RegularFile(
 65 |             FastaReader::new(std_buf, &filepath, 256, false).unwrap(),
 66 |         ))
 67 |     }
 68 | }
 69 | 
 70 | fn main() -> Result<(), std::io::Error> {
 71 |     CmdOptions::command().version(VERSION_STRING).get_matches();
 72 |     let args = CmdOptions::parse();
 73 |     let mut filters = FxHashMap::<String, KmerFilter>::default();
 74 | 
 75 |     let add_seqs = |filter: &mut KmerFilter,
 76 |                     seq_iter: &mut dyn Iterator<Item = io::Result<SeqRec>>| {
 77 |         seq_iter.into_iter().for_each(|r| {
 78 |             if let Ok(r) = r {
 79 |                 filter.add_seq(&r.seq);
 80 |                 let rc_seq = reverse_complement(&r.seq);
 81 |                 filter.add_seq(&rc_seq);
 82 |             };
 83 |         });
 84 |     };
 85 | 
 86 |     let inputs = BufReader::new(File::open(args.ref_fasta_list)?);
 87 |     inputs
 88 |         .lines()
 89 |         .into_iter()
 90 |         .try_for_each(|line| -> Result<(), std::io::Error> {
 91 |             match line {
 92 |                 Ok(line) => {
 93 |                     let fields = line.split("\t").into_iter().collect::<Vec<&str>>();
 94 |                     if fields.len() != 2 {
 95 |                         return Err(Error::new(ErrorKind::Other, "can't read the input file"));
 96 |                     }
 97 |                     let fileanme = fields[0];
 98 |                     let suffix = fields[1];
 99 |                     let mut filter = KmerFilter::with_capacity(args.k, 1_usize << 24);
100 |                     match get_fastx_reader(fileanme.to_string())? {
101 |                         GZFastaReader::GZFile(reader) => {
102 |                             add_seqs(&mut filter, &mut reader.into_iter())
103 |                         }
104 | 
105 |                         GZFastaReader::RegularFile(reader) => {
106 |                             add_seqs(&mut filter, &mut reader.into_iter())
107 |                         }
108 |                     };
109 |                     filters.insert(suffix.to_string(), filter);
110 | 
111 |                     Ok(())
112 |                 }
113 |                 Err(e) => Err(e),
114 |             }
115 |         })?;
116 | 
117 |     let check_seqs = |seq_iter: &mut dyn Iterator<Item = io::Result<SeqRec>>| {
118 |         let mut seq_data = Vec::<SeqRec>::new();
119 |         for r in seq_iter {
120 |             if let Ok(r) = r {
121 |                 seq_data.push(r);
122 |             }
123 |         }
124 | 
125 |         filters.iter().for_each(|(suffix, filter)| {
126 |             let mut writer = BufWriter::new(
127 |                 File::create(args.prefix.clone() + "_" + &suffix.clone()[..] + ".fa")
128 |                     .expect("file creating error"),
129 |             );
130 | 
131 |             (&seq_data)
132 |                 .into_par_iter()
133 |                 .filter(|&r| {
134 |                     let c = filter.check_seq(&r.seq);
135 |                     c >= args.threshold
136 |                 })
137 |                 .collect::<Vec<&SeqRec>>()
138 |                 .iter()
139 |                 .for_each(|r| {
140 |                     write!(writer, ">{}\n", String::from_utf8_lossy(&r.id)).expect("writing error");
141 |                     write!(writer, "{}\n", String::from_utf8_lossy(&r.seq[..])).expect("writing error");
142 |                 });
143 |         });
144 | 
145 |     };
146 | 
147 |     if args.query_fastx_path.is_some() {
148 |         match get_fastx_reader(args.query_fastx_path.unwrap())? {
149 |             GZFastaReader::GZFile(reader) => check_seqs(&mut reader.into_iter()),
150 |             GZFastaReader::RegularFile(reader) => check_seqs(&mut reader.into_iter()),
151 |         }
152 |     } else {
153 |         let reader = FastqStreamReader::new(128);
154 |         check_seqs(&mut reader.into_iter());
155 |     }
156 | 
157 |     Ok(())
158 | }
159 | 


--------------------------------------------------------------------------------
/pgr-bin/src/_bin/pgr-probe-match.rs:
--------------------------------------------------------------------------------
  1 | const VERSION_STRING: &'static str = env!("VERSION_STRING");
  2 | use clap::{self, CommandFactory, Parser};
  3 | use flate2::bufread::MultiGzDecoder;
  4 | use pgr_db::fasta_io::{reverse_complement, FastaReader, FastqStreamReader, SeqRec};
  5 | use rayon::prelude::*;
  6 | use rustc_hash::FxHashMap;
  7 | use std::fs::File;
  8 | use std::io::{self, BufRead, BufReader, Read};
  9 | 
 10 | #[derive(Parser, Debug)]
 11 | #[clap(name = "pgr-probe-match")]
 12 | #[clap(author, version)]
 13 | #[clap(about = "using Cuckoo Filter for Matching Reads To A Reference Set of Sequences", long_about = None)]
 14 | struct CmdOptions {
 15 |     probe_file_path: String,
 16 |     #[clap(long, short)]
 17 |     query_fastx_path: Option<String>,
 18 | }
 19 | enum GZFastaReader {
 20 |     GZFile(FastaReader<BufReader<MultiGzDecoder<BufReader<File>>>>),
 21 |     RegularFile(FastaReader<BufReader<BufReader<File>>>),
 22 | }
 23 | 
 24 | #[derive(Clone)]
 25 | struct ProbeInfo {
 26 |     vname: String,
 27 |     vprobe: Vec<u8>,
 28 |     vprobe_r: Vec<u8>,
 29 |     t1name: String,
 30 |     t1probe: Vec<u8>,
 31 |     t1probe_r: Vec<u8>,
 32 |     t2name: String,
 33 |     t2probe: Vec<u8>,
 34 |     t2probe_r: Vec<u8>,
 35 | }
 36 | 
 37 | fn get_fastx_reader(filepath: String) -> Result<GZFastaReader, std::io::Error> {
 38 |     let file = File::open(&filepath)?;
 39 |     let mut reader = BufReader::new(file);
 40 |     let mut is_gzfile = false;
 41 |     {
 42 |         let r = reader.by_ref();
 43 |         let mut buf = Vec::<u8>::new();
 44 |         let _ = r.take(2).read_to_end(&mut buf);
 45 |         if buf == [0x1F_u8, 0x8B_u8] {
 46 |             log::info!("input file: {} detected as gz-compressed file", filepath);
 47 |             is_gzfile = true;
 48 |         }
 49 |     }
 50 |     drop(reader);
 51 | 
 52 |     let file = File::open(&filepath)?;
 53 |     let reader = BufReader::new(file);
 54 |     let gz_buf = BufReader::new(MultiGzDecoder::new(reader));
 55 | 
 56 |     let file = File::open(&filepath)?;
 57 |     let reader = BufReader::new(file);
 58 |     let std_buf = BufReader::new(reader);
 59 | 
 60 |     if is_gzfile {
 61 |         drop(std_buf);
 62 |         Ok(GZFastaReader::GZFile(
 63 |             FastaReader::new(gz_buf, &filepath, 256, false).unwrap(),
 64 |         ))
 65 |     } else {
 66 |         drop(gz_buf);
 67 |         Ok(GZFastaReader::RegularFile(
 68 |             FastaReader::new(std_buf, &filepath, 256, false).unwrap(),
 69 |         ))
 70 |     }
 71 | }
 72 | 
 73 | fn main() -> Result<(), std::io::Error> {
 74 |     CmdOptions::command().version(VERSION_STRING).get_matches();
 75 |     let args = CmdOptions::parse();
 76 |     let probe_reader = BufReader::new(File::open(args.probe_file_path)?);
 77 |     let mut all_probes = FxHashMap::<String, ProbeInfo>::default();
 78 |     probe_reader
 79 |         .lines()
 80 |         .into_iter()
 81 |         .for_each(|line| match line {
 82 |             Ok(line) => {
 83 |                 let line = line.trim_end();
 84 |                 let mut fields = line.split("\t");
 85 |                 let vname = fields.next().expect("error parsing").to_string();
 86 |                 let tmp = fields.next().expect("error parsing");
 87 |                 let vprobe = tmp.as_bytes().to_vec();
 88 |                 let vprobe_r = reverse_complement(&vprobe);
 89 |                 let t1name = fields.next().expect("error parsing").to_string();
 90 |                 let tmp = fields.next().expect("error parsing");
 91 |                 let t1probe = tmp.as_bytes().to_vec();
 92 |                 let t1probe_r = reverse_complement(&t1probe);
 93 |                 let t2name = fields.next().expect("error parsing").to_string();
 94 |                 let tmp = fields.next().expect("error parsing");
 95 |                 let t2probe = tmp.as_bytes().to_vec();
 96 |                 let t2probe_r = reverse_complement(&t2probe);
 97 |                 let probeset = ProbeInfo {
 98 |                     vname: vname.clone(),
 99 |                     vprobe,
100 |                     vprobe_r,
101 |                     t1name,
102 |                     t1probe,
103 |                     t1probe_r,
104 |                     t2name,
105 |                     t2probe,
106 |                     t2probe_r,
107 |                 };
108 |                 all_probes.insert(vname, probeset);
109 |             }
110 |             _ => {}
111 |         });
112 | 
113 |     let match_probe = |seq: &Vec<u8>, probe: &Vec<u8>| -> bool {
114 |         let plen = probe.len();
115 |         let mut flag = false;
116 |         for i in 0..seq.len() - plen {
117 |             if seq[i..i + plen] == probe[..] || seq[i..i + plen] == probe[..] {
118 |                 flag = true;
119 |                 break;
120 |             }
121 |         }
122 |         flag
123 |     };
124 | 
125 |     let check_seqs = |seq_iter: &mut dyn Iterator<Item = io::Result<SeqRec>>| {
126 |         let mut seq_data = Vec::<SeqRec>::new();
127 |         for r in seq_iter {
128 |             if let Ok(r) = r {
129 |                 seq_data.push(r);
130 |             }
131 |         }
132 | 
133 |         all_probes.into_iter().for_each(|(_vname, probe_info)| {
134 |             let mut count = (0_usize, 0_usize, 0_usize);
135 |             (&seq_data)
136 |                 .into_par_iter()
137 |                 .filter(|&r| {
138 |                     match_probe(&r.seq, &probe_info.vprobe)
139 |                         || match_probe(&r.seq, &probe_info.vprobe_r)
140 |                 })
141 |                 .collect::<Vec<&SeqRec>>()
142 |                 .into_iter()
143 |                 .for_each(|r| {
144 |                     count.0 += 1;
145 |                     if match_probe(&r.seq, &probe_info.t1probe)
146 |                         || match_probe(&r.seq, &probe_info.t1probe_r)
147 |                     {
148 |                         count.1 += 1;
149 |                     }
150 |                     if match_probe(&r.seq, &probe_info.t2probe)
151 |                         || match_probe(&r.seq, &probe_info.t2probe_r)
152 |                     {
153 |                         count.2 += 1;
154 |                     }
155 |                 });
156 |             println!(
157 |                 "{} {} {} {} {} {}",
158 |                 probe_info.vname, count.0, probe_info.t1name, count.1, probe_info.t2name, count.2
159 |             );
160 |         });
161 |     };
162 | 
163 |     if args.query_fastx_path.is_some() {
164 |         match get_fastx_reader(args.query_fastx_path.unwrap())? {
165 |             GZFastaReader::GZFile(reader) => check_seqs(&mut reader.into_iter()),
166 |             GZFastaReader::RegularFile(reader) => check_seqs(&mut reader.into_iter()),
167 |         }
168 |     } else {
169 |         let reader = FastqStreamReader::new(128);
170 |         check_seqs(&mut reader.into_iter());
171 |     }
172 | 
173 |     Ok(())
174 | }
175 | 


--------------------------------------------------------------------------------
/pgr-bin/src/_bin/pgr-shmmr-pair-count.rs:
--------------------------------------------------------------------------------
 1 | const VERSION_STRING: &'static str = env!("VERSION_STRING");
 2 | 
 3 | //use std::path::PathBuf;
 4 | use clap::{self, CommandFactory, Parser};
 5 | 
 6 | use rayon::prelude::*;
 7 | use rustc_hash::FxHashMap;
 8 | use std::collections::HashMap;
 9 | use std::collections::HashSet;
10 | use std::fs::File;
11 | use std::io::{BufRead, BufReader, BufWriter, Write};
12 | 
13 | use pgr_db::seq_db;
14 | 
15 | #[derive(Parser, Debug)]
16 | #[clap(name = "pgr-shmmr-pair-count")]
17 | #[clap(author, version)]
18 | #[clap(about = "count shimmer pairs in a shimmer database", long_about = None)]
19 | struct CmdOptions {
20 |     prefix: String,
21 |     output_path: String,
22 |     //max_unique_count
23 |     #[clap(long, short, default_value_t = 1)]
24 |     max_unique_count: usize,
25 | }
26 | 
27 | fn main() -> Result<(), std::io::Error> {
28 |     CmdOptions::command().version(VERSION_STRING).get_matches();
29 |     let args = CmdOptions::parse();
30 |     let (_shmmr_spec, shmmr_pair_to_frags) =
31 |         seq_db::read_mdb_file(args.prefix.clone() + ".mdb").unwrap();
32 |     let mut seq_index = HashMap::<(String, Option<String>), (u32, u32)>::new();
33 |     let mut seq_info = HashMap::<u32, (String, Option<String>, u32)>::new();
34 |     let midx_file = BufReader::new(File::open(args.prefix.clone() + ".midx")?);
35 | 
36 |     let mut sources = HashSet::<String>::new();
37 |     midx_file
38 |         .lines()
39 |         .into_iter()
40 |         .try_for_each(|line| -> Result<(), std::io::Error> {
41 |             let line = line.unwrap();
42 |             let mut line = line.as_str().split("\t");
43 |             let sid = line.next().unwrap().parse::<u32>().unwrap();
44 |             let len = line.next().unwrap().parse::<u32>().unwrap();
45 |             let ctg_name = line.next().unwrap().to_string();
46 |             let source = line.next().unwrap().to_string();
47 |             sources.insert(source.clone());
48 |             seq_index.insert((ctg_name.clone(), Some(source.clone())), (sid, len));
49 |             seq_info.insert(sid, (ctg_name, Some(source), len));
50 |             Ok(())
51 |         })?;
52 | 
53 |     let source_to_id = sources
54 |         .iter()
55 |         .enumerate()
56 |         .map(|v| (v.1.clone(), v.0 as u32))
57 |         .collect::<HashMap<String, u32>>();
58 | 
59 |     let mut sid_to_source_id_lookup = vec![0_u32; seq_info.len()];
60 | 
61 |     seq_info.iter().for_each(|(k, v)| {
62 |         sid_to_source_id_lookup[*k as usize] = *source_to_id.get(v.1.as_ref().unwrap()).unwrap();
63 |     });
64 |     let mut out_file = BufWriter::new(File::create(args.output_path)?);
65 |     let out_vec = shmmr_pair_to_frags
66 |         .par_iter()
67 |         .map(|(k, v)| {
68 |             let mut count = FxHashMap::<u32, usize>::default();
69 |             v.iter().for_each(|v| {
70 |                 let sid = (*v).1;
71 |                 let source_id = *sid_to_source_id_lookup.get(sid as usize).unwrap();
72 |                 *count.entry(source_id).or_insert(0) += 1;
73 |             });
74 |             let v = count
75 |                 .into_iter()
76 |                 .filter(|(_k, v)| {
77 |                     let muc = args.max_unique_count;
78 |                     if *v > muc {
79 |                         false
80 |                     } else {
81 |                         true
82 |                     }
83 |                 })
84 |                 .count();
85 |             (k.0, k.1, v)
86 |         })
87 |         .collect::<Vec<(u64, u64, usize)>>();
88 | 
89 |     out_vec
90 |         .iter()
91 |         .try_for_each(|v| -> Result<(), std::io::Error> {
92 |             writeln!(&mut out_file, "{} {} {}", v.0, v.1, v.2)?;
93 |             Ok(())
94 |         })?;
95 | 
96 |     Ok(())
97 | }
98 | 


--------------------------------------------------------------------------------
/pgr-bin/src/_bin/pgr-test.rs:
--------------------------------------------------------------------------------
  1 | use flate2::bufread::MultiGzDecoder;
  2 | use pgr_db::agc_io::AGCFile;
  3 | use pgr_db::fasta_io::FastaReader;
  4 | use std::collections::HashMap;
  5 | use std::fs::File;
  6 | use std::io::{BufRead, BufReader, Read};
  7 | 
  8 | use pgr_db::seq_db::{self, query_fragment, read_mdb_file};
  9 | 
 10 | pub fn load_seqs() -> HashMap<String, Vec<u8>> {
 11 |     let mut seqs = HashMap::<String, Vec<u8>>::new();
 12 |     //let filepath = "test/test_data/test_seqs.fa";
 13 |     let filepath = "/wd/peregrine-r-ext/phasing_test/PanMHCgraph/HPRCy1.MHC.fa";
 14 |     let file = File::open(filepath.to_string()).unwrap();
 15 |     let mut reader = BufReader::new(file);
 16 |     let mut is_gzfile = false;
 17 |     {
 18 |         let r = reader.by_ref();
 19 |         let mut buf = Vec::<u8>::new();
 20 |         let _ = r.take(2).read_to_end(&mut buf);
 21 |         if buf == [0x1F_u8, 0x8B_u8] {
 22 |             log::info!("input file detected as gz-compressed file",);
 23 |             is_gzfile = true;
 24 |         }
 25 |     }
 26 |     drop(reader);
 27 | 
 28 |     let file = File::open(&filepath).unwrap();
 29 |     let mut reader = BufReader::new(file);
 30 |     let gz_buf = &mut BufReader::new(MultiGzDecoder::new(&mut reader));
 31 | 
 32 |     let file = File::open(&filepath).unwrap();
 33 |     let reader = BufReader::new(file);
 34 |     let std_buf = &mut BufReader::new(reader);
 35 | 
 36 |     let fastx_buf: &mut dyn BufRead = if is_gzfile {
 37 |         drop(std_buf);
 38 |         gz_buf
 39 |     } else {
 40 |         drop(gz_buf);
 41 |         std_buf
 42 |     };
 43 | 
 44 |     let mut fastx_reader =
 45 |         FastaReader::new(fastx_buf, &filepath.to_string(), 1 << 14, true).unwrap();
 46 |     while let Some(rec) = fastx_reader.next_rec() {
 47 |         let rec = rec.unwrap();
 48 |         let seqname = String::from_utf8_lossy(&rec.id).into_owned();
 49 |         seqs.insert(seqname, rec.seq.clone());
 50 |     }
 51 |     seqs
 52 | }
 53 | 
 54 | fn _load_seq_test() {
 55 |     let seqs = load_seqs();
 56 |     let mut sdb = seq_db::CompactSeqDB::new(seq_db::SHMMRSPEC);
 57 |     let _shmmr_spec = &pgr_db::seq_db::SHMMRSPEC;
 58 |     let _ = sdb.load_seqs_from_fastx(
 59 |         "/wd/peregrine-r-ext/phasing_test/PanMHCgraph/HPRCy1.MHC.fa".to_string(),
 60 |     );
 61 |     //println!("test");
 62 |     for seq in sdb.seqs.iter() {
 63 |         println!("S {} {} {}", seq.name, seq.id, seq.len);
 64 |         //println!();
 65 |         //println!("{}", seq.name);
 66 |         let reconstruct_seq = sdb.get_seq(&seq);
 67 |         let orig_seq = seqs.get(&seq.name).unwrap();
 68 |         if reconstruct_seq != *orig_seq {
 69 |             //println!("{}", seq.name);
 70 |             //println!("{:?}", reconstruct_seq);
 71 |             //println!("{:?}", orig_seq);
 72 |             for i in 0..reconstruct_seq.len() {
 73 |                 if orig_seq[i] != reconstruct_seq[i] {
 74 |                     println!("{} {} {} X", i, orig_seq[i], reconstruct_seq[i]);
 75 |                 } else {
 76 |                     println!("{} {} {}  ", i, orig_seq[i], reconstruct_seq[i]);
 77 |                 }
 78 |             }
 79 |         } else {
 80 |             println!("{} matched", seq.name);
 81 |         };
 82 |         assert_eq!(reconstruct_seq, *orig_seq);
 83 |     }
 84 |     for (shmmr_pair, frg_ids) in sdb.frag_map.into_iter() {
 85 |         for ids in frg_ids {
 86 |             println!(
 87 |                 "M {:016X} {:016X} {} {} {} {} {}",
 88 |                 shmmr_pair.0, shmmr_pair.1, ids.0, ids.1, ids.2, ids.3, ids.4
 89 |             );
 90 |         }
 91 |     }
 92 | }
 93 | 
 94 | fn _load_index_from_fastx() -> Result<(), std::io::Error> {
 95 |     let mut sdb = seq_db::CompactSeqDB::new(seq_db::SHMMRSPEC);
 96 |     let filelist = File::open("./filelist").unwrap();
 97 | 
 98 |     BufReader::new(filelist).lines().into_iter().for_each(|fp| {
 99 |         let fp = fp.unwrap();
100 |         let _ = sdb.load_index_from_fastx(fp);
101 |     });
102 | 
103 |     seq_db::write_shmr_map_file(&sdb.shmmr_spec, &sdb.frag_map, "test.db".to_string())?;
104 | 
105 |     for seq in sdb.seqs.iter() {
106 |         println!("S {} {} {}", seq.name, seq.id, seq.len);
107 |     }
108 |     for (shmmr_pair, frg_ids) in sdb.frag_map.into_iter() {
109 |         for ids in frg_ids {
110 |             println!(
111 |                 "M {:016X} {:016X} {} {} {} {} {}",
112 |                 shmmr_pair.0, shmmr_pair.1, ids.0, ids.1, ids.2, ids.3, ids.4
113 |             );
114 |         }
115 |     }
116 |     Ok(())
117 | }
118 | 
119 | fn load_index_from_agcfile() -> Result<(), std::io::Error> {
120 |     let mut sdb = seq_db::CompactSeqDB::new(seq_db::SHMMRSPEC);
121 |     let filelist = File::open("./filelist").unwrap();
122 | 
123 |     BufReader::new(filelist).lines().into_iter().try_for_each(
124 |         |fp| -> Result<(), std::io::Error> {
125 |             let fp = fp.unwrap();
126 |             let agcfile = AGCFile::new(fp)?;
127 |             let _ = sdb.load_index_from_agcfile(agcfile);
128 |             Ok(())
129 |         },
130 |     )?;
131 | 
132 |     //seq_db::write_shmr_map_file(&sdb.frag_map, "test.db".to_string());
133 |     sdb.write_shmr_map_index("test".to_string())?;
134 |     Ok(())
135 | }
136 | 
137 | fn _load_index_mdb() -> Result<(), std::io::Error> {
138 |     let agcfile = AGCFile::new(String::from("grch38.agc"))?;
139 |     for sample in agcfile.samples.iter() {
140 |         for contig in sample.contigs.iter() {
141 |             let (_n, _t) = contig;
142 |             //println!("{}:{}:{}", sample.name, n, t);
143 |         }
144 |     }
145 |     let seq_mhc = agcfile.get_sub_seq("GCA_000001405.15_GRCh38_no_alt_analysis_set".to_string(), 
146 |     "chr6  AC:CM000668.2  gi:568336018  LN:170805979  rl:Chromosome  M5:5691468a67c7e7a7b5f2a3a683792c29  AS:GRCh38".to_string(), 
147 |     28510120, 33480577);
148 |     // println!("MHC seq len: {}", MHCseq.len());
149 |     let (_shmmr_spec, new_map) = read_mdb_file("test.db".to_string()).unwrap();
150 |     let shmmr_spec = &pgr_db::seq_db::SHMMRSPEC;
151 |     let r_frags = query_fragment(&new_map, &seq_mhc, shmmr_spec);
152 |     let mut out = vec![];
153 |     for res in r_frags {
154 |         for v in res.2 {
155 |             //println!("Q {:?} {:?} {:?}", res.0, res.1, v);
156 |             out.push((v, res.1, res.0))
157 |         }
158 |     }
159 |     out.sort();
160 |     for (v0, v1, _) in out {
161 |         println!(
162 |             "Q {} {} {} {} {} {} {} {}",
163 |             v0.0, v0.1, v0.2, v0.3, v0.4, v1.0, v1.1, v1.2
164 |         );
165 |     }
166 |     Ok(())
167 | }
168 | 
169 | fn main() -> Result<(), std::io::Error> {
170 |     //load_seq_test();
171 |     //load_index_from_fastx();
172 |     load_index_from_agcfile()?;
173 |     //load_index_mdb();
174 |     Ok(())
175 | }
176 | 


--------------------------------------------------------------------------------
/pgr-bin/src/bin/pgr-annotate-bed-file.rs:
--------------------------------------------------------------------------------
  1 | const VERSION_STRING: &str = env!("VERSION_STRING");
  2 | use clap::{self, CommandFactory, Parser};
  3 | use flate2::bufread::MultiGzDecoder;
  4 | use iset::IntervalMap;
  5 | //use rayon::prelude::*;
  6 | use rustc_hash::{FxHashMap, FxHashSet};
  7 | use std::fs::File;
  8 | use std::io::{BufRead, BufReader, BufWriter, Write};
  9 | use std::path::Path;
 10 | 
 11 | /// Align long contigs and identify potential SV regions with respect to the reference fasta file
 12 | #[derive(Parser, Debug)]
 13 | #[clap(name = "pgr-annotate-bed-file")]
 14 | #[clap(author, version)]
 15 | #[clap(about, long_about = None)]
 16 | struct CmdOptions {
 17 |     /// path to the the a bed file
 18 |     bed_path: String,
 19 |     /// path to the annotation file (gzipped)
 20 |     annotation_path: String,
 21 |     /// the prefix of the output files
 22 |     output_path: String,
 23 |     /// type 
 24 |     #[clap(long, default_value = "transcript")]
 25 |     feature: String,
 26 |     /// number of threads used in parallel (more memory usage), default to "0" using all CPUs available or the number set by RAYON_NUM_THREADS
 27 |     #[clap(long, default_value_t = 0)]
 28 |     number_of_thread: usize,
 29 | }
 30 | fn main() -> Result<(), std::io::Error> {
 31 |     CmdOptions::command().version(VERSION_STRING).get_matches();
 32 |     let args = CmdOptions::parse();
 33 | 
 34 |     rayon::ThreadPoolBuilder::new()
 35 |         .num_threads(args.number_of_thread)
 36 |         .build_global()
 37 |         .unwrap();
 38 | 
 39 |     let mut reader = BufReader::new(File::open(Path::new(&args.annotation_path)).unwrap());
 40 | 
 41 |     let annotation_reader = BufReader::new(MultiGzDecoder::new(&mut reader));
 42 |     let mut annotation_interval = FxHashMap::<String, IntervalMap<u32, (char, String)>>::default();
 43 |     // we support https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/genes/hg38.ncbiRefSeq.gtf.gz for now
 44 |     annotation_reader.lines().for_each(|line| {
 45 |         if let Ok(line) = line {
 46 |             let err_msg = format!("faile to parse on {}", line);
 47 |             let fields = line.split('\t').collect::<Vec<&str>>();
 48 |             let chr = fields[0].to_string();
 49 |             let f_type = fields[2].to_string();
 50 |             let fs = fields[3].parse::<u32>().expect(&err_msg);
 51 |             let fe = fields[4].parse::<u32>().expect(&err_msg) + 1;
 52 |             let strand = fields[6].chars().next().expect(&err_msg);
 53 |             let attribute = fields[8].to_string();
 54 |             if f_type == args.feature {
 55 |                 let e = annotation_interval
 56 |                     .entry(chr)
 57 |                     .or_insert(IntervalMap::<u32, (char, String)>::default());
 58 |                 e.insert(fs..fe, (strand, attribute));
 59 |             }
 60 |         }
 61 |     });
 62 | 
 63 |     let mut out_bed = BufWriter::new(File::create(Path::new(&args.output_path)).unwrap());
 64 | 
 65 |     let bed_reader = BufReader::new(File::open(Path::new(&args.bed_path)).unwrap());
 66 |     bed_reader.lines().for_each(|line| {
 67 |         if let Ok(line) = line {
 68 |             if line.starts_with('#') {
 69 |                 return;
 70 |             };
 71 |             let err_msg = format!("faile to parse on {}", line);
 72 |             let fields = line.split('\t').collect::<Vec<&str>>();
 73 |             let chr = fields[0].to_string();
 74 |             let bgn = fields[1].parse::<u32>().expect(&err_msg);
 75 |             let end = fields[2].parse::<u32>().expect(&err_msg);
 76 |             let annotation = fields[3].to_string();
 77 |             if let Some(i_map) = annotation_interval.get(&chr) {
 78 |                 // TODO, we only pick the first overlap for now
 79 |                 let mut annotations = FxHashSet::<String>::default();
 80 |                 for (_strand, attributes) in i_map.values(bgn..end) {
 81 |                     // TODO: need a proper parser
 82 |                     let attributes = attributes.trim_end_matches(';').to_string();
 83 |                     let a_fields = attributes.split(';').collect::<Vec<&str>>();
 84 |                     let gn = a_fields.last().unwrap().to_string();
 85 |                     let gn = gn.split(' ').collect::<Vec<&str>>();
 86 |                     let gn = gn.last().unwrap().to_string();
 87 |                     let gn = gn.trim_matches('"');
 88 |                     annotations.insert(gn.to_string()); 
 89 |                 };
 90 |                 if annotations.is_empty() { return };
 91 |                 let gn = annotations.into_iter().collect::<Vec<_>>().join("/"); 
 92 | 
 93 |                 
 94 |                 writeln!(
 95 |                     out_bed,
 96 |                     "{}\t{}\t{}\t{}>{}",
 97 |                     chr, bgn, end, annotation, gn
 98 |                 )
 99 |                 .expect("fail to write the vcf file");
100 |             };
101 |         }
102 |     });
103 | 
104 |     Ok(())
105 | }
106 | 


--------------------------------------------------------------------------------
/pgr-bin/src/bin/pgr-annotate-vcf-file.rs:
--------------------------------------------------------------------------------
  1 | const VERSION_STRING: &str = env!("VERSION_STRING");
  2 | use clap::{self, CommandFactory, Parser};
  3 | use flate2::bufread::MultiGzDecoder;
  4 | use iset::IntervalMap;
  5 | //use rayon::prelude::*;
  6 | use rustc_hash::{FxHashMap, FxHashSet};
  7 | use std::fs::File;
  8 | use std::io::{BufRead, BufReader, BufWriter, Write};
  9 | use std::path::Path;
 10 | 
 11 | /// Align long contigs and identify potential SV regions with respect to the reference fasta file
 12 | #[derive(Parser, Debug)]
 13 | #[clap(name = "pgr-annotate-vcf-file")]
 14 | #[clap(author, version)]
 15 | #[clap(about, long_about = None)]
 16 | struct CmdOptions {
 17 |     /// path to the the a vcf file
 18 |     vcf_path: String,
 19 |     /// path to the annotation file (gzipped)
 20 |     annotation_path: String,
 21 |     /// the prefix of the output files
 22 |     output_path: String,
 23 |     /// number of threads used in parallel (more memory usage), default to "0" using all CPUs available or the number set by RAYON_NUM_THREADS
 24 |     #[clap(long, default_value_t = 0)]
 25 |     number_of_thread: usize,
 26 | }
 27 | fn main() -> Result<(), std::io::Error> {
 28 |     CmdOptions::command().version(VERSION_STRING).get_matches();
 29 |     let args = CmdOptions::parse();
 30 | 
 31 |     rayon::ThreadPoolBuilder::new()
 32 |         .num_threads(args.number_of_thread)
 33 |         .build_global()
 34 |         .unwrap();
 35 | 
 36 |     let mut reader = BufReader::new(File::open(Path::new(&args.annotation_path)).unwrap());
 37 | 
 38 |     let annotation_reader = BufReader::new(MultiGzDecoder::new(&mut reader));
 39 |     let mut annotation_interval = FxHashMap::<String, IntervalMap<u32, (char, String)>>::default();
 40 |     // we support https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/genes/hg38.ncbiRefSeq.gtf.gz for now
 41 |     annotation_reader.lines().for_each(|line| {
 42 |         if let Ok(line) = line {
 43 |             let err_msg = format!("faile to parse on {}", line);
 44 |             let fields = line.split('\t').collect::<Vec<&str>>();
 45 |             let chr = fields[0].to_string();
 46 |             let f_type = fields[2].to_string();
 47 |             let fs = fields[3].parse::<u32>().expect(&err_msg);
 48 |             let fe = fields[4].parse::<u32>().expect(&err_msg);
 49 |             let strand = fields[6].chars().next().expect(&err_msg);
 50 |             let attribute = fields[8].to_string();
 51 |             if f_type == "transcript" {
 52 |                 let e = annotation_interval
 53 |                     .entry(chr)
 54 |                     .or_insert(IntervalMap::<u32, (char, String)>::default());
 55 |                 e.insert(fs..fe, (strand, attribute));
 56 |             }
 57 |         }
 58 |     });
 59 | 
 60 |     let mut out_vcf = BufWriter::new(File::create(Path::new(&args.output_path)).unwrap());
 61 |     writeln!(out_vcf, "##fileformat=VCFv4.2").expect("fail to write the vcf file");
 62 |     writeln!(
 63 |         out_vcf,
 64 |         r#"##INFO=<ID=GN,Number=1,Type=String,Description="GeneName">"#
 65 |     )
 66 |     .expect("fail to write the vcf");
 67 |     writeln!(
 68 |         out_vcf,
 69 |         r#"##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">"#
 70 |     )
 71 |     .expect("fail to write the vcf file");
 72 |     writeln!(
 73 |         out_vcf,
 74 |         "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE"
 75 |     )
 76 |     .expect("fail to write the vcf file");
 77 | 
 78 |     let vcf_reader = BufReader::new(File::open(Path::new(&args.vcf_path)).unwrap());
 79 |     vcf_reader.lines().for_each(|line| {
 80 |         if let Ok(line) = line {
 81 |             if line.starts_with('#') {
 82 |                 return;
 83 |             };
 84 |             let err_msg = format!("faile to parse on {}", line);
 85 |             let fields = line.split('\t').collect::<Vec<&str>>();
 86 |             let chr = fields[0].to_string();
 87 |             let pos = fields[1].parse::<u32>().expect(&err_msg);
 88 |             if let Some(i_map) = annotation_interval.get(&chr) {
 89 |                 // TODO, we only pick the first overlap for now
 90 |                 let mut annotations = FxHashSet::<String>::default();
 91 |                 for (_rng, (_strand, attributes)) in i_map.overlap(pos) {
 92 |                     // TODO: need a proper parser
 93 |                     let attributes = attributes.trim_end_matches(';').to_string();
 94 |                     let a_fields = attributes.split(';').collect::<Vec<&str>>();
 95 |                     let gn = a_fields.last().unwrap().to_string();
 96 |                     let gn = gn.split(' ').collect::<Vec<&str>>();
 97 |                     let gn = gn.last().unwrap().to_string();
 98 |                     let gn = gn.trim_matches('"');
 99 |                     annotations.insert(gn.to_string()); 
100 |                 };
101 |                 if annotations.is_empty() { return };
102 |                 let gn = annotations.into_iter().collect::<Vec<_>>().join("/"); 
103 | 
104 |                 let tvs = fields[3];
105 |                     let qvs = fields[4];
106 |                     let gt = fields[9];
107 |                     writeln!(
108 |                         out_vcf,
109 |                         "{}\t{}\t.\t{}\t{}\t60\tPASS\tGN={}\tGT\t{}",
110 |                         chr, pos, tvs, qvs, gn, gt,
111 |                     )
112 |                     .expect("fail to write the vcf file");
113 |             };
114 |         }
115 |     });
116 | 
117 |     Ok(())
118 | }
119 | 


--------------------------------------------------------------------------------
/pgr-bin/src/bin/pgr-fetch-seqs.rs:
--------------------------------------------------------------------------------
  1 | const VERSION_STRING: &str = env!("VERSION_STRING");
  2 | use clap::{self, CommandFactory, Parser};
  3 | use pgr_db::ext::SeqIndexDB;
  4 | use pgr_db::fasta_io;
  5 | use std::fs::File;
  6 | use std::io::{self, BufRead, BufReader, BufWriter, Write};
  7 | use std::path::Path;
  8 | 
  9 | /// List or fetch sequences from a PGR-TK database
 10 | #[derive(Parser, Debug)]
 11 | #[clap(name = "pgr-fetch-seqs")]
 12 | #[clap(author, version)]
 13 | #[clap(about, long_about = None)]
 14 | struct CmdOptions {
 15 |     /// the prefix to a PGR-TK sequence database
 16 |     pgr_db_prefix: String,
 17 | 
 18 |     /// using the frg format for the sequence database (default to the AGC backend database if not specified)
 19 |     #[clap(long, default_value_t = false)]
 20 |     frg_file: bool,
 21 | 
 22 |     /// the regions file path
 23 |     #[clap(short, long, default_value=None)]
 24 |     region_file: Option<String>,
 25 | 
 26 |     /// output file name
 27 |     #[clap(short, long, default_value=None)]
 28 |     output_file: Option<String>,
 29 | 
 30 |     /// list all sequence source, contig names in the database
 31 |     #[clap(long, default_value_t = false)]
 32 |     list: bool,
 33 | }
 34 | 
 35 | fn main() -> Result<(), std::io::Error> {
 36 |     CmdOptions::command().version(VERSION_STRING).get_matches();
 37 |     let args = CmdOptions::parse();
 38 | 
 39 |     let mut seq_index_db = SeqIndexDB::new();
 40 | 
 41 |     #[cfg(feature = "with_agc")]
 42 |     if args.frg_file {
 43 |         let _ = seq_index_db.load_from_frg_index(args.pgr_db_prefix);
 44 |     } else {
 45 |         let _ = seq_index_db.load_from_agc_index(args.pgr_db_prefix);
 46 |     }
 47 |     #[cfg(not(feature = "with_agc"))]
 48 |     if args.frg_file {
 49 |         let _ = seq_index_db.load_from_frg_index(args.pgr_db_prefix);
 50 |     } else {
 51 |         panic!("This command is compiled with only frg file support, please specify `--frg-file");
 52 |     }
 53 | 
 54 |     if args.list {
 55 |         let mut out = if args.output_file.is_some() {
 56 |             let f = File::open(args.output_file.unwrap()).expect("can't open the ouptfile");
 57 |             Box::new(f) as Box<dyn Write>
 58 |         } else {
 59 |             Box::new(io::stdout())
 60 |         };
 61 |         seq_index_db
 62 |             .seq_info
 63 |             .unwrap()
 64 |             .into_iter()
 65 |             .for_each(|(sid, (ctg, src, length))| {
 66 |                 writeln!(
 67 |                     out,
 68 |                     "{}\t{}\t{}\t{}",
 69 |                     sid,
 70 |                     src.unwrap_or_else(|| "None".to_string()),
 71 |                     ctg,
 72 |                     length
 73 |                 )
 74 |                 .expect("can't write output file")
 75 |             });
 76 |         return Ok(());
 77 |     }
 78 | 
 79 |     let region_file = args.region_file.expect("region file not specified");
 80 |     let region_file =
 81 |         BufReader::new(File::open(Path::new(&region_file)).expect("can't open the region file"));
 82 | 
 83 |     let mut out = if args.output_file.is_some() {
 84 |         let f = BufWriter::new(
 85 |             File::create(args.output_file.unwrap()).expect("can't open the ouptfile"),
 86 |         );
 87 |         Box::new(f) as Box<dyn Write>
 88 |     } else {
 89 |         Box::new(io::stdout())
 90 |     };
 91 | 
 92 |     region_file.lines().for_each(|line| {
 93 |         let line = line.expect("fail to get a line in the region file");
 94 |         let fields = line.split('\t').collect::<Vec<&str>>();
 95 |         let label = fields[0].to_string();
 96 |         let src = fields[1].to_string();
 97 |         let ctg = fields[2].to_string();
 98 |         let bgn: usize = fields[3].parse().expect("can't parse bgn");
 99 |         let end: usize = fields[4].parse().expect("can't parse end");
100 |         let reversed: bool = fields[5].parse::<u32>().expect("can't parse strand") == 1;
101 |         let mut seq = seq_index_db
102 |             .get_sub_seq(src, ctg, bgn, end)
103 |             .expect("fail to fetch sequence");
104 |         if reversed {
105 |             seq = fasta_io::reverse_complement(&seq);
106 |         }
107 | 
108 |         writeln!(out, ">{}", label).expect("fail to write the sequences");
109 |         writeln!(out, "{}", String::from_utf8_lossy(&seq[..]))
110 |             .expect("fail to write the sequences");
111 |     });
112 | 
113 |     Ok(())
114 | }
115 | 


--------------------------------------------------------------------------------
/pgr-bin/src/bin/pgr-make-frgdb.rs:
--------------------------------------------------------------------------------
 1 | const VERSION_STRING: &str = env!("VERSION_STRING");
 2 | 
 3 | //use std::path::PathBuf;
 4 | use clap::{self, CommandFactory, Parser};
 5 | 
 6 | use pgr_db::ext::SeqIndexDB;
 7 | use std::fs::File;
 8 | use std::io::{BufRead, BufReader};
 9 | use std::path::Path;
10 | 
11 | /// Create PGR-TK fragment minimizer database with frg format backend
12 | #[derive(Parser, Debug)]
13 | #[clap(name = "pgr-make-frgdb")]
14 | #[clap(author, version)]
15 | #[clap(about, long_about = None)]
16 | struct CmdOptions {
17 |     /// the path to the file contains the paths to the fastx files to load
18 |     filepath: String,
19 |     prefix: String,
20 |     /// minimizer window size
21 |     #[clap(long, short, default_value_t = 80)]
22 |     w: u32,
23 |     /// minimizer k-mer size
24 |     #[clap(long, short, default_value_t = 56)]
25 |     k: u32,
26 |     /// sparse minimizer (shimmer) reduction factor
27 |     #[clap(long, short, default_value_t = 4)]
28 |     r: u32,
29 |     /// min span for neighboring minimiers
30 |     #[clap(long, short, default_value_t = 64)]
31 |     min_span: u32,
32 | }
33 | 
34 | fn main() {
35 |     CmdOptions::command().version(VERSION_STRING).get_matches();
36 |     let args = CmdOptions::parse();
37 |     // TODO: to log file
38 |     //println!("read data from files in {:?}", args.filepath);
39 |     //println!("output prefix {:?}", args.prefix);
40 |     let _shmmr_spec = pgr_db::shmmrutils::ShmmrSpec {
41 |         w: args.w,
42 |         k: args.k,
43 |         r: args.r,
44 |         min_span: args.min_span,
45 |         sketch: false,
46 |     };
47 |     let mut sdb = SeqIndexDB::new();
48 |     let input_files = BufReader::new(
49 |         File::open(Path::new(&args.filepath))
50 |             .expect("can't open the input file that contains the paths to the fastx files"),
51 |     );
52 |     input_files.lines().enumerate().for_each(|(fid, filename)| {
53 |         let filepath = filename
54 |             .expect("can't get fastx file name")
55 |             .trim()
56 |             .to_string();
57 |         if fid == 0 {
58 |             sdb.load_from_fastx(filepath.clone(), args.w, args.k, args.r, args.min_span, true)
59 |                 .unwrap_or_else(|_| panic!("fail to read the fastx file: {}", filepath));
60 |         } else {
61 |             sdb.append_from_fastx(filepath.clone(), true)
62 |                 .unwrap_or_else(|_| panic!("fail to read the fastx file: {}", filepath));
63 |         }
64 |     });
65 | 
66 |     sdb.write_frag_and_index_files(args.prefix);
67 | }
68 | 


--------------------------------------------------------------------------------
/pgr-bin/src/bin/pgr-mdb.rs:
--------------------------------------------------------------------------------
  1 | const VERSION_STRING: &str = env!("VERSION_STRING");
  2 | 
  3 | //use std::path::PathBuf;
  4 | use clap::{self, CommandFactory, Parser};
  5 | 
  6 | #[cfg(feature = "with_agc")]
  7 | use pgr_db::agc_io::AGCFile;
  8 | 
  9 | #[cfg(feature = "with_agc")]
 10 | use pgr_db::shmmrutils::ShmmrSpec;
 11 | 
 12 | #[cfg(feature = "with_agc")]
 13 | use std::fs::File;
 14 | 
 15 | #[cfg(feature = "with_agc")]
 16 | use std::io::{BufRead, BufReader};
 17 | 
 18 | #[cfg(feature = "with_agc")]
 19 | use pgr_db::seq_db;
 20 | 
 21 | /// Create pgr minimizer database with AGC backend
 22 | #[derive(Parser, Debug)]
 23 | #[clap(name = "pgr-mdb")]
 24 | #[clap(author, version)]
 25 | #[clap(about, long_about = None)]
 26 | struct CmdOptions {
 27 |     filepath: String,
 28 |     prefix: String,
 29 |     /// minimizer window size
 30 |     #[clap(long, short, default_value_t = 80)]
 31 |     w: u32,
 32 |     /// minimizer k-mer size
 33 |     #[clap(long, short, default_value_t = 56)]
 34 |     k: u32,
 35 |     /// sparse minimizer (shimmer) reduction factor
 36 |     #[clap(long, short, default_value_t = 4)]
 37 |     r: u32,
 38 |     /// min span for neighboring minimiers
 39 |     #[clap(long, short, default_value_t = 64)]
 40 |     min_span: u32,
 41 |     /// using sketch k-mer than minimizer
 42 |     #[clap(short, long)]
 43 |     sketch: bool,
 44 |     /// set to use agc prefecting feature (more memory usage but faster, useful for agcfile with many small contigs)
 45 |     #[clap(short, long)]
 46 |     prefetching: bool,
 47 |     /// number of parallel agc reader threads (more memory usage)
 48 |     #[clap(long, short, default_value_t = 4)]
 49 |     number_of_readers: usize,
 50 | }
 51 | 
 52 | #[cfg(feature = "with_agc")]
 53 | fn load_write_index_from_agcfile(
 54 |     path: String,
 55 |     prefix: String,
 56 |     shmmr_spec: &ShmmrSpec,
 57 |     prefetching: bool,
 58 |     number_of_readers: usize,
 59 | ) -> Result<(), std::io::Error> {
 60 |     let mut sdb = seq_db::CompactSeqDB::new(shmmr_spec.clone());
 61 |     let filelist = File::open(path)?;
 62 | 
 63 |     BufReader::new(filelist)
 64 |         .lines()
 65 |         .try_for_each(|fp| -> Result<(), std::io::Error> {
 66 |             let fp = fp.unwrap();
 67 |             //println!("load file {}", fp);
 68 |             let mut agcfile: AGCFile = AGCFile::new(fp)?;
 69 |             agcfile.set_iter_thread(number_of_readers);
 70 |             agcfile.set_prefetching(prefetching);
 71 |             //println!("start to load index");
 72 |             let _ = sdb.load_index_from_agcfile(agcfile);
 73 |             Ok(())
 74 |         })?;
 75 | 
 76 |     //seq_db::write_shmr_map_file(&sdb.frag_map, "test.db".to_string());
 77 |     sdb.write_shmmr_map_index(prefix)?;
 78 |     Ok(())
 79 | }
 80 | 
 81 | fn main() {
 82 |     CmdOptions::command().version(VERSION_STRING).get_matches();
 83 | 
 84 |     #[cfg(feature = "with_agc")]
 85 |     let args = CmdOptions::parse();
 86 |     // TODO: to log file
 87 |     //println!("read data from files in {:?}", args.filepath);
 88 |     //println!("output prefix {:?}", args.prefix);
 89 | 
 90 |     #[cfg(feature = "with_agc")]
 91 |     let shmmr_spec = pgr_db::shmmrutils::ShmmrSpec {
 92 |         w: args.w,
 93 |         k: args.k,
 94 |         r: args.r,
 95 |         min_span: args.min_span,
 96 |         sketch: args.sketch,
 97 |     };
 98 | 
 99 |     #[cfg(feature = "with_agc")]
100 |     load_write_index_from_agcfile(
101 |         args.filepath,
102 |         args.prefix.clone(),
103 |         &shmmr_spec,
104 |         args.prefetching,
105 |         args.number_of_readers,
106 |     )
107 |     .unwrap();
108 | 
109 |     #[cfg(not(feature = "with_agc"))]
110 |     panic!("the command is not compiled with `with_agc` feature")
111 | }
112 | 


--------------------------------------------------------------------------------
/pgr-bin/src/bin/pgr-merge-svcnd-bed.rs:
--------------------------------------------------------------------------------
  1 | const VERSION_STRING: &str = env!("VERSION_STRING");
  2 | use clap::{self, CommandFactory, Parser};
  3 | // use rayon::prelude::*;
  4 | use rustc_hash::FxHashMap;
  5 | use std::fs::File;
  6 | use std::io::{BufRead, BufReader, BufWriter, Write};
  7 | use std::path::Path;
  8 | 
  9 | /// Merge svcnd from multiple *.svcnd.bed files into one and compute the merged regions
 10 | /// It is useful to identify unique bed regions to one specific haplotype
 11 | #[derive(Parser, Debug)]
 12 | #[clap(name = "pgr-merge-svcnd-bed")]
 13 | #[clap(author, version)]
 14 | #[clap(about, long_about = None)]
 15 | struct CmdOptions {
 16 |     /// path to the file contain the input bed files, each line should be "label<tab>input file path"
 17 |     input_files: String,
 18 |     /// the path of the output files
 19 |     output_path: String,
 20 |     /// number of threads used in parallel (more memory usage), default to "0" using all CPUs available or the number set by RAYON_NUM_THREADS
 21 |     #[clap(long, default_value_t = 0)]
 22 |     number_of_thread: usize,
 23 | }
 24 | 
 25 | type Interval = ((u32, u32), (String, String));
 26 | fn main() {
 27 |     CmdOptions::command().version(VERSION_STRING).get_matches();
 28 |     let args = CmdOptions::parse();
 29 | 
 30 |     rayon::ThreadPoolBuilder::new()
 31 |         .num_threads(args.number_of_thread)
 32 |         .build_global()
 33 |         .unwrap();
 34 | 
 35 |     let input_files = BufReader::new(File::open(Path::new(&args.input_files)).unwrap());
 36 | 
 37 |     let input_files = input_files
 38 |         .lines()
 39 |         .flat_map(|line| {
 40 |             if let Ok(line) = line {
 41 |                 let rec = line.trim().split('\t').collect::<Vec<&str>>();
 42 |                 assert!(rec.len() >= 2);
 43 |                 Some((rec[0].to_string(), rec[1].to_string()))
 44 |             } else {
 45 |                 None
 46 |             }
 47 |         })
 48 |         .collect::<Vec<_>>();
 49 | 
 50 |     let mut interval_collection =
 51 |         FxHashMap::<String, Vec<((u32, u32), (String, String))>>::default();
 52 |     input_files.iter().for_each(|(label, path)| {
 53 |         let bed_reader = BufReader::new(File::open(Path::new(path)).unwrap());
 54 |         bed_reader.lines().for_each(|line| {
 55 |             if let Ok(line) = line {
 56 |                 if line.starts_with('#') {
 57 |                     return;
 58 |                 };
 59 |                 let err_msg = format!("fail to parse on {}", line);
 60 |                 let fields = line.split('\t').collect::<Vec<&str>>();
 61 |                 let chr = fields[0].to_string();
 62 |                 let bgn = fields[1].parse::<u32>().expect(&err_msg);
 63 |                 let end = fields[2].parse::<u32>().expect(&err_msg);
 64 |                 let annotation = fields[3].to_string();
 65 |                 let e = interval_collection.entry(chr).or_insert_with(Vec::new);
 66 |                 e.push(((bgn, end), (label.clone(), annotation)));
 67 |             }
 68 |         });
 69 |     });
 70 | 
 71 |     let group_intervals = |intervals: &mut Vec<Interval>| -> Vec<(u32, u32, Vec<Interval>)> {
 72 |         let mut interval_groups = Vec::<(u32, u32, Vec<Interval>)>::new();
 73 |         if intervals.is_empty() {
 74 |             return interval_groups;
 75 |         }
 76 | 
 77 |         intervals.sort();
 78 |         let (mut current_bgn, mut current_end) = intervals.first().unwrap().0;
 79 | 
 80 |         let mut current_groups = Vec::<Interval>::new();
 81 |         intervals.iter().for_each(|(interval, payload)| {
 82 |             if current_end < interval.0 {
 83 |                 interval_groups.push((current_bgn, current_end, current_groups.clone()));
 84 |                 current_groups.clear();
 85 |                 current_groups.push((*interval, payload.clone()));
 86 |                 (current_bgn, current_end) = *interval;
 87 |             } else {
 88 |                 current_groups.push((*interval, payload.clone()));
 89 |                 if current_end < interval.1 {
 90 |                     current_end = interval.1;
 91 |                 } 
 92 |             }
 93 |         });
 94 |         if !current_groups.is_empty() {
 95 |             interval_groups.push((current_bgn, current_end, current_groups.clone()));
 96 |         }
 97 |         interval_groups
 98 |     };
 99 | 
100 |     let mut out_bed = BufWriter::new(File::create(Path::new(&args.output_path)).unwrap());
101 |     let mut keys = interval_collection.keys().cloned().collect::<Vec<_>>();
102 |     keys.sort();
103 |     keys.into_iter().for_each(|key| {
104 |         let intervals = interval_collection.get_mut(&key).unwrap();
105 |         let interval_groups = group_intervals(intervals);
106 |         interval_groups.into_iter().for_each(|intervals| {
107 |             if intervals.2.is_empty() {
108 |                 return;
109 |             }
110 |             let itvl_group_bgn = intervals.0;
111 |             let itvl_group_end = intervals.1;
112 |             if itvl_group_bgn > itvl_group_end {
113 |                 return;
114 |             };
115 | 
116 |             let mut label_count = FxHashMap::<String, u32>::default();
117 |             let mut total_interval_counts = 0u32;
118 |             intervals.2.iter().for_each(|(_interval, payload)| {
119 |                 let e = label_count.entry(payload.0.clone()).or_default();
120 |                 *e += 1;
121 |                 total_interval_counts += 1;
122 |             });
123 | 
124 |             writeln!(
125 |                 out_bed,
126 |                 "{}\t{}\t{}\tmerged:{}:{}",
127 |                 key,
128 |                 itvl_group_bgn,
129 |                 itvl_group_end,
130 |                 label_count.len(),
131 |                 total_interval_counts
132 |             )
133 |             .expect("unable to write the output file");
134 | 
135 |             intervals.2.iter().for_each(|(interval, payload)| {
136 |                 let number_haplotype = label_count.len();
137 |                 let e = label_count.entry(payload.0.clone()).or_default();
138 |                 writeln!(
139 |                     out_bed,
140 |                     "{}\t{}\t{}\t{}:{}:{}-{}:{}:{}",
141 |                     key,
142 |                     interval.0,
143 |                     interval.1,
144 |                     payload.0,
145 |                     payload.1,
146 |                     itvl_group_bgn,
147 |                     itvl_group_end,
148 |                     number_haplotype,
149 |                     *e,
150 |                 )
151 |                 .expect("unable to write the output file");
152 |             });
153 |         });
154 |     });
155 | }
156 | 


--------------------------------------------------------------------------------
/pgr-bin/src/bin/pgr-pbundle-aln.rs:
--------------------------------------------------------------------------------
  1 | const VERSION_STRING: &str = env!("VERSION_STRING");
  2 | use clap::{self, CommandFactory, Parser};
  3 | use rustc_hash::FxHashMap;
  4 | use serde::*;
  5 | use serde_json::json;
  6 | use std::io::{BufRead, BufReader, BufWriter, Write};
  7 | use std::path::Path;
  8 | use std::{fs::File, path};
  9 | 
 10 | /// Generate alignment between sequences using bundle decomposition from a principal bundle bed file
 11 | #[derive(Parser, Debug)]
 12 | #[clap(name = "pgr-pbundle-aln")]
 13 | #[clap(author, version)]
 14 | #[clap(about, long_about = None)]
 15 | struct CmdOptions {
 16 |     /// the path to the principal bundle bed file
 17 |     bed_file_path: String,
 18 |     /// a file contain two lines of the contig ids that should be aligned to each other
 19 |     aln_spec: String,
 20 |     /// the prefix of the output file
 21 |     output_prefix: String,
 22 | }
 23 | 
 24 | #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)]
 25 | struct BundleSegment {
 26 |     bgn: u32,
 27 |     end: u32,
 28 |     bundle_id: u32,
 29 |     bundle_v_count: u32,
 30 |     bundle_dir: u32,
 31 |     bundle_v_bgn: u32,
 32 |     bundle_v_end: u32,
 33 | }
 34 | 
 35 | #[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)]
 36 | enum AlnType {
 37 |     Match,
 38 |     Insertion,
 39 |     Deletion,
 40 | }
 41 | type AlnPathElement = (usize, usize, AlnType, u32, u32, usize, usize);
 42 | type AlnPath = Vec<AlnPathElement>;
 43 | 
 44 | fn align_bundles(
 45 |     q_bundles: &[BundleSegment],
 46 |     t_bundles: &[BundleSegment],
 47 | ) -> (f32, usize, usize, AlnPath) {
 48 |     let q_count = q_bundles.len();
 49 |     let t_count = t_bundles.len();
 50 |     let mut s_map = FxHashMap::<(usize, usize), i64>::default();
 51 |     let mut t_map = FxHashMap::<(usize, usize), AlnType>::default();
 52 | 
 53 |     let mut get_aln_direction_with_best_score =
 54 |         |q_idx: usize, t_idx: usize, s_map: &FxHashMap<(usize, usize), i64>| -> (AlnType, i64) {
 55 |             let mut best = (AlnType::Match, i64::MIN);
 56 |             let q_len = (q_bundles[q_idx].end as i64 - q_bundles[q_idx].bgn as i64).abs();
 57 |             let t_len = (t_bundles[t_idx].end as i64 - t_bundles[t_idx].bgn as i64).abs();
 58 |             let min_len = if q_len > t_len { t_len } else { q_len };
 59 |             let q_b_seg = q_bundles[q_idx];
 60 |             let t_b_seg = t_bundles[t_idx];
 61 |             if q_idx == 0
 62 |                 && t_idx == 0
 63 |                 && (q_b_seg.bundle_id == t_b_seg.bundle_id)
 64 |                 && (q_b_seg.bundle_dir == t_b_seg.bundle_dir)
 65 |             {
 66 |                 best = (AlnType::Match, 2 * min_len)
 67 |             };
 68 |             if q_idx > 0
 69 |                 && t_idx > 0
 70 |                 && q_b_seg.bundle_id == t_b_seg.bundle_id
 71 |                 && (q_b_seg.bundle_dir == t_b_seg.bundle_dir)
 72 |             {
 73 |                 best = (
 74 |                     AlnType::Match,
 75 |                     2 * min_len + s_map.get(&(q_idx - 1, t_idx - 1)).unwrap(),
 76 |                 )
 77 |             };
 78 |             if t_idx > 0 {
 79 |                 let score = -2 * q_len + s_map.get(&(q_idx, t_idx - 1)).unwrap();
 80 |                 if score > best.1 {
 81 |                     best = (AlnType::Deletion, score)
 82 |                 };
 83 |             };
 84 |             if q_idx > 0 {
 85 |                 let score = -2 * t_len + s_map.get(&(q_idx - 1, t_idx)).unwrap();
 86 |                 if score > best.1 {
 87 |                     best = (AlnType::Insertion, score)
 88 |                 }
 89 |             }
 90 |             t_map.insert((q_idx, t_idx), best.0);
 91 |             best
 92 |         };
 93 | 
 94 |     //let mut best_score = 0;
 95 |     //let mut best_q_idx = 0;
 96 |     //let mut best_t_idx = 0;
 97 |     let mut aln_path = AlnPath::new();
 98 | 
 99 |     (0..t_count)
100 |         .flat_map(|t_idx| (0..q_count).map(move |q_idx| (q_idx, t_idx)))
101 |         .for_each(|(q_idx, t_idx)| {
102 |             //println!("{} {}", q_idx, t_idx);
103 |             let (_, score) = get_aln_direction_with_best_score(q_idx, t_idx, &s_map);
104 |             s_map.insert((q_idx, t_idx), score);
105 |             /*
106 |             if score > best_score {
107 |                 best_score = score;
108 |                 best_q_idx = q_idx;
109 |                 best_t_idx = t_idx;
110 |             }
111 |             */
112 |         });
113 |     let mut q_idx = q_count - 1;
114 |     let mut t_idx = t_count - 1;
115 |     let mut diff_len = 0_usize;
116 |     let mut max_len = 1_usize;
117 |     while let Some(aln_type) = t_map.get(&(q_idx, t_idx)) {
118 |         let qq_idx = q_idx;
119 |         let tt_idx = t_idx;
120 |         let (diff_len_delta, max_len_delta) = match aln_type {
121 |             AlnType::Match => {
122 |                 let q_len = (q_bundles[q_idx].end as i64 - q_bundles[q_idx].bgn as i64).abs();
123 |                 let t_len = (t_bundles[t_idx].end as i64 - t_bundles[t_idx].bgn as i64).abs();
124 |                 let diff_len_delta = (q_len - t_len).unsigned_abs() as usize;
125 |                 let max_len_delata = if q_len > t_len {
126 |                     q_len as usize
127 |                 } else {
128 |                     t_len as usize
129 |                 };
130 |                 q_idx -= 1;
131 |                 t_idx -= 1;
132 |                 (diff_len_delta, max_len_delata)
133 |             }
134 |             AlnType::Insertion => {
135 |                 let q_len = (q_bundles[q_idx].end as i64 - q_bundles[q_idx].bgn as i64).abs();
136 |                 q_idx -= 1;
137 |                 (q_len as usize, q_len as usize)
138 |             }
139 |             AlnType::Deletion => {
140 |                 let t_len = (t_bundles[t_idx].end as i64 - t_bundles[t_idx].bgn as i64).abs();
141 |                 t_idx -= 1;
142 |                 (t_len as usize, t_len as usize)
143 |             }
144 |         };
145 |         diff_len += diff_len_delta;
146 |         max_len += max_len_delta;
147 |         aln_path.push((
148 |             qq_idx,
149 |             tt_idx,
150 |             *aln_type,
151 |             q_bundles[qq_idx].bundle_id,
152 |             t_bundles[tt_idx].bundle_id,
153 |             diff_len_delta,
154 |             max_len_delta,
155 |         ));
156 |     }
157 |     aln_path.reverse();
158 |     (
159 |         diff_len as f32 / max_len as f32,
160 |         diff_len,
161 |         max_len,
162 |         aln_path,
163 |     )
164 | }
165 | 
166 | fn main() -> std::result::Result<(), std::io::Error> {
167 |     CmdOptions::command().version(VERSION_STRING).get_matches();
168 |     let args = CmdOptions::parse();
169 |     let bed_file_path = path::Path::new(&args.bed_file_path);
170 |     let bed_file = BufReader::new(File::open(bed_file_path).expect("can't open the bed file"));
171 |     let mut ctg_data = FxHashMap::<String, Vec<_>>::default();
172 |     let bed_file_parse_err_msg = "bed file parsing error";
173 |     bed_file.lines().for_each(|line| {
174 |         let line = line.unwrap().trim().to_string();
175 |         if line.is_empty() {
176 |             return;
177 |         }
178 |         if &line[0..1] == "#" {
179 |             return;
180 |         }
181 |         let bed_fields = line.split('\t').collect::<Vec<&str>>();
182 |         let ctg: String = bed_fields[0].to_string();
183 |         let bgn: u32 = bed_fields[1].parse().expect(bed_file_parse_err_msg);
184 |         let end: u32 = bed_fields[2].parse().expect(bed_file_parse_err_msg);
185 |         let pbundle_fields = bed_fields[3].split(':').collect::<Vec<&str>>();
186 |         let bundle_id: u32 = pbundle_fields[0].parse().expect(bed_file_parse_err_msg);
187 |         let bundle_v_count: u32 = pbundle_fields[1].parse().expect(bed_file_parse_err_msg);
188 |         let bundle_dir: u32 = pbundle_fields[2].parse().expect(bed_file_parse_err_msg);
189 |         let bundle_v_bgn: u32 = pbundle_fields[3].parse().expect(bed_file_parse_err_msg);
190 |         let bundle_v_end: u32 = pbundle_fields[4].parse().expect(bed_file_parse_err_msg);
191 | 
192 |         let e = ctg_data.entry(ctg).or_default();
193 |         let b_seg = BundleSegment {
194 |             bgn,
195 |             end,
196 |             bundle_id,
197 |             bundle_v_count,
198 |             bundle_dir,
199 |             bundle_v_bgn,
200 |             bundle_v_end,
201 |         };
202 |         e.push(b_seg);
203 |     });
204 | 
205 |     let aln_spec = path::Path::new(&args.aln_spec);
206 |     let spec_file = BufReader::new(File::open(aln_spec).expect("can't open the aln_spec file"));
207 |     let mut ctg_of_interests = Vec::<String>::new();
208 |     spec_file.lines().for_each(|line| {
209 |         let line = line.unwrap().trim().to_string();
210 |         ctg_of_interests.push(line);
211 |     });
212 | 
213 |     let ctg_data = ctg_of_interests
214 |         .into_iter()
215 |         .map(|k| {
216 |             let v = ctg_data
217 |                 .get(&k)
218 |                 .unwrap_or_else(|| panic!("ctg name nof found: {}", k));
219 |             (k, v)
220 |         })
221 |         .collect::<Vec<_>>();
222 | 
223 |     let n_ctg = ctg_data.len();
224 | 
225 |     let mut alignment_paths = Vec::<_>::new();
226 |     let ctg_idx0 = 0;
227 |     (1..n_ctg).for_each(|ctg_idx1| {
228 |         // the first sequence is the "target"
229 |         let (target_ctg, target_bundles) = &ctg_data[ctg_idx0];
230 |         let (query_ctg, query_bundles) = &ctg_data[ctg_idx1];
231 |         let (_dist0, _diff_len0, _max_len0, aln_path) =
232 |             align_bundles(query_bundles, target_bundles);
233 | 
234 |         let aln_path = aln_path
235 |             .into_iter()
236 |             .map(
237 |                 |(
238 |                     qq_idx,
239 |                     tt_idx,
240 |                     aln_type,
241 |                     _q_bundle_id,
242 |                     _t_bundle_id,
243 |                     _diff_len_delta,
244 |                     _max_len_delta,
245 |                 )| {
246 |                     let target_data = target_bundles.get(tt_idx).unwrap();
247 |                     let query_data = query_bundles.get(qq_idx).unwrap();
248 |                     (qq_idx, tt_idx, aln_type, target_data, query_data)
249 |                 },
250 |             )
251 |             .collect::<Vec<_>>();
252 |         alignment_paths.push((target_ctg, query_ctg, aln_path))
253 |     });
254 | 
255 |     let out_path = Path::new(&args.output_prefix).with_extension("bln.json");
256 |     let mut out_file =
257 |         BufWriter::new(File::create(out_path).expect("can't create the bundle alignment file"));
258 | 
259 |     let out_json = json!(alignment_paths);
260 |     out_file.write_all(out_json.to_string().as_bytes())?;
261 | 
262 |     Ok(())
263 | }
264 | 


--------------------------------------------------------------------------------
/pgr-bin/src/bin/pgr-pbundle-bed2sorted.rs:
--------------------------------------------------------------------------------
  1 | const VERSION_STRING: &str = env!("VERSION_STRING");
  2 | use clap::{self, CommandFactory, Parser};
  3 | use rustc_hash::FxHashMap;
  4 | use std::io::{BufRead, BufReader, BufWriter, Write};
  5 | use std::path::Path;
  6 | use std::{fs::File, path};
  7 | 
  8 | /// Generate annotation file with a sorting order from the principal bundle decomposition
  9 | #[derive(Parser, Debug)]
 10 | #[clap(name = "pgr-pbundle-bed2sorted")]
 11 | #[clap(author, version)]
 12 | #[clap(about, long_about = None)]
 13 | struct CmdOptions {
 14 |     /// the path to the pricipal bundle bed file
 15 |     bed_file_path: String,
 16 |     /// the prefix of the output file
 17 |     output_prefix: String,
 18 | }
 19 | 
 20 | #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug)]
 21 | struct BundleSegement {
 22 |     bgn: u32,
 23 |     end: u32,
 24 |     bundle_id: u32,
 25 |     bundle_v_count: u32,
 26 |     bundle_dir: u32,
 27 |     bundle_v_bgn: u32,
 28 |     bundle_v_end: u32,
 29 | }
 30 | 
 31 | fn main() -> Result<(), std::io::Error> {
 32 |     CmdOptions::command().version(VERSION_STRING).get_matches();
 33 |     let args = CmdOptions::parse();
 34 |     let bed_file_path = path::Path::new(&args.bed_file_path);
 35 |     let bed_file = BufReader::new(File::open(bed_file_path)?);
 36 |     let mut ctg_data = FxHashMap::<String, Vec<_>>::default();
 37 |     let bed_file_parse_err_msg = "bed file parsing error";
 38 |     let mut node_length = FxHashMap::<(u32, u32), Vec<_>>::default();
 39 |     bed_file.lines().for_each(|line| {
 40 |         let line = line.unwrap().trim().to_string();
 41 |         if line.is_empty() {
 42 |             return;
 43 |         }
 44 |         if &line[0..1] == "#" {
 45 |             return;
 46 |         }
 47 |         let bed_fields = line.split('\t').collect::<Vec<&str>>();
 48 |         let ctg: String = bed_fields[0].to_string();
 49 |         let bgn: u32 = bed_fields[1].parse().expect(bed_file_parse_err_msg);
 50 |         let end: u32 = bed_fields[2].parse().expect(bed_file_parse_err_msg);
 51 |         let pbundle_fields = bed_fields[3].split(':').collect::<Vec<&str>>();
 52 |         let bundle_id: u32 = pbundle_fields[0].parse().expect(bed_file_parse_err_msg);
 53 |         let bundle_v_count: u32 = pbundle_fields[1].parse().expect(bed_file_parse_err_msg);
 54 |         let bundle_dir: u32 = pbundle_fields[2].parse().expect(bed_file_parse_err_msg);
 55 |         let bundle_v_bgn: u32 = pbundle_fields[3].parse().expect(bed_file_parse_err_msg);
 56 |         let bundle_v_end: u32 = pbundle_fields[4].parse().expect(bed_file_parse_err_msg);
 57 | 
 58 |         let e = ctg_data.entry(ctg).or_default();
 59 |         let b_seg = BundleSegement {
 60 |             bgn,
 61 |             end,
 62 |             bundle_id,
 63 |             bundle_v_count,
 64 |             bundle_dir,
 65 |             bundle_v_bgn,
 66 |             bundle_v_end,
 67 |         };
 68 |         e.push(b_seg);
 69 |         if (bundle_v_bgn as i64 - bundle_v_end as i64).abs() as f32 > (bundle_v_count as f32) * 0.5
 70 |         {
 71 |             let e = node_length.entry((bundle_id, bundle_dir)).or_default();
 72 |             e.push((end as i64 - bgn as i64).unsigned_abs());
 73 |         }
 74 |     });
 75 | 
 76 |     let mut node_length = node_length
 77 |         .into_iter()
 78 |         .map(|(n, v)| {
 79 |             let c = v.len() as f64;
 80 |             let sum = v.into_iter().sum::<u64>() as f64;
 81 |             (sum / c, n)
 82 |         })
 83 |         .collect::<Vec<(f64, (u32, u32))>>();
 84 |     node_length.sort_by(|a, b| b.partial_cmp(a).unwrap());
 85 | 
 86 |     let mut ctg_data = ctg_data
 87 |         .into_iter()
 88 |         .map(|(ctg, mut bundle_segs)| {
 89 |             bundle_segs.sort();
 90 |             let mut node_count = FxHashMap::<(u32, u32), u32>::default();
 91 |             bundle_segs.iter().for_each(|vv| {
 92 |                 let node = (vv.bundle_id, vv.bundle_dir);
 93 |                 if (vv.bundle_v_bgn as i64 - vv.bundle_v_end as i64).abs() as f32
 94 |                     > (vv.bundle_v_count as f32) * 0.5
 95 |                 {
 96 |                     let e = node_count.entry(node).or_insert(0);
 97 |                     *e += 1;
 98 |                 }
 99 |             });
100 |             let mut sort_key = vec![];
101 |             node_length.iter().for_each(|&(_, n)| {
102 |                 sort_key.push(*node_count.get(&n).unwrap_or(&0));
103 |             });
104 | 
105 |             (sort_key, ctg, bundle_segs)
106 |         })
107 |         .collect::<Vec<_>>();
108 | 
109 |     ctg_data.sort();
110 |     ctg_data.reverse();
111 | 
112 |     let out_path = Path::new(&args.output_prefix).with_extension("ord");
113 |     let mut out_file = BufWriter::new(File::create(out_path)?);
114 | 
115 |     ctg_data.into_iter().for_each(|(sort_key, ctg, _)| {
116 |         let sort_key = sort_key
117 |             .into_iter()
118 |             .map(|k| format!("{}", k))
119 |             .collect::<Vec<String>>();
120 |         let sort_key = sort_key.join(",");
121 |         writeln!(
122 |             out_file,
123 |             "{}\t{
124 |         }",
125 |             ctg, sort_key
126 |         )
127 |         .expect("writing error");
128 |     });
129 | 
130 |     Ok(())
131 | }
132 | 


--------------------------------------------------------------------------------
/pgr-bin/utility_scripts/get_cytoband_to_json.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | 
 4 | if __name__ == "__main__":
 5 |     os.system("wget https://s3.amazonaws.com/igv.org.genomes/hg38/annotations/cytoBandIdeo.txt.gz")
 6 |     os.system("gunzip -f cytoBandIdeo.txt.gz")
 7 | 
 8 |     cypobands = {}
 9 |     with open("cytoBandIdeo.txt") as f:
10 |         for row in f:
11 |             row = row.strip().split("\t")
12 |             cypobands.setdefault(row[0], [])
13 |             cypobands[row[0]].append( (int(row[1]), int(row[2]), row[3], row[4]) )          
14 |     
15 |     out = open("cytoBandIdeo.json","w")
16 |     json.dump({"cytobands": cypobands}, out)
17 |     out.close()


--------------------------------------------------------------------------------
/pgr-db/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "pgr-db"
 3 | version = "0.6.0"
 4 | edition = "2021"
 5 | authors = ["Jason Chin <cschin@infoecho.net>"]
 6 | build = "build.rs"
 7 | 
 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 9 | [build-dependencies]
10 | bindgen = "0.60.1"
11 | 
12 | [dependencies]
13 | rustc-hash = "1.1.0"
14 | #flate2 = { version = "1.0.17", features = ["zlib-ng-compat"], default-features = false }
15 | flate2 = "1.0.17"
16 | log = { version = "0.4.19", features = ["std", "max_level_debug", "release_max_level_warn"]}
17 | simple_logger = "4.2.0"
18 | rayon = "1.5.2"
19 | libc = "0.2"
20 | byteorder = "1.3.4"
21 | petgraph = "0.6.1"
22 | cuckoofilter = "0.5"
23 | bgzip = "0.2.1"
24 | serde = { version = "1.0.137", features = ["derive", "rc"] } 
25 | serde_json = "1.0.81"
26 | regex = "1"
27 | bincode = { version = "2.0.0-rc.1", features = ["alloc"] }
28 | memmap2 = "0.5.10"
29 | wavefront-aln = {git = "https://github.com/cschin/wavefront-aln.git"}
30 | 
31 | [features]
32 | default = ["with_agc"]
33 | with_agc = []
34 | 


--------------------------------------------------------------------------------
/pgr-db/build.rs:
--------------------------------------------------------------------------------
  1 | extern crate bindgen;
  2 | use std::env::consts::{ARCH, OS};
  3 | 
  4 | #[cfg(debug_assertions)]
  5 | const BUILD_TYPE: &str = "debug";
  6 | #[cfg(not(debug_assertions))]
  7 | const BUILD_TYPE: &str = "release";
  8 | 
  9 | #[cfg(feature = "with_agc")]
 10 | use std::fs::{read_dir, remove_dir_all};
 11 | 
 12 | #[cfg(feature = "with_agc")]
 13 | use std::path::PathBuf;
 14 | 
 15 | use std::{env, process::Command};
 16 | 
 17 | #[cfg(feature = "with_agc")]
 18 | fn build_agc() -> Option<()> {
 19 |     let mut agc_dir = read_dir("../agc").ok()?;
 20 |     if !agc_dir.any(|f| f.unwrap().file_name() == "makefile") {
 21 |         return None;
 22 |     }
 23 | 
 24 |     let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
 25 | 
 26 |     let agc_path = out_path.join("agc");
 27 | 
 28 |     let _ = remove_dir_all(agc_path.as_path());
 29 | 
 30 |     // copy the AGC dir to OUT_PATH and build it there... clunky, but
 31 |     // don't want to pull in the entire 100MB WFA repo, since git2
 32 |     // doesn't seem to support shallow clones, and build scripts
 33 |     // should only modify things inside OUT_PATH. since the WFA folder
 34 |     // is just a couple MB, this is fine for now.
 35 |     let _cp_agc = Command::new("cp")
 36 |         .arg("-r")
 37 |         .arg("../agc")
 38 |         .arg(&out_path)
 39 |         .output()
 40 |         .unwrap();
 41 | 
 42 |     let output = Command::new("make")
 43 |         .arg("-f")
 44 |         .arg("makefile.release")
 45 |         .arg("clean")
 46 |         .arg("libagc")
 47 |         .current_dir(&agc_path)
 48 |         .output()
 49 |         .unwrap();
 50 |     if output.status.success() {
 51 |         Some(())
 52 |     } else {
 53 |         panic!("make error: {}", String::from_utf8_lossy(&output.stderr));
 54 |     }
 55 | }
 56 | 
 57 | // fn wfa() {
 58 | //     let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
 59 | //     let _cp_agc = Command::new("cp")
 60 | //         .arg("../WFA2-lib/lib/libwfa.a")
 61 | //         .arg(&out_path)
 62 | //         .output()
 63 | //         .unwrap();
 64 | //     // The directory of the WFA libraries, added to the search path.
 65 | //     println!("cargo:rustc-link-search={}", out_path.display());
 66 | //     // Link the `wfa-lib` library.
 67 | //     println!("cargo:rustc-link-lib=wfa");
 68 | //     // Also link `omp`.
 69 | //     println!("cargo:rustc-link-lib=omp5");
 70 | // }
 71 | 
 72 | fn main() {
 73 |     //wfa();
 74 | 
 75 |     #[cfg(feature = "with_agc")]
 76 |     if build_agc().is_none() {
 77 |         panic!("Error building AGC C library");
 78 |     } else {
 79 |         let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
 80 |         let agc_path = out_path.join("agc");
 81 | 
 82 |         // shared library.
 83 |         println!("cargo:rustc-link-lib=agc");
 84 |         println!("cargo:rustc-link-search={}", agc_path.display());
 85 |         println!("cargo:rustc-link-lib=zstd");
 86 |         println!("cargo:rustc-link-search={}/libs", agc_path.display());
 87 |         println!("cargo:rustc-link-lib=stdc++");
 88 |         println!("cargo:rustc-link-search=/usr/lib/gcc/x86_64-linux-gnu/11/");
 89 | 
 90 |         // Tell cargo to invalidate the built crate whenever the wrapper changes
 91 |         println!("cargo:rerun-if-changed=wrapper.h");
 92 | 
 93 |         // The bindgen::Builder is the main entry point
 94 |         // to bindgen, and lets you build up options for
 95 |         // the resulting bindings.
 96 |         let bindings = bindgen::Builder::default()
 97 |             // The input header we would like to generate
 98 |             // bindings for.
 99 |             .header("wrapper.h")
100 |             // Tell cargo to invalidate the built crate whenever any of the
101 |             // included header files changed.
102 |             .parse_callbacks(Box::new(bindgen::CargoCallbacks))
103 |             // Finish the builder and generate the bindings.
104 |             .generate()
105 |             // Unwrap the Result and panic on failure.
106 |             .expect("Unable to generate bindings");
107 | 
108 |         // Write the bindings to the $OUT_DIR/bindings.rs file.
109 |         bindings
110 |             .write_to_file(out_path.join("bindings.rs"))
111 |             .expect("Couldn't write bindings!");
112 |     }
113 |     // from https://vallentin.dev/2019/06/06/versioning
114 |     let branch_name = get_branch_name();
115 |     if branch_name != *"bioconda" {
116 |         let version_string = format!(
117 |             "{} {} ({}:{}{}, {} build, {} [{}] [{}])",
118 |             env!("CARGO_PKG_NAME"),
119 |             env!("CARGO_PKG_VERSION"),
120 |             get_branch_name(),
121 |             get_commit_hash(),
122 |             if is_working_tree_clean() { "" } else { "+" },
123 |             BUILD_TYPE,
124 |             OS,
125 |             ARCH,
126 |             get_rustc_version()
127 |         );
128 | 
129 |         println!("cargo:rustc-env=VERSION_STRING={}", version_string);
130 |     } else {
131 |         let version_string = format!(
132 |             "{} {} (bioconda {} build ({}:{}{}), {} [{}] [{}])",
133 |             env!("CARGO_PKG_NAME"),
134 |             env!("CARGO_PKG_VERSION"),
135 |             BUILD_TYPE,
136 |             get_branch_name(),
137 |             get_commit_hash(),
138 |             if is_working_tree_clean() { "" } else { "+" },
139 |             OS,
140 |             ARCH,
141 |             get_rustc_version()
142 |         );
143 |         println!("cargo:rustc-env=VERSION_STRING={}", version_string);
144 |     }
145 | }
146 | 
147 | fn get_rustc_version() -> String {
148 |     let output = Command::new("rustc")
149 |         .arg("--version")
150 |         .current_dir(env!("CARGO_MANIFEST_DIR"))
151 |         .output()
152 |         .unwrap();
153 | 
154 |     assert!(output.status.success());
155 | 
156 |     String::from_utf8_lossy(&output.stdout)
157 |         .trim_end()
158 |         .to_string()
159 | }
160 | 
161 | fn get_commit_hash() -> String {
162 |     let output = Command::new("git")
163 |         .arg("log")
164 |         .arg("-1")
165 |         .arg("--pretty=format:%h") // Abbreviated commit hash
166 |         // .arg("--pretty=format:%H") // Full commit hash
167 |         .current_dir(env!("CARGO_MANIFEST_DIR"))
168 |         .output()
169 |         .unwrap();
170 | 
171 |     // assert!(output.status.success());
172 |     if output.status.success() {
173 |         String::from_utf8_lossy(&output.stdout).to_string()
174 |     } else {
175 |         String::from("bioconda")
176 |     }
177 | }
178 | 
179 | fn get_branch_name() -> String {
180 |     let output = Command::new("git")
181 |         .arg("rev-parse")
182 |         .arg("--abbrev-ref")
183 |         .arg("HEAD")
184 |         .current_dir(env!("CARGO_MANIFEST_DIR"))
185 |         .output()
186 |         .unwrap();
187 | 
188 |     //assert!(output.status.success());
189 |     if output.status.success() {
190 |         String::from_utf8_lossy(&output.stdout)
191 |             .trim_end()
192 |             .to_string()
193 |     } else {
194 |         String::from("bioconda")
195 |     }
196 | }
197 | 
198 | fn is_working_tree_clean() -> bool {
199 |     let status = Command::new("git")
200 |         .arg("diff")
201 |         .arg("--quiet")
202 |         .arg("--exit-code")
203 |         .current_dir(env!("CARGO_MANIFEST_DIR"))
204 |         .status()
205 |         .unwrap();
206 | 
207 |     if status.success() {
208 |         status.code().unwrap() == 0
209 |     } else {
210 |         true
211 |     }
212 | }
213 | 


--------------------------------------------------------------------------------
/pgr-db/src/bindings.rs:
--------------------------------------------------------------------------------
1 | #![allow(non_upper_case_globals)]
2 | #![allow(non_camel_case_types)]
3 | #![allow(non_snake_case)]
4 | #[cfg(feature = "with_agc")]
5 | include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
6 | 


--------------------------------------------------------------------------------
/pgr-db/src/gff_db.rs:
--------------------------------------------------------------------------------
  1 | use bgzip::BGZFReader;
  2 | use rustc_hash::FxHashMap;
  3 | use serde::{Deserialize, Serialize};
  4 | use serde_json;
  5 | use std::char;
  6 | use std::fmt;
  7 | use std::fs::File;
  8 | use std::io::BufRead;
  9 | use std::path::Path;
 10 | use std::rc::Rc;
 11 | 
 12 | #[derive(Debug, Clone, Deserialize, Serialize)]
 13 | pub struct GFFRecord {
 14 |     pub seqid: String,
 15 |     pub source: String,
 16 |     #[serde(rename = "type")]
 17 |     pub type_name: String,
 18 |     pub bgn: u32,
 19 |     pub end: u32,
 20 |     pub score: Option<f32>,
 21 |     pub strand: char,
 22 |     pub phase: Option<u8>,
 23 |     pub attributes: FxHashMap<String, String>,
 24 | }
 25 | 
 26 | impl GFFRecord {
 27 |     pub fn from_line(line: &str) -> GFFRecord {
 28 |         let fields = line
 29 |             .trim_end()
 30 |             .split('\t')
 31 |             .into_iter()
 32 |             .map(|s| s.to_string())
 33 |             .collect::<Vec<String>>();
 34 |         GFFRecord::from_fields(&fields)
 35 |     }
 36 | 
 37 |     pub fn from_fields(fields: &[String]) -> GFFRecord {
 38 |         let seqid = fields[0].clone();
 39 |         let source = fields[1].clone();
 40 |         let type_ = fields[2].clone();
 41 |         let bgn = fields[3]
 42 |             .parse::<u32>()
 43 |             .expect("can't parse the start coordinate");
 44 |         let end = fields[4]
 45 |             .parse::<u32>()
 46 |             .expect("can't parse the end coordinate");
 47 |         let score = match fields[5].as_str() {
 48 |             "." => None,
 49 |             s => Some(s.parse::<f32>().expect("can't parse score")),
 50 |         };
 51 | 
 52 |         let strand = fields[6][0..1].chars().next().unwrap();
 53 |         let phase = match fields[7].as_str() {
 54 |             "." => None,
 55 |             s => Some(s.parse::<u8>().unwrap_or_else(|_| {
 56 |                 panic!("fail to parse the phase field {}", fields[6].as_str())
 57 |             })),
 58 |         };
 59 |         let attributes = fields[8]
 60 |             .split(';')
 61 |             .into_iter()
 62 |             .map(|s| {
 63 |                 let kv = s
 64 |                     .split('=')
 65 |                     .into_iter()
 66 |                     .map(|s| s.to_string())
 67 |                     .collect::<Vec<String>>();
 68 |                 if kv.len() != 2 {
 69 |                     panic!("error parsing attributes")
 70 |                 };
 71 |                 (kv[0].clone(), kv[1].clone())
 72 |             })
 73 |             .collect::<FxHashMap<String, String>>();
 74 | 
 75 |         Self {
 76 |             seqid,
 77 |             source,
 78 |             type_name: type_,
 79 |             bgn,
 80 |             end,
 81 |             score,
 82 |             strand,
 83 |             phase,
 84 |             attributes,
 85 |         }
 86 |     }
 87 | }
 88 | 
 89 | impl fmt::Display for GFFRecord {
 90 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 91 |         let mut out = vec![];
 92 |         out.push(format!(
 93 |             "{}\t{}\t{}\t{}\t{}",
 94 |             self.seqid, self.source, self.type_name, self.bgn, self.end
 95 |         ));
 96 | 
 97 |         out.push(if self.score.is_none() {
 98 |             ".".to_string()
 99 |         } else {
100 |             format!("{}", self.score.unwrap())
101 |         });
102 | 
103 |         out.push(format!("{}", self.strand));
104 | 
105 |         out.push(if self.phase.is_none() {
106 |             ".".to_string()
107 |         } else {
108 |             format!("{}", self.phase.unwrap())
109 |         });
110 | 
111 |         out.push(
112 |             self.attributes
113 |                 .iter()
114 |                 .map(|(k, v)| format!("{}={}", k, v))
115 |                 .collect::<Vec<String>>()
116 |                 .join(";"),
117 |         );
118 | 
119 |         write!(f, "{}", out.join("\t"))
120 |     }
121 | }
122 | 
123 | type IdToGffRec = FxHashMap<String, Rc<GFFRecord>>;
124 | type IdToChildren = FxHashMap<String, Vec<Rc<GFFRecord>>>;
125 | type NameToGffRec = FxHashMap<String, Rc<GFFRecord>>;
126 | 
127 | #[derive(Debug, Clone, Deserialize, Serialize)]
128 | pub struct GFFDB {
129 |     pub header: Vec<String>,
130 |     pub records: Vec<Rc<GFFRecord>>,
131 |     pub id_to_rec: IdToGffRec,
132 |     pub name_to_rec: NameToGffRec,
133 |     pub children: IdToChildren,
134 | }
135 | 
136 | impl GFFDB {
137 |     pub fn from_bgzip_file(filepath: &Path) -> std::io::Result<GFFDB> {
138 |         let file = BGZFReader::new(File::open(filepath)?);
139 |         let mut header = Vec::<String>::new();
140 |         let mut records = Vec::<Rc<GFFRecord>>::new();
141 |         let mut id_to_rec = IdToGffRec::default();
142 |         let mut name_to_rec = NameToGffRec::default();
143 |         let mut children = IdToChildren::default();
144 |         file.lines().into_iter().for_each(|line| {
145 |             let line = line.unwrap();
146 |             if &line[0..1] != "#" {
147 |                 let rec = Rc::new(GFFRecord::from_line(&line));
148 |                 records.push(rec.clone());
149 | 
150 |                 if rec.attributes.contains_key("ID") {
151 |                     let id = rec.attributes.get("ID").unwrap();
152 |                     id_to_rec.insert(id.clone(), rec.clone());
153 |                 }
154 |                 if rec.attributes.contains_key("Name") {
155 |                     let name = rec.attributes.get("Name").unwrap();
156 |                     name_to_rec.insert(name.clone(), rec.clone());
157 |                 }
158 |                 if rec.attributes.contains_key("Parent") {
159 |                     let parent_id = rec.attributes.get("Parent").unwrap();
160 |                     children
161 |                         .entry(parent_id.clone())
162 |                         .or_insert_with(Vec::new)
163 |                         .push(rec.clone());
164 |                 }
165 |             } else {
166 |                 header.push(line);
167 |             }
168 |         });
169 |         Ok(GFFDB {
170 |             header,
171 |             records,
172 |             id_to_rec,
173 |             name_to_rec,
174 |             children,
175 |         })
176 |     }
177 | 
178 |     pub fn from_list_of_fields(list_of_fields: &[Vec<String>]) -> GFFDB {
179 |         let header = Vec::<String>::new();
180 |         let mut records = Vec::<Rc<GFFRecord>>::new();
181 |         let mut id_to_rec = IdToGffRec::default();
182 |         let mut name_to_rec = NameToGffRec::default();
183 |         let mut children = IdToChildren::default();
184 | 
185 |         list_of_fields.iter().for_each(|fields| {
186 |             let rec = Rc::new(GFFRecord::from_fields(fields));
187 |             records.push(rec.clone());
188 | 
189 |             if rec.attributes.contains_key("ID") {
190 |                 let id = rec.attributes.get("ID").unwrap();
191 |                 id_to_rec.insert(id.clone(), rec.clone());
192 |             }
193 |             if rec.attributes.contains_key("Name") {
194 |                 let name = rec.attributes.get("Name").unwrap();
195 |                 name_to_rec.insert(name.clone(), rec.clone());
196 |             }
197 |             if rec.attributes.contains_key("Parent") {
198 |                 let parent_id = rec.attributes.get("Parent").unwrap();
199 |                 children
200 |                     .entry(parent_id.clone())
201 |                     .or_insert_with(Vec::new)
202 |                     .push(rec.clone());
203 |             }
204 |         });
205 | 
206 |         GFFDB {
207 |             header,
208 |             records,
209 |             id_to_rec,
210 |             name_to_rec,
211 |             children,
212 |         }
213 |     }
214 | 
215 |     pub fn get_all_offspring(
216 |         &self,
217 |         id_or_name: &String,
218 |         recusive: bool,
219 |     ) -> Option<Vec<Rc<GFFRecord>>> {
220 |         let mut all_offspring = Vec::<Rc<GFFRecord>>::new();
221 | 
222 |         let id = if self.id_to_rec.contains_key(id_or_name) {
223 |             Some(id_or_name)
224 |         } else if self.name_to_rec.contains_key(id_or_name) {
225 |             let r = self.name_to_rec.get(id_or_name).unwrap();
226 |             r.attributes.get("ID")
227 |         } else {
228 |             None
229 |         };
230 | 
231 |         id?;
232 | 
233 |         let id = id.unwrap();
234 |         match self.children.get(id) {
235 |             Some(children) => {
236 |                 children.iter().for_each(|r| {
237 |                     if recusive && r.attributes.contains_key("ID") {
238 |                         let id = r.attributes.get("ID").unwrap();
239 |                         if let Some(more_offsprings) = self.get_all_offspring(id, recusive) {
240 |                             more_offsprings.iter().for_each(|r| {
241 |                                 all_offspring.push(r.clone());
242 |                             });
243 |                         }
244 |                     }
245 |                     all_offspring.push(r.clone());
246 |                 });
247 |                 Some(all_offspring)
248 |             }
249 |             None => None,
250 |         }
251 |     }
252 | 
253 |     pub fn dump_json(&self) {
254 |         println!("{}", serde_json::to_string(&self).unwrap());
255 |     }
256 | 
257 |     pub fn load_json(s: &str) -> serde_json::Result<GFFDB> {
258 |         let gffdb: GFFDB = serde_json::from_str(s)?;
259 |         Ok(gffdb)
260 |     }
261 | }
262 | 
263 | #[derive(Debug, Clone, Deserialize, Serialize)]
264 | pub struct QueryOut {
265 |     parent: Rc<GFFRecord>,
266 |     offspring: Vec<Rc<GFFRecord>>,
267 | }
268 | 
269 | #[cfg(test)]
270 | mod test {
271 |     use super::*;
272 | 
273 |     #[test]
274 |     fn test_gff_to_db() {
275 |         let res = super::GFFDB::from_bgzip_file(&Path::new("./test/test_data/test.gff3.gz"));
276 |         let gdb = res.unwrap();
277 |         println!("{}", gdb.header.join("\n"));
278 |         let r = gdb.name_to_rec.get(&"FLG".to_string()).unwrap();
279 |         let parent = r.clone();
280 |         let mut offspring = Vec::<Rc<GFFRecord>>::new();
281 |         println!("{}", r);
282 |         gdb.get_all_offspring(&"FLG".to_string(), true)
283 |             .unwrap()
284 |             .into_iter()
285 |             .for_each(|r| {
286 |                 println!("{}", r);
287 |                 offspring.push(r.clone());
288 |             });
289 |         let qr = QueryOut { parent, offspring };
290 |         println!("{}", serde_json::to_string(&qr).unwrap());
291 |     }
292 | }
293 | 


--------------------------------------------------------------------------------
/pgr-db/src/kmer_filter.rs:
--------------------------------------------------------------------------------
  1 | use cuckoofilter::CuckooFilter;
  2 | use rustc_hash::FxHashSet;
  3 | use std::collections::hash_map::DefaultHasher;
  4 | 
  5 | pub struct KmerFilter {
  6 |     filter: CuckooFilter<DefaultHasher>,
  7 |     kmer_size: usize,
  8 | 
  9 | }
 10 | 
 11 | impl KmerFilter {
 12 |     pub fn new(kmer_size: usize) -> Self {
 13 |         let filter = CuckooFilter::new();
 14 |         KmerFilter { filter, kmer_size }
 15 |     }
 16 | 
 17 |     pub fn with_capacity(kmer_size: usize, capacity: usize) -> Self {
 18 |         let filter = CuckooFilter::with_capacity(capacity);
 19 |         KmerFilter { filter, kmer_size }
 20 |     }
 21 | }
 22 | 
 23 | impl KmerFilter {
 24 |     pub fn add_seq(&mut self, seq: &Vec<u8>) {
 25 |         (0..seq.len() - self.kmer_size).for_each(|pos| {
 26 |             self.filter.test_and_add(&seq[pos..pos + self.kmer_size]).unwrap();
 27 |         })
 28 |     }
 29 | 
 30 |     pub fn check_seq(&self, seq: &Vec<u8>) -> usize {
 31 |         let mut count = 0_usize;
 32 |         (0..seq.len() - self.kmer_size).for_each(|pos| {
 33 |             if self.filter.contains(&seq[pos..pos + self.kmer_size]) {
 34 |                 count += 1
 35 |             };
 36 |         });
 37 |         count
 38 |     }
 39 |     
 40 |     pub fn add_seq_mmers(&mut self, seq: &Vec<u8>) {
 41 |         let k = self.kmer_size as u32;
 42 |         let w = k >> 1;
 43 |         let shmmrs = crate::shmmrutils::sequence_to_shmmrs1(0, seq, w, k, 1, 0, false);
 44 |         shmmrs.into_iter().for_each(|mmer| {
 45 |             self.filter.test_and_add(&mmer.x).unwrap();
 46 |         })
 47 |     }
 48 | 
 49 |     pub fn check_seq_mmers(&self, seq: &Vec<u8>) -> (usize, usize) {
 50 |         let mut count = 0_usize;
 51 |         let k = self.kmer_size as u32;
 52 |         let w = k >> 1;
 53 |         let shmmrs = crate::shmmrutils::sequence_to_shmmrs1(0, seq, w, k, 1, 0, false);
 54 |         shmmrs.iter().for_each(|mmer| {
 55 |             if self.filter.contains(&mmer.x) {
 56 |                 count += 1
 57 |             };
 58 |         });
 59 |         (shmmrs.len(), count)
 60 |     }
 61 | }
 62 | 
 63 | pub struct MinimizerFilter {
 64 |     filter: FxHashSet<u64>,
 65 |     kmer_size: usize,
 66 | 
 67 | }
 68 | 
 69 | impl MinimizerFilter {
 70 |     pub fn new(kmer_size: usize) -> Self {
 71 |         let filter = FxHashSet::default();
 72 |         MinimizerFilter { filter, kmer_size }
 73 |     }
 74 | }
 75 | 
 76 | impl MinimizerFilter {
 77 |     
 78 |     pub fn add_seq_mmers(&mut self, seq: &Vec<u8>) {
 79 |         let k = self.kmer_size as u32;
 80 |         let w = k >> 1;
 81 |         let shmmrs = crate::shmmrutils::sequence_to_shmmrs1(0, seq, w, k, 1, 0, false);
 82 |         shmmrs.into_iter().for_each(|mmer| {
 83 |             self.filter.insert(mmer.x);
 84 |         })
 85 |     }
 86 | 
 87 |     pub fn check_seq_mmers(&self, seq: &Vec<u8>) -> (usize, usize) {
 88 |         let mut count = 0_usize;
 89 |         let k = self.kmer_size as u32;
 90 |         let w = k >> 1;
 91 |         let shmmrs = crate::shmmrutils::sequence_to_shmmrs1(0, seq, w, k, 1, 0, false);
 92 |         shmmrs.iter().for_each(|mmer| {
 93 |             if self.filter.contains(&mmer.x) {
 94 |                 count += 1
 95 |             };
 96 |         });
 97 |         (shmmrs.len(), count)
 98 |     }
 99 | }
100 | 


--------------------------------------------------------------------------------
/pgr-db/test/test_data/consensus_test2.fa:
--------------------------------------------------------------------------------
 1 | >ref
 2 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCCCGCAGGTCCGCAGGGGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTCTCCCCCAAAAATGAGCTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG
 3 | >0
 4 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCCCGCAGGTCCGCAGGGGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTCTCCCCCAAAAATGAGCTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG
 5 | >6
 6 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCGAGCAGCGTCCACAGGCGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTTCTCCCCCAAAAATGAGTTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG
 7 | >13
 8 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCCCGCAGGTCCGCAGGGGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCCAGCACTTGCTTTCTCCCCCAAAAATGAGCTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG
 9 | >16
10 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCGAGCAGGTCCACAGGCGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTCTCCCCCAAAAATGAGTTTGCCTTTGAGATGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG
11 | >22
12 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCCCGCAGGTCCGCAGGGGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTCTCCCCCAAAAATGAGCTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG
13 | >25
14 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCGAGCAGGTCCACAGGCGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTCTCCCCCAAAAATGAGTTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG
15 | >31
16 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCCCGCAGGTCCGCAGGGGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTCTCCCCCAAAAATGAGCTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG
17 | >33
18 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCGAGCAGGTCCACAGGCGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTCTCCCCCAAAAATGAGTTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG
19 | >50
20 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCGAGCAGGTCCACAGGCGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTCTCCCCCAAAAATGAGTTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG
21 | >52
22 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCCCGCAGGTCCGCAGGGGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTCTCCCCCAAAAATGAGCTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG
23 | >56
24 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCGAGCAGGTCCACAGGCGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTCTCCCCCAAAAATGAGTTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG
25 | >59
26 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCGAGCAGGTCCACAGGCGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTCTCCCCCAAAAATGAGTTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG
27 | >63
28 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCCCGCAGGTCCGCAGGGGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTCTCCCCCAAAAATGAGCTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG
29 | >77
30 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCGAGCAGGTCCACAGGCGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTCTCCCCCAAAAATGAGTTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG
31 | 


--------------------------------------------------------------------------------
/pgr-db/test/test_data/gen_agc.sh:
--------------------------------------------------------------------------------
1 | ~/pgr/agc/agc create test_agc_ref.fa test_agc_seqs.fa > test.agc
2 | 


--------------------------------------------------------------------------------
/pgr-db/test/test_data/gen_frag_db.py:
--------------------------------------------------------------------------------
1 | import pgrtk
2 | sdb = pgrtk.SeqIndexDB()
3 | sdb.load_from_fastx("test_seqs.fa")
4 | sdb.write_frag_and_index_files("test_seqs_frag")
5 | 


--------------------------------------------------------------------------------
/pgr-db/test/test_data/seq0:
--------------------------------------------------------------------------------
1 | TCCATTCCCACCAGCAGTGTGTGAAAGTCTGGTACTGGTTCAGCCTGCCGTACTTTAATGATTATTGGTGTCACTCTTTCAAGTAACTTGTTGGTAATAAGAAGTCAATTA
2 | 


--------------------------------------------------------------------------------
/pgr-db/test/test_data/seq1:
--------------------------------------------------------------------------------
1 | TCCATTCCCACCAGCAGTGTGTGAAGGTTCAGCCTGCCGTACTTTAATGATTATTGGTGACACTCTTTCAAGTAACTTGTTGGTAATATTTATCTAAGAAGTCAATTA
2 | 


--------------------------------------------------------------------------------
/pgr-db/test/test_data/test.agc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cschin/pgr-tk/af629426abff01f7d27c08de504534a93611b7c2/pgr-db/test/test_data/test.agc


--------------------------------------------------------------------------------
/pgr-db/test/test_data/test.gff3.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cschin/pgr-tk/af629426abff01f7d27c08de504534a93611b7c2/pgr-db/test/test_data/test.gff3.gz


--------------------------------------------------------------------------------
/pgr-db/test/test_data/test_agc_ref.fa:
--------------------------------------------------------------------------------
1 | >NA21309#1#JAHEPC010000026.1:3279880-3319873
2 | CTCAGGGCCCTGACGGGCGTCTTGCCATGCTGCTCCTGGGCCTGCTGCTGCTGCTGCCCCTGCTGGCTGGCGCCCGCCTGCTGTGGAACTGGTGGAAGCTCCGGAGCCTCCACCTCCTGCCTCTTGCCCCGGGCTTCTTGCACCTGCTGCAGCCCGACCTCCCCATCTATCTGCTTGGCCTGACTCAGAAATTCGGGCCCATCTACAGGCTCCACCTTGGGCTGCAAGGTGAGAGGCTGATCTCGCTCTGGCCCTCACCATAGGAGGGGGCGGAGGTGACGGAGAGGGTCCTCTCTCCGCTGACGCTGCTTTGGCTGTCTCCCAGATGTGGTGGTGCTGAACTCCAAGAGGACCATTGAGGAAGCCATGGTCAAAAAGTGGGCAGACTTTGCTGGCAGACCTGAGCCACTTACCTGTAAGGGCTGGGGGCATTTTTTCTTTCTTAAACAAATTTTTTTTTTGTTAGAGATGGGGTCTTGCTATGTTGCCCAGGCTGGTCTTGAATTCCTGGTCTCAAGTGATCCTCCCACCTCGGCCTCAAGTGGGAGCCACCTTCGGGGGCTTCCCCAATCCTCCAGGTCACTGGAAGCTCTTGGGGGGCATATCTTCAGGAGAAGAAGCAGGTGTTGAGGAGGCAGAAGAAGGTCAGGCCCTCGGCTTCCTTGGTCAGTTCCCACCCTCCAGCCCCCAGCTCCTCCTGCAGACAAGCTGGTGTCTAAGAACTACCCGGACCTGTCGTTGGTCTCTGCTCTGGAAAGCCCACAAGAAGCTCACCCGCTCAGCCCTGCTGCTGGGCATCCGTGACTCCATGGAGCCAGTGGTGGAGCAGCTGACCCAGGAGTTCTGTGAGGTAAGGCTGGGCTCCTGAGGCCACCTCGGGTCAGCCTCGCCTCTCACAGTAGCCCCCGCCCTGCCCGCTGCACAGCGGCCTGCTGAACTCACACTGTTTCTCCACAGCGCATGAGAGCCCAGCCCGGCACCCCTGTGGCCATTGAGGAGGAATTCTCTCTCCTCACCTGCAGCATCAACTGTTACCTCACCTTCGGAGACAAGATCAAGGTGCCTCACAGCCCCTCAGGCCCACCCCCAGCCCCTCCCTGAGCCTCTCCTTGTCCTGAACTGAAAGTACTCCCTCCTTTCCTGGCAGGAGGACAACTTAATGCCTGCCTATTACAAATGTATCCAGGAGGTGTTAAAAACCTGGAGCCACTGGTCCATCCAAATTGTGGACGTGATTCCCTTTCTCAGGGTGAGGACCTGGAGCCTAGACACCCCTGGATTGTGGGGGAGAGGCTGGGGTGGAGGGAGAGGCTCCTTCCCACAGCTGCATTCTCATGCTTCCTGCCGCAGTTCTTCCCCAATCCAGGTCTCCGGAGGCTGAAGCAGGCCATAGAGAAGAGGGACCACAACGAGGAGAAGCAGCTGAGGCAGCACAAGGTGGGGACTGTGTGTGGACGGCCTCCCCTCGGCCCACAGCCAGTGATGCTACCGGCCTCAGCATTGCTATGAGGCGGGTTCTTTTGCATACCCCAGTTATGGGCCTGTTGCCACTCTGTACTCCTCTCCCCAGGCCAGCCGCTCAGCCCGCTCCTTTCACCCTCTGCAGGAGAGCCTGGTGGCAGGCCAGTGGAGGGACATGATGGACTACATGCTCCAAGGGGTGGCGCAGCCGAGCATGGAAGAGGGCTCCGGACAGCTCCTGGAAGGGCACTTGCACATGGCTGCAGTGGACCTCCTGATCGGTGGCACTGAGACCACAGCAAACACCCTCTCCTGGGCCGTGGTTTTTTTTGCTTCACCACCCTGAGGTGCGTCCTGCGGACAAGCAAAAGGCTCCTTCCCAGCAACCTGGCCAGGGCGGTGGGCACCCTCACTCAGCTCTGAGCACTGTGCGGCTGGGGCTGTGCTTGCCTCACCGGCACTCAGGCTCACTGGGTTGCTGAGGGAGCGGCTGGAGGCTGGGCAGCTGTGGGCTGCTGGGGCAGGACTCCACCCGATCATTCCCCAGATTCAGCAGCGACTGCAGGAGGAGCTAGACCACGAACTGGGCCCTGGTGCCTCCAGCTCCCGGGTCCCCTACAAGGACCGTGCACGGCTGCCCTTGCTCAATGCCACCATCGCCGAGGTGCTGCGCCTGCGGCCCGTTGTGCCCTTAGCCTTGCCCCACCGCACCACACGGCCCAGCAGGTGACTCCCGAGGGTTGGGGATGAGTGAGGAAAGCCCGAGCCCAGGGAGGTCCTGGCCAGCCTCTAACTCCAGCCCCCTTCAGCATCTCCGGCTACGACATCCCTGAGGGCACAGTCATCATTCCGAACCTCCAAGGCGCCCACCTGGATGAGACGGTCTGGGAGAGGCCACATGAGTTCTGGCCTGGTATGTGGGGGGCCGGGGGCCTGCCGTGAAAATGTGGTGGAGGCTGGTCCCCGCTGCCGCTGAACGCCTCCCCACCCACCTGTCCACCCGCCCGCAGATCGCTTCCTGGAGCCAGGCAAGAACTCCAGAGCTCTGGCCTTCGGCTGCGGTGCCCGCGTGTGCCTGGGCGAGCCGCTGGCGCGCCTGGAGCTCTTCGTGGTGCTGACCCGACTGCTGCAGGCCTTCACGCTGCTGCCCTCCGGGGACGCCCTGCCCTCCCTGCAGCCCCTGCCCCACTGCAGTGTCATCCTCAAGATGCAGCCTTTCCAAGTGCGGCTGCAGCCCCGGGGGATGGGGGCCCACAGCCCGGGCCAGAGCCAGTGATGGGGCAGGACCGATGCCAGCCGGGTACCTCAGTTTCTCCTTTATTGCTCCTGTACGAACCCCTCCCCTCCCCCCTGTAAACACAGTGCTGCGAGATCGCTGGCAGAGAAGGCTTCCTCCAGCGGCTGGGTGGTGAAGGACCCTGGCTCTTCTCTCGGGGCGACCCCTCAGTGCTCGGCAGTCATACTGGGGTGCGAGAGAGGTGGGCAGCAGCTCAGCCTCCCCCCGCTGGGGAGCGAAAGTTTCTTGGTCTCAGCTTCATTTCCGTGAAGGGCACCGAGAACTCGAAGCCCTTCCAGTGGTACCAGCTCACTCCCTGGGAAAGGGGTTGTCAAGAGAGAGTCAAAGCCGGATGTCCCATCTGCTCCTCCCGTTCCCCTTAAGGAGGTGGCTCCCAGCACTCAACCAACCTCCCCGCAGAGCTCCCTTCCTGACCCTCTGCCGCAGAGGATTGAGGCTTAATCCTGAGCTGGTCCTTTCCAGCCAATAAATCAACTCCAGCTCCCTCTGCGAGGCTGGCATGATTGTTCCATTTCACCCAGCCGCTCAGTCCCTTGCCTGTTACACTGTGGGGCTGAAACCTAGGCAGGCCGAGCCCCAGCCACCCCAGCTCTGAGCCGCCTCCCCACCCCTCACCTGATGGTCCAC
3 | 


--------------------------------------------------------------------------------
/pgr-db/test/test_data/test_rev.fa:
--------------------------------------------------------------------------------
1 | >NA21309#1#JAHEPC010000026.1:3279880-3319873
2 | CTCAGGGCCCTGACGGGCGTCTTGCCATGCTGCTCCTGGGCCTGCTGCTGCTGCTGCCCCTGCTGGCTGGCGCCCGCCTGCTGTGGAACTGGTGGAAGCTCCGGAGCCTCCACCTCCTGCCTCTTGCCCCGGGCTTCTTGCACCTGCTGCAGCCCGACCTCCCCATCTATCTGCTTGGCCTGACTCAGAAATTCGGGCCCATCTACAGGCTCCACCTTGGGCTGCAAGGTGAGAGGCTGATCTCGCTCTGGCCCTCACCATAGGAGGGGGCGGAGGTGACGGAGAGGGTCCTCTCTCCGCTGACGCTGCTTTGGCTGTCTCCCAGATGTGGTGGTGCTGAACTCCAAGAGGACCATTGAGGAAGCCATGGTCAAAAAGTGGGCAGACTTTGCTGGCAGACCTGAGCCACTTACCTGTAAGGGCTGGGGGCATTTTTTCTTTCTTAAACAAATTTTTTTTTTGTTAGAGATGGGGTCTTGCTATGTTGCCCAGGCTGGTCTTGAATTCCTGGTCTCAAGTGATCCTCCCACCTCGGCCTCAAGTGGGAGCCACCTTCGGGGGCTTCCCCAATCCTCCAGGTCACTGGAAGCTCTTGGGGGGCATATCTTCAGGAGAAGAAGCAGGTGTTGAGGAGGCAGAAGAAGGTCAGGCCCTCGGCTTCCTTGGTCAGTTCCCACCCTCCAGCCCCCAGCTCCTCCTGCAGACAAGCTGGTGTCTAAGAACTACCCGGACCTGTCGTTGGTCTCTGCTCTGGAAAGCCCACAAGAAGCTCACCCGCTCAGCCCTGCTGCTGGGCATCCGTGACTCCATGGAGCCAGTGGTGGAGCAGCTGACCCAGGAGTTCTGTGAGGTAAGGCTGGGCTCCTGAGGCCACCTCGGGTCAGCCTCGCCTCTCACAGTAGCCCCCGCCCTGCCCGCTGCACAGCGGCCTGCTGAACTCACACTGTTTCTCCACAGCGCATGAGAGCCCAGCCCGGCACCCCTGTGGCCATTGAGGAGGAATTCTCTCTCCTCACCTGCAGCATCAACTGTTACCTCACCTTCGGAGACAAGATCAAGGTGCCTCACAGCCCCTCAGGCCCACCCCCAGCCCCTCCCTGAGCCTCTCCTTGTCCTGAACTGAAAGTACTCCCTCCTTTCCTGGCAGGAGGACAACTTAATGCCTGCCTATTACAAATGTATCCAGGAGGTGTTAAAAACCTGGAGCCACTGGTCCATCCAAATTGTGGACGTGATTCCCTTTCTCAGGGTGAGGACCTGGAGCCTAGACACCCCTGGATTGTGGGGGAGAGGCTGGGGTGGAGGGAGAGGCTCCTTCCCACAGCTGCATTCTCATGCTTCCTGCCGCAGTTCTTCCCCAATCCAGGTCTCCGGAGGCTGAAGCAGGCCATAGAGAAGAGGGACCACAACGAGGAGAAGCAGCTGAGGCAGCACAAGGTGGGGACTGTGTGTGGACGGCCTCCCCTCGGCCCACAGCCAGTGATGCTACCGGCCTCAGCATTGCTATGAGGCGGGTTCTTTTGCATACCCCAGTTATGGGCCTGTTGCCACTCTGTACTCCTCTCCCCAGGCCAGCCGCTCAGCCCGCTCCTTTCACCCTCTGCAGGAGAGCCTGGTGGCAGGCCAGTGGAGGGACATGATGGACTACATGCTCCAAGGGGTGGCGCAGCCGAGCATGGAAGAGGGCTCCGGACAGCTCCTGGAAGGGCACTTGCACATGGCTGCAGTGGACCTCCTGATCGGTGGCACTGAGACCACAGCAAACACCCTCTCCTGGGCCGTGGTTTTTTTTGCTTCACCACCCTGAGGTGCGTCCTGCGGACAAGCAAAAGGCTCCTTCCCAGCAACCTGGCCAGGGCGGTGGGCACCCTCACTCAGCTCTGAGCACTGTGCGGCTGGGGCTGTGCTTGCCTCACCGGCACTCAGGCTCACTGGGTTGCTGAGGGAGCGGCTGGAGGCTGGGCAGCTGTGGGCTGCTGGGGCAGGACTCCACCCGATCATTCCCCAGATTCAGCAGCGACTGCAGGAGGAGCTAGACCACGAACTGGGCCCTGGTGCCTCCAGCTCCCGGGTCCCCTACAAGGACCGTGCACGGCTGCCCTTGCTCAATGCCACCATCGCCGAGGTGCTGCGCCTGCGGCCCGTTGTGCCCTTAGCCTTGCCCCACCGCACCACACGGCCCAGCAGGTGACTCCCGAGGGTTGGGGATGAGTGAGGAAAGCCCGAGCCCAGGGAGGTCCTGGCCAGCCTCTAACTCCAGCCCCCTTCAGCATCTCCGGCTACGACATCCCTGAGGGCACAGTCATCATTCCGAACCTCCAAGGCGCCCACCTGGATGAGACGGTCTGGGAGAGGCCACATGAGTTCTGGCCTGGTATGTGGGGGGCCGGGGGCCTGCCGTGAAAATGTGGTGGAGGCTGGTCCCCGCTGCCGCTGAACGCCTCCCCACCCACCTGTCCACCCGCCCGCAGATCGCTTCCTGGAGCCAGGCAAGAACTCCAGAGCTCTGGCCTTCGGCTGCGGTGCCCGCGTGTGCCTGGGCGAGCCGCTGGCGCGCCTGGAGCTCTTCGTGGTGCTGACCCGACTGCTGCAGGCCTTCACGCTGCTGCCCTCCGGGGACGCCCTGCCCTCCCTGCAGCCCCTGCCCCACTGCAGTGTCATCCTCAAGATGCAGCCTTTCCAAGTGCGGCTGCAGCCCCGGGGGATGGGGGCCCACAGCCCGGGCCAGAGCCAGTGATGGGGCAGGACCGATGCCAGCCGGGTACCTCAGTTTCTCCTTTATTGCTCCTGTACGAACCCCTCCCCTCCCCCCTGTAAACACAGTGCTGCGAGATCGCTGGCAGAGAAGGCTTCCTCCAGCGGCTGGGTGGTGAAGGACCCTGGCTCTTCTCTCGGGGCGACCCCTCAGTGCTCGGCAGTCATACTGGGGTGCGAGAGAGGTGGGCAGCAGCTCAGCCTCCCCCCGCTGGGGAGCGAAAGTTTCTTGGTCTCAGCTTCATTTCCGTGAAGGGCACCGAGAACTCGAAGCCCTTCCAGTGGTACCAGCTCACTCCCTGGGAAAGGGGTTGTCAAGAGAGAGTCAAAGCCGGATGTCCCATCTGCTCCTCCCGTTCCCCTTAAGGAGGTGGCTCCCAGCACTCAACCAACCTCCCCGCAGAGCTCCCTTCCTGACCCTCTGCCGCAGAGGATTGAGGCTTAATCCTGAGCTGGTCCTTTCCAGCCAATAAATCAACTCCAGCTCCCTCTGCGAGGCTGGCATGATTGTTCCATTTCACCCAGCCGCTCAGTCCCTTGCCTGTTACACTGTGGGGCTGAAACCTAGGCAGGCCGAGCCCCAGCCACCCCAGCTCTGAGCCGCCTCCCCACCCCTCACCTGATGGTCCAC
3 | >NA21309#1#JAHEPC010000026.1:3279880-3319873_RC
4 | GTGGACCATCAGGTGAGGGGTGGGGAGGCGGCTCAGAGCTGGGGTGGCTGGGGCTCGGCCTGCCTAGGTTTCAGCCCCACAGTGTAACAGGCAAGGGACTGAGCGGCTGGGTGAAATGGAACAATCATGCCAGCCTCGCAGAGGGAGCTGGAGTTGATTTATTGGCTGGAAAGGACCAGCTCAGGATTAAGCCTCAATCCTCTGCGGCAGAGGGTCAGGAAGGGAGCTCTGCGGGGAGGTTGGTTGAGTGCTGGGAGCCACCTCCTTAAGGGGAACGGGAGGAGCAGATGGGACATCCGGCTTTGACTCTCTCTTGACAACCCCTTTCCCAGGGAGTGAGCTGGTACCACTGGAAGGGCTTCGAGTTCTCGGTGCCCTTCACGGAAATGAAGCTGAGACCAAGAAACTTTCGCTCCCCAGCGGGGGGAGGCTGAGCTGCTGCCCACCTCTCTCGCACCCCAGTATGACTGCCGAGCACTGAGGGGTCGCCCCGAGAGAAGAGCCAGGGTCCTTCACCACCCAGCCGCTGGAGGAAGCCTTCTCTGCCAGCGATCTCGCAGCACTGTGTTTACAGGGGGGAGGGGAGGGGTTCGTACAGGAGCAATAAAGGAGAAACTGAGGTACCCGGCTGGCATCGGTCCTGCCCCATCACTGGCTCTGGCCCGGGCTGTGGGCCCCCATCCCCCGGGGCTGCAGCCGCACTTGGAAAGGCTGCATCTTGAGGATGACACTGCAGTGGGGCAGGGGCTGCAGGGAGGGCAGGGCGTCCCCGGAGGGCAGCAGCGTGAAGGCCTGCAGCAGTCGGGTCAGCACCACGAAGAGCTCCAGGCGCGCCAGCGGCTCGCCCAGGCACACGCGGGCACCGCAGCCGAAGGCCAGAGCTCTGGAGTTCTTGCCTGGCTCCAGGAAGCGATCTGCGGGCGGGTGGACAGGTGGGTGGGGAGGCGTTCAGCGGCAGCGGGGACCAGCCTCCACCACATTTTCACGGCAGGCCCCCGGCCCCCCACATACCAGGCCAGAACTCATGTGGCCTCTCCCAGACCGTCTCATCCAGGTGGGCGCCTTGGAGGTTCGGAATGATGACTGTGCCCTCAGGGATGTCGTAGCCGGAGATGCTGAAGGGGGCTGGAGTTAGAGGCTGGCCAGGACCTCCCTGGGCTCGGGCTTTCCTCACTCATCCCCAACCCTCGGGAGTCACCTGCTGGGCCGTGTGGTGCGGTGGGGCAAGGCTAAGGGCACAACGGGCCGCAGGCGCAGCACCTCGGCGATGGTGGCATTGAGCAAGGGCAGCCGTGCACGGTCCTTGTAGGGGACCCGGGAGCTGGAGGCACCAGGGCCCAGTTCGTGGTCTAGCTCCTCCTGCAGTCGCTGCTGAATCTGGGGAATGATCGGGTGGAGTCCTGCCCCAGCAGCCCACAGCTGCCCAGCCTCCAGCCGCTCCCTCAGCAACCCAGTGAGCCTGAGTGCCGGTGAGGCAAGCACAGCCCCAGCCGCACAGTGCTCAGAGCTGAGTGAGGGTGCCCACCGCCCTGGCCAGGTTGCTGGGAAGGAGCCTTTTGCTTGTCCGCAGGACGCACCTCAGGGTGGTGAAGCAAAAAAAACCACGGCCCAGGAGAGGGTGTTTGCTGTGGTCTCAGTGCCACCGATCAGGAGGTCCACTGCAGCCATGTGCAAGTGCCCTTCCAGGAGCTGTCCGGAGCCCTCTTCCATGCTCGGCTGCGCCACCCCTTGGAGCATGTAGTCCATCATGTCCCTCCACTGGCCTGCCACCAGGCTCTCCTGCAGAGGGTGAAAGGAGCGGGCTGAGCGGCTGGCCTGGGGAGAGGAGTACAGAGTGGCAACAGGCCCATAACTGGGGTATGCAAAAGAACCCGCCTCATAGCAATGCTGAGGCCGGTAGCATCACTGGCTGTGGGCCGAGGGGAGGCCGTCCACACACAGTCCCCACCTTGTGCTGCCTCAGCTGCTTCTCCTCGTTGTGGTCCCTCTTCTCTATGGCCTGCTTCAGCCTCCGGAGACCTGGATTGGGGAAGAACTGCGGCAGGAAGCATGAGAATGCAGCTGTGGGAAGGAGCCTCTCCCTCCACCCCAGCCTCTCCCCCACAATCCAGGGGTGTCTAGGCTCCAGGTCCTCACCCTGAGAAAGGGAATCACGTCCACAATTTGGATGGACCAGTGGCTCCAGGTTTTTAACACCTCCTGGATACATTTGTAATAGGCAGGCATTAAGTTGTCCTCCTGCCAGGAAAGGAGGGAGTACTTTCAGTTCAGGACAAGGAGAGGCTCAGGGAGGGGCTGGGGGTGGGCCTGAGGGGCTGTGAGGCACCTTGATCTTGTCTCCGAAGGTGAGGTAACAGTTGATGCTGCAGGTGAGGAGAGAGAATTCCTCCTCAATGGCCACAGGGGTGCCGGGCTGGGCTCTCATGCGCTGTGGAGAAACAGTGTGAGTTCAGCAGGCCGCTGTGCAGCGGGCAGGGCGGGGGCTACTGTGAGAGGCGAGGCTGACCCGAGGTGGCCTCAGGAGCCCAGCCTTACCTCACAGAACTCCTGGGTCAGCTGCTCCACCACTGGCTCCATGGAGTCACGGATGCCCAGCAGCAGGGCTGAGCGGGTGAGCTTCTTGTGGGCTTTCCAGAGCAGAGACCAACGACAGGTCCGGGTAGTTCTTAGACACCAGCTTGTCTGCAGGAGGAGCTGGGGGCTGGAGGGTGGGAACTGACCAAGGAAGCCGAGGGCCTGACCTTCTTCTGCCTCCTCAACACCTGCTTCTTCTCCTGAAGATATGCCCCCCAAGAGCTTCCAGTGACCTGGAGGATTGGGGAAGCCCCCGAAGGTGGCTCCCACTTGAGGCCGAGGTGGGAGGATCACTTGAGACCAGGAATTCAAGACCAGCCTGGGCAACATAGCAAGACCCCATCTCTAACAAAAAAAAAATTTGTTTAAGAAAGAAAAAATGCCCCCAGCCCTTACAGGTAAGTGGCTCAGGTCTGCCAGCAAAGTCTGCCCACTTTTTGACCATGGCTTCCTCAATGGTCCTCTTGGAGTTCAGCACCACCACATCTGGGAGACAGCCAAAGCAGCGTCAGCGGAGAGAGGACCCTCTCCGTCACCTCCGCCCCCTCCTATGGTGAGGGCCAGAGCGAGATCAGCCTCTCACCTTGCAGCCCAAGGTGGAGCCTGTAGATGGGCCCGAATTTCTGAGTCAGGCCAAGCAGATAGATGGGGAGGTCGGGCTGCAGCAGGTGCAAGAAGCCCGGGGCAAGAGGCAGGAGGTGGAGGCTCCGGAGCTTCCACCAGTTCCACAGCAGGCGGGCGCCAGCCAGCAGGGGCAGCAGCAGCAGCAGGCCCAGGAGCAGCATGGCAAGACGCCCGTCAGGGCCCTGAG
5 | 


--------------------------------------------------------------------------------
/pgr-db/test/test_data/test_seqs2.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cschin/pgr-tk/af629426abff01f7d27c08de504534a93611b7c2/pgr-db/test/test_data/test_seqs2.fa.gz


--------------------------------------------------------------------------------
/pgr-db/test/test_data/test_seqs_frag.frg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cschin/pgr-tk/af629426abff01f7d27c08de504534a93611b7c2/pgr-db/test/test_data/test_seqs_frag.frg


--------------------------------------------------------------------------------
/pgr-db/test/test_data/test_seqs_frag.mdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cschin/pgr-tk/af629426abff01f7d27c08de504534a93611b7c2/pgr-db/test/test_data/test_seqs_frag.mdb


--------------------------------------------------------------------------------
/pgr-db/test/test_data/test_seqs_frag.midx:
--------------------------------------------------------------------------------
 1 | 0	3385	NA21309#1#JAHEPC010000026.1:3279880-3319873	test_seqs.fa
 2 | 1	3384	NA21309#2#JAHEPB010000021.1:3182493-3222484	test_seqs.fa
 3 | 2	3385	NA20129#1#JAHEPE010000077.1:3268654-3307814	test_seqs.fa
 4 | 3	3384	NA20129#2#JAHEPD010000054.1:24048449-24086959	test_seqs.fa
 5 | 4	3385	NA19240#2#JAHEOL010000047.1:3346340-3411873	test_seqs.fa
 6 | 5	3385	NA18906#1#JAHEOO010000017.1:29298030-29336539	test_seqs.fa
 7 | 6	3385	HG03579#1#JAGYVU010000035.1:17853932-17892393	test_seqs.fa
 8 | 7	3385	HG03540#1#JAGYVY010000082.1:17876501-17941376	test_seqs.fa
 9 | 8	3385	HG03516#1#JAGYYT010000073.1:24039705-24078215	test_seqs.fa
10 | 9	3385	HG03516#2#JAGYYS010000003.1:32115852-32155015	test_seqs.fa
11 | 10	3385	HG03492#1#JAHEPI010000049.1:16807354-16852770	test_seqs.fa
12 | 11	3385	HG03486#2#JAHEOP010000002.1:3274978-3314140	test_seqs.fa
13 | 12	3385	HG03453#1#JAGYVW010000148.1:2385113-2424164	test_seqs.fa
14 | 13	3385	HG03098#1#JAHEPM010000086.1:23575752-23614804	test_seqs.fa
15 | 14	3385	HG02886#1#JAHAOU010000006.1:23436768-23475277	test_seqs.fa
16 | 15	3385	HG02818#2#JAHEOR010000019.1:17832149-17870658	test_seqs.fa
17 | 16	3384	HG02723#1#JAHEOU010000100.1:4894384-4934376	test_seqs.fa
18 | 17	3385	HG02723#2#JAHEOT010000107.1:24171657-24210709	test_seqs.fa
19 | 18	3385	HG02717#1#JAHAOS010000073.1:5257988-5297982	test_seqs.fa
20 | 19	3385	HG02717#2#JAHAOR010000061.1:24170153-24235031	test_seqs.fa
21 | 20	3385	HG02630#2#JAHAOP010000058.1:24157264-24195773	test_seqs.fa
22 | 21	3385	HG02622#1#JAHAOO010000042.1:28532698-28597579	test_seqs.fa
23 | 22	3385	HG02572#1#JAHAOW010000052.1:1097698-1136749	test_seqs.fa
24 | 23	3385	HG02559#1#JAGYVK010000047.1:32002843-32048367	test_seqs.fa
25 | 24	3385	HG02559#2#JAGYVJ010000064.1:31959199-32005559	test_seqs.fa
26 | 25	3385	HG02486#1#JAGYVM010000005.1:27346251-27384764	test_seqs.fa
27 | 26	3385	HG02257#1#JAGYVI010000022.1:28399996-28444874	test_seqs.fa
28 | 27	3385	HG02257#2#JAGYVH010000080.1:5254630-5293788	test_seqs.fa
29 | 28	3385	HG02148#1#JAHAMG010000076.1:24056708-24095759	test_seqs.fa
30 | 29	3385	HG02145#1#JAHKSG010000017.1:7890663-7936079	test_seqs.fa
31 | 30	3385	HG02109#1#JAHEPG010000124.1:3230658-3268212	test_seqs.fa
32 | 31	3385	HG02055#1#JAHEPK010000074.1:3335267-3375260	test_seqs.fa
33 | 32	3384	HG01978#1#JAGYVS010000035.1:32002155-32041308	test_seqs.fa
34 | 33	3385	HG01978#2#JAGYVR010000046.1:2481050-2558664	test_seqs.fa
35 | 34	3385	HG01952#1#JAHAME010000044.1:28390439-28428944	test_seqs.fa
36 | 35	3384	HG01952#2#JAHAMD010000016.1:32003252-32042411	test_seqs.fa
37 | 36	3385	HG01928#1#JAGYVQ010000020.1:28403872-28448747	test_seqs.fa
38 | 37	3385	HG01928#2#JAGYVP010000017.1:31935974-31981618	test_seqs.fa
39 | 38	3385	HG01891#1#JAGYVO010000024.1:26903616-26942667	test_seqs.fa
40 | 39	3385	HG01361#1#JAGYYX010000108.1:7857845-7896895	test_seqs.fa
41 | 40	3386	HG01361#2#JAGYYW010000059.1:32011724-32051719	test_seqs.fa
42 | 41	3385	HG01358#1#JAGYZB010000008.1:7968591-8013467	test_seqs.fa
43 | 42	3385	HG01358#2#JAGYZA010000082.1:7871132-7910180	test_seqs.fa
44 | 43	3385	HG01258#1#JAGYYV010000066.1:28088097-28127147	test_seqs.fa
45 | 44	3385	HG01258#2#JAGYYU010000011.1:27163808-27202317	test_seqs.fa
46 | 45	3385	HG01243#1#JAHEOY010000117.1:3280591-3319753	test_seqs.fa
47 | 46	3385	HG01243#2#JAHEOX010000097.1:27566385-27611263	test_seqs.fa
48 | 47	3386	HG01175#2#JAHALZ010000032.1:24298713-24337223	test_seqs.fa
49 | 48	3385	HG01123#1#JAGYYZ010000057.1:26932659-26977537	test_seqs.fa
50 | 49	3385	HG01123#2#JAGYYY010000050.1:31983405-32028932	test_seqs.fa
51 | 50	3385	HG01109#1#JAHEPA010000084.1:27909663-27954539	test_seqs.fa
52 | 51	3385	HG01071#2#JAHBCE010000076.1:7804427-7869306	test_seqs.fa
53 | 52	3385	HG00741#1#JAHALY010000025.1:24278226-24317278	test_seqs.fa
54 | 53	3385	HG00741#2#JAHALX010000077.1:26562165-26600674	test_seqs.fa
55 | 54	3384	HG00735#1#JAHBCH010000013.1:32057725-32097711	test_seqs.fa
56 | 55	3385	HG00735#2#JAHBCG010000038.1:3345449-3385443	test_seqs.fa
57 | 56	3385	HG00733#1#JAHEPQ010000070.1:31971892-32017417	test_seqs.fa
58 | 57	3385	HG00673#1#JAHBBZ010000030.1:31864344-31910704	test_seqs.fa
59 | 58	3385	HG005#1#JAHEPO010000054.1:5156418-5202777	test_seqs.fa
60 | 59	3385	HG00621#2#JAHBCC010000005.1:31951291-31996816	test_seqs.fa
61 | 60	3385	HG005#2#JAHEPN010000064.1:6876299-6921824	test_seqs.fa
62 | 61	3382	HG002#1#JAHKSE010000066.1:5280272-5325794	test_seqs.fa
63 | 62	3385	HG002#2#JAHKSD010000045.1:27105329-27150207	test_seqs.fa
64 | 63	3385	GRCH38_chr6:32000466-32046826	test_seqs.fa
65 | 64	3385	chm13_chr6:31853672-31899197	test_seqs.fa
66 | 65	3385	RC_TEST	test_seqs.fa
67 | 


--------------------------------------------------------------------------------
/pgr-db/test/test_data/test_seqs_frag.sdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cschin/pgr-tk/af629426abff01f7d27c08de504534a93611b7c2/pgr-db/test/test_data/test_seqs_frag.sdx


--------------------------------------------------------------------------------
/pgr-db/wrapper.h:
--------------------------------------------------------------------------------
1 | #include "../agc/src/lib-cxx/agc-api.h"
2 | 


--------------------------------------------------------------------------------
/pgr-tk-workstation/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM continuumio/miniconda3:latest
 2 | 
 3 | ARG DEBIAN_FRONTEND=noninteractive
 4 | 
 5 | RUN apt-get update \
 6 |     && apt-get install -y --no-install-recommends \
 7 |         curl \
 8 |         git \
 9 |         graphviz \
10 |         graphviz-dev \
11 |         gawk \
12 |         minimap2 \
13 |         samtools \
14 |         time \
15 |         wget \
16 |         pdf2svg \
17 |         awscli \
18 |         vim \
19 |         build-essential \
20 |         zlib1g \
21 |         zlib1g-dev \
22 |         libomp5 \
23 |     && rm -rf /var/lib/apt/lists/*
24 | 
25 | 
26 | RUN . /opt/conda/bin/activate && \
27 | conda install -y python=3.11 jupyterlab numpy networkx==2.4 matplotlib bokeh && conda clean -ya
28 | 
29 | #RUN conda install -y  --channel=conda-forge \
30 | #        matplotlib \
31 | #        numpy \
32 | #    && conda clean -ya
33 | 
34 | RUN pip3 install -U --no-cache-dir \
35 |     networkx==2.8.2 \
36 |     papermill==2.3.4 \
37 |     pydot==1.4.2 \
38 |     scikit-learn
39 | 
40 | RUN echo deb http://http.us.debian.org/debian/ testing non-free contrib main > /etc/apt/sources.list
41 | RUN apt-get update
42 | RUN apt-get install -y libc6 libstdc++6
43 | RUN ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/bin/../lib/libstdc++.so.6
44 | 
45 | COPY pgrtk-0.6.0-cp311-cp311-linux_x86_64.whl /tmp
46 | RUN pip install /tmp/pgrtk-0.6.0-cp311-cp311-linux_x86_64.whl
47 | RUN rm /tmp/pgrtk-0.6.0-cp311-cp311-linux_x86_64.whl
48 | RUN mkdir -p /opt/bin/ /wd/
49 | COPY jupyterlab.sh /opt/bin/
50 | CMD /bin/bash /opt/bin/jupyterlab.sh
51 | 


--------------------------------------------------------------------------------
/pgr-tk-workstation/Readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Introduction
 3 | 
 4 | Thie directory contain docker file to build a docker image including the
 5 | pgr-tk and a jupyter lab server that can run some example notebook for
 6 | pangneome analysis.
 7 | 
 8 | This tutorual assumes the user is familiar with a typical Linux environment and docker.
 9 | 
10 | ## Build the Docker Image
11 | 
12 | After bullding the python wheel for version `0.x.y` (check the `../target/wheels` 
13 | directory for proper version `0.x.y`).
14 | The image uses python3.8.
15 |  
16 | ```
17 | cp ../target/wheels/pgrtk-0.x.y-cp38-cp38-linux_x86_64.whl .
18 | docker build -t pgr-tk-ws:v0.x.y .
19 | ```
20 | 
21 | You can also use us a prebuilt docker image bye
22 | 
23 | ```
24 | docker pull cschin/pgr-tk-ws:v0.x.y
25 | ```
26 | (check `https://hub.docker.com/r/cschin/pgr-tk-ws` the latest version.)
27 | 
28 | 
29 | ## Set up environment 
30 | 
31 | In a directory that you have write permission,
32 | 
33 | ```
34 | mkdir -p workdir
35 | cd workdir
36 | wget https://giab-data.s3.amazonaws.com/PGR-TK-Files/pgr-tk-HGRP-y1-evaluation-set-v0.tar
37 | wget https://giab-data.s3.amazonaws.com/PGR-TK-Files/pgr-tk-example-code.zip
38 | ```
39 | 
40 | Untar the data file
41 | ```
42 | tar xvf pgr-tk-HGRP-y1-evaluation-set-v0.tar
43 | ```
44 | 
45 | The tar ball contains the following data file  
46 | 
47 | ```
48 | data/
49 | data/pgr-tk-HGRP-y1-evaluation-set-v0.agc # HPRC year 1 47 genomes (94 haplotype) + hg38 + hg19 + chm13 sequences in AGC format
50 | data/pgr-tk-HGRP-y1-evaluation-set-v0.mdb # the SHIMMER index into the sequences 
51 | data/pgr-tk-HGRP-y1-evaluation-set-v0.midx # auxilary index file for sequence names
52 | data/pgr-tk-HGRP-y1-evaluation-set-v0_input # file used to generate the index 
53 | data/AMY1A_gfa_view.png # AMY1A GFA example
54 | ```
55 | 
56 | Unzip the example notebooks
57 | 
58 | ```
59 | mkdir -p code && pushd code
60 | unzip ../pgr-tk-example-code.zip
61 | popd
62 | ```
63 | 
64 | Execute the Jupyter Lab server through docker
65 | 
66 | ```
67 | docker run -v $PWD:/wd/ -p 8888:8888 pgr-tk-ws:v0.x.y
68 | ```
69 | 
70 | or use a pre-built docker
71 | 
72 | Then follow the instruction from the Jupyter Lab output to connect to
73 | the server from a browser.
74 | 
75 | For analyzing the whole 97 haplotyp human assembly, it is suggested
76 | to have at least 64G RAM. You may use a remote server with enouge memory
77 | connect to the server directly or through ssh tunneling.
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/pgr-tk-workstation/build.sh:
--------------------------------------------------------------------------------
1 | cp ../target/wheels/pgrtk-0.6.0-cp311-cp311-linux_x86_64.whl .
2 | docker build -t cschin/pgr-tk-ws:v0.6.0 .
3 | 


--------------------------------------------------------------------------------
/pgr-tk-workstation/jupyterlab.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | mkdir -p /wd/results/
3 | mkdir -p /wd/code/
4 | mkdir -p /wd/data//
5 | ln -sf /wd/* /
6 | . /opt/conda/etc/profile.d/conda.sh
7 | jupyter-lab --ip="*" --allow-root --no-browser --port 8888 --NotebookApp.disable_check_xsrf=True /wd
8 | 


--------------------------------------------------------------------------------
/pgr-tk/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "pgrtk"
 3 | version = "0.6.0"
 4 | authors = ["Jason Chin <cschin@infoecho.net>"]
 5 | edition = "2021"
 6 | 
 7 | 
 8 | [lib]
 9 | name = "pgrtk"
10 | crate-type = ["rlib","cdylib"]
11 | 
12 | [dependencies]
13 | pyo3 = { version = "0.18.3", features = ["extension-module"] }
14 | 
15 | pgr-db = { path = "../pgr-db/", default-features = false  }
16 | rustc-hash = "1.1.0"
17 | rayon = "1.5.2"
18 | memmap2 = "0.5.10"
19 | 
20 | [features]
21 | with_agc = []
22 | default = ["pgr-db/with_agc", "with_agc"]
23 | 


--------------------------------------------------------------------------------
/pgr-tk/Readme.md:
--------------------------------------------------------------------------------
1 | A new light interface to the pgr-db


--------------------------------------------------------------------------------
/pgr-tk/build.rs:
--------------------------------------------------------------------------------
  1 | // from https://vallentin.dev/2019/06/06/versioning
  2 | 
  3 | use std::env::consts::{ARCH, OS};
  4 | use std::process::Command;
  5 | 
  6 | #[cfg(debug_assertions)]
  7 | const BUILD_TYPE: &'static str = "debug";
  8 | #[cfg(not(debug_assertions))]
  9 | const BUILD_TYPE: &'static str = "release";
 10 | 
 11 | fn main() {
 12 |     let branch_name = get_branch_name();
 13 |     if branch_name != String::from("bioconda") {
 14 |         let version_string = format!(
 15 |             "{} {} ({}:{}{}, {} build, {} [{}] [{}])",
 16 |             env!("CARGO_PKG_NAME"),
 17 |             env!("CARGO_PKG_VERSION"),
 18 |             get_branch_name(),
 19 |             get_commit_hash(),
 20 |             if is_working_tree_clean() { "" } else { "+" },
 21 |             BUILD_TYPE,
 22 |             OS,
 23 |             ARCH,
 24 |             get_rustc_version()
 25 |         );
 26 | 
 27 |         println!("cargo:rustc-env=VERSION_STRING={}", version_string);
 28 |     } else {
 29 |         let version_string = format!(
 30 |             "{} {} (bioconda {} build, {} [{}] [{}])",
 31 |             env!("CARGO_PKG_NAME"),
 32 |             env!("CARGO_PKG_VERSION"),
 33 |             BUILD_TYPE,
 34 |             OS,
 35 |             ARCH,
 36 |             get_rustc_version()
 37 |         );
 38 |         println!("cargo:rustc-env=VERSION_STRING={}", version_string);
 39 |     }
 40 | }
 41 | 
 42 | fn get_rustc_version() -> String {
 43 |     let output = Command::new("rustc")
 44 |         .arg("--version")
 45 |         .current_dir(env!("CARGO_MANIFEST_DIR"))
 46 |         .output()
 47 |         .unwrap();
 48 | 
 49 |     assert!(output.status.success());
 50 | 
 51 |     String::from_utf8_lossy(&output.stdout)
 52 |         .trim_end()
 53 |         .to_string()
 54 | }
 55 | 
 56 | fn get_commit_hash() -> String {
 57 |     let output = Command::new("git")
 58 |         .arg("log")
 59 |         .arg("-1")
 60 |         .arg("--pretty=format:%h") // Abbreviated commit hash
 61 |         // .arg("--pretty=format:%H") // Full commit hash
 62 |         .current_dir(env!("CARGO_MANIFEST_DIR"))
 63 |         .output()
 64 |         .unwrap();
 65 | 
 66 |     // assert!(output.status.success());
 67 |     if output.status.success() {
 68 |         String::from_utf8_lossy(&output.stdout).to_string()
 69 |     } else {
 70 |         String::from("bioconda")
 71 |     }
 72 | }
 73 | 
 74 | fn get_branch_name() -> String {
 75 |     let output = Command::new("git")
 76 |         .arg("rev-parse")
 77 |         .arg("--abbrev-ref")
 78 |         .arg("HEAD")
 79 |         .current_dir(env!("CARGO_MANIFEST_DIR"))
 80 |         .output()
 81 |         .unwrap();
 82 | 
 83 |     //assert!(output.status.success());
 84 |     if output.status.success() {
 85 |         String::from_utf8_lossy(&output.stdout)
 86 |             .trim_end()
 87 |             .to_string()
 88 |     } else {
 89 |         String::from("bioconda")
 90 |     }
 91 | }
 92 | 
 93 | fn is_working_tree_clean() -> bool {
 94 |     let status = Command::new("git")
 95 |         .arg("diff")
 96 |         .arg("--quiet")
 97 |         .arg("--exit-code")
 98 |         .current_dir(env!("CARGO_MANIFEST_DIR"))
 99 |         .status()
100 |         .unwrap();
101 | 
102 |     if status.success() {
103 |         status.code().unwrap() == 0
104 |     } else {
105 |         true
106 |     }
107 | }
108 | 


--------------------------------------------------------------------------------
/pgr-tk/build.sh:
--------------------------------------------------------------------------------
1 | maturin build --release --skip-auditwheel -i python3.8
2 | 


--------------------------------------------------------------------------------
/pgr-tk/examples/generate_gfa_for_CMRG.py:
--------------------------------------------------------------------------------
  1 | import pgrlite
  2 | import os
  3 | import networkx as nx
  4 | from networkx.drawing import nx_pydot
  5 | from collections import Counter
  6 | 
  7 | def generate_gfa(cmrg_regions, gene_name, pg_db, out_dir):
  8 |     gene_seq = cmrg_regions[gene_name][1]
  9 |     aln_range0 = pgrlite.query_sdb(pg_db, gene_seq, merge_range_tol=len(gene_seq) * 0.25)
 10 |     print("The number of hits for {} is {}".format((gene_name), len(aln_range0)))
 11 |     count = 0
 12 |     for sid, rgns in aln_range0.items():
 13 |         count += len(rgns) 
 14 |     
 15 |     print("The total aligned regions {} is {}".format(gene_name, count))
 16 |     seq_info = pg_db.seq_info.copy()
 17 |     with open(os.path.join(out_dir, f"{gene_name}_hit.txt"), "w") as f:
 18 |         print("#source", "ctg", "len", "n_hit", sep="\t", file = f)
 19 |         for k in aln_range0:
 20 |             if len(aln_range0[k]) >= 1:
 21 |                 ctg, src, len_ = seq_info[k]
 22 |                 print(src, ctg, len_, len(aln_range0[k]), sep="\t", file = f)
 23 |     
 24 | 
 25 |     rgn_lengths = []
 26 |     with open(os.path.join(out_dir, f"{gene_name}_hit_range.txt"), "w") as f:
 27 |         print("#sourc", "ctg", "len", "t_rgn_start", "t_rgn_end", "t_rgn_len", sep="\t", file = f)
 28 |     
 29 |         for k in list(aln_range0.keys()):
 30 |             b, e = aln_range0[k][0][0:2]
 31 |             if e-b < len(gene_seq) * 0.25:
 32 |                 continue
 33 |             ctg, src, len_ = seq_info[k]
 34 |             print(src, ctg, len_, b, e, e-b, sep="\t", file = f )
 35 |             rgn_lengths.append(e-b)
 36 |     
 37 |     with open(os.path.join(out_dir, f"{gene_name}_ht_copy_count.txt"), "w") as f:
 38 |         n_copy = {}
 39 |         for k in list(aln_range0.keys()):
 40 |             b, e = aln_range0[k][0][0:2]
 41 |             if e-b < len(gene_seq) * 0.25:
 42 |                 continue
 43 |             n_copy[k] = len(aln_range0[k])
 44 |         copy_count = Counter(n_copy.values())
 45 |         for nc, nh in copy_count.items():
 46 |             print("{}\tnumber_of_copy: {}\tnumber_of_haplotype_contig: {}".format(gene_name, nc, nh), file = f)
 47 | 
 48 |     seq_list = []
 49 |     i = 0
 50 |     for k in list(aln_range0.keys()):
 51 |         ctg_name, source, _ = seq_info[k]
 52 |         seq_id = k
 53 |         rgns = aln_range0[k].copy()
 54 |         rgns = pgrlite.merge_regions(rgns, tol=int(len(gene_seq)*0.25))
 55 | 
 56 |         for rgn in rgns:
 57 |             b, e, length, orientation, aln = rgn
 58 |             if length < len(gene_seq)*0.25:
 59 |                 continue
 60 |             seq = pg_db.get_sub_seq(source, ctg_name, b, e)
 61 |             if orientation == 1:
 62 |                 seq = pgrlite.rc_byte_seq(seq)
 63 |             seq_list.append((i, "{}_{}_{}_{}".format(ctg_name, b, e, orientation), seq))
 64 |             i += 1
 65 | 
 66 |     with open( os.path.join(out_dir, f"{gene_name}.fa"), "w") as f:
 67 |         for sid, name, seq in seq_list:
 68 |             print(">{} {}".format(name, sid), file = f)
 69 |             print(pgrlite.u8_to_string(seq), file = f)
 70 | 
 71 |     new_sdb = pgrlite.SeqIndexDB() 
 72 |     new_sdb.load_from_seq_list(seq_list, w=48, k=48, r=1, min_span=24)
 73 |     new_sdb.generate_smp_gfa(0, os.path.join(out_dir, f"{gene_name}_48_48_1_24.gfa"))
 74 |     new_sdb.write_midx_to_text_file(os.path.join(out_dir, f"{gene_name}_48_48_1_24.midx"))
 75 | 
 76 |     links = new_sdb.get_smp_adj_list(0)
 77 |     link_count = Counter([(_[1],_[2]) for _ in links])
 78 |     G = nx.DiGraph()
 79 |     for sid, v, w in links:
 80 |         if sid == 0:
 81 |             continue
 82 |             
 83 |         penwidth = link_count[(v,w)] * 0.01
 84 |         weight = penwidth
 85 |         G.add_edge(tuple(v[:2]), tuple(w[:2]), weight= weight, penwidth=penwidth)
 86 | 
 87 |     nx_pydot.write_dot(G, os.path.join(out_dir, "{}_48_48_1_24.dot".format(gene_name)))
 88 |     nx.write_gexf(G, os.path.join(out_dir, "{}_48_48_1_24.gexf".format(gene_name)))
 89 | 
 90 | 
 91 |     new_sdb.load_from_seq_list(seq_list, w=48, k=48, r=8, min_span=24)
 92 |     new_sdb.generate_smp_gfa(0, os.path.join(out_dir, f"{gene_name}_48_48_8_24.gfa"))
 93 |     new_sdb.write_midx_to_text_file(os.path.join(out_dir, f"{gene_name}_48_48_8_24.midx"))
 94 |     
 95 |     links = new_sdb.get_smp_adj_list(0)
 96 |     link_count = Counter([(_[1],_[2]) for _ in links])
 97 |     G = nx.DiGraph()
 98 |     for sid, v, w in links:
 99 |         if sid == 0:
100 |             continue
101 |             
102 |         penwidth = link_count[(v,w)] * 0.01
103 |         weight = penwidth
104 |         G.add_edge(tuple(v[:2]), tuple(w[:2]), weight= weight, penwidth=penwidth)
105 |     
106 |     nx_pydot.write_dot(G, os.path.join(out_dir, "{}_48_48_8_24.dot".format(gene_name)))
107 |     nx.write_gexf(G, os.path.join(out_dir, "{}_48_48_8_24.gexf".format(gene_name)))
108 | 
109 |     new_sdb.generate_smp_gfa(0, os.path.join(out_dir, f"{gene_name}_48_48_8_24.gfa"))
110 |     new_sdb.write_midx_to_text_file(os.path.join(out_dir, f"{gene_name}_48_48_8_24.midx"))
111 | 
112 | 
113 | if __name__ == "__main__":
114 |     ref_db =pgrlite.AGCFile("/data/HPRC-y1-rebuild-04252022/hg19.agc")
115 |     pb_db = pgrlite.SeqIndexDB()
116 |     pb_db.load_from_agc_index("/data/HPRC-y1-rebuild-04252022")
117 |     CMRG_coordinates = {}
118 |     padding = 20000
119 |     with open("/data/HG002_GRCh37_CMRG_coordinates.bed") as f:
120 |         for r in f:
121 |             r = r.strip().split("\t")
122 |             CMRG_coordinates[r[3]]=("chr{}".format(r[0]), int(r[1])-padding, int(r[2])+padding)
123 | 
124 |     CMRG_hg19_seq = {}
125 |     for g, c in CMRG_coordinates.items():
126 |         seq = ref_db.get_sub_seq('hg19.fasta', c[0], c[1], c[2])
127 |         CMRG_hg19_seq[g] = (c, seq)
128 | 
129 |     for g_name in CMRG_hg19_seq:
130 |         print("analyzing {}".format(g_name))
131 |         generate_gfa(CMRG_hg19_seq, g_name, pb_db, "/scratch/GFA_files")
132 | 


--------------------------------------------------------------------------------
/pgr-tk/examples/get_variants.py:
--------------------------------------------------------------------------------
  1 | import pgrtk
  2 | import os, sys
  3 | 
  4 | 
  5 | def filter_aln(aln_segs):
  6 |     """
  7 |     ensure both target / query are strictly increasing
  8 |     """
  9 |  
 10 |     last_ts = aln_segs[0][1][0]
 11 |     last_te = aln_segs[0][1][1]
 12 | 
 13 |     last_qs = aln_segs[0][0][0]
 14 |     last_qe = aln_segs[0][0][1]
 15 | 
 16 |     
 17 |     rtn = [ ((last_ts, last_te), (last_qs, last_qe)) ]
 18 |     
 19 |     for seg in aln_segs:
 20 |       
 21 |         if seg[1][1] < seg[1][0]: continue
 22 |         if seg[0][-1] != seg[1][-1]: continue
 23 | 
 24 |         if seg[1][0] >= last_te: 
 25 |        
 26 |             last_ts = last_te
 27 |             last_te = seg[1][1]
 28 |  
 29 |             last_qs = last_qe
 30 |             last_qe = seg[0][1]
 31 |           
 32 |             if last_ts == last_te:
 33 |                 continue
 34 |             
 35 |             
 36 |             rtn.append( ((last_ts, last_te), (last_qs, last_qe)) )
 37 |     return rtn
 38 |         
 39 | 
 40 |     
 41 | def filter_aln_rev(aln_segs):
 42 |     """
 43 |     ensure both target / query are strictly increasing
 44 |     """
 45 |     aln_segs = aln_segs.copy() 
 46 |     aln_segs.reverse()
 47 |     last_ts = aln_segs[0][1][0]
 48 |     last_te = aln_segs[0][1][1]
 49 | 
 50 |     last_qs = aln_segs[0][0][0]
 51 |     last_qe = aln_segs[0][0][1]
 52 | 
 53 |     
 54 |     rtn = [ ((last_ts, last_te), (last_qs, last_qe)) ]
 55 |     
 56 |     for seg in aln_segs:
 57 |       
 58 |         if seg[1][1] < seg[1][0]: continue
 59 |         if seg[0][-1] == seg[1][-1]: continue
 60 | 
 61 |         if seg[1][0] >= last_te: 
 62 |        
 63 |             last_ts = last_te
 64 |             last_te = seg[1][1]
 65 |  
 66 |             last_qe = last_qs
 67 |             last_qs = seg[0][0]
 68 |             
 69 |           
 70 |             if last_ts == last_te:
 71 |                 continue
 72 |             
 73 |             
 74 |             rtn.append( ((last_ts, last_te), (last_qs, last_qe)) )
 75 |     return rtn 
 76 |     
 77 | def seq_align_to_sdb(seq_db, seq1):
 78 | 
 79 |     query_res = pgrtk.query_sdb(seq_db, seq1, 
 80 |                        merge_range_tol=0, 
 81 |                        gap_penalty_factor=0.001, 
 82 |                        max_query_count=1, 
 83 |                        max_target_count=1)
 84 |     
 85 |     _, kmer_size, _, _, _ = seq_db.get_shmmr_spec() 
 86 |     rtn = []
 87 | 
 88 |     for sid, alns in query_res.items():
 89 |         # print("#sid, hits:", sid, len(alns))
 90 |         
 91 |         ref_seq = seq_db.get_seq_by_id(sid)
 92 |         
 93 |         for aln in alns:
 94 |             ts, te, tl, orientation = aln[:-1]
 95 |             # print(ts, te, tl, orientation)
 96 |             aln = aln[-1]
 97 |             if orientation == 0 :
 98 |                 filter_alignments = filter_aln(aln)
 99 |             else:
100 |                 filter_alignments = filter_aln_rev(aln)
101 |             # print("# anchors: ", len(aln), len(filter_aln(aln)), len(filter_aln_rev(aln)))
102 |             
103 |             for seg in filter_alignments:
104 |                 
105 |                 last_ts, last_te = seg[0][:2]
106 |                 last_qs, last_qe = seg[1][:2]   
107 | 
108 |                 last_ts -= kmer_size
109 |                 #last_te -= kmer_size
110 |              
111 |                 s0str = pgrtk.u8_to_string(ref_seq[last_ts: last_te])
112 |                 if orientation == 0:
113 |                     last_qs -= kmer_size
114 |                     s1str =  pgrtk.u8_to_string(seq1[last_qs:last_qe])
115 |                 else:
116 |                     last_qs -= kmer_size
117 |                     s1str =  pgrtk.rc(pgrtk.u8_to_string(seq1[last_qs:last_qe]))
118 |              
119 |                 if s0str[:16] != s1str[:16] or s0str[-16:] != s1str[-16:]:
120 |                     print("XXXX1 {} :\n{}\n{}\n".format(orientation, s0str[:56],  s1str[:56]))
121 |                     print("XXXX2 {} :\n{}\n{}\n".format(orientation, s0str[-56:],  s1str[-56:]))
122 |                     diff = None
123 |                 elif min(len(s0str),len(s1str)) == 0 or abs(len(s0str)-len(s1str)) > 256:
124 |                     diff = None
125 |                 else:
126 |                     diff = pgrtk.get_variant_segments(s0str, s1str, max_wf_length=min(64, len(s0str), len(s1str)), max_diff_percent=1)
127 | 
128 |                 if diff is not None:
129 |                     if len(diff[0]) > 0:
130 |                         for d in diff[0]:
131 |                             rtn.append( ((sid, last_ts, last_te), (last_qs, last_qe), 
132 |                                         (d[0] + last_ts, d[1] + last_qs, d[2], d[3], d[4]), orientation) )
133 |                     else:
134 |                         rtn.append( ((sid, last_ts, last_te), (last_qs, last_qe), 'ALL', orientation ) )
135 |                 elif diff is None:
136 |                     rtn.append( ((sid, last_ts, last_te), (last_qs, last_qe), 'NULL', orientation ) )
137 |     return rtn
138 | 
139 | def main(sdb_prefix, query_seq_fasta_path, out_prefix = "out"):
140 |     target_sdb = pgrtk.SeqIndexDB()
141 |     target_sdb.load_from_frg_index(sdb_prefix)
142 |     query_sdb = pgrtk.SeqIndexDB()
143 |     query_sdb.load_from_fastx(query_seq_fasta_path)
144 | 
145 |     target_sinfo = target_sdb.seq_info.copy() 
146 |     sinfo = query_sdb.seq_info.copy()
147 |     variant_file = open(out_prefix+".variants", "w")
148 |     sv_candidate_file = open(out_prefix+".sv_candidate", "w")
149 |     all_match_file = open(out_prefix+".all_match", "w")
150 |     for sid in sinfo:
151 |         ctg, src, length = sinfo[sid]
152 |         query_seq = query_sdb.get_seq_by_id(sid)
153 |         variants = seq_align_to_sdb(target_sdb, query_seq)
154 |         for variant in variants: 
155 |             t_sid, ts, te = variant[0]
156 |             qs, qe = variant[1]
157 |             t_ctg, _, _ = target_sinfo[t_sid]
158 |             rec = variant[2]
159 |             if rec in ['ALL', 'NULL']:
160 |                 print(t_ctg, ts, te, ctg, qs, qe, variant[2], variant[3], sep="\t", file=all_match_file)
161 |             else:
162 |                 print(t_ctg, ts, te, ctg, qs, qe, rec[0], variant[3], sep="\t", file=all_match_file)
163 |                 print(t_ctg, rec[0], rec[2], rec[3], rec[4], ctg, sep="\t", file=variant_file)
164 |             if rec == "NULL":
165 |                 print(t_ctg, variant[0][1], variant[0][2], ctg, variant[1][0], variant[1][1], sep="\t", file=sv_candidate_file)
166 |     variant_file.close()
167 |     sv_candidate_file.close()
168 | 
169 |             
170 | 
171 |     
172 | 
173 | if __name__ == "__main__":
174 | 
175 |     sdb_prefix = sys.argv[1]
176 |     query_seq_fasta_path = sys.argv[2]
177 |     prefix = sys.argv[3]
178 |     main(sdb_prefix, query_seq_fasta_path, prefix)
179 | 


--------------------------------------------------------------------------------
/pgr-web/frontend/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "pgr-web"
 3 | version = "0.6.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | dioxus = { version = "0.4.0", features = [] }
10 | reqwest = { version = "0.11", features = ["json"] }
11 | ws_stream_wasm = "0.7.4"
12 | serde = { version = "1.0.80", features = ["derive"] }
13 | serde_derive = "^1.0.59"
14 | serde_json = "1.0.59"
15 | rustc-hash = "1.1.0"
16 | dioxus-web = "0.4.0"
17 | wasm-logger = "0.2.0"
18 | log = "0.4.17"
19 | serde_qs = "0.12.0"
20 | serde_with = "3.0.0"
21 | url = "2.4.0"
22 | getrandom = { version = "0.2", features = ["js"] }
23 | pharos = "0.5"
24 | wasm-bindgen = "0.2.86"
25 | sledgehammer_bindgen = "0.2.2"
26 | futures-util = "0.3.28"
27 | futures-lite = "1.13.0"
28 | itertools = "0.10.5"
29 | 
30 | [dependencies.web-sys]
31 | version = "0.3.59"
32 | features = ["console", 
33 |             "Document", 
34 | 	    "DomTokenList", 
35 | 	    "Element", 
36 | 	    "HtmlSelectElement", 
37 | 	    "HtmlOptionsCollection"]
38 | 


--------------------------------------------------------------------------------
/pgr-web/frontend/Trunk.toml:
--------------------------------------------------------------------------------
 1 | [build]
 2 | target = "index.html"
 3 | dist = "../dist"
 4 | 
 5 | [watch]
 6 | watch = ["index.html", "src/main.rs"]
 7 | 
 8 | [serve]
 9 | # The address to serve on.
10 | address = "127.0.0.1"
11 | # The port to serve on.
12 | port = 8080
13 | # Open a browser tab once the initial build is complete.
14 | open = false
15 | # Disable auto-reload of the web app.
16 | no_autoreload = false
17 | 


--------------------------------------------------------------------------------
/pgr-web/frontend/index.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | <head>
 3 |     <meta charset="UTF-8">
 4 |     <title>PGR-TK WS</title>
 5 |     <!-- <link data-trunk rel="css" href="./tailwind.css"> -->
 6 |     <script src="https://cdn.tailwindcss.com"></script>
 7 | </head>
 8 | <body id="main" class="bg-gray-200">
 9 | <table class="border border-solid"></table>
10 | </body>
11 | </html>
12 | 


--------------------------------------------------------------------------------
/pgr-web/frontend/src/data/ROIs.json:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"AMY1A": {
  3 | 		"source": "hg19_tagged.fa",
  4 | 		"ctg": "chr1_hg19",
  5 | 		"bgn": 104198140,
  6 | 		"end": 104207173,
  7 | 		"padding": 150000,
  8 | 		"merge_range_tol": 120000,
  9 | 		"w": 48,
 10 | 		"k": 56,
 11 | 		"r": 1,
 12 | 		"min_span": 12,
 13 | 		"sketch": false,
 14 | 		"min_cov": 2,
 15 | 		"min_branch_size": 8,
 16 | 		"bundle_length_cutoff": 500,
 17 | 		"bundle_merge_distance": 10000
 18 | 	},
 19 | 	"TBC1D3": {
 20 | 		"source": "hg38_tagged.fa",
 21 | 		"ctg": "chr17_hg38",
 22 | 		"bgn": 37885486,
 23 | 		"end": 38325932,
 24 | 		"padding": 100000,
 25 | 		"merge_range_tol": 120000,
 26 | 		"w": 48,
 27 | 		"k": 56,
 28 | 		"r": 1,
 29 | 		"min_span": 12,
 30 | 		"sketch": false,
 31 | 		"min_cov": 2,
 32 | 		"min_branch_size": 8,
 33 | 		"bundle_length_cutoff": 500,
 34 | 		"bundle_merge_distance": 10000
 35 | 	},
 36 | 	"LPA": {
 37 | 		"source": "hg38_tagged.fa",
 38 | 		"ctg": "chr6_hg38",
 39 | 		"bgn": 160531482,
 40 | 		"end": 160664275,
 41 | 		"padding": 40000,
 42 | 		"merge_range_tol": 100000,
 43 | 		"w": 64,
 44 | 		"k": 56,
 45 | 		"r": 1,
 46 | 		"min_span": 12,
 47 | 		"sketch": false,
 48 | 		"min_cov": 2,
 49 | 		"min_branch_size": 8,
 50 | 		"bundle_length_cutoff": 500,
 51 | 		"bundle_merge_distance": 10000
 52 | 	},
 53 | 	"HLA Class II": {
 54 | 		"source": "hg38_tagged.fa",
 55 | 		"ctg": "chr6_hg38",
 56 | 		"bgn": 32163513,
 57 | 		"end": 32992088,
 58 | 		"padding": 25000,
 59 | 		"merge_range_tol": 2000000,
 60 | 		"w": 128,
 61 | 		"k": 56,
 62 | 		"r": 12,
 63 | 		"min_span": 64,
 64 | 		"sketch": false,
 65 | 		"min_cov": 2,
 66 | 		"min_branch_size": 8,
 67 | 		"bundle_length_cutoff": 500,
 68 | 		"bundle_merge_distance": 10000
 69 | 	},
 70 | 	"ChrY_Repeats": {
 71 | 		"source": "hg38_tagged.fa",
 72 | 		"ctg": "chrY_hg38",
 73 | 		"bgn": 23129355,
 74 | 		"end": 24907040,
 75 | 		"padding": 1500000,
 76 | 		"merge_range_tol": 2000000,
 77 | 		"w": 128,
 78 | 		"k": 56,
 79 | 		"r": 12,
 80 | 		"min_span": 64,
 81 | 		"sketch": false,
 82 | 		"min_cov": 2,
 83 | 		"min_branch_size": 8,
 84 | 		"bundle_length_cutoff": 500,
 85 | 		"bundle_merge_distance": 10000
 86 | 	},
 87 | 	"FLG": {
 88 | 		"source": "hg38_tagged.fa",
 89 | 		"ctg": "chr1_hg38",
 90 | 		"bgn": 152301265,
 91 | 		"end": 152328339,
 92 | 		"padding": 5000,
 93 | 		"merge_range_tol": 100,
 94 | 		"w": 48,
 95 | 		"k": 56,
 96 | 		"r": 1,
 97 | 		"min_span": 8,
 98 | 		"sketch": false,
 99 | 		"min_cov": 2,
100 | 		"min_branch_size": 8,
101 | 		"bundle_length_cutoff": 500,
102 | 		"bundle_merge_distance": 10000
103 | 	},
104 | 	"KIR": {
105 | 		"source": "hg38_tagged.fa",
106 | 		"ctg": "chr19_hg38",
107 | 		"bgn": 54687267,
108 | 		"end": 54907736,
109 | 		"padding": 5000,
110 | 		"merge_range_tol": 5000,
111 | 		"w": 48,
112 | 		"k": 56,
113 | 		"r": 5,
114 | 		"min_span": 16,
115 | 		"sketch": false,
116 | 		"min_cov": 2,
117 | 		"min_branch_size": 8,
118 | 		"bundle_length_cutoff": 500,
119 | 		"bundle_merge_distance": 10000
120 | 	}
121 | }


--------------------------------------------------------------------------------
/pgr-web/pgr-server/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "pgr-server"
 3 | version = "0.6.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | axum = { version="0.5.13", features = ["ws"] }
10 | tokio = { version = "1.0", features = ["full"] }
11 | pgr-db = { path = "../../pgr-db/", default-features = false}
12 | rustc-hash = "1.1.0"
13 | rayon = "1.5.2"
14 | serde_json = "1.0.83"
15 | serde = { version = "1.0.117", features = ["derive", "rc"] }
16 | tower-http = { version = "0.3.0", features = ["cors", "trace", "fs"] }
17 | tower = "0.4.13"
18 | tracing = "0.1"
19 | tracing-subscriber = { version = "0.3", features = ["env-filter"] }
20 | svg = "0.16"
21 | clap = { version = "4.2.7", features = ["derive"] }
22 | serde_qs = "0.12.0"
23 | serde_with = "3.0.0"
24 | 
25 | [features]
26 | default = ["with_agc"]
27 | with_agc = ["pgr-db/with_agc"]
28 | 


--------------------------------------------------------------------------------
/pgr-web/pgr-server/src/ROIs.json:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"AMY1A": {
  3 | 		"source": "hg19_tagged.fa",
  4 | 		"ctg": "chr1_hg19",
  5 | 		"bgn": 104198140,
  6 | 		"end": 104207173,
  7 | 		"padding": 150000,
  8 | 		"merge_range_tol": 120000,
  9 | 		"w": 48,
 10 | 		"k": 56,
 11 | 		"r": 1,
 12 | 		"min_span": 12,
 13 | 		"sketch": false,
 14 | 		"min_cov": 2,
 15 | 		"min_branch_size": 8,
 16 | 		"bundle_length_cutoff": 500,
 17 | 		"bundle_merge_distance": 10000
 18 | 	},
 19 | 	"TBC1D3": {
 20 | 		"source": "hg38_tagged.fa",
 21 | 		"ctg": "chr17_hg38",
 22 | 		"bgn": 37885486,
 23 | 		"end": 38325932,
 24 | 		"padding": 100000,
 25 | 		"merge_range_tol": 120000,
 26 | 		"w": 48,
 27 | 		"k": 56,
 28 | 		"r": 1,
 29 | 		"min_span": 12,
 30 | 		"sketch": false,
 31 | 		"min_cov": 2,
 32 | 		"min_branch_size": 8,
 33 | 		"bundle_length_cutoff": 500,
 34 | 		"bundle_merge_distance": 10000
 35 | 	},
 36 | 	"LPA": {
 37 | 		"source": "hg38_tagged.fa",
 38 | 		"ctg": "chr6_hg38",
 39 | 		"bgn": 160531482,
 40 | 		"end": 160664275,
 41 | 		"padding": 40000,
 42 | 		"merge_range_tol": 100000,
 43 | 		"w": 64,
 44 | 		"k": 56,
 45 | 		"r": 1,
 46 | 		"min_span": 12,
 47 | 		"sketch": false,
 48 | 		"min_cov": 2,
 49 | 		"min_branch_size": 8,
 50 | 		"bundle_length_cutoff": 500,
 51 | 		"bundle_merge_distance": 10000
 52 | 	},
 53 | 	"HLA Class II": {
 54 | 		"source": "hg38_tagged.fa",
 55 | 		"ctg": "chr6_hg38",
 56 | 		"bgn": 32163513,
 57 | 		"end": 32992088,
 58 | 		"padding": 25000,
 59 | 		"merge_range_tol": 2000000,
 60 | 		"w": 128,
 61 | 		"k": 56,
 62 | 		"r": 12,
 63 | 		"min_span": 64,
 64 | 		"sketch": false,
 65 | 		"min_cov": 2,
 66 | 		"min_branch_size": 8,
 67 | 		"bundle_length_cutoff": 500,
 68 | 		"bundle_merge_distance": 10000
 69 | 	},
 70 | 	"ChrY_Repeats": {
 71 | 		"source": "hg38_tagged.fa",
 72 | 		"ctg": "chrY_hg38",
 73 | 		"bgn": 23129355,
 74 | 		"end": 24907040,
 75 | 		"padding": 1500000,
 76 | 		"merge_range_tol": 2000000,
 77 | 		"w": 128,
 78 | 		"k": 56,
 79 | 		"r": 12,
 80 | 		"min_span": 64,
 81 | 		"sketch": false,
 82 | 		"min_cov": 2,
 83 | 		"min_branch_size": 8,
 84 | 		"bundle_length_cutoff": 500,
 85 | 		"bundle_merge_distance": 10000
 86 | 	},
 87 | 	"FLG": {
 88 | 		"source": "hg38_tagged.fa",
 89 | 		"ctg": "chr1_hg38",
 90 | 		"bgn": 152301265,
 91 | 		"end": 152328339,
 92 | 		"padding": 5000,
 93 | 		"merge_range_tol": 100,
 94 | 		"w": 48,
 95 | 		"k": 56,
 96 | 		"r": 1,
 97 | 		"min_span": 8,
 98 | 		"sketch": false,
 99 | 		"min_cov": 2,
100 | 		"min_branch_size": 8,
101 | 		"bundle_length_cutoff": 500,
102 | 		"bundle_merge_distance": 10000
103 | 	},
104 | 	"KIR": {
105 | 		"source": "hg38_tagged.fa",
106 | 		"ctg": "chr19_hg38",
107 | 		"bgn": 54687267,
108 | 		"end": 54907736,
109 | 		"padding": 5000,
110 | 		"merge_range_tol": 5000,
111 | 		"w": 48,
112 | 		"k": 56,
113 | 		"r": 5,
114 | 		"min_span": 16,
115 | 		"sketch": false,
116 | 		"min_cov": 2,
117 | 		"min_branch_size": 8,
118 | 		"bundle_length_cutoff": 500,
119 | 		"bundle_merge_distance": 10000
120 | 	}
121 | }


--------------------------------------------------------------------------------
/pgr-web/pgr-server/src/main.rs:
--------------------------------------------------------------------------------
  1 | pub mod bundle_processing;
  2 | 
  3 | use axum::{
  4 |     body::{boxed, Body},
  5 |     extract::ws::{WebSocket, WebSocketUpgrade},
  6 |     extract::Query,
  7 |     http::{Response, StatusCode},
  8 |     response,
  9 |     response::Html,
 10 |     routing::{get, post},
 11 |     Json, Router,
 12 | };
 13 | use bundle_processing::*;
 14 | use clap::{self, Parser};
 15 | use pgr_db::ext::*;
 16 | use rustc_hash::FxHashMap;
 17 | use std::net::SocketAddr;
 18 | use std::{
 19 |     net::{IpAddr, Ipv6Addr},
 20 |     path::PathBuf,
 21 |     str::FromStr,
 22 |     sync::Arc,
 23 | };
 24 | use tokio::fs;
 25 | use tower::{ServiceBuilder, ServiceExt};
 26 | use tower_http::cors::Any;
 27 | use tower_http::cors::CorsLayer;
 28 | use tower_http::services::ServeDir;
 29 | use tower_http::trace::TraceLayer;
 30 | use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};
 31 | 
 32 | #[derive(Parser, Debug)]
 33 | #[clap(name = "pgr-server", about = "Experimental Server")]
 34 | struct Opt {
 35 |     /// set the listen addr
 36 |     #[clap(short = 'a', long = "addr", default_value = "::1")]
 37 |     addr: String,
 38 | 
 39 |     /// set the listen port
 40 |     #[clap(short = 'p', long = "port", default_value = "5000")]
 41 |     port: u16,
 42 | 
 43 |     /// set the directory where static files are to be found
 44 |     #[clap(long = "static-dir", default_value = "./dist")]
 45 |     static_dir: String,
 46 | 
 47 |     /// set data_path_prefix
 48 |     #[clap(
 49 |         short = 'd',
 50 |         long = "data-path-prefix",
 51 |         default_value = "./pgr-tk-HGRP-y1-evaluation-set-v0"
 52 |     )]
 53 |     data_path_prefix: String,
 54 | 
 55 |     /// set the listen port
 56 |     #[clap(short = 'f', long = "frg-file")]
 57 |     frg_file: bool,
 58 | }
 59 | 
 60 | #[tokio::main]
 61 | async fn main() {
 62 |     let opt = Opt::parse();
 63 | 
 64 |     tracing_subscriber::registry()
 65 |         .with(tracing_subscriber::EnvFilter::new(
 66 |             std::env::var("RUST_LOG")
 67 |                 .unwrap_or_else(|_| "example_tracing_aka_logging=debug,tower_http=debug".into()),
 68 |         ))
 69 |         .with(tracing_subscriber::fmt::layer())
 70 |         .init();
 71 | 
 72 |     let mut seq_db = SeqIndexDB::new();
 73 | 
 74 |     if opt.frg_file {
 75 |         let _ = seq_db.load_from_frg_index(opt.data_path_prefix);
 76 |     } else {
 77 |         #[cfg(feature = "with_agc")]
 78 |         let _ = seq_db.load_from_agc_index(opt.data_path_prefix);
 79 | 
 80 |         #[cfg(not(feature = "with_agc"))]
 81 |         panic!("This command is compiled with only frg file support, please specify `--frg-file");
 82 |     }
 83 | 
 84 |     let seq_db = Arc::new(seq_db);
 85 |     // build our application with a route
 86 |     let app = Router::new()
 87 |         .route(
 88 |             "/api/get_number_of_ctgs",
 89 |             get({
 90 |                 let seq_db = seq_db.clone();
 91 |                 move || get_number_of_ctgs(seq_db)
 92 |             }),
 93 |         )
 94 |         .route(
 95 |             "/api/post_query_for_json_data",
 96 |             post({
 97 |                 let seq_db = seq_db.clone();
 98 |                 move |params| post_query_for_json_data(params, seq_db)
 99 |             }),
100 |         )
101 |         .route(
102 |             "/api/get_html_by_query",
103 |             get({
104 |                 let seq_db = seq_db.clone();
105 |                 move |params| get_html_by_query(params, seq_db)
106 |             }),
107 |         )
108 |         .route("/ws", get(ws_handler))
109 |         .layer(
110 |             CorsLayer::new()
111 |                 .allow_origin(Any)
112 |                 //.allow_origin("http://127.0.0.1:8080".parse::<HeaderValue>().unwrap())
113 |                 .allow_methods(Any)
114 |                 .allow_headers(Any),
115 |         )
116 |         .layer(ServiceBuilder::new().layer(TraceLayer::new_for_http()))
117 |         .fallback(get(|req| async move {
118 |             match ServeDir::new(&opt.static_dir).oneshot(req).await {
119 |                 Ok(res) => {
120 |                     let status = res.status();
121 |                     match status {
122 |                         StatusCode::NOT_FOUND => {
123 |                             let index_path = PathBuf::from(&opt.static_dir).join("index.html");
124 |                             let index_content = match fs::read_to_string(index_path).await {
125 |                                 Err(_) => {
126 |                                     return Response::builder()
127 |                                         .status(StatusCode::NOT_FOUND)
128 |                                         .body(boxed(Body::from("index file not found")))
129 |                                         .unwrap()
130 |                                 }
131 |                                 Ok(index_content) => index_content,
132 |                             };
133 | 
134 |                             Response::builder()
135 |                                 .status(StatusCode::OK)
136 |                                 .body(boxed(Body::from(index_content)))
137 |                                 .unwrap()
138 |                         }
139 |                         _ => res.map(boxed),
140 |                     }
141 |                 }
142 |                 Err(_err) => Response::builder()
143 |                     .status(StatusCode::INTERNAL_SERVER_ERROR)
144 |                     .body(boxed(Body::from(format!("internal errors"))))
145 |                     .expect("error response"),
146 |             }
147 |         }));
148 | 
149 |     // run it
150 |     let addr = SocketAddr::from((
151 |         IpAddr::from_str(opt.addr.as_str()).unwrap_or(IpAddr::V6(Ipv6Addr::LOCALHOST)),
152 |         opt.port,
153 |     ));
154 |     println!("listening on {}", addr);
155 |     axum::Server::bind(&addr)
156 |         .serve(app.into_make_service())
157 |         .await
158 |         .unwrap();
159 | }
160 | 
161 | /*
162 | async fn handler(seq_db: Arc<SeqIndexDB>) -> impl IntoResponse {
163 |     let n_ctg = 0;
164 |     let mut headers = HeaderMap::new();
165 |     headers.insert(header::CONTENT_TYPE, "text/plain".parse().unwrap());
166 |     headers.insert(header::URI, "http://127.0.0.1:3000".parse().unwrap());
167 |     let rtn = format!("Hello, World! {}", n_ctg);
168 |     (headers, rtn)
169 | }
170 | */
171 | 
172 | async fn get_number_of_ctgs(seq_db: Arc<SeqIndexDB>) -> Json<usize> {
173 |     let n_ctg = seq_db.seq_index.as_ref().unwrap().len();
174 |     Json(n_ctg)
175 | }
176 | 
177 | async fn post_query_for_json_data(
178 |     Json(seq_query_spec): Json<Option<SequenceQuerySpec>>,
179 |     seq_db: Arc<SeqIndexDB>,
180 | ) -> Json<Option<TargetMatchPrincipalBundles>> {
181 |     if seq_query_spec.is_none() {
182 |         return Json(None);
183 |     };
184 | 
185 |     let seq_query_spec = seq_query_spec.unwrap();
186 |     println!("{:?}", seq_query_spec);
187 |     Json(get_target_and_principal_bundle_decomposition(
188 |         &seq_query_spec,
189 |         seq_db,
190 |     ))
191 | }
192 | 
193 | async fn get_html_by_query(
194 |     Query(seq_query_spec): Query<SequenceQuerySpec>,
195 |     seq_db: Arc<SeqIndexDB>,
196 | ) -> Html<String> {
197 |     //if seq_query_spec.is_none() {
198 |     //    return Html("<html><body>No Query Yet</body></html>".into());
199 |     //};
200 | 
201 |     //let seq_query_spec = seq_query_spec.unwrap();
202 |     println!("{:?}", seq_query_spec);
203 | 
204 |     let data = get_target_and_principal_bundle_decomposition(&seq_query_spec, seq_db);
205 |     let output = pb_data_to_html_string(&data.unwrap());
206 | 
207 |     Html(output)
208 | }
209 | 
210 | async fn ws_handler(ws: WebSocketUpgrade) -> response::Response {
211 |     ws.on_upgrade(ws_handle_socket)
212 | }
213 | 
214 | use std::cell::OnceCell;
215 | // Todo... read the ROIs.json into the String than using static
216 | static ROI_JSON: &str = include_str!("ROIs.json");
217 | 
218 | async fn ws_handle_socket(mut socket: WebSocket) {
219 |     let ROI: OnceCell<FxHashMap<String, SequenceQuerySpec>> = OnceCell::new();
220 |     let _ = ROI.set(serde_json::from_str(ROI_JSON).unwrap());
221 | 
222 |     while let Some(msg) = socket.recv().await {
223 |         let msg = if let Ok(msg) = msg {
224 |             println!("WS msg: {:?}", msg);
225 |             if let axum::extract::ws::Message::Text(msg) = msg {
226 |                 if !msg.is_empty() {
227 |                     let roi = ROI.get().unwrap();
228 |                     let keys = roi.keys();
229 |                     let mut keys = keys.filter(|&s| (*s).starts_with(&msg)).collect::<Vec<_>>();
230 |                     keys.sort();
231 |                     let json = serde_json::to_string(
232 |                         &keys
233 |                             .iter()
234 |                             .map(|&k| ((*k).clone(), roi.get(k).unwrap().clone()))
235 |                             .collect::<FxHashMap<_, _>>(),
236 |                     )
237 |                     .unwrap();
238 |                     axum::extract::ws::Message::Text(json)
239 |                 } else {
240 |                     axum::extract::ws::Message::Text("{}".to_string())
241 |                 }
242 |             } else {
243 |                 axum::extract::ws::Message::Text("{}".to_string())
244 |             }
245 |         } else {
246 |             // client disconnected
247 |             return;
248 |         };
249 | 
250 |         if socket.send(msg).await.is_err() {
251 |             // client disconnected
252 |             return;
253 |         }
254 |     }
255 | }
256 | 


--------------------------------------------------------------------------------
/pgr-web/prod.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -euo pipefail
 3 | IFS=$'\n\t'
 4 | 
 5 | pushd frontend
 6 | trunk build --release
 7 | popd
 8 | 
 9 | cargo run --bin pgr-server --release --  --addr 0.0.0.0 --port 3000 --static-dir ./dist --data-path-prefix /wd/pgr-tk-demo-data/data/pgr-tk-HGRP-y1-evaluation-set-v0
10 | 


--------------------------------------------------------------------------------
/pgr-web/prod_no_agc.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -euo pipefail
 3 | IFS=$'\n\t'
 4 | 
 5 | pushd frontend
 6 | trunk build --release
 7 | popd
 8 | 
 9 | cargo run --bin pgr-server --no-default-features --release --  --addr 0.0.0.0 --port 3000 --static-dir ./dist --frg-file --data-path-prefix $HOME/Sandbox/pgr-tk-data/HGRP-y1-evaluation-set_fragdb 
10 | 


--------------------------------------------------------------------------------
/pgr-web/scripts/ROIs_examples.json:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"AMY1A": {
  3 | 		"source": "hg19_tagged.fa",
  4 | 		"ctg": "chr1_hg19",
  5 | 		"bgn": 104198140,
  6 | 		"end": 104207173,
  7 | 		"padding": 150000,
  8 | 		"merge_range_tol": 120000,
  9 | 		"w": 48,
 10 | 		"k": 56,
 11 | 		"r": 1,
 12 | 		"min_span": 12,
 13 | 		"sketch": false,
 14 | 		"min_cov": 2,
 15 | 		"min_branch_size": 8,
 16 | 		"bundle_length_cutoff": 500,
 17 | 		"bundle_merge_distance": 10000
 18 | 	},
 19 | 	"TBC1D3": {
 20 | 		"source": "hg38_tagged.fa",
 21 | 		"ctg": "chr17_hg38",
 22 | 		"bgn": 37885486,
 23 | 		"end": 38325932,
 24 | 		"padding": 100000,
 25 | 		"merge_range_tol": 120000,
 26 | 		"w": 48,
 27 | 		"k": 56,
 28 | 		"r": 1,
 29 | 		"min_span": 12,
 30 | 		"sketch": false,
 31 | 		"min_cov": 2,
 32 | 		"min_branch_size": 8,
 33 | 		"bundle_length_cutoff": 500,
 34 | 		"bundle_merge_distance": 10000
 35 | 	},
 36 | 	"LPA": {
 37 | 		"source": "hg38_tagged.fa",
 38 | 		"ctg": "chr6_hg38",
 39 | 		"bgn": 160531482,
 40 | 		"end": 160664275,
 41 | 		"padding": 40000,
 42 | 		"merge_range_tol": 100000,
 43 | 		"w": 64,
 44 | 		"k": 56,
 45 | 		"r": 1,
 46 | 		"min_span": 12,
 47 | 		"sketch": false,
 48 | 		"min_cov": 2,
 49 | 		"min_branch_size": 8,
 50 | 		"bundle_length_cutoff": 500,
 51 | 		"bundle_merge_distance": 10000
 52 | 	},
 53 | 	"HLA Class II": {
 54 | 		"source": "hg38_tagged.fa",
 55 | 		"ctg": "chr6_hg38",
 56 | 		"bgn": 32163513,
 57 | 		"end": 32992088,
 58 | 		"padding": 25000,
 59 | 		"merge_range_tol": 2000000,
 60 | 		"w": 128,
 61 | 		"k": 56,
 62 | 		"r": 12,
 63 | 		"min_span": 64,
 64 | 		"sketch": false,
 65 | 		"min_cov": 2,
 66 | 		"min_branch_size": 8,
 67 | 		"bundle_length_cutoff": 500,
 68 | 		"bundle_merge_distance": 10000
 69 | 	},
 70 | 	"ChrY_Repeats": {
 71 | 		"source": "hg38_tagged.fa",
 72 | 		"ctg": "chrY_hg38",
 73 | 		"bgn": 23129355,
 74 | 		"end": 24907040,
 75 | 		"padding": 1500000,
 76 | 		"merge_range_tol": 2000000,
 77 | 		"w": 128,
 78 | 		"k": 56,
 79 | 		"r": 12,
 80 | 		"min_span": 64,
 81 | 		"sketch": false,
 82 | 		"min_cov": 2,
 83 | 		"min_branch_size": 8,
 84 | 		"bundle_length_cutoff": 500,
 85 | 		"bundle_merge_distance": 10000
 86 | 	},
 87 | 	"FLG": {
 88 | 		"source": "hg38_tagged.fa",
 89 | 		"ctg": "chr1_hg38",
 90 | 		"bgn": 152301265,
 91 | 		"end": 152328339,
 92 | 		"padding": 5000,
 93 | 		"merge_range_tol": 100,
 94 | 		"w": 48,
 95 | 		"k": 56,
 96 | 		"r": 1,
 97 | 		"min_span": 8,
 98 | 		"sketch": false,
 99 | 		"min_cov": 2,
100 | 		"min_branch_size": 8,
101 | 		"bundle_length_cutoff": 500,
102 | 		"bundle_merge_distance": 10000
103 | 	},
104 | 	"KIR": {
105 | 		"source": "hg38_tagged.fa",
106 | 		"ctg": "chr19_hg38",
107 | 		"bgn": 54687267,
108 | 		"end": 54907736,
109 | 		"padding": 5000,
110 | 		"merge_range_tol": 5000,
111 | 		"w": 48,
112 | 		"k": 56,
113 | 		"r": 5,
114 | 		"min_span": 16,
115 | 		"sketch": false,
116 | 		"min_cov": 2,
117 | 		"min_branch_size": 8,
118 | 		"bundle_length_cutoff": 500,
119 | 		"bundle_merge_distance": 10000
120 | 	}
121 | }


--------------------------------------------------------------------------------
/pgr-web/scripts/generare_ROIs.sh:
--------------------------------------------------------------------------------
1 | python3 get_ROIs.py > ../pgr-server/src/ROIs.json
2 | 


--------------------------------------------------------------------------------
/pgr-web/scripts/get_ROIs.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | import json
 3 | 
 4 | 
 5 | #gene2query = {}
 6 | gene2query = json.loads(open("ROIs_examples.json").read())
 7 | ## we need the file of the coordinate of genes from https://s3.amazonaws.com/igv.org.genomes/hg38/ncbiRefSeq.sorted.txt.gz
 8 | with gzip.open("ncbiRefSeq.sorted.txt.gz") as f:
 9 |     for row in f:
10 |         row = row.decode("utf-8")
11 |         row = row.strip().split("\t")
12 |         g = row[12]
13 |         ch = row[2]
14 |         if len(ch.split("_")) > 1:
15 |             continue
16 |         strand = row[3]
17 |         bgn = int(row[4])
18 |         end = int(row[5])
19 |         if g not in gene2query:
20 |             gene2query[g] = {
21 |                 "source": "hg38_tagged.fa",
22 |                 "ctg": f"{ch}_hg38",
23 |                 "bgn": bgn,
24 |                 "end": end,
25 |                 "padding": 10000,
26 |                 "merge_range_tol": 120000,
27 |                 "w": 48,
28 |                 "k": 56,
29 |                 "r": 1,
30 |                 "min_span": 12,
31 |                 "sketch": False,
32 |                 "min_cov": 2,
33 |                 "min_branch_size": 8,
34 |                 "bundle_length_cutoff": 500,
35 |                 "bundle_merge_distance": 10000
36 |             }
37 | 
38 | 
39 | print(json.dumps(gene2query)) 
40 | 
41 | 


--------------------------------------------------------------------------------