├── .github └── workflows │ ├── create_artifect_for_release.yml │ └── test_and_build.yml ├── .gitignore ├── .gitmodules ├── Cargo.toml ├── LICENSE ├── LICENSE-GeneDX ├── README.md ├── aws_image_builder └── pgr-tk.yaml ├── build.sh ├── build_no_agc.sh ├── build_no_agc_apple_silicon.sh ├── docker ├── Dockerfile ├── Dockerfile.build_env-22.04 └── github-build.pub ├── docker_exec_env ├── Dockerfile └── build.sh ├── docs_src └── alnmap_formap.md ├── images ├── AMY1A_example.png └── PGR_TK_Sketch_MAPG_construction.png ├── justfile ├── pgr-bin ├── Cargo.toml ├── build.rs ├── file_format_documents │ ├── ctgsv.bed.md │ ├── for_pgr-alnmap │ │ ├── alnmap.md │ │ ├── ctgmap.bed.md │ │ ├── ctgmap.json.md │ │ └── svcnd.bed.md │ ├── gfa_format.md │ ├── input_file_formats_for_pgr-pbundle-bed.md │ ├── output_files_for_pgr-pbundle-decomp.md │ └── principal_bundle_bed_file.md ├── src │ ├── _bin │ │ ├── README.txt │ │ ├── pgr-fasta-smp-count.rs │ │ ├── pgr-filter.rs │ │ ├── pgr-multifilter.rs │ │ ├── pgr-probe-match.rs │ │ ├── pgr-shmmr-pair-count.rs │ │ └── pgr-test.rs │ └── bin │ │ ├── pgr-alnmap.rs │ │ ├── pgr-annotate-bed-file.rs │ │ ├── pgr-annotate-vcf-file.rs │ │ ├── pgr-compare-cov.rs │ │ ├── pgr-compare-cov2.rs │ │ ├── pgr-fetch-seqs.rs │ │ ├── pgr-generate-chr-aln-plot.rs │ │ ├── pgr-generate-diploid-vcf.rs │ │ ├── pgr-generate-sv-analysis.rs │ │ ├── pgr-make-frgdb.rs │ │ ├── pgr-map-coordinate.rs │ │ ├── pgr-mdb.rs │ │ ├── pgr-merge-svcnd-bed.rs │ │ ├── pgr-pbundle-aln.rs │ │ ├── pgr-pbundle-bed2dist.rs │ │ ├── pgr-pbundle-bed2offset.rs │ │ ├── pgr-pbundle-bed2sorted.rs │ │ ├── pgr-pbundle-bed2svg.rs │ │ ├── pgr-pbundle-decomp.rs │ │ ├── pgr-pbundle-shmmr2dist.rs │ │ ├── pgr-query.rs │ │ └── pgr-shmmr-count.rs └── utility_scripts │ └── get_cytoband_to_json.py ├── pgr-db ├── Cargo.toml ├── build.rs ├── src │ ├── agc_io.rs │ ├── aln.rs │ ├── bindings.rs │ ├── ec.rs │ ├── ext.rs │ ├── fasta_io.rs │ ├── frag_file_io.rs │ ├── gff_db.rs │ ├── graph_utils.rs │ ├── kmer_filter.rs │ ├── lib.rs │ ├── seq_db.rs │ ├── seqs2variants.rs │ └── shmmrutils.rs ├── test │ └── test_data │ │ ├── consensus_test.fa │ │ ├── consensus_test2.fa │ │ ├── consensus_test3.fa │ │ ├── consensus_test4.fa │ │ ├── consensus_test5.fa │ │ ├── gen_agc.sh │ │ ├── gen_frag_db.py │ │ ├── seq0 │ │ ├── seq1 │ │ ├── test.agc │ │ ├── test.gff3.gz │ │ ├── test_agc_ref.fa │ │ ├── test_agc_seqs.fa │ │ ├── test_hits │ │ ├── test_rev.fa │ │ ├── test_seqs.fa │ │ ├── test_seqs2.fa.gz │ │ ├── test_seqs_frag.frg │ │ ├── test_seqs_frag.mdb │ │ ├── test_seqs_frag.midx │ │ └── test_seqs_frag.sdx └── wrapper.h ├── pgr-tk-workstation ├── Dockerfile ├── Readme.md ├── build.sh └── jupyterlab.sh ├── pgr-tk ├── Cargo.toml ├── Readme.md ├── build.rs ├── build.sh ├── examples │ ├── generate_gfa_for_CMRG.py │ └── get_variants.py ├── pgrtk │ └── __init__.py └── src │ └── lib.rs └── pgr-web ├── frontend ├── Cargo.toml ├── Trunk.toml ├── index.html └── src │ ├── data │ └── ROIs.json │ └── main.rs ├── pgr-server ├── Cargo.toml └── src │ ├── ROIs.json │ ├── bundle_processing.rs │ └── main.rs ├── prod.sh ├── prod_no_agc.sh └── scripts ├── ROIs_examples.json ├── generare_ROIs.sh └── get_ROIs.py /.github/workflows/create_artifect_for_release.yml: -------------------------------------------------------------------------------- 1 | name: create_artifect_for_release 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | env: 7 | CARGO_TERM_COLOR: always 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | container: docker.io/cschin/pgr-tk-build-env 13 | steps: 14 | - uses: actions/checkout@v3 15 | with: 16 | submodules: recursive 17 | - name: setup 18 | run: | 19 | rustup default stable 20 | - name: Run tests 21 | run: cargo test --verbose --workspace --exclude pgrtk 22 | - name: Build 23 | run: bash build.sh 24 | - uses: actions/upload-artifact@v3 25 | with: 26 | name: build-artifact 27 | path: | 28 | target/release/pgr-mdb 29 | target/release/pgr-fetch-seqs 30 | target/release/pgr-make-frgdb 31 | target/release/pgr-pbundle-bed2dist 32 | target/release/pgr-pbundle-bed2offset 33 | target/release/pgr-pbundle-bed2sorted 34 | target/release/pgr-pbundle-bed2svg 35 | target/release/pgr-pbundle-decomp 36 | target/release/pgr-query 37 | target/wheels/*.whl 38 | -------------------------------------------------------------------------------- /.github/workflows/test_and_build.yml: -------------------------------------------------------------------------------- 1 | name: test_and_build 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | container: docker.io/cschin/pgr-tk-build-env 16 | steps: 17 | - uses: actions/checkout@v4 18 | with: 19 | submodules: recursive 20 | - name: setup 21 | run: /opt/cargo/bin/rustup default stable 22 | - name: Run tests 23 | run: /opt/cargo/bin/cargo test --verbose --workspace --exclude pgrtk 24 | - name: Build 25 | run: bash build.sh 26 | - uses: actions/upload-artifact@v4 27 | with: 28 | name: build-artifact 29 | path: | 30 | target/release/pgr-mdb 31 | target/release/pgr-fetch-seqs 32 | target/release/pgr-make-frgdb 33 | target/release/pgr-pbundle-bed2dist 34 | target/release/pgr-pbundle-bed2offset 35 | target/release/pgr-pbundle-bed2sorted 36 | target/release/pgr-pbundle-bed2svg 37 | target/release/pgr-pbundle-decomp 38 | target/release/pgr-query 39 | target/wheels/*.whl 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | */target 3 | */Cargo.lock 4 | */libagc.so 5 | */test/test_data/test_shmmr.db 6 | libagc.so 7 | *.pyc 8 | Cargo.lock 9 | pgr-tk/pgrtk/*.so 10 | pgr-tk-workstation/*.whl 11 | docker/github-build 12 | .vscode/settings.json 13 | .gitignore 14 | pgr-web/dist/* 15 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "agc"] 2 | path = agc 3 | url = https://github.com/cschin/agc.git 4 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = ["pgr-db", "pgr-bin", 'pgr-tk', 'pgr-web/pgr-server', "pgr-web/frontend"] 3 | resolver = "2" 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Pangenome Research ToolKit 2 | 3 | 2023-2024 (c) Jason Chin 4 | 5 | MIT License 6 | 7 | Copyright (c) 2024 GeneDx 8 | 9 | Permission is hereby granted, free of charge, to any person obtaining a copy 10 | of this software and associated documentation files (the "Software"), to deal 11 | in the Software without restriction, including without limitation the rights 12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | copies of the Software, and to permit persons to whom the Software is 14 | furnished to do so, subject to the following conditions: 15 | 16 | The above copyright notice and this permission notice shall be included in all 17 | copies or substantial portions of the Software. 18 | 19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | SOFTWARE. 26 | -------------------------------------------------------------------------------- /LICENSE-GeneDX: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 GeneDx 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PGR-tk: A PanGenomic Research Took Kit 2 | 3 | [![test_and_build](https://github.com/cschin/pgr-tk/actions/workflows/test_and_build.yml/badge.svg)](https://github.com/cschin/pgr-tk/actions/workflows/test_and_build.yml) 4 | 5 | This repository is a project to provide Python and Rust libraries to facilitate pangenomics analysis. Several algorithms and data structures used for the Peregrine Genome Assembler are useful for Pangenomics analysis as well. This repo takes those algorithms and data structure, combining other handy 3rd party tools to expose them as a library in Python (with Rust code for those computing parts that need performance.) 6 | 7 | ## What is PGR-tk? 8 | 9 | Research Preprint: 10 | 11 | [Multiscale Analysis of Pangenome Enables Improved Representation of Genomic Diversity For Repetitive And Clinically Relevant Genes](https://www.biorxiv.org/content/10.1101/2022.08.05.502980v2) 12 | 13 | PGR-TK provides pangenome assembly management, query and Minimizer Anchored Pangenome (MAP) Graph Generation 14 | 15 | ![Pangenome Data Management and Minimizer Anchored Pangenome Graph Generation](/images/PGR_TK_Sketch_MAPG_construction.png) 16 | 17 | With the MAP graph, we can use the "principal bundle decomposition" to study complicated structure variants and genome re-arragenment in the human populations. 18 | 19 | ![AMY1A Example](/images/AMY1A_example.png) 20 | 21 | 22 | ## Documentation, Usage and Examples 23 | 24 | Command Line Tools: 25 | 26 | PGR-TK provides the following tool to 27 | 28 | - create the PGR-TK sequence and index database 29 | - `pgr-mdb`: create pgr minimizer database with AGC backend 30 | - `pgr-make-frgdb`: create PGR-TK fragment minimizer database with frg format backend 31 | - query the database to fetch sequences 32 | - `pgr-query`: query a PGR-TK pangenome sequence database, ouput the hit summary and generate fasta files from the target sequences 33 | - generate MAP-graph in GFA format and principal bundle decomposition bed file 34 | - `pgr-pbundle-decomp`: generat the principal bundle decomposition though MAP Graph from a fasta file 35 | - generate SVG from the principal bundle decomposition bed file 36 | - `pgr-pbundle-bed2svg`: generate SVG from a principal bundle bed file 37 | - auxiliary tools 38 | - `pgr-pbundle-bed2sorted`: generate annotation file with a sorting order from the principal bundle decomposition 39 | - `pgr-pbundle-bed2dist`: generate alignment scores between sequences using bundle decomposition from a principal bundle bed file 40 | 41 | For each comannd, `command --help` provides the detail usage information. 42 | 43 | The API documentation is at https://genedx.github.io/pgr-tk/ 44 | 45 | A collection of Jupyter Notebooks are at https://github.com/genedx/pgr-tk-notebooks/ 46 | 47 | ## Built Binaries 48 | 49 | Check https://github.com/genedx/pgr-tk/releases 50 | 51 | 52 | ## Build 53 | 54 | See `docker/Dockerfile.build_env-20.04` for a build enviroment under ubuntu 20.04. 55 | With the proper build environment, just run `bash build.sh` to build all. 56 | 57 | For example, on a Mac OS with Docker install, you can clone the repository and build a linux binary 58 | within an Ubuntu 20.04 Linux distribution as follow: 59 | 60 | 1. Build the Docker image for a build environment: 61 | 62 | ``` 63 | git clone --recursive git@github.com:cschin/pgr-tk.git # clone the repo 64 | cd pgr-tk/docker 65 | ln -s Dockerfile.build_env-20.04 Dockerfile 66 | docker build -t pgr-tk-build . 67 | ``` 68 | 69 | 2. In the root directory of the repo `pgr-tk`: 70 | 71 | Execute 72 | ``` 73 | docker run -it --rm -v $PWD:/wd/pgr-tk pgr-tk-build /bin/bash 74 | ``` 75 | 76 | 3. Build the `pgr-tk` inside the docker container from the image `pgr-tk-build` 77 | 78 | ``` 79 | cd /wd/pgr-tk 80 | bash build.sh 81 | ``` 82 | 83 | The build python wheels will be in `target/wheels` which can be installed for ubuntun 20.04 python3.8 distribution. You can install it in the `pgr-tk-build` image as well to test it out. 84 | 85 | 86 | ### Build Singularity image 87 | 88 | If you have built the pgr-tk in a Docker container, you can use the following steps to build a Singularity image based on your Docker container. 89 | 90 | **Step 1: Commit Docker container to image** 91 | 92 | ```bash 93 | docker commit : 94 | ``` 95 | 96 | **Step 2: Push Docker image to Docker Hub** 97 | 98 | ```bash 99 | docker login # if not already logged in 100 | docker push : 101 | ``` 102 | 103 | **Step 3: Build Singularity image** 104 | 105 | ```bash 106 | singularity build ./pgr-tk.v0.5.1.sif docker:///: 107 | ``` 108 | 109 | This will generate a .sif file in the current directory. 110 | 111 | **Step 4: Execute** 112 | 113 | ```bash 114 | singularity exec --fakeroot -B :/ ./pgr-tk.v0.5.1.sif pgr-mdb test.input test_idx 115 | ``` 116 | 117 | Replace `` with the actual path you wish to bind to the container. 118 | 119 | The `--fakeroot` option allows you to build and run images as a "fake" root user. 120 | 121 | ## Install stable verison v0.3.6 with Bioconda 122 | 123 | If you have a conda install, you can try this to build an conda environment to use pgr-tk v0.3.6 (on linux only): 124 | 125 | ``` 126 | conda create -n pgr-tk python=3.8 127 | conda activate pgr-tk 128 | conda install -c bioconda -c conda-forge python_abi libstdcxx-ng=12 libclang13 pgr-tk=0.3.6 129 | ``` 130 | 131 | ## Troubleshooting 132 | 133 | `Segmentation fault (core dumped)` 134 | 135 | Usually, the issue arises because AGC encounters a version incompatibility when called by pgr-tk. The version of AGC that has been well-tested is [453c0afd](https://github.com/cschin/agc/tree/453c0afdc54b4aa00fa8e97a63f196931fdb81c4). To address this error, consider the following potential solutions: 136 | 137 | 1. Compile pgr-tk using Docker or Singularity instead of directly on your computer. Ensure that the Docker container is based on Ubuntu 20.04. 138 | 139 | 2. When cloning the pgr-tk repository, make sure to use the `--recursive` option. This will clone the AGC dependency as well." 140 | 141 | -------------------------------------------------------------------------------- /aws_image_builder/pgr-tk.yaml: -------------------------------------------------------------------------------- 1 | # Document Start 2 | # This provide AWS Image Builder Component with Ubuntu 22.04 base image 3 | name: "pgr-tk" 4 | description: "Image with PGR-TK pre-install with pangenomic data" 5 | schemaVersion: 1.0 6 | phases: 7 | - name: build 8 | steps: 9 | - name: InstallBuildingToolChain 10 | action: ExecuteBash 11 | inputs: 12 | commands: 13 | - apt-get update 14 | - DEBIAN_FRONTEND=noninteractive 15 | - TZ=Etc/UTC 16 | - apt-get install -y build-essential git ssh curl clang-14 cmake libssl-dev libssl3 pkg-config python3-pip 17 | - mkdir -p /opt 18 | - export RUSTUP_HOME=/opt/rustup 19 | - export CARGO_HOME=/opt/cargo 20 | - bash -c "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y" 21 | - source /opt/cargo/env && rustup default stable 22 | - source /opt/cargo/env && cargo install --locked maturin 23 | - GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no" cd /opt/ && git clone --recursive https://github.com/cschin/pgr-tk.git 24 | - source /opt/cargo/env && cd /opt/pgr-tk/pgr-bin && cargo install --path . 25 | - source /opt/cargo/env && cd /opt/pgr-tk && bash build.sh 26 | - pip install numpy 27 | - pip install /opt/pgr-tk/target/wheels/pgrtk-*-*-*-linux_x86_64.whl 28 | - chown ubuntu:ubuntu -R ${CARGO_HOME} 29 | - echo source /opt/cargo/env >> /home/ubuntu/.bashrc 30 | 31 | # Document End 32 | -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | rustup default stable 2 | 3 | ## if necessary, you can instal libclang / clang using Anaconda 4 | ## and set LIBCLANG_PATH to point to the libclang for cbindgen dependence clang-sys 5 | # export LIBCLANG_PATH=$HOME/miniconda3/lib 6 | 7 | ## if necessary, install maturin with `cargo install --locked maturin` 8 | # cargo install --locked maturin 9 | 10 | cargo build -p pgr-db --release 11 | cargo build -p pgr-bin --release 12 | cargo install --path pgr-bin 13 | 14 | pushd pgr-tk/ 15 | maturin build --release 16 | maturin build --release --skip-auditwheel 17 | popd 18 | -------------------------------------------------------------------------------- /build_no_agc.sh: -------------------------------------------------------------------------------- 1 | #pushd WFA2-lib 2 | #make all 3 | #popd 4 | 5 | rustup default stable 6 | cargo build -p pgr-db --release --no-default-features 7 | cargo build -p pgr-bin --release --no-default-features 8 | cargo install --path pgr-bin --no-default-features 9 | 10 | pushd pgr-tk/ 11 | maturin build --release --no-default-features 12 | maturin build --release --skip-auditwheel --no-default-features 13 | popd 14 | -------------------------------------------------------------------------------- /build_no_agc_apple_silicon.sh: -------------------------------------------------------------------------------- 1 | #pushd WFA2-lib 2 | #make all 3 | #popd 4 | 5 | #rustup default stable 6 | rustup default stable-aarch64-apple-darwin 7 | cargo build -p pgr-db --release --no-default-features 8 | cargo build -p pgr-bin --release --no-default-features 9 | cargo install --path pgr-bin --no-default-features 10 | 11 | pushd pgr-tk/ 12 | maturin build --release --no-default-features 13 | maturin build --release --skip-auditwheel --no-default-features 14 | popd 15 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | Dockerfile.build_env-22.04 -------------------------------------------------------------------------------- /docker/Dockerfile.build_env-22.04: -------------------------------------------------------------------------------- 1 | FROM ubuntu:22.04 2 | RUN apt-get update 3 | ARG DEBIAN_FRONTEND=noninteractive 4 | ENV TZ=Etc/UTC 5 | RUN apt-get install -y build-essential git ssh curl clang-14 cmake libssl-dev libssl3 pkg-config libzstd-dev zstd 6 | RUN mkdir -p /opt 7 | ENV RUSTUP_HOME=/opt/rustup 8 | ENV CARGO_HOME=/opt/cargo 9 | RUN RUSTUP_HOME=${RUSTUP_HOME} CARGO_HOME=${CARGO_HOME} bash -c "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y" 10 | RUN . /opt/cargo/env && rustup default stable 11 | RUN . /opt/cargo/env && cargo install --locked maturin 12 | ENV GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no" 13 | RUN . /opt/cargo/env && rustup toolchain list 14 | RUN apt-get install -y zlib1g-dev zlib1g libdeflate-dev 15 | -------------------------------------------------------------------------------- /docker/github-build.pub: -------------------------------------------------------------------------------- 1 | ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQCRJ3R0ssNmojNFh2C72gptRMNA5+2eXEG9V8vJzPByXV55iaM4e+AqPzbPXuCRyQUh9tTp1aHx61OJOIxVKonZyAC/mL0o2VKWgBMoRmOs/WQSlY083uenUe9lkTYhCnnRWcLyxJXd6NfysJQ8odmD7ZcNq/2yA4RLcQ02OH8xZsEGxUxsC+LUaOjIMvHFP9yyoYEhY8CNDwiRCBoPJtNM48826uFP8aTvAe0OnalouA200QVQpDqHaxNvGmUg6GmwmLN2yU1DIPdXpCkCER63Uhz+kYPL6ZBDWmFc9ipr0MzaWFuLrISSmb1sAhrS/TLsBN90WK7+32bYbU4ArHPjVzR6SXobKVqf6Vd3WNbqW51b6LIuAGG85CU4hRKK51E016p47VTVwt4mFLGdQSXwRFXoXkEpNDjz2U3+gdXt+7VnsxlwUSw1lv9qZ6F8t798BVaHmurkeQKA7K0xlGYl0EBVVEnkKljl6m5Xsz7CBc7YDq6S8/YJDbCUbEPC9Nc= cschin@umf01 2 | -------------------------------------------------------------------------------- /docker_exec_env/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:24.04 2 | RUN apt-get update 3 | ARG DEBIAN_FRONTEND=noninteractive 4 | ENV TZ=Etc/UTC 5 | RUN apt-get install -y build-essential git ssh curl clang-14 cmake libssl-dev libssl3 pkg-config libzstd-dev zstd 6 | RUN mkdir -p /opt 7 | ENV RUSTUP_HOME=/opt/rustup 8 | ENV CARGO_HOME=/opt/cargo 9 | RUN RUSTUP_HOME=${RUSTUP_HOME} CARGO_HOME=${CARGO_HOME} bash -c "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y" 10 | RUN . /opt/cargo/env && rustup default stable 11 | RUN . /opt/cargo/env && cargo install --locked maturin 12 | ENV GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no" 13 | RUN . /opt/cargo/env && rustup toolchain list 14 | RUN apt-get install -y zlib1g-dev zlib1g libdeflate-dev 15 | RUN cd /opt/ && git clone --recursive https://github.com/cschin/pgr-tk.git 16 | RUN cd /opt/pgr-tk/ && . /opt/cargo/env && cargo install maturin && bash build.sh 17 | RUN apt-get install -y python3-pip python3.12-venv 18 | RUN cd /opt/pgr-tk/ && python3 -m venv /opt/pgr-tk-py/ && /opt/pgr-tk-py/bin/pip install target/wheels/pgrtk-0.6.0-cp312-cp312-linux_x86_64.whl numpy 19 | RUN cd /opt/pgr-tk/ && . /opt/cargo/env && cargo clean 20 | ENV PATH="/opt/cargo/bin:$PATH" 21 | -------------------------------------------------------------------------------- /docker_exec_env/build.sh: -------------------------------------------------------------------------------- 1 | docker build -t cschin/pgrtk_env:latest . 2 | -------------------------------------------------------------------------------- /docs_src/alnmap_formap.md: -------------------------------------------------------------------------------- 1 | 2 | The command line tool `pgr-alnmap` can generate alingment map between a set of assembled contigs 3 | and a reference file. It creates a number of files that are useful for downstram analysis. 4 | This document describe the format of the generated files. 5 | 6 | ## `*.alnmap` files 7 | 8 | A `*.alnmap` files contain the blocks that mapped from the assembly contigs (also called "query") to 9 | the reference fiel (also called "target") from the PGR-TK-WGS's whole genome alignment code. (We will 10 | describe the algorithm behind it in a different documents.) The file contains chains of alignments Each 11 | chains is a set of blocks. 12 | 13 | Each chain of the align block has an integer id which is the first field of the line (also called as a 14 | record.) The second field is the record types: 15 | 16 | - "B": the record represents the begin of the chain of the align blocks. 17 | - "E": the record represents the end of the chain of the align blocks. 18 | - "M": the record represents a full alignment (no variant) block. 19 | - "M_D": the record represents a full alignment (no variant) block, however, two or more query are mapped to the same target block 20 | - "M_O": the record represents a full alignment (no variant) block, however, two or more query are mapped to the same target block from overlapped alignment chains 21 | - "V": the record represetns there are variants between the query and the target, the variant information are appended. 22 | - "V_D": the record represents there are variants between the query and the target, the variant information are appended. There are other query blocks mapping to the same target block. 23 | - "V_O": the record represents there are variants between the query and the target, the variant information are appended. There are other query blocks mapping to the same target block from overlapped alignment chains. 24 | - "S": the record represents potential structral variants between the query and the target block. 25 | - "S_D": the record represents potential structral variants between the query and the target block. There are other query blocks mapping to the same target block. 26 | - "S_O": the record represents potential structral variants between the query and the target block. There are other query blocks mapping to the same target blocke 27 | 28 | All records share the following common 9 fields seperated by `tab`: 29 | 30 | `aligned_chain_id, block_type, target_name, target_start, target_end, query_name, query_start, query_end, query_strand` 31 | 32 | 33 | The follow command generate the unqiuely mapped blocks: 34 | 35 | ``` 36 | cat grch38_to_chm13.alnmap | awk '$2 == "V" || $2 =="M" || $2 == "S" ' | cut -f1-9 | sort -k3,3 -k4,4n -u > grch38_to_chm13_unique_blocks.alnmap 37 | 38 | ``` 39 | 40 | and the duplicate mapped blocks: 41 | 42 | ``` 43 | cat grch38_to_chm13.alnmap | awk '$2 == "V_D" || $2 =="M_D" || $2 == "S_D" ' | cut -f1-9 | sort -k3,3 -k4,4n -u > grch38_to_chm13_dup_blocks.alnmap 44 | ``` 45 | 46 | For the "V", "V_D" and "V_O" records, six addition fields are appended: 47 | (`variant_position_in_the_target_block`, `variant_position_in_the_query_block`, `variant_position_in_the_target_sequence`, `variant_type`, `ref_seq`, `variant_seq`) 48 | 49 | For the "B" and "E" blocks, two additional fields are appended: `query_sequence_length`, `the_alingment_orientation_of_the_contig`. 50 | 51 | For the "S", "S_D" and "S_O" blocks, two addional fileds are appended: `the_alingment_orientation_of_the_contig`, `sv_candidate_ type`. 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /images/AMY1A_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cschin/pgr-tk/af629426abff01f7d27c08de504534a93611b7c2/images/AMY1A_example.png -------------------------------------------------------------------------------- /images/PGR_TK_Sketch_MAPG_construction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cschin/pgr-tk/af629426abff01f7d27c08de504534a93611b7c2/images/PGR_TK_Sketch_MAPG_construction.png -------------------------------------------------------------------------------- /justfile: -------------------------------------------------------------------------------- 1 | build_no_agc: 2 | #!/usr/bin/env bash 3 | #pushd WFA2-lib 4 | #make all 5 | #popd 6 | 7 | rustup default stable 8 | cargo build -p pgr-db --release --no-default-features 9 | cargo build -p pgr-bin --release --no-default-features 10 | cargo install --path pgr-bin --no-default-features 11 | 12 | pushd pgr-tk/ 13 | maturin build --release --no-default-features 14 | maturin build --release --skip-auditwheel --no-default-features 15 | popd 16 | 17 | install_bin_no_agc: 18 | cargo install --path pgr-bin --no-default-features 19 | 20 | install_bin: 21 | cargo install --path pgr-bin/ 22 | 23 | build: 24 | #!/usr/bin/env bash 25 | rustup default stable 26 | 27 | ## if necessary, you can instal libclang / clang using Anaconda 28 | ## and set LIBCLANG_PATH to point to the libclang for cbindgen dependence clang-sys 29 | # export LIBCLANG_PATH=$HOME/miniconda3/lib 30 | 31 | ## if necessary, install maturin with `cargo install --locked maturin` 32 | # cargo install --locked maturin 33 | 34 | cargo build -p pgr-db --release 35 | cargo build -p pgr-bin --release 36 | cargo install --path pgr-bin 37 | 38 | pushd pgr-tk/ 39 | maturin build --release 40 | maturin build --release --skip-auditwheel 41 | popd 42 | 43 | -------------------------------------------------------------------------------- /pgr-bin/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "pgr-bin" 3 | version = "0.6.0" 4 | edition = "2021" 5 | authors = ["Jason Chin "] 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | pgr-db = {path = "../pgr-db", default-features = false } 11 | flate2 = { version = "1.0.17", features = ["zlib-ng-compat"], default-features = false } 12 | log = { version = "^0.4.5", features = ["std"] } 13 | clap = { version = "4.0.30", features = ["derive"] } 14 | rustc-hash = "1.1.0" 15 | rayon = "1.5.2" 16 | regex = "1" 17 | svg = "0.16" 18 | kodama = "0.2.3" 19 | memmap2 = "0.5.10" 20 | bincode = { version = "2.0.0-rc.1", features = ["alloc"] } 21 | serde_json = "1.0.96" 22 | serde = "1.0.163" 23 | iset = "0.2.2" 24 | 25 | [features] 26 | default = ["with_agc"] 27 | with_agc = ["pgr-db/with_agc"] 28 | -------------------------------------------------------------------------------- /pgr-bin/build.rs: -------------------------------------------------------------------------------- 1 | // from https://vallentin.dev/2019/06/06/versioning 2 | 3 | use std::env::consts::{ARCH, OS}; 4 | use std::process::Command; 5 | 6 | #[cfg(debug_assertions)] 7 | const BUILD_TYPE: &str = "debug"; 8 | #[cfg(not(debug_assertions))] 9 | const BUILD_TYPE: &'static str = "release"; 10 | 11 | fn main() { 12 | let branch_name = get_branch_name(); 13 | if branch_name != *"bioconda" { 14 | let version_string = format!( 15 | "{} {} ({}:{}{}, {} build, {} [{}] [{}])", 16 | env!("CARGO_PKG_NAME"), 17 | env!("CARGO_PKG_VERSION"), 18 | get_branch_name(), 19 | get_commit_hash(), 20 | if is_working_tree_clean() { "" } else { "+" }, 21 | BUILD_TYPE, 22 | OS, 23 | ARCH, 24 | get_rustc_version() 25 | ); 26 | 27 | println!("cargo:rustc-env=VERSION_STRING={}", version_string); 28 | } else { 29 | let version_string = format!( 30 | "{} {} (bioconda {} build, {} [{}] [{}])", 31 | env!("CARGO_PKG_NAME"), 32 | env!("CARGO_PKG_VERSION"), 33 | BUILD_TYPE, 34 | OS, 35 | ARCH, 36 | get_rustc_version() 37 | ); 38 | println!("cargo:rustc-env=VERSION_STRING={}", version_string); 39 | } 40 | } 41 | 42 | fn get_rustc_version() -> String { 43 | let output = Command::new("rustc") 44 | .arg("--version") 45 | .current_dir(env!("CARGO_MANIFEST_DIR")) 46 | .output() 47 | .unwrap(); 48 | 49 | assert!(output.status.success()); 50 | 51 | String::from_utf8_lossy(&output.stdout) 52 | .trim_end() 53 | .to_string() 54 | } 55 | 56 | fn get_commit_hash() -> String { 57 | let output = Command::new("git") 58 | .arg("log") 59 | .arg("-1") 60 | .arg("--pretty=format:%h") // Abbreviated commit hash 61 | // .arg("--pretty=format:%H") // Full commit hash 62 | .current_dir(env!("CARGO_MANIFEST_DIR")) 63 | .output() 64 | .unwrap(); 65 | 66 | // assert!(output.status.success()); 67 | if output.status.success() { 68 | String::from_utf8_lossy(&output.stdout).to_string() 69 | } else { 70 | String::from("bioconda") 71 | } 72 | } 73 | 74 | fn get_branch_name() -> String { 75 | let output = Command::new("git") 76 | .arg("rev-parse") 77 | .arg("--abbrev-ref") 78 | .arg("HEAD") 79 | .current_dir(env!("CARGO_MANIFEST_DIR")) 80 | .output() 81 | .unwrap(); 82 | 83 | //assert!(output.status.success()); 84 | if output.status.success() { 85 | String::from_utf8_lossy(&output.stdout) 86 | .trim_end() 87 | .to_string() 88 | } else { 89 | String::from("bioconda") 90 | } 91 | } 92 | 93 | fn is_working_tree_clean() -> bool { 94 | let status = Command::new("git") 95 | .arg("diff") 96 | .arg("--quiet") 97 | .arg("--exit-code") 98 | .current_dir(env!("CARGO_MANIFEST_DIR")) 99 | .status() 100 | .unwrap(); 101 | 102 | if status.success() { 103 | status.code().unwrap() == 0 104 | } else { 105 | true 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /pgr-bin/file_format_documents/ctgsv.bed.md: -------------------------------------------------------------------------------- 1 | # ctgsv.bed File Format 2 | 3 | The "ctgsv.bed" file is a tab-separated values (TSV) file that contains information about query contig alignments and potential structural variations. Each line in the file represents a region of interest in the query sequence. 4 | 5 | ## File Structure 6 | 7 | Each line in the file consists of four columns: 8 | 9 | 1. Query Name 10 | 2. Start Position 11 | 3. End Position 12 | 4. Annotation 13 | 14 | ## Column Descriptions 15 | 16 | 1. **Query Name**: The name or identifier of the query contig. 17 | 18 | 2. **Start Position**: The starting position of the region in the query contig (0-based). 19 | 20 | 3. **End Position**: The ending position of the region in the query contig (exclusive). 21 | 22 | 4. **Annotation**: A string containing information about the region, with fields separated by colons. The annotation format is as follows: 23 | 24 | `::-:::` 25 | 26 | - ``: Indicates the type of region: 27 | - `QG`: Query Gap 28 | - `QD`: Query Duplicate 29 | - `QO`: Query Overlap 30 | 31 | - ``: The name of the target sequence this region aligns to. 32 | 33 | - `-`: The start and end positions in the target sequence. 34 | 35 | - ``: The orientation of the alignment (0 for forward, 1 for reverse). 36 | 37 | - ``: The overall orientation of the contig (0 for forward, 1 for reverse). 38 | 39 | - ``: Any additional information (may vary depending on the type). 40 | 41 | ## Example 42 | 43 | ``` 44 | contig1 0 1000 QG:BGN>chr1:500-1500:0:0:32000 45 | contig1 1000 2000 QD:chr1>chr2:1000-2000:1:0:32000 46 | contig1 2000 3000 QO:chr2>chr3:2000-3000:0:0:32000 47 | contig1 3000 32000 QG:chr3>END 48 | ``` 49 | 50 | In this example: 51 | - The first line shows a query gap at the beginning of contig1, aligning to chr1. 52 | - The second line indicates a duplicated region in contig1, aligning to chr2 in reverse orientation. 53 | - The third line shows an overlapping region in contig1, aligning to chr3. 54 | - The last line represents the end of the contig, with a gap from the last alignment to the end of the sequence. 55 | 56 | This file format provides a comprehensive view of how query contigs align to the target sequences, highlighting potential structural variations, duplications, and gaps in the query assembly. -------------------------------------------------------------------------------- /pgr-bin/file_format_documents/for_pgr-alnmap/alnmap.md: -------------------------------------------------------------------------------- 1 | # alnmap Output File Documentation 2 | 3 | The alnmap output file contains detailed information about the alignment between a reference sequence and query sequences. Each line in the file represents a different type of record, with fields separated by tabs. 4 | 5 | ## Record Types 6 | 7 | 1. **Begin Record (B)** 8 | Format: `\tB\t\t\t\t\t\t\t\t\t\t\t\t\t` 9 | 10 | 2. **End Record (E)** 11 | Format: `\tE\t\t\t\t\t\t\t\t\t` 12 | 13 | 3. **Match Record (M, M_D, or M_O)** 14 | Format: `\t\t\t\t\t\t\t\t` 15 | 16 | 4. **SV Candidate Record (S, S_D, or S_O)** 17 | Format: `\t\t\t\t\t\t\t\t\t\t` 18 | 19 | 5. **Variant Record (V, V_D, or V_O)** 20 | Format: `\t\t\t\t\t\t\t\t\t\t\t\t\t\t` 21 | 22 | ## Field Descriptions 23 | 24 | - `aln_idx`: Alignment index (6-digit zero-padded number) 25 | - `target_name`: Name of the reference sequence 26 | - `target_start`: Start position in the reference sequence 27 | - `target_end`: End position in the reference sequence 28 | - `query_name`: Name of the query sequence 29 | - `query_start`: Start position in the query sequence 30 | - `query_end`: End position in the query sequence 31 | - `orientation`: Orientation of the alignment (0 for forward, 1 for reverse) 32 | - `query_length`: Length of the query sequence 33 | - `contig_orientation`: Orientation of the contig 34 | - `target_duplicate`: Whether the target region is duplicated (1) or not (0) 35 | - `target_overlap`: Whether the target region overlaps (1) or not (0) 36 | - `query_duplicate`: Whether the query region is duplicated (1) or not (0) 37 | - `query_overlap`: Whether the query region overlaps (1) or not (0) 38 | - `match_type`: Type of match (M, M_D, or M_O) 39 | - `sv_type`: Type of structural variant candidate (S, S_D, or S_O) 40 | - `diff_type`: Type of difference (A: FailAln, E: FailEndMatch, S: FailShortSeq, L: FailLengthDiff, U: Unknown) 41 | - `variant_type`: Type of variant (V, V_D, or V_O) 42 | - `target_diff`: Difference in the target sequence 43 | - `query_diff`: Difference in the query sequence 44 | - `target_coord`: Coordinate in the target sequence 45 | - `variant_type`: Type of variant (single character) 46 | - `target_variant_seq`: Variant sequence in the target 47 | - `query_variant_seq`: Variant sequence in the query 48 | 49 | ## Notes 50 | 51 | - Records with `_D` suffix indicate duplicated regions 52 | - Records with `_O` suffix indicate overlapping regions 53 | - The order of records in the file follows the alignment process 54 | 55 | This documentation should help users understand the structure and content of the alnmap output file generated by the provided code. -------------------------------------------------------------------------------- /pgr-bin/file_format_documents/for_pgr-alnmap/ctgmap.bed.md: -------------------------------------------------------------------------------- 1 | # *.ctgmap.bed File Format 2 | 3 | The *.ctgmap.bed file is a tab-separated values (TSV) file that describes the alignment of query contigs to a reference genome. It follows a modified BED format with additional fields to provide detailed information about the alignments. 4 | 5 | ## File Structure 6 | 7 | Each line in the file represents a single alignment between a query contig and the reference genome. The fields are separated by tabs and are ordered as follows: 8 | 9 | 1. Reference sequence name 10 | 2. Start position on the reference (0-based) 11 | 3. End position on the reference (exclusive) 12 | 4. Additional information (colon-separated) 13 | 14 | ## Fields 15 | 16 | 1. **Reference sequence name**: The name of the reference sequence (chromosome or scaffold). 17 | 18 | 2. **Start position**: The start position of the alignment on the reference sequence (0-based). 19 | 20 | 3. **End position**: The end position of the alignment on the reference sequence (exclusive). 21 | 22 | 4. **Additional information**: A colon-separated string containing the following fields: 23 | a. Query sequence name 24 | b. Start position on the query sequence 25 | c. End position on the query sequence 26 | d. Query contig length 27 | e. Orientation (0 for forward, 1 for reverse) 28 | f. Contig orientation (0 for forward, 1 for reverse) 29 | g. Target duplication flag (0 for unique, 1 for duplicated) 30 | h. Target overlap flag (0 for non-overlapping, 1 for overlapping) 31 | i. Query duplication flag (0 for unique, 1 for duplicated) 32 | j. Query overlap flag (0 for non-overlapping, 1 for overlapping) 33 | 34 | ## Example 35 | 36 | ``` 37 | chr1 1000 2000 contig1:500:1500:3000:0:0:0:0:0:0 38 | ``` 39 | 40 | This example line can be interpreted as follows: 41 | - The alignment is on reference sequence "chr1" from position 1000 to 2000. 42 | - The query contig name is "contig1". 43 | - The alignment covers positions 500 to 1500 on the query contig. 44 | - The total length of the query contig is 3000 base pairs. 45 | - The alignment is in the forward orientation (0) for both the reference and the query. 46 | - The alignment is unique and non-overlapping on both the reference and the query (all flags are 0). 47 | 48 | ## Usage 49 | 50 | This file format is useful for visualizing and analyzing the alignment of query contigs to a reference genome. It can be used to identify structural variations, assess the quality of genome assemblies, and compare different genome versions or assemblies. -------------------------------------------------------------------------------- /pgr-bin/file_format_documents/for_pgr-alnmap/ctgmap.json.md: -------------------------------------------------------------------------------- 1 | # ctgmap.json Documentation 2 | 3 | The `ctgmap.json` file contains information about contig mappings between a reference genome and query contigs. It is generated by the `pgr-alnmap` tool and provides detailed alignment information in a JSON format. 4 | 5 | ## File Structure 6 | 7 | The `ctgmap.json` file contains a single JSON object with the following structure: 8 | 9 | ```json 10 | { 11 | "records": [...], 12 | "query_length": [...], 13 | "target_length": [...] 14 | } 15 | ``` 16 | 17 | ### Records 18 | 19 | The `records` field is an array of objects, where each object represents a contig mapping. Each mapping object has the following structure: 20 | 21 | ```json 22 | { 23 | "t_name": "string", 24 | "ts": number, 25 | "te": number, 26 | "q_name": "string", 27 | "qs": number, 28 | "qe": number, 29 | "ctg_len": number, 30 | "orientation": number, 31 | "ctg_orientation": number, 32 | "t_dup": boolean, 33 | "t_ovlp": boolean, 34 | "q_dup": boolean, 35 | "q_ovlp": boolean 36 | } 37 | ``` 38 | 39 | - `t_name`: Name of the target (reference) sequence 40 | - `ts`: Start position of the alignment on the target sequence 41 | - `te`: End position of the alignment on the target sequence 42 | - `q_name`: Name of the query contig 43 | - `qs`: Start position of the alignment on the query contig 44 | - `qe`: End position of the alignment on the query contig 45 | - `ctg_len`: Length of the query contig 46 | - `orientation`: Orientation of the alignment (0 for forward, 1 for reverse) 47 | - `ctg_orientation`: Orientation of the entire contig alignment 48 | - `t_dup`: Boolean indicating if the target region is duplicated 49 | - `t_ovlp`: Boolean indicating if the target region overlaps with other alignments 50 | - `q_dup`: Boolean indicating if the query region is duplicated 51 | - `q_ovlp`: Boolean indicating if the query region overlaps with other alignments 52 | 53 | ### Query Length 54 | 55 | The `query_length` field is an array of tuples containing information about query contig lengths: 56 | 57 | ```json 58 | [ 59 | [id, "name", length], 60 | ... 61 | ] 62 | ``` 63 | 64 | - `id`: Numeric identifier for the query contig 65 | - `name`: Name of the query contig 66 | - `length`: Length of the query contig in base pairs 67 | 68 | ### Target Length 69 | 70 | The `target_length` field is an array of tuples containing information about target (reference) sequence lengths: 71 | 72 | ```json 73 | [ 74 | [id, "name", length], 75 | ... 76 | ] 77 | ``` 78 | 79 | - `id`: Numeric identifier for the target sequence 80 | - `name`: Name of the target sequence 81 | - `length`: Length of the target sequence in base pairs 82 | 83 | ## Usage 84 | 85 | The `ctgmap.json` file can be used for various downstream analyses, including: 86 | 87 | 1. Visualizing contig alignments 88 | 2. Identifying potential structural variations 89 | 3. Assessing the quality of genome assemblies 90 | 4. Comparing different assemblies or genome versions 91 | 92 | To work with the `ctgmap.json` file, you can use any JSON parser in your preferred programming language to load and process the data. -------------------------------------------------------------------------------- /pgr-bin/file_format_documents/for_pgr-alnmap/svcnd.bed.md: -------------------------------------------------------------------------------- 1 | The "svcnd.bed" file is a BED (Browser Extensible Data) format file that contains information about structural variant candidates (SVCs) and alignment regions. Each line in the file represents a feature and has four tab-separated fields: 2 | 3 | 1. Chromosome/Contig name (target sequence name) 4 | 2. Start position (0-based) 5 | 3. End position (1-based) 6 | 4. Feature annotation 7 | 8 | The feature annotation field contains detailed information about the SV candidate or alignment region. It can have different formats depending on the type of feature: 9 | 10 | 1. SV Candidates: 11 | Format: `::-:::` 12 | 13 | - SVC_TYPE: Can be "SVC" (regular SV candidate), "SVC_D" (SV in duplicated region), or "SVC_O" (SV in overlapped region) 14 | - QUERY_NAME: Name of the query sequence 15 | - QUERY_START and QUERY_END: Start and end positions in the query sequence 16 | - ORIENTATION: Alignment orientation (0 for forward, 1 for reverse) 17 | - CTG_ORIENTATION: Contig orientation 18 | - DIFF_TYPE: Type of difference ('A' for alignment failure, 'E' for end mismatch, 'S' for short sequence, 'L' for length difference) 19 | 20 | 2. Target Alignment Regions: 21 | Format: `:>:::::` 22 | 23 | - TYPE: Can be "TG" (gap), "TD" (duplication), or "TO" (overlap) 24 | - PREV_CTG and NEXT_CTG: Names of the previous and next contigs 25 | - QUERY_START and QUERY_END: Start and end positions in the query sequence 26 | - CTG_LEN: Length of the contig 27 | - ORIENTATION: Alignment orientation 28 | - CTG_ORIENTATION: Contig orientation 29 | 30 | The "svcnd.bed" file combines information about structural variant candidates and alignment regions, providing a comprehensive view of potential genomic variations and how query sequences align to the target reference. This format allows for easy visualization in genome browsers and can be used for further analysis of structural variations and alignment characteristics. -------------------------------------------------------------------------------- /pgr-bin/file_format_documents/gfa_format.md: -------------------------------------------------------------------------------- 1 | # GFA Output Format in PGR-TK 2 | 3 | ## Overview 4 | 5 | PGR-TK generates Graph Fragment Assembly (GFA) files to represent pangenome assembly graphs. These graphs are based on the Minimizer Anchored Pangenome (MAP) concept, which uses shimmer pairs (minimizers) to anchor paths through the genome. 6 | 7 | Two main types of GFA files are produced: 8 | - **MAP Graph GFA** (.mapg.gfa) - representing the full minimizer-anchored paths graph 9 | - **Principal MAP Graph GFA** (.pmapg.gfa) - representing a simplified version with principal bundles 10 | 11 | ## File Format Specification 12 | 13 | GFA files in PGR-TK follow the standard GFA specification with customizations for MAP graphs: 14 | 15 | ### Header Line (H) 16 | ``` 17 | H VN:Z:1.0 CM:Z:Sparse Genome Graph Generated By pgr-tk 18 | ``` 19 | - `VN:Z:1.0` - Version number of the format 20 | - `CM:Z:...` - Comment describing the file 21 | 22 | ### Segments (S lines) 23 | Segments represent nodes in the graph: 24 | 25 | ``` 26 | S * LN:i: SN:Z: [BN:i: BP:i:] 27 | ``` 28 | 29 | Fields: 30 | - `` - Numerical identifier for the segment 31 | - `*` - Placeholder for sequence (not stored in this representation) 32 | - `LN:i:` - Length of the segment (average length plus k-mer size) 33 | - `SN:Z:` - Hexadecimal representation of the shimmer pair (hash0_hash1) 34 | 35 | Optional fields for principal bundle GFA: 36 | - `BN:i:` - Principal bundle identifier 37 | - `BP:i:` - Position within the bundle 38 | 39 | ### Links (L lines) 40 | Links represent edges connecting segments: 41 | 42 | ``` 43 | L SC:i: 44 | ``` 45 | 46 | Fields: 47 | - ``, `` - IDs of connected segments 48 | - ``, `` - Orientation (`+` or `-`) 49 | - `` - CIGAR string (typically `kM` where k is the k-mer size) 50 | - `SC:i:` - Number of sequences supporting this connection 51 | 52 | ## MAP Graph Index Format (.mapg.idx) 53 | 54 | The MAP Graph index (.mapg.idx) file complements the GFA representation with a tab-delimited text format. Each line begins with a single character that identifies the record type: 55 | 56 | ### K Record (SHIMMER Parameters) 57 | ``` 58 | K\t\t\t\t\t 59 | ``` 60 | 61 | Fields: 62 | - `w` - Window size parameter (integer) 63 | - `k` - K-mer size parameter (integer) 64 | - `r` - Reduction factor parameter (integer) 65 | - `min_span` - Minimum span length parameter (integer) 66 | - `sketch` - Boolean flag (true/false) indicating if sketching was used 67 | 68 | ### C Record (Contig Information) 69 | ``` 70 | C\t\t\t\t 71 | ``` 72 | 73 | Fields: 74 | - `id` - Unique identifier for the contig (integer) 75 | - `contig_name` - Name of the contig 76 | - `source` - Source sample name or "NA" if not available 77 | - `length` - Length of the contig (integer) 78 | 79 | ### F Record (Fragment Information) 80 | ``` 81 | F\t\t\t\t\t\t 82 | ``` 83 | 84 | Fields: 85 | - `shimmer_pair_id` - SHIMMER pair identifier in hex format (e.g., "0123456789ABCDEF_0123456789ABCDEF") 86 | - `seq_id` - Sequence identifier (integer) 87 | - `direction` - Direction/orientation of the fragment (integer) 88 | - `start` - Start position of the fragment (integer) 89 | - `end` - End position of the fragment (integer) 90 | - `fragment_id` - Fragment identifier (integer) 91 | 92 | The index file allows for efficient loading and querying of the graph structure without having to parse the entire GFA file, which can be much larger. 93 | 94 | ## Use Cases 95 | 96 | - Representing complex genomic regions across multiple samples 97 | - Analyzing structural variations and sequence relationships 98 | - Visualizing pangenome structure 99 | - Identifying shared genomic regions (principal bundles) 100 | 101 | The GFA format enables visualization and algorithmic analysis of the graph structure, facilitating exploration of sequence relationships across multiple genomes, particularly in repetitive and structurally complex regions. -------------------------------------------------------------------------------- /pgr-bin/file_format_documents/input_file_formats_for_pgr-pbundle-bed.md: -------------------------------------------------------------------------------- 1 | # Input File Formats for pgr-pbundle-bed2svg 2 | 3 | ## 1. Principal Bundle BED File (Required) 4 | 5 | This is the main input file containing bundle information. 6 | 7 | - Format: Tab-separated values 8 | - Each line represents a bundle segment 9 | - Fields: 10 | 1. Contig name 11 | 2. Start position 12 | 3. End position 13 | 4. Bundle information (colon-separated): 14 | - Bundle ID 15 | - Bundle vertex count (not used) 16 | - Bundle direction (0 or 1) 17 | - Bundle vertex start (not used) 18 | - Bundle vertex end (not used) 19 | 20 | Example: 21 | ``` 22 | ctg1 1000 2000 1:5:0:1000:2000 ctg1 3000 4000 2:3:1:3000:4000 23 | ``` 24 | 25 | ## 2. Annotation File (Optional) 26 | 27 | Provides additional annotation text for each contig. 28 | 29 | - Format: Tab-separated values 30 | - Fields: 31 | 1. Contig name 32 | 2. Annotation text 33 | 34 | Example: 35 | ``` 36 | ctg1 Annotation for contig 1 37 | ctg2 Annotation for contig 2 38 | ``` 39 | 40 | ## 3. Annotation Region BED File (Optional) 41 | 42 | Defines regions for annotation tracks. 43 | 44 | - Format: Tab-separated values 45 | - Fields: 46 | 1. Contig name 47 | 2. Start position 48 | 3. End position 49 | 4. Title 50 | 5. Color 51 | 52 | Example: 53 | ``` 54 | ctg1 5000 6000 Region_A #FF0000 55 | ctg1 7000 8000 Region_B #00FF00 56 | ``` 57 | 58 | 59 | ## 4. Offset File (Optional) 60 | 61 | Provides offset values for each contig. 62 | 63 | - Format: Tab-separated values 64 | - Fields: 65 | 1. Contig name 66 | 2. Offset value (integer) 67 | 68 | Example: 69 | ``` 70 | ctg1 1000 ctg2 -500 71 | ``` 72 | 73 | 74 | ## 5. Dendrogram File (Optional) 75 | 76 | Describes the hierarchical clustering of contigs. 77 | 78 | - Format: Tab-separated values 79 | - Three types of lines: 80 | 1. Leaf nodes (L): 81 | - L [node_id] [contig_name] 82 | 2. Internal nodes (I): 83 | - I [node_id] [child_node0] [child_node1] [node_size] [node_height] 84 | 3. Node positions (P): 85 | - P [node_id] [node_position] [node_height] [node_size] 86 | 87 | Example: 88 | ``` 89 | L 1 ctg1 L 2 ctg2 I 3 1 2 2 0.5 P 1 0.0 0.0 1 P 2 1.0 0.0 1 P 3 0.5 0.5 2 90 | ``` 91 | -------------------------------------------------------------------------------- /pgr-bin/file_format_documents/output_files_for_pgr-pbundle-decomp.md: -------------------------------------------------------------------------------- 1 | # Output Files for pgr-pbundle-decomp 2 | 3 | The `pgr-pbundle-decomp` tool generates several output files when decomposing principal bundles from sequence data. Each file serves a specific purpose in the analysis pipeline. Below is a comprehensive description of each output file format: 4 | 5 | ## 1. Principal Bundle BED File (`[prefix].bed`) 6 | 7 | This file contains the principal bundle decomposition results in BED format, which identifies regions in the genome that share similar sequence patterns. For more detailed information, see [Principal Bundle BED File Format](principal_bundle_bed_file.md). 8 | 9 | ### Format: 10 | ``` 11 | ::::: 12 | ``` 13 | 14 | ### Fields: 15 | - `contig_name`: Name of the contig or chromosome 16 | - `start`: 0-based start position of the bundle on the contig (inclusive) 17 | - `end`: 0-based end position of the bundle on the contig (exclusive) 18 | - `bundle_id`: Unique identifier for the bundle 19 | - `bundle_size`: Number of sequences contained in the bundle 20 | - `direction`: Direction of the bundle (0 for forward, 1 for reverse) 21 | - `start_pos`: Start position within the principal bundle coordinate system 22 | - `end_pos`: End position within the principal bundle coordinate system 23 | - `repeat_flag`: Classification flag - 'R' for repeat regions, 'U' for unique regions 24 | 25 | The first line of the file contains a comment with the command used to run the tool. 26 | 27 | ## 2. Contig Summary File (`[prefix].ctg.summary.tsv`) 28 | 29 | This tab-separated file provides detailed statistical information about bundle distribution across each contig, useful for quantitative analysis. This file is often used in conjunction with the principal bundle BED file for comprehensive analysis. 30 | 31 | ### Header: 32 | ``` 33 | #ctg length repeat_bundle_count repeat_bundle_sum repeat_bundle_percentage repeat_bundle_mean repeat_bundle_min repeat_bundle_max non_repeat_bundle_count non_repeat_bundle_sum non_repeat_bundle_percentage non_repeat_bundle_mean non_repeat_bundle_min non_repeat_bundle_max total_bundle_count total_bundle_coverage_percentage 34 | ``` 35 | 36 | 37 | ### Fields: 38 | - `ctg`: Contig name 39 | - `length`: Total length of the contig in base pairs 40 | - `repeat_bundle_count`: Number of repeat bundles identified 41 | - `repeat_bundle_sum`: Total base pairs covered by repeat bundles 42 | - `repeat_bundle_percentage`: Percentage of contig covered by repeat bundles 43 | - `repeat_bundle_mean`: Mean length of repeat bundles 44 | - `repeat_bundle_min`: Minimum length of repeat bundles 45 | - `repeat_bundle_max`: Maximum length of repeat bundles 46 | - `non_repeat_bundle_count`: Number of non-repeat (unique) bundles 47 | - `non_repeat_bundle_sum`: Total base pairs covered by non-repeat bundles 48 | - `non_repeat_bundle_percentage`: Percentage of contig covered by non-repeat bundles 49 | - `non_repeat_bundle_mean`: Mean length of non-repeat bundles 50 | - `non_repeat_bundle_min`: Minimum length of non-repeat bundles 51 | - `non_repeat_bundle_max`: Maximum length of non-repeat bundles 52 | - `total_bundle_count`: Total number of bundles (repeat + non-repeat) 53 | - `total_bundle_coverage_percentage`: Percentage of contig covered by all bundles 54 | 55 | ### Example: 56 | ``` 57 | #ctg length repeat_bundle_count repeat_bundle_sum repeat_bundle_percentage repeat_bundle_mean repeat_bundle_min repeat_bundle_max non_repeat_bundle_count non_repeat_bundle_sum non_repeat_bundle_percentage non_repeat_bundle_mean non_repeat_bundle_min non_repeat_bundle_max total_bundle_count total_bundle_coverage_percentage 58 | chr1 248956422 2156 42568912 17.1 19744 500 125680 5842 192458700 77.3 32943 200 258462 7998 94.4 59 | ``` 60 | 61 | ## 3. MAP Graph GFA File (.mapg.gfa) 62 | 63 | This file contains the Minimizer Anchor Profile (MAP) graph in GFA (Graphical Fragment Assembly) format. For detailed information about the GFA format used in PGR-TK, see [GFA Output Format in PGR-TK](gfa_format.md). 64 | 65 | ## 4. MAP Graph Index File (.mapg.idx) 66 | 67 | This file contains the index for the MAP graph, enabling efficient querying and traversal. The index format is described in detail in the [GFA Output Format in PGR-TK](gfa_format.md) document. 68 | 69 | ## 5. Principal MAP Graph GFA File (.pmapg.gfa) 70 | 71 | This file contains the principal MAP graph in GFA format, which is a simplified version focusing on the principal bundles. See [GFA Output Format in PGR-TK](gfa_format.md) for format details. 72 | 73 | ## 6. Principal Bundle Data File (`[prefix].pdb`) 74 | 75 | This binary file contains the complete principal bundle data, essential for downstream analysis tools in the PGR-TK suite. It includes: 76 | 77 | - SHIMMER parameters used for minimizer generation (k-mer size, window size, reduction factor) 78 | - Bundle information with coordinate mappings between reference and bundle spaces 79 | - Fragment boundary information 80 | - Bundle connectivity data 81 | - Sequence mapping metadata 82 | 83 | ## Related Documentation 84 | 85 | - [Principal Bundle BED File Format](principal_bundle_bed_file.md) - Detailed explanation of the principal bundle BED file format 86 | - [Input File Formats for pgr-pbundle-bed2svg](input_file_formats_for_pgr-pbundle-bed.md) - Documentation for the input files required by pgr-pbundle-bed2svg 87 | - [GFA Output Format in PGR-TK](gfa_format.md) - Information about the GFA format used in PGR-TK 88 | - [Contig SV BED Format](ctgsv.bed.md) - Documentation for the contig SV BED format, another important file format in the PGR-TK ecosystem 89 | 90 | -------------------------------------------------------------------------------- /pgr-bin/file_format_documents/principal_bundle_bed_file.md: -------------------------------------------------------------------------------- 1 | # Principal Bundle BED File Format 2 | 3 | ## Overview 4 | 5 | This document describes the format of principal bundle BED files generated by the `pgr-pbundle-decomp` tool. These files contain decomposition information about genomic regions organized into principal bundles, which represent similar sequence regions across different contigs or assemblies. 6 | 7 | ## File Format 8 | 9 | The principal bundle BED file is a tab-separated text file where each line represents a bundle region in a contig. 10 | 11 | ### Header 12 | 13 | The file begins with a comment line containing the command used to generate the file: 14 | ``` 15 | # cmd: 16 | ``` 17 | 18 | ### Data Columns 19 | 20 | Each subsequent line contains the following tab-separated fields: 21 | 22 | 1. **Contig Name** (string) 23 | - The name of the contig or sequence 24 | 25 | 2. **Start Position** (integer) 26 | - The 0-based start position of the bundle region on the contig 27 | 28 | 3. **End Position** (integer) 29 | - The end position of the bundle region on the contig 30 | 31 | 4. **Bundle Information** (string) 32 | - A colon-separated string containing the following components: 33 | 34 | a. **Bundle ID** (integer) 35 | - The unique identifier for the bundle 36 | 37 | b. **Bundle Size** (integer) 38 | - The total size of the bundle in base pairs 39 | 40 | c. **Direction** (0 or 1) 41 | - 0: Forward direction 42 | - 1: Reverse direction (reverse complement) 43 | 44 | d. **Start Position in Bundle** (integer) 45 | - The start position of this region within the bundle coordinates 46 | 47 | e. **End Position in Bundle** (integer) 48 | - The end position of this region within the bundle coordinates 49 | 50 | f. **Repeat Status** (R or U) 51 | - R: Repeat region 52 | - U: Unique (non-repeat) region 53 | 54 | ## Example 55 | 56 | ``` 57 | contig1 1000 2000 42:5000:0:100:200:U 58 | ``` 59 | 60 | This line indicates: 61 | - Region is on `contig1` 62 | - Region spans from position 1000 to 2000 on the contig 63 | - Region belongs to bundle with ID 42 64 | - The total bundle size is 5000 bp 65 | - Region is in the forward direction (0) 66 | - Within the bundle, the region spans from position 100 to 200 67 | - The region is unique (U), not a repeat 68 | 69 | ## Usage 70 | 71 | Principal bundle BED files are typically used for: 72 | - Analyzing sequence similarity across different assemblies 73 | - Identifying structural variants 74 | - Visualizing genome alignments 75 | - Mapping coordinates between different assemblies 76 | 77 | For related tools that work with these files, see: 78 | - `pgr-pbundle-bed2svg`: Generates SVG visualizations from bundle BED files 79 | - `pgr-pbundle-bed2dist`: Calculates distances between bundle regions 80 | - `pgr-pbundle-bed2sorted`: Sorts bundle BED files for more efficient processing 81 | - `pgr-pbundle-bed2offset`: Computes coordinate offsets between bundle regions -------------------------------------------------------------------------------- /pgr-bin/src/_bin/README.txt: -------------------------------------------------------------------------------- 1 | experimental binaries 2 | -------------------------------------------------------------------------------- /pgr-bin/src/_bin/pgr-fasta-smp-count.rs: -------------------------------------------------------------------------------- 1 | const VERSION_STRING: &'static str = env!("VERSION_STRING"); 2 | 3 | //use std::path::PathBuf; 4 | use clap::{self, CommandFactory, Parser}; 5 | 6 | use pgr_db::shmmrutils::ShmmrSpec; 7 | use std::fs::File; 8 | use std::io::{BufWriter, Write}; 9 | 10 | use pgr_db::seq_db; 11 | 12 | #[derive(Parser, Debug)] 13 | #[clap(name = "pgr-seq-smp-count")] 14 | #[clap(author, version)] 15 | #[clap(about = "count shimmer pairs from fasta files", long_about = None)] 16 | struct CmdOptions { 17 | #[clap(long, short)] 18 | in_fasta: String, 19 | #[clap(long, short)] 20 | output_path: String, 21 | //max_unique_count 22 | #[clap(long, short, default_value_t = 4)] 23 | min_count: usize, 24 | #[clap(long, short, default_value_t = 31)] 25 | w: u32, 26 | #[clap(long, short, default_value_t = 31)] 27 | k: u32, 28 | #[clap(long, short, default_value_t = 1)] 29 | r: u32, 30 | #[clap(long, default_value_t = 0)] 31 | min_span: u32, 32 | } 33 | 34 | fn main() -> Result<(), std::io::Error> { 35 | CmdOptions::command().version(VERSION_STRING).get_matches(); 36 | let args = CmdOptions::parse(); 37 | let filepath = args.in_fasta; 38 | let spec = ShmmrSpec { 39 | w: args.w, 40 | k: args.k, 41 | r: args.r, 42 | min_span: args.min_span, 43 | sketch: false, 44 | }; 45 | let mut sdb = seq_db::CompactSeqDB::new(spec.clone()); 46 | sdb.load_seqs_from_fastx(filepath)?; 47 | let mut out_buf = BufWriter::new(File::create(args.output_path)?); 48 | sdb.frag_map 49 | .into_iter() 50 | .try_for_each(|(k, v)| -> Result<(), std::io::Error> { 51 | let c = v.len(); 52 | if c >= args.min_count { 53 | out_buf.write_fmt(format_args!("{:016x} {:016x} {}\n", k.0, k.1, c))?; 54 | }; 55 | Ok(()) 56 | })?; 57 | Ok(()) 58 | } 59 | -------------------------------------------------------------------------------- /pgr-bin/src/_bin/pgr-filter.rs: -------------------------------------------------------------------------------- 1 | const VERSION_STRING: &'static str = env!("VERSION_STRING"); 2 | use clap::{self, CommandFactory, Parser}; 3 | use flate2::bufread::MultiGzDecoder; 4 | use pgr_db::fasta_io::{FastaReader, FastaStreamReader, FastqStreamReader, SeqRec}; 5 | use pgr_db::kmer_filter::MinimizerFilter; 6 | use rayon::prelude::*; 7 | use std::fs::File; 8 | use std::io::{self, BufReader, Read}; 9 | 10 | enum GZFastaReader { 11 | GZFile(FastaReader>>>), 12 | RegularFile(FastaReader>>), 13 | } 14 | 15 | #[derive(Parser, Debug)] 16 | #[clap(name = "pgr-filter")] 17 | #[clap(author, version)] 18 | #[clap(about = "using Cuckoo Filter for Matching Reads To A Reference Set of Sequences", long_about = None)] 19 | struct CmdOptions { 20 | ref_fasta_path: String, 21 | #[clap(long, short)] 22 | query_fastx_path: Option, 23 | /// k-mer size 24 | #[clap(long, short, default_value_t = 32)] 25 | k: usize, 26 | /// count threshold 27 | #[clap(long, short, default_value_t = 0.8)] 28 | threshold: f32, 29 | #[clap(long)] 30 | fasta_stdin: bool, 31 | } 32 | 33 | fn get_fastx_reader(filepath: String) -> Result { 34 | let file = File::open(&filepath)?; 35 | let mut reader = BufReader::new(file); 36 | let mut is_gzfile = false; 37 | { 38 | let r = reader.by_ref(); 39 | let mut buf = Vec::::new(); 40 | let _ = r.take(2).read_to_end(&mut buf); 41 | if buf == [0x1F_u8, 0x8B_u8] { 42 | log::info!("input file: {} detected as gz-compressed file", filepath); 43 | is_gzfile = true; 44 | } 45 | } 46 | drop(reader); 47 | 48 | let file = File::open(&filepath)?; 49 | let reader = BufReader::new(file); 50 | let gz_buf = BufReader::new(MultiGzDecoder::new(reader)); 51 | 52 | let file = File::open(&filepath)?; 53 | let reader = BufReader::new(file); 54 | let std_buf = BufReader::new(reader); 55 | 56 | if is_gzfile { 57 | drop(std_buf); 58 | Ok(GZFastaReader::GZFile( 59 | FastaReader::new(gz_buf, &filepath, 256, false).unwrap(), 60 | )) 61 | } else { 62 | drop(gz_buf); 63 | Ok(GZFastaReader::RegularFile( 64 | FastaReader::new(std_buf, &filepath, 256, false).unwrap(), 65 | )) 66 | } 67 | } 68 | 69 | fn main() -> Result<(), std::io::Error> { 70 | CmdOptions::command().version(VERSION_STRING).get_matches(); 71 | let args = CmdOptions::parse(); 72 | //let mut filter = KmerFilter::with_capacity(args.k, 1_usize << 24); 73 | let mut filter = MinimizerFilter::new(args.k); 74 | let mut add_seqs = |seq_iter: &mut dyn Iterator>| { 75 | seq_iter.into_iter().for_each(|r| { 76 | if let Ok(r) = r { 77 | filter.add_seq_mmers(&r.seq); 78 | }; 79 | }); 80 | }; 81 | 82 | match get_fastx_reader(args.ref_fasta_path)? { 83 | GZFastaReader::GZFile(reader) => add_seqs(&mut reader.into_iter()), 84 | 85 | GZFastaReader::RegularFile(reader) => add_seqs(&mut reader.into_iter()), 86 | }; 87 | 88 | let check_seqs = |seq_iter: &mut dyn Iterator>| { 89 | let mut seq_data = Vec::::new(); 90 | for r in seq_iter { 91 | if let Ok(r) = r { 92 | seq_data.push(r); 93 | }; 94 | if seq_data.len() == 64 { 95 | seq_data 96 | .par_iter() 97 | .map(|r| { 98 | let (total, c) = filter.check_seq_mmers(&r.seq); 99 | (r.clone(), total, c) 100 | }) 101 | .collect::>() 102 | .iter() 103 | .for_each(|(r, total, c)| { 104 | if *total > 0 { 105 | if (*c as f32) / (*total as f32) > args.threshold { 106 | println!(">{} {} {}", String::from_utf8_lossy(&r.id), total, c); 107 | println!("{}", String::from_utf8_lossy(&r.seq[..])); 108 | } 109 | } 110 | }); 111 | seq_data.clear(); 112 | } 113 | } 114 | 115 | seq_data 116 | .into_par_iter() 117 | .map(|r| { 118 | let (total, c) = filter.check_seq_mmers(&r.seq); 119 | (r, total, c) 120 | }) 121 | .collect::>() 122 | .iter() 123 | .for_each(|(r, total, c)| { 124 | if *total > 0 { 125 | if (*c as f32) / (*total as f32) > args.threshold { 126 | println!(">{} {} {}", String::from_utf8_lossy(&r.id), total, c); 127 | println!("{}", String::from_utf8_lossy(&r.seq[..])); 128 | } 129 | } 130 | }); 131 | }; 132 | 133 | if args.query_fastx_path.is_some() { 134 | match get_fastx_reader(args.query_fastx_path.unwrap())? { 135 | GZFastaReader::GZFile(reader) => check_seqs(&mut reader.into_iter()), 136 | GZFastaReader::RegularFile(reader) => check_seqs(&mut reader.into_iter()), 137 | } 138 | } else { 139 | if args.fasta_stdin { 140 | let reader = FastaStreamReader::new(256); 141 | check_seqs(&mut reader.into_iter()); 142 | } else { 143 | let reader = FastqStreamReader::new(256); 144 | check_seqs(&mut reader.into_iter()); 145 | } 146 | } 147 | 148 | Ok(()) 149 | } 150 | -------------------------------------------------------------------------------- /pgr-bin/src/_bin/pgr-multifilter.rs: -------------------------------------------------------------------------------- 1 | const VERSION_STRING: &'static str = env!("VERSION_STRING"); 2 | use clap::{self, CommandFactory, Parser}; 3 | use flate2::bufread::MultiGzDecoder; 4 | use pgr_db::kmer_filter::KmerFilter; 5 | use pgr_db::fasta_io::{reverse_complement, FastaReader, FastqStreamReader, SeqRec}; 6 | use rayon::prelude::*; 7 | use rustc_hash::FxHashMap; 8 | use std::fs::File; 9 | use std::io::{self, BufRead, BufReader, BufWriter, Error, ErrorKind, Read, Write}; 10 | 11 | 12 | enum GZFastaReader { 13 | GZFile(FastaReader>>>), 14 | RegularFile(FastaReader>>), 15 | } 16 | 17 | #[derive(Parser, Debug)] 18 | #[clap(name = "pgr-multi-filter")] 19 | #[clap(author, version)] 20 | #[clap(about = "using Cuckoo Filter for Matching Reads To A Reference Set of Sequences", long_about = None)] 21 | struct CmdOptions { 22 | ref_fasta_list: String, 23 | prefix: String, 24 | #[clap(long, short)] 25 | query_fastx_path: Option, 26 | /// k-mer size 27 | #[clap(long, short, default_value_t = 32)] 28 | k: usize, 29 | /// count threshold 30 | #[clap(long, short, default_value_t = 4)] 31 | threshold: usize, 32 | } 33 | 34 | fn get_fastx_reader(filepath: String) -> Result { 35 | let file = File::open(&filepath)?; 36 | let mut reader = BufReader::new(file); 37 | let mut is_gzfile = false; 38 | { 39 | let r = reader.by_ref(); 40 | let mut buf = Vec::::new(); 41 | let _ = r.take(2).read_to_end(&mut buf); 42 | if buf == [0x1F_u8, 0x8B_u8] { 43 | log::info!("input file: {} detected as gz-compressed file", filepath); 44 | is_gzfile = true; 45 | } 46 | } 47 | drop(reader); 48 | 49 | let file = File::open(&filepath)?; 50 | let reader = BufReader::new(file); 51 | let gz_buf = BufReader::new(MultiGzDecoder::new(reader)); 52 | 53 | let file = File::open(&filepath)?; 54 | let reader = BufReader::new(file); 55 | let std_buf = BufReader::new(reader); 56 | 57 | if is_gzfile { 58 | drop(std_buf); 59 | Ok(GZFastaReader::GZFile( 60 | FastaReader::new(gz_buf, &filepath, 256, false).unwrap(), 61 | )) 62 | } else { 63 | drop(gz_buf); 64 | Ok(GZFastaReader::RegularFile( 65 | FastaReader::new(std_buf, &filepath, 256, false).unwrap(), 66 | )) 67 | } 68 | } 69 | 70 | fn main() -> Result<(), std::io::Error> { 71 | CmdOptions::command().version(VERSION_STRING).get_matches(); 72 | let args = CmdOptions::parse(); 73 | let mut filters = FxHashMap::::default(); 74 | 75 | let add_seqs = |filter: &mut KmerFilter, 76 | seq_iter: &mut dyn Iterator>| { 77 | seq_iter.into_iter().for_each(|r| { 78 | if let Ok(r) = r { 79 | filter.add_seq(&r.seq); 80 | let rc_seq = reverse_complement(&r.seq); 81 | filter.add_seq(&rc_seq); 82 | }; 83 | }); 84 | }; 85 | 86 | let inputs = BufReader::new(File::open(args.ref_fasta_list)?); 87 | inputs 88 | .lines() 89 | .into_iter() 90 | .try_for_each(|line| -> Result<(), std::io::Error> { 91 | match line { 92 | Ok(line) => { 93 | let fields = line.split("\t").into_iter().collect::>(); 94 | if fields.len() != 2 { 95 | return Err(Error::new(ErrorKind::Other, "can't read the input file")); 96 | } 97 | let fileanme = fields[0]; 98 | let suffix = fields[1]; 99 | let mut filter = KmerFilter::with_capacity(args.k, 1_usize << 24); 100 | match get_fastx_reader(fileanme.to_string())? { 101 | GZFastaReader::GZFile(reader) => { 102 | add_seqs(&mut filter, &mut reader.into_iter()) 103 | } 104 | 105 | GZFastaReader::RegularFile(reader) => { 106 | add_seqs(&mut filter, &mut reader.into_iter()) 107 | } 108 | }; 109 | filters.insert(suffix.to_string(), filter); 110 | 111 | Ok(()) 112 | } 113 | Err(e) => Err(e), 114 | } 115 | })?; 116 | 117 | let check_seqs = |seq_iter: &mut dyn Iterator>| { 118 | let mut seq_data = Vec::::new(); 119 | for r in seq_iter { 120 | if let Ok(r) = r { 121 | seq_data.push(r); 122 | } 123 | } 124 | 125 | filters.iter().for_each(|(suffix, filter)| { 126 | let mut writer = BufWriter::new( 127 | File::create(args.prefix.clone() + "_" + &suffix.clone()[..] + ".fa") 128 | .expect("file creating error"), 129 | ); 130 | 131 | (&seq_data) 132 | .into_par_iter() 133 | .filter(|&r| { 134 | let c = filter.check_seq(&r.seq); 135 | c >= args.threshold 136 | }) 137 | .collect::>() 138 | .iter() 139 | .for_each(|r| { 140 | write!(writer, ">{}\n", String::from_utf8_lossy(&r.id)).expect("writing error"); 141 | write!(writer, "{}\n", String::from_utf8_lossy(&r.seq[..])).expect("writing error"); 142 | }); 143 | }); 144 | 145 | }; 146 | 147 | if args.query_fastx_path.is_some() { 148 | match get_fastx_reader(args.query_fastx_path.unwrap())? { 149 | GZFastaReader::GZFile(reader) => check_seqs(&mut reader.into_iter()), 150 | GZFastaReader::RegularFile(reader) => check_seqs(&mut reader.into_iter()), 151 | } 152 | } else { 153 | let reader = FastqStreamReader::new(128); 154 | check_seqs(&mut reader.into_iter()); 155 | } 156 | 157 | Ok(()) 158 | } 159 | -------------------------------------------------------------------------------- /pgr-bin/src/_bin/pgr-probe-match.rs: -------------------------------------------------------------------------------- 1 | const VERSION_STRING: &'static str = env!("VERSION_STRING"); 2 | use clap::{self, CommandFactory, Parser}; 3 | use flate2::bufread::MultiGzDecoder; 4 | use pgr_db::fasta_io::{reverse_complement, FastaReader, FastqStreamReader, SeqRec}; 5 | use rayon::prelude::*; 6 | use rustc_hash::FxHashMap; 7 | use std::fs::File; 8 | use std::io::{self, BufRead, BufReader, Read}; 9 | 10 | #[derive(Parser, Debug)] 11 | #[clap(name = "pgr-probe-match")] 12 | #[clap(author, version)] 13 | #[clap(about = "using Cuckoo Filter for Matching Reads To A Reference Set of Sequences", long_about = None)] 14 | struct CmdOptions { 15 | probe_file_path: String, 16 | #[clap(long, short)] 17 | query_fastx_path: Option, 18 | } 19 | enum GZFastaReader { 20 | GZFile(FastaReader>>>), 21 | RegularFile(FastaReader>>), 22 | } 23 | 24 | #[derive(Clone)] 25 | struct ProbeInfo { 26 | vname: String, 27 | vprobe: Vec, 28 | vprobe_r: Vec, 29 | t1name: String, 30 | t1probe: Vec, 31 | t1probe_r: Vec, 32 | t2name: String, 33 | t2probe: Vec, 34 | t2probe_r: Vec, 35 | } 36 | 37 | fn get_fastx_reader(filepath: String) -> Result { 38 | let file = File::open(&filepath)?; 39 | let mut reader = BufReader::new(file); 40 | let mut is_gzfile = false; 41 | { 42 | let r = reader.by_ref(); 43 | let mut buf = Vec::::new(); 44 | let _ = r.take(2).read_to_end(&mut buf); 45 | if buf == [0x1F_u8, 0x8B_u8] { 46 | log::info!("input file: {} detected as gz-compressed file", filepath); 47 | is_gzfile = true; 48 | } 49 | } 50 | drop(reader); 51 | 52 | let file = File::open(&filepath)?; 53 | let reader = BufReader::new(file); 54 | let gz_buf = BufReader::new(MultiGzDecoder::new(reader)); 55 | 56 | let file = File::open(&filepath)?; 57 | let reader = BufReader::new(file); 58 | let std_buf = BufReader::new(reader); 59 | 60 | if is_gzfile { 61 | drop(std_buf); 62 | Ok(GZFastaReader::GZFile( 63 | FastaReader::new(gz_buf, &filepath, 256, false).unwrap(), 64 | )) 65 | } else { 66 | drop(gz_buf); 67 | Ok(GZFastaReader::RegularFile( 68 | FastaReader::new(std_buf, &filepath, 256, false).unwrap(), 69 | )) 70 | } 71 | } 72 | 73 | fn main() -> Result<(), std::io::Error> { 74 | CmdOptions::command().version(VERSION_STRING).get_matches(); 75 | let args = CmdOptions::parse(); 76 | let probe_reader = BufReader::new(File::open(args.probe_file_path)?); 77 | let mut all_probes = FxHashMap::::default(); 78 | probe_reader 79 | .lines() 80 | .into_iter() 81 | .for_each(|line| match line { 82 | Ok(line) => { 83 | let line = line.trim_end(); 84 | let mut fields = line.split("\t"); 85 | let vname = fields.next().expect("error parsing").to_string(); 86 | let tmp = fields.next().expect("error parsing"); 87 | let vprobe = tmp.as_bytes().to_vec(); 88 | let vprobe_r = reverse_complement(&vprobe); 89 | let t1name = fields.next().expect("error parsing").to_string(); 90 | let tmp = fields.next().expect("error parsing"); 91 | let t1probe = tmp.as_bytes().to_vec(); 92 | let t1probe_r = reverse_complement(&t1probe); 93 | let t2name = fields.next().expect("error parsing").to_string(); 94 | let tmp = fields.next().expect("error parsing"); 95 | let t2probe = tmp.as_bytes().to_vec(); 96 | let t2probe_r = reverse_complement(&t2probe); 97 | let probeset = ProbeInfo { 98 | vname: vname.clone(), 99 | vprobe, 100 | vprobe_r, 101 | t1name, 102 | t1probe, 103 | t1probe_r, 104 | t2name, 105 | t2probe, 106 | t2probe_r, 107 | }; 108 | all_probes.insert(vname, probeset); 109 | } 110 | _ => {} 111 | }); 112 | 113 | let match_probe = |seq: &Vec, probe: &Vec| -> bool { 114 | let plen = probe.len(); 115 | let mut flag = false; 116 | for i in 0..seq.len() - plen { 117 | if seq[i..i + plen] == probe[..] || seq[i..i + plen] == probe[..] { 118 | flag = true; 119 | break; 120 | } 121 | } 122 | flag 123 | }; 124 | 125 | let check_seqs = |seq_iter: &mut dyn Iterator>| { 126 | let mut seq_data = Vec::::new(); 127 | for r in seq_iter { 128 | if let Ok(r) = r { 129 | seq_data.push(r); 130 | } 131 | } 132 | 133 | all_probes.into_iter().for_each(|(_vname, probe_info)| { 134 | let mut count = (0_usize, 0_usize, 0_usize); 135 | (&seq_data) 136 | .into_par_iter() 137 | .filter(|&r| { 138 | match_probe(&r.seq, &probe_info.vprobe) 139 | || match_probe(&r.seq, &probe_info.vprobe_r) 140 | }) 141 | .collect::>() 142 | .into_iter() 143 | .for_each(|r| { 144 | count.0 += 1; 145 | if match_probe(&r.seq, &probe_info.t1probe) 146 | || match_probe(&r.seq, &probe_info.t1probe_r) 147 | { 148 | count.1 += 1; 149 | } 150 | if match_probe(&r.seq, &probe_info.t2probe) 151 | || match_probe(&r.seq, &probe_info.t2probe_r) 152 | { 153 | count.2 += 1; 154 | } 155 | }); 156 | println!( 157 | "{} {} {} {} {} {}", 158 | probe_info.vname, count.0, probe_info.t1name, count.1, probe_info.t2name, count.2 159 | ); 160 | }); 161 | }; 162 | 163 | if args.query_fastx_path.is_some() { 164 | match get_fastx_reader(args.query_fastx_path.unwrap())? { 165 | GZFastaReader::GZFile(reader) => check_seqs(&mut reader.into_iter()), 166 | GZFastaReader::RegularFile(reader) => check_seqs(&mut reader.into_iter()), 167 | } 168 | } else { 169 | let reader = FastqStreamReader::new(128); 170 | check_seqs(&mut reader.into_iter()); 171 | } 172 | 173 | Ok(()) 174 | } 175 | -------------------------------------------------------------------------------- /pgr-bin/src/_bin/pgr-shmmr-pair-count.rs: -------------------------------------------------------------------------------- 1 | const VERSION_STRING: &'static str = env!("VERSION_STRING"); 2 | 3 | //use std::path::PathBuf; 4 | use clap::{self, CommandFactory, Parser}; 5 | 6 | use rayon::prelude::*; 7 | use rustc_hash::FxHashMap; 8 | use std::collections::HashMap; 9 | use std::collections::HashSet; 10 | use std::fs::File; 11 | use std::io::{BufRead, BufReader, BufWriter, Write}; 12 | 13 | use pgr_db::seq_db; 14 | 15 | #[derive(Parser, Debug)] 16 | #[clap(name = "pgr-shmmr-pair-count")] 17 | #[clap(author, version)] 18 | #[clap(about = "count shimmer pairs in a shimmer database", long_about = None)] 19 | struct CmdOptions { 20 | prefix: String, 21 | output_path: String, 22 | //max_unique_count 23 | #[clap(long, short, default_value_t = 1)] 24 | max_unique_count: usize, 25 | } 26 | 27 | fn main() -> Result<(), std::io::Error> { 28 | CmdOptions::command().version(VERSION_STRING).get_matches(); 29 | let args = CmdOptions::parse(); 30 | let (_shmmr_spec, shmmr_pair_to_frags) = 31 | seq_db::read_mdb_file(args.prefix.clone() + ".mdb").unwrap(); 32 | let mut seq_index = HashMap::<(String, Option), (u32, u32)>::new(); 33 | let mut seq_info = HashMap::, u32)>::new(); 34 | let midx_file = BufReader::new(File::open(args.prefix.clone() + ".midx")?); 35 | 36 | let mut sources = HashSet::::new(); 37 | midx_file 38 | .lines() 39 | .into_iter() 40 | .try_for_each(|line| -> Result<(), std::io::Error> { 41 | let line = line.unwrap(); 42 | let mut line = line.as_str().split("\t"); 43 | let sid = line.next().unwrap().parse::().unwrap(); 44 | let len = line.next().unwrap().parse::().unwrap(); 45 | let ctg_name = line.next().unwrap().to_string(); 46 | let source = line.next().unwrap().to_string(); 47 | sources.insert(source.clone()); 48 | seq_index.insert((ctg_name.clone(), Some(source.clone())), (sid, len)); 49 | seq_info.insert(sid, (ctg_name, Some(source), len)); 50 | Ok(()) 51 | })?; 52 | 53 | let source_to_id = sources 54 | .iter() 55 | .enumerate() 56 | .map(|v| (v.1.clone(), v.0 as u32)) 57 | .collect::>(); 58 | 59 | let mut sid_to_source_id_lookup = vec![0_u32; seq_info.len()]; 60 | 61 | seq_info.iter().for_each(|(k, v)| { 62 | sid_to_source_id_lookup[*k as usize] = *source_to_id.get(v.1.as_ref().unwrap()).unwrap(); 63 | }); 64 | let mut out_file = BufWriter::new(File::create(args.output_path)?); 65 | let out_vec = shmmr_pair_to_frags 66 | .par_iter() 67 | .map(|(k, v)| { 68 | let mut count = FxHashMap::::default(); 69 | v.iter().for_each(|v| { 70 | let sid = (*v).1; 71 | let source_id = *sid_to_source_id_lookup.get(sid as usize).unwrap(); 72 | *count.entry(source_id).or_insert(0) += 1; 73 | }); 74 | let v = count 75 | .into_iter() 76 | .filter(|(_k, v)| { 77 | let muc = args.max_unique_count; 78 | if *v > muc { 79 | false 80 | } else { 81 | true 82 | } 83 | }) 84 | .count(); 85 | (k.0, k.1, v) 86 | }) 87 | .collect::>(); 88 | 89 | out_vec 90 | .iter() 91 | .try_for_each(|v| -> Result<(), std::io::Error> { 92 | writeln!(&mut out_file, "{} {} {}", v.0, v.1, v.2)?; 93 | Ok(()) 94 | })?; 95 | 96 | Ok(()) 97 | } 98 | -------------------------------------------------------------------------------- /pgr-bin/src/_bin/pgr-test.rs: -------------------------------------------------------------------------------- 1 | use flate2::bufread::MultiGzDecoder; 2 | use pgr_db::agc_io::AGCFile; 3 | use pgr_db::fasta_io::FastaReader; 4 | use std::collections::HashMap; 5 | use std::fs::File; 6 | use std::io::{BufRead, BufReader, Read}; 7 | 8 | use pgr_db::seq_db::{self, query_fragment, read_mdb_file}; 9 | 10 | pub fn load_seqs() -> HashMap> { 11 | let mut seqs = HashMap::>::new(); 12 | //let filepath = "test/test_data/test_seqs.fa"; 13 | let filepath = "/wd/peregrine-r-ext/phasing_test/PanMHCgraph/HPRCy1.MHC.fa"; 14 | let file = File::open(filepath.to_string()).unwrap(); 15 | let mut reader = BufReader::new(file); 16 | let mut is_gzfile = false; 17 | { 18 | let r = reader.by_ref(); 19 | let mut buf = Vec::::new(); 20 | let _ = r.take(2).read_to_end(&mut buf); 21 | if buf == [0x1F_u8, 0x8B_u8] { 22 | log::info!("input file detected as gz-compressed file",); 23 | is_gzfile = true; 24 | } 25 | } 26 | drop(reader); 27 | 28 | let file = File::open(&filepath).unwrap(); 29 | let mut reader = BufReader::new(file); 30 | let gz_buf = &mut BufReader::new(MultiGzDecoder::new(&mut reader)); 31 | 32 | let file = File::open(&filepath).unwrap(); 33 | let reader = BufReader::new(file); 34 | let std_buf = &mut BufReader::new(reader); 35 | 36 | let fastx_buf: &mut dyn BufRead = if is_gzfile { 37 | drop(std_buf); 38 | gz_buf 39 | } else { 40 | drop(gz_buf); 41 | std_buf 42 | }; 43 | 44 | let mut fastx_reader = 45 | FastaReader::new(fastx_buf, &filepath.to_string(), 1 << 14, true).unwrap(); 46 | while let Some(rec) = fastx_reader.next_rec() { 47 | let rec = rec.unwrap(); 48 | let seqname = String::from_utf8_lossy(&rec.id).into_owned(); 49 | seqs.insert(seqname, rec.seq.clone()); 50 | } 51 | seqs 52 | } 53 | 54 | fn _load_seq_test() { 55 | let seqs = load_seqs(); 56 | let mut sdb = seq_db::CompactSeqDB::new(seq_db::SHMMRSPEC); 57 | let _shmmr_spec = &pgr_db::seq_db::SHMMRSPEC; 58 | let _ = sdb.load_seqs_from_fastx( 59 | "/wd/peregrine-r-ext/phasing_test/PanMHCgraph/HPRCy1.MHC.fa".to_string(), 60 | ); 61 | //println!("test"); 62 | for seq in sdb.seqs.iter() { 63 | println!("S {} {} {}", seq.name, seq.id, seq.len); 64 | //println!(); 65 | //println!("{}", seq.name); 66 | let reconstruct_seq = sdb.get_seq(&seq); 67 | let orig_seq = seqs.get(&seq.name).unwrap(); 68 | if reconstruct_seq != *orig_seq { 69 | //println!("{}", seq.name); 70 | //println!("{:?}", reconstruct_seq); 71 | //println!("{:?}", orig_seq); 72 | for i in 0..reconstruct_seq.len() { 73 | if orig_seq[i] != reconstruct_seq[i] { 74 | println!("{} {} {} X", i, orig_seq[i], reconstruct_seq[i]); 75 | } else { 76 | println!("{} {} {} ", i, orig_seq[i], reconstruct_seq[i]); 77 | } 78 | } 79 | } else { 80 | println!("{} matched", seq.name); 81 | }; 82 | assert_eq!(reconstruct_seq, *orig_seq); 83 | } 84 | for (shmmr_pair, frg_ids) in sdb.frag_map.into_iter() { 85 | for ids in frg_ids { 86 | println!( 87 | "M {:016X} {:016X} {} {} {} {} {}", 88 | shmmr_pair.0, shmmr_pair.1, ids.0, ids.1, ids.2, ids.3, ids.4 89 | ); 90 | } 91 | } 92 | } 93 | 94 | fn _load_index_from_fastx() -> Result<(), std::io::Error> { 95 | let mut sdb = seq_db::CompactSeqDB::new(seq_db::SHMMRSPEC); 96 | let filelist = File::open("./filelist").unwrap(); 97 | 98 | BufReader::new(filelist).lines().into_iter().for_each(|fp| { 99 | let fp = fp.unwrap(); 100 | let _ = sdb.load_index_from_fastx(fp); 101 | }); 102 | 103 | seq_db::write_shmr_map_file(&sdb.shmmr_spec, &sdb.frag_map, "test.db".to_string())?; 104 | 105 | for seq in sdb.seqs.iter() { 106 | println!("S {} {} {}", seq.name, seq.id, seq.len); 107 | } 108 | for (shmmr_pair, frg_ids) in sdb.frag_map.into_iter() { 109 | for ids in frg_ids { 110 | println!( 111 | "M {:016X} {:016X} {} {} {} {} {}", 112 | shmmr_pair.0, shmmr_pair.1, ids.0, ids.1, ids.2, ids.3, ids.4 113 | ); 114 | } 115 | } 116 | Ok(()) 117 | } 118 | 119 | fn load_index_from_agcfile() -> Result<(), std::io::Error> { 120 | let mut sdb = seq_db::CompactSeqDB::new(seq_db::SHMMRSPEC); 121 | let filelist = File::open("./filelist").unwrap(); 122 | 123 | BufReader::new(filelist).lines().into_iter().try_for_each( 124 | |fp| -> Result<(), std::io::Error> { 125 | let fp = fp.unwrap(); 126 | let agcfile = AGCFile::new(fp)?; 127 | let _ = sdb.load_index_from_agcfile(agcfile); 128 | Ok(()) 129 | }, 130 | )?; 131 | 132 | //seq_db::write_shmr_map_file(&sdb.frag_map, "test.db".to_string()); 133 | sdb.write_shmr_map_index("test".to_string())?; 134 | Ok(()) 135 | } 136 | 137 | fn _load_index_mdb() -> Result<(), std::io::Error> { 138 | let agcfile = AGCFile::new(String::from("grch38.agc"))?; 139 | for sample in agcfile.samples.iter() { 140 | for contig in sample.contigs.iter() { 141 | let (_n, _t) = contig; 142 | //println!("{}:{}:{}", sample.name, n, t); 143 | } 144 | } 145 | let seq_mhc = agcfile.get_sub_seq("GCA_000001405.15_GRCh38_no_alt_analysis_set".to_string(), 146 | "chr6 AC:CM000668.2 gi:568336018 LN:170805979 rl:Chromosome M5:5691468a67c7e7a7b5f2a3a683792c29 AS:GRCh38".to_string(), 147 | 28510120, 33480577); 148 | // println!("MHC seq len: {}", MHCseq.len()); 149 | let (_shmmr_spec, new_map) = read_mdb_file("test.db".to_string()).unwrap(); 150 | let shmmr_spec = &pgr_db::seq_db::SHMMRSPEC; 151 | let r_frags = query_fragment(&new_map, &seq_mhc, shmmr_spec); 152 | let mut out = vec![]; 153 | for res in r_frags { 154 | for v in res.2 { 155 | //println!("Q {:?} {:?} {:?}", res.0, res.1, v); 156 | out.push((v, res.1, res.0)) 157 | } 158 | } 159 | out.sort(); 160 | for (v0, v1, _) in out { 161 | println!( 162 | "Q {} {} {} {} {} {} {} {}", 163 | v0.0, v0.1, v0.2, v0.3, v0.4, v1.0, v1.1, v1.2 164 | ); 165 | } 166 | Ok(()) 167 | } 168 | 169 | fn main() -> Result<(), std::io::Error> { 170 | //load_seq_test(); 171 | //load_index_from_fastx(); 172 | load_index_from_agcfile()?; 173 | //load_index_mdb(); 174 | Ok(()) 175 | } 176 | -------------------------------------------------------------------------------- /pgr-bin/src/bin/pgr-annotate-bed-file.rs: -------------------------------------------------------------------------------- 1 | const VERSION_STRING: &str = env!("VERSION_STRING"); 2 | use clap::{self, CommandFactory, Parser}; 3 | use flate2::bufread::MultiGzDecoder; 4 | use iset::IntervalMap; 5 | //use rayon::prelude::*; 6 | use rustc_hash::{FxHashMap, FxHashSet}; 7 | use std::fs::File; 8 | use std::io::{BufRead, BufReader, BufWriter, Write}; 9 | use std::path::Path; 10 | 11 | /// Align long contigs and identify potential SV regions with respect to the reference fasta file 12 | #[derive(Parser, Debug)] 13 | #[clap(name = "pgr-annotate-bed-file")] 14 | #[clap(author, version)] 15 | #[clap(about, long_about = None)] 16 | struct CmdOptions { 17 | /// path to the the a bed file 18 | bed_path: String, 19 | /// path to the annotation file (gzipped) 20 | annotation_path: String, 21 | /// the prefix of the output files 22 | output_path: String, 23 | /// type 24 | #[clap(long, default_value = "transcript")] 25 | feature: String, 26 | /// number of threads used in parallel (more memory usage), default to "0" using all CPUs available or the number set by RAYON_NUM_THREADS 27 | #[clap(long, default_value_t = 0)] 28 | number_of_thread: usize, 29 | } 30 | fn main() -> Result<(), std::io::Error> { 31 | CmdOptions::command().version(VERSION_STRING).get_matches(); 32 | let args = CmdOptions::parse(); 33 | 34 | rayon::ThreadPoolBuilder::new() 35 | .num_threads(args.number_of_thread) 36 | .build_global() 37 | .unwrap(); 38 | 39 | let mut reader = BufReader::new(File::open(Path::new(&args.annotation_path)).unwrap()); 40 | 41 | let annotation_reader = BufReader::new(MultiGzDecoder::new(&mut reader)); 42 | let mut annotation_interval = FxHashMap::>::default(); 43 | // we support https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/genes/hg38.ncbiRefSeq.gtf.gz for now 44 | annotation_reader.lines().for_each(|line| { 45 | if let Ok(line) = line { 46 | let err_msg = format!("faile to parse on {}", line); 47 | let fields = line.split('\t').collect::>(); 48 | let chr = fields[0].to_string(); 49 | let f_type = fields[2].to_string(); 50 | let fs = fields[3].parse::().expect(&err_msg); 51 | let fe = fields[4].parse::().expect(&err_msg) + 1; 52 | let strand = fields[6].chars().next().expect(&err_msg); 53 | let attribute = fields[8].to_string(); 54 | if f_type == args.feature { 55 | let e = annotation_interval 56 | .entry(chr) 57 | .or_insert(IntervalMap::::default()); 58 | e.insert(fs..fe, (strand, attribute)); 59 | } 60 | } 61 | }); 62 | 63 | let mut out_bed = BufWriter::new(File::create(Path::new(&args.output_path)).unwrap()); 64 | 65 | let bed_reader = BufReader::new(File::open(Path::new(&args.bed_path)).unwrap()); 66 | bed_reader.lines().for_each(|line| { 67 | if let Ok(line) = line { 68 | if line.starts_with('#') { 69 | return; 70 | }; 71 | let err_msg = format!("faile to parse on {}", line); 72 | let fields = line.split('\t').collect::>(); 73 | let chr = fields[0].to_string(); 74 | let bgn = fields[1].parse::().expect(&err_msg); 75 | let end = fields[2].parse::().expect(&err_msg); 76 | let annotation = fields[3].to_string(); 77 | if let Some(i_map) = annotation_interval.get(&chr) { 78 | // TODO, we only pick the first overlap for now 79 | let mut annotations = FxHashSet::::default(); 80 | for (_strand, attributes) in i_map.values(bgn..end) { 81 | // TODO: need a proper parser 82 | let attributes = attributes.trim_end_matches(';').to_string(); 83 | let a_fields = attributes.split(';').collect::>(); 84 | let gn = a_fields.last().unwrap().to_string(); 85 | let gn = gn.split(' ').collect::>(); 86 | let gn = gn.last().unwrap().to_string(); 87 | let gn = gn.trim_matches('"'); 88 | annotations.insert(gn.to_string()); 89 | }; 90 | if annotations.is_empty() { return }; 91 | let gn = annotations.into_iter().collect::>().join("/"); 92 | 93 | 94 | writeln!( 95 | out_bed, 96 | "{}\t{}\t{}\t{}>{}", 97 | chr, bgn, end, annotation, gn 98 | ) 99 | .expect("fail to write the vcf file"); 100 | }; 101 | } 102 | }); 103 | 104 | Ok(()) 105 | } 106 | -------------------------------------------------------------------------------- /pgr-bin/src/bin/pgr-annotate-vcf-file.rs: -------------------------------------------------------------------------------- 1 | const VERSION_STRING: &str = env!("VERSION_STRING"); 2 | use clap::{self, CommandFactory, Parser}; 3 | use flate2::bufread::MultiGzDecoder; 4 | use iset::IntervalMap; 5 | //use rayon::prelude::*; 6 | use rustc_hash::{FxHashMap, FxHashSet}; 7 | use std::fs::File; 8 | use std::io::{BufRead, BufReader, BufWriter, Write}; 9 | use std::path::Path; 10 | 11 | /// Align long contigs and identify potential SV regions with respect to the reference fasta file 12 | #[derive(Parser, Debug)] 13 | #[clap(name = "pgr-annotate-vcf-file")] 14 | #[clap(author, version)] 15 | #[clap(about, long_about = None)] 16 | struct CmdOptions { 17 | /// path to the the a vcf file 18 | vcf_path: String, 19 | /// path to the annotation file (gzipped) 20 | annotation_path: String, 21 | /// the prefix of the output files 22 | output_path: String, 23 | /// number of threads used in parallel (more memory usage), default to "0" using all CPUs available or the number set by RAYON_NUM_THREADS 24 | #[clap(long, default_value_t = 0)] 25 | number_of_thread: usize, 26 | } 27 | fn main() -> Result<(), std::io::Error> { 28 | CmdOptions::command().version(VERSION_STRING).get_matches(); 29 | let args = CmdOptions::parse(); 30 | 31 | rayon::ThreadPoolBuilder::new() 32 | .num_threads(args.number_of_thread) 33 | .build_global() 34 | .unwrap(); 35 | 36 | let mut reader = BufReader::new(File::open(Path::new(&args.annotation_path)).unwrap()); 37 | 38 | let annotation_reader = BufReader::new(MultiGzDecoder::new(&mut reader)); 39 | let mut annotation_interval = FxHashMap::>::default(); 40 | // we support https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/genes/hg38.ncbiRefSeq.gtf.gz for now 41 | annotation_reader.lines().for_each(|line| { 42 | if let Ok(line) = line { 43 | let err_msg = format!("faile to parse on {}", line); 44 | let fields = line.split('\t').collect::>(); 45 | let chr = fields[0].to_string(); 46 | let f_type = fields[2].to_string(); 47 | let fs = fields[3].parse::().expect(&err_msg); 48 | let fe = fields[4].parse::().expect(&err_msg); 49 | let strand = fields[6].chars().next().expect(&err_msg); 50 | let attribute = fields[8].to_string(); 51 | if f_type == "transcript" { 52 | let e = annotation_interval 53 | .entry(chr) 54 | .or_insert(IntervalMap::::default()); 55 | e.insert(fs..fe, (strand, attribute)); 56 | } 57 | } 58 | }); 59 | 60 | let mut out_vcf = BufWriter::new(File::create(Path::new(&args.output_path)).unwrap()); 61 | writeln!(out_vcf, "##fileformat=VCFv4.2").expect("fail to write the vcf file"); 62 | writeln!( 63 | out_vcf, 64 | r#"##INFO="# 65 | ) 66 | .expect("fail to write the vcf"); 67 | writeln!( 68 | out_vcf, 69 | r#"##FORMAT="# 70 | ) 71 | .expect("fail to write the vcf file"); 72 | writeln!( 73 | out_vcf, 74 | "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE" 75 | ) 76 | .expect("fail to write the vcf file"); 77 | 78 | let vcf_reader = BufReader::new(File::open(Path::new(&args.vcf_path)).unwrap()); 79 | vcf_reader.lines().for_each(|line| { 80 | if let Ok(line) = line { 81 | if line.starts_with('#') { 82 | return; 83 | }; 84 | let err_msg = format!("faile to parse on {}", line); 85 | let fields = line.split('\t').collect::>(); 86 | let chr = fields[0].to_string(); 87 | let pos = fields[1].parse::().expect(&err_msg); 88 | if let Some(i_map) = annotation_interval.get(&chr) { 89 | // TODO, we only pick the first overlap for now 90 | let mut annotations = FxHashSet::::default(); 91 | for (_rng, (_strand, attributes)) in i_map.overlap(pos) { 92 | // TODO: need a proper parser 93 | let attributes = attributes.trim_end_matches(';').to_string(); 94 | let a_fields = attributes.split(';').collect::>(); 95 | let gn = a_fields.last().unwrap().to_string(); 96 | let gn = gn.split(' ').collect::>(); 97 | let gn = gn.last().unwrap().to_string(); 98 | let gn = gn.trim_matches('"'); 99 | annotations.insert(gn.to_string()); 100 | }; 101 | if annotations.is_empty() { return }; 102 | let gn = annotations.into_iter().collect::>().join("/"); 103 | 104 | let tvs = fields[3]; 105 | let qvs = fields[4]; 106 | let gt = fields[9]; 107 | writeln!( 108 | out_vcf, 109 | "{}\t{}\t.\t{}\t{}\t60\tPASS\tGN={}\tGT\t{}", 110 | chr, pos, tvs, qvs, gn, gt, 111 | ) 112 | .expect("fail to write the vcf file"); 113 | }; 114 | } 115 | }); 116 | 117 | Ok(()) 118 | } 119 | -------------------------------------------------------------------------------- /pgr-bin/src/bin/pgr-fetch-seqs.rs: -------------------------------------------------------------------------------- 1 | const VERSION_STRING: &str = env!("VERSION_STRING"); 2 | use clap::{self, CommandFactory, Parser}; 3 | use pgr_db::ext::SeqIndexDB; 4 | use pgr_db::fasta_io; 5 | use std::fs::File; 6 | use std::io::{self, BufRead, BufReader, BufWriter, Write}; 7 | use std::path::Path; 8 | 9 | /// List or fetch sequences from a PGR-TK database 10 | #[derive(Parser, Debug)] 11 | #[clap(name = "pgr-fetch-seqs")] 12 | #[clap(author, version)] 13 | #[clap(about, long_about = None)] 14 | struct CmdOptions { 15 | /// the prefix to a PGR-TK sequence database 16 | pgr_db_prefix: String, 17 | 18 | /// using the frg format for the sequence database (default to the AGC backend database if not specified) 19 | #[clap(long, default_value_t = false)] 20 | frg_file: bool, 21 | 22 | /// the regions file path 23 | #[clap(short, long, default_value=None)] 24 | region_file: Option, 25 | 26 | /// output file name 27 | #[clap(short, long, default_value=None)] 28 | output_file: Option, 29 | 30 | /// list all sequence source, contig names in the database 31 | #[clap(long, default_value_t = false)] 32 | list: bool, 33 | } 34 | 35 | fn main() -> Result<(), std::io::Error> { 36 | CmdOptions::command().version(VERSION_STRING).get_matches(); 37 | let args = CmdOptions::parse(); 38 | 39 | let mut seq_index_db = SeqIndexDB::new(); 40 | 41 | #[cfg(feature = "with_agc")] 42 | if args.frg_file { 43 | let _ = seq_index_db.load_from_frg_index(args.pgr_db_prefix); 44 | } else { 45 | let _ = seq_index_db.load_from_agc_index(args.pgr_db_prefix); 46 | } 47 | #[cfg(not(feature = "with_agc"))] 48 | if args.frg_file { 49 | let _ = seq_index_db.load_from_frg_index(args.pgr_db_prefix); 50 | } else { 51 | panic!("This command is compiled with only frg file support, please specify `--frg-file"); 52 | } 53 | 54 | if args.list { 55 | let mut out = if args.output_file.is_some() { 56 | let f = File::open(args.output_file.unwrap()).expect("can't open the ouptfile"); 57 | Box::new(f) as Box 58 | } else { 59 | Box::new(io::stdout()) 60 | }; 61 | seq_index_db 62 | .seq_info 63 | .unwrap() 64 | .into_iter() 65 | .for_each(|(sid, (ctg, src, length))| { 66 | writeln!( 67 | out, 68 | "{}\t{}\t{}\t{}", 69 | sid, 70 | src.unwrap_or_else(|| "None".to_string()), 71 | ctg, 72 | length 73 | ) 74 | .expect("can't write output file") 75 | }); 76 | return Ok(()); 77 | } 78 | 79 | let region_file = args.region_file.expect("region file not specified"); 80 | let region_file = 81 | BufReader::new(File::open(Path::new(®ion_file)).expect("can't open the region file")); 82 | 83 | let mut out = if args.output_file.is_some() { 84 | let f = BufWriter::new( 85 | File::create(args.output_file.unwrap()).expect("can't open the ouptfile"), 86 | ); 87 | Box::new(f) as Box 88 | } else { 89 | Box::new(io::stdout()) 90 | }; 91 | 92 | region_file.lines().for_each(|line| { 93 | let line = line.expect("fail to get a line in the region file"); 94 | let fields = line.split('\t').collect::>(); 95 | let label = fields[0].to_string(); 96 | let src = fields[1].to_string(); 97 | let ctg = fields[2].to_string(); 98 | let bgn: usize = fields[3].parse().expect("can't parse bgn"); 99 | let end: usize = fields[4].parse().expect("can't parse end"); 100 | let reversed: bool = fields[5].parse::().expect("can't parse strand") == 1; 101 | let mut seq = seq_index_db 102 | .get_sub_seq(src, ctg, bgn, end) 103 | .expect("fail to fetch sequence"); 104 | if reversed { 105 | seq = fasta_io::reverse_complement(&seq); 106 | } 107 | 108 | writeln!(out, ">{}", label).expect("fail to write the sequences"); 109 | writeln!(out, "{}", String::from_utf8_lossy(&seq[..])) 110 | .expect("fail to write the sequences"); 111 | }); 112 | 113 | Ok(()) 114 | } 115 | -------------------------------------------------------------------------------- /pgr-bin/src/bin/pgr-make-frgdb.rs: -------------------------------------------------------------------------------- 1 | const VERSION_STRING: &str = env!("VERSION_STRING"); 2 | 3 | //use std::path::PathBuf; 4 | use clap::{self, CommandFactory, Parser}; 5 | 6 | use pgr_db::ext::SeqIndexDB; 7 | use std::fs::File; 8 | use std::io::{BufRead, BufReader}; 9 | use std::path::Path; 10 | 11 | /// Create PGR-TK fragment minimizer database with frg format backend 12 | #[derive(Parser, Debug)] 13 | #[clap(name = "pgr-make-frgdb")] 14 | #[clap(author, version)] 15 | #[clap(about, long_about = None)] 16 | struct CmdOptions { 17 | /// the path to the file contains the paths to the fastx files to load 18 | filepath: String, 19 | prefix: String, 20 | /// minimizer window size 21 | #[clap(long, short, default_value_t = 80)] 22 | w: u32, 23 | /// minimizer k-mer size 24 | #[clap(long, short, default_value_t = 56)] 25 | k: u32, 26 | /// sparse minimizer (shimmer) reduction factor 27 | #[clap(long, short, default_value_t = 4)] 28 | r: u32, 29 | /// min span for neighboring minimiers 30 | #[clap(long, short, default_value_t = 64)] 31 | min_span: u32, 32 | } 33 | 34 | fn main() { 35 | CmdOptions::command().version(VERSION_STRING).get_matches(); 36 | let args = CmdOptions::parse(); 37 | // TODO: to log file 38 | //println!("read data from files in {:?}", args.filepath); 39 | //println!("output prefix {:?}", args.prefix); 40 | let _shmmr_spec = pgr_db::shmmrutils::ShmmrSpec { 41 | w: args.w, 42 | k: args.k, 43 | r: args.r, 44 | min_span: args.min_span, 45 | sketch: false, 46 | }; 47 | let mut sdb = SeqIndexDB::new(); 48 | let input_files = BufReader::new( 49 | File::open(Path::new(&args.filepath)) 50 | .expect("can't open the input file that contains the paths to the fastx files"), 51 | ); 52 | input_files.lines().enumerate().for_each(|(fid, filename)| { 53 | let filepath = filename 54 | .expect("can't get fastx file name") 55 | .trim() 56 | .to_string(); 57 | if fid == 0 { 58 | sdb.load_from_fastx(filepath.clone(), args.w, args.k, args.r, args.min_span, true) 59 | .unwrap_or_else(|_| panic!("fail to read the fastx file: {}", filepath)); 60 | } else { 61 | sdb.append_from_fastx(filepath.clone(), true) 62 | .unwrap_or_else(|_| panic!("fail to read the fastx file: {}", filepath)); 63 | } 64 | }); 65 | 66 | sdb.write_frag_and_index_files(args.prefix); 67 | } 68 | -------------------------------------------------------------------------------- /pgr-bin/src/bin/pgr-mdb.rs: -------------------------------------------------------------------------------- 1 | const VERSION_STRING: &str = env!("VERSION_STRING"); 2 | 3 | //use std::path::PathBuf; 4 | use clap::{self, CommandFactory, Parser}; 5 | 6 | #[cfg(feature = "with_agc")] 7 | use pgr_db::agc_io::AGCFile; 8 | 9 | #[cfg(feature = "with_agc")] 10 | use pgr_db::shmmrutils::ShmmrSpec; 11 | 12 | #[cfg(feature = "with_agc")] 13 | use std::fs::File; 14 | 15 | #[cfg(feature = "with_agc")] 16 | use std::io::{BufRead, BufReader}; 17 | 18 | #[cfg(feature = "with_agc")] 19 | use pgr_db::seq_db; 20 | 21 | /// Create pgr minimizer database with AGC backend 22 | #[derive(Parser, Debug)] 23 | #[clap(name = "pgr-mdb")] 24 | #[clap(author, version)] 25 | #[clap(about, long_about = None)] 26 | struct CmdOptions { 27 | filepath: String, 28 | prefix: String, 29 | /// minimizer window size 30 | #[clap(long, short, default_value_t = 80)] 31 | w: u32, 32 | /// minimizer k-mer size 33 | #[clap(long, short, default_value_t = 56)] 34 | k: u32, 35 | /// sparse minimizer (shimmer) reduction factor 36 | #[clap(long, short, default_value_t = 4)] 37 | r: u32, 38 | /// min span for neighboring minimiers 39 | #[clap(long, short, default_value_t = 64)] 40 | min_span: u32, 41 | /// using sketch k-mer than minimizer 42 | #[clap(short, long)] 43 | sketch: bool, 44 | /// set to use agc prefecting feature (more memory usage but faster, useful for agcfile with many small contigs) 45 | #[clap(short, long)] 46 | prefetching: bool, 47 | /// number of parallel agc reader threads (more memory usage) 48 | #[clap(long, short, default_value_t = 4)] 49 | number_of_readers: usize, 50 | } 51 | 52 | #[cfg(feature = "with_agc")] 53 | fn load_write_index_from_agcfile( 54 | path: String, 55 | prefix: String, 56 | shmmr_spec: &ShmmrSpec, 57 | prefetching: bool, 58 | number_of_readers: usize, 59 | ) -> Result<(), std::io::Error> { 60 | let mut sdb = seq_db::CompactSeqDB::new(shmmr_spec.clone()); 61 | let filelist = File::open(path)?; 62 | 63 | BufReader::new(filelist) 64 | .lines() 65 | .try_for_each(|fp| -> Result<(), std::io::Error> { 66 | let fp = fp.unwrap(); 67 | //println!("load file {}", fp); 68 | let mut agcfile: AGCFile = AGCFile::new(fp)?; 69 | agcfile.set_iter_thread(number_of_readers); 70 | agcfile.set_prefetching(prefetching); 71 | //println!("start to load index"); 72 | let _ = sdb.load_index_from_agcfile(agcfile); 73 | Ok(()) 74 | })?; 75 | 76 | //seq_db::write_shmr_map_file(&sdb.frag_map, "test.db".to_string()); 77 | sdb.write_shmmr_map_index(prefix)?; 78 | Ok(()) 79 | } 80 | 81 | fn main() { 82 | CmdOptions::command().version(VERSION_STRING).get_matches(); 83 | 84 | #[cfg(feature = "with_agc")] 85 | let args = CmdOptions::parse(); 86 | // TODO: to log file 87 | //println!("read data from files in {:?}", args.filepath); 88 | //println!("output prefix {:?}", args.prefix); 89 | 90 | #[cfg(feature = "with_agc")] 91 | let shmmr_spec = pgr_db::shmmrutils::ShmmrSpec { 92 | w: args.w, 93 | k: args.k, 94 | r: args.r, 95 | min_span: args.min_span, 96 | sketch: args.sketch, 97 | }; 98 | 99 | #[cfg(feature = "with_agc")] 100 | load_write_index_from_agcfile( 101 | args.filepath, 102 | args.prefix.clone(), 103 | &shmmr_spec, 104 | args.prefetching, 105 | args.number_of_readers, 106 | ) 107 | .unwrap(); 108 | 109 | #[cfg(not(feature = "with_agc"))] 110 | panic!("the command is not compiled with `with_agc` feature") 111 | } 112 | -------------------------------------------------------------------------------- /pgr-bin/src/bin/pgr-merge-svcnd-bed.rs: -------------------------------------------------------------------------------- 1 | const VERSION_STRING: &str = env!("VERSION_STRING"); 2 | use clap::{self, CommandFactory, Parser}; 3 | // use rayon::prelude::*; 4 | use rustc_hash::FxHashMap; 5 | use std::fs::File; 6 | use std::io::{BufRead, BufReader, BufWriter, Write}; 7 | use std::path::Path; 8 | 9 | /// Merge svcnd from multiple *.svcnd.bed files into one and compute the merged regions 10 | /// It is useful to identify unique bed regions to one specific haplotype 11 | #[derive(Parser, Debug)] 12 | #[clap(name = "pgr-merge-svcnd-bed")] 13 | #[clap(author, version)] 14 | #[clap(about, long_about = None)] 15 | struct CmdOptions { 16 | /// path to the file contain the input bed files, each line should be "labelinput file path" 17 | input_files: String, 18 | /// the path of the output files 19 | output_path: String, 20 | /// number of threads used in parallel (more memory usage), default to "0" using all CPUs available or the number set by RAYON_NUM_THREADS 21 | #[clap(long, default_value_t = 0)] 22 | number_of_thread: usize, 23 | } 24 | 25 | type Interval = ((u32, u32), (String, String)); 26 | fn main() { 27 | CmdOptions::command().version(VERSION_STRING).get_matches(); 28 | let args = CmdOptions::parse(); 29 | 30 | rayon::ThreadPoolBuilder::new() 31 | .num_threads(args.number_of_thread) 32 | .build_global() 33 | .unwrap(); 34 | 35 | let input_files = BufReader::new(File::open(Path::new(&args.input_files)).unwrap()); 36 | 37 | let input_files = input_files 38 | .lines() 39 | .flat_map(|line| { 40 | if let Ok(line) = line { 41 | let rec = line.trim().split('\t').collect::>(); 42 | assert!(rec.len() >= 2); 43 | Some((rec[0].to_string(), rec[1].to_string())) 44 | } else { 45 | None 46 | } 47 | }) 48 | .collect::>(); 49 | 50 | let mut interval_collection = 51 | FxHashMap::>::default(); 52 | input_files.iter().for_each(|(label, path)| { 53 | let bed_reader = BufReader::new(File::open(Path::new(path)).unwrap()); 54 | bed_reader.lines().for_each(|line| { 55 | if let Ok(line) = line { 56 | if line.starts_with('#') { 57 | return; 58 | }; 59 | let err_msg = format!("fail to parse on {}", line); 60 | let fields = line.split('\t').collect::>(); 61 | let chr = fields[0].to_string(); 62 | let bgn = fields[1].parse::().expect(&err_msg); 63 | let end = fields[2].parse::().expect(&err_msg); 64 | let annotation = fields[3].to_string(); 65 | let e = interval_collection.entry(chr).or_insert_with(Vec::new); 66 | e.push(((bgn, end), (label.clone(), annotation))); 67 | } 68 | }); 69 | }); 70 | 71 | let group_intervals = |intervals: &mut Vec| -> Vec<(u32, u32, Vec)> { 72 | let mut interval_groups = Vec::<(u32, u32, Vec)>::new(); 73 | if intervals.is_empty() { 74 | return interval_groups; 75 | } 76 | 77 | intervals.sort(); 78 | let (mut current_bgn, mut current_end) = intervals.first().unwrap().0; 79 | 80 | let mut current_groups = Vec::::new(); 81 | intervals.iter().for_each(|(interval, payload)| { 82 | if current_end < interval.0 { 83 | interval_groups.push((current_bgn, current_end, current_groups.clone())); 84 | current_groups.clear(); 85 | current_groups.push((*interval, payload.clone())); 86 | (current_bgn, current_end) = *interval; 87 | } else { 88 | current_groups.push((*interval, payload.clone())); 89 | if current_end < interval.1 { 90 | current_end = interval.1; 91 | } 92 | } 93 | }); 94 | if !current_groups.is_empty() { 95 | interval_groups.push((current_bgn, current_end, current_groups.clone())); 96 | } 97 | interval_groups 98 | }; 99 | 100 | let mut out_bed = BufWriter::new(File::create(Path::new(&args.output_path)).unwrap()); 101 | let mut keys = interval_collection.keys().cloned().collect::>(); 102 | keys.sort(); 103 | keys.into_iter().for_each(|key| { 104 | let intervals = interval_collection.get_mut(&key).unwrap(); 105 | let interval_groups = group_intervals(intervals); 106 | interval_groups.into_iter().for_each(|intervals| { 107 | if intervals.2.is_empty() { 108 | return; 109 | } 110 | let itvl_group_bgn = intervals.0; 111 | let itvl_group_end = intervals.1; 112 | if itvl_group_bgn > itvl_group_end { 113 | return; 114 | }; 115 | 116 | let mut label_count = FxHashMap::::default(); 117 | let mut total_interval_counts = 0u32; 118 | intervals.2.iter().for_each(|(_interval, payload)| { 119 | let e = label_count.entry(payload.0.clone()).or_default(); 120 | *e += 1; 121 | total_interval_counts += 1; 122 | }); 123 | 124 | writeln!( 125 | out_bed, 126 | "{}\t{}\t{}\tmerged:{}:{}", 127 | key, 128 | itvl_group_bgn, 129 | itvl_group_end, 130 | label_count.len(), 131 | total_interval_counts 132 | ) 133 | .expect("unable to write the output file"); 134 | 135 | intervals.2.iter().for_each(|(interval, payload)| { 136 | let number_haplotype = label_count.len(); 137 | let e = label_count.entry(payload.0.clone()).or_default(); 138 | writeln!( 139 | out_bed, 140 | "{}\t{}\t{}\t{}:{}:{}-{}:{}:{}", 141 | key, 142 | interval.0, 143 | interval.1, 144 | payload.0, 145 | payload.1, 146 | itvl_group_bgn, 147 | itvl_group_end, 148 | number_haplotype, 149 | *e, 150 | ) 151 | .expect("unable to write the output file"); 152 | }); 153 | }); 154 | }); 155 | } 156 | -------------------------------------------------------------------------------- /pgr-bin/src/bin/pgr-pbundle-aln.rs: -------------------------------------------------------------------------------- 1 | const VERSION_STRING: &str = env!("VERSION_STRING"); 2 | use clap::{self, CommandFactory, Parser}; 3 | use rustc_hash::FxHashMap; 4 | use serde::*; 5 | use serde_json::json; 6 | use std::io::{BufRead, BufReader, BufWriter, Write}; 7 | use std::path::Path; 8 | use std::{fs::File, path}; 9 | 10 | /// Generate alignment between sequences using bundle decomposition from a principal bundle bed file 11 | #[derive(Parser, Debug)] 12 | #[clap(name = "pgr-pbundle-aln")] 13 | #[clap(author, version)] 14 | #[clap(about, long_about = None)] 15 | struct CmdOptions { 16 | /// the path to the principal bundle bed file 17 | bed_file_path: String, 18 | /// a file contain two lines of the contig ids that should be aligned to each other 19 | aln_spec: String, 20 | /// the prefix of the output file 21 | output_prefix: String, 22 | } 23 | 24 | #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] 25 | struct BundleSegment { 26 | bgn: u32, 27 | end: u32, 28 | bundle_id: u32, 29 | bundle_v_count: u32, 30 | bundle_dir: u32, 31 | bundle_v_bgn: u32, 32 | bundle_v_end: u32, 33 | } 34 | 35 | #[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)] 36 | enum AlnType { 37 | Match, 38 | Insertion, 39 | Deletion, 40 | } 41 | type AlnPathElement = (usize, usize, AlnType, u32, u32, usize, usize); 42 | type AlnPath = Vec; 43 | 44 | fn align_bundles( 45 | q_bundles: &[BundleSegment], 46 | t_bundles: &[BundleSegment], 47 | ) -> (f32, usize, usize, AlnPath) { 48 | let q_count = q_bundles.len(); 49 | let t_count = t_bundles.len(); 50 | let mut s_map = FxHashMap::<(usize, usize), i64>::default(); 51 | let mut t_map = FxHashMap::<(usize, usize), AlnType>::default(); 52 | 53 | let mut get_aln_direction_with_best_score = 54 | |q_idx: usize, t_idx: usize, s_map: &FxHashMap<(usize, usize), i64>| -> (AlnType, i64) { 55 | let mut best = (AlnType::Match, i64::MIN); 56 | let q_len = (q_bundles[q_idx].end as i64 - q_bundles[q_idx].bgn as i64).abs(); 57 | let t_len = (t_bundles[t_idx].end as i64 - t_bundles[t_idx].bgn as i64).abs(); 58 | let min_len = if q_len > t_len { t_len } else { q_len }; 59 | let q_b_seg = q_bundles[q_idx]; 60 | let t_b_seg = t_bundles[t_idx]; 61 | if q_idx == 0 62 | && t_idx == 0 63 | && (q_b_seg.bundle_id == t_b_seg.bundle_id) 64 | && (q_b_seg.bundle_dir == t_b_seg.bundle_dir) 65 | { 66 | best = (AlnType::Match, 2 * min_len) 67 | }; 68 | if q_idx > 0 69 | && t_idx > 0 70 | && q_b_seg.bundle_id == t_b_seg.bundle_id 71 | && (q_b_seg.bundle_dir == t_b_seg.bundle_dir) 72 | { 73 | best = ( 74 | AlnType::Match, 75 | 2 * min_len + s_map.get(&(q_idx - 1, t_idx - 1)).unwrap(), 76 | ) 77 | }; 78 | if t_idx > 0 { 79 | let score = -2 * q_len + s_map.get(&(q_idx, t_idx - 1)).unwrap(); 80 | if score > best.1 { 81 | best = (AlnType::Deletion, score) 82 | }; 83 | }; 84 | if q_idx > 0 { 85 | let score = -2 * t_len + s_map.get(&(q_idx - 1, t_idx)).unwrap(); 86 | if score > best.1 { 87 | best = (AlnType::Insertion, score) 88 | } 89 | } 90 | t_map.insert((q_idx, t_idx), best.0); 91 | best 92 | }; 93 | 94 | //let mut best_score = 0; 95 | //let mut best_q_idx = 0; 96 | //let mut best_t_idx = 0; 97 | let mut aln_path = AlnPath::new(); 98 | 99 | (0..t_count) 100 | .flat_map(|t_idx| (0..q_count).map(move |q_idx| (q_idx, t_idx))) 101 | .for_each(|(q_idx, t_idx)| { 102 | //println!("{} {}", q_idx, t_idx); 103 | let (_, score) = get_aln_direction_with_best_score(q_idx, t_idx, &s_map); 104 | s_map.insert((q_idx, t_idx), score); 105 | /* 106 | if score > best_score { 107 | best_score = score; 108 | best_q_idx = q_idx; 109 | best_t_idx = t_idx; 110 | } 111 | */ 112 | }); 113 | let mut q_idx = q_count - 1; 114 | let mut t_idx = t_count - 1; 115 | let mut diff_len = 0_usize; 116 | let mut max_len = 1_usize; 117 | while let Some(aln_type) = t_map.get(&(q_idx, t_idx)) { 118 | let qq_idx = q_idx; 119 | let tt_idx = t_idx; 120 | let (diff_len_delta, max_len_delta) = match aln_type { 121 | AlnType::Match => { 122 | let q_len = (q_bundles[q_idx].end as i64 - q_bundles[q_idx].bgn as i64).abs(); 123 | let t_len = (t_bundles[t_idx].end as i64 - t_bundles[t_idx].bgn as i64).abs(); 124 | let diff_len_delta = (q_len - t_len).unsigned_abs() as usize; 125 | let max_len_delata = if q_len > t_len { 126 | q_len as usize 127 | } else { 128 | t_len as usize 129 | }; 130 | q_idx -= 1; 131 | t_idx -= 1; 132 | (diff_len_delta, max_len_delata) 133 | } 134 | AlnType::Insertion => { 135 | let q_len = (q_bundles[q_idx].end as i64 - q_bundles[q_idx].bgn as i64).abs(); 136 | q_idx -= 1; 137 | (q_len as usize, q_len as usize) 138 | } 139 | AlnType::Deletion => { 140 | let t_len = (t_bundles[t_idx].end as i64 - t_bundles[t_idx].bgn as i64).abs(); 141 | t_idx -= 1; 142 | (t_len as usize, t_len as usize) 143 | } 144 | }; 145 | diff_len += diff_len_delta; 146 | max_len += max_len_delta; 147 | aln_path.push(( 148 | qq_idx, 149 | tt_idx, 150 | *aln_type, 151 | q_bundles[qq_idx].bundle_id, 152 | t_bundles[tt_idx].bundle_id, 153 | diff_len_delta, 154 | max_len_delta, 155 | )); 156 | } 157 | aln_path.reverse(); 158 | ( 159 | diff_len as f32 / max_len as f32, 160 | diff_len, 161 | max_len, 162 | aln_path, 163 | ) 164 | } 165 | 166 | fn main() -> std::result::Result<(), std::io::Error> { 167 | CmdOptions::command().version(VERSION_STRING).get_matches(); 168 | let args = CmdOptions::parse(); 169 | let bed_file_path = path::Path::new(&args.bed_file_path); 170 | let bed_file = BufReader::new(File::open(bed_file_path).expect("can't open the bed file")); 171 | let mut ctg_data = FxHashMap::>::default(); 172 | let bed_file_parse_err_msg = "bed file parsing error"; 173 | bed_file.lines().for_each(|line| { 174 | let line = line.unwrap().trim().to_string(); 175 | if line.is_empty() { 176 | return; 177 | } 178 | if &line[0..1] == "#" { 179 | return; 180 | } 181 | let bed_fields = line.split('\t').collect::>(); 182 | let ctg: String = bed_fields[0].to_string(); 183 | let bgn: u32 = bed_fields[1].parse().expect(bed_file_parse_err_msg); 184 | let end: u32 = bed_fields[2].parse().expect(bed_file_parse_err_msg); 185 | let pbundle_fields = bed_fields[3].split(':').collect::>(); 186 | let bundle_id: u32 = pbundle_fields[0].parse().expect(bed_file_parse_err_msg); 187 | let bundle_v_count: u32 = pbundle_fields[1].parse().expect(bed_file_parse_err_msg); 188 | let bundle_dir: u32 = pbundle_fields[2].parse().expect(bed_file_parse_err_msg); 189 | let bundle_v_bgn: u32 = pbundle_fields[3].parse().expect(bed_file_parse_err_msg); 190 | let bundle_v_end: u32 = pbundle_fields[4].parse().expect(bed_file_parse_err_msg); 191 | 192 | let e = ctg_data.entry(ctg).or_default(); 193 | let b_seg = BundleSegment { 194 | bgn, 195 | end, 196 | bundle_id, 197 | bundle_v_count, 198 | bundle_dir, 199 | bundle_v_bgn, 200 | bundle_v_end, 201 | }; 202 | e.push(b_seg); 203 | }); 204 | 205 | let aln_spec = path::Path::new(&args.aln_spec); 206 | let spec_file = BufReader::new(File::open(aln_spec).expect("can't open the aln_spec file")); 207 | let mut ctg_of_interests = Vec::::new(); 208 | spec_file.lines().for_each(|line| { 209 | let line = line.unwrap().trim().to_string(); 210 | ctg_of_interests.push(line); 211 | }); 212 | 213 | let ctg_data = ctg_of_interests 214 | .into_iter() 215 | .map(|k| { 216 | let v = ctg_data 217 | .get(&k) 218 | .unwrap_or_else(|| panic!("ctg name nof found: {}", k)); 219 | (k, v) 220 | }) 221 | .collect::>(); 222 | 223 | let n_ctg = ctg_data.len(); 224 | 225 | let mut alignment_paths = Vec::<_>::new(); 226 | let ctg_idx0 = 0; 227 | (1..n_ctg).for_each(|ctg_idx1| { 228 | // the first sequence is the "target" 229 | let (target_ctg, target_bundles) = &ctg_data[ctg_idx0]; 230 | let (query_ctg, query_bundles) = &ctg_data[ctg_idx1]; 231 | let (_dist0, _diff_len0, _max_len0, aln_path) = 232 | align_bundles(query_bundles, target_bundles); 233 | 234 | let aln_path = aln_path 235 | .into_iter() 236 | .map( 237 | |( 238 | qq_idx, 239 | tt_idx, 240 | aln_type, 241 | _q_bundle_id, 242 | _t_bundle_id, 243 | _diff_len_delta, 244 | _max_len_delta, 245 | )| { 246 | let target_data = target_bundles.get(tt_idx).unwrap(); 247 | let query_data = query_bundles.get(qq_idx).unwrap(); 248 | (qq_idx, tt_idx, aln_type, target_data, query_data) 249 | }, 250 | ) 251 | .collect::>(); 252 | alignment_paths.push((target_ctg, query_ctg, aln_path)) 253 | }); 254 | 255 | let out_path = Path::new(&args.output_prefix).with_extension("bln.json"); 256 | let mut out_file = 257 | BufWriter::new(File::create(out_path).expect("can't create the bundle alignment file")); 258 | 259 | let out_json = json!(alignment_paths); 260 | out_file.write_all(out_json.to_string().as_bytes())?; 261 | 262 | Ok(()) 263 | } 264 | -------------------------------------------------------------------------------- /pgr-bin/src/bin/pgr-pbundle-bed2sorted.rs: -------------------------------------------------------------------------------- 1 | const VERSION_STRING: &str = env!("VERSION_STRING"); 2 | use clap::{self, CommandFactory, Parser}; 3 | use rustc_hash::FxHashMap; 4 | use std::io::{BufRead, BufReader, BufWriter, Write}; 5 | use std::path::Path; 6 | use std::{fs::File, path}; 7 | 8 | /// Generate annotation file with a sorting order from the principal bundle decomposition 9 | #[derive(Parser, Debug)] 10 | #[clap(name = "pgr-pbundle-bed2sorted")] 11 | #[clap(author, version)] 12 | #[clap(about, long_about = None)] 13 | struct CmdOptions { 14 | /// the path to the pricipal bundle bed file 15 | bed_file_path: String, 16 | /// the prefix of the output file 17 | output_prefix: String, 18 | } 19 | 20 | #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug)] 21 | struct BundleSegement { 22 | bgn: u32, 23 | end: u32, 24 | bundle_id: u32, 25 | bundle_v_count: u32, 26 | bundle_dir: u32, 27 | bundle_v_bgn: u32, 28 | bundle_v_end: u32, 29 | } 30 | 31 | fn main() -> Result<(), std::io::Error> { 32 | CmdOptions::command().version(VERSION_STRING).get_matches(); 33 | let args = CmdOptions::parse(); 34 | let bed_file_path = path::Path::new(&args.bed_file_path); 35 | let bed_file = BufReader::new(File::open(bed_file_path)?); 36 | let mut ctg_data = FxHashMap::>::default(); 37 | let bed_file_parse_err_msg = "bed file parsing error"; 38 | let mut node_length = FxHashMap::<(u32, u32), Vec<_>>::default(); 39 | bed_file.lines().for_each(|line| { 40 | let line = line.unwrap().trim().to_string(); 41 | if line.is_empty() { 42 | return; 43 | } 44 | if &line[0..1] == "#" { 45 | return; 46 | } 47 | let bed_fields = line.split('\t').collect::>(); 48 | let ctg: String = bed_fields[0].to_string(); 49 | let bgn: u32 = bed_fields[1].parse().expect(bed_file_parse_err_msg); 50 | let end: u32 = bed_fields[2].parse().expect(bed_file_parse_err_msg); 51 | let pbundle_fields = bed_fields[3].split(':').collect::>(); 52 | let bundle_id: u32 = pbundle_fields[0].parse().expect(bed_file_parse_err_msg); 53 | let bundle_v_count: u32 = pbundle_fields[1].parse().expect(bed_file_parse_err_msg); 54 | let bundle_dir: u32 = pbundle_fields[2].parse().expect(bed_file_parse_err_msg); 55 | let bundle_v_bgn: u32 = pbundle_fields[3].parse().expect(bed_file_parse_err_msg); 56 | let bundle_v_end: u32 = pbundle_fields[4].parse().expect(bed_file_parse_err_msg); 57 | 58 | let e = ctg_data.entry(ctg).or_default(); 59 | let b_seg = BundleSegement { 60 | bgn, 61 | end, 62 | bundle_id, 63 | bundle_v_count, 64 | bundle_dir, 65 | bundle_v_bgn, 66 | bundle_v_end, 67 | }; 68 | e.push(b_seg); 69 | if (bundle_v_bgn as i64 - bundle_v_end as i64).abs() as f32 > (bundle_v_count as f32) * 0.5 70 | { 71 | let e = node_length.entry((bundle_id, bundle_dir)).or_default(); 72 | e.push((end as i64 - bgn as i64).unsigned_abs()); 73 | } 74 | }); 75 | 76 | let mut node_length = node_length 77 | .into_iter() 78 | .map(|(n, v)| { 79 | let c = v.len() as f64; 80 | let sum = v.into_iter().sum::() as f64; 81 | (sum / c, n) 82 | }) 83 | .collect::>(); 84 | node_length.sort_by(|a, b| b.partial_cmp(a).unwrap()); 85 | 86 | let mut ctg_data = ctg_data 87 | .into_iter() 88 | .map(|(ctg, mut bundle_segs)| { 89 | bundle_segs.sort(); 90 | let mut node_count = FxHashMap::<(u32, u32), u32>::default(); 91 | bundle_segs.iter().for_each(|vv| { 92 | let node = (vv.bundle_id, vv.bundle_dir); 93 | if (vv.bundle_v_bgn as i64 - vv.bundle_v_end as i64).abs() as f32 94 | > (vv.bundle_v_count as f32) * 0.5 95 | { 96 | let e = node_count.entry(node).or_insert(0); 97 | *e += 1; 98 | } 99 | }); 100 | let mut sort_key = vec![]; 101 | node_length.iter().for_each(|&(_, n)| { 102 | sort_key.push(*node_count.get(&n).unwrap_or(&0)); 103 | }); 104 | 105 | (sort_key, ctg, bundle_segs) 106 | }) 107 | .collect::>(); 108 | 109 | ctg_data.sort(); 110 | ctg_data.reverse(); 111 | 112 | let out_path = Path::new(&args.output_prefix).with_extension("ord"); 113 | let mut out_file = BufWriter::new(File::create(out_path)?); 114 | 115 | ctg_data.into_iter().for_each(|(sort_key, ctg, _)| { 116 | let sort_key = sort_key 117 | .into_iter() 118 | .map(|k| format!("{}", k)) 119 | .collect::>(); 120 | let sort_key = sort_key.join(","); 121 | writeln!( 122 | out_file, 123 | "{}\t{ 124 | }", 125 | ctg, sort_key 126 | ) 127 | .expect("writing error"); 128 | }); 129 | 130 | Ok(()) 131 | } 132 | -------------------------------------------------------------------------------- /pgr-bin/utility_scripts/get_cytoband_to_json.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | if __name__ == "__main__": 5 | os.system("wget https://s3.amazonaws.com/igv.org.genomes/hg38/annotations/cytoBandIdeo.txt.gz") 6 | os.system("gunzip -f cytoBandIdeo.txt.gz") 7 | 8 | cypobands = {} 9 | with open("cytoBandIdeo.txt") as f: 10 | for row in f: 11 | row = row.strip().split("\t") 12 | cypobands.setdefault(row[0], []) 13 | cypobands[row[0]].append( (int(row[1]), int(row[2]), row[3], row[4]) ) 14 | 15 | out = open("cytoBandIdeo.json","w") 16 | json.dump({"cytobands": cypobands}, out) 17 | out.close() -------------------------------------------------------------------------------- /pgr-db/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "pgr-db" 3 | version = "0.6.0" 4 | edition = "2021" 5 | authors = ["Jason Chin "] 6 | build = "build.rs" 7 | 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 9 | [build-dependencies] 10 | bindgen = "0.60.1" 11 | 12 | [dependencies] 13 | rustc-hash = "1.1.0" 14 | #flate2 = { version = "1.0.17", features = ["zlib-ng-compat"], default-features = false } 15 | flate2 = "1.0.17" 16 | log = { version = "0.4.19", features = ["std", "max_level_debug", "release_max_level_warn"]} 17 | simple_logger = "4.2.0" 18 | rayon = "1.5.2" 19 | libc = "0.2" 20 | byteorder = "1.3.4" 21 | petgraph = "0.6.1" 22 | cuckoofilter = "0.5" 23 | bgzip = "0.2.1" 24 | serde = { version = "1.0.137", features = ["derive", "rc"] } 25 | serde_json = "1.0.81" 26 | regex = "1" 27 | bincode = { version = "2.0.0-rc.1", features = ["alloc"] } 28 | memmap2 = "0.5.10" 29 | wavefront-aln = {git = "https://github.com/cschin/wavefront-aln.git"} 30 | 31 | [features] 32 | default = ["with_agc"] 33 | with_agc = [] 34 | -------------------------------------------------------------------------------- /pgr-db/build.rs: -------------------------------------------------------------------------------- 1 | extern crate bindgen; 2 | use std::env::consts::{ARCH, OS}; 3 | 4 | #[cfg(debug_assertions)] 5 | const BUILD_TYPE: &str = "debug"; 6 | #[cfg(not(debug_assertions))] 7 | const BUILD_TYPE: &str = "release"; 8 | 9 | #[cfg(feature = "with_agc")] 10 | use std::fs::{read_dir, remove_dir_all}; 11 | 12 | #[cfg(feature = "with_agc")] 13 | use std::path::PathBuf; 14 | 15 | use std::{env, process::Command}; 16 | 17 | #[cfg(feature = "with_agc")] 18 | fn build_agc() -> Option<()> { 19 | let mut agc_dir = read_dir("../agc").ok()?; 20 | if !agc_dir.any(|f| f.unwrap().file_name() == "makefile") { 21 | return None; 22 | } 23 | 24 | let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); 25 | 26 | let agc_path = out_path.join("agc"); 27 | 28 | let _ = remove_dir_all(agc_path.as_path()); 29 | 30 | // copy the AGC dir to OUT_PATH and build it there... clunky, but 31 | // don't want to pull in the entire 100MB WFA repo, since git2 32 | // doesn't seem to support shallow clones, and build scripts 33 | // should only modify things inside OUT_PATH. since the WFA folder 34 | // is just a couple MB, this is fine for now. 35 | let _cp_agc = Command::new("cp") 36 | .arg("-r") 37 | .arg("../agc") 38 | .arg(&out_path) 39 | .output() 40 | .unwrap(); 41 | 42 | let output = Command::new("make") 43 | .arg("-f") 44 | .arg("makefile.release") 45 | .arg("clean") 46 | .arg("libagc") 47 | .current_dir(&agc_path) 48 | .output() 49 | .unwrap(); 50 | if output.status.success() { 51 | Some(()) 52 | } else { 53 | panic!("make error: {}", String::from_utf8_lossy(&output.stderr)); 54 | } 55 | } 56 | 57 | // fn wfa() { 58 | // let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); 59 | // let _cp_agc = Command::new("cp") 60 | // .arg("../WFA2-lib/lib/libwfa.a") 61 | // .arg(&out_path) 62 | // .output() 63 | // .unwrap(); 64 | // // The directory of the WFA libraries, added to the search path. 65 | // println!("cargo:rustc-link-search={}", out_path.display()); 66 | // // Link the `wfa-lib` library. 67 | // println!("cargo:rustc-link-lib=wfa"); 68 | // // Also link `omp`. 69 | // println!("cargo:rustc-link-lib=omp5"); 70 | // } 71 | 72 | fn main() { 73 | //wfa(); 74 | 75 | #[cfg(feature = "with_agc")] 76 | if build_agc().is_none() { 77 | panic!("Error building AGC C library"); 78 | } else { 79 | let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); 80 | let agc_path = out_path.join("agc"); 81 | 82 | // shared library. 83 | println!("cargo:rustc-link-lib=agc"); 84 | println!("cargo:rustc-link-search={}", agc_path.display()); 85 | println!("cargo:rustc-link-lib=zstd"); 86 | println!("cargo:rustc-link-search={}/libs", agc_path.display()); 87 | println!("cargo:rustc-link-lib=stdc++"); 88 | println!("cargo:rustc-link-search=/usr/lib/gcc/x86_64-linux-gnu/11/"); 89 | 90 | // Tell cargo to invalidate the built crate whenever the wrapper changes 91 | println!("cargo:rerun-if-changed=wrapper.h"); 92 | 93 | // The bindgen::Builder is the main entry point 94 | // to bindgen, and lets you build up options for 95 | // the resulting bindings. 96 | let bindings = bindgen::Builder::default() 97 | // The input header we would like to generate 98 | // bindings for. 99 | .header("wrapper.h") 100 | // Tell cargo to invalidate the built crate whenever any of the 101 | // included header files changed. 102 | .parse_callbacks(Box::new(bindgen::CargoCallbacks)) 103 | // Finish the builder and generate the bindings. 104 | .generate() 105 | // Unwrap the Result and panic on failure. 106 | .expect("Unable to generate bindings"); 107 | 108 | // Write the bindings to the $OUT_DIR/bindings.rs file. 109 | bindings 110 | .write_to_file(out_path.join("bindings.rs")) 111 | .expect("Couldn't write bindings!"); 112 | } 113 | // from https://vallentin.dev/2019/06/06/versioning 114 | let branch_name = get_branch_name(); 115 | if branch_name != *"bioconda" { 116 | let version_string = format!( 117 | "{} {} ({}:{}{}, {} build, {} [{}] [{}])", 118 | env!("CARGO_PKG_NAME"), 119 | env!("CARGO_PKG_VERSION"), 120 | get_branch_name(), 121 | get_commit_hash(), 122 | if is_working_tree_clean() { "" } else { "+" }, 123 | BUILD_TYPE, 124 | OS, 125 | ARCH, 126 | get_rustc_version() 127 | ); 128 | 129 | println!("cargo:rustc-env=VERSION_STRING={}", version_string); 130 | } else { 131 | let version_string = format!( 132 | "{} {} (bioconda {} build ({}:{}{}), {} [{}] [{}])", 133 | env!("CARGO_PKG_NAME"), 134 | env!("CARGO_PKG_VERSION"), 135 | BUILD_TYPE, 136 | get_branch_name(), 137 | get_commit_hash(), 138 | if is_working_tree_clean() { "" } else { "+" }, 139 | OS, 140 | ARCH, 141 | get_rustc_version() 142 | ); 143 | println!("cargo:rustc-env=VERSION_STRING={}", version_string); 144 | } 145 | } 146 | 147 | fn get_rustc_version() -> String { 148 | let output = Command::new("rustc") 149 | .arg("--version") 150 | .current_dir(env!("CARGO_MANIFEST_DIR")) 151 | .output() 152 | .unwrap(); 153 | 154 | assert!(output.status.success()); 155 | 156 | String::from_utf8_lossy(&output.stdout) 157 | .trim_end() 158 | .to_string() 159 | } 160 | 161 | fn get_commit_hash() -> String { 162 | let output = Command::new("git") 163 | .arg("log") 164 | .arg("-1") 165 | .arg("--pretty=format:%h") // Abbreviated commit hash 166 | // .arg("--pretty=format:%H") // Full commit hash 167 | .current_dir(env!("CARGO_MANIFEST_DIR")) 168 | .output() 169 | .unwrap(); 170 | 171 | // assert!(output.status.success()); 172 | if output.status.success() { 173 | String::from_utf8_lossy(&output.stdout).to_string() 174 | } else { 175 | String::from("bioconda") 176 | } 177 | } 178 | 179 | fn get_branch_name() -> String { 180 | let output = Command::new("git") 181 | .arg("rev-parse") 182 | .arg("--abbrev-ref") 183 | .arg("HEAD") 184 | .current_dir(env!("CARGO_MANIFEST_DIR")) 185 | .output() 186 | .unwrap(); 187 | 188 | //assert!(output.status.success()); 189 | if output.status.success() { 190 | String::from_utf8_lossy(&output.stdout) 191 | .trim_end() 192 | .to_string() 193 | } else { 194 | String::from("bioconda") 195 | } 196 | } 197 | 198 | fn is_working_tree_clean() -> bool { 199 | let status = Command::new("git") 200 | .arg("diff") 201 | .arg("--quiet") 202 | .arg("--exit-code") 203 | .current_dir(env!("CARGO_MANIFEST_DIR")) 204 | .status() 205 | .unwrap(); 206 | 207 | if status.success() { 208 | status.code().unwrap() == 0 209 | } else { 210 | true 211 | } 212 | } 213 | -------------------------------------------------------------------------------- /pgr-db/src/bindings.rs: -------------------------------------------------------------------------------- 1 | #![allow(non_upper_case_globals)] 2 | #![allow(non_camel_case_types)] 3 | #![allow(non_snake_case)] 4 | #[cfg(feature = "with_agc")] 5 | include!(concat!(env!("OUT_DIR"), "/bindings.rs")); 6 | -------------------------------------------------------------------------------- /pgr-db/src/gff_db.rs: -------------------------------------------------------------------------------- 1 | use bgzip::BGZFReader; 2 | use rustc_hash::FxHashMap; 3 | use serde::{Deserialize, Serialize}; 4 | use serde_json; 5 | use std::char; 6 | use std::fmt; 7 | use std::fs::File; 8 | use std::io::BufRead; 9 | use std::path::Path; 10 | use std::rc::Rc; 11 | 12 | #[derive(Debug, Clone, Deserialize, Serialize)] 13 | pub struct GFFRecord { 14 | pub seqid: String, 15 | pub source: String, 16 | #[serde(rename = "type")] 17 | pub type_name: String, 18 | pub bgn: u32, 19 | pub end: u32, 20 | pub score: Option, 21 | pub strand: char, 22 | pub phase: Option, 23 | pub attributes: FxHashMap, 24 | } 25 | 26 | impl GFFRecord { 27 | pub fn from_line(line: &str) -> GFFRecord { 28 | let fields = line 29 | .trim_end() 30 | .split('\t') 31 | .into_iter() 32 | .map(|s| s.to_string()) 33 | .collect::>(); 34 | GFFRecord::from_fields(&fields) 35 | } 36 | 37 | pub fn from_fields(fields: &[String]) -> GFFRecord { 38 | let seqid = fields[0].clone(); 39 | let source = fields[1].clone(); 40 | let type_ = fields[2].clone(); 41 | let bgn = fields[3] 42 | .parse::() 43 | .expect("can't parse the start coordinate"); 44 | let end = fields[4] 45 | .parse::() 46 | .expect("can't parse the end coordinate"); 47 | let score = match fields[5].as_str() { 48 | "." => None, 49 | s => Some(s.parse::().expect("can't parse score")), 50 | }; 51 | 52 | let strand = fields[6][0..1].chars().next().unwrap(); 53 | let phase = match fields[7].as_str() { 54 | "." => None, 55 | s => Some(s.parse::().unwrap_or_else(|_| { 56 | panic!("fail to parse the phase field {}", fields[6].as_str()) 57 | })), 58 | }; 59 | let attributes = fields[8] 60 | .split(';') 61 | .into_iter() 62 | .map(|s| { 63 | let kv = s 64 | .split('=') 65 | .into_iter() 66 | .map(|s| s.to_string()) 67 | .collect::>(); 68 | if kv.len() != 2 { 69 | panic!("error parsing attributes") 70 | }; 71 | (kv[0].clone(), kv[1].clone()) 72 | }) 73 | .collect::>(); 74 | 75 | Self { 76 | seqid, 77 | source, 78 | type_name: type_, 79 | bgn, 80 | end, 81 | score, 82 | strand, 83 | phase, 84 | attributes, 85 | } 86 | } 87 | } 88 | 89 | impl fmt::Display for GFFRecord { 90 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 91 | let mut out = vec![]; 92 | out.push(format!( 93 | "{}\t{}\t{}\t{}\t{}", 94 | self.seqid, self.source, self.type_name, self.bgn, self.end 95 | )); 96 | 97 | out.push(if self.score.is_none() { 98 | ".".to_string() 99 | } else { 100 | format!("{}", self.score.unwrap()) 101 | }); 102 | 103 | out.push(format!("{}", self.strand)); 104 | 105 | out.push(if self.phase.is_none() { 106 | ".".to_string() 107 | } else { 108 | format!("{}", self.phase.unwrap()) 109 | }); 110 | 111 | out.push( 112 | self.attributes 113 | .iter() 114 | .map(|(k, v)| format!("{}={}", k, v)) 115 | .collect::>() 116 | .join(";"), 117 | ); 118 | 119 | write!(f, "{}", out.join("\t")) 120 | } 121 | } 122 | 123 | type IdToGffRec = FxHashMap>; 124 | type IdToChildren = FxHashMap>>; 125 | type NameToGffRec = FxHashMap>; 126 | 127 | #[derive(Debug, Clone, Deserialize, Serialize)] 128 | pub struct GFFDB { 129 | pub header: Vec, 130 | pub records: Vec>, 131 | pub id_to_rec: IdToGffRec, 132 | pub name_to_rec: NameToGffRec, 133 | pub children: IdToChildren, 134 | } 135 | 136 | impl GFFDB { 137 | pub fn from_bgzip_file(filepath: &Path) -> std::io::Result { 138 | let file = BGZFReader::new(File::open(filepath)?); 139 | let mut header = Vec::::new(); 140 | let mut records = Vec::>::new(); 141 | let mut id_to_rec = IdToGffRec::default(); 142 | let mut name_to_rec = NameToGffRec::default(); 143 | let mut children = IdToChildren::default(); 144 | file.lines().into_iter().for_each(|line| { 145 | let line = line.unwrap(); 146 | if &line[0..1] != "#" { 147 | let rec = Rc::new(GFFRecord::from_line(&line)); 148 | records.push(rec.clone()); 149 | 150 | if rec.attributes.contains_key("ID") { 151 | let id = rec.attributes.get("ID").unwrap(); 152 | id_to_rec.insert(id.clone(), rec.clone()); 153 | } 154 | if rec.attributes.contains_key("Name") { 155 | let name = rec.attributes.get("Name").unwrap(); 156 | name_to_rec.insert(name.clone(), rec.clone()); 157 | } 158 | if rec.attributes.contains_key("Parent") { 159 | let parent_id = rec.attributes.get("Parent").unwrap(); 160 | children 161 | .entry(parent_id.clone()) 162 | .or_insert_with(Vec::new) 163 | .push(rec.clone()); 164 | } 165 | } else { 166 | header.push(line); 167 | } 168 | }); 169 | Ok(GFFDB { 170 | header, 171 | records, 172 | id_to_rec, 173 | name_to_rec, 174 | children, 175 | }) 176 | } 177 | 178 | pub fn from_list_of_fields(list_of_fields: &[Vec]) -> GFFDB { 179 | let header = Vec::::new(); 180 | let mut records = Vec::>::new(); 181 | let mut id_to_rec = IdToGffRec::default(); 182 | let mut name_to_rec = NameToGffRec::default(); 183 | let mut children = IdToChildren::default(); 184 | 185 | list_of_fields.iter().for_each(|fields| { 186 | let rec = Rc::new(GFFRecord::from_fields(fields)); 187 | records.push(rec.clone()); 188 | 189 | if rec.attributes.contains_key("ID") { 190 | let id = rec.attributes.get("ID").unwrap(); 191 | id_to_rec.insert(id.clone(), rec.clone()); 192 | } 193 | if rec.attributes.contains_key("Name") { 194 | let name = rec.attributes.get("Name").unwrap(); 195 | name_to_rec.insert(name.clone(), rec.clone()); 196 | } 197 | if rec.attributes.contains_key("Parent") { 198 | let parent_id = rec.attributes.get("Parent").unwrap(); 199 | children 200 | .entry(parent_id.clone()) 201 | .or_insert_with(Vec::new) 202 | .push(rec.clone()); 203 | } 204 | }); 205 | 206 | GFFDB { 207 | header, 208 | records, 209 | id_to_rec, 210 | name_to_rec, 211 | children, 212 | } 213 | } 214 | 215 | pub fn get_all_offspring( 216 | &self, 217 | id_or_name: &String, 218 | recusive: bool, 219 | ) -> Option>> { 220 | let mut all_offspring = Vec::>::new(); 221 | 222 | let id = if self.id_to_rec.contains_key(id_or_name) { 223 | Some(id_or_name) 224 | } else if self.name_to_rec.contains_key(id_or_name) { 225 | let r = self.name_to_rec.get(id_or_name).unwrap(); 226 | r.attributes.get("ID") 227 | } else { 228 | None 229 | }; 230 | 231 | id?; 232 | 233 | let id = id.unwrap(); 234 | match self.children.get(id) { 235 | Some(children) => { 236 | children.iter().for_each(|r| { 237 | if recusive && r.attributes.contains_key("ID") { 238 | let id = r.attributes.get("ID").unwrap(); 239 | if let Some(more_offsprings) = self.get_all_offspring(id, recusive) { 240 | more_offsprings.iter().for_each(|r| { 241 | all_offspring.push(r.clone()); 242 | }); 243 | } 244 | } 245 | all_offspring.push(r.clone()); 246 | }); 247 | Some(all_offspring) 248 | } 249 | None => None, 250 | } 251 | } 252 | 253 | pub fn dump_json(&self) { 254 | println!("{}", serde_json::to_string(&self).unwrap()); 255 | } 256 | 257 | pub fn load_json(s: &str) -> serde_json::Result { 258 | let gffdb: GFFDB = serde_json::from_str(s)?; 259 | Ok(gffdb) 260 | } 261 | } 262 | 263 | #[derive(Debug, Clone, Deserialize, Serialize)] 264 | pub struct QueryOut { 265 | parent: Rc, 266 | offspring: Vec>, 267 | } 268 | 269 | #[cfg(test)] 270 | mod test { 271 | use super::*; 272 | 273 | #[test] 274 | fn test_gff_to_db() { 275 | let res = super::GFFDB::from_bgzip_file(&Path::new("./test/test_data/test.gff3.gz")); 276 | let gdb = res.unwrap(); 277 | println!("{}", gdb.header.join("\n")); 278 | let r = gdb.name_to_rec.get(&"FLG".to_string()).unwrap(); 279 | let parent = r.clone(); 280 | let mut offspring = Vec::>::new(); 281 | println!("{}", r); 282 | gdb.get_all_offspring(&"FLG".to_string(), true) 283 | .unwrap() 284 | .into_iter() 285 | .for_each(|r| { 286 | println!("{}", r); 287 | offspring.push(r.clone()); 288 | }); 289 | let qr = QueryOut { parent, offspring }; 290 | println!("{}", serde_json::to_string(&qr).unwrap()); 291 | } 292 | } 293 | -------------------------------------------------------------------------------- /pgr-db/src/kmer_filter.rs: -------------------------------------------------------------------------------- 1 | use cuckoofilter::CuckooFilter; 2 | use rustc_hash::FxHashSet; 3 | use std::collections::hash_map::DefaultHasher; 4 | 5 | pub struct KmerFilter { 6 | filter: CuckooFilter, 7 | kmer_size: usize, 8 | 9 | } 10 | 11 | impl KmerFilter { 12 | pub fn new(kmer_size: usize) -> Self { 13 | let filter = CuckooFilter::new(); 14 | KmerFilter { filter, kmer_size } 15 | } 16 | 17 | pub fn with_capacity(kmer_size: usize, capacity: usize) -> Self { 18 | let filter = CuckooFilter::with_capacity(capacity); 19 | KmerFilter { filter, kmer_size } 20 | } 21 | } 22 | 23 | impl KmerFilter { 24 | pub fn add_seq(&mut self, seq: &Vec) { 25 | (0..seq.len() - self.kmer_size).for_each(|pos| { 26 | self.filter.test_and_add(&seq[pos..pos + self.kmer_size]).unwrap(); 27 | }) 28 | } 29 | 30 | pub fn check_seq(&self, seq: &Vec) -> usize { 31 | let mut count = 0_usize; 32 | (0..seq.len() - self.kmer_size).for_each(|pos| { 33 | if self.filter.contains(&seq[pos..pos + self.kmer_size]) { 34 | count += 1 35 | }; 36 | }); 37 | count 38 | } 39 | 40 | pub fn add_seq_mmers(&mut self, seq: &Vec) { 41 | let k = self.kmer_size as u32; 42 | let w = k >> 1; 43 | let shmmrs = crate::shmmrutils::sequence_to_shmmrs1(0, seq, w, k, 1, 0, false); 44 | shmmrs.into_iter().for_each(|mmer| { 45 | self.filter.test_and_add(&mmer.x).unwrap(); 46 | }) 47 | } 48 | 49 | pub fn check_seq_mmers(&self, seq: &Vec) -> (usize, usize) { 50 | let mut count = 0_usize; 51 | let k = self.kmer_size as u32; 52 | let w = k >> 1; 53 | let shmmrs = crate::shmmrutils::sequence_to_shmmrs1(0, seq, w, k, 1, 0, false); 54 | shmmrs.iter().for_each(|mmer| { 55 | if self.filter.contains(&mmer.x) { 56 | count += 1 57 | }; 58 | }); 59 | (shmmrs.len(), count) 60 | } 61 | } 62 | 63 | pub struct MinimizerFilter { 64 | filter: FxHashSet, 65 | kmer_size: usize, 66 | 67 | } 68 | 69 | impl MinimizerFilter { 70 | pub fn new(kmer_size: usize) -> Self { 71 | let filter = FxHashSet::default(); 72 | MinimizerFilter { filter, kmer_size } 73 | } 74 | } 75 | 76 | impl MinimizerFilter { 77 | 78 | pub fn add_seq_mmers(&mut self, seq: &Vec) { 79 | let k = self.kmer_size as u32; 80 | let w = k >> 1; 81 | let shmmrs = crate::shmmrutils::sequence_to_shmmrs1(0, seq, w, k, 1, 0, false); 82 | shmmrs.into_iter().for_each(|mmer| { 83 | self.filter.insert(mmer.x); 84 | }) 85 | } 86 | 87 | pub fn check_seq_mmers(&self, seq: &Vec) -> (usize, usize) { 88 | let mut count = 0_usize; 89 | let k = self.kmer_size as u32; 90 | let w = k >> 1; 91 | let shmmrs = crate::shmmrutils::sequence_to_shmmrs1(0, seq, w, k, 1, 0, false); 92 | shmmrs.iter().for_each(|mmer| { 93 | if self.filter.contains(&mmer.x) { 94 | count += 1 95 | }; 96 | }); 97 | (shmmrs.len(), count) 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /pgr-db/test/test_data/consensus_test2.fa: -------------------------------------------------------------------------------- 1 | >ref 2 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCCCGCAGGTCCGCAGGGGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTCTCCCCCAAAAATGAGCTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG 3 | >0 4 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCCCGCAGGTCCGCAGGGGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTCTCCCCCAAAAATGAGCTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG 5 | >6 6 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCGAGCAGCGTCCACAGGCGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTTCTCCCCCAAAAATGAGTTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG 7 | >13 8 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCCCGCAGGTCCGCAGGGGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCCAGCACTTGCTTTCTCCCCCAAAAATGAGCTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG 9 | >16 10 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCGAGCAGGTCCACAGGCGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTCTCCCCCAAAAATGAGTTTGCCTTTGAGATGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG 11 | >22 12 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCCCGCAGGTCCGCAGGGGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTCTCCCCCAAAAATGAGCTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG 13 | >25 14 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCGAGCAGGTCCACAGGCGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTCTCCCCCAAAAATGAGTTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG 15 | >31 16 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCCCGCAGGTCCGCAGGGGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTCTCCCCCAAAAATGAGCTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG 17 | >33 18 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCGAGCAGGTCCACAGGCGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTCTCCCCCAAAAATGAGTTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG 19 | >50 20 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCGAGCAGGTCCACAGGCGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTCTCCCCCAAAAATGAGTTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG 21 | >52 22 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCCCGCAGGTCCGCAGGGGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTCTCCCCCAAAAATGAGCTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG 23 | >56 24 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCGAGCAGGTCCACAGGCGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTCTCCCCCAAAAATGAGTTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG 25 | >59 26 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCGAGCAGGTCCACAGGCGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTCTCCCCCAAAAATGAGTTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG 27 | >63 28 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCCCGCAGGTCCGCAGGGGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTCTCCCCCAAAAATGAGCTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG 29 | >77 30 | ACTGCCAGGAGCGCCTCACCCCTCACCTCCAGTCTCCTGGGCCATTGCCATGAGGTTGTTGTGGGCAACACCGAGCAGGTCCACAGGCGCCTTGGTCAGTGTCAGGGCATAGGCCGTGATGGCAGCTGCGTGGGCACCCAGGAGCCCAGCACTTGCTTTCTCCCCCAAAAATGAGTTTGCCTTTGAGATGGAGGCTTCCTGGAAGAAAACGGGAGGAGGGTCTTGGGCCTGGACCCCTGGGTTCCTGAG 31 | -------------------------------------------------------------------------------- /pgr-db/test/test_data/gen_agc.sh: -------------------------------------------------------------------------------- 1 | ~/pgr/agc/agc create test_agc_ref.fa test_agc_seqs.fa > test.agc 2 | -------------------------------------------------------------------------------- /pgr-db/test/test_data/gen_frag_db.py: -------------------------------------------------------------------------------- 1 | import pgrtk 2 | sdb = pgrtk.SeqIndexDB() 3 | sdb.load_from_fastx("test_seqs.fa") 4 | sdb.write_frag_and_index_files("test_seqs_frag") 5 | -------------------------------------------------------------------------------- /pgr-db/test/test_data/seq0: -------------------------------------------------------------------------------- 1 | TCCATTCCCACCAGCAGTGTGTGAAAGTCTGGTACTGGTTCAGCCTGCCGTACTTTAATGATTATTGGTGTCACTCTTTCAAGTAACTTGTTGGTAATAAGAAGTCAATTA 2 | -------------------------------------------------------------------------------- /pgr-db/test/test_data/seq1: -------------------------------------------------------------------------------- 1 | TCCATTCCCACCAGCAGTGTGTGAAGGTTCAGCCTGCCGTACTTTAATGATTATTGGTGACACTCTTTCAAGTAACTTGTTGGTAATATTTATCTAAGAAGTCAATTA 2 | -------------------------------------------------------------------------------- /pgr-db/test/test_data/test.agc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cschin/pgr-tk/af629426abff01f7d27c08de504534a93611b7c2/pgr-db/test/test_data/test.agc -------------------------------------------------------------------------------- /pgr-db/test/test_data/test.gff3.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cschin/pgr-tk/af629426abff01f7d27c08de504534a93611b7c2/pgr-db/test/test_data/test.gff3.gz -------------------------------------------------------------------------------- /pgr-db/test/test_data/test_agc_ref.fa: -------------------------------------------------------------------------------- 1 | >NA21309#1#JAHEPC010000026.1:3279880-3319873 2 | CTCAGGGCCCTGACGGGCGTCTTGCCATGCTGCTCCTGGGCCTGCTGCTGCTGCTGCCCCTGCTGGCTGGCGCCCGCCTGCTGTGGAACTGGTGGAAGCTCCGGAGCCTCCACCTCCTGCCTCTTGCCCCGGGCTTCTTGCACCTGCTGCAGCCCGACCTCCCCATCTATCTGCTTGGCCTGACTCAGAAATTCGGGCCCATCTACAGGCTCCACCTTGGGCTGCAAGGTGAGAGGCTGATCTCGCTCTGGCCCTCACCATAGGAGGGGGCGGAGGTGACGGAGAGGGTCCTCTCTCCGCTGACGCTGCTTTGGCTGTCTCCCAGATGTGGTGGTGCTGAACTCCAAGAGGACCATTGAGGAAGCCATGGTCAAAAAGTGGGCAGACTTTGCTGGCAGACCTGAGCCACTTACCTGTAAGGGCTGGGGGCATTTTTTCTTTCTTAAACAAATTTTTTTTTTGTTAGAGATGGGGTCTTGCTATGTTGCCCAGGCTGGTCTTGAATTCCTGGTCTCAAGTGATCCTCCCACCTCGGCCTCAAGTGGGAGCCACCTTCGGGGGCTTCCCCAATCCTCCAGGTCACTGGAAGCTCTTGGGGGGCATATCTTCAGGAGAAGAAGCAGGTGTTGAGGAGGCAGAAGAAGGTCAGGCCCTCGGCTTCCTTGGTCAGTTCCCACCCTCCAGCCCCCAGCTCCTCCTGCAGACAAGCTGGTGTCTAAGAACTACCCGGACCTGTCGTTGGTCTCTGCTCTGGAAAGCCCACAAGAAGCTCACCCGCTCAGCCCTGCTGCTGGGCATCCGTGACTCCATGGAGCCAGTGGTGGAGCAGCTGACCCAGGAGTTCTGTGAGGTAAGGCTGGGCTCCTGAGGCCACCTCGGGTCAGCCTCGCCTCTCACAGTAGCCCCCGCCCTGCCCGCTGCACAGCGGCCTGCTGAACTCACACTGTTTCTCCACAGCGCATGAGAGCCCAGCCCGGCACCCCTGTGGCCATTGAGGAGGAATTCTCTCTCCTCACCTGCAGCATCAACTGTTACCTCACCTTCGGAGACAAGATCAAGGTGCCTCACAGCCCCTCAGGCCCACCCCCAGCCCCTCCCTGAGCCTCTCCTTGTCCTGAACTGAAAGTACTCCCTCCTTTCCTGGCAGGAGGACAACTTAATGCCTGCCTATTACAAATGTATCCAGGAGGTGTTAAAAACCTGGAGCCACTGGTCCATCCAAATTGTGGACGTGATTCCCTTTCTCAGGGTGAGGACCTGGAGCCTAGACACCCCTGGATTGTGGGGGAGAGGCTGGGGTGGAGGGAGAGGCTCCTTCCCACAGCTGCATTCTCATGCTTCCTGCCGCAGTTCTTCCCCAATCCAGGTCTCCGGAGGCTGAAGCAGGCCATAGAGAAGAGGGACCACAACGAGGAGAAGCAGCTGAGGCAGCACAAGGTGGGGACTGTGTGTGGACGGCCTCCCCTCGGCCCACAGCCAGTGATGCTACCGGCCTCAGCATTGCTATGAGGCGGGTTCTTTTGCATACCCCAGTTATGGGCCTGTTGCCACTCTGTACTCCTCTCCCCAGGCCAGCCGCTCAGCCCGCTCCTTTCACCCTCTGCAGGAGAGCCTGGTGGCAGGCCAGTGGAGGGACATGATGGACTACATGCTCCAAGGGGTGGCGCAGCCGAGCATGGAAGAGGGCTCCGGACAGCTCCTGGAAGGGCACTTGCACATGGCTGCAGTGGACCTCCTGATCGGTGGCACTGAGACCACAGCAAACACCCTCTCCTGGGCCGTGGTTTTTTTTGCTTCACCACCCTGAGGTGCGTCCTGCGGACAAGCAAAAGGCTCCTTCCCAGCAACCTGGCCAGGGCGGTGGGCACCCTCACTCAGCTCTGAGCACTGTGCGGCTGGGGCTGTGCTTGCCTCACCGGCACTCAGGCTCACTGGGTTGCTGAGGGAGCGGCTGGAGGCTGGGCAGCTGTGGGCTGCTGGGGCAGGACTCCACCCGATCATTCCCCAGATTCAGCAGCGACTGCAGGAGGAGCTAGACCACGAACTGGGCCCTGGTGCCTCCAGCTCCCGGGTCCCCTACAAGGACCGTGCACGGCTGCCCTTGCTCAATGCCACCATCGCCGAGGTGCTGCGCCTGCGGCCCGTTGTGCCCTTAGCCTTGCCCCACCGCACCACACGGCCCAGCAGGTGACTCCCGAGGGTTGGGGATGAGTGAGGAAAGCCCGAGCCCAGGGAGGTCCTGGCCAGCCTCTAACTCCAGCCCCCTTCAGCATCTCCGGCTACGACATCCCTGAGGGCACAGTCATCATTCCGAACCTCCAAGGCGCCCACCTGGATGAGACGGTCTGGGAGAGGCCACATGAGTTCTGGCCTGGTATGTGGGGGGCCGGGGGCCTGCCGTGAAAATGTGGTGGAGGCTGGTCCCCGCTGCCGCTGAACGCCTCCCCACCCACCTGTCCACCCGCCCGCAGATCGCTTCCTGGAGCCAGGCAAGAACTCCAGAGCTCTGGCCTTCGGCTGCGGTGCCCGCGTGTGCCTGGGCGAGCCGCTGGCGCGCCTGGAGCTCTTCGTGGTGCTGACCCGACTGCTGCAGGCCTTCACGCTGCTGCCCTCCGGGGACGCCCTGCCCTCCCTGCAGCCCCTGCCCCACTGCAGTGTCATCCTCAAGATGCAGCCTTTCCAAGTGCGGCTGCAGCCCCGGGGGATGGGGGCCCACAGCCCGGGCCAGAGCCAGTGATGGGGCAGGACCGATGCCAGCCGGGTACCTCAGTTTCTCCTTTATTGCTCCTGTACGAACCCCTCCCCTCCCCCCTGTAAACACAGTGCTGCGAGATCGCTGGCAGAGAAGGCTTCCTCCAGCGGCTGGGTGGTGAAGGACCCTGGCTCTTCTCTCGGGGCGACCCCTCAGTGCTCGGCAGTCATACTGGGGTGCGAGAGAGGTGGGCAGCAGCTCAGCCTCCCCCCGCTGGGGAGCGAAAGTTTCTTGGTCTCAGCTTCATTTCCGTGAAGGGCACCGAGAACTCGAAGCCCTTCCAGTGGTACCAGCTCACTCCCTGGGAAAGGGGTTGTCAAGAGAGAGTCAAAGCCGGATGTCCCATCTGCTCCTCCCGTTCCCCTTAAGGAGGTGGCTCCCAGCACTCAACCAACCTCCCCGCAGAGCTCCCTTCCTGACCCTCTGCCGCAGAGGATTGAGGCTTAATCCTGAGCTGGTCCTTTCCAGCCAATAAATCAACTCCAGCTCCCTCTGCGAGGCTGGCATGATTGTTCCATTTCACCCAGCCGCTCAGTCCCTTGCCTGTTACACTGTGGGGCTGAAACCTAGGCAGGCCGAGCCCCAGCCACCCCAGCTCTGAGCCGCCTCCCCACCCCTCACCTGATGGTCCAC 3 | -------------------------------------------------------------------------------- /pgr-db/test/test_data/test_rev.fa: -------------------------------------------------------------------------------- 1 | >NA21309#1#JAHEPC010000026.1:3279880-3319873 2 | CTCAGGGCCCTGACGGGCGTCTTGCCATGCTGCTCCTGGGCCTGCTGCTGCTGCTGCCCCTGCTGGCTGGCGCCCGCCTGCTGTGGAACTGGTGGAAGCTCCGGAGCCTCCACCTCCTGCCTCTTGCCCCGGGCTTCTTGCACCTGCTGCAGCCCGACCTCCCCATCTATCTGCTTGGCCTGACTCAGAAATTCGGGCCCATCTACAGGCTCCACCTTGGGCTGCAAGGTGAGAGGCTGATCTCGCTCTGGCCCTCACCATAGGAGGGGGCGGAGGTGACGGAGAGGGTCCTCTCTCCGCTGACGCTGCTTTGGCTGTCTCCCAGATGTGGTGGTGCTGAACTCCAAGAGGACCATTGAGGAAGCCATGGTCAAAAAGTGGGCAGACTTTGCTGGCAGACCTGAGCCACTTACCTGTAAGGGCTGGGGGCATTTTTTCTTTCTTAAACAAATTTTTTTTTTGTTAGAGATGGGGTCTTGCTATGTTGCCCAGGCTGGTCTTGAATTCCTGGTCTCAAGTGATCCTCCCACCTCGGCCTCAAGTGGGAGCCACCTTCGGGGGCTTCCCCAATCCTCCAGGTCACTGGAAGCTCTTGGGGGGCATATCTTCAGGAGAAGAAGCAGGTGTTGAGGAGGCAGAAGAAGGTCAGGCCCTCGGCTTCCTTGGTCAGTTCCCACCCTCCAGCCCCCAGCTCCTCCTGCAGACAAGCTGGTGTCTAAGAACTACCCGGACCTGTCGTTGGTCTCTGCTCTGGAAAGCCCACAAGAAGCTCACCCGCTCAGCCCTGCTGCTGGGCATCCGTGACTCCATGGAGCCAGTGGTGGAGCAGCTGACCCAGGAGTTCTGTGAGGTAAGGCTGGGCTCCTGAGGCCACCTCGGGTCAGCCTCGCCTCTCACAGTAGCCCCCGCCCTGCCCGCTGCACAGCGGCCTGCTGAACTCACACTGTTTCTCCACAGCGCATGAGAGCCCAGCCCGGCACCCCTGTGGCCATTGAGGAGGAATTCTCTCTCCTCACCTGCAGCATCAACTGTTACCTCACCTTCGGAGACAAGATCAAGGTGCCTCACAGCCCCTCAGGCCCACCCCCAGCCCCTCCCTGAGCCTCTCCTTGTCCTGAACTGAAAGTACTCCCTCCTTTCCTGGCAGGAGGACAACTTAATGCCTGCCTATTACAAATGTATCCAGGAGGTGTTAAAAACCTGGAGCCACTGGTCCATCCAAATTGTGGACGTGATTCCCTTTCTCAGGGTGAGGACCTGGAGCCTAGACACCCCTGGATTGTGGGGGAGAGGCTGGGGTGGAGGGAGAGGCTCCTTCCCACAGCTGCATTCTCATGCTTCCTGCCGCAGTTCTTCCCCAATCCAGGTCTCCGGAGGCTGAAGCAGGCCATAGAGAAGAGGGACCACAACGAGGAGAAGCAGCTGAGGCAGCACAAGGTGGGGACTGTGTGTGGACGGCCTCCCCTCGGCCCACAGCCAGTGATGCTACCGGCCTCAGCATTGCTATGAGGCGGGTTCTTTTGCATACCCCAGTTATGGGCCTGTTGCCACTCTGTACTCCTCTCCCCAGGCCAGCCGCTCAGCCCGCTCCTTTCACCCTCTGCAGGAGAGCCTGGTGGCAGGCCAGTGGAGGGACATGATGGACTACATGCTCCAAGGGGTGGCGCAGCCGAGCATGGAAGAGGGCTCCGGACAGCTCCTGGAAGGGCACTTGCACATGGCTGCAGTGGACCTCCTGATCGGTGGCACTGAGACCACAGCAAACACCCTCTCCTGGGCCGTGGTTTTTTTTGCTTCACCACCCTGAGGTGCGTCCTGCGGACAAGCAAAAGGCTCCTTCCCAGCAACCTGGCCAGGGCGGTGGGCACCCTCACTCAGCTCTGAGCACTGTGCGGCTGGGGCTGTGCTTGCCTCACCGGCACTCAGGCTCACTGGGTTGCTGAGGGAGCGGCTGGAGGCTGGGCAGCTGTGGGCTGCTGGGGCAGGACTCCACCCGATCATTCCCCAGATTCAGCAGCGACTGCAGGAGGAGCTAGACCACGAACTGGGCCCTGGTGCCTCCAGCTCCCGGGTCCCCTACAAGGACCGTGCACGGCTGCCCTTGCTCAATGCCACCATCGCCGAGGTGCTGCGCCTGCGGCCCGTTGTGCCCTTAGCCTTGCCCCACCGCACCACACGGCCCAGCAGGTGACTCCCGAGGGTTGGGGATGAGTGAGGAAAGCCCGAGCCCAGGGAGGTCCTGGCCAGCCTCTAACTCCAGCCCCCTTCAGCATCTCCGGCTACGACATCCCTGAGGGCACAGTCATCATTCCGAACCTCCAAGGCGCCCACCTGGATGAGACGGTCTGGGAGAGGCCACATGAGTTCTGGCCTGGTATGTGGGGGGCCGGGGGCCTGCCGTGAAAATGTGGTGGAGGCTGGTCCCCGCTGCCGCTGAACGCCTCCCCACCCACCTGTCCACCCGCCCGCAGATCGCTTCCTGGAGCCAGGCAAGAACTCCAGAGCTCTGGCCTTCGGCTGCGGTGCCCGCGTGTGCCTGGGCGAGCCGCTGGCGCGCCTGGAGCTCTTCGTGGTGCTGACCCGACTGCTGCAGGCCTTCACGCTGCTGCCCTCCGGGGACGCCCTGCCCTCCCTGCAGCCCCTGCCCCACTGCAGTGTCATCCTCAAGATGCAGCCTTTCCAAGTGCGGCTGCAGCCCCGGGGGATGGGGGCCCACAGCCCGGGCCAGAGCCAGTGATGGGGCAGGACCGATGCCAGCCGGGTACCTCAGTTTCTCCTTTATTGCTCCTGTACGAACCCCTCCCCTCCCCCCTGTAAACACAGTGCTGCGAGATCGCTGGCAGAGAAGGCTTCCTCCAGCGGCTGGGTGGTGAAGGACCCTGGCTCTTCTCTCGGGGCGACCCCTCAGTGCTCGGCAGTCATACTGGGGTGCGAGAGAGGTGGGCAGCAGCTCAGCCTCCCCCCGCTGGGGAGCGAAAGTTTCTTGGTCTCAGCTTCATTTCCGTGAAGGGCACCGAGAACTCGAAGCCCTTCCAGTGGTACCAGCTCACTCCCTGGGAAAGGGGTTGTCAAGAGAGAGTCAAAGCCGGATGTCCCATCTGCTCCTCCCGTTCCCCTTAAGGAGGTGGCTCCCAGCACTCAACCAACCTCCCCGCAGAGCTCCCTTCCTGACCCTCTGCCGCAGAGGATTGAGGCTTAATCCTGAGCTGGTCCTTTCCAGCCAATAAATCAACTCCAGCTCCCTCTGCGAGGCTGGCATGATTGTTCCATTTCACCCAGCCGCTCAGTCCCTTGCCTGTTACACTGTGGGGCTGAAACCTAGGCAGGCCGAGCCCCAGCCACCCCAGCTCTGAGCCGCCTCCCCACCCCTCACCTGATGGTCCAC 3 | >NA21309#1#JAHEPC010000026.1:3279880-3319873_RC 4 | GTGGACCATCAGGTGAGGGGTGGGGAGGCGGCTCAGAGCTGGGGTGGCTGGGGCTCGGCCTGCCTAGGTTTCAGCCCCACAGTGTAACAGGCAAGGGACTGAGCGGCTGGGTGAAATGGAACAATCATGCCAGCCTCGCAGAGGGAGCTGGAGTTGATTTATTGGCTGGAAAGGACCAGCTCAGGATTAAGCCTCAATCCTCTGCGGCAGAGGGTCAGGAAGGGAGCTCTGCGGGGAGGTTGGTTGAGTGCTGGGAGCCACCTCCTTAAGGGGAACGGGAGGAGCAGATGGGACATCCGGCTTTGACTCTCTCTTGACAACCCCTTTCCCAGGGAGTGAGCTGGTACCACTGGAAGGGCTTCGAGTTCTCGGTGCCCTTCACGGAAATGAAGCTGAGACCAAGAAACTTTCGCTCCCCAGCGGGGGGAGGCTGAGCTGCTGCCCACCTCTCTCGCACCCCAGTATGACTGCCGAGCACTGAGGGGTCGCCCCGAGAGAAGAGCCAGGGTCCTTCACCACCCAGCCGCTGGAGGAAGCCTTCTCTGCCAGCGATCTCGCAGCACTGTGTTTACAGGGGGGAGGGGAGGGGTTCGTACAGGAGCAATAAAGGAGAAACTGAGGTACCCGGCTGGCATCGGTCCTGCCCCATCACTGGCTCTGGCCCGGGCTGTGGGCCCCCATCCCCCGGGGCTGCAGCCGCACTTGGAAAGGCTGCATCTTGAGGATGACACTGCAGTGGGGCAGGGGCTGCAGGGAGGGCAGGGCGTCCCCGGAGGGCAGCAGCGTGAAGGCCTGCAGCAGTCGGGTCAGCACCACGAAGAGCTCCAGGCGCGCCAGCGGCTCGCCCAGGCACACGCGGGCACCGCAGCCGAAGGCCAGAGCTCTGGAGTTCTTGCCTGGCTCCAGGAAGCGATCTGCGGGCGGGTGGACAGGTGGGTGGGGAGGCGTTCAGCGGCAGCGGGGACCAGCCTCCACCACATTTTCACGGCAGGCCCCCGGCCCCCCACATACCAGGCCAGAACTCATGTGGCCTCTCCCAGACCGTCTCATCCAGGTGGGCGCCTTGGAGGTTCGGAATGATGACTGTGCCCTCAGGGATGTCGTAGCCGGAGATGCTGAAGGGGGCTGGAGTTAGAGGCTGGCCAGGACCTCCCTGGGCTCGGGCTTTCCTCACTCATCCCCAACCCTCGGGAGTCACCTGCTGGGCCGTGTGGTGCGGTGGGGCAAGGCTAAGGGCACAACGGGCCGCAGGCGCAGCACCTCGGCGATGGTGGCATTGAGCAAGGGCAGCCGTGCACGGTCCTTGTAGGGGACCCGGGAGCTGGAGGCACCAGGGCCCAGTTCGTGGTCTAGCTCCTCCTGCAGTCGCTGCTGAATCTGGGGAATGATCGGGTGGAGTCCTGCCCCAGCAGCCCACAGCTGCCCAGCCTCCAGCCGCTCCCTCAGCAACCCAGTGAGCCTGAGTGCCGGTGAGGCAAGCACAGCCCCAGCCGCACAGTGCTCAGAGCTGAGTGAGGGTGCCCACCGCCCTGGCCAGGTTGCTGGGAAGGAGCCTTTTGCTTGTCCGCAGGACGCACCTCAGGGTGGTGAAGCAAAAAAAACCACGGCCCAGGAGAGGGTGTTTGCTGTGGTCTCAGTGCCACCGATCAGGAGGTCCACTGCAGCCATGTGCAAGTGCCCTTCCAGGAGCTGTCCGGAGCCCTCTTCCATGCTCGGCTGCGCCACCCCTTGGAGCATGTAGTCCATCATGTCCCTCCACTGGCCTGCCACCAGGCTCTCCTGCAGAGGGTGAAAGGAGCGGGCTGAGCGGCTGGCCTGGGGAGAGGAGTACAGAGTGGCAACAGGCCCATAACTGGGGTATGCAAAAGAACCCGCCTCATAGCAATGCTGAGGCCGGTAGCATCACTGGCTGTGGGCCGAGGGGAGGCCGTCCACACACAGTCCCCACCTTGTGCTGCCTCAGCTGCTTCTCCTCGTTGTGGTCCCTCTTCTCTATGGCCTGCTTCAGCCTCCGGAGACCTGGATTGGGGAAGAACTGCGGCAGGAAGCATGAGAATGCAGCTGTGGGAAGGAGCCTCTCCCTCCACCCCAGCCTCTCCCCCACAATCCAGGGGTGTCTAGGCTCCAGGTCCTCACCCTGAGAAAGGGAATCACGTCCACAATTTGGATGGACCAGTGGCTCCAGGTTTTTAACACCTCCTGGATACATTTGTAATAGGCAGGCATTAAGTTGTCCTCCTGCCAGGAAAGGAGGGAGTACTTTCAGTTCAGGACAAGGAGAGGCTCAGGGAGGGGCTGGGGGTGGGCCTGAGGGGCTGTGAGGCACCTTGATCTTGTCTCCGAAGGTGAGGTAACAGTTGATGCTGCAGGTGAGGAGAGAGAATTCCTCCTCAATGGCCACAGGGGTGCCGGGCTGGGCTCTCATGCGCTGTGGAGAAACAGTGTGAGTTCAGCAGGCCGCTGTGCAGCGGGCAGGGCGGGGGCTACTGTGAGAGGCGAGGCTGACCCGAGGTGGCCTCAGGAGCCCAGCCTTACCTCACAGAACTCCTGGGTCAGCTGCTCCACCACTGGCTCCATGGAGTCACGGATGCCCAGCAGCAGGGCTGAGCGGGTGAGCTTCTTGTGGGCTTTCCAGAGCAGAGACCAACGACAGGTCCGGGTAGTTCTTAGACACCAGCTTGTCTGCAGGAGGAGCTGGGGGCTGGAGGGTGGGAACTGACCAAGGAAGCCGAGGGCCTGACCTTCTTCTGCCTCCTCAACACCTGCTTCTTCTCCTGAAGATATGCCCCCCAAGAGCTTCCAGTGACCTGGAGGATTGGGGAAGCCCCCGAAGGTGGCTCCCACTTGAGGCCGAGGTGGGAGGATCACTTGAGACCAGGAATTCAAGACCAGCCTGGGCAACATAGCAAGACCCCATCTCTAACAAAAAAAAAATTTGTTTAAGAAAGAAAAAATGCCCCCAGCCCTTACAGGTAAGTGGCTCAGGTCTGCCAGCAAAGTCTGCCCACTTTTTGACCATGGCTTCCTCAATGGTCCTCTTGGAGTTCAGCACCACCACATCTGGGAGACAGCCAAAGCAGCGTCAGCGGAGAGAGGACCCTCTCCGTCACCTCCGCCCCCTCCTATGGTGAGGGCCAGAGCGAGATCAGCCTCTCACCTTGCAGCCCAAGGTGGAGCCTGTAGATGGGCCCGAATTTCTGAGTCAGGCCAAGCAGATAGATGGGGAGGTCGGGCTGCAGCAGGTGCAAGAAGCCCGGGGCAAGAGGCAGGAGGTGGAGGCTCCGGAGCTTCCACCAGTTCCACAGCAGGCGGGCGCCAGCCAGCAGGGGCAGCAGCAGCAGCAGGCCCAGGAGCAGCATGGCAAGACGCCCGTCAGGGCCCTGAG 5 | -------------------------------------------------------------------------------- /pgr-db/test/test_data/test_seqs2.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cschin/pgr-tk/af629426abff01f7d27c08de504534a93611b7c2/pgr-db/test/test_data/test_seqs2.fa.gz -------------------------------------------------------------------------------- /pgr-db/test/test_data/test_seqs_frag.frg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cschin/pgr-tk/af629426abff01f7d27c08de504534a93611b7c2/pgr-db/test/test_data/test_seqs_frag.frg -------------------------------------------------------------------------------- /pgr-db/test/test_data/test_seqs_frag.mdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cschin/pgr-tk/af629426abff01f7d27c08de504534a93611b7c2/pgr-db/test/test_data/test_seqs_frag.mdb -------------------------------------------------------------------------------- /pgr-db/test/test_data/test_seqs_frag.midx: -------------------------------------------------------------------------------- 1 | 0 3385 NA21309#1#JAHEPC010000026.1:3279880-3319873 test_seqs.fa 2 | 1 3384 NA21309#2#JAHEPB010000021.1:3182493-3222484 test_seqs.fa 3 | 2 3385 NA20129#1#JAHEPE010000077.1:3268654-3307814 test_seqs.fa 4 | 3 3384 NA20129#2#JAHEPD010000054.1:24048449-24086959 test_seqs.fa 5 | 4 3385 NA19240#2#JAHEOL010000047.1:3346340-3411873 test_seqs.fa 6 | 5 3385 NA18906#1#JAHEOO010000017.1:29298030-29336539 test_seqs.fa 7 | 6 3385 HG03579#1#JAGYVU010000035.1:17853932-17892393 test_seqs.fa 8 | 7 3385 HG03540#1#JAGYVY010000082.1:17876501-17941376 test_seqs.fa 9 | 8 3385 HG03516#1#JAGYYT010000073.1:24039705-24078215 test_seqs.fa 10 | 9 3385 HG03516#2#JAGYYS010000003.1:32115852-32155015 test_seqs.fa 11 | 10 3385 HG03492#1#JAHEPI010000049.1:16807354-16852770 test_seqs.fa 12 | 11 3385 HG03486#2#JAHEOP010000002.1:3274978-3314140 test_seqs.fa 13 | 12 3385 HG03453#1#JAGYVW010000148.1:2385113-2424164 test_seqs.fa 14 | 13 3385 HG03098#1#JAHEPM010000086.1:23575752-23614804 test_seqs.fa 15 | 14 3385 HG02886#1#JAHAOU010000006.1:23436768-23475277 test_seqs.fa 16 | 15 3385 HG02818#2#JAHEOR010000019.1:17832149-17870658 test_seqs.fa 17 | 16 3384 HG02723#1#JAHEOU010000100.1:4894384-4934376 test_seqs.fa 18 | 17 3385 HG02723#2#JAHEOT010000107.1:24171657-24210709 test_seqs.fa 19 | 18 3385 HG02717#1#JAHAOS010000073.1:5257988-5297982 test_seqs.fa 20 | 19 3385 HG02717#2#JAHAOR010000061.1:24170153-24235031 test_seqs.fa 21 | 20 3385 HG02630#2#JAHAOP010000058.1:24157264-24195773 test_seqs.fa 22 | 21 3385 HG02622#1#JAHAOO010000042.1:28532698-28597579 test_seqs.fa 23 | 22 3385 HG02572#1#JAHAOW010000052.1:1097698-1136749 test_seqs.fa 24 | 23 3385 HG02559#1#JAGYVK010000047.1:32002843-32048367 test_seqs.fa 25 | 24 3385 HG02559#2#JAGYVJ010000064.1:31959199-32005559 test_seqs.fa 26 | 25 3385 HG02486#1#JAGYVM010000005.1:27346251-27384764 test_seqs.fa 27 | 26 3385 HG02257#1#JAGYVI010000022.1:28399996-28444874 test_seqs.fa 28 | 27 3385 HG02257#2#JAGYVH010000080.1:5254630-5293788 test_seqs.fa 29 | 28 3385 HG02148#1#JAHAMG010000076.1:24056708-24095759 test_seqs.fa 30 | 29 3385 HG02145#1#JAHKSG010000017.1:7890663-7936079 test_seqs.fa 31 | 30 3385 HG02109#1#JAHEPG010000124.1:3230658-3268212 test_seqs.fa 32 | 31 3385 HG02055#1#JAHEPK010000074.1:3335267-3375260 test_seqs.fa 33 | 32 3384 HG01978#1#JAGYVS010000035.1:32002155-32041308 test_seqs.fa 34 | 33 3385 HG01978#2#JAGYVR010000046.1:2481050-2558664 test_seqs.fa 35 | 34 3385 HG01952#1#JAHAME010000044.1:28390439-28428944 test_seqs.fa 36 | 35 3384 HG01952#2#JAHAMD010000016.1:32003252-32042411 test_seqs.fa 37 | 36 3385 HG01928#1#JAGYVQ010000020.1:28403872-28448747 test_seqs.fa 38 | 37 3385 HG01928#2#JAGYVP010000017.1:31935974-31981618 test_seqs.fa 39 | 38 3385 HG01891#1#JAGYVO010000024.1:26903616-26942667 test_seqs.fa 40 | 39 3385 HG01361#1#JAGYYX010000108.1:7857845-7896895 test_seqs.fa 41 | 40 3386 HG01361#2#JAGYYW010000059.1:32011724-32051719 test_seqs.fa 42 | 41 3385 HG01358#1#JAGYZB010000008.1:7968591-8013467 test_seqs.fa 43 | 42 3385 HG01358#2#JAGYZA010000082.1:7871132-7910180 test_seqs.fa 44 | 43 3385 HG01258#1#JAGYYV010000066.1:28088097-28127147 test_seqs.fa 45 | 44 3385 HG01258#2#JAGYYU010000011.1:27163808-27202317 test_seqs.fa 46 | 45 3385 HG01243#1#JAHEOY010000117.1:3280591-3319753 test_seqs.fa 47 | 46 3385 HG01243#2#JAHEOX010000097.1:27566385-27611263 test_seqs.fa 48 | 47 3386 HG01175#2#JAHALZ010000032.1:24298713-24337223 test_seqs.fa 49 | 48 3385 HG01123#1#JAGYYZ010000057.1:26932659-26977537 test_seqs.fa 50 | 49 3385 HG01123#2#JAGYYY010000050.1:31983405-32028932 test_seqs.fa 51 | 50 3385 HG01109#1#JAHEPA010000084.1:27909663-27954539 test_seqs.fa 52 | 51 3385 HG01071#2#JAHBCE010000076.1:7804427-7869306 test_seqs.fa 53 | 52 3385 HG00741#1#JAHALY010000025.1:24278226-24317278 test_seqs.fa 54 | 53 3385 HG00741#2#JAHALX010000077.1:26562165-26600674 test_seqs.fa 55 | 54 3384 HG00735#1#JAHBCH010000013.1:32057725-32097711 test_seqs.fa 56 | 55 3385 HG00735#2#JAHBCG010000038.1:3345449-3385443 test_seqs.fa 57 | 56 3385 HG00733#1#JAHEPQ010000070.1:31971892-32017417 test_seqs.fa 58 | 57 3385 HG00673#1#JAHBBZ010000030.1:31864344-31910704 test_seqs.fa 59 | 58 3385 HG005#1#JAHEPO010000054.1:5156418-5202777 test_seqs.fa 60 | 59 3385 HG00621#2#JAHBCC010000005.1:31951291-31996816 test_seqs.fa 61 | 60 3385 HG005#2#JAHEPN010000064.1:6876299-6921824 test_seqs.fa 62 | 61 3382 HG002#1#JAHKSE010000066.1:5280272-5325794 test_seqs.fa 63 | 62 3385 HG002#2#JAHKSD010000045.1:27105329-27150207 test_seqs.fa 64 | 63 3385 GRCH38_chr6:32000466-32046826 test_seqs.fa 65 | 64 3385 chm13_chr6:31853672-31899197 test_seqs.fa 66 | 65 3385 RC_TEST test_seqs.fa 67 | -------------------------------------------------------------------------------- /pgr-db/test/test_data/test_seqs_frag.sdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cschin/pgr-tk/af629426abff01f7d27c08de504534a93611b7c2/pgr-db/test/test_data/test_seqs_frag.sdx -------------------------------------------------------------------------------- /pgr-db/wrapper.h: -------------------------------------------------------------------------------- 1 | #include "../agc/src/lib-cxx/agc-api.h" 2 | -------------------------------------------------------------------------------- /pgr-tk-workstation/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/miniconda3:latest 2 | 3 | ARG DEBIAN_FRONTEND=noninteractive 4 | 5 | RUN apt-get update \ 6 | && apt-get install -y --no-install-recommends \ 7 | curl \ 8 | git \ 9 | graphviz \ 10 | graphviz-dev \ 11 | gawk \ 12 | minimap2 \ 13 | samtools \ 14 | time \ 15 | wget \ 16 | pdf2svg \ 17 | awscli \ 18 | vim \ 19 | build-essential \ 20 | zlib1g \ 21 | zlib1g-dev \ 22 | libomp5 \ 23 | && rm -rf /var/lib/apt/lists/* 24 | 25 | 26 | RUN . /opt/conda/bin/activate && \ 27 | conda install -y python=3.11 jupyterlab numpy networkx==2.4 matplotlib bokeh && conda clean -ya 28 | 29 | #RUN conda install -y --channel=conda-forge \ 30 | # matplotlib \ 31 | # numpy \ 32 | # && conda clean -ya 33 | 34 | RUN pip3 install -U --no-cache-dir \ 35 | networkx==2.8.2 \ 36 | papermill==2.3.4 \ 37 | pydot==1.4.2 \ 38 | scikit-learn 39 | 40 | RUN echo deb http://http.us.debian.org/debian/ testing non-free contrib main > /etc/apt/sources.list 41 | RUN apt-get update 42 | RUN apt-get install -y libc6 libstdc++6 43 | RUN ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/bin/../lib/libstdc++.so.6 44 | 45 | COPY pgrtk-0.6.0-cp311-cp311-linux_x86_64.whl /tmp 46 | RUN pip install /tmp/pgrtk-0.6.0-cp311-cp311-linux_x86_64.whl 47 | RUN rm /tmp/pgrtk-0.6.0-cp311-cp311-linux_x86_64.whl 48 | RUN mkdir -p /opt/bin/ /wd/ 49 | COPY jupyterlab.sh /opt/bin/ 50 | CMD /bin/bash /opt/bin/jupyterlab.sh 51 | -------------------------------------------------------------------------------- /pgr-tk-workstation/Readme.md: -------------------------------------------------------------------------------- 1 | 2 | ## Introduction 3 | 4 | Thie directory contain docker file to build a docker image including the 5 | pgr-tk and a jupyter lab server that can run some example notebook for 6 | pangneome analysis. 7 | 8 | This tutorual assumes the user is familiar with a typical Linux environment and docker. 9 | 10 | ## Build the Docker Image 11 | 12 | After bullding the python wheel for version `0.x.y` (check the `../target/wheels` 13 | directory for proper version `0.x.y`). 14 | The image uses python3.8. 15 | 16 | ``` 17 | cp ../target/wheels/pgrtk-0.x.y-cp38-cp38-linux_x86_64.whl . 18 | docker build -t pgr-tk-ws:v0.x.y . 19 | ``` 20 | 21 | You can also use us a prebuilt docker image bye 22 | 23 | ``` 24 | docker pull cschin/pgr-tk-ws:v0.x.y 25 | ``` 26 | (check `https://hub.docker.com/r/cschin/pgr-tk-ws` the latest version.) 27 | 28 | 29 | ## Set up environment 30 | 31 | In a directory that you have write permission, 32 | 33 | ``` 34 | mkdir -p workdir 35 | cd workdir 36 | wget https://giab-data.s3.amazonaws.com/PGR-TK-Files/pgr-tk-HGRP-y1-evaluation-set-v0.tar 37 | wget https://giab-data.s3.amazonaws.com/PGR-TK-Files/pgr-tk-example-code.zip 38 | ``` 39 | 40 | Untar the data file 41 | ``` 42 | tar xvf pgr-tk-HGRP-y1-evaluation-set-v0.tar 43 | ``` 44 | 45 | The tar ball contains the following data file 46 | 47 | ``` 48 | data/ 49 | data/pgr-tk-HGRP-y1-evaluation-set-v0.agc # HPRC year 1 47 genomes (94 haplotype) + hg38 + hg19 + chm13 sequences in AGC format 50 | data/pgr-tk-HGRP-y1-evaluation-set-v0.mdb # the SHIMMER index into the sequences 51 | data/pgr-tk-HGRP-y1-evaluation-set-v0.midx # auxilary index file for sequence names 52 | data/pgr-tk-HGRP-y1-evaluation-set-v0_input # file used to generate the index 53 | data/AMY1A_gfa_view.png # AMY1A GFA example 54 | ``` 55 | 56 | Unzip the example notebooks 57 | 58 | ``` 59 | mkdir -p code && pushd code 60 | unzip ../pgr-tk-example-code.zip 61 | popd 62 | ``` 63 | 64 | Execute the Jupyter Lab server through docker 65 | 66 | ``` 67 | docker run -v $PWD:/wd/ -p 8888:8888 pgr-tk-ws:v0.x.y 68 | ``` 69 | 70 | or use a pre-built docker 71 | 72 | Then follow the instruction from the Jupyter Lab output to connect to 73 | the server from a browser. 74 | 75 | For analyzing the whole 97 haplotyp human assembly, it is suggested 76 | to have at least 64G RAM. You may use a remote server with enouge memory 77 | connect to the server directly or through ssh tunneling. 78 | 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /pgr-tk-workstation/build.sh: -------------------------------------------------------------------------------- 1 | cp ../target/wheels/pgrtk-0.6.0-cp311-cp311-linux_x86_64.whl . 2 | docker build -t cschin/pgr-tk-ws:v0.6.0 . 3 | -------------------------------------------------------------------------------- /pgr-tk-workstation/jupyterlab.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | mkdir -p /wd/results/ 3 | mkdir -p /wd/code/ 4 | mkdir -p /wd/data// 5 | ln -sf /wd/* / 6 | . /opt/conda/etc/profile.d/conda.sh 7 | jupyter-lab --ip="*" --allow-root --no-browser --port 8888 --NotebookApp.disable_check_xsrf=True /wd 8 | -------------------------------------------------------------------------------- /pgr-tk/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "pgrtk" 3 | version = "0.6.0" 4 | authors = ["Jason Chin "] 5 | edition = "2021" 6 | 7 | 8 | [lib] 9 | name = "pgrtk" 10 | crate-type = ["rlib","cdylib"] 11 | 12 | [dependencies] 13 | pyo3 = { version = "0.18.3", features = ["extension-module"] } 14 | 15 | pgr-db = { path = "../pgr-db/", default-features = false } 16 | rustc-hash = "1.1.0" 17 | rayon = "1.5.2" 18 | memmap2 = "0.5.10" 19 | 20 | [features] 21 | with_agc = [] 22 | default = ["pgr-db/with_agc", "with_agc"] 23 | -------------------------------------------------------------------------------- /pgr-tk/Readme.md: -------------------------------------------------------------------------------- 1 | A new light interface to the pgr-db -------------------------------------------------------------------------------- /pgr-tk/build.rs: -------------------------------------------------------------------------------- 1 | // from https://vallentin.dev/2019/06/06/versioning 2 | 3 | use std::env::consts::{ARCH, OS}; 4 | use std::process::Command; 5 | 6 | #[cfg(debug_assertions)] 7 | const BUILD_TYPE: &'static str = "debug"; 8 | #[cfg(not(debug_assertions))] 9 | const BUILD_TYPE: &'static str = "release"; 10 | 11 | fn main() { 12 | let branch_name = get_branch_name(); 13 | if branch_name != String::from("bioconda") { 14 | let version_string = format!( 15 | "{} {} ({}:{}{}, {} build, {} [{}] [{}])", 16 | env!("CARGO_PKG_NAME"), 17 | env!("CARGO_PKG_VERSION"), 18 | get_branch_name(), 19 | get_commit_hash(), 20 | if is_working_tree_clean() { "" } else { "+" }, 21 | BUILD_TYPE, 22 | OS, 23 | ARCH, 24 | get_rustc_version() 25 | ); 26 | 27 | println!("cargo:rustc-env=VERSION_STRING={}", version_string); 28 | } else { 29 | let version_string = format!( 30 | "{} {} (bioconda {} build, {} [{}] [{}])", 31 | env!("CARGO_PKG_NAME"), 32 | env!("CARGO_PKG_VERSION"), 33 | BUILD_TYPE, 34 | OS, 35 | ARCH, 36 | get_rustc_version() 37 | ); 38 | println!("cargo:rustc-env=VERSION_STRING={}", version_string); 39 | } 40 | } 41 | 42 | fn get_rustc_version() -> String { 43 | let output = Command::new("rustc") 44 | .arg("--version") 45 | .current_dir(env!("CARGO_MANIFEST_DIR")) 46 | .output() 47 | .unwrap(); 48 | 49 | assert!(output.status.success()); 50 | 51 | String::from_utf8_lossy(&output.stdout) 52 | .trim_end() 53 | .to_string() 54 | } 55 | 56 | fn get_commit_hash() -> String { 57 | let output = Command::new("git") 58 | .arg("log") 59 | .arg("-1") 60 | .arg("--pretty=format:%h") // Abbreviated commit hash 61 | // .arg("--pretty=format:%H") // Full commit hash 62 | .current_dir(env!("CARGO_MANIFEST_DIR")) 63 | .output() 64 | .unwrap(); 65 | 66 | // assert!(output.status.success()); 67 | if output.status.success() { 68 | String::from_utf8_lossy(&output.stdout).to_string() 69 | } else { 70 | String::from("bioconda") 71 | } 72 | } 73 | 74 | fn get_branch_name() -> String { 75 | let output = Command::new("git") 76 | .arg("rev-parse") 77 | .arg("--abbrev-ref") 78 | .arg("HEAD") 79 | .current_dir(env!("CARGO_MANIFEST_DIR")) 80 | .output() 81 | .unwrap(); 82 | 83 | //assert!(output.status.success()); 84 | if output.status.success() { 85 | String::from_utf8_lossy(&output.stdout) 86 | .trim_end() 87 | .to_string() 88 | } else { 89 | String::from("bioconda") 90 | } 91 | } 92 | 93 | fn is_working_tree_clean() -> bool { 94 | let status = Command::new("git") 95 | .arg("diff") 96 | .arg("--quiet") 97 | .arg("--exit-code") 98 | .current_dir(env!("CARGO_MANIFEST_DIR")) 99 | .status() 100 | .unwrap(); 101 | 102 | if status.success() { 103 | status.code().unwrap() == 0 104 | } else { 105 | true 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /pgr-tk/build.sh: -------------------------------------------------------------------------------- 1 | maturin build --release --skip-auditwheel -i python3.8 2 | -------------------------------------------------------------------------------- /pgr-tk/examples/generate_gfa_for_CMRG.py: -------------------------------------------------------------------------------- 1 | import pgrlite 2 | import os 3 | import networkx as nx 4 | from networkx.drawing import nx_pydot 5 | from collections import Counter 6 | 7 | def generate_gfa(cmrg_regions, gene_name, pg_db, out_dir): 8 | gene_seq = cmrg_regions[gene_name][1] 9 | aln_range0 = pgrlite.query_sdb(pg_db, gene_seq, merge_range_tol=len(gene_seq) * 0.25) 10 | print("The number of hits for {} is {}".format((gene_name), len(aln_range0))) 11 | count = 0 12 | for sid, rgns in aln_range0.items(): 13 | count += len(rgns) 14 | 15 | print("The total aligned regions {} is {}".format(gene_name, count)) 16 | seq_info = pg_db.seq_info.copy() 17 | with open(os.path.join(out_dir, f"{gene_name}_hit.txt"), "w") as f: 18 | print("#source", "ctg", "len", "n_hit", sep="\t", file = f) 19 | for k in aln_range0: 20 | if len(aln_range0[k]) >= 1: 21 | ctg, src, len_ = seq_info[k] 22 | print(src, ctg, len_, len(aln_range0[k]), sep="\t", file = f) 23 | 24 | 25 | rgn_lengths = [] 26 | with open(os.path.join(out_dir, f"{gene_name}_hit_range.txt"), "w") as f: 27 | print("#sourc", "ctg", "len", "t_rgn_start", "t_rgn_end", "t_rgn_len", sep="\t", file = f) 28 | 29 | for k in list(aln_range0.keys()): 30 | b, e = aln_range0[k][0][0:2] 31 | if e-b < len(gene_seq) * 0.25: 32 | continue 33 | ctg, src, len_ = seq_info[k] 34 | print(src, ctg, len_, b, e, e-b, sep="\t", file = f ) 35 | rgn_lengths.append(e-b) 36 | 37 | with open(os.path.join(out_dir, f"{gene_name}_ht_copy_count.txt"), "w") as f: 38 | n_copy = {} 39 | for k in list(aln_range0.keys()): 40 | b, e = aln_range0[k][0][0:2] 41 | if e-b < len(gene_seq) * 0.25: 42 | continue 43 | n_copy[k] = len(aln_range0[k]) 44 | copy_count = Counter(n_copy.values()) 45 | for nc, nh in copy_count.items(): 46 | print("{}\tnumber_of_copy: {}\tnumber_of_haplotype_contig: {}".format(gene_name, nc, nh), file = f) 47 | 48 | seq_list = [] 49 | i = 0 50 | for k in list(aln_range0.keys()): 51 | ctg_name, source, _ = seq_info[k] 52 | seq_id = k 53 | rgns = aln_range0[k].copy() 54 | rgns = pgrlite.merge_regions(rgns, tol=int(len(gene_seq)*0.25)) 55 | 56 | for rgn in rgns: 57 | b, e, length, orientation, aln = rgn 58 | if length < len(gene_seq)*0.25: 59 | continue 60 | seq = pg_db.get_sub_seq(source, ctg_name, b, e) 61 | if orientation == 1: 62 | seq = pgrlite.rc_byte_seq(seq) 63 | seq_list.append((i, "{}_{}_{}_{}".format(ctg_name, b, e, orientation), seq)) 64 | i += 1 65 | 66 | with open( os.path.join(out_dir, f"{gene_name}.fa"), "w") as f: 67 | for sid, name, seq in seq_list: 68 | print(">{} {}".format(name, sid), file = f) 69 | print(pgrlite.u8_to_string(seq), file = f) 70 | 71 | new_sdb = pgrlite.SeqIndexDB() 72 | new_sdb.load_from_seq_list(seq_list, w=48, k=48, r=1, min_span=24) 73 | new_sdb.generate_smp_gfa(0, os.path.join(out_dir, f"{gene_name}_48_48_1_24.gfa")) 74 | new_sdb.write_midx_to_text_file(os.path.join(out_dir, f"{gene_name}_48_48_1_24.midx")) 75 | 76 | links = new_sdb.get_smp_adj_list(0) 77 | link_count = Counter([(_[1],_[2]) for _ in links]) 78 | G = nx.DiGraph() 79 | for sid, v, w in links: 80 | if sid == 0: 81 | continue 82 | 83 | penwidth = link_count[(v,w)] * 0.01 84 | weight = penwidth 85 | G.add_edge(tuple(v[:2]), tuple(w[:2]), weight= weight, penwidth=penwidth) 86 | 87 | nx_pydot.write_dot(G, os.path.join(out_dir, "{}_48_48_1_24.dot".format(gene_name))) 88 | nx.write_gexf(G, os.path.join(out_dir, "{}_48_48_1_24.gexf".format(gene_name))) 89 | 90 | 91 | new_sdb.load_from_seq_list(seq_list, w=48, k=48, r=8, min_span=24) 92 | new_sdb.generate_smp_gfa(0, os.path.join(out_dir, f"{gene_name}_48_48_8_24.gfa")) 93 | new_sdb.write_midx_to_text_file(os.path.join(out_dir, f"{gene_name}_48_48_8_24.midx")) 94 | 95 | links = new_sdb.get_smp_adj_list(0) 96 | link_count = Counter([(_[1],_[2]) for _ in links]) 97 | G = nx.DiGraph() 98 | for sid, v, w in links: 99 | if sid == 0: 100 | continue 101 | 102 | penwidth = link_count[(v,w)] * 0.01 103 | weight = penwidth 104 | G.add_edge(tuple(v[:2]), tuple(w[:2]), weight= weight, penwidth=penwidth) 105 | 106 | nx_pydot.write_dot(G, os.path.join(out_dir, "{}_48_48_8_24.dot".format(gene_name))) 107 | nx.write_gexf(G, os.path.join(out_dir, "{}_48_48_8_24.gexf".format(gene_name))) 108 | 109 | new_sdb.generate_smp_gfa(0, os.path.join(out_dir, f"{gene_name}_48_48_8_24.gfa")) 110 | new_sdb.write_midx_to_text_file(os.path.join(out_dir, f"{gene_name}_48_48_8_24.midx")) 111 | 112 | 113 | if __name__ == "__main__": 114 | ref_db =pgrlite.AGCFile("/data/HPRC-y1-rebuild-04252022/hg19.agc") 115 | pb_db = pgrlite.SeqIndexDB() 116 | pb_db.load_from_agc_index("/data/HPRC-y1-rebuild-04252022") 117 | CMRG_coordinates = {} 118 | padding = 20000 119 | with open("/data/HG002_GRCh37_CMRG_coordinates.bed") as f: 120 | for r in f: 121 | r = r.strip().split("\t") 122 | CMRG_coordinates[r[3]]=("chr{}".format(r[0]), int(r[1])-padding, int(r[2])+padding) 123 | 124 | CMRG_hg19_seq = {} 125 | for g, c in CMRG_coordinates.items(): 126 | seq = ref_db.get_sub_seq('hg19.fasta', c[0], c[1], c[2]) 127 | CMRG_hg19_seq[g] = (c, seq) 128 | 129 | for g_name in CMRG_hg19_seq: 130 | print("analyzing {}".format(g_name)) 131 | generate_gfa(CMRG_hg19_seq, g_name, pb_db, "/scratch/GFA_files") 132 | -------------------------------------------------------------------------------- /pgr-tk/examples/get_variants.py: -------------------------------------------------------------------------------- 1 | import pgrtk 2 | import os, sys 3 | 4 | 5 | def filter_aln(aln_segs): 6 | """ 7 | ensure both target / query are strictly increasing 8 | """ 9 | 10 | last_ts = aln_segs[0][1][0] 11 | last_te = aln_segs[0][1][1] 12 | 13 | last_qs = aln_segs[0][0][0] 14 | last_qe = aln_segs[0][0][1] 15 | 16 | 17 | rtn = [ ((last_ts, last_te), (last_qs, last_qe)) ] 18 | 19 | for seg in aln_segs: 20 | 21 | if seg[1][1] < seg[1][0]: continue 22 | if seg[0][-1] != seg[1][-1]: continue 23 | 24 | if seg[1][0] >= last_te: 25 | 26 | last_ts = last_te 27 | last_te = seg[1][1] 28 | 29 | last_qs = last_qe 30 | last_qe = seg[0][1] 31 | 32 | if last_ts == last_te: 33 | continue 34 | 35 | 36 | rtn.append( ((last_ts, last_te), (last_qs, last_qe)) ) 37 | return rtn 38 | 39 | 40 | 41 | def filter_aln_rev(aln_segs): 42 | """ 43 | ensure both target / query are strictly increasing 44 | """ 45 | aln_segs = aln_segs.copy() 46 | aln_segs.reverse() 47 | last_ts = aln_segs[0][1][0] 48 | last_te = aln_segs[0][1][1] 49 | 50 | last_qs = aln_segs[0][0][0] 51 | last_qe = aln_segs[0][0][1] 52 | 53 | 54 | rtn = [ ((last_ts, last_te), (last_qs, last_qe)) ] 55 | 56 | for seg in aln_segs: 57 | 58 | if seg[1][1] < seg[1][0]: continue 59 | if seg[0][-1] == seg[1][-1]: continue 60 | 61 | if seg[1][0] >= last_te: 62 | 63 | last_ts = last_te 64 | last_te = seg[1][1] 65 | 66 | last_qe = last_qs 67 | last_qs = seg[0][0] 68 | 69 | 70 | if last_ts == last_te: 71 | continue 72 | 73 | 74 | rtn.append( ((last_ts, last_te), (last_qs, last_qe)) ) 75 | return rtn 76 | 77 | def seq_align_to_sdb(seq_db, seq1): 78 | 79 | query_res = pgrtk.query_sdb(seq_db, seq1, 80 | merge_range_tol=0, 81 | gap_penalty_factor=0.001, 82 | max_query_count=1, 83 | max_target_count=1) 84 | 85 | _, kmer_size, _, _, _ = seq_db.get_shmmr_spec() 86 | rtn = [] 87 | 88 | for sid, alns in query_res.items(): 89 | # print("#sid, hits:", sid, len(alns)) 90 | 91 | ref_seq = seq_db.get_seq_by_id(sid) 92 | 93 | for aln in alns: 94 | ts, te, tl, orientation = aln[:-1] 95 | # print(ts, te, tl, orientation) 96 | aln = aln[-1] 97 | if orientation == 0 : 98 | filter_alignments = filter_aln(aln) 99 | else: 100 | filter_alignments = filter_aln_rev(aln) 101 | # print("# anchors: ", len(aln), len(filter_aln(aln)), len(filter_aln_rev(aln))) 102 | 103 | for seg in filter_alignments: 104 | 105 | last_ts, last_te = seg[0][:2] 106 | last_qs, last_qe = seg[1][:2] 107 | 108 | last_ts -= kmer_size 109 | #last_te -= kmer_size 110 | 111 | s0str = pgrtk.u8_to_string(ref_seq[last_ts: last_te]) 112 | if orientation == 0: 113 | last_qs -= kmer_size 114 | s1str = pgrtk.u8_to_string(seq1[last_qs:last_qe]) 115 | else: 116 | last_qs -= kmer_size 117 | s1str = pgrtk.rc(pgrtk.u8_to_string(seq1[last_qs:last_qe])) 118 | 119 | if s0str[:16] != s1str[:16] or s0str[-16:] != s1str[-16:]: 120 | print("XXXX1 {} :\n{}\n{}\n".format(orientation, s0str[:56], s1str[:56])) 121 | print("XXXX2 {} :\n{}\n{}\n".format(orientation, s0str[-56:], s1str[-56:])) 122 | diff = None 123 | elif min(len(s0str),len(s1str)) == 0 or abs(len(s0str)-len(s1str)) > 256: 124 | diff = None 125 | else: 126 | diff = pgrtk.get_variant_segments(s0str, s1str, max_wf_length=min(64, len(s0str), len(s1str)), max_diff_percent=1) 127 | 128 | if diff is not None: 129 | if len(diff[0]) > 0: 130 | for d in diff[0]: 131 | rtn.append( ((sid, last_ts, last_te), (last_qs, last_qe), 132 | (d[0] + last_ts, d[1] + last_qs, d[2], d[3], d[4]), orientation) ) 133 | else: 134 | rtn.append( ((sid, last_ts, last_te), (last_qs, last_qe), 'ALL', orientation ) ) 135 | elif diff is None: 136 | rtn.append( ((sid, last_ts, last_te), (last_qs, last_qe), 'NULL', orientation ) ) 137 | return rtn 138 | 139 | def main(sdb_prefix, query_seq_fasta_path, out_prefix = "out"): 140 | target_sdb = pgrtk.SeqIndexDB() 141 | target_sdb.load_from_frg_index(sdb_prefix) 142 | query_sdb = pgrtk.SeqIndexDB() 143 | query_sdb.load_from_fastx(query_seq_fasta_path) 144 | 145 | target_sinfo = target_sdb.seq_info.copy() 146 | sinfo = query_sdb.seq_info.copy() 147 | variant_file = open(out_prefix+".variants", "w") 148 | sv_candidate_file = open(out_prefix+".sv_candidate", "w") 149 | all_match_file = open(out_prefix+".all_match", "w") 150 | for sid in sinfo: 151 | ctg, src, length = sinfo[sid] 152 | query_seq = query_sdb.get_seq_by_id(sid) 153 | variants = seq_align_to_sdb(target_sdb, query_seq) 154 | for variant in variants: 155 | t_sid, ts, te = variant[0] 156 | qs, qe = variant[1] 157 | t_ctg, _, _ = target_sinfo[t_sid] 158 | rec = variant[2] 159 | if rec in ['ALL', 'NULL']: 160 | print(t_ctg, ts, te, ctg, qs, qe, variant[2], variant[3], sep="\t", file=all_match_file) 161 | else: 162 | print(t_ctg, ts, te, ctg, qs, qe, rec[0], variant[3], sep="\t", file=all_match_file) 163 | print(t_ctg, rec[0], rec[2], rec[3], rec[4], ctg, sep="\t", file=variant_file) 164 | if rec == "NULL": 165 | print(t_ctg, variant[0][1], variant[0][2], ctg, variant[1][0], variant[1][1], sep="\t", file=sv_candidate_file) 166 | variant_file.close() 167 | sv_candidate_file.close() 168 | 169 | 170 | 171 | 172 | 173 | if __name__ == "__main__": 174 | 175 | sdb_prefix = sys.argv[1] 176 | query_seq_fasta_path = sys.argv[2] 177 | prefix = sys.argv[3] 178 | main(sdb_prefix, query_seq_fasta_path, prefix) 179 | -------------------------------------------------------------------------------- /pgr-web/frontend/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "pgr-web" 3 | version = "0.6.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | dioxus = { version = "0.4.0", features = [] } 10 | reqwest = { version = "0.11", features = ["json"] } 11 | ws_stream_wasm = "0.7.4" 12 | serde = { version = "1.0.80", features = ["derive"] } 13 | serde_derive = "^1.0.59" 14 | serde_json = "1.0.59" 15 | rustc-hash = "1.1.0" 16 | dioxus-web = "0.4.0" 17 | wasm-logger = "0.2.0" 18 | log = "0.4.17" 19 | serde_qs = "0.12.0" 20 | serde_with = "3.0.0" 21 | url = "2.4.0" 22 | getrandom = { version = "0.2", features = ["js"] } 23 | pharos = "0.5" 24 | wasm-bindgen = "0.2.86" 25 | sledgehammer_bindgen = "0.2.2" 26 | futures-util = "0.3.28" 27 | futures-lite = "1.13.0" 28 | itertools = "0.10.5" 29 | 30 | [dependencies.web-sys] 31 | version = "0.3.59" 32 | features = ["console", 33 | "Document", 34 | "DomTokenList", 35 | "Element", 36 | "HtmlSelectElement", 37 | "HtmlOptionsCollection"] 38 | -------------------------------------------------------------------------------- /pgr-web/frontend/Trunk.toml: -------------------------------------------------------------------------------- 1 | [build] 2 | target = "index.html" 3 | dist = "../dist" 4 | 5 | [watch] 6 | watch = ["index.html", "src/main.rs"] 7 | 8 | [serve] 9 | # The address to serve on. 10 | address = "127.0.0.1" 11 | # The port to serve on. 12 | port = 8080 13 | # Open a browser tab once the initial build is complete. 14 | open = false 15 | # Disable auto-reload of the web app. 16 | no_autoreload = false 17 | -------------------------------------------------------------------------------- /pgr-web/frontend/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | PGR-TK WS 5 | 6 | 7 | 8 | 9 |
10 | 11 | 12 | -------------------------------------------------------------------------------- /pgr-web/frontend/src/data/ROIs.json: -------------------------------------------------------------------------------- 1 | { 2 | "AMY1A": { 3 | "source": "hg19_tagged.fa", 4 | "ctg": "chr1_hg19", 5 | "bgn": 104198140, 6 | "end": 104207173, 7 | "padding": 150000, 8 | "merge_range_tol": 120000, 9 | "w": 48, 10 | "k": 56, 11 | "r": 1, 12 | "min_span": 12, 13 | "sketch": false, 14 | "min_cov": 2, 15 | "min_branch_size": 8, 16 | "bundle_length_cutoff": 500, 17 | "bundle_merge_distance": 10000 18 | }, 19 | "TBC1D3": { 20 | "source": "hg38_tagged.fa", 21 | "ctg": "chr17_hg38", 22 | "bgn": 37885486, 23 | "end": 38325932, 24 | "padding": 100000, 25 | "merge_range_tol": 120000, 26 | "w": 48, 27 | "k": 56, 28 | "r": 1, 29 | "min_span": 12, 30 | "sketch": false, 31 | "min_cov": 2, 32 | "min_branch_size": 8, 33 | "bundle_length_cutoff": 500, 34 | "bundle_merge_distance": 10000 35 | }, 36 | "LPA": { 37 | "source": "hg38_tagged.fa", 38 | "ctg": "chr6_hg38", 39 | "bgn": 160531482, 40 | "end": 160664275, 41 | "padding": 40000, 42 | "merge_range_tol": 100000, 43 | "w": 64, 44 | "k": 56, 45 | "r": 1, 46 | "min_span": 12, 47 | "sketch": false, 48 | "min_cov": 2, 49 | "min_branch_size": 8, 50 | "bundle_length_cutoff": 500, 51 | "bundle_merge_distance": 10000 52 | }, 53 | "HLA Class II": { 54 | "source": "hg38_tagged.fa", 55 | "ctg": "chr6_hg38", 56 | "bgn": 32163513, 57 | "end": 32992088, 58 | "padding": 25000, 59 | "merge_range_tol": 2000000, 60 | "w": 128, 61 | "k": 56, 62 | "r": 12, 63 | "min_span": 64, 64 | "sketch": false, 65 | "min_cov": 2, 66 | "min_branch_size": 8, 67 | "bundle_length_cutoff": 500, 68 | "bundle_merge_distance": 10000 69 | }, 70 | "ChrY_Repeats": { 71 | "source": "hg38_tagged.fa", 72 | "ctg": "chrY_hg38", 73 | "bgn": 23129355, 74 | "end": 24907040, 75 | "padding": 1500000, 76 | "merge_range_tol": 2000000, 77 | "w": 128, 78 | "k": 56, 79 | "r": 12, 80 | "min_span": 64, 81 | "sketch": false, 82 | "min_cov": 2, 83 | "min_branch_size": 8, 84 | "bundle_length_cutoff": 500, 85 | "bundle_merge_distance": 10000 86 | }, 87 | "FLG": { 88 | "source": "hg38_tagged.fa", 89 | "ctg": "chr1_hg38", 90 | "bgn": 152301265, 91 | "end": 152328339, 92 | "padding": 5000, 93 | "merge_range_tol": 100, 94 | "w": 48, 95 | "k": 56, 96 | "r": 1, 97 | "min_span": 8, 98 | "sketch": false, 99 | "min_cov": 2, 100 | "min_branch_size": 8, 101 | "bundle_length_cutoff": 500, 102 | "bundle_merge_distance": 10000 103 | }, 104 | "KIR": { 105 | "source": "hg38_tagged.fa", 106 | "ctg": "chr19_hg38", 107 | "bgn": 54687267, 108 | "end": 54907736, 109 | "padding": 5000, 110 | "merge_range_tol": 5000, 111 | "w": 48, 112 | "k": 56, 113 | "r": 5, 114 | "min_span": 16, 115 | "sketch": false, 116 | "min_cov": 2, 117 | "min_branch_size": 8, 118 | "bundle_length_cutoff": 500, 119 | "bundle_merge_distance": 10000 120 | } 121 | } -------------------------------------------------------------------------------- /pgr-web/pgr-server/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "pgr-server" 3 | version = "0.6.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | axum = { version="0.5.13", features = ["ws"] } 10 | tokio = { version = "1.0", features = ["full"] } 11 | pgr-db = { path = "../../pgr-db/", default-features = false} 12 | rustc-hash = "1.1.0" 13 | rayon = "1.5.2" 14 | serde_json = "1.0.83" 15 | serde = { version = "1.0.117", features = ["derive", "rc"] } 16 | tower-http = { version = "0.3.0", features = ["cors", "trace", "fs"] } 17 | tower = "0.4.13" 18 | tracing = "0.1" 19 | tracing-subscriber = { version = "0.3", features = ["env-filter"] } 20 | svg = "0.16" 21 | clap = { version = "4.2.7", features = ["derive"] } 22 | serde_qs = "0.12.0" 23 | serde_with = "3.0.0" 24 | 25 | [features] 26 | default = ["with_agc"] 27 | with_agc = ["pgr-db/with_agc"] 28 | -------------------------------------------------------------------------------- /pgr-web/pgr-server/src/ROIs.json: -------------------------------------------------------------------------------- 1 | { 2 | "AMY1A": { 3 | "source": "hg19_tagged.fa", 4 | "ctg": "chr1_hg19", 5 | "bgn": 104198140, 6 | "end": 104207173, 7 | "padding": 150000, 8 | "merge_range_tol": 120000, 9 | "w": 48, 10 | "k": 56, 11 | "r": 1, 12 | "min_span": 12, 13 | "sketch": false, 14 | "min_cov": 2, 15 | "min_branch_size": 8, 16 | "bundle_length_cutoff": 500, 17 | "bundle_merge_distance": 10000 18 | }, 19 | "TBC1D3": { 20 | "source": "hg38_tagged.fa", 21 | "ctg": "chr17_hg38", 22 | "bgn": 37885486, 23 | "end": 38325932, 24 | "padding": 100000, 25 | "merge_range_tol": 120000, 26 | "w": 48, 27 | "k": 56, 28 | "r": 1, 29 | "min_span": 12, 30 | "sketch": false, 31 | "min_cov": 2, 32 | "min_branch_size": 8, 33 | "bundle_length_cutoff": 500, 34 | "bundle_merge_distance": 10000 35 | }, 36 | "LPA": { 37 | "source": "hg38_tagged.fa", 38 | "ctg": "chr6_hg38", 39 | "bgn": 160531482, 40 | "end": 160664275, 41 | "padding": 40000, 42 | "merge_range_tol": 100000, 43 | "w": 64, 44 | "k": 56, 45 | "r": 1, 46 | "min_span": 12, 47 | "sketch": false, 48 | "min_cov": 2, 49 | "min_branch_size": 8, 50 | "bundle_length_cutoff": 500, 51 | "bundle_merge_distance": 10000 52 | }, 53 | "HLA Class II": { 54 | "source": "hg38_tagged.fa", 55 | "ctg": "chr6_hg38", 56 | "bgn": 32163513, 57 | "end": 32992088, 58 | "padding": 25000, 59 | "merge_range_tol": 2000000, 60 | "w": 128, 61 | "k": 56, 62 | "r": 12, 63 | "min_span": 64, 64 | "sketch": false, 65 | "min_cov": 2, 66 | "min_branch_size": 8, 67 | "bundle_length_cutoff": 500, 68 | "bundle_merge_distance": 10000 69 | }, 70 | "ChrY_Repeats": { 71 | "source": "hg38_tagged.fa", 72 | "ctg": "chrY_hg38", 73 | "bgn": 23129355, 74 | "end": 24907040, 75 | "padding": 1500000, 76 | "merge_range_tol": 2000000, 77 | "w": 128, 78 | "k": 56, 79 | "r": 12, 80 | "min_span": 64, 81 | "sketch": false, 82 | "min_cov": 2, 83 | "min_branch_size": 8, 84 | "bundle_length_cutoff": 500, 85 | "bundle_merge_distance": 10000 86 | }, 87 | "FLG": { 88 | "source": "hg38_tagged.fa", 89 | "ctg": "chr1_hg38", 90 | "bgn": 152301265, 91 | "end": 152328339, 92 | "padding": 5000, 93 | "merge_range_tol": 100, 94 | "w": 48, 95 | "k": 56, 96 | "r": 1, 97 | "min_span": 8, 98 | "sketch": false, 99 | "min_cov": 2, 100 | "min_branch_size": 8, 101 | "bundle_length_cutoff": 500, 102 | "bundle_merge_distance": 10000 103 | }, 104 | "KIR": { 105 | "source": "hg38_tagged.fa", 106 | "ctg": "chr19_hg38", 107 | "bgn": 54687267, 108 | "end": 54907736, 109 | "padding": 5000, 110 | "merge_range_tol": 5000, 111 | "w": 48, 112 | "k": 56, 113 | "r": 5, 114 | "min_span": 16, 115 | "sketch": false, 116 | "min_cov": 2, 117 | "min_branch_size": 8, 118 | "bundle_length_cutoff": 500, 119 | "bundle_merge_distance": 10000 120 | } 121 | } -------------------------------------------------------------------------------- /pgr-web/pgr-server/src/main.rs: -------------------------------------------------------------------------------- 1 | pub mod bundle_processing; 2 | 3 | use axum::{ 4 | body::{boxed, Body}, 5 | extract::ws::{WebSocket, WebSocketUpgrade}, 6 | extract::Query, 7 | http::{Response, StatusCode}, 8 | response, 9 | response::Html, 10 | routing::{get, post}, 11 | Json, Router, 12 | }; 13 | use bundle_processing::*; 14 | use clap::{self, Parser}; 15 | use pgr_db::ext::*; 16 | use rustc_hash::FxHashMap; 17 | use std::net::SocketAddr; 18 | use std::{ 19 | net::{IpAddr, Ipv6Addr}, 20 | path::PathBuf, 21 | str::FromStr, 22 | sync::Arc, 23 | }; 24 | use tokio::fs; 25 | use tower::{ServiceBuilder, ServiceExt}; 26 | use tower_http::cors::Any; 27 | use tower_http::cors::CorsLayer; 28 | use tower_http::services::ServeDir; 29 | use tower_http::trace::TraceLayer; 30 | use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; 31 | 32 | #[derive(Parser, Debug)] 33 | #[clap(name = "pgr-server", about = "Experimental Server")] 34 | struct Opt { 35 | /// set the listen addr 36 | #[clap(short = 'a', long = "addr", default_value = "::1")] 37 | addr: String, 38 | 39 | /// set the listen port 40 | #[clap(short = 'p', long = "port", default_value = "5000")] 41 | port: u16, 42 | 43 | /// set the directory where static files are to be found 44 | #[clap(long = "static-dir", default_value = "./dist")] 45 | static_dir: String, 46 | 47 | /// set data_path_prefix 48 | #[clap( 49 | short = 'd', 50 | long = "data-path-prefix", 51 | default_value = "./pgr-tk-HGRP-y1-evaluation-set-v0" 52 | )] 53 | data_path_prefix: String, 54 | 55 | /// set the listen port 56 | #[clap(short = 'f', long = "frg-file")] 57 | frg_file: bool, 58 | } 59 | 60 | #[tokio::main] 61 | async fn main() { 62 | let opt = Opt::parse(); 63 | 64 | tracing_subscriber::registry() 65 | .with(tracing_subscriber::EnvFilter::new( 66 | std::env::var("RUST_LOG") 67 | .unwrap_or_else(|_| "example_tracing_aka_logging=debug,tower_http=debug".into()), 68 | )) 69 | .with(tracing_subscriber::fmt::layer()) 70 | .init(); 71 | 72 | let mut seq_db = SeqIndexDB::new(); 73 | 74 | if opt.frg_file { 75 | let _ = seq_db.load_from_frg_index(opt.data_path_prefix); 76 | } else { 77 | #[cfg(feature = "with_agc")] 78 | let _ = seq_db.load_from_agc_index(opt.data_path_prefix); 79 | 80 | #[cfg(not(feature = "with_agc"))] 81 | panic!("This command is compiled with only frg file support, please specify `--frg-file"); 82 | } 83 | 84 | let seq_db = Arc::new(seq_db); 85 | // build our application with a route 86 | let app = Router::new() 87 | .route( 88 | "/api/get_number_of_ctgs", 89 | get({ 90 | let seq_db = seq_db.clone(); 91 | move || get_number_of_ctgs(seq_db) 92 | }), 93 | ) 94 | .route( 95 | "/api/post_query_for_json_data", 96 | post({ 97 | let seq_db = seq_db.clone(); 98 | move |params| post_query_for_json_data(params, seq_db) 99 | }), 100 | ) 101 | .route( 102 | "/api/get_html_by_query", 103 | get({ 104 | let seq_db = seq_db.clone(); 105 | move |params| get_html_by_query(params, seq_db) 106 | }), 107 | ) 108 | .route("/ws", get(ws_handler)) 109 | .layer( 110 | CorsLayer::new() 111 | .allow_origin(Any) 112 | //.allow_origin("http://127.0.0.1:8080".parse::().unwrap()) 113 | .allow_methods(Any) 114 | .allow_headers(Any), 115 | ) 116 | .layer(ServiceBuilder::new().layer(TraceLayer::new_for_http())) 117 | .fallback(get(|req| async move { 118 | match ServeDir::new(&opt.static_dir).oneshot(req).await { 119 | Ok(res) => { 120 | let status = res.status(); 121 | match status { 122 | StatusCode::NOT_FOUND => { 123 | let index_path = PathBuf::from(&opt.static_dir).join("index.html"); 124 | let index_content = match fs::read_to_string(index_path).await { 125 | Err(_) => { 126 | return Response::builder() 127 | .status(StatusCode::NOT_FOUND) 128 | .body(boxed(Body::from("index file not found"))) 129 | .unwrap() 130 | } 131 | Ok(index_content) => index_content, 132 | }; 133 | 134 | Response::builder() 135 | .status(StatusCode::OK) 136 | .body(boxed(Body::from(index_content))) 137 | .unwrap() 138 | } 139 | _ => res.map(boxed), 140 | } 141 | } 142 | Err(_err) => Response::builder() 143 | .status(StatusCode::INTERNAL_SERVER_ERROR) 144 | .body(boxed(Body::from(format!("internal errors")))) 145 | .expect("error response"), 146 | } 147 | })); 148 | 149 | // run it 150 | let addr = SocketAddr::from(( 151 | IpAddr::from_str(opt.addr.as_str()).unwrap_or(IpAddr::V6(Ipv6Addr::LOCALHOST)), 152 | opt.port, 153 | )); 154 | println!("listening on {}", addr); 155 | axum::Server::bind(&addr) 156 | .serve(app.into_make_service()) 157 | .await 158 | .unwrap(); 159 | } 160 | 161 | /* 162 | async fn handler(seq_db: Arc) -> impl IntoResponse { 163 | let n_ctg = 0; 164 | let mut headers = HeaderMap::new(); 165 | headers.insert(header::CONTENT_TYPE, "text/plain".parse().unwrap()); 166 | headers.insert(header::URI, "http://127.0.0.1:3000".parse().unwrap()); 167 | let rtn = format!("Hello, World! {}", n_ctg); 168 | (headers, rtn) 169 | } 170 | */ 171 | 172 | async fn get_number_of_ctgs(seq_db: Arc) -> Json { 173 | let n_ctg = seq_db.seq_index.as_ref().unwrap().len(); 174 | Json(n_ctg) 175 | } 176 | 177 | async fn post_query_for_json_data( 178 | Json(seq_query_spec): Json>, 179 | seq_db: Arc, 180 | ) -> Json> { 181 | if seq_query_spec.is_none() { 182 | return Json(None); 183 | }; 184 | 185 | let seq_query_spec = seq_query_spec.unwrap(); 186 | println!("{:?}", seq_query_spec); 187 | Json(get_target_and_principal_bundle_decomposition( 188 | &seq_query_spec, 189 | seq_db, 190 | )) 191 | } 192 | 193 | async fn get_html_by_query( 194 | Query(seq_query_spec): Query, 195 | seq_db: Arc, 196 | ) -> Html { 197 | //if seq_query_spec.is_none() { 198 | // return Html("No Query Yet".into()); 199 | //}; 200 | 201 | //let seq_query_spec = seq_query_spec.unwrap(); 202 | println!("{:?}", seq_query_spec); 203 | 204 | let data = get_target_and_principal_bundle_decomposition(&seq_query_spec, seq_db); 205 | let output = pb_data_to_html_string(&data.unwrap()); 206 | 207 | Html(output) 208 | } 209 | 210 | async fn ws_handler(ws: WebSocketUpgrade) -> response::Response { 211 | ws.on_upgrade(ws_handle_socket) 212 | } 213 | 214 | use std::cell::OnceCell; 215 | // Todo... read the ROIs.json into the String than using static 216 | static ROI_JSON: &str = include_str!("ROIs.json"); 217 | 218 | async fn ws_handle_socket(mut socket: WebSocket) { 219 | let ROI: OnceCell> = OnceCell::new(); 220 | let _ = ROI.set(serde_json::from_str(ROI_JSON).unwrap()); 221 | 222 | while let Some(msg) = socket.recv().await { 223 | let msg = if let Ok(msg) = msg { 224 | println!("WS msg: {:?}", msg); 225 | if let axum::extract::ws::Message::Text(msg) = msg { 226 | if !msg.is_empty() { 227 | let roi = ROI.get().unwrap(); 228 | let keys = roi.keys(); 229 | let mut keys = keys.filter(|&s| (*s).starts_with(&msg)).collect::>(); 230 | keys.sort(); 231 | let json = serde_json::to_string( 232 | &keys 233 | .iter() 234 | .map(|&k| ((*k).clone(), roi.get(k).unwrap().clone())) 235 | .collect::>(), 236 | ) 237 | .unwrap(); 238 | axum::extract::ws::Message::Text(json) 239 | } else { 240 | axum::extract::ws::Message::Text("{}".to_string()) 241 | } 242 | } else { 243 | axum::extract::ws::Message::Text("{}".to_string()) 244 | } 245 | } else { 246 | // client disconnected 247 | return; 248 | }; 249 | 250 | if socket.send(msg).await.is_err() { 251 | // client disconnected 252 | return; 253 | } 254 | } 255 | } 256 | -------------------------------------------------------------------------------- /pgr-web/prod.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | IFS=$'\n\t' 4 | 5 | pushd frontend 6 | trunk build --release 7 | popd 8 | 9 | cargo run --bin pgr-server --release -- --addr 0.0.0.0 --port 3000 --static-dir ./dist --data-path-prefix /wd/pgr-tk-demo-data/data/pgr-tk-HGRP-y1-evaluation-set-v0 10 | -------------------------------------------------------------------------------- /pgr-web/prod_no_agc.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | IFS=$'\n\t' 4 | 5 | pushd frontend 6 | trunk build --release 7 | popd 8 | 9 | cargo run --bin pgr-server --no-default-features --release -- --addr 0.0.0.0 --port 3000 --static-dir ./dist --frg-file --data-path-prefix $HOME/Sandbox/pgr-tk-data/HGRP-y1-evaluation-set_fragdb 10 | -------------------------------------------------------------------------------- /pgr-web/scripts/ROIs_examples.json: -------------------------------------------------------------------------------- 1 | { 2 | "AMY1A": { 3 | "source": "hg19_tagged.fa", 4 | "ctg": "chr1_hg19", 5 | "bgn": 104198140, 6 | "end": 104207173, 7 | "padding": 150000, 8 | "merge_range_tol": 120000, 9 | "w": 48, 10 | "k": 56, 11 | "r": 1, 12 | "min_span": 12, 13 | "sketch": false, 14 | "min_cov": 2, 15 | "min_branch_size": 8, 16 | "bundle_length_cutoff": 500, 17 | "bundle_merge_distance": 10000 18 | }, 19 | "TBC1D3": { 20 | "source": "hg38_tagged.fa", 21 | "ctg": "chr17_hg38", 22 | "bgn": 37885486, 23 | "end": 38325932, 24 | "padding": 100000, 25 | "merge_range_tol": 120000, 26 | "w": 48, 27 | "k": 56, 28 | "r": 1, 29 | "min_span": 12, 30 | "sketch": false, 31 | "min_cov": 2, 32 | "min_branch_size": 8, 33 | "bundle_length_cutoff": 500, 34 | "bundle_merge_distance": 10000 35 | }, 36 | "LPA": { 37 | "source": "hg38_tagged.fa", 38 | "ctg": "chr6_hg38", 39 | "bgn": 160531482, 40 | "end": 160664275, 41 | "padding": 40000, 42 | "merge_range_tol": 100000, 43 | "w": 64, 44 | "k": 56, 45 | "r": 1, 46 | "min_span": 12, 47 | "sketch": false, 48 | "min_cov": 2, 49 | "min_branch_size": 8, 50 | "bundle_length_cutoff": 500, 51 | "bundle_merge_distance": 10000 52 | }, 53 | "HLA Class II": { 54 | "source": "hg38_tagged.fa", 55 | "ctg": "chr6_hg38", 56 | "bgn": 32163513, 57 | "end": 32992088, 58 | "padding": 25000, 59 | "merge_range_tol": 2000000, 60 | "w": 128, 61 | "k": 56, 62 | "r": 12, 63 | "min_span": 64, 64 | "sketch": false, 65 | "min_cov": 2, 66 | "min_branch_size": 8, 67 | "bundle_length_cutoff": 500, 68 | "bundle_merge_distance": 10000 69 | }, 70 | "ChrY_Repeats": { 71 | "source": "hg38_tagged.fa", 72 | "ctg": "chrY_hg38", 73 | "bgn": 23129355, 74 | "end": 24907040, 75 | "padding": 1500000, 76 | "merge_range_tol": 2000000, 77 | "w": 128, 78 | "k": 56, 79 | "r": 12, 80 | "min_span": 64, 81 | "sketch": false, 82 | "min_cov": 2, 83 | "min_branch_size": 8, 84 | "bundle_length_cutoff": 500, 85 | "bundle_merge_distance": 10000 86 | }, 87 | "FLG": { 88 | "source": "hg38_tagged.fa", 89 | "ctg": "chr1_hg38", 90 | "bgn": 152301265, 91 | "end": 152328339, 92 | "padding": 5000, 93 | "merge_range_tol": 100, 94 | "w": 48, 95 | "k": 56, 96 | "r": 1, 97 | "min_span": 8, 98 | "sketch": false, 99 | "min_cov": 2, 100 | "min_branch_size": 8, 101 | "bundle_length_cutoff": 500, 102 | "bundle_merge_distance": 10000 103 | }, 104 | "KIR": { 105 | "source": "hg38_tagged.fa", 106 | "ctg": "chr19_hg38", 107 | "bgn": 54687267, 108 | "end": 54907736, 109 | "padding": 5000, 110 | "merge_range_tol": 5000, 111 | "w": 48, 112 | "k": 56, 113 | "r": 5, 114 | "min_span": 16, 115 | "sketch": false, 116 | "min_cov": 2, 117 | "min_branch_size": 8, 118 | "bundle_length_cutoff": 500, 119 | "bundle_merge_distance": 10000 120 | } 121 | } -------------------------------------------------------------------------------- /pgr-web/scripts/generare_ROIs.sh: -------------------------------------------------------------------------------- 1 | python3 get_ROIs.py > ../pgr-server/src/ROIs.json 2 | -------------------------------------------------------------------------------- /pgr-web/scripts/get_ROIs.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import json 3 | 4 | 5 | #gene2query = {} 6 | gene2query = json.loads(open("ROIs_examples.json").read()) 7 | ## we need the file of the coordinate of genes from https://s3.amazonaws.com/igv.org.genomes/hg38/ncbiRefSeq.sorted.txt.gz 8 | with gzip.open("ncbiRefSeq.sorted.txt.gz") as f: 9 | for row in f: 10 | row = row.decode("utf-8") 11 | row = row.strip().split("\t") 12 | g = row[12] 13 | ch = row[2] 14 | if len(ch.split("_")) > 1: 15 | continue 16 | strand = row[3] 17 | bgn = int(row[4]) 18 | end = int(row[5]) 19 | if g not in gene2query: 20 | gene2query[g] = { 21 | "source": "hg38_tagged.fa", 22 | "ctg": f"{ch}_hg38", 23 | "bgn": bgn, 24 | "end": end, 25 | "padding": 10000, 26 | "merge_range_tol": 120000, 27 | "w": 48, 28 | "k": 56, 29 | "r": 1, 30 | "min_span": 12, 31 | "sketch": False, 32 | "min_cov": 2, 33 | "min_branch_size": 8, 34 | "bundle_length_cutoff": 500, 35 | "bundle_merge_distance": 10000 36 | } 37 | 38 | 39 | print(json.dumps(gene2query)) 40 | 41 | --------------------------------------------------------------------------------