├── .gitmodules ├── LICENSE ├── README.md ├── SECURITY.md ├── _config.yml ├── applications ├── AutoDock-Vina │ ├── Dockerfile │ ├── README.md │ └── data_download_script.sh ├── Autodock │ ├── Dockerfile │ ├── README.md │ └── data_download_script.sh ├── ProtGPT2 │ ├── Dockerfile │ ├── README.md │ ├── env.yml │ ├── model_script.sh │ └── protgpt2.py ├── ProteinMPNN │ ├── Dockerfile │ ├── ProteinMPNN.patch │ ├── README.md │ └── setup_proteinmpnn.sh ├── RFdiffusion │ ├── Dockerfile │ ├── README.md │ ├── RFdiffusion.patch │ └── setup_rfdiffusion.sh ├── boltz │ ├── Dockerfile │ ├── README.md │ └── entrypoint.sh ├── esm │ ├── Dockerfile.base │ ├── Dockerfile.esm │ ├── Dockerfile.esmfold │ ├── README.md │ ├── build_docker_images.sh │ ├── env.yml │ ├── esm_change_all.patch │ └── esm_openfold_change_py37.patch ├── esm3 │ ├── Dockerfile │ ├── README.md │ ├── env.yml │ ├── esm3_changes.patch │ └── scripts │ │ ├── ESM3_chain_of_thought.py │ │ ├── ESM3_folding_task.py │ │ ├── ESM3_function_prediction_task.py │ │ ├── ESM3_inversefold_task.py │ │ ├── ESM3_logits_embedding_task.py │ │ ├── ESM3_prompt_sequence.py │ │ └── ESMC_logits_embedding_task.py ├── gromacs │ ├── Dockerfile │ ├── README.md │ ├── entrypoint.sh │ ├── grms_input │ │ ├── mdtut_ions.mdp │ │ ├── mdtut_md.mdp │ │ ├── mdtut_minim.mdp │ │ ├── mdtut_npt.mdp │ │ ├── mdtut_nvt.mdp │ │ └── run_commands.sh │ └── run_commands.sh ├── moflow │ ├── Dockerfile │ ├── README.md │ ├── env.yml │ └── mflow_change_all.patch └── relion │ ├── Dockerfile │ ├── README.md │ ├── entrypoint.sh │ └── relion_env_patch.patch ├── benchmarking ├── AWS-Intel-blog-v2.1-2024 │ ├── README.md │ ├── long_db │ ├── proteome.py │ ├── run_pipe_bwa.sh │ ├── short_db │ └── test_pipe_bwa.py └── aws │ └── README.md ├── images ├── Open-Omics-Acceleration-Framework v2.0.JPG ├── Open-Omics-Acceleration-Framework v2.0.jpg ├── Open-Omics-Acceleration-Framework v3.0.jpg ├── Open-Omics-Acceleration-Framework-v2.0.JPG ├── Open-Omics-Acceleration-Framework-v3.0.jpg ├── alphafold2-protein-folding.jpg ├── deepvariant-fq2vcf.jpg ├── open-omics-acceleration-framework-v2.0.JPG ├── open-omics-acceleration-framework.JPG └── scrnaseq-analysis.jpg └── pipelines ├── alphafold2-based-protein-folding ├── Dockerfile_Inf ├── Dockerfile_Pre ├── README.md ├── entrypoint_inf.sh └── entrypoint_pre.sh ├── deepvariant-based-germline-variant-calling-fq2vcf ├── Dockerfile_bams2vcf ├── Dockerfile_fq2bams ├── README.md ├── bams2vcf.py ├── docs.txt ├── environment.yml ├── fq2bams.py ├── libmimalloc.so.2.0 ├── merge_vcf.sh ├── run_bams2vcf.py ├── run_fq2bams.py └── trash │ ├── config │ ├── extra_scripts │ ├── config │ ├── merge_vcf.sh │ ├── run_pipeline_ec2_part2.sh │ └── run_pipeline_part2.sh │ ├── run_pipeline.sh │ ├── run_pipeline_part1.sh │ ├── scripts │ ├── aws │ │ ├── basic_setup.sh │ │ ├── build_deepvariant_docker_image.sh │ │ ├── build_tools.sh │ │ ├── config │ │ ├── create_reference_index.sh │ │ ├── deepvariant_ec2_setup.sh │ │ ├── deepvariant_setup.sh │ │ ├── pcluster_compute_node_setup.sh │ │ ├── pcluster_example_config │ │ ├── pcluster_reference_index.sh │ │ ├── run_pipeline_ec2.sh │ │ ├── run_pipeline_ec2_part1.sh │ │ └── run_pipeline_pcluster.sh │ └── cluster │ │ ├── config │ │ ├── create_reference_index.sh │ │ ├── load_deepvariant.sh │ │ ├── run_pipeline_cluster.sh │ │ └── setup.sh │ ├── setup_env.sh │ └── test_pipeline_final.py ├── fq2sortedbam ├── Dockerfile ├── README.md ├── README.md.old ├── basic_setup_ubuntu.sh ├── config.yaml ├── doc.txt ├── environment.yml ├── fq2sortedbam.py ├── hwconfig.py ├── install.sh ├── print_config.sh ├── run_bwa.sh └── run_fq2sortedbam.py └── single-cell-RNA-seq-analysis ├── Dockerfile ├── Dockerfile.python ├── LICENSE ├── README.md ├── _t_sne.py ├── environment.yml └── notebooks ├── 1.3_million_single_cell_analysis.ipynb ├── fastpp.py ├── full_single_cell_analysis.py ├── sc_nbrs.py ├── sc_pp_hvg.py └── sc_pp_simple.py /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "lib/tal"] 2 | path = lib/tal 3 | url = https://github.com/IntelLabs/Trans-Omics-Acceleration-Library.git 4 | [submodule "applications/bwa-mem2"] 5 | path = applications/bwa-mem2 6 | url = https://github.com/bwa-mem2/bwa-mem2.git 7 | [submodule "applications/mm2-fast"] 8 | path = applications/mm2-fast 9 | url = https://github.com/bwa-mem2/mm2-fast.git 10 | [submodule "applications/alphafold"] 11 | path = applications/alphafold 12 | url = https://github.com/IntelLabs/open-omics-alphafold.git 13 | [submodule "applications/samtools"] 14 | path = applications/samtools 15 | url = https://github.com/samtools/samtools.git 16 | [submodule "applications/htslib"] 17 | path = applications/htslib 18 | url = https://github.com/samtools/htslib.git 19 | [submodule "applications/deepvariant"] 20 | path = applications/deepvariant 21 | url = https://github.com/IntelLabs/deepvariant.git 22 | [submodule "applications/bcftools"] 23 | path = applications/bcftools 24 | url = https://github.com/samtools/bcftools.git 25 | [submodule "applications/bwa-meth"] 26 | path = applications/bwa-meth 27 | url = https://github.com/brentp/bwa-meth.git 28 | [submodule "applications/STAR"] 29 | path = applications/STAR 30 | url = https://github.com/alexdobin/STAR.git 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Intel Labs 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Reporting Security Issues 2 | 3 | The Bootstrap team and community take security issues in Bootstrap seriously. We appreciate your efforts to responsibly disclose your findings, and will make every effort to acknowledge your contributions. 4 | 5 | To report a security issue, email [security@getbootstrap.com](mailto:security@getbootstrap.com) and include the word "SECURITY" in the subject line. 6 | 7 | We'll endeavor to respond quickly, and will keep you updated throughout the process. 8 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: minima 2 | -------------------------------------------------------------------------------- /applications/AutoDock-Vina/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM condaforge/miniforge3:4.10.2-0 2 | ENV DEBIAN_FRONTEND=noninteractive 3 | RUN apt-get update && apt-get install -y --no-install-recommends \ 4 | build-essential \ 5 | libboost-all-dev \ 6 | swig \ 7 | vim \ 8 | gcc-8 \ 9 | g++-8 \ 10 | numactl \ 11 | time && \ 12 | apt-get clean && \ 13 | rm -rf /var/lib/apt/lists/* 14 | ENV CC=gcc-8 15 | ENV CXX=g++-8 16 | WORKDIR /opt 17 | RUN git clone https://github.com/ccsb-scripps/AutoDock-Vina.git 18 | WORKDIR /opt/AutoDock-Vina 19 | RUN git checkout v1.2.2 20 | WORKDIR /opt/AutoDock-Vina/build/linux/release 21 | RUN make -j$(nproc) 22 | ENV SERVICE_NAME="autodock-vina-service" 23 | RUN groupadd --gid 1001 $SERVICE_NAME && \ 24 | useradd -m -g $SERVICE_NAME --shell /bin/false --uid 1001 $SERVICE_NAME 25 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /opt 26 | USER $SERVICE_NAME 27 | ENV PATH="/opt/AutoDock-Vina/build/linux/release:$PATH" 28 | WORKDIR /input 29 | HEALTHCHECK NONE 30 | CMD ["vina","--help"] 31 | 32 | -------------------------------------------------------------------------------- /applications/AutoDock-Vina/README.md: -------------------------------------------------------------------------------- 1 | ## Open-Omics-Autodock-Vina 2 | Open-Omics-Autodock-Vina is a fast, efficient molecular docking software used to predict ligand-protein binding poses and affinities. It features a refined scoring function, parallel execution on multicore CPUs and user-friendly configuration. 3 | 4 | ## Docker Setup Instructions 5 | 6 | 7 | ### 1. Build the Docker Image 8 | To build the Docker image with the tag `docker_vina`, use the following commands based on your machine's proxy requirements: 9 | * For machine without a proxy: 10 | ```bash 11 | docker build -t docker_vina . 12 | ``` 13 | * For machine with a proxy: 14 | ```bash 15 | docker build --build-arg http_proxy= --build-arg https_proxy= --build-arg no_proxy= -t docker_vina . 16 | ``` 17 | 18 | 19 | ### 2. Choose and Download Protein Complex Data 20 | Select any protein complex from the available dataset of **140** protein-ligand complexes(https://zenodo.org/records/4031961) which you can download from (https://zenodo.org/records/4031961/files/data.zip?download=1). This guide uses the **5wlo** protein as an example. 21 | 22 | 1) Run the below commands to make data download script executable, download the complete dataset and extract the data for `5wlo`: 23 | 24 | ```bash 25 | chmod +x data_download_script.sh 26 | bash data_download_script.sh 5wlo 27 | ``` 28 | **Note: You can replace 5wlo with any other complex name from the complete dataset available in `data_original/data` directory.** 29 | 30 | 2) Create an output directory to store results specific to `5wlo`: 31 | ```bash 32 | mkdir -p 5wlo_output 33 | ``` 34 | 35 | 3) Set the environment variables for the `5wlo` protein as follows: 36 | ```bash 37 | export INPUT_VINA=$PWD/5wlo 38 | export OUTPUT_VINA=$PWD/5wlo_output 39 | ``` 40 | 41 | 4) Add the necessary permissions to output folder for Docker to write to it: 42 | ```bash 43 | sudo chmod -R a+w $OUTPUT_VINA 44 | ``` 45 | 46 | ### 3. Run the Docker Container 47 | Verify that the Docker image was built successfully by listing Docker images: 48 | ```bash 49 | docker images | grep docker_vina 50 | ``` 51 | If the image is listed, run AutoDock Vina with the following command: 52 | ```bash 53 | docker run -it -v $INPUT_VINA:/input -v $OUTPUT_VINA:/output docker_vina:latest vina --receptor protein.pdbqt --ligand rand-1.pdbqt --out /output/rand-1_out.pdbqt --center_x 16.459 --center_y -19.946 --center_z -5.850 --size_x 18 --size_y 18 --size_z 18 --seed 1234 --exhaustiveness 64 54 | ``` 55 | This command will process your receptor and ligand files and place the results in the specified output directory. 56 | ### 4. Expected Output 57 | After running the above command, you should find the output file (`rand-1_out.pdbqt`) in the output directory, such as `5wlo_output` for this example. 58 | 59 | --- 60 | The original README content of AutoDock-Vina follows: 61 | 62 | ## AutoDock Vina: Docking and virtual screening program 63 | 64 | **AutoDock Vina** is one of the **fastest** and **most widely used** **open-source** docking engines. It is a turnkey computational docking program that is based on a simple scoring function and rapid gradient-optimization conformational search. It was originally designed and implemented by Dr. Oleg Trott in the Molecular Graphics Lab, and it is now being maintained and develop by the Forli Lab at The Scripps Research Institute. 65 | 66 | * AutoDock4.2 and Vina scoring functions 67 | * Support of simultaneous docking of multiple ligands and batch mode for virtual screening 68 | * Support of macrocycle molecules 69 | * Hydrated docking protocol 70 | * Can write and load external AutoDock maps 71 | * Python bindings for Python 3 72 | 73 | ## Documentation 74 | 75 | The installation instructions, documentation and tutorials can be found on [readthedocs.org](https://autodock-vina.readthedocs.io/en/latest/). 76 | 77 | ## Citations 78 | * [J. Eberhardt, D. Santos-Martins, A. F. Tillack, and S. Forli. (2021). AutoDock Vina 1.2.0: New Docking Methods, Expanded Force Field, and Python Bindings. Journal of Chemical Information and Modeling.](https://pubs.acs.org/doi/10.1021/acs.jcim.1c00203) 79 | * [O. Trott and A. J. Olson. (2010). AutoDock Vina: improving the speed and accuracy of docking with a new scoring function, efficient optimization, and multithreading. Journal of computational chemistry, 31(2), 455-461.](https://onlinelibrary.wiley.com/doi/10.1002/jcc.21334) 80 | -------------------------------------------------------------------------------- /applications/AutoDock-Vina/data_download_script.sh: -------------------------------------------------------------------------------- 1 | url="https://zenodo.org/records/4031961/files/data.zip?download=1" 2 | download_dir="./data_original" 3 | target_folder="$1" 4 | if [ ! -d "$download_dir/data" ]; then 5 | echo "Downloading data.zip..." 6 | mkdir -p "$download_dir" 7 | wget -O "$download_dir/data.zip" "$url" 8 | 9 | echo "Unzipping data.zip..." 10 | unzip "$download_dir/data.zip" -d "$download_dir" 11 | rm -f "$download_dir/data.zip" 12 | 13 | echo "Data downloaded and extracted to $download_dir/data" 14 | else 15 | echo "Data already exists in $download_dir/data. Skipping download and extraction." 16 | fi 17 | if [ -d "$target_folder" ]; then 18 | echo "The folder '$target_folder' already exists in the current directory. Skipping copy." 19 | else 20 | if [ -d "$download_dir/data/$target_folder" ]; then 21 | cp -r "$download_dir/data/$target_folder" ./ 22 | echo "$target_folder folder successfully copied to the current directory." 23 | else 24 | echo "$target_folder folder not found inside '$download_dir/data'." 25 | fi 26 | fi 27 | echo "'$target_folder' folder is now available in the current directory." 28 | -------------------------------------------------------------------------------- /applications/Autodock/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM condaforge/miniforge3:4.10.2-0 2 | ENV DEBIAN_FRONTEND=noninteractive 3 | RUN apt-get update && apt-get install -y --no-install-recommends \ 4 | vim \ 5 | git \ 6 | build-essential \ 7 | ocl-icd-opencl-dev \ 8 | clinfo && \ 9 | apt-get clean && \ 10 | rm -rf /var/lib/apt/lists/* 11 | RUN conda install -c conda-forge \ 12 | python=3.10 \ 13 | requests=2.28.2 \ 14 | mkl=2023.1 \ 15 | dpcpp_linux-64=2023.1 \ 16 | dpcpp-cpp-rt=2023.1 \ 17 | mkl-devel=2023.1 && \ 18 | conda clean --all -f -y 19 | ENV LD_LIBRARY_PATH="/opt/conda/lib:${LD_LIBRARY_PATH}" 20 | WORKDIR /opt 21 | ENV SERVICE_NAME="autodock-service" 22 | RUN groupadd --gid 1001 $SERVICE_NAME && \ 23 | useradd -m -g $SERVICE_NAME --shell /bin/false --uid 1001 $SERVICE_NAME && \ 24 | mkdir -p /opt/AutoDock && \ 25 | chown -R $SERVICE_NAME:$SERVICE_NAME /opt/AutoDock 26 | USER $SERVICE_NAME 27 | WORKDIR /opt/AutoDock 28 | RUN git clone https://github.com/emascarenhas/AutoDock-GPU.git . && \ 29 | git checkout v1.4 30 | RUN make DEVICE=CPU NUMWI=64 && \ 31 | rm -rf .git build_temp 32 | ENV PATH="/opt/AutoDock/bin:${PATH}" 33 | HEALTHCHECK NONE 34 | WORKDIR /input 35 | CMD ["autodock_cpu_64wi","--help"] 36 | 37 | -------------------------------------------------------------------------------- /applications/Autodock/data_download_script.sh: -------------------------------------------------------------------------------- 1 | url="https://zenodo.org/records/4031961/files/data.zip?download=1" 2 | download_dir="./data_original" 3 | target_folder="$1" 4 | if [ ! -d "$download_dir/data" ]; then 5 | echo "Downloading data.zip..." 6 | mkdir -p "$download_dir" 7 | wget -O "$download_dir/data.zip" "$url" 8 | 9 | echo "Unzipping data.zip..." 10 | unzip "$download_dir/data.zip" -d "$download_dir" 11 | rm -f "$download_dir/data.zip" 12 | 13 | echo "Data downloaded and extracted to $download_dir/data" 14 | else 15 | echo "Data already exists in $download_dir/data. Skipping download and extraction." 16 | fi 17 | if [ -d "$target_folder" ]; then 18 | echo "The folder '$target_folder' already exists in the current directory. Skipping copy." 19 | else 20 | if [ -d "$download_dir/data/$target_folder" ]; then 21 | cp -r "$download_dir/data/$target_folder" ./ 22 | echo "$target_folder folder successfully copied to the current directory." 23 | else 24 | echo "$target_folder folder not found inside '$download_dir/data'." 25 | fi 26 | fi 27 | echo "'$target_folder' folder is now available in the current directory." 28 | -------------------------------------------------------------------------------- /applications/ProtGPT2/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use the official Ubuntu image as a base 2 | ARG FROM_IMAGE=ubuntu:24.04 3 | 4 | # Stage 1: Set up Conda environment 5 | ARG BASE_IMAGE=condaforge/miniforge3:24.3.0-0 6 | FROM ${BASE_IMAGE} as conda_setup 7 | 8 | ENV DEBIAN_FRONTEND=noninteractive 9 | 10 | # Stage 2: Set up the main build environment 11 | FROM ${FROM_IMAGE} as builder 12 | 13 | ENV DEBIAN_FRONTEND=noninteractive 14 | 15 | ARG http_proxy 16 | ENV http_proxy=${http_proxy} 17 | 18 | ARG https_proxy 19 | ENV https_proxy=${https_proxy} 20 | 21 | ARG no_proxy 22 | ENV no_proxy=${no_proxy} 23 | 24 | # Install necessary build tools and clean up 25 | RUN apt-get update && apt-get install -y --no-install-recommends \ 26 | ca-certificates git build-essential vim numactl autoconf automake make && \ 27 | apt-get clean && \ 28 | rm -rf /var/lib/apt/lists/* 29 | 30 | WORKDIR /app 31 | # Non-root user setup 32 | ENV SERVICE_NAME="protgpt2-service" 33 | 34 | RUN groupadd --gid 1001 $SERVICE_NAME && \ 35 | useradd -m -g $SERVICE_NAME --shell /bin/false --uid 1001 $SERVICE_NAME 36 | 37 | # Copy Conda installation from the conda_setup stage 38 | COPY --from=conda_setup /opt/conda /opt/conda 39 | 40 | RUN wget https://github.com/IntelLabs/Open-Omics-Acceleration-Framework/releases/download/3.0/Source_code_with_submodules.tar.gz 41 | RUN tar -xzf Source_code_with_submodules.tar.gz 42 | 43 | ENV PATH="/opt/conda/bin:$PATH" 44 | 45 | # Copy environment.yml and create Conda environment 46 | RUN cp /app/Open-Omics-Acceleration-Framework/applications/ProtGPT2/env.yml . 47 | RUN cp /app/Open-Omics-Acceleration-Framework/applications/ProtGPT2/protgpt2.py . 48 | RUN cp /app/Open-Omics-Acceleration-Framework/applications/ProtGPT2/model_script.sh . 49 | RUN rm -rf Open-Omics-Acceleration-Framework 50 | RUN rm -rf Source_code_with_submodules.tar.gz 51 | RUN conda env create -f env.yml 52 | 53 | RUN git clone --branch 5.3.0 https://github.com/jemalloc/jemalloc.git 54 | WORKDIR /app/jemalloc 55 | RUN bash autogen.sh --prefix=/opt/conda/envs/protgpt2/ && make install 56 | WORKDIR /app 57 | RUN rm -rf jemalloc 58 | 59 | # Set up environment activation and PATH 60 | ENV PATH="/opt/conda/envs/protgpt2/bin:$PATH" 61 | 62 | # Swith to Non-root user 63 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /app 64 | USER $SERVICE_NAME 65 | 66 | HEALTHCHECK NONE 67 | 68 | ENV LD_PRELOAD "/opt/conda/envs/protgpt2/lib/libjemalloc.so:$LD_PRELOAD"  69 | ENV MALLOC_CONF "oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1" 70 | CMD python 71 | -------------------------------------------------------------------------------- /applications/ProtGPT2/env.yml: -------------------------------------------------------------------------------- 1 | name: protgpt2 2 | channels: 3 | - conda-forge 4 | - pytorch 5 | dependencies: 6 | - cpuonly 7 | - python=3.11 8 | - pip=24.0 9 | - pytorch=2.2.0 10 | - pip: 11 | - transformers==4.38.0 12 | - intel-extension-for-pytorch==2.2.0 13 | - numpy==1.26.0 14 | 15 | -------------------------------------------------------------------------------- /applications/ProtGPT2/model_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Directory where the files will be downloaded 4 | model_dir="./model_dir" 5 | 6 | # Array of URLs to download 7 | URLS=( 8 | "https://huggingface.co/nferruz/ProtGPT2/resolve/main/.gitattributes" 9 | "https://huggingface.co/nferruz/ProtGPT2/resolve/main/config.json" 10 | "https://huggingface.co/nferruz/ProtGPT2/resolve/main/merges.txt" 11 | "https://huggingface.co/nferruz/ProtGPT2/resolve/main/ppl-plddt.png" 12 | "https://huggingface.co/nferruz/ProtGPT2/resolve/main/pytorch_model.bin" 13 | "https://huggingface.co/nferruz/ProtGPT2/resolve/main/special_tokens_map.json" 14 | "https://huggingface.co/nferruz/ProtGPT2/resolve/main/tokenizer.json" 15 | "https://huggingface.co/nferruz/ProtGPT2/resolve/main/vocab.json" 16 | 17 | ) 18 | 19 | # Create the directory if it doesn't exist 20 | mkdir -p "$model_dir" 21 | 22 | # Change to the download directory 23 | cd "$model_dir" || exit 24 | 25 | # Loop through each URL and download the file if it doesn't exist 26 | for url in "${URLS[@]}"; do 27 | filename=$(basename "$url") 28 | if [ -f "$filename" ]; then 29 | echo "$filename already exists. Skipping download." 30 | else 31 | echo "Downloading $filename..." 32 | wget "$url" 33 | fi 34 | done 35 | 36 | echo "Download process completed." 37 | 38 | -------------------------------------------------------------------------------- /applications/ProtGPT2/protgpt2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | import os 4 | import argparse 5 | import time 6 | import torch 7 | import random 8 | import numpy as np 9 | import intel_extension_for_pytorch as ipex 10 | from transformers import pipeline 11 | 12 | def make_deterministic(seed=42): 13 | torch.manual_seed(seed) 14 | np.random.seed(seed) 15 | random.seed(seed) 16 | 17 | def main(): 18 | # Setting up argument parser 19 | parser = argparse.ArgumentParser(description='Generate protein sequences using ProtGPT2 with IPEX optimization.') 20 | parser.add_argument('--max_length', type=int, default=100, help='Maximum length of generated sequence') 21 | parser.add_argument('--do_sample', type=bool, default=True, help='Whether to sample the output or not') 22 | parser.add_argument('--top_k', type=int, default=950, help='The number of highest probability vocabulary tokens to keep for top-k-filtering') 23 | parser.add_argument('--repetition_penalty', type=float, default=1.2, help='The parameter for repetition penalty. 1.0 means no penalty') 24 | parser.add_argument('--num_return_sequences', type=int, default=1, help='The number of sequences to return') 25 | parser.add_argument('--eos_token_id', type=int, default=0, help='The id of the end of sequence token') 26 | parser.add_argument('--dtype', type=str, choices=['float32', 'bfloat16'], default='float32', help='Data type for model optimization') 27 | parser.add_argument('--iterations', type=int, default=5, help='Number of iterations to run') 28 | parser.add_argument('--model_dir', type=str, default="None", help='Directory to load the protgpt2 model') 29 | parser.add_argument('--output_file', type=str, default='protgpt2_generated_sequences.txt', help='File to save the generated sequences') 30 | args = parser.parse_args() 31 | 32 | #make_deterministic() 33 | # Setting dtype 34 | dtype = torch.float32 if args.dtype == 'float32' else torch.bfloat16 35 | model_dir = args.model_dir 36 | if args.model_dir=="None": 37 | protgpt2 = pipeline('text-generation', model="nferruz/ProtGPT2", torch_dtype=dtype) 38 | else: 39 | protgpt2 = pipeline('text-generation', model=model_dir, torch_dtype=dtype) 40 | # Generate sequences using ProtGPT2 with IPEX optimization 41 | protgpt2.model = ipex.optimize(protgpt2.model, dtype=dtype) 42 | tic = time.time() 43 | for i in range(args.iterations): 44 | print("Iteration:", i) 45 | t0 = time.time() 46 | sequences = protgpt2( 47 | "<|endoftext|>", 48 | max_length=args.max_length, 49 | do_sample=args.do_sample, 50 | top_k=args.top_k, 51 | repetition_penalty=args.repetition_penalty, 52 | num_return_sequences=args.num_return_sequences, 53 | eos_token_id=args.eos_token_id 54 | ) 55 | t1 = time.time() 56 | print('Time taken for', i, 'iteration:', t1 - t0, 'seconds') 57 | toc = time.time() 58 | print('Time taken for', args.iterations, 'iterations:', toc - tic, 'seconds') 59 | print('Average time per iteration:', (toc - tic) / args.iterations, 'seconds') 60 | 61 | # Printing the sequences and saveing them to the output folder. 62 | with open(args.output_file, 'w') as f: 63 | for seq in sequences: 64 | f.write(seq['generated_text'] + "\n") 65 | print(f'Output saved to {args.output_file}') 66 | 67 | if __name__ == "__main__": 68 | main() 69 | 70 | -------------------------------------------------------------------------------- /applications/ProteinMPNN/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use the official Ubuntu image as a base 2 | ARG FROM_IMAGE=ubuntu:24.04 3 | 4 | # Stage 1: Set up Conda environment 5 | ARG BASE_IMAGE=condaforge/miniforge3:24.3.0-0 6 | FROM ${BASE_IMAGE} as conda_setup 7 | ENV DEBIAN_FRONTEND=noninteractive 8 | 9 | # Stage 2: Set up the main build environment 10 | FROM ${FROM_IMAGE} as builder 11 | ENV DEBIAN_FRONTEND=noninteractive 12 | 13 | # Install necessary build tools and clean up 14 | RUN apt-get update && apt-get install -y --no-install-recommends \ 15 | git autoconf build-essential wget vim ca-certificates numactl && \ 16 | rm -rf /var/lib/apt/lists/* && \ 17 | apt-get autoremove -y && \ 18 | apt-get clean 19 | 20 | # Non-root user setup 21 | ENV SERVICE_NAME="proteinmpnn-service" 22 | RUN groupadd --gid 1001 $SERVICE_NAME && \ 23 | useradd -m -g $SERVICE_NAME --shell /bin/false --uid 1001 $SERVICE_NAME 24 | 25 | WORKDIR / 26 | RUN git clone https://github.com/dauparas/ProteinMPNN.git 27 | WORKDIR /ProteinMPNN 28 | RUN git checkout 8907e6671bfbfc92303b5f79c4b5e6ce47cdef57 29 | 30 | # Apply the patch file 31 | RUN wget https://github.com/IntelLabs/Open-Omics-Acceleration-Framework/releases/download/3.0/Source_code_with_submodules.tar.gz 32 | RUN tar -xzf Source_code_with_submodules.tar.gz 33 | RUN cp /ProteinMPNN/Open-Omics-Acceleration-Framework/applications/ProteinMPNN/ProteinMPNN.patch . 34 | RUN git apply ProteinMPNN.patch 35 | RUN rm -rf Open-Omics-Acceleration-Framework 36 | RUN rm -rf Source_code_with_submodules.tar.gz 37 | ENV PATH="/opt/conda/bin:$PATH" 38 | 39 | # Copy Conda installation from the conda_setup stage 40 | COPY --from=conda_setup /opt/conda /opt/conda 41 | 42 | RUN conda create -n p_mpnn python=3.11 pip=24.0 43 | 44 | # Install PyTorch, Torchvision, Torchaudio 45 | RUN conda install -n p_mpnn -y pytorch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 -c pytorch 46 | # Install Intel PyTorch extension 47 | RUN /opt/conda/envs/p_mpnn/bin/python -m pip install intel-extension-for-pytorch==2.3.100 oneccl-bind-pt==2.3.0 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ 48 | RUN /opt/conda/envs/p_mpnn/bin/python -m pip install numpy==1.26.0 49 | 50 | # Clone jemalloc source for building 51 | WORKDIR /ProteinMPNN 52 | RUN git clone --branch 5.3.0 https://github.com/jemalloc/jemalloc.git 53 | WORKDIR /ProteinMPNN/jemalloc 54 | RUN bash autogen.sh --prefix=/opt/conda/envs/p_mpnn/ && make install 55 | WORKDIR /ProteinMPNN 56 | RUN rm -rf jemalloc 57 | 58 | # Set up environment activation and PATH 59 | ENV PATH="/opt/conda/envs/p_mpnn/bin:/opt/conda/bin:$PATH" 60 | 61 | # Ensure all scripts inside ProteinMPNN/examples are executable 62 | RUN chmod +x /ProteinMPNN/examples/*.py 63 | RUN mkdir /outputs 64 | 65 | # Change ownership of the directory 66 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /ProteinMPNN /outputs 67 | 68 | # Switch to non-root user 69 | USER $SERVICE_NAME 70 | 71 | # Healthcheck disabled 72 | HEALTHCHECK NONE 73 | 74 | # Set environment variables for jemalloc 75 | ENV LD_PRELOAD "/opt/conda/envs/p_mpnn/lib/libjemalloc.so:$LD_PRELOAD" 76 | ENV MALLOC_CONF "oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1" 77 | # Set the default command to run the application 78 | CMD source ~/.bashrc && ["/bin/bash", "python"] 79 | -------------------------------------------------------------------------------- /applications/ProteinMPNN/setup_proteinmpnn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | trap 'echo "Error on line $LINENO"; exit 1;' ERR 5 | 6 | SCRIPT_PATH="${BASH_SOURCE:-$0}" 7 | ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")" 8 | ABS_DIRECTORY="$(dirname "${ABS_SCRIPT_PATH}")" 9 | 10 | CONDA_INSTALL_DIR=$(realpath ./miniforge3) 11 | 12 | # Parse command line arguments 13 | while (( "$#" )); do 14 | case "$1" in 15 | -p) 16 | CONDA_INSTALL_DIR=$2 17 | CONDA_INSTALL_DIR=$(realpath "$CONDA_INSTALL_DIR") 18 | shift 2 19 | ;; 20 | -*|--*=) 21 | echo "Error: Unsupported flag $1" >&2 22 | exit 1 23 | ;; 24 | *) 25 | echo "Error: Unsupported argument $1" >&2 26 | exit 1 27 | ;; 28 | esac 29 | done 30 | 31 | # Check if Miniforge3 exists 32 | if [ ! -d "$CONDA_INSTALL_DIR" ]; then 33 | echo "Miniforge3 is not installed. Installing..." 34 | command -v wget >/dev/null 2>&1 || { echo "wget is required but not installed. Exiting."; exit 1; } 35 | wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh 36 | bash Miniforge3-Linux-x86_64.sh -b -p "$CONDA_INSTALL_DIR" 37 | echo "Miniforge3 installation complete." 38 | else 39 | echo "Miniforge3 is already installed at: $CONDA_INSTALL_DIR" 40 | fi 41 | 42 | export PATH="$CONDA_INSTALL_DIR/bin:$PATH" 43 | 44 | # Clone the ProteinMPNN repository 45 | if [ ! -d "ProteinMPNN" ]; then 46 | git clone https://github.com/dauparas/ProteinMPNN.git 47 | else 48 | echo "ProteinMPNN repository already exists, skipping git clone." 49 | fi 50 | 51 | cd ProteinMPNN 52 | git checkout 8907e6671bfbfc92303b5f79c4b5e6ce47cdef57 53 | PATCH_FILE="$ABS_DIRECTORY/ProteinMPNN.patch" 54 | if [ -f "$PATCH_FILE" ]; then 55 | if git apply --reverse --check "$PATCH_FILE" > /dev/null 2>&1; then 56 | echo "Patch has already been applied. Skipping patch step." 57 | else 58 | git apply "$PATCH_FILE" 59 | echo "Patch applied successfully." 60 | fi 61 | else 62 | echo "Error: Patch file not found at $PATCH_FILE" >&2 63 | exit 1 64 | fi 65 | 66 | # Create and activate the Conda environment 67 | #source "$CONDA_INSTALL_DIR/bin/activate" 68 | if conda env list | grep -q "^p_mpnn"; then 69 | echo "Environment exists. Moving ahead without create the env. If the setup crashes, please remove manually." 70 | else 71 | echo "Creating conda env p_mpnn.." 72 | conda create -n p_mpnn -y python=3.11 pip=24.0 73 | fi 74 | 75 | source $CONDA_INSTALL_DIR/bin/activate p_mpnn 76 | #conda activate p_mpnn 77 | 78 | conda install -n p_mpnn -y pytorch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 -c pytorch 79 | pip install intel-extension-for-pytorch==2.3.100 80 | pip install numpy==1.26.0 81 | 82 | echo "setup complete!" 83 | echo "Note:" 84 | echo "Conda (Miniforge3) is installed at $CONDA_INSTALL_DIR" 85 | echo "To manually activate conda env, do: source $CONDA_INSTALL_DIR/bin/activate SE3nv" 86 | -------------------------------------------------------------------------------- /applications/RFdiffusion/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use the official Ubuntu image as a base 2 | ARG FROM_IMAGE=ubuntu:24.04 3 | 4 | # Stage 1: Set up Conda environment 5 | ARG BASE_IMAGE=condaforge/miniforge3:24.3.0-0 6 | FROM ${BASE_IMAGE} as conda_setup 7 | ENV DEBIAN_FRONTEND=noninteractive 8 | 9 | # Stage 2: Set up the main build environment 10 | FROM ${FROM_IMAGE} as builder 11 | ENV DEBIAN_FRONTEND=noninteractive 12 | 13 | # Install necessary build tools and clean up 14 | RUN apt-get update && apt-get install -y --no-install-recommends \ 15 | git build-essential wget vim ca-certificates numactl autoconf automake make && \ 16 | rm -rf /var/lib/apt/lists/* && \ 17 | apt-get autoremove -y && \ 18 | apt-get clean 19 | 20 | # Non-root user setup 21 | ENV SERVICE_NAME="rfdiffusion-service" 22 | RUN groupadd --gid 1001 $SERVICE_NAME && \ 23 | useradd -m -g $SERVICE_NAME --shell /bin/false --uid 1001 $SERVICE_NAME 24 | 25 | # Copy Conda installation from the conda_setup stage 26 | COPY --from=conda_setup /opt/conda /opt/conda 27 | 28 | WORKDIR /app 29 | RUN git clone https://github.com/RosettaCommons/RFdiffusion.git 30 | 31 | WORKDIR /app/RFdiffusion 32 | # adding the git commit id 33 | RUN git checkout 820bfdfaded8c260b962dc40a3171eae316b6ce0 34 | 35 | RUN wget https://github.com/IntelLabs/Open-Omics-Acceleration-Framework/releases/download/3.0/Source_code_with_submodules.tar.gz 36 | RUN tar -xzf Source_code_with_submodules.tar.gz 37 | RUN cp /app/RFdiffusion/Open-Omics-Acceleration-Framework/applications/RFdiffusion/RFdiffusion.patch . 38 | RUN git apply RFdiffusion.patch 39 | RUN rm -rf Open-Omics-Acceleration-Framework 40 | RUN rm -rf Source_code_with_submodules.tar.gz 41 | 42 | WORKDIR /app/RFdiffusion/models 43 | RUN wget https://files.ipd.uw.edu/pub/RFdiffusion/6f5902ac237024bdd0c176cb93063dc4/Base_ckpt.pt && \ 44 | wget https://files.ipd.uw.edu/pub/RFdiffusion/e29311f6f1bf1af907f9ef9f44b8328b/Complex_base_ckpt.pt && \ 45 | wget https://files.ipd.uw.edu/pub/RFdiffusion/60f09a193fb5e5ccdc4980417708dbab/Complex_Fold_base_ckpt.pt && \ 46 | wget https://files.ipd.uw.edu/pub/RFdiffusion/74f51cfb8b440f50d70878e05361d8f0/InpaintSeq_ckpt.pt && \ 47 | wget https://files.ipd.uw.edu/pub/RFdiffusion/76d00716416567174cdb7ca96e208296/InpaintSeq_Fold_ckpt.pt && \ 48 | wget https://files.ipd.uw.edu/pub/RFdiffusion/5532d2e1f3a4738decd58b19d633b3c3/ActiveSite_ckpt.pt && \ 49 | wget https://files.ipd.uw.edu/pub/RFdiffusion/12fc204edeae5b57713c5ad7dcb97d39/Base_epoch8_ckpt.pt && \ 50 | wget https://files.ipd.uw.edu/pub/RFdiffusion/f572d396fae9206628714fb2ce00f72e/Complex_beta_ckpt.pt && \ 51 | wget https://files.ipd.uw.edu/pub/RFdiffusion/1befcb9b28e2f778f53d47f18b7597fa/RF_structure_prediction_weights.pt 52 | 53 | WORKDIR /app/RFdiffusion 54 | 55 | RUN /opt/conda/bin/conda env create -f env/SE3nv.yml 56 | 57 | RUN git clone --branch 5.3.0 https://github.com/jemalloc/jemalloc.git 58 | WORKDIR /app/RFdiffusion/jemalloc 59 | RUN bash autogen.sh --prefix=/opt/conda/envs/SE3nv/ && make install 60 | WORKDIR /app/RFdiffusion 61 | RUN rm -rf jemalloc 62 | 63 | # Set up environment activation and PATH 64 | ENV PATH="/opt/conda/envs/SE3nv/bin:/opt/conda/bin:$PATH" 65 | 66 | # Install dependencies 67 | WORKDIR /app/RFdiffusion/env/SE3Transformer 68 | RUN pip install --no-cache-dir -r requirements.txt 69 | RUN python setup.py install 70 | 71 | WORKDIR /app/RFdiffusion 72 | RUN pip install -e . 73 | 74 | RUN tar -xvf examples/ppi_scaffolds_subset.tar.gz -C examples/ 75 | 76 | WORKDIR /app/RFdiffusion/scripts 77 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /app 78 | 79 | # Switch to non-root user 80 | USER $SERVICE_NAME 81 | HEALTHCHECK NONE 82 | 83 | ENV LD_PRELOAD "/opt/conda/envs/SE3nv/lib/libjemalloc.so:$LD_PRELOAD" 84 | ENV MALLOC_CONF "oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1" 85 | CMD python 86 | -------------------------------------------------------------------------------- /applications/RFdiffusion/setup_rfdiffusion.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | SCRIPT_PATH="${BASH_SOURCE:-$0}" 5 | ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")" 6 | #echo "Value of ABS_SCRIPT_PATH: ${ABS_SCRIPT_PATH}" 7 | ABS_DIRECTORY="$(dirname "${ABS_SCRIPT_PATH}")" 8 | # Default Conda installation directory 9 | CONDA_INSTALL_DIR=$(realpath ./miniforge3) 10 | 11 | # Parse command line arguments 12 | while (( "$#" )); do 13 | case "$1" in 14 | -p) 15 | CONDA_INSTALL_DIR=$2 16 | CONDA_INSTALL_DIR=$(realpath "$CONDA_INSTALL_DIR") 17 | shift 2 18 | ;; 19 | -*|--*=) # Unsupported flags 20 | echo "Error: Unsupported flag $1" >&2 21 | exit 1 22 | ;; 23 | *) # Preserve positional arguments 24 | echo "Error: Unsupported argument $1" >&2 25 | exit 1 26 | ;; 27 | esac 28 | done 29 | # Check if Miniforge3 exists and install if not found 30 | if [ ! -d "$CONDA_INSTALL_DIR" ]; then 31 | echo "Miniforge3 is not installed. Installing..." 32 | wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh 33 | bash Miniforge3-Linux-x86_64.sh -b -p "$CONDA_INSTALL_DIR" 34 | echo "Miniforge3 installation complete." 35 | else 36 | echo "Miniforge3 is already installed at: $CONDA_INSTALL_DIR" 37 | fi 38 | # Export Conda binary path 39 | export PATH="$CONDA_INSTALL_DIR/bin:$PATH" 40 | # Clone the RFdiffusion repository if it doesn't exist 41 | if [ ! -d "RFdiffusion" ]; then 42 | git clone https://github.com/RosettaCommons/RFdiffusion.git 43 | else 44 | echo "RFdiffusion repository already exists, skipping git clone." 45 | fi 46 | 47 | echo "$CONDA_INSTALL_DIR" 48 | # Apply patch (assuming patch file is RFdiffusion.patch and it should be applied in RFdiffusion directory) 49 | cd RFdiffusion 50 | git checkout 820bfdfaded8c260b962dc40a3171eae316b6ce0 51 | git log -1 52 | PATCH_FILE="$ABS_DIRECTORY/RFdiffusion.patch" 53 | echo $PATCH_FILE 54 | if [ -f "$PATCH_FILE" ]; then 55 | # Check if the patch is already applied 56 | if git apply --reverse --check "$PATCH_FILE" > /dev/null 2>&1; then 57 | echo "Patch has already been applied. Skipping patch step." 58 | else 59 | git apply "$PATCH_FILE" 60 | echo "Patch applied successfully." 61 | fi 62 | else 63 | echo "Error: Patch file not found at $PATCH_FILE" >&2 64 | exit 1 65 | fi 66 | mkdir -p models 67 | cd models/ 68 | wget https://files.ipd.uw.edu/pub/RFdiffusion/6f5902ac237024bdd0c176cb93063dc4/Base_ckpt.pt 69 | wget https://files.ipd.uw.edu/pub/RFdiffusion/e29311f6f1bf1af907f9ef9f44b8328b/Complex_base_ckpt.pt 70 | wget https://files.ipd.uw.edu/pub/RFdiffusion/60f09a193fb5e5ccdc4980417708dbab/Complex_Fold_base_ckpt.pt 71 | wget https://files.ipd.uw.edu/pub/RFdiffusion/74f51cfb8b440f50d70878e05361d8f0/InpaintSeq_ckpt.pt 72 | wget https://files.ipd.uw.edu/pub/RFdiffusion/76d00716416567174cdb7ca96e208296/InpaintSeq_Fold_ckpt.pt 73 | wget https://files.ipd.uw.edu/pub/RFdiffusion/5532d2e1f3a4738decd58b19d633b3c3/ActiveSite_ckpt.pt 74 | wget https://files.ipd.uw.edu/pub/RFdiffusion/12fc204edeae5b57713c5ad7dcb97d39/Base_epoch8_ckpt.pt 75 | # Optional: 76 | wget https://files.ipd.uw.edu/pub/RFdiffusion/f572d396fae9206628714fb2ce00f72e/Complex_beta_ckpt.pt 77 | # original structure prediction weights 78 | wget https://files.ipd.uw.edu/pub/RFdiffusion/1befcb9b28e2f778f53d47f18b7597fa/RF_structure_prediction_weights.pt 79 | cd ../ 80 | # Create and activate the Conda environment using the YAML file, disabling plugins to avoid errors 81 | #CONDA_NO_PLUGINS=true 82 | if conda env list | grep -q "^SE3nv"; then 83 | echo "Environment exists. Moving ahead without create the env. If the setup crashes, please remove manually." 84 | else 85 | echo "Creating conda env SE3nv.." 86 | conda env create -f env/SE3nv.yml 87 | fi 88 | source $CONDA_INSTALL_DIR/bin/activate SE3nv 89 | #conda init 90 | #conda activate SE3nv 91 | 92 | # Install SE3Transformer requirements 93 | cd env/SE3Transformer 94 | pip install --no-cache-dir -r requirements.txt 95 | python setup.py install 96 | 97 | # Install the rfdiffusion module 98 | cd ../.. # Change into the root directory of the repository 99 | pip install -e . 100 | 101 | echo "" 102 | echo "Note:" 103 | echo "Conda (Miniforge3) is installed at $CONDA_INSTALL_DIR" 104 | echo "To manually activate conda env, do: source $CONDA_INSTALL_DIR/bin/activate SE3nv" 105 | -------------------------------------------------------------------------------- /applications/boltz/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use the official Ubuntu image as a base 2 | ARG FROM_IMAGE=ubuntu:24.04 3 | 4 | # Stage 1: Set up Conda environment 5 | ARG BASE_IMAGE=condaforge/miniforge3:24.3.0-0 6 | FROM ${BASE_IMAGE} as conda_setup 7 | ENV DEBIAN_FRONTEND=noninteractive 8 | 9 | # Stage 2: Set up the main build environment 10 | FROM ${FROM_IMAGE} as builder 11 | ENV DEBIAN_FRONTEND=noninteractive 12 | 13 | # Install necessary build tools and clean up 14 | RUN apt-get update && apt-get install -y --no-install-recommends \ 15 | git build-essential wget vim ca-certificates numactl autoconf automake make && \ 16 | rm -rf /var/lib/apt/lists/* && \ 17 | apt-get autoremove -y && \ 18 | apt-get clean 19 | 20 | # Build arguments for host UID/GID 21 | ARG USER_ID=2000 22 | ARG GROUP_ID=2000 23 | 24 | ENV SERVICE_NAME="boltz-service" 25 | 26 | # Create a user and group with same UID and GID as host 27 | RUN groupadd --gid ${GROUP_ID} $SERVICE_NAME && \ 28 | useradd -m -g $SERVICE_NAME --shell /bin/false --uid ${USER_ID} $SERVICE_NAME 29 | 30 | # Copy Conda installation from the conda_setup stage 31 | COPY --from=conda_setup /opt/conda /opt/conda 32 | ENV PATH="/opt/conda/bin:$PATH" 33 | ENV LD_LIBRARY_PATH="/opt/conda/lib:${LD_LIBRARY_PATH:-}" 34 | RUN echo "source activate" >> ~/.bashrc 35 | 36 | RUN git clone --branch 5.3.0 https://github.com/jemalloc/jemalloc.git 37 | WORKDIR /jemalloc 38 | RUN bash autogen.sh --prefix=/opt/conda/ && make install 39 | WORKDIR / 40 | RUN rm -rf jemalloc 41 | 42 | WORKDIR /app 43 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /app 44 | RUN git clone --branch v0.4.1 https://github.com/jwohlwend/boltz.git 45 | 46 | WORKDIR /app/boltz 47 | RUN pip install -e . 48 | 49 | # Switch to non-root user 50 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /app/boltz 51 | COPY ./entrypoint.sh entrypoint.sh 52 | RUN chmod +x entrypoint.sh 53 | 54 | USER $SERVICE_NAME 55 | 56 | # bin bash 57 | # Clone Boltz 1 repository (replace with the actual repo URL) 58 | ENTRYPOINT ["/app/boltz/entrypoint.sh"] 59 | 60 | # Default command 61 | CMD ["default"] -------------------------------------------------------------------------------- /applications/boltz/README.md: -------------------------------------------------------------------------------- 1 | ## 🔍 Running Inference with Boltz Docker 2 | 3 | Follow the steps below to run inference using the Boltz Docker container: 4 | 5 | --- 6 | 7 | ### 🐳 1. Build the Docker Image 8 | 9 | From the root of the project directory, build the Docker image: 10 | 11 | ```bash 12 | docker build -t boltz1 . 13 | ``` 14 | 15 | --- 16 | 17 | ### 📁 2. Create and Set Output Directory Permissions 18 | 19 | Create an output folder and give it proper write permissions: 20 | 21 | ```bash 22 | mkdir -p 23 | chmod a+w 24 | 25 | export OUTPUT=$PWD/ 26 | export MODELS=$PWD/ 27 | export INPUT=$PWD/ 28 | ``` 29 | 30 | > ⚠️ Docker needs write permissions in the `` and `` folder. `` is the folder contaning the input `.yaml` or `.fasta` file 31 | 32 | Example 33 | 34 | ```bash 35 | mkdir -p ./output ./model 36 | chmod a+w ./output ./model 37 | 38 | export OUTPUT=$PWD/output 39 | export MODELS=$PWD/model 40 | export INPUT=$PWD/examples/ 41 | ``` 42 | 43 | --- 44 | 45 | ### 🚀 3. Run Inference 46 | 47 | In order to do inferencing few things needs to be done 48 | Mount the volumes for input folder and output folder. Pass the mounted volumes to boltz as arguments. So the docker run command looks like 49 | 50 | ```bash 51 | docker run -it \ 52 | --shm-size=100g \ 53 | -v $INPUT:/app/boltz/input \ 54 | -v $MODELS:/home/boltz-service/.boltz/ \ 55 | -v $OUTPUT:/app/boltz/output \ 56 | boltz1 57 | ``` 58 | 59 | > 📝 The `--shm-size=100g` flag avoids shared memory issues during data loading with PyTorch. 60 | 61 | --- 62 | 63 | ### ✅ Output 64 | 65 | Results will be written to the folder. 66 | 67 | Boltz currently accepts three input formats: 68 | 69 | 1. Fasta file, for most use cases 70 | 71 | 2. A comprehensive YAML schema, for more complex use cases 72 | 73 | 3. A directory containing files of the above formats, for batched processing 74 | 75 | ## For more information checkout [boltz](https://github.com/jwohlwend/boltz) 76 | 77 | ## License 78 | 79 | Our model and code are released under MIT License, and can be freely used for both academic and commercial purposes. 80 | 81 | 82 | ## Cite 83 | 84 | If you use this code or the models in your research, please cite the following paper: 85 | 86 | ```bibtex 87 | @article{wohlwend2024boltz1, 88 | author = {Wohlwend, Jeremy and Corso, Gabriele and Passaro, Saro and Reveiz, Mateo and Leidal, Ken and Swiderski, Wojtek and Portnoi, Tally and Chinn, Itamar and Silterra, Jacob and Jaakkola, Tommi and Barzilay, Regina}, 89 | title = {Boltz-1: Democratizing Biomolecular Interaction Modeling}, 90 | year = {2024}, 91 | doi = {10.1101/2024.11.19.624167}, 92 | journal = {bioRxiv} 93 | } 94 | ``` 95 | 96 | In addition if you use the automatic MSA generation, please cite: 97 | 98 | ```bibtex 99 | @article{mirdita2022colabfold, 100 | title={ColabFold: making protein folding accessible to all}, 101 | author={Mirdita, Milot and Sch{\"u}tze, Konstantin and Moriwaki, Yoshitaka and Heo, Lim and Ovchinnikov, Sergey and Steinegger, Martin}, 102 | journal={Nature methods}, 103 | year={2022}, 104 | } 105 | ``` 106 | -------------------------------------------------------------------------------- /applications/boltz/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | INPUT_DIR="/app/boltz/input" 4 | OUTPUT_DIR="/app/boltz/output" 5 | CUDA_VISIBLE_DEVICES="" 6 | 7 | FOUND=0 8 | 9 | for INPUT_FILE in "$INPUT_DIR"/*; do 10 | # Only process .yaml or .fasta files 11 | if [[ "$INPUT_FILE" == *.yaml || "$INPUT_FILE" == *.fasta ]]; then 12 | echo "📂 Processing: $INPUT_FILE" 13 | LD_PRELOAD=/opt/conda/lib/libjemalloc.so:$LD_PRELOAD \ 14 | MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1" \ 15 | boltz predict "$INPUT_FILE" --out_dir "$OUTPUT_DIR" --accelerator "cpu" 16 | FOUND=1 17 | fi 18 | done 19 | 20 | if [[ $FOUND -eq 0 ]]; then 21 | echo "❌ No .yaml or .fasta files found in $INPUT_DIR" 22 | exit 1 23 | fi 24 | -------------------------------------------------------------------------------- /applications/esm/Dockerfile.base: -------------------------------------------------------------------------------- 1 | ARG FROM_IMAGE=ubuntu:24.04 2 | 3 | # Stage 2: Set up the main build environment 4 | FROM ${FROM_IMAGE} as builder 5 | 6 | ENV DEBIAN_FRONTEND=noninteractive 7 | 8 | ARG http_proxy 9 | ENV http_proxy=${http_proxy} 10 | 11 | ARG https_proxy 12 | ENV https_proxy=${https_proxy} 13 | 14 | ARG no_proxy 15 | ENV no_proxy=${no_proxy} 16 | 17 | 18 | # Install necessary build tools and clean up 19 | RUN apt-get update && apt-get install -y --no-install-recommends \ 20 | git build-essential wget vim ca-certificates autoconf automake make numactl && \ 21 | rm -rf /var/lib/apt/lists/* && \ 22 | apt-get autoremove -y && \ 23 | apt-get clean 24 | ENV SERVICE_NAME="esm-base-service" 25 | RUN groupadd --gid 1001 $SERVICE_NAME && \ 26 | useradd -m -g $SERVICE_NAME --shell /bin/false --uid 1001 $SERVICE_NAME 27 | 28 | WORKDIR /app 29 | 30 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /app 31 | USER $SERVICE_NAME 32 | 33 | RUN wget --no-check-certificate "https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-$(uname)-$(uname -m).sh" 34 | RUN bash Miniforge3-$(uname)-$(uname -m).sh -b -p "${HOME}/conda" 35 | 36 | WORKDIR /app 37 | 38 | RUN git clone --recursive https://github.com/facebookresearch/esm.git 39 | WORKDIR /app/esm 40 | RUN git checkout -b esm 2b369911bb5b4b0dda914521b9475cad1656b2ac 41 | 42 | 43 | RUN wget https://github.com/IntelLabs/Open-Omics-Acceleration-Framework/releases/download/3.0/Source_code_with_submodules.tar.gz 44 | RUN tar -xzf Source_code_with_submodules.tar.gz 45 | 46 | RUN mv /app/esm/Open-Omics-Acceleration-Framework/applications/esm /app/esm/omics_setup && \ 47 | git apply /app/esm/omics_setup/esm_change_all.patch && \ 48 | rm -rf /app/esm/Open-Omics-Acceleration-Framework && \ 49 | rm -rf /app/esm/Source_code_with_submodules.tar.gz 50 | 51 | RUN mkdir -p /home/esm-base-service/.cache/torch/hub/ && \ 52 | rm -rf /home/esm-base-service/.cache/torch/hub/checkpoints && \ 53 | ln -s /checkpoints /home/esm-base-service/.cache/torch/hub/checkpoints 54 | 55 | HEALTHCHECK NONE 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /applications/esm/Dockerfile.esm: -------------------------------------------------------------------------------- 1 | # Accept the base image name as an argument 2 | ARG BASE_IMAGE 3 | 4 | # Extend from the base image passed as an argument 5 | FROM ${BASE_IMAGE} 6 | 7 | USER $SERVICE_NAME 8 | # Create Conda Env for ESM 9 | WORKDIR /app/esm/ 10 | RUN ${HOME}/conda/bin/mamba env create -f /app/esm/omics_setup/env.yml 11 | 12 | WORKDIR /app/esm 13 | RUN bash -c "source ${HOME}/conda/etc/profile.d/conda.sh && \ 14 | source ${HOME}/conda/etc/profile.d/mamba.sh && \ 15 | mamba activate esm_py11 && \ 16 | pip install . &&\ 17 | pip install torch==2.4.0+cpu torchvision==0.19.0+cpu torchaudio==2.4.0+cpu --index-url https://download.pytorch.org/whl/cpu" 18 | 19 | RUN echo "#!/bin/bash" >> /app/init.sh && \ 20 | echo "source ${HOME}/conda/etc/profile.d/conda.sh" >> /app/init.sh && \ 21 | echo "source ${HOME}/conda/etc/profile.d/mamba.sh" >> /app/init.sh && \ 22 | echo "mamba activate esm_py11" >> /app/init.sh && \ 23 | chmod +x /app/init.sh && \ 24 | echo "source /app/init.sh" >> ~/.bashrc 25 | 26 | WORKDIR /app/esm/examples/lm-design/ 27 | RUN wget https://dl.fbaipublicfiles.com/fair-esm/examples/lm_design/linear_projection_model.pt 28 | 29 | 30 | WORKDIR /app/esm/ 31 | HEALTHCHECK NONE 32 | 33 | RUN echo '#!/bin/bash' > /app/entrypoint.sh && \ 34 | echo 'if [ -z "$1" ]; then' >> /app/entrypoint.sh && \ 35 | echo ' exec /bin/bash' >> /app/entrypoint.sh && \ 36 | echo 'else' >> /app/entrypoint.sh && \ 37 | echo ' source /app/init.sh' >> /app/entrypoint.sh && \ 38 | echo ' exec "$@"' >> /app/entrypoint.sh && \ 39 | echo 'fi' >> /app/entrypoint.sh && \ 40 | chmod +x /app/entrypoint.sh 41 | 42 | ENTRYPOINT ["/app/entrypoint.sh"] 43 | CMD [] 44 | -------------------------------------------------------------------------------- /applications/esm/Dockerfile.esmfold: -------------------------------------------------------------------------------- 1 | # Accept the base image name as an argument 2 | ARG BASE_IMAGE 3 | 4 | # Extend from the base image passed as an argument 5 | FROM ${BASE_IMAGE} 6 | 7 | USER $SERVICE_NAME 8 | # Clone Openfold 9 | WORKDIR /app/esm 10 | RUN git clone https://github.com/aqlaboratory/openfold.git 11 | WORKDIR /app/esm/openfold 12 | RUN git checkout -b esm_openfold 4b41059694619831a7db195b7e0988fc4ff3a307 && \ 13 | git apply /app/esm/omics_setup/esm_openfold_change_py37.patch 14 | 15 | WORKDIR /app/esm 16 | RUN ${HOME}/conda/bin/mamba env create -f environment.yml 17 | RUN bash -c "source ${HOME}/conda/etc/profile.d/conda.sh && \ 18 | source ${HOME}/conda/etc/profile.d/mamba.sh && \ 19 | mamba activate esmfold && \ 20 | pip install . " 21 | 22 | WORKDIR /app/esm/openfold 23 | RUN bash -c "source ${HOME}/conda/etc/profile.d/conda.sh && \ 24 | source ${HOME}/conda/etc/profile.d/mamba.sh && \ 25 | mamba activate esmfold && \ 26 | python setup.py install " 27 | 28 | WORKDIR /app/esm 29 | RUN git clone --branch 5.3.0 https://github.com/jemalloc/jemalloc.git 30 | WORKDIR /app/esm/jemalloc 31 | RUN bash autogen.sh --prefix=${HOME}/conda/envs/esmfold/ && make install && \ 32 | rm -rf /app/esm/jemalloc 33 | 34 | RUN echo "#!/bin/bash" >> /app/init.sh && \ 35 | echo "source ${HOME}/conda/etc/profile.d/conda.sh" >> /app/init.sh && \ 36 | echo "source ${HOME}/conda/etc/profile.d/mamba.sh" >> /app/init.sh && \ 37 | echo "mamba activate esmfold" >> /app/init.sh && \ 38 | chmod +x /app/init.sh && \ 39 | echo "source /app/init.sh" >> ~/.bashrc 40 | 41 | WORKDIR /app/esm/ 42 | HEALTHCHECK NONE 43 | 44 | ENV LD_PRELOAD "/home/esm-base-service/conda/envs/esmfold/lib/libjemalloc.so:$LD_PRELOAD" 45 | ENV MALLOC_CONF "oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1" 46 | 47 | RUN echo '#!/bin/bash' > /app/entrypoint.sh && \ 48 | echo 'if [ -z "$1" ]; then' >> /app/entrypoint.sh && \ 49 | echo ' exec /bin/bash' >> /app/entrypoint.sh && \ 50 | echo 'else' >> /app/entrypoint.sh && \ 51 | echo ' source /app/init.sh' >> /app/entrypoint.sh && \ 52 | echo ' exec "$@"' >> /app/entrypoint.sh && \ 53 | echo 'fi' >> /app/entrypoint.sh && \ 54 | chmod +x /app/entrypoint.sh 55 | 56 | 57 | ENTRYPOINT ["/app/entrypoint.sh"] 58 | CMD [] 59 | 60 | -------------------------------------------------------------------------------- /applications/esm/build_docker_images.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Constants 4 | BASE_IMAGE=esm_base_image 5 | ESM_IMAGE=esm_image 6 | ESMFOLD_IMAGE=esmfold_image 7 | 8 | # Function to check if a command exists 9 | command_exists() { 10 | command -v "$1" >/dev/null 2>&1 11 | } 12 | 13 | # Function to check if a Docker image exists 14 | image_exists() { 15 | local image_name="$1" 16 | $runtime images -q "$image_name" | grep -q . 17 | } 18 | 19 | # Parse optional proxy args from command-line 20 | for arg in "$@"; do 21 | case $arg in 22 | --http_proxy=*) 23 | http_proxy="${arg#*=}" 24 | ;; 25 | --https_proxy=*) 26 | https_proxy="${arg#*=}" 27 | ;; 28 | --no_proxy=*) 29 | no_proxy="${arg#*=}" 30 | ;; 31 | *) 32 | echo "Unknown option: $arg" 33 | exit 1 34 | ;; 35 | esac 36 | done 37 | 38 | runtime=docker 39 | if ! command_exists "$runtime"; then 40 | echo "$runtime is not installed on your system. Please install it first." 41 | exit 1 42 | fi 43 | 44 | build_esm=false 45 | build_esmfold=false 46 | 47 | # Prompt for tasks to build 48 | echo "Which images do you want to build?" 49 | echo "1 esm" 50 | echo "2 esm_fold" 51 | echo "3 Both esm and esm_fold" 52 | read -r task_option 53 | 54 | case $task_option in 55 | 1) build_esm=true ;; 56 | 2) build_esmfold=true ;; 57 | 3) build_esm=true; build_esmfold=true ;; 58 | *) echo "Invalid option selected. Please choose 1, 2, or 3."; exit 1 ;; 59 | esac 60 | 61 | echo "build_esm = ${build_esm}" 62 | echo "build_esmfold = ${build_esmfold}" 63 | 64 | # Function to build Docker image 65 | build_image() { 66 | local image_name="$1" 67 | local dockerfile="$2" 68 | local args=(--build-arg BASE_IMAGE=$BASE_IMAGE) 69 | 70 | [[ -n "$http_proxy" ]] && args+=(--build-arg http_proxy=$http_proxy) 71 | [[ -n "$https_proxy" ]] && args+=(--build-arg https_proxy=$https_proxy) 72 | [[ -n "$no_proxy" ]] && args+=(--build-arg no_proxy=$no_proxy) 73 | 74 | $runtime build "${args[@]}" -f "$dockerfile" -t "$image_name" . 75 | } 76 | 77 | # Build base image 78 | if ! image_exists "$BASE_IMAGE"; then 79 | echo "Building base image..." 80 | build_image "$BASE_IMAGE" "Dockerfile.base" 81 | fi 82 | 83 | # Build esm image 84 | if $build_esm && ! image_exists "$ESM_IMAGE"; then 85 | echo "Building image for esm..." 86 | build_image "$ESM_IMAGE" "Dockerfile.esm" 87 | fi 88 | 89 | # Build esm_fold image 90 | if $build_esmfold && ! image_exists "$ESMFOLD_IMAGE"; then 91 | echo "Building image for esm_fold..." 92 | build_image "$ESMFOLD_IMAGE" "Dockerfile.esmfold" 93 | fi 94 | 95 | echo "Build process completed." 96 | -------------------------------------------------------------------------------- /applications/esm/env.yml: -------------------------------------------------------------------------------- 1 | name: esm_py11 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.11 6 | - pip 7 | - pip: 8 | - numpy==1.26.4 9 | - hydra-core==1.3.2 10 | - nltk==3.8.1 11 | - py3Dmol==2.3.0 12 | - biotite==0.41.2 13 | - torch-geometric==2.5.3 14 | - dm-tree==0.1.8 15 | 16 | -------------------------------------------------------------------------------- /applications/esm3/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Intel Corporation 2 | # SPDX-License-Identifier: MIT License 3 | 4 | ARG FROM_IMAGE=ubuntu:25.10 5 | 6 | FROM ${FROM_IMAGE} AS builder 7 | 8 | ENV DEBIAN_FRONTEND=noninteractive 9 | 10 | RUN apt-get update && apt-get install -y --no-install-recommends \ 11 | git build-essential wget vim ca-certificates autoconf automake make numactl && \ 12 | rm -rf /var/lib/apt/lists/* && \ 13 | apt-get autoremove -y && \ 14 | apt-get clean 15 | ENV SERVICE_NAME="esm3-base-service" 16 | RUN groupadd --gid 1001 $SERVICE_NAME && \ 17 | useradd -m -g $SERVICE_NAME --shell /bin/false --uid 1001 $SERVICE_NAME 18 | 19 | WORKDIR /app 20 | 21 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /app 22 | USER $SERVICE_NAME 23 | 24 | RUN wget --no-check-certificate "https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-$(uname)-$(uname -m).sh" 25 | RUN bash Miniforge3-$(uname)-$(uname -m).sh -b -p "${HOME}/conda" 26 | 27 | WORKDIR /app 28 | COPY . /app 29 | RUN git clone https://github.com/evolutionaryscale/esm.git 30 | WORKDIR /app/esm 31 | RUN git checkout -b esm d40007ea16850da4fbf60244a9d50c2a94cbef3d 32 | RUN cp /app/esm3_changes.patch /app/esm/esm_changes.patch 33 | RUN git apply /app/esm/esm_changes.patch 34 | 35 | WORKDIR /app/esm/ 36 | RUN ${HOME}/conda/bin/conda update -y --all 37 | RUN ${HOME}/conda/bin/mamba update -y --all 38 | RUN ${HOME}/conda/bin/mamba env create -f /app/env.yml 39 | ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu 40 | 41 | WORKDIR /app/esm 42 | RUN bash -c "source ${HOME}/conda/etc/profile.d/conda.sh && \ 43 | source ${HOME}/conda/etc/profile.d/mamba.sh && \ 44 | mamba activate esm3 && \ 45 | pip install --upgrade pip setuptools wheel && \ 46 | pip install ." 47 | 48 | 49 | RUN echo "#!/bin/bash" > /app/init.sh && \ 50 | echo "source ${HOME}/conda/etc/profile.d/conda.sh" >> /app/init.sh && \ 51 | echo "source ${HOME}/conda/etc/profile.d/mamba.sh" >> /app/init.sh && \ 52 | echo "mamba activate esm3" >> /app/init.sh && \ 53 | chmod +x /app/init.sh && \ 54 | echo "source /app/init.sh" >> ~/.bashrc 55 | 56 | WORKDIR /app/esm 57 | 58 | RUN echo '#!/bin/bash' > /app/entrypoint.sh && \ 59 | echo 'if [ -z "$1" ]; then' >> /app/entrypoint.sh && \ 60 | echo ' exec /bin/bash' >> /app/entrypoint.sh && \ 61 | echo 'else' >> /app/entrypoint.sh && \ 62 | echo ' source /app/init.sh' >> /app/entrypoint.sh && \ 63 | echo ' exec "$@"' >> /app/entrypoint.sh && \ 64 | echo 'fi' >> /app/entrypoint.sh && \ 65 | chmod +x /app/entrypoint.sh 66 | 67 | ENTRYPOINT ["/app/entrypoint.sh"] 68 | CMD [] 69 | HEALTHCHECK NONE 70 | 71 | WORKDIR /app 72 | ENV HF_HOME=/home/esm3-base-service/.cache/torch 73 | 74 | RUN mkdir -p /home/esm3-base-service/.cache/torch && \ 75 | ln -s /models /home/esm3-base-service/.cache/torch/hub 76 | -------------------------------------------------------------------------------- /applications/esm3/env.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Intel Corporation 2 | # SPDX-License-Identifier: MIT License 3 | 4 | name: esm3 5 | channels: 6 | - conda-forge 7 | dependencies: 8 | - python=3.10.16 9 | - mamba 10 | - pip 11 | -------------------------------------------------------------------------------- /applications/esm3/esm3_changes.patch: -------------------------------------------------------------------------------- 1 | diff --git a/esm/models/esm3.py b/esm/models/esm3.py 2 | index cbe02dd..b6df418 100644 3 | --- a/esm/models/esm3.py 4 | +++ b/esm/models/esm3.py 5 | @@ -227,7 +227,10 @@ class ESM3(nn.Module, ESM3InferenceClient): 6 | 7 | @classmethod 8 | def from_pretrained( 9 | - cls, model_name: str = ESM3_OPEN_SMALL, device: torch.device | None = None 10 | + cls, 11 | + model_name: str = ESM3_OPEN_SMALL, 12 | + device: torch.device | None = None, 13 | + bf16: bool = False # Add bf16 argument 14 | ) -> ESM3: 15 | from esm.pretrained import load_local_model 16 | 17 | @@ -236,7 +239,14 @@ class ESM3(nn.Module, ESM3InferenceClient): 18 | raise ValueError(f"Model name {model_name} is not a valid ESM3 model name.") 19 | if device is None: 20 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 21 | + # Determine dtype based on bf16 flag 22 | + if bf16: 23 | + dtype = torch.bfloat16 24 | + else: 25 | + dtype = torch.float32 26 | + print("Selected data type:", dtype) 27 | model = load_local_model(model_name, device=device) 28 | + model = model.to(dtype) 29 | if device.type != "cpu": 30 | model = model.to(torch.bfloat16) 31 | assert isinstance(model, ESM3) 32 | diff --git a/esm/models/esmc.py b/esm/models/esmc.py 33 | index 0807a21..e93085e 100644 34 | --- a/esm/models/esmc.py 35 | +++ b/esm/models/esmc.py 36 | @@ -77,13 +77,23 @@ class ESMC(nn.Module, ESMCInferenceClient): 37 | 38 | @classmethod 39 | def from_pretrained( 40 | - cls, model_name: str = ESMC_600M, device: torch.device | None = None 41 | + cls, model_name: str = ESMC_600M, device: torch.device | None = None, 42 | + bf16: bool = False, # Add bf16 argument 43 | ) -> ESMC: 44 | from esm.pretrained import load_local_model 45 | 46 | if device is None: 47 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 48 | + # Determine dtype based on bf16 flag 49 | + if bf16: 50 | + dtype = torch.bfloat16 51 | + else: 52 | + dtype = torch.float32 53 | + 54 | + print("Selected data type ESMC :", dtype) 55 | + print("model_name ESMC",model_name) 56 | model = load_local_model(model_name, device=device) 57 | + model = model.to(dtype) 58 | if device.type != "cpu": 59 | model = model.to(torch.bfloat16) 60 | assert isinstance(model, ESMC) 61 | diff --git a/esm/utils/structure/affine3d.py b/esm/utils/structure/affine3d.py 62 | index 382abcd..db201d6 100644 63 | --- a/esm/utils/structure/affine3d.py 64 | +++ b/esm/utils/structure/affine3d.py 65 | @@ -124,7 +124,7 @@ class RotationMatrix(Rotation): 66 | with fp32_autocast_context(self.device.type): 67 | if self._rots.shape[-3] == 1: 68 | # This is a slight speedup over einsum for batched rotations 69 | - return p @ self._rots.transpose(-1, -2).squeeze(-3) 70 | + return p.float() @ self._rots.float().transpose(-1, -2).squeeze(-3) 71 | else: 72 | # einsum way faster than bmm! 73 | return torch.einsum("...ij,...j", self._rots, p) 74 | index 0cc7bd9..1ee7ecb 100644 75 | --- a/pyproject.toml 76 | +++ b/pyproject.toml 77 | @@ -24,7 +24,7 @@ dependencies = [ 78 | "torch>=2.2.0", 79 | "torchvision", 80 | "torchtext", 81 | - "transformers<4.48.2", 82 | + "transformers<4.52.2", 83 | "ipython", 84 | "einops", 85 | "biotite==0.41.2", 86 | -------------------------------------------------------------------------------- /applications/esm3/scripts/ESM3_chain_of_thought.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Intel Corporation 2 | # SPDX-License-Identifier: MIT License 3 | 4 | import os 5 | import torch 6 | import argparse 7 | import time 8 | import pandas as pd 9 | from esm.models.esm3 import ESM3 10 | from esm.sdk.api import ESM3InferenceClient, ESMProtein, ESMProteinTensor 11 | from esm.sdk.api import GenerationConfig 12 | from esm.utils.types import FunctionAnnotation 13 | 14 | def get_sample_protein_from_csv(csv_file: str, sequence_length: int = None) -> ESMProtein: 15 | df = pd.read_csv(csv_file, sep="\t") 16 | if not {"Label", "Start", "End"}.issubset(df.columns): 17 | raise ValueError("CSV file must contain 'Label', 'Start', and 'End' columns") 18 | 19 | function_annotations = [ 20 | FunctionAnnotation(label=row["Label"], start=int(row["Start"]), end=int(row["End"])) 21 | for _, row in df.iterrows() 22 | ] 23 | 24 | # Determine sequence length 25 | sequence_length = sequence_length or max(df["End"], default=100) 26 | 27 | protein = ESMProtein(sequence="_" * sequence_length) 28 | protein.function_annotations = function_annotations 29 | return protein 30 | 31 | def chain_of_thought(client: ESM3InferenceClient, csv_path: str, output_dir: str, args): 32 | cot_protein = get_sample_protein_from_csv(csv_path, args.sequence_length) 33 | enable_autocast = args.bf16 34 | device_type = "cpu" 35 | 36 | with torch.amp.autocast(device_type=device_type, enabled=enable_autocast): 37 | cot_protein.sequence = "_" * len(cot_protein.sequence) 38 | cot_protein.coordinates = None 39 | cot_protein.sasa = None 40 | cot_protein_tensor = client.encode(cot_protein) 41 | 42 | # Generate different properties using command-line args 43 | for cot_track in ["secondary_structure", "structure", "sequence"]: 44 | cot_protein_tensor = client.generate( 45 | cot_protein_tensor, 46 | GenerationConfig( 47 | track=cot_track, 48 | schedule=args.schedule, 49 | num_steps=args.num_steps, 50 | strategy=args.strategy, 51 | temperature=args.temperature, 52 | top_p=args.top_p, 53 | condition_on_coordinates_only=args.condition_on_coordinates_only 54 | ), 55 | ) 56 | 57 | assert isinstance(cot_protein_tensor, ESMProteinTensor) 58 | cot_protein = client.decode(cot_protein_tensor) 59 | assert isinstance(cot_protein, ESMProtein) 60 | 61 | csv_name = os.path.splitext(os.path.basename(csv_path))[0] 62 | output_pdb_path = os.path.join(output_dir, f"{csv_name}.pdb") 63 | cot_protein.to_pdb(output_pdb_path) 64 | print(f"Saved output to {output_pdb_path}") 65 | 66 | def main(args): 67 | 68 | if not os.path.exists(args.csv_file) or not args.csv_file.endswith(".csv"): 69 | print(f"Invalid CSV file: {args.csv_file}") 70 | return 71 | os.makedirs(args.output_dir, exist_ok=True) 72 | 73 | client = ESM3InferenceClient() if os.getenv("ESM_API_KEY") else ESM3.from_pretrained( 74 | "esm3_sm_open_v1", bf16=args.bf16 75 | ) 76 | 77 | infer_time = time.time() 78 | chain_of_thought(client, args.csv_file, args.output_dir, args) 79 | if args.timing: 80 | print(f"Inference time = {time.time() - infer_time} seconds") 81 | 82 | if __name__ == "__main__": 83 | parser = argparse.ArgumentParser(description="Run ESM3 protein inverse folding on a CSV file.") 84 | parser.add_argument("csv_file", type=str, help="Path to input CSV file.") 85 | parser.add_argument("output_dir", type=str, help="Directory to save the output PDB file.") 86 | parser.add_argument("--bf16", action="store_true", help="Enable bf16 inference.") 87 | parser.add_argument("--timing", action="store_true", help="Enable timing for inference.") 88 | parser.add_argument("--schedule", type=str, choices=["cosine", "linear"], default="cosine", help="Schedule type.") 89 | parser.add_argument("--strategy", type=str, choices=["random", "entropy"], default="entropy", help="Unmasking strategy.") 90 | parser.add_argument("--num_steps", type=int, default=1, help="Number of steps for generation.") 91 | parser.add_argument("--temperature", type=float, default=1.0, help="Sampling temperature.") 92 | parser.add_argument("--top_p", type=float, default=1.0, help="Top-p sampling value.") 93 | parser.add_argument("--condition_on_coordinates_only", action="store_true", help="Condition only on coordinates.") 94 | parser.add_argument("--sequence_length", type=int, help="Custom sequence length (optional).") 95 | 96 | args = parser.parse_args() 97 | os.makedirs(args.output_dir, exist_ok=True) 98 | if args.timing: 99 | start_time = time.time() 100 | main(args) 101 | if args.timing: 102 | print(f"Complete run time = {time.time() - start_time:.2f} seconds") 103 | -------------------------------------------------------------------------------- /applications/esm3/scripts/ESM3_function_prediction_task.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Intel Corporation 2 | # SPDX-License-Identifier: MIT License 3 | 4 | import os 5 | import torch 6 | import argparse 7 | from esm.models.esm3 import ESM3 8 | from esm.sdk.api import ESM3InferenceClient, ESMProtein 9 | from esm.sdk.api import GenerationConfig 10 | import time 11 | import csv 12 | from esm.utils.structure.protein_complex import ProteinComplex 13 | 14 | def function_protein(client: ESM3InferenceClient, protein,pdb_file: str, output_dir: str, args): 15 | """Runs protein function prediction and saves the output as a CSV file.""" 16 | print(f"Processing {pdb_file}...") 17 | 18 | 19 | with torch.amp.autocast(device_type="cpu", enabled=args.bf16): 20 | protein.function_annotations = None 21 | 22 | # Inline GenerationConfig inside client.generate() 23 | protein_with_function = client.generate( 24 | protein, 25 | GenerationConfig( 26 | track="function", 27 | schedule=args.schedule, 28 | strategy=args.strategy, 29 | num_steps=args.num_steps, 30 | temperature=args.temperature, 31 | temperature_annealing=args.temperature_annealing, 32 | top_p=args.top_p, 33 | condition_on_coordinates_only=args.condition_on_coordinates_only, 34 | ), 35 | ) 36 | 37 | assert isinstance(protein_with_function, ESMProtein), f"Unexpected output: {protein_with_function}" 38 | output_csv = os.path.join(output_dir, f"{os.path.basename(pdb_file).replace('.pdb', '')}.csv") 39 | 40 | with open(output_csv, "w", newline="") as f: 41 | writer = csv.writer(f, delimiter="\t") 42 | writer.writerow(["Label", "Start", "End"]) 43 | for annotation in protein_with_function.function_annotations: 44 | writer.writerow([annotation.label, annotation.start, annotation.end]) 45 | 46 | print(f"Function annotations saved as {output_csv}") 47 | def processing_pdb(client: ESM3InferenceClient, pdb_file: str, output_dir: str, args): 48 | """Runs protein folding and saves the output as a PDB file in the specified directory.""" 49 | print(f"Processing {pdb_file}...") 50 | 51 | if args.protein_complex: 52 | protein = ProteinComplex.from_pdb(pdb_file) 53 | protein = ESMProtein.from_protein_complex(protein) 54 | function_protein(client,protein,pdb_file,output_dir,args) 55 | else: 56 | protein = ESMProtein.from_pdb(pdb_file) 57 | function_protein(client,protein,pdb_file,output_dir,args) 58 | 59 | def main(args): 60 | if not os.path.exists(args.pdb_file) or not args.pdb_file.endswith(".pdb"): 61 | print(f"Invalid PDB file: {args.pdb_file}") 62 | return 63 | os.makedirs(args.output_dir, exist_ok=True) 64 | 65 | client = ESM3InferenceClient() if os.getenv("ESM_API_KEY") else ESM3.from_pretrained("esm3_sm_open_v1", bf16=args.bf16) 66 | 67 | infer_time = time.time() 68 | processing_pdb(client, args.pdb_file, args.output_dir, args) 69 | if args.timing: 70 | print(f"Inference time = {time.time() - infer_time} seconds") 71 | 72 | if __name__ == "__main__": 73 | parser = argparse.ArgumentParser(description="Run ESM3 protein function annotation on a single PDB file.") 74 | parser.add_argument("pdb_file", type=str, help="Path to the input PDB file.") 75 | parser.add_argument("output_dir", type=str, help="Directory to save the output CSV file.") 76 | parser.add_argument("--bf16", action="store_true", help="Enable bf16 inference.") 77 | parser.add_argument("--timing", action="store_true", help="Enable timing for inference.") 78 | parser.add_argument("--schedule", type=str, choices=["cosine", "linear"], default="cosine", help="Schedule type (cosine or linear).") 79 | parser.add_argument("--strategy", type=str, choices=["random", "entropy"], default="entropy", help="Unmasking strategy (random or entropy).") 80 | parser.add_argument("--num_steps", type=int, default=1, help="Number of steps for generation.") 81 | parser.add_argument("--temperature", type=float, default=1.0, help="Sampling temperature.") 82 | parser.add_argument("--temperature_annealing", action="store_true", help="Enable temperature annealing.") 83 | parser.add_argument("--top_p", type=float, default=1.0, help="Top-p sampling value.") 84 | parser.add_argument("--condition_on_coordinates_only", action="store_true", help="Condition only on coordinates.") 85 | parser.add_argument("--protein_complex", action="store_true", help="Enable prediction for protein complexes (multi-chain structure) using a multi-chain FASTA file input.") 86 | 87 | args = parser.parse_args() 88 | 89 | os.makedirs(args.output_dir, exist_ok=True) 90 | if args.timing: 91 | start_time = time.time() 92 | main(args) 93 | if args.timing: 94 | print(f"Complete run time = {time.time() - start_time:.2f} seconds") 95 | -------------------------------------------------------------------------------- /applications/gromacs/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | 3 | SHELL ["/bin/bash", "-c"] 4 | ENV DEBIAN_FRONTEND=noninteractive 5 | ENV TZ=Asia/Kolkata 6 | ENV SERVICE_NAME="gromacs-service" 7 | 8 | RUN set -euo pipefail && \ 9 | apt-get update && apt-get install -y --no-install-recommends \ 10 | wget \ 11 | gnupg \ 12 | software-properties-common \ 13 | time && \ 14 | wget -qO- https://apt.kitware.com/keys/kitware-archive-latest.asc | gpg --dearmor > /usr/share/keyrings/kitware-archive-keyring.gpg && \ 15 | echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ focal main" > /etc/apt/sources.list.d/kitware.list && \ 16 | apt-get update && apt-get install -y --no-install-recommends \ 17 | cmake \ 18 | git \ 19 | tar \ 20 | vim \ 21 | g++ \ 22 | gcc \ 23 | make \ 24 | curl \ 25 | build-essential \ 26 | tzdata && \ 27 | apt-get clean && rm -rf /var/lib/apt/lists/* 28 | 29 | RUN wget -q https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -O Miniforge3.sh && \ 30 | bash Miniforge3.sh -b -p /opt/conda && \ 31 | rm -f Miniforge3.sh 32 | 33 | RUN chown -R 1001:1001 /opt/conda 34 | RUN groupadd --gid 1001 $SERVICE_NAME && \ 35 | useradd -m -g $SERVICE_NAME --shell /bin/bash --uid 1001 $SERVICE_NAME && \ 36 | mkdir -p /grmcs /input /output && \ 37 | chown -R $SERVICE_NAME:$SERVICE_NAME /grmcs /input /output 38 | 39 | ENV PATH="/opt/conda/bin:$PATH" 40 | 41 | RUN conda init bash && \ 42 | conda create -y -n grms_env python=3.12 \ 43 | sphinx pygments \ 44 | mkl=2023.2 mkl-devel=2023.2 \ 45 | dpcpp_linux-64=2023.2 dpcpp-cpp-rt=2023.2 \ 46 | gxx_linux-64=12 && \ 47 | conda clean -afy && \ 48 | echo "source /opt/conda/bin/activate grms_env" >> /etc/bash.bashrc && \ 49 | echo "conda activate grms_env" >> /etc/bash.bashrc 50 | 51 | RUN source /opt/conda/bin/activate grms_env && conda activate grms_env 52 | 53 | ENV CC=icx \ 54 | CXX=icpx \ 55 | MKLROOT=/opt/conda/envs/grms_env \ 56 | CFLAGS="-fopenmp -I/opt/conda/envs/grms_env/include" \ 57 | CXXFLAGS="-fopenmp -stdlib=libstdc++ -I/opt/conda/envs/grms_env/include" \ 58 | LDFLAGS="-L/opt/conda/envs/grms_env/lib -lmkl_rt -liomp5 -lpthread -lm -ldl" \ 59 | LD_LIBRARY_PATH="/opt/conda/envs/grms_env/lib:$LD_LIBRARY_PATH" 60 | 61 | WORKDIR /grmcs 62 | 63 | RUN source /opt/conda/bin/activate grms_env && \ 64 | wget -q https://ftp.gromacs.org/gromacs/gromacs-2024.2.tar.gz && \ 65 | tar -xzvf gromacs-2024.2.tar.gz && \ 66 | rm -f gromacs-2024.2.tar.gz 67 | 68 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /grmcs/gromacs-2024.2 69 | WORKDIR /grmcs/gromacs-2024.2 70 | RUN mkdir -p build && chmod -R 775 build 71 | 72 | WORKDIR /grmcs/gromacs-2024.2/build 73 | 74 | RUN source /opt/conda/bin/activate grms_env && \ 75 | which icx && icx --version && \ 76 | which icpx && icpx --version 77 | 78 | RUN source /opt/conda/bin/activate grms_env && \ 79 | cmake .. -DGMX_BUILD_OWN_FFTW=OFF \ 80 | -DREGRESSIONTEST_DOWNLOAD=OFF \ 81 | -DCMAKE_C_COMPILER=$CC \ 82 | -DCMAKE_CXX_COMPILER=$CXX \ 83 | -DGMX_FFT_LIBRARY=mkl \ 84 | -DMKL_INCLUDE_DIR=$MKLROOT/include \ 85 | -DMKL_LIBRARIES=$MKLROOT/lib/libmkl_rt.so \ 86 | -DCMAKE_INSTALL_PREFIX=/grmcs/gmx_mkl_prefix && \ 87 | make -j10 VERBOSE=1 && \ 88 | make install 89 | 90 | WORKDIR /grmcs 91 | RUN rm -rf gromacs-2024.2 92 | ENV PATH="/grmcs/gmx_mkl_prefix/bin:$PATH" 93 | RUN echo "source /grmcs/gmx_mkl_prefix/bin/GMXRC" >> /etc/bash.bashrc 94 | 95 | WORKDIR /input 96 | COPY run_commands.sh /input/ 97 | RUN chmod +x /input/run_commands.sh 98 | 99 | COPY entrypoint.sh /entrypoint.sh 100 | RUN chmod +x /entrypoint.sh 101 | 102 | USER $SERVICE_NAME 103 | WORKDIR /input 104 | ENTRYPOINT ["/entrypoint.sh"] 105 | HEALTHCHECK NONE 106 | 107 | -------------------------------------------------------------------------------- /applications/gromacs/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /opt/conda/bin/activate grms_env 4 | source /grmcs/gmx_mkl_prefix/bin/GMXRC 5 | 6 | if [[ "$1" == "/bin/bash" || "$1" == "bash" ]]; then 7 | exec /bin/bash 8 | fi 9 | 10 | if [ "$#" -eq 0 ]; then 11 | echo "Error: No PDB file provided. Please provide a PDB file as an argument." 12 | echo " Run full workflow: docker run -v \$(INPUT_GMX_CPU):/input -v \$(OUTPUT_GMX_CPU):/output -it grms_dock " 13 | echo " Run specific GROMACS command: docker run -v \$(INPUT_GMX_CPU):/input -v \$(OUTPUT_GMX_CPU):/output -it grms_dock gmx " 14 | exit 1 15 | elif [ "$1" == "gmx" ]; then 16 | shift 17 | exec gmx "$@" 18 | else 19 | pdb_file="$1" 20 | 21 | if [ ! -f "/input/$pdb_file" ]; then 22 | echo "Error: File '/input/$pdb_file' not found." 23 | exit 1 24 | fi 25 | 26 | echo "Running full workflow with PDB file: /input/$pdb_file" 27 | exec /input/run_commands.sh "/input/$pdb_file" 28 | fi 29 | 30 | -------------------------------------------------------------------------------- /applications/gromacs/grms_input/mdtut_ions.mdp: -------------------------------------------------------------------------------- 1 | ; ions.mdp - used as input into grompp to generate ions.tpr 2 | ; Parameters describing what to do, when to stop and what to save 3 | integrator = steep ; Algorithm (steep = steepest descent minimization) 4 | emtol = 1000.0 ; Stop minimization when the maximum force < 1000.0 kJ/mol/nm 5 | emstep = 0.01 ; Energy step size 6 | nsteps = 50000 ; Maximum number of (minimization) steps to perform 7 | 8 | ; Parameters describing how to find the neighbors of each atom and how to calculate the interactions 9 | nstlist = 1 ; Frequency to update the neighbor list and long range forces 10 | cutoff-scheme = Verlet 11 | ns_type = grid ; Method to determine neighbor list (simple, grid) 12 | coulombtype = PME ; Treatment of long range electrostatic interactions 13 | rcoulomb = 1.0 ; Short-range electrostatic cut-off 14 | rvdw = 1.0 ; Short-range Van der Waals cut-off 15 | pbc = xyz ; Periodic Boundary Conditions (yes/no) 16 | -------------------------------------------------------------------------------- /applications/gromacs/grms_input/mdtut_md.mdp: -------------------------------------------------------------------------------- 1 | title = Lysozyme NPT equilibration 2 | ; Run parameters 3 | integrator = md ; leap-frog integrator 4 | nsteps = 500000 ; 2 * 500000 = 1000 ps (1 ns) 5 | dt = 0.002 ; 2 fs 6 | ; Output control 7 | nstxout = 0 ; suppress bulky .trr file by specifying 8 | nstvout = 0 ; 0 for output frequency of nstxout, 9 | nstfout = 0 ; nstvout, and nstfout 10 | nstenergy = 5000 ; save energies every 10.0 ps 11 | nstlog = 5000 ; update log file every 10.0 ps 12 | nstxout-compressed = 50000 ; save compressed coordinates every 10.0 ps 13 | compressed-x-grps = System ; save the whole system 14 | ; Bond parameters 15 | continuation = yes ; Restarting after NPT 16 | constraint_algorithm = lincs ; holonomic constraints 17 | constraints = h-bonds ; bonds involving H are constrained 18 | lincs_iter = 1 ; accuracy of LINCS 19 | lincs_order = 4 ; also related to accuracy 20 | ; Neighborsearching 21 | cutoff-scheme = Verlet ; Buffered neighbor searching 22 | ns_type = grid ; search neighboring grid cells 23 | nstlist = 10 ; 20 fs, largely irrelevant with Verlet scheme 24 | rcoulomb = 1.0 ; short-range electrostatic cutoff (in nm) 25 | rvdw = 1.0 ; short-range van der Waals cutoff (in nm) 26 | ; Electrostatics 27 | coulombtype = PME ; Particle Mesh Ewald for long-range electrostatics 28 | pme_order = 4 ; cubic interpolation 29 | fourierspacing = 0.16 ; grid spacing for FFT 30 | ; Temperature coupling is on 31 | tcoupl = V-rescale ; modified Berendsen thermostat 32 | tc-grps = Protein Non-Protein ; two coupling groups - more accurate 33 | tau_t = 0.1 0.1 ; time constant, in ps 34 | ref_t = 300 300 ; reference temperature, one for each group, in K 35 | ; Pressure coupling is on 36 | pcoupl = Parrinello-Rahman ; Pressure coupling on in NPT 37 | pcoupltype = isotropic ; uniform scaling of box vectors 38 | tau_p = 2.0 ; time constant, in ps 39 | ref_p = 1.0 ; reference pressure, in bar 40 | compressibility = 4.5e-5 ; isothermal compressibility of water, bar^-1 41 | ; Periodic boundary conditions 42 | pbc = xyz ; 3-D PBC 43 | ; Dispersion correction 44 | DispCorr = EnerPres ; account for cut-off vdW scheme 45 | ; Velocity generation 46 | gen_vel = no ; Velocity generation is off 47 | -------------------------------------------------------------------------------- /applications/gromacs/grms_input/mdtut_minim.mdp: -------------------------------------------------------------------------------- 1 | ; minim.mdp - used as input into grompp to generate em.tpr 2 | integrator = steep ; Algorithm (steep = steepest descent minimization) 3 | emtol = 1000.0 ; Stop minimization when the maximum force < 1000.0 kJ/mol/nm 4 | emstep = 0.01 ; Energy step size 5 | nsteps = 50000 ; Maximum number of (minimization) steps to perform 6 | 7 | ; Parameters describing how to find the neighbors of each atom and how to calculate the interactions 8 | nstlist = 1 ; Frequency to update the neighbor list and long range forces 9 | cutoff-scheme = Verlet 10 | ns_type = grid ; Method to determine neighbor list (simple, grid) 11 | coulombtype = PME ; Treatment of long range electrostatic interactions 12 | rcoulomb = 1.0 ; Short-range electrostatic cut-off 13 | rvdw = 1.0 ; Short-range Van der Waals cut-off 14 | pbc = xyz ; Periodic Boundary Conditions (yes/no) 15 | -------------------------------------------------------------------------------- /applications/gromacs/grms_input/mdtut_npt.mdp: -------------------------------------------------------------------------------- 1 | title = Lysozyme NPT equilibration 2 | define = -DPOSRES ; position restrain the protein 3 | ; Run parameters 4 | integrator = md ; leap-frog integrator 5 | nsteps = 500000 ; 2 * 500000 = 1000 ps 6 | dt = 0.002 ; 2 fs 7 | ; Output control 8 | nstxout = 500 ; save coordinates every 1.0 ps 9 | nstvout = 500 ; save velocities every 1.0 ps 10 | nstenergy = 500 ; save energies every 1.0 ps 11 | nstlog = 500 ; update log file every 1.0 ps 12 | ; Bond parameters 13 | continuation = yes ; Restarting after NVT 14 | constraint_algorithm = lincs ; holonomic constraints 15 | constraints = h-bonds ; bonds involving H are constrained 16 | lincs_iter = 1 ; accuracy of LINCS 17 | lincs_order = 4 ; also related to accuracy 18 | ; Nonbonded settings 19 | cutoff-scheme = Verlet ; Buffered neighbor searching 20 | ns_type = grid ; search neighboring grid cells 21 | nstlist = 10 ; 20 fs, largely irrelevant with Verlet scheme 22 | rcoulomb = 1.0 ; short-range electrostatic cutoff (in nm) 23 | rvdw = 1.0 ; short-range van der Waals cutoff (in nm) 24 | DispCorr = EnerPres ; account for cut-off vdW scheme 25 | ; Electrostatics 26 | coulombtype = PME ; Particle Mesh Ewald for long-range electrostatics 27 | pme_order = 4 ; cubic interpolation 28 | fourierspacing = 0.16 ; grid spacing for FFT 29 | ; Temperature coupling is on 30 | tcoupl = V-rescale ; modified Berendsen thermostat 31 | tc-grps = Protein Non-Protein ; two coupling groups - more accurate 32 | tau_t = 0.1 0.1 ; time constant, in ps 33 | ref_t = 300 300 ; reference temperature, one for each group, in K 34 | ; Pressure coupling is on 35 | pcoupl = Parrinello-Rahman ; Pressure coupling on in NPT 36 | pcoupltype = isotropic ; uniform scaling of box vectors 37 | tau_p = 2.0 ; time constant, in ps 38 | ref_p = 1.0 ; reference pressure, in bar 39 | compressibility = 4.5e-5 ; isothermal compressibility of water, bar^-1 40 | refcoord_scaling = com 41 | ; Periodic boundary conditions 42 | pbc = xyz ; 3-D PBC 43 | ; Velocity generation 44 | gen_vel = no ; Velocity generation is off 45 | -------------------------------------------------------------------------------- /applications/gromacs/grms_input/mdtut_nvt.mdp: -------------------------------------------------------------------------------- 1 | title = Lysozyme NVT equilibration 2 | define = -DPOSRES ; position restrain the protein 3 | ; Run parameters 4 | integrator = md ; leap-frog integrator 5 | nsteps = 500000 ; 2 * 500000 = 1000 ps 6 | dt = 0.002 ; 2 fs 7 | ; Output control 8 | nstxout = 500 ; save coordinates every 1.0 ps 9 | nstvout = 500 ; save velocities every 1.0 ps 10 | nstenergy = 500 ; save energies every 1.0 ps 11 | nstlog = 500 ; update log file every 1.0 ps 12 | ; Bond parameters 13 | continuation = no ; first dynamics run 14 | constraint_algorithm = lincs ; holonomic constraints 15 | constraints = h-bonds ; bonds involving H are constrained 16 | lincs_iter = 1 ; accuracy of LINCS 17 | lincs_order = 4 ; also related to accuracy 18 | ; Neighborsearching 19 | cutoff-scheme = Verlet 20 | ns_type = grid ; search neighboring grid cells 21 | nstlist = 10 ; 20 fs, largely irrelevant with Verlet 22 | rcoulomb = 1.0 ; short-range electrostatic cutoff (in nm) 23 | rvdw = 1.0 ; short-range van der Waals cutoff (in nm) 24 | ; Electrostatics 25 | coulombtype = PME ; Particle Mesh Ewald for long-range electrostatics 26 | pme_order = 4 ; cubic interpolation 27 | fourierspacing = 0.16 ; grid spacing for FFT 28 | ; Temperature coupling is on 29 | tcoupl = V-rescale ; modified Berendsen thermostat 30 | tc-grps = Protein Non-Protein ; two coupling groups - more accurate 31 | tau_t = 0.1 0.1 ; time constant, in ps 32 | ref_t = 300 300 ; reference temperature, one for each group, in K 33 | ; Pressure coupling is off 34 | pcoupl = no ; no pressure coupling in NVT 35 | ; Periodic boundary conditions 36 | pbc = xyz ; 3-D PBC 37 | ; Dispersion correction 38 | DispCorr = EnerPres ; account for cut-off vdW scheme 39 | ; Velocity generation 40 | gen_vel = yes ; assign velocities from Maxwell distribution 41 | gen_temp = 300 ; temperature for Maxwell distribution 42 | gen_seed = -1 ; generate a random seed 43 | -------------------------------------------------------------------------------- /applications/gromacs/grms_input/run_commands.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source /opt/conda/bin/activate grms_env 3 | source /grmcs/gmx_mkl_prefix/bin/GMXRC 4 | 5 | file="$1" 6 | num_cpus=$(nproc) 7 | current_ntmpi=$num_cpus 8 | timestamp=$(date +"%Y%m%d_%H%M%S") 9 | 10 | mkdir -p /output/output_$timestamp 11 | 12 | exit_on_error() { 13 | echo "Error: $1" 14 | exit 1 15 | } 16 | 17 | echo "Preprocessing protein structure..." 18 | grep -v HOH "${file}" > prot_clean.pdb || exit_on_error "Failed to clean PDB file" 19 | 20 | echo "Running pdb2gmx..." 21 | gmx pdb2gmx -f prot_clean.pdb -o prot_pros.gro -water spce -ff amber99sb -ignh || exit_on_error "pdb2gmx failed" 22 | 23 | echo "Defining box dimensions..." 24 | gmx editconf -f prot_pros.gro -o prot_box.gro -c -d 1.0 -bt cubic || exit_on_error "editconf failed" 25 | 26 | echo "Adding solvent..." 27 | gmx solvate -cp prot_box.gro -cs spc216.gro -o prot_solv.gro -p topol.top || exit_on_error "solvate failed" 28 | 29 | echo "Preparing for ion addition..." 30 | gmx grompp -f mdtut_ions.mdp -c prot_solv.gro -p topol.top -o ions.tpr -maxwarn 2 || exit_on_error "grompp for ions failed" 31 | 32 | echo "Adding ions..." 33 | echo "13" | gmx genion -s ions.tpr -o prot_solv_ions.gro -p topol.top -pname NA -nname CL -neutral || exit_on_error "genion failed" 34 | 35 | echo "Preparing energy minimization..." 36 | gmx grompp -f mdtut_minim.mdp -c prot_solv_ions.gro -p topol.top -o em.tpr -maxwarn 1 || exit_on_error "grompp for minimization failed" 37 | 38 | while [ $current_ntmpi -ge 1 ]; do 39 | echo "Running EM with ntmpi=$current_ntmpi..." 40 | time /usr/bin/time -v gmx mdrun -ntmpi $current_ntmpi -v -deffnm em 41 | mdrun_exit_code=$? 42 | if [ $mdrun_exit_code -eq 0 ]; then 43 | echo "EM stage completed successfully with ntmpi=$current_ntmpi" 44 | break 45 | fi 46 | if [ -f em.log ] && grep -q "Fatal error: There is no domain decomposition" em.log || grep -q "Fatal error:" em.log; then 47 | echo "Domain decomposition failed for ntmpi=$current_ntmpi. Reducing MPI ranks..." 48 | grep "Fatal error:" em.log 49 | rm -f em.log 50 | current_ntmpi=$((current_ntmpi / 2)) 51 | else 52 | exit_on_error "Unexpected failure in EM stage. Check em.log for details." 53 | fi 54 | done 55 | 56 | echo "Preparing NVT equilibration..." 57 | gmx grompp -f mdtut_nvt.mdp -c em.gro -r em.gro -p topol.top -o nvt.tpr || exit_on_error "grompp for NVT failed" 58 | 59 | echo "Running NVT mdrun..." 60 | gmx mdrun -ntmpi $current_ntmpi -v -deffnm nvt || exit_on_error "NVT mdrun failed" 61 | 62 | echo "Preparing NPT equilibration..." 63 | gmx grompp -f mdtut_npt.mdp -c nvt.gro -r nvt.gro -t nvt.cpt -p topol.top -o npt.tpr || exit_on_error "grompp for NPT failed" 64 | 65 | echo "Running NPT mdrun..." 66 | gmx mdrun -ntmpi $current_ntmpi -v -deffnm npt || exit_on_error "NPT mdrun failed" 67 | 68 | echo "Preparing MD simulation..." 69 | gmx grompp -f mdtut_md.mdp -c npt.gro -t npt.cpt -p topol.top -o md01.tpr || exit_on_error "grompp for MD failed" 70 | 71 | echo "Running MD mdrun..." 72 | gmx mdrun -ntmpi $current_ntmpi -v -deffnm md01 || exit_on_error "MD mdrun failed" 73 | 74 | echo "Simulation completed successfully with ntmpi=$current_ntmpi" 75 | 76 | echo "Moving output files..." 77 | mv /input/md01* /output/output_$timestamp/ 2>/dev/null || echo "Warning: No md01* files found." 78 | 79 | echo "Pipeline execution completed successfully. Results saved in /output/output_$timestamp" 80 | 81 | -------------------------------------------------------------------------------- /applications/gromacs/run_commands.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source /opt/conda/bin/activate grms_env 3 | source /grmcs/gmx_mkl_prefix/bin/GMXRC 4 | 5 | file="$1" 6 | num_cpus=$(nproc) 7 | current_ntmpi=$num_cpus 8 | timestamp=$(date +"%Y%m%d_%H%M%S") 9 | 10 | mkdir -p /output/output_$timestamp 11 | 12 | exit_on_error() { 13 | echo "Error: $1" 14 | exit 1 15 | } 16 | 17 | echo "Preprocessing protein structure..." 18 | grep -v HOH "${file}" > prot_clean.pdb || exit_on_error "Failed to clean PDB file" 19 | 20 | echo "Running pdb2gmx..." 21 | gmx pdb2gmx -f prot_clean.pdb -o prot_pros.gro -water spce -ff amber99sb -ignh || exit_on_error "pdb2gmx failed" 22 | 23 | echo "Defining box dimensions..." 24 | gmx editconf -f prot_pros.gro -o prot_box.gro -c -d 1.0 -bt cubic || exit_on_error "editconf failed" 25 | 26 | echo "Adding solvent..." 27 | gmx solvate -cp prot_box.gro -cs spc216.gro -o prot_solv.gro -p topol.top || exit_on_error "solvate failed" 28 | 29 | echo "Preparing for ion addition..." 30 | gmx grompp -f mdtut_ions.mdp -c prot_solv.gro -p topol.top -o ions.tpr -maxwarn 2 || exit_on_error "grompp for ions failed" 31 | 32 | echo "Adding ions..." 33 | echo "13" | gmx genion -s ions.tpr -o prot_solv_ions.gro -p topol.top -pname NA -nname CL -neutral || exit_on_error "genion failed" 34 | 35 | echo "Preparing energy minimization..." 36 | gmx grompp -f mdtut_minim.mdp -c prot_solv_ions.gro -p topol.top -o em.tpr -maxwarn 1 || exit_on_error "grompp for minimization failed" 37 | 38 | while [ $current_ntmpi -ge 1 ]; do 39 | echo "Running EM with ntmpi=$current_ntmpi..." 40 | time /usr/bin/time -v gmx mdrun -ntmpi $current_ntmpi -v -deffnm em 41 | mdrun_exit_code=$? 42 | if [ $mdrun_exit_code -eq 0 ]; then 43 | echo "EM stage completed successfully with ntmpi=$current_ntmpi" 44 | break 45 | fi 46 | if [ -f em.log ] && grep -q "Fatal error: There is no domain decomposition" em.log || grep -q "Fatal error:" em.log; then 47 | echo "Domain decomposition failed for ntmpi=$current_ntmpi. Reducing MPI ranks..." 48 | grep "Fatal error:" em.log 49 | rm -f em.log 50 | current_ntmpi=$((current_ntmpi / 2)) 51 | else 52 | exit_on_error "Unexpected failure in EM stage. Check em.log for details." 53 | fi 54 | done 55 | 56 | echo "Preparing NVT equilibration..." 57 | gmx grompp -f mdtut_nvt.mdp -c em.gro -r em.gro -p topol.top -o nvt.tpr || exit_on_error "grompp for NVT failed" 58 | 59 | echo "Running NVT mdrun..." 60 | gmx mdrun -ntmpi $current_ntmpi -v -deffnm nvt || exit_on_error "NVT mdrun failed" 61 | 62 | echo "Preparing NPT equilibration..." 63 | gmx grompp -f mdtut_npt.mdp -c nvt.gro -r nvt.gro -t nvt.cpt -p topol.top -o npt.tpr || exit_on_error "grompp for NPT failed" 64 | 65 | echo "Running NPT mdrun..." 66 | gmx mdrun -ntmpi $current_ntmpi -v -deffnm npt || exit_on_error "NPT mdrun failed" 67 | 68 | echo "Preparing MD simulation..." 69 | gmx grompp -f mdtut_md.mdp -c npt.gro -t npt.cpt -p topol.top -o md01.tpr || exit_on_error "grompp for MD failed" 70 | 71 | echo "Running MD mdrun..." 72 | gmx mdrun -ntmpi $current_ntmpi -v -deffnm md01 || exit_on_error "MD mdrun failed" 73 | 74 | echo "Simulation completed successfully with ntmpi=$current_ntmpi" 75 | 76 | echo "Moving output files..." 77 | mv /input/md01* /output/output_$timestamp/ 2>/dev/null || echo "Warning: No md01* files found." 78 | 79 | echo "Pipeline execution completed successfully. Results saved in /output/output_$timestamp" 80 | 81 | -------------------------------------------------------------------------------- /applications/moflow/Dockerfile: -------------------------------------------------------------------------------- 1 | # Base image specification 2 | ARG FROM_IMAGE=ubuntu:24.04 3 | FROM ${FROM_IMAGE} as builder 4 | 5 | ENV DEBIAN_FRONTEND=noninteractive 6 | 7 | # Install necessary build tools and clean up 8 | RUN apt-get update && apt-get install -y --no-install-recommends \ 9 | git build-essential wget vim ca-certificates autoconf automake make numactl unzip && \ 10 | rm -rf /var/lib/apt/lists/* && \ 11 | apt-get autoremove -y && \ 12 | apt-get clean 13 | 14 | # Create a user and group for running the service 15 | ENV SERVICE_NAME="moflow-base-service" 16 | RUN groupadd --gid 1001 $SERVICE_NAME && \ 17 | useradd -m -g $SERVICE_NAME --shell /bin/false --uid 1001 $SERVICE_NAME 18 | 19 | WORKDIR /app 20 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /app 21 | USER $SERVICE_NAME 22 | 23 | # Install Miniforge (Conda) 24 | RUN wget --no-check-certificate -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-$(uname)-$(uname -m).sh" 25 | RUN bash Miniforge3.sh -b -p "${HOME}/conda" && rm Miniforge3.sh 26 | 27 | # Clone MoFlow repository and apply patch 28 | RUN git clone --recursive https://github.com/calvin-zcx/moflow.git moflow 29 | WORKDIR /app/moflow 30 | RUN git checkout 3611c637260272b3d34a298f221623cb59e01091 31 | 32 | RUN wget https://github.com/IntelLabs/Open-Omics-Acceleration-Framework/releases/download/3.0/Source_code_with_submodules.tar.gz 33 | RUN tar -xzf Source_code_with_submodules.tar.gz 34 | 35 | RUN mv /app/moflow/Open-Omics-Acceleration-Framework/applications/moflow /app/moflow/omics_setup && \ 36 | git apply /app/moflow/omics_setup/mflow_change_all.patch && \ 37 | rm -rf /app/moflow/Open-Omics-Acceleration-Framework && \ 38 | rm -rf /app/moflow/Source_code_with_submodules.tar.gz 39 | 40 | # Set up Conda environment 41 | RUN ${HOME}/conda/bin/mamba env create -f /app/moflow/omics_setup/env.yml 42 | 43 | # Install MoFlow package 44 | WORKDIR /app/moflow 45 | RUN bash -c "source ${HOME}/conda/etc/profile.d/conda.sh && \ 46 | source ${HOME}/conda/etc/profile.d/mamba.sh && \ 47 | mamba activate moflow && \ 48 | pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cpu" 49 | 50 | # Initialization script for Conda environment 51 | RUN echo "#!/bin/bash" > /app/init.sh && \ 52 | echo "source ${HOME}/conda/etc/profile.d/conda.sh" >> /app/init.sh && \ 53 | echo "source ${HOME}/conda/etc/profile.d/mamba.sh" >> /app/init.sh && \ 54 | echo "mamba activate moflow" >> /app/init.sh && \ 55 | chmod +x /app/init.sh && \ 56 | echo "source /app/init.sh" >> ~/.bashrc 57 | 58 | WORKDIR /app/moflow 59 | RUN echo '#!/bin/bash' > /app/entrypoint.sh && \ 60 | echo 'if [ -z "$1" ]; then' >> /app/entrypoint.sh && \ 61 | echo ' exec /bin/bash' >> /app/entrypoint.sh && \ 62 | echo 'else' >> /app/entrypoint.sh && \ 63 | echo ' source /app/init.sh' >> /app/entrypoint.sh && \ 64 | echo ' exec "$@"' >> /app/entrypoint.sh && \ 65 | echo 'fi' >> /app/entrypoint.sh && \ 66 | chmod +x /app/entrypoint.sh 67 | 68 | ENTRYPOINT ["/app/entrypoint.sh"] 69 | CMD [] 70 | HEALTHCHECK NONE 71 | -------------------------------------------------------------------------------- /applications/moflow/env.yml: -------------------------------------------------------------------------------- 1 | name: moflow 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | dependencies: 6 | - conda-forge::python=3.8.5 7 | - conda-forge::pandas==1.1.2 8 | - conda-forge::matplotlib==3.3.2 9 | - conda-forge::rdkit==2020.03.6 10 | - conda-forge::orderedset==2.0.3 11 | - conda-forge::tabulate==0.8.7 12 | - conda-forge::networkx==2.5 13 | - conda-forge::scipy==1.5.0 14 | - conda-forge::seaborn==0.11.0 15 | - pip: 16 | - cairosvg==2.4.2 17 | - tqdm==4.50.0 18 | - gdown==5.2.0 19 | - numpy==1.19.2 20 | - scikit-learn==1.3.2 -------------------------------------------------------------------------------- /applications/relion/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:22.04 2 | 3 | LABEL description="RELION 5.0 built with Intel oneAPI and conda" 4 | 5 | ENV DEBIAN_FRONTEND=noninteractive 6 | ENV TZ=Asia/Kolkata 7 | ENV SERVICE_NAME="relion-service" 8 | 9 | # Create non-root user 10 | RUN groupadd --gid 1001 $SERVICE_NAME && \ 11 | useradd -m -g $SERVICE_NAME --shell /bin/bash --uid 1001 $SERVICE_NAME 12 | 13 | # Install system packages and Python build dependencies 14 | RUN apt-get update && \ 15 | apt-get install -y --no-install-recommends \ 16 | git \ 17 | cmake \ 18 | vim \ 19 | make \ 20 | tar \ 21 | curl \ 22 | wget \ 23 | gnupg \ 24 | time \ 25 | ca-certificates \ 26 | tzdata \ 27 | libtiff-dev \ 28 | libx11-dev \ 29 | libpng-dev \ 30 | python3-dev \ 31 | libffi-dev \ 32 | libssl-dev \ 33 | pkg-config \ 34 | gfortran \ 35 | libstdc++-11-dev \ 36 | libgl1 \ 37 | libgl1-mesa-glx \ 38 | libxrender1 \ 39 | build-essential && \ 40 | apt-get clean && \ 41 | rm -rf /var/lib/apt/lists/* 42 | 43 | # Prepare directory and switch to non-root 44 | WORKDIR /opt 45 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /opt 46 | USER $SERVICE_NAME 47 | 48 | # Clone and patch RELION 49 | RUN git clone --branch ver5.0 https://github.com/3dem/relion.git relion_5.0 50 | WORKDIR /opt/relion_5.0 51 | RUN git pull 52 | COPY relion_env_patch.patch /opt/relion_5.0 53 | RUN git apply relion_env_patch.patch 54 | 55 | # Install Intel oneAPI HPC Toolkit 56 | WORKDIR /opt 57 | RUN wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b7f71cf2-8157-4393-abae-8cea815509f7/intel-oneapi-hpc-toolkit-2025.0.1.47_offline.sh && \ 58 | chmod +x intel-oneapi-hpc-toolkit-2025.0.1.47_offline.sh && \ 59 | ./intel-oneapi-hpc-toolkit-2025.0.1.47_offline.sh -a --silent --cli --eula accept && \ 60 | rm intel-oneapi-hpc-toolkit-2025.0.1.47_offline.sh 61 | 62 | # Conda setup 63 | ENV CONDA_DIR=/opt/conda 64 | ENV HOME=/home/relion-service 65 | ENV XDG_CACHE_HOME=$HOME/.cache 66 | ENV TMPDIR=/home/relion-service/tmp 67 | ENV PATH=$CONDA_DIR/bin:$PATH 68 | ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu 69 | 70 | RUN mkdir -p $XDG_CACHE_HOME $TMPDIR && \ 71 | chmod -R 777 $XDG_CACHE_HOME $TMPDIR 72 | 73 | # Install Miniforge 74 | RUN wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -O /tmp/miniforge.sh && \ 75 | bash /tmp/miniforge.sh -b -u -p $CONDA_DIR && \ 76 | rm /tmp/miniforge.sh && \ 77 | conda init bash 78 | 79 | # Create conda env without .[vis] first 80 | RUN conda env create -f /opt/relion_5.0/environment.yml && \ 81 | conda clean -afy 82 | 83 | # Install .[vis] manually after env is created 84 | RUN bash -c "source $CONDA_DIR/etc/profile.d/conda.sh && \ 85 | conda activate relion-5.0 && \ 86 | pip install --verbose /opt/relion_5.0[vis]" 87 | 88 | # Build RELION 89 | WORKDIR /opt/relion_5.0 90 | SHELL ["/bin/bash", "-c"] 91 | RUN mkdir -p /opt/relion_5.0/torch_home 92 | RUN mkdir -p /opt/relion_5.0/build_cpu 93 | WORKDIR /opt/relion_5.0/build_cpu 94 | 95 | RUN source /home/relion-service/intel/oneapi/2025.0/oneapi-vars.sh --force && \ 96 | cmake -DCMAKE_C_COMPILER=icx \ 97 | -DCMAKE_CXX_COMPILER=icpx \ 98 | -DMPI_C_COMPILER=mpiicx \ 99 | -DMPI_CXX_COMPILER=mpiicpx \ 100 | -DCUDA=OFF \ 101 | -DALTCPU=ON \ 102 | -DMKLFFT=ON \ 103 | -DGUI=OFF \ 104 | -DFETCH_WEIGHTS=OFF \ 105 | -DCMAKE_BUILD_TYPE=Release \ 106 | -DCMAKE_C_FLAGS="-g -O3 -qopenmp-simd -xCORE-AVX512 -qopt-zmm-usage=high" \ 107 | -DCMAKE_CXX_FLAGS="-DTBB_SUPPRESS_DEPRECATED_MESSAGES -g -O3 -qopenmp-simd -xCORE-AVX512 -qopt-zmm-usage=high" \ 108 | -DCMAKE_EXE_LINKER_FLAGS="-static-intel -static-libgcc -qopenmp-link=static -Wno-unused-command-line-argument" \ 109 | -DTORCH_HOME_PATH=/opt/relion_5.0/torch_home \ 110 | -DPYTHON_EXE_PATH=/opt/conda/envs/relion-5.0/bin/python \ 111 | -DCMAKE_INSTALL_PREFIX=/opt/relion_5.0_cpu_benchmark_prefix .. && \ 112 | make -j$(nproc) && make install 113 | WORKDIR /opt/relion_5.0 114 | RUN rm -rf /opt/relion_5.0/build_cpu 115 | RUN echo 'source /home/relion-service/intel/oneapi/setvars.sh --force' >> /home/relion-service/.bashrc 116 | ENV PATH="/opt/relion_5.0_cpu_benchmark_prefix/bin:/home/relion-service/intel/oneapi/compiler/2025.0/bin:/home/relion-service/intel/oneapi/mpi/2025.0/bin:/home/relion-service/intel/oneapi/compiler/2025.0/linux/bin/intel64:$PATH" 117 | ENV LD_LIBRARY_PATH="/home/relion-service/intel/oneapi/compiler/2025.0/lib:/home/relion-service/intel/oneapi/mkl/2025.0/lib/intel64:/home/relion-service/intel/oneapi/2025.0/lib:$LD_LIBRARY_PATH" 118 | RUN mkdir -p /opt/relion_5.0/relion_benchmark 119 | COPY entrypoint.sh ./entrypoint_temp.sh 120 | RUN bash -c "cp ./entrypoint_temp.sh ./entrypoint.sh && chmod u+x ./entrypoint.sh && rm ./entrypoint_temp.sh" 121 | 122 | ENTRYPOINT ["/opt/relion_5.0/entrypoint.sh"] 123 | 124 | HEALTHCHECK NONE 125 | 126 | -------------------------------------------------------------------------------- /applications/relion/README.md: -------------------------------------------------------------------------------- 1 | # Open-Omics-Relion 2 | **Open-Omics-Relion** is a Dockerized RELION 5.0 setup for running benchmark workloads using Intel oneAPI and Intel MPI. It supports **2D classification**, **3D classification**, and **auto-refinement** modes using official test data, designed for **reproducibility**, **performance testing**, and **ease of deployment**. 3 | 4 | --- 5 | 6 | ## Step 1: Download Benchmark Dataset 7 | ```zsh 8 | wget ftp://ftp.mrc-lmb.cam.ac.uk/pub/scheres/relion_benchmark.tar.gz 9 | tar -xzvf relion_benchmark.tar.gz 10 | ``` 11 | This will extract a folder named `relion_benchmark`. 12 | ## Step 2: Build the Docker Image 13 | Build the docker image with `Dockerfile`, run: 14 | ```zsh 15 | sudo docker build -t relion_nru . 16 | ``` 17 | Verify the image was built: 18 | ```zsh 19 | sudo docker images | grep -i relion_nru 20 | ``` 21 | ## Step 3: Change Ownership of Benchmark Data 22 | To avoid permission issues when mounting the directory (RELION runs as a non-root user UID 1001): 23 | ```zsh 24 | cd relion_benchmark/ 25 | sudo chown 1001:1001 $(pwd) 26 | ``` 27 | 28 | ## Step 4: Run RELION Benchmark 29 | Launch the container with required options for shared memory and MPI support: 30 | ```zsh 31 | sudo docker run --rm --net=host --ipc=host --pid=host --ulimit stack=67108864 --shm-size=2g --cap-add=SYS_PTRACE -e I_MPI_DEBUG=5 -e I_MPI_SHM_LMT=shm -e I_MPI_FABRICS=shm:tcp -it -v $(pwd):/opt/relion_5.0/relion_benchmark relion_nru:latest 32 | ``` 33 | **Notes** 34 | - Modify entrypoint.sh if you wish to customize CPU/thread usage. 35 | - The container is designed to work on systems with Intel MPI and oneAPI properly set up inside. 36 | - Ensure Docker has access to sufficient system shared memory (e.g. via /dev/shm or --shm-size if needed). 37 | - The container uses a non-root user (UID 1001). Make sure mounted volumes are writable by this user. 38 | ### Available Run Modes 39 | You can specify different modes as an argument to the Docker command. Each mode will automatically create an output folder inside the `relion_benchmark` directory. 40 | 41 | | Mode | Command Example (append to `docker run`) | Description | Output Folder | 42 | |--------------|--------------------------------------------------------------|------------------------------|--------------| 43 | | *(default)* | `relion_nru:latest` | Run 3D classification | `3D/` | 44 | | `3d` | `relion_nru:latest 3d` | Run 3D classification | `3D/` | 45 | | `2d` | `relion_nru:latest 2d` | Run 2D classification | `2D/` | 46 | | `autorefine` | `relion_nru:latest autorefine` | Run 3D auto-refinement | `3D_AUTO/` | 47 | | `custom` | `relion_nru:latest relion_refine_mpi [your-flags]` | Run any custom RELION command | User-defined (`--o`) | 48 | 49 | --- 50 | 51 | **The Original README for Relion starts here:** 52 | 53 | 54 | RELION 5.0-beta 55 | =============== 56 | 57 | RELION (for REgularised LIkelihood OptimisatioN) is a stand-alone computer 58 | program for Maximum A Posteriori refinement of (multiple) 3D reconstructions 59 | or 2D class averages in cryo-electron microscopy. It is developed in the 60 | research group of Sjors Scheres at the MRC Laboratory of Molecular Biology. 61 | 62 | If RELION is useful in your work, please cite our papers. 63 | 64 | Comprehensive documentation of RELION and tutorials are stored [here](https://relion.readthedocs.io/). 65 | 66 | ## Installation 67 | 68 | See our [installation instructions](https://relion.readthedocs.io/en/release-5.0/Installation.html). 69 | 70 | You will have to set up a Python environment to use Python modules (e.g. Blush, ModelAngelo and DynaMight). 71 | Thus, please read the above instructions carefully even if you are familiar with earlier versions. 72 | 73 | ## Class Ranker 74 | 75 | The default model for the class ranker has been trained and tested in Python 3.9.12 with Pytorch 1.10.0 and Numpy 1.20.0. 76 | If you wish to retrain the class ranker model with your own data, please refer to [this repo](https://github.com/3dem/relion-classranker). 77 | -------------------------------------------------------------------------------- /applications/relion/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source ~/intel/oneapi/2025.0/oneapi-vars.sh 3 | APPEXE=/opt/relion_5.0_cpu_benchmark_prefix/bin/relion_refine_mpi 4 | DEFAULT_DATA_DIR=/opt/relion_5.0/relion_benchmark 5 | cd "$DEFAULT_DATA_DIR" || exit 1 6 | 7 | NUMMPI=$((16+1)) 8 | NUMTHR=8 9 | POOLSIZE=4 10 | NUMITER=25 11 | 12 | DEFAULT_MODE="${1:-3d}" 13 | MODE="${DEFAULT_MODE#-}" 14 | shift 15 | 16 | case "$MODE" in 17 | 2d) 18 | echo "➡️ Running 2D Classification" 19 | mkdir -p 2D 20 | exec mpirun -n $NUMMPI $APPEXE --i Particles/shiny_2sets.star --dont_combine_weights_via_disc --ctf --tau2_fudge 2 --particle_diameter 360 --K 200 --zero_mask --oversampling 1 --psi_step 6 --offset_range 5 --offset_step 2 --norm --scale --random_seed 0 --pad 2 --o 2D/2D --pool $POOLSIZE --j $NUMTHR --iter $NUMITER --cpu 21 | ;; 22 | 3d) 23 | echo "➡️ Running 3D Classification with SYCL" 24 | mkdir -p 3D 25 | exec mpirun -n $NUMMPI $APPEXE --i Particles/shiny_2sets.star --ref emd_2660.map:mrc --firstiter_cc --ini_high 60 --dont_combine_weights_via_disc --ctf --tau2_fudge 4 --particle_diameter 360 --K 6 --flatten_solvent --zero_mask --oversampling 1 --healpix_order 2 --offset_range 5 --offset_step 2 --sym C1 --norm --scale --pad 2 --random_seed 0 --o 3D/3D --pool $POOLSIZE --j $NUMTHR --iter $NUMITER --cpu 26 | ;; 27 | autorefine) 28 | echo "➡️ Running 3D AutoRefine" 29 | mkdir -p 3D_AUTO 30 | exec mpirun -n $NUMMPI $APPEXE --i Particles/shiny_2sets.star --ref emd_2660.map:mrc --firstiter_cc --ini_high 60 --dont_combine_weights_via_disc --ctf --particle_diameter 360 --flatten_solvent --zero_mask --oversampling 1 --healpix_order 2 --offset_range 5 --offset_step 2 --sym C1 --norm --scale --auto_refine --split_random_halves --auto_local_healpix_order 4 --low_resol_join_halves 40 --random_seed 1 --pad 2 --o 3D_AUTO/3D_AUTO --pool $POOLSIZE --j $NUMTHR --cpu 31 | ;; 32 | *) 33 | echo "➡️ Running custom RELION command: $DEFAULT_MODE $@" 34 | exec $DEFAULT_MODE "$@" 35 | ;; 36 | esac 37 | 38 | -------------------------------------------------------------------------------- /applications/relion/relion_env_patch.patch: -------------------------------------------------------------------------------- 1 | diff --git a/environment.yml b/environment.yml 2 | index ab0acb75..98d1d008 100644 3 | --- a/environment.yml 4 | +++ b/environment.yml 5 | @@ -6,8 +6,9 @@ dependencies: 6 | - python=3.10 7 | - setuptools=59.5.0 8 | - pip: 9 | - - torch==2.0.1 10 | - - torchvision==0.15.2 11 | + - cython 12 | + - torch==2.0.1+cpu 13 | + - torchvision==0.15.2+cpu 14 | - tqdm==4.65.0 15 | - mrcfile==1.4.3 16 | - starfile>=0.5.6 17 | @@ -32,4 +33,3 @@ dependencies: 18 | - git+https://github.com/3dem/DynaMight 19 | - git+https://github.com/3dem/topaz 20 | - git+https://github.com/3dem/model-angelo 21 | - - ".[vis]" 22 | -------------------------------------------------------------------------------- /benchmarking/AWS-Intel-blog-v2.1-2024/long_db: -------------------------------------------------------------------------------- 1 | celegans_4020.fa 2 | celegans_4040.fa 3 | celegans_4060.fa 4 | celegans_4080.fa 5 | celegans_4100.fa 6 | celegans_4120.fa 7 | celegans_4140.fa 8 | celegans_4160.fa 9 | celegans_4180.fa 10 | celegans_4200.fa 11 | celegans_4220.fa 12 | celegans_4240.fa 13 | celegans_4260.fa 14 | celegans_4280.fa 15 | celegans_4300.fa 16 | celegans_4320.fa 17 | celegans_4340.fa 18 | celegans_4360.fa 19 | -------------------------------------------------------------------------------- /benchmarking/AWS-Intel-blog-v2.1-2024/proteome.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import re 4 | import numpy as np 5 | 6 | 7 | with open("./uniprotkb_proteome.fasta", "r") as f: 8 | lines = f.readlines() 9 | i = -1 10 | protien_list = [] 11 | proteome_list = [] 12 | for line in lines: 13 | if ">sp|" in line: 14 | if i >= 0: 15 | proteome_list.append(protien_list) 16 | protien_list = [] 17 | i = i + 1 18 | protien_list.append(line) 19 | else: 20 | protien_list.append(line) 21 | proteome_list.append(protien_list) 22 | 23 | sorted_list = sorted(proteome_list, key=lambda x: len(''.join(x[1:])), reverse=False) 24 | i = 0 25 | sum = 0 26 | small_db = open("short_db", "r") 27 | small_list = [line.rstrip() for line in small_db.readlines()] 28 | small_db.close() 29 | #lines = small_db.readlines() 30 | #print(lines) 31 | 32 | long_db = open("long_db", "r") 33 | long_list = [line.rstrip() for line in long_db.readlines()] 34 | long_db.close() 35 | os.mkdir("~/celegans_samples") 36 | os.mkdir("~/celegans_samples_long") 37 | for pl_list in sorted_list: 38 | sum = sum + len(''.join(pl_list[1:])) 39 | print(i, len(''.join(pl_list[1:]))) 40 | if "celegans_"+str(i)+".fa" in small_list: 41 | with open("~/celegans_samples/celegans_" + str(i) + ".fa", "w") as f: 42 | f.writelines(pl_list) 43 | 44 | if "celegans_"+str(i)+".fa" in long_list: 45 | with open("~/celegans_samples_long/celegans_" + str(i) + ".fa", "w") as f: 46 | f.writelines(pl_list) 47 | 48 | 49 | 50 | i = i + 1 51 | 52 | print(sum/i) 53 | -------------------------------------------------------------------------------- /benchmarking/AWS-Intel-blog-v2.1-2024/run_pipe_bwa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #docker pull google/deepvariant:1.5.0 3 | lscpu > compute 4 | num_cpus_per_node=$(cat compute | grep -E '^CPU\(s\)' | awk '{print $2}') 5 | 6 | INPUT=~/HG001/ 7 | OUTPUT=~/HGOO1/OUTPUT/ 8 | echo $OUTPUT 9 | mkdir -p $OUTPUT 10 | #python test_pipe_bwa.py --input $INPUT --output $OUTPUT --index GCA_000001405.15_GRCh38_no_alt_analysis_set.fna --read HG001.novaseq.pcr-free.30x.R1.fastq.gz HG001.novaseq.pcr-free.30x.R2.fastq.gz --cpus 108 --threads 108 --shards 112 11 | python test_pipe_bwa.py --input $INPUT --output $OUTPUT --index Homo_sapiens_assembly38.fasta --read HG001.novaseq.pcr-free.30x.R1.fastq.gz HG001.novaseq.pcr-free.30x.R2.fastq.gz --cpus $num_cpus_per_node --threads $num_cpus_per_node --shards $num_cpus_per_node # 2>&1 | tee ${OUTPUT}log.txti 12 | 13 | echo "Ouput files are inside "$OUTPUT" folder" 14 | -------------------------------------------------------------------------------- /benchmarking/AWS-Intel-blog-v2.1-2024/short_db: -------------------------------------------------------------------------------- 1 | celegans_0.fa 2 | celegans_100.fa 3 | celegans_1000.fa 4 | celegans_1050.fa 5 | celegans_1100.fa 6 | celegans_1150.fa 7 | celegans_1200.fa 8 | celegans_1250.fa 9 | celegans_1300.fa 10 | celegans_1350.fa 11 | celegans_1400.fa 12 | celegans_1450.fa 13 | celegans_150.fa 14 | celegans_1500.fa 15 | celegans_1550.fa 16 | celegans_1600.fa 17 | celegans_1650.fa 18 | celegans_1700.fa 19 | celegans_1800.fa 20 | celegans_1850.fa 21 | celegans_1900.fa 22 | celegans_1950.fa 23 | celegans_200.fa 24 | celegans_2000.fa 25 | celegans_2050.fa 26 | celegans_2100.fa 27 | celegans_2150.fa 28 | celegans_2200.fa 29 | celegans_2250.fa 30 | celegans_2300.fa 31 | celegans_2350.fa 32 | celegans_2400.fa 33 | celegans_2450.fa 34 | celegans_250.fa 35 | celegans_2500.fa 36 | celegans_2550.fa 37 | celegans_2600.fa 38 | celegans_2650.fa 39 | celegans_2700.fa 40 | celegans_2750.fa 41 | celegans_2800.fa 42 | celegans_2850.fa 43 | celegans_2900.fa 44 | celegans_2950.fa 45 | celegans_300.fa 46 | celegans_3000.fa 47 | celegans_3050.fa 48 | celegans_3100.fa 49 | celegans_3150.fa 50 | celegans_3200.fa 51 | celegans_3250.fa 52 | celegans_3300.fa 53 | celegans_3350.fa 54 | celegans_3400.fa 55 | celegans_3450.fa 56 | celegans_350.fa 57 | celegans_3500.fa 58 | celegans_3550.fa 59 | celegans_3600.fa 60 | celegans_3650.fa 61 | celegans_3700.fa 62 | celegans_3800.fa 63 | celegans_3850.fa 64 | celegans_3950.fa 65 | celegans_400.fa 66 | celegans_450.fa 67 | celegans_50.fa 68 | celegans_500.fa 69 | celegans_550.fa 70 | celegans_600.fa 71 | celegans_650.fa 72 | celegans_700.fa 73 | celegans_750.fa 74 | celegans_800.fa 75 | celegans_850.fa 76 | celegans_900.fa 77 | celegans_950.fa 78 | -------------------------------------------------------------------------------- /benchmarking/AWS-Intel-blog-v2.1-2024/test_pipe_bwa.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from subprocess import Popen, PIPE, run 3 | import subprocess 4 | import time 5 | import os 6 | import sys 7 | from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter 8 | 9 | def main(argv): 10 | parser=ArgumentParser() 11 | parser.add_argument('--input',help="Input data directory") 12 | parser.add_argument('--output',help="Output data directory") 13 | parser.add_argument("-i", "--index", help="name of index file") 14 | parser.add_argument("-r", "--reads", nargs='+',help="name of reads file seperated by space") 15 | parser.add_argument("-c", "--cpus",default=72,help="Number of cpus. default=72") 16 | parser.add_argument("-t", "--threads",default=72,help="Number of threads used in samtool operations. default=72") 17 | #parser.add_argument("-t", "--threads",default=72,help="Number of threads used in samtool operations. default=72") 18 | parser.add_argument('-in', '--istart',action='store_true',help="It Will start indexing") 19 | parser.add_argument('-sindex',action='store_true',help="It Will start creating .fai file") 20 | parser.add_argument('--shards',default=1,help="Number of shards for deepvariant") 21 | args = vars(parser.parse_args()) 22 | ifile=args["index"] 23 | rfile1=args["reads"][0] 24 | rfile2=args["reads"][1] 25 | cpus=args["cpus"] 26 | threads=args["threads"] 27 | index=args["istart"] 28 | sindex=args["sindex"] 29 | nproc=args["shards"] 30 | folder=args["input"] 31 | output=args["output"] 32 | 33 | t0=time.time() 34 | file_size = os.path.getsize(folder+rfile1) 35 | print("\nSize of FASTQ file:",file_size) 36 | 37 | if index==True : 38 | print("Indexing Starts") 39 | begin = time.time() 40 | a=run('../../applications/bwa-0.7.17/bwa index '+folder+ifile,capture_output=True,shell=True) 41 | end=time.time() 42 | file_size = os.path.getsize(folder+rfile1) 43 | print("\nIndex time:",end-begin) 44 | print("\nSize of FASTQ file:",file_size) 45 | 46 | 47 | print("bwa starts") 48 | begin1 = time.time() 49 | print('../../applications/bwa-0.7.17/bwa mem -t '+cpus+' '+folder+ifile+' '+folder+rfile1+' '+folder+rfile2+' > '+output+'aln.sam') 50 | a=run('../../applications/bwa-0.7.17/bwa mem -t '+cpus+' '+folder+ifile+' '+folder+rfile1+' '+folder+rfile2+' > '+output+'aln.sam',capture_output=True, shell=True) 51 | end1=time.time() 52 | #file_size = os.path.getsize(output+'aln.sam') 53 | print("\nFASTQ to SAM time:",end1-begin1) 54 | print("\nSize of SAM file:",file_size) 55 | 56 | print("sam to sort-bam starts") 57 | begin2=time.time() 58 | print(output+'aln.bam') 59 | a=run('../../applications/samtools/samtools sort --threads '+threads+' -T /tmp/aln.sorted -o '+output+'aln.bam '+output+'aln.sam',capture_output=True,shell=True) 60 | end2=time.time() 61 | file_size = os.path.getsize(output+'aln.bam') 62 | print("\nSAM to sort-BAM time:",end2-begin2) 63 | print("\nSize of sort-BAM file",file_size) 64 | 65 | begin3=time.time() 66 | print("Indexing of ref and read starts") 67 | if sindex==True : 68 | a=run('../../applications/samtools/samtools faidx '+folder+ifile,capture_output=True,shell=True) 69 | 70 | print('../../applications/samtools/samtools index -M -@ '+threads+' '+output+'aln.bam') 71 | a=run('../../applications/samtools/samtools index -M -@ '+threads+' '+output+'aln.bam',capture_output=True,shell=True) 72 | 73 | end3=time.time() 74 | print("\nIndex creation time",end3-begin3) 75 | 76 | begin5=time.time() 77 | #original 78 | command='sudo docker run -v '+folder+':"/input" -v '+output+':"/output" google/deepvariant:1.5.0 /opt/deepvariant/bin/run_deepvariant --model_type=WGS --ref=/input/'+ifile+' --reads=/output/aln.bam --output_vcf=/output/output.vcf.gz --output_gvcf=/output/output.g.vcf.gz --intermediate_results_dir /output/intermediate_results_dir --num_shards='+nproc+' --dry_run=false' 79 | #updated 80 | #command='podman run -v '+folder+':"/input" -v '+output+':"/output" localhost/deepvariant:latest /opt/deepvariant/bin/run_deepvariant --model_type=WGS --ref=/input/'+ifile+' --reads=/output/aln.sorted.new.bam --output_vcf=/output/output.vcf.gz --intermediate_results_dir /output/intermediate_results_dir --num_shards='+nproc+' --pcl_opt --dry_run=false' 81 | print(command) 82 | a=run( command+" 2>&1 >> "+output+"log_deepvariant.txt", shell=True) 83 | #pid=subprocess.call(command,shell=True) 84 | #os.system(command) 85 | end5=time.time() 86 | print("\nDeepVariant runtime",end5-begin5) 87 | print("Pipeline runtime",end5-t0) 88 | 89 | if __name__ == "__main__": 90 | main(sys.argv[1:]) 91 | -------------------------------------------------------------------------------- /images/Open-Omics-Acceleration-Framework v2.0.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/Open-Omics-Acceleration-Framework/db29aec9c3e2eb27c36dd91824dfc346e5deae89/images/Open-Omics-Acceleration-Framework v2.0.JPG -------------------------------------------------------------------------------- /images/Open-Omics-Acceleration-Framework v2.0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/Open-Omics-Acceleration-Framework/db29aec9c3e2eb27c36dd91824dfc346e5deae89/images/Open-Omics-Acceleration-Framework v2.0.jpg -------------------------------------------------------------------------------- /images/Open-Omics-Acceleration-Framework v3.0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/Open-Omics-Acceleration-Framework/db29aec9c3e2eb27c36dd91824dfc346e5deae89/images/Open-Omics-Acceleration-Framework v3.0.jpg -------------------------------------------------------------------------------- /images/Open-Omics-Acceleration-Framework-v2.0.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/Open-Omics-Acceleration-Framework/db29aec9c3e2eb27c36dd91824dfc346e5deae89/images/Open-Omics-Acceleration-Framework-v2.0.JPG -------------------------------------------------------------------------------- /images/Open-Omics-Acceleration-Framework-v3.0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/Open-Omics-Acceleration-Framework/db29aec9c3e2eb27c36dd91824dfc346e5deae89/images/Open-Omics-Acceleration-Framework-v3.0.jpg -------------------------------------------------------------------------------- /images/alphafold2-protein-folding.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/Open-Omics-Acceleration-Framework/db29aec9c3e2eb27c36dd91824dfc346e5deae89/images/alphafold2-protein-folding.jpg -------------------------------------------------------------------------------- /images/deepvariant-fq2vcf.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/Open-Omics-Acceleration-Framework/db29aec9c3e2eb27c36dd91824dfc346e5deae89/images/deepvariant-fq2vcf.jpg -------------------------------------------------------------------------------- /images/open-omics-acceleration-framework-v2.0.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/Open-Omics-Acceleration-Framework/db29aec9c3e2eb27c36dd91824dfc346e5deae89/images/open-omics-acceleration-framework-v2.0.JPG -------------------------------------------------------------------------------- /images/open-omics-acceleration-framework.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/Open-Omics-Acceleration-Framework/db29aec9c3e2eb27c36dd91824dfc346e5deae89/images/open-omics-acceleration-framework.JPG -------------------------------------------------------------------------------- /images/scrnaseq-analysis.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/Open-Omics-Acceleration-Framework/db29aec9c3e2eb27c36dd91824dfc346e5deae89/images/scrnaseq-analysis.jpg -------------------------------------------------------------------------------- /pipelines/alphafold2-based-protein-folding/Dockerfile_Inf: -------------------------------------------------------------------------------- 1 | ARG FROM_IMAGE=ubuntu:22.04 2 | # Install Base miniconda image 3 | ARG BASE_IMAGE=condaforge/miniforge3:23.1.0-3 4 | FROM ${BASE_IMAGE} as conda_setup 5 | ENV DEBIAN_FRONTEND=noninteractive 6 | 7 | # Non-root user setup 8 | ENV SERVICE_NAME="alphafold2-inf-service" 9 | 10 | RUN groupadd --gid 1001 $SERVICE_NAME && \ 11 | useradd -m -g $SERVICE_NAME --shell /bin/false --uid 1001 $SERVICE_NAME 12 | 13 | # Install Anaconda and PIP dependency 14 | RUN conda update -n base conda 15 | RUN conda install python==3.11 16 | RUN conda install -y -c conda-forge gcc_linux-64==12.1.0 gxx_linux-64==12.1.0 17 | RUN conda install -y -c conda-forge openmm==8.0.0 pdbfixer==1.9 18 | RUN conda install -y bioconda::kalign2==2.04 19 | RUN conda install -y -c conda-forge mkl==2024.2.0 mkl-devel==2024.2.0 20 | RUN python -m pip install onednn-cpu-iomp==2023.2.0 21 | RUN python -m pip install torch==2.1.0 pybind11==2.11.1 22 | RUN python -m pip install absl-py==2.0.0 biopython==1.81 chex==0.1.84 dm-haiku==0.0.10 dm-tree==0.1.8 immutabledict==3.0.0 ml-collections==0.1.1 numpy==1.26.1 scipy==1.11.3 tensorflow==2.14.0 pandas==2.1.1 psutil==5.9.6 tqdm==4.65.0 joblib==1.3.2 pragzip==0.6.0 23 | RUN python -m pip install jax==0.4.21 jaxlib==0.4.21 24 | RUN python -m pip install intel-extension-for-pytorch==2.1.0 intel-openmp==2024.2.0 25 | RUN conda install -y -c conda-forge autoconf==2.71 26 | RUN conda install -y -c conda-forge make==4.3 27 | 28 | 29 | FROM ${FROM_IMAGE} as builder 30 | ENV DEBIAN_FRONTEND=noninteractive 31 | 32 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ 33 | git build-essential cmake wget tzdata gcc curl gnupg gnupg2 gnupg1 sudo kalign autoconf numactl time vim tar ca-certificates -y\ 34 | && rm -rf /var/lib/apt/lists/* \ 35 | && apt-get autoremove -y \ 36 | && apt-get clean 37 | RUN apt update 38 | 39 | COPY --from=conda_setup /opt/conda /opt/conda 40 | ENV PATH "/opt/conda/bin:$PATH" 41 | RUN echo "source activate" >> ~/.bashrc 42 | CMD source ~/.bashrc 43 | 44 | WORKDIR / 45 | # RUN wget https://github.com/IntelLabs/Open-Omics-Acceleration-Framework/releases/download/3.0/Source_code_with_submodules.tar.gz 46 | # RUN tar -xzf Source_code_with_submodules.tar.gz 47 | RUN git clone --recursive https://github.com/IntelLabs/Open-Omics-Acceleration-Framework.git 48 | WORKDIR /Open-Omics-Acceleration-Framework/applications/ 49 | 50 | RUN git clone --branch 5.3.0 https://github.com/jemalloc/jemalloc.git 51 | WORKDIR /Open-Omics-Acceleration-Framework/applications/jemalloc 52 | RUN bash autogen.sh --prefix=/opt/conda/ && make install 53 | WORKDIR /Open-Omics-Acceleration-Framework/applications 54 | RUN rm -rf jemalloc 55 | 56 | ENV PATH="/usr/bin:$PATH" 57 | ENV PATH "/opt/conda/bin:$PATH" 58 | ENV LD_LIBRARY_PATH "/opt/conda/lib:$LD_LIBRARY_PATH" 59 | 60 | 61 | 62 | WORKDIR /Open-Omics-Acceleration-Framework/applications/alphafold/ 63 | WORKDIR /Open-Omics-Acceleration-Framework/applications/alphafold/tpp-pytorch-extension 64 | RUN CC=gcc && CXX=g++ && python setup.py install \ 65 | && python -c "from tpp_pytorch_extension.alphafold.Alpha_Attention import GatingAttentionOpti_forward" 66 | 67 | WORKDIR /Open-Omics-Acceleration-Framework/applications/alphafold/ 68 | RUN wget -q -P ./alphafold/common/ https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c4941278d92b554ec94415f8/modules/mol/alg/src/stereo_chemical_props.txt --no-check-certificate 69 | 70 | 71 | # Swith to Non-root user 72 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /Open-Omics-Acceleration-Framework 73 | USER $SERVICE_NAME 74 | 75 | HEALTHCHECK NONE 76 | 77 | ENV PATH "/opt/conda/bin:$PATH" 78 | ENV LD_LIBRARY_PATH "/opt/conda/lib:$LD_LIBRARY_PATH" 79 | 80 | COPY ./entrypoint_inf.sh / 81 | RUN chmod +x /entrypoint_inf.sh 82 | 83 | ENTRYPOINT ["/entrypoint_inf.sh"] 84 | 85 | # Default command 86 | CMD ["default"] -------------------------------------------------------------------------------- /pipelines/alphafold2-based-protein-folding/Dockerfile_Pre: -------------------------------------------------------------------------------- 1 | ARG FROM_IMAGE=ubuntu:22.04 2 | # Install Base miniconda image 3 | ARG BASE_IMAGE=condaforge/miniforge3:24.3.0-0 4 | FROM ${BASE_IMAGE} as conda_setup 5 | ENV DEBIAN_FRONTEND=noninteractive 6 | 7 | # Non-root user setup 8 | ENV SERVICE_NAME="alphafold2-pre-service" 9 | 10 | RUN groupadd --gid 1001 $SERVICE_NAME && \ 11 | useradd -m -g $SERVICE_NAME --shell /bin/false --uid 1001 $SERVICE_NAME 12 | 13 | # Install Anaconda and PIP dependency 14 | RUN conda update -n base conda 15 | RUN conda install python==3.11 16 | RUN conda install -y -c conda-forge mkl==2024.2.0 dpcpp_linux-64==2024.2.0 dpcpp-cpp-rt==2024.2.0 mkl-devel==2024.2.0 17 | RUN conda install -y -c conda-forge openmm==8.0.0 pdbfixer==1.9 18 | RUN conda install -y -c bioconda hmmer=3.3.2 hhsuite==3.3.0 kalign2==2.04 19 | RUN python -m pip install onednn-cpu-iomp==2023.2.0 20 | RUN python -m pip install torch==2.1.0 pybind11==2.11.1 21 | RUN python -m pip install absl-py==2.0.0 biopython==1.81 chex==0.1.84 dm-haiku==0.0.10 dm-tree==0.1.8 immutabledict==3.0.0 ml-collections==0.1.1 numpy==1.26.1 scipy==1.11.3 tensorflow==2.14.0 pandas==2.1.1 psutil==5.9.6 tqdm==4.65.0 joblib==1.3.2 pragzip==0.6.0 22 | RUN python -m pip install jax==0.4.21 jaxlib==0.4.21 23 | RUN python -m pip install intel-extension-for-pytorch==2.1.0 intel-openmp==2024.2.0 24 | 25 | 26 | FROM ${FROM_IMAGE} as builder 27 | ENV DEBIAN_FRONTEND=noninteractive 28 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ 29 | git build-essential cmake wget tzdata gcc curl gnupg gnupg2 gnupg1 sudo kalign autoconf numactl time vim tar ca-certificates -y \ 30 | && rm -rf /var/lib/apt/lists/* \ 31 | && apt-get autoremove -y \ 32 | && apt-get clean \ 33 | && apt update 34 | 35 | 36 | COPY --from=conda_setup /opt/conda /opt/conda 37 | ENV PATH "/opt/conda/bin:$PATH" 38 | RUN echo "source /opt/conda/bin/activate " > ~/.bashrc 39 | CMD source ~/.bashrc 40 | 41 | WORKDIR / 42 | # RUN wget https://github.com/IntelLabs/Open-Omics-Acceleration-Framework/releases/download/3.0/Source_code_with_submodules.tar.gz 43 | # RUN tar -xzf Source_code_with_submodules.tar.gz 44 | RUN git clone --recursive https://github.com/IntelLabs/Open-Omics-Acceleration-Framework.git 45 | 46 | WORKDIR /Open-Omics-Acceleration-Framework/applications/alphafold 47 | ENV LD_LIBRARY_PATH "/opt/conda/lib:$LD_LIBRARY_PATH" 48 | ENV PATH "/opt/conda/bin:$PATH" 49 | # Compile HHsuite from source. 50 | # 51 | WORKDIR /Open-Omics-Acceleration-Framework/applications/alphafold 52 | RUN git clone --recursive https://github.com/IntelLabs/hh-suite.git 53 | WORKDIR /Open-Omics-Acceleration-Framework/applications/alphafold/hh-suite 54 | WORKDIR /Open-Omics-Acceleration-Framework/applications/alphafold/hh-suite/build 55 | RUN cmake -DCMAKE_INSTALL_PREFIX=`pwd`/release -DCMAKE_CXX_COMPILER="icpx" -DCMAKE_CXX_FLAGS_RELEASE="-O3 -mavx512bw" .. \ 56 | && make -j 4 && make install \ 57 | && ./release/bin/hhblits -h 58 | 59 | WORKDIR /Open-Omics-Acceleration-Framework/applications/alphafold 60 | # Compile Hmmer from source. 61 | RUN git clone --recursive https://github.com/IntelLabs/hmmer.git 62 | WORKDIR /Open-Omics-Acceleration-Framework/applications/alphafold/hmmer 63 | RUN cp easel_makefile.in easel/Makefile.in 64 | WORKDIR /Open-Omics-Acceleration-Framework/applications/alphafold/hmmer/easel 65 | RUN autoconf && ./configure --prefix=`pwd` 66 | WORKDIR /Open-Omics-Acceleration-Framework/applications/alphafold/hmmer 67 | RUN autoconf && CC=icx CFLAGS="-O3 -mavx512bw -fPIC" ./configure --prefix=`pwd`/release \ 68 | && make -j 4 && make install \ 69 | && ./release/bin/jackhmmer -h 70 | 71 | WORKDIR /Open-Omics-Acceleration-Framework/applications/alphafold 72 | # Swith to Non-root user 73 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /Open-Omics-Acceleration-Framework 74 | USER $SERVICE_NAME 75 | 76 | HEALTHCHECK NONE 77 | 78 | RUN echo "source /opt/conda/bin/activate " > ~/.bashrc 79 | CMD source ~/.bashrc 80 | ENV PATH "/opt/conda/bin:$PATH" 81 | ENV LD_LIBRARY_PATH "/opt/conda/lib:$LD_LIBRARY_PATH" 82 | 83 | COPY ./entrypoint_pre.sh / 84 | RUN chmod +x /entrypoint_pre.sh 85 | 86 | ENTRYPOINT ["/entrypoint_pre.sh"] 87 | 88 | # Default command 89 | CMD ["default"] -------------------------------------------------------------------------------- /pipelines/alphafold2-based-protein-folding/README.md: -------------------------------------------------------------------------------- 1 | # Pipeline overview 2 | Given one or more protein sequences, this workflow performs preprocessing (database search and multiple sequence alignment using Open Omics [HMMER](https://github.com/IntelLabs/hmmer) and [HH-suite](https://github.com/IntelLabs/hh-suite)) and structure prediction through AlphaFold2's Evoformer model ([Open Omics AlphaFold2](https://github.com/IntelLabs/open-omics-alphafold)) to output the structure(s) of the protein sequences. The following block diagram illustrates the pipeline. 3 | 4 |

5 |
6 |

7 | 8 | # Build a docker image 9 | 10 | ### Current docker image requires a single socket/dual-socket CPU with 1 or 2 NUMA domains, because it runs multiple inference instances in parallel. It can be easily modified to run at other types of machines. 11 | 12 | ```bash 13 | cd ~/Open-Omics-Acceleration-Framework/pipelines/alphafold2-based-protein-folding 14 | docker build -t alphafold:pre -f Dockerfile_Pre . # Build a docker image named alphafold:pre for pre-processing step 15 | docker build -t alphafold:inf -f Dockerfile_Inf . # Build a docker image named alphafold:inf for inference step 16 | 17 | ``` 18 | # Preparation 19 | 1. Follow the instructions from https://github.com/deepmind/alphafold repo and download the database for alphafold2. 20 | 2. Create a samples directory that contains fasta files for input proteins. 21 | 3. Create a output directory where model output will be written. 22 | 4. Create a log directory where log will be written. 23 | # Run a docker container 24 | ```bash 25 | export DATA_DIR= 26 | export SAMPLES_DIR= 27 | export OUTPUT_DIR= 28 | export LOG_DIR= 29 | 30 | 31 | # Run pre-processign step for monomer 32 | docker run -it --cap-add SYS_NICE -v $DATA_DIR:/data \ 33 | -v $SAMPLES_DIR:/samples \ 34 | -v $OUTPUT_DIR:/output \ 35 | -v $LOG_DIR:/Open-Omics-Acceleration-Framework/applications/alphafold/logs \ 36 | alphafold:pre 37 | 38 | # Run pre-processign step for multimer 39 | docker run -it --cap-add SYS_NICE -v $DATA_DIR:/data \ 40 | -v $SAMPLES_DIR:/samples \ 41 | -v $OUTPUT_DIR:/output \ 42 | -v $LOG_DIR:/Open-Omics-Acceleration-Framework/applications/alphafold/logs \ 43 | alphafold:pre multimer 44 | 45 | # Run inference step for monomer with relexation 46 | docker run -it --cap-add SYS_NICE -v $DATA_DIR:/data \ 47 | -v $SAMPLES_DIR:/samples \ 48 | -v $OUTPUT_DIR:/output \ 49 | -v $LOG_DIR:/Open-Omics-Acceleration-Framework/applications/alphafold/logs \ 50 | alphafold:inf monomer relax 51 | 52 | # Run inference step for multimer with relexation 53 | docker run -it --cap-add SYS_NICE -v $DATA_DIR:/data \ 54 | -v $SAMPLES_DIR:/samples \ 55 | -v $OUTPUT_DIR:/output \ 56 | -v $LOG_DIR:/Open-Omics-Acceleration-Framework/applications/alphafold/logs \ 57 | alphafold:inf multimer relax 58 | ``` 59 | 60 | # Running baremetal 61 | 62 | To run the optimized alphafold2 without docker (baremetal) 63 | 1. Clone the open-omics-alphafold submodule present in the applications directory of this repo. 64 | 2. Follow the readme instructions of the submodule for creating conda environment and runnning inference. 65 | -------------------------------------------------------------------------------- /pipelines/alphafold2-based-protein-folding/entrypoint_inf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | if [ "$1" = "multimer" ]; then 4 | echo "Running command for multimer" 5 | mkdir weights && mkdir weights/extracted && python extract_params.py --input /data/params/params_model_1_multimer_v3.npz --output_dir ./weights/extracted/model_1_multimer_v3 \ 6 | && python extract_params.py --input /data/params/params_model_2_multimer_v3.npz --output_dir ./weights/extracted/model_2_multimer_v3 \ 7 | && python extract_params.py --input /data/params/params_model_3_multimer_v3.npz --output_dir ./weights/extracted/model_3_multimer_v3 \ 8 | && python extract_params.py --input /data/params/params_model_4_multimer_v3.npz --output_dir ./weights/extracted/model_4_multimer_v3 \ 9 | && python extract_params.py --input /data/params/params_model_5_multimer_v3.npz --output_dir ./weights/extracted/model_5_multimer_v3 \ 10 | && LD_PRELOAD=/opt/conda/lib/libjemalloc.so:$LD_PRELOAD \ 11 | MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1" \ 12 | python run_multiprocess_infer_multimer.py --root_condaenv=/opt/conda --root_home=/Open-Omics-Acceleration-Framework/applications/alphafold --input_dir=/samples --output_dir=/output --model_names="model_1_multimer_v3,model_2_multimer_v3,model_3_multimer_v3,model_4_multimer_v3,model_5_multimer_v3" --num_multimer_predictions_per_model=5 13 | if [ "$2" = "relax" ]; then 14 | echo "Running command for relaxation" 15 | python run_multiprocess_relax.py --root_home=/Open-Omics-Acceleration-Framework/applications/alphafold --input_dir=/samples --output_dir=/output --model_names="model_1_multimer_v3,model_2_multimer_v3,model_3_multimer_v3,model_4_multimer_v3,model_5_multimer_v3" --model_preset=multimer --num_multimer_predictions_per_model=5 16 | fi 17 | 18 | elif [ "$1" = "monomer" ]; then 19 | echo "Running command for monomer" 20 | mkdir weights && mkdir weights/extracted && python extract_params.py --input /data/params/params_model_1.npz --output_dir ./weights/extracted/model_1 \ 21 | && python extract_params.py --input /data/params/params_model_2.npz --output_dir ./weights/extracted/model_2 \ 22 | && python extract_params.py --input /data/params/params_model_3.npz --output_dir ./weights/extracted/model_3 \ 23 | && python extract_params.py --input /data/params/params_model_4.npz --output_dir ./weights/extracted/model_4 \ 24 | && python extract_params.py --input /data/params/params_model_5.npz --output_dir ./weights/extracted/model_5 \ 25 | && LD_PRELOAD=/opt/conda/lib/libjemalloc.so:$LD_PRELOAD \ 26 | MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1" \ 27 | python run_multiprocess_infer.py --root_condaenv=/opt/conda --root_home=/Open-Omics-Acceleration-Framework/applications/alphafold --input_dir=/samples --output_dir=/output --model_names="model_1,model_2,model_3,model_4,model_5" 28 | 29 | if [ "$2" = "relax" ]; then 30 | echo "Running command for relaxation" 31 | python run_multiprocess_relax.py --root_home=/Open-Omics-Acceleration-Framework/applications/alphafold --input_dir=/samples --output_dir=/output --model_names="model_1,model_2,model_3,model_4,model_5" --model_preset=monomer 32 | fi 33 | 34 | else 35 | echo "Running command for monomer" 36 | mkdir weights && mkdir weights/extracted && python extract_params.py --input /data/params/params_model_1.npz --output_dir ./weights/extracted/model_1 \ 37 | && python extract_params.py --input /data/params/params_model_2.npz --output_dir ./weights/extracted/model_2 \ 38 | && python extract_params.py --input /data/params/params_model_3.npz --output_dir ./weights/extracted/model_3 \ 39 | && python extract_params.py --input /data/params/params_model_4.npz --output_dir ./weights/extracted/model_4 \ 40 | && python extract_params.py --input /data/params/params_model_5.npz --output_dir ./weights/extracted/model_5 \ 41 | && LD_PRELOAD=/opt/conda/lib/libjemalloc.so:$LD_PRELOAD \ 42 | MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1" \ 43 | python run_multiprocess_infer.py --root_condaenv=/opt/conda --root_home=/Open-Omics-Acceleration-Framework/applications/alphafold --input_dir=/samples --output_dir=/output --model_names="model_1,model_2,model_3,model_4,model_5" 44 | fi 45 | -------------------------------------------------------------------------------- /pipelines/alphafold2-based-protein-folding/entrypoint_pre.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | if [ "$1" = "multimer" ]; then 4 | echo "Running command for multimer" 5 | mkdir weights && mkdir weights/extracted && python extract_params.py --input /data/params/params_model_1_multimer_v3.npz --output_dir ./weights/extracted/model_1_multimer_v3 \ 6 | && python extract_params.py --input /data/params/params_model_2_multimer_v3.npz --output_dir ./weights/extracted/model_2_multimer_v3 \ 7 | && python extract_params.py --input /data/params/params_model_3_multimer_v3.npz --output_dir ./weights/extracted/model_3_multimer_v3 \ 8 | && python extract_params.py --input /data/params/params_model_4_multimer_v3.npz --output_dir ./weights/extracted/model_4_multimer_v3 \ 9 | && python extract_params.py --input /data/params/params_model_5_multimer_v3.npz --output_dir ./weights/extracted/model_5_multimer_v3 \ 10 | && python run_multiprocess_pre_multimer.py --root_home=/Open-Omics-Acceleration-Framework/applications/alphafold --data_dir=/data --input_dir=/samples --output_dir=/output 11 | else 12 | echo "Running command for monomer" 13 | mkdir weights && mkdir weights/extracted && python extract_params.py --input /data/params/params_model_1.npz --output_dir ./weights/extracted/model_1 \ 14 | && python run_multiprocess_pre.py --root_home=/Open-Omics-Acceleration-Framework/applications/alphafold --data_dir=/data --input_dir=/samples --output_dir=/output --model_name=model_1 15 | fi -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/Dockerfile_fq2bams: -------------------------------------------------------------------------------- 1 | # Install Base miniconda image 2 | ARG FROM_IMAGE=amazonlinux:2023 3 | #ARG BASE_IMAGE=continuumio/miniconda3 4 | ARG BASE_IMAGE=condaforge/miniforge3 5 | FROM ${BASE_IMAGE} as conda_setup 6 | 7 | WORKDIR / 8 | RUN git clone --recursive https://github.com/IntelLabs/Open-Omics-Acceleration-Framework.git 9 | WORKDIR /Open-Omics-Acceleration-Framework 10 | RUN git checkout 060a0c76ad4ded6d6de709b0466b8bdafdc6053d 11 | WORKDIR /Open-Omics-Acceleration-Framework/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/ 12 | 13 | RUN conda env create --name dv_env -f environment.yml 14 | 15 | 16 | FROM ${FROM_IMAGE} as builder 17 | RUN yum update -y && \ 18 | yum install --allowerasing -y git gcc make cmake3 tar gnupg2 autoconf numactl time vim && \ 19 | yum clean all && \ 20 | rm -rf /var/cache/yum 21 | RUN yum install -y procps 22 | RUN yum groupinstall -y 'Development Tools' 23 | RUN yum -y update 24 | RUN yum -y install make zlib-devel ncurses-devel 25 | RUN yum -y install bzip2-devel xz-devel 26 | RUN yum -y install yum-utils 27 | 28 | COPY --from=conda_setup /opt/conda /opt/conda 29 | ENV PATH "/opt/conda/envs/dv_env/bin:/opt/conda/bin:$PATH" 30 | RUN echo "source activate dv_env" >> ~/.bashrc 31 | RUN source ~/.bashrc 32 | 33 | WORKDIR / 34 | RUN git clone --recursive https://github.com/IntelLabs/Open-Omics-Acceleration-Framework.git 35 | WORKDIR /Open-Omics-Acceleration-Framework 36 | RUN git checkout 060a0c76ad4ded6d6de709b0466b8bdafdc6053d 37 | 38 | WORKDIR /Open-Omics-Acceleration-Framework/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/ 39 | 40 | 41 | # compile bwa-mem2 42 | RUN echo "Build bwa-mem2" 43 | WORKDIR /Open-Omics-Acceleration-Framework/applications/bwa-mem2 44 | RUN make multi 45 | 46 | 47 | # compile htslib 48 | WORKDIR /Open-Omics-Acceleration-Framework/applications/htslib 49 | RUN autoreconf -i # Build the configure script and install files it uses 50 | RUN ./configure # Optional but recommended, for choosing extra functionality 51 | RUN make 52 | #make install #uncomment this for installation 53 | 54 | # compile samtools 55 | WORKDIR /Open-Omics-Acceleration-Framework/applications/samtools 56 | RUN autoheader 57 | RUN autoconf -Wno-syntax 58 | RUN chmod 775 configure 59 | RUN ./configure # Needed for choosing optional functionality 60 | RUN make 61 | 62 | RUN mkdir /input 63 | RUN mkdir /output 64 | WORKDIR /Open-Omics-Acceleration-Framework/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/ 65 | CMD ["/bin/bash"] 66 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/bams2vcf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | import json 5 | from subprocess import Popen, PIPE, run 6 | import subprocess 7 | import time 8 | import os 9 | import sys 10 | import threading 11 | import tempfile 12 | from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter 13 | from mpi4py import MPI 14 | import bisect 15 | import heapq as hq 16 | import numpy as np 17 | from multiprocessing import Pool 18 | from operator import itemgetter 19 | import pickle 20 | BINDIR="../.." 21 | 22 | def allexit(comm, flg): 23 | comm.barrier() 24 | flg = comm.bcast(flg, root=0) 25 | if flg: os.sys.exit(1) 26 | 27 | 28 | #def main(argv): 29 | def main(args): 30 | ifile=args["refindex"] 31 | cpus=args["cpus"] 32 | threads=args["threads"] 33 | nproc=args["shards"] 34 | inputdir=args["input"] + "/" 35 | output=args["output"] + "/" 36 | refdir=args["refdir"] + "/" 37 | #tempdir=output 38 | tempdir = args["tempdir"] 39 | if tempdir == "": tempdir = output 40 | else: tempdir = tempdir + "/" 41 | 42 | outfile = args["outfile"] 43 | 44 | comm = MPI.COMM_WORLD 45 | rank = comm.Get_rank() 46 | nranks = comm.Get_size() 47 | bin_region=None 48 | 49 | global ncpus 50 | ncpus = int(cpus) 51 | i = 0 52 | binstr = "%05d"%(i*nranks+rank) 53 | t0 = time.time() 54 | if not os.path.isfile(os.path.join(output, 'bin_region.pkl')): 55 | print("[Info] Missing intermediate .pkl files from fq2bam part of the pipeline.") 56 | os.sys.exit(1) 57 | 58 | if not os.path.isfile(os.path.join(inputdir, 'aln' + binstr + '.bam')): 59 | print("[Info] Missing intermediate .bam files from fq2bam part of the pipeline.") 60 | os.sys.exit(1) 61 | 62 | if (ifile == "" or ifile == None) or not os.path.isfile(os.path.join(refdir, ifile)): 63 | print("[Info] Missing reference file.") 64 | os.sys.exit(1) 65 | 66 | 67 | with open(os.path.join(inputdir, 'bin_region.pkl'), 'rb') as f: 68 | bin_region = pickle.load(f) 69 | print(bin_region) 70 | 71 | command='mkdir -p '+os.path.join(output,binstr)+ \ 72 | '; /opt/deepvariant/bin/run_deepvariant --model_type=WGS --ref=' + \ 73 | os.path.join(refdir, ifile) + \ 74 | ' --reads='+ os.path.join(inputdir, 'aln' + binstr + '.bam') + ' ' + \ 75 | ' --output_vcf=' + os.path.join(output, binstr, 'output.vcf.gz ') + ' ' + \ 76 | ' --intermediate_results_dir '+ \ 77 | os.path.join(output, 'intermediate_results_dir'+ binstr) + \ 78 | ' --num_shards='+ str(nproc)+ \ 79 | ' --dry_run=false --regions "' + bin_region[i*nranks+rank]+'"' 80 | 81 | print("Deepvariant commandline: ") 82 | print(command) 83 | 84 | a = run('echo "'+command+'" > '+os.path.join(output, "logs", 'dvlog'+binstr+'.txt'), shell=True) 85 | a = run(command + " 2>&1 >> " + os.path.join(output, "logs", 'dvlog'+binstr+'.txt'), shell=True) 86 | assert a.returncode == 0,"[Info] Deepvariant execution failed." 87 | comm.barrier() 88 | 89 | bins_per_rank = 1 90 | flg = 0 91 | if rank == 0: 92 | cmd = 'bash merge_vcf.sh '+output +' '+str(nranks)+' '+str(bins_per_rank) + ' ' + outfile + " > " + output + "/logs/mergelog.txt" 93 | a = run(cmd, capture_output = True, shell = True) 94 | #assert a.returncode == 0,"VCF merge failed" 95 | if a.returncode != 0: 96 | flg = 1 97 | print("[Info] VCF merge failed.") 98 | end5 = time.time() 99 | print("\nDeepVariant runtime",end5-t0) 100 | #print("\nTime for the whole pipeline",end5-start0) 101 | for i in range(nranks): 102 | r = "%05d"%(i) 103 | p = os.path.join(output, r) 104 | #print('path: ', p) 105 | os.system('rm -rf ' + p) 106 | 107 | if rank == nranks - 1: 108 | print('[Info] Cleaning up....') 109 | for i in range(nranks): 110 | r = "%05d"%(i) 111 | #print(r) 112 | #os.system('ls -lh ' + r) 113 | if not args['keep_input']: 114 | os.system('rm -rf ' + os.path.join(inputdir, "bin_region.pkl")) 115 | os.system('rm -rf ' + os.path.join(inputdir, "aln"+r+".bam")) 116 | os.system('rm -rf ' + os.path.join(inputdir, "aln"+r+".bam.bai")) 117 | 118 | os.system('rm -rf '+ os.path.join(output, 'intermediate_results_dir' + r)) 119 | print('[Info] Cleaning up done.') 120 | 121 | 122 | allexit(comm, flg) ## all ranks exit if failure in rank 0 above 123 | 124 | if __name__ == "__main__": 125 | args = json.loads(sys.argv[1]) 126 | main(args) 127 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/docs.txt: -------------------------------------------------------------------------------- 1 | python run_fq2bams.py --ref /refdir/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna --reads /input/HG001.novaseq.pcr-free.30x.R1_29M.fastq.gz --output /out/ --params '-R "@RG\\tID:RG1\\tSM:RGSN1"' --keep_intermediate_sam 2 | 3 | python run_bams2vcf.py --ref /refdir/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna --output /o 4 | ut/out2.vcf --input /out/ 5 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/environment.yml: -------------------------------------------------------------------------------- 1 | name: new_env 2 | channels: 3 | - conda-forge 4 | - anaconda 5 | dependencies: 6 | - _libgcc_mutex=0.1=main 7 | - _openmp_mutex=5.1=1_gnu 8 | - blas=1.0=mkl 9 | - ca-certificates=2023.01.10=h06a4308_0 10 | - certifi=2022.12.7=py39h06a4308_0 11 | - intel-openmp=2021.4.0=h06a4308_3561 12 | - ld_impl_linux-64=2.38=h1181459_1 13 | - libffi=3.4.2=h6a678d5_6 14 | - libgcc-ng=11.2.0=h1234567_1 15 | - libgfortran-ng=7.5.0=ha8ba4b0_17 16 | - libgfortran4=7.5.0=ha8ba4b0_17 17 | - libgomp=11.2.0=h1234567_1 18 | - libstdcxx-ng=11.2.0=h1234567_1 19 | - mkl=2021.4.0=h06a4308_640 20 | - mkl-service=2.4.0=py39h7f8727e_0 21 | - mkl_fft=1.3.1=py39hd3c417c_0 22 | - mkl_random=1.2.2=py39h51133e4_0 23 | - mpi=1.0=mpich 24 | - mpi4py=3.1.4=py39hfc96bbd_0 25 | - mpich=3.3.2=hc856adb_0 26 | - ncurses=6.4=h6a678d5_0 27 | - numpy=1.23.5=py39h14f4228_0 28 | - numpy-base=1.23.5=py39h31eccc5_0 29 | - openssl=1.1.1t=h7f8727e_0 30 | - pip=23.0.1=py39h06a4308_0 31 | - python=3.9.16=h7a1cb2a_2 32 | - readline=8.2=h5eee18b_0 33 | - setuptools=65.6.3=py39h06a4308_0 34 | - six=1.16.0=pyhd3eb1b0_1 35 | - sqlite=3.41.1=h5eee18b_0 36 | - tk=8.6.12=h1ccaba5_0 37 | - tzdata=2023c=h04d1e81_0 38 | - wheel=0.38.4=py39h06a4308_0 39 | - xz=5.2.10=h5eee18b_1 40 | - zlib=1.2.13=h5eee18b_0 41 | - pip: 42 | - pragzip==0.5.0 43 | - yappi==1.4.0 44 | 45 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/libmimalloc.so.2.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/Open-Omics-Acceleration-Framework/db29aec9c3e2eb27c36dd91824dfc346e5deae89/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/libmimalloc.so.2.0 -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/merge_vcf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | SCRIPT_PATH="${BASH_SOURCE:-$0}" 4 | ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")" 5 | #echo "Value of ABS_SCRIPT_PATH: ${ABS_SCRIPT_PATH}" 6 | ABS_DIRECTORY="$(dirname "${ABS_SCRIPT_PATH}")" 7 | #echo "Value of ABS_DIRECTORY: ${ABS_DIRECTORY}" 8 | 9 | echo $1 $2 $3 $4 10 | #path ranks bins 11 | total=$(( ($2 * $3) )) 12 | for (( j=0 ; j < $total ; j++ )) 13 | do 14 | printf -v padded_number "%05d" $j 15 | echo $padded_number 16 | ls ${1}/${padded_number}/output.vcf.gz -v >> ${1}/a.txt 17 | 18 | done 19 | vcf_list=`cat ${1}/a.txt` 20 | 21 | ${ABS_DIRECTORY}/../../applications/bcftools/bcftools concat $vcf_list > ${1}/${4}.vcf.gz 22 | 23 | rm ${1}/a.txt 24 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/config: -------------------------------------------------------------------------------- 1 | export LD_PRELOAD=/Open-Omics-Acceleration-Framework/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/libmimalloc.so.2.0:$LD_PRELOAD 2 | export INPUT_DIR=/reads/ 3 | export OUTPUT_DIR=/output/ 4 | export REF_DIR=/ref/ 5 | REF=GCA_000001405.15_GRCh38_no_alt_analysis_set.fna 6 | R1=HG001_R1.fastq.gz 7 | R2=HG001_R2.fastq.gz 8 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/extra_scripts/config: -------------------------------------------------------------------------------- 1 | export INPUT_DIR=/output/ 2 | export OUTPUT_DIR=/output/ 3 | export REF_DIR=/ref/ 4 | export REF=GCA_000001405.15_GRCh38_no_alt_analysis_set.fna 5 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/extra_scripts/merge_vcf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | SCRIPT_PATH="${BASH_SOURCE:-$0}" 4 | ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")" 5 | #echo "Value of ABS_SCRIPT_PATH: ${ABS_SCRIPT_PATH}" 6 | ABS_DIRECTORY="$(dirname "${ABS_SCRIPT_PATH}")" 7 | #echo "Value of ABS_DIRECTORY: ${ABS_DIRECTORY}" 8 | 9 | echo $1 $2 $3 10 | #path ranks bins 11 | total=$(( ($2 * $3) )) 12 | for (( j=0 ; j < $total ; j++ )) 13 | do 14 | printf -v padded_number "%05d" $j 15 | echo $padded_number 16 | ls ${1}/${padded_number}/output.vcf.gz -v >> ${1}/a.txt 17 | 18 | done 19 | vcf_list=`cat ${1}/a.txt` 20 | 21 | bcftools concat $vcf_list > ${1}/output.vcf.gz 22 | 23 | rm ${1}/a.txt 24 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/extra_scripts/run_pipeline_ec2_part2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | source config 4 | 5 | #cd ../.. 6 | 7 | #source miniconda3/bin/activate dv_env 8 | 9 | echo "localhost" > hostfile 10 | 11 | num_nodes=`cat hostfile | wc -l` 12 | 13 | first_ip=`head -n 1 hostfile` 14 | 15 | #ssh ${first_ip} lscpu > compute_config 16 | lscpu > compute_config 17 | 18 | 19 | num_cpus_per_node=$(cat compute_config | grep -E '^CPU\(s\)' | awk '{print $2}') 20 | num_cpus_all_node=`expr ${num_cpus_per_node} \* ${num_nodes}` 21 | threads_per_core=$(cat compute_config | grep -E '^Thread' | awk '{print $4}') 22 | echo "Total number of CPUs across all nodes: $num_cpus_all_node" 23 | 24 | 25 | num_physical_cores_all_nodes=`expr ${num_cpus_all_node} / ${threads_per_core}` 26 | 27 | num_physical_cores_per_nodes=`expr ${num_cpus_per_node} / ${threads_per_core}` 28 | 29 | 30 | while [ $num_physical_cores_per_nodes -ge 20 ] 31 | do 32 | num_physical_cores_per_nodes=`expr $num_physical_cores_per_nodes / 2` 33 | done 34 | 35 | num_physical_cores_per_rank=$num_physical_cores_per_nodes 36 | 37 | total_num_ranks=`expr ${num_physical_cores_all_nodes} / ${num_physical_cores_per_rank}` 38 | 39 | ranks_per_node=`expr ${total_num_ranks} / ${num_nodes}` 40 | 41 | sh run_pipeline_part2.sh ${total_num_ranks} ${ranks_per_node} ${REF} ${R1} ${R2} "sudo docker" 42 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/extra_scripts/run_pipeline_part2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | SCRIPT_PATH="${BASH_SOURCE:-$0}" 5 | ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")" 6 | ABS_DIRECTORY="$(dirname "${ABS_SCRIPTINPUT_DIR_PATH}")" 7 | 8 | INDIR=$INPUT_DIR 9 | OUTDIR=$OUTPUT_DIR 10 | REFDIR=$REF_DIR 11 | #* ranks: Number of mpi process that we want the pipeline to run on 12 | #* threads/shards: parameters to different tools in the pipeline, calculated as below 13 | ppn=$2 14 | 15 | 16 | Sockets=$(cat compute_config | grep -E '^Socket\(s\)' | awk '{print $2}') #2 17 | Cores=$(cat compute_config | grep -E '^Core\(s\)' | awk '{print $4}') #56 18 | Thread=$(cat compute_config | grep -E '^Thread' | awk '{print $4}') #2 19 | 20 | a=$(( $(( ${Cores}*${Thread}*${Sockets} / $ppn )) - 2*${Thread} )) #24 (Four threads are removed for IO) 21 | b=$(( $(( ${Cores}*${Sockets} )) / $ppn )) #14 22 | 23 | if [ $a -lt 1 ] 24 | then 25 | echo 'Number of cpus are less to run the pipeline.' 26 | exit 0 27 | fi 28 | 29 | N=$1 30 | PPN=$2 31 | CPUS=$a 32 | THREADS=$a 33 | SHARDS=$b 34 | REF=$(basename "$3") #Change to your reference file 35 | READ1=$(basename "$4") #Change your read files 36 | READ2=$(basename "$5") 37 | BINDING=socket 38 | Container=docker 39 | 40 | if [ $# -gt 5 ] 41 | then 42 | Container="$6" 43 | fi 44 | 45 | echo "Output directory: $OUTDIR" 46 | mkdir -p ${OUTDIR} 47 | #It is assumed that if reference file is .gz then it is converted using create_reference_index.sh or pcluster_reference_index.sh script. 48 | file_ext=${REF##*.} 49 | 50 | if [ "${file_ext}" = "gz" ] 51 | then 52 | REF=$(basename "$REF" .gz ) 53 | if ! [ -f $REFDIR/${REF} ]; then 54 | echo "File $REFDIR/${REF} does not exist." 55 | exit 0 56 | fi 57 | fi 58 | 59 | echo Starting run with $N ranks, $CPUS threads,$THREADS threads, $SHARDS shards, $PPN ppn. 60 | # -in -sindex are required only once for indexing. 61 | # Todo : Make index creation parameterized. 62 | mpiexec -bootstrap ssh -bind-to $BINDING -map-by $BINDING --hostfile hostfile -n $N -ppn $PPN python -u test_deep.py --inputdir $INDIR --output $OUTDIR $TEMPDIR --refdir $REFDIR --index $REF --cpus $CPUS --threads $THREADS --shards $SHARDS 2>&1 | tee ${OUTDIR}/log.txt 63 | 64 | #/opt/deepvariant/bin/run_deepvariant --model_type=WGS --ref=/refdir/'+ifile+' --reads=/tempdir/aln'+binstr+'.bam --output_vcf=/output/output.vcf.gz --intermediate_results_dir /tempdir/intermediate_results_dir'+binstr+' --num_shards='+nproc+' --dry_run=false --regions "'+bin_region[i*nranks+rank]+'"' 65 | 66 | #echo "Pipeline finished. Output vcf can be found at: $OUTPUT_DIR/output.vcf.gz" 67 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/run_pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | SCRIPT_PATH="${BASH_SOURCE:-$0}" 4 | ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")" 5 | ABS_DIRECTORY="$(dirname "${ABS_SCRIPTINPUT_DIR_PATH}")" 6 | 7 | INDIR=$INPUT_DIR 8 | OUTDIR=$OUTPUT_DIR 9 | REFDIR=$REF_DIR 10 | #* ranks: Number of mpi process that we want the pipeline to run on 11 | #* threads/shards: parameters to different tools in the pipeline, calculated as below 12 | ppn=$2 13 | 14 | 15 | Sockets=$(cat compute_config | grep -E '^Socket\(s\)' | awk '{print $2}') #2 16 | Cores=$(cat compute_config | grep -E '^Core\(s\)' | awk '{print $4}') #56 17 | Thread=$(cat compute_config | grep -E '^Thread' | awk '{print $4}') #2 18 | 19 | a=$(( $(( ${Cores}*${Thread}*${Sockets} / $ppn )) - 2*${Thread} )) #24 (Four threads are removed for IO) 20 | b=$(( $(( ${Cores}*${Sockets} )) / $ppn )) #14 21 | 22 | if [ $a -lt 1 ] 23 | then 24 | echo 'Number of cpus are less to run the pipeline.' 25 | exit 0 26 | fi 27 | 28 | N=$1 29 | PPN=$2 30 | CPUS=$a 31 | THREADS=$a 32 | SHARDS=$b 33 | REF=$(basename "$3") #Change to your reference file 34 | READ1=$(basename "$4") #Change your read files 35 | READ2=$(basename "$5") 36 | BINDING=socket 37 | Container=docker 38 | 39 | if [ $# -gt 5 ] 40 | then 41 | Container="$6" 42 | fi 43 | 44 | echo "Output directory: $OUTDIR" 45 | mkdir -p ${OUTDIR} 46 | #It is assumed that if reference file is .gz then it is converted using create_reference_index.sh or pcluster_reference_index.sh script. 47 | file_ext=${REF##*.} 48 | 49 | if [ "${file_ext}" = "gz" ] 50 | then 51 | REF=$(basename "$REF" .gz ) 52 | if ! [ -f $REFDIR/${REF} ]; then 53 | echo "File $REFDIR/${REF} does not exist." 54 | exit 0 55 | fi 56 | fi 57 | 58 | echo Starting run with $N ranks, $CPUS threads,$THREADS threads, $SHARDS shards, $PPN ppn. 59 | # -in -sindex are required only once for indexing. 60 | # Todo : Make index creation parameterized. 61 | mpiexec -bootstrap ssh -bind-to $BINDING -map-by $BINDING --hostfile hostfile -n $N -ppn $PPN python -u test_pipeline_final.py --input $INDIR --output $OUTDIR $TEMPDIR --refdir $REFDIR --index $REF --read $READ1 $READ2 --cpus $CPUS --threads $THREADS --shards $SHARDS --container_tool "$Container" 2>&1 | tee ${OUTDIR}/log.txt 62 | 63 | echo "Pipeline finished. Output vcf can be found at: $OUTPUT_DIR/output.vcf.gz" 64 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/run_pipeline_part1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | SCRIPT_PATH="${BASH_SOURCE:-$0}" 5 | ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")" 6 | ABS_DIRECTORY="$(dirname "${ABS_SCRIPTINPUT_DIR_PATH}")" 7 | 8 | INDIR=$INPUT_DIR 9 | OUTDIR=$OUTPUT_DIR 10 | REFDIR=$REF_DIR 11 | #* ranks: Number of mpi process that we want the pipeline to run on 12 | #* threads/shards: parameters to different tools in the pipeline, calculated as below 13 | ppn=$2 14 | 15 | 16 | Sockets=$(cat compute_config | grep -E '^Socket\(s\)' | awk '{print $2}') #2 17 | Cores=$(cat compute_config | grep -E '^Core\(s\)' | awk '{print $4}') #56 18 | Thread=$(cat compute_config | grep -E '^Thread' | awk '{print $4}') #2 19 | 20 | a=$(( $(( ${Cores}*${Thread}*${Sockets} / $ppn )) - 2*${Thread} )) #24 (Four threads are removed for IO) 21 | b=$(( $(( ${Cores}*${Sockets} )) / $ppn )) #14 22 | 23 | if [ $a -lt 1 ] 24 | then 25 | echo 'Number of cpus are less to run the pipeline.' 26 | exit 0 27 | fi 28 | 29 | N=$1 30 | PPN=$2 31 | CPUS=$a 32 | THREADS=$a 33 | SHARDS=$b 34 | REF=$(basename "$3") #Change to your reference file 35 | READ1="$4" #Change your read files 36 | READ2="$5" 37 | BINDING=socket 38 | Container=docker 39 | 40 | if [ $# -gt 5 ] 41 | then 42 | Container="$6" 43 | fi 44 | 45 | echo "Output directory: $OUTDIR" 46 | mkdir -p ${OUTDIR} 47 | #It is assumed that if reference file is .gz then it is converted using create_reference_index.sh or pcluster_reference_index.sh script. 48 | file_ext=${REF##*.} 49 | 50 | if [ "${file_ext}" = "gz" ] 51 | then 52 | REF=$(basename "$REF" .gz ) 53 | if ! [ -f $REFDIR/${REF} ]; then 54 | echo "File $REFDIR/${REF} does not exist." 55 | exit 0 56 | fi 57 | fi 58 | 59 | echo Starting run with $N ranks, $CPUS threads,$THREADS threads, $SHARDS shards, $PPN ppn. 60 | # -in -sindex are required only once for indexing. 61 | # Todo : Make index creation parameterized. 62 | mpiexec -bootstrap ssh -bind-to $BINDING -map-by $BINDING --hostfile hostfile -n $N -ppn $PPN python -u test_pipeline_part1.py --input $INDIR --output $OUTDIR $TEMPDIR --refdir $REFDIR --index $REF --read $READ1 $READ2 --cpus $CPUS --threads $THREADS --shards $SHARDS --container_tool "$Container" 2>&1 | tee ${OUTDIR}/log_part1.txt 63 | 64 | #echo "Pipeline finished. Output vcf can be found at: $OUTPUT_DIR/output.vcf.gz" 65 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/aws/basic_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | # All basic dev tools for Ubuntu 22.04 4 | 5 | sudo apt update 6 | 7 | #sudo apt -y upgrade 8 | 9 | sudo apt -y install make 10 | 11 | sudo apt -y install autoconf 12 | 13 | sudo apt -y install numactl 14 | 15 | sudo apt -y install build-essential 16 | 17 | sudo apt -y install zlib1g-dev 18 | 19 | sudo apt -y install libncurses5-dev 20 | 21 | sudo apt -y update 22 | 23 | #sudo apt -y upgrade 24 | 25 | sudo apt -y install libbz2-dev 26 | 27 | sudo apt -y install liblzma-dev 28 | 29 | sudo apt-get -qq -y update 30 | sudo apt-get -qq -y install wget 31 | 32 | # All dependencies for bcftools Docker 33 | echo "Installing Docker" 34 | sudo apt-get -qq -y install apt-transport-https ca-certificates curl gnupg-agent software-properties-common 35 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - 36 | 37 | 38 | sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" 39 | 40 | sudo apt-get -qq -y update 41 | sudo apt-get -qq -y install docker-ce 42 | sudo systemctl start docker 43 | 44 | sudo docker --version 45 | 46 | echo "Running Docker installation hello world!! test" 47 | sudo docker run hello-world 48 | 49 | #echo "Creating and activating a conda environment" 50 | #source setup_env.sh deepvaraint_env 51 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/aws/build_deepvariant_docker_image.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | 5 | 6 | #Pre-req: Docker installation 7 | # Check if Docker is installed 8 | if [ "$(command -v docker)" ]; then 9 | # Docker is installed 10 | echo "Docker is installed." 11 | 12 | # Check Docker version 13 | docker_version=$("docker" --version | awk '{print $3}') 14 | echo "Docker version: $docker_version" 15 | else 16 | # Docker is not installed 17 | echo "Docker is not installed on this system." 18 | exit 1 19 | fi 20 | 21 | 22 | # Build docker 23 | 24 | # This will save deepvariant images 25 | cd ../../../../applications/deepvariant 26 | sudo docker build -t deepvariant . 27 | 28 | # check the built and print the image ID 29 | 30 | sudo docker images | grep "deepvariant:latest" 31 | 32 | #save image(~7 GB) to tar file if you are using multiple nodes. 33 | 34 | echo "Saving deepvariant:latest image as deepvariant.tar..." 35 | cd - # Move to pipelines/deepvariant 36 | sudo docker save -o deepvariant.tar deepvariant:latest 37 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/aws/build_tools.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | # Prerequisites: conda env activated 4 | 5 | 6 | # Clone the repo: https://github.com/IntelLabs/Open-Omics-Acceleration-Framework.git 7 | 8 | # git clone --recursive https://github.com/IntelLabs/Open-Omics-Acceleration-Framework.git 9 | 10 | cd ../../../../../Open-Omics-Acceleration-Framework 11 | WDIR=`pwd` 12 | 13 | cd ${WDIR}/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/ 14 | 15 | ls 16 | 17 | 18 | # Pre-req: conda env 19 | source setup_env.sh dv_env 20 | 21 | 22 | # compile bwa-mem2 23 | echo "Build bwa-mem2" 24 | cd ${WDIR}/applications/bwa-mem2 25 | make multi 26 | if [ -e "${WDIR}/applications/bwa-mem2/bwa-mem2" ]; then 27 | echo "bwa-mem2 build successful" 28 | else 29 | echo "Error!! bwa-mem2 build failed" 30 | fi 31 | 32 | #make install #uncomment this for installation 33 | 34 | # compile htslib 35 | cd ${WDIR}/applications/htslib 36 | autoreconf -i # Build the configure script and install files it uses 37 | ./configure # Optional but recommended, for choosing extra functionality 38 | make 39 | #make install #uncomment this for installation 40 | 41 | # compile bcftools 42 | cd ${WDIR}/applications/bcftools 43 | # The following is optional: 44 | # autoheader && autoconf && ./configure --enable-libgsl --enable-perl-filters 45 | make 46 | #make install #uncomment this for installation 47 | 48 | # compile samtools 49 | cd ${WDIR}/applications/samtools 50 | autoheader 51 | autoconf -Wno-syntax 52 | chmod 775 configure 53 | ./configure # Needed for choosing optional functionality 54 | make 55 | #make install #uncomment this for installation 56 | 57 | 58 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/aws/config: -------------------------------------------------------------------------------- 1 | export LD_PRELOAD=/Open-Omics-Acceleration-Framework/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/libmimalloc.so.2.0:$LD_PRELOAD 2 | export INPUT_DIR=/path-to-read-datasets/ 3 | export OUTPUT_DIR=/path-to-output-directory/ 4 | export REF_DIR=/path-to-ref-directory/ 5 | REF=ref.fasta 6 | R1=R1.fastq.gz 7 | R2=R2.fastq.gz 8 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/aws/create_reference_index.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | SCRIPT_PATH="${BASH_SOURCE:-$0}" 4 | ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")" 5 | #echo "Value of ABS_SCRIPT_PATH: ${ABS_SCRIPT_PATH}" 6 | ABS_DIRECTORY="$(dirname "${ABS_SCRIPT_PATH}")" 7 | 8 | cd $ABS_DIRECTORY 9 | source config 10 | filename=${REF} 11 | file_ext=${filename##*.} 12 | file_name_without_extension=$(basename "$filename" .gz ) 13 | 14 | 15 | if [ ${file_ext} == 'gz' ] 16 | then 17 | echo "Refecence file is decompressing..." 18 | gzip -d ${REF_DIR}/${filename} 19 | REF=${file_name_without_extension} 20 | fi 21 | ref=${REF_DIR}/${REF} 22 | mkdir -p ${OUTPUT_DIR} 23 | 24 | echo "Checking the index files for $ref" 25 | ls ${ref}* 26 | 27 | # mem2 index 28 | echo "Creating FM-index for the reference sequence ${ref}" 29 | cd ../../../../applications/bwa-mem2 30 | ./bwa-mem2 index $ref &> ${OUTPUT_DIR}/bwa_mem2_index_log 31 | cd - &> /dev/null 32 | 33 | 34 | # samtool idfai index 35 | echo "Creating fai index for the reference sequence ${ref}" 36 | cd ../../../../applications/samtools 37 | ./samtools faidx $ref &> ${OUTPUT_DIR}/samtools_fai_log 38 | cd - &> /dev/null 39 | 40 | 41 | echo "The list of all index files created." 42 | ls ${ref}* 43 | if [ -z $1 ] 44 | then 45 | echo "Index files are created." 46 | else 47 | echo "Index files are created release instance by typing: 'scancel $1' " 48 | fi 49 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/aws/deepvariant_ec2_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | echo "Step 1: Basic installation.." 5 | bash basic_setup.sh 6 | 7 | echo "Step 2: Building applications.." 8 | bash build_tools.sh 9 | 10 | echo "Step 3: Building Deepvaraint image.." 11 | bash build_deepvariant_docker_image.sh 12 | 13 | echo "Setup done!!" 14 | 15 | echo "Next step, modify the \"config\" file according to the reference sequence and the read datasets, and run \"bash create_reference_index.sh \"" 16 | 17 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/aws/deepvariant_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | echo "Step 1: Basic installation.." 5 | bash basic_setup.sh 6 | 7 | echo "Step 2: Building applications.." 8 | bash build_tools.sh 9 | 10 | echo "Step 3: Building and saving Deepvaraint image.." 11 | bash build_deepvariant_docker_image.sh 12 | 13 | echo "Setup done!!" 14 | 15 | echo "Next step, modify the \"config\" file according to the reference sequence and the read datasets, and run \"bash create_reference_index.sh \"" 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/aws/pcluster_compute_node_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | WDIR=`pwd` 4 | num_compute_nodes=$1 5 | allocation_time="02:00:00" 6 | 7 | if [ -z $2 ] 8 | then 9 | echo "Allocating compute nodes by default for 2 hours" 10 | else 11 | echo "Allocating compute nodes for $2 hours" 12 | allocation_time=$2 13 | fi 14 | 15 | # Allocate compute nodes 16 | salloc --nodes=${num_compute_nodes} --ntasks-per-node=1 --wait-all-nodes=1 --time=${allocation_time} --no-shell &> tmp_salloc && grep "Granted job allocation" tmp_salloc | cut -d" " -f5 &> tmp_jobid 17 | 18 | jid=`cat tmp_jobid | head -n 1` 19 | 20 | rm tmp_salloc tmp_jobid 21 | 22 | srun --jobid=$jid hostname > ../../hostfile 23 | 24 | echo "Cluster alloccation done!!" 25 | cat ../../hostfile 26 | 27 | for i in `cat ../../hostfile` 28 | do 29 | echo $i 30 | ssh $i "bash ${WDIR}/basic_setup.sh && sudo docker load -i ${WDIR}/deepvariant.tar && sudo docker images && echo \"setup done for $i. Press enter to continue..\" " & 31 | done 32 | 33 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/aws/pcluster_example_config: -------------------------------------------------------------------------------- 1 | # This example configuration file is created for parallel cluster version 2.11.9 2 | # This file resides in the ~/.parallelcluster/config on local machine 3 | [cluster default] 4 | key_name = 5 | vpc_settings = public 6 | ebs_settings = myebs 7 | compute_instance_type = c7i.48xlarge #change as per your need 8 | master_instance_type = c6i.4xlarge #change as per your need 9 | master_root_volume_size = 200 #change as per your need 10 | maintain_initial_size = false 11 | initial_queue_size = 0 12 | max_queue_size = 32 #maximum number of compute nodes in the cluster 13 | placement_group = DYNAMIC 14 | placement = cluster 15 | scaling_settings = custom 16 | tags = {"name": "xyz"} 17 | base_os = ubuntu2004 #ubuntu gives best performance 18 | scheduler = slurm 19 | enable_efa = compute 20 | enable_intel_hpc_platform = false 21 | 22 | [scaling custom] 23 | scaledown_idletime=10 24 | 25 | [vpc public] 26 | vpc_id = vpc-xxxxxxxxxx #get vpc id from your aws region 27 | master_subnet_id = subnet-xxxxxxxxxxxxx #get subnet id from your aws region 28 | ssh_from = 172.17.0.1/1 29 | 30 | [ebs myebs] 31 | shared_dir = /sharedgp 32 | volume_size = 1024 33 | volume_type = io2 34 | volume_iops = 64000 35 | 36 | [aliases] 37 | ssh = ssh {CFN_USER}@{MASTER_IP} {ARGS} 38 | 39 | [aws] 40 | aws_region_name = us-west-2 #change as per your need 41 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/aws/pcluster_reference_index.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set –e 3 | 4 | WDIR=`pwd` 5 | num_compute_nodes=1 6 | allocation_time="03:00:00" 7 | 8 | if [ -z $2 ] 9 | then 10 | echo "Allocating compute nodes by default for 2 hours" 11 | else 12 | echo "Allocating compute nodes for $2 hours" 13 | allocation_time=$2 14 | fi 15 | 16 | # Allocate compute nodes 17 | salloc --nodes=${num_compute_nodes} --ntasks-per-node=1 --wait-all-nodes=1 --time=${allocation_time} --no-shell &> tmp_salloc && grep "Granted job allocation" tmp_salloc | cut -d" " -f5 &> tmp_jobid 18 | 19 | jid=`cat tmp_jobid | head -n 1` 20 | 21 | rm tmp_salloc tmp_jobid 22 | 23 | srun --jobid=$jid hostname > ../../hostfile 24 | 25 | echo "Cluster allocation done!!" 26 | cat ../../hostfile 27 | 28 | for i in `cat ../../hostfile` 29 | do 30 | echo $i 31 | ssh $i "bash ${WDIR}/basic_setup.sh && bash ${WDIR}/create_reference_index.sh ${jid}" & 32 | done 33 | 34 | 35 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/aws/run_pipeline_ec2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | source config 4 | 5 | cd ../.. 6 | 7 | source miniconda3/bin/activate dv_env 8 | 9 | hostname > hostfile 10 | 11 | num_nodes=`cat hostfile | wc -l` 12 | 13 | first_ip=`head -n 1 hostfile` 14 | 15 | #ssh ${first_ip} lscpu > compute_config 16 | lscpu > compute_config 17 | 18 | 19 | num_cpus_per_node=$(cat compute_config | grep -E '^CPU\(s\)' | awk '{print $2}') 20 | num_cpus_all_node=`expr ${num_cpus_per_node} \* ${num_nodes}` 21 | threads_per_core=$(cat compute_config | grep -E '^Thread' | awk '{print $4}') 22 | echo "Total number of CPUs across all nodes: $num_cpus_all_node" 23 | 24 | 25 | num_physical_cores_all_nodes=`expr ${num_cpus_all_node} / ${threads_per_core}` 26 | 27 | num_physical_cores_per_nodes=`expr ${num_cpus_per_node} / ${threads_per_core}` 28 | 29 | 30 | while [ $num_physical_cores_per_nodes -ge 20 ] 31 | do 32 | num_physical_cores_per_nodes=`expr $num_physical_cores_per_nodes / 2` 33 | done 34 | 35 | num_physical_cores_per_rank=$num_physical_cores_per_nodes 36 | 37 | total_num_ranks=`expr ${num_physical_cores_all_nodes} / ${num_physical_cores_per_rank}` 38 | 39 | ranks_per_node=`expr ${total_num_ranks} / ${num_nodes}` 40 | 41 | sh run_pipeline.sh ${total_num_ranks} ${ranks_per_node} ${REF} ${R1} ${R2} "sudo docker" 42 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/aws/run_pipeline_ec2_part1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | source config 4 | 5 | cd ../.. 6 | 7 | #source miniconda3/bin/activate dv_env 8 | 9 | echo "localhost" > hostfile 10 | 11 | num_nodes=`cat hostfile | wc -l` 12 | 13 | first_ip=`head -n 1 hostfile` 14 | 15 | #ssh ${first_ip} lscpu > compute_config 16 | lscpu > compute_config 17 | 18 | 19 | num_cpus_per_node=$(cat compute_config | grep -E '^CPU\(s\)' | awk '{print $2}') 20 | num_cpus_all_node=`expr ${num_cpus_per_node} \* ${num_nodes}` 21 | threads_per_core=$(cat compute_config | grep -E '^Thread' | awk '{print $4}') 22 | echo "Total number of CPUs across all nodes: $num_cpus_all_node" 23 | 24 | 25 | num_physical_cores_all_nodes=`expr ${num_cpus_all_node} / ${threads_per_core}` 26 | 27 | num_physical_cores_per_nodes=`expr ${num_cpus_per_node} / ${threads_per_core}` 28 | 29 | 30 | while [ $num_physical_cores_per_nodes -ge 20 ] 31 | do 32 | num_physical_cores_per_nodes=`expr $num_physical_cores_per_nodes / 2` 33 | done 34 | 35 | num_physical_cores_per_rank=$num_physical_cores_per_nodes 36 | 37 | total_num_ranks=`expr ${num_physical_cores_all_nodes} / ${num_physical_cores_per_rank}` 38 | 39 | ranks_per_node=`expr ${total_num_ranks} / ${num_nodes}` 40 | 41 | sh run_pipeline_part1.sh ${total_num_ranks} ${ranks_per_node} ${REF} ${R1} ${R2} "sudo docker" 42 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/aws/run_pipeline_pcluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | source config 4 | 5 | cd ../.. 6 | 7 | source miniconda3/bin/activate dv_env 8 | 9 | 10 | num_nodes=`cat hostfile | wc -l` 11 | 12 | first_ip=`head -n 1 hostfile` 13 | 14 | ssh ${first_ip} lscpu > compute_config 15 | #lscpu > compute_config 16 | 17 | 18 | num_cpus_per_node=$(cat compute_config | grep -E '^CPU\(s\)' | awk '{print $2}') 19 | num_cpus_all_node=`expr ${num_cpus_per_node} \* ${num_nodes}` 20 | threads_per_core=$(cat compute_config | grep -E '^Thread' | awk '{print $4}') 21 | echo "Total number of CPUs across all nodes: $num_cpus_all_node" 22 | 23 | 24 | num_physical_cores_all_nodes=`expr ${num_cpus_all_node} / ${threads_per_core}` 25 | 26 | num_physical_cores_per_nodes=`expr ${num_cpus_per_node} / ${threads_per_core}` 27 | 28 | 29 | while [ $num_physical_cores_per_nodes -ge 20 ] 30 | do 31 | num_physical_cores_per_nodes=`expr $num_physical_cores_per_nodes / 2` 32 | done 33 | 34 | num_physical_cores_per_rank=$num_physical_cores_per_nodes 35 | 36 | total_num_ranks=`expr ${num_physical_cores_all_nodes} / ${num_physical_cores_per_rank}` 37 | 38 | ranks_per_node=`expr ${total_num_ranks} / ${num_nodes}` 39 | 40 | sh run_pipeline.sh ${total_num_ranks} ${ranks_per_node} ${REF} ${R1} ${R2} "sudo docker" 41 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/cluster/config: -------------------------------------------------------------------------------- 1 | export LD_PRELOAD=/Open-Omics-Acceleration-Framework/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/libmimalloc.so.2.0:$LD_PRELOAD 2 | export INPUT_DIR=/path-to-read-datasets/ 3 | export OUTPUT_DIR=/path-to-output-directory/ 4 | export REF_DIR=/path-to-ref-directory/ 5 | REF=ref.fasta 6 | R1=R1.fastq.gz 7 | R2=R2.fastq.gz 8 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/cluster/create_reference_index.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | SCRIPT_PATH="${BASH_SOURCE:-$0}" 4 | ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")" 5 | #echo "Value of ABS_SCRIPT_PATH: ${ABS_SCRIPT_PATH}" 6 | ABS_DIRECTORY="$(dirname "${ABS_SCRIPT_PATH}")" 7 | 8 | cd $ABS_DIRECTORY 9 | source config 10 | filename=${REF} 11 | file_ext=${filename##*.} 12 | file_name_without_extension=$(basename "$filename" .gz ) 13 | 14 | 15 | if [ ${file_ext} == 'gz' ] 16 | then 17 | echo "Refecence file is decompressing..." 18 | gzip -d ${REF_DIR}/${filename} 19 | REF=${file_name_without_extension} 20 | fi 21 | 22 | 23 | ref=${REF_DIR}/${REF} 24 | mkdir -p ${OUTPUT_DIR} 25 | echo "Checking the index files for $ref" 26 | ls ${ref}* 27 | 28 | # mem2 index 29 | echo "Creating FM-index for the reference sequence ${ref}" 30 | cd ../../../../applications/bwa-mem2 31 | ./bwa-mem2 index $ref &> ${OUTPUT_DIR}/bwa_mem2_index_log 32 | cd - &> /dev/null 33 | 34 | 35 | # samtool idfai index 36 | echo "Creating fai index for the reference sequence ${ref}" 37 | cd ../../../../applications/samtools 38 | ./samtools faidx $ref &> ${OUTPUT_DIR}/samtools_fai_log 39 | cd - &> /dev/null 40 | 41 | 42 | echo "The list of all index files created." 43 | ls ${ref}* 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/cluster/load_deepvariant.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | SCRIPT_PATH="${BASH_SOURCE:-$0}" 4 | ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")" 5 | ABS_DIRECTORY="$(dirname "${ABS_SCRIPT_PATH}")" 6 | 7 | Container=docker 8 | 9 | [[ $# -gt 0 ]] && Container="$1" 10 | 11 | for i in `cat hostfile` 12 | do 13 | echo $i 14 | ssh $i "${Container} load -i ${ABS_DIRECTORY}/../../deepvariant.tar" & 15 | done 16 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/cluster/run_pipeline_cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | source config 5 | 6 | Container=docker 7 | 8 | if [ $# -gt 0 ] 9 | then 10 | Container="$1" 11 | fi 12 | 13 | mv hostfile ../../ 14 | cd ../.. 15 | 16 | source miniconda3/bin/activate dv_env 17 | 18 | num_nodes=`cat hostfile | wc -l` 19 | 20 | first_ip=`head -n 1 hostfile` 21 | 22 | ssh ${first_ip} lscpu > compute_config 23 | #lscpu > compute_config 24 | 25 | 26 | num_cpus_per_node=$(cat compute_config | grep -E '^CPU\(s\)' | awk '{print $2}') 27 | num_cpus_all_node=`expr ${num_cpus_per_node} \* ${num_nodes}` 28 | threads_per_core=$(cat compute_config | grep -E '^Thread' | awk '{print $4}') 29 | echo "Total number of CPUs across all nodes: $num_cpus_all_node" 30 | 31 | 32 | num_physical_cores_all_nodes=`expr ${num_cpus_all_node} / ${threads_per_core}` 33 | 34 | num_physical_cores_per_nodes=`expr ${num_cpus_per_node} / ${threads_per_core}` 35 | 36 | 37 | while [ $num_physical_cores_per_nodes -ge 20 ] 38 | do 39 | num_physical_cores_per_nodes=`expr $num_physical_cores_per_nodes / 2` 40 | done 41 | 42 | num_physical_cores_per_rank=$num_physical_cores_per_nodes 43 | 44 | total_num_ranks=`expr ${num_physical_cores_all_nodes} / ${num_physical_cores_per_rank}` 45 | 46 | ranks_per_node=`expr ${total_num_ranks} / ${num_nodes}` 47 | 48 | sh run_pipeline.sh ${total_num_ranks} ${ranks_per_node} ${REF} ${R1} ${R2} ${Container} 49 | 50 | echo "Pipeline finished. Output vcf can be found at: $OUTPUT_DIR/output.vcf.gz" 51 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/cluster/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | #ENV=/nfs/work04/ashish/envs/new_env/bin/activate 5 | #source $ENV 6 | 7 | SCRIPT_PATH="${BASH_SOURCE:-$0}" 8 | ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")" 9 | #echo "Value of ABS_SCRIPT_PATH: ${ABS_SCRIPT_PATH}" 10 | ABS_DIRECTORY="$(dirname "${ABS_SCRIPT_PATH}")" 11 | #echo "Value of ABS_DIRECTORY: ${ABS_DIRECTORY}" 12 | 13 | export LD_PRELOAD=$LD_PRELOAD:"${ABS_DIRECTORY}/../../libmimalloc.so.2.0" 14 | #echo $LD_PRELOAD 15 | 16 | Container=docker 17 | 18 | if [ $# -gt 0 ] 19 | then 20 | Container="$1" 21 | fi 22 | 23 | # This will save deepvariant images 24 | cd ${ABS_DIRECTORY}/../../../../applications/deepvariant 25 | $Container build -t deepvariant . 26 | # docker build --build-arg http_proxy="http://proxy-us.abc.com:123" --build-arg https_proxy="http://proxy-us.abc.com:123" --build-arg no_proxy="127.0.0.1,localhost" -t deepvariant . 27 | 28 | 29 | #save image(~7 GB) to tar file if you are using multiple nodes. 30 | cd ${ABS_DIRECTORY} 31 | $Container save -o ${ABS_DIRECTORY}/../../deepvariant.tar deepvariant:latest 32 | 33 | 34 | cd ${ABS_DIRECTORY}/../../../../applications/bwa-mem2 35 | #make CXX=icpc multi 36 | make 37 | #make install #uncomment this for installation 38 | 39 | cd ${ABS_DIRECTORY}/../../../../applications/htslib 40 | autoreconf -i # Build the configure script and install files it uses 41 | ./configure # Optional but recommended, for choosing extra functionality 42 | make 43 | #make install #uncomment this for installation 44 | 45 | cd ${ABS_DIRECTORY}/../../../../applications/bcftools 46 | # The following is optional: 47 | # autoheader && autoconf && ./configure --enable-libgsl --enable-perl-filters 48 | make 49 | #make install #uncomment this for installation 50 | 51 | 52 | cd ${ABS_DIRECTORY}/../../../../applications/samtools 53 | autoheader 54 | autoconf -Wno-syntax 55 | chmod 775 configure 56 | ./configure # Needed for choosing optional functionality 57 | make 58 | #make install #uncomment this for installation 59 | cd ${ABS_DIRECTORY} 60 | 61 | bash load_deepvariant.sh $Container 62 | -------------------------------------------------------------------------------- /pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/setup_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set –e 3 | SCRIPT_PATH="${BASH_SOURCE:-$0}" 4 | ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")" 5 | #echo "Value of ABS_SCRIPT_PATH: ${ABS_SCRIPT_PATH}" 6 | ABS_DIRECTORY="$(dirname "${ABS_SCRIPT_PATH}")" 7 | #echo "Value of ABS_DIRECTORY: ${ABS_DIRECTORY}" 8 | mkdir -p ./miniconda3 9 | ln -s ${ABS_DIRECTORY}/miniconda3 ~/miniconda3 10 | 11 | echo "Downloading and setting up miniconda..." 12 | wget https://repo.anaconda.com/miniconda/Miniconda3-py39_23.3.1-0-Linux-x86_64.sh 13 | bash ./Miniconda3-py39_23.3.1-0-Linux-x86_64.sh -b -u -p ~/miniconda3 14 | echo "Downloading and setting up miniconda...DONE" 15 | 16 | echo "Seeting up conda env named with given argument" 17 | miniconda3/bin/conda env create --name $1 -f environment.yml 18 | echo "Seeting up conda env named new_env...DONE" 19 | 20 | echo "Activating conda env..." 21 | source miniconda3/bin/activate $1 22 | 23 | -------------------------------------------------------------------------------- /pipelines/fq2sortedbam/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | FROM python:3.10 3 | 4 | RUN apt-get update 5 | RUN apt-get install -y build-essential 6 | RUN apt-get install -y git 7 | RUN apt-get install -y vim 8 | RUN apt -y install autoconf 9 | RUN apt -y install numactl 10 | RUN apt -y install zlib1g-dev 11 | RUN apt -y install libncurses5-dev 12 | RUN apt -y install libbz2-dev 13 | RUN apt -y install liblzma-dev 14 | RUN apt-get -qq -y install wget gcc-11 g++-11 15 | RUN unlink /usr/bin/gcc && unlink /usr/bin/g++ 16 | RUN ln -s /usr/bin/gcc-11 /usr/bin/gcc && ln -s /usr/bin/g++-11 /usr/bin/g++ 17 | 18 | WORKDIR /app 19 | RUN git clone --recursive https://github.com/IntelLabs/Open-Omics-Acceleration-Framework.git 20 | #COPY /scratch/users/mvasimud/workspace/Open-Omics-Acceleration-Framework . 21 | WORKDIR Open-Omics-Acceleration-Framework/pipelines/fq2sortedbam 22 | 23 | #RUN bash install.sh onprem 24 | RUN bash install.sh onprem 25 | ENV PATH="/app/Open-Omics-Acceleration-Framework/applications/samtools:$PATH" 26 | ENV PATH="/app/Open-Omics-Acceleration-Framework/applications/bwa-mem2:$PATH" 27 | ENV PATH="/app/Open-Omics-Acceleration-Framework/applications/bwa-meth:$PATH" 28 | ENV PATH="/app/Open-Omics-Acceleration-Framework/applications/mm2-fast:$PATH" 29 | ENV PATH="/app/Open-Omics-Acceleration-Framework/applications/STAR/source:$PATH" 30 | ENV PATH="/app/Open-Omics-Acceleration-Framework/applications/samtools/samtools:$PATH" 31 | 32 | ENV PATH="/app/Open-Omics-Acceleration-Framework/pipelines/fq2sortedbam/miniforge3/envs/fq2bam/bin/:/app/Open-Omics-Acceleration-Framework/pipelines/fq2sortedbam/miniforge3/bin/:$PATH" 33 | ENV LD_PRELOAD="/app/Open-Omics-Acceleration-Framework/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/libmimalloc.so.2.0" 34 | 35 | RUN mkdir /input 36 | RUN mkdir /out 37 | RUN mkdir /refdir 38 | RUN mkdir /tempdir 39 | 40 | CMD ["/bin/bash"] 41 | SHELL ["source", "~/miniforge3/bin/activate", "fq2bam", "/bin/bash", "-c"] 42 | -------------------------------------------------------------------------------- /pipelines/fq2sortedbam/README.md: -------------------------------------------------------------------------------- 1 | ## fq2SortedBAM: OpenOmics' Genomics Secondary Analysis Pipeline 2 | ### Overview: 3 | The pipeline takes input fastq files and produces sorted BAM file through the following stages: 4 | 1. Sequence Alignment: bwa-mem2 for short reads, mm2-fast (Accelerated Minimap2) for long reads (PacBio, ONT) 5 | 2. SAMSort (Using SAMTools) 6 | 7 | ### Modes: 8 | fq2SortedBAM supports 4 different modes: 9 | 1. ```sortedbam```: It takes fastq reads files and reference genome as input and outputs a sorted BAM file 10 | 2. ```flatmode```: It takes fastq reads files and reference genome as input, and outputs multiple (equal to the number of ranks created) unsorted SAM files 11 | 3. ```fqprocessonly```: Custom mode, not for general use 12 | 4. ```multifq```: Custom mode, not for general use 13 | 14 | ## Sequence alingment tools in the pipeline 15 | Both sortedbam and flatmode support bwa-mem2 for short reads and mm2-fast (accelerated version of minimap2) for long reads alignemnt. 16 | These alignment tools can be enabled by _--read_type_ command-line option: 17 | 1. [deafult] selects bwa-mem2 when _--read_type=short_ 18 | 2. selects mm2-fast when _--read_type=long_ (Note: mm2-fast runs with '-a' command-line option by default producing SAM output) 19 | 3. Both the alignment tools support all the original tools' command-line options; these parameters can be provided through _--params_ command-line option to fq2sortedBAM 20 | 21 | 22 | ## Use Docker 23 | ### Docker build: 24 | ``` 25 | wget https://github.com/IntelLabs/Open-Omics-Acceleration-Framework/releases/download/3.0/Source_code_with_submodules.tar.gz 26 | tar -xzf Source_code_with_submodules.tar.gz 27 | cp Open-Omics-Acceleration-Framework/pipelines/fq2sortedbam/Dockerfile . 28 | cp Open-Omics-Acceleration-Framework/pipelines/fq2sortedbam/config.yaml 29 | ``` 30 | ```bash 31 | docker save fq2bam:latest > fq2bam.tar ## this step is optional 32 | docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t fq2bam . 33 | ``` 34 | 35 | 36 | ### Docker run: 37 | ``` 38 | docker load -i fq2bam.tar ## optional, if the image is build on the same machine or is already loaded 39 | docker run -v :/readsdir -v :/outdir -v :/refdir fq2bam:latest python run_fq2sortedbam.py --ref /refdir/ --reads /readsdir/ /readsdir/ --output /outdir/ 40 | ``` 41 | Note: 42 | \: Location of the local directory containing read files read1 & read2 43 | \: Location of the local directory containing reference sequence file ref 44 | \: Location of the local directory for output files SAM/BAM 45 | 46 | 47 | ## Use Source Code 48 | ### Installation: 49 | ``` 50 | wget https://github.com/IntelLabs/Open-Omics-Acceleration-Framework/releases/download/3.0/Source_code_with_submodules.tar.gz 51 | tar -xzf Source_code_with_submodules.tar.gz 52 | cd Open-Omics-Acceleration-Framework/pipelines/fq2sortedbam/ 53 | bash install.sh ## onprem mode: Manually install the depenendies present in basic_setup_ubuntu.sh as it needs sudo access 54 | ``` 55 | 56 | ### Run: 57 | ``` 58 | python run_fq2sortedbam.py --ref --reads --output 59 | ``` 60 | 61 | ## General Notes: 62 | 1. Individual pipeline tools are present in applications folder 63 | 2. To understand various parameters to these tools, you can access their ```man``` page 64 | 3. You can setup the parameters of these tools using ```params``` commandline option 65 | 4. Understand all the parameters to fq3sortedbam using "-h" option 66 | 5. fq2sortedbam supports the following aligners: 67 | a. DNA short read alignment using bwa-mem2 68 | b. DNA long read alignment using mm2-fast (minimap2) 69 | c. RNA short read alignment using STAR aligner 70 | d. bwa-meth based alignment 71 | -------------------------------------------------------------------------------- /pipelines/fq2sortedbam/basic_setup_ubuntu.sh: -------------------------------------------------------------------------------- 1 | # All basic dev tools for Ubuntu 22.04 2 | 3 | sudo apt update 4 | 5 | #sudo apt -y install make 6 | sudo apt -y install build-essential 7 | sudo apt -y install autoconf 8 | sudo apt -y install numactl 9 | sudo apt -y install zlib1g-dev 10 | sudo apt -y install libncurses5-dev 11 | sudo apt -y install libbz2-dev 12 | sudo apt -y install liblzma-dev 13 | sudo apt -y install git 14 | sudo apt-get -qq -y install wget 15 | -------------------------------------------------------------------------------- /pipelines/fq2sortedbam/config.yaml: -------------------------------------------------------------------------------- 1 | bwa: 2 | dindex: 'False' 3 | params: +R "@RG\tID:RG1\tSM:RGSN1" 4 | rindex: 'False' 5 | dataset: 6 | index: GCA_000001405.15_GRCh38_no_alt_analysis_set.fna 7 | input: /input/ 8 | outfile: short.se.sam 9 | output: /out/ 10 | read1: HG001.novaseq.pcr-free.30x.R1.fastq.gz 11 | read2: HG001.novaseq.pcr-free.30x.R2.fastq.gz 12 | read_type: short 13 | refdir: /refdir/ 14 | tempdir: "" 15 | fqprocess: 16 | bam_size: '5' 17 | barcode_orientation: FIRST_BP_RC 18 | output_format: FASTQ 19 | prefix: multiome-practice-may15_arcgtf 20 | read3: '' 21 | read_structure: 16C 22 | readi1: '' 23 | sample_id: '' 24 | suffix: trimmed_adapters.fastq.gz 25 | whitelist: whitelist.txt 26 | mm2: 27 | params: ' -ax map-hifi ' 28 | -------------------------------------------------------------------------------- /pipelines/fq2sortedbam/doc.txt: -------------------------------------------------------------------------------- 1 | # Pipeline Usage Guide 2 | 3 | This document explains the command-line arguments for running the pipeline. Below is a detailed description of each argument and its usage. 4 | 5 | --- 6 | 7 | ## Arguments 8 | 9 | ### General Options 10 | - `--ref REF` 11 | - **Description**: Reference genome path. For BWA, this pipeline expects the index to be present at this location. If the index is not available, it can be generated using the `--rindex` option. 12 | 13 | - `--reads READS [READS ...]` 14 | - **Description**: Input reads. The pipeline expects both reads to be at the same location. 15 | 16 | - `--tempdir TEMPDIR` 17 | - **Description**: Directory for storing intermediate data. 18 | 19 | - `--output OUTPUT` 20 | - **Description**: Prefix location for the output file(s) name. 21 | 22 | ### Processing Options 23 | - `--simd SIMD` 24 | - **Description**: Defaults to `avx512` mode. Use `sse` for BWA SSE mode. 25 | 26 | - `--read3 READ3` 27 | - **Description**: Name of `R3` files (for `fqprocess`), separated by spaces. 28 | 29 | - `--readi1 READI1` 30 | - **Description**: Name of `I1` files (for `fqprocess`), separated by spaces. 31 | 32 | - `--prefix PREFIX` 33 | - **Description**: Prefix for processed `R1` and `R3` files for `bwa-mem2`. 34 | 35 | - `--suffix SUFFIX` 36 | - **Description**: Suffix for processed `R1` and `R3` files for `bwa-mem2`. 37 | 38 | - `--whitelist WHITELIST` 39 | - **Description**: 10x whitelist file. 40 | 41 | - `--read_structure READ_STRUCTURE` 42 | - **Description**: Read structure. 43 | 44 | - `--barcode_orientation BARCODE_ORIENTATION` 45 | - **Description**: Barcode orientation. 46 | 47 | - `--sample_id SAMPLE_ID` 48 | - **Description**: Sample ID. 49 | 50 | - `--output_format OUTPUT_FORMAT` 51 | - **Description**: Output format. 52 | 53 | ### File Size and Execution Mode 54 | - `-b BAM_SIZE`, `--bam_size BAM_SIZE` 55 | - **Description**: BAM file size in GB. 56 | 57 | - `--mode MODE` 58 | - **Description**: Execution mode options: 59 | - `flatmode`: Just BWA without sorting, creates SAM files equal to the number of ranks created. 60 | - `sortedbam`: BWA + SAM sorting steps, creating a single BAM file as output. 61 | - `fqprocessonly`: Custom processing of fastq files. 62 | - `multifq`: Custom multifastq processing. 63 | 64 | ### Read Type Options 65 | - `--read_type READ_TYPE` 66 | - **Description**: Specifies the type of reads for alignment: 67 | - `short`: BWA-MEM2 alignment with short reads (default). 68 | - `long`: MM2-fast alignment with long reads. 69 | 70 | ### Indexing Options 71 | - `--rindex` 72 | - **Description**: Enables BWA-MEM2 index generation. Use this option if the index is not present. 73 | 74 | - `--dindex` 75 | - **Description**: Creates a reference genome FAI index. Use this option if the reference FAI is not present. 76 | 77 | ### Additional Options 78 | - `--profile` 79 | - **Description**: Enables profiling. 80 | 81 | - `--not_keep_unmapped` 82 | - **Description**: Rejects unmapped reads at the end of the sorted BAM file. If not specified, unmapped reads are retained. 83 | 84 | - `--keep_sam` 85 | - **Description**: Retains intermediate SAM files generated by the alignment tool for each rank. SAM files are named as `aln{rank:04d}.sam`. 86 | 87 | - `--params PARAMS` 88 | - **Description**: Allows supplying various parameters to BWA-MEM2 (except the `-t` parameter for threads). Example: 89 | ``` 90 | --params '-R "@RG\tID:RG1\tSM:RGSN1"' 91 | ``` 92 | This enables read grouping. 93 | 94 | - `--sso` 95 | - **Description**: Executes the pipeline on a single socket only. By default, all sockets are used. 96 | 97 | ### Resource Allocation 98 | - `--th TH` 99 | - **Description**: Sets the threshold for the minimum number of cores allocated to each rank. 100 | 101 | - `-N N` 102 | - **Description**: Manually sets the number of ranks. When using this, set `PPN` and `cpus` options accordingly. 103 | 104 | - `-PPN PPN` 105 | - **Description**: Manually sets the number of processes per node (PPN). When using this, set `N` and `cpus` options accordingly. 106 | 107 | - `--cpus CPUS` 108 | - **Description**: Manually sets the number of CPUs. When using this, set `N` and `PPN` options accordingly. 109 | 110 | --- 111 | 112 | ## Example Usage 113 | ```bash 114 | python pipeline.py \ 115 | --ref /path/to/reference.fasta \ 116 | --reads read1.fastq read2.fastq \ 117 | --tempdir /path/to/tempdir \ 118 | --output /path/to/output \ 119 | --simd avx512 \ 120 | --mode sortedbam \ 121 | --read_type short 122 | ``` 123 | 124 | -------------------------------------------------------------------------------- /pipelines/fq2sortedbam/environment.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - anaconda 4 | dependencies: 5 | - _libgcc_mutex=0.1=main 6 | - _openmp_mutex=5.1=1_gnu 7 | - blas=1.0=mkl 8 | - ca-certificates=2023.01.10=h06a4308_0 9 | - certifi=2022.12.7=py39h06a4308_0 10 | - intel-openmp=2021.4.0=h06a4308_3561 11 | - ld_impl_linux-64=2.38=h1181459_1 12 | - libffi=3.4.2=h6a678d5_6 13 | - libgcc-ng=11.2.0=h1234567_1 14 | - libgfortran-ng=7.5.0=ha8ba4b0_17 15 | - libgfortran4=7.5.0=ha8ba4b0_17 16 | - libgomp=11.2.0=h1234567_1 17 | - libstdcxx-ng=11.2.0=h1234567_1 18 | - mkl=2021.4.0=h06a4308_640 19 | - mkl-service=2.4.0=py39h7f8727e_0 20 | - mkl_fft=1.3.1=py39hd3c417c_0 21 | - mkl_random=1.2.2=py39h51133e4_0 22 | - mpi=1.0=mpich 23 | - mpi4py=3.1.4=py39hfc96bbd_0 24 | - mpich=3.3.2=hc856adb_0 25 | - ncurses=6.4=h6a678d5_0 26 | - numpy=1.23.5=py39h14f4228_0 27 | - numpy-base=1.23.5=py39h31eccc5_0 28 | - openssl=1.1.1t=h7f8727e_0 29 | - pip=23.0.1=py39h06a4308_0 30 | - python=3.9.16=h7a1cb2a_2 31 | - readline=8.2=h5eee18b_0 32 | - setuptools=65.6.3=py39h06a4308_0 33 | - six=1.16.0=pyhd3eb1b0_1 34 | - sqlite=3.41.1=h5eee18b_0 35 | - tk=8.6.12=h1ccaba5_0 36 | - tzdata=2023c=h04d1e81_0 37 | - wheel=0.38.4=py39h06a4308_0 38 | - xz=5.2.10=h5eee18b_1 39 | - zlib=1.2.13=h5eee18b_0 40 | - pyyaml=5.4.1 41 | - pip: 42 | - pragzip==0.5.0 43 | - yappi==1.4.0 44 | 45 | -------------------------------------------------------------------------------- /pipelines/fq2sortedbam/hwconfig.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import sys, os 3 | from subprocess import Popen, PIPE, run 4 | 5 | def main(): 6 | #print("args: ", sys.argv) 7 | #print(len(sys.argv)) 8 | assert len(sys.argv) == 3, " " 9 | run('lscpu > lscpu.txt', capture_output=True, shell=True) 10 | #inf = sys.argv[1] 11 | sso = sys.argv[1] 12 | num_nodes = int(sys.argv[2]) 13 | 14 | dt={} 15 | with open('lscpu.txt', 'r') as f: 16 | l = f.readline() 17 | while l: 18 | try: 19 | a,b = l.strip('\n').split(':') 20 | #aa, bb = a.split(' ') 21 | #print(a, b) 22 | dt[a] = b 23 | except: 24 | pass 25 | 26 | l = f.readline() 27 | 28 | ncpus = int(dt['CPU(s)']) 29 | nsocks = int(dt['Socket(s)']) 30 | nthreads = int(dt['Thread(s) per core']) 31 | ncores = int(dt['Core(s) per socket']) 32 | nnuma = int(dt['NUMA node(s)']) 33 | 34 | if sso == 'sso': 35 | nsocks = 1 36 | 37 | th = 16 ## max cores per rank 38 | #num_nodes = 1 39 | 40 | num_physical_cores_all_nodes = num_nodes * nsocks * ncores 41 | num_physical_cores_per_node = nsocks * ncores 42 | num_physical_cores_per_rank = nsocks * ncores 43 | 44 | while num_physical_cores_per_rank > th: 45 | num_physical_cores_per_rank /= 2 46 | 47 | num_physical_cores_per_rank = int(num_physical_cores_per_rank) 48 | assert num_physical_cores_per_rank > 8, 'cores per rank should be > 8' 49 | 50 | N = int(num_physical_cores_all_nodes / num_physical_cores_per_rank) 51 | PPN = int(num_physical_cores_per_node / num_physical_cores_per_rank) 52 | CPUS = int(ncores * nthreads * nsocks / PPN - 2*nthreads) 53 | THREADS = CPUS 54 | print(f"N={int(N)}") 55 | print(f"PPN={int(PPN)}") 56 | print(f"CPUS={int(CPUS)}") 57 | print(f"THREADS={int(THREADS)}") 58 | 59 | threads_per_rank = num_physical_cores_per_rank * nthreads 60 | bits = pow(2, num_physical_cores_per_rank) - 1 61 | allbits = 0 62 | mask="[" 63 | for r in range(N): 64 | allbits = allbits | (bits << r*num_physical_cores_per_rank) 65 | allbits = allbits | (allbits << nsocks * ncores) 66 | #print("{:x}".format(allbits)) 67 | if mask == "[": 68 | mask = mask + hex(allbits) 69 | else: 70 | mask = mask+","+ hex(allbits) 71 | allbits=0 72 | #print("{:x}".format(mask)) 73 | mask=mask + "]" 74 | print("I_MPI_PIN_DOMAIN={}".format(mask)) 75 | 76 | 77 | if __name__ == '__main__': 78 | #print("> args: ", sys.argv) 79 | #print(">", len(sys.argv)) 80 | 81 | main() 82 | 83 | -------------------------------------------------------------------------------- /pipelines/fq2sortedbam/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | #SCRIPT_PATH="${BASH_SOURCE:-$0}" 5 | #ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")" 6 | ##echo "Value of ABS_SCRIPT_PATH: ${ABS_SCRIPT_PATH}" 7 | #ABS_DIRECTORY="$(dirname "${ABS_SCRIPT_PATH}")" 8 | ##echo "Value of ABS_DIRECTORY: ${ABS_DIRECTORY}" 9 | 10 | if [ "$#" -ne "1" ] 11 | then 12 | echo "pls. provide args: cloud/onprem" 13 | fi 14 | 15 | if [ "$1" == "cloud" ] 16 | then 17 | echo "Installing pre-requisite tools.." 18 | bash basic_setup_ubuntu.sh 19 | echo "Done" 20 | fi 21 | 22 | echo "Downloading and setting up miniconda..." 23 | [[ ! -e "Miniforge3-24.3.0-0-Linux-x86_64.sh" ]] && wget https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh 24 | bash ./Miniforge3-24.3.0-0-Linux-x86_64.sh -u -b -p ./miniforge3 25 | 26 | echo "Setting up conda env named with given argument" 27 | ./miniforge3/bin/conda env create --name fq2bam -f environment.yml 28 | echo "Setting up conda env named new_env...DONE" 29 | 30 | echo "Activating conda env..." 31 | source ./miniforge3/bin/activate fq2bam 32 | 33 | #echo "Downloading and setting up miniconda..." 34 | #if [ ! -e "Miniconda3-py39_23.3.1-0-Linux-x86_64.sh" ] 35 | #then 36 | # wget https://repo.anaconda.com/miniconda/Miniconda3-py39_23.3.1-0-Linux-x86_64.sh 37 | #fi 38 | # 39 | #bash ./Miniconda3-py39_23.3.1-0-Linux-x86_64.sh -b -p ./miniconda3 40 | #echo "Downloading and setting up miniconda...DONE" 41 | # 42 | #echo "Seeting up conda env named with given argument" 43 | #miniconda3/bin/conda env create --name distbwa -f environment.yml 44 | #echo "Seeting up conda env named new_env...DONE" 45 | # 46 | #echo "Activating conda env..." 47 | #source miniconda3/bin/activate distbwa 48 | #echo "localhost" > hostfile 49 | 50 | ## build tools 51 | cd ../../ 52 | #WDIR=../../ 53 | WDIR=`pwd` 54 | cd - 55 | EXEDIR=`pwd` 56 | 57 | # compile bwa-mem2 58 | echo "Build bwa-mem2" 59 | cd ${WDIR}/applications/bwa-mem2 60 | make clean 61 | make -j multi 62 | bwainstall="SUCESS" 63 | if [ -e "${WDIR}/applications/bwa-mem2/bwa-mem2" ]; then 64 | echo "bwa-mem2 build successful" 65 | else 66 | bwainstall="FAILED" 67 | echo "Error!! bwa-mem2 build failed" 68 | fi 69 | 70 | 71 | cd ${WDIR}/applications/mm2-fast 72 | make clean 73 | make -j 74 | mm2install="SUCESS" 75 | if [ -e "${WDIR}/applications/mm2-fast/minimap2" ]; then 76 | echo "mm2-fast build successful" 77 | else 78 | mm2install="FAILED" 79 | echo "Error!! mm2-fast build failed" 80 | exit 81 | fi 82 | 83 | cd ${WDIR}/applications/STAR/source/ 84 | make clean 85 | make -j STAR 86 | echo $WDIR 87 | if [ -e "${WDIR}/applications/STAR/source/STAR" ]; then 88 | echo "STAR build successful" 89 | else 90 | starinstall="FAILED" 91 | echo "Error!! STAR build failed" 92 | exit 93 | fi 94 | 95 | #make install #uncomment this for installation 96 | 97 | # compile htslib 98 | cd ${WDIR}/applications/htslib 99 | autoreconf -i # Build the configure script and install files it uses 100 | ./configure # Optional but recommended, for choosing extra functionality 101 | make 102 | #make install #uncomment this for installation 103 | 104 | # compile bcftools 105 | ## cd ${WDIR}/applications/bcftools 106 | ## # The following is optional: 107 | ## # autoheader && autoconf && ./configure --enable-libgsl --enable-perl-filters 108 | ## make 109 | ## #make install #uncomment this for installation 110 | 111 | # compile samtools 112 | cd ${WDIR}/applications/samtools 113 | autoheader 114 | autoconf -Wno-syntax 115 | chmod 775 configure 116 | ./configure # Needed for choosing optional functionality 117 | make 118 | saminstall="SUCESS" 119 | if [ -e "${WDIR}/applications/samtools/samtools" ]; then 120 | echo "SAMTools build successful" 121 | else 122 | saminstall="FAILED" 123 | echo "Error!! SAMTools build failed" 124 | fi 125 | 126 | 127 | cd ${WDIR}/applications/bwa-meth/ 128 | wget https://pypi.python.org/packages/source/t/toolshed/toolshed-0.4.0.tar.gz 129 | tar xzvf toolshed-0.4.0.tar.gz 130 | cd toolshed-0.4.0 131 | python setup.py install 132 | cd - 133 | python setup.py install 134 | 135 | #cd $EXEDIR 136 | 137 | #[[ ! -d warp-tools ]] && git clone --recursive https://github.com/broadinstitute/warp-tools.git -b develop 138 | #cd warp-tools/tools/fastqpreprocessing/ 139 | #./fetch_and_make_dep_libs.sh && make 140 | ## make -j 141 | 142 | #if [ "$?" == "0" ] 143 | #then 144 | # echo "fqprocess installed successfully" 145 | #else 146 | # echo "fqprocess installation failed" 147 | #fi 148 | 149 | echo "bwa compilation is "$bwainstall 150 | echo "mm2-fast compilation is "$mm2install 151 | echo "STAR compilation is "$starinstall 152 | echo "samtools compilation is "$saminstall 153 | 154 | echo "Compelete installation done." 155 | -------------------------------------------------------------------------------- /pipelines/fq2sortedbam/print_config.sh: -------------------------------------------------------------------------------- 1 | num_nodes=1 2 | lscpu > compute_config 3 | 4 | 5 | num_cpus_per_node=$(cat compute_config | grep -E '^CPU\(s\)' | awk '{print $2}') 6 | num_socket=$(cat compute_config | grep -E '^Socket'| awk '{print $2}') 7 | num_numa=$(cat compute_config | grep '^NUMA node(s)' | awk '{print $3}') 8 | num_cpus_all_node=`expr ${num_cpus_per_node} \* ${num_nodes}` 9 | threads_per_core=$(cat compute_config | grep -E '^Thread' | awk '{print $4}') 10 | echo "#############################################" 11 | echo "Number of sockets: "$num_socket 12 | echo "Number of NUMA domains: "$num_numa 13 | echo "Number of CPUs: $num_cpus_all_node" 14 | 15 | num_physical_cores_all_nodes=`expr ${num_cpus_all_node} / ${threads_per_core}` 16 | num_physical_cores_per_nodes=`expr ${num_cpus_per_node} / ${threads_per_core}` 17 | num_physical_cores_per_socket=`expr ${num_physical_cores_all_nodes} / ${num_socket}` 18 | num_physical_cores_per_numa=`expr ${num_physical_cores_all_nodes} / ${num_numa}` 19 | echo "Number physical cores: "$num_physical_cores_per_nodes 20 | echo "Number physical cores per socket: "$num_physical_cores_per_socket 21 | echo "Number physical cores per numa: "$num_physical_cores_per_numa 22 | 23 | th=`expr ${num_physical_cores_per_numa} / 2` #${num_physical_cores_per_numa} ##20 24 | if [ $th -le 10 ] 25 | then 26 | th=${num_physical_cores_per_numa} 27 | fi 28 | 29 | while [ $num_physical_cores_per_nodes -gt $th ] 30 | do 31 | num_physical_cores_per_nodes=`expr $num_physical_cores_per_nodes / 2` 32 | done 33 | 34 | num_physical_cores_per_rank=$num_physical_cores_per_nodes 35 | total_num_ranks=`expr ${num_physical_cores_all_nodes} / ${num_physical_cores_per_rank}` 36 | 37 | ranks_per_node=`expr ${total_num_ranks} / ${num_nodes}` 38 | echo "Number of MPI ranks: "${total_num_ranks} 39 | echo "Number of cores per MPI rank: "$num_physical_cores_per_nodes 40 | echo "#############################################" 41 | #echo "Note: Each MPI rank runs a bwa-mem2 process on its input fastq files produced by fqprocess. Please ensure that the number of files created due to bam_size parameter to fqprocess (in config file) creates number of fastq files equal to ${total_num_ranks}" 42 | echo "Please set bam_size such that fastqprocess creates ${total_num_ranks} splits of input fastq files" 43 | echo "#############################################" 44 | -------------------------------------------------------------------------------- /pipelines/fq2sortedbam/run_bwa.sh: -------------------------------------------------------------------------------- 1 | #************************************************************************************* 2 | # The MIT License 3 | # 4 | # Intel OpenOmics - fq2sortedbam pipeline 5 | # Copyright (C) 2023 Intel Corporation. 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining 8 | # a copy of this software and associated documentation files (the 9 | # "Software"), to deal in the Software without restriction, including 10 | # without limitation the rights to use, copy, modify, merge, publish, 11 | # distribute, sublicense, and/or sell copies of the Software, and to 12 | # permit persons to whom the Software is furnished to do so, subject to 13 | # the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be 16 | # included in all copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 22 | # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 23 | # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 24 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | # SOFTWARE. 26 | # 27 | #Authors: Vasimuddin Md ; Babu Pillai ; 28 | #*****************************************************************************************/ 29 | 30 | #!/usr/bin/bash 31 | set -e 32 | 33 | trap 'echo "Error occurred"' ERR 34 | 35 | source ./miniforge3/bin/activate fq2bam 36 | echo "Your conda env @: "$CONDA_PREFIX 37 | 38 | config="" 39 | sso="None" 40 | if [ "$#" == "2" ] || [ "$#" == "3" ] 41 | then 42 | mode=$1 43 | config=$2 44 | [[ "$#" == "3" ]] && sso=$3 45 | else 46 | echo " [sso (for single socket only execution)]" 47 | exit 48 | fi 49 | 50 | #Note: "##### Note: Currently, this code only supports single node. " 51 | #Note: "##### I've deliberately disabled distributed runs for now. " 52 | #Note: "##### Contact: " 53 | #echo "" 54 | 55 | num_nodes=1 56 | echo "run mode: "$mode 57 | echo "config: "$config 58 | #echo "se/pe: $se_mode" 59 | [[ "$sso" == "sso" ]] && echo "single socket only" 60 | 61 | #echo "localhost" > hostfile 62 | hostname > hostfile 63 | #semode="" 64 | CONFIG="" 65 | #read_type="--read_type short" 66 | #[[ "$se_mode" == "se" ]] && semode="--se_mode" 67 | [[ "$config" != "" ]] && CONFIG="-y $config" 68 | runmode="--mode $mode" 69 | 70 | #lscpu > lscpu.txt 71 | chmod +x hwconfig.py 72 | #ls -lh hwconfig.py 73 | python hwconfig.py $sso $num_nodes > hwconfig 74 | #python hwconfig.py "sso" > hwconfig 75 | 76 | source hwconfig 77 | echo "[Info] Running $N ranks, each with $THREADS threads ..." 78 | #rm lscpu.txt 79 | rm hwconfig 80 | 81 | BINDING=socket 82 | mkdir -p logs 83 | 84 | exec=dist_bwa.py 85 | #echo $I_MPI_PIN_DOMAIN 86 | #-genv I_MPI_PIN_DOMAIN=$I_MPI_PIN_DOMAIN 87 | 88 | #echo $N 89 | #echo $PPN 90 | #echo $exec 91 | #echo $CONFIG 92 | mpiexec -bootstrap ssh -n $N -ppn $PPN -bind-to $BINDING -map-by $BINDING --hostfile hostfile python -u $exec --cpus $CPUS --threads $THREADS ${runmode} ${CONFIG} --keep_unmapped 2>&1 | tee logs/log.txt 93 | echo "[Info] The output log file is at logs/log.txt" 94 | -------------------------------------------------------------------------------- /pipelines/single-cell-RNA-seq-analysis/Dockerfile: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intel Labs 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # Authors: Narendra Chaudhary ; Sanchit Misra 24 | 25 | # Install Base miniconda image 26 | ARG FROM_IMAGE=ubuntu:22.04 27 | 28 | # Install Base miniforge image 29 | ARG BASE_IMAGE=condaforge/miniforge3:23.1.0-3 30 | FROM ${BASE_IMAGE} as conda_setup 31 | 32 | RUN conda update -n base conda 33 | COPY ./environment.yml ./ 34 | 35 | RUN conda env create --name=single_cell -f ./environment.yml 36 | COPY ./_t_sne.py /opt/conda/lib/python3.8/site-packages/daal4py/sklearn/manifold/_t_sne.py 37 | 38 | 39 | FROM ${FROM_IMAGE} as builder 40 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ 41 | git build-essential gcc curl gnupg gnupg2 gnupg1 sudo wget tar ca-certificates -y \ 42 | && rm -rf /var/lib/apt/lists/* \ 43 | && apt-get autoremove -y \ 44 | && apt-get clean \ 45 | && apt update 46 | 47 | 48 | COPY --from=conda_setup /opt/conda /opt/conda 49 | ENV PATH "/opt/conda/envs/single_cell/bin:$PATH" 50 | RUN echo "source /opt/conda/bin/activate single_cell" >> ~/.bashrc 51 | CMD source ~/.bashrc 52 | 53 | # Non-root user setup 54 | ENV SERVICE_NAME="scrna" 55 | 56 | RUN groupadd --gid 1001 $SERVICE_NAME && \ 57 | useradd -m -g $SERVICE_NAME --shell /bin/false --uid 1001 $SERVICE_NAME 58 | 59 | 60 | RUN pip uninstall -y umap-learn 61 | WORKDIR / 62 | RUN wget https://github.com/IntelLabs/Open-Omics-Acceleration-Framework/releases/download/3.0/Source_code_with_submodules.tar.gz 63 | RUN tar -xzf Source_code_with_submodules.tar.gz 64 | 65 | #SHELL ["/bin/bash", "-c", "source activate single_cell"] 66 | WORKDIR /Open-Omics-Acceleration-Framework/lib/tal/applications/UMAP_fast/umap_extend/ 67 | RUN python setup.py install 68 | 69 | WORKDIR ../umap/ 70 | RUN python setup.py install 71 | 72 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /Open-Omics-Acceleration-Framework /opt 73 | # Switch to non-root user 74 | USER $SERVICE_NAME 75 | 76 | 77 | WORKDIR /Open-Omics-Acceleration-Framework/pipelines/single-cell-RNA-seq-analysis/notebooks 78 | 79 | 80 | CMD jupyter notebook \ 81 | --no-browser \ 82 | --allow-root \ 83 | --port=8888 \ 84 | --ip=0.0.0.0 \ 85 | --notebook-dir=/Open-Omics-Acceleration-Framework/pipelines/single-cell-RNA-seq-analysis/notebooks \ 86 | --NotebookApp.password="" \ 87 | --NotebookApp.token="" \ 88 | --NotebookApp.password_required=False 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /pipelines/single-cell-RNA-seq-analysis/Dockerfile.python: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intel Labs 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # Authors: Narendra Chaudhary ; Sanchit Misra 24 | 25 | # Install Base miniconda image 26 | ARG FROM_IMAGE=ubuntu:22.04 27 | 28 | # Install Base miniforge image 29 | ARG BASE_IMAGE=condaforge/miniforge3:23.1.0-3 30 | FROM ${BASE_IMAGE} as conda_setup 31 | 32 | RUN conda update -n base conda 33 | COPY ./environment.yml ./ 34 | 35 | RUN conda env create --name=single_cell -f ./environment.yml 36 | COPY ./_t_sne.py /opt/conda/lib/python3.8/site-packages/daal4py/sklearn/manifold/_t_sne.py 37 | 38 | 39 | FROM ${FROM_IMAGE} as builder 40 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ 41 | git build-essential gcc curl gnupg gnupg2 gnupg1 sudo wget tar ca-certificates -y \ 42 | && rm -rf /var/lib/apt/lists/* \ 43 | && apt-get autoremove -y \ 44 | && apt-get clean \ 45 | && apt update 46 | 47 | 48 | COPY --from=conda_setup /opt/conda /opt/conda 49 | ENV PATH "/opt/conda/envs/single_cell/bin:$PATH" 50 | RUN echo "source /opt/conda/bin/activate single_cell" >> ~/.bashrc 51 | CMD source ~/.bashrc 52 | 53 | # Non-root user setup 54 | ENV SERVICE_NAME="scrna" 55 | 56 | RUN groupadd --gid 1001 $SERVICE_NAME && \ 57 | useradd -m -g $SERVICE_NAME --shell /bin/false --uid 1001 $SERVICE_NAME 58 | 59 | 60 | RUN pip uninstall -y umap-learn 61 | WORKDIR / 62 | RUN wget https://github.com/IntelLabs/Open-Omics-Acceleration-Framework/releases/download/3.0/Source_code_with_submodules.tar.gz 63 | RUN tar -xzf Source_code_with_submodules.tar.gz 64 | 65 | #SHELL ["/bin/bash", "-c", "source activate single_cell"] 66 | WORKDIR /Open-Omics-Acceleration-Framework/lib/tal/applications/UMAP_fast/umap_extend/ 67 | RUN python setup.py install 68 | 69 | WORKDIR ../umap/ 70 | RUN python setup.py install 71 | 72 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /Open-Omics-Acceleration-Framework /opt 73 | # Switch to non-root user 74 | USER $SERVICE_NAME 75 | 76 | 77 | WORKDIR /Open-Omics-Acceleration-Framework/pipelines/single-cell-RNA-seq-analysis/notebooks 78 | #WORKDIR /workspace 79 | CMD python -m sklearnex full_single_cell_analysis.py 80 | 81 | # build with "docker build -f Dockerfile.python -t scanpy ." 82 | # mkdir -p ~/output 83 | # docker run -v ~/output:/workspace/figures -v ~/Open-Omics-Acceleration-Framework/pipelines/single_cell_pipeline/data:/data scanpy 84 | -------------------------------------------------------------------------------- /pipelines/single-cell-RNA-seq-analysis/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Intel Labs 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /pipelines/single-cell-RNA-seq-analysis/README.md: -------------------------------------------------------------------------------- 1 | # Pipeline overview 2 | 3 | Given a cell by gene matrix, this [scanpy](https://github.com/scverse/scanpy) based pipeline performs data preprocessing (filter, linear regression and normalization), dimensionality reduction (PCA), clustering (Louvain/Leiden/kmeans) to cluster the cells into different cell types and visualize those clusters (UMAP/t-SNE). The following block diagram illustrates the pipeline. 4 | 5 |

6 |
7 |

8 | 9 | 10 | # Download entire repository 11 | ```bash 12 | cd ~ 13 | RUN wget https://github.com/IntelLabs/Open-Omics-Acceleration-Framework/releases/download/3.0/Source_code_with_submodules.tar.gz 14 | RUN tar -xzf Source_code_with_submodules.tar.gz 15 | cd ~/Open-Omics-Acceleration-Framework/pipelines/single-cell-RNA-seq-analysis 16 | ``` 17 | 18 | # Instructions to Run 19 | We can run this pipeline in three ways: 1. Docker container (i. interactive, ii. non-interactive), 2. Using anaconda environment file, 3. Creating anaconda environment manually. 20 | 21 | ## (Option 1): Docker instructions for interactive and non-interactive mode (Recommended on Cloud Instance) 22 | 23 | 24 | ### Run with jupyter notebook (interactive) 25 | ```bash 26 | cd ~/Open-Omics-Acceleration-Framework/pipelines/single-cell-RNA-seq-analysis/ 27 | docker build -t scanpy . # Create a docker image named scanpy 28 | 29 | # Download dataset 30 | wget -P ~/Open-Omics-Acceleration-Framework/pipelines/single-cell-RNA-seq-analysis/data https://rapids-single-cell-examples.s3.us-east-2.amazonaws.com/1M_brain_cells_10X.sparse.h5ad 31 | 32 | docker run -it -p 8888:8888 -v ~/Open-Omics-Acceleration-Framework/pipelines/single-cell-RNA-seq-analysis/data:/data scanpy # run docker container with the data folder as volume 33 | 34 | ``` 35 | 36 | ### Run with non-interactive mode 37 | 38 | ```bash 39 | export DATA_DIR= 40 | export OUTPUT_DIR= 41 | mkdir -p $OUTPUT_DIR 42 | cd ~/Open-Omics-Acceleration-Framework/pipelines/single-cell-RNA-seq-analysis/ 43 | 44 | docker build -f Dockerfile.python -t scanpy_python . # Create a docker image named scanpy_python 45 | 46 | # Download dataset 47 | wget -P $DATA_DIR https://rapids-single-cell-examples.s3.us-east-2.amazonaws.com/1M_brain_cells_10X.sparse.h5ad 48 | 49 | docker run -v $OUTPUT_DIR:/Open-Omics-Acceleration-Framework/pipelines/single-cell-RNA-seq-analysis/notebooks/figures -v $DATA_DIR:/data -it scanpy_python 50 | ``` 51 | 52 | 53 | 54 | 55 | ## (Option 2): Create an Anaconda environment from file 56 | ```bash 57 | conda env create --name=single_cell -f environment.yml 58 | conda activate single_cell 59 | ``` 60 | 61 | ### Replace the _t_sne.py file to anaconda environment's daal4py package 62 | ```bash 63 | cp _t_sne.py ~/anaconda3/envs/single_cell/lib/python3.8/site-packages/daal4py/sklearn/manifold/ 64 | ``` 65 | 66 | ### Install umap_extend and umap 67 | ```bash 68 | 69 | pip uninstall umap-learn 70 | cd ~/Open-Omics-Acceleration-Framework/lib/tal/applications/UMAP_fast/umap_extend 71 | python setup.py install # Uncomment AVX512 lines in setup.py before doing this step on avx512 machines 72 | 73 | 74 | cd ~/Open-Omics-Acceleration-Framework/lib/tal/applications/UMAP_fast/umap 75 | python setup.py install # do python setup.py install if moving environment using conda-pack 76 | ``` 77 | 78 | 79 | ### Example Dataset 80 | The dataset was made publicly available by 10X Genomics. Use the following command to download the count matrix for this dataset and store it in the data folder: 81 | ```bash 82 | wget -P ~/Open-Omics-Acceleration-Framework/pipelines/single-cell-RNA-seq-analysis/data https://rapids-single-cell-examples.s3.us-east-2.amazonaws.com/1M_brain_cells_10X.sparse.h5ad 83 | ``` 84 | 85 | ### Setup and run 86 | ```bash 87 | export NUMEXPR_MAX_THREADS=56 # equal to number of threads on a single socket 88 | export NUMBA_NUM_THREADS=56 # Remember to delete __pycache__ folder from local directory and umap/umap/ directory if increasing number of threads 89 | 90 | # also update sc.settings.n_jobs=56 to set number of threads inside 1M_brain_cpu_analysis.py 91 | 92 | cd ~/Open-Omics-Acceleration-Framework/pipelines/single-cell-RNA-seq-analysis/notebooks/ 93 | 94 | # Or the jupyter notebook with sklearn patch in it. 95 | # from sklearnex import patch_sklearn 96 | # patch_sklearn() 97 | 98 | jupyter notebook 99 | ``` 100 | 101 | 102 | ## (Alternatively, Option - 3) You can also create Anaconda environment Manually 103 | ```bash 104 | conda create --name single_cell python=3.8.0 105 | conda activate single_cell 106 | ``` 107 | 108 | ### Necessary scanpy tools 109 | ```bash 110 | conda install -y seaborn=0.12.2 scikit-learn=1.0.2 statsmodels=0.13.2 numba=0.53.1 pytables=3.7.0 matplotlib-base=3.6.2 pandas=1.5.2 111 | conda install -y -c conda-forge mkl-service=2.4.0 112 | conda install -y -c conda-forge python-igraph=0.10.3 leidenalg=0.9.1 113 | conda install -y -c conda-forge cython=0.29.33 jinja2=3.1.2 clang-tools=15.0.7 114 | conda install -y -c katanagraph/label/dev -c conda-forge katana-python 115 | ``` 116 | 117 | ### Install scanpy 118 | ```bash 119 | pip install scanpy==1.8.1 120 | ``` 121 | 122 | ### Install scikit-learn intel extension (PIP version) 123 | ```bash 124 | pip install scikit-learn-intelex==2023.0.1 125 | ``` 126 | ### Install other packages 127 | ```bash 128 | pip install pybind11 129 | pip install jupyterlab 130 | pip install wget 131 | ``` 132 | 133 | ### Replace the _t_sne.py file to anaconda environment's daal4py package 134 | ```bash 135 | cp _t_sne.py ~/anaconda3/envs/single_cell/lib/python3.8/site-packages/daal4py/sklearn/manifold/ 136 | ``` 137 | 138 | ### Install umap_extend and umap 139 | ```bash 140 | 141 | pip uninstall umap-learn 142 | cd ~/Open-Omics-Acceleration-Framework/lib/tal/applications/UMAP_fast/umap_extend 143 | python setup.py install # Uncomment AVX512 lines in setup.py before doing this step on avx512 machines 144 | 145 | 146 | cd ~/Open-Omics-Acceleration-Framework/lib/tal/applications/UMAP_fast/umap 147 | python setup.py install # do python setup.py install if moving environment using conda-pack 148 | ``` 149 | -------------------------------------------------------------------------------- /pipelines/single-cell-RNA-seq-analysis/environment.yml: -------------------------------------------------------------------------------- 1 | name: single_cell 2 | channels: 3 | - conda-forge 4 | - defaults 5 | - katanagraph/label/dev 6 | dependencies: 7 | - defaults::python=3.8.0 8 | - conda-forge::gcc_linux-64==12.1.0 9 | - conda-forge::gxx_linux-64==12.1.0 10 | - defaults::seaborn=0.12.2 11 | - defaults::scikit-learn=1.0.2 12 | - defaults::statsmodels=0.13.2 13 | - defaults::numba=0.53.1 14 | - defaults::pytables=3.7.0 15 | - defaults::pip=22.3.1 16 | - defaults::pandas=1.5.2 17 | - defaults::matplotlib-base=3.6.2 18 | - conda-forge::mkl-service=2.4.0 19 | - conda-forge::python-igraph=0.10.3 20 | - conda-forge::leidenalg=0.9.1 21 | - conda-forge::cython=0.29.33 22 | - conda-forge::jinja2=3.1.2 23 | - conda-forge::clang-tools=15.0.7 24 | - katanagraph/label/dev::katana-python 25 | - pip: 26 | - scanpy==1.8.1 27 | - anndata==0.8.0 28 | - scikit-learn-intelex==2023.0.1 29 | - pybind11 30 | - jupyter 31 | - wget 32 | --------------------------------------------------------------------------------