├── .gitmodules
├── LICENSE
├── README.md
├── SECURITY.md
├── _config.yml
├── applications
    ├── AutoDock-Vina
    │   ├── Dockerfile
    │   ├── README.md
    │   └── data_download_script.sh
    ├── Autodock
    │   ├── Dockerfile
    │   ├── README.md
    │   └── data_download_script.sh
    ├── ProtGPT2
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── env.yml
    │   ├── model_script.sh
    │   └── protgpt2.py
    ├── ProteinMPNN
    │   ├── Dockerfile
    │   ├── ProteinMPNN.patch
    │   ├── README.md
    │   └── setup_proteinmpnn.sh
    ├── RFdiffusion
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── RFdiffusion.patch
    │   └── setup_rfdiffusion.sh
    ├── boltz
    │   ├── Dockerfile
    │   ├── README.md
    │   └── entrypoint.sh
    ├── esm
    │   ├── Dockerfile.base
    │   ├── Dockerfile.esm
    │   ├── Dockerfile.esmfold
    │   ├── README.md
    │   ├── build_docker_images.sh
    │   ├── env.yml
    │   ├── esm_change_all.patch
    │   └── esm_openfold_change_py37.patch
    ├── esm3
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── env.yml
    │   ├── esm3_changes.patch
    │   └── scripts
    │   │   ├── ESM3_chain_of_thought.py
    │   │   ├── ESM3_folding_task.py
    │   │   ├── ESM3_function_prediction_task.py
    │   │   ├── ESM3_inversefold_task.py
    │   │   ├── ESM3_logits_embedding_task.py
    │   │   ├── ESM3_prompt_sequence.py
    │   │   └── ESMC_logits_embedding_task.py
    ├── gromacs
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── entrypoint.sh
    │   ├── grms_input
    │   │   ├── mdtut_ions.mdp
    │   │   ├── mdtut_md.mdp
    │   │   ├── mdtut_minim.mdp
    │   │   ├── mdtut_npt.mdp
    │   │   ├── mdtut_nvt.mdp
    │   │   └── run_commands.sh
    │   └── run_commands.sh
    ├── moflow
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── env.yml
    │   └── mflow_change_all.patch
    └── relion
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── entrypoint.sh
    │   └── relion_env_patch.patch
├── benchmarking
    ├── AWS-Intel-blog-v2.1-2024
    │   ├── README.md
    │   ├── long_db
    │   ├── proteome.py
    │   ├── run_pipe_bwa.sh
    │   ├── short_db
    │   └── test_pipe_bwa.py
    └── aws
    │   └── README.md
├── images
    ├── Open-Omics-Acceleration-Framework v2.0.JPG
    ├── Open-Omics-Acceleration-Framework v2.0.jpg
    ├── Open-Omics-Acceleration-Framework v3.0.jpg
    ├── Open-Omics-Acceleration-Framework-v2.0.JPG
    ├── Open-Omics-Acceleration-Framework-v3.0.jpg
    ├── alphafold2-protein-folding.jpg
    ├── deepvariant-fq2vcf.jpg
    ├── open-omics-acceleration-framework-v2.0.JPG
    ├── open-omics-acceleration-framework.JPG
    └── scrnaseq-analysis.jpg
└── pipelines
    ├── alphafold2-based-protein-folding
        ├── Dockerfile_Inf
        ├── Dockerfile_Pre
        ├── README.md
        ├── entrypoint_inf.sh
        └── entrypoint_pre.sh
    ├── deepvariant-based-germline-variant-calling-fq2vcf
        ├── Dockerfile_bams2vcf
        ├── Dockerfile_fq2bams
        ├── README.md
        ├── bams2vcf.py
        ├── docs.txt
        ├── environment.yml
        ├── fq2bams.py
        ├── libmimalloc.so.2.0
        ├── merge_vcf.sh
        ├── run_bams2vcf.py
        ├── run_fq2bams.py
        └── trash
        │   ├── config
        │   ├── extra_scripts
        │       ├── config
        │       ├── merge_vcf.sh
        │       ├── run_pipeline_ec2_part2.sh
        │       └── run_pipeline_part2.sh
        │   ├── run_pipeline.sh
        │   ├── run_pipeline_part1.sh
        │   ├── scripts
        │       ├── aws
        │       │   ├── basic_setup.sh
        │       │   ├── build_deepvariant_docker_image.sh
        │       │   ├── build_tools.sh
        │       │   ├── config
        │       │   ├── create_reference_index.sh
        │       │   ├── deepvariant_ec2_setup.sh
        │       │   ├── deepvariant_setup.sh
        │       │   ├── pcluster_compute_node_setup.sh
        │       │   ├── pcluster_example_config
        │       │   ├── pcluster_reference_index.sh
        │       │   ├── run_pipeline_ec2.sh
        │       │   ├── run_pipeline_ec2_part1.sh
        │       │   └── run_pipeline_pcluster.sh
        │       └── cluster
        │       │   ├── config
        │       │   ├── create_reference_index.sh
        │       │   ├── load_deepvariant.sh
        │       │   ├── run_pipeline_cluster.sh
        │       │   └── setup.sh
        │   ├── setup_env.sh
        │   └── test_pipeline_final.py
    ├── fq2sortedbam
        ├── Dockerfile
        ├── README.md
        ├── README.md.old
        ├── basic_setup_ubuntu.sh
        ├── config.yaml
        ├── doc.txt
        ├── environment.yml
        ├── fq2sortedbam.py
        ├── hwconfig.py
        ├── install.sh
        ├── print_config.sh
        ├── run_bwa.sh
        └── run_fq2sortedbam.py
    └── single-cell-RNA-seq-analysis
        ├── Dockerfile
        ├── Dockerfile.python
        ├── LICENSE
        ├── README.md
        ├── _t_sne.py
        ├── environment.yml
        └── notebooks
            ├── 1.3_million_single_cell_analysis.ipynb
            ├── fastpp.py
            ├── full_single_cell_analysis.py
            ├── sc_nbrs.py
            ├── sc_pp_hvg.py
            └── sc_pp_simple.py


/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "lib/tal"]
 2 | 	path = lib/tal
 3 | 	url = https://github.com/IntelLabs/Trans-Omics-Acceleration-Library.git
 4 | [submodule "applications/bwa-mem2"]
 5 | 	path = applications/bwa-mem2
 6 | 	url = https://github.com/bwa-mem2/bwa-mem2.git
 7 | [submodule "applications/mm2-fast"]
 8 | 	path = applications/mm2-fast
 9 | 	url = https://github.com/bwa-mem2/mm2-fast.git
10 | [submodule "applications/alphafold"]
11 | 	path = applications/alphafold
12 | 	url = https://github.com/IntelLabs/open-omics-alphafold.git
13 | [submodule "applications/samtools"]
14 | 	path = applications/samtools
15 | 	url = https://github.com/samtools/samtools.git
16 | [submodule "applications/htslib"]
17 | 	path = applications/htslib
18 | 	url = https://github.com/samtools/htslib.git
19 | [submodule "applications/deepvariant"]
20 | 	path = applications/deepvariant
21 | 	url = https://github.com/IntelLabs/deepvariant.git
22 | [submodule "applications/bcftools"]
23 | 	path = applications/bcftools
24 | 	url = https://github.com/samtools/bcftools.git
25 | [submodule "applications/bwa-meth"]
26 | 	path = applications/bwa-meth
27 | 	url = https://github.com/brentp/bwa-meth.git
28 | [submodule "applications/STAR"]
29 | 	path = applications/STAR
30 | 	url = https://github.com/alexdobin/STAR.git
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Intel Labs
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Reporting Security Issues
2 | 
3 | The Bootstrap team and community take security issues in Bootstrap seriously. We appreciate your efforts to responsibly disclose your findings, and will make every effort to acknowledge your contributions.
4 | 
5 | To report a security issue, email [security@getbootstrap.com](mailto:security@getbootstrap.com) and include the word "SECURITY" in the subject line.
6 | 
7 | We'll endeavor to respond quickly, and will keep you updated throughout the process.
8 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: minima
2 | 


--------------------------------------------------------------------------------
/applications/AutoDock-Vina/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM condaforge/miniforge3:4.10.2-0
 2 | ENV DEBIAN_FRONTEND=noninteractive
 3 | RUN apt-get update && apt-get install -y --no-install-recommends \
 4 |     build-essential \
 5 |     libboost-all-dev \
 6 |     swig \
 7 |     vim \
 8 |     gcc-8 \
 9 |     g++-8 \
10 |     numactl \
11 |     time && \
12 |     apt-get clean && \
13 |     rm -rf /var/lib/apt/lists/*
14 | ENV CC=gcc-8
15 | ENV CXX=g++-8
16 | WORKDIR /opt
17 | RUN git clone https://github.com/ccsb-scripps/AutoDock-Vina.git
18 | WORKDIR /opt/AutoDock-Vina
19 | RUN git checkout v1.2.2
20 | WORKDIR /opt/AutoDock-Vina/build/linux/release
21 | RUN make -j$(nproc)
22 | ENV SERVICE_NAME="autodock-vina-service"
23 | RUN groupadd --gid 1001 $SERVICE_NAME && \
24 |     useradd -m -g $SERVICE_NAME --shell /bin/false --uid 1001 $SERVICE_NAME
25 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /opt
26 | USER $SERVICE_NAME
27 | ENV PATH="/opt/AutoDock-Vina/build/linux/release:$PATH"
28 | WORKDIR /input
29 | HEALTHCHECK NONE
30 | CMD ["vina","--help"]
31 | 
32 | 


--------------------------------------------------------------------------------
/applications/AutoDock-Vina/README.md:
--------------------------------------------------------------------------------
 1 | ## Open-Omics-Autodock-Vina
 2 | Open-Omics-Autodock-Vina is a fast, efficient molecular docking software used to predict ligand-protein binding poses and affinities. It features a refined scoring function, parallel execution on multicore CPUs and user-friendly configuration.
 3 | 
 4 | ## Docker Setup Instructions
 5 | 
 6 | 
 7 | ### 1. Build the Docker Image 
 8 | To build the Docker image with the tag `docker_vina`, use the following commands based on your machine's proxy requirements:
 9 | * For machine without a proxy:
10 | ```bash
11 | docker build -t docker_vina .
12 | ```
13 | * For machine with a proxy:
14 | ```bash
15 | docker build --build-arg http_proxy=<http_proxy> --build-arg https_proxy=<https_proxy> --build-arg no_proxy=<no_proxy_ip> -t docker_vina .
16 | ```
17 | 
18 | 
19 | ### 2. Choose and Download Protein Complex Data
20 | Select any protein complex from the available dataset of **140** protein-ligand complexes(https://zenodo.org/records/4031961) which you can download from (https://zenodo.org/records/4031961/files/data.zip?download=1). This guide uses the **5wlo** protein as an example.
21 | 
22 | 1) Run the below commands to make data download script executable, download the complete dataset and extract the data for `5wlo`:
23 | 
24 | ```bash
25 | chmod +x data_download_script.sh
26 | bash data_download_script.sh 5wlo
27 | ```
28 | **Note: You can replace 5wlo with any other complex name from the complete dataset available in `data_original/data` directory.**
29 | 
30 | 2) Create an output directory to store results specific to `5wlo`:
31 | ```bash
32 | mkdir -p 5wlo_output                                                                                                               
33 | ```
34 | 
35 | 3) Set the environment variables for the `5wlo` protein as follows:
36 | ```bash                                                                                                                         
37 | export INPUT_VINA=$PWD/5wlo
38 | export OUTPUT_VINA=$PWD/5wlo_output
39 | ```
40 | 
41 | 4) Add the necessary permissions to output folder for Docker to write to it:
42 | ```bash
43 | sudo chmod -R a+w $OUTPUT_VINA
44 | ```
45 | 
46 | ### 3. Run the Docker Container
47 | Verify that the Docker image was built successfully by listing Docker images:
48 | ```bash
49 | docker images | grep docker_vina                                                                                                
50 | ```
51 | If the image is listed, run AutoDock Vina with the following command:
52 | ```bash                                                                                                                         
53 | docker run -it -v $INPUT_VINA:/input -v $OUTPUT_VINA:/output docker_vina:latest vina --receptor protein.pdbqt --ligand rand-1.pdbqt --out /output/rand-1_out.pdbqt --center_x 16.459 --center_y -19.946 --center_z -5.850 --size_x 18 --size_y 18 --size_z 18 --seed 1234 --exhaustiveness 64
54 | ```
55 | This command will process your receptor and ligand files and place the results in the specified output directory.
56 | ### 4. Expected Output                                                                                                           
57 | After running the above command, you should find the output file (`rand-1_out.pdbqt`) in the output directory, such as `5wlo_output` for this example.
58 | 
59 | ---
60 | The original README content of AutoDock-Vina follows:
61 | 
62 | ## AutoDock Vina: Docking and virtual screening program
63 | 
64 | **AutoDock Vina** is one of the **fastest** and **most widely used** **open-source** docking engines. It is a turnkey computational docking program that is based on a simple scoring function and rapid gradient-optimization conformational search. It was originally designed and implemented by Dr. Oleg Trott in the Molecular Graphics Lab, and it is now being maintained and develop by the Forli Lab at The Scripps Research Institute.
65 | 
66 | * AutoDock4.2 and Vina scoring functions
67 | * Support of simultaneous docking of multiple ligands and batch mode for virtual screening
68 | * Support of macrocycle molecules
69 | * Hydrated docking protocol
70 | * Can write and load external AutoDock maps
71 | * Python bindings for Python 3
72 | 
73 | ## Documentation
74 | 
75 | The installation instructions, documentation and tutorials can be found on [readthedocs.org](https://autodock-vina.readthedocs.io/en/latest/).
76 | 
77 | ## Citations
78 | * [J. Eberhardt, D. Santos-Martins, A. F. Tillack, and S. Forli. (2021). AutoDock Vina 1.2.0: New Docking Methods, Expanded Force Field, and Python Bindings. Journal of Chemical Information and Modeling.](https://pubs.acs.org/doi/10.1021/acs.jcim.1c00203)
79 | * [O. Trott and A. J. Olson. (2010). AutoDock Vina: improving the speed and accuracy of docking with a new scoring function, efficient optimization, and multithreading. Journal of computational chemistry, 31(2), 455-461.](https://onlinelibrary.wiley.com/doi/10.1002/jcc.21334)
80 | 


--------------------------------------------------------------------------------
/applications/AutoDock-Vina/data_download_script.sh:
--------------------------------------------------------------------------------
 1 | url="https://zenodo.org/records/4031961/files/data.zip?download=1"
 2 | download_dir="./data_original"
 3 | target_folder="$1"
 4 | if [ ! -d "$download_dir/data" ]; then
 5 |     echo "Downloading data.zip..."
 6 |     mkdir -p "$download_dir"
 7 |     wget -O "$download_dir/data.zip" "$url"
 8 | 
 9 |     echo "Unzipping data.zip..."
10 |     unzip "$download_dir/data.zip" -d "$download_dir"
11 |     rm -f "$download_dir/data.zip"
12 | 
13 |     echo "Data downloaded and extracted to $download_dir/data"
14 | else
15 |     echo "Data already exists in $download_dir/data. Skipping download and extraction."
16 | fi
17 | if [ -d "$target_folder" ]; then
18 |     echo "The folder '$target_folder' already exists in the current directory. Skipping copy."
19 | else
20 |     if [ -d "$download_dir/data/$target_folder" ]; then
21 |         cp -r "$download_dir/data/$target_folder" ./
22 |         echo "$target_folder folder successfully copied to the current directory."
23 |     else
24 |         echo "$target_folder folder not found inside '$download_dir/data'."
25 |     fi
26 | fi
27 | echo "'$target_folder' folder is now available in the current directory."
28 | 


--------------------------------------------------------------------------------
/applications/Autodock/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM condaforge/miniforge3:4.10.2-0
 2 | ENV DEBIAN_FRONTEND=noninteractive
 3 | RUN apt-get update && apt-get install -y --no-install-recommends \
 4 |     vim \
 5 |     git \
 6 |     build-essential \
 7 |     ocl-icd-opencl-dev \
 8 |     clinfo && \
 9 |     apt-get clean && \
10 |     rm -rf /var/lib/apt/lists/*
11 | RUN conda install -c conda-forge \
12 |     python=3.10 \
13 |     requests=2.28.2 \
14 |     mkl=2023.1 \
15 |     dpcpp_linux-64=2023.1 \
16 |     dpcpp-cpp-rt=2023.1 \
17 |     mkl-devel=2023.1 && \
18 |     conda clean --all -f -y
19 | ENV LD_LIBRARY_PATH="/opt/conda/lib:${LD_LIBRARY_PATH}"
20 | WORKDIR /opt
21 | ENV SERVICE_NAME="autodock-service"
22 | RUN groupadd --gid 1001 $SERVICE_NAME && \
23 |     useradd -m -g $SERVICE_NAME --shell /bin/false --uid 1001 $SERVICE_NAME && \
24 |     mkdir -p /opt/AutoDock && \
25 |     chown -R $SERVICE_NAME:$SERVICE_NAME /opt/AutoDock
26 | USER $SERVICE_NAME
27 | WORKDIR /opt/AutoDock
28 | RUN git clone https://github.com/emascarenhas/AutoDock-GPU.git . && \
29 |     git checkout v1.4
30 | RUN make DEVICE=CPU NUMWI=64 && \
31 |     rm -rf .git build_temp
32 | ENV PATH="/opt/AutoDock/bin:${PATH}"
33 | HEALTHCHECK NONE
34 | WORKDIR /input
35 | CMD ["autodock_cpu_64wi","--help"]
36 | 
37 | 


--------------------------------------------------------------------------------
/applications/Autodock/data_download_script.sh:
--------------------------------------------------------------------------------
 1 | url="https://zenodo.org/records/4031961/files/data.zip?download=1"
 2 | download_dir="./data_original"
 3 | target_folder="$1"
 4 | if [ ! -d "$download_dir/data" ]; then
 5 |     echo "Downloading data.zip..."
 6 |     mkdir -p "$download_dir"
 7 |     wget -O "$download_dir/data.zip" "$url"
 8 | 
 9 |     echo "Unzipping data.zip..."
10 |     unzip "$download_dir/data.zip" -d "$download_dir"
11 |     rm -f "$download_dir/data.zip"
12 | 
13 |     echo "Data downloaded and extracted to $download_dir/data"
14 | else
15 |     echo "Data already exists in $download_dir/data. Skipping download and extraction."
16 | fi
17 | if [ -d "$target_folder" ]; then
18 |     echo "The folder '$target_folder' already exists in the current directory. Skipping copy."
19 | else
20 |     if [ -d "$download_dir/data/$target_folder" ]; then
21 |         cp -r "$download_dir/data/$target_folder" ./
22 |         echo "$target_folder folder successfully copied to the current directory."
23 |     else
24 |         echo "$target_folder folder not found inside '$download_dir/data'."
25 |     fi
26 | fi
27 | echo "'$target_folder' folder is now available in the current directory."
28 | 


--------------------------------------------------------------------------------
/applications/ProtGPT2/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use the official Ubuntu image as a base
 2 | ARG FROM_IMAGE=ubuntu:24.04
 3 | 
 4 | # Stage 1: Set up Conda environment
 5 | ARG BASE_IMAGE=condaforge/miniforge3:24.3.0-0
 6 | FROM ${BASE_IMAGE} as conda_setup
 7 | 
 8 | ENV DEBIAN_FRONTEND=noninteractive
 9 | 
10 | # Stage 2: Set up the main build environment
11 | FROM ${FROM_IMAGE} as builder
12 | 
13 | ENV DEBIAN_FRONTEND=noninteractive
14 | 
15 | ARG http_proxy
16 | ENV http_proxy=${http_proxy}
17 | 
18 | ARG https_proxy
19 | ENV https_proxy=${https_proxy}
20 | 
21 | ARG no_proxy
22 | ENV no_proxy=${no_proxy}
23 | 
24 | # Install necessary build tools and clean up
25 | RUN apt-get update && apt-get install -y --no-install-recommends \
26 |     ca-certificates git build-essential vim numactl autoconf automake make && \
27 |     apt-get clean && \
28 |     rm -rf /var/lib/apt/lists/*
29 | 
30 | WORKDIR /app
31 | # Non-root user setup
32 | ENV SERVICE_NAME="protgpt2-service"
33 | 
34 | RUN groupadd --gid 1001 $SERVICE_NAME && \
35 |     useradd -m -g $SERVICE_NAME --shell /bin/false --uid 1001 $SERVICE_NAME
36 | 
37 | # Copy Conda installation from the conda_setup stage
38 | COPY --from=conda_setup /opt/conda /opt/conda
39 | 
40 | RUN wget https://github.com/IntelLabs/Open-Omics-Acceleration-Framework/releases/download/3.0/Source_code_with_submodules.tar.gz 
41 | RUN tar -xzf Source_code_with_submodules.tar.gz
42 | 
43 | ENV PATH="/opt/conda/bin:$PATH"
44 | 
45 | # Copy environment.yml and create Conda environment
46 | RUN cp /app/Open-Omics-Acceleration-Framework/applications/ProtGPT2/env.yml .
47 | RUN cp /app/Open-Omics-Acceleration-Framework/applications/ProtGPT2/protgpt2.py .
48 | RUN cp /app/Open-Omics-Acceleration-Framework/applications/ProtGPT2/model_script.sh .
49 | RUN rm -rf Open-Omics-Acceleration-Framework
50 | RUN rm -rf Source_code_with_submodules.tar.gz
51 | RUN conda env create -f env.yml
52 | 
53 | RUN git clone --branch 5.3.0 https://github.com/jemalloc/jemalloc.git
54 | WORKDIR /app/jemalloc
55 | RUN bash autogen.sh --prefix=/opt/conda/envs/protgpt2/ && make install
56 | WORKDIR /app
57 | RUN rm -rf jemalloc
58 | 
59 | # Set up environment activation and PATH
60 | ENV PATH="/opt/conda/envs/protgpt2/bin:$PATH"
61 | 
62 | # Swith to Non-root user
63 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /app
64 | USER $SERVICE_NAME
65 | 
66 | HEALTHCHECK NONE
67 | 
68 | ENV LD_PRELOAD "/opt/conda/envs/protgpt2/lib/libjemalloc.so:$LD_PRELOAD" 
69 | ENV MALLOC_CONF "oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1" 
70 | CMD python
71 | 


--------------------------------------------------------------------------------
/applications/ProtGPT2/env.yml:
--------------------------------------------------------------------------------
 1 | name: protgpt2
 2 | channels:
 3 |   - conda-forge
 4 |   - pytorch
 5 | dependencies:
 6 |   - cpuonly
 7 |   - python=3.11
 8 |   - pip=24.0
 9 |   - pytorch=2.2.0
10 |   - pip:
11 |       - transformers==4.38.0
12 |       - intel-extension-for-pytorch==2.2.0
13 |       - numpy==1.26.0
14 | 
15 | 


--------------------------------------------------------------------------------
/applications/ProtGPT2/model_script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Directory where the files will be downloaded
 4 | model_dir="./model_dir"
 5 | 
 6 | # Array of URLs to download
 7 | URLS=(
 8 |     "https://huggingface.co/nferruz/ProtGPT2/resolve/main/.gitattributes"
 9 | 	"https://huggingface.co/nferruz/ProtGPT2/resolve/main/config.json"
10 | 	"https://huggingface.co/nferruz/ProtGPT2/resolve/main/merges.txt"
11 | 	"https://huggingface.co/nferruz/ProtGPT2/resolve/main/ppl-plddt.png"
12 | 	"https://huggingface.co/nferruz/ProtGPT2/resolve/main/pytorch_model.bin"
13 | 	"https://huggingface.co/nferruz/ProtGPT2/resolve/main/special_tokens_map.json"
14 | 	"https://huggingface.co/nferruz/ProtGPT2/resolve/main/tokenizer.json"
15 | 	"https://huggingface.co/nferruz/ProtGPT2/resolve/main/vocab.json"
16 | 
17 | )
18 | 
19 | # Create the directory if it doesn't exist
20 | mkdir -p "$model_dir"
21 | 
22 | # Change to the download directory
23 | cd "$model_dir" || exit
24 | 
25 | # Loop through each URL and download the file if it doesn't exist
26 | for url in "${URLS[@]}"; do
27 |     filename=$(basename "$url")
28 |     if [ -f "$filename" ]; then
29 |         echo "$filename already exists. Skipping download."
30 |     else
31 |         echo "Downloading $filename..."
32 |         wget "$url"
33 |     fi
34 | done
35 | 
36 | echo "Download process completed."
37 | 
38 | 


--------------------------------------------------------------------------------
/applications/ProtGPT2/protgpt2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | import os
 4 | import argparse
 5 | import time
 6 | import torch
 7 | import random
 8 | import numpy as np
 9 | import intel_extension_for_pytorch as ipex
10 | from transformers import pipeline
11 | 
12 | def make_deterministic(seed=42):
13 |     torch.manual_seed(seed)
14 |     np.random.seed(seed)
15 |     random.seed(seed)
16 | 
17 | def main():
18 |     # Setting up argument parser
19 |     parser = argparse.ArgumentParser(description='Generate protein sequences using ProtGPT2 with IPEX optimization.')
20 |     parser.add_argument('--max_length', type=int, default=100, help='Maximum length of generated sequence')
21 |     parser.add_argument('--do_sample', type=bool, default=True, help='Whether to sample the output or not')
22 |     parser.add_argument('--top_k', type=int, default=950, help='The number of highest probability vocabulary tokens to keep for top-k-filtering')
23 |     parser.add_argument('--repetition_penalty', type=float, default=1.2, help='The parameter for repetition penalty. 1.0 means no penalty')
24 |     parser.add_argument('--num_return_sequences', type=int, default=1, help='The number of sequences to return')
25 |     parser.add_argument('--eos_token_id', type=int, default=0, help='The id of the end of sequence token')
26 |     parser.add_argument('--dtype', type=str, choices=['float32', 'bfloat16'], default='float32', help='Data type for model optimization')
27 |     parser.add_argument('--iterations', type=int, default=5, help='Number of iterations to run')
28 |     parser.add_argument('--model_dir', type=str, default="None", help='Directory to load the protgpt2 model')
29 |     parser.add_argument('--output_file', type=str, default='protgpt2_generated_sequences.txt', help='File to save the generated sequences')
30 |     args = parser.parse_args()
31 | 
32 |     #make_deterministic()
33 |     # Setting dtype
34 |     dtype = torch.float32 if args.dtype == 'float32' else torch.bfloat16
35 |     model_dir = args.model_dir
36 |     if args.model_dir=="None":
37 |         protgpt2 = pipeline('text-generation', model="nferruz/ProtGPT2", torch_dtype=dtype)
38 |     else:
39 |         protgpt2 = pipeline('text-generation', model=model_dir, torch_dtype=dtype)
40 |     # Generate sequences using ProtGPT2 with IPEX optimization
41 |     protgpt2.model = ipex.optimize(protgpt2.model, dtype=dtype)
42 |     tic = time.time()
43 |     for i in range(args.iterations):
44 |         print("Iteration:", i)
45 |         t0 = time.time()
46 |         sequences = protgpt2(
47 |             "<|endoftext|>",
48 |             max_length=args.max_length,
49 |             do_sample=args.do_sample,
50 |             top_k=args.top_k,
51 |             repetition_penalty=args.repetition_penalty,
52 |             num_return_sequences=args.num_return_sequences,
53 |             eos_token_id=args.eos_token_id
54 |         )
55 |         t1 = time.time()
56 |         print('Time taken for', i, 'iteration:', t1 - t0, 'seconds')
57 |     toc = time.time()
58 |     print('Time taken for', args.iterations, 'iterations:', toc - tic, 'seconds')
59 |     print('Average time per iteration:', (toc - tic) / args.iterations, 'seconds')
60 | 
61 |     # Printing the sequences and saveing them to the output folder.
62 |     with open(args.output_file, 'w') as f:
63 |         for seq in sequences:
64 |             f.write(seq['generated_text'] + "\n")
65 |     print(f'Output saved to {args.output_file}')
66 | 
67 | if __name__ == "__main__":
68 |     main()
69 | 
70 | 


--------------------------------------------------------------------------------
/applications/ProteinMPNN/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use the official Ubuntu image as a base
 2 | ARG FROM_IMAGE=ubuntu:24.04
 3 | 
 4 | # Stage 1: Set up Conda environment
 5 | ARG BASE_IMAGE=condaforge/miniforge3:24.3.0-0
 6 | FROM ${BASE_IMAGE} as conda_setup
 7 | ENV DEBIAN_FRONTEND=noninteractive
 8 | 
 9 | # Stage 2: Set up the main build environment
10 | FROM ${FROM_IMAGE} as builder
11 | ENV DEBIAN_FRONTEND=noninteractive
12 | 
13 | # Install necessary build tools and clean up
14 | RUN apt-get update && apt-get install -y --no-install-recommends \
15 |     git autoconf build-essential wget vim ca-certificates numactl && \
16 |     rm -rf /var/lib/apt/lists/* && \
17 |     apt-get autoremove -y && \
18 |     apt-get clean
19 | 
20 | # Non-root user setup
21 | ENV SERVICE_NAME="proteinmpnn-service"
22 | RUN groupadd --gid 1001 $SERVICE_NAME && \
23 |     useradd -m -g $SERVICE_NAME --shell /bin/false --uid 1001 $SERVICE_NAME
24 | 
25 | WORKDIR /
26 | RUN git clone https://github.com/dauparas/ProteinMPNN.git
27 | WORKDIR /ProteinMPNN
28 | RUN git checkout 8907e6671bfbfc92303b5f79c4b5e6ce47cdef57
29 | 
30 | # Apply the patch file
31 | RUN wget https://github.com/IntelLabs/Open-Omics-Acceleration-Framework/releases/download/3.0/Source_code_with_submodules.tar.gz 
32 | RUN tar -xzf Source_code_with_submodules.tar.gz
33 | RUN cp /ProteinMPNN/Open-Omics-Acceleration-Framework/applications/ProteinMPNN/ProteinMPNN.patch .
34 | RUN git apply ProteinMPNN.patch
35 | RUN rm -rf Open-Omics-Acceleration-Framework
36 | RUN rm -rf Source_code_with_submodules.tar.gz
37 | ENV PATH="/opt/conda/bin:$PATH"
38 | 
39 | # Copy Conda installation from the conda_setup stage
40 | COPY --from=conda_setup /opt/conda /opt/conda
41 | 
42 | RUN conda create -n p_mpnn python=3.11 pip=24.0
43 | 
44 | # Install PyTorch, Torchvision, Torchaudio
45 | RUN conda install -n p_mpnn -y pytorch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 -c pytorch
46 | # Install Intel PyTorch extension
47 | RUN /opt/conda/envs/p_mpnn/bin/python -m pip install intel-extension-for-pytorch==2.3.100 oneccl-bind-pt==2.3.0 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
48 | RUN /opt/conda/envs/p_mpnn/bin/python -m pip install numpy==1.26.0
49 | 
50 | # Clone jemalloc source for building
51 | WORKDIR /ProteinMPNN
52 | RUN git clone --branch 5.3.0 https://github.com/jemalloc/jemalloc.git
53 | WORKDIR /ProteinMPNN/jemalloc
54 | RUN bash autogen.sh --prefix=/opt/conda/envs/p_mpnn/ && make install
55 | WORKDIR /ProteinMPNN
56 | RUN rm -rf jemalloc
57 | 
58 | # Set up environment activation and PATH
59 | ENV PATH="/opt/conda/envs/p_mpnn/bin:/opt/conda/bin:$PATH"
60 | 
61 | # Ensure all scripts inside ProteinMPNN/examples are executable
62 | RUN chmod +x /ProteinMPNN/examples/*.py
63 | RUN mkdir /outputs
64 | 
65 | # Change ownership of the directory
66 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /ProteinMPNN /outputs
67 | 
68 | # Switch to non-root user
69 | USER $SERVICE_NAME
70 | 
71 | # Healthcheck disabled
72 | HEALTHCHECK NONE
73 | 
74 | # Set environment variables for jemalloc
75 | ENV LD_PRELOAD "/opt/conda/envs/p_mpnn/lib/libjemalloc.so:$LD_PRELOAD"
76 | ENV MALLOC_CONF "oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
77 | # Set the default command to run the application
78 | CMD source ~/.bashrc && ["/bin/bash", "python"] 
79 | 


--------------------------------------------------------------------------------
/applications/ProteinMPNN/setup_proteinmpnn.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | trap 'echo "Error on line $LINENO"; exit 1;' ERR
 5 | 
 6 | SCRIPT_PATH="${BASH_SOURCE:-$0}"
 7 | ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")"
 8 | ABS_DIRECTORY="$(dirname "${ABS_SCRIPT_PATH}")"
 9 | 
10 | CONDA_INSTALL_DIR=$(realpath ./miniforge3)
11 | 
12 | # Parse command line arguments
13 | while (( "$#" )); do
14 |   case "$1" in
15 |     -p)
16 |       CONDA_INSTALL_DIR=$2
17 |       CONDA_INSTALL_DIR=$(realpath "$CONDA_INSTALL_DIR")
18 |       shift 2
19 |       ;;
20 |     -*|--*=)
21 |       echo "Error: Unsupported flag $1" >&2
22 |       exit 1
23 |       ;;
24 |     *)
25 |       echo "Error: Unsupported argument $1" >&2
26 |       exit 1
27 |       ;;
28 |   esac
29 | done
30 | 
31 | # Check if Miniforge3 exists
32 | if [ ! -d "$CONDA_INSTALL_DIR" ]; then
33 |   echo "Miniforge3 is not installed. Installing..."
34 |   command -v wget >/dev/null 2>&1 || { echo "wget is required but not installed. Exiting."; exit 1; }
35 |   wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh
36 |   bash Miniforge3-Linux-x86_64.sh -b -p "$CONDA_INSTALL_DIR"
37 |   echo "Miniforge3 installation complete."
38 | else
39 |   echo "Miniforge3 is already installed at: $CONDA_INSTALL_DIR"
40 | fi
41 | 
42 | export PATH="$CONDA_INSTALL_DIR/bin:$PATH"
43 | 
44 | # Clone the ProteinMPNN repository
45 | if [ ! -d "ProteinMPNN" ]; then
46 |   git clone https://github.com/dauparas/ProteinMPNN.git
47 | else
48 |   echo "ProteinMPNN repository already exists, skipping git clone."
49 | fi
50 | 
51 | cd ProteinMPNN
52 | git checkout 8907e6671bfbfc92303b5f79c4b5e6ce47cdef57
53 | PATCH_FILE="$ABS_DIRECTORY/ProteinMPNN.patch"
54 | if [ -f "$PATCH_FILE" ]; then
55 |   if git apply --reverse --check "$PATCH_FILE" > /dev/null 2>&1; then
56 |     echo "Patch has already been applied. Skipping patch step."
57 |   else
58 |     git apply "$PATCH_FILE"
59 |     echo "Patch applied successfully."
60 |   fi
61 | else
62 |   echo "Error: Patch file not found at $PATCH_FILE" >&2
63 |   exit 1
64 | fi
65 | 
66 | # Create and activate the Conda environment
67 | #source "$CONDA_INSTALL_DIR/bin/activate"
68 | if conda env list | grep -q "^p_mpnn"; then
69 | 	echo "Environment exists. Moving ahead without create the env. If the setup crashes, please remove manually."
70 |     else
71 | 	echo "Creating conda env p_mpnn.."
72 | 	conda create -n p_mpnn -y python=3.11 pip=24.0
73 | fi
74 | 
75 | source $CONDA_INSTALL_DIR/bin/activate p_mpnn
76 | #conda activate p_mpnn
77 | 
78 | conda install -n p_mpnn -y pytorch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 -c pytorch
79 | pip install intel-extension-for-pytorch==2.3.100
80 | pip install numpy==1.26.0
81 | 
82 | echo "setup complete!"
83 | echo "Note:"
84 | echo "Conda (Miniforge3) is installed at $CONDA_INSTALL_DIR"
85 | echo "To manually activate conda env, do: source $CONDA_INSTALL_DIR/bin/activate SE3nv"
86 | 


--------------------------------------------------------------------------------
/applications/RFdiffusion/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use the official Ubuntu image as a base
 2 | ARG FROM_IMAGE=ubuntu:24.04
 3 | 
 4 | # Stage 1: Set up Conda environment
 5 | ARG BASE_IMAGE=condaforge/miniforge3:24.3.0-0
 6 | FROM ${BASE_IMAGE} as conda_setup
 7 | ENV DEBIAN_FRONTEND=noninteractive
 8 | 
 9 | # Stage 2: Set up the main build environment
10 | FROM ${FROM_IMAGE} as builder
11 | ENV DEBIAN_FRONTEND=noninteractive
12 | 
13 | # Install necessary build tools and clean up
14 | RUN apt-get update && apt-get install -y --no-install-recommends \
15 |     git build-essential wget vim ca-certificates numactl autoconf automake make && \
16 |     rm -rf /var/lib/apt/lists/* && \
17 |     apt-get autoremove -y && \
18 |     apt-get clean
19 | 
20 | # Non-root user setup
21 | ENV SERVICE_NAME="rfdiffusion-service"
22 | RUN groupadd --gid 1001 $SERVICE_NAME && \
23 |     useradd -m -g $SERVICE_NAME --shell /bin/false --uid 1001 $SERVICE_NAME 
24 | 
25 | # Copy Conda installation from the conda_setup stage
26 | COPY --from=conda_setup /opt/conda /opt/conda
27 | 
28 | WORKDIR /app
29 | RUN git clone https://github.com/RosettaCommons/RFdiffusion.git 
30 | 
31 | WORKDIR /app/RFdiffusion
32 | # adding the git commit id
33 | RUN git checkout 820bfdfaded8c260b962dc40a3171eae316b6ce0
34 | 
35 | RUN wget https://github.com/IntelLabs/Open-Omics-Acceleration-Framework/releases/download/3.0/Source_code_with_submodules.tar.gz 
36 | RUN tar -xzf Source_code_with_submodules.tar.gz
37 | RUN cp /app/RFdiffusion/Open-Omics-Acceleration-Framework/applications/RFdiffusion/RFdiffusion.patch .
38 | RUN git apply RFdiffusion.patch
39 | RUN rm -rf Open-Omics-Acceleration-Framework
40 | RUN rm -rf Source_code_with_submodules.tar.gz
41 | 
42 | WORKDIR /app/RFdiffusion/models
43 | RUN  wget https://files.ipd.uw.edu/pub/RFdiffusion/6f5902ac237024bdd0c176cb93063dc4/Base_ckpt.pt && \
44 |      wget https://files.ipd.uw.edu/pub/RFdiffusion/e29311f6f1bf1af907f9ef9f44b8328b/Complex_base_ckpt.pt && \
45 |      wget https://files.ipd.uw.edu/pub/RFdiffusion/60f09a193fb5e5ccdc4980417708dbab/Complex_Fold_base_ckpt.pt && \
46 |      wget https://files.ipd.uw.edu/pub/RFdiffusion/74f51cfb8b440f50d70878e05361d8f0/InpaintSeq_ckpt.pt && \
47 |      wget https://files.ipd.uw.edu/pub/RFdiffusion/76d00716416567174cdb7ca96e208296/InpaintSeq_Fold_ckpt.pt && \
48 |      wget https://files.ipd.uw.edu/pub/RFdiffusion/5532d2e1f3a4738decd58b19d633b3c3/ActiveSite_ckpt.pt && \
49 |      wget https://files.ipd.uw.edu/pub/RFdiffusion/12fc204edeae5b57713c5ad7dcb97d39/Base_epoch8_ckpt.pt && \
50 |      wget https://files.ipd.uw.edu/pub/RFdiffusion/f572d396fae9206628714fb2ce00f72e/Complex_beta_ckpt.pt && \
51 |      wget https://files.ipd.uw.edu/pub/RFdiffusion/1befcb9b28e2f778f53d47f18b7597fa/RF_structure_prediction_weights.pt
52 | 
53 | WORKDIR /app/RFdiffusion
54 | 
55 | RUN /opt/conda/bin/conda env create -f env/SE3nv.yml
56 | 
57 | RUN git clone --branch 5.3.0 https://github.com/jemalloc/jemalloc.git
58 | WORKDIR /app/RFdiffusion/jemalloc
59 | RUN bash autogen.sh --prefix=/opt/conda/envs/SE3nv/ && make install
60 | WORKDIR /app/RFdiffusion
61 | RUN rm -rf jemalloc
62 | 
63 | # Set up environment activation and PATH
64 | ENV PATH="/opt/conda/envs/SE3nv/bin:/opt/conda/bin:$PATH"
65 | 	
66 | # Install dependencies
67 | WORKDIR /app/RFdiffusion/env/SE3Transformer
68 | RUN pip install --no-cache-dir -r requirements.txt
69 | RUN python setup.py install
70 | 
71 | WORKDIR /app/RFdiffusion
72 | RUN pip install -e .
73 | 
74 | RUN tar -xvf examples/ppi_scaffolds_subset.tar.gz -C examples/
75 | 
76 | WORKDIR /app/RFdiffusion/scripts
77 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /app
78 | 
79 | # Switch to non-root user
80 | USER $SERVICE_NAME
81 | HEALTHCHECK NONE
82 | 
83 | ENV LD_PRELOAD "/opt/conda/envs/SE3nv/lib/libjemalloc.so:$LD_PRELOAD"
84 | ENV MALLOC_CONF "oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1" 
85 | CMD python 
86 | 


--------------------------------------------------------------------------------
/applications/RFdiffusion/setup_rfdiffusion.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -e
  4 | SCRIPT_PATH="${BASH_SOURCE:-$0}"
  5 | ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")"
  6 | #echo "Value of ABS_SCRIPT_PATH: ${ABS_SCRIPT_PATH}"
  7 | ABS_DIRECTORY="$(dirname "${ABS_SCRIPT_PATH}")"
  8 | # Default Conda installation directory
  9 | CONDA_INSTALL_DIR=$(realpath ./miniforge3)
 10 | 
 11 | # Parse command line arguments
 12 | while (( "$#" )); do
 13 |   case "$1" in
 14 |     -p)
 15 |       CONDA_INSTALL_DIR=$2
 16 |       CONDA_INSTALL_DIR=$(realpath "$CONDA_INSTALL_DIR")
 17 |       shift 2
 18 |       ;;
 19 |     -*|--*=) # Unsupported flags
 20 |       echo "Error: Unsupported flag $1" >&2
 21 |       exit 1
 22 |       ;;
 23 |     *) # Preserve positional arguments
 24 |       echo "Error: Unsupported argument $1" >&2
 25 |       exit 1
 26 |       ;;
 27 |   esac
 28 | done
 29 | # Check if Miniforge3 exists and install if not found
 30 | if [ ! -d "$CONDA_INSTALL_DIR" ]; then
 31 |   echo "Miniforge3 is not installed. Installing..."
 32 |   wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh
 33 |   bash Miniforge3-Linux-x86_64.sh -b -p "$CONDA_INSTALL_DIR"
 34 |   echo "Miniforge3 installation complete."
 35 | else
 36 |   echo "Miniforge3 is already installed at: $CONDA_INSTALL_DIR"
 37 | fi
 38 | # Export Conda binary path
 39 | export PATH="$CONDA_INSTALL_DIR/bin:$PATH"
 40 | # Clone the RFdiffusion repository if it doesn't exist
 41 | if [ ! -d "RFdiffusion" ]; then
 42 |   git clone https://github.com/RosettaCommons/RFdiffusion.git
 43 | else
 44 |   echo "RFdiffusion repository already exists, skipping git clone."
 45 | fi
 46 | 
 47 | echo "$CONDA_INSTALL_DIR"
 48 | # Apply patch (assuming patch file is RFdiffusion.patch and it should be applied in RFdiffusion directory)
 49 | cd RFdiffusion
 50 | git checkout 820bfdfaded8c260b962dc40a3171eae316b6ce0
 51 | git log -1
 52 | PATCH_FILE="$ABS_DIRECTORY/RFdiffusion.patch"
 53 | echo $PATCH_FILE
 54 | if [ -f "$PATCH_FILE" ]; then
 55 |   # Check if the patch is already applied
 56 |   if git apply --reverse --check "$PATCH_FILE" > /dev/null 2>&1; then
 57 |     echo "Patch has already been applied. Skipping patch step."
 58 |   else
 59 |     git apply "$PATCH_FILE"
 60 |     echo "Patch applied successfully."
 61 |   fi
 62 | else
 63 |   echo "Error: Patch file not found at $PATCH_FILE" >&2
 64 |   exit 1
 65 | fi
 66 | mkdir -p models
 67 | cd models/
 68 | wget https://files.ipd.uw.edu/pub/RFdiffusion/6f5902ac237024bdd0c176cb93063dc4/Base_ckpt.pt
 69 | wget https://files.ipd.uw.edu/pub/RFdiffusion/e29311f6f1bf1af907f9ef9f44b8328b/Complex_base_ckpt.pt
 70 | wget https://files.ipd.uw.edu/pub/RFdiffusion/60f09a193fb5e5ccdc4980417708dbab/Complex_Fold_base_ckpt.pt
 71 | wget https://files.ipd.uw.edu/pub/RFdiffusion/74f51cfb8b440f50d70878e05361d8f0/InpaintSeq_ckpt.pt
 72 | wget https://files.ipd.uw.edu/pub/RFdiffusion/76d00716416567174cdb7ca96e208296/InpaintSeq_Fold_ckpt.pt
 73 | wget https://files.ipd.uw.edu/pub/RFdiffusion/5532d2e1f3a4738decd58b19d633b3c3/ActiveSite_ckpt.pt
 74 | wget https://files.ipd.uw.edu/pub/RFdiffusion/12fc204edeae5b57713c5ad7dcb97d39/Base_epoch8_ckpt.pt
 75 | # Optional:
 76 | wget https://files.ipd.uw.edu/pub/RFdiffusion/f572d396fae9206628714fb2ce00f72e/Complex_beta_ckpt.pt
 77 | # original structure prediction weights
 78 | wget https://files.ipd.uw.edu/pub/RFdiffusion/1befcb9b28e2f778f53d47f18b7597fa/RF_structure_prediction_weights.pt
 79 | cd ../
 80 | # Create and activate the Conda environment using the YAML file, disabling plugins to avoid errors
 81 | #CONDA_NO_PLUGINS=true 
 82 | if conda env list | grep -q "^SE3nv"; then
 83 | 	echo "Environment exists. Moving ahead without create the env. If the setup crashes, please remove manually."
 84 |     else
 85 | 	echo "Creating conda env SE3nv.."
 86 | 	conda env create -f env/SE3nv.yml 
 87 | fi
 88 | source $CONDA_INSTALL_DIR/bin/activate SE3nv
 89 | #conda init
 90 | #conda activate SE3nv
 91 | 
 92 | # Install SE3Transformer requirements
 93 | cd env/SE3Transformer
 94 | pip install --no-cache-dir -r requirements.txt
 95 | python setup.py install
 96 | 
 97 | # Install the rfdiffusion module
 98 | cd ../.. # Change into the root directory of the repository
 99 | pip install -e .
100 | 
101 | echo ""
102 | echo "Note:"
103 | echo "Conda (Miniforge3) is installed at $CONDA_INSTALL_DIR"
104 | echo "To manually activate conda env, do: source $CONDA_INSTALL_DIR/bin/activate SE3nv"
105 | 


--------------------------------------------------------------------------------
/applications/boltz/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use the official Ubuntu image as a base
 2 | ARG FROM_IMAGE=ubuntu:24.04
 3 | 
 4 | # Stage 1: Set up Conda environment
 5 | ARG BASE_IMAGE=condaforge/miniforge3:24.3.0-0
 6 | FROM ${BASE_IMAGE} as conda_setup
 7 | ENV DEBIAN_FRONTEND=noninteractive
 8 | 
 9 | # Stage 2: Set up the main build environment
10 | FROM ${FROM_IMAGE} as builder
11 | ENV DEBIAN_FRONTEND=noninteractive
12 | 
13 | # Install necessary build tools and clean up
14 | RUN apt-get update && apt-get install -y --no-install-recommends \
15 |     git build-essential wget vim ca-certificates numactl autoconf automake make && \
16 |     rm -rf /var/lib/apt/lists/* && \
17 |     apt-get autoremove -y && \
18 |     apt-get clean
19 | 
20 | # Build arguments for host UID/GID
21 | ARG USER_ID=2000
22 | ARG GROUP_ID=2000
23 | 
24 | ENV SERVICE_NAME="boltz-service"
25 | 
26 | # Create a user and group with same UID and GID as host
27 | RUN groupadd --gid ${GROUP_ID} $SERVICE_NAME && \
28 |     useradd -m -g $SERVICE_NAME --shell /bin/false --uid ${USER_ID} $SERVICE_NAME
29 | 
30 | # Copy Conda installation from the conda_setup stage
31 | COPY --from=conda_setup /opt/conda /opt/conda
32 | ENV PATH="/opt/conda/bin:$PATH"
33 | ENV LD_LIBRARY_PATH="/opt/conda/lib:${LD_LIBRARY_PATH:-}"
34 | RUN echo "source activate" >> ~/.bashrc
35 | 
36 | RUN git clone --branch 5.3.0 https://github.com/jemalloc/jemalloc.git
37 | WORKDIR /jemalloc
38 | RUN bash autogen.sh --prefix=/opt/conda/ && make install
39 | WORKDIR /
40 | RUN rm -rf jemalloc
41 | 
42 | WORKDIR /app
43 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /app
44 | RUN git clone --branch v0.4.1 https://github.com/jwohlwend/boltz.git
45 | 
46 | WORKDIR /app/boltz
47 | RUN pip install -e .
48 | 
49 | # Switch to non-root user
50 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /app/boltz
51 | COPY ./entrypoint.sh entrypoint.sh
52 | RUN chmod +x entrypoint.sh
53 | 
54 | USER $SERVICE_NAME
55 | 
56 | # bin bash
57 | # Clone Boltz 1 repository (replace with the actual repo URL)
58 | ENTRYPOINT ["/app/boltz/entrypoint.sh"]
59 | 
60 | # Default command
61 | CMD ["default"]


--------------------------------------------------------------------------------
/applications/boltz/README.md:
--------------------------------------------------------------------------------
  1 | ## 🔍 Running Inference with Boltz Docker
  2 | 
  3 | Follow the steps below to run inference using the Boltz Docker container:
  4 | 
  5 | ---
  6 | 
  7 | ### 🐳 1. Build the Docker Image
  8 | 
  9 | From the root of the project directory, build the Docker image:
 10 | 
 11 | ```bash
 12 | docker build -t boltz1 .
 13 | ```
 14 | 
 15 | ---
 16 | 
 17 | ### 📁 2. Create and Set Output Directory Permissions
 18 | 
 19 | Create an output folder and give it proper write permissions:
 20 | 
 21 | ```bash
 22 | mkdir -p <output_folder_location> <model_folder_location>
 23 | chmod a+w <output_folder_location> <model_folder_location>
 24 |           
 25 | export OUTPUT=$PWD/<output_folder_location>
 26 | export MODELS=$PWD/<model_folder_location>
 27 | export INPUT=$PWD/<input_folder_location>
 28 | ```
 29 | 
 30 | > ⚠️ Docker needs write permissions in the `<output_folder_location>` and `<model_folder_location>`  folder. `<input_folder_location>` is the folder contaning the input `.yaml` or `.fasta` file
 31 | 
 32 | Example
 33 | 
 34 | ```bash
 35 | mkdir -p ./output ./model
 36 | chmod a+w ./output ./model
 37 |           
 38 | export OUTPUT=$PWD/output
 39 | export MODELS=$PWD/model
 40 | export INPUT=$PWD/examples/
 41 | ```
 42 | 
 43 | ---
 44 | 
 45 | ### 🚀 3. Run Inference
 46 | 
 47 | In order to do inferencing few things needs to be done
 48 | Mount the volumes for input folder and output folder. Pass the mounted volumes to boltz as arguments. So the docker run command looks like
 49 | 
 50 | ```bash
 51 | docker run -it \
 52 |   --shm-size=100g \
 53 |   -v $INPUT:/app/boltz/input \
 54 |   -v $MODELS:/home/boltz-service/.boltz/ \
 55 |   -v $OUTPUT:/app/boltz/output \
 56 |   boltz1
 57 | ```
 58 | 
 59 | > 📝 The `--shm-size=100g` flag avoids shared memory issues during data loading with PyTorch.
 60 | 
 61 | ---
 62 | 
 63 | ### ✅ Output
 64 | 
 65 | Results will be written to the <output_folder_location> folder.
 66 | 
 67 | Boltz currently accepts three input formats:
 68 | 
 69 | 1. Fasta file, for most use cases
 70 | 
 71 | 2. A comprehensive YAML schema, for more complex use cases
 72 | 
 73 | 3. A directory containing files of the above formats, for batched processing
 74 | 
 75 | ## For more information checkout [boltz](https://github.com/jwohlwend/boltz)
 76 | 
 77 | ## License
 78 | 
 79 | Our model and code are released under MIT License, and can be freely used for both academic and commercial purposes.
 80 | 
 81 | 
 82 | ## Cite
 83 | 
 84 | If you use this code or the models in your research, please cite the following paper:
 85 | 
 86 | ```bibtex
 87 | @article{wohlwend2024boltz1,
 88 |   author = {Wohlwend, Jeremy and Corso, Gabriele and Passaro, Saro and Reveiz, Mateo and Leidal, Ken and Swiderski, Wojtek and Portnoi, Tally and Chinn, Itamar and Silterra, Jacob and Jaakkola, Tommi and Barzilay, Regina},
 89 |   title = {Boltz-1: Democratizing Biomolecular Interaction Modeling},
 90 |   year = {2024},
 91 |   doi = {10.1101/2024.11.19.624167},
 92 |   journal = {bioRxiv}
 93 | }
 94 | ```
 95 | 
 96 | In addition if you use the automatic MSA generation, please cite:
 97 | 
 98 | ```bibtex
 99 | @article{mirdita2022colabfold,
100 |   title={ColabFold: making protein folding accessible to all},
101 |   author={Mirdita, Milot and Sch{\"u}tze, Konstantin and Moriwaki, Yoshitaka and Heo, Lim and Ovchinnikov, Sergey and Steinegger, Martin},
102 |   journal={Nature methods},
103 |   year={2022},
104 | }
105 | ```
106 | 


--------------------------------------------------------------------------------
/applications/boltz/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | INPUT_DIR="/app/boltz/input"
 4 | OUTPUT_DIR="/app/boltz/output"
 5 | CUDA_VISIBLE_DEVICES=""
 6 | 
 7 | FOUND=0
 8 | 
 9 | for INPUT_FILE in "$INPUT_DIR"/*; do
10 |     # Only process .yaml or .fasta files
11 |     if [[ "$INPUT_FILE" == *.yaml || "$INPUT_FILE" == *.fasta ]]; then
12 |         echo "📂 Processing: $INPUT_FILE"
13 |         LD_PRELOAD=/opt/conda/lib/libjemalloc.so:$LD_PRELOAD \
14 |         MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1" \
15 |         boltz predict "$INPUT_FILE" --out_dir "$OUTPUT_DIR" --accelerator "cpu"
16 |         FOUND=1
17 |     fi
18 | done
19 | 
20 | if [[ $FOUND -eq 0 ]]; then
21 |     echo "❌ No .yaml or .fasta files found in $INPUT_DIR"
22 |     exit 1
23 | fi
24 | 


--------------------------------------------------------------------------------
/applications/esm/Dockerfile.base:
--------------------------------------------------------------------------------
 1 | ARG FROM_IMAGE=ubuntu:24.04
 2 | 
 3 | # Stage 2: Set up the main build environment
 4 | FROM ${FROM_IMAGE} as builder
 5 | 
 6 | ENV DEBIAN_FRONTEND=noninteractive
 7 | 
 8 | ARG http_proxy
 9 | ENV http_proxy=${http_proxy}
10 | 
11 | ARG https_proxy
12 | ENV https_proxy=${https_proxy}
13 | 
14 | ARG no_proxy
15 | ENV no_proxy=${no_proxy}
16 | 
17 | 
18 | # Install necessary build tools and clean up
19 | RUN apt-get update && apt-get install -y --no-install-recommends \
20 |     git build-essential wget vim ca-certificates autoconf automake make numactl && \
21 |     rm -rf /var/lib/apt/lists/* && \
22 |     apt-get autoremove -y && \
23 |     apt-get clean
24 | ENV SERVICE_NAME="esm-base-service"
25 | RUN groupadd --gid 1001 $SERVICE_NAME && \
26 | 	useradd -m -g $SERVICE_NAME --shell /bin/false --uid 1001 $SERVICE_NAME
27 | 
28 | WORKDIR /app
29 | 
30 | RUN    chown -R $SERVICE_NAME:$SERVICE_NAME /app
31 | USER $SERVICE_NAME
32 | 
33 | RUN wget --no-check-certificate "https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-$(uname)-$(uname -m).sh"
34 | RUN bash Miniforge3-$(uname)-$(uname -m).sh -b -p "${HOME}/conda"
35 | 
36 | WORKDIR /app
37 | 
38 | RUN git clone --recursive https://github.com/facebookresearch/esm.git
39 | WORKDIR /app/esm
40 | RUN git checkout -b esm 2b369911bb5b4b0dda914521b9475cad1656b2ac
41 | 
42 | 
43 | RUN wget https://github.com/IntelLabs/Open-Omics-Acceleration-Framework/releases/download/3.0/Source_code_with_submodules.tar.gz 
44 | RUN tar -xzf Source_code_with_submodules.tar.gz
45 | 
46 | RUN mv /app/esm/Open-Omics-Acceleration-Framework/applications/esm /app/esm/omics_setup && \
47 |     git apply /app/esm/omics_setup/esm_change_all.patch && \
48 |     rm -rf /app/esm/Open-Omics-Acceleration-Framework && \
49 |     rm -rf /app/esm/Source_code_with_submodules.tar.gz
50 |     
51 | RUN mkdir -p /home/esm-base-service/.cache/torch/hub/ && \
52 |     rm -rf /home/esm-base-service/.cache/torch/hub/checkpoints && \
53 |     ln -s /checkpoints /home/esm-base-service/.cache/torch/hub/checkpoints
54 |     
55 | HEALTHCHECK NONE
56 | 
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/applications/esm/Dockerfile.esm:
--------------------------------------------------------------------------------
 1 | # Accept the base image name as an argument
 2 | ARG BASE_IMAGE
 3 | 
 4 | # Extend from the base image passed as an argument
 5 | FROM ${BASE_IMAGE}
 6 | 
 7 | USER $SERVICE_NAME
 8 | # Create Conda Env for ESM
 9 | WORKDIR /app/esm/
10 | RUN ${HOME}/conda/bin/mamba env create -f /app/esm/omics_setup/env.yml
11 | 
12 | WORKDIR /app/esm
13 | RUN bash -c "source ${HOME}/conda/etc/profile.d/conda.sh && \
14 | source ${HOME}/conda/etc/profile.d/mamba.sh && \
15 | mamba activate esm_py11 && \
16 | pip install . &&\
17 | pip install torch==2.4.0+cpu torchvision==0.19.0+cpu torchaudio==2.4.0+cpu --index-url https://download.pytorch.org/whl/cpu"
18 | 
19 | RUN echo "#!/bin/bash" >> /app/init.sh && \
20 | echo "source ${HOME}/conda/etc/profile.d/conda.sh" >> /app/init.sh && \
21 | echo "source ${HOME}/conda/etc/profile.d/mamba.sh" >> /app/init.sh && \
22 | echo "mamba activate esm_py11" >> /app/init.sh && \
23 | chmod +x /app/init.sh && \
24 | echo "source /app/init.sh" >> ~/.bashrc
25 | 
26 | WORKDIR /app/esm/examples/lm-design/
27 | RUN wget https://dl.fbaipublicfiles.com/fair-esm/examples/lm_design/linear_projection_model.pt
28 | 
29 | 
30 | WORKDIR /app/esm/
31 | HEALTHCHECK NONE
32 | 
33 | RUN echo '#!/bin/bash' > /app/entrypoint.sh && \
34 |     echo 'if [ -z "$1" ]; then' >> /app/entrypoint.sh && \
35 |     echo '    exec /bin/bash' >> /app/entrypoint.sh && \
36 |     echo 'else' >> /app/entrypoint.sh && \
37 |     echo '    source /app/init.sh' >> /app/entrypoint.sh && \
38 |     echo '    exec "$@"' >> /app/entrypoint.sh && \
39 |     echo 'fi' >> /app/entrypoint.sh && \
40 |     chmod +x /app/entrypoint.sh
41 | 
42 | ENTRYPOINT ["/app/entrypoint.sh"]
43 | CMD []
44 | 


--------------------------------------------------------------------------------
/applications/esm/Dockerfile.esmfold:
--------------------------------------------------------------------------------
 1 | # Accept the base image name as an argument
 2 | ARG BASE_IMAGE
 3 | 
 4 | # Extend from the base image passed as an argument
 5 | FROM ${BASE_IMAGE}
 6 | 
 7 | USER $SERVICE_NAME
 8 | # Clone Openfold
 9 | WORKDIR /app/esm
10 | RUN git clone https://github.com/aqlaboratory/openfold.git
11 | WORKDIR /app/esm/openfold
12 | RUN git checkout -b esm_openfold 4b41059694619831a7db195b7e0988fc4ff3a307 && \
13 |     git apply /app/esm/omics_setup/esm_openfold_change_py37.patch 
14 | 
15 | WORKDIR /app/esm
16 | RUN ${HOME}/conda/bin/mamba env create -f environment.yml
17 | RUN bash -c "source ${HOME}/conda/etc/profile.d/conda.sh && \
18 | source ${HOME}/conda/etc/profile.d/mamba.sh && \
19 | mamba activate esmfold && \
20 | pip install . "
21 | 
22 | WORKDIR /app/esm/openfold
23 | RUN bash -c "source ${HOME}/conda/etc/profile.d/conda.sh && \
24 | source ${HOME}/conda/etc/profile.d/mamba.sh && \
25 | mamba activate esmfold && \
26 | python setup.py install "
27 | 
28 | WORKDIR /app/esm
29 | RUN git clone --branch 5.3.0 https://github.com/jemalloc/jemalloc.git
30 | WORKDIR /app/esm/jemalloc
31 | RUN bash autogen.sh --prefix=${HOME}/conda/envs/esmfold/ && make install && \
32 |     rm -rf /app/esm/jemalloc
33 |     
34 | RUN echo "#!/bin/bash" >> /app/init.sh && \
35 | echo "source ${HOME}/conda/etc/profile.d/conda.sh" >> /app/init.sh && \
36 | echo "source ${HOME}/conda/etc/profile.d/mamba.sh" >> /app/init.sh && \
37 | echo "mamba activate esmfold" >> /app/init.sh && \
38 | chmod +x /app/init.sh && \
39 | echo "source /app/init.sh" >> ~/.bashrc
40 | 
41 | WORKDIR /app/esm/
42 | HEALTHCHECK NONE
43 | 
44 | ENV LD_PRELOAD "/home/esm-base-service/conda/envs/esmfold/lib/libjemalloc.so:$LD_PRELOAD"
45 | ENV MALLOC_CONF "oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
46 | 
47 | RUN echo '#!/bin/bash' > /app/entrypoint.sh && \
48 |     echo 'if [ -z "$1" ]; then' >> /app/entrypoint.sh && \
49 |     echo '    exec /bin/bash' >> /app/entrypoint.sh && \
50 |     echo 'else' >> /app/entrypoint.sh && \
51 |     echo '    source /app/init.sh' >> /app/entrypoint.sh && \
52 |     echo '    exec "$@"' >> /app/entrypoint.sh && \
53 |     echo 'fi' >> /app/entrypoint.sh && \
54 |     chmod +x /app/entrypoint.sh
55 | 
56 |     
57 | ENTRYPOINT ["/app/entrypoint.sh"]
58 | CMD []
59 | 
60 | 


--------------------------------------------------------------------------------
/applications/esm/build_docker_images.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Constants
 4 | BASE_IMAGE=esm_base_image
 5 | ESM_IMAGE=esm_image
 6 | ESMFOLD_IMAGE=esmfold_image
 7 | 
 8 | # Function to check if a command exists
 9 | command_exists() {
10 |   command -v "$1" >/dev/null 2>&1
11 | }
12 | 
13 | # Function to check if a Docker image exists
14 | image_exists() {
15 |   local image_name="$1"
16 |   $runtime images -q "$image_name" | grep -q .
17 | }
18 | 
19 | # Parse optional proxy args from command-line
20 | for arg in "$@"; do
21 |   case $arg in
22 |     --http_proxy=*)
23 |       http_proxy="${arg#*=}"
24 |       ;;
25 |     --https_proxy=*)
26 |       https_proxy="${arg#*=}"
27 |       ;;
28 |     --no_proxy=*)
29 |       no_proxy="${arg#*=}"
30 |       ;;
31 |     *)
32 |       echo "Unknown option: $arg"
33 |       exit 1
34 |       ;;
35 |   esac
36 | done
37 | 
38 | runtime=docker
39 | if ! command_exists "$runtime"; then
40 |   echo "$runtime is not installed on your system. Please install it first."
41 |   exit 1
42 | fi
43 | 
44 | build_esm=false
45 | build_esmfold=false
46 | 
47 | # Prompt for tasks to build
48 | echo "Which images do you want to build?"
49 | echo "1 esm"
50 | echo "2 esm_fold"
51 | echo "3 Both esm and esm_fold"
52 | read -r task_option
53 | 
54 | case $task_option in
55 |     1) build_esm=true ;;
56 |     2) build_esmfold=true ;;
57 |     3) build_esm=true; build_esmfold=true ;;
58 |     *) echo "Invalid option selected. Please choose 1, 2, or 3."; exit 1 ;;
59 | esac
60 | 
61 | echo "build_esm = ${build_esm}"
62 | echo "build_esmfold = ${build_esmfold}"
63 | 
64 | # Function to build Docker image
65 | build_image() {
66 |   local image_name="$1"
67 |   local dockerfile="$2"
68 |   local args=(--build-arg BASE_IMAGE=$BASE_IMAGE)
69 | 
70 |   [[ -n "$http_proxy" ]] && args+=(--build-arg http_proxy=$http_proxy)
71 |   [[ -n "$https_proxy" ]] && args+=(--build-arg https_proxy=$https_proxy)
72 |   [[ -n "$no_proxy" ]] && args+=(--build-arg no_proxy=$no_proxy)
73 | 
74 |   $runtime build "${args[@]}" -f "$dockerfile" -t "$image_name" .
75 | }
76 | 
77 | # Build base image
78 | if ! image_exists "$BASE_IMAGE"; then
79 |   echo "Building base image..."
80 |   build_image "$BASE_IMAGE" "Dockerfile.base"
81 | fi
82 | 
83 | # Build esm image
84 | if $build_esm && ! image_exists "$ESM_IMAGE"; then
85 |   echo "Building image for esm..."
86 |   build_image "$ESM_IMAGE" "Dockerfile.esm"
87 | fi
88 | 
89 | # Build esm_fold image
90 | if $build_esmfold && ! image_exists "$ESMFOLD_IMAGE"; then
91 |   echo "Building image for esm_fold..."
92 |   build_image "$ESMFOLD_IMAGE" "Dockerfile.esmfold"
93 | fi
94 | 
95 | echo "Build process completed."
96 | 


--------------------------------------------------------------------------------
/applications/esm/env.yml:
--------------------------------------------------------------------------------
 1 | name: esm_py11
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - python=3.11
 6 |   - pip
 7 |   - pip:
 8 |       - numpy==1.26.4
 9 |       - hydra-core==1.3.2
10 |       - nltk==3.8.1
11 |       - py3Dmol==2.3.0
12 |       - biotite==0.41.2
13 |       - torch-geometric==2.5.3
14 |       - dm-tree==0.1.8
15 | 
16 | 


--------------------------------------------------------------------------------
/applications/esm3/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Intel Corporation
 2 | # SPDX-License-Identifier: MIT License
 3 | 
 4 | ARG FROM_IMAGE=ubuntu:25.10
 5 | 
 6 | FROM ${FROM_IMAGE} AS builder
 7 | 
 8 | ENV DEBIAN_FRONTEND=noninteractive
 9 | 
10 | RUN apt-get update && apt-get install -y --no-install-recommends \
11 |     git build-essential wget vim ca-certificates autoconf automake make numactl && \
12 |     rm -rf /var/lib/apt/lists/* && \
13 |     apt-get autoremove -y && \
14 |     apt-get clean
15 | ENV SERVICE_NAME="esm3-base-service"
16 | RUN groupadd --gid 1001 $SERVICE_NAME && \
17 | 	useradd -m -g $SERVICE_NAME --shell /bin/false --uid 1001 $SERVICE_NAME
18 | 
19 | WORKDIR /app
20 | 
21 | RUN  chown -R $SERVICE_NAME:$SERVICE_NAME /app
22 | USER $SERVICE_NAME
23 | 
24 | RUN wget --no-check-certificate "https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-$(uname)-$(uname -m).sh"
25 | RUN bash Miniforge3-$(uname)-$(uname -m).sh -b -p "${HOME}/conda"
26 | 
27 | WORKDIR /app
28 | COPY . /app
29 | RUN git clone https://github.com/evolutionaryscale/esm.git
30 | WORKDIR /app/esm
31 | RUN git checkout -b esm d40007ea16850da4fbf60244a9d50c2a94cbef3d
32 | RUN cp /app/esm3_changes.patch /app/esm/esm_changes.patch
33 | RUN git apply /app/esm/esm_changes.patch
34 | 
35 | WORKDIR /app/esm/
36 | RUN ${HOME}/conda/bin/conda update -y --all
37 | RUN ${HOME}/conda/bin/mamba update -y --all
38 | RUN ${HOME}/conda/bin/mamba env create -f /app/env.yml
39 | ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu
40 | 
41 | WORKDIR /app/esm
42 | RUN bash -c "source ${HOME}/conda/etc/profile.d/conda.sh && \
43 |     source ${HOME}/conda/etc/profile.d/mamba.sh && \
44 |     mamba activate esm3 && \
45 |     pip install --upgrade pip setuptools wheel && \
46 |     pip install ."
47 | 
48 | 
49 | RUN echo "#!/bin/bash" > /app/init.sh && \
50 |     echo "source ${HOME}/conda/etc/profile.d/conda.sh" >> /app/init.sh && \
51 |     echo "source ${HOME}/conda/etc/profile.d/mamba.sh" >> /app/init.sh && \
52 |     echo "mamba activate esm3" >> /app/init.sh && \
53 |     chmod +x /app/init.sh && \
54 |     echo "source /app/init.sh" >> ~/.bashrc
55 |     
56 | WORKDIR /app/esm
57 | 
58 | RUN echo '#!/bin/bash' > /app/entrypoint.sh && \
59 |     echo 'if [ -z "$1" ]; then' >> /app/entrypoint.sh && \
60 |     echo '    exec /bin/bash' >> /app/entrypoint.sh && \
61 |     echo 'else' >> /app/entrypoint.sh && \
62 |     echo '    source /app/init.sh' >> /app/entrypoint.sh && \
63 |     echo '    exec "$@"' >> /app/entrypoint.sh && \
64 |     echo 'fi' >> /app/entrypoint.sh && \
65 |     chmod +x /app/entrypoint.sh
66 | 
67 | ENTRYPOINT ["/app/entrypoint.sh"]
68 | CMD []
69 | HEALTHCHECK NONE
70 | 
71 | WORKDIR /app
72 | ENV HF_HOME=/home/esm3-base-service/.cache/torch
73 | 
74 | RUN mkdir -p /home/esm3-base-service/.cache/torch && \
75 |     ln -s /models /home/esm3-base-service/.cache/torch/hub
76 | 


--------------------------------------------------------------------------------
/applications/esm3/env.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Intel Corporation
 2 | # SPDX-License-Identifier: MIT License
 3 | 
 4 | name: esm3
 5 | channels:
 6 |   - conda-forge
 7 | dependencies:
 8 |   - python=3.10.16
 9 |   - mamba
10 |   - pip
11 | 


--------------------------------------------------------------------------------
/applications/esm3/esm3_changes.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/esm/models/esm3.py b/esm/models/esm3.py
 2 | index cbe02dd..b6df418 100644
 3 | --- a/esm/models/esm3.py
 4 | +++ b/esm/models/esm3.py
 5 | @@ -227,7 +227,10 @@ class ESM3(nn.Module, ESM3InferenceClient):
 6 |  
 7 |      @classmethod
 8 |      def from_pretrained(
 9 | -        cls, model_name: str = ESM3_OPEN_SMALL, device: torch.device | None = None
10 | +        cls, 
11 | +        model_name: str = ESM3_OPEN_SMALL, 
12 | +        device: torch.device | None = None, 
13 | +        bf16: bool = False  # Add bf16 argument
14 |      ) -> ESM3:
15 |          from esm.pretrained import load_local_model
16 |  
17 | @@ -236,7 +239,14 @@ class ESM3(nn.Module, ESM3InferenceClient):
18 |              raise ValueError(f"Model name {model_name} is not a valid ESM3 model name.")
19 |          if device is None:
20 |              device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21 | +        # Determine dtype based on bf16 flag
22 | +        if bf16:
23 | +            dtype = torch.bfloat16
24 | +        else:
25 | +            dtype = torch.float32
26 | +        print("Selected data type:", dtype)
27 |          model = load_local_model(model_name, device=device)
28 | +        model = model.to(dtype)
29 |          if device.type != "cpu":
30 |              model = model.to(torch.bfloat16)
31 |          assert isinstance(model, ESM3)
32 | diff --git a/esm/models/esmc.py b/esm/models/esmc.py
33 | index 0807a21..e93085e 100644
34 | --- a/esm/models/esmc.py
35 | +++ b/esm/models/esmc.py
36 | @@ -77,13 +77,23 @@ class ESMC(nn.Module, ESMCInferenceClient):
37 |  
38 |      @classmethod
39 |      def from_pretrained(
40 | -        cls, model_name: str = ESMC_600M, device: torch.device | None = None
41 | +        cls, model_name: str = ESMC_600M, device: torch.device | None = None,
42 | +        bf16: bool = False,  # Add bf16 argument
43 |      ) -> ESMC:
44 |          from esm.pretrained import load_local_model
45 |  
46 |          if device is None:
47 |              device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
48 | +        # Determine dtype based on bf16 flag
49 | +        if bf16:
50 | +            dtype = torch.bfloat16
51 | +        else:
52 | +            dtype = torch.float32
53 | +
54 | +        print("Selected data type ESMC :", dtype)
55 | +        print("model_name ESMC",model_name)
56 |          model = load_local_model(model_name, device=device)
57 | +        model = model.to(dtype)
58 |          if device.type != "cpu":
59 |              model = model.to(torch.bfloat16)
60 |          assert isinstance(model, ESMC)
61 | diff --git a/esm/utils/structure/affine3d.py b/esm/utils/structure/affine3d.py
62 | index 382abcd..db201d6 100644
63 | --- a/esm/utils/structure/affine3d.py
64 | +++ b/esm/utils/structure/affine3d.py
65 | @@ -124,7 +124,7 @@ class RotationMatrix(Rotation):
66 |          with fp32_autocast_context(self.device.type):
67 |              if self._rots.shape[-3] == 1:
68 |                  # This is a slight speedup over einsum for batched rotations
69 | -                return p @ self._rots.transpose(-1, -2).squeeze(-3)
70 | +                return p.float() @ self._rots.float().transpose(-1, -2).squeeze(-3)
71 |              else:
72 |                  # einsum way faster than bmm!
73 |                  return torch.einsum("...ij,...j", self._rots, p)
74 | index 0cc7bd9..1ee7ecb 100644
75 | --- a/pyproject.toml
76 | +++ b/pyproject.toml
77 | @@ -24,7 +24,7 @@ dependencies = [
78 |    "torch>=2.2.0",
79 |    "torchvision",
80 |    "torchtext",
81 | -  "transformers<4.48.2",
82 | +  "transformers<4.52.2",
83 |    "ipython",
84 |    "einops",
85 |    "biotite==0.41.2",
86 | 


--------------------------------------------------------------------------------
/applications/esm3/scripts/ESM3_chain_of_thought.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2025 Intel Corporation
  2 | # SPDX-License-Identifier: MIT License
  3 | 
  4 | import os
  5 | import torch
  6 | import argparse
  7 | import time
  8 | import pandas as pd
  9 | from esm.models.esm3 import ESM3
 10 | from esm.sdk.api import ESM3InferenceClient, ESMProtein, ESMProteinTensor
 11 | from esm.sdk.api import GenerationConfig
 12 | from esm.utils.types import FunctionAnnotation
 13 | 
 14 | def get_sample_protein_from_csv(csv_file: str, sequence_length: int = None) -> ESMProtein:
 15 |     df = pd.read_csv(csv_file, sep="\t")
 16 |     if not {"Label", "Start", "End"}.issubset(df.columns):
 17 |         raise ValueError("CSV file must contain 'Label', 'Start', and 'End' columns")
 18 |     
 19 |     function_annotations = [
 20 |         FunctionAnnotation(label=row["Label"], start=int(row["Start"]), end=int(row["End"]))
 21 |         for _, row in df.iterrows()
 22 |     ]
 23 |     
 24 |     # Determine sequence length
 25 |     sequence_length = sequence_length or max(df["End"], default=100)
 26 |     
 27 |     protein = ESMProtein(sequence="_" * sequence_length)
 28 |     protein.function_annotations = function_annotations
 29 |     return protein
 30 | 
 31 | def chain_of_thought(client: ESM3InferenceClient, csv_path: str, output_dir: str, args):
 32 |     cot_protein = get_sample_protein_from_csv(csv_path, args.sequence_length)
 33 |     enable_autocast = args.bf16
 34 |     device_type = "cpu"
 35 |     
 36 |     with torch.amp.autocast(device_type=device_type, enabled=enable_autocast):
 37 |         cot_protein.sequence = "_" * len(cot_protein.sequence)
 38 |         cot_protein.coordinates = None
 39 |         cot_protein.sasa = None
 40 |         cot_protein_tensor = client.encode(cot_protein)
 41 | 
 42 |         # Generate different properties using command-line args
 43 |         for cot_track in ["secondary_structure", "structure", "sequence"]:
 44 |             cot_protein_tensor = client.generate(
 45 |                 cot_protein_tensor,
 46 |                 GenerationConfig(
 47 |                     track=cot_track,
 48 |                     schedule=args.schedule,
 49 |                     num_steps=args.num_steps,
 50 |                     strategy=args.strategy,
 51 |                     temperature=args.temperature,
 52 |                     top_p=args.top_p,
 53 |                     condition_on_coordinates_only=args.condition_on_coordinates_only
 54 |                 ),
 55 |             )
 56 |         
 57 |         assert isinstance(cot_protein_tensor, ESMProteinTensor)
 58 |         cot_protein = client.decode(cot_protein_tensor)
 59 |         assert isinstance(cot_protein, ESMProtein)
 60 |         
 61 |         csv_name = os.path.splitext(os.path.basename(csv_path))[0]
 62 |         output_pdb_path = os.path.join(output_dir, f"{csv_name}.pdb")
 63 |         cot_protein.to_pdb(output_pdb_path)
 64 |         print(f"Saved output to {output_pdb_path}")
 65 | 
 66 | def main(args):
 67 | 
 68 |     if not os.path.exists(args.csv_file) or not args.csv_file.endswith(".csv"):
 69 |         print(f"Invalid CSV file: {args.csv_file}")
 70 |         return
 71 |     os.makedirs(args.output_dir, exist_ok=True)
 72 |     
 73 |     client = ESM3InferenceClient() if os.getenv("ESM_API_KEY") else ESM3.from_pretrained(
 74 |         "esm3_sm_open_v1", bf16=args.bf16
 75 |     )
 76 |     
 77 |     infer_time = time.time()
 78 |     chain_of_thought(client, args.csv_file, args.output_dir, args)
 79 |     if args.timing:
 80 |         print(f"Inference time = {time.time() - infer_time} seconds")
 81 | 
 82 | if __name__ == "__main__":
 83 |     parser = argparse.ArgumentParser(description="Run ESM3 protein inverse folding on a CSV file.")
 84 |     parser.add_argument("csv_file", type=str, help="Path to input CSV file.")
 85 |     parser.add_argument("output_dir", type=str, help="Directory to save the output PDB file.")
 86 |     parser.add_argument("--bf16", action="store_true", help="Enable bf16 inference.")
 87 |     parser.add_argument("--timing", action="store_true", help="Enable timing for inference.")
 88 |     parser.add_argument("--schedule", type=str, choices=["cosine", "linear"], default="cosine", help="Schedule type.")
 89 |     parser.add_argument("--strategy", type=str, choices=["random", "entropy"], default="entropy", help="Unmasking strategy.")
 90 |     parser.add_argument("--num_steps", type=int, default=1, help="Number of steps for generation.")
 91 |     parser.add_argument("--temperature", type=float, default=1.0, help="Sampling temperature.")
 92 |     parser.add_argument("--top_p", type=float, default=1.0, help="Top-p sampling value.")
 93 |     parser.add_argument("--condition_on_coordinates_only", action="store_true", help="Condition only on coordinates.")
 94 |     parser.add_argument("--sequence_length", type=int, help="Custom sequence length (optional).")
 95 |     
 96 |     args = parser.parse_args()
 97 |     os.makedirs(args.output_dir, exist_ok=True)
 98 |     if args.timing:
 99 |         start_time = time.time()
100 |     main(args)
101 |     if args.timing:
102 |         print(f"Complete run time = {time.time() - start_time:.2f} seconds")
103 | 


--------------------------------------------------------------------------------
/applications/esm3/scripts/ESM3_function_prediction_task.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Intel Corporation
 2 | # SPDX-License-Identifier: MIT License
 3 | 
 4 | import os
 5 | import torch
 6 | import argparse
 7 | from esm.models.esm3 import ESM3
 8 | from esm.sdk.api import ESM3InferenceClient, ESMProtein
 9 | from esm.sdk.api import GenerationConfig
10 | import time
11 | import csv
12 | from esm.utils.structure.protein_complex import ProteinComplex
13 | 
14 | def function_protein(client: ESM3InferenceClient, protein,pdb_file: str, output_dir: str, args):
15 |     """Runs protein function prediction and saves the output as a CSV file."""
16 |     print(f"Processing {pdb_file}...")
17 | 
18 | 
19 |     with torch.amp.autocast(device_type="cpu", enabled=args.bf16):
20 |         protein.function_annotations = None
21 |         
22 |         # Inline GenerationConfig inside client.generate()
23 |         protein_with_function = client.generate(
24 |             protein,
25 |             GenerationConfig(
26 |                 track="function",
27 |                 schedule=args.schedule,
28 |                 strategy=args.strategy,
29 |                 num_steps=args.num_steps,
30 |                 temperature=args.temperature,
31 |                 temperature_annealing=args.temperature_annealing,
32 |                 top_p=args.top_p,
33 |                 condition_on_coordinates_only=args.condition_on_coordinates_only,
34 |             ),
35 |         )
36 | 
37 |         assert isinstance(protein_with_function, ESMProtein), f"Unexpected output: {protein_with_function}"
38 |         output_csv = os.path.join(output_dir, f"{os.path.basename(pdb_file).replace('.pdb', '')}.csv")
39 | 
40 |         with open(output_csv, "w", newline="") as f:
41 |             writer = csv.writer(f, delimiter="\t")  
42 |             writer.writerow(["Label", "Start", "End"])
43 |             for annotation in protein_with_function.function_annotations:
44 |                 writer.writerow([annotation.label, annotation.start, annotation.end])
45 | 
46 |         print(f"Function annotations saved as {output_csv}")
47 | def processing_pdb(client: ESM3InferenceClient, pdb_file: str, output_dir: str, args):
48 |     """Runs protein folding and saves the output as a PDB file in the specified directory."""
49 |     print(f"Processing {pdb_file}...")
50 | 
51 |     if args.protein_complex:
52 |         protein = ProteinComplex.from_pdb(pdb_file)
53 |         protein = ESMProtein.from_protein_complex(protein)
54 |         function_protein(client,protein,pdb_file,output_dir,args)
55 |     else:
56 |         protein = ESMProtein.from_pdb(pdb_file)
57 |         function_protein(client,protein,pdb_file,output_dir,args)
58 | 
59 | def main(args):
60 |     if not os.path.exists(args.pdb_file) or not args.pdb_file.endswith(".pdb"):
61 |         print(f"Invalid PDB file: {args.pdb_file}")
62 |         return
63 |     os.makedirs(args.output_dir, exist_ok=True)
64 | 
65 |     client = ESM3InferenceClient() if os.getenv("ESM_API_KEY") else ESM3.from_pretrained("esm3_sm_open_v1", bf16=args.bf16)
66 | 
67 |     infer_time = time.time()
68 |     processing_pdb(client, args.pdb_file, args.output_dir, args)
69 |     if args.timing:
70 |         print(f"Inference time = {time.time() - infer_time} seconds")
71 | 
72 | if __name__ == "__main__":
73 |     parser = argparse.ArgumentParser(description="Run ESM3 protein function annotation on a single PDB file.")
74 |     parser.add_argument("pdb_file", type=str, help="Path to the input PDB file.")
75 |     parser.add_argument("output_dir", type=str, help="Directory to save the output CSV file.")
76 |     parser.add_argument("--bf16", action="store_true", help="Enable bf16 inference.")
77 |     parser.add_argument("--timing", action="store_true", help="Enable timing for inference.")
78 |     parser.add_argument("--schedule", type=str, choices=["cosine", "linear"], default="cosine", help="Schedule type (cosine or linear).")
79 |     parser.add_argument("--strategy", type=str, choices=["random", "entropy"], default="entropy", help="Unmasking strategy (random or entropy).")
80 |     parser.add_argument("--num_steps", type=int, default=1, help="Number of steps for generation.")
81 |     parser.add_argument("--temperature", type=float, default=1.0, help="Sampling temperature.")
82 |     parser.add_argument("--temperature_annealing", action="store_true", help="Enable temperature annealing.")
83 |     parser.add_argument("--top_p", type=float, default=1.0, help="Top-p sampling value.")
84 |     parser.add_argument("--condition_on_coordinates_only", action="store_true", help="Condition only on coordinates.")
85 |     parser.add_argument("--protein_complex", action="store_true", help="Enable prediction for protein complexes (multi-chain structure) using a multi-chain FASTA file input.")   
86 | 
87 |     args = parser.parse_args()
88 | 
89 |     os.makedirs(args.output_dir, exist_ok=True)
90 |     if args.timing:
91 |         start_time = time.time()
92 |     main(args)
93 |     if args.timing:
94 |         print(f"Complete run time = {time.time() - start_time:.2f} seconds")
95 | 


--------------------------------------------------------------------------------
/applications/gromacs/Dockerfile:
--------------------------------------------------------------------------------
  1 | FROM ubuntu:20.04
  2 | 
  3 | SHELL ["/bin/bash", "-c"]
  4 | ENV DEBIAN_FRONTEND=noninteractive
  5 | ENV TZ=Asia/Kolkata
  6 | ENV SERVICE_NAME="gromacs-service"
  7 | 
  8 | RUN set -euo pipefail && \
  9 |     apt-get update && apt-get install -y --no-install-recommends \
 10 |     wget \
 11 |     gnupg \
 12 |     software-properties-common \
 13 |     time && \
 14 |     wget -qO- https://apt.kitware.com/keys/kitware-archive-latest.asc | gpg --dearmor > /usr/share/keyrings/kitware-archive-keyring.gpg && \
 15 |     echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ focal main" > /etc/apt/sources.list.d/kitware.list && \
 16 |     apt-get update && apt-get install -y --no-install-recommends \
 17 |     cmake \
 18 |     git \
 19 |     tar \
 20 |     vim \
 21 |     g++ \
 22 |     gcc \
 23 |     make \
 24 |     curl \
 25 |     build-essential \
 26 |     tzdata && \
 27 |     apt-get clean && rm -rf /var/lib/apt/lists/*
 28 | 
 29 | RUN wget -q https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -O Miniforge3.sh && \
 30 |     bash Miniforge3.sh -b -p /opt/conda && \
 31 |     rm -f Miniforge3.sh
 32 | 
 33 | RUN chown -R 1001:1001 /opt/conda
 34 | RUN groupadd --gid 1001 $SERVICE_NAME && \
 35 |     useradd -m -g $SERVICE_NAME --shell /bin/bash --uid 1001 $SERVICE_NAME && \
 36 |     mkdir -p /grmcs /input /output && \
 37 |     chown -R $SERVICE_NAME:$SERVICE_NAME /grmcs /input /output
 38 | 
 39 | ENV PATH="/opt/conda/bin:$PATH"
 40 | 
 41 | RUN conda init bash && \
 42 |     conda create -y -n grms_env python=3.12 \
 43 |     sphinx pygments \
 44 |     mkl=2023.2 mkl-devel=2023.2 \
 45 |     dpcpp_linux-64=2023.2 dpcpp-cpp-rt=2023.2 \
 46 |     gxx_linux-64=12 && \
 47 |     conda clean -afy && \
 48 |     echo "source /opt/conda/bin/activate grms_env" >> /etc/bash.bashrc && \
 49 |     echo "conda activate grms_env" >> /etc/bash.bashrc
 50 | 
 51 | RUN source /opt/conda/bin/activate grms_env && conda activate grms_env
 52 | 
 53 | ENV CC=icx \
 54 |     CXX=icpx \
 55 |     MKLROOT=/opt/conda/envs/grms_env \
 56 |     CFLAGS="-fopenmp -I/opt/conda/envs/grms_env/include" \
 57 |     CXXFLAGS="-fopenmp -stdlib=libstdc++ -I/opt/conda/envs/grms_env/include" \
 58 |     LDFLAGS="-L/opt/conda/envs/grms_env/lib -lmkl_rt -liomp5 -lpthread -lm -ldl" \
 59 |     LD_LIBRARY_PATH="/opt/conda/envs/grms_env/lib:$LD_LIBRARY_PATH"
 60 | 
 61 | WORKDIR /grmcs
 62 | 
 63 | RUN source /opt/conda/bin/activate grms_env && \
 64 |     wget -q https://ftp.gromacs.org/gromacs/gromacs-2024.2.tar.gz && \
 65 |     tar -xzvf gromacs-2024.2.tar.gz && \
 66 |     rm -f gromacs-2024.2.tar.gz
 67 | 
 68 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /grmcs/gromacs-2024.2
 69 | WORKDIR /grmcs/gromacs-2024.2
 70 | RUN mkdir -p build && chmod -R 775 build
 71 | 
 72 | WORKDIR /grmcs/gromacs-2024.2/build
 73 | 
 74 | RUN source /opt/conda/bin/activate grms_env && \
 75 |     which icx && icx --version && \
 76 |     which icpx && icpx --version
 77 | 
 78 | RUN source /opt/conda/bin/activate grms_env && \
 79 |     cmake .. -DGMX_BUILD_OWN_FFTW=OFF \
 80 |              -DREGRESSIONTEST_DOWNLOAD=OFF \
 81 |              -DCMAKE_C_COMPILER=$CC \
 82 |              -DCMAKE_CXX_COMPILER=$CXX \
 83 |              -DGMX_FFT_LIBRARY=mkl \
 84 |              -DMKL_INCLUDE_DIR=$MKLROOT/include \
 85 |              -DMKL_LIBRARIES=$MKLROOT/lib/libmkl_rt.so \
 86 |              -DCMAKE_INSTALL_PREFIX=/grmcs/gmx_mkl_prefix && \
 87 |     make -j10 VERBOSE=1 && \
 88 |     make install
 89 | 
 90 | WORKDIR /grmcs
 91 | RUN rm -rf gromacs-2024.2
 92 | ENV PATH="/grmcs/gmx_mkl_prefix/bin:$PATH"
 93 | RUN echo "source /grmcs/gmx_mkl_prefix/bin/GMXRC" >> /etc/bash.bashrc
 94 | 
 95 | WORKDIR /input
 96 | COPY run_commands.sh /input/
 97 | RUN chmod +x /input/run_commands.sh
 98 | 
 99 | COPY entrypoint.sh /entrypoint.sh
100 | RUN chmod +x /entrypoint.sh
101 | 
102 | USER $SERVICE_NAME
103 | WORKDIR /input
104 | ENTRYPOINT ["/entrypoint.sh"]
105 | HEALTHCHECK NONE
106 | 
107 | 


--------------------------------------------------------------------------------
/applications/gromacs/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /opt/conda/bin/activate grms_env
 4 | source /grmcs/gmx_mkl_prefix/bin/GMXRC
 5 | 
 6 | if [[ "$1" == "/bin/bash" || "$1" == "bash" ]]; then
 7 |     exec /bin/bash
 8 | fi
 9 | 
10 | if [ "$#" -eq 0 ]; then
11 |     echo "Error: No PDB file provided. Please provide a PDB file as an argument."
12 |     echo "  Run full workflow: docker run -v \$(INPUT_GMX_CPU):/input -v \$(OUTPUT_GMX_CPU):/output -it grms_dock <protein.pdb>" 
13 |     echo "  Run specific GROMACS command: docker run -v \$(INPUT_GMX_CPU):/input -v \$(OUTPUT_GMX_CPU):/output -it grms_dock gmx <command>"
14 |     exit 1
15 | elif [ "$1" == "gmx" ]; then
16 |     shift
17 |     exec gmx "$@"
18 | else
19 |     pdb_file="$1"
20 |     
21 |     if [ ! -f "/input/$pdb_file" ]; then
22 |         echo "Error: File '/input/$pdb_file' not found."
23 |         exit 1
24 |     fi
25 |     
26 |     echo "Running full workflow with PDB file: /input/$pdb_file"
27 |     exec /input/run_commands.sh "/input/$pdb_file"
28 | fi
29 | 
30 | 


--------------------------------------------------------------------------------
/applications/gromacs/grms_input/mdtut_ions.mdp:
--------------------------------------------------------------------------------
 1 | ; ions.mdp - used as input into grompp to generate ions.tpr
 2 | ; Parameters describing what to do, when to stop and what to save
 3 | integrator	= steep		; Algorithm (steep = steepest descent minimization)
 4 | emtol		= 1000.0  	; Stop minimization when the maximum force < 1000.0 kJ/mol/nm
 5 | emstep      = 0.01      ; Energy step size
 6 | nsteps		= 50000	  	; Maximum number of (minimization) steps to perform
 7 | 
 8 | ; Parameters describing how to find the neighbors of each atom and how to calculate the interactions
 9 | nstlist		    = 1		    ; Frequency to update the neighbor list and long range forces
10 | cutoff-scheme   = Verlet
11 | ns_type		    = grid		; Method to determine neighbor list (simple, grid)
12 | coulombtype	    = PME		; Treatment of long range electrostatic interactions
13 | rcoulomb	    = 1.0		; Short-range electrostatic cut-off
14 | rvdw		    = 1.0		; Short-range Van der Waals cut-off
15 | pbc		        = xyz 		; Periodic Boundary Conditions (yes/no)
16 | 


--------------------------------------------------------------------------------
/applications/gromacs/grms_input/mdtut_md.mdp:
--------------------------------------------------------------------------------
 1 | title                   =  Lysozyme NPT equilibration 
 2 | ; Run parameters
 3 | integrator              = md        ; leap-frog integrator
 4 | nsteps                  = 500000    ; 2 * 500000 = 1000 ps (1 ns)
 5 | dt                      = 0.002     ; 2 fs
 6 | ; Output control
 7 | nstxout                 = 0         ; suppress bulky .trr file by specifying 
 8 | nstvout                 = 0         ; 0 for output frequency of nstxout,
 9 | nstfout                 = 0         ; nstvout, and nstfout
10 | nstenergy               = 5000      ; save energies every 10.0 ps
11 | nstlog                  = 5000      ; update log file every 10.0 ps
12 | nstxout-compressed      = 50000      ; save compressed coordinates every 10.0 ps
13 | compressed-x-grps       = System    ; save the whole system
14 | ; Bond parameters
15 | continuation            = yes       ; Restarting after NPT 
16 | constraint_algorithm    = lincs     ; holonomic constraints 
17 | constraints             = h-bonds   ; bonds involving H are constrained
18 | lincs_iter              = 1         ; accuracy of LINCS
19 | lincs_order             = 4         ; also related to accuracy
20 | ; Neighborsearching
21 | cutoff-scheme           = Verlet    ; Buffered neighbor searching
22 | ns_type                 = grid      ; search neighboring grid cells
23 | nstlist                 = 10        ; 20 fs, largely irrelevant with Verlet scheme
24 | rcoulomb                = 1.0       ; short-range electrostatic cutoff (in nm)
25 | rvdw                    = 1.0       ; short-range van der Waals cutoff (in nm)
26 | ; Electrostatics
27 | coulombtype             = PME       ; Particle Mesh Ewald for long-range electrostatics
28 | pme_order               = 4         ; cubic interpolation
29 | fourierspacing          = 0.16      ; grid spacing for FFT
30 | ; Temperature coupling is on
31 | tcoupl                  = V-rescale             ; modified Berendsen thermostat
32 | tc-grps                 = Protein Non-Protein   ; two coupling groups - more accurate
33 | tau_t                   = 0.1     0.1           ; time constant, in ps
34 | ref_t                   = 300     300           ; reference temperature, one for each group, in K
35 | ; Pressure coupling is on
36 | pcoupl                  = Parrinello-Rahman     ; Pressure coupling on in NPT
37 | pcoupltype              = isotropic             ; uniform scaling of box vectors
38 | tau_p                   = 2.0                   ; time constant, in ps
39 | ref_p                   = 1.0                   ; reference pressure, in bar
40 | compressibility         = 4.5e-5                ; isothermal compressibility of water, bar^-1
41 | ; Periodic boundary conditions
42 | pbc                     = xyz       ; 3-D PBC
43 | ; Dispersion correction
44 | DispCorr                = EnerPres  ; account for cut-off vdW scheme
45 | ; Velocity generation
46 | gen_vel                 = no        ; Velocity generation is off
47 | 


--------------------------------------------------------------------------------
/applications/gromacs/grms_input/mdtut_minim.mdp:
--------------------------------------------------------------------------------
 1 | ; minim.mdp - used as input into grompp to generate em.tpr
 2 | integrator	= steep		; Algorithm (steep = steepest descent minimization)
 3 | emtol		= 1000.0  	; Stop minimization when the maximum force < 1000.0 kJ/mol/nm
 4 | emstep      = 0.01      ; Energy step size
 5 | nsteps		= 50000	  	; Maximum number of (minimization) steps to perform
 6 | 
 7 | ; Parameters describing how to find the neighbors of each atom and how to calculate the interactions
 8 | nstlist		    = 1		    ; Frequency to update the neighbor list and long range forces
 9 | cutoff-scheme   = Verlet
10 | ns_type		    = grid		; Method to determine neighbor list (simple, grid)
11 | coulombtype	    = PME		; Treatment of long range electrostatic interactions
12 | rcoulomb	    = 1.0		; Short-range electrostatic cut-off
13 | rvdw		    = 1.0		; Short-range Van der Waals cut-off
14 | pbc		        = xyz 		; Periodic Boundary Conditions (yes/no)
15 | 


--------------------------------------------------------------------------------
/applications/gromacs/grms_input/mdtut_npt.mdp:
--------------------------------------------------------------------------------
 1 | title                   =  Lysozyme NPT equilibration 
 2 | define                  = -DPOSRES  ; position restrain the protein
 3 | ; Run parameters
 4 | integrator              = md        ; leap-frog integrator
 5 | nsteps                  = 500000     ; 2 * 500000 = 1000 ps
 6 | dt                      = 0.002     ; 2 fs
 7 | ; Output control
 8 | nstxout                 = 500       ; save coordinates every 1.0 ps
 9 | nstvout                 = 500       ; save velocities every 1.0 ps
10 | nstenergy               = 500       ; save energies every 1.0 ps
11 | nstlog                  = 500       ; update log file every 1.0 ps
12 | ; Bond parameters
13 | continuation            = yes       ; Restarting after NVT 
14 | constraint_algorithm    = lincs     ; holonomic constraints 
15 | constraints             = h-bonds   ; bonds involving H are constrained
16 | lincs_iter              = 1         ; accuracy of LINCS
17 | lincs_order             = 4         ; also related to accuracy
18 | ; Nonbonded settings 
19 | cutoff-scheme           = Verlet    ; Buffered neighbor searching
20 | ns_type                 = grid      ; search neighboring grid cells
21 | nstlist                 = 10        ; 20 fs, largely irrelevant with Verlet scheme
22 | rcoulomb                = 1.0       ; short-range electrostatic cutoff (in nm)
23 | rvdw                    = 1.0       ; short-range van der Waals cutoff (in nm)
24 | DispCorr                = EnerPres  ; account for cut-off vdW scheme
25 | ; Electrostatics
26 | coulombtype             = PME       ; Particle Mesh Ewald for long-range electrostatics
27 | pme_order               = 4         ; cubic interpolation
28 | fourierspacing          = 0.16      ; grid spacing for FFT
29 | ; Temperature coupling is on
30 | tcoupl                  = V-rescale             ; modified Berendsen thermostat
31 | tc-grps                 = Protein Non-Protein   ; two coupling groups - more accurate
32 | tau_t                   = 0.1     0.1           ; time constant, in ps
33 | ref_t                   = 300     300           ; reference temperature, one for each group, in K
34 | ; Pressure coupling is on
35 | pcoupl                  = Parrinello-Rahman     ; Pressure coupling on in NPT
36 | pcoupltype              = isotropic             ; uniform scaling of box vectors
37 | tau_p                   = 2.0                   ; time constant, in ps
38 | ref_p                   = 1.0                   ; reference pressure, in bar
39 | compressibility         = 4.5e-5                ; isothermal compressibility of water, bar^-1
40 | refcoord_scaling        = com
41 | ; Periodic boundary conditions
42 | pbc                     = xyz       ; 3-D PBC
43 | ; Velocity generation
44 | gen_vel                 = no        ; Velocity generation is off 
45 | 


--------------------------------------------------------------------------------
/applications/gromacs/grms_input/mdtut_nvt.mdp:
--------------------------------------------------------------------------------
 1 | title		=  Lysozyme NVT equilibration 
 2 | define		= -DPOSRES	; position restrain the protein
 3 | ; Run parameters
 4 | integrator	= md		; leap-frog integrator
 5 | nsteps		= 500000		; 2 * 500000 = 1000 ps
 6 | dt		    = 0.002		; 2 fs
 7 | ; Output control
 8 | nstxout		= 500		; save coordinates every 1.0 ps
 9 | nstvout		= 500		; save velocities every 1.0 ps
10 | nstenergy	= 500		; save energies every 1.0 ps
11 | nstlog		= 500		; update log file every 1.0 ps
12 | ; Bond parameters
13 | continuation	        = no		; first dynamics run
14 | constraint_algorithm    = lincs	    ; holonomic constraints 
15 | constraints	            = h-bonds	; bonds involving H are constrained
16 | lincs_iter	            = 1		    ; accuracy of LINCS
17 | lincs_order	            = 4		    ; also related to accuracy
18 | ; Neighborsearching
19 | cutoff-scheme   = Verlet
20 | ns_type		    = grid		; search neighboring grid cells
21 | nstlist		    = 10		; 20 fs, largely irrelevant with Verlet
22 | rcoulomb	    = 1.0		; short-range electrostatic cutoff (in nm)
23 | rvdw		    = 1.0		; short-range van der Waals cutoff (in nm)
24 | ; Electrostatics
25 | coulombtype	    = PME	; Particle Mesh Ewald for long-range electrostatics
26 | pme_order	    = 4		; cubic interpolation
27 | fourierspacing	= 0.16	; grid spacing for FFT
28 | ; Temperature coupling is on
29 | tcoupl		= V-rescale	            ; modified Berendsen thermostat
30 | tc-grps		= Protein Non-Protein	; two coupling groups - more accurate
31 | tau_t		= 0.1	  0.1           ; time constant, in ps
32 | ref_t		= 300 	  300           ; reference temperature, one for each group, in K
33 | ; Pressure coupling is off
34 | pcoupl		= no 		; no pressure coupling in NVT
35 | ; Periodic boundary conditions
36 | pbc		= xyz		    ; 3-D PBC
37 | ; Dispersion correction
38 | DispCorr	= EnerPres	; account for cut-off vdW scheme
39 | ; Velocity generation
40 | gen_vel		= yes		; assign velocities from Maxwell distribution
41 | gen_temp	= 300		; temperature for Maxwell distribution
42 | gen_seed	= -1		; generate a random seed
43 | 


--------------------------------------------------------------------------------
/applications/gromacs/grms_input/run_commands.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | source /opt/conda/bin/activate grms_env
 3 | source /grmcs/gmx_mkl_prefix/bin/GMXRC 
 4 | 
 5 | file="$1"
 6 | num_cpus=$(nproc)
 7 | current_ntmpi=$num_cpus
 8 | timestamp=$(date +"%Y%m%d_%H%M%S")
 9 | 
10 | mkdir -p /output/output_$timestamp
11 | 
12 | exit_on_error() {
13 |     echo "Error: $1"
14 |     exit 1
15 | }
16 | 
17 | echo "Preprocessing protein structure..."
18 | grep -v HOH "${file}" > prot_clean.pdb || exit_on_error "Failed to clean PDB file"
19 | 
20 | echo "Running pdb2gmx..."
21 | gmx pdb2gmx -f prot_clean.pdb -o prot_pros.gro -water spce -ff amber99sb -ignh || exit_on_error "pdb2gmx failed"
22 | 
23 | echo "Defining box dimensions..."
24 | gmx editconf -f prot_pros.gro -o prot_box.gro -c -d 1.0 -bt cubic || exit_on_error "editconf failed"
25 | 
26 | echo "Adding solvent..."
27 | gmx solvate -cp prot_box.gro -cs spc216.gro -o prot_solv.gro -p topol.top || exit_on_error "solvate failed"
28 | 
29 | echo "Preparing for ion addition..."
30 | gmx grompp -f mdtut_ions.mdp -c prot_solv.gro -p topol.top -o ions.tpr -maxwarn 2 || exit_on_error "grompp for ions failed"
31 | 
32 | echo "Adding ions..."
33 | echo "13" | gmx genion -s ions.tpr -o prot_solv_ions.gro -p topol.top -pname NA -nname CL -neutral || exit_on_error "genion failed"
34 | 
35 | echo "Preparing energy minimization..."
36 | gmx grompp -f mdtut_minim.mdp -c prot_solv_ions.gro -p topol.top -o em.tpr -maxwarn 1 || exit_on_error "grompp for minimization failed"
37 | 
38 | while [ $current_ntmpi -ge 1 ]; do
39 |     echo "Running EM with ntmpi=$current_ntmpi..."
40 |     time /usr/bin/time -v gmx mdrun -ntmpi $current_ntmpi -v -deffnm em
41 |     mdrun_exit_code=$?
42 |     if [ $mdrun_exit_code -eq 0 ]; then
43 |         echo "EM stage completed successfully with ntmpi=$current_ntmpi"
44 |         break
45 |     fi
46 |     if [ -f em.log ] && grep -q "Fatal error: There is no domain decomposition" em.log || grep -q "Fatal error:" em.log; then
47 |         echo "Domain decomposition failed for ntmpi=$current_ntmpi. Reducing MPI ranks..."
48 |         grep "Fatal error:" em.log
49 |         rm -f em.log
50 |         current_ntmpi=$((current_ntmpi / 2))
51 |     else
52 |         exit_on_error "Unexpected failure in EM stage. Check em.log for details."
53 |     fi
54 | done
55 | 
56 | echo "Preparing NVT equilibration..."
57 | gmx grompp -f mdtut_nvt.mdp -c em.gro -r em.gro -p topol.top -o nvt.tpr || exit_on_error "grompp for NVT failed"
58 | 
59 | echo "Running NVT mdrun..."
60 | gmx mdrun -ntmpi $current_ntmpi -v -deffnm nvt || exit_on_error "NVT mdrun failed"
61 | 
62 | echo "Preparing NPT equilibration..."
63 | gmx grompp -f mdtut_npt.mdp -c nvt.gro -r nvt.gro -t nvt.cpt -p topol.top -o npt.tpr || exit_on_error "grompp for NPT failed"
64 | 
65 | echo "Running NPT mdrun..."
66 | gmx mdrun -ntmpi $current_ntmpi -v -deffnm npt || exit_on_error "NPT mdrun failed"
67 | 
68 | echo "Preparing MD simulation..."
69 | gmx grompp -f mdtut_md.mdp -c npt.gro -t npt.cpt -p topol.top -o md01.tpr || exit_on_error "grompp for MD failed"
70 | 
71 | echo "Running MD mdrun..."
72 | gmx mdrun -ntmpi $current_ntmpi -v -deffnm md01 || exit_on_error "MD mdrun failed"
73 | 
74 | echo "Simulation completed successfully with ntmpi=$current_ntmpi"
75 | 
76 | echo "Moving output files..."
77 | mv /input/md01* /output/output_$timestamp/ 2>/dev/null || echo "Warning: No md01* files found."
78 | 
79 | echo "Pipeline execution completed successfully. Results saved in /output/output_$timestamp"
80 | 
81 | 


--------------------------------------------------------------------------------
/applications/gromacs/run_commands.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | source /opt/conda/bin/activate grms_env
 3 | source /grmcs/gmx_mkl_prefix/bin/GMXRC 
 4 | 
 5 | file="$1"
 6 | num_cpus=$(nproc)
 7 | current_ntmpi=$num_cpus
 8 | timestamp=$(date +"%Y%m%d_%H%M%S")
 9 | 
10 | mkdir -p /output/output_$timestamp
11 | 
12 | exit_on_error() {
13 |     echo "Error: $1"
14 |     exit 1
15 | }
16 | 
17 | echo "Preprocessing protein structure..."
18 | grep -v HOH "${file}" > prot_clean.pdb || exit_on_error "Failed to clean PDB file"
19 | 
20 | echo "Running pdb2gmx..."
21 | gmx pdb2gmx -f prot_clean.pdb -o prot_pros.gro -water spce -ff amber99sb -ignh || exit_on_error "pdb2gmx failed"
22 | 
23 | echo "Defining box dimensions..."
24 | gmx editconf -f prot_pros.gro -o prot_box.gro -c -d 1.0 -bt cubic || exit_on_error "editconf failed"
25 | 
26 | echo "Adding solvent..."
27 | gmx solvate -cp prot_box.gro -cs spc216.gro -o prot_solv.gro -p topol.top || exit_on_error "solvate failed"
28 | 
29 | echo "Preparing for ion addition..."
30 | gmx grompp -f mdtut_ions.mdp -c prot_solv.gro -p topol.top -o ions.tpr -maxwarn 2 || exit_on_error "grompp for ions failed"
31 | 
32 | echo "Adding ions..."
33 | echo "13" | gmx genion -s ions.tpr -o prot_solv_ions.gro -p topol.top -pname NA -nname CL -neutral || exit_on_error "genion failed"
34 | 
35 | echo "Preparing energy minimization..."
36 | gmx grompp -f mdtut_minim.mdp -c prot_solv_ions.gro -p topol.top -o em.tpr -maxwarn 1 || exit_on_error "grompp for minimization failed"
37 | 
38 | while [ $current_ntmpi -ge 1 ]; do
39 |     echo "Running EM with ntmpi=$current_ntmpi..."
40 |     time /usr/bin/time -v gmx mdrun -ntmpi $current_ntmpi -v -deffnm em
41 |     mdrun_exit_code=$?
42 |     if [ $mdrun_exit_code -eq 0 ]; then
43 |         echo "EM stage completed successfully with ntmpi=$current_ntmpi"
44 |         break
45 |     fi
46 |     if [ -f em.log ] && grep -q "Fatal error: There is no domain decomposition" em.log || grep -q "Fatal error:" em.log; then
47 |         echo "Domain decomposition failed for ntmpi=$current_ntmpi. Reducing MPI ranks..."
48 |         grep "Fatal error:" em.log
49 |         rm -f em.log
50 |         current_ntmpi=$((current_ntmpi / 2))
51 |     else
52 |         exit_on_error "Unexpected failure in EM stage. Check em.log for details."
53 |     fi
54 | done
55 | 
56 | echo "Preparing NVT equilibration..."
57 | gmx grompp -f mdtut_nvt.mdp -c em.gro -r em.gro -p topol.top -o nvt.tpr || exit_on_error "grompp for NVT failed"
58 | 
59 | echo "Running NVT mdrun..."
60 | gmx mdrun -ntmpi $current_ntmpi -v -deffnm nvt || exit_on_error "NVT mdrun failed"
61 | 
62 | echo "Preparing NPT equilibration..."
63 | gmx grompp -f mdtut_npt.mdp -c nvt.gro -r nvt.gro -t nvt.cpt -p topol.top -o npt.tpr || exit_on_error "grompp for NPT failed"
64 | 
65 | echo "Running NPT mdrun..."
66 | gmx mdrun -ntmpi $current_ntmpi -v -deffnm npt || exit_on_error "NPT mdrun failed"
67 | 
68 | echo "Preparing MD simulation..."
69 | gmx grompp -f mdtut_md.mdp -c npt.gro -t npt.cpt -p topol.top -o md01.tpr || exit_on_error "grompp for MD failed"
70 | 
71 | echo "Running MD mdrun..."
72 | gmx mdrun -ntmpi $current_ntmpi -v -deffnm md01 || exit_on_error "MD mdrun failed"
73 | 
74 | echo "Simulation completed successfully with ntmpi=$current_ntmpi"
75 | 
76 | echo "Moving output files..."
77 | mv /input/md01* /output/output_$timestamp/ 2>/dev/null || echo "Warning: No md01* files found."
78 | 
79 | echo "Pipeline execution completed successfully. Results saved in /output/output_$timestamp"
80 | 
81 | 


--------------------------------------------------------------------------------
/applications/moflow/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Base image specification
 2 | ARG FROM_IMAGE=ubuntu:24.04
 3 | FROM ${FROM_IMAGE} as builder
 4 | 
 5 | ENV DEBIAN_FRONTEND=noninteractive
 6 | 
 7 | # Install necessary build tools and clean up
 8 | RUN apt-get update && apt-get install -y --no-install-recommends \
 9 |     git build-essential wget vim ca-certificates autoconf automake make numactl unzip && \
10 |     rm -rf /var/lib/apt/lists/* && \
11 |     apt-get autoremove -y && \
12 |     apt-get clean
13 | 
14 | # Create a user and group for running the service
15 | ENV SERVICE_NAME="moflow-base-service"
16 | RUN groupadd --gid 1001 $SERVICE_NAME && \
17 |     useradd -m -g $SERVICE_NAME --shell /bin/false --uid 1001 $SERVICE_NAME
18 | 
19 | WORKDIR /app
20 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /app
21 | USER $SERVICE_NAME
22 | 
23 | # Install Miniforge (Conda)
24 | RUN wget --no-check-certificate -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-$(uname)-$(uname -m).sh"
25 | RUN bash Miniforge3.sh -b -p "${HOME}/conda" && rm Miniforge3.sh
26 | 
27 | # Clone MoFlow repository and apply patch
28 | RUN git clone --recursive https://github.com/calvin-zcx/moflow.git moflow
29 | WORKDIR /app/moflow
30 | RUN git checkout 3611c637260272b3d34a298f221623cb59e01091
31 | 
32 | RUN wget https://github.com/IntelLabs/Open-Omics-Acceleration-Framework/releases/download/3.0/Source_code_with_submodules.tar.gz 
33 | RUN tar -xzf Source_code_with_submodules.tar.gz
34 | 
35 | RUN mv /app/moflow/Open-Omics-Acceleration-Framework/applications/moflow /app/moflow/omics_setup && \
36 |     git apply /app/moflow/omics_setup/mflow_change_all.patch && \
37 |     rm -rf /app/moflow/Open-Omics-Acceleration-Framework && \
38 |     rm -rf /app/moflow/Source_code_with_submodules.tar.gz
39 | 
40 | # Set up Conda environment
41 | RUN ${HOME}/conda/bin/mamba env create -f /app/moflow/omics_setup/env.yml
42 | 
43 | # Install MoFlow package
44 | WORKDIR /app/moflow
45 | RUN bash -c "source ${HOME}/conda/etc/profile.d/conda.sh && \
46 |              source ${HOME}/conda/etc/profile.d/mamba.sh && \
47 |              mamba activate moflow && \
48 |              pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cpu"
49 |     
50 | # Initialization script for Conda environment
51 | RUN echo "#!/bin/bash" > /app/init.sh && \
52 |     echo "source ${HOME}/conda/etc/profile.d/conda.sh" >> /app/init.sh && \
53 |     echo "source ${HOME}/conda/etc/profile.d/mamba.sh" >> /app/init.sh && \
54 |     echo "mamba activate moflow" >> /app/init.sh && \
55 |     chmod +x /app/init.sh && \
56 |     echo "source /app/init.sh" >> ~/.bashrc
57 |     
58 | WORKDIR /app/moflow
59 | RUN echo '#!/bin/bash' > /app/entrypoint.sh && \
60 |     echo 'if [ -z "$1" ]; then' >> /app/entrypoint.sh && \
61 |     echo '    exec /bin/bash' >> /app/entrypoint.sh && \
62 |     echo 'else' >> /app/entrypoint.sh && \
63 |     echo '    source /app/init.sh' >> /app/entrypoint.sh && \
64 |     echo '    exec "$@"' >> /app/entrypoint.sh && \
65 |     echo 'fi' >> /app/entrypoint.sh && \
66 |     chmod +x /app/entrypoint.sh
67 | 
68 | ENTRYPOINT ["/app/entrypoint.sh"]
69 | CMD []
70 | HEALTHCHECK NONE
71 | 


--------------------------------------------------------------------------------
/applications/moflow/env.yml:
--------------------------------------------------------------------------------
 1 | name: moflow
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 | dependencies:
 6 |   - conda-forge::python=3.8.5
 7 |   - conda-forge::pandas==1.1.2
 8 |   - conda-forge::matplotlib==3.3.2
 9 |   - conda-forge::rdkit==2020.03.6
10 |   - conda-forge::orderedset==2.0.3
11 |   - conda-forge::tabulate==0.8.7
12 |   - conda-forge::networkx==2.5
13 |   - conda-forge::scipy==1.5.0
14 |   - conda-forge::seaborn==0.11.0
15 |   - pip:
16 |       - cairosvg==2.4.2
17 |       - tqdm==4.50.0
18 |       - gdown==5.2.0
19 |       - numpy==1.19.2
20 |       - scikit-learn==1.3.2


--------------------------------------------------------------------------------
/applications/relion/Dockerfile:
--------------------------------------------------------------------------------
  1 | FROM ubuntu:22.04
  2 | 
  3 | LABEL description="RELION 5.0 built with Intel oneAPI and conda"
  4 | 
  5 | ENV DEBIAN_FRONTEND=noninteractive
  6 | ENV TZ=Asia/Kolkata
  7 | ENV SERVICE_NAME="relion-service"
  8 | 
  9 | # Create non-root user
 10 | RUN groupadd --gid 1001 $SERVICE_NAME && \
 11 |     useradd -m -g $SERVICE_NAME --shell /bin/bash --uid 1001 $SERVICE_NAME
 12 | 
 13 | # Install system packages and Python build dependencies
 14 | RUN apt-get update && \
 15 |     apt-get install -y --no-install-recommends \
 16 |     git \
 17 |     cmake \
 18 |     vim \
 19 |     make \
 20 |     tar \
 21 |     curl \
 22 |     wget \
 23 |     gnupg \
 24 |     time \
 25 |     ca-certificates \
 26 |     tzdata \
 27 |     libtiff-dev \
 28 |     libx11-dev \
 29 |     libpng-dev \
 30 |     python3-dev \
 31 |     libffi-dev \
 32 |     libssl-dev \
 33 |     pkg-config \
 34 |     gfortran \
 35 |     libstdc++-11-dev \
 36 |     libgl1 \
 37 |     libgl1-mesa-glx \
 38 |     libxrender1 \
 39 |     build-essential && \
 40 |     apt-get clean && \
 41 |     rm -rf /var/lib/apt/lists/*
 42 | 
 43 | # Prepare directory and switch to non-root
 44 | WORKDIR /opt
 45 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /opt
 46 | USER $SERVICE_NAME
 47 | 
 48 | # Clone and patch RELION
 49 | RUN git clone --branch ver5.0 https://github.com/3dem/relion.git relion_5.0
 50 | WORKDIR /opt/relion_5.0
 51 | RUN git pull
 52 | COPY relion_env_patch.patch /opt/relion_5.0
 53 | RUN git apply relion_env_patch.patch
 54 | 
 55 | # Install Intel oneAPI HPC Toolkit
 56 | WORKDIR /opt
 57 | RUN wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b7f71cf2-8157-4393-abae-8cea815509f7/intel-oneapi-hpc-toolkit-2025.0.1.47_offline.sh && \
 58 |     chmod +x intel-oneapi-hpc-toolkit-2025.0.1.47_offline.sh && \
 59 |     ./intel-oneapi-hpc-toolkit-2025.0.1.47_offline.sh -a --silent --cli --eula accept && \
 60 |     rm intel-oneapi-hpc-toolkit-2025.0.1.47_offline.sh
 61 | 
 62 | # Conda setup
 63 | ENV CONDA_DIR=/opt/conda
 64 | ENV HOME=/home/relion-service
 65 | ENV XDG_CACHE_HOME=$HOME/.cache
 66 | ENV TMPDIR=/home/relion-service/tmp
 67 | ENV PATH=$CONDA_DIR/bin:$PATH
 68 | ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu
 69 | 
 70 | RUN mkdir -p $XDG_CACHE_HOME $TMPDIR && \
 71 |     chmod -R 777 $XDG_CACHE_HOME $TMPDIR 
 72 | 
 73 | # Install Miniforge
 74 | RUN wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -O /tmp/miniforge.sh && \
 75 |     bash /tmp/miniforge.sh -b -u -p $CONDA_DIR && \
 76 |     rm /tmp/miniforge.sh && \
 77 |     conda init bash
 78 | 
 79 | # Create conda env without .[vis] first
 80 | RUN conda env create -f /opt/relion_5.0/environment.yml && \
 81 |     conda clean -afy
 82 | 
 83 | # Install .[vis] manually after env is created
 84 | RUN bash -c "source $CONDA_DIR/etc/profile.d/conda.sh && \
 85 |     conda activate relion-5.0 && \
 86 |     pip install --verbose /opt/relion_5.0[vis]"
 87 | 
 88 | # Build RELION
 89 | WORKDIR /opt/relion_5.0
 90 | SHELL ["/bin/bash", "-c"]
 91 | RUN mkdir -p /opt/relion_5.0/torch_home
 92 | RUN mkdir -p /opt/relion_5.0/build_cpu
 93 | WORKDIR /opt/relion_5.0/build_cpu
 94 | 
 95 | RUN source /home/relion-service/intel/oneapi/2025.0/oneapi-vars.sh --force && \
 96 |     cmake -DCMAKE_C_COMPILER=icx \
 97 |           -DCMAKE_CXX_COMPILER=icpx \
 98 |           -DMPI_C_COMPILER=mpiicx \
 99 |           -DMPI_CXX_COMPILER=mpiicpx \
100 |           -DCUDA=OFF \
101 |           -DALTCPU=ON \
102 |           -DMKLFFT=ON \
103 |           -DGUI=OFF \
104 |           -DFETCH_WEIGHTS=OFF \
105 |           -DCMAKE_BUILD_TYPE=Release \
106 |           -DCMAKE_C_FLAGS="-g -O3 -qopenmp-simd -xCORE-AVX512 -qopt-zmm-usage=high" \
107 |           -DCMAKE_CXX_FLAGS="-DTBB_SUPPRESS_DEPRECATED_MESSAGES -g -O3 -qopenmp-simd -xCORE-AVX512 -qopt-zmm-usage=high" \
108 |           -DCMAKE_EXE_LINKER_FLAGS="-static-intel -static-libgcc -qopenmp-link=static -Wno-unused-command-line-argument" \
109 |           -DTORCH_HOME_PATH=/opt/relion_5.0/torch_home \
110 |           -DPYTHON_EXE_PATH=/opt/conda/envs/relion-5.0/bin/python \
111 |           -DCMAKE_INSTALL_PREFIX=/opt/relion_5.0_cpu_benchmark_prefix .. && \ 
112 |           make -j$(nproc) && make install 
113 | WORKDIR /opt/relion_5.0
114 | RUN rm -rf /opt/relion_5.0/build_cpu
115 | RUN echo 'source /home/relion-service/intel/oneapi/setvars.sh --force' >> /home/relion-service/.bashrc
116 | ENV PATH="/opt/relion_5.0_cpu_benchmark_prefix/bin:/home/relion-service/intel/oneapi/compiler/2025.0/bin:/home/relion-service/intel/oneapi/mpi/2025.0/bin:/home/relion-service/intel/oneapi/compiler/2025.0/linux/bin/intel64:$PATH"
117 | ENV LD_LIBRARY_PATH="/home/relion-service/intel/oneapi/compiler/2025.0/lib:/home/relion-service/intel/oneapi/mkl/2025.0/lib/intel64:/home/relion-service/intel/oneapi/2025.0/lib:$LD_LIBRARY_PATH"
118 | RUN mkdir -p /opt/relion_5.0/relion_benchmark
119 | COPY entrypoint.sh ./entrypoint_temp.sh
120 | RUN bash -c "cp ./entrypoint_temp.sh ./entrypoint.sh && chmod u+x ./entrypoint.sh && rm ./entrypoint_temp.sh"
121 | 
122 | ENTRYPOINT ["/opt/relion_5.0/entrypoint.sh"]
123 | 
124 | HEALTHCHECK NONE
125 | 
126 | 


--------------------------------------------------------------------------------
/applications/relion/README.md:
--------------------------------------------------------------------------------
 1 | # Open-Omics-Relion
 2 | **Open-Omics-Relion** is a Dockerized RELION 5.0 setup for running benchmark workloads using Intel oneAPI and Intel MPI. It supports **2D classification**, **3D classification**, and **auto-refinement** modes using official test data, designed for **reproducibility**, **performance testing**, and **ease of deployment**.
 3 | 
 4 | ---
 5 | 
 6 | ## Step 1: Download Benchmark Dataset
 7 | ```zsh
 8 | wget ftp://ftp.mrc-lmb.cam.ac.uk/pub/scheres/relion_benchmark.tar.gz
 9 | tar -xzvf relion_benchmark.tar.gz
10 | ```
11 | This will extract a folder named `relion_benchmark`.
12 | ## Step 2: Build the Docker Image
13 | Build the docker image with `Dockerfile`, run:
14 | ```zsh
15 | sudo docker build -t relion_nru .
16 | ```
17 | Verify the image was built:
18 | ```zsh
19 | sudo docker images | grep -i relion_nru
20 | ```
21 | ## Step 3: Change Ownership of Benchmark Data
22 | To avoid permission issues when mounting the directory (RELION runs as a non-root user UID 1001):
23 | ```zsh
24 | cd relion_benchmark/
25 | sudo chown 1001:1001 $(pwd)
26 | ```
27 | 
28 | ## Step 4: Run RELION Benchmark
29 | Launch the container with required options for shared memory and MPI support:
30 | ```zsh
31 | sudo docker run --rm --net=host --ipc=host --pid=host --ulimit stack=67108864 --shm-size=2g --cap-add=SYS_PTRACE -e I_MPI_DEBUG=5 -e I_MPI_SHM_LMT=shm -e I_MPI_FABRICS=shm:tcp -it -v $(pwd):/opt/relion_5.0/relion_benchmark relion_nru:latest
32 | ```
33 | **Notes**
34 | - Modify entrypoint.sh if you wish to customize CPU/thread usage.
35 | - The container is designed to work on systems with Intel MPI and oneAPI properly set up inside.
36 | - Ensure Docker has access to sufficient system shared memory (e.g. via /dev/shm or --shm-size if needed).
37 | - The container uses a non-root user (UID 1001). Make sure mounted volumes are writable by this user.
38 | ### Available Run Modes
39 | You can specify different modes as an argument to the Docker command. Each mode will automatically create an output folder inside the `relion_benchmark` directory.
40 | 
41 | | Mode         | Command Example (append to `docker run`)                    | Description                  | Output Folder |
42 | |--------------|--------------------------------------------------------------|------------------------------|--------------|
43 | | *(default)*   | `relion_nru:latest`               | Run 3D classification        |     `3D/`      |
44 | | `3d`          | `relion_nru:latest 3d`               | Run 3D classification        |     `3D/`    |
45 | | `2d`         | `relion_nru:latest 2d`                                      | Run 2D classification        |   `2D/`   |
46 | | `autorefine` | `relion_nru:latest autorefine`                              | Run 3D auto-refinement       |  `3D_AUTO/`   |
47 | | `custom`     | `relion_nru:latest relion_refine_mpi [your-flags]`          | Run any custom RELION command |  User-defined (`--o`) |
48 | 
49 | ---
50 | 
51 | **The Original README for Relion starts here:**
52 | 
53 | 
54 | RELION 5.0-beta
55 | ===============
56 | 
57 | RELION (for REgularised LIkelihood OptimisatioN) is a stand-alone computer
58 | program for Maximum A Posteriori refinement of (multiple) 3D reconstructions
59 | or 2D class averages in cryo-electron microscopy. It is developed in the
60 | research group of Sjors Scheres at the MRC Laboratory of Molecular Biology.
61 | 
62 | If RELION is useful in your work, please cite our papers.
63 | 
64 | Comprehensive documentation of RELION and tutorials are stored [here](https://relion.readthedocs.io/).
65 | 
66 | ## Installation
67 | 
68 | See our [installation instructions](https://relion.readthedocs.io/en/release-5.0/Installation.html).
69 | 
70 | You will have to set up a Python environment to use Python modules (e.g. Blush, ModelAngelo and DynaMight).
71 | Thus, please read the above instructions carefully even if you are familiar with earlier versions.
72 | 
73 | ## Class Ranker
74 | 
75 | The default model for the class ranker has been trained and tested in Python 3.9.12 with Pytorch 1.10.0 and Numpy 1.20.0.
76 | If you wish to retrain the class ranker model with your own data, please refer to [this repo](https://github.com/3dem/relion-classranker).
77 | 


--------------------------------------------------------------------------------
/applications/relion/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | source ~/intel/oneapi/2025.0/oneapi-vars.sh
 3 | APPEXE=/opt/relion_5.0_cpu_benchmark_prefix/bin/relion_refine_mpi
 4 | DEFAULT_DATA_DIR=/opt/relion_5.0/relion_benchmark
 5 | cd "$DEFAULT_DATA_DIR" || exit 1
 6 | 
 7 | NUMMPI=$((16+1))
 8 | NUMTHR=8
 9 | POOLSIZE=4
10 | NUMITER=25
11 | 
12 | DEFAULT_MODE="${1:-3d}"  
13 | MODE="${DEFAULT_MODE#-}" 
14 | shift                
15 | 
16 | case "$MODE" in
17 |   2d)
18 |     echo "➡️ Running 2D Classification"
19 |     mkdir -p 2D
20 |     exec mpirun -n $NUMMPI $APPEXE --i Particles/shiny_2sets.star --dont_combine_weights_via_disc --ctf --tau2_fudge 2 --particle_diameter 360 --K 200 --zero_mask --oversampling 1 --psi_step 6 --offset_range 5 --offset_step 2 --norm --scale --random_seed 0 --pad 2 --o 2D/2D --pool $POOLSIZE --j $NUMTHR --iter $NUMITER --cpu
21 |     ;;
22 |   3d)
23 |     echo "➡️ Running 3D Classification with SYCL"
24 |     mkdir -p 3D
25 |     exec mpirun -n $NUMMPI $APPEXE --i Particles/shiny_2sets.star --ref emd_2660.map:mrc --firstiter_cc --ini_high 60 --dont_combine_weights_via_disc --ctf --tau2_fudge 4 --particle_diameter 360 --K 6 --flatten_solvent --zero_mask --oversampling 1 --healpix_order 2 --offset_range 5 --offset_step 2 --sym C1 --norm --scale --pad 2 --random_seed 0 --o 3D/3D --pool $POOLSIZE --j $NUMTHR --iter $NUMITER --cpu
26 |     ;;
27 |   autorefine)
28 |     echo "➡️ Running 3D AutoRefine"
29 |     mkdir -p 3D_AUTO
30 |     exec mpirun -n $NUMMPI $APPEXE --i Particles/shiny_2sets.star --ref emd_2660.map:mrc --firstiter_cc --ini_high 60 --dont_combine_weights_via_disc --ctf --particle_diameter 360 --flatten_solvent --zero_mask --oversampling 1 --healpix_order 2 --offset_range 5 --offset_step 2 --sym C1 --norm --scale --auto_refine --split_random_halves --auto_local_healpix_order 4 --low_resol_join_halves 40 --random_seed 1 --pad 2 --o 3D_AUTO/3D_AUTO --pool $POOLSIZE --j $NUMTHR --cpu
31 |     ;;
32 |   *)
33 |    echo "➡️ Running custom RELION command: $DEFAULT_MODE $@"
34 |     exec $DEFAULT_MODE "$@"
35 |     ;;
36 | esac
37 | 
38 | 


--------------------------------------------------------------------------------
/applications/relion/relion_env_patch.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/environment.yml b/environment.yml
 2 | index ab0acb75..98d1d008 100644
 3 | --- a/environment.yml
 4 | +++ b/environment.yml
 5 | @@ -6,8 +6,9 @@ dependencies:
 6 |    - python=3.10
 7 |    - setuptools=59.5.0
 8 |    - pip:
 9 | -      - torch==2.0.1
10 | -      - torchvision==0.15.2
11 | +      - cython
12 | +      - torch==2.0.1+cpu
13 | +      - torchvision==0.15.2+cpu
14 |        - tqdm==4.65.0
15 |        - mrcfile==1.4.3
16 |        - starfile>=0.5.6
17 | @@ -32,4 +33,3 @@ dependencies:
18 |        - git+https://github.com/3dem/DynaMight
19 |        - git+https://github.com/3dem/topaz
20 |        - git+https://github.com/3dem/model-angelo
21 | -      - ".[vis]"
22 | 


--------------------------------------------------------------------------------
/benchmarking/AWS-Intel-blog-v2.1-2024/long_db:
--------------------------------------------------------------------------------
 1 | celegans_4020.fa
 2 | celegans_4040.fa
 3 | celegans_4060.fa
 4 | celegans_4080.fa
 5 | celegans_4100.fa
 6 | celegans_4120.fa
 7 | celegans_4140.fa
 8 | celegans_4160.fa
 9 | celegans_4180.fa
10 | celegans_4200.fa
11 | celegans_4220.fa
12 | celegans_4240.fa
13 | celegans_4260.fa
14 | celegans_4280.fa
15 | celegans_4300.fa
16 | celegans_4320.fa
17 | celegans_4340.fa
18 | celegans_4360.fa
19 | 


--------------------------------------------------------------------------------
/benchmarking/AWS-Intel-blog-v2.1-2024/proteome.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import re
 4 | import numpy as np
 5 | 
 6 | 
 7 | with open("./uniprotkb_proteome.fasta", "r") as f:
 8 |     lines = f.readlines()
 9 |     i = -1
10 |     protien_list = []
11 |     proteome_list = []
12 |     for line in lines:
13 |         if ">sp|" in line:
14 |             if i >= 0:
15 |                 proteome_list.append(protien_list)
16 |                 protien_list = []
17 |             i = i + 1
18 |             protien_list.append(line)
19 |         else:
20 |             protien_list.append(line)
21 |     proteome_list.append(protien_list)
22 |     
23 |     sorted_list = sorted(proteome_list, key=lambda x: len(''.join(x[1:])), reverse=False)
24 |     i = 0
25 |     sum = 0
26 |     small_db = open("short_db", "r")
27 |     small_list = [line.rstrip() for line in small_db.readlines()]
28 |     small_db.close()
29 |     #lines = small_db.readlines()
30 |     #print(lines)
31 | 
32 |     long_db = open("long_db", "r")
33 |     long_list = [line.rstrip() for line in long_db.readlines()]
34 |     long_db.close()
35 |     os.mkdir("~/celegans_samples")
36 |     os.mkdir("~/celegans_samples_long")
37 |     for pl_list in sorted_list:
38 |         sum = sum + len(''.join(pl_list[1:]))
39 |         print(i, len(''.join(pl_list[1:])))
40 |         if "celegans_"+str(i)+".fa" in small_list:
41 |             with open("~/celegans_samples/celegans_" + str(i) + ".fa", "w") as f:
42 |                 f.writelines(pl_list)
43 |         
44 |         if "celegans_"+str(i)+".fa" in long_list:
45 |             with open("~/celegans_samples_long/celegans_" + str(i) + ".fa", "w") as f:
46 |                 f.writelines(pl_list)
47 | 
48 | 
49 | 
50 |         i = i + 1
51 |         
52 |     print(sum/i)
53 | 


--------------------------------------------------------------------------------
/benchmarking/AWS-Intel-blog-v2.1-2024/run_pipe_bwa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #docker pull google/deepvariant:1.5.0
 3 | lscpu > compute
 4 | num_cpus_per_node=$(cat compute | grep -E '^CPU\(s\)' | awk  '{print $2}')
 5 | 
 6 | INPUT=~/HG001/
 7 | OUTPUT=~/HGOO1/OUTPUT/
 8 | echo $OUTPUT
 9 | mkdir -p $OUTPUT
10 | #python test_pipe_bwa.py --input $INPUT --output $OUTPUT --index GCA_000001405.15_GRCh38_no_alt_analysis_set.fna --read HG001.novaseq.pcr-free.30x.R1.fastq.gz HG001.novaseq.pcr-free.30x.R2.fastq.gz --cpus 108 --threads 108 --shards 112 
11 | python test_pipe_bwa.py --input $INPUT --output $OUTPUT --index Homo_sapiens_assembly38.fasta --read HG001.novaseq.pcr-free.30x.R1.fastq.gz HG001.novaseq.pcr-free.30x.R2.fastq.gz --cpus $num_cpus_per_node --threads $num_cpus_per_node --shards $num_cpus_per_node # 2>&1 | tee ${OUTPUT}log.txti
12 | 
13 | echo "Ouput files are inside "$OUTPUT" folder"
14 | 


--------------------------------------------------------------------------------
/benchmarking/AWS-Intel-blog-v2.1-2024/short_db:
--------------------------------------------------------------------------------
 1 | celegans_0.fa
 2 | celegans_100.fa
 3 | celegans_1000.fa
 4 | celegans_1050.fa
 5 | celegans_1100.fa
 6 | celegans_1150.fa
 7 | celegans_1200.fa
 8 | celegans_1250.fa
 9 | celegans_1300.fa
10 | celegans_1350.fa
11 | celegans_1400.fa
12 | celegans_1450.fa
13 | celegans_150.fa
14 | celegans_1500.fa
15 | celegans_1550.fa
16 | celegans_1600.fa
17 | celegans_1650.fa
18 | celegans_1700.fa
19 | celegans_1800.fa
20 | celegans_1850.fa
21 | celegans_1900.fa
22 | celegans_1950.fa
23 | celegans_200.fa
24 | celegans_2000.fa
25 | celegans_2050.fa
26 | celegans_2100.fa
27 | celegans_2150.fa
28 | celegans_2200.fa
29 | celegans_2250.fa
30 | celegans_2300.fa
31 | celegans_2350.fa
32 | celegans_2400.fa
33 | celegans_2450.fa
34 | celegans_250.fa
35 | celegans_2500.fa
36 | celegans_2550.fa
37 | celegans_2600.fa
38 | celegans_2650.fa
39 | celegans_2700.fa
40 | celegans_2750.fa
41 | celegans_2800.fa
42 | celegans_2850.fa
43 | celegans_2900.fa
44 | celegans_2950.fa
45 | celegans_300.fa
46 | celegans_3000.fa
47 | celegans_3050.fa
48 | celegans_3100.fa
49 | celegans_3150.fa
50 | celegans_3200.fa
51 | celegans_3250.fa
52 | celegans_3300.fa
53 | celegans_3350.fa
54 | celegans_3400.fa
55 | celegans_3450.fa
56 | celegans_350.fa
57 | celegans_3500.fa
58 | celegans_3550.fa
59 | celegans_3600.fa
60 | celegans_3650.fa
61 | celegans_3700.fa
62 | celegans_3800.fa
63 | celegans_3850.fa
64 | celegans_3950.fa
65 | celegans_400.fa
66 | celegans_450.fa
67 | celegans_50.fa
68 | celegans_500.fa
69 | celegans_550.fa
70 | celegans_600.fa
71 | celegans_650.fa
72 | celegans_700.fa
73 | celegans_750.fa
74 | celegans_800.fa
75 | celegans_850.fa
76 | celegans_900.fa
77 | celegans_950.fa
78 | 


--------------------------------------------------------------------------------
/benchmarking/AWS-Intel-blog-v2.1-2024/test_pipe_bwa.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | from subprocess import Popen, PIPE, run
 3 | import subprocess
 4 | import time
 5 | import os
 6 | import sys
 7 | from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
 8 | 
 9 | def main(argv):
10 |     parser=ArgumentParser()
11 |     parser.add_argument('--input',help="Input data directory")
12 |     parser.add_argument('--output',help="Output data directory")
13 |     parser.add_argument("-i", "--index", help="name of index file")
14 |     parser.add_argument("-r", "--reads", nargs='+',help="name of reads file seperated by space")
15 |     parser.add_argument("-c", "--cpus",default=72,help="Number of cpus. default=72")
16 |     parser.add_argument("-t", "--threads",default=72,help="Number of threads used in samtool operations. default=72")
17 |     #parser.add_argument("-t", "--threads",default=72,help="Number of threads used in samtool operations. default=72")
18 |     parser.add_argument('-in', '--istart',action='store_true',help="It Will start indexing")
19 |     parser.add_argument('-sindex',action='store_true',help="It Will start creating .fai file")
20 |     parser.add_argument('--shards',default=1,help="Number of shards for deepvariant")
21 |     args = vars(parser.parse_args())
22 |     ifile=args["index"]
23 |     rfile1=args["reads"][0]
24 |     rfile2=args["reads"][1]
25 |     cpus=args["cpus"]
26 |     threads=args["threads"]
27 |     index=args["istart"]
28 |     sindex=args["sindex"]
29 |     nproc=args["shards"]
30 |     folder=args["input"]
31 |     output=args["output"]
32 | 
33 |     t0=time.time()
34 |     file_size = os.path.getsize(folder+rfile1)
35 |     print("\nSize of FASTQ file:",file_size)
36 | 
37 |     if index==True :
38 |         print("Indexing Starts")
39 |         begin = time.time()
40 |         a=run('../../applications/bwa-0.7.17/bwa index '+folder+ifile,capture_output=True,shell=True)
41 |         end=time.time()
42 |         file_size = os.path.getsize(folder+rfile1)
43 |         print("\nIndex time:",end-begin)
44 |         print("\nSize of FASTQ file:",file_size)
45 |         
46 | 
47 |     print("bwa starts")
48 |     begin1 = time.time()
49 |     print('../../applications/bwa-0.7.17/bwa mem -t '+cpus+' '+folder+ifile+' '+folder+rfile1+' '+folder+rfile2+' > '+output+'aln.sam')
50 |     a=run('../../applications/bwa-0.7.17/bwa mem -t '+cpus+' '+folder+ifile+' '+folder+rfile1+' '+folder+rfile2+' > '+output+'aln.sam',capture_output=True, shell=True)
51 |     end1=time.time()
52 |     #file_size = os.path.getsize(output+'aln.sam')
53 |     print("\nFASTQ to SAM time:",end1-begin1)
54 |     print("\nSize of SAM file:",file_size)
55 |     
56 |     print("sam to sort-bam starts")
57 |     begin2=time.time()
58 |     print(output+'aln.bam')
59 |     a=run('../../applications/samtools/samtools sort --threads '+threads+' -T /tmp/aln.sorted -o '+output+'aln.bam '+output+'aln.sam',capture_output=True,shell=True)
60 |     end2=time.time()
61 |     file_size = os.path.getsize(output+'aln.bam')
62 |     print("\nSAM to sort-BAM time:",end2-begin2)
63 |     print("\nSize of sort-BAM file",file_size)
64 |     
65 |     begin3=time.time()
66 |     print("Indexing of ref and read starts")
67 |     if sindex==True :
68 |         a=run('../../applications/samtools/samtools faidx '+folder+ifile,capture_output=True,shell=True)
69 |     
70 |     print('../../applications/samtools/samtools index -M -@ '+threads+' '+output+'aln.bam') 
71 |     a=run('../../applications/samtools/samtools index -M -@ '+threads+' '+output+'aln.bam',capture_output=True,shell=True)
72 | 
73 |     end3=time.time()
74 |     print("\nIndex creation time",end3-begin3)
75 |     
76 |     begin5=time.time()
77 |     #original
78 |     command='sudo docker run -v '+folder+':"/input" -v '+output+':"/output" google/deepvariant:1.5.0 /opt/deepvariant/bin/run_deepvariant --model_type=WGS --ref=/input/'+ifile+' --reads=/output/aln.bam --output_vcf=/output/output.vcf.gz --output_gvcf=/output/output.g.vcf.gz --intermediate_results_dir /output/intermediate_results_dir --num_shards='+nproc+' --dry_run=false'
79 |     #updated
80 |     #command='podman run -v '+folder+':"/input" -v '+output+':"/output" localhost/deepvariant:latest /opt/deepvariant/bin/run_deepvariant --model_type=WGS --ref=/input/'+ifile+' --reads=/output/aln.sorted.new.bam --output_vcf=/output/output.vcf.gz --intermediate_results_dir /output/intermediate_results_dir --num_shards='+nproc+' --pcl_opt --dry_run=false'
81 |     print(command)
82 |     a=run( command+" 2>&1 >> "+output+"log_deepvariant.txt", shell=True)
83 |     #pid=subprocess.call(command,shell=True)
84 |     #os.system(command)
85 |     end5=time.time()
86 |     print("\nDeepVariant runtime",end5-begin5)
87 |     print("Pipeline runtime",end5-t0)
88 | 
89 | if __name__ == "__main__":
90 |     main(sys.argv[1:])
91 | 


--------------------------------------------------------------------------------
/images/Open-Omics-Acceleration-Framework v2.0.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/Open-Omics-Acceleration-Framework/db29aec9c3e2eb27c36dd91824dfc346e5deae89/images/Open-Omics-Acceleration-Framework v2.0.JPG


--------------------------------------------------------------------------------
/images/Open-Omics-Acceleration-Framework v2.0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/Open-Omics-Acceleration-Framework/db29aec9c3e2eb27c36dd91824dfc346e5deae89/images/Open-Omics-Acceleration-Framework v2.0.jpg


--------------------------------------------------------------------------------
/images/Open-Omics-Acceleration-Framework v3.0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/Open-Omics-Acceleration-Framework/db29aec9c3e2eb27c36dd91824dfc346e5deae89/images/Open-Omics-Acceleration-Framework v3.0.jpg


--------------------------------------------------------------------------------
/images/Open-Omics-Acceleration-Framework-v2.0.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/Open-Omics-Acceleration-Framework/db29aec9c3e2eb27c36dd91824dfc346e5deae89/images/Open-Omics-Acceleration-Framework-v2.0.JPG


--------------------------------------------------------------------------------
/images/Open-Omics-Acceleration-Framework-v3.0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/Open-Omics-Acceleration-Framework/db29aec9c3e2eb27c36dd91824dfc346e5deae89/images/Open-Omics-Acceleration-Framework-v3.0.jpg


--------------------------------------------------------------------------------
/images/alphafold2-protein-folding.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/Open-Omics-Acceleration-Framework/db29aec9c3e2eb27c36dd91824dfc346e5deae89/images/alphafold2-protein-folding.jpg


--------------------------------------------------------------------------------
/images/deepvariant-fq2vcf.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/Open-Omics-Acceleration-Framework/db29aec9c3e2eb27c36dd91824dfc346e5deae89/images/deepvariant-fq2vcf.jpg


--------------------------------------------------------------------------------
/images/open-omics-acceleration-framework-v2.0.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/Open-Omics-Acceleration-Framework/db29aec9c3e2eb27c36dd91824dfc346e5deae89/images/open-omics-acceleration-framework-v2.0.JPG


--------------------------------------------------------------------------------
/images/open-omics-acceleration-framework.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/Open-Omics-Acceleration-Framework/db29aec9c3e2eb27c36dd91824dfc346e5deae89/images/open-omics-acceleration-framework.JPG


--------------------------------------------------------------------------------
/images/scrnaseq-analysis.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/Open-Omics-Acceleration-Framework/db29aec9c3e2eb27c36dd91824dfc346e5deae89/images/scrnaseq-analysis.jpg


--------------------------------------------------------------------------------
/pipelines/alphafold2-based-protein-folding/Dockerfile_Inf:
--------------------------------------------------------------------------------
 1 | ARG FROM_IMAGE=ubuntu:22.04
 2 | # Install Base miniconda image
 3 | ARG BASE_IMAGE=condaforge/miniforge3:23.1.0-3
 4 | FROM ${BASE_IMAGE} as conda_setup
 5 | ENV DEBIAN_FRONTEND=noninteractive
 6 | 
 7 | # Non-root user setup
 8 | ENV SERVICE_NAME="alphafold2-inf-service"
 9 | 
10 | RUN groupadd --gid 1001 $SERVICE_NAME && \
11 |     useradd -m -g $SERVICE_NAME --shell /bin/false --uid 1001 $SERVICE_NAME
12 | 
13 | # Install Anaconda and PIP dependency
14 | RUN conda update -n base conda
15 | RUN conda install python==3.11
16 | RUN conda install -y -c conda-forge gcc_linux-64==12.1.0 gxx_linux-64==12.1.0
17 | RUN conda install -y -c conda-forge openmm==8.0.0 pdbfixer==1.9
18 | RUN conda install -y bioconda::kalign2==2.04
19 | RUN conda install -y -c conda-forge mkl==2024.2.0 mkl-devel==2024.2.0
20 | RUN python -m pip install onednn-cpu-iomp==2023.2.0
21 | RUN python -m pip install torch==2.1.0 pybind11==2.11.1
22 | RUN python -m pip install absl-py==2.0.0 biopython==1.81 chex==0.1.84 dm-haiku==0.0.10 dm-tree==0.1.8 immutabledict==3.0.0  ml-collections==0.1.1 numpy==1.26.1 scipy==1.11.3 tensorflow==2.14.0 pandas==2.1.1 psutil==5.9.6 tqdm==4.65.0 joblib==1.3.2 pragzip==0.6.0
23 | RUN python -m pip install jax==0.4.21 jaxlib==0.4.21
24 | RUN python -m pip install intel-extension-for-pytorch==2.1.0 intel-openmp==2024.2.0
25 | RUN conda install -y -c conda-forge autoconf==2.71
26 | RUN conda install -y -c conda-forge make==4.3
27 | 
28 | 
29 | FROM ${FROM_IMAGE} as builder
30 | ENV DEBIAN_FRONTEND=noninteractive
31 | 
32 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
33 |     git build-essential cmake wget tzdata gcc curl gnupg gnupg2 gnupg1 sudo kalign autoconf numactl time vim tar ca-certificates -y\
34 |     && rm -rf /var/lib/apt/lists/* \
35 |     && apt-get autoremove -y \
36 |     && apt-get clean
37 | RUN apt update
38 | 
39 | COPY --from=conda_setup /opt/conda /opt/conda
40 | ENV PATH "/opt/conda/bin:$PATH"
41 | RUN echo "source activate" >> ~/.bashrc
42 | CMD source ~/.bashrc
43 | 
44 | WORKDIR /
45 | # RUN wget https://github.com/IntelLabs/Open-Omics-Acceleration-Framework/releases/download/3.0/Source_code_with_submodules.tar.gz 
46 | # RUN tar -xzf Source_code_with_submodules.tar.gz
47 | RUN git clone --recursive https://github.com/IntelLabs/Open-Omics-Acceleration-Framework.git
48 | WORKDIR /Open-Omics-Acceleration-Framework/applications/
49 | 
50 | RUN git clone --branch 5.3.0 https://github.com/jemalloc/jemalloc.git
51 | WORKDIR /Open-Omics-Acceleration-Framework/applications/jemalloc
52 | RUN bash autogen.sh --prefix=/opt/conda/ && make install
53 | WORKDIR /Open-Omics-Acceleration-Framework/applications
54 | RUN rm -rf jemalloc
55 | 
56 | ENV PATH="/usr/bin:$PATH"
57 | ENV PATH "/opt/conda/bin:$PATH"
58 | ENV LD_LIBRARY_PATH "/opt/conda/lib:$LD_LIBRARY_PATH"
59 | 
60 | 
61 | 
62 | WORKDIR /Open-Omics-Acceleration-Framework/applications/alphafold/
63 | WORKDIR /Open-Omics-Acceleration-Framework/applications/alphafold/tpp-pytorch-extension
64 | RUN CC=gcc && CXX=g++ && python setup.py install \
65 |     && python -c "from tpp_pytorch_extension.alphafold.Alpha_Attention import GatingAttentionOpti_forward"
66 | 
67 | WORKDIR /Open-Omics-Acceleration-Framework/applications/alphafold/
68 | RUN wget -q -P ./alphafold/common/ https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c4941278d92b554ec94415f8/modules/mol/alg/src/stereo_chemical_props.txt --no-check-certificate
69 | 
70 | 
71 | # Swith to Non-root user
72 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /Open-Omics-Acceleration-Framework
73 | USER $SERVICE_NAME
74 | 
75 | HEALTHCHECK NONE
76 | 
77 | ENV PATH "/opt/conda/bin:$PATH"
78 | ENV LD_LIBRARY_PATH "/opt/conda/lib:$LD_LIBRARY_PATH"
79 | 
80 | COPY ./entrypoint_inf.sh /
81 | RUN chmod +x /entrypoint_inf.sh
82 | 
83 | ENTRYPOINT ["/entrypoint_inf.sh"]
84 | 
85 | # Default command
86 | CMD ["default"]


--------------------------------------------------------------------------------
/pipelines/alphafold2-based-protein-folding/Dockerfile_Pre:
--------------------------------------------------------------------------------
 1 | ARG FROM_IMAGE=ubuntu:22.04
 2 | # Install Base miniconda image
 3 | ARG BASE_IMAGE=condaforge/miniforge3:24.3.0-0
 4 | FROM ${BASE_IMAGE} as conda_setup
 5 | ENV DEBIAN_FRONTEND=noninteractive
 6 | 
 7 | # Non-root user setup
 8 | ENV SERVICE_NAME="alphafold2-pre-service"
 9 | 
10 | RUN groupadd --gid 1001 $SERVICE_NAME && \
11 |     useradd -m -g $SERVICE_NAME --shell /bin/false --uid 1001 $SERVICE_NAME
12 | 
13 | # Install Anaconda and PIP dependency
14 | RUN conda update -n base conda
15 | RUN conda install python==3.11
16 | RUN conda install -y -c conda-forge mkl==2024.2.0  dpcpp_linux-64==2024.2.0 dpcpp-cpp-rt==2024.2.0 mkl-devel==2024.2.0
17 | RUN conda install -y -c conda-forge openmm==8.0.0 pdbfixer==1.9
18 | RUN conda install -y -c bioconda hmmer=3.3.2  hhsuite==3.3.0 kalign2==2.04
19 | RUN python -m pip install onednn-cpu-iomp==2023.2.0
20 | RUN python -m pip install torch==2.1.0 pybind11==2.11.1
21 | RUN python -m pip install absl-py==2.0.0 biopython==1.81 chex==0.1.84 dm-haiku==0.0.10 dm-tree==0.1.8 immutabledict==3.0.0  ml-collections==0.1.1 numpy==1.26.1 scipy==1.11.3 tensorflow==2.14.0 pandas==2.1.1 psutil==5.9.6 tqdm==4.65.0 joblib==1.3.2 pragzip==0.6.0
22 | RUN python -m pip install jax==0.4.21 jaxlib==0.4.21
23 | RUN python -m pip install intel-extension-for-pytorch==2.1.0 intel-openmp==2024.2.0
24 | 
25 | 
26 | FROM ${FROM_IMAGE} as builder
27 | ENV DEBIAN_FRONTEND=noninteractive
28 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
29 |     git build-essential cmake wget tzdata gcc curl gnupg gnupg2 gnupg1 sudo kalign autoconf numactl time vim tar ca-certificates -y \
30 |     && rm -rf /var/lib/apt/lists/* \
31 |     && apt-get autoremove -y \
32 |     && apt-get clean \
33 |     && apt update
34 | 
35 | 
36 | COPY --from=conda_setup /opt/conda /opt/conda
37 | ENV PATH "/opt/conda/bin:$PATH"
38 | RUN echo "source /opt/conda/bin/activate " > ~/.bashrc
39 | CMD source ~/.bashrc
40 | 
41 | WORKDIR /
42 | # RUN wget https://github.com/IntelLabs/Open-Omics-Acceleration-Framework/releases/download/3.0/Source_code_with_submodules.tar.gz 
43 | # RUN tar -xzf Source_code_with_submodules.tar.gz
44 | RUN git clone --recursive https://github.com/IntelLabs/Open-Omics-Acceleration-Framework.git
45 | 
46 | WORKDIR /Open-Omics-Acceleration-Framework/applications/alphafold
47 | ENV LD_LIBRARY_PATH "/opt/conda/lib:$LD_LIBRARY_PATH"
48 | ENV PATH "/opt/conda/bin:$PATH"
49 | # Compile HHsuite from source.
50 | #
51 | WORKDIR /Open-Omics-Acceleration-Framework/applications/alphafold
52 | RUN git clone --recursive https://github.com/IntelLabs/hh-suite.git
53 | WORKDIR /Open-Omics-Acceleration-Framework/applications/alphafold/hh-suite
54 | WORKDIR /Open-Omics-Acceleration-Framework/applications/alphafold/hh-suite/build
55 | RUN cmake -DCMAKE_INSTALL_PREFIX=`pwd`/release -DCMAKE_CXX_COMPILER="icpx" -DCMAKE_CXX_FLAGS_RELEASE="-O3 -mavx512bw" .. \
56 |     && make -j 4 && make install \
57 |     && ./release/bin/hhblits -h
58 | 
59 | WORKDIR /Open-Omics-Acceleration-Framework/applications/alphafold
60 | # Compile Hmmer from source.
61 | RUN git clone --recursive https://github.com/IntelLabs/hmmer.git
62 | WORKDIR /Open-Omics-Acceleration-Framework/applications/alphafold/hmmer
63 | RUN cp easel_makefile.in easel/Makefile.in
64 | WORKDIR /Open-Omics-Acceleration-Framework/applications/alphafold/hmmer/easel
65 | RUN autoconf && ./configure --prefix=`pwd`
66 | WORKDIR /Open-Omics-Acceleration-Framework/applications/alphafold/hmmer
67 | RUN autoconf && CC=icx CFLAGS="-O3 -mavx512bw -fPIC" ./configure --prefix=`pwd`/release \
68 |     && make -j 4 && make install \
69 |     && ./release/bin/jackhmmer -h
70 | 
71 | WORKDIR /Open-Omics-Acceleration-Framework/applications/alphafold
72 | # Swith to Non-root user
73 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /Open-Omics-Acceleration-Framework
74 | USER $SERVICE_NAME
75 | 
76 | HEALTHCHECK NONE
77 | 
78 | RUN echo "source /opt/conda/bin/activate " > ~/.bashrc
79 | CMD source ~/.bashrc
80 | ENV PATH "/opt/conda/bin:$PATH"
81 | ENV LD_LIBRARY_PATH "/opt/conda/lib:$LD_LIBRARY_PATH"
82 | 
83 | COPY ./entrypoint_pre.sh /
84 | RUN chmod +x /entrypoint_pre.sh
85 | 
86 | ENTRYPOINT ["/entrypoint_pre.sh"]
87 | 
88 | # Default command
89 | CMD ["default"]


--------------------------------------------------------------------------------
/pipelines/alphafold2-based-protein-folding/README.md:
--------------------------------------------------------------------------------
 1 | # Pipeline overview
 2 | Given one or more protein sequences, this workflow performs preprocessing (database search and multiple sequence alignment using Open Omics [HMMER](https://github.com/IntelLabs/hmmer) and [HH-suite](https://github.com/IntelLabs/hh-suite)) and structure prediction through AlphaFold2's Evoformer model ([Open Omics AlphaFold2](https://github.com/IntelLabs/open-omics-alphafold)) to output the structure(s) of the protein sequences. The following block diagram illustrates the pipeline.
 3 | 
 4 | <p align="center">
 5 | <img src="https://github.com/IntelLabs/Open-Omics-Acceleration-Framework/blob/main/images/alphafold2-protein-folding.jpg"/a></br>
 6 | </p> 
 7 | 
 8 | # Build a docker image
 9 | 
10 | ### Current docker image requires a single socket/dual-socket CPU with 1 or 2 NUMA domains, because it runs multiple inference instances in parallel. It can be easily modified to run at other types of machines.
11 | 
12 | ```bash
13 | cd ~/Open-Omics-Acceleration-Framework/pipelines/alphafold2-based-protein-folding
14 | docker build -t alphafold:pre -f Dockerfile_Pre .   # Build a docker image named alphafold:pre for pre-processing step
15 | docker build -t alphafold:inf -f Dockerfile_Inf .   # Build a docker image named alphafold:inf for inference step
16 | 
17 | ```
18 | # Preparation 
19 | 1. Follow the instructions from https://github.com/deepmind/alphafold repo and download the database for alphafold2.
20 | 2. Create a samples directory that contains fasta files for input proteins. 
21 | 3. Create a output directory where model output will be written.
22 | 4. Create a log directory where log will be written.
23 | # Run a docker container
24 | ```bash
25 | export DATA_DIR=<path-to-database-directory>
26 | export SAMPLES_DIR=<path-to-input-directory>
27 | export OUTPUT_DIR=<path-to-output-directory>
28 | export LOG_DIR=<path-to-log-directory>
29 | 
30 | 
31 | # Run pre-processign step for monomer
32 | docker run -it --cap-add SYS_NICE -v $DATA_DIR:/data \
33 |     -v $SAMPLES_DIR:/samples \
34 |     -v $OUTPUT_DIR:/output \
35 |     -v $LOG_DIR:/Open-Omics-Acceleration-Framework/applications/alphafold/logs \
36 |     alphafold:pre
37 | 
38 | # Run pre-processign step for multimer
39 | docker run -it --cap-add SYS_NICE -v $DATA_DIR:/data \
40 |     -v $SAMPLES_DIR:/samples \
41 |     -v $OUTPUT_DIR:/output \
42 |     -v $LOG_DIR:/Open-Omics-Acceleration-Framework/applications/alphafold/logs \
43 |     alphafold:pre multimer
44 | 
45 | # Run inference step for monomer with relexation
46 | docker run -it --cap-add SYS_NICE -v $DATA_DIR:/data \
47 |     -v $SAMPLES_DIR:/samples \
48 |     -v $OUTPUT_DIR:/output \
49 |     -v $LOG_DIR:/Open-Omics-Acceleration-Framework/applications/alphafold/logs \
50 |     alphafold:inf monomer relax
51 | 
52 | # Run inference step for multimer with relexation
53 | docker run -it --cap-add SYS_NICE -v $DATA_DIR:/data \
54 |     -v $SAMPLES_DIR:/samples \
55 |     -v $OUTPUT_DIR:/output \
56 |     -v $LOG_DIR:/Open-Omics-Acceleration-Framework/applications/alphafold/logs \
57 |     alphafold:inf multimer relax
58 | ```
59 | 
60 | # Running baremetal
61 | 
62 | To run the optimized alphafold2 without docker (baremetal)
63 | 1. Clone the open-omics-alphafold submodule present in the applications directory of this repo.
64 | 2. Follow the readme instructions of the submodule for creating conda environment and runnning inference.
65 | 


--------------------------------------------------------------------------------
/pipelines/alphafold2-based-protein-folding/entrypoint_inf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | if [ "$1" = "multimer" ]; then
 4 |   echo "Running command for multimer"
 5 |   mkdir weights && mkdir weights/extracted && python extract_params.py --input /data/params/params_model_1_multimer_v3.npz --output_dir ./weights/extracted/model_1_multimer_v3 \
 6 |   && python extract_params.py --input /data/params/params_model_2_multimer_v3.npz --output_dir ./weights/extracted/model_2_multimer_v3 \
 7 |   && python extract_params.py --input /data/params/params_model_3_multimer_v3.npz --output_dir ./weights/extracted/model_3_multimer_v3 \
 8 |   && python extract_params.py --input /data/params/params_model_4_multimer_v3.npz --output_dir ./weights/extracted/model_4_multimer_v3 \
 9 |   && python extract_params.py --input /data/params/params_model_5_multimer_v3.npz --output_dir ./weights/extracted/model_5_multimer_v3 \
10 |   && LD_PRELOAD=/opt/conda/lib/libjemalloc.so:$LD_PRELOAD \
11 |   MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1" \
12 |   python run_multiprocess_infer_multimer.py --root_condaenv=/opt/conda --root_home=/Open-Omics-Acceleration-Framework/applications/alphafold --input_dir=/samples --output_dir=/output --model_names="model_1_multimer_v3,model_2_multimer_v3,model_3_multimer_v3,model_4_multimer_v3,model_5_multimer_v3" --num_multimer_predictions_per_model=5
13 |   if [ "$2" = "relax" ]; then
14 |     echo "Running command for relaxation"
15 |     python run_multiprocess_relax.py --root_home=/Open-Omics-Acceleration-Framework/applications/alphafold --input_dir=/samples --output_dir=/output --model_names="model_1_multimer_v3,model_2_multimer_v3,model_3_multimer_v3,model_4_multimer_v3,model_5_multimer_v3" --model_preset=multimer --num_multimer_predictions_per_model=5
16 |   fi
17 | 
18 | elif [ "$1" = "monomer" ]; then
19 |   echo "Running command for monomer"
20 |   mkdir weights && mkdir weights/extracted && python extract_params.py --input /data/params/params_model_1.npz --output_dir ./weights/extracted/model_1 \
21 |   && python extract_params.py --input /data/params/params_model_2.npz --output_dir ./weights/extracted/model_2 \
22 |   && python extract_params.py --input /data/params/params_model_3.npz --output_dir ./weights/extracted/model_3 \
23 |   && python extract_params.py --input /data/params/params_model_4.npz --output_dir ./weights/extracted/model_4 \
24 |   && python extract_params.py --input /data/params/params_model_5.npz --output_dir ./weights/extracted/model_5 \
25 |   && LD_PRELOAD=/opt/conda/lib/libjemalloc.so:$LD_PRELOAD \
26 |   MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1" \
27 |   python run_multiprocess_infer.py --root_condaenv=/opt/conda --root_home=/Open-Omics-Acceleration-Framework/applications/alphafold --input_dir=/samples --output_dir=/output --model_names="model_1,model_2,model_3,model_4,model_5"
28 | 
29 |   if [ "$2" = "relax" ]; then
30 |     echo "Running command for relaxation"
31 |     python run_multiprocess_relax.py --root_home=/Open-Omics-Acceleration-Framework/applications/alphafold --input_dir=/samples --output_dir=/output --model_names="model_1,model_2,model_3,model_4,model_5" --model_preset=monomer
32 |   fi
33 | 
34 | else
35 |   echo "Running command for monomer"
36 |   mkdir weights && mkdir weights/extracted && python extract_params.py --input /data/params/params_model_1.npz --output_dir ./weights/extracted/model_1 \
37 |   && python extract_params.py --input /data/params/params_model_2.npz --output_dir ./weights/extracted/model_2 \
38 |   && python extract_params.py --input /data/params/params_model_3.npz --output_dir ./weights/extracted/model_3 \
39 |   && python extract_params.py --input /data/params/params_model_4.npz --output_dir ./weights/extracted/model_4 \
40 |   && python extract_params.py --input /data/params/params_model_5.npz --output_dir ./weights/extracted/model_5 \
41 |   && LD_PRELOAD=/opt/conda/lib/libjemalloc.so:$LD_PRELOAD \
42 |   MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1" \
43 |   python run_multiprocess_infer.py --root_condaenv=/opt/conda --root_home=/Open-Omics-Acceleration-Framework/applications/alphafold --input_dir=/samples --output_dir=/output --model_names="model_1,model_2,model_3,model_4,model_5"
44 | fi
45 | 


--------------------------------------------------------------------------------
/pipelines/alphafold2-based-protein-folding/entrypoint_pre.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | if [ "$1" = "multimer" ]; then
 4 |   echo "Running command for multimer"
 5 |   mkdir weights && mkdir weights/extracted && python extract_params.py --input /data/params/params_model_1_multimer_v3.npz --output_dir ./weights/extracted/model_1_multimer_v3 \
 6 |     && python extract_params.py --input /data/params/params_model_2_multimer_v3.npz --output_dir ./weights/extracted/model_2_multimer_v3 \
 7 |     && python extract_params.py --input /data/params/params_model_3_multimer_v3.npz --output_dir ./weights/extracted/model_3_multimer_v3 \
 8 |     && python extract_params.py --input /data/params/params_model_4_multimer_v3.npz --output_dir ./weights/extracted/model_4_multimer_v3 \
 9 |     && python extract_params.py --input /data/params/params_model_5_multimer_v3.npz --output_dir ./weights/extracted/model_5_multimer_v3 \
10 |     && python run_multiprocess_pre_multimer.py --root_home=/Open-Omics-Acceleration-Framework/applications/alphafold --data_dir=/data --input_dir=/samples --output_dir=/output
11 | else
12 |   echo "Running command for monomer"
13 |   mkdir weights && mkdir weights/extracted && python extract_params.py --input /data/params/params_model_1.npz --output_dir ./weights/extracted/model_1 \
14 |   && python run_multiprocess_pre.py --root_home=/Open-Omics-Acceleration-Framework/applications/alphafold --data_dir=/data --input_dir=/samples --output_dir=/output --model_name=model_1
15 | fi


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/Dockerfile_fq2bams:
--------------------------------------------------------------------------------
 1 | # Install Base miniconda image
 2 | ARG FROM_IMAGE=amazonlinux:2023
 3 | #ARG BASE_IMAGE=continuumio/miniconda3
 4 | ARG BASE_IMAGE=condaforge/miniforge3
 5 | FROM ${BASE_IMAGE} as conda_setup
 6 | 
 7 | WORKDIR /
 8 | RUN git clone --recursive https://github.com/IntelLabs/Open-Omics-Acceleration-Framework.git
 9 | WORKDIR /Open-Omics-Acceleration-Framework
10 | RUN git checkout 060a0c76ad4ded6d6de709b0466b8bdafdc6053d
11 | WORKDIR /Open-Omics-Acceleration-Framework/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/
12 | 
13 | RUN conda env create --name dv_env -f environment.yml
14 | 
15 | 
16 | FROM ${FROM_IMAGE} as builder
17 | RUN yum update -y && \
18 |     yum install --allowerasing -y git gcc make cmake3 tar gnupg2 autoconf numactl time vim && \
19 |     yum clean all && \
20 |     rm -rf /var/cache/yum
21 | RUN yum install -y procps
22 | RUN yum groupinstall -y 'Development Tools'
23 | RUN yum -y update
24 | RUN yum -y install make zlib-devel ncurses-devel
25 | RUN yum -y install bzip2-devel xz-devel
26 | RUN yum -y install yum-utils
27 | 
28 | COPY --from=conda_setup /opt/conda /opt/conda
29 | ENV PATH "/opt/conda/envs/dv_env/bin:/opt/conda/bin:$PATH"
30 | RUN echo "source activate dv_env" >> ~/.bashrc
31 | RUN source ~/.bashrc
32 | 
33 | WORKDIR /
34 | RUN git clone --recursive https://github.com/IntelLabs/Open-Omics-Acceleration-Framework.git
35 | WORKDIR /Open-Omics-Acceleration-Framework
36 | RUN git checkout 060a0c76ad4ded6d6de709b0466b8bdafdc6053d
37 | 
38 | WORKDIR /Open-Omics-Acceleration-Framework/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/
39 | 
40 | 
41 | # compile bwa-mem2
42 | RUN echo "Build bwa-mem2"
43 | WORKDIR  /Open-Omics-Acceleration-Framework/applications/bwa-mem2
44 | RUN make multi
45 | 
46 | 
47 | # compile htslib
48 | WORKDIR  /Open-Omics-Acceleration-Framework/applications/htslib
49 | RUN autoreconf -i  # Build the configure script and install files it uses
50 | RUN ./configure    # Optional but recommended, for choosing extra functionality
51 | RUN make
52 | #make install   #uncomment this for installation
53 | 
54 | # compile samtools
55 | WORKDIR  /Open-Omics-Acceleration-Framework/applications/samtools
56 | RUN autoheader
57 | RUN autoconf -Wno-syntax
58 | RUN chmod 775 configure
59 | RUN ./configure           # Needed for choosing optional functionality
60 | RUN make
61 | 
62 | RUN mkdir /input
63 | RUN mkdir /output
64 | WORKDIR /Open-Omics-Acceleration-Framework/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/
65 | CMD ["/bin/bash"]
66 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/bams2vcf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | import json
  5 | from subprocess import Popen, PIPE, run
  6 | import subprocess
  7 | import time
  8 | import os
  9 | import sys
 10 | import threading
 11 | import tempfile
 12 | from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
 13 | from mpi4py import MPI
 14 | import bisect
 15 | import heapq as hq
 16 | import numpy as np
 17 | from multiprocessing import Pool
 18 | from operator import itemgetter
 19 | import pickle
 20 | BINDIR="../.."
 21 | 
 22 | def allexit(comm, flg):
 23 |     comm.barrier()        
 24 |     flg = comm.bcast(flg, root=0)
 25 |     if flg: os.sys.exit(1)
 26 | 
 27 | 
 28 | #def main(argv):
 29 | def main(args):
 30 |     ifile=args["refindex"]
 31 |     cpus=args["cpus"]
 32 |     threads=args["threads"]
 33 |     nproc=args["shards"]
 34 |     inputdir=args["input"] + "/"
 35 |     output=args["output"] + "/"
 36 |     refdir=args["refdir"] + "/"
 37 |     #tempdir=output
 38 |     tempdir = args["tempdir"]
 39 |     if tempdir == "": tempdir = output
 40 |     else: tempdir = tempdir + "/"
 41 | 
 42 |     outfile = args["outfile"]
 43 | 
 44 |     comm = MPI.COMM_WORLD
 45 |     rank = comm.Get_rank()
 46 |     nranks = comm.Get_size()
 47 |     bin_region=None
 48 | 
 49 |     global ncpus
 50 |     ncpus = int(cpus)
 51 |     i = 0
 52 |     binstr = "%05d"%(i*nranks+rank)
 53 |     t0 = time.time()
 54 |     if not os.path.isfile(os.path.join(output, 'bin_region.pkl')):
 55 |         print("[Info] Missing intermediate .pkl files from fq2bam part of the pipeline.")
 56 |         os.sys.exit(1)
 57 | 
 58 |     if not os.path.isfile(os.path.join(inputdir, 'aln' + binstr + '.bam')):
 59 |         print("[Info] Missing intermediate .bam files from fq2bam part of the pipeline.")
 60 |         os.sys.exit(1)
 61 |        
 62 |     if (ifile == "" or ifile == None) or not os.path.isfile(os.path.join(refdir, ifile)):
 63 |         print("[Info] Missing reference file.")
 64 |         os.sys.exit(1)
 65 | 
 66 | 
 67 |     with open(os.path.join(inputdir, 'bin_region.pkl'), 'rb') as f:
 68 |         bin_region = pickle.load(f)
 69 |     print(bin_region)
 70 | 
 71 |     command='mkdir -p '+os.path.join(output,binstr)+ \
 72 |         '; /opt/deepvariant/bin/run_deepvariant --model_type=WGS --ref=' + \
 73 |         os.path.join(refdir, ifile) + \
 74 |         ' --reads='+ os.path.join(inputdir, 'aln' + binstr + '.bam') + ' '  + \
 75 |         ' --output_vcf=' + os.path.join(output, binstr, 'output.vcf.gz ') + ' ' + \
 76 |         ' --intermediate_results_dir '+ \
 77 |         os.path.join(output, 'intermediate_results_dir'+ binstr) + \
 78 |         ' --num_shards='+ str(nproc)+ \
 79 |         ' --dry_run=false --regions "' + bin_region[i*nranks+rank]+'"'
 80 |     
 81 |     print("Deepvariant commandline: ")
 82 |     print(command)
 83 |     
 84 |     a = run('echo "'+command+'" > '+os.path.join(output, "logs", 'dvlog'+binstr+'.txt'), shell=True)
 85 |     a = run(command + " 2>&1 >> " + os.path.join(output, "logs", 'dvlog'+binstr+'.txt'), shell=True)
 86 |     assert a.returncode == 0,"[Info] Deepvariant execution failed."
 87 |     comm.barrier()
 88 |     
 89 |     bins_per_rank = 1
 90 |     flg = 0
 91 |     if rank == 0:
 92 |         cmd = 'bash merge_vcf.sh '+output +' '+str(nranks)+' '+str(bins_per_rank) + ' ' + outfile + " > " + output + "/logs/mergelog.txt"
 93 |         a = run(cmd, capture_output = True, shell = True)
 94 |         #assert a.returncode == 0,"VCF merge failed"
 95 |         if a.returncode != 0:
 96 |             flg = 1
 97 |             print("[Info] VCF merge failed.")
 98 |         end5 = time.time()
 99 |         print("\nDeepVariant runtime",end5-t0)
100 |         #print("\nTime for the whole pipeline",end5-start0)
101 |         for i in range(nranks):
102 |             r = "%05d"%(i)
103 |             p = os.path.join(output, r)
104 |             #print('path: ', p)
105 |             os.system('rm -rf ' + p)
106 | 
107 |     if rank == nranks - 1:
108 |         print('[Info] Cleaning up....')
109 |         for i in range(nranks):
110 |             r = "%05d"%(i)
111 |             #print(r)
112 |             #os.system('ls -lh ' + r)
113 |             if not args['keep_input']:
114 |                 os.system('rm -rf ' + os.path.join(inputdir, "bin_region.pkl"))
115 |                 os.system('rm -rf ' + os.path.join(inputdir, "aln"+r+".bam"))
116 |                 os.system('rm -rf ' + os.path.join(inputdir, "aln"+r+".bam.bai"))
117 | 
118 |             os.system('rm -rf '+ os.path.join(output, 'intermediate_results_dir' + r))
119 |         print('[Info] Cleaning up done.')
120 | 
121 | 
122 |     allexit(comm, flg)  ## all ranks exit if failure in rank 0 above
123 |     
124 | if __name__ == "__main__":
125 |     args = json.loads(sys.argv[1])
126 |     main(args)
127 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/docs.txt:
--------------------------------------------------------------------------------
1 | python run_fq2bams.py --ref /refdir/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna --reads /input/HG001.novaseq.pcr-free.30x.R1_29M.fastq.gz --output /out/ --params '-R "@RG\\tID:RG1\\tSM:RGSN1"' --keep_intermediate_sam
2 | 
3 | python run_bams2vcf.py --ref /refdir/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna --output /o
4 | ut/out2.vcf --input /out/
5 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/environment.yml:
--------------------------------------------------------------------------------
 1 | name: new_env
 2 | channels:
 3 |   - conda-forge
 4 |   - anaconda
 5 | dependencies:
 6 |   - _libgcc_mutex=0.1=main
 7 |   - _openmp_mutex=5.1=1_gnu
 8 |   - blas=1.0=mkl
 9 |   - ca-certificates=2023.01.10=h06a4308_0
10 |   - certifi=2022.12.7=py39h06a4308_0
11 |   - intel-openmp=2021.4.0=h06a4308_3561
12 |   - ld_impl_linux-64=2.38=h1181459_1
13 |   - libffi=3.4.2=h6a678d5_6
14 |   - libgcc-ng=11.2.0=h1234567_1
15 |   - libgfortran-ng=7.5.0=ha8ba4b0_17
16 |   - libgfortran4=7.5.0=ha8ba4b0_17
17 |   - libgomp=11.2.0=h1234567_1
18 |   - libstdcxx-ng=11.2.0=h1234567_1
19 |   - mkl=2021.4.0=h06a4308_640
20 |   - mkl-service=2.4.0=py39h7f8727e_0
21 |   - mkl_fft=1.3.1=py39hd3c417c_0
22 |   - mkl_random=1.2.2=py39h51133e4_0
23 |   - mpi=1.0=mpich
24 |   - mpi4py=3.1.4=py39hfc96bbd_0
25 |   - mpich=3.3.2=hc856adb_0
26 |   - ncurses=6.4=h6a678d5_0
27 |   - numpy=1.23.5=py39h14f4228_0
28 |   - numpy-base=1.23.5=py39h31eccc5_0
29 |   - openssl=1.1.1t=h7f8727e_0
30 |   - pip=23.0.1=py39h06a4308_0
31 |   - python=3.9.16=h7a1cb2a_2
32 |   - readline=8.2=h5eee18b_0
33 |   - setuptools=65.6.3=py39h06a4308_0
34 |   - six=1.16.0=pyhd3eb1b0_1
35 |   - sqlite=3.41.1=h5eee18b_0
36 |   - tk=8.6.12=h1ccaba5_0
37 |   - tzdata=2023c=h04d1e81_0
38 |   - wheel=0.38.4=py39h06a4308_0
39 |   - xz=5.2.10=h5eee18b_1
40 |   - zlib=1.2.13=h5eee18b_0
41 |   - pip:
42 |       - pragzip==0.5.0
43 |       - yappi==1.4.0
44 | 
45 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/libmimalloc.so.2.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/Open-Omics-Acceleration-Framework/db29aec9c3e2eb27c36dd91824dfc346e5deae89/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/libmimalloc.so.2.0


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/merge_vcf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | SCRIPT_PATH="${BASH_SOURCE:-$0}"
 4 | ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")"
 5 | #echo "Value of ABS_SCRIPT_PATH: ${ABS_SCRIPT_PATH}"
 6 | ABS_DIRECTORY="$(dirname "${ABS_SCRIPT_PATH}")"
 7 | #echo "Value of ABS_DIRECTORY: ${ABS_DIRECTORY}"
 8 | 
 9 | echo $1 $2 $3 $4
10 | #path ranks bins
11 | total=$(( ($2 * $3) ))
12 | for (( j=0 ; j < $total ; j++ ))
13 | do
14 |         printf -v padded_number "%05d" $j
15 |         echo $padded_number
16 |         ls ${1}/${padded_number}/output.vcf.gz -v >> ${1}/a.txt
17 |         
18 | done
19 | vcf_list=`cat ${1}/a.txt`
20 | 
21 | ${ABS_DIRECTORY}/../../applications/bcftools/bcftools concat $vcf_list > ${1}/${4}.vcf.gz
22 | 
23 | rm ${1}/a.txt
24 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/config:
--------------------------------------------------------------------------------
1 | export LD_PRELOAD=/Open-Omics-Acceleration-Framework/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/libmimalloc.so.2.0:$LD_PRELOAD
2 | export INPUT_DIR=/reads/
3 | export OUTPUT_DIR=/output/
4 | export REF_DIR=/ref/
5 | REF=GCA_000001405.15_GRCh38_no_alt_analysis_set.fna
6 | R1=HG001_R1.fastq.gz
7 | R2=HG001_R2.fastq.gz
8 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/extra_scripts/config:
--------------------------------------------------------------------------------
1 | export INPUT_DIR=/output/ 
2 | export OUTPUT_DIR=/output/
3 | export REF_DIR=/ref/
4 | export REF=GCA_000001405.15_GRCh38_no_alt_analysis_set.fna
5 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/extra_scripts/merge_vcf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | SCRIPT_PATH="${BASH_SOURCE:-$0}"
 4 | ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")"
 5 | #echo "Value of ABS_SCRIPT_PATH: ${ABS_SCRIPT_PATH}"
 6 | ABS_DIRECTORY="$(dirname "${ABS_SCRIPT_PATH}")"
 7 | #echo "Value of ABS_DIRECTORY: ${ABS_DIRECTORY}"
 8 | 
 9 | echo $1 $2 $3
10 | #path ranks bins
11 | total=$(( ($2 * $3) ))
12 | for (( j=0 ; j < $total ; j++ ))
13 | do
14 |         printf -v padded_number "%05d" $j
15 |         echo $padded_number
16 |         ls ${1}/${padded_number}/output.vcf.gz -v >> ${1}/a.txt
17 |         
18 | done
19 | vcf_list=`cat ${1}/a.txt`
20 | 
21 | bcftools concat $vcf_list > ${1}/output.vcf.gz
22 | 
23 | rm ${1}/a.txt
24 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/extra_scripts/run_pipeline_ec2_part2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | source config
 4 | 
 5 | #cd ../..
 6 | 
 7 | #source miniconda3/bin/activate dv_env
 8 | 
 9 | echo "localhost" > hostfile
10 | 
11 | num_nodes=`cat hostfile | wc -l`
12 | 
13 | first_ip=`head -n 1 hostfile`
14 | 
15 | #ssh ${first_ip} lscpu > compute_config
16 | lscpu > compute_config
17 | 
18 | 
19 | num_cpus_per_node=$(cat compute_config | grep -E '^CPU\(s\)' | awk  '{print $2}')
20 | num_cpus_all_node=`expr ${num_cpus_per_node} \* ${num_nodes}`
21 | threads_per_core=$(cat compute_config | grep -E '^Thread' | awk  '{print $4}')
22 | echo "Total number of CPUs across all nodes: $num_cpus_all_node"
23 | 
24 | 
25 | num_physical_cores_all_nodes=`expr ${num_cpus_all_node} / ${threads_per_core}`
26 | 
27 | num_physical_cores_per_nodes=`expr ${num_cpus_per_node} / ${threads_per_core}`
28 | 
29 | 
30 | while [ $num_physical_cores_per_nodes -ge 20 ]
31 | do
32 |    num_physical_cores_per_nodes=`expr $num_physical_cores_per_nodes / 2`
33 | done
34 | 
35 | num_physical_cores_per_rank=$num_physical_cores_per_nodes
36 | 
37 | total_num_ranks=`expr ${num_physical_cores_all_nodes} / ${num_physical_cores_per_rank}`
38 | 
39 | ranks_per_node=`expr ${total_num_ranks} / ${num_nodes}`
40 | 
41 | sh run_pipeline_part2.sh  ${total_num_ranks} ${ranks_per_node} ${REF} ${R1} ${R2} "sudo docker"
42 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/extra_scripts/run_pipeline_part2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | SCRIPT_PATH="${BASH_SOURCE:-$0}"
 5 | ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")"
 6 | ABS_DIRECTORY="$(dirname "${ABS_SCRIPTINPUT_DIR_PATH}")"
 7 | 
 8 | INDIR=$INPUT_DIR
 9 | OUTDIR=$OUTPUT_DIR
10 | REFDIR=$REF_DIR
11 | #* ranks: Number of mpi process that we want the pipeline to run on
12 | #* threads/shards: parameters to different tools in the pipeline, calculated as below
13 | ppn=$2
14 | 
15 | 
16 | Sockets=$(cat compute_config | grep -E '^Socket\(s\)' | awk  '{print $2}')   #2
17 | Cores=$(cat compute_config | grep -E '^Core\(s\)' | awk  '{print $4}')  #56
18 | Thread=$(cat compute_config | grep -E '^Thread' | awk  '{print $4}')  #2
19 | 
20 | a=$(( $(( ${Cores}*${Thread}*${Sockets} / $ppn )) - 2*${Thread} ))   #24 (Four threads are removed for IO)
21 | b=$(( $(( ${Cores}*${Sockets} )) / $ppn ))   #14
22 | 
23 | if [ $a -lt 1 ]
24 | then
25 |     echo 'Number of cpus are less to run the pipeline.'
26 |     exit 0
27 | fi
28 | 
29 | N=$1
30 | PPN=$2
31 | CPUS=$a
32 | THREADS=$a
33 | SHARDS=$b
34 | REF=$(basename "$3")  #Change to your reference file
35 | READ1=$(basename "$4")  #Change your read files
36 | READ2=$(basename "$5")
37 | BINDING=socket
38 | Container=docker
39 | 
40 | if [ $# -gt 5 ]
41 | then
42 |         Container="$6"
43 | fi
44 | 
45 | echo "Output directory: $OUTDIR"
46 | mkdir -p ${OUTDIR}
47 | #It is assumed that if reference file is .gz then it is converted using create_reference_index.sh or pcluster_reference_index.sh script.
48 | file_ext=${REF##*.}
49 | 
50 | if [ "${file_ext}" = "gz" ]
51 | then
52 |         REF=$(basename "$REF" .gz )
53 |         if ! [ -f $REFDIR/${REF} ]; then
54 |                 echo "File $REFDIR/${REF} does not exist."
55 |                 exit 0
56 |         fi
57 | fi
58 | 
59 | echo Starting run with $N ranks, $CPUS threads,$THREADS threads, $SHARDS shards, $PPN ppn.
60 | # -in -sindex are required only once for indexing.
61 | # Todo : Make index creation parameterized.
62 | mpiexec -bootstrap ssh -bind-to $BINDING -map-by $BINDING --hostfile hostfile -n $N -ppn $PPN python -u test_deep.py --inputdir $INDIR --output  $OUTDIR $TEMPDIR --refdir $REFDIR --index $REF --cpus $CPUS --threads $THREADS --shards $SHARDS   2>&1 | tee ${OUTDIR}/log.txt
63 | 
64 | #/opt/deepvariant/bin/run_deepvariant --model_type=WGS --ref=/refdir/'+ifile+' --reads=/tempdir/aln'+binstr+'.bam --output_vcf=/output/output.vcf.gz --intermediate_results_dir /tempdir/intermediate_results_dir'+binstr+' --num_shards='+nproc+' --dry_run=false --regions "'+bin_region[i*nranks+rank]+'"'
65 | 
66 | #echo "Pipeline finished. Output vcf can be found at: $OUTPUT_DIR/output.vcf.gz"
67 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/run_pipeline.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | SCRIPT_PATH="${BASH_SOURCE:-$0}"
 4 | ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")"
 5 | ABS_DIRECTORY="$(dirname "${ABS_SCRIPTINPUT_DIR_PATH}")"
 6 | 
 7 | INDIR=$INPUT_DIR
 8 | OUTDIR=$OUTPUT_DIR
 9 | REFDIR=$REF_DIR
10 | #* ranks: Number of mpi process that we want the pipeline to run on
11 | #* threads/shards: parameters to different tools in the pipeline, calculated as below
12 | ppn=$2
13 | 
14 | 
15 | Sockets=$(cat compute_config | grep -E '^Socket\(s\)' | awk  '{print $2}')   #2
16 | Cores=$(cat compute_config | grep -E '^Core\(s\)' | awk  '{print $4}')  #56
17 | Thread=$(cat compute_config | grep -E '^Thread' | awk  '{print $4}')  #2
18 | 
19 | a=$(( $(( ${Cores}*${Thread}*${Sockets} / $ppn )) - 2*${Thread} ))   #24 (Four threads are removed for IO)
20 | b=$(( $(( ${Cores}*${Sockets} )) / $ppn ))   #14
21 | 
22 | if [ $a -lt 1 ]
23 | then
24 |     echo 'Number of cpus are less to run the pipeline.'
25 |     exit 0
26 | fi
27 | 
28 | N=$1
29 | PPN=$2
30 | CPUS=$a
31 | THREADS=$a
32 | SHARDS=$b
33 | REF=$(basename "$3")  #Change to your reference file
34 | READ1=$(basename "$4")  #Change your read files       
35 | READ2=$(basename "$5")
36 | BINDING=socket
37 | Container=docker
38 | 
39 | if [ $# -gt 5 ]
40 | then
41 |         Container="$6"
42 | fi
43 | 
44 | echo "Output directory: $OUTDIR"
45 | mkdir -p ${OUTDIR}
46 | #It is assumed that if reference file is .gz then it is converted using create_reference_index.sh or pcluster_reference_index.sh script.
47 | file_ext=${REF##*.}
48 | 
49 | if [ "${file_ext}" = "gz" ]
50 | then
51 | 	REF=$(basename "$REF" .gz )
52 | 	if ! [ -f $REFDIR/${REF} ]; then
53 |   		echo "File $REFDIR/${REF} does not exist."
54 | 		exit 0
55 | 	fi
56 | fi
57 | 
58 | echo Starting run with $N ranks, $CPUS threads,$THREADS threads, $SHARDS shards, $PPN ppn.
59 | # -in -sindex are required only once for indexing. 
60 | # Todo : Make index creation parameterized. 
61 | mpiexec -bootstrap ssh -bind-to $BINDING -map-by $BINDING --hostfile hostfile -n $N -ppn $PPN python -u test_pipeline_final.py --input $INDIR --output  $OUTDIR $TEMPDIR --refdir $REFDIR --index $REF --read $READ1 $READ2 --cpus $CPUS --threads $THREADS --shards $SHARDS --container_tool "$Container" 2>&1 | tee ${OUTDIR}/log.txt
62 | 
63 | echo "Pipeline finished. Output vcf can be found at: $OUTPUT_DIR/output.vcf.gz"
64 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/run_pipeline_part1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | SCRIPT_PATH="${BASH_SOURCE:-$0}"
 5 | ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")"
 6 | ABS_DIRECTORY="$(dirname "${ABS_SCRIPTINPUT_DIR_PATH}")"
 7 | 
 8 | INDIR=$INPUT_DIR
 9 | OUTDIR=$OUTPUT_DIR
10 | REFDIR=$REF_DIR
11 | #* ranks: Number of mpi process that we want the pipeline to run on
12 | #* threads/shards: parameters to different tools in the pipeline, calculated as below
13 | ppn=$2
14 | 
15 | 
16 | Sockets=$(cat compute_config | grep -E '^Socket\(s\)' | awk  '{print $2}')   #2
17 | Cores=$(cat compute_config | grep -E '^Core\(s\)' | awk  '{print $4}')  #56
18 | Thread=$(cat compute_config | grep -E '^Thread' | awk  '{print $4}')  #2
19 | 
20 | a=$(( $(( ${Cores}*${Thread}*${Sockets} / $ppn )) - 2*${Thread} ))   #24 (Four threads are removed for IO)
21 | b=$(( $(( ${Cores}*${Sockets} )) / $ppn ))   #14
22 | 
23 | if [ $a -lt 1 ]
24 | then
25 |     echo 'Number of cpus are less to run the pipeline.'
26 |     exit 0
27 | fi
28 | 
29 | N=$1
30 | PPN=$2
31 | CPUS=$a
32 | THREADS=$a
33 | SHARDS=$b
34 | REF=$(basename "$3")  #Change to your reference file
35 | READ1="$4" #Change your read files
36 | READ2="$5"
37 | BINDING=socket
38 | Container=docker
39 | 
40 | if [ $# -gt 5 ]
41 | then
42 |         Container="$6"
43 | fi
44 | 
45 | echo "Output directory: $OUTDIR"
46 | mkdir -p ${OUTDIR}
47 | #It is assumed that if reference file is .gz then it is converted using create_reference_index.sh or pcluster_reference_index.sh script.
48 | file_ext=${REF##*.}
49 | 
50 | if [ "${file_ext}" = "gz" ]
51 | then
52 |         REF=$(basename "$REF" .gz )
53 |         if ! [ -f $REFDIR/${REF} ]; then
54 |                 echo "File $REFDIR/${REF} does not exist."
55 |                 exit 0
56 |         fi
57 | fi
58 | 
59 | echo Starting run with $N ranks, $CPUS threads,$THREADS threads, $SHARDS shards, $PPN ppn.
60 | # -in -sindex are required only once for indexing.
61 | # Todo : Make index creation parameterized.
62 | mpiexec -bootstrap ssh -bind-to $BINDING -map-by $BINDING --hostfile hostfile -n $N -ppn $PPN python -u test_pipeline_part1.py --input $INDIR --output  $OUTDIR $TEMPDIR --refdir $REFDIR --index $REF --read $READ1 $READ2 --cpus $CPUS --threads $THREADS --shards $SHARDS --container_tool "$Container"  2>&1 | tee ${OUTDIR}/log_part1.txt
63 | 
64 | #echo "Pipeline finished. Output vcf can be found at: $OUTPUT_DIR/output.vcf.gz"
65 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/aws/basic_setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | # All basic dev tools for Ubuntu 22.04
 4 | 
 5 | sudo apt update 
 6 | 
 7 | #sudo apt -y upgrade 
 8 | 
 9 | sudo apt -y install make 
10 | 
11 | sudo apt -y install autoconf 
12 | 
13 | sudo apt -y install numactl 
14 | 
15 | sudo apt -y install build-essential 
16 | 
17 | sudo apt -y install zlib1g-dev 
18 | 
19 | sudo apt -y install libncurses5-dev 
20 | 
21 | sudo apt -y update 
22 | 
23 | #sudo apt -y upgrade 
24 | 
25 | sudo apt -y install libbz2-dev 
26 | 
27 | sudo apt -y install liblzma-dev 
28 | 
29 | sudo apt-get -qq -y update
30 | sudo apt-get -qq -y install wget
31 | 
32 | # All dependencies for bcftools Docker
33 | echo "Installing Docker"
34 | sudo apt-get -qq -y install apt-transport-https ca-certificates curl gnupg-agent software-properties-common
35 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
36 | 
37 | 
38 | sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
39 | 
40 | sudo apt-get -qq -y update
41 | sudo apt-get -qq -y install docker-ce
42 | sudo systemctl start docker
43 | 
44 | sudo docker --version
45 | 
46 | echo "Running Docker installation hello world!! test"
47 | sudo docker run hello-world
48 | 
49 | #echo "Creating and activating a conda environment"
50 | #source setup_env.sh deepvaraint_env
51 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/aws/build_deepvariant_docker_image.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | 
 5 | 
 6 | #Pre-req: Docker installation
 7 | # Check if Docker is installed
 8 | if [ "$(command -v docker)" ]; then
 9 |     # Docker is installed
10 |     echo "Docker is installed."
11 |     
12 |     # Check Docker version
13 |     docker_version=$("docker" --version | awk '{print $3}')
14 |     echo "Docker version: $docker_version"
15 | else
16 |     # Docker is not installed
17 |     echo "Docker is not installed on this system."
18 | 	exit 1
19 | fi
20 | 
21 | 
22 | # Build docker
23 | 
24 | # This will save deepvariant images
25 | cd ../../../../applications/deepvariant
26 | sudo docker build -t deepvariant .
27 | 
28 | # check the built and print the image ID
29 | 
30 | sudo docker images | grep "deepvariant:latest"
31 | 
32 | #save image(~7 GB) to tar file if you are using multiple nodes.
33 | 
34 | echo "Saving deepvariant:latest image as deepvariant.tar..."
35 | cd - # Move to pipelines/deepvariant
36 | sudo docker save -o deepvariant.tar deepvariant:latest
37 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/aws/build_tools.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | # Prerequisites: conda env activated
 4 | 
 5 | 
 6 | # Clone the repo: https://github.com/IntelLabs/Open-Omics-Acceleration-Framework.git
 7 | 
 8 | # git clone --recursive https://github.com/IntelLabs/Open-Omics-Acceleration-Framework.git
 9 | 
10 | cd ../../../../../Open-Omics-Acceleration-Framework
11 | WDIR=`pwd`
12 | 
13 | cd ${WDIR}/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/
14 | 
15 | ls
16 | 
17 | 
18 | # Pre-req: conda env
19 | source setup_env.sh  dv_env
20 | 
21 | 
22 | # compile bwa-mem2
23 | echo "Build bwa-mem2"
24 | cd ${WDIR}/applications/bwa-mem2
25 | make multi
26 | if [ -e "${WDIR}/applications/bwa-mem2/bwa-mem2" ]; then
27 |     echo "bwa-mem2 build successful"
28 | else
29 |     echo "Error!! bwa-mem2 build failed"
30 | fi
31 | 
32 | #make install   #uncomment this for installation
33 | 
34 | # compile htslib
35 | cd ${WDIR}/applications/htslib
36 | autoreconf -i  # Build the configure script and install files it uses
37 | ./configure    # Optional but recommended, for choosing extra functionality
38 | make
39 | #make install   #uncomment this for installation
40 | 
41 | # compile bcftools
42 | cd ${WDIR}/applications/bcftools
43 | # The following is optional:
44 | #   autoheader && autoconf && ./configure --enable-libgsl --enable-perl-filters
45 | make
46 | #make install   #uncomment this for installation
47 | 
48 | # compile samtools
49 | cd ${WDIR}/applications/samtools
50 | autoheader
51 | autoconf -Wno-syntax
52 | chmod 775 configure
53 | ./configure           # Needed for choosing optional functionality
54 | make
55 | #make install         #uncomment this for installation
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/aws/config:
--------------------------------------------------------------------------------
1 | export LD_PRELOAD=<absolute-path>/Open-Omics-Acceleration-Framework/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/libmimalloc.so.2.0:$LD_PRELOAD
2 | export INPUT_DIR=/path-to-read-datasets/
3 | export OUTPUT_DIR=/path-to-output-directory/
4 | export REF_DIR=/path-to-ref-directory/
5 | REF=ref.fasta
6 | R1=R1.fastq.gz
7 | R2=R2.fastq.gz
8 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/aws/create_reference_index.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | set -e
 3 | SCRIPT_PATH="${BASH_SOURCE:-$0}"
 4 | ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")"
 5 | #echo "Value of ABS_SCRIPT_PATH: ${ABS_SCRIPT_PATH}"
 6 | ABS_DIRECTORY="$(dirname "${ABS_SCRIPT_PATH}")"
 7 | 
 8 | cd $ABS_DIRECTORY
 9 | source config
10 | filename=${REF}
11 | file_ext=${filename##*.}
12 | file_name_without_extension=$(basename "$filename" .gz )
13 | 
14 | 
15 | if [ ${file_ext} == 'gz' ]
16 | then
17 | 	echo "Refecence file is decompressing..."
18 |         gzip -d ${REF_DIR}/${filename}
19 | 	REF=${file_name_without_extension}
20 | fi
21 | ref=${REF_DIR}/${REF}
22 | mkdir -p ${OUTPUT_DIR}
23 | 
24 | echo "Checking the index files for $ref"
25 | ls ${ref}*
26 | 
27 | # mem2 index
28 | echo "Creating FM-index for the reference sequence ${ref}"
29 | cd ../../../../applications/bwa-mem2
30 | ./bwa-mem2 index $ref &> ${OUTPUT_DIR}/bwa_mem2_index_log
31 | cd - &> /dev/null
32 | 
33 | 
34 | # samtool idfai index
35 | echo "Creating fai index for the reference sequence ${ref}"
36 | cd ../../../../applications/samtools
37 | ./samtools faidx $ref &> ${OUTPUT_DIR}/samtools_fai_log
38 | cd - &> /dev/null
39 | 
40 | 
41 | echo "The list of all index files created."
42 | ls ${ref}*
43 | if [ -z $1 ]
44 | then 
45 | 		echo "Index files are created."
46 | 	else
47 | 		echo "Index files are created release instance by typing: 'scancel $1' "
48 | fi
49 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/aws/deepvariant_ec2_setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | echo "Step 1: Basic installation.." 
 5 | bash basic_setup.sh
 6 | 
 7 | echo "Step 2: Building applications.."
 8 | bash build_tools.sh
 9 | 
10 | echo "Step 3: Building Deepvaraint image.."
11 | bash build_deepvariant_docker_image.sh
12 | 
13 | echo "Setup done!!"
14 | 
15 | echo "Next step, modify the \"config\" file according to the reference sequence and the read datasets, and run \"bash create_reference_index.sh \""
16 | 
17 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/aws/deepvariant_setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | echo "Step 1: Basic installation.." 
 5 | bash basic_setup.sh
 6 | 
 7 | echo "Step 2: Building applications.."
 8 | bash build_tools.sh
 9 | 
10 | echo "Step 3: Building and saving Deepvaraint image.."
11 | bash build_deepvariant_docker_image.sh
12 | 
13 | echo "Setup done!!"
14 | 
15 | echo "Next step, modify the \"config\" file according to the reference sequence and the read datasets, and run \"bash create_reference_index.sh \""
16 | 
17 | 
18 | 
19 | 
20 | 
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/aws/pcluster_compute_node_setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | WDIR=`pwd`
 4 | num_compute_nodes=$1
 5 | allocation_time="02:00:00"
 6 | 
 7 | if [ -z $2 ]
 8 | then 
 9 | 	echo "Allocating compute nodes by default for 2 hours"
10 | else
11 | 	echo "Allocating compute nodes for $2 hours"
12 | 	allocation_time=$2
13 | fi
14 | 
15 | # Allocate compute nodes
16 | salloc --nodes=${num_compute_nodes} --ntasks-per-node=1 --wait-all-nodes=1 --time=${allocation_time} --no-shell &> tmp_salloc && grep "Granted job allocation" tmp_salloc | cut -d" " -f5 &> tmp_jobid
17 | 
18 | jid=`cat tmp_jobid | head -n 1`
19 | 
20 | rm tmp_salloc tmp_jobid
21 | 
22 | srun --jobid=$jid hostname > ../../hostfile
23 | 
24 | echo "Cluster alloccation done!!"
25 | cat ../../hostfile
26 | 
27 | for i in `cat ../../hostfile`
28 | do
29 |   echo $i
30 |   ssh $i "bash ${WDIR}/basic_setup.sh && sudo docker load -i ${WDIR}/deepvariant.tar && sudo docker images && echo \"setup done for $i. Press enter to continue..\" " &
31 | done
32 | 
33 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/aws/pcluster_example_config:
--------------------------------------------------------------------------------
 1 | # This example configuration file is created for parallel cluster version 2.11.9
 2 | # This file resides in the  ~/.parallelcluster/config on local machine
 3 | [cluster default]
 4 | key_name = <key_in_the_region>
 5 | vpc_settings = public
 6 | ebs_settings = myebs
 7 | compute_instance_type = c7i.48xlarge #change as per your need
 8 | master_instance_type = c6i.4xlarge  #change as per your need
 9 | master_root_volume_size = 200 #change as per your need
10 | maintain_initial_size = false
11 | initial_queue_size = 0
12 | max_queue_size = 32  #maximum number of compute nodes in the cluster 
13 | placement_group = DYNAMIC
14 | placement = cluster
15 | scaling_settings = custom
16 | tags = {"name": "xyz"}
17 | base_os = ubuntu2004 #ubuntu gives best performance
18 | scheduler = slurm
19 | enable_efa = compute
20 | enable_intel_hpc_platform = false
21 | 
22 | [scaling custom]
23 | scaledown_idletime=10
24 | 
25 | [vpc public]
26 | vpc_id = vpc-xxxxxxxxxx  #get vpc id from your aws region
27 | master_subnet_id = subnet-xxxxxxxxxxxxx  #get subnet id from your aws region
28 | ssh_from = 172.17.0.1/1
29 | 
30 | [ebs myebs]
31 | shared_dir = /sharedgp
32 | volume_size = 1024
33 | volume_type = io2
34 | volume_iops = 64000
35 | 
36 | [aliases]
37 | ssh = ssh {CFN_USER}@{MASTER_IP} {ARGS}
38 | 
39 | [aws]
40 | aws_region_name = us-west-2  #change as per your need
41 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/aws/pcluster_reference_index.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | set –e 
 3 | 
 4 | WDIR=`pwd`
 5 | num_compute_nodes=1
 6 | allocation_time="03:00:00"
 7 | 
 8 | if [ -z $2 ]
 9 | then 
10 | 	echo "Allocating compute nodes by default for 2 hours"
11 | else
12 | 	echo "Allocating compute nodes for $2 hours"
13 | 	allocation_time=$2
14 | fi
15 | 
16 | # Allocate compute nodes
17 | salloc --nodes=${num_compute_nodes} --ntasks-per-node=1 --wait-all-nodes=1 --time=${allocation_time} --no-shell &> tmp_salloc && grep "Granted job allocation" tmp_salloc | cut -d" " -f5 &> tmp_jobid
18 | 
19 | jid=`cat tmp_jobid | head -n 1`
20 | 
21 | rm tmp_salloc tmp_jobid
22 | 
23 | srun --jobid=$jid hostname > ../../hostfile
24 | 
25 | echo "Cluster allocation done!!"
26 | cat ../../hostfile
27 | 
28 | for i in `cat ../../hostfile`
29 | do
30 |   echo $i
31 |   ssh $i "bash ${WDIR}/basic_setup.sh && bash ${WDIR}/create_reference_index.sh ${jid}" &
32 | done
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/aws/run_pipeline_ec2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | source config
 4 | 
 5 | cd ../..
 6 | 
 7 | source miniconda3/bin/activate dv_env
 8 | 
 9 | hostname > hostfile
10 | 
11 | num_nodes=`cat hostfile | wc -l`
12 | 
13 | first_ip=`head -n 1 hostfile`
14 | 
15 | #ssh ${first_ip} lscpu > compute_config
16 | lscpu > compute_config
17 | 
18 | 
19 | num_cpus_per_node=$(cat compute_config | grep -E '^CPU\(s\)' | awk  '{print $2}')
20 | num_cpus_all_node=`expr ${num_cpus_per_node} \* ${num_nodes}`
21 | threads_per_core=$(cat compute_config | grep -E '^Thread' | awk  '{print $4}')
22 | echo "Total number of CPUs across all nodes: $num_cpus_all_node"
23 | 
24 | 
25 | num_physical_cores_all_nodes=`expr ${num_cpus_all_node} / ${threads_per_core}`
26 | 
27 | num_physical_cores_per_nodes=`expr ${num_cpus_per_node} / ${threads_per_core}`
28 | 
29 | 
30 | while [ $num_physical_cores_per_nodes -ge 20 ]
31 | do
32 |    num_physical_cores_per_nodes=`expr $num_physical_cores_per_nodes / 2`
33 | done
34 | 
35 | num_physical_cores_per_rank=$num_physical_cores_per_nodes
36 | 
37 | total_num_ranks=`expr ${num_physical_cores_all_nodes} / ${num_physical_cores_per_rank}`
38 | 
39 | ranks_per_node=`expr ${total_num_ranks} / ${num_nodes}`
40 | 
41 | sh run_pipeline.sh  ${total_num_ranks} ${ranks_per_node} ${REF} ${R1} ${R2} "sudo docker"
42 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/aws/run_pipeline_ec2_part1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | source config
 4 | 
 5 | cd ../..
 6 | 
 7 | #source miniconda3/bin/activate dv_env
 8 | 
 9 | echo "localhost" > hostfile
10 | 
11 | num_nodes=`cat hostfile | wc -l`
12 | 
13 | first_ip=`head -n 1 hostfile`
14 | 
15 | #ssh ${first_ip} lscpu > compute_config
16 | lscpu > compute_config
17 | 
18 | 
19 | num_cpus_per_node=$(cat compute_config | grep -E '^CPU\(s\)' | awk  '{print $2}')
20 | num_cpus_all_node=`expr ${num_cpus_per_node} \* ${num_nodes}`
21 | threads_per_core=$(cat compute_config | grep -E '^Thread' | awk  '{print $4}')
22 | echo "Total number of CPUs across all nodes: $num_cpus_all_node"
23 | 
24 | 
25 | num_physical_cores_all_nodes=`expr ${num_cpus_all_node} / ${threads_per_core}`
26 | 
27 | num_physical_cores_per_nodes=`expr ${num_cpus_per_node} / ${threads_per_core}`
28 | 
29 | 
30 | while [ $num_physical_cores_per_nodes -ge 20 ]
31 | do
32 |    num_physical_cores_per_nodes=`expr $num_physical_cores_per_nodes / 2`
33 | done
34 | 
35 | num_physical_cores_per_rank=$num_physical_cores_per_nodes
36 | 
37 | total_num_ranks=`expr ${num_physical_cores_all_nodes} / ${num_physical_cores_per_rank}`
38 | 
39 | ranks_per_node=`expr ${total_num_ranks} / ${num_nodes}`
40 | 
41 | sh run_pipeline_part1.sh  ${total_num_ranks} ${ranks_per_node} ${REF} ${R1} ${R2} "sudo docker"
42 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/aws/run_pipeline_pcluster.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | source config
 4 | 
 5 | cd ../..
 6 | 
 7 | source miniconda3/bin/activate dv_env
 8 | 
 9 | 
10 | num_nodes=`cat hostfile | wc -l`
11 | 
12 | first_ip=`head -n 1 hostfile`
13 | 
14 | ssh ${first_ip} lscpu > compute_config
15 | #lscpu > compute_config
16 | 
17 | 
18 | num_cpus_per_node=$(cat compute_config | grep -E '^CPU\(s\)' | awk  '{print $2}')
19 | num_cpus_all_node=`expr ${num_cpus_per_node} \* ${num_nodes}`
20 | threads_per_core=$(cat compute_config | grep -E '^Thread' | awk  '{print $4}')
21 | echo "Total number of CPUs across all nodes: $num_cpus_all_node"
22 | 
23 | 
24 | num_physical_cores_all_nodes=`expr ${num_cpus_all_node} / ${threads_per_core}`
25 | 
26 | num_physical_cores_per_nodes=`expr ${num_cpus_per_node} / ${threads_per_core}`
27 | 
28 | 
29 | while [ $num_physical_cores_per_nodes -ge 20 ]
30 | do
31 |    num_physical_cores_per_nodes=`expr $num_physical_cores_per_nodes / 2`
32 | done
33 | 
34 | num_physical_cores_per_rank=$num_physical_cores_per_nodes
35 | 
36 | total_num_ranks=`expr ${num_physical_cores_all_nodes} / ${num_physical_cores_per_rank}`
37 | 
38 | ranks_per_node=`expr ${total_num_ranks} / ${num_nodes}`
39 | 
40 | sh run_pipeline.sh  ${total_num_ranks} ${ranks_per_node} ${REF} ${R1} ${R2} "sudo docker"
41 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/cluster/config:
--------------------------------------------------------------------------------
1 | export LD_PRELOAD=<absolute-path>/Open-Omics-Acceleration-Framework/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/libmimalloc.so.2.0:$LD_PRELOAD
2 | export INPUT_DIR=/path-to-read-datasets/
3 | export OUTPUT_DIR=/path-to-output-directory/
4 | export REF_DIR=/path-to-ref-directory/
5 | REF=ref.fasta
6 | R1=R1.fastq.gz
7 | R2=R2.fastq.gz
8 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/cluster/create_reference_index.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | SCRIPT_PATH="${BASH_SOURCE:-$0}"
 4 | ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")"
 5 | #echo "Value of ABS_SCRIPT_PATH: ${ABS_SCRIPT_PATH}"
 6 | ABS_DIRECTORY="$(dirname "${ABS_SCRIPT_PATH}")"
 7 | 
 8 | cd $ABS_DIRECTORY
 9 | source config
10 | filename=${REF}
11 | file_ext=${filename##*.}
12 | file_name_without_extension=$(basename "$filename" .gz )
13 | 
14 | 
15 | if [ ${file_ext} == 'gz' ]
16 | then
17 |         echo "Refecence file is decompressing..."
18 |         gzip -d ${REF_DIR}/${filename}
19 |         REF=${file_name_without_extension}
20 | fi
21 | 
22 | 
23 | ref=${REF_DIR}/${REF}
24 | mkdir -p ${OUTPUT_DIR}
25 | echo "Checking the index files for $ref"
26 | ls ${ref}*
27 | 
28 | # mem2 index
29 | echo "Creating FM-index for the reference sequence ${ref}"
30 | cd ../../../../applications/bwa-mem2
31 | ./bwa-mem2 index $ref &> ${OUTPUT_DIR}/bwa_mem2_index_log
32 | cd - &> /dev/null
33 | 
34 | 
35 | # samtool idfai index
36 | echo "Creating fai index for the reference sequence ${ref}"
37 | cd ../../../../applications/samtools
38 | ./samtools faidx $ref &> ${OUTPUT_DIR}/samtools_fai_log
39 | cd - &> /dev/null
40 | 
41 | 
42 | echo "The list of all index files created."
43 | ls ${ref}*
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/cluster/load_deepvariant.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | SCRIPT_PATH="${BASH_SOURCE:-$0}"
 4 | ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")"
 5 | ABS_DIRECTORY="$(dirname "${ABS_SCRIPT_PATH}")"
 6 | 
 7 | Container=docker
 8 | 
 9 | [[ $# -gt 0 ]] && Container="$1"
10 | 
11 | for i in `cat hostfile`
12 | do
13 |   echo $i
14 |   ssh $i "${Container} load -i ${ABS_DIRECTORY}/../../deepvariant.tar" &
15 | done
16 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/cluster/run_pipeline_cluster.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | source config
 5 | 
 6 | Container=docker
 7 | 
 8 | if [ $# -gt 0 ]
 9 | then
10 |     Container="$1"
11 | fi
12 | 
13 | mv hostfile ../../
14 | cd ../..
15 | 
16 | source miniconda3/bin/activate dv_env
17 | 
18 | num_nodes=`cat hostfile | wc -l`
19 | 
20 | first_ip=`head -n 1 hostfile`
21 | 
22 | ssh ${first_ip} lscpu > compute_config
23 | #lscpu > compute_config
24 | 
25 | 
26 | num_cpus_per_node=$(cat compute_config | grep -E '^CPU\(s\)' | awk  '{print $2}')
27 | num_cpus_all_node=`expr ${num_cpus_per_node} \* ${num_nodes}`
28 | threads_per_core=$(cat compute_config | grep -E '^Thread' | awk  '{print $4}')
29 | echo "Total number of CPUs across all nodes: $num_cpus_all_node"
30 | 
31 | 
32 | num_physical_cores_all_nodes=`expr ${num_cpus_all_node} / ${threads_per_core}`
33 | 
34 | num_physical_cores_per_nodes=`expr ${num_cpus_per_node} / ${threads_per_core}`
35 | 
36 | 
37 | while [ $num_physical_cores_per_nodes -ge 20 ]
38 | do
39 |    num_physical_cores_per_nodes=`expr $num_physical_cores_per_nodes / 2`
40 | done
41 | 
42 | num_physical_cores_per_rank=$num_physical_cores_per_nodes
43 | 
44 | total_num_ranks=`expr ${num_physical_cores_all_nodes} / ${num_physical_cores_per_rank}`
45 | 
46 | ranks_per_node=`expr ${total_num_ranks} / ${num_nodes}`
47 | 
48 | sh run_pipeline.sh  ${total_num_ranks} ${ranks_per_node} ${REF} ${R1} ${R2} ${Container}
49 | 
50 | echo "Pipeline finished. Output vcf can be found at: $OUTPUT_DIR/output.vcf.gz"
51 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/scripts/cluster/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | #ENV=/nfs/work04/ashish/envs/new_env/bin/activate
 5 | #source $ENV
 6 | 
 7 | SCRIPT_PATH="${BASH_SOURCE:-$0}"
 8 | ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")"
 9 | #echo "Value of ABS_SCRIPT_PATH: ${ABS_SCRIPT_PATH}"
10 | ABS_DIRECTORY="$(dirname "${ABS_SCRIPT_PATH}")"
11 | #echo "Value of ABS_DIRECTORY: ${ABS_DIRECTORY}"
12 | 
13 | export LD_PRELOAD=$LD_PRELOAD:"${ABS_DIRECTORY}/../../libmimalloc.so.2.0"
14 | #echo $LD_PRELOAD
15 | 
16 | Container=docker
17 | 
18 | if [ $# -gt 0 ]
19 | then
20 |     Container="$1"
21 | fi
22 | 
23 | # This will save deepvariant images
24 | cd ${ABS_DIRECTORY}/../../../../applications/deepvariant
25 | $Container build -t deepvariant .
26 | # docker build --build-arg http_proxy="http://proxy-us.abc.com:123" --build-arg https_proxy="http://proxy-us.abc.com:123" --build-arg no_proxy="127.0.0.1,localhost"  -t deepvariant .
27 | 
28 | 
29 | #save image(~7 GB) to tar file if you are using multiple nodes.
30 | cd ${ABS_DIRECTORY}
31 | $Container save -o ${ABS_DIRECTORY}/../../deepvariant.tar deepvariant:latest
32 | 
33 | 
34 | cd ${ABS_DIRECTORY}/../../../../applications/bwa-mem2
35 | #make CXX=icpc multi
36 | make
37 | #make install   #uncomment this for installation
38 | 
39 | cd ${ABS_DIRECTORY}/../../../../applications/htslib
40 | autoreconf -i  # Build the configure script and install files it uses
41 | ./configure    # Optional but recommended, for choosing extra functionality
42 | make
43 | #make install   #uncomment this for installation
44 | 
45 | cd ${ABS_DIRECTORY}/../../../../applications/bcftools
46 | # The following is optional:
47 | #   autoheader && autoconf && ./configure --enable-libgsl --enable-perl-filters
48 | make
49 | #make install   #uncomment this for installation
50 | 
51 | 
52 | cd ${ABS_DIRECTORY}/../../../../applications/samtools
53 | autoheader
54 | autoconf -Wno-syntax
55 | chmod 775 configure
56 | ./configure           # Needed for choosing optional functionality
57 | make
58 | #make install         #uncomment this for installation
59 | cd ${ABS_DIRECTORY}
60 | 
61 | bash load_deepvariant.sh $Container
62 | 


--------------------------------------------------------------------------------
/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/trash/setup_env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set –e 
 3 | SCRIPT_PATH="${BASH_SOURCE:-$0}"
 4 | ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")"
 5 | #echo "Value of ABS_SCRIPT_PATH: ${ABS_SCRIPT_PATH}"
 6 | ABS_DIRECTORY="$(dirname "${ABS_SCRIPT_PATH}")"
 7 | #echo "Value of ABS_DIRECTORY: ${ABS_DIRECTORY}"
 8 | mkdir -p ./miniconda3
 9 | ln -s ${ABS_DIRECTORY}/miniconda3 ~/miniconda3
10 | 
11 | echo "Downloading and setting up miniconda..."
12 | wget https://repo.anaconda.com/miniconda/Miniconda3-py39_23.3.1-0-Linux-x86_64.sh
13 | bash ./Miniconda3-py39_23.3.1-0-Linux-x86_64.sh -b -u -p ~/miniconda3
14 | echo "Downloading and setting up miniconda...DONE"
15 | 
16 | echo "Seeting up conda env named with given argument"
17 | miniconda3/bin/conda env create --name $1 -f environment.yml
18 | echo "Seeting up conda env named new_env...DONE"
19 | 
20 | echo "Activating conda env..."
21 | source miniconda3/bin/activate $1
22 | 
23 | 


--------------------------------------------------------------------------------
/pipelines/fq2sortedbam/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:20.04
 2 | FROM python:3.10
 3 | 
 4 | RUN apt-get update 
 5 | RUN apt-get install -y build-essential
 6 | RUN apt-get install -y git
 7 | RUN apt-get install -y vim
 8 | RUN apt -y install autoconf
 9 | RUN apt -y install numactl
10 | RUN apt -y install zlib1g-dev
11 | RUN apt -y install libncurses5-dev
12 | RUN apt -y install libbz2-dev
13 | RUN apt -y install liblzma-dev
14 | RUN apt-get -qq -y install wget gcc-11 g++-11
15 | RUN unlink /usr/bin/gcc && unlink /usr/bin/g++
16 | RUN ln -s /usr/bin/gcc-11 /usr/bin/gcc && ln -s /usr/bin/g++-11 /usr/bin/g++
17 | 
18 | WORKDIR /app
19 | RUN git clone --recursive https://github.com/IntelLabs/Open-Omics-Acceleration-Framework.git
20 | #COPY /scratch/users/mvasimud/workspace/Open-Omics-Acceleration-Framework .
21 | WORKDIR Open-Omics-Acceleration-Framework/pipelines/fq2sortedbam
22 | 
23 | #RUN bash install.sh onprem
24 | RUN bash install.sh onprem
25 | ENV PATH="/app/Open-Omics-Acceleration-Framework/applications/samtools:$PATH"
26 | ENV PATH="/app/Open-Omics-Acceleration-Framework/applications/bwa-mem2:$PATH"
27 | ENV PATH="/app/Open-Omics-Acceleration-Framework/applications/bwa-meth:$PATH"
28 | ENV PATH="/app/Open-Omics-Acceleration-Framework/applications/mm2-fast:$PATH"
29 | ENV PATH="/app/Open-Omics-Acceleration-Framework/applications/STAR/source:$PATH"
30 | ENV PATH="/app/Open-Omics-Acceleration-Framework/applications/samtools/samtools:$PATH"
31 | 
32 | ENV PATH="/app/Open-Omics-Acceleration-Framework/pipelines/fq2sortedbam/miniforge3/envs/fq2bam/bin/:/app/Open-Omics-Acceleration-Framework/pipelines/fq2sortedbam/miniforge3/bin/:$PATH"
33 | ENV LD_PRELOAD="/app/Open-Omics-Acceleration-Framework/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/libmimalloc.so.2.0"
34 | 
35 | RUN mkdir /input
36 | RUN mkdir /out
37 | RUN mkdir /refdir
38 | RUN mkdir /tempdir
39 | 
40 | CMD ["/bin/bash"]
41 | SHELL ["source", "~/miniforge3/bin/activate", "fq2bam", "/bin/bash", "-c"]
42 | 


--------------------------------------------------------------------------------
/pipelines/fq2sortedbam/README.md:
--------------------------------------------------------------------------------
 1 | ## fq2SortedBAM: OpenOmics' Genomics Secondary Analysis Pipeline
 2 | ### Overview:
 3 | The pipeline takes input fastq files and produces sorted BAM file through the following stages:
 4 | 1. Sequence Alignment: bwa-mem2 for short reads, mm2-fast (Accelerated Minimap2) for long reads (PacBio, ONT)
 5 | 2. SAMSort (Using SAMTools)
 6 | 
 7 | ### Modes:
 8 | fq2SortedBAM supports 4 different modes:  
 9 | 1. ```sortedbam```: It takes fastq reads files and reference genome as input and outputs a sorted BAM file  
10 | 2. ```flatmode```: It takes fastq reads files and reference genome as input, and outputs multiple (equal to the number of ranks created) unsorted SAM files  
11 | 3. ```fqprocessonly```: Custom mode, not for general use
12 | 4. ```multifq```: Custom mode, not for general use  
13 | 
14 | ## Sequence alingment tools in the pipeline
15 | Both sortedbam and flatmode support bwa-mem2 for short reads and mm2-fast (accelerated version of minimap2) for long reads alignemnt.  
16 | These alignment tools can be enabled by _--read_type_ command-line option:  
17 | 1. [deafult] selects bwa-mem2 when _--read_type=short_  
18 | 2. selects mm2-fast when _--read_type=long_  (Note: mm2-fast runs with '-a' command-line option by default producing SAM output)  
19 | 3. Both the alignment tools support all the original tools' command-line options; these parameters can be provided through _--params_ command-line option to fq2sortedBAM    
20 | 
21 | 
22 | ## Use Docker
23 | ### Docker build:  
24 | ```
25 | wget https://github.com/IntelLabs/Open-Omics-Acceleration-Framework/releases/download/3.0/Source_code_with_submodules.tar.gz  
26 | tar -xzf Source_code_with_submodules.tar.gz  
27 | cp Open-Omics-Acceleration-Framework/pipelines/fq2sortedbam/Dockerfile .
28 | cp Open-Omics-Acceleration-Framework/pipelines/fq2sortedbam/config.yaml <inputdir>
29 | ```
30 | ```bash
31 | docker save fq2bam:latest > fq2bam.tar     ## this step is optional  
32 | docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t fq2bam .  
33 | ```
34 | 
35 | 
36 | ### Docker run:
37 | ```
38 | docker load -i fq2bam.tar      ## optional, if the image is build on the same machine or is already loaded    
39 | docker run  -v <readsdir>:/readsdir -v <outdir>:/outdir -v <refdir>:/refdir fq2bam:latest python run_fq2sortedbam.py --ref /refdir/<reference> --reads  /readsdir/<read1>  /readsdir/<read2>  --output /outdir/<outBAMfile> 
40 | ```
41 | Note:  
42 | \<readsdir\>: Location of the local directory containing read files read1 & read2  
43 | \<refdir\>: Location of the local directory containing reference sequence file ref  
44 | \<outdir\>: Location of the local directory for output files SAM/BAM  
45 | 
46 | 
47 | ## Use Source Code  
48 | ### Installation:
49 | ```
50 | wget https://github.com/IntelLabs/Open-Omics-Acceleration-Framework/releases/download/3.0/Source_code_with_submodules.tar.gz  
51 | tar -xzf Source_code_with_submodules.tar.gz  
52 | cd Open-Omics-Acceleration-Framework/pipelines/fq2sortedbam/
53 | bash install.sh <onprem/cloud>  ## onprem mode: Manually install the depenendies present in basic_setup_ubuntu.sh as it needs sudo access
54 | ```
55 | 
56 | ### Run:
57 | ```
58 | python run_fq2sortedbam.py --ref <reference> --reads <read1> <read2> --output <outBAMfile>
59 | ```
60 | 
61 | ## General Notes:  
62 | 1. Individual pipeline tools are present in applications folder    
63 | 2. To understand various parameters to these tools, you can access their ```man``` page  
64 | 3. You can setup the parameters of these tools using ```params``` commandline option
65 | 4. Understand all the parameters to fq3sortedbam using "-h" option 
66 | 5. fq2sortedbam supports the following aligners: 
67 |    a. DNA short read alignment using bwa-mem2  
68 |    b. DNA long read alignment using mm2-fast (minimap2)  
69 |    c. RNA short read alignment using STAR aligner  
70 |    d. bwa-meth based alignment  
71 | 


--------------------------------------------------------------------------------
/pipelines/fq2sortedbam/basic_setup_ubuntu.sh:
--------------------------------------------------------------------------------
 1 | # All basic dev tools for Ubuntu 22.04
 2 | 
 3 | sudo apt update
 4 | 
 5 | #sudo apt -y install make
 6 | sudo apt -y install build-essential
 7 | sudo apt -y install autoconf
 8 | sudo apt -y install numactl
 9 | sudo apt -y install zlib1g-dev
10 | sudo apt -y install libncurses5-dev
11 | sudo apt -y install libbz2-dev
12 | sudo apt -y install liblzma-dev
13 | sudo apt -y install git
14 | sudo apt-get -qq -y install wget
15 | 


--------------------------------------------------------------------------------
/pipelines/fq2sortedbam/config.yaml:
--------------------------------------------------------------------------------
 1 | bwa:
 2 |   dindex: 'False'
 3 |   params: +R "@RG\tID:RG1\tSM:RGSN1"  
 4 |   rindex: 'False'
 5 | dataset:
 6 |   index: GCA_000001405.15_GRCh38_no_alt_analysis_set.fna
 7 |   input: /input/
 8 |   outfile: short.se.sam
 9 |   output: /out/
10 |   read1: HG001.novaseq.pcr-free.30x.R1.fastq.gz
11 |   read2: HG001.novaseq.pcr-free.30x.R2.fastq.gz
12 |   read_type: short
13 |   refdir: /refdir/
14 |   tempdir:  ""
15 | fqprocess:
16 |   bam_size: '5'
17 |   barcode_orientation: FIRST_BP_RC
18 |   output_format: FASTQ
19 |   prefix: multiome-practice-may15_arcgtf
20 |   read3: ''
21 |   read_structure: 16C
22 |   readi1: ''
23 |   sample_id: ''
24 |   suffix: trimmed_adapters.fastq.gz
25 |   whitelist: whitelist.txt
26 | mm2:
27 |   params: ' -ax map-hifi '
28 | 


--------------------------------------------------------------------------------
/pipelines/fq2sortedbam/doc.txt:
--------------------------------------------------------------------------------
  1 | # Pipeline Usage Guide
  2 | 
  3 | This document explains the command-line arguments for running the pipeline. Below is a detailed description of each argument and its usage.
  4 | 
  5 | ---
  6 | 
  7 | ## Arguments
  8 | 
  9 | ### General Options
 10 | - `--ref REF`
 11 |   - **Description**: Reference genome path. For BWA, this pipeline expects the index to be present at this location. If the index is not available, it can be generated using the `--rindex` option.
 12 | 
 13 | - `--reads READS [READS ...]`
 14 |   - **Description**: Input reads. The pipeline expects both reads to be at the same location.
 15 | 
 16 | - `--tempdir TEMPDIR`
 17 |   - **Description**: Directory for storing intermediate data.
 18 | 
 19 | - `--output OUTPUT`
 20 |   - **Description**: Prefix location for the output file(s) name.
 21 | 
 22 | ### Processing Options
 23 | - `--simd SIMD`
 24 |   - **Description**: Defaults to `avx512` mode. Use `sse` for BWA SSE mode.
 25 | 
 26 | - `--read3 READ3`
 27 |   - **Description**: Name of `R3` files (for `fqprocess`), separated by spaces.
 28 | 
 29 | - `--readi1 READI1`
 30 |   - **Description**: Name of `I1` files (for `fqprocess`), separated by spaces.
 31 | 
 32 | - `--prefix PREFIX`
 33 |   - **Description**: Prefix for processed `R1` and `R3` files for `bwa-mem2`.
 34 | 
 35 | - `--suffix SUFFIX`
 36 |   - **Description**: Suffix for processed `R1` and `R3` files for `bwa-mem2`.
 37 | 
 38 | - `--whitelist WHITELIST`
 39 |   - **Description**: 10x whitelist file.
 40 | 
 41 | - `--read_structure READ_STRUCTURE`
 42 |   - **Description**: Read structure.
 43 | 
 44 | - `--barcode_orientation BARCODE_ORIENTATION`
 45 |   - **Description**: Barcode orientation.
 46 | 
 47 | - `--sample_id SAMPLE_ID`
 48 |   - **Description**: Sample ID.
 49 | 
 50 | - `--output_format OUTPUT_FORMAT`
 51 |   - **Description**: Output format.
 52 | 
 53 | ### File Size and Execution Mode
 54 | - `-b BAM_SIZE`, `--bam_size BAM_SIZE`
 55 |   - **Description**: BAM file size in GB.
 56 | 
 57 | - `--mode MODE`
 58 |   - **Description**: Execution mode options:
 59 |     - `flatmode`: Just BWA without sorting, creates SAM files equal to the number of ranks created.
 60 |     - `sortedbam`: BWA + SAM sorting steps, creating a single BAM file as output.
 61 |     - `fqprocessonly`: Custom processing of fastq files.
 62 |     - `multifq`: Custom multifastq processing.
 63 | 
 64 | ### Read Type Options
 65 | - `--read_type READ_TYPE`
 66 |   - **Description**: Specifies the type of reads for alignment:
 67 |     - `short`: BWA-MEM2 alignment with short reads (default).
 68 |     - `long`: MM2-fast alignment with long reads.
 69 | 
 70 | ### Indexing Options
 71 | - `--rindex`
 72 |   - **Description**: Enables BWA-MEM2 index generation. Use this option if the index is not present.
 73 | 
 74 | - `--dindex`
 75 |   - **Description**: Creates a reference genome FAI index. Use this option if the reference FAI is not present.
 76 | 
 77 | ### Additional Options
 78 | - `--profile`
 79 |   - **Description**: Enables profiling.
 80 | 
 81 | - `--not_keep_unmapped`
 82 |   - **Description**: Rejects unmapped reads at the end of the sorted BAM file. If not specified, unmapped reads are retained.
 83 | 
 84 | - `--keep_sam`
 85 |   - **Description**: Retains intermediate SAM files generated by the alignment tool for each rank. SAM files are named as `aln{rank:04d}.sam`.
 86 | 
 87 | - `--params PARAMS`
 88 |   - **Description**: Allows supplying various parameters to BWA-MEM2 (except the `-t` parameter for threads). Example:
 89 |     ```
 90 |     --params '-R "@RG\tID:RG1\tSM:RGSN1"'
 91 |     ```
 92 |     This enables read grouping.
 93 | 
 94 | - `--sso`
 95 |   - **Description**: Executes the pipeline on a single socket only. By default, all sockets are used.
 96 | 
 97 | ### Resource Allocation
 98 | - `--th TH`
 99 |   - **Description**: Sets the threshold for the minimum number of cores allocated to each rank.
100 | 
101 | - `-N N`
102 |   - **Description**: Manually sets the number of ranks. When using this, set `PPN` and `cpus` options accordingly.
103 | 
104 | - `-PPN PPN`
105 |   - **Description**: Manually sets the number of processes per node (PPN). When using this, set `N` and `cpus` options accordingly.
106 | 
107 | - `--cpus CPUS`
108 |   - **Description**: Manually sets the number of CPUs. When using this, set `N` and `PPN` options accordingly.
109 | 
110 | ---
111 | 
112 | ## Example Usage
113 | ```bash
114 | python pipeline.py \
115 |   --ref /path/to/reference.fasta \
116 |   --reads read1.fastq read2.fastq \
117 |   --tempdir /path/to/tempdir \
118 |   --output /path/to/output \
119 |   --simd avx512 \
120 |   --mode sortedbam \
121 |   --read_type short
122 | ```
123 | 
124 | 


--------------------------------------------------------------------------------
/pipelines/fq2sortedbam/environment.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - anaconda
 4 | dependencies:
 5 |   - _libgcc_mutex=0.1=main
 6 |   - _openmp_mutex=5.1=1_gnu
 7 |   - blas=1.0=mkl
 8 |   - ca-certificates=2023.01.10=h06a4308_0
 9 |   - certifi=2022.12.7=py39h06a4308_0
10 |   - intel-openmp=2021.4.0=h06a4308_3561
11 |   - ld_impl_linux-64=2.38=h1181459_1
12 |   - libffi=3.4.2=h6a678d5_6
13 |   - libgcc-ng=11.2.0=h1234567_1
14 |   - libgfortran-ng=7.5.0=ha8ba4b0_17
15 |   - libgfortran4=7.5.0=ha8ba4b0_17
16 |   - libgomp=11.2.0=h1234567_1
17 |   - libstdcxx-ng=11.2.0=h1234567_1
18 |   - mkl=2021.4.0=h06a4308_640
19 |   - mkl-service=2.4.0=py39h7f8727e_0
20 |   - mkl_fft=1.3.1=py39hd3c417c_0
21 |   - mkl_random=1.2.2=py39h51133e4_0
22 |   - mpi=1.0=mpich
23 |   - mpi4py=3.1.4=py39hfc96bbd_0
24 |   - mpich=3.3.2=hc856adb_0
25 |   - ncurses=6.4=h6a678d5_0
26 |   - numpy=1.23.5=py39h14f4228_0
27 |   - numpy-base=1.23.5=py39h31eccc5_0
28 |   - openssl=1.1.1t=h7f8727e_0
29 |   - pip=23.0.1=py39h06a4308_0
30 |   - python=3.9.16=h7a1cb2a_2
31 |   - readline=8.2=h5eee18b_0
32 |   - setuptools=65.6.3=py39h06a4308_0
33 |   - six=1.16.0=pyhd3eb1b0_1
34 |   - sqlite=3.41.1=h5eee18b_0
35 |   - tk=8.6.12=h1ccaba5_0
36 |   - tzdata=2023c=h04d1e81_0
37 |   - wheel=0.38.4=py39h06a4308_0
38 |   - xz=5.2.10=h5eee18b_1
39 |   - zlib=1.2.13=h5eee18b_0
40 |   - pyyaml=5.4.1
41 |   - pip:
42 |       - pragzip==0.5.0
43 |       - yappi==1.4.0
44 | 
45 | 


--------------------------------------------------------------------------------
/pipelines/fq2sortedbam/hwconfig.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import sys, os
 3 | from subprocess import Popen, PIPE, run
 4 | 
 5 | def main():
 6 |     #print("args: ", sys.argv)
 7 |     #print(len(sys.argv))
 8 |     assert len(sys.argv) == 3, "<exec> <sso> <num_nodes>"
 9 |     run('lscpu > lscpu.txt', capture_output=True, shell=True)
10 |     #inf = sys.argv[1]
11 |     sso = sys.argv[1]
12 |     num_nodes = int(sys.argv[2])
13 | 
14 |     dt={}
15 |     with open('lscpu.txt', 'r') as f:
16 |         l = f.readline()
17 |         while l:
18 |             try:
19 |                 a,b = l.strip('\n').split(':')
20 |                 #aa, bb = a.split(' ')
21 |                 #print(a, b)
22 |                 dt[a] = b
23 |             except:
24 |                 pass
25 |             
26 |             l = f.readline()
27 | 
28 |     ncpus = int(dt['CPU(s)'])
29 |     nsocks = int(dt['Socket(s)'])
30 |     nthreads = int(dt['Thread(s) per core'])
31 |     ncores = int(dt['Core(s) per socket'])
32 |     nnuma = int(dt['NUMA node(s)'])
33 | 
34 |     if sso == 'sso':
35 |         nsocks = 1
36 | 
37 |     th = 16  ## max cores per rank
38 |     #num_nodes = 1
39 |         
40 |     num_physical_cores_all_nodes = num_nodes * nsocks * ncores
41 |     num_physical_cores_per_node = nsocks * ncores
42 |     num_physical_cores_per_rank = nsocks * ncores
43 |     
44 |     while num_physical_cores_per_rank > th:
45 |         num_physical_cores_per_rank /= 2
46 | 
47 |     num_physical_cores_per_rank = int(num_physical_cores_per_rank)
48 |     assert num_physical_cores_per_rank > 8, 'cores per rank should be > 8'
49 |     
50 |     N = int(num_physical_cores_all_nodes / num_physical_cores_per_rank)
51 |     PPN = int(num_physical_cores_per_node / num_physical_cores_per_rank)
52 |     CPUS = int(ncores * nthreads * nsocks / PPN - 2*nthreads)
53 |     THREADS = CPUS
54 |     print(f"N={int(N)}")
55 |     print(f"PPN={int(PPN)}")
56 |     print(f"CPUS={int(CPUS)}")
57 |     print(f"THREADS={int(THREADS)}")
58 |     
59 |     threads_per_rank = num_physical_cores_per_rank * nthreads
60 |     bits = pow(2, num_physical_cores_per_rank) - 1
61 |     allbits = 0
62 |     mask="["
63 |     for r in range(N):
64 |         allbits = allbits | (bits << r*num_physical_cores_per_rank)
65 |         allbits = allbits | (allbits << nsocks * ncores)
66 |         #print("{:x}".format(allbits))
67 |         if mask == "[":
68 |             mask = mask + hex(allbits)
69 |         else:
70 |             mask = mask+","+ hex(allbits)
71 |         allbits=0
72 |         #print("{:x}".format(mask))
73 |     mask=mask + "]"
74 |     print("I_MPI_PIN_DOMAIN={}".format(mask))
75 | 
76 |     
77 | if __name__ == '__main__':
78 |     #print("> args: ", sys.argv)
79 |     #print(">", len(sys.argv))
80 | 
81 |     main()
82 | 
83 | 


--------------------------------------------------------------------------------
/pipelines/fq2sortedbam/install.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | 
  4 | #SCRIPT_PATH="${BASH_SOURCE:-$0}"
  5 | #ABS_SCRIPT_PATH="$(realpath "${SCRIPT_PATH}")"
  6 | ##echo "Value of ABS_SCRIPT_PATH: ${ABS_SCRIPT_PATH}"
  7 | #ABS_DIRECTORY="$(dirname "${ABS_SCRIPT_PATH}")"
  8 | ##echo "Value of ABS_DIRECTORY: ${ABS_DIRECTORY}"
  9 | 
 10 | if [ "$#" -ne "1" ]
 11 | then
 12 |     echo "pls. provide args: cloud/onprem"
 13 | fi
 14 | 
 15 | if [ "$1" == "cloud" ]
 16 | then
 17 | echo "Installing pre-requisite tools.."
 18 | bash basic_setup_ubuntu.sh
 19 | echo "Done"
 20 | fi
 21 | 
 22 | echo "Downloading and setting up miniconda..."
 23 | [[ ! -e "Miniforge3-24.3.0-0-Linux-x86_64.sh" ]] && wget https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh
 24 | bash ./Miniforge3-24.3.0-0-Linux-x86_64.sh -u -b -p ./miniforge3
 25 | 
 26 | echo "Setting up conda env named with given argument"
 27 | ./miniforge3/bin/conda env create --name fq2bam -f environment.yml
 28 | echo "Setting up conda env named new_env...DONE"
 29 | 
 30 | echo "Activating conda env..."
 31 | source ./miniforge3/bin/activate fq2bam
 32 | 
 33 | #echo "Downloading and setting up miniconda..."
 34 | #if [ ! -e "Miniconda3-py39_23.3.1-0-Linux-x86_64.sh" ]
 35 | #then
 36 | #    wget https://repo.anaconda.com/miniconda/Miniconda3-py39_23.3.1-0-Linux-x86_64.sh
 37 | #fi
 38 | #
 39 | #bash ./Miniconda3-py39_23.3.1-0-Linux-x86_64.sh -b -p ./miniconda3
 40 | #echo "Downloading and setting up miniconda...DONE"
 41 | #
 42 | #echo "Seeting up conda env named with given argument"
 43 | #miniconda3/bin/conda env create --name distbwa -f environment.yml
 44 | #echo "Seeting up conda env named new_env...DONE"
 45 | #
 46 | #echo "Activating conda env..."
 47 | #source miniconda3/bin/activate distbwa
 48 | #echo "localhost" > hostfile
 49 | 
 50 | ## build tools
 51 | cd ../../
 52 | #WDIR=../../
 53 | WDIR=`pwd`
 54 | cd -
 55 | EXEDIR=`pwd`
 56 | 
 57 | # compile bwa-mem2
 58 | echo "Build bwa-mem2"
 59 | cd ${WDIR}/applications/bwa-mem2
 60 | make clean
 61 | make -j multi
 62 | bwainstall="SUCESS"
 63 | if [ -e "${WDIR}/applications/bwa-mem2/bwa-mem2" ]; then
 64 |     echo "bwa-mem2 build successful"
 65 | else
 66 |     bwainstall="FAILED"
 67 |     echo "Error!! bwa-mem2 build failed"
 68 | fi
 69 | 
 70 | 
 71 | cd ${WDIR}/applications/mm2-fast
 72 | make clean
 73 | make -j
 74 | mm2install="SUCESS"
 75 | if [ -e "${WDIR}/applications/mm2-fast/minimap2" ]; then
 76 |     echo "mm2-fast build successful"
 77 | else
 78 |     mm2install="FAILED"
 79 |     echo "Error!! mm2-fast build failed"
 80 |     exit
 81 | fi
 82 | 
 83 | cd ${WDIR}/applications/STAR/source/
 84 | make clean
 85 | make -j STAR
 86 | echo $WDIR
 87 | if [ -e "${WDIR}/applications/STAR/source/STAR" ]; then
 88 |     echo "STAR build successful"
 89 | else
 90 |     starinstall="FAILED"
 91 |     echo "Error!! STAR build failed"
 92 |     exit
 93 | fi
 94 | 
 95 | #make install   #uncomment this for installation
 96 | 
 97 | # compile htslib
 98 | cd ${WDIR}/applications/htslib
 99 | autoreconf -i  # Build the configure script and install files it uses
100 | ./configure    # Optional but recommended, for choosing extra functionality
101 | make
102 | #make install   #uncomment this for installation
103 | 
104 | # compile bcftools
105 | ## cd ${WDIR}/applications/bcftools
106 | ## # The following is optional:
107 | ## #   autoheader && autoconf && ./configure --enable-libgsl --enable-perl-filters
108 | ## make
109 | ## #make install   #uncomment this for installation
110 | 
111 | # compile samtools
112 | cd ${WDIR}/applications/samtools
113 | autoheader
114 | autoconf -Wno-syntax
115 | chmod 775 configure
116 | ./configure           # Needed for choosing optional functionality
117 | make
118 | saminstall="SUCESS"
119 | if [ -e "${WDIR}/applications/samtools/samtools" ]; then
120 |     echo "SAMTools build successful"
121 | else
122 |     saminstall="FAILED"
123 |     echo "Error!! SAMTools build failed"
124 | fi
125 | 
126 | 
127 | cd ${WDIR}/applications/bwa-meth/
128 | wget https://pypi.python.org/packages/source/t/toolshed/toolshed-0.4.0.tar.gz
129 | tar xzvf toolshed-0.4.0.tar.gz
130 | cd toolshed-0.4.0
131 | python setup.py install
132 | cd -
133 | python setup.py install
134 | 
135 | #cd $EXEDIR
136 | 
137 | #[[ ! -d warp-tools ]] && git clone --recursive https://github.com/broadinstitute/warp-tools.git -b develop
138 | #cd warp-tools/tools/fastqpreprocessing/
139 | #./fetch_and_make_dep_libs.sh && make
140 | ## make -j
141 | 
142 | #if [ "$?" == "0" ]
143 | #then
144 | #    echo "fqprocess installed successfully"
145 | #else
146 | #    echo "fqprocess installation failed"
147 | #fi
148 | 
149 | echo "bwa compilation is "$bwainstall
150 | echo "mm2-fast compilation is "$mm2install
151 | echo "STAR compilation is "$starinstall
152 | echo "samtools compilation is "$saminstall
153 | 
154 | echo "Compelete installation done."
155 | 


--------------------------------------------------------------------------------
/pipelines/fq2sortedbam/print_config.sh:
--------------------------------------------------------------------------------
 1 | num_nodes=1
 2 | lscpu > compute_config
 3 | 
 4 | 
 5 | num_cpus_per_node=$(cat compute_config | grep -E '^CPU\(s\)' | awk  '{print $2}')
 6 | num_socket=$(cat compute_config | grep -E '^Socket'| awk  '{print $2}')
 7 | num_numa=$(cat compute_config | grep '^NUMA node(s)' | awk '{print $3}')
 8 | num_cpus_all_node=`expr ${num_cpus_per_node} \* ${num_nodes}`
 9 | threads_per_core=$(cat compute_config | grep -E '^Thread' | awk  '{print $4}')
10 | echo "#############################################"
11 | echo "Number of sockets: "$num_socket
12 | echo "Number of NUMA domains: "$num_numa
13 | echo "Number of CPUs: $num_cpus_all_node"
14 | 
15 | num_physical_cores_all_nodes=`expr ${num_cpus_all_node} / ${threads_per_core}`
16 | num_physical_cores_per_nodes=`expr ${num_cpus_per_node} / ${threads_per_core}`
17 | num_physical_cores_per_socket=`expr ${num_physical_cores_all_nodes} / ${num_socket}`
18 | num_physical_cores_per_numa=`expr ${num_physical_cores_all_nodes} / ${num_numa}`
19 | echo "Number physical cores: "$num_physical_cores_per_nodes
20 | echo "Number physical cores per socket: "$num_physical_cores_per_socket
21 | echo "Number physical cores per numa: "$num_physical_cores_per_numa
22 | 
23 | th=`expr ${num_physical_cores_per_numa} / 2`  #${num_physical_cores_per_numa}  ##20
24 | if [ $th -le 10 ]
25 | then
26 |     th=${num_physical_cores_per_numa}
27 | fi
28 | 
29 | while [ $num_physical_cores_per_nodes -gt $th ]
30 | do
31 |     num_physical_cores_per_nodes=`expr $num_physical_cores_per_nodes / 2`
32 | done
33 | 
34 | num_physical_cores_per_rank=$num_physical_cores_per_nodes
35 | total_num_ranks=`expr ${num_physical_cores_all_nodes} / ${num_physical_cores_per_rank}`
36 | 
37 | ranks_per_node=`expr ${total_num_ranks} / ${num_nodes}`
38 | echo "Number of MPI ranks: "${total_num_ranks}
39 | echo "Number of cores per MPI rank: "$num_physical_cores_per_nodes
40 | echo "#############################################"
41 | #echo "Note: Each MPI rank runs a bwa-mem2 process on its input fastq files produced by fqprocess. Please ensure that the number of files created due to bam_size parameter to fqprocess (in config file) creates number of fastq files equal to ${total_num_ranks}"
42 | echo "Please set bam_size such that fastqprocess creates ${total_num_ranks} splits of input fastq files"
43 | echo "#############################################"
44 | 


--------------------------------------------------------------------------------
/pipelines/fq2sortedbam/run_bwa.sh:
--------------------------------------------------------------------------------
 1 | #*************************************************************************************
 2 | #                           The MIT License
 3 | #
 4 | #   Intel OpenOmics - fq2sortedbam pipeline
 5 | #   Copyright (C) 2023  Intel Corporation.
 6 | #
 7 | #   Permission is hereby granted, free of charge, to any person obtaining
 8 | #   a copy of this software and associated documentation files (the
 9 | #   "Software"), to deal in the Software without restriction, including
10 | #   without limitation the rights to use, copy, modify, merge, publish,
11 | #   distribute, sublicense, and/or sell copies of the Software, and to
12 | #   permit persons to whom the Software is furnished to do so, subject to
13 | #   the following conditions:
14 | #
15 | #   The above copyright notice and this permission notice shall be
16 | #   included in all copies or substantial portions of the Software.
17 | #
18 | #   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 | #   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 | #   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 | #   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
22 | #   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
23 | #   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
24 | #   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 | #   SOFTWARE.
26 | #
27 | #Authors:  Vasimuddin Md <vasimuddin.md@intel.com>; Babu Pillai <padmanabhan.s.pillai@intel.com>;
28 | #*****************************************************************************************/
29 | 
30 | #!/usr/bin/bash
31 | set -e
32 | 
33 | trap 'echo "Error occurred"' ERR
34 | 
35 | source ./miniforge3/bin/activate fq2bam
36 | echo "Your conda env @: "$CONDA_PREFIX
37 | 
38 | config=""
39 | sso="None"
40 | if [ "$#" == "2" ] || [ "$#" == "3" ]
41 |     then
42 |         mode=$1
43 |         config=$2
44 |         [[ "$#" == "3" ]] && sso=$3
45 | else
46 |     echo "<exec> <sortedbam/flatmode/fqprocessonly/multifq> <config_file> [sso (for single socket only execution)]"
47 |     exit
48 | fi
49 | 
50 | #Note: "##### Note: Currently, this code only supports single node. "
51 | #Note: "##### I've deliberately disabled distributed runs for now. "
52 | #Note: "##### Contact: <vasimuddin.md@intel.com>"
53 | #echo ""
54 | 
55 | num_nodes=1
56 | echo "run mode: "$mode
57 | echo "config: "$config
58 | #echo "se/pe: $se_mode"
59 | [[ "$sso" == "sso" ]] && echo "single socket only"
60 | 
61 | #echo "localhost" > hostfile
62 | hostname > hostfile
63 | #semode=""
64 | CONFIG=""
65 | #read_type="--read_type short"
66 | #[[ "$se_mode" == "se" ]] && semode="--se_mode"
67 | [[ "$config" != "" ]] && CONFIG="-y $config"
68 | runmode="--mode $mode"
69 | 
70 | #lscpu > lscpu.txt
71 | chmod +x hwconfig.py
72 | #ls -lh  hwconfig.py
73 | python hwconfig.py $sso $num_nodes > hwconfig
74 | #python hwconfig.py "sso"   > hwconfig
75 | 
76 | source hwconfig
77 | echo "[Info] Running $N ranks, each with $THREADS threads ..."
78 | #rm lscpu.txt
79 | rm hwconfig
80 | 
81 | BINDING=socket
82 | mkdir -p logs
83 | 
84 | exec=dist_bwa.py
85 | #echo $I_MPI_PIN_DOMAIN
86 | #-genv I_MPI_PIN_DOMAIN=$I_MPI_PIN_DOMAIN
87 | 
88 | #echo $N
89 | #echo $PPN
90 | #echo $exec
91 | #echo $CONFIG
92 | mpiexec -bootstrap ssh -n $N -ppn $PPN -bind-to $BINDING -map-by $BINDING  --hostfile hostfile  python -u $exec --cpus $CPUS --threads $THREADS ${runmode} ${CONFIG} --keep_unmapped  2>&1 | tee logs/log.txt
93 | echo "[Info] The output log file is at logs/log.txt"
94 | 


--------------------------------------------------------------------------------
/pipelines/single-cell-RNA-seq-analysis/Dockerfile:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intel Labs
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # Authors: Narendra Chaudhary <narendra.chaudhary@intel.com>; Sanchit Misra <sanchit.misra@intel.com>
24 | 
25 | # Install Base miniconda image
26 | ARG FROM_IMAGE=ubuntu:22.04
27 | 
28 | # Install Base miniforge image
29 | ARG BASE_IMAGE=condaforge/miniforge3:23.1.0-3
30 | FROM ${BASE_IMAGE} as conda_setup
31 | 
32 | RUN conda update -n base conda
33 | COPY ./environment.yml ./
34 | 
35 | RUN conda env create --name=single_cell -f ./environment.yml
36 | COPY ./_t_sne.py /opt/conda/lib/python3.8/site-packages/daal4py/sklearn/manifold/_t_sne.py
37 | 
38 | 
39 | FROM ${FROM_IMAGE} as builder
40 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
41 |     git build-essential gcc curl gnupg gnupg2 gnupg1 sudo wget tar ca-certificates -y \
42 |     && rm -rf /var/lib/apt/lists/* \
43 |     && apt-get autoremove -y \
44 |     && apt-get clean \
45 |     && apt update
46 | 
47 | 
48 | COPY --from=conda_setup /opt/conda /opt/conda
49 | ENV PATH "/opt/conda/envs/single_cell/bin:$PATH"
50 | RUN echo "source /opt/conda/bin/activate single_cell" >> ~/.bashrc
51 | CMD source ~/.bashrc
52 | 
53 | # Non-root user setup
54 | ENV SERVICE_NAME="scrna"
55 | 
56 | RUN groupadd --gid 1001 $SERVICE_NAME && \
57 |     useradd -m -g $SERVICE_NAME --shell /bin/false --uid 1001 $SERVICE_NAME
58 | 
59 | 
60 | RUN pip uninstall -y umap-learn
61 | WORKDIR /
62 | RUN wget https://github.com/IntelLabs/Open-Omics-Acceleration-Framework/releases/download/3.0/Source_code_with_submodules.tar.gz 
63 | RUN tar -xzf Source_code_with_submodules.tar.gz
64 | 
65 | #SHELL ["/bin/bash", "-c", "source activate single_cell"]
66 | WORKDIR /Open-Omics-Acceleration-Framework/lib/tal/applications/UMAP_fast/umap_extend/
67 | RUN python setup.py install
68 | 
69 | WORKDIR ../umap/
70 | RUN python setup.py install
71 | 
72 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /Open-Omics-Acceleration-Framework /opt
73 | # Switch to non-root user
74 | USER $SERVICE_NAME
75 | 
76 | 
77 | WORKDIR /Open-Omics-Acceleration-Framework/pipelines/single-cell-RNA-seq-analysis/notebooks
78 | 
79 | 
80 | CMD jupyter notebook \
81 |                 --no-browser \
82 |                 --allow-root \
83 |                 --port=8888 \
84 |                 --ip=0.0.0.0 \
85 |                 --notebook-dir=/Open-Omics-Acceleration-Framework/pipelines/single-cell-RNA-seq-analysis/notebooks \
86 |                 --NotebookApp.password="" \
87 |                 --NotebookApp.token="" \
88 |                 --NotebookApp.password_required=False
89 | 
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/pipelines/single-cell-RNA-seq-analysis/Dockerfile.python:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intel Labs
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # Authors: Narendra Chaudhary <narendra.chaudhary@intel.com>; Sanchit Misra <sanchit.misra@intel.com>
24 | 
25 | # Install Base miniconda image
26 | ARG FROM_IMAGE=ubuntu:22.04
27 | 
28 | # Install Base miniforge image
29 | ARG BASE_IMAGE=condaforge/miniforge3:23.1.0-3
30 | FROM ${BASE_IMAGE} as conda_setup
31 | 
32 | RUN conda update -n base conda
33 | COPY ./environment.yml ./
34 | 
35 | RUN conda env create --name=single_cell -f ./environment.yml
36 | COPY ./_t_sne.py /opt/conda/lib/python3.8/site-packages/daal4py/sklearn/manifold/_t_sne.py
37 | 
38 | 
39 | FROM ${FROM_IMAGE} as builder
40 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
41 |     git build-essential gcc curl gnupg gnupg2 gnupg1 sudo wget tar ca-certificates -y \
42 |     && rm -rf /var/lib/apt/lists/* \
43 |     && apt-get autoremove -y \
44 |     && apt-get clean \
45 |     && apt update
46 | 
47 | 
48 | COPY --from=conda_setup /opt/conda /opt/conda
49 | ENV PATH "/opt/conda/envs/single_cell/bin:$PATH"
50 | RUN echo "source /opt/conda/bin/activate single_cell" >> ~/.bashrc
51 | CMD source ~/.bashrc
52 | 
53 | # Non-root user setup
54 | ENV SERVICE_NAME="scrna"
55 | 
56 | RUN groupadd --gid 1001 $SERVICE_NAME && \
57 |     useradd -m -g $SERVICE_NAME --shell /bin/false --uid 1001 $SERVICE_NAME
58 | 
59 | 
60 | RUN pip uninstall -y umap-learn
61 | WORKDIR /
62 | RUN wget https://github.com/IntelLabs/Open-Omics-Acceleration-Framework/releases/download/3.0/Source_code_with_submodules.tar.gz 
63 | RUN tar -xzf Source_code_with_submodules.tar.gz
64 | 
65 | #SHELL ["/bin/bash", "-c", "source activate single_cell"]
66 | WORKDIR /Open-Omics-Acceleration-Framework/lib/tal/applications/UMAP_fast/umap_extend/
67 | RUN python setup.py install
68 | 
69 | WORKDIR ../umap/
70 | RUN python setup.py install
71 | 
72 | RUN chown -R $SERVICE_NAME:$SERVICE_NAME /Open-Omics-Acceleration-Framework /opt
73 | # Switch to non-root user
74 | USER $SERVICE_NAME
75 | 
76 | 
77 | WORKDIR /Open-Omics-Acceleration-Framework/pipelines/single-cell-RNA-seq-analysis/notebooks
78 | #WORKDIR /workspace
79 | CMD python -m sklearnex full_single_cell_analysis.py
80 | 
81 | # build with "docker build -f Dockerfile.python -t scanpy ."
82 | # mkdir -p ~/output
83 | # docker run -v ~/output:/workspace/figures -v ~/Open-Omics-Acceleration-Framework/pipelines/single_cell_pipeline/data:/data scanpy
84 | 


--------------------------------------------------------------------------------
/pipelines/single-cell-RNA-seq-analysis/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Intel Labs
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/pipelines/single-cell-RNA-seq-analysis/README.md:
--------------------------------------------------------------------------------
  1 | # Pipeline overview
  2 | 
  3 | Given a cell by gene matrix, this [scanpy](https://github.com/scverse/scanpy) based pipeline performs data preprocessing (filter, linear regression and normalization), dimensionality reduction (PCA), clustering (Louvain/Leiden/kmeans) to cluster the cells into different cell types and visualize those clusters (UMAP/t-SNE). The following block diagram illustrates the pipeline.
  4 | 
  5 | <p align="center">
  6 | <img src="https://github.com/IntelLabs/Open-Omics-Acceleration-Framework/blob/main/images/scrnaseq-analysis.jpg"/a></br>
  7 | </p> 
  8 | 
  9 | 
 10 | # Download entire repository
 11 | ```bash
 12 | cd ~
 13 | RUN wget https://github.com/IntelLabs/Open-Omics-Acceleration-Framework/releases/download/3.0/Source_code_with_submodules.tar.gz 
 14 | RUN tar -xzf Source_code_with_submodules.tar.gz
 15 | cd ~/Open-Omics-Acceleration-Framework/pipelines/single-cell-RNA-seq-analysis
 16 | ```
 17 | 
 18 | # Instructions to Run
 19 | We can run this pipeline in three ways: 1. Docker container (i. interactive, ii. non-interactive), 2. Using anaconda environment file, 3. Creating anaconda environment manually.   
 20 | 
 21 | ## (Option 1): Docker instructions for interactive and non-interactive mode (Recommended on Cloud Instance)
 22 | 
 23 | 
 24 | ### Run with jupyter notebook (interactive)
 25 | ```bash
 26 | cd ~/Open-Omics-Acceleration-Framework/pipelines/single-cell-RNA-seq-analysis/
 27 | docker build -t scanpy .           # Create a docker image named scanpy
 28 | 
 29 | # Download dataset
 30 | wget -P ~/Open-Omics-Acceleration-Framework/pipelines/single-cell-RNA-seq-analysis/data https://rapids-single-cell-examples.s3.us-east-2.amazonaws.com/1M_brain_cells_10X.sparse.h5ad
 31 | 
 32 | docker run -it -p 8888:8888 -v ~/Open-Omics-Acceleration-Framework/pipelines/single-cell-RNA-seq-analysis/data:/data scanpy   # run docker container with the data folder as volume
 33 | 
 34 | ```
 35 | 
 36 | ### Run with non-interactive mode
 37 | 
 38 | ```bash
 39 | export DATA_DIR=<path-to-database-directory>
 40 | export OUTPUT_DIR=<path-to-output-directory>
 41 | mkdir -p $OUTPUT_DIR
 42 | cd ~/Open-Omics-Acceleration-Framework/pipelines/single-cell-RNA-seq-analysis/
 43 | 
 44 | docker build -f Dockerfile.python -t scanpy_python . # Create a docker image named scanpy_python
 45 | 
 46 | # Download dataset
 47 | wget -P  $DATA_DIR https://rapids-single-cell-examples.s3.us-east-2.amazonaws.com/1M_brain_cells_10X.sparse.h5ad
 48 | 
 49 | docker run -v $OUTPUT_DIR:/Open-Omics-Acceleration-Framework/pipelines/single-cell-RNA-seq-analysis/notebooks/figures  -v $DATA_DIR:/data  -it scanpy_python
 50 | ```
 51 | 
 52 | 
 53 | 
 54 | 
 55 | ## (Option 2): Create an Anaconda environment from file
 56 | ```bash
 57 | conda env create --name=single_cell -f environment.yml
 58 | conda activate single_cell
 59 | ```
 60 | 
 61 | ### Replace the _t_sne.py file to anaconda environment's daal4py package
 62 | ```bash
 63 | cp _t_sne.py ~/anaconda3/envs/single_cell/lib/python3.8/site-packages/daal4py/sklearn/manifold/
 64 | ```
 65 | 
 66 | ### Install umap_extend and umap 
 67 | ```bash
 68 | 
 69 | pip uninstall umap-learn
 70 | cd ~/Open-Omics-Acceleration-Framework/lib/tal/applications/UMAP_fast/umap_extend
 71 | python setup.py install                          # Uncomment AVX512 lines in setup.py before doing this step on avx512 machines
 72 | 
 73 | 
 74 | cd ~/Open-Omics-Acceleration-Framework/lib/tal/applications/UMAP_fast/umap
 75 | python setup.py install                     # do python setup.py install if moving environment using conda-pack
 76 | ```
 77 | 
 78 | 
 79 | ### Example Dataset
 80 | The dataset was made publicly available by 10X Genomics. Use the following command to download the count matrix for this dataset and store it in the data folder:
 81 | ```bash
 82 | wget -P ~/Open-Omics-Acceleration-Framework/pipelines/single-cell-RNA-seq-analysis/data https://rapids-single-cell-examples.s3.us-east-2.amazonaws.com/1M_brain_cells_10X.sparse.h5ad
 83 | ```
 84 | 
 85 | ### Setup and run
 86 | ```bash
 87 | export NUMEXPR_MAX_THREADS=56          # equal to number of threads on a single socket
 88 | export NUMBA_NUM_THREADS=56            # Remember to delete __pycache__ folder from local directory and umap/umap/ directory if increasing number of threads
 89 | 
 90 | # also update sc.settings.n_jobs=56 to set number of threads inside 1M_brain_cpu_analysis.py
 91 | 
 92 | cd ~/Open-Omics-Acceleration-Framework/pipelines/single-cell-RNA-seq-analysis/notebooks/
 93 | 
 94 | # Or the jupyter notebook with sklearn patch in it. 
 95 | # from sklearnex import patch_sklearn
 96 | # patch_sklearn()
 97 | 
 98 | jupyter notebook
 99 | ```
100 | 
101 | 
102 | ## (Alternatively, Option - 3) You can also create Anaconda environment Manually
103 | ```bash
104 | conda create --name single_cell python=3.8.0
105 | conda activate single_cell
106 | ```
107 | 
108 | ### Necessary scanpy tools
109 | ```bash
110 | conda install -y seaborn=0.12.2 scikit-learn=1.0.2 statsmodels=0.13.2 numba=0.53.1 pytables=3.7.0 matplotlib-base=3.6.2 pandas=1.5.2
111 | conda install -y -c conda-forge mkl-service=2.4.0
112 | conda install -y -c conda-forge python-igraph=0.10.3 leidenalg=0.9.1
113 | conda install -y -c conda-forge cython=0.29.33 jinja2=3.1.2 clang-tools=15.0.7
114 | conda install -y -c katanagraph/label/dev -c conda-forge katana-python
115 | ```
116 | 
117 | ### Install scanpy
118 | ```bash
119 | pip install scanpy==1.8.1
120 | ```
121 | 
122 | ### Install scikit-learn intel extension (PIP version)
123 | ```bash
124 | pip install scikit-learn-intelex==2023.0.1
125 | ```
126 | ### Install other packages
127 | ```bash
128 | pip install pybind11
129 | pip install jupyterlab
130 | pip install wget
131 | ```
132 | 
133 | ### Replace the _t_sne.py file to anaconda environment's daal4py package
134 | ```bash
135 | cp _t_sne.py ~/anaconda3/envs/single_cell/lib/python3.8/site-packages/daal4py/sklearn/manifold/
136 | ```
137 | 
138 | ### Install umap_extend and umap 
139 | ```bash
140 | 
141 | pip uninstall umap-learn
142 | cd ~/Open-Omics-Acceleration-Framework/lib/tal/applications/UMAP_fast/umap_extend
143 | python setup.py install                          # Uncomment AVX512 lines in setup.py before doing this step on avx512 machines
144 | 
145 | 
146 | cd ~/Open-Omics-Acceleration-Framework/lib/tal/applications/UMAP_fast/umap
147 | python setup.py install                     # do python setup.py install if moving environment using conda-pack
148 | ```
149 | 


--------------------------------------------------------------------------------
/pipelines/single-cell-RNA-seq-analysis/environment.yml:
--------------------------------------------------------------------------------
 1 | name: single_cell
 2 | channels:
 3 |   - conda-forge
 4 |   - defaults
 5 |   - katanagraph/label/dev
 6 | dependencies:
 7 |   - defaults::python=3.8.0
 8 |   - conda-forge::gcc_linux-64==12.1.0
 9 |   - conda-forge::gxx_linux-64==12.1.0
10 |   - defaults::seaborn=0.12.2
11 |   - defaults::scikit-learn=1.0.2
12 |   - defaults::statsmodels=0.13.2
13 |   - defaults::numba=0.53.1
14 |   - defaults::pytables=3.7.0
15 |   - defaults::pip=22.3.1
16 |   - defaults::pandas=1.5.2
17 |   - defaults::matplotlib-base=3.6.2
18 |   - conda-forge::mkl-service=2.4.0
19 |   - conda-forge::python-igraph=0.10.3
20 |   - conda-forge::leidenalg=0.9.1
21 |   - conda-forge::cython=0.29.33
22 |   - conda-forge::jinja2=3.1.2
23 |   - conda-forge::clang-tools=15.0.7
24 |   - katanagraph/label/dev::katana-python
25 |   - pip:
26 |       - scanpy==1.8.1
27 |       - anndata==0.8.0
28 |       - scikit-learn-intelex==2023.0.1 
29 |       - pybind11
30 |       - jupyter
31 |       - wget
32 | 


--------------------------------------------------------------------------------