├── docs ├── input.md └── images │ └── pipeline_overview.png ├── .gitattributes ├── .dockerignore ├── .gcloudignore ├── src ├── R │ ├── TSSRanges.RData │ ├── joint_cell_plotting_density.R │ ├── atac_qc_plots.R │ ├── cell_annotation_helper_functions.R │ ├── barcode_rank_functions.R │ └── rna_qc_plots.R ├── python │ ├── get_cellxgene_data.py │ ├── qc_atac_count_duplicates_per_barcode.py │ ├── pbc_stats.py │ ├── flexible_import_entities_standard.py │ ├── plot_insert_size_hist.py │ ├── assign_multimappers.py │ ├── barcode_revcomp_detect.py │ ├── write_html.py │ ├── bam_to_fragments.py │ ├── filter_mito_reads.py │ ├── qc_atac_compute_reads_in_peaks.py │ ├── infer_barcodes.py │ ├── generate_h5_rna.py │ ├── rna_barcode_metadata.py │ ├── trim_fastq.py │ ├── match_barcodes.py │ └── joint_cell_plotting.py └── bash │ └── monitor_script.sh ├── dockerfiles ├── notes-for-bowtie ├── share_task_html_report.dockerfile ├── share_task_generate_h5.dockerfile ├── terra_archr_and_seurat.dockerfile ├── share_task_correct_fastq.dockerfile ├── 10x_task_preprocess.dockerfile ├── share_task_joint_qc.dockerfile ├── share_task_trim_fastqs_atac.dockerfile ├── share_task_qc_rna.dockerfile ├── share_task_seurat.dockerfile ├── share_task_preprocess.dockerfile ├── share_task_merge_bams.dockerfile ├── share_task_archr.dockerfile ├── dorcs_task_find_dorcs.dockerfile ├── share_task_bowtie2.dockerfile ├── share_task_star.dockerfile ├── share_task_cell_annotation.dockerfile ├── share_task_filter_atac.dockerfile └── share_task_qc_atac.dockerfile ├── example_input_json ├── subwf_preprocess.json └── inputs-short-share.json ├── .gitignore ├── tasks ├── raise_exception.wdl ├── share_task_log_atac.wdl ├── 10x_create_barcode_mapping.wdl ├── share_task_log_rna.wdl ├── get_cellxgene_data.wdl ├── share_task_correct_fastq.wdl ├── share_task_trim_fastqs_atac.wdl ├── share_task_generate_h5.wdl ├── share_task_html_report.wdl ├── dorcs_task_find_dorcs.wdl ├── share_task_star.wdl ├── share_task_cell_annotation.wdl ├── share_task_qc_rna.wdl ├── share_task_joint_qc.wdl ├── 10x_task_preprocess.wdl └── share_task_merge_bams.wdl ├── LICENSE ├── .dockstore.yml ├── workflows ├── subwf-cell-annotation.wdl ├── subwf-atac-archr.wdl ├── subwf-rna-seurat.wdl └── subwf-find-dorcs.wdl ├── .vimrc └── README.md /docs/input.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | .cache 3 | data 4 | input_examples 5 | tasks 6 | tests 7 | tmp 8 | -------------------------------------------------------------------------------- /.gcloudignore: -------------------------------------------------------------------------------- 1 | input_examples 2 | LICENSE 3 | README.md 4 | share-seq.wdl 5 | tasks 6 | tests 7 | workflows 8 | -------------------------------------------------------------------------------- /src/R/TSSRanges.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/epi-SHARE-seq-pipeline/HEAD/src/R/TSSRanges.RData -------------------------------------------------------------------------------- /dockerfiles/notes-for-bowtie: -------------------------------------------------------------------------------- 1 | https://community.arm.com/developer/tools-software/hpc/b/hpc-blog/posts/tuning-bowtie2-better-performance 2 | -------------------------------------------------------------------------------- /docs/images/pipeline_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/epi-SHARE-seq-pipeline/HEAD/docs/images/pipeline_overview.png -------------------------------------------------------------------------------- /example_input_json/subwf_preprocess.json: -------------------------------------------------------------------------------- 1 | { 2 | "wf_preprocess.atac_primers" : "P1.01,P1.02", 3 | "wf_preprocess.rna_primers" : "P1.17,P1.18", 4 | "wf_preprocess.read1" : "other-files-for-testing/Undetermined_S1_R1_001.fastq.gz", 5 | "wf_preprocess.read2" : "other-files-for-testing/Undetermined_S1_R4_001.fastq.gz", 6 | "wf_preprocess.index1" : "other-files-for-testing/Undetermined_S1_R2_001.fastq.gz", 7 | "wf_preprocess.index2" : "other-files-for-testing/Undetermined_S1_R3_001.fastq.gz" 8 | 9 | } 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | /target/ 4 | 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 7 | Cargo.lock 8 | 9 | # These are backup files generated by rustfmt 10 | **/*.rs.bk 11 | 12 | .DS_Store 13 | 14 | .ipynb_checkpoints* 15 | 16 | .dockstore.yml 17 | 18 | src/jupyter_nb/log/ 19 | src/jupyter_nb/prefix.rna.cell.annotation.plots.mm10/ 20 | build_docker.sh 21 | -------------------------------------------------------------------------------- /tasks/raise_exception.wdl: -------------------------------------------------------------------------------- 1 | # From https://github.com/ENCODE-DCC/chip-seq-pipeline2/blob/master/chip.wdl 2 | 3 | 4 | task raise_exception { 5 | input { 6 | String msg 7 | Array[String]? vals 8 | } 9 | command { 10 | echo -e "\n* Error: ${msg}\n" >&2 11 | echo -e "* Vals: ${sep=',' vals}\n" >&2 12 | exit 2 13 | } 14 | output { 15 | String error_msg = '${msg}' 16 | } 17 | runtime { 18 | maxRetries : 0 19 | cpu : 1 20 | memory : '2 GB' 21 | time : 1 22 | disks : 'local-disk 10 SSD' 23 | docker : 'encodedcc/chip-seq-pipeline:v2.2.1' 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /dockerfiles/share_task_html_report.dockerfile: -------------------------------------------------------------------------------- 1 | ############################################################ 2 | # Dockerfile for BROAD GRO share-seq-pipeline 3 | # Based on Ubuntu 18.04.3 4 | ############################################################ 5 | 6 | # Set the base image to Ubuntu 18.04.3 7 | #FROM ubuntu:focal 8 | FROM ubuntu@sha256:d1d454df0f579c6be4d8161d227462d69e163a8ff9d20a847533989cf0c94d90 9 | 10 | LABEL maintainer="Neva Durand" 11 | 12 | # To prevent time zone prompt 13 | ENV DEBIAN_FRONTEND=noninteractive 14 | 15 | # Install softwares from apt repo 16 | RUN apt-get update && apt-get install -y \ 17 | python3 \ 18 | && rm -rf /var/lib/apt/lists/* 19 | 20 | # Make directory for all softwares 21 | RUN mkdir /software 22 | WORKDIR /software 23 | ENV PATH="/software:${PATH}" 24 | 25 | # Copy the external scripts inside 26 | COPY src/python/write_html.py /software 27 | -------------------------------------------------------------------------------- /dockerfiles/share_task_generate_h5.dockerfile: -------------------------------------------------------------------------------- 1 | ############################################################ 2 | # Dockerfile for BROAD GRO share-seq-pipeline 3 | # Based on Debian slim 4 | ############################################################ 5 | 6 | FROM python@sha256:7ad180fdf785219c4a23124e53745fbd683bd6e23d0885e3554aff59eddbc377 7 | 8 | LABEL maintainer = "Eugenio Mattei" 9 | LABEL software = "Share-seq pipeline" 10 | LABEL software.version="1.0.0" 11 | LABEL software.organization="Broad Institute of MIT and Harvard" 12 | LABEL software.version.is-production="Yes" 13 | LABEL software.task="generate_h5" 14 | 15 | # Install python packages 16 | RUN pip install --no-cache-dir h5py scipy 17 | 18 | # Create and setup new user 19 | ENV USER=shareseq 20 | WORKDIR /home/$USER 21 | RUN groupadd -r $USER &&\ 22 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\ 23 | chown $USER:$USER /home/$USER 24 | 25 | # Copy scripts 26 | COPY --chown=$USER:$USER src/python/generate_h5_rna.py /usr/local/bin/ 27 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin 28 | 29 | USER ${USER} 30 | -------------------------------------------------------------------------------- /dockerfiles/terra_archr_and_seurat.dockerfile: -------------------------------------------------------------------------------- 1 | ############################################################ 2 | # Dockerfile for Terra to support ArchR 3 | # Based on Debian slim 4 | ############################################################ 5 | 6 | FROM us.gcr.io/broad-dsp-gcr-public/terra-jupyter-r:2.1.3 7 | 8 | LABEL maintainer = "Siddarth Wekhande" 9 | LABEL software = "ArchR on Terra" 10 | LABEL software.version="0.0.1" 11 | LABEL software.organization="Broad Institute of MIT and Harvard" 12 | LABEL software.version.is-production="No" 13 | LABEL software.task="archr" 14 | 15 | USER root 16 | 17 | RUN R --no-echo --no-restore --no-save -e "install.packages(c('hdf5r','remotes'))" 18 | 19 | RUN R --no-echo --no-restore --no-save -e "remotes::install_version('Seurat', version = '4.1.1')" 20 | 21 | RUN R --no-echo --no-restore --no-save -e "remotes::install_github('GreenleafLab/ArchR@v1.0.1', repos = BiocManager::repositories());ArchR::installExtraPackages()" 22 | 23 | RUN R --no-echo --no-restore --no-save -e "remotes::install_github('immunogenomics/presto')" 24 | 25 | ENV USER jupyter 26 | USER $USER 27 | 28 | ENTRYPOINT ["/bin/bash"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Broad Institute 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/python/get_cellxgene_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script takes dataset id as input and download an h5ad file 3 | from cellxgene server using cellxgene_census API. 4 | """ 5 | 6 | import argparse 7 | import logging 8 | import cellxgene_census 9 | import scanpy as sc 10 | 11 | def parse_arguments(): 12 | parser = argparse.ArgumentParser(description="Download data from cellxgene server") 13 | parser.add_argument("--id", type=str, required=True, 14 | help="Cellxgene dataset id to download.") 15 | parser.add_argument("--out", type=str, required=True, 16 | help="Output filename", default="reference") 17 | 18 | return parser.parse_args() 19 | 20 | 21 | if __name__ == '__main__': 22 | # create log file 23 | logging.basicConfig(filename="get_cellxgene_data.log", level=logging.INFO) 24 | 25 | # get arguments 26 | args = parse_arguments() 27 | 28 | logging.info("Downloading data\n") 29 | cellxgene_census.download_source_h5ad( 30 | dataset_id=args.id, 31 | to_path=f"{args.out}.h5ad") 32 | 33 | adata = sc.read_h5ad(f"{args.out}.h5ad") 34 | 35 | # get counts 36 | if not adata.raw: 37 | adata.raw = adata.copy() 38 | 39 | adata.write_h5ad(f"{args.out}.h5ad") 40 | 41 | logging.info("All done!") 42 | -------------------------------------------------------------------------------- /dockerfiles/share_task_correct_fastq.dockerfile: -------------------------------------------------------------------------------- 1 | ############################################################ 2 | # Dockerfile for BROAD GRO share-seq-pipeline 3 | # Based on Debian slim 4 | ############################################################ 5 | 6 | FROM python@sha256:7ad180fdf785219c4a23124e53745fbd683bd6e23d0885e3554aff59eddbc377 7 | 8 | LABEL maintainer = "Eugenio Mattei" 9 | LABEL software = "Share-seq pipeline" 10 | LABEL software.version="1.0.0" 11 | LABEL software.organization="Broad Institute of MIT and Harvard" 12 | LABEL software.version.is-production="Yes" 13 | LABEL software.task="correct_fastq" 14 | 15 | # To prevent time zone prompt 16 | ENV DEBIAN_FRONTEND=noninteractive 17 | 18 | # Install softwares from apt repo 19 | RUN apt-get update && apt-get install -y \ 20 | pigz && \ 21 | rm -rf /var/lib/apt/lists/* 22 | 23 | # Install python packages 24 | RUN pip install --no-cache-dir --break-system-packages xopen 25 | 26 | # Create and setup new user 27 | ENV USER=shareseq 28 | WORKDIR /home/$USER 29 | RUN groupadd -r $USER &&\ 30 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\ 31 | chown $USER:$USER /home/$USER 32 | 33 | # Copy scripts 34 | COPY --chown=$USER:$USER src/python/correct_fastq.py /usr/local/bin/ 35 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin 36 | 37 | USER ${USER} 38 | -------------------------------------------------------------------------------- /dockerfiles/10x_task_preprocess.dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian@sha256:3ecce669b6be99312305bc3acc90f91232880c68b566f257ae66647e9414174f 2 | 3 | LABEL maintainer = "Eugenio Mattei" 4 | LABEL software = "Share-seq pipeline" 5 | LABEL software.version="1.0.0" 6 | LABEL software.organization="Broad Institute of MIT and Harvard" 7 | LABEL software.version.is-production="Yes" 8 | LABEL software.task="10x preprocess" 9 | 10 | RUN apt-get update && apt-get install -y \ 11 | gcc \ 12 | git \ 13 | python3 \ 14 | python3-dev \ 15 | python3-pip \ 16 | zlib1g-dev \ 17 | wget &&\ 18 | rm -rf /var/lib/apt/lists/* 19 | 20 | # Install packages for python3 scripts (pysam, SAMstats) 21 | RUN python3 -m pip install --no-cache-dir --break-system-packages --ignore-installed numpy pandas pybind11 --editable=git+https://github.com/GreenleafLab/matcha.git#egg=matcha 22 | 23 | # Create and setup new user 24 | ENV USER=shareseq 25 | WORKDIR /home/$USER 26 | 27 | RUN groupadd -r $USER &&\ 28 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\ 29 | chown $USER:$USER /home/$USER 30 | 31 | # Add folder with software to the path 32 | ENV PATH="/software:${PATH}" 33 | 34 | # Copy the compiled software from the builder 35 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin 36 | COPY --chown=$USER:$USER src/python/barcode_revcomp_detect.py /usr/local/bin 37 | COPY --chown=$USER:$USER src/python/match_barcodes.py /usr/local/bin 38 | 39 | USER ${USER} 40 | -------------------------------------------------------------------------------- /example_input_json/inputs-short-share.json: -------------------------------------------------------------------------------- 1 | { 2 | "share.chemistry": "String", 3 | "share.read1_atac": "Array[File]", 4 | "share.read2_atac": "Array[File]", 5 | "share.read1_rna": "Array[File]", 6 | "share.read2_rna": "Array[File]", 7 | "share.genome_name_input": "String", 8 | "share.pipeline_type": "['full', 'count_only', 'no-align']", 9 | 10 | 11 | "share.pkr": "String? (optional, default = \"\")", 12 | "share.prefix": "String (optional, default = \"shareseq-project\")", 13 | "share.atac.align_multimappers": "Int? (optional)", 14 | "share.whitelist": "File? (optional)", 15 | "share.atac.barcode_tag_fragments": "String? (optional)", 16 | 17 | "share.trim_fastqs": "Boolean (optional, default = true)", 18 | 19 | 20 | "share.append_comment": "Boolean (optional, default = false)", 21 | "share.fastq_barcode": "Array[File] (optional, default = [])", 22 | "share.preprocess_tenx.barcode_dist": "Int? (optional, default = 2)", 23 | "share.preprocess_tenx.threshold_pct_barcode_matching": "Float? (optional, default = 0.6)", 24 | "share.whitelist_atac": "File? (optional)", 25 | "share.whitelist_rna": "File? (optional)", 26 | 27 | 28 | "share.atac.barcode_tag": "String? (optional, default = \"CB\")", 29 | 30 | "share.atac_genome_index_tar": "File? (optional)", 31 | "share.idx_tar_rna": "File? (optional)", 32 | "share.gtf": "File? (optional)", 33 | "share.tss_bed": "File? (optional)", 34 | "share.peak_set": "File? (optional)", 35 | "share.chrom_sizes": "File? (optional)" 36 | 37 | } 38 | 39 | -------------------------------------------------------------------------------- /dockerfiles/share_task_joint_qc.dockerfile: -------------------------------------------------------------------------------- 1 | ############################################################ 2 | # Dockerfile for BROAD GRO share-seq-pipeline 3 | ############################################################ 4 | 5 | #FROM ubuntu@sha256:d1d454df0f579c6be4d8161d227462d69e163a8ff9d20a847533989cf0c94d90 6 | FROM python:3.8-buster@sha256:7e7f4c5508b85268a93b573566c8eb321a6fdb466e3b60c663a42300c73a7400 7 | 8 | LABEL maintainer="Mei Knudson" 9 | 10 | # To prevent time zone prompt 11 | ENV DEBIAN_FRONTEND=noninteractive 12 | ENV SAMTOOLS_VERSION 1.9 13 | 14 | # Install softwares from apt repo 15 | RUN apt-get update && apt-get install -y \ 16 | r-base &&\ 17 | rm -rf /var/lib/apt/lists/* 18 | 19 | # Install packages for python3 scripts 20 | RUN python3 -m pip install matplotlib numpy pandas plotnine 21 | 22 | # Install packages for R scripts 23 | RUN R -e "install.packages(c('ggplot2', 'remotes'))" 24 | RUN R -e "remotes::install_github('LKremer/ggpointdensity')" 25 | 26 | # Create and setup new user 27 | ENV USER=shareseq 28 | WORKDIR /home/$USER 29 | RUN groupadd -r $USER &&\ 30 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\ 31 | chown $USER:$USER /home/$USER 32 | 33 | ENV PYTHONPATH="/usr/local/python:$PYTHONPATH" 34 | ENV R_LIBS_USER=/usr/local/lib/R 35 | 36 | COPY --chown=$USER:$USER src/python/joint_cell_plotting.py /usr/local/bin 37 | COPY --chown=$USER:$USER src/R/joint_cell_plotting_density.R /usr/local/bin 38 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin 39 | 40 | USER ${USER} 41 | -------------------------------------------------------------------------------- /dockerfiles/share_task_trim_fastqs_atac.dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian@sha256:3ecce669b6be99312305bc3acc90f91232880c68b566f257ae66647e9414174f 2 | 3 | LABEL maintainer = "Eugenio Mattei" 4 | LABEL software = "Share-seq pipeline" 5 | LABEL software.version="1.0.0" 6 | LABEL software.organization="Broad Institute of MIT and Harvard" 7 | LABEL software.version.is-production="Yes" 8 | LABEL software.task="Trim ATAC fastqs" 9 | 10 | # Install softwares from apt repo 11 | RUN apt-get update && apt-get install -y \ 12 | autoconf \ 13 | automake \ 14 | binutils \ 15 | build-essential \ 16 | libcurl4-openssl-dev \ 17 | liblz4-dev \ 18 | liblzma-dev \ 19 | libncurses5-dev \ 20 | libbz2-dev \ 21 | pigz \ 22 | python3-dev \ 23 | python3-pip \ 24 | wget \ 25 | zlib1g-dev &&\ 26 | rm -rf /var/lib/apt/lists/* 27 | 28 | # Install python packages 29 | RUN pip install --no-cache-dir --break-system-packages dnaio Levenshtein 30 | # Install fastp 31 | RUN wget http://opengene.org/fastp/fastp.0.20.1 && mv fastp.0.20.1 fastp && chmod a+x ./fastp && mv ./fastp /usr/local/bin 32 | 33 | # Create and setup new user 34 | ENV USER=shareseq 35 | WORKDIR /home/$USER 36 | 37 | RUN groupadd -r $USER &&\ 38 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\ 39 | chown $USER:$USER /home/$USER 40 | 41 | # Add folder with software to the path 42 | ENV PATH="/software:${PATH}" 43 | 44 | # Copy the compiled software from the builder 45 | COPY --chown=$USER:$USER src/python/trim_fastq.py /usr/local/bin 46 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin 47 | 48 | USER ${USER} 49 | -------------------------------------------------------------------------------- /.dockstore.yml: -------------------------------------------------------------------------------- 1 | version: 1.2 2 | workflows: 3 | - name: "SHARE-seq" 4 | subclass: WDL 5 | primaryDescriptorPath: /share-seq.wdl 6 | filters: # Only develop or master branches and localAligner/** tags 7 | branches: 8 | - main 9 | - IGVF-variant-jamboree 10 | tags: 11 | - /.*/ 12 | 13 | - name: "dorcs-find-dorcs" 14 | subclass: WDL 15 | primaryDescriptorPath: /workflows/subwf-find-dorcs.wdl 16 | filters: # Only develop or master branches and localAligner/** tags 17 | branches: 18 | - main 19 | tags: 20 | - /.*/ 21 | - name: "SHARE-seq-atac-archr" 22 | subclass: WDL 23 | primaryDescriptorPath: /workflows/subwf-atac-archr.wdl 24 | filters: # Only develop or master branches and localAligner/** tags 25 | branches: 26 | - main 27 | - dev 28 | tags: 29 | - /.*/ 30 | 31 | - name: "SHARE-seq-rna-seurat" 32 | subclass: WDL 33 | primaryDescriptorPath: /workflows/subwf-rna-seurat.wdl 34 | filters: # Only develop or master branches and localAligner/** tags 35 | branches: 36 | - main 37 | - dev 38 | tags: 39 | - /.*/ 40 | 41 | - name: "SHARE-seq-sample-demultiplexing" 42 | subclass: WDL 43 | primaryDescriptorPath: /workflows/subwf-preprocess.wdl 44 | filters: # Only develop or master branches and localAligner/** tags 45 | branches: 46 | - main 47 | tags: 48 | - /.*/ 49 | 50 | - name: "SHARE-seq-cell-annotation" 51 | subclass: WDL 52 | primaryDescriptorPath: /workflows/subwf-cell-annotation.wdl 53 | filters: # Only develop or master branches and localAligner/** tags 54 | branches: 55 | - main 56 | - dev 57 | - cell-annotation 58 | tags: 59 | - /.*/ 60 | -------------------------------------------------------------------------------- /src/python/qc_atac_count_duplicates_per_barcode.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Count the number of unique and duplicate fragments per barcode using the DS and DI tag from picard 4 | 5 | import argparse 6 | import pysam 7 | import sys 8 | 9 | from collections import defaultdict 10 | 11 | def count_duplicates(in_path, out_path, barcode_tag="CB"): 12 | """ 13 | """ 14 | # Dictionary holding the unique and duplicate count per barcode 15 | counter = defaultdict(lambda: [0,0]) 16 | input = pysam.AlignmentFile(in_path, "rb") 17 | for read in input: 18 | cell_barcode = read.get_tag(barcode_tag) 19 | if read.flag & 1024 == 1024: 20 | counter[cell_barcode][1] += 1 21 | else: 22 | counter[cell_barcode][0] += 1 23 | 24 | with open(out_path, "w") as out_file: 25 | print("barcode\treads_unique\treads_duplicate\tpct_duplicates", file=out_file) 26 | for barcode, counts_vector in counter.items(): 27 | print(f"{barcode}\t{counts_vector[0]}\t{counts_vector[1]}\t{round(counts_vector[1]/(counts_vector[0]+counts_vector[1])*100,1)}", file=out_file) 28 | 29 | if __name__ == '__main__': 30 | 31 | msg = "Add the description" 32 | parser = argparse.ArgumentParser(description = msg) 33 | 34 | # Adding optional argument 35 | parser.add_argument("bam_wdup", help = "Path to the coordinate-sorted bam file with duplicates marked but nor removed.") 36 | parser.add_argument("-o", "--output", help = "Path to the fragments output file.") 37 | parser.add_argument("--prefix", help = "Prefix for the metrics output file.") 38 | parser.add_argument("--bc_tag", help = "Specify the tag containing the cell barcode.", default="CB") 39 | 40 | # Read arguments from command line 41 | args = parser.parse_args() 42 | 43 | if args.prefix: 44 | prefix = args.prefix 45 | else: 46 | prefix = args.bam_wdup[:-4] 47 | 48 | if args.output: 49 | out_path = args.output 50 | else: 51 | out_path = f"{prefix}.duplicate.stats.tsv" 52 | 53 | bc_tag = args.bc_tag 54 | 55 | 56 | count_duplicates(args.bam_wdup, out_path, bc_tag) 57 | -------------------------------------------------------------------------------- /src/python/pbc_stats.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | # Author Kundaje lab 4 | # https://github.com/kundajelab/ENCODE_scatac/blob/master/workflow/scripts/pbc_stats.py 5 | # Input QNAME sorted 6 | 7 | 8 | def calc_pbc(in_sam, out_path): 9 | total_pairs = 0 10 | distinct_pairs = -1 11 | one_read_pairs = 0 12 | two_read_pairs = 0 13 | 14 | current_pair = None 15 | current_count = 0 16 | 17 | for al in in_sam: 18 | fields = al.strip().split('\t') 19 | flag = int(fields[1]) 20 | rname = fields[2] 21 | pos = int(fields[3]) 22 | pnext = int(fields[7]) 23 | 24 | if not (flag & 35 == 35): 25 | continue 26 | 27 | pair = (rname, pos, pnext) 28 | if pair == current_pair: 29 | total_pairs += 1 30 | current_count += 1 31 | else: 32 | total_pairs += current_count 33 | distinct_pairs += 1 34 | if current_count == 1: 35 | one_read_pairs += 1 36 | elif current_count == 2: 37 | two_read_pairs += 1 38 | 39 | current_pair = pair 40 | current_count = 1 41 | 42 | total_pairs += current_count 43 | distinct_pairs += 1 44 | if current_count == 1: 45 | one_read_pairs += 1 46 | elif current_count == 2: 47 | two_read_pairs += 1 48 | 49 | nrf = distinct_pairs / total_pairs 50 | pbc1 = one_read_pairs / distinct_pairs 51 | pbc2 = one_read_pairs / two_read_pairs 52 | 53 | stats_str = "\t".join(str(i) for i in [ 54 | total_pairs, 55 | distinct_pairs, 56 | one_read_pairs, 57 | two_read_pairs, 58 | nrf, 59 | pbc1, 60 | pbc2 61 | ]) 62 | descr_str = "\t".join([ 63 | "TotalReadPairs", 64 | "DistinctReadPairs", 65 | "OneReadPair", 66 | "TwoReadPairs", 67 | "NRF=Distinct/Total", 68 | "PBC1=OnePair/Distinct", 69 | "PBC2=OnePair/TwoPair" 70 | ]) 71 | with open(out_path, 'w') as f: 72 | f.write(f"{descr_str}\n{stats_str}\n") 73 | 74 | if __name__ == "__main__": 75 | qc_path = sys.argv[1] 76 | calc_pbc(sys.stdin, qc_path) 77 | 78 | -------------------------------------------------------------------------------- /tasks/share_task_log_atac.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | # TASK 4 | # SHARE-atac-log 5 | # Gather information from log files 6 | 7 | 8 | task log_atac { 9 | meta { 10 | version: 'v0.1' 11 | author: 'Neva C. Durand (neva@broadinstitute.org) at Broad Institute of MIT and Harvard' 12 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: log atac task' 13 | } 14 | 15 | input { 16 | # This function takes as input the necessary log files and extracts 17 | # the quality metrics 18 | File alignment_log 19 | File dups_log 20 | } 21 | 22 | command <<< 23 | total_reads=$(awk 'NR==1{print $1}' ~{alignment_log}) 24 | echo $total_reads > total_reads.txt 25 | aligned_uniquely=$(awk 'NR==4{print $1}' ~{alignment_log}) 26 | echo $aligned_uniquely > aligned_uniquely.txt 27 | echo $(($total_reads - $aligned_uniquely)) > unaligned.txt 28 | awk 'NR>1{sum += $2}END{print sum/2}' ~{dups_log} > feature_reads.txt 29 | awk 'NR>1{sum += $3}END{print sum/2}' ~{dups_log} > duplicate_reads.txt 30 | awk 'NR>1{unique+= $2; dups+=$3}END{printf "%5.1f", 100*dups/(unique+dups)}' ~{dups_log} > pct_duplicate_reads.txt 31 | >>> 32 | output { 33 | Int atac_total_reads = read_int("total_reads.txt") 34 | Int atac_aligned_uniquely = read_int("aligned_uniquely.txt") 35 | Int atac_unaligned = read_int("unaligned.txt") 36 | Int atac_feature_reads = read_int("feature_reads.txt") 37 | Int atac_duplicate_reads = read_int("duplicate_reads.txt") 38 | Float atac_pct_dup = read_float("pct_duplicate_reads.txt") 39 | } 40 | 41 | runtime { 42 | docker: 'ubuntu:latest' 43 | } 44 | parameter_meta { 45 | alignment_log: { 46 | description: 'ATAC alignment log file', 47 | help: 'Log file from ATAC alignment step.', 48 | example: 'SS-PKR-30-96-ENTIRE-PLATE.atac.align.hg38.Log.out' 49 | } 50 | dups_log: { 51 | description: 'ATAC dups log file', 52 | help: 'Log file from ATAC rmdups step.', 53 | example: 'SS-PKR-12.atac.counts.mm10.filtered.cs.log' 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /tasks/10x_create_barcode_mapping.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | # TASK 4 | # 10x_barcode_mapping 5 | 6 | task mapping_tenx_barcodes { 7 | meta { 8 | version: 'v0.1' 9 | author: 'Eugenio Mattei (emattei@broadinstitute.org) at Broad Institute of MIT and Harvard' 10 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: preprocess 10x ATAC data.' 11 | } 12 | 13 | input { 14 | # This task takes in input the 3 fastqs coming out from cellranger mkfastqs and preprocess them. 15 | File whitelist_atac # Barcode whitelist (chemistry specific) 16 | File whitelist_rna # Barcode whitelist (chemistry specific) 17 | 18 | Int? cpus = 16 19 | Float? disk_factor = 0.5 20 | Float? memory_factor = 0.15 21 | String? docker_image = "debian:bullseye-slim" 22 | } 23 | 24 | # Determine the size of the input 25 | Float input_file_size_gb = size(whitelist_rna, "G") + size(whitelist_atac, "G") 26 | 27 | # Determining memory size base on the size of the input files. 28 | Float mem_gb = 5.0 + memory_factor * input_file_size_gb 29 | 30 | # Determining disk size base on the size of the input files. 31 | Int disk_gb = round(40.0 + disk_factor * input_file_size_gb) 32 | 33 | # Determining disk type base on the size of disk. 34 | String disk_type = if disk_gb > 375 then "SSD" else "LOCAL" 35 | 36 | String barcode_conversion_dict = "barcode_conversion_dict.csv" 37 | 38 | command <<< 39 | set -e 40 | 41 | if [ "$(zcat ~{whitelist_atac} | wc -l)" -eq "$(zcat ~{whitelist_rna} | wc -l)" ]; then 42 | zcat ~{whitelist_atac} | tr ACGTacgt TGCAtgca | rev | paste -d ',' - <(zcat ~{whitelist_rna}) > ~{barcode_conversion_dict} 43 | paste -d ',' <(zcat ~{whitelist_atac}) <(zcat ~{whitelist_rna}) >> ~{barcode_conversion_dict} 44 | fi 45 | >>> 46 | 47 | output { 48 | File? tenx_barcode_conversion_dict = barcode_conversion_dict 49 | } 50 | 51 | runtime { 52 | cpu: cpus 53 | docker: "${docker_image}" 54 | disks: "local-disk ${disk_gb} ${disk_type}" 55 | maxRetries: 1 56 | memory: "${mem_gb} GB" 57 | memory_retry_multiplier: 2 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /dockerfiles/share_task_qc_rna.dockerfile: -------------------------------------------------------------------------------- 1 | ############################################################ 2 | # Dockerfile for BROAD GRO share-seq-pipeline 3 | # Based on Debian slim 4 | ############################################################ 5 | 6 | FROM r-base@sha256:fff003a52d076e963396876b83cfa88c4f40a8bc27e341339cd3cc0236c1db79 7 | 8 | LABEL maintainer = "Eugenio Mattei" 9 | LABEL software = "Share-seq pipeline" 10 | LABEL software.version="1.0.0" 11 | LABEL software.organization="Broad Institute of MIT and Harvard" 12 | LABEL software.version.is-production="Yes" 13 | LABEL software.task="qc_rna" 14 | 15 | ENV R_VERSION=4.1.2 16 | 17 | # To prevent time zone prompt 18 | ENV DEBIAN_FRONTEND=noninteractive 19 | ENV SAMTOOLS_VERSION 1.9 20 | 21 | # Install softwares from apt repo 22 | RUN apt-get update && apt-get install -y \ 23 | autoconf \ 24 | automake \ 25 | binutils \ 26 | build-essential \ 27 | git \ 28 | libcurl4-openssl-dev \ 29 | liblz4-dev \ 30 | liblzma-dev \ 31 | libncurses5-dev \ 32 | libbz2-dev \ 33 | python3 \ 34 | python3-dev \ 35 | python3-full \ 36 | python3-pip \ 37 | wget \ 38 | zlib1g-dev &&\ 39 | rm -rf /var/lib/apt/lists/* 40 | 41 | # Install samtools 1.9 42 | RUN git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/samtools.git && \ 43 | git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/htslib.git && \ 44 | cd samtools && make && make install && cd ../ && rm -rf samtools* htslib* 45 | 46 | # Install python packages 47 | RUN pip install --no-cache-dir --break-system-packages pysam 48 | 49 | # Create and setup new user 50 | ENV USER=shareseq 51 | WORKDIR /home/$USER 52 | RUN groupadd -r $USER &&\ 53 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\ 54 | chown $USER:$USER /home/$USER 55 | 56 | ENV R_LIBS_USER=/usr/local/lib/R 57 | 58 | # Copy scripts 59 | COPY --chown=$USER:$USER src/python/rna_barcode_metadata.py /usr/local/bin/ 60 | COPY --chown=$USER:$USER src/R/barcode_rank_functions.R /usr/local/bin/ 61 | COPY --chown=$USER:$USER src/R/rna_qc_plots.R /usr/local/bin/ 62 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin 63 | 64 | USER ${USER} 65 | -------------------------------------------------------------------------------- /dockerfiles/share_task_seurat.dockerfile: -------------------------------------------------------------------------------- 1 | ############################################################ 2 | # Dockerfile for BROAD GRO share-seq-pipeline 3 | # Based on Debian slim 4 | ############################################################ 5 | 6 | FROM r-base@sha256:fff003a52d076e963396876b83cfa88c4f40a8bc27e341339cd3cc0236c1db79 as builder 7 | 8 | LABEL maintainer = "Siddarth Wekhande" 9 | LABEL software = "Share-seq pipeline" 10 | LABEL software.version="0.0.1" 11 | LABEL software.organization="Broad Institute of MIT and Harvard" 12 | LABEL software.version.is-production="No" 13 | LABEL software.task="seurat" 14 | 15 | RUN echo "options(repos = 'https://cloud.r-project.org')" > $(R --no-echo --no-save -e "cat(Sys.getenv('R_HOME'))")/etc/Rprofile.site 16 | 17 | ENV R_LIBS_USER=/usr/local/lib/R 18 | ENV RETICULATE_MINICONDA_ENABLED=FALSE 19 | 20 | RUN apt-get update -qq && \ 21 | apt-get install -y -qq --no-install-recommends\ 22 | binutils \ 23 | gtk-doc-tools \ 24 | libcairo2-dev \ 25 | libcurl4-openssl-dev \ 26 | libfreetype-dev \ 27 | libfribidi-dev \ 28 | libgsl-dev \ 29 | libharfbuzz-dev \ 30 | libhdf5-dev \ 31 | libjpeg-dev \ 32 | libmpfr-dev \ 33 | libpng-dev \ 34 | libssl-dev \ 35 | libtiff5-dev \ 36 | libxml2-dev \ 37 | libxt-dev \ 38 | libgeos-dev \ 39 | meson \ 40 | pkg-config \ 41 | python3 \ 42 | python3-pip && \ 43 | rm -rf /var/lib/apt/lists/* 44 | 45 | ENV USER=shareseq 46 | WORKDIR /home/$USER 47 | 48 | RUN groupadd -r $USER &&\ 49 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\ 50 | chown $USER:$USER /home/$USER 51 | 52 | RUN R --no-echo --no-restore --no-save -e "install.packages(c('hdf5r','remotes','IRkernel','logr','BiocManager'))" 53 | 54 | RUN R --no-echo --no-restore --no-save -e "remotes::install_version('Seurat', version = '4.3.0')" 55 | 56 | RUN R --no-echo --no-restore --no-save -e "BiocManager::install(c('rhdf5'), update=F, ask=F)" 57 | 58 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin 59 | 60 | 61 | RUN python3 -m pip install --break-system-packages jupyter papermill 62 | 63 | COPY src/jupyter_nb/seurat_notebook.ipynb /usr/local/bin/ 64 | 65 | RUN R -e "IRkernel::installspec()" 66 | -------------------------------------------------------------------------------- /dockerfiles/share_task_preprocess.dockerfile: -------------------------------------------------------------------------------- 1 | ############################################################ 2 | # Dockerfile for BROAD GRO share-seq-pipeline 3 | # Based on Ubuntu 18.04.3 4 | ############################################################ 5 | 6 | # Set the base image to Ubuntu 18.04.3 7 | #FROM ubuntu:focal 8 | FROM ubuntu@sha256:d1d454df0f579c6be4d8161d227462d69e163a8ff9d20a847533989cf0c94d90 9 | 10 | MAINTAINER Neva Durand 11 | 12 | # To prevent time zone prompt 13 | ENV DEBIAN_FRONTEND=noninteractive 14 | 15 | # Install softwares from apt repo 16 | RUN apt-get update && apt-get install -y \ 17 | libncurses5-dev libcurl4-openssl-dev zlib1g-dev liblzma-dev libbz2-dev \ 18 | python3 python3-setuptools python3-pip \ 19 | git wget xmlstarlet \ 20 | openjdk-8-jre \ 21 | && rm -rf /var/lib/apt/lists/* 22 | 23 | # Make directory for all softwares 24 | RUN mkdir /software 25 | WORKDIR /software 26 | ENV PATH="/software:${PATH}" 27 | 28 | # Install samtools 1.9 29 | RUN git clone --branch 1.9 --single-branch https://github.com/samtools/samtools.git && \ 30 | git clone --branch 1.9 --single-branch https://github.com/samtools/htslib.git && \ 31 | cd samtools && make && make install && cd ../ && rm -rf samtools* htslib* 32 | 33 | # Install system/math python packages (python3) 34 | RUN pip3 install --no-cache-dir python-Levenshtein==0.12.2 pysam requests oauth2client 35 | 36 | # Install Picard 2.26.11 37 | RUN wget https://github.com/broadinstitute/picard/releases/download/2.26.11/picard.jar && chmod +x picard.jar 38 | 39 | # Install gsutil 40 | # Downloading gcloud package 41 | RUN wget https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz 42 | 43 | # Installing the package 44 | RUN mkdir -p /usr/local/gcloud \ 45 | && gunzip google-cloud-sdk.tar.gz \ 46 | && tar -C /usr/local/gcloud -xvf google-cloud-sdk.tar \ 47 | && /usr/local/gcloud/google-cloud-sdk/install.sh \ 48 | && rm google-cloud-sdk.tar 49 | 50 | # Adding the package path to local 51 | ENV PATH $PATH:/usr/local/gcloud/google-cloud-sdk/bin 52 | 53 | # Copy the external scripts inside 54 | COPY src/python/bam_to_raw_fastq.py /software 55 | COPY src/python/flexible_import_entities_standard.py /software 56 | COPY src/python/write_terra_tables.py /software 57 | COPY src/bash/monitor_script.sh /software 58 | -------------------------------------------------------------------------------- /src/R/joint_cell_plotting_density.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(ggpointdensity) 3 | 4 | args <- commandArgs() 5 | pkr <- args[6] 6 | barcode_metadata_file <- args[7] 7 | plot_file <- args[8] 8 | 9 | options(scipen=999) 10 | 11 | barcode_metadata <- read.csv(barcode_metadata_file) 12 | passing_df <- barcode_metadata[barcode_metadata$QC %in% c("RNA only", "ATAC only", "both"),] 13 | 14 | # get max x and y coords to set plot limits 15 | round_to_power_10 <- function(x){ 16 | return(10^ceiling(log10(x))) 17 | } 18 | max_x <- max(passing_df$frags) 19 | max_y <- max(passing_df$umis) 20 | xy_lim <- round_to_power_10(max(max_x, max_y)) 21 | 22 | # palette from https://rdrr.io/github/GreenleafLab/ArchR/src/R/ColorPalettes.R 23 | sambaNight <- c("6"='#1873CC',"2"='#1798E5',"8"='#00BFFF',"5"='#4AC596',"1"='#00CC00',"4"='#A2E700',"9"='#FFFF00',"7"='#FFD200',"3"='#FFA500') 24 | 25 | if (sum(barcode_metadata$QC=="both") > 0) { 26 | png(plot_file, width=8.75, height=6, units="in", res=300) 27 | 28 | density_plot <- ggplot(passing_df, aes(x=frags, y=umis)) + 29 | geom_pointdensity(size=0.7) + 30 | scale_color_gradientn(colors=sambaNight) + 31 | labs(title=paste0("Joint Cell Calling (", pkr, "): Density Plot", sep=""), 32 | x="ATAC Unique Fragments per Barcode", 33 | y="RNA UMIs per Barcode") + 34 | theme_light() + 35 | theme(plot.margin=margin(t=9, r=36.5, b=25, l=9, unit="pt"), 36 | plot.title=element_text(size=12.5, hjust=0.5), 37 | axis.title=element_text(size=11), 38 | axis.text=element_text(size=8.5), 39 | legend.title=element_text(size=8), 40 | legend.text=element_text(size=6), 41 | panel.grid.minor=element_blank()) + 42 | scale_x_continuous(trans="log10", 43 | limits=c(10,xy_lim)) + 44 | scale_y_continuous(trans="log10", 45 | limits=c(10,xy_lim)) 46 | print(density_plot) 47 | dev.off() 48 | } 49 | -------------------------------------------------------------------------------- /src/python/flexible_import_entities_standard.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import requests 3 | 4 | from oauth2client.client import GoogleCredentials 5 | 6 | # function to get authorization bearer token for requests 7 | def get_access_token(): 8 | """Get access token.""" 9 | 10 | scopes = ["https://www.googleapis.com/auth/userinfo.profile", "https://www.googleapis.com/auth/userinfo.email"] 11 | credentials = GoogleCredentials.get_application_default() 12 | credentials = credentials.create_scoped(scopes) 13 | return credentials.get_access_token().access_token 14 | 15 | def call_flexible_import_entities(workspace_name, project, tsv): 16 | """Post entities to Terra workspace using flexibleImportEntities.""" 17 | 18 | # rawls request URL for batchUpsert 19 | uri = f"https://api.firecloud.org/api/workspaces/{project}/{workspace_name}/flexibleImportEntities?async=false&deleteEmptyValues=false" 20 | # Get access token and and add to headers for requests. 21 | # -H "accept: */*" -H "Authorization: Bearer [token] -H "Content-Type: application/json" 22 | headers = {"Authorization": "Bearer " + get_access_token(), "accept": "*/*"} 23 | 24 | # Create file dictionary to be passed to request 25 | files = {'entities': open(tsv ,'rb')} 26 | 27 | # capture response from API and parse out status code 28 | response = requests.post(uri, headers=headers, files=files) 29 | status_code = response.status_code 30 | 31 | if status_code != 200: # entities upsert fail 32 | print(f"ERROR: Code {status_code} returned.") 33 | print(response.text) 34 | print(response.raise_for_status()) 35 | 36 | # entities upsert success 37 | print(f"Successfully uploaded entities." + "\n") 38 | 39 | if __name__ == '__main__': 40 | parser = argparse.ArgumentParser(description='') 41 | parser.add_argument('-w', '--workspace_name', required=True, help='name of workspace in which to make changes') 42 | parser.add_argument('-p', '--project', required=True, help='billing project (namespace) of workspace in which to make changes') 43 | parser.add_argument('-t', '--tsv', required=True, help='.tsv file formatted in load format to Terra UI') 44 | 45 | args = parser.parse_args() 46 | 47 | # call import API (firecloud) 48 | call_flexible_import_entities(args.workspace_name, args.project, args.tsv) -------------------------------------------------------------------------------- /tasks/share_task_log_rna.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | # TASK 4 | # SHARE-rna-log 5 | # Gather information from log files 6 | 7 | 8 | task log_rna { 9 | meta { 10 | version: 'v0.1' 11 | author: 'Neva C. Durand (neva@broadinstitute.org) at Broad Institute of MIT and Harvard' 12 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: log rna task' 13 | } 14 | 15 | input { 16 | # This function takes as input the necessary log files and extracts 17 | # the quality metrics 18 | File alignment_log 19 | File dups_log 20 | } 21 | 22 | command <<< 23 | total_reads=$(awk -F"|" '$1~/input reads/{print $2}' ~{alignment_log}) 24 | echo $total_reads > total_reads.txt 25 | aligned_uniquely=$(awk -F"|" '$1~/Uniquely mapped reads number/{print $2}' ~{alignment_log}) 26 | echo $aligned_uniquely > aligned_uniquely.txt 27 | aligned_multimap=$(awk -F"|" '$1~/Number of reads mapped to multiple loci/{print $2}' ~{alignment_log}) 28 | echo $aligned_multimap > aligned_multimap.txt 29 | echo $(($total_reads - $aligned_uniquely - $aligned_multimap)) > unaligned.txt 30 | awk -F":" '$1~/total reads/{print $2}' ~{dups_log} > feature_reads.txt 31 | awk -F":" '$1~/duplicate reads/{print $2}' ~{dups_log} > duplicate_reads.txt 32 | >>> 33 | output { 34 | Int rna_total_reads = read_int("total_reads.txt") 35 | Int rna_aligned_uniquely = read_int("aligned_uniquely.txt") 36 | Int rna_aligned_multimap = read_int("aligned_multimap.txt") 37 | Int rna_unaligned = read_int("unaligned.txt") 38 | Int rna_feature_reads = read_int("feature_reads.txt") 39 | Int rna_duplicate_reads = read_int("duplicate_reads.txt") 40 | } 41 | 42 | runtime { 43 | docker: 'ubuntu:latest' 44 | } 45 | parameter_meta { 46 | alignment_log: { 47 | description: 'RNA alignment log file', 48 | help: 'Log file from RNA alignment step.', 49 | example: 'SS-PKR-30-96-ENTIRE-PLATE.rna.align.hg38.Log.out' 50 | } 51 | 52 | dups_log: { 53 | description: 'Group UMI dups log file', 54 | help: 'Log file from group UMI task', 55 | example: 'SS-PKR-30-96-ENTIRE-PLATE.rm_dup_barcode.log.txt' 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/python/plot_insert_size_hist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | This script takes in the Picard CollectInsertSizeMetrics histogram txt file output, 5 | and generates the histogram as a png. 6 | """ 7 | 8 | import argparse 9 | import pandas as pd 10 | from plotnine import * 11 | 12 | def parse_arguments(): 13 | parser = argparse.ArgumentParser(description="Plot insert size histogram") 14 | parser.add_argument("histogram_file", help="Histogram txt file name") 15 | parser.add_argument("pkr", help="PKR ID") 16 | parser.add_argument("out_file", help="Name of output png file") 17 | 18 | return parser.parse_args() 19 | 20 | def get_hist_vals(histogram_file): 21 | """Get dataframe of histogram values""" 22 | with open(histogram_file, "r") as f: 23 | begin_vals = False 24 | insert_size = [] 25 | count = [] 26 | for line in f: 27 | vals = line.rstrip().split(sep="\t") 28 | if begin_vals and len(vals) == 2: # last line is blank 29 | insert_size.append(int(vals[0])) 30 | count.append(int(vals[1])) 31 | elif vals[0] == "insert_size": # desired values occur after line beginning with "insert_size" 32 | begin_vals = True 33 | 34 | df = pd.DataFrame(list(zip(insert_size, count)), columns=["insert_size","count"]) 35 | 36 | return(df) 37 | 38 | def label_func(breaks): 39 | return ["{:.0e}".format(x) for x in breaks] 40 | 41 | def plot_hist(df, pkr, out_file): 42 | plot = (ggplot(df, aes(x="insert_size", y="count")) + 43 | geom_line(color="red") + 44 | geom_area(fill="red") + 45 | labs(title = f"Insert Size Histogram ({pkr})", 46 | x = "Insert size", 47 | y = "Count") + 48 | scale_y_continuous(labels = label_func) + 49 | theme_classic()) 50 | 51 | plot.save(filename = out_file, dpi=1000) 52 | 53 | def main(): 54 | print("Starting histogram plotting script") 55 | args = parse_arguments() 56 | histogram_file = getattr(args, "histogram_file") 57 | pkr = getattr(args, "pkr") 58 | out_file = getattr(args, "out_file") 59 | 60 | df = get_hist_vals(histogram_file) 61 | 62 | plot_hist(df, pkr, out_file) 63 | print("Finished plotting") 64 | 65 | if __name__ == "__main__": 66 | main() 67 | -------------------------------------------------------------------------------- /workflows/subwf-cell-annotation.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | # Import the tasks called by the pipeline 4 | import "../tasks/share_task_cell_annotation.wdl" as share_task_cell_annotation 5 | 6 | workflow wf_cell_annotation { 7 | meta { 8 | version: 'v0.1' 9 | author: 'Zhijian Li' 10 | affiliation: 'Broad Institute of MIT and Harvard' 11 | email: 'lizhijia@broadinstitute.org' 12 | description: 'SHARE-Seq pipeline: cell type annotation using RNA-seq data.' 13 | } 14 | 15 | input { 16 | # Sample name 17 | String? prefix="prefix" 18 | 19 | # Reference genome 20 | String genome 21 | 22 | # Reference data for cell annotation 23 | String reference_data_id 24 | String reference_data_name 25 | String reference_label 26 | 27 | # Set true if the reference data uses gene id as feature name. 28 | # This is usually true for data downloaded from cellxgene server 29 | String? gene_id_to_symbol = "TRUE" 30 | 31 | # Query data 32 | File query_data 33 | 34 | # Docker images 35 | String? docker_image="lzj1769/cell_annotation" 36 | 37 | # Runtime parameters 38 | Float? memory_factor = 5 39 | Float? disk_factor = 10 40 | } 41 | 42 | call share_task_cell_annotation.cell_annotation as cell_annotation{ 43 | input: 44 | reference_data_id = reference_data_id, 45 | reference_data_name = reference_data_name, 46 | reference_label = reference_label, 47 | query_data = query_data, 48 | genome = genome, 49 | gene_id_to_symbol = gene_id_to_symbol, 50 | prefix = prefix, 51 | docker_image = docker_image, 52 | disk_factor = disk_factor, 53 | memory_factor = memory_factor 54 | } 55 | 56 | output { 57 | File share_cell_annotation_reference_h5ad = cell_annotation.reference_h5ad 58 | File share_cell_annotation_notebook_log = cell_annotation.notebook_log 59 | File share_cell_annotation_monitor_log = cell_annotation.monitor_log 60 | File share_cell_annotation_prediction = cell_annotation.prediction 61 | File share_cell_annotation_prediction_labels = cell_annotation.prediction_labels 62 | File share_cell_annotation_prediction_scores = cell_annotation.prediction_scores 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/python/assign_multimappers.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | 4 | """ 5 | From https://github.com/ENCODE-DCC/atac-seq-pipeline/blob/master/src/assign_multimappers.py 6 | Script to take multimappers and randomly assign 7 | REQUIRES A QJNAME SORTED FILE! 8 | """ 9 | 10 | def parse_args(): 11 | ''' 12 | Gives options 13 | ''' 14 | parser = argparse.ArgumentParser(description='Saves reads below a alignment threshold and discards all others') 15 | parser.add_argument('-k', help='Alignment number cutoff') 16 | parser.add_argument('--paired-end', dest='paired_ended', action='store_true', help='Data is paired-end') 17 | args = parser.parse_args() 18 | alignment_cutoff = int(args.k) 19 | paired_ended = args.paired_ended 20 | 21 | return alignment_cutoff, paired_ended 22 | 23 | 24 | if __name__ == "__main__": 25 | ''' 26 | Runs the filtering step of choosing multimapped reads 27 | ''' 28 | 29 | [alignment_cutoff, paired_ended] = parse_args() 30 | 31 | if paired_ended: 32 | alignment_cutoff = int(alignment_cutoff) * 2 33 | 34 | # Store each line in sam file as a list of reads, 35 | # where each read is a list of elements to easily 36 | # modify or grab things 37 | current_reads = [] 38 | current_qname = '' 39 | 40 | for line in sys.stdin: 41 | 42 | read_elems = line.strip().split('\t') 43 | 44 | if read_elems[0].startswith('@'): 45 | sys.stdout.write(line) 46 | continue 47 | 48 | # Keep taking lines that have the same qname 49 | if read_elems[0] == current_qname: 50 | # Add line to current reads 51 | current_reads.append(line) 52 | pass 53 | else: 54 | # Discard if there are more than the alignment cutoff 55 | if len(current_reads) > alignment_cutoff: 56 | current_reads = [line] 57 | current_qname = read_elems[0] 58 | elif len(current_reads) > 0: 59 | # Just output all reads, which are then filtered with samtools 60 | for read in current_reads: 61 | sys.stdout.write(str(read)) 62 | 63 | # And then discard 64 | current_reads = [line] 65 | current_qname = read_elems[0] 66 | else: 67 | # First read in file 68 | current_reads.append(line) 69 | current_qname = read_elems[0] 70 | -------------------------------------------------------------------------------- /src/python/barcode_revcomp_detect.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import sys 3 | 4 | REV_COMP = str.maketrans("ATGC", "TACG") 5 | def reverse_complement(seq): 6 | return str.translate(seq, REV_COMP)[::-1] 7 | 8 | def get_open_fn(path): 9 | with open(path, "rb") as f: 10 | is_gzipped = (f.read(2) == b'\x1f\x8b') 11 | return gzip.open if is_gzipped else open 12 | 13 | def read_barcodes(path): 14 | open_fn = get_open_fn(path) 15 | with open_fn(path, 'rt') as file: 16 | bc = [b.strip() for b in file] 17 | bcrc = [reverse_complement(b) for b in bc] 18 | return set(bc), set(bcrc) 19 | 20 | def bc_detect(fastq, whitelist, out, qc, offset, num_reads=100000, thresh=0.45): 21 | bc, bcrc = read_barcodes(whitelist) 22 | 23 | bc_match = 0 24 | bcrc_match = 0 25 | num_lines = num_reads * 4 26 | with gzip.open(fastq, 'rt') as f: 27 | for lnum, line in enumerate(f): 28 | if lnum >= num_lines: 29 | break 30 | if lnum % 4 != 1: 31 | continue 32 | seq = line.strip()[offset:] 33 | if seq in bc: 34 | bc_match += 1 35 | if seq in bcrc: 36 | bcrc_match += 1 37 | 38 | bc_match_prop = bc_match / num_reads 39 | bcrc_match_prop = bcrc_match / num_reads 40 | valid = (bc_match_prop >= thresh) or (bcrc_match_prop >= thresh) 41 | fc_chosen = (bc_match_prop >= bcrc_match_prop) 42 | 43 | with open(qc, 'w') as f: 44 | f.write(f"Direct match proportion: {bc_match_prop}\n") 45 | f.write(f"Reverse-complement match proportion: {bcrc_match_prop}\n") 46 | f.write(f"Reverse-complement chosen: {not fc_chosen}\n") 47 | 48 | if not valid: 49 | raise ValueError(f"Insufficient barcode match rate: {bc_match_prop}, {bcrc_match_prop}") 50 | with open(out, 'w') as f: 51 | if fc_chosen: 52 | f.write(f"{0}\n") 53 | else: 54 | f.write(f"{1}\n") 55 | 56 | try: 57 | fastq = sys.argv[1] 58 | modality = sys.argv[2] 59 | whitelist = sys.argv[3] 60 | 61 | qc = sys.argv[4] 62 | out = sys.argv[5] 63 | thres = sys.argv[6] 64 | 65 | if modality == "10x": 66 | offset = 0 67 | bc_detect(fastq, whitelist, out, qc, offset, 100000, float(thres)) 68 | elif modality == "10x_multiome": 69 | offset = 8 70 | bc_detect(fastq, whitelist, out, qc, offset, 100000, float(thres)) 71 | 72 | except NameError: 73 | pass 74 | -------------------------------------------------------------------------------- /tasks/get_cellxgene_data.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task get_cellxgene_data { 4 | meta { 5 | version: 'v0.1' 6 | author: 'Zhijian Li' 7 | affiliation: 'Broad Institute of MIT and Harvard' 8 | email: 'lizhijia@broadinstitute.org' 9 | description: 'SHARE-Seq pipeline: get data from cellxgene server.' 10 | } 11 | 12 | input { 13 | # Reference data id and name 14 | String reference_data_id 15 | String reference_data_name 16 | 17 | # Docker image 18 | String? docker_image 19 | } 20 | 21 | # Determining memory size base on the size of the input files. 22 | Float mem_gb = 32.0 23 | 24 | # Determining disk size base on the size of the input files. 25 | Int disk_gb = 100 26 | 27 | # Determining disk type base on the size of disk. 28 | String disk_type = if disk_gb > 375 then "SSD" else "LOCAL" 29 | 30 | String reference_h5ad = "${reference_data_name}" 31 | String monitor_log = "monitoring.log" 32 | String running_log = "get_cellxgene_data.log" 33 | 34 | command { 35 | set -e 36 | 37 | bash $(which monitor_script.sh) | tee ~{monitor_log} 1>&2 & 38 | 39 | # Download data from cellxgene 40 | python3 $(which get_cellxgene_data.py) ${reference_data_id} ${reference_data_name} 41 | 42 | } 43 | 44 | output { 45 | File reference_h5ad = reference_h5ad 46 | File monitor_log = monitor_log 47 | File running_log = running_log 48 | } 49 | 50 | runtime { 51 | memory : "${mem_gb} GB" 52 | memory_retry_multiplier: 2 53 | disks: "local-disk ${disk_gb} ${disk_type}" 54 | docker : "${docker_image}" 55 | maxRetries:1 56 | } 57 | 58 | parameter_meta { 59 | reference_data_id: { 60 | description: 'Reference dataset id', 61 | help: 'The dataset id from cellxgene server.', 62 | examples: ['3bbb6cf9-72b9-41be-b568-656de6eb18b5'] 63 | } 64 | 65 | reference_data_name: { 66 | description: 'Reference dataset name', 67 | help: 'String used to name the reference data.', 68 | examples: ['reference.h5ad'] 69 | } 70 | 71 | docker_image: { 72 | description: 'Docker image.', 73 | help: 'Docker image for preprocessing step.', 74 | example: ['put link to gcr or dockerhub'] 75 | } 76 | 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /tasks/share_task_correct_fastq.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | # TASK 4 | # SHARE-correct-fastq 5 | 6 | task share_correct_fastq { 7 | meta { 8 | version: 'v0.1' 9 | author: 'Mei Knudson (mknudson@broadinstitute.org) at Broad Institute of MIT and Harvard' 10 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: Correct FASTQs task' 11 | } 12 | 13 | input { 14 | File fastq_R1 15 | File fastq_R2 16 | File whitelist 17 | String sample_type 18 | String? pkr 19 | String? prefix 20 | 21 | Int? cpus = 16 22 | Float? disk_factor = 8.0 23 | Float? memory_factor = 0.08 24 | String? docker_image = "us.gcr.io/buenrostro-share-seq/share_task_correct_fastq:v1.0.0" 25 | } 26 | 27 | # Determine the size of the input 28 | Float input_file_size_gb = size(fastq_R1, "G") + size(fastq_R2, "G") 29 | 30 | # Determining memory size base on the size of the input files. 31 | Float mem_gb = 16.0 + memory_factor * input_file_size_gb 32 | 33 | # Determining disk size base on the size of the input files. 34 | Int disk_gb = round(40.0 + disk_factor * input_file_size_gb) 35 | 36 | # Determining disk type base on the size of disk. 37 | String disk_type = if disk_gb > 375 then "SSD" else "LOCAL" 38 | 39 | String corrected_fastq_R1 = basename(fastq_R1, ".fastq.gz") + "_corrected.fastq" 40 | String corrected_fastq_R2 = basename(fastq_R2, ".fastq.gz") + "_corrected.fastq" 41 | String monitor_log = "correct_fastqs_monitor.log" 42 | 43 | command <<< 44 | set -e 45 | 46 | bash $(which monitor_script.sh) | tee ~{monitor_log} 1>&2 & 47 | 48 | # Perform barcode error correction on FASTQs 49 | python3 $(which correct_fastq.py) \ 50 | ~{fastq_R1} \ 51 | ~{fastq_R2} \ 52 | ~{corrected_fastq_R1} \ 53 | ~{corrected_fastq_R2} \ 54 | ~{whitelist} \ 55 | ~{sample_type} \ 56 | ~{prefix} \ 57 | ~{pkr} 58 | 59 | pigz -p ~{cpus} *.fastq 60 | >>> 61 | 62 | output { 63 | File corrected_fastq_R1 = "~{corrected_fastq_R1}.gz" 64 | File corrected_fastq_R2 = "~{corrected_fastq_R2}.gz" 65 | File barcode_qc = "~{prefix}_barcode_qc.txt" 66 | File monitor_log = "~{monitor_log}" 67 | } 68 | 69 | runtime { 70 | cpu : cpus 71 | memory : "~{mem_gb} GB" 72 | disks: "local-disk ~{disk_gb} ~{disk_type}" 73 | docker : "~{docker_image}" 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /.vimrc: -------------------------------------------------------------------------------- 1 | " Vim syn file 2 | " Language: Workflow Description Language 3 | " Maintainer: Scott Frazer 4 | " Latest Revision: 21 July 2015 5 | " 6 | if exists("b:current_syn") 7 | finish 8 | endif 9 | 10 | " command { ... } section 11 | syntax region wdlCommandSection start="command\s*{" end="\v\}" contains=wdlCommand,wdlCommandParameter,wdlKeyword,wdlCommandDelimiter 12 | syntax region wdlCommandSection2 start="command\s*<<<" end="\v>>>" contains=wdlCommand,wdlCommandParameter,wdlKeyword,wdlCommandDelimiter 13 | syntax keyword wdlCommandKeyword command contained containedin=wdlCommandSection 14 | syntax match wdlCommand "\zs.\{-}\ze\${" contained containedin=wdlCommandSection 15 | syntax region wdlCommandParameter start=/\v\$\{/ end=/\v\}/ oneline contained containedin=wdlCommandSection contains=wdlType,wdlString,wdlCommandParameterName 16 | syntax match wdlCommandParameterName /\v\zs\w+\ze([\\?\\*\\+]?\})/ contained containedin=wdlCommandParameter 17 | 18 | " Keywords 19 | syntax keyword wdlKeyword workflow task call nextgroup=wdlTaskName 20 | syntax keyword wdlKeyword output scatter if then else runtime 21 | syntax keyword wdlType Boolean Int Float String File Uri nextgroup=wdlIdentifier 22 | syntax keyword wdlImport import 23 | 24 | " Compound Types 25 | syntax region wdlType start=/\(Map\|Array\)\[/ end=/\]/ contains=wdlType nextgroup=wdlIdentifier 26 | 27 | " Identifiers 28 | syntax match wdlIdentifier /\v\s*\w+/ contained 29 | syntax match wdlTaskName /\v\s*\w+/ contained 30 | 31 | " Strings 32 | syntax region wdlString start=/"/ skip=/\\"/ end=/"/ oneline contains=wdlInterpolationWrapper 33 | syntax region wdlInterpolationWrapper start="\v\$\{" end="\v\}" contained containedin=wdlString contains=wdlInterpolatedString 34 | syntax match wdlInterpolatedString "\v\w+" contained containedin=wdlInterpolationWrapper 35 | 36 | " Comments 37 | syntax match wdlComment "\v#.*$" 38 | 39 | highlight link wdlCommandParameter Comment 40 | highlight link wdlKeyword Keyword 41 | highlight link wdlCommandKeyword Keyword 42 | highlight link wdlCommand Punctuation 43 | highlight link wdlTaskName Identifier 44 | 45 | highlight link wdlCommandParameterName Identifier 46 | highlight link wdlIdentifier Identifier 47 | highlight link wdlType Type 48 | highlight link wdlString String 49 | highlight link wdlImport Include 50 | highlight link wdlInterpolationWrapper Include 51 | highlight link wdlInterpolatedString Include 52 | highlight link wdlComment Comment 53 | 54 | setlocal commentstring=//\ %s 55 | " @-@ adds the literal @ to iskeyword for @IBAction and similar 56 | setlocal tabstop=2 57 | setlocal softtabstop=2 58 | setlocal shiftwidth=2 59 | 60 | au BufRead,BufNewFile *.wdl set filetype=wdl 61 | -------------------------------------------------------------------------------- /dockerfiles/share_task_merge_bams.dockerfile: -------------------------------------------------------------------------------- 1 | ############################################################ 2 | # Dockerfile for BROAD GRO share-seq-pipeline 3 | # Based on Debian slim 4 | ############################################################ 5 | 6 | FROM debian@sha256:3ecce669b6be99312305bc3acc90f91232880c68b566f257ae66647e9414174f as builder 7 | 8 | ENV SAMBAMBA_VERSION 0.6.6 9 | ENV PICARD_VERSION 2.27.5 10 | 11 | # To prevent time zone prompt 12 | ENV DEBIAN_FRONTEND=noninteractive 13 | 14 | # Install softwares from apt repo 15 | RUN apt-get update && apt-get install -y \ 16 | autoconf \ 17 | build-essential \ 18 | git \ 19 | libcurl4-openssl-dev \ 20 | liblz4-dev \ 21 | liblzma-dev \ 22 | libncurses5-dev \ 23 | libbz2-dev \ 24 | python3 \ 25 | unzip \ 26 | wget \ 27 | zlib1g-dev && \ 28 | rm -rf /var/lib/apt/lists/* 29 | 30 | 31 | # Make directory for all softwares 32 | RUN mkdir /software 33 | WORKDIR /software 34 | ENV PATH="/software:${PATH}" 35 | 36 | # Install sambamba 0.6.6 37 | RUN wget https://github.com/lomereiter/sambamba/releases/download/v${SAMBAMBA_VERSION}/sambamba_v${SAMBAMBA_VERSION}_linux.tar.bz2 && \ 38 | tar -xvjf sambamba_v${SAMBAMBA_VERSION}_linux.tar.bz2 && \ 39 | mv sambamba_v${SAMBAMBA_VERSION} /usr/local/bin/sambamba && \ 40 | rm -rf sambamba_* 41 | 42 | # Install Picard 2.20.7 43 | RUN wget https://github.com/broadinstitute/picard/releases/download/${PICARD_VERSION}/picard.jar && chmod +x picard.jar && mv picard.jar /usr/local/bin 44 | 45 | FROM debian@sha256:3ecce669b6be99312305bc3acc90f91232880c68b566f257ae66647e9414174f 46 | 47 | LABEL maintainer = "Eugenio Mattei" 48 | LABEL software = "Share-seq pipeline" 49 | LABEL software.version="1.0.0" 50 | LABEL software.organization="Broad Institute of MIT and Harvard" 51 | LABEL software.version.is-production="Yes" 52 | LABEL software.task="merge" 53 | 54 | RUN apt-get update && apt-get install -y \ 55 | openjdk-17-jre && \ 56 | rm -rf /var/lib/apt/lists/* 57 | 58 | # Create and setup new user 59 | ENV USER=shareseq 60 | WORKDIR /home/$USER 61 | 62 | RUN groupadd -r $USER &&\ 63 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\ 64 | chown $USER:$USER /home/$USER 65 | 66 | # Add folder with software to the path 67 | ENV PATH="/software:${PATH}" 68 | 69 | # Copy the compiled software from the builder 70 | COPY --from=builder --chown=$USER:$USER /usr/local/bin/* /usr/local/bin/ 71 | COPY --from=builder --chown=$USER:$USER /lib/x86_64-linux-gnu/* /lib/x86_64-linux-gnu/ 72 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin 73 | 74 | 75 | USER ${USER} 76 | 77 | 78 | -------------------------------------------------------------------------------- /dockerfiles/share_task_archr.dockerfile: -------------------------------------------------------------------------------- 1 | ############################################################ 2 | # Dockerfile for BROAD GRO share-seq-pipeline 3 | # Based on Debian slim 4 | ############################################################ 5 | 6 | FROM r-base@sha256:fff003a52d076e963396876b83cfa88c4f40a8bc27e341339cd3cc0236c1db79 as builder 7 | 8 | LABEL maintainer = "Siddarth Wekhande" 9 | LABEL software = "Share-seq pipeline" 10 | LABEL software.version="1.0.0" 11 | LABEL software.organization="Broad Institute of MIT and Harvard" 12 | LABEL software.version.is-production="Yes" 13 | LABEL software.task="archr" 14 | 15 | RUN echo "options(repos = 'https://cloud.r-project.org')" > $(R --no-echo --no-save -e "cat(Sys.getenv('R_HOME'))")/etc/Rprofile.site 16 | 17 | ENV R_LIBS_USER=/usr/local/lib/R 18 | ENV RETICULATE_MINICONDA_ENABLED=FALSE 19 | 20 | RUN apt-get update -qq && \ 21 | apt-get install -y -qq --no-install-recommends\ 22 | binutils \ 23 | gtk-doc-tools \ 24 | libcairo2-dev \ 25 | libcurl4-openssl-dev \ 26 | libfreetype-dev \ 27 | libfribidi-dev \ 28 | libgsl-dev \ 29 | libharfbuzz-dev \ 30 | libhdf5-dev \ 31 | libjpeg-dev \ 32 | libmpfr-dev \ 33 | libpng-dev \ 34 | libssl-dev \ 35 | libtiff5-dev \ 36 | libxml2-dev \ 37 | libxt-dev \ 38 | libmagick++-dev \ 39 | libgeos-dev \ 40 | meson \ 41 | python3 \ 42 | python3-pip && \ 43 | rm -rf /var/lib/apt/lists/* 44 | 45 | ENV USER=shareseq 46 | WORKDIR /home/$USER 47 | 48 | RUN groupadd -r $USER &&\ 49 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\ 50 | chown $USER:$USER /home/$USER 51 | 52 | RUN R --no-echo --no-restore --no-save -e "install.packages(c('devtools','hdf5r','IRkernel','BiocManager','Cairo','magick'))" 53 | 54 | RUN R --no-echo --no-restore --no-save -e "BiocManager::install(c('GenomeInfoDbData','GenomicRanges','Rsamtools'), update=F, ask=F)" 55 | 56 | RUN R --no-echo --no-restore --no-save -e "devtools::install_github('GreenleafLab/ArchR@v1.0.1', repos = BiocManager::repositories());ArchR::installExtraPackages()" 57 | 58 | RUN R --no-echo --no-restore --no-save -e "devtools::install_github('immunogenomics/presto')" 59 | 60 | RUN R --no-echo --no-restore --no-save -e "remotes::install_version('Seurat', version = '4.3.0')" 61 | 62 | 63 | RUN R --no-echo --no-restore --no-save -e "install.packages(c('logr','hexbin', 'ggpointdensity'))" 64 | 65 | RUN python3 -m pip install --break-system-packages jupyter papermill 66 | 67 | COPY src/jupyter_nb/archr_notebook.ipynb /usr/local/bin/ 68 | 69 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin 70 | 71 | 72 | RUN R -e "IRkernel::installspec()" 73 | -------------------------------------------------------------------------------- /workflows/subwf-atac-archr.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | # Import the tasks called by the pipeline 4 | import "../tasks/share_task_archr.wdl" as share_task_archr 5 | 6 | 7 | workflow wf_atac { 8 | meta { 9 | version: 'v0.1' 10 | author: 'Eugenio Mattei (emattei@broadinstitute.org) and Sai Ma @ Broad Institute of MIT and Harvard' 11 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: Sub-workflow to process the ATAC portion of SHARE-seq libraries.' 12 | } 13 | 14 | input { 15 | # ATAC Sub-worflow inputs 16 | File atac_fragments_filtered 17 | String genome_name 18 | String peak_set 19 | Int? cpus = 4 20 | String? docker 21 | String? prefix 22 | Int? min_tss = 4 23 | Int? min_frags = 1000 24 | Float? archr_disk_factor = 8.0 25 | Float? archr_memory_factor = 4.0 26 | } 27 | 28 | call share_task_archr.archr as archr{ 29 | input: 30 | atac_frag = atac_fragments_filtered, 31 | genome = genome_name, 32 | peak_set = peak_set, 33 | min_tss = min_tss, 34 | min_frags = min_frags, 35 | doublet_k = 10, 36 | doublet_knn_method = "UMAP", 37 | lsi_method = 1, 38 | docker_image = docker, 39 | prefix = prefix, 40 | disk_factor = archr_disk_factor, 41 | memory_factor = archr_memory_factor 42 | } 43 | 44 | output { 45 | File share_atac_archr_notebook_output = archr.notebook_output 46 | File share_atac_archr_notebook_log = archr.notebook_log 47 | 48 | File? share_atac_archr_raw_tss_enrichment = archr.archr_raw_tss_by_uniq_frags_plot 49 | File? share_atac_archr_filtered_tss_enrichment = archr.archr_filtered_tss_by_uniq_frags_plot 50 | File? share_atac_archr_raw_fragment_size_plot = archr.archr_raw_frag_size_dist_plot 51 | File? share_atac_archr_filtered_fragment_size_plot = archr.archr_filtered_frag_size_dist_plot 52 | 53 | File? share_atac_archr_umap_doublets = archr.archr_umap_doublets 54 | File? share_atac_archr_umap_cluster_plot = archr.archr_umap_cluster_plot 55 | File? share_atac_archr_umap_num_frags_plot = archr.archr_umap_num_frags_plot 56 | File? share_atac_archr_umap_tss_score_plot = archr.archr_umap_tss_score_plot 57 | File? share_atac_archr_umap_frip_plot = archr.archr_umap_frip_plot 58 | 59 | File? share_atac_archr_gene_heatmap_plot = archr.archr_heatmap_plot 60 | File? share_atac_archr_arrow = archr.archr_arrow 61 | File? share_atac_archr_obj = archr.archr_raw_obj 62 | File? share_atac_archr_plots_zip = archr.plots_zip 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /dockerfiles/dorcs_task_find_dorcs.dockerfile: -------------------------------------------------------------------------------- 1 | ############################################################ 2 | # Dockerfile for BROAD GRO share-seq-pipeline 3 | # Based on Debian slim 4 | ############################################################ 5 | 6 | FROM r-base@sha256:fff003a52d076e963396876b83cfa88c4f40a8bc27e341339cd3cc0236c1db79 as builder 7 | 8 | RUN echo "options(repos = 'https://cloud.r-project.org')" > $(R --no-echo --no-save -e "cat(Sys.getenv('R_HOME'))")/etc/Rprofile.site 9 | 10 | ENV R_LIBS_USER=/usr/local/lib/R 11 | ENV RETICULATE_MINICONDA_ENABLED=FALSE 12 | 13 | RUN apt-get update -qq && \ 14 | apt-get install -y --no-install-recommends \ 15 | binutils \ 16 | gtk-doc-tools \ 17 | libcairo2-dev \ 18 | libcurl4-openssl-dev \ 19 | libfreetype-dev \ 20 | libfribidi-dev \ 21 | libgsl-dev \ 22 | libharfbuzz-dev \ 23 | libhdf5-dev \ 24 | libjpeg-dev \ 25 | libmpfr-dev \ 26 | libpng-dev \ 27 | libssl-dev \ 28 | libtiff5-dev \ 29 | libxml2-dev \ 30 | libxt-dev \ 31 | libmagick++-dev \ 32 | libgeos-dev \ 33 | meson \ 34 | python3 \ 35 | python3-pip && \ 36 | rm -rf /var/lib/apt/lists/* 37 | 38 | RUN R --no-echo --no-restore --no-save -e "install.packages(c('dplyr','patchwork','ggplot2','ggrepel','reshape2','circlize','networkD3','GGally','igraph','network','foreach','iterators','hdf5r','ggrastr','BiocManager','remotes','pbmcapply','doSNOW','Rmpfr', 'glue','magrittr','pillar','RcppArmadillo','reticulate','rlang','yaml','rpart','IRkernel','data.table', 'tidyft','qlcMatrix','logr'))" 39 | 40 | RUN R --no-echo --no-restore --no-save -e "remotes::install_version('Seurat', version = '4.1.1')" 41 | 42 | RUN R --no-echo --no-restore --no-save -e "BiocManager::install(c('Biostrings','rtracklayer','GenomicRanges','motifmatchr','ComplexHeatmap','chromVAR'), update=T, ask=F)" 43 | 44 | RUN R --no-echo --no-restore --no-save -e "remotes::install_github('caleblareau/BuenColors')" 45 | 46 | ENV USER=shareseq 47 | WORKDIR /home/$USER 48 | RUN groupadd -r $USER &&\ 49 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\ 50 | chown $USER:$USER /home/$USER 51 | 52 | RUN python3 -m pip install --break-system-packages jupyter papermill 53 | 54 | RUN chown $USER:$USER /usr/local/lib/R 55 | 56 | COPY --chown=$USER:$USER src/jupyter_nb/dorcs_jplot_notebook.ipynb /usr/local/bin/ 57 | 58 | #COPY --chown=$USER:$USER src/jupyter_nb/dorcs_notebook_rds.ipynb /usr/local/bin/ 59 | 60 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin 61 | 62 | 63 | RUN mkdir -p /home/R/ 64 | 65 | COPY --chown=$USER:$USER src/R/DORCS_helper_functions_optimized.R src/R/TSSRanges.RData /home/R/ 66 | 67 | USER ${USER} 68 | 69 | RUN R -e "IRkernel::installspec()" 70 | -------------------------------------------------------------------------------- /dockerfiles/share_task_bowtie2.dockerfile: -------------------------------------------------------------------------------- 1 | ############################################################ 2 | # Dockerfile for BROAD GRO share-seq-pipeline 3 | # Based on Debian slim 4 | ############################################################ 5 | 6 | FROM debian:buster-slim as builder 7 | 8 | ENV BOWTIE2_VERSION 2.4.3 9 | ENV SAMTOOLS_VERSION 1.9 10 | 11 | # To prevent time zone prompt 12 | ENV DEBIAN_FRONTEND=noninteractive 13 | 14 | # Install softwares from apt repo 15 | RUN apt-get update && apt-get install -y \ 16 | build-essential \ 17 | cpanminus \ 18 | git \ 19 | liblz4-dev \ 20 | liblzma-dev \ 21 | libncurses5-dev \ 22 | libbz2-dev \ 23 | unzip \ 24 | wget \ 25 | zlib1g-dev &&\ 26 | rm -rf /var/lib/apt/lists/* 27 | 28 | 29 | # Make directory for all softwares 30 | RUN mkdir /software 31 | WORKDIR /software 32 | ENV PATH="/software:${PATH}" 33 | 34 | RUN cpanm Sys::Hostname 35 | 36 | # Install Bowtie2 2.3.4.3 37 | RUN wget https://sourceforge.net/projects/bowtie-bio/files/bowtie2/${BOWTIE2_VERSION}/bowtie2-${BOWTIE2_VERSION}-source.zip && \ 38 | unzip bowtie2-${BOWTIE2_VERSION}-source.zip && cd bowtie2-${BOWTIE2_VERSION} && make static-libs && make STATIC_BUILD=1 && \ 39 | cp bowtie2* .. && \ 40 | cd .. && rm -rf bowtie2-${BOWTIE2_VERSION}* 41 | 42 | # Install samtools 1.9 43 | RUN git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/samtools.git && \ 44 | git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/htslib.git && \ 45 | cd samtools && make && make install && cd ../ && rm -rf samtools* htslib* 46 | 47 | FROM debian@sha256:3ecce669b6be99312305bc3acc90f91232880c68b566f257ae66647e9414174f 48 | 49 | LABEL maintainer = "Eugenio Mattei" 50 | LABEL software = "Share-seq pipeline" 51 | LABEL software.version="1.0.0" 52 | LABEL software.organization="Broad Institute of MIT and Harvard" 53 | LABEL software.version.is-production="Yes" 54 | LABEL software.task="Bowtie2" 55 | 56 | RUN apt-get update && apt-get install -y \ 57 | cpanminus && \ 58 | rm -rf /var/lib/apt/lists/* 59 | 60 | # Create and setup new user 61 | ENV USER=shareseq 62 | WORKDIR /home/$USER 63 | 64 | RUN groupadd -r $USER &&\ 65 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\ 66 | chown $USER:$USER /home/$USER 67 | 68 | # Add folder with software to the path 69 | ENV PATH="/software:${PATH}" 70 | 71 | # Copy the compiled software from the builder 72 | COPY --from=builder --chown=$USER:$USER /software/bowtie2* /software/ 73 | COPY --from=builder --chown=$USER:$USER /usr/local/bin/* /usr/local/bin/ 74 | COPY --from=builder --chown=$USER:$USER /lib/x86_64-linux-gnu/* /lib/x86_64-linux-gnu/ 75 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin 76 | 77 | 78 | 79 | 80 | USER $USER 81 | -------------------------------------------------------------------------------- /src/python/write_html.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Write output HTML file from list of images and text 4 | 5 | @author Neva Durand (c) 2021 6 | """ 7 | import argparse 8 | import base64 9 | import io 10 | import os.path 11 | 12 | def main(output_file_name, image_file_list, log_file_list, input_file_name=None): 13 | """ 14 | Write to the input file 15 | Image file list is list of png images 16 | Log file list is list of text log files to link to 17 | 18 | Separates images by br tag and encodes directly in utf-8 19 | Log files separated by their title and encoded via pre tag 20 | """ 21 | # Open output file, write input if exists 22 | output_file = io.open(output_file_name, 'w', encoding='utf8') 23 | output_file.write('Results summary') 24 | if input_file_name is not None: 25 | with open(input_file_name) as input_file: 26 | output_file.write(input_file.read()) 27 | 28 | with open(image_file_list) as fname: 29 | images = fname.read().splitlines() 30 | 31 | # loop through images in image list and encode 32 | output_file.write('
') 33 | for image in images: 34 | data = open(image, 'rb').read() # read bytes from file 35 | data_base64 = base64.b64encode(data) # encode to base64 (bytes) 36 | data_base64 = data_base64.decode('utf-8') # convert bytes to string 37 | output_file.write(' + os.path.basename(image)+
') # embed in html 38 | 39 | with open(log_file_list) as fname: 40 | logs = fname.read().splitlines() 41 | 42 | # loop through log files in log list and write 43 | for log in logs: 44 | output_file.write(log) 45 | output_file.write("
") 46 | output_file.write('') 47 | output_file.close() 48 | 49 | if __name__ == '__main__': 50 | parser = argparse.ArgumentParser( 51 | formatter_class=argparse.RawDescriptionHelpFormatter, 52 | description=__doc__.split('\n\n\n')[0]) 53 | group = parser.add_argument_group() 54 | group.add_argument('output_file_name', 55 | help='html file to write to') 56 | group.add_argument('image_file_list', 57 | help='file containing list of image files to paste in HTML file') 58 | group.add_argument('log_file_list', 59 | help='file containing list of text log files to append to end of HTML file') 60 | group.add_argument('--input_file_name', 61 | help='optional file with html text to add at top of file', nargs='?') 62 | args = parser.parse_args() 63 | main(args.output_file_name, args.image_file_list, args.log_file_list, args.input_file_name) 64 | 65 | -------------------------------------------------------------------------------- /src/python/bam_to_fragments.py: -------------------------------------------------------------------------------- 1 | # From Kundaje lab 2 | # https://github.com/kundajelab/ENCODE_scatac/blob/master/workflow/scripts/bam_to_fragments.py 3 | 4 | import argparse 5 | import pysam 6 | import sys 7 | 8 | def bam_to_frag(in_path, out_path, barcode_tag="CB", shift_plus=4, shift_minus=-4): 9 | """ 10 | Convert coordinate-sorted BAM file to a fragment file format, while adding Tn5 coordinate adjustment 11 | BAM should be pre-filtered for PCR duplicates, secondary alignments, and unpaired reads 12 | Output fragment file is sorted by chr, start, end, barcode 13 | """ 14 | 15 | input = pysam.AlignmentFile(in_path, "rb") 16 | with open(out_path, "w") as out_file: 17 | buf = [] 18 | curr_pos = None 19 | for read in input: 20 | if read.flag & 16 == 16: 21 | continue # ignore reverse (coordinate-wise second) read in pair 22 | 23 | chromosome = read.reference_name 24 | start = read.reference_start + shift_plus 25 | end = read.reference_start + read.template_length + shift_minus 26 | cell_barcode = read.get_tag(barcode_tag) 27 | # assert(read.next_reference_start >= read.reference_start) #### 28 | data = (chromosome, start, end, cell_barcode, 1) 29 | pos = (chromosome, start) 30 | 31 | if pos == curr_pos: 32 | buf.append(data) 33 | else: 34 | buf.sort() 35 | for i in buf: 36 | print(*i, sep="\t", file=out_file) 37 | buf.clear() 38 | buf.append(data) 39 | curr_pos = pos 40 | 41 | if __name__ == '__main__': 42 | 43 | msg = "Add the description" 44 | parser = argparse.ArgumentParser(description = msg) 45 | 46 | # Adding optional argument 47 | parser.add_argument("bam", help = "Path to the coordinate-sorted bam file.") 48 | parser.add_argument("-o", "--output", help = "Path to the fragments output file.") 49 | parser.add_argument("--prefix", help = "Prefix for the metrics output file.") 50 | parser.add_argument("--shift_plus", help = "Tn5 coordinate adjustment for the plus strand.", type = int, default = 4) 51 | parser.add_argument("--shift_minus", help = "Tn5 coordinate adjustment for the minus strand.", type = int, default = -4) 52 | parser.add_argument("--bc_tag", help = "Specify the tag containing the cell barcode.", default="CB") 53 | 54 | # Read arguments from command line 55 | args = parser.parse_args() 56 | 57 | if args.prefix: 58 | prefix = args.prefix 59 | else: 60 | prefix = args.bam[:-4] 61 | 62 | if args.output: 63 | out_path = args.output 64 | else: 65 | out_path = f"{prefix}.fragments.tsv" 66 | 67 | bc_tag = args.bc_tag 68 | 69 | 70 | bam_to_frag(args.bam, out_path, bc_tag, shift_plus=args.shift_plus, shift_minus=args.shift_minus) 71 | -------------------------------------------------------------------------------- /dockerfiles/share_task_star.dockerfile: -------------------------------------------------------------------------------- 1 | ############################################################ 2 | # Dockerfile for BROAD GRO share-seq-pipeline 3 | # Based on Debian slim 4 | ############################################################ 5 | 6 | FROM debian@sha256:3ecce669b6be99312305bc3acc90f91232880c68b566f257ae66647e9414174f as builder 7 | 8 | ENV STAR_VERSION 2.7.10a_alpha_220818 9 | ENV SAMTOOLS_VERSION 1.9 10 | 11 | # To prevent time zone prompt 12 | ENV DEBIAN_FRONTEND=noninteractive 13 | 14 | # Install softwares from apt repo 15 | RUN apt-get update && apt-get install -y \ 16 | build-essential \ 17 | git \ 18 | liblz4-dev \ 19 | liblzma-dev \ 20 | libncurses5-dev \ 21 | libbz2-dev \ 22 | unzip \ 23 | wget \ 24 | zlib1g-dev &&\ 25 | rm -rf /var/lib/apt/lists/* 26 | 27 | 28 | # Make directory for all softwares 29 | RUN mkdir /software 30 | WORKDIR /software 31 | ENV PATH="/software:${PATH}" 32 | 33 | # Install STAR 2.7.10a 34 | RUN wget https://github.com/alexdobin/STAR/releases/download/2.7.10a_alpha_220818/STAR_2.7.10a_alpha_220818_Linux_x86_64_static.zip && unzip STAR_2.7.10a_alpha_220818_Linux_x86_64_static.zip && mv STAR /usr/local/bin/ 35 | #RUN wget https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.tar.gz && tar -xzf ${STAR_VERSION}.tar.gz 36 | #RUN cd STAR-${STAR_VERSION}/source && make STAR && rm ../../${STAR_VERSION}.tar.gz && mv /software/STAR-${STAR_VERSION}/bin/Linux_x86_64/* /usr/local/bin/ 37 | 38 | # Install samtools 1.9 39 | RUN git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/samtools.git && \ 40 | git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/htslib.git && \ 41 | cd samtools && make && make install && cd ../ && rm -rf samtools* htslib* 42 | 43 | FROM debian@sha256:3ecce669b6be99312305bc3acc90f91232880c68b566f257ae66647e9414174f 44 | 45 | LABEL maintainer = "Eugenio Mattei" 46 | LABEL software = "Share-seq pipeline" 47 | LABEL software.version="1.0.0" 48 | LABEL software.organization="Broad Institute of MIT and Harvard" 49 | LABEL software.version.is-production="Yes" 50 | LABEL software.task="STAR" 51 | 52 | RUN apt-get update && apt-get install -y \ 53 | libgc-dev &&\ 54 | rm -rf /var/lib/apt/lists/* 55 | 56 | # Create and setup new user 57 | ENV USER=shareseq 58 | WORKDIR /home/$USER 59 | 60 | RUN groupadd -r $USER &&\ 61 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\ 62 | chown $USER:$USER /home/$USER 63 | 64 | # Add folder with software to the path 65 | ENV PATH="/software:${PATH}" 66 | 67 | # Copy the compiled software from the builder 68 | COPY --from=builder --chown=$USER:$USER /usr/local/bin/* /usr/local/bin/ 69 | COPY --from=builder --chown=$USER:$USER /usr/lib/x86_64-linux-gnu/libgomp.so.1 /lib/x86_64-linux-gnu/libncurses.so.6 /lib/x86_64-linux-gnu/ 70 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin 71 | 72 | USER $USER 73 | -------------------------------------------------------------------------------- /workflows/subwf-rna-seurat.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | # Import the tasks called by the pipeline 4 | import "../tasks/share_task_seurat.wdl" as share_task_seurat 5 | 6 | workflow wf_rna { 7 | meta { 8 | version: 'v0.1' 9 | author: 'Eugenio Mattei (emattei@broadinstitute.org) and Sai Ma @ Broad Institute of MIT and Harvard' 10 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: Sub-workflow to process the RNA portion of SHARE-seq libraries.' 11 | } 12 | 13 | input { 14 | # RNA Seurat inputs 15 | 16 | String prefix 17 | String genome_name 18 | String? docker 19 | File h5_matrix 20 | 21 | #Seurat filtering parameters 22 | Int? min_features 23 | Float? percent_mt 24 | Int? min_cells 25 | 26 | # Seurat UMAP parameters 27 | Int? umap_dim 28 | Float? umap_resolution 29 | 30 | #Seurat runtime parameters 31 | Float? disk_factor 32 | Float? memory_factor 33 | } 34 | 35 | call share_task_seurat.seurat as seurat{ 36 | input: 37 | rna_matrix = h5_matrix, 38 | genome_name = genome_name, 39 | min_features = min_features, 40 | percent_mt = percent_mt, 41 | min_cells = min_cells, 42 | umap_dim = umap_dim, 43 | umap_resolution = umap_resolution, 44 | prefix = prefix, 45 | docker_image = docker, 46 | disk_factor = disk_factor, 47 | memory_factor = memory_factor 48 | } 49 | 50 | output { 51 | File share_rna_seurat_notebook_output = seurat.notebook_output 52 | File share_rna_seurat_notebook_log = seurat.notebook_log 53 | File? share_rna_seurat_raw_violin_plot = seurat.seurat_raw_violin_plot 54 | File? share_rna_seurat_filtered_violin_plot = seurat.seurat_filtered_violin_plot 55 | File? share_rna_seurat_raw_qc_scatter_plot = seurat.seurat_raw_qc_scatter_plot 56 | File? share_rna_seurat_filtered_qc_scatter_plot = seurat.seurat_filtered_qc_scatter_plot 57 | File? share_rna_seurat_variable_genes_plot = seurat.seurat_variable_genes_plot 58 | File? share_rna_seurat_PCA_dim_loadings_plot = seurat.seurat_PCA_dim_loadings_plot 59 | File? share_rna_seurat_PCA_plot = seurat.seurat_PCA_plot 60 | File? share_rna_seurat_heatmap_plot = seurat.seurat_heatmap_plot 61 | File? share_rna_seurat_jackstraw_plot = seurat.seurat_jackstraw_plot 62 | File? share_rna_seurat_elbow_plot = seurat.seurat_elbow_plot 63 | File? share_rna_seurat_umap_cluster_plot = seurat.seurat_umap_cluster_plot 64 | File? share_rna_seurat_umap_rna_count_plot = seurat.seurat_umap_rna_count_plot 65 | File? share_rna_seurat_umap_gene_count_plot = seurat.seurat_umap_gene_count_plot 66 | File? share_rna_seurat_umap_mito_plot = seurat.seurat_umap_mito_plot 67 | File? share_rna_seurat_obj = seurat.seurat_filtered_obj 68 | File? share_rna_plots_zip = seurat.plots_zip 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/R/atac_qc_plots.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | 3 | ### Takes ATAC barcode metadata tsv file, and outputs barcode rank plots as a png file. 4 | 5 | ## Import helper functions 6 | source("/usr/local/bin/barcode_rank_functions.R") 7 | 8 | ## Get arguments, read input 9 | args <- commandArgs() 10 | 11 | barcode_metadata_file <- args[6] 12 | fragment_cutoff <- as.integer(args[7]) 13 | fragment_rank_plot_file <- args[8] 14 | 15 | barcode_metadata <- read.table(barcode_metadata_file, header=T) 16 | 17 | ## Get plot inputs 18 | 19 | # Impose fragment cutoff, sort in decreasing order, assign rank 20 | # 1 fragment = 2 reads 21 | fragment <- barcode_metadata$reads_unique / 2 22 | fragment_filtered <- fragment[fragment >= fragment_cutoff] 23 | fragment_filtered_sort <- sort(fragment_filtered, decreasing=T) 24 | fragment_rank <- 1:length(fragment_filtered_sort) 25 | 26 | # Find elbow/knee of fragment barcode rank plot and top-ranked fragment barcode rank plot 27 | fragment_points <- get_elbow_knee_points(x=fragment_rank, y=log10(fragment_filtered_sort)) 28 | # For each valid plot, make factor for coloring plot points 29 | if (length(fragment_points) > 0) { # Elbow found in first plot 30 | fragment_plot1 <- TRUE 31 | is_top_ranked_fragment <- factor(ifelse(fragment_rank <= fragment_points[1], 1, 0)) 32 | if (length(fragment_points) > 2) { # Elbow/knee found in second plot 33 | fragment_plot2 <- TRUE 34 | fragment_top_rank <- fragment_rank[1:fragment_points[1]] 35 | fragment_top_fragment <- fragment_filtered_sort[1:fragment_points[1]] 36 | is_top_top_ranked_fragment <- factor(ifelse(fragment_top_rank <= fragment_points[3], 1, 0)) 37 | } else { 38 | fragment_plot2 <- FALSE 39 | } 40 | } else { 41 | fragment_plot1 <- FALSE 42 | } 43 | 44 | ## Generate plots 45 | 46 | options(scipen=999) 47 | 48 | # Make fragment barcode rank plots 49 | png(fragment_rank_plot_file, width=8, height=8, units='in', res=300) 50 | par(mfrow = c(2,1)) 51 | 52 | # Plot 1 (all barcodes passing fragment filter vs log10(fragments)) 53 | if (fragment_plot1) { 54 | plot(x=fragment_rank, 55 | y=fragment_filtered_sort, 56 | log="y", 57 | xlab=paste0(" Barcode rank (", length(fragment_rank)-fragment_points[1], " low quality cells)"), 58 | ylab="Fragments per barcode (log10 scale)", 59 | main="ATAC Fragments per Barcode", 60 | col=c("dimgrey","darkblue")[is_top_ranked_fragment], 61 | pch=16, 62 | ylim=c(1,100000)) 63 | abline(v=fragment_points[1], h=10^(fragment_points[2])) 64 | text(fragment_points[1], 10^(fragment_points[2]), 65 | paste0("(", fragment_points[1], ", ", 10^(fragment_points[2]), ")"), 66 | adj=c(-0.1,-0.5)) 67 | } 68 | 69 | # Plot 2 (top ranked barcodes vs log10(fragments)) 70 | if (fragment_plot2) { 71 | plot(x=fragment_top_rank, 72 | y=fragment_top_fragment, 73 | log="y", 74 | xlab="Barcode rank", 75 | ylab="Fragments per barcode (log10 scale)", 76 | main="ATAC Fragments per Top-Ranked Barcode", 77 | col=c("dimgrey","darkblue")[is_top_top_ranked_fragment], 78 | pch=16, 79 | ylim=c(1,100000)) 80 | abline(v=fragment_points[3], h=10^(fragment_points[4])) 81 | text(fragment_points[3], 10^(fragment_points[4]), 82 | paste("(", fragment_points[3], ", ", 10^(fragment_points[4]), ")", sep=""), 83 | adj=c(-0.1,-0.5)) 84 | } 85 | dev.off() 86 | 87 | -------------------------------------------------------------------------------- /dockerfiles/share_task_cell_annotation.dockerfile: -------------------------------------------------------------------------------- 1 | ############################################################ 2 | # Dockerfile for BROAD GRO share-seq-pipeline 3 | # Based on Debian slim 4 | ############################################################ 5 | 6 | FROM ubuntu@sha256:2fdb1cf4995abb74c035e5f520c0f3a46f12b3377a59e86ecca66d8606ad64f9 7 | 8 | LABEL maintainer = "Zhijian Li" 9 | LABEL software = "Share-seq pipeline" 10 | LABEL software.version="0.0.1" 11 | LABEL software.organization="Broad Institute of MIT and Harvard" 12 | LABEL software.version.is-production="No" 13 | LABEL software.task="cell-annotation" 14 | 15 | # To prevent time zone prompt 16 | ENV DEBIAN_FRONTEND=noninteractive 17 | ENV RETICULATE_MINICONDA_ENABLED=FALSE 18 | 19 | ## Create new user 20 | ENV USER=shareseq 21 | WORKDIR /home/$USER 22 | RUN groupadd -r $USER && \ 23 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\ 24 | chown $USER:$USER /home/$USER 25 | 26 | # Install libraries 27 | RUN apt-get update 28 | RUN apt-get install -y --no-install-recommends \ 29 | gcc \ 30 | g++ \ 31 | gfortran \ 32 | patch \ 33 | build-essential \ 34 | binutils \ 35 | gtk-doc-tools \ 36 | libcairo2-dev \ 37 | libcurl4-openssl-dev \ 38 | libfreetype6-dev \ 39 | libfribidi-dev \ 40 | libgsl-dev \ 41 | libharfbuzz-dev \ 42 | libhdf5-dev \ 43 | libjpeg-dev \ 44 | libmpfr-dev \ 45 | libpng-dev \ 46 | libssl-dev \ 47 | libtiff5-dev \ 48 | libxml2-dev \ 49 | libxt-dev \ 50 | libgeos-dev \ 51 | meson \ 52 | libblas-dev \ 53 | liblapack-dev \ 54 | libbz2-dev 55 | 56 | # Install python and R 57 | RUN apt-get install -y --no-install-recommends \ 58 | python3 python3-pip python3-dev python3-venv r-base 59 | 60 | RUN rm -rf /var/lib/apt/lists/* 61 | 62 | RUN echo "options(repos = 'https://cloud.r-project.org')" > $(R --no-echo --no-save -e "cat(Sys.getenv('R_HOME'))")/etc/Rprofile.site 63 | ENV R_LIBS_USER=/usr/local/lib/R 64 | 65 | RUN R --no-echo --no-restore --no-save -e "install.packages('hdf5r')" 66 | RUN R --no-echo --no-restore --no-save -e "install.packages('remotes')" 67 | RUN R --no-echo --no-restore --no-save -e "install.packages('IRkernel')" 68 | RUN R --no-echo --no-restore --no-save -e "install.packages('logr')" 69 | RUN R --no-echo --no-restore --no-save -e "install.packages('BiocManager')" 70 | RUN R --no-echo --no-restore --no-save -e "install.packages('glue')" 71 | RUN R --no-echo --no-restore --no-save -e "install.packages('Matrix')" 72 | RUN R --no-echo --no-restore --no-save -e "install.packages('SeuratObject')" 73 | RUN R --no-echo --no-restore --no-save -e "remotes::install_version('Seurat', version = '4.3.0')" 74 | RUN R --no-echo --no-restore --no-save -e "BiocManager::install('rhdf5', update=F, ask=F)" 75 | RUN R --no-echo --no-restore --no-save -e "BiocManager::install('EnsDb.Mmusculus.v79', update=F, ask=F)" 76 | RUN R --no-echo --no-restore --no-save -e "BiocManager::install('EnsDb.Hsapiens.v86', update=F, ask=F)" 77 | RUN R --no-echo --no-restore --no-save -e "install.packages('optparse')" 78 | 79 | RUN python3 -m pip install anndata cellxgene-census 80 | 81 | COPY src/bash/monitor_script.sh /usr/local/bin 82 | COPY src/python/get_cellxgene_data.py /usr/local/bin 83 | COPY src/R/cell_annotation.R /usr/local/bin/ 84 | COPY src/R/cell_annotation_helper_functions.R /usr/local/bin/ 85 | 86 | -------------------------------------------------------------------------------- /tasks/share_task_trim_fastqs_atac.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | # TASK 4 | # trim_fastqs_atac 5 | 6 | task share_trim_fastqs_atac { 7 | meta { 8 | version: 'v0.1' 9 | author: 'Eugenio Mattei (emattei@broadinstitute.org) at Broad Institute of MIT and Harvard' 10 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: trim ATAC FASTQs.' 11 | } 12 | 13 | input { 14 | File fastq_R1 # Pair 1 reads 15 | File fastq_R2 # Pair 2 reads 16 | String chemistry 17 | 18 | Int? cpus = 16 19 | Float? disk_factor = 8.0 20 | Float? memory_factor = 0.15 21 | String? docker_image = "us.gcr.io/buenrostro-share-seq/share_task_trim_fastqs_atac:v1.0.0" 22 | } 23 | 24 | # Determine the size of the input 25 | Float input_file_size_gb = size(fastq_R1, "G") + size(fastq_R2, "G") 26 | 27 | # Determining memory size base on the size of the input files. 28 | Float mem_gb = 16.0 + memory_factor * input_file_size_gb 29 | 30 | # Determining disk size base on the size of the input files. 31 | Int disk_gb = round(40.0 + disk_factor * input_file_size_gb) 32 | 33 | # Determining disk type base on the size of disk. 34 | String disk_type = if disk_gb > 375 then "SSD" else "LOCAL" 35 | 36 | # Read trimming outfiles 37 | String fastq_R1_trimmed = basename(fastq_R1, ".fastq.gz") + "_trimmed.fastq" 38 | String fastq_R2_trimmed = basename(fastq_R2, ".fastq.gz") + "_trimmed.fastq" 39 | String trimming_log_json = basename(fastq_R1, "R1.fastq.gz") + ".atac.preprocess.trimming.log.json" 40 | String trimming_log_html = basename(fastq_R1, "R1.fastq.gz") + ".atac.preprocess.trimming.log.html" 41 | String trimming_stats = basename(fastq_R1, "R1.fastq.gz") + ".atac.preprocess.trimming.adapter.stats.txt" 42 | String monitor_log = 'trim_fastqs_atac_monitor.log' 43 | 44 | command <<< 45 | set -e 46 | 47 | bash $(which monitor_script.sh) | tee ~{monitor_log} 1>&2 & 48 | 49 | # Use trim_fastq script for SHARE ATAC trimming 50 | if [ '~{chemistry}' == 'shareseq' ]; then 51 | python3 $(which trim_fastq.py) ~{fastq_R1} ~{fastq_R2} ~{fastq_R1_trimmed} ~{fastq_R2_trimmed} ~{trimming_stats} 52 | 53 | # Use fastp for 10X ATAC trimming 54 | else 55 | fastp -i ~{fastq_R1} -I ~{fastq_R2} -o ~{fastq_R1_trimmed} -O ~{fastq_R2_trimmed} -h ~{trimming_log_html} -j ~{trimming_log_json} -G -Q -L -w ~{cpus} 2> ~{trimming_stats} 56 | 57 | fi 58 | 59 | pigz -p ~{cpus} *.fastq 60 | >>> 61 | 62 | output { 63 | File fastq_R1_trimmed = fastq_R1_trimmed + ".gz" 64 | File fastq_R2_trimmed = fastq_R2_trimmed + ".gz" 65 | File? tenx_trimming_log_json = trimming_log_json 66 | File? tenx_trimming_log_html = trimming_log_html 67 | File trimming_stats = trimming_stats 68 | File trim_fastqs_atac_monitor = monitor_log 69 | } 70 | 71 | runtime { 72 | cpu: cpus 73 | docker: "${docker_image}" 74 | disks: "local-disk ${disk_gb} ${disk_type}" 75 | memory: "${mem_gb} GB" 76 | } 77 | 78 | parameter_meta { 79 | fastq_R1: { 80 | description: 'Pairs 1 fastq', 81 | help: 'Pairs 1 fastq', 82 | } 83 | fastq_R2: { 84 | description: 'Pairs 2 fastq', 85 | help: 'Pairs 2 fastq', 86 | } 87 | } 88 | 89 | } 90 | -------------------------------------------------------------------------------- /tasks/share_task_generate_h5.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | # TASK 4 | # SHARE-rna-generate-h5 5 | 6 | 7 | task generate_h5 { 8 | meta { 9 | version: 'v0.1' 10 | author: 'Eugenio Mattei (emattei@broadinstitute.org) at Broad Institute of MIT and Harvard' 11 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: RNA gene x cell matrix' 12 | } 13 | 14 | input { 15 | # This task computes the the gene x cell matrix. 16 | File tar 17 | String genome_name 18 | String? pkr 19 | String prefix 20 | String? gene_naming 21 | 22 | Float? disk_factor = 8.0 23 | Float? memory_factor = 2.0 24 | String? docker_image = "us.gcr.io/buenrostro-share-seq/share_task_generate_h5:v1.0.0" 25 | } 26 | 27 | # Determine the size of the input 28 | Float input_file_size_gb = size(tar, "G") 29 | 30 | # Determining memory size based on the size of the input files. 31 | Float mem_gb = 10.0 + memory_factor * input_file_size_gb 32 | 33 | # Determining disk size based on the size of the input files. 34 | Int disk_gb = round(40.0 + disk_factor * input_file_size_gb) 35 | 36 | # Determining disk type based on the size of disk. 37 | String disk_type = if disk_gb > 375 then "SSD" else "LOCAL" 38 | 39 | String ensembl_option = if "${gene_naming}"=="ensembl" then "--ensembl" else "" 40 | String h5 = "${default="share-seq" prefix}.${genome_name}.rna.h5" 41 | String monitor_log = "monitor.log" 42 | 43 | command <<< 44 | set -e 45 | 46 | bash $(which monitor_script.sh) | tee ~{monitor_log} 1>&2 & 47 | 48 | # Untar 49 | tar xzvf ~{tar} 50 | 51 | # Generate h5 file 52 | python3 $(which generate_h5_rna.py) \ 53 | ./matrix.mtx.gz \ 54 | ./features.tsv.gz \ 55 | ./barcodes.tsv.gz \ 56 | ~{h5} \ 57 | ~{pkr} \ 58 | ~{ensembl_option} 59 | >>> 60 | 61 | output { 62 | File h5_matrix = "${h5}" 63 | } 64 | 65 | runtime { 66 | memory : "${mem_gb} GB" 67 | disks: "local-disk ${disk_gb} ${disk_type}" 68 | docker : "${docker_image}" 69 | } 70 | 71 | parameter_meta { 72 | tar: { 73 | description: 'STARsolo output tar.gz file', 74 | help: 'tar.gz file containing raw matrix, features, and barcodes file from STARsolo.', 75 | example: 'raw.tar.gz' 76 | } 77 | genome_name: { 78 | description: 'Reference name', 79 | help: 'The name genome reference used to align.', 80 | example: ['hg38', 'mm10', 'hg19', 'mm9'] 81 | } 82 | prefix: { 83 | description: 'Prefix for output files', 84 | help: 'Prefix that will be used to name the output files.', 85 | example: 'MyExperiment' 86 | } 87 | gene_naming: { 88 | description: 'Gene naming convention', 89 | help: 'Convention for gene naming in h5 matrix; either "gene_name" (default) or "ensembl".', 90 | example: ['gene_name', 'ensembl'] 91 | } 92 | docker_image: { 93 | description: 'Docker image.', 94 | help: 'Docker image for preprocessing step. Dependencies: python3 -m pip install h5py scipy', 95 | example: ['put link to gcr or dockerhub'] 96 | } 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /tasks/share_task_html_report.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | # TASK 4 | # SHARE-html-report 5 | # Gather information from log files 6 | 7 | 8 | task html_report { 9 | meta { 10 | version: 'v0.1' 11 | author: 'Neva C. Durand (neva@broadinstitute.org) at Broad Institute of MIT and Harvard' 12 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: create html report task' 13 | } 14 | 15 | input { 16 | # This function takes as input the files to append to the report 17 | # and the metrics and writes out an html file 18 | 19 | String? prefix 20 | 21 | # Stats for ATAC and RNA, will go at top of html 22 | Int? atac_total_reads 23 | Int? atac_aligned_uniquely 24 | Int? atac_unaligned 25 | Int? atac_feature_reads 26 | Int? atac_duplicate_reads 27 | Float? atac_percent_duplicates 28 | Int? rna_total_reads 29 | Int? rna_aligned_uniquely 30 | Int? rna_aligned_multimap 31 | Int? rna_unaligned 32 | Int? rna_feature_reads 33 | Int? rna_duplicate_reads 34 | 35 | ## JPEG files to be encoded and appended to html 36 | Array[File?] image_files 37 | 38 | ## Raw text logs to append to end of html 39 | Array[String?] log_files 40 | 41 | } 42 | 43 | String output_file = "${default="share-seq" prefix}.html" 44 | # need to select from valid files since some are optional 45 | Array[File] valid_image_files = select_all(image_files) 46 | Array[String] valid_log_files = select_all(log_files) 47 | 48 | command <<< 49 | 50 | echo "~{sep="\n" valid_image_files}" > image_list.txt 51 | echo "~{sep="\n" valid_log_files}" > log_list.txt 52 | 53 | echo "

Summary Statistics

" > output.txt 54 | echo "" >> output.txt 55 | echo "" >> output.txt 56 | echo "" >> output.txt 57 | echo "" >> output.txt 58 | echo "" >> output.txt 59 | echo "" >> output.txt 60 | echo "" >> output.txt 61 | echo "" >> output.txt 62 | echo "" >> output.txt 63 | echo "" >> output.txt 64 | echo "" >> output.txt 65 | percent=$(( ~{default=0 rna_duplicate_reads}*100/~{default=1 rna_feature_reads} )) 66 | echo "
ATAC
Total reads" ~{atac_total_reads} "
Aligned uniquely" ~{atac_aligned_uniquely} "
Unaligned" ~{atac_unaligned} "
Unique Reads" ~{atac_feature_reads} "
Duplicate Reads" ~{atac_duplicate_reads} "
Percent Duplicates" ~{atac_percent_duplicates} "
RNA
Total reads" ~{rna_total_reads} "
Aligned uniquely" ~{rna_aligned_uniquely} "
Aligned multimap" ~{rna_aligned_multimap} "
Unaligned" ~{rna_unaligned} "
Filtered (feature) Reads" ~{rna_feature_reads} "
Duplicate Reads" ~{rna_duplicate_reads} "
Percent Duplicates" $percent "
" >> output.txt 67 | PYTHONIOENCODING=utf-8 python3 /software/write_html.py ~{output_file} image_list.txt log_list.txt --input_file_name output.txt 68 | >>> 69 | output { 70 | File html_report_file = "~{output_file}" 71 | } 72 | 73 | runtime { 74 | docker: 'us.gcr.io/buenrostro-share-seq/share_task_html_report:v1.0.0' 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/python/filter_mito_reads.py: -------------------------------------------------------------------------------- 1 | # From Kundaje lab 2 | # https://github.com/kundajelab/ENCODE_scatac/blob/master/workflow/scripts/filter_mito.py 3 | # 4 | # Modified by: Eugenio Mattei 5 | # Affiliation: The Broad InstituteOf MIT and Harvard 6 | # 7 | # Changelog: 8 | # 2023/01/20: Now it returns the statistics per barcode 9 | # 10 | 11 | import argparse 12 | import pysam 13 | from collections import defaultdict 14 | 15 | 16 | 17 | def filter_mito(in_path, out_path, barcode_tag, cutoff, prefix, threads=1): 18 | """ 19 | Removes mitochondrial alignments from BAM 20 | Calculates number of mapped mitochondrial and non-mitochondrial reads (not alignments) 21 | Assumes mitochondrial chromosome is "chrM" 22 | """ 23 | 24 | infile = pysam.AlignmentFile(in_path, "rb", threads=threads) 25 | outfile = pysam.AlignmentFile(out_path, "wb", template=infile, threads=threads) 26 | outfile_bulk_metrics = f"{prefix}.mito.bulk-metrics.tsv" 27 | outfile_barcode_metrics = f"{prefix}.mito.bc-metrics.tsv" 28 | 29 | number_mito = 0 30 | number_non_mito = 0 31 | 32 | # Initializing the dictionary setting the counts for non-mito and mito. 33 | barcode_metrics = defaultdict(lambda: [0,0]) 34 | 35 | for read in infile.fetch(until_eof=True,multiple_iterators=True): 36 | if read.reference_name == "chrM": 37 | if read.flag & 260 == 0: # Alignment is mapped and is primary 38 | number_mito += 1 39 | barcode_metrics[read.get_tag(barcode_tag)][1] += 1 40 | 41 | else: 42 | if read.flag & 260 == 0: 43 | number_non_mito += 1 44 | barcode_metrics[read.get_tag(barcode_tag)][0] += 1 45 | #outfile.write(read) 46 | 47 | # Write the summary metrics 48 | with open(outfile_bulk_metrics, "w") as fh: 49 | print("raw_reads_nonmito\traw_reads_mito", file = fh) 50 | print(f"{number_non_mito}\t{number_mito}", file = fh) 51 | 52 | # Write the metrics per barcode 53 | with open(outfile_barcode_metrics, "w") as fh: 54 | # Print header 55 | print("barcode\traw_reads_nonmito\traw_reads_mito", file = fh) 56 | for barcode,counts in barcode_metrics.items(): 57 | print(f"{barcode}\t{counts[0]}\t{counts[1]}", file = fh) 58 | 59 | # Write a filtered bam 60 | for read in infile: 61 | if read.flag & 260 == 0 and read.reference_name != "chrM" and barcode_metrics[read.get_tag(barcode_tag)][0] > cutoff*2: 62 | outfile.write(read) 63 | 64 | outfile.close() 65 | return 66 | 67 | 68 | 69 | if __name__ == '__main__': 70 | 71 | msg = "Add the description" 72 | parser = argparse.ArgumentParser(description = msg) 73 | 74 | # Adding optional argument 75 | parser.add_argument("bam", help = "Path to the coordinate-sorted bam file.") 76 | parser.add_argument("-o", "--output", help = "Path to the mitochondrial-free bam file.") 77 | parser.add_argument("-p", help = "Number of threads to use.", type=int, default=1) 78 | parser.add_argument("--prefix", help = "Prefix for the metrics output file.") 79 | parser.add_argument("--cutoff", help = "Remove barcodes with a number of fragments less than the cutoff.", type=int, default=1) 80 | parser.add_argument("--bc_tag", help = "Specify the tag containing the cell barcode.", default="CB") 81 | 82 | # Read arguments from command line 83 | args = parser.parse_args() 84 | 85 | if args.prefix: 86 | prefix = args.prefix 87 | else: 88 | prefix = args.bam[:-4] 89 | 90 | if args.output: 91 | out_path = args.output 92 | else: 93 | out_path = f"{prefix}.no_mito.bam" 94 | 95 | bc_tag = args.bc_tag 96 | 97 | filter_mito(args.bam, out_path, bc_tag, args.cutoff, prefix, threads=args.p) 98 | -------------------------------------------------------------------------------- /src/R/cell_annotation_helper_functions.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | 3 | ## --------------------------- 4 | ## Helper functions for cell annotation 5 | ## Author: Zhijian Li 6 | ## Date Created: 2023-05-29 7 | ## Email: lzj1769@gmail.com 8 | ## --------------------------- 9 | library(reticulate) 10 | use_python("/usr/bin/python3") 11 | 12 | read_h5ad <- function( 13 | filename, 14 | backed = NULL 15 | ) { 16 | python_anndata <- reticulate::import("anndata", convert = FALSE) 17 | filename <- normalizePath(filename, mustWork = FALSE) 18 | py_to_r_ifneedbe(python_anndata$read_h5ad( 19 | filename = filename, 20 | backed = backed 21 | )) 22 | } 23 | 24 | py_to_r_ifneedbe <- function(x) { 25 | if (inherits(x, "python.builtin.object")) { 26 | py_to_r(x) 27 | } else { 28 | x 29 | } 30 | } 31 | 32 | #' @name r-py-conversion 33 | #' @export 34 | py_to_r.pandas.core.indexes.base.Index <- function(x) { 35 | python_builtins <- reticulate::import_builtins() 36 | out <- python_builtins$list(x) 37 | attr(out, "name") <- py_to_r_ifneedbe(x$name) 38 | out 39 | } 40 | 41 | #' Convert between Python and R objects 42 | #' 43 | #' @param x A Python object. 44 | #' @param name A name 45 | #' @param value A value 46 | #' 47 | #' @return An \R object, as converted from the Python object. 48 | #' 49 | #' @name r-py-conversion 50 | #' @export 51 | `[[<-.collections.abc.MutableMapping` <- function(x, name, value) { 52 | if (!is.null(value)) { 53 | reticulate::py_set_item(x, name, value) 54 | } else if (name %in% x$keys()) { 55 | reticulate::py_del_item(x, name) 56 | } 57 | } 58 | 59 | #' @name r-py-conversion 60 | #' @export 61 | `[[.collections.abc.Mapping` <- function(x, name) { 62 | if (name %in% x$keys()) { 63 | py_to_r_ifneedbe(reticulate::py_get_item(x, name)) 64 | } else { 65 | NULL 66 | } 67 | } 68 | 69 | #' @name r-py-conversion 70 | #' @export 71 | `[<-.collections.abc.MutableMapping` <- `[[<-.collections.abc.MutableMapping` 72 | # 73 | #' @name r-py-conversion 74 | #' @export 75 | `[.collections.abc.Mapping` <- `[[.collections.abc.Mapping` 76 | # 77 | #' @name r-py-conversion 78 | #' @export 79 | `names.collections.abc.Mapping` <- function(x) { 80 | python_builtins <- reticulate::import_builtins() 81 | python_builtins$list(x$keys()) 82 | } 83 | 84 | #' @name r-py-conversion 85 | #' @export 86 | `py_to_r.collections.abc.Set` <- function(x) { 87 | python_builtins <- reticulate::import_builtins() 88 | python_builtins$list(x) 89 | } 90 | 91 | #' @name r-py-conversion 92 | #' @export 93 | py_to_r.pandas.core.indexes.base.Index <- function(x) { 94 | python_builtins <- reticulate::import_builtins() 95 | out <- python_builtins$list(x) 96 | attr(out, "name") <- py_to_r_ifneedbe(x$name) 97 | out 98 | } 99 | 100 | #' @name r-py-conversion 101 | #' @export 102 | py_to_r.collections.abc.KeysView <- function(x) { 103 | python_builtins <- reticulate::import_builtins() 104 | python_builtins$list(x) 105 | } 106 | 107 | #' @name r-py-conversion 108 | #' @export 109 | `py_to_r.collections.abc.Mapping` <- function(x) { 110 | python_builtins <- reticulate::import_builtins() 111 | 112 | x_list <- python_builtins$dict(x) 113 | 114 | # convert members of x_list if need be 115 | for (i in seq_along(x_list)) { 116 | if (inherits(x_list[[i]], "python.builtin.object")) { 117 | x_list[[i]] <- py_to_r_ifneedbe(x_list[[i]]) 118 | } 119 | } 120 | 121 | x_list 122 | } 123 | 124 | 125 | #' @importFrom Matrix sparseMatrix 126 | py_to_r.scipy.sparse.csc.csc_matrix <- function(x) { 127 | Matrix::sparseMatrix( 128 | i = as.integer(py_to_r_ifneedbe(x$indices))+1, 129 | p = as.integer(py_to_r_ifneedbe(x$indptr)), 130 | x = as.vector(py_to_r_ifneedbe(x$data)), 131 | dims = as.integer(dim(x)) 132 | ) 133 | } 134 | -------------------------------------------------------------------------------- /workflows/subwf-find-dorcs.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | 4 | # Import the tasks called by the pipeline 5 | import "../tasks/dorcs_task_find_dorcs.wdl" as find_dorcs 6 | 7 | workflow wf_dorcs { 8 | 9 | meta { 10 | version: 'v0.1' 11 | author: 'Siddarth Wekhande (swekhand@broadinstitute.org)' 12 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: Sub-workflow to find DORCs from SHARE-seq data.' 13 | } 14 | 15 | input { 16 | File? rna_matrix 17 | File? atac_fragments 18 | File peak_file 19 | 20 | String genome 21 | Int n_cores = 4 22 | String save_plots_to_dir = "TRUE" 23 | String? output_filename 24 | 25 | Int minFeature_RNA = 200 26 | Int maxFeature_RNA = 2500 27 | Float percentMT_RNA = 5 28 | Int minCells_RNA = 3 29 | 30 | Int dorcGeneCutOff = 10 31 | Float fripCutOff = 0.3 32 | Float corrPVal = 0.05 33 | Int topNGene = 20 34 | Int windowPadSize = 50000 35 | 36 | Int numNearestNeighbor = 30 37 | Float numBackgroundPairs = 100000 38 | Float chunkSize = 50000 39 | 40 | String? prefix 41 | Int mem_gb = 64 42 | Int disk_gb = 100 43 | String? docker 44 | } 45 | 46 | File rna_matrix_ = select_first([rna_matrix]) 47 | File atac_fragments_ = select_first([atac_fragments]) 48 | 49 | if ( !defined(rna_matrix) || !defined(atac_fragments) ){ 50 | call raise_exception as missing_input { 51 | input: 52 | msg = "The genes-by-cell matrix or the dna fragments file are missing." 53 | } 54 | } 55 | 56 | call find_dorcs.find_dorcs as find_dorcs{ 57 | input: 58 | rna_matrix = rna_matrix_, 59 | atac_fragments = atac_fragments_, 60 | peak_file = peak_file, 61 | genome = genome, 62 | n_cores = n_cores, 63 | save_plots_to_dir = save_plots_to_dir, 64 | output_filename = output_filename, 65 | minFeature_RNA = minFeature_RNA, 66 | maxFeature_RNA = maxFeature_RNA, 67 | percentMT_RNA = percentMT_RNA, 68 | minCells_RNA = minCells_RNA, 69 | dorcGeneCutOff = dorcGeneCutOff, 70 | fripCutOff = fripCutOff, 71 | corrPVal = corrPVal, 72 | topNGene = topNGene, 73 | windowPadSize = windowPadSize, 74 | numNearestNeighbor = numNearestNeighbor, 75 | numBackgroundPairs = numBackgroundPairs, 76 | chunkSize = chunkSize, 77 | mem_gb = mem_gb, 78 | disk_gb = disk_gb, 79 | docker_image = docker, 80 | prefix = prefix 81 | } 82 | 83 | output { 84 | File dorcs_notebook_output = find_dorcs.notebook_output 85 | File dorcs_notebook_log = find_dorcs.notebook_log 86 | File? seurat_violin_plot = find_dorcs.seurat_violin_plot 87 | File? j_plot = find_dorcs.j_plot 88 | File? plots_zip = find_dorcs.plots_zip 89 | File? dorcs_genes_summary = find_dorcs.dorcs_genes_summary 90 | File? dorcs_regions_summary = find_dorcs.dorcs_regions_summary 91 | } 92 | 93 | } 94 | 95 | # Task to report errors to user. 96 | # From https://github.com/ENCODE-DCC/chip-seq-pipeline2/blob/master/chip.wdl 97 | task raise_exception { 98 | input { 99 | String msg 100 | Array[String]? vals 101 | } 102 | command { 103 | echo -e "\n* Error: ${msg}\n" >&2 104 | echo -e "* Vals: ${sep=',' vals}\n" >&2 105 | exit 2 106 | } 107 | output { 108 | String error_msg = '${msg}' 109 | } 110 | runtime { 111 | maxRetries : 0 112 | cpu : 1 113 | memory : '2 GB' 114 | time : 1 115 | disks : 'local-disk 10 SSD' 116 | docker : 'encodedcc/chip-seq-pipeline:v2.2.1' 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /dockerfiles/share_task_filter_atac.dockerfile: -------------------------------------------------------------------------------- 1 | ############################################################ 2 | # Dockerfile for BROAD GRO share-seq-pipeline 3 | # Based on Debian slim 4 | ############################################################ 5 | 6 | FROM debian:buster-slim as builder 7 | 8 | ENV BEDTOOLS_VERSION v2.29.0 9 | ENV PICARD_VERSION 2.27.5 10 | ENV SAMTOOLS_VERSION 1.16 11 | ENV SAMBAMBA_VERSION 0.6.6 12 | 13 | # To prevent time zone prompt 14 | ENV DEBIAN_FRONTEND=noninteractive 15 | 16 | # Install softwares from apt repo 17 | RUN apt-get update && apt-get install -y \ 18 | autoconf \ 19 | automake \ 20 | build-essential \ 21 | git \ 22 | libcurl4-openssl-dev \ 23 | liblz4-dev \ 24 | liblzma-dev \ 25 | libncurses5-dev \ 26 | libncursesw5-dev \ 27 | libbz2-dev \ 28 | perl \ 29 | python \ 30 | unzip \ 31 | xz-utils \ 32 | wget \ 33 | zlib1g-dev &&\ 34 | rm -rf /var/lib/apt/lists/* 35 | 36 | # Make directory for all softwares 37 | RUN mkdir /software 38 | WORKDIR /software 39 | ENV PATH="/software:${PATH}" 40 | 41 | # Install bedtools 2.29.0 42 | RUN git clone --branch ${BEDTOOLS_VERSION} --single-branch https://github.com/arq5x/bedtools2.git && \ 43 | cd bedtools2 && make && make install && cd ../ && rm -rf bedtools2* 44 | 45 | # Install sambamba 0.6.6 46 | RUN wget https://github.com/lomereiter/sambamba/releases/download/v${SAMBAMBA_VERSION}/sambamba_v${SAMBAMBA_VERSION}_linux.tar.bz2 && \ 47 | tar -xvjf sambamba_v${SAMBAMBA_VERSION}_linux.tar.bz2 && \ 48 | mv sambamba_v${SAMBAMBA_VERSION} /usr/local/bin/sambamba && \ 49 | rm -rf sambamba_* 50 | 51 | # Install samtools 1.16 52 | RUN git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/htslib.git && \ 53 | cd htslib && git submodule update --init --recursive && autoreconf -i && make && make install && cd ../ && \ 54 | git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/samtools.git && \ 55 | cd samtools && make && make install && cd ../ && rm -rf samtools* && rm -rf htslib* 56 | 57 | 58 | # Install Picard 2.20.7 59 | RUN wget https://github.com/broadinstitute/picard/releases/download/${PICARD_VERSION}/picard.jar && chmod +x picard.jar && mv picard.jar /usr/local/bin 60 | 61 | 62 | 63 | FROM debian:buster-slim 64 | 65 | LABEL maintainer = "Eugenio Mattei" 66 | LABEL software = "Share-seq pipeline" 67 | LABEL software.version="1.0.0" 68 | LABEL software.organization="Broad Institute of MIT and Harvard" 69 | LABEL software.version.is-production="Yes" 70 | LABEL software.task="filter" 71 | 72 | RUN apt-get update && apt-get install -y \ 73 | gcc \ 74 | libcurl4-openssl-dev \ 75 | libbz2-dev \ 76 | liblzma-dev \ 77 | python3 \ 78 | python3-dev \ 79 | python3-pip \ 80 | openjdk-11-jre \ 81 | zlib1g-dev &&\ 82 | rm -rf /var/lib/apt/lists/* 83 | 84 | # Install packages for python3 scripts 85 | RUN python3 -m pip install --upgrade pip 86 | RUN python3 -m pip install --no-cache-dir --ignore-installed pysam 87 | 88 | # Create and setup new user 89 | ENV USER=shareseq 90 | WORKDIR /home/$USER 91 | 92 | RUN groupadd -r $USER &&\ 93 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\ 94 | chown $USER:$USER /home/$USER 95 | 96 | # Add folder with software to the path 97 | ENV PATH="/software:${PATH}" 98 | 99 | # Copy the compiled software from the builder 100 | COPY --from=builder --chown=$USER:$USER /usr/local/bin/* /usr/local/bin/ 101 | COPY --from=builder --chown=$USER:$USER /lib/x86_64-linux-gnu/* /lib/x86_64-linux-gnu/ 102 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin 103 | COPY --chown=$USER:$USER src/python/filter_mito_reads.py /usr/local/bin 104 | COPY --chown=$USER:$USER src/python/bam_to_fragments.py /usr/local/bin 105 | COPY --chown=$USER:$USER src/python/assign_multimappers.py /usr/local/bin 106 | 107 | 108 | USER ${USER} 109 | -------------------------------------------------------------------------------- /dockerfiles/share_task_qc_atac.dockerfile: -------------------------------------------------------------------------------- 1 | ############################################################ 2 | # Dockerfile for BROAD GRO share-seq-pipeline 3 | # Based on Debian slim 4 | ############################################################ 5 | 6 | FROM debian:buster-slim as builder 7 | 8 | ENV SAMTOOLS_VERSION 1.9 9 | ENV BEDTOOLS_VERSION v2.29.0 10 | ENV PICARD_VERSION 2.27.5 11 | 12 | # To prevent time zone prompt 13 | ENV DEBIAN_FRONTEND=noninteractive 14 | 15 | # Install softwares from apt repo 16 | RUN apt-get update && apt-get install -y \ 17 | autoconf \ 18 | build-essential \ 19 | git \ 20 | libcurl4-openssl-dev \ 21 | liblz4-dev \ 22 | liblzma-dev \ 23 | libncurses5-dev \ 24 | libbz2-dev \ 25 | python \ 26 | unzip \ 27 | wget \ 28 | zlib1g-dev &&\ 29 | rm -rf /var/lib/apt/lists/* 30 | 31 | 32 | # Make directory for all softwares 33 | RUN mkdir /software 34 | WORKDIR /software 35 | ENV PATH="/software:${PATH}" 36 | 37 | # Install bedtools 2.29.0 38 | RUN git clone --branch ${BEDTOOLS_VERSION} --single-branch https://github.com/arq5x/bedtools2.git && \ 39 | cd bedtools2 && make && make install && cd ../ && rm -rf bedtools2* 40 | 41 | # Install samtools 1.9 42 | RUN git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/samtools.git && \ 43 | git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/htslib.git && \ 44 | cd samtools && make && make install && cd ../ && rm -rf samtools* && \ 45 | cd htslib && autoreconf -i && make && make install && cd ../ && rm -rf htslib* 46 | 47 | # Install Picard 2.20.7 48 | RUN wget https://github.com/broadinstitute/picard/releases/download/${PICARD_VERSION}/picard.jar && chmod +x picard.jar && mv picard.jar /usr/local/bin 49 | 50 | 51 | 52 | FROM debian:buster-slim 53 | 54 | LABEL maintainer = "Eugenio Mattei" 55 | LABEL software = "Share-seq pipeline" 56 | LABEL software.version="1.0.0" 57 | LABEL software.organization="Broad Institute of MIT and Harvard" 58 | LABEL software.version.is-production="Yes" 59 | LABEL software.task="qc-atac" 60 | 61 | RUN apt-get update && apt-get install -y \ 62 | gcc \ 63 | git \ 64 | python3 \ 65 | python3-dev \ 66 | python3-pip \ 67 | openjdk-11-jre \ 68 | r-base \ 69 | zlib1g-dev &&\ 70 | rm -rf /var/lib/apt/lists/* 71 | 72 | # Install packages for python3 scripts (pysam, SAMstats) 73 | RUN python3 -m pip install --upgrade pip 74 | RUN python3 -m pip install --no-cache-dir --ignore-installed numpy matplotlib pandas plotnine pysam --editable=git+https://github.com/kundajelab/SAMstats@75e60f1e67c6d5d066371a0b53729e4b1f6f76c5#egg=SAMstats 75 | 76 | # Create and setup new user 77 | ENV USER=shareseq 78 | WORKDIR /home/$USER 79 | 80 | RUN groupadd -r $USER &&\ 81 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\ 82 | chown $USER:$USER /home/$USER 83 | 84 | # Add folder with software to the path 85 | ENV PATH="/software:${PATH}" 86 | 87 | # Copy the compiled software from the builder 88 | COPY --from=builder --chown=$USER:$USER /usr/local/bin/* /usr/local/bin/ 89 | COPY --from=builder --chown=$USER:$USER /lib/x86_64-linux-gnu/* /lib/x86_64-linux-gnu/ 90 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin 91 | COPY --chown=$USER:$USER src/python/pbc_stats.py /usr/local/bin 92 | COPY --chown=$USER:$USER src/python/qc_atac_compute_tss_enrichment.py /usr/local/bin 93 | COPY --chown=$USER:$USER src/python/qc_atac_count_duplicates_per_barcode.py /usr/local/bin 94 | COPY --chown=$USER:$USER src/python/qc_atac_compute_reads_in_peaks.py /usr/local/bin 95 | COPY --chown=$USER:$USER src/python/plot_insert_size_hist.py /usr/local/bin 96 | COPY --chown=$USER:$USER src/R/barcode_rank_functions.R /usr/local/bin 97 | COPY --chown=$USER:$USER src/R/atac_qc_plots.R /usr/local/bin 98 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin 99 | 100 | 101 | USER ${USER} 102 | 103 | 104 | -------------------------------------------------------------------------------- /tasks/dorcs_task_find_dorcs.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task find_dorcs { 4 | meta { 5 | version: 'v0.1' 6 | author: 'Siddarth Wekhande (swekhand@broadinstitute.org) at Broad Institute of MIT and Harvard' 7 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: find DORCs task' 8 | } 9 | 10 | input { 11 | #This task takes in the RNA and ATAC files and finds the DORCs based on the cut-off criteria provided 12 | 13 | #DORCs parameters 14 | File rna_matrix 15 | File atac_fragments 16 | File? peak_file 17 | String genome 18 | Int n_cores = 4 19 | String save_plots_to_dir = "TRUE" 20 | String prefix = "prefix" 21 | 22 | #RNA QC parameters 23 | Int minFeature_RNA = 200 24 | Int maxFeature_RNA = 2500 25 | Float percentMT_RNA = 5 26 | Int minCells_RNA = 3 27 | 28 | #ATAC QC parameter 29 | Float fripCutOff = 0.3 30 | Float chunkSize = 50000 31 | 32 | #Background correlation parameters 33 | Int numNearestNeighbor = 100 34 | Float numBackgroundPairs = 100000 35 | 36 | #DORC genes parameter 37 | # Regulatory region around TSS. Default is +/- 50Kb 38 | Int windowPadSize = 50000 39 | Int dorcGeneCutOff = 10 40 | Float corrPVal = 0.05 41 | Int topNGene = 20 42 | 43 | String output_filename = "${prefix}.dorcs.notebook.${genome}.ipynb" 44 | String docker_image = "us.gcr.io/buenrostro-share-seq/dorcs_task_find_dorcs:v1.0.0" 45 | #String docker_image = "swekhande/shareseq-prod:share-task-dorcs" 46 | Int mem_gb = 64 47 | Int disk_gb = 100 48 | } 49 | 50 | #Output filepaths 51 | 52 | String violin_plot = '${prefix}.dorcs.plots.${genome}/${prefix}.dorcs.rna_violin_plot.${genome}.png' 53 | String jplot = '${prefix}.dorcs.plots.${genome}/${prefix}.dorcs.jplot.${genome}.png' 54 | String dorc_genes_summ = '${prefix}.dorcs.dorc_genes_summary.${genome}.csv' 55 | String all_regions_summ = '${prefix}.dorcs.all_regions_summary.${genome}.csv' 56 | String plots_zip_dir = '${prefix}.dorcs.plots.${genome}.zip' 57 | #String papermill_log_filename = 'papermill.logfile.txt' 58 | String log_filename = "log/${prefix}.dorcs.logfile.${genome}.txt" 59 | 60 | command { 61 | gzip -dc ${atac_fragments} > tmp_fragments.bedpe 62 | 63 | papermill $(which dorcs_jplot_notebook.ipynb) ${output_filename} \ 64 | -p rnaCountMatrix ${rna_matrix} \ 65 | -p atacFragFile tmp_fragments.bedpe \ 66 | -p peakFile ${peak_file} \ 67 | -p savePlotsToDir ${save_plots_to_dir} \ 68 | -p nCores ${n_cores} \ 69 | -p genome ${genome} \ 70 | -p minFeature_RNA ${minFeature_RNA} \ 71 | -p maxFeature_RNA ${maxFeature_RNA} \ 72 | -p percentMT_RNA ${percentMT_RNA} \ 73 | -p minCells_RNA ${minCells_RNA} \ 74 | -p dorcGeneCutOff ${dorcGeneCutOff} \ 75 | -p fripCutOff ${fripCutOff} \ 76 | -p corrPVal ${corrPVal} \ 77 | -p topNGene ${topNGene} \ 78 | -p windowPadSize ${windowPadSize} \ 79 | -p numNearestNeighbor ${numNearestNeighbor} \ 80 | -p numBackgroundPairs ${numBackgroundPairs} \ 81 | -p chunkSize ${chunkSize} \ 82 | -p prefix ${prefix} 83 | } 84 | 85 | output { 86 | File notebook_output = output_filename 87 | File notebook_log = log_filename 88 | #File papermill_log = papermill_log_filename 89 | 90 | File? seurat_violin_plot = violin_plot 91 | File? j_plot = jplot 92 | File? plots_zip = plots_zip_dir 93 | 94 | File? dorcs_genes_summary = dorc_genes_summ 95 | File? dorcs_regions_summary = all_regions_summ 96 | 97 | 98 | } 99 | 100 | runtime { 101 | cpu : 4 102 | memory : mem_gb+'G' 103 | docker : docker_image 104 | disks : 'local-disk ${disk_gb} LOCAL' 105 | maxRetries : 0 106 | } 107 | } 108 | 109 | 110 | -------------------------------------------------------------------------------- /tasks/share_task_star.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | # TASK 4 | # SHARE-atac-STAR 5 | 6 | task share_rna_align { 7 | meta { 8 | version: 'v0.1' 9 | author: 'Eugenio Mattei (emattei@broadinstitute.org) at Broad Institute of MIT and Harvard' 10 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: align RNA task' 11 | } 12 | 13 | input { 14 | # This function takes in input the pre-processed fastq and align it to the genome 15 | # using STAR. 16 | 17 | Array[File] fastq_R1 18 | Array[File]? fastq_R2 19 | File? genome_index_tar 20 | String genome_name 21 | String? prefix 22 | String docker_image = "docker.io/nchernia/share_task_star:1" 23 | Int cpus = 16 24 | } 25 | #Float input_file_size_gb = size(input[0], "G") 26 | Int samtools_cpus = 6 27 | Int samtools_mem_gb = 8 28 | Int mem_gb = 64 29 | Int disk_gb = 850 30 | #Int disk_gb = round(20.0 + 4 * input_file_size_gb) 31 | 32 | # Define the output names 33 | String sorted_bam = "${default="share-seq" prefix}.rna.align.${genome_name}.sorted.bam" 34 | String sorted_bai = "${default="share-seq" prefix}.rna.align.${genome_name}.sorted.bam.bai" 35 | String alignment_log = "${default="share-seq" prefix}.rna.align.${genome_name}.log" 36 | 37 | command { 38 | set -e 39 | # Untar the genome 40 | tar xvzf ${genome_index_tar} --no-same-owner -C ./ 41 | 42 | mkdir out 43 | 44 | $(which STAR) \ 45 | --runThreadN ${cpus} \ 46 | --chimOutType WithinBAM \ 47 | --genomeDir ./ \ 48 | --readFilesIn ${sep=',' fastq_R1} ${sep=',' fastq_R2} \ 49 | --outFileNamePrefix out/${default="share-seq" prefix}.rna.align.${genome_name}. \ 50 | --outFilterMultimapNmax 20 \ 51 | --outFilterScoreMinOverLread 0.3 \ 52 | --outFilterMatchNminOverLread 0.3 \ 53 | --outSAMattributes NH HI AS nM MD \ 54 | --limitOutSJcollapsed 2000000 \ 55 | --outSAMtype BAM Unsorted \ 56 | --limitIObufferSize 400000000 400000000 \ 57 | --outReadsUnmapped Fastx \ 58 | --readFilesCommand zcat 59 | 60 | $(which samtools) sort \ 61 | -@ ${samtools_cpus} \ 62 | -m ${samtools_mem_gb}G \ 63 | -o out/${sorted_bam} \ 64 | out/${default="share-seq" prefix}.rna.align.${genome_name}.Aligned.out.bam 65 | 66 | $(which samtools) index \ 67 | -@ ${cpus} \ 68 | out/${sorted_bam} 69 | } 70 | 71 | output { 72 | File rna_alignment = "out/${sorted_bam}" 73 | File rna_alignment_index = "out/${sorted_bai}" 74 | File rna_alignment_log = glob('out/*.Log.final.out')[0] 75 | } 76 | 77 | runtime { 78 | cpu : cpus 79 | memory : mem_gb+'G' 80 | disks : 'local-disk ${disk_gb} SSD' 81 | maxRetries: 0 82 | docker: docker_image 83 | } 84 | 85 | parameter_meta { 86 | fastq_R1: { 87 | description: 'Read1 fastq', 88 | help: 'Processed fastq for read1.', 89 | example: 'processed.atac.R1.fq.gz' 90 | } 91 | genome_index_tar: { 92 | description: 'STAR indexes', 93 | help: 'Index files for STAR to use during alignment in tar.gz.', 94 | example: [''] 95 | } 96 | genome_name: { 97 | description: 'Reference name', 98 | help: 'The name of the reference genome used by the aligner.', 99 | example: ['hg38', 'mm10', 'both'] 100 | } 101 | prefix: { 102 | description: 'Prefix for output files', 103 | help: 'Prefix that will be used to name the output files', 104 | example: 'MyExperiment' 105 | } 106 | cpus: { 107 | description: 'Number of cpus', 108 | help: 'Set the number of cpus useb by bowtie2', 109 | example: '4' 110 | } 111 | docker_image: { 112 | description: 'Docker image.', 113 | help: 'Docker image for preprocessing step. Dependencies: STAR', 114 | example: ['put link to gcr or dockerhub'] 115 | } 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /src/python/qc_atac_compute_reads_in_peaks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Author: Eugenio Mattei, Broad Institute of MIT and Harvard 4 | # modified from Jason Buenrostro's tool 5 | 6 | import argparse 7 | import os 8 | import pysam 9 | from collections import Counter 10 | from collections import defaultdict 11 | import numpy as np 12 | 13 | #import os 14 | #import sys 15 | import matplotlib 16 | matplotlib.use('Agg') 17 | import matplotlib.pyplot as plt 18 | #from multiprocessing import Pool 19 | 20 | 21 | ##### DEFINE FUNCTIONS ##### 22 | def count_fragments_in_peaks(tabix_filename, 23 | peaks_list, 24 | mapq_threshold = 30): 25 | """ 26 | This funtion counts the per-barcode number of reads in the peak region. 27 | 28 | Parameters 29 | ---------- 30 | tabix_filename : str 31 | Path to the tabix file containing the fragments. 32 | File needs to be coordinate-sorted and indexed. 33 | peaks_list : array 34 | Array containing the list of peaks to be included. 35 | Each member of the array contains the following four elements: 36 | Chr, Start, End, Strand 37 | barcode_tag : str 38 | Which tag in the BAM file contains the barcode id. 39 | mapq_threshold : int 40 | Keep only the reads with mapq score greater or equal. 41 | default: 30 42 | 43 | Returns 44 | ------- 45 | 46 | Dictionary 47 | Key: Barcode 48 | Value: Number of fragments in peaks. 49 | """ 50 | # To count the number of fragments in peaks 51 | reads_in_peaks_counter = defaultdict(set) 52 | fragments_in_peaks_counter = defaultdict(set) 53 | 54 | tabixfile = pysam.TabixFile(tabix_filename) 55 | 56 | for peak in peaks_list: 57 | peak_chr = str(peak[0]) 58 | peak_start = int(peak[1]) 59 | peak_end = int(peak[2]) 60 | 61 | # Find all the fragments overlapping the promoter. 62 | for fragment in tabixfile.fetch(peak_chr, peak_start, peak_end): 63 | fragment_fields = fragment.split("\t") 64 | 65 | fragment_contig = fragment_fields[0] 66 | fragment_start = int(fragment_fields[1]) 67 | fragment_end = int(fragment_fields[2]) 68 | barcode = fragment_fields[3] 69 | 70 | fragment_id = "-".join(fragment_fields) 71 | fragments_in_peaks_counter[barcode].add(fragment_id) 72 | 73 | # Increment the counter for the specific barcode. 74 | if fragment_start >= peak_start and fragment_start <= peak_end-1: 75 | reads_in_peaks_counter[barcode].add(fragment_id+"start") 76 | 77 | if fragment_end >= peak_start and fragment_end <= peak_end-1: 78 | reads_in_peaks_counter[barcode].add(fragment_id+"end") 79 | 80 | return reads_in_peaks_counter, fragments_in_peaks_counter 81 | 82 | 83 | if __name__ == '__main__': 84 | 85 | #args = _parse_sanitize_cmdline_arguments() 86 | 87 | msg = "Add the description" 88 | parser = argparse.ArgumentParser(description = msg) 89 | 90 | # Adding optional argument 91 | parser.add_argument("tabix", help= "Fragments file in tabix format and indexed.") 92 | parser.add_argument("--prefix", help = "Prefix for the metrics output fil.") 93 | parser.add_argument("--peaks", help= "Peaks bed file") 94 | 95 | # Read arguments from command line 96 | args = parser.parse_args() 97 | 98 | if args.prefix: 99 | prefix = args.prefix 100 | else: 101 | prefix = args.bam[:-4] 102 | 103 | # It is extremely fast. Don't think we need parallel processing. 104 | #cpus = len(os.sched_getaffinity(0))/2 105 | # Using column chr, start, end and what user input contains the strand information. 106 | peaks_list = np.loadtxt(args.peaks, 'str', usecols = (0,1,2)) 107 | 108 | reads_in_peaks, fragments_in_peaks = count_fragments_in_peaks(args.tabix, 109 | peaks_list 110 | ) 111 | output_fnp = f"{prefix}.reads.in.peak.tsv" 112 | 113 | with open(output_fnp,"w") as out_file: 114 | print(f"barcode\treads_peaks\tfragment_peaks", file=out_file) 115 | for barcode,fragments_in_peak in fragments_in_peaks.items(): 116 | print(f"{barcode}\t{len(reads_in_peaks[barcode])}\t{len(fragments_in_peak)}", file=out_file) 117 | -------------------------------------------------------------------------------- /src/python/infer_barcodes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # This script is used to infer molecular barcodes 4 | # from raw sequencing BCL data. 5 | # 6 | # It requires running Picard ExtractIlluminaBarcodes with BARCODE=N, 7 | # to extract all barcodes into *_barcode.txt.gz files first. 8 | 9 | import glob 10 | import gzip 11 | import sys 12 | 13 | [ 14 | _name, 15 | multiplex_params_file, 16 | candidate_molecular_barcodes_file, 17 | barcode_matches_file, 18 | ] = sys.argv 19 | 20 | YIELD_THRESHOLD = 0.1 21 | MIN_READ_COUNT = 1e6 22 | 23 | 24 | def parse_barcodes(file_path): 25 | with open(file_path) as f: 26 | barcodes = {} 27 | for row in f.readlines(): 28 | row = row.strip().split('\t') 29 | copa = row[0] 30 | barcode = ''.join(row[1:]) 31 | barcodes[barcode] = copa 32 | return barcodes 33 | 34 | 35 | copa_barcodes = parse_barcodes(multiplex_params_file) 36 | molecular_barcodes = parse_barcodes(candidate_molecular_barcodes_file) 37 | 38 | # count each unique barcode combination 39 | counts = {} 40 | for extracted in glob.glob('*_barcode.txt.gz'): 41 | with gzip.open(extracted, 'rt') as f: 42 | for row in f.readlines(): 43 | barcode = row.split('\t')[0] 44 | if barcode in counts: 45 | counts[barcode] += 1 46 | else: 47 | counts[barcode] = 1 48 | 49 | # add any missing barcodes from the list of CoPAs 50 | for barcode in copa_barcodes.keys(): 51 | if barcode not in counts: 52 | counts[barcode] = 0 53 | 54 | 55 | def distance(b1, b2): 56 | return sum(c1 != c2 for c1, c2 in zip(b1, b2)) 57 | 58 | 59 | COPA_UNDEFINED = 'UNDEFINED' 60 | 61 | # match barcodes to candidates 62 | results = {} 63 | molecular_barcode_len = len(next(iter(molecular_barcodes))) 64 | for barcode, count in counts.items(): 65 | molecular_barcode_matched = False 66 | molecular_barcode_match = barcode[:molecular_barcode_len] 67 | molecular_barcode_match_name = molecular_barcode_match 68 | 69 | if molecular_barcode_match in molecular_barcodes: 70 | molecular_barcode_matched = True 71 | molecular_barcode_match_name = molecular_barcodes[molecular_barcode_match] 72 | 73 | barcode_match = molecular_barcode_match 74 | copa = copa_barcodes[barcode_match] if barcode_match in copa_barcodes else COPA_UNDEFINED 75 | if barcode_match in results: 76 | results[barcode_match]['Count'] += count 77 | else: 78 | results[barcode_match] = { 79 | 'CoPA': copa, 80 | 'Molecular Barcode': molecular_barcode_match_name, 81 | 'Count': count, 82 | 'Matched': molecular_barcode_matched 83 | } 84 | 85 | # show barcodes that correspond to a CoPA or have a matched 86 | # barcode at the top of the output file, otherwise 87 | # sort by count 88 | results = sorted( 89 | results.values(), 90 | key=lambda r: (r['CoPA'], int(not r['Matched']), -r['Count']) 91 | ) 92 | 93 | # calculate % of average yield 94 | total_yield = 0 95 | copa_count = 0 96 | for r in results: 97 | if r['CoPA'] != COPA_UNDEFINED: 98 | total_yield += r['Count'] 99 | copa_count += 1 100 | avg_yield = total_yield / copa_count if copa_count else None 101 | for r in results: 102 | percent_avg_yield = '' 103 | if r['CoPA'] != COPA_UNDEFINED: 104 | percent_avg_yield = '{:.2f}%'.format( 105 | 100 * r['Count'] / avg_yield) if avg_yield else 0 106 | r['Percent of average'] = percent_avg_yield 107 | 108 | # report results as a TSV 109 | with open(barcode_matches_file, 'w') as f: 110 | header = ( 111 | 'CoPA', 'Molecular Barcode', 112 | 'Count', 'Percent of average', 113 | ) 114 | print('\t'.join(header), file=f) 115 | 116 | # print CoPA matches, barcode matches, and the top barcodes 117 | # with the highest read count 118 | for r in results: 119 | if ( 120 | r['CoPA'] != COPA_UNDEFINED or 121 | r['Matched'] or 122 | r['Count'] >= MIN_READ_COUNT 123 | ): 124 | print('\t'.join((str(r[col]) for col in header)), file=f) 125 | 126 | # fail the task (and the workflow) for low yield 127 | if not avg_yield: 128 | raise Exception('None of the candidate barcodes matched any CoPAs!') 129 | failed_copas = [] 130 | for r in results: 131 | if (r['CoPA'] != COPA_UNDEFINED and 132 | float(r['Percent of average'].replace('%', '')) < YIELD_THRESHOLD): 133 | failed_copas.append(r['CoPA']) 134 | failed_copas = ', '.join(failed_copas) 135 | if failed_copas: 136 | raise Exception( 137 | f'Found CoPA(s) with < {YIELD_THRESHOLD}% yield: {failed_copas}') -------------------------------------------------------------------------------- /tasks/share_task_cell_annotation.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task cell_annotation { 4 | meta { 5 | version: 'v0.1' 6 | author: 'Zhijian Li' 7 | affiliation: 'Broad Institute of MIT and Harvard' 8 | email: 'lizhijia@broadinstitute.org' 9 | description: 'SHARE-Seq pipeline: cell type annotation using RNA-seq data.' 10 | } 11 | 12 | input { 13 | # Sample or project name 14 | String? prefix = "prefix" 15 | 16 | # Reference genome 17 | String genome 18 | 19 | # Reference data name and id 20 | String reference_data_id 21 | String reference_data_name 22 | String reference_label 23 | 24 | # Query data 25 | File query_data 26 | 27 | String? gene_id_to_symbol 28 | 29 | # Docker image 30 | String? docker_image 31 | 32 | # Runtime parameter 33 | Float? memory_factor 34 | Float? disk_factor 35 | } 36 | 37 | # Determine the size of the input 38 | Float input_file_size_mb = size(query_data, "G") 39 | 40 | # Determining memory size base on the size of the input files. 41 | Float mem_gb = 64.0 + memory_factor * input_file_size_mb 42 | 43 | # Determining disk size base on the size of the input files. 44 | Int disk_gb = round(disk_factor * input_file_size_mb) 45 | 46 | # Determining disk type base on the size of disk. 47 | String disk_type = if disk_gb > 375 then "SSD" else "LOCAL" 48 | 49 | #Output files 50 | String reference_h5ad = "${reference_data_name}.h5ad" 51 | String monitor_log = "cell_annotation_monitor.log" 52 | String notebook_log = "log/${prefix}.cell.annotation.logfile.${genome}.txt" 53 | String prediction = "${prefix}.cell.annotation.prediction.${genome}.csv" 54 | String prediction_labels = "${prefix}.cell.annotation.labels.${genome}.png" 55 | String prediction_scores = "${prefix}.cell.annotation.scores.${genome}.pdf" 56 | 57 | command { 58 | set -e 59 | 60 | bash $(which monitor_script.sh) | tee ~{monitor_log} 1>&2 & 61 | 62 | # Download data from cellxgene 63 | python3 $(which get_cellxgene_data.py) \ 64 | --id ${reference_data_id} \ 65 | --out ${reference_data_name} 66 | 67 | 68 | # Perform cell annotation 69 | Rscript $(which cell_annotation.R) \ 70 | --prefix ${prefix} \ 71 | --reference_data_name ${reference_data_name} \ 72 | --reference_label ${reference_label} \ 73 | --query_data ${query_data} \ 74 | --genome ${genome} \ 75 | --gene_id_to_symbol ${gene_id_to_symbol} 76 | 77 | } 78 | 79 | output { 80 | File reference_h5ad = "${reference_h5ad}" 81 | File monitor_log = "${monitor_log}" 82 | File notebook_log = "${notebook_log}" 83 | File prediction = '${prediction}' 84 | File prediction_labels = '${prediction_labels}' 85 | File prediction_scores = '${prediction_scores}' 86 | } 87 | 88 | runtime { 89 | memory : "${mem_gb} GB" 90 | memory_retry_multiplier: 2 91 | disks: "local-disk ${disk_gb} ${disk_type}" 92 | docker : "${docker_image}" 93 | maxRetries:1 94 | } 95 | 96 | parameter_meta { 97 | reference_data_id: { 98 | description: 'Reference dataset id', 99 | help: 'The dataset id from cellxgene server.', 100 | examples: ['3bbb6cf9-72b9-41be-b568-656de6eb18b5'] 101 | } 102 | 103 | reference_data_name: { 104 | description: 'Reference data', 105 | help: 'This file will be used as reference', 106 | examples: ['reference.h5ad'] 107 | } 108 | 109 | query_data: { 110 | description: 'Query data', 111 | help: 'scRNA-seq data used as query', 112 | examples: ['put link to gcr'] 113 | } 114 | 115 | genome: { 116 | description: 'Reference name', 117 | help: 'Reference genome.', 118 | examples: ['hg38', 'mm10', 'hg19', 'mm9'] 119 | } 120 | 121 | prefix: { 122 | description: 'Project name', 123 | help: 'String used to name your project and associated file names', 124 | example: "shareseq" 125 | } 126 | 127 | docker_image: { 128 | description: 'Docker image.', 129 | help: 'Docker image for preprocessing step.', 130 | example: ['put link to gcr or dockerhub'] 131 | } 132 | 133 | disk_factor: { 134 | description: 'Disk factor', 135 | help: 'Multiply this value to input .h5 file size (MB) to determine disk space (GB)', 136 | example: 16.0 137 | } 138 | 139 | memory_factor: { 140 | description: 'Memory factor', 141 | help: 'Multiply this value to input .h5 file size (MB) and add to default 32GB memory to determine RAM (GB)', 142 | example: 1.0 143 | } 144 | } 145 | } 146 | -------------------------------------------------------------------------------- /src/R/barcode_rank_functions.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | 3 | ## Define functions needed for plotting barcode rank 4 | 5 | # Helper function to get vectors on which to call the elbow_knee_finder. 6 | # Takes in xy values of the curve, outputs appropriate xy vectors to be passed to elbow_knee_finder. 7 | # 8 | # Function computes the second derivative of the curve, and uses the shape of the second 9 | # derivative curve to determine whether the curve has multiple "joints" (i.e. if knee should be found). 10 | # If the second derivative is uniformly positive or uniformly negative, the curve has a single "joint", 11 | # and so elbow_knee_finder can be called on the original input vectors. 12 | # Otherwise (multiple "joints"), find the zeroes of the second derivative to the left and right of the 13 | # absolute minimum of the second derivative. 14 | # These will be the endpoints of the elbow_knee_finder, so return the slices of the xy vectors 15 | # between these zeroes. 16 | get_vectors <- function(x, y){ 17 | smooth_spline <- smooth.spline(x, y, spar=1) 18 | second_deriv <- predict(smooth_spline, x, deriv=2) 19 | 20 | # Second derivative values can be noisy at beginning and end of graph; exclude first 10% and last 10% 21 | # of values when establishing uniformity of second derivative sign 22 | ten_percent <- round(length(second_deriv$x)*0.1) 23 | mid_second_deriv <- second_deriv$y[(ten_percent+1):(length(second_deriv$y)-ten_percent)] 24 | 25 | if (all(mid_second_deriv >= 0) | all(mid_second_deriv <= 0)){ 26 | print("Returning original vectors") 27 | return(list(x,y)) } 28 | else { 29 | # Find absolute minimum 30 | abs_min_idx <- second_deriv$x[which.min(second_deriv$y)] 31 | # Find last non-negative value before absolute minimum 32 | left_vect <- second_deriv$y[0:abs_min_idx] 33 | endpt_1_idx <- tail(which(left_vect >= 0), n=1) 34 | # Find first non-positive value after absolute minimum 35 | right_vect <- second_deriv$y[abs_min_idx:length(second_deriv$y)] 36 | endpt_2_idx <- abs_min_idx + which(right_vect >= 0)[1] - 1 37 | 38 | # Error cases: revert to elbow finder 39 | # Used when second derivative curve has both positive and negative values, 40 | # but doesn't match positive-negative-positive shape expected of a knee's second derivative 41 | if (length(endpt_1_idx)==0 | length(endpt_2_idx)==0){ 42 | print("Returning original vectors") 43 | return(list(x,y)) 44 | } else if (is.na(endpt_1_idx) | is.na(endpt_2_idx)){ 45 | print("Returning original vectors") 46 | return(list(x,y)) 47 | } else { 48 | print("Returning sliced vectors") 49 | return(list(x[endpt_1_idx:endpt_2_idx], y[endpt_1_idx:endpt_2_idx])) 50 | } 51 | } 52 | } 53 | 54 | # Function to find the elbow or knee of a plot. 55 | # Takes in set of xy coordinates of the plot and mode, returns point which is farthest 56 | # from the line formed by the endpoints. 57 | # Basic mode (default) is used when the plot is known to have only one "joint", 58 | # whereas advanced mode is used when it is not known whether the function needs to find an 59 | # elbow or a knee. 60 | elbow_knee_finder <- function(x, y, mode="basic") { 61 | # With advanced mode, use helper function to determine which vectors to perform calculation on 62 | if (mode == "advanced") { 63 | # smooth.spline() function used in get_vectors() requires at least 4 unique 64 | # x values; preempt this error 65 | if (length(unique(x)) < 4) { 66 | return(NULL) 67 | } else { 68 | xy_vects <- get_vectors(x, y) 69 | x <- xy_vects[[1]] 70 | y <- xy_vects[[2]] 71 | } 72 | } 73 | # Error case: return null if vectors have length 0 74 | if (length(x)==0 | length(y)==0) { 75 | return(NULL) 76 | } 77 | # Get endpoints (point with smallest x value, point with largest x value) 78 | endpts_df <- data.frame(x_coords=c(x[1], x[length(x)]), 79 | y_coords=c(y[1], y[length(y)])) 80 | # Fit line between endpoints 81 | fit <- lm(endpts_df$y_coords ~ endpts_df$x_coords) 82 | # For each point, get distance from line 83 | distances <- numeric(length(x)) 84 | for(i in 1:length(x)) { 85 | distances[i] <- abs(coef(fit)[2]*x[i] - y[i] + coef(fit)[1]) / sqrt(coef(fit)[2]^2 + 1^2) 86 | } 87 | 88 | # Get point farthest from line 89 | x_max_dist <- x[which.max(distances)] 90 | y_max_dist <- y[which.max(distances)] 91 | 92 | return(c(x_max_dist, y_max_dist)) 93 | } 94 | 95 | # Function to find the elbow/knee of a plot, and the elbow/knee of the points 96 | # before the first elbow/knee (i.e. elbow/knee of all barcodes, and elbow/knee 97 | # of top-ranked barcodes). 98 | # Takes in xy coordinates of the plot and returns vector of four coordinates: 99 | # xy coordinates of first elbow/knee, and xy coordinates of second elbow/knee. 100 | get_elbow_knee_points <- function(x, y) { 101 | point_1 <- elbow_knee_finder(x, y, mode="basic") 102 | if (!is.null(point_1)) { 103 | point_2 <- elbow_knee_finder(x[1:point_1[1]], y[1:point_1[1]], mode="advanced") 104 | } 105 | return(c(point_1, point_2)) 106 | } 107 | -------------------------------------------------------------------------------- /src/python/generate_h5_rna.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf8 3 | 4 | """ 5 | This script takes in the STARsolo barcodes tsv file, features tsv file, 6 | and raw count matrix mtx file, and generates an h5 file containing the 7 | genes x barcodes count matrix. 8 | """ 9 | 10 | import argparse 11 | from collections import defaultdict 12 | import gzip 13 | import h5py 14 | import logging 15 | from scipy.sparse import csc_matrix 16 | 17 | def parse_arguments(): 18 | parser = argparse.ArgumentParser(description="Generate an h5 count matrix of genes x barcodes") 19 | parser.add_argument("matrix_file", help="Filename for STARsolo raw matrix mtx file") 20 | parser.add_argument("features_file", help="Filename for STARsolo features tsv file") 21 | parser.add_argument("barcodes_file", help="Filename for STARsolo barcodes tsv file") 22 | parser.add_argument("output_file", help="Filename for output h5 file") 23 | parser.add_argument("pkr", help="Experiment prefix", nargs = '?') 24 | parser.add_argument("--ensembl", help="Flag for outputting genes using ENSEMBL ID, rather than gene name", action="store_true") 25 | 26 | return parser.parse_args() 27 | 28 | def get_split_lines(file_name, delimiter, skip=0): 29 | """Read file contents and yield generator with line entries""" 30 | opener = gzip.open if file_name.endswith('.gz') else open 31 | 32 | with opener(file_name, "rt") as f: 33 | for i in range(skip): 34 | next(f) 35 | for line in f: 36 | yield line.rstrip().split(sep=delimiter) 37 | 38 | def rename_duplicates(duplicate_list): 39 | """Rename duplicate entries as entry, entry.1, entry.2, etc.""" 40 | seen = defaultdict(int) 41 | renamed_list = [] 42 | 43 | for entry in duplicate_list: 44 | renamed_list.append(f"{entry}.{seen[entry]}" if entry in seen else entry) 45 | seen[entry] += 1 46 | 47 | return renamed_list 48 | 49 | def build_count_matrix(matrix): 50 | """Convert contents of mtx file to csc matrix""" 51 | # first line of matrix contains dimensions 52 | dimensions = next(matrix) 53 | n_rows = int(dimensions[0]) 54 | n_cols = int(dimensions[1]) 55 | 56 | gene_indices = [] 57 | barcode_indices = [] 58 | counts = [] 59 | 60 | for line in matrix: 61 | # subtract 1 from indices to convert to zero-based indexing 62 | gene_indices.append(int(line[0])-1) 63 | barcode_indices.append(int(line[1])-1) 64 | counts.append(int(line[2])) 65 | 66 | count_matrix = csc_matrix((counts, (gene_indices,barcode_indices)), shape=(n_rows,n_cols)) 67 | 68 | return count_matrix 69 | 70 | def write_h5(output_file, count_matrix, barcode_list, gene_list): 71 | h5_file = h5py.File(output_file, "w") 72 | 73 | # create datasets expected for Seurat import 74 | g = h5_file.create_group("group") 75 | g.create_dataset("barcodes", data=barcode_list) 76 | g.create_dataset("data", data=count_matrix.data) 77 | g.create_dataset("gene_names", data=gene_list) 78 | g.create_dataset("genes", data=gene_list) 79 | g.create_dataset("indices", data=count_matrix.indices) 80 | g.create_dataset("indptr", data=count_matrix.indptr) 81 | g.create_dataset("shape", data=count_matrix.shape) 82 | 83 | h5_file.close() 84 | 85 | def main(): 86 | # create log file 87 | logging.basicConfig(filename="generate_h5_rna.log", level=logging.INFO) 88 | 89 | # get arguments 90 | args = parse_arguments() 91 | matrix_file = getattr(args, "matrix_file") 92 | features_file = getattr(args, "features_file") 93 | barcodes_file = getattr(args, "barcodes_file") 94 | pkr = getattr(args, "pkr", None) 95 | output_file = getattr(args, "output_file") 96 | ensembl = getattr(args, "ensembl") 97 | 98 | # read input files 99 | logging.info("Reading input files\n") 100 | 101 | # get indices and counts from matrix file; skip first two lines of matrix file (header) 102 | matrix = get_split_lines(matrix_file, delimiter=" ", skip=2) 103 | 104 | # get genes from features file 105 | features = get_split_lines(features_file, delimiter="\t") 106 | if ensembl: 107 | gene_list = [line[0] for line in features] 108 | else: 109 | gene_list_duplicated = [line[1] for line in features] 110 | # append .1, .2, etc. for duplicated genes 111 | gene_list = rename_duplicates(gene_list_duplicated) 112 | 113 | # get barcodes from barcodes file, reformat as R1R2R3_PKR 114 | barcodes = get_split_lines(barcodes_file, delimiter="\t") 115 | barcode_list = [line[0] for line in barcodes] 116 | if pkr is None: 117 | formatted_barcode_list = barcode_list 118 | else: 119 | formatted_barcode_list = [barcode + "_" + pkr for barcode in barcode_list] 120 | 121 | # generate count matrix 122 | logging.info("Generating count matrix\n") 123 | count_matrix = build_count_matrix(matrix) 124 | 125 | # write h5 file 126 | logging.info(f"Writing to {output_file}.h5\n") 127 | write_h5(output_file, count_matrix, formatted_barcode_list, gene_list) 128 | logging.info("Finished writing h5 file\n") 129 | 130 | if __name__ == "__main__": 131 | main() 132 | -------------------------------------------------------------------------------- /src/python/rna_barcode_metadata.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | This script takes in a bam file, and outputs a txt file containing the number of 5 | total reads, duplicate reads, UMIs, genes, and percent mitochondrial reads for each barcode. 6 | """ 7 | 8 | import argparse 9 | import logging 10 | import pysam 11 | from collections import defaultdict 12 | 13 | logging.basicConfig(filename='barcode_metadata.log', encoding='utf-8', level=logging.DEBUG) 14 | logging.debug('Creating the barcode metadata for RNA from bam.') 15 | 16 | def parse_arguments(): 17 | parser = argparse.ArgumentParser(description="Get total reads, duplicate reads, UMIs, genes, and percent mitochondrial reads for each barcode from bam file") 18 | parser.add_argument("bam_file", help="Filename for input bam file") 19 | parser.add_argument("bai_file", help="Filename for bam index file") 20 | parser.add_argument("barcode_metadata_file", help="Filename for output barcode metadata txt file") 21 | parser.add_argument("pkr", help="PKR id for shareseq", default = None, nargs='?') 22 | parser.add_argument("--barcode_tag", help="PKR id for shareseq", default="CB") 23 | 24 | return parser.parse_args() 25 | 26 | def get_metrics(bam, barcode_tag="CB", pkr=None): 27 | """ 28 | Get barcode metrics from bam file; all counts are only for reads overlapping genes. 29 | Reported metrics are total counts, UMIs (one UMI counted per unique UMI-gene mapping), 30 | duplicate counts, genes, percent mitochondrial reads 31 | """ 32 | total_counts = defaultdict(int) 33 | genes = defaultdict(set) 34 | umi_gene = defaultdict(set) 35 | mitochondrial_counts = defaultdict(int) 36 | barcodes = set() 37 | formatted_barcodes = {} 38 | 39 | for read in bam: 40 | try: 41 | # get barcode; skip read if not present 42 | barcode = read.get_tag(barcode_tag) 43 | if barcode == "-": 44 | #logging.warning(f"Skipping {read.qname} because the {barcode_tag} tag is empty") slowing down 45 | continue 46 | 47 | # get gene id; skip read if not present 48 | gene_id = read.get_tag("GX") 49 | if gene_id == "-": 50 | #logging.warning(f"Skipping {read.qname} because the GX tag is empty") 51 | continue 52 | 53 | # get UMI; skip read if not present 54 | umi = read.get_tag("UB") 55 | if umi == "-": 56 | #logging.warning(f"Skipping {read.qname} because the UB tag is empty") 57 | continue 58 | 59 | barcodes.add(barcode) 60 | 61 | total_counts[barcode] += 1 62 | 63 | genes[barcode].add(gene_id) 64 | 65 | umi_gene[barcode].add(umi + gene_id) 66 | 67 | if read.reference_name == "chrM": 68 | mitochondrial_counts[barcode] += 1 69 | except KeyError: 70 | logging.error(f"Skipping {read.qname} because one of the tags {barcode_tag},GX, or UB is missing.") 71 | 72 | # count unique genes per barcode 73 | genes_per_barcode = {barcode:len(gene_set) for (barcode, gene_set) in genes.items()} 74 | 75 | # count unique umi-gene mappings per barcode 76 | umis_per_barcode = {barcode:len(umi_gene_set) for (barcode, umi_gene_set) in umi_gene.items()} 77 | 78 | # create list with barcodes and associated metrics 79 | barcode_metadata = [] 80 | for barcode in barcodes: 81 | total_val = str(total_counts[barcode]) 82 | umi_val = str(umis_per_barcode.get(barcode, 0)) 83 | duplicate_val = str(total_counts[barcode] - umis_per_barcode.get(barcode, 0)) 84 | gene_val = str(genes_per_barcode.get(barcode, 0)) 85 | mitochondrial_val = str(round(mitochondrial_counts.get(barcode, 0) / total_counts[barcode] * 100, 2)) 86 | out_barcode = barcode + "_" + pkr if pkr else barcode 87 | 88 | metrics = [out_barcode, total_val, duplicate_val, umi_val, gene_val, mitochondrial_val] 89 | 90 | barcode_metadata.append(metrics) 91 | 92 | return barcode_metadata 93 | 94 | def write_metadata_file(barcode_metadata, output_file): 95 | fields = ["barcode", "total_counts", "duplicate_counts", "umis", "genes", "percent_mitochondrial"] 96 | 97 | with open(output_file, "w") as f: 98 | # write header 99 | f.write("\t".join(fields) + "\n") 100 | # write rows 101 | for metrics_list in barcode_metadata: 102 | f.write("\t".join(metrics_list[:]) + "\n") 103 | 104 | def main(): 105 | # get arguments 106 | args = parse_arguments() 107 | bam_file = getattr(args, "bam_file") 108 | bai_file = getattr(args, "bai_file") 109 | 110 | pkr = getattr(args, "pkr") 111 | barcode_tag = getattr(args, "barcode_tag") 112 | 113 | barcode_metadata_file = getattr(args, "barcode_metadata_file") 114 | 115 | # load bam file 116 | bam = pysam.AlignmentFile(bam_file, "rb", index_filename=bai_file) 117 | 118 | # get metrics for each barcode 119 | barcode_metadata = get_metrics(bam, barcode_tag, pkr) 120 | 121 | # write txt file 122 | write_metadata_file(barcode_metadata, barcode_metadata_file) 123 | 124 | if __name__ == "__main__": 125 | 126 | main() 127 | -------------------------------------------------------------------------------- /src/bash/monitor_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | declare -a TEMP=$(mktemp temp_monitoring.XXXXXXXX) 4 | 5 | if [[ -z "${BACKEND}" ]]; then 6 | backend="" 7 | else 8 | backend=${BACKEND} 9 | fi 10 | 11 | function get_disk_info() { 12 | # df command and cromwell root field 13 | if [ "$backend" = "aws" ]; then 14 | df | grep '/$' 15 | else 16 | df | grep cromwell_root 17 | fi 18 | } 19 | 20 | function get_disk_usage() { 21 | # get disk usage field 22 | get_disk_info | awk '{ print $5 }' 23 | } 24 | 25 | function get_mem_info() { 26 | # /proc/meminfo 27 | cat /proc/meminfo 28 | } 29 | 30 | function get_mem_available() { 31 | # mem unused from /proc/meminfo 32 | get_mem_info | grep MemAvailable | awk 'BEGIN { FS=" " } ; { print $2 }' 33 | } 34 | 35 | function get_mem_total() { 36 | # mem total from /proc/meminfo 37 | get_mem_info | grep MemTotal | awk 'BEGIN { FS=" " } ; { print $2 }' 38 | } 39 | 40 | function get_mem_usage() { 41 | # memTotal and memAvailable 42 | local -r mem_total=$(get_mem_total) 43 | local -r mem_available=$(get_mem_available) 44 | 45 | # usage = 100 * mem_used / mem_total 46 | local -r mem_used=$(($mem_total-$mem_available)) 47 | echo "$mem_used" "$mem_total" "%"| awk '{ print 100*($1/$2)$3 }' 48 | } 49 | 50 | function get_cpu_info() { 51 | # cpu info from /proc/stat 52 | cat /proc/stat | grep "cpu " 53 | } 54 | 55 | function get_cpu_total() { 56 | # get the total cpu usage since a given time (including idle and iowait) 57 | # user+nice+system+idle+iowait+irq+softirq+steal 58 | get_cpu_info | awk 'BEGIN { FS=" " } ; { print $2+$3+$4+$5+$6+$7+$8+$9 }' 59 | } 60 | 61 | function get_cpu_used() { 62 | # get the cpu usage since a given time (w/o idle or iowait) 63 | # user+nice+system+irq+softirq+steal 64 | get_cpu_info | awk 'BEGIN { FS=" " } ; { print $2+$3+$4+$7+$8+$9 }' 65 | } 66 | 67 | function get_cpu_usage() { 68 | # get the cpu usage since a given time (w/o idle or iowait) 69 | # user+nice+system+irq+softirq+steal 70 | local -r cpu_used_cur=$(get_cpu_used) 71 | 72 | # get the total cpu usage since a given time (including idle and iowait) 73 | # user+nice+system+idle+iowait+irq+softirq+steal 74 | local -r cpu_total_cur=$(get_cpu_total) 75 | 76 | # read in previous cpu usage values 77 | read -r -a cpu_prev < ${TEMP} 78 | local -r cpu_used_prev=${cpu_prev[0]} 79 | local -r cpu_total_prev=${cpu_prev[1]} 80 | 81 | # save current values as prev values for next iteration 82 | cpu_prev[0]=$cpu_used_cur 83 | cpu_prev[1]=$cpu_total_cur 84 | echo "${cpu_prev[@]}" > ${TEMP} 85 | 86 | # usage = 100 * (cpu_used_cur - cpu_used_prev) / (cpu_total_cur-cpu_total_prev) 87 | echo "$cpu_used_cur" "$cpu_used_prev" "$cpu_total_cur" "$cpu_total_prev" "%"| awk 'BEGIN {FS=" "} ; { print 100*(($1-$2)/($3-$4))$5 }' 88 | 89 | } 90 | 91 | function print_usage() { 92 | echo [$(date)] 93 | echo \* CPU usage: "$(get_cpu_usage)" 94 | echo \* Memory usage: "$(get_mem_usage)" 95 | echo \* Disk usage: $(get_disk_usage) 96 | } 97 | 98 | function print_summary() { 99 | # display header information 100 | echo ================================== 101 | echo =========== MONITORING =========== 102 | echo ================================== 103 | 104 | # summary info 105 | echo --- General Information --- 106 | # number of cores 107 | echo \#CPU: $(nproc) 108 | # multiply by 10^-6 to convert KB to GB 109 | echo Total Memory: $(echo $(get_mem_total) 1000000 | awk '{ print $1/$2 }')G 110 | 111 | if [ "$backend" = "aws" ]; then 112 | echo Total Disk space: $(df -h | grep '/$' | awk '{ print $2 }') 113 | else 114 | echo Total Disk space: $(df -h | grep cromwell_root | awk '{ print $2}') 115 | fi 116 | } 117 | 118 | function main() { 119 | # disk, mem and cpu general statisitcs 120 | print_summary 121 | 122 | # create variable to store cpu being used (cpu_prev[0]) and total cpu total (cpu_prev[1]) 123 | # save variable to a temp file to allow passing in values to a function 124 | declare -a cpu_prev 125 | cpu_prev[0]=$(get_cpu_used) 126 | cpu_prev[1]=$(get_cpu_total) 127 | # save global values to temp file to allow passing in values to a function 128 | echo "${cpu_prev[@]}" > ${TEMP} 129 | 130 | # sleep b/w getting usage and intially storing the cpu_previous usage values 131 | # this is b/c cpu usage values are time dependent 132 | # to calculate cpu usage, values must be determined from 2 diff time stamps 133 | if [ -z "$MONITOR_SCRIPT_SLEEP" ]; then 134 | MONITOR_SCRIPT_SLEEP=30 135 | fi 136 | # get usage of disk, cpu and mem every MONITOR_SCRIPT_SLEEP sec 137 | echo 138 | echo --- Runtime Information --- 139 | 140 | sleep "$MONITOR_SCRIPT_SLEEP"; 141 | while true; do print_usage; sleep "$MONITOR_SCRIPT_SLEEP"; done 142 | } 143 | 144 | main 145 | -------------------------------------------------------------------------------- /tasks/share_task_qc_rna.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | # TASK 4 | # SHARE-qc-rna 5 | 6 | task qc_rna { 7 | meta { 8 | version: 'v0.1' 9 | author: 'Mei Knudson (mknudson@broadinstitute.org) at Broad Institute of MIT and Harvard' 10 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: QC RNA task' 11 | } 12 | 13 | input { 14 | # This function takes in input the sorted bam file produced by STARsolo 15 | File bam 16 | Int? umi_cutoff = 100 17 | Int? gene_cutoff = 100 18 | String genome_name 19 | String? barcode_tag = "CB" 20 | String? pkr 21 | String? prefix 22 | 23 | Int? cpus = 16 24 | Float? disk_factor = 1.0 25 | Float? memory_factor = 1.5 26 | String docker_image = "us.gcr.io/buenrostro-share-seq/share_task_qc_rna:v1.0.0" 27 | } 28 | 29 | # Determine the size of the input 30 | Float input_file_size_gb = size(bam, "G") 31 | 32 | # Determining memory size based on the size of the input files. 33 | Float mem_gb = 5.0 + memory_factor * input_file_size_gb 34 | 35 | # Determining disk size based on the size of the input files. 36 | Int disk_gb = round(40.0 + disk_factor * input_file_size_gb) 37 | 38 | # Determining disk type based on the size of disk. 39 | String disk_type = if disk_gb > 375 then "SSD" else "LOCAL" 40 | 41 | String assay = "RNA" 42 | String bai = "~{default="share-seq" prefix}.qc.rna.~{genome_name}.bam.bai" 43 | String barcode_metadata = "~{default="share-seq" prefix}.qc.rna.~{genome_name}.barcode.metadata.tsv" 44 | String duplicates_log = "~{default="share-seq" prefix}.qc.rna.~{genome_name}.duplicates.log.txt" 45 | String umi_barcode_rank_plot = "~{default="share-seq" prefix}.qc.rna.~{genome_name}.umi.barcode.rank.plot.png" 46 | String gene_barcode_rank_plot = "~{default="share-seq" prefix}.qc.rna.~{genome_name}.gene.barcode.rank.plot.png" 47 | String gene_umi_scatter_plot = "~{default="share-seq" prefix}.qc.rna.~{genome_name}.gene.umi.scatter.plot.png" 48 | String monitor_log = "monitor.log" 49 | 50 | command <<< 51 | set -e 52 | 53 | bash $(which monitor_script.sh) | tee ~{monitor_log} 1>&2 & 54 | 55 | # Index bam file 56 | samtools index -@ ~{cpus} ~{bam} ~{bai} 57 | 58 | # Extract barcode metadata (total counts, unique counts, duplicate counts, genes, percent mitochondrial) from bam file 59 | python3 $(which rna_barcode_metadata.py) ~{bam} \ 60 | ~{bai} \ 61 | ~{barcode_metadata} \ 62 | ~{pkr} ~{"--barcode_tag " + barcode_tag} 63 | 64 | awk '{total+=$2; duplicate+=$3; unique+=$4} END {print "total reads:", total; print "unique reads:", unique; print "duplicate reads:", duplicate}' ~{barcode_metadata} > ~{duplicates_log} 65 | 66 | # Make QC plots 67 | Rscript $(which rna_qc_plots.R) ~{barcode_metadata} ~{umi_cutoff} ~{gene_cutoff} ~{umi_barcode_rank_plot} ~{gene_barcode_rank_plot} ~{gene_umi_scatter_plot} 68 | >>> 69 | 70 | output { 71 | File rna_barcode_metadata = "~{barcode_metadata}" 72 | File rna_duplicates_log = "~{duplicates_log}" 73 | File rna_barcode_metadata_log = "barcode_metadata.log" 74 | File? rna_umi_barcode_rank_plot = "~{umi_barcode_rank_plot}" 75 | File? rna_gene_barcode_rank_plot = "~{gene_barcode_rank_plot}" 76 | File? rna_gene_umi_scatter_plot = "~{gene_umi_scatter_plot}" 77 | } 78 | 79 | runtime { 80 | cpu : cpus 81 | memory : "~{mem_gb} GB" 82 | disks: "local-disk ~{disk_gb} ~{disk_type}" 83 | docker : "${docker_image}" 84 | } 85 | 86 | parameter_meta { 87 | bam: { 88 | description: 'Alignment bam file', 89 | help: 'Aligned reads in bam format.', 90 | example: 'hg38.aligned.bam' 91 | } 92 | umi_cutoff: { 93 | description: 'UMI cutoff', 94 | help: 'Cutoff for number of UMIs required when making UMI barcode rank plot.', 95 | example: 10 96 | } 97 | gene_cutoff: { 98 | description: 'Gene cutoff', 99 | help: 'Cutoff for number of genes required when making gene barcode rank plot.', 100 | example: 10 101 | } 102 | pkr: { 103 | description: 'Experiment pkr', 104 | help: 'Id of the sample pkr (share-seq specific).', 105 | examples: ['SS-PKR-000'] 106 | } 107 | genome_name: { 108 | description: 'Reference name', 109 | help: 'The name genome reference used to align.', 110 | example: ['hg38', 'mm10', 'hg19', 'mm9'] 111 | } 112 | prefix: { 113 | description: 'Prefix for output files', 114 | help: 'Prefix that will be used to name the output files', 115 | example: 'MyExperiment' 116 | } 117 | docker_image: { 118 | description: 'Docker image.', 119 | help: 'Docker image for preprocessing step. Dependencies: samtools', 120 | example: ['put link to gcr or dockerhub'] 121 | } 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /tasks/share_task_joint_qc.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | # TASK 4 | # SHARE-joint-qc-plotting 5 | 6 | 7 | task joint_qc_plotting { 8 | meta { 9 | version: 'v0.1' 10 | author: 'Mei Knudson (mknudson@broadinstitute.org) at Broad Institute of MIT and Harvard' 11 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: Joint QC plot' 12 | } 13 | 14 | input { 15 | # This task generates a plot of barcodes QC'd jointly by RNA and ATAC metrics, as well as a 16 | # density plot of all barcodes passing at least one filter. 17 | File? atac_barcode_metadata 18 | File? rna_barcode_metadata 19 | Int remove_low_yielding_cells = 10 20 | Int min_umis = 100 21 | Int min_genes = 200 22 | Int min_tss = 4 23 | Int min_frags = 100 24 | 25 | Float? disk_factor = 8.0 26 | Float? memory_factor = 2.0 27 | 28 | String? prefix 29 | String genome_name 30 | 31 | String docker_image = "us.gcr.io/buenrostro-share-seq/share_task_joint_qc:v1.0.0" 32 | } 33 | 34 | # Determine the size of the input 35 | Float input_file_size_gb = size(atac_barcode_metadata, "G") + size(rna_barcode_metadata, "G") 36 | 37 | # Determine memory size based on the size of the input files 38 | Float mem_gb = 5.0 + memory_factor * input_file_size_gb 39 | 40 | # Determine disk size based on the size of the input files 41 | Int disk_gb = round(40.0 + disk_factor * input_file_size_gb) 42 | 43 | # Determining disk type base on the size of disk. 44 | String disk_type = if disk_gb > 375 then "SSD" else "LOCAL" 45 | 46 | String joint_qc_plot = '${default="share-seq" prefix}.${genome_name}.joint.qc.plot.png' 47 | String joint_density_plot = '${default="share-seq" prefix}.${genome_name}.joint.density.plot.png' 48 | String joint_barcode_metadata = '${default="share-seq" prefix}.joint.barcode.metadata.${genome_name}.csv' 49 | 50 | command { 51 | set -e 52 | 53 | bash $(which monitor_script.sh) > monitoring.log & 54 | 55 | # Make joint qc plot 56 | python3 $(which joint_cell_plotting.py) ${rna_barcode_metadata} ${atac_barcode_metadata} ${remove_low_yielding_cells} ${min_umis} ${min_genes} ${min_tss} ${min_frags} ${joint_qc_plot} ${joint_barcode_metadata} ${default="share-seq" prefix} 57 | 58 | # Make joint density plot 59 | Rscript $(which joint_cell_plotting_density.R) ${default="share-seq" prefix} ${joint_barcode_metadata} ${joint_density_plot} 60 | } 61 | 62 | output { 63 | File joint_calling_monitor = "monitoring.log" 64 | File joint_calling_log = "joint_cell_plotting.log" 65 | File? joint_qc_plot = "${joint_qc_plot}" 66 | File? joint_density_plot = "${joint_density_plot}" 67 | File joint_barcode_metadata = "${joint_barcode_metadata}" 68 | } 69 | 70 | runtime { 71 | memory : "${mem_gb} GB" 72 | disks: "local-disk ${disk_gb} ${disk_type}" 73 | docker : "${docker_image}" 74 | } 75 | 76 | parameter_meta { 77 | atac_barcode_metadata: { 78 | description: 'File containing ATAC barcode metrics.', 79 | help: 'tsv file with ATAC barcode (R1,R2,R3,PKR), fragments, TSS enrichment.', 80 | example: 'qc.atac.barcode.metadata.tsv' 81 | } 82 | rna_barcode_metadata: { 83 | description: 'File containing RNA barcode metrics.', 84 | help: 'tsv file with RNA barcode (R1,R2,R3,PKR), UMIs, genes.', 85 | example: 'qc.rna.barcode.metadata.tsv' 86 | } 87 | remove_low_yielding_cells: { 88 | description: 'UMI and fragments cutoff for plotting.', 89 | help: 'Minimum number of UMIs/fragments required for barcode to be plotted.', 90 | example: 10 91 | } 92 | min_umis: { 93 | description: 'UMI cutoff for RNA QC.', 94 | help: 'Minimum number of UMIs required for barcode to pass RNA QC.', 95 | example: 100 96 | } 97 | min_genes: { 98 | description: 'Gene cutoff for RNA QC.', 99 | help: 'Minimum number of genes required for barcode to pass RNA QC.', 100 | example: 200 101 | } 102 | min_tss: { 103 | description: 'TSS cutoff for ATAC QC.', 104 | help: 'Minimum TSS score required for barcode to pass ATAC QC.', 105 | example: 4 106 | } 107 | min_frags: { 108 | description: 'Fragments cutoff for ATAC QC.', 109 | help: 'Minimum number of fragments required for barcode to pass ATAC QC.', 110 | example: 100 111 | } 112 | prefix: { 113 | description: 'Prefix for output files', 114 | help: 'Prefix that will be used to name the output files', 115 | examples: 'MyExperiment' 116 | } 117 | genome_name: { 118 | description: 'Reference name', 119 | help: 'The name genome reference used to align.', 120 | example: ['hg38', 'mm10', 'hg19', 'mm9'] 121 | } 122 | docker_image: { 123 | description: 'Docker image.', 124 | help: 'Docker image for preprocessing step.', 125 | example: ['put link to gcr or dockerhub'] 126 | } 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /tasks/10x_task_preprocess.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | # TASK 4 | # 10x_task_preprocess 5 | 6 | task preprocess_tenx { 7 | meta { 8 | version: 'v0.1' 9 | author: 'Eugenio Mattei (emattei@broadinstitute.org) at Broad Institute of MIT and Harvard' 10 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: preprocess 10x ATAC data.' 11 | } 12 | 13 | input { 14 | # This task takes in input the 3 fastqs coming out from cellranger mkfastqs and preprocess them. 15 | File fastq_R1 # Pair 1 reads 16 | File fastq_R3 # Pair 2 reads 17 | File fastq_R2 # Barcode fastq 18 | File? whitelist # Barcode whitelist (chemistry specific) 19 | Int? barcode_dist = 2 20 | Float? threshold_pct_barcode_matching = 0.60 21 | String chemistry 22 | String? prefix 23 | Int? cpus = 16 24 | Float? disk_factor = 8.0 25 | Float? memory_factor = 0.15 26 | String docker_image = "us.gcr.io/buenrostro-share-seq/10x_task_preprocess:v1.0.0" 27 | } 28 | 29 | # Determine the size of the input 30 | Float input_file_size_gb = size(fastq_R1, "G") + size(fastq_R2, "G") + size(fastq_R3, "G") 31 | 32 | # Determining memory size base on the size of the input files. 33 | Float mem_gb = 5.0 + memory_factor * input_file_size_gb 34 | 35 | # Determining disk size base on the size of the input files. 36 | Int disk_gb = round(40.0 + disk_factor * input_file_size_gb) 37 | 38 | # Determining disk type base on the size of disk. 39 | String disk_type = if disk_gb > 375 then "SSD" else "LOCAL" 40 | 41 | # auto-detect barcode complementation outfiles 42 | String barcode_complementation_qc = "${default="10x" prefix}.atac.preprocess.complementation.qc.txt" 43 | String barcode_complementation_out = "${default="10x" prefix}.atac.preprocess.complementation.out.txt" 44 | 45 | # barcode correction and filtering outfiles 46 | String barcode_correction_qc = "${default="10x" prefix}.atac.preprocess.barcode.correction.qc.txt" 47 | String cleaned_fastq_R1 = "${default="10x" prefix}.atac.preprocess.cleaned.R1.fastq.gz" 48 | String cleaned_fastq_R2 = "${default="10x" prefix}.atac.preprocess.cleaned.R2.fastq.gz" 49 | 50 | # read trimming outfiles 51 | String final_fastq_R1 = "${default="10x" prefix}.atac.preprocess.cleaned.trimmed.R1.fastq.gz" 52 | String final_fastq_R2 = "${default="10x" prefix}.atac.preprocess.cleaned.trimmed.R2.fastq.gz" 53 | String trimming_log_json = "${default="10x" prefix}.atac.preprocess.trimming.log.json" 54 | String trimming_log_html = "${default="10x" prefix}.atac.preprocess.trimming.log.html" 55 | String trimming_stats = "${default="10x" prefix}.atac.preprocess.trimming.adapter.stats.txt" 56 | 57 | String barcode_conversion_dict = "barcode_conversion_dict.csv" 58 | 59 | String monitor_log = 'monitor_10x_preprocessing.log.txt' 60 | 61 | command <<< 62 | set -e 63 | 64 | bash $(which monitor_script.sh) | tee ~{monitor_log} 1>&2 & 65 | 66 | # Strip read description 67 | zcat ~{fastq_R1} | sed 's/ .*//' | gzip > stripped_R1.fastq.gz 68 | zcat ~{fastq_R3} | sed 's/ .*//' | gzip > stripped_R2.fastq.gz 69 | zcat ~{fastq_R2} | sed 's/ .*//' | gzip > stripped_barcode.fastq.gz 70 | 71 | if [[ '~{whitelist}' == *.gz ]]; then 72 | gunzip -c ~{whitelist} > whitelist.txt 73 | else 74 | ln -s ~{whitelist} whitelist.txt 75 | fi 76 | 77 | # auto-detect barcode complementation 78 | # python3 barcode_revcomp_detect.py barcode_fastq chemistry whitelist qc_out out threshold 79 | 80 | python3 $(which barcode_revcomp_detect.py) stripped_barcode.fastq.gz ~{chemistry} whitelist.txt ~{barcode_complementation_qc} ~{barcode_complementation_out} ~{threshold_pct_barcode_matching} 81 | 82 | # barcode correction and filtering 83 | # python3 match_barcodes.py 84 | 85 | python3 $(which match_barcodes.py) stripped_R1.fastq.gz stripped_R2.fastq.gz stripped_barcode.fastq.gz ~{chemistry} ~{barcode_dist} ~{barcode_complementation_out} whitelist.txt ~{cleaned_fastq_R1} ~{cleaned_fastq_R2} ~{barcode_correction_qc} ~{cpus} 86 | 87 | # Cleaned old files 88 | rm stripped_R1.fastq.gz stripped_R2.fastq.gz stripped_barcode.fastq.gz 89 | >>> 90 | 91 | output { 92 | File fastq_R1_preprocessed = cleaned_fastq_R1 93 | File fastq_R2_preprocessed = cleaned_fastq_R2 94 | File tenx_barcode_complementation_qc = barcode_complementation_qc 95 | File tenx_barcode_correction_qc = barcode_correction_qc 96 | File? tenx_barcode_conversion_dict = barcode_conversion_dict 97 | #File tenx_trimming_log_json = trimming_log_json 98 | #File trimming_log_html = trimming_log_html 99 | #File tenx_trimming_stats = trimming_stats 100 | } 101 | 102 | runtime { 103 | cpu: cpus 104 | docker: "${docker_image}" 105 | disks: "local-disk ${disk_gb} ${disk_type}" 106 | memory: "${mem_gb} GB" 107 | } 108 | 109 | parameter_meta { 110 | fastq_R1: { 111 | description: 'Pairs 1 fastq', 112 | help: 'Pairs 1 fastq', 113 | } 114 | fastq_R2: { 115 | description: 'Barcode fastq', 116 | help: 'Barcode fastq', 117 | } 118 | fastq_R3: { 119 | description: 'Pairs 2 fastq', 120 | help: 'Pairs 2 fastq', 121 | } 122 | } 123 | 124 | } 125 | -------------------------------------------------------------------------------- /src/R/rna_qc_plots.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | 3 | ### Takes RNA barcode metadata tsv file, and outputs QC plots as png files. 4 | ### QC plots include barcode rank by number of UMIs (all barcodes and top-ranked barcodes), 5 | ### barcode rank by number of genes (all barcodes and top-ranked barcodes), 6 | ### and genes vs UMIs scatter plot. 7 | 8 | ## Import helper functions 9 | source("/usr/local/bin/barcode_rank_functions.R") 10 | 11 | ## Get arguments, read input 12 | args <- commandArgs() 13 | 14 | barcode_metadata_file <- args[6] 15 | umi_cutoff <- as.integer(args[7]) 16 | gene_cutoff <- as.integer(args[8]) 17 | umi_rank_plot_file <- args[9] 18 | gene_rank_plot_file <- args[10] 19 | gene_umi_plot_file <- args[11] 20 | 21 | barcode_metadata <- read.table(barcode_metadata_file, header=T) 22 | 23 | ## Get plot inputs 24 | 25 | # Impose UMI cutoff, sort in decreasing order, assign rank 26 | umi_filtered <- barcode_metadata$umis[barcode_metadata$umis >= umi_cutoff] 27 | umi_filtered_sort <- sort(umi_filtered, decreasing=T) 28 | umi_rank <- 1:length(umi_filtered_sort) 29 | 30 | # Find elbow/knee of UMI barcode rank plot and top-ranked UMI barcode rank plot 31 | umi_points <- get_elbow_knee_points(x=umi_rank, y=log10(umi_filtered_sort)) 32 | # For each valid plot, make factor for coloring plot points 33 | if (length(umi_points) > 0) { # Elbow found in first plot 34 | umi_plot1 <- TRUE 35 | is_top_ranked_umi <- factor(ifelse(umi_rank <= umi_points[1], 1, 0)) 36 | if (length(umi_points) > 2) { # Elbow/knee found in second plot 37 | umi_plot2 <- TRUE 38 | umi_top_rank <- umi_rank[1:umi_points[1]] 39 | umi_top_umi <- umi_filtered_sort[1:umi_points[1]] 40 | is_top_top_ranked_umi <- factor(ifelse(umi_top_rank <= umi_points[3], 1, 0)) 41 | } else { 42 | umi_plot2 <- FALSE 43 | } 44 | } else { 45 | umi_plot1 <- FALSE 46 | } 47 | 48 | # Impose gene cutoff, sort in decreasing order, assign rank 49 | gene_filtered <- barcode_metadata$genes[barcode_metadata$genes >= gene_cutoff] 50 | gene_filtered_sort <- sort(gene_filtered, decreasing=T) 51 | gene_rank <- 1:length(gene_filtered_sort) 52 | 53 | # Find elbow/knee of gene barcode rank plot and top-ranked gene barcode rank plot 54 | gene_points <- get_elbow_knee_points(x=gene_rank, y=log10(gene_filtered_sort)) 55 | # For each valid plot, make factor for coloring plot points 56 | if (length(gene_points) > 0) { # Elbow found in first plot 57 | gene_plot1 <- TRUE 58 | is_top_ranked_gene <- factor(ifelse(gene_rank <= gene_points[1], 1, 0)) 59 | if (length(gene_points) > 2) { # Elbow/knee found in second plot 60 | gene_plot2 <- TRUE 61 | gene_top_rank <- gene_rank[1:gene_points[1]] 62 | gene_top_gene <- gene_filtered_sort[1:gene_points[1]] 63 | is_top_top_ranked_gene <- factor(ifelse(gene_top_rank <= gene_points[3], 1, 0)) 64 | } else { 65 | gene_plot2 <- FALSE 66 | } 67 | } else { 68 | gene_plot1 <- FALSE 69 | } 70 | 71 | ## Generate plots 72 | 73 | options(scipen=999) 74 | 75 | # Make UMI barcode rank plots 76 | png(umi_rank_plot_file, width=8, height=8, units='in', res=300) 77 | par(mfrow = c(2,1)) 78 | 79 | # Plot 1 (all barcodes passing UMI filter vs log10(UMIs)) 80 | if (umi_plot1) { 81 | plot(x=umi_rank, 82 | y=umi_filtered_sort, 83 | log="y", 84 | xlab=paste0(" Barcode rank (", length(umi_rank)-umi_points[1], " low quality cells)"), 85 | ylab="log10(UMIs)", 86 | main="RNA UMIs per Barcode", 87 | col=c("dimgrey","darkblue")[is_top_ranked_umi], 88 | pch=16, 89 | ylim=c(1,100000)) 90 | abline(v=umi_points[1], h=10^(umi_points[2])) 91 | text(umi_points[1], 10^(umi_points[2]), 92 | paste0("(", umi_points[1], ", ", 10^(umi_points[2]), ")"), 93 | adj=c(-0.1,-0.5)) 94 | } 95 | 96 | # Plot 2 (top ranked barcodes vs log10(UMIs)) 97 | if (umi_plot2) { 98 | plot(x=umi_top_rank, 99 | y=umi_top_umi, 100 | log="y", 101 | xlab="Barcode rank", 102 | ylab="log10(UMIs)", 103 | main="RNA UMIs per Top-Ranked Barcode", 104 | col=c("dimgrey","darkblue")[is_top_top_ranked_umi], 105 | pch=16, 106 | ylim=c(1,100000)) 107 | abline(v=umi_points[3], h=10^(umi_points[4])) 108 | text(umi_points[3], 10^(umi_points[4]), 109 | paste("(", umi_points[3], ", ", 10^(umi_points[4]), ")", sep=""), 110 | adj=c(-0.1,-0.5)) 111 | } 112 | dev.off() 113 | 114 | 115 | # Make gene barcode rank plots 116 | png(gene_rank_plot_file, width=8, height=8, units='in', res=300) 117 | par(mfrow = c(2,1)) 118 | 119 | # Plot 1 (all barcodes passing gene filter vs log10(genes)) 120 | if (gene_plot1) { 121 | plot(x=gene_rank, 122 | y=gene_filtered_sort, 123 | log="y", 124 | xlab=paste0(" Barcode rank (", length(gene_rank)-gene_points[1], " low quality cells)"), 125 | ylab="log10(genes)", 126 | main="RNA Genes per Barcode", 127 | col=c("dimgrey","darkblue")[is_top_ranked_gene], 128 | pch=16, 129 | ylim=c(1,10000)) 130 | abline(v=gene_points[1], h=10^(gene_points[2])) 131 | text(gene_points[1], 10^(gene_points[2]), 132 | paste0("(", gene_points[1], ", ", 10^(gene_points[2]), ")"), 133 | adj=c(-0.1,-0.5)) 134 | } 135 | 136 | # Plot 2 (top ranked barcodes vs log10(genes)) 137 | if (gene_plot2) { 138 | plot(x=gene_top_rank, 139 | y=gene_top_gene, 140 | log="y", 141 | xlab="Barcode rank", 142 | ylab="log10(genes)", 143 | main="RNA Genes per Top-Ranked Barcode", 144 | col=c("dimgrey","darkblue")[is_top_top_ranked_gene], 145 | pch=16, 146 | ylim=c(1,10000)) 147 | abline(v=gene_points[3], h=10^(gene_points[4])) 148 | text(gene_points[3], 10^(gene_points[4]), 149 | paste("(", gene_points[3], ", ", 10^(gene_points[4]), ")", sep=""), 150 | adj=c(-0.1,-0.5)) 151 | } 152 | dev.off() 153 | 154 | # Make genes vs UMIs scatter plot 155 | png(gene_umi_plot_file, width=8, height=8, units='in', res=300) 156 | 157 | plot(x=barcode_metadata$umis, 158 | y=barcode_metadata$genes, 159 | xlab="UMIs", 160 | ylab="Genes", 161 | main="RNA Genes vs UMIs", 162 | col="darkblue", 163 | pch=16) 164 | 165 | dev.off() 166 | -------------------------------------------------------------------------------- /tasks/share_task_merge_bams.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | # TASK 4 | # SHARE-atac-merge_bams 5 | 6 | task share_atac_merge_bams { 7 | meta { 8 | version: 'v0.1' 9 | author: 'Eugenio Mattei (emattei@broadinstitute.org) at Broad Institute of MIT and Harvard' 10 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: merge the individual bams together' 11 | } 12 | 13 | input { 14 | # This task takes in input the preprocessed ATAC fastqs and align them to the genome. 15 | Array[File] bams 16 | Array[File] logs 17 | String genome_name 18 | String prefix = "sample-share" 19 | Int? multimappers # = 5 20 | Int? cpus = 16 21 | Float? disk_factor = 8.0 22 | Float? memory_factor = 0.15 23 | String? docker_image = "us.gcr.io/buenrostro-share-seq/share_task_merge_bams:v1.0.0" 24 | } 25 | 26 | # Determine the size of the input 27 | Float input_file_size_gb = size(bams, "G") 28 | 29 | # Determining memory size base on the size of the input files. 30 | Float mem_gb = 16.0 + memory_factor * input_file_size_gb 31 | 32 | # Determining disk size base on the size of the input files. 33 | Int disk_gb = round(20.0 + disk_factor * input_file_size_gb) 34 | 35 | # Determining disk type base on the size of disk. 36 | String disk_type = if disk_gb > 375 then "SSD" else "LOCAL" 37 | 38 | # Determining memory for samtools. 39 | Float samtools_memory_gb = 0.8 * mem_gb # Samtools has overheads so reducing the memory to 80% of the total. 40 | 41 | # Number of threads to beable to use 4GB of memory per thread seems to be the fastest way 42 | Int samtools_threads_ = floor(samtools_memory_gb / 4) 43 | Int samtools_threads = if samtools_threads_ == 0 then 1 else samtools_threads_ 44 | 45 | Int sambamba_threads = floor(cpus/2) 46 | 47 | # Now that we know how many threads we can use to assure 4GB of memory per thread 48 | # we assign any remaining memory to the threads. 49 | Int samtools_memory_per_thread_ = floor(samtools_memory_gb * 1024 / samtools_threads) # Computing the memory per thread for samtools in MB. 50 | Int samtools_memory_per_thread = if samtools_memory_per_thread_ < 768 then 768 else samtools_memory_per_thread_ 51 | 52 | # Tim parameters 53 | Int machine_mem_mb = 18150 54 | Int cpu = 1 55 | Int compression_level = 5 56 | # default to 500GiB of space 57 | Int disk = 500 58 | Int command_mem_mb = machine_mem_mb - 500 59 | 60 | # Define tmp file name 61 | String unsorted_bam = "${prefix}.atac.merge.${genome_name}.bam" 62 | 63 | # Define the output names 64 | String merged_bam = "${prefix}.atac.merged.k${multimappers}.${genome_name}.sorted.bam" 65 | String merged_bai = "${prefix}.atac.merged.k${multimappers}.${genome_name}.sorted.bam.bai" 66 | String alignment_log = "${prefix}.atac.merged.k${multimappers}.${genome_name}.log" 67 | 68 | String monitor_log = "atac_merge_monitor.log" 69 | 70 | command <<< 71 | set -e 72 | 73 | bash $(which monitor_script.sh) 2>&1 & 74 | 75 | #sambamba merge -t ~{cpus} ~{unsorted_bam} ~{sep=" " bams} 76 | 77 | #sambamba sort -t ~{cpus} -m ~{command_mem_mb}M -o ~{merged_bam} ~{unsorted_bam} 78 | 79 | #sambamba index -t ~{cpus} ~{merged_bam} 80 | 81 | # Trying picard 82 | 83 | java -Dsamjdk.compression_level=~{compression_level} -Xms~{command_mem_mb}m -Xmx~{command_mem_mb}m -jar /usr/local/bin/picard.jar \ 84 | MergeSamFiles \ 85 | USE_THREADING=true \ 86 | SORT_ORDER="coordinate" \ 87 | INPUT=~{sep=' INPUT=' bams} \ 88 | OUTPUT=~{merged_bam} 89 | 90 | sambamba index -t ~{cpus} ~{merged_bam} 91 | 92 | sed 's/^[[:space:]]*//g' ~{sep=" " logs} | cut -f 1 -d ' ' | awk '{ sum[FNR%15]+=$1 } END {n_total=length(sum);for (idx=1; idx <= n_total; idx++){print sum[idx]}}' > ~{alignment_log} 93 | 94 | >>> 95 | 96 | output { 97 | File atac_merged_alignment = merged_bam 98 | File atac_merged_alignment_index = merged_bai 99 | File atac_merged_alignment_log = alignment_log 100 | } 101 | 102 | runtime { 103 | cpu: cpu 104 | docker: "${docker_image}" 105 | disks: "local-disk ${disk} HDD" 106 | disk: disk + " GB" # TES 107 | #disks: "local-disk ${disk_gb} ${disk_type}" 108 | maxRetries:1 109 | memory: "${machine_mem_mb} MiB" 110 | #memory: "${mem_gb} GB" 111 | memory_retry_multiplier: 2 112 | } 113 | 114 | parameter_meta { 115 | bams: { 116 | description: 'Individuals bams from the scatter alignment task', 117 | help: 'Individuals bams from the scatter alignment task', 118 | example: 'align.raw.L1.bam', 119 | } 120 | cpus: { 121 | description: 'Number of cpus.', 122 | help: 'Set the number of cpus used by bowtie2', 123 | default: 16 124 | } 125 | disk_factor: { 126 | description: 'Multiplication factor to determine disk required for task align.', 127 | help: 'This factor will be multiplied to the size of FASTQs to determine required disk of instance (GCP/AWS) or job (HPCs).', 128 | default: 8.0 129 | } 130 | memory_factor: { 131 | description: 'Multiplication factor to determine memory required for task align.', 132 | help: 'This factor will be multiplied to the size of FASTQs to determine required memory of instance (GCP/AWS) or job (HPCs).', 133 | default: 0.15 134 | } 135 | prefix: { 136 | description: 'Prefix for output files.', 137 | help: 'Prefix that will be used to name the output files', 138 | examples: 'my-experiment' 139 | } 140 | docker_image: { 141 | description: 'Docker image.', 142 | help: 'Docker image for the alignment step.', 143 | example: ["us.gcr.io/buenrostro-share-seq/share_task_bowtie2"] 144 | } 145 | } 146 | 147 | 148 | } 149 | -------------------------------------------------------------------------------- /src/python/trim_fastq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Trim fastq 5 | Removes dovetail (overlap) between R1 and R2 6 | """ 7 | 8 | import argparse 9 | import Levenshtein 10 | import xopen 11 | from collections import deque 12 | 13 | def parse_arguments(): 14 | parser = argparse.ArgumentParser(description="Trim dovetail (overlap) between read1 and read2") 15 | parser.add_argument("input_read1_fastq_file", help="Filename for untrimmed input read 1 FASTQ file") 16 | parser.add_argument("input_read2_fastq_file", help="Filename for untrimmed input read 2 FASTQ file") 17 | parser.add_argument("output_read1_fastq_file", help="Filename for corrected output read 1 FASTQ file") 18 | parser.add_argument("output_read2_fastq_file", help="Filename for corrected output read 2 FASTQ file") 19 | parser.add_argument("trimming_stats_file", help="Filename for txt file containing trimming statistics") 20 | 21 | return parser.parse_args() 22 | 23 | REV_COMP = str.maketrans("ATGC", "TACG") 24 | def reverse_complement(seq): 25 | return str.translate(seq, REV_COMP)[::-1] 26 | 27 | def trim_fastqs(input_read1_fastq_file, input_read2_fastq_file, 28 | output_read1_fastq_file, output_read2_fastq_file, 29 | trimming_stats_file): 30 | """ 31 | Trim reads if overlapping, write reads to output FASTQ files. 32 | Produces file enumerating how many reads were processed and trimmed. 33 | """ 34 | # counters 35 | total = trimmed = 0 36 | 37 | read1_out_writer = xopen.xopen(output_read1_fastq_file, mode="w") 38 | read2_out_writer = xopen.xopen(output_read2_fastq_file, mode="w") 39 | 40 | buffer1 = deque() 41 | buffer2 = deque() 42 | buffer_counter = 0 43 | 44 | # process FASTQs together 45 | with xopen.xopen(input_read1_fastq_file, mode= "r", threads= 8) as read1_fh, xopen.xopen(input_read2_fastq_file, mode= "r", threads= 8) as read2_fh: 46 | for readline1, readline2 in zip(read1_fh, read2_fh): 47 | total += 2 48 | 49 | name1 = readline1.strip() 50 | name2 = readline2.strip() 51 | 52 | readline1 = next(read1_fh) 53 | readline2 = next(read2_fh) 54 | 55 | sequence1 = readline1.strip() 56 | sequence2 = readline2.strip() 57 | 58 | next(read1_fh) 59 | next(read2_fh) 60 | 61 | readline1 = next(read1_fh) 62 | readline2 = next(read2_fh) 63 | 64 | quality1 = readline1.strip() 65 | quality2 = readline2.strip() 66 | 67 | # trim adapters for ATAC 68 | where = trim(sequence1, sequence2) 69 | 70 | if where > -1: 71 | trimmed += 2 72 | 73 | # add trimmed read 1 to buffer 74 | trimmed_read1 = f"{name1}\n{sequence1[:where]}\n+\n{quality1[:where]}\n" 75 | buffer1.append(trimmed_read1) 76 | 77 | # add trimmed read 2 to buffer 78 | trimmed_read2 = f"{name2}\n{sequence2[:where]}\n+\n{quality2[:where]}\n" 79 | buffer2.append(trimmed_read2) 80 | 81 | else: 82 | # add original read 1 to buffer 83 | read1 = f"{name1}\n{sequence1}\n+\n{quality1}\n" 84 | buffer1.append(read1) 85 | 86 | # add original read 1 to buffer 87 | read2 = f"{name2}\n{sequence2}\n+\n{quality2}\n" 88 | buffer2.append(read2) 89 | 90 | buffer_counter += 1 91 | 92 | # write reads to trimmed FASTQ files 93 | if buffer_counter == 10000000: 94 | read1_out_writer.write("".join(buffer1)) 95 | buffer1.clear() 96 | read2_out_writer.write("".join(buffer2)) 97 | buffer2.clear() 98 | buffer_counter = 0 99 | 100 | # write out remaining reads 101 | if buffer_counter > 0: 102 | read1_out_writer.write("".join(buffer1)) 103 | buffer1.clear() 104 | read2_out_writer.write("".join(buffer2)) 105 | buffer2.clear() 106 | buffer_counter = 0 107 | 108 | # write trimming statistics output file 109 | with open(trimming_stats_file, "w") as f: 110 | fields = ["total_reads", "untrimmed_reads", "trimmed_reads", "%trimmed"] 111 | f.write("\t".join(fields) + "\n") 112 | f.write("%i\t%i\t%i\t%0.1f" % (total, total-trimmed, trimmed, trimmed/total*100 if total > 0 else 0)) 113 | 114 | def trim(seq1, seq2): 115 | """ 116 | Find overlap between read1 and read2 and return location 117 | """ 118 | query = reverse_complement(seq2[0:20]) 119 | idx = seq1.rfind(query) # look for perfect match 120 | if idx == -1: 121 | idx = fuzz_align(query,seq1) 122 | 123 | # found it, return everything through match 124 | if idx > -1: 125 | idx = idx+20 126 | else: 127 | idx = -1 128 | return idx 129 | 130 | def fuzz_align(s_seq, l_seq): 131 | """ 132 | Align allowing Levenshtein distance of 1 133 | This iteration should go from the right end of l_seq 134 | since we want to do a rfind 135 | """ 136 | for i, base in enumerate(l_seq): # loop through equal size windows 137 | l_subset = l_seq[i:i+len(s_seq)] 138 | dist = Levenshtein.distance(l_subset, s_seq, score_cutoff= 1) 139 | if dist <= 1: # find first then break 140 | return i 141 | return -1 142 | 143 | def main(): 144 | args = parse_arguments() 145 | input_read1_fastq_file = getattr(args, "input_read1_fastq_file") 146 | input_read2_fastq_file = getattr(args, "input_read2_fastq_file") 147 | output_read1_fastq_file = getattr(args, "output_read1_fastq_file") 148 | output_read2_fastq_file = getattr(args, "output_read2_fastq_file") 149 | trimming_stats_file = getattr(args, "trimming_stats_file") 150 | 151 | trim_fastqs(input_read1_fastq_file, input_read2_fastq_file, 152 | output_read1_fastq_file, output_read2_fastq_file, 153 | trimming_stats_file) 154 | 155 | 156 | if __name__ == "__main__": 157 | main() 158 | -------------------------------------------------------------------------------- /src/python/match_barcodes.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | 3 | import numpy as np 4 | # import pandas as pd 5 | 6 | import matcha 7 | import sys 8 | 9 | REV_COMP = str.maketrans("ATGC", "TACG") 10 | def reverse_complement(seq): 11 | return str.translate(seq, REV_COMP)[::-1] 12 | 13 | def get_open_fn(path): 14 | with open(path, "rb") as f: 15 | is_gzipped = (f.read(2) == b'\x1f\x8b') 16 | return gzip.open if is_gzipped else open 17 | 18 | def read_barcodes(path, revcomp): 19 | # if path.endswith(".tsv"): 20 | # bc = pd.read_csv(path, sep="\t")["sequence"] 21 | # else: 22 | open_fn = get_open_fn(path) 23 | with open_fn(path, 'rt') as file: 24 | bc = [b.strip() for b in file] 25 | if revcomp: 26 | valid = [reverse_complement(b) for b in bc] 27 | else: 28 | valid = bc 29 | 30 | return valid 31 | 32 | def match_one_bc(fastqs, whitelists, revcomp, max_barcode_dist, offsets, fastq1_out_path, fastq2_out_path, qc_path, threads): 33 | f = matcha.FastqReader(threads = threads) 34 | f.add_sequence("R1", fastqs["R1"], output_path=fastq1_out_path) 35 | f.add_sequence("R2", fastqs["R2"]) 36 | f.add_sequence("R3", fastqs["R3"], output_path=fastq2_out_path) 37 | 38 | with open(revcomp["R2"]) as rf: 39 | rc = (int(rf.read().strip()) == 1) 40 | 41 | barcode_sequences = read_barcodes(whitelists["R2"], rc) 42 | cell_barcode = matcha.HashMatcher( 43 | sequences = barcode_sequences, 44 | labels = barcode_sequences, 45 | max_mismatches=max_barcode_dist, 46 | subsequence_count=2 47 | ) 48 | f.add_barcode("cell", cell_barcode, "R2", match_start=offsets["R2"]) 49 | f.set_output_names("{read_name} CB:Z:{cell}") 50 | 51 | barcode_counts = np.zeros(max_barcode_dist + 2, int) 52 | 53 | total_reads = 0 54 | total_pass = 0 55 | 56 | # print("start read") #### 57 | chunk_size = 10000 58 | while f.read_chunk(chunk_size): 59 | pass_filter = (f.get_match_result("cell", "dist") <=max_barcode_dist) & \ 60 | (f.get_match_result("cell", "second_best_dist") > f.get_match_result("cell", "dist")) 61 | 62 | total_reads += len(pass_filter) 63 | total_pass += pass_filter.sum() 64 | values, counts = np.unique(f.get_match_result("cell", "dist"), return_counts=True) 65 | barcode_counts[np.minimum(values, max_barcode_dist + 1)] += counts 66 | 67 | f.write_chunk(pass_filter) 68 | 69 | with open(qc_path, "w") as stats_output: 70 | print(f"{total_pass}/{total_reads} reads passing, ({total_pass/total_reads*100:.2f}%)\n", file=stats_output) 71 | print("mismatches\treads", file=stats_output) 72 | for dist in range(max_barcode_dist + 2): 73 | print( 74 | dist if dist <= max_barcode_dist else f">{max_barcode_dist}", 75 | barcode_counts[dist], 76 | sep = "\t", 77 | file=stats_output 78 | ) 79 | 80 | 81 | # def match_two_bc(fastqs, whitelists, revcomp, max_barcode_dist, offsets, fastq1_out_path, fastq2_out_path, qc_path, threads): 82 | # f = matcha.FastqReader(threads = threads) 83 | # f.add_sequence("R1", fastqs["R1"], output_path=fastq1_out_path) 84 | # f.add_sequence("R2", fastqs["R2"], output_path=fastq2_out_path) 85 | # f.add_sequence("I1", fastqs["I1"]) 86 | # f.add_sequence("I2", fastqs["I2"]) 87 | 88 | # i5_sequences, i5_maybe_rc = read_barcodes(whitelists["I2"], revcomp["I2"]) 89 | # T7_sequences, T7_maybe_rc = read_barcodes(whitelists["I1"], revcomp["I1"]) 90 | 91 | # i5_barcode = matcha.HashMatcher( 92 | # sequences = i5_maybe_rc, 93 | # labels = i5_sequences, 94 | # max_mismatches=max_barcode_dist, 95 | # subsequence_count=2 96 | # ) 97 | 98 | # T7_barcode = matcha.HashMatcher( 99 | # sequences = T7_maybe_rc, 100 | # labels = T7_sequences, 101 | # max_mismatches=max_barcode_dist, 102 | # subsequence_count=2 103 | # ) 104 | 105 | # f.add_barcode("i5", i5_barcode, "I2", match_start=offsets["I2"]) 106 | # f.add_barcode("T7", T7_barcode, "I1", match_start=offsets["I1"]) 107 | 108 | # f.set_output_names("{read_name} CB:Z:{i5}{T7}") 109 | 110 | # barcode_counts = np.zeros((max_barcode_dist + 2, max_barcode_dist + 2), int) 111 | 112 | # total_reads = 0 113 | # total_pass = 0 114 | 115 | # chunk_size = 10000 116 | 117 | # dists = [None, None] 118 | # second_dists = [None, None] 119 | # while f.read_chunk(chunk_size): 120 | # dists[0] = f.get_match_result("i5", "dist") 121 | # second_dists[0] = f.get_match_result("i5", "second_best_dist") 122 | # dists[1] = f.get_match_result("T7", "dist") 123 | # second_dists[1] = f.get_match_result("T7", "second_best_dist") 124 | 125 | # pass_filter = (dists[0] < max_barcode_dist) & \ 126 | # (dists[1] < max_barcode_dist) & \ 127 | # (dists[0] + dists[1] < second_dists[0] + second_dists[1]) 128 | 129 | # total_reads += len(pass_filter) 130 | # total_pass += pass_filter.sum() 131 | 132 | # values, counts = np.unique(dists, axis = 1, return_counts=True) 133 | # indices = np.minimum(values, max_barcode_dist+1) 134 | # barcode_counts[(indices[0], indices[1])] += counts 135 | 136 | # f.write_chunk(pass_filter) 137 | 138 | # with open(qc_path, "w") as stats_output: 139 | # print(f"{total_pass}/{total_reads} reads passing, ({total_pass/total_reads*100:.2f}%)\n", file=stats_output) 140 | # print("mismatches_i5\tmismatches_T7\treads", file=stats_output) 141 | # for i5_dist in range(max_barcode_dist + 2): 142 | # for T7_dist in range(max_barcode_dist + 2): 143 | # print( 144 | # i5_dist if i5_dist <= max_barcode_dist else f">{max_barcode_dist}", 145 | # T7_dist if T7_dist <= max_barcode_dist else f">{max_barcode_dist}", 146 | # barcode_counts[i5_dist, T7_dist], 147 | # sep = "\t", 148 | # file=stats_output 149 | # ) 150 | 151 | modality = sys.argv[4] 152 | whitelist = sys.argv[7] 153 | fastq1_out_path = sys.argv[8] 154 | fastq2_out_path = sys.argv[9] 155 | qc_path = sys.argv[10] 156 | threads = int(sys.argv[11]) 157 | max_barcode_dist = int(sys.argv[5]) 158 | fastqs = { 159 | "R1": sys.argv[1], 160 | "R2": sys.argv[3], 161 | "R3": sys.argv[2], 162 | } 163 | revcomp = { 164 | "R2": sys.argv[6], 165 | } 166 | if modality == "10x": 167 | whitelists = { 168 | "R2": whitelist, 169 | } 170 | offsets = { 171 | "R2": 0, 172 | } 173 | match_one_bc(fastqs, whitelists, revcomp, max_barcode_dist, offsets, fastq1_out_path, fastq2_out_path, qc_path, threads) 174 | 175 | elif modality == "10x_multiome": 176 | whitelists = { 177 | "R2": whitelist, 178 | } 179 | offsets = { 180 | "R2": 8, 181 | } 182 | match_one_bc(fastqs, whitelists, revcomp, max_barcode_dist, offsets, fastq1_out_path, fastq2_out_path, qc_path, threads) 183 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Broad Institute of MIT and Harvard Single-Cell/Nucleus Multiomic Processing Pipeline 2 | 3 | Pipeline specifications can be found [here](https://docs.google.com/document/d/1J-NWpDLkEGLsLjVe6h6-Rx4nxzTdgy1TJZvuMnYiiyg/edit?usp=sharing). 4 | 5 | Pipeline main page on [dockstore](https://dockstore.org/workflows/github.com/broadinstitute/epi-SHARE-seq-pipeline/SHARE-seq:release?tab=info). 6 | 7 |

8 | Pipeline overview. 9 |

10 | 11 | ### Structure of this repo 12 | * The **tasks** directory contains the tasks called from the main workflow share-seq.wdl. Each task corresponds to a different step of the pipeline: *align*, *filter*, etc. 13 | * The **src** directory contains bash, Python, R, and notebook scripts called within the tasks. 14 | * The **dockerfiles** directory contains the Dockerfiles used to build the Docker images used by the pipeline. 15 | 16 | ## Introduction 17 | 18 | The **SHARE-seq** multiomic pipeline is based off the original Buenrostro SHARE-seq pipeline specifications (by Sai Ma) in [this github repo](https://github.com/masai1116/SHARE-seq-alignment). 19 | 20 | This **10X** single-cell multiomic pipeline is based off the ENCODE (phase-3) single-cell pipeline specifications (by Anshul Kundaje) in [this google doc](https://docs.google.com/document/u/2/d/e/2PACX-1vTlgtT4WeXbvRicybUHXnhZs8RKyB4EkTbcWooQ6qBxxQ_zIHpFEVHy38D5lC_s8_YDGfUTsyomJcs3/pub). 21 | 22 | ### Features 23 | 24 | * **Portability**: The pipeline can be run on different cloud platforms such as Google, AWS and DNAnexus, as well as on cluster engines such as SLURM, SGE and PBS. 25 | * **User-friendly HTML report**: In addition to the standard outputs, the pipeline generates an HTML report that consists of quality metrics including alignment statistics along with many useful plots. An example of the [HTML report](). # TODO: add an example html. 26 | * **Supported genomes**: The pipeline requires genome-specific data such as aligner indices, chromosome sizes, and blacklisted regions. We provide genome references for hg38, mm10, mm39. 27 | 28 | ## Installation 29 | 30 | 1) Install Caper (Python Wrapper/CLI for [Cromwell](https://github.com/broadinstitute/cromwell)). 31 | ```bash 32 | $ pip install caper 33 | ``` 34 | 35 | 2) **IMPORTANT**: Read Caper's [README](https://github.com/ENCODE-DCC/caper/blob/master/README.md) carefully to choose a backend for your system. Follow the instructions in the configuration file. 36 | ```bash 37 | # backend: local or your HPC type (e.g. slurm, sge, pbs, lsf). read Caper's README carefully. 38 | $ caper init [YOUR_BACKEND] 39 | 40 | # IMPORTANT: edit the conf file and follow commented instructions in there 41 | $ vi ~/.caper/default.conf 42 | ``` 43 | 44 | 3) Git clone this pipeline. 45 | ```bash 46 | $ cd 47 | $ git clone https://github.com/broadinstitute/epi-SHARE-seq-pipeline/ #TODO: This should point to the release 48 | ``` 49 | 50 | 4) Define test input JSON. 51 | ```bash 52 | INPUT_JSON="" #TODO: We need a test dataset available for everyone 53 | ``` 54 | 55 | 5) If you have Docker and want to run the pipelines locally on your laptop, `--max-concurrent-tasks 1` limits the number of concurrent tasks to test-run on a laptop. Uncomment if running on a workstation/HPC. 56 | ```bash 57 | # check if Docker works on your machine 58 | $ docker run ubuntu:latest echo hello 59 | 60 | # --max-concurrent-tasks 1 is for computers with limited resources 61 | $ caper run share-seq.wdl -i "${INPUT_JSON}" --docker --max-concurrent-tasks 1 62 | ``` 63 | 64 | 6) Otherwise, install Singularity on your system. Please follow [these instructions](https://neuro.debian.net/install_pkg.html?p=singularity-container) to install Singularity on a Debian-based OS. Or ask your system administrator to install Singularity on your HPC. 65 | ```bash 66 | # check if Singularity works on your machine 67 | $ singularity exec docker://ubuntu:latest echo hello 68 | 69 | # on your local machine (--max-concurrent-tasks 1 is for computers with limited resources) 70 | $ caper run share-seq.wdl -i "${INPUT_JSON}" --singularity --max-concurrent-tasks 1 71 | 72 | # on HPC, make sure that Caper's conf ~/.caper/default.conf is correctly configured to work with your HPC 73 | # the following command will submit Caper as a leader job to SLURM with Singularity 74 | $ caper hpc submit share-seq.wdl -i "${INPUT_JSON}" --singularity --leader-job-name ANY_GOOD_LEADER_JOB_NAME 75 | 76 | # check job ID and status of your leader jobs 77 | $ caper hpc list 78 | 79 | # cancel the leader node to close all of its children jobs 80 | # If you directly use cluster command like scancel or qdel then 81 | # child jobs will not be terminated 82 | $ caper hpc abort [JOB_ID] 83 | ``` 84 | 85 | ## Input JSON file 86 | 87 | > **IMPORTANT**: DO NOT BLINDLY USE A TEMPLATE/EXAMPLE INPUT JSON. READ THROUGH THE FOLLOWING GUIDE TO MAKE A CORRECT INPUT JSON FILE. 88 | 89 | An input JSON file specifies all of the input parameters and files that are necessary for successfully running this pipeline. This includes a specification of the path to the genome reference files and the raw data FASTQ files. Please make sure to specify absolute paths rather than relative paths in your input JSON files. 90 | 91 | 1) [Input JSON file specification (short)](docs/input_short.md) 92 | 2) [Input JSON file specification (long)](docs/input.md) 93 | 94 | 95 | ## Running on Terra/Anvil (using Dockstore) 96 | 97 | Visit our pipeline repo on [Dockstore](https://dockstore.org/my-workflows/github.com/broadinstitute/epi-SHARE-seq-pipeline/SHARE-seq). Click on `Terra` or `Anvil`. Follow Terra's instructions to create a workspace on Terra and add Terra's billing bot to your Google Cloud account. 98 | 99 | Download this [test input JSON for Terra](we don't have one at the moment), upload it to Terra's UI, and then run the analysis. 100 | 101 | If you would like to use your own input JSON file, make sure that all files in the input JSON are on a Google Cloud Storage bucket (`gs://`). URLs will not work. 102 | 103 | ## How to organize outputs 104 | 105 | Install [Croo](https://github.com/ENCODE-DCC/croo#installation). Make sure that you have python3(> 3.4.1) installed on your system. Find a `metadata.json` on Caper's output directory. 106 | 107 | ```bash 108 | $ pip install croo 109 | $ croo [METADATA_JSON_FILE] 110 | ``` 111 | 112 | ## How to make a spreadsheet of QC metrics 113 | 114 | Install [qc2tsv](https://github.com/ENCODE-DCC/qc2tsv#installation). Make sure that you have Python 3 (>3.4.1) installed on your system. 115 | 116 | Once you have [organized the output with Croo](#how-to-organize-outputs), you will be able to find the pipeline's final output file `qc/qc.json` which contains all the QC metrics. Simply feed `qc2tsv` with multiple `qc.json` files. It can take various URIs such as local paths, `gs://`, and `s3://`. 117 | 118 | ```bash 119 | $ pip install qc2tsv 120 | $ qc2tsv /sample1/qc.json gs://sample2/qc.json s3://sample3/qc.json ... > spreadsheet.tsv 121 | ``` 122 | 123 | QC metrics for each experiment (`qc.json`) will be split into multiple rows (1 for overall experiment + 1 for each bio replicate) in a spreadsheet. 124 | 125 |
126 | TODO:\ 127 | Sambamba\ 128 | add track generation \ 129 | 130 | Thank you to the **ENCODE DAC** for writing excellent documentation for their pipelines that we used as templates. 131 | -------------------------------------------------------------------------------- /src/python/joint_cell_plotting.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | This script QCs barcodes via ATAC frags & TSS and RNA UMIs & genes, 5 | and plots all barcodes colored by joint QC status. It also generates the 6 | same plot with transparency added to show density. 7 | """ 8 | 9 | import argparse 10 | import logging 11 | import numpy as np 12 | import pandas as pd 13 | from plotnine import * 14 | 15 | def parse_arguments(): 16 | parser = argparse.ArgumentParser(description="Plot barcodes by RNA and ATAC QC status") 17 | parser.add_argument("rna_metrics_file", help="Filename for RNA metrics tsv file") 18 | parser.add_argument("atac_metrics_file", help="Filename for ATAC metrics tsv file") 19 | parser.add_argument("remove_low_yielding_cells", type=int, help="Minimum number of UMIs/fragments required for a cell to be plotted") 20 | parser.add_argument("min_umis", type=int, help="Cutoff for minimum number of UMIs") 21 | parser.add_argument("min_genes", type=int, help="Cutoff for minimum number of genes") 22 | parser.add_argument("min_tss", type=int, help="Cutoff for minimum TSS score") 23 | parser.add_argument("min_frags", type=int, help="Cutoff for minimum number of ATAC fragments") 24 | parser.add_argument("plot_file", help="Filename for plot png file") 25 | parser.add_argument("barcode_metadata_file", help="Filename for barcode metadata csv file") 26 | parser.add_argument("pkr", help="PKR name", nargs='?', default="") 27 | 28 | return parser.parse_args() 29 | 30 | def get_split_lines(file_name, delimiter, skip_header): 31 | with open(file_name, "r") as f: 32 | if skip_header: 33 | next(f) 34 | for line in f: 35 | yield line.rstrip().split(sep=delimiter) 36 | 37 | def merge_dicts(dict_1, dict_2): 38 | """Merge dictionaries by key; combine values into quadruple, fill with 0s if key not in both dicts""" 39 | keys = set(dict_1.keys() | dict_2.keys()) 40 | merged = {k: (dict_1.get(k, (0,0)) + dict_2.get(k, (0,0))) for k in keys} 41 | 42 | return(merged) 43 | 44 | def get_metrics(rna_metrics_file, atac_metrics_file, remove_low_yielding_cells): 45 | """Read files and aggregate metrics into Pandas dataframe""" 46 | rna_metrics_contents = get_split_lines(rna_metrics_file, delimiter="\t", skip_header=True) 47 | umis = [] 48 | genes = [] 49 | rna_barcodes = [] 50 | # remove cells that have fewer than 10 UMIs 51 | for line in rna_metrics_contents: 52 | if int(line[3]) >= remove_low_yielding_cells: 53 | umis.append(int(line[3])) 54 | genes.append(int(line[4])) 55 | rna_barcodes.append(line[0]) 56 | rna_metrics = dict(zip(rna_barcodes, zip(umis, genes))) 57 | 58 | atac_metrics_contents = get_split_lines(atac_metrics_file, delimiter="\t", skip_header=True) 59 | tss = [] 60 | frags = [] 61 | atac_barcodes = [] 62 | # remove cells that have fewer than 10 fragments 63 | for line in atac_metrics_contents: 64 | if int(line[6])/2 >= remove_low_yielding_cells: 65 | tss.append(float(line[4])) 66 | frags.append(int(line[6])/2) 67 | atac_barcodes.append(line[0]) 68 | atac_metrics = dict(zip(atac_barcodes, zip(tss, frags))) 69 | 70 | # merge metrics by barcodes 71 | metrics = merge_dicts(rna_metrics, atac_metrics) 72 | df = pd.DataFrame.from_dict(metrics, orient="index", columns=["umis","genes","tss","frags"]) 73 | 74 | return(df) 75 | 76 | def qc_cells(df, min_umis, min_genes, min_tss, min_frags): 77 | pass_umis = df["umis"] >= min_umis 78 | pass_genes = df["genes"] >= min_genes 79 | pass_tss = df["tss"] >= min_tss 80 | pass_frags = df["frags"] >= min_frags 81 | 82 | # add df column with QC outcome 83 | qc_conditions = [(pass_umis & pass_genes & pass_tss & pass_frags), 84 | (pass_umis & pass_genes), 85 | (pass_tss & pass_frags), 86 | (~(pass_umis & pass_genes) & (~(pass_tss & pass_frags)))] 87 | qc_choices = ["both", "RNA only", "ATAC only", "neither"] 88 | df["QC"] = np.select(qc_conditions, qc_choices) 89 | 90 | # get counts of each outcome type (used in plot legend) 91 | outcome_counts = df["QC"].value_counts() 92 | 93 | df["QC_count"] = [f"{outcome} ({outcome_counts[outcome]})" for outcome in df["QC"]] 94 | 95 | return(df) 96 | 97 | def round_to_power_10(x): 98 | return(10**np.ceil(np.log10(x))) 99 | 100 | def label_func(breaks): 101 | return [int(x) for x in breaks] 102 | 103 | def plot_cells(df, pkr, min_umis, min_genes, min_tss, min_frags, plot_file): 104 | # get max x and y coords to set plot limits 105 | max_x = max(df["frags"]) 106 | max_y = max(df["umis"]) 107 | xy_lim = round_to_power_10(max(max_x, max_y)) 108 | 109 | plot = (ggplot(df, aes("frags", "umis", color="QC_count")) 110 | + geom_point(size=0.5) 111 | + labs(title = f"Joint Cell Calling ({pkr})", 112 | caption = f"ATAC cutoffs: TSS ≥ {min_tss}, frags ≥ {min_frags}. RNA cutoffs: UMIs ≥ {min_umis}, genes ≥ {min_genes}", 113 | x = "ATAC Unique Fragments per Barcode", 114 | y = "RNA UMIs per Barcode", 115 | color = "QC") 116 | + theme_light() 117 | + theme(figure_size = (8,6), 118 | title = element_text(size=12), 119 | axis_title = element_text(size=10), 120 | axis_text = element_text(size=8), 121 | legend_box_margin = 0, 122 | legend_title = element_text(size=8), 123 | legend_text = element_text(size=6), 124 | legend_key = element_blank(), 125 | plot_caption=element_text(size=8, ha="center", margin={"r": 3.2, "t": -0.2, "units": "in"}), 126 | panel_grid_minor = element_blank()) 127 | + scale_x_log10(limits=(10,xy_lim), labels=label_func) 128 | + scale_y_log10(limits=(10,xy_lim), labels=label_func) 129 | ) 130 | 131 | plot.save(filename=plot_file, dpi=1000) 132 | 133 | def main(): 134 | # create log file 135 | logging.basicConfig(filename="joint_cell_plotting.log", level=logging.INFO) 136 | 137 | # get arguments 138 | args = parse_arguments() 139 | pkr = getattr(args, "pkr") 140 | rna_metrics_file = getattr(args, "rna_metrics_file") 141 | atac_metrics_file = getattr(args, "atac_metrics_file") 142 | remove_low_yielding_cells = getattr(args, "remove_low_yielding_cells") 143 | barcode_metadata_file = getattr(args, "barcode_metadata_file") 144 | min_umis = getattr(args, "min_umis") 145 | min_genes = getattr(args, "min_genes") 146 | min_tss = getattr(args, "min_tss") 147 | min_frags = getattr(args, "min_frags") 148 | plot_file = getattr(args, "plot_file") 149 | 150 | # read rna and atac files, get cell metrics 151 | logging.info("Getting metrics\n") 152 | metrics_df = get_metrics(rna_metrics_file, atac_metrics_file, remove_low_yielding_cells) 153 | 154 | # QC cells based on inputted cutoffs 155 | logging.info("QCing cells\n") 156 | metrics_df = qc_cells(metrics_df, min_umis, min_genes, min_tss, min_frags) 157 | 158 | # generate plot 159 | logging.info("Generating joint cell calling plot\n") 160 | plot_cells(metrics_df, pkr, min_umis, min_genes, min_tss, min_frags, plot_file) 161 | 162 | # save dataframe 163 | logging.info("Saving dataframe as csv\n") 164 | metrics_df.to_csv(barcode_metadata_file) 165 | logging.info("All done!") 166 | 167 | 168 | if __name__ == "__main__": 169 | main() 170 | 171 | --------------------------------------------------------------------------------