├── docs
├── input.md
└── images
│ └── pipeline_overview.png
├── .gitattributes
├── .dockerignore
├── .gcloudignore
├── src
├── R
│ ├── TSSRanges.RData
│ ├── joint_cell_plotting_density.R
│ ├── atac_qc_plots.R
│ ├── cell_annotation_helper_functions.R
│ ├── barcode_rank_functions.R
│ └── rna_qc_plots.R
├── python
│ ├── get_cellxgene_data.py
│ ├── qc_atac_count_duplicates_per_barcode.py
│ ├── pbc_stats.py
│ ├── flexible_import_entities_standard.py
│ ├── plot_insert_size_hist.py
│ ├── assign_multimappers.py
│ ├── barcode_revcomp_detect.py
│ ├── write_html.py
│ ├── bam_to_fragments.py
│ ├── filter_mito_reads.py
│ ├── qc_atac_compute_reads_in_peaks.py
│ ├── infer_barcodes.py
│ ├── generate_h5_rna.py
│ ├── rna_barcode_metadata.py
│ ├── trim_fastq.py
│ ├── match_barcodes.py
│ └── joint_cell_plotting.py
└── bash
│ └── monitor_script.sh
├── dockerfiles
├── notes-for-bowtie
├── share_task_html_report.dockerfile
├── share_task_generate_h5.dockerfile
├── terra_archr_and_seurat.dockerfile
├── share_task_correct_fastq.dockerfile
├── 10x_task_preprocess.dockerfile
├── share_task_joint_qc.dockerfile
├── share_task_trim_fastqs_atac.dockerfile
├── share_task_qc_rna.dockerfile
├── share_task_seurat.dockerfile
├── share_task_preprocess.dockerfile
├── share_task_merge_bams.dockerfile
├── share_task_archr.dockerfile
├── dorcs_task_find_dorcs.dockerfile
├── share_task_bowtie2.dockerfile
├── share_task_star.dockerfile
├── share_task_cell_annotation.dockerfile
├── share_task_filter_atac.dockerfile
└── share_task_qc_atac.dockerfile
├── example_input_json
├── subwf_preprocess.json
└── inputs-short-share.json
├── .gitignore
├── tasks
├── raise_exception.wdl
├── share_task_log_atac.wdl
├── 10x_create_barcode_mapping.wdl
├── share_task_log_rna.wdl
├── get_cellxgene_data.wdl
├── share_task_correct_fastq.wdl
├── share_task_trim_fastqs_atac.wdl
├── share_task_generate_h5.wdl
├── share_task_html_report.wdl
├── dorcs_task_find_dorcs.wdl
├── share_task_star.wdl
├── share_task_cell_annotation.wdl
├── share_task_qc_rna.wdl
├── share_task_joint_qc.wdl
├── 10x_task_preprocess.wdl
└── share_task_merge_bams.wdl
├── LICENSE
├── .dockstore.yml
├── workflows
├── subwf-cell-annotation.wdl
├── subwf-atac-archr.wdl
├── subwf-rna-seurat.wdl
└── subwf-find-dorcs.wdl
├── .vimrc
└── README.md
/docs/input.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | .git
2 | .cache
3 | data
4 | input_examples
5 | tasks
6 | tests
7 | tmp
8 |
--------------------------------------------------------------------------------
/.gcloudignore:
--------------------------------------------------------------------------------
1 | input_examples
2 | LICENSE
3 | README.md
4 | share-seq.wdl
5 | tasks
6 | tests
7 | workflows
8 |
--------------------------------------------------------------------------------
/src/R/TSSRanges.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/epi-SHARE-seq-pipeline/HEAD/src/R/TSSRanges.RData
--------------------------------------------------------------------------------
/dockerfiles/notes-for-bowtie:
--------------------------------------------------------------------------------
1 | https://community.arm.com/developer/tools-software/hpc/b/hpc-blog/posts/tuning-bowtie2-better-performance
2 |
--------------------------------------------------------------------------------
/docs/images/pipeline_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/epi-SHARE-seq-pipeline/HEAD/docs/images/pipeline_overview.png
--------------------------------------------------------------------------------
/example_input_json/subwf_preprocess.json:
--------------------------------------------------------------------------------
1 | {
2 | "wf_preprocess.atac_primers" : "P1.01,P1.02",
3 | "wf_preprocess.rna_primers" : "P1.17,P1.18",
4 | "wf_preprocess.read1" : "other-files-for-testing/Undetermined_S1_R1_001.fastq.gz",
5 | "wf_preprocess.read2" : "other-files-for-testing/Undetermined_S1_R4_001.fastq.gz",
6 | "wf_preprocess.index1" : "other-files-for-testing/Undetermined_S1_R2_001.fastq.gz",
7 | "wf_preprocess.index2" : "other-files-for-testing/Undetermined_S1_R3_001.fastq.gz"
8 |
9 | }
10 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Generated by Cargo
2 | # will have compiled files and executables
3 | /target/
4 |
5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
7 | Cargo.lock
8 |
9 | # These are backup files generated by rustfmt
10 | **/*.rs.bk
11 |
12 | .DS_Store
13 |
14 | .ipynb_checkpoints*
15 |
16 | .dockstore.yml
17 |
18 | src/jupyter_nb/log/
19 | src/jupyter_nb/prefix.rna.cell.annotation.plots.mm10/
20 | build_docker.sh
21 |
--------------------------------------------------------------------------------
/tasks/raise_exception.wdl:
--------------------------------------------------------------------------------
1 | # From https://github.com/ENCODE-DCC/chip-seq-pipeline2/blob/master/chip.wdl
2 |
3 |
4 | task raise_exception {
5 | input {
6 | String msg
7 | Array[String]? vals
8 | }
9 | command {
10 | echo -e "\n* Error: ${msg}\n" >&2
11 | echo -e "* Vals: ${sep=',' vals}\n" >&2
12 | exit 2
13 | }
14 | output {
15 | String error_msg = '${msg}'
16 | }
17 | runtime {
18 | maxRetries : 0
19 | cpu : 1
20 | memory : '2 GB'
21 | time : 1
22 | disks : 'local-disk 10 SSD'
23 | docker : 'encodedcc/chip-seq-pipeline:v2.2.1'
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/dockerfiles/share_task_html_report.dockerfile:
--------------------------------------------------------------------------------
1 | ############################################################
2 | # Dockerfile for BROAD GRO share-seq-pipeline
3 | # Based on Ubuntu 18.04.3
4 | ############################################################
5 |
6 | # Set the base image to Ubuntu 18.04.3
7 | #FROM ubuntu:focal
8 | FROM ubuntu@sha256:d1d454df0f579c6be4d8161d227462d69e163a8ff9d20a847533989cf0c94d90
9 |
10 | LABEL maintainer="Neva Durand"
11 |
12 | # To prevent time zone prompt
13 | ENV DEBIAN_FRONTEND=noninteractive
14 |
15 | # Install softwares from apt repo
16 | RUN apt-get update && apt-get install -y \
17 | python3 \
18 | && rm -rf /var/lib/apt/lists/*
19 |
20 | # Make directory for all softwares
21 | RUN mkdir /software
22 | WORKDIR /software
23 | ENV PATH="/software:${PATH}"
24 |
25 | # Copy the external scripts inside
26 | COPY src/python/write_html.py /software
27 |
--------------------------------------------------------------------------------
/dockerfiles/share_task_generate_h5.dockerfile:
--------------------------------------------------------------------------------
1 | ############################################################
2 | # Dockerfile for BROAD GRO share-seq-pipeline
3 | # Based on Debian slim
4 | ############################################################
5 |
6 | FROM python@sha256:7ad180fdf785219c4a23124e53745fbd683bd6e23d0885e3554aff59eddbc377
7 |
8 | LABEL maintainer = "Eugenio Mattei"
9 | LABEL software = "Share-seq pipeline"
10 | LABEL software.version="1.0.0"
11 | LABEL software.organization="Broad Institute of MIT and Harvard"
12 | LABEL software.version.is-production="Yes"
13 | LABEL software.task="generate_h5"
14 |
15 | # Install python packages
16 | RUN pip install --no-cache-dir h5py scipy
17 |
18 | # Create and setup new user
19 | ENV USER=shareseq
20 | WORKDIR /home/$USER
21 | RUN groupadd -r $USER &&\
22 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
23 | chown $USER:$USER /home/$USER
24 |
25 | # Copy scripts
26 | COPY --chown=$USER:$USER src/python/generate_h5_rna.py /usr/local/bin/
27 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
28 |
29 | USER ${USER}
30 |
--------------------------------------------------------------------------------
/dockerfiles/terra_archr_and_seurat.dockerfile:
--------------------------------------------------------------------------------
1 | ############################################################
2 | # Dockerfile for Terra to support ArchR
3 | # Based on Debian slim
4 | ############################################################
5 |
6 | FROM us.gcr.io/broad-dsp-gcr-public/terra-jupyter-r:2.1.3
7 |
8 | LABEL maintainer = "Siddarth Wekhande"
9 | LABEL software = "ArchR on Terra"
10 | LABEL software.version="0.0.1"
11 | LABEL software.organization="Broad Institute of MIT and Harvard"
12 | LABEL software.version.is-production="No"
13 | LABEL software.task="archr"
14 |
15 | USER root
16 |
17 | RUN R --no-echo --no-restore --no-save -e "install.packages(c('hdf5r','remotes'))"
18 |
19 | RUN R --no-echo --no-restore --no-save -e "remotes::install_version('Seurat', version = '4.1.1')"
20 |
21 | RUN R --no-echo --no-restore --no-save -e "remotes::install_github('GreenleafLab/ArchR@v1.0.1', repos = BiocManager::repositories());ArchR::installExtraPackages()"
22 |
23 | RUN R --no-echo --no-restore --no-save -e "remotes::install_github('immunogenomics/presto')"
24 |
25 | ENV USER jupyter
26 | USER $USER
27 |
28 | ENTRYPOINT ["/bin/bash"]
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Broad Institute
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/src/python/get_cellxgene_data.py:
--------------------------------------------------------------------------------
1 | """
2 | This script takes dataset id as input and download an h5ad file
3 | from cellxgene server using cellxgene_census API.
4 | """
5 |
6 | import argparse
7 | import logging
8 | import cellxgene_census
9 | import scanpy as sc
10 |
11 | def parse_arguments():
12 | parser = argparse.ArgumentParser(description="Download data from cellxgene server")
13 | parser.add_argument("--id", type=str, required=True,
14 | help="Cellxgene dataset id to download.")
15 | parser.add_argument("--out", type=str, required=True,
16 | help="Output filename", default="reference")
17 |
18 | return parser.parse_args()
19 |
20 |
21 | if __name__ == '__main__':
22 | # create log file
23 | logging.basicConfig(filename="get_cellxgene_data.log", level=logging.INFO)
24 |
25 | # get arguments
26 | args = parse_arguments()
27 |
28 | logging.info("Downloading data\n")
29 | cellxgene_census.download_source_h5ad(
30 | dataset_id=args.id,
31 | to_path=f"{args.out}.h5ad")
32 |
33 | adata = sc.read_h5ad(f"{args.out}.h5ad")
34 |
35 | # get counts
36 | if not adata.raw:
37 | adata.raw = adata.copy()
38 |
39 | adata.write_h5ad(f"{args.out}.h5ad")
40 |
41 | logging.info("All done!")
42 |
--------------------------------------------------------------------------------
/dockerfiles/share_task_correct_fastq.dockerfile:
--------------------------------------------------------------------------------
1 | ############################################################
2 | # Dockerfile for BROAD GRO share-seq-pipeline
3 | # Based on Debian slim
4 | ############################################################
5 |
6 | FROM python@sha256:7ad180fdf785219c4a23124e53745fbd683bd6e23d0885e3554aff59eddbc377
7 |
8 | LABEL maintainer = "Eugenio Mattei"
9 | LABEL software = "Share-seq pipeline"
10 | LABEL software.version="1.0.0"
11 | LABEL software.organization="Broad Institute of MIT and Harvard"
12 | LABEL software.version.is-production="Yes"
13 | LABEL software.task="correct_fastq"
14 |
15 | # To prevent time zone prompt
16 | ENV DEBIAN_FRONTEND=noninteractive
17 |
18 | # Install softwares from apt repo
19 | RUN apt-get update && apt-get install -y \
20 | pigz && \
21 | rm -rf /var/lib/apt/lists/*
22 |
23 | # Install python packages
24 | RUN pip install --no-cache-dir --break-system-packages xopen
25 |
26 | # Create and setup new user
27 | ENV USER=shareseq
28 | WORKDIR /home/$USER
29 | RUN groupadd -r $USER &&\
30 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
31 | chown $USER:$USER /home/$USER
32 |
33 | # Copy scripts
34 | COPY --chown=$USER:$USER src/python/correct_fastq.py /usr/local/bin/
35 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
36 |
37 | USER ${USER}
38 |
--------------------------------------------------------------------------------
/dockerfiles/10x_task_preprocess.dockerfile:
--------------------------------------------------------------------------------
1 | FROM debian@sha256:3ecce669b6be99312305bc3acc90f91232880c68b566f257ae66647e9414174f
2 |
3 | LABEL maintainer = "Eugenio Mattei"
4 | LABEL software = "Share-seq pipeline"
5 | LABEL software.version="1.0.0"
6 | LABEL software.organization="Broad Institute of MIT and Harvard"
7 | LABEL software.version.is-production="Yes"
8 | LABEL software.task="10x preprocess"
9 |
10 | RUN apt-get update && apt-get install -y \
11 | gcc \
12 | git \
13 | python3 \
14 | python3-dev \
15 | python3-pip \
16 | zlib1g-dev \
17 | wget &&\
18 | rm -rf /var/lib/apt/lists/*
19 |
20 | # Install packages for python3 scripts (pysam, SAMstats)
21 | RUN python3 -m pip install --no-cache-dir --break-system-packages --ignore-installed numpy pandas pybind11 --editable=git+https://github.com/GreenleafLab/matcha.git#egg=matcha
22 |
23 | # Create and setup new user
24 | ENV USER=shareseq
25 | WORKDIR /home/$USER
26 |
27 | RUN groupadd -r $USER &&\
28 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
29 | chown $USER:$USER /home/$USER
30 |
31 | # Add folder with software to the path
32 | ENV PATH="/software:${PATH}"
33 |
34 | # Copy the compiled software from the builder
35 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
36 | COPY --chown=$USER:$USER src/python/barcode_revcomp_detect.py /usr/local/bin
37 | COPY --chown=$USER:$USER src/python/match_barcodes.py /usr/local/bin
38 |
39 | USER ${USER}
40 |
--------------------------------------------------------------------------------
/example_input_json/inputs-short-share.json:
--------------------------------------------------------------------------------
1 | {
2 | "share.chemistry": "String",
3 | "share.read1_atac": "Array[File]",
4 | "share.read2_atac": "Array[File]",
5 | "share.read1_rna": "Array[File]",
6 | "share.read2_rna": "Array[File]",
7 | "share.genome_name_input": "String",
8 | "share.pipeline_type": "['full', 'count_only', 'no-align']",
9 |
10 |
11 | "share.pkr": "String? (optional, default = \"\")",
12 | "share.prefix": "String (optional, default = \"shareseq-project\")",
13 | "share.atac.align_multimappers": "Int? (optional)",
14 | "share.whitelist": "File? (optional)",
15 | "share.atac.barcode_tag_fragments": "String? (optional)",
16 |
17 | "share.trim_fastqs": "Boolean (optional, default = true)",
18 |
19 |
20 | "share.append_comment": "Boolean (optional, default = false)",
21 | "share.fastq_barcode": "Array[File] (optional, default = [])",
22 | "share.preprocess_tenx.barcode_dist": "Int? (optional, default = 2)",
23 | "share.preprocess_tenx.threshold_pct_barcode_matching": "Float? (optional, default = 0.6)",
24 | "share.whitelist_atac": "File? (optional)",
25 | "share.whitelist_rna": "File? (optional)",
26 |
27 |
28 | "share.atac.barcode_tag": "String? (optional, default = \"CB\")",
29 |
30 | "share.atac_genome_index_tar": "File? (optional)",
31 | "share.idx_tar_rna": "File? (optional)",
32 | "share.gtf": "File? (optional)",
33 | "share.tss_bed": "File? (optional)",
34 | "share.peak_set": "File? (optional)",
35 | "share.chrom_sizes": "File? (optional)"
36 |
37 | }
38 |
39 |
--------------------------------------------------------------------------------
/dockerfiles/share_task_joint_qc.dockerfile:
--------------------------------------------------------------------------------
1 | ############################################################
2 | # Dockerfile for BROAD GRO share-seq-pipeline
3 | ############################################################
4 |
5 | #FROM ubuntu@sha256:d1d454df0f579c6be4d8161d227462d69e163a8ff9d20a847533989cf0c94d90
6 | FROM python:3.8-buster@sha256:7e7f4c5508b85268a93b573566c8eb321a6fdb466e3b60c663a42300c73a7400
7 |
8 | LABEL maintainer="Mei Knudson"
9 |
10 | # To prevent time zone prompt
11 | ENV DEBIAN_FRONTEND=noninteractive
12 | ENV SAMTOOLS_VERSION 1.9
13 |
14 | # Install softwares from apt repo
15 | RUN apt-get update && apt-get install -y \
16 | r-base &&\
17 | rm -rf /var/lib/apt/lists/*
18 |
19 | # Install packages for python3 scripts
20 | RUN python3 -m pip install matplotlib numpy pandas plotnine
21 |
22 | # Install packages for R scripts
23 | RUN R -e "install.packages(c('ggplot2', 'remotes'))"
24 | RUN R -e "remotes::install_github('LKremer/ggpointdensity')"
25 |
26 | # Create and setup new user
27 | ENV USER=shareseq
28 | WORKDIR /home/$USER
29 | RUN groupadd -r $USER &&\
30 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
31 | chown $USER:$USER /home/$USER
32 |
33 | ENV PYTHONPATH="/usr/local/python:$PYTHONPATH"
34 | ENV R_LIBS_USER=/usr/local/lib/R
35 |
36 | COPY --chown=$USER:$USER src/python/joint_cell_plotting.py /usr/local/bin
37 | COPY --chown=$USER:$USER src/R/joint_cell_plotting_density.R /usr/local/bin
38 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
39 |
40 | USER ${USER}
41 |
--------------------------------------------------------------------------------
/dockerfiles/share_task_trim_fastqs_atac.dockerfile:
--------------------------------------------------------------------------------
1 | FROM debian@sha256:3ecce669b6be99312305bc3acc90f91232880c68b566f257ae66647e9414174f
2 |
3 | LABEL maintainer = "Eugenio Mattei"
4 | LABEL software = "Share-seq pipeline"
5 | LABEL software.version="1.0.0"
6 | LABEL software.organization="Broad Institute of MIT and Harvard"
7 | LABEL software.version.is-production="Yes"
8 | LABEL software.task="Trim ATAC fastqs"
9 |
10 | # Install softwares from apt repo
11 | RUN apt-get update && apt-get install -y \
12 | autoconf \
13 | automake \
14 | binutils \
15 | build-essential \
16 | libcurl4-openssl-dev \
17 | liblz4-dev \
18 | liblzma-dev \
19 | libncurses5-dev \
20 | libbz2-dev \
21 | pigz \
22 | python3-dev \
23 | python3-pip \
24 | wget \
25 | zlib1g-dev &&\
26 | rm -rf /var/lib/apt/lists/*
27 |
28 | # Install python packages
29 | RUN pip install --no-cache-dir --break-system-packages dnaio Levenshtein
30 | # Install fastp
31 | RUN wget http://opengene.org/fastp/fastp.0.20.1 && mv fastp.0.20.1 fastp && chmod a+x ./fastp && mv ./fastp /usr/local/bin
32 |
33 | # Create and setup new user
34 | ENV USER=shareseq
35 | WORKDIR /home/$USER
36 |
37 | RUN groupadd -r $USER &&\
38 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
39 | chown $USER:$USER /home/$USER
40 |
41 | # Add folder with software to the path
42 | ENV PATH="/software:${PATH}"
43 |
44 | # Copy the compiled software from the builder
45 | COPY --chown=$USER:$USER src/python/trim_fastq.py /usr/local/bin
46 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
47 |
48 | USER ${USER}
49 |
--------------------------------------------------------------------------------
/.dockstore.yml:
--------------------------------------------------------------------------------
1 | version: 1.2
2 | workflows:
3 | - name: "SHARE-seq"
4 | subclass: WDL
5 | primaryDescriptorPath: /share-seq.wdl
6 | filters: # Only develop or master branches and localAligner/** tags
7 | branches:
8 | - main
9 | - IGVF-variant-jamboree
10 | tags:
11 | - /.*/
12 |
13 | - name: "dorcs-find-dorcs"
14 | subclass: WDL
15 | primaryDescriptorPath: /workflows/subwf-find-dorcs.wdl
16 | filters: # Only develop or master branches and localAligner/** tags
17 | branches:
18 | - main
19 | tags:
20 | - /.*/
21 | - name: "SHARE-seq-atac-archr"
22 | subclass: WDL
23 | primaryDescriptorPath: /workflows/subwf-atac-archr.wdl
24 | filters: # Only develop or master branches and localAligner/** tags
25 | branches:
26 | - main
27 | - dev
28 | tags:
29 | - /.*/
30 |
31 | - name: "SHARE-seq-rna-seurat"
32 | subclass: WDL
33 | primaryDescriptorPath: /workflows/subwf-rna-seurat.wdl
34 | filters: # Only develop or master branches and localAligner/** tags
35 | branches:
36 | - main
37 | - dev
38 | tags:
39 | - /.*/
40 |
41 | - name: "SHARE-seq-sample-demultiplexing"
42 | subclass: WDL
43 | primaryDescriptorPath: /workflows/subwf-preprocess.wdl
44 | filters: # Only develop or master branches and localAligner/** tags
45 | branches:
46 | - main
47 | tags:
48 | - /.*/
49 |
50 | - name: "SHARE-seq-cell-annotation"
51 | subclass: WDL
52 | primaryDescriptorPath: /workflows/subwf-cell-annotation.wdl
53 | filters: # Only develop or master branches and localAligner/** tags
54 | branches:
55 | - main
56 | - dev
57 | - cell-annotation
58 | tags:
59 | - /.*/
60 |
--------------------------------------------------------------------------------
/src/python/qc_atac_count_duplicates_per_barcode.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Count the number of unique and duplicate fragments per barcode using the DS and DI tag from picard
4 |
5 | import argparse
6 | import pysam
7 | import sys
8 |
9 | from collections import defaultdict
10 |
11 | def count_duplicates(in_path, out_path, barcode_tag="CB"):
12 | """
13 | """
14 | # Dictionary holding the unique and duplicate count per barcode
15 | counter = defaultdict(lambda: [0,0])
16 | input = pysam.AlignmentFile(in_path, "rb")
17 | for read in input:
18 | cell_barcode = read.get_tag(barcode_tag)
19 | if read.flag & 1024 == 1024:
20 | counter[cell_barcode][1] += 1
21 | else:
22 | counter[cell_barcode][0] += 1
23 |
24 | with open(out_path, "w") as out_file:
25 | print("barcode\treads_unique\treads_duplicate\tpct_duplicates", file=out_file)
26 | for barcode, counts_vector in counter.items():
27 | print(f"{barcode}\t{counts_vector[0]}\t{counts_vector[1]}\t{round(counts_vector[1]/(counts_vector[0]+counts_vector[1])*100,1)}", file=out_file)
28 |
29 | if __name__ == '__main__':
30 |
31 | msg = "Add the description"
32 | parser = argparse.ArgumentParser(description = msg)
33 |
34 | # Adding optional argument
35 | parser.add_argument("bam_wdup", help = "Path to the coordinate-sorted bam file with duplicates marked but nor removed.")
36 | parser.add_argument("-o", "--output", help = "Path to the fragments output file.")
37 | parser.add_argument("--prefix", help = "Prefix for the metrics output file.")
38 | parser.add_argument("--bc_tag", help = "Specify the tag containing the cell barcode.", default="CB")
39 |
40 | # Read arguments from command line
41 | args = parser.parse_args()
42 |
43 | if args.prefix:
44 | prefix = args.prefix
45 | else:
46 | prefix = args.bam_wdup[:-4]
47 |
48 | if args.output:
49 | out_path = args.output
50 | else:
51 | out_path = f"{prefix}.duplicate.stats.tsv"
52 |
53 | bc_tag = args.bc_tag
54 |
55 |
56 | count_duplicates(args.bam_wdup, out_path, bc_tag)
57 |
--------------------------------------------------------------------------------
/src/python/pbc_stats.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | # Author Kundaje lab
4 | # https://github.com/kundajelab/ENCODE_scatac/blob/master/workflow/scripts/pbc_stats.py
5 | # Input QNAME sorted
6 |
7 |
8 | def calc_pbc(in_sam, out_path):
9 | total_pairs = 0
10 | distinct_pairs = -1
11 | one_read_pairs = 0
12 | two_read_pairs = 0
13 |
14 | current_pair = None
15 | current_count = 0
16 |
17 | for al in in_sam:
18 | fields = al.strip().split('\t')
19 | flag = int(fields[1])
20 | rname = fields[2]
21 | pos = int(fields[3])
22 | pnext = int(fields[7])
23 |
24 | if not (flag & 35 == 35):
25 | continue
26 |
27 | pair = (rname, pos, pnext)
28 | if pair == current_pair:
29 | total_pairs += 1
30 | current_count += 1
31 | else:
32 | total_pairs += current_count
33 | distinct_pairs += 1
34 | if current_count == 1:
35 | one_read_pairs += 1
36 | elif current_count == 2:
37 | two_read_pairs += 1
38 |
39 | current_pair = pair
40 | current_count = 1
41 |
42 | total_pairs += current_count
43 | distinct_pairs += 1
44 | if current_count == 1:
45 | one_read_pairs += 1
46 | elif current_count == 2:
47 | two_read_pairs += 1
48 |
49 | nrf = distinct_pairs / total_pairs
50 | pbc1 = one_read_pairs / distinct_pairs
51 | pbc2 = one_read_pairs / two_read_pairs
52 |
53 | stats_str = "\t".join(str(i) for i in [
54 | total_pairs,
55 | distinct_pairs,
56 | one_read_pairs,
57 | two_read_pairs,
58 | nrf,
59 | pbc1,
60 | pbc2
61 | ])
62 | descr_str = "\t".join([
63 | "TotalReadPairs",
64 | "DistinctReadPairs",
65 | "OneReadPair",
66 | "TwoReadPairs",
67 | "NRF=Distinct/Total",
68 | "PBC1=OnePair/Distinct",
69 | "PBC2=OnePair/TwoPair"
70 | ])
71 | with open(out_path, 'w') as f:
72 | f.write(f"{descr_str}\n{stats_str}\n")
73 |
74 | if __name__ == "__main__":
75 | qc_path = sys.argv[1]
76 | calc_pbc(sys.stdin, qc_path)
77 |
78 |
--------------------------------------------------------------------------------
/tasks/share_task_log_atac.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | # TASK
4 | # SHARE-atac-log
5 | # Gather information from log files
6 |
7 |
8 | task log_atac {
9 | meta {
10 | version: 'v0.1'
11 | author: 'Neva C. Durand (neva@broadinstitute.org) at Broad Institute of MIT and Harvard'
12 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: log atac task'
13 | }
14 |
15 | input {
16 | # This function takes as input the necessary log files and extracts
17 | # the quality metrics
18 | File alignment_log
19 | File dups_log
20 | }
21 |
22 | command <<<
23 | total_reads=$(awk 'NR==1{print $1}' ~{alignment_log})
24 | echo $total_reads > total_reads.txt
25 | aligned_uniquely=$(awk 'NR==4{print $1}' ~{alignment_log})
26 | echo $aligned_uniquely > aligned_uniquely.txt
27 | echo $(($total_reads - $aligned_uniquely)) > unaligned.txt
28 | awk 'NR>1{sum += $2}END{print sum/2}' ~{dups_log} > feature_reads.txt
29 | awk 'NR>1{sum += $3}END{print sum/2}' ~{dups_log} > duplicate_reads.txt
30 | awk 'NR>1{unique+= $2; dups+=$3}END{printf "%5.1f", 100*dups/(unique+dups)}' ~{dups_log} > pct_duplicate_reads.txt
31 | >>>
32 | output {
33 | Int atac_total_reads = read_int("total_reads.txt")
34 | Int atac_aligned_uniquely = read_int("aligned_uniquely.txt")
35 | Int atac_unaligned = read_int("unaligned.txt")
36 | Int atac_feature_reads = read_int("feature_reads.txt")
37 | Int atac_duplicate_reads = read_int("duplicate_reads.txt")
38 | Float atac_pct_dup = read_float("pct_duplicate_reads.txt")
39 | }
40 |
41 | runtime {
42 | docker: 'ubuntu:latest'
43 | }
44 | parameter_meta {
45 | alignment_log: {
46 | description: 'ATAC alignment log file',
47 | help: 'Log file from ATAC alignment step.',
48 | example: 'SS-PKR-30-96-ENTIRE-PLATE.atac.align.hg38.Log.out'
49 | }
50 | dups_log: {
51 | description: 'ATAC dups log file',
52 | help: 'Log file from ATAC rmdups step.',
53 | example: 'SS-PKR-12.atac.counts.mm10.filtered.cs.log'
54 | }
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/tasks/10x_create_barcode_mapping.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | # TASK
4 | # 10x_barcode_mapping
5 |
6 | task mapping_tenx_barcodes {
7 | meta {
8 | version: 'v0.1'
9 | author: 'Eugenio Mattei (emattei@broadinstitute.org) at Broad Institute of MIT and Harvard'
10 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: preprocess 10x ATAC data.'
11 | }
12 |
13 | input {
14 | # This task takes in input the 3 fastqs coming out from cellranger mkfastqs and preprocess them.
15 | File whitelist_atac # Barcode whitelist (chemistry specific)
16 | File whitelist_rna # Barcode whitelist (chemistry specific)
17 |
18 | Int? cpus = 16
19 | Float? disk_factor = 0.5
20 | Float? memory_factor = 0.15
21 | String? docker_image = "debian:bullseye-slim"
22 | }
23 |
24 | # Determine the size of the input
25 | Float input_file_size_gb = size(whitelist_rna, "G") + size(whitelist_atac, "G")
26 |
27 | # Determining memory size base on the size of the input files.
28 | Float mem_gb = 5.0 + memory_factor * input_file_size_gb
29 |
30 | # Determining disk size base on the size of the input files.
31 | Int disk_gb = round(40.0 + disk_factor * input_file_size_gb)
32 |
33 | # Determining disk type base on the size of disk.
34 | String disk_type = if disk_gb > 375 then "SSD" else "LOCAL"
35 |
36 | String barcode_conversion_dict = "barcode_conversion_dict.csv"
37 |
38 | command <<<
39 | set -e
40 |
41 | if [ "$(zcat ~{whitelist_atac} | wc -l)" -eq "$(zcat ~{whitelist_rna} | wc -l)" ]; then
42 | zcat ~{whitelist_atac} | tr ACGTacgt TGCAtgca | rev | paste -d ',' - <(zcat ~{whitelist_rna}) > ~{barcode_conversion_dict}
43 | paste -d ',' <(zcat ~{whitelist_atac}) <(zcat ~{whitelist_rna}) >> ~{barcode_conversion_dict}
44 | fi
45 | >>>
46 |
47 | output {
48 | File? tenx_barcode_conversion_dict = barcode_conversion_dict
49 | }
50 |
51 | runtime {
52 | cpu: cpus
53 | docker: "${docker_image}"
54 | disks: "local-disk ${disk_gb} ${disk_type}"
55 | maxRetries: 1
56 | memory: "${mem_gb} GB"
57 | memory_retry_multiplier: 2
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/dockerfiles/share_task_qc_rna.dockerfile:
--------------------------------------------------------------------------------
1 | ############################################################
2 | # Dockerfile for BROAD GRO share-seq-pipeline
3 | # Based on Debian slim
4 | ############################################################
5 |
6 | FROM r-base@sha256:fff003a52d076e963396876b83cfa88c4f40a8bc27e341339cd3cc0236c1db79
7 |
8 | LABEL maintainer = "Eugenio Mattei"
9 | LABEL software = "Share-seq pipeline"
10 | LABEL software.version="1.0.0"
11 | LABEL software.organization="Broad Institute of MIT and Harvard"
12 | LABEL software.version.is-production="Yes"
13 | LABEL software.task="qc_rna"
14 |
15 | ENV R_VERSION=4.1.2
16 |
17 | # To prevent time zone prompt
18 | ENV DEBIAN_FRONTEND=noninteractive
19 | ENV SAMTOOLS_VERSION 1.9
20 |
21 | # Install softwares from apt repo
22 | RUN apt-get update && apt-get install -y \
23 | autoconf \
24 | automake \
25 | binutils \
26 | build-essential \
27 | git \
28 | libcurl4-openssl-dev \
29 | liblz4-dev \
30 | liblzma-dev \
31 | libncurses5-dev \
32 | libbz2-dev \
33 | python3 \
34 | python3-dev \
35 | python3-full \
36 | python3-pip \
37 | wget \
38 | zlib1g-dev &&\
39 | rm -rf /var/lib/apt/lists/*
40 |
41 | # Install samtools 1.9
42 | RUN git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/samtools.git && \
43 | git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/htslib.git && \
44 | cd samtools && make && make install && cd ../ && rm -rf samtools* htslib*
45 |
46 | # Install python packages
47 | RUN pip install --no-cache-dir --break-system-packages pysam
48 |
49 | # Create and setup new user
50 | ENV USER=shareseq
51 | WORKDIR /home/$USER
52 | RUN groupadd -r $USER &&\
53 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
54 | chown $USER:$USER /home/$USER
55 |
56 | ENV R_LIBS_USER=/usr/local/lib/R
57 |
58 | # Copy scripts
59 | COPY --chown=$USER:$USER src/python/rna_barcode_metadata.py /usr/local/bin/
60 | COPY --chown=$USER:$USER src/R/barcode_rank_functions.R /usr/local/bin/
61 | COPY --chown=$USER:$USER src/R/rna_qc_plots.R /usr/local/bin/
62 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
63 |
64 | USER ${USER}
65 |
--------------------------------------------------------------------------------
/dockerfiles/share_task_seurat.dockerfile:
--------------------------------------------------------------------------------
1 | ############################################################
2 | # Dockerfile for BROAD GRO share-seq-pipeline
3 | # Based on Debian slim
4 | ############################################################
5 |
6 | FROM r-base@sha256:fff003a52d076e963396876b83cfa88c4f40a8bc27e341339cd3cc0236c1db79 as builder
7 |
8 | LABEL maintainer = "Siddarth Wekhande"
9 | LABEL software = "Share-seq pipeline"
10 | LABEL software.version="0.0.1"
11 | LABEL software.organization="Broad Institute of MIT and Harvard"
12 | LABEL software.version.is-production="No"
13 | LABEL software.task="seurat"
14 |
15 | RUN echo "options(repos = 'https://cloud.r-project.org')" > $(R --no-echo --no-save -e "cat(Sys.getenv('R_HOME'))")/etc/Rprofile.site
16 |
17 | ENV R_LIBS_USER=/usr/local/lib/R
18 | ENV RETICULATE_MINICONDA_ENABLED=FALSE
19 |
20 | RUN apt-get update -qq && \
21 | apt-get install -y -qq --no-install-recommends\
22 | binutils \
23 | gtk-doc-tools \
24 | libcairo2-dev \
25 | libcurl4-openssl-dev \
26 | libfreetype-dev \
27 | libfribidi-dev \
28 | libgsl-dev \
29 | libharfbuzz-dev \
30 | libhdf5-dev \
31 | libjpeg-dev \
32 | libmpfr-dev \
33 | libpng-dev \
34 | libssl-dev \
35 | libtiff5-dev \
36 | libxml2-dev \
37 | libxt-dev \
38 | libgeos-dev \
39 | meson \
40 | pkg-config \
41 | python3 \
42 | python3-pip && \
43 | rm -rf /var/lib/apt/lists/*
44 |
45 | ENV USER=shareseq
46 | WORKDIR /home/$USER
47 |
48 | RUN groupadd -r $USER &&\
49 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
50 | chown $USER:$USER /home/$USER
51 |
52 | RUN R --no-echo --no-restore --no-save -e "install.packages(c('hdf5r','remotes','IRkernel','logr','BiocManager'))"
53 |
54 | RUN R --no-echo --no-restore --no-save -e "remotes::install_version('Seurat', version = '4.3.0')"
55 |
56 | RUN R --no-echo --no-restore --no-save -e "BiocManager::install(c('rhdf5'), update=F, ask=F)"
57 |
58 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
59 |
60 |
61 | RUN python3 -m pip install --break-system-packages jupyter papermill
62 |
63 | COPY src/jupyter_nb/seurat_notebook.ipynb /usr/local/bin/
64 |
65 | RUN R -e "IRkernel::installspec()"
66 |
--------------------------------------------------------------------------------
/dockerfiles/share_task_preprocess.dockerfile:
--------------------------------------------------------------------------------
1 | ############################################################
2 | # Dockerfile for BROAD GRO share-seq-pipeline
3 | # Based on Ubuntu 18.04.3
4 | ############################################################
5 |
6 | # Set the base image to Ubuntu 18.04.3
7 | #FROM ubuntu:focal
8 | FROM ubuntu@sha256:d1d454df0f579c6be4d8161d227462d69e163a8ff9d20a847533989cf0c94d90
9 |
10 | MAINTAINER Neva Durand
11 |
12 | # To prevent time zone prompt
13 | ENV DEBIAN_FRONTEND=noninteractive
14 |
15 | # Install softwares from apt repo
16 | RUN apt-get update && apt-get install -y \
17 | libncurses5-dev libcurl4-openssl-dev zlib1g-dev liblzma-dev libbz2-dev \
18 | python3 python3-setuptools python3-pip \
19 | git wget xmlstarlet \
20 | openjdk-8-jre \
21 | && rm -rf /var/lib/apt/lists/*
22 |
23 | # Make directory for all softwares
24 | RUN mkdir /software
25 | WORKDIR /software
26 | ENV PATH="/software:${PATH}"
27 |
28 | # Install samtools 1.9
29 | RUN git clone --branch 1.9 --single-branch https://github.com/samtools/samtools.git && \
30 | git clone --branch 1.9 --single-branch https://github.com/samtools/htslib.git && \
31 | cd samtools && make && make install && cd ../ && rm -rf samtools* htslib*
32 |
33 | # Install system/math python packages (python3)
34 | RUN pip3 install --no-cache-dir python-Levenshtein==0.12.2 pysam requests oauth2client
35 |
36 | # Install Picard 2.26.11
37 | RUN wget https://github.com/broadinstitute/picard/releases/download/2.26.11/picard.jar && chmod +x picard.jar
38 |
39 | # Install gsutil
40 | # Downloading gcloud package
41 | RUN wget https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz
42 |
43 | # Installing the package
44 | RUN mkdir -p /usr/local/gcloud \
45 | && gunzip google-cloud-sdk.tar.gz \
46 | && tar -C /usr/local/gcloud -xvf google-cloud-sdk.tar \
47 | && /usr/local/gcloud/google-cloud-sdk/install.sh \
48 | && rm google-cloud-sdk.tar
49 |
50 | # Adding the package path to local
51 | ENV PATH $PATH:/usr/local/gcloud/google-cloud-sdk/bin
52 |
53 | # Copy the external scripts inside
54 | COPY src/python/bam_to_raw_fastq.py /software
55 | COPY src/python/flexible_import_entities_standard.py /software
56 | COPY src/python/write_terra_tables.py /software
57 | COPY src/bash/monitor_script.sh /software
58 |
--------------------------------------------------------------------------------
/src/R/joint_cell_plotting_density.R:
--------------------------------------------------------------------------------
1 | library(ggplot2)
2 | library(ggpointdensity)
3 |
4 | args <- commandArgs()
5 | pkr <- args[6]
6 | barcode_metadata_file <- args[7]
7 | plot_file <- args[8]
8 |
9 | options(scipen=999)
10 |
11 | barcode_metadata <- read.csv(barcode_metadata_file)
12 | passing_df <- barcode_metadata[barcode_metadata$QC %in% c("RNA only", "ATAC only", "both"),]
13 |
14 | # get max x and y coords to set plot limits
15 | round_to_power_10 <- function(x){
16 | return(10^ceiling(log10(x)))
17 | }
18 | max_x <- max(passing_df$frags)
19 | max_y <- max(passing_df$umis)
20 | xy_lim <- round_to_power_10(max(max_x, max_y))
21 |
22 | # palette from https://rdrr.io/github/GreenleafLab/ArchR/src/R/ColorPalettes.R
23 | sambaNight <- c("6"='#1873CC',"2"='#1798E5',"8"='#00BFFF',"5"='#4AC596',"1"='#00CC00',"4"='#A2E700',"9"='#FFFF00',"7"='#FFD200',"3"='#FFA500')
24 |
25 | if (sum(barcode_metadata$QC=="both") > 0) {
26 | png(plot_file, width=8.75, height=6, units="in", res=300)
27 |
28 | density_plot <- ggplot(passing_df, aes(x=frags, y=umis)) +
29 | geom_pointdensity(size=0.7) +
30 | scale_color_gradientn(colors=sambaNight) +
31 | labs(title=paste0("Joint Cell Calling (", pkr, "): Density Plot", sep=""),
32 | x="ATAC Unique Fragments per Barcode",
33 | y="RNA UMIs per Barcode") +
34 | theme_light() +
35 | theme(plot.margin=margin(t=9, r=36.5, b=25, l=9, unit="pt"),
36 | plot.title=element_text(size=12.5, hjust=0.5),
37 | axis.title=element_text(size=11),
38 | axis.text=element_text(size=8.5),
39 | legend.title=element_text(size=8),
40 | legend.text=element_text(size=6),
41 | panel.grid.minor=element_blank()) +
42 | scale_x_continuous(trans="log10",
43 | limits=c(10,xy_lim)) +
44 | scale_y_continuous(trans="log10",
45 | limits=c(10,xy_lim))
46 | print(density_plot)
47 | dev.off()
48 | }
49 |
--------------------------------------------------------------------------------
/src/python/flexible_import_entities_standard.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import requests
3 |
4 | from oauth2client.client import GoogleCredentials
5 |
6 | # function to get authorization bearer token for requests
7 | def get_access_token():
8 | """Get access token."""
9 |
10 | scopes = ["https://www.googleapis.com/auth/userinfo.profile", "https://www.googleapis.com/auth/userinfo.email"]
11 | credentials = GoogleCredentials.get_application_default()
12 | credentials = credentials.create_scoped(scopes)
13 | return credentials.get_access_token().access_token
14 |
15 | def call_flexible_import_entities(workspace_name, project, tsv):
16 | """Post entities to Terra workspace using flexibleImportEntities."""
17 |
18 | # rawls request URL for batchUpsert
19 | uri = f"https://api.firecloud.org/api/workspaces/{project}/{workspace_name}/flexibleImportEntities?async=false&deleteEmptyValues=false"
20 | # Get access token and and add to headers for requests.
21 | # -H "accept: */*" -H "Authorization: Bearer [token] -H "Content-Type: application/json"
22 | headers = {"Authorization": "Bearer " + get_access_token(), "accept": "*/*"}
23 |
24 | # Create file dictionary to be passed to request
25 | files = {'entities': open(tsv ,'rb')}
26 |
27 | # capture response from API and parse out status code
28 | response = requests.post(uri, headers=headers, files=files)
29 | status_code = response.status_code
30 |
31 | if status_code != 200: # entities upsert fail
32 | print(f"ERROR: Code {status_code} returned.")
33 | print(response.text)
34 | print(response.raise_for_status())
35 |
36 | # entities upsert success
37 | print(f"Successfully uploaded entities." + "\n")
38 |
39 | if __name__ == '__main__':
40 | parser = argparse.ArgumentParser(description='')
41 | parser.add_argument('-w', '--workspace_name', required=True, help='name of workspace in which to make changes')
42 | parser.add_argument('-p', '--project', required=True, help='billing project (namespace) of workspace in which to make changes')
43 | parser.add_argument('-t', '--tsv', required=True, help='.tsv file formatted in load format to Terra UI')
44 |
45 | args = parser.parse_args()
46 |
47 | # call import API (firecloud)
48 | call_flexible_import_entities(args.workspace_name, args.project, args.tsv)
--------------------------------------------------------------------------------
/tasks/share_task_log_rna.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | # TASK
4 | # SHARE-rna-log
5 | # Gather information from log files
6 |
7 |
8 | task log_rna {
9 | meta {
10 | version: 'v0.1'
11 | author: 'Neva C. Durand (neva@broadinstitute.org) at Broad Institute of MIT and Harvard'
12 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: log rna task'
13 | }
14 |
15 | input {
16 | # This function takes as input the necessary log files and extracts
17 | # the quality metrics
18 | File alignment_log
19 | File dups_log
20 | }
21 |
22 | command <<<
23 | total_reads=$(awk -F"|" '$1~/input reads/{print $2}' ~{alignment_log})
24 | echo $total_reads > total_reads.txt
25 | aligned_uniquely=$(awk -F"|" '$1~/Uniquely mapped reads number/{print $2}' ~{alignment_log})
26 | echo $aligned_uniquely > aligned_uniquely.txt
27 | aligned_multimap=$(awk -F"|" '$1~/Number of reads mapped to multiple loci/{print $2}' ~{alignment_log})
28 | echo $aligned_multimap > aligned_multimap.txt
29 | echo $(($total_reads - $aligned_uniquely - $aligned_multimap)) > unaligned.txt
30 | awk -F":" '$1~/total reads/{print $2}' ~{dups_log} > feature_reads.txt
31 | awk -F":" '$1~/duplicate reads/{print $2}' ~{dups_log} > duplicate_reads.txt
32 | >>>
33 | output {
34 | Int rna_total_reads = read_int("total_reads.txt")
35 | Int rna_aligned_uniquely = read_int("aligned_uniquely.txt")
36 | Int rna_aligned_multimap = read_int("aligned_multimap.txt")
37 | Int rna_unaligned = read_int("unaligned.txt")
38 | Int rna_feature_reads = read_int("feature_reads.txt")
39 | Int rna_duplicate_reads = read_int("duplicate_reads.txt")
40 | }
41 |
42 | runtime {
43 | docker: 'ubuntu:latest'
44 | }
45 | parameter_meta {
46 | alignment_log: {
47 | description: 'RNA alignment log file',
48 | help: 'Log file from RNA alignment step.',
49 | example: 'SS-PKR-30-96-ENTIRE-PLATE.rna.align.hg38.Log.out'
50 | }
51 |
52 | dups_log: {
53 | description: 'Group UMI dups log file',
54 | help: 'Log file from group UMI task',
55 | example: 'SS-PKR-30-96-ENTIRE-PLATE.rm_dup_barcode.log.txt'
56 | }
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/src/python/plot_insert_size_hist.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | """
4 | This script takes in the Picard CollectInsertSizeMetrics histogram txt file output,
5 | and generates the histogram as a png.
6 | """
7 |
8 | import argparse
9 | import pandas as pd
10 | from plotnine import *
11 |
12 | def parse_arguments():
13 | parser = argparse.ArgumentParser(description="Plot insert size histogram")
14 | parser.add_argument("histogram_file", help="Histogram txt file name")
15 | parser.add_argument("pkr", help="PKR ID")
16 | parser.add_argument("out_file", help="Name of output png file")
17 |
18 | return parser.parse_args()
19 |
20 | def get_hist_vals(histogram_file):
21 | """Get dataframe of histogram values"""
22 | with open(histogram_file, "r") as f:
23 | begin_vals = False
24 | insert_size = []
25 | count = []
26 | for line in f:
27 | vals = line.rstrip().split(sep="\t")
28 | if begin_vals and len(vals) == 2: # last line is blank
29 | insert_size.append(int(vals[0]))
30 | count.append(int(vals[1]))
31 | elif vals[0] == "insert_size": # desired values occur after line beginning with "insert_size"
32 | begin_vals = True
33 |
34 | df = pd.DataFrame(list(zip(insert_size, count)), columns=["insert_size","count"])
35 |
36 | return(df)
37 |
38 | def label_func(breaks):
39 | return ["{:.0e}".format(x) for x in breaks]
40 |
41 | def plot_hist(df, pkr, out_file):
42 | plot = (ggplot(df, aes(x="insert_size", y="count")) +
43 | geom_line(color="red") +
44 | geom_area(fill="red") +
45 | labs(title = f"Insert Size Histogram ({pkr})",
46 | x = "Insert size",
47 | y = "Count") +
48 | scale_y_continuous(labels = label_func) +
49 | theme_classic())
50 |
51 | plot.save(filename = out_file, dpi=1000)
52 |
53 | def main():
54 | print("Starting histogram plotting script")
55 | args = parse_arguments()
56 | histogram_file = getattr(args, "histogram_file")
57 | pkr = getattr(args, "pkr")
58 | out_file = getattr(args, "out_file")
59 |
60 | df = get_hist_vals(histogram_file)
61 |
62 | plot_hist(df, pkr, out_file)
63 | print("Finished plotting")
64 |
65 | if __name__ == "__main__":
66 | main()
67 |
--------------------------------------------------------------------------------
/workflows/subwf-cell-annotation.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | # Import the tasks called by the pipeline
4 | import "../tasks/share_task_cell_annotation.wdl" as share_task_cell_annotation
5 |
6 | workflow wf_cell_annotation {
7 | meta {
8 | version: 'v0.1'
9 | author: 'Zhijian Li'
10 | affiliation: 'Broad Institute of MIT and Harvard'
11 | email: 'lizhijia@broadinstitute.org'
12 | description: 'SHARE-Seq pipeline: cell type annotation using RNA-seq data.'
13 | }
14 |
15 | input {
16 | # Sample name
17 | String? prefix="prefix"
18 |
19 | # Reference genome
20 | String genome
21 |
22 | # Reference data for cell annotation
23 | String reference_data_id
24 | String reference_data_name
25 | String reference_label
26 |
27 | # Set true if the reference data uses gene id as feature name.
28 | # This is usually true for data downloaded from cellxgene server
29 | String? gene_id_to_symbol = "TRUE"
30 |
31 | # Query data
32 | File query_data
33 |
34 | # Docker images
35 | String? docker_image="lzj1769/cell_annotation"
36 |
37 | # Runtime parameters
38 | Float? memory_factor = 5
39 | Float? disk_factor = 10
40 | }
41 |
42 | call share_task_cell_annotation.cell_annotation as cell_annotation{
43 | input:
44 | reference_data_id = reference_data_id,
45 | reference_data_name = reference_data_name,
46 | reference_label = reference_label,
47 | query_data = query_data,
48 | genome = genome,
49 | gene_id_to_symbol = gene_id_to_symbol,
50 | prefix = prefix,
51 | docker_image = docker_image,
52 | disk_factor = disk_factor,
53 | memory_factor = memory_factor
54 | }
55 |
56 | output {
57 | File share_cell_annotation_reference_h5ad = cell_annotation.reference_h5ad
58 | File share_cell_annotation_notebook_log = cell_annotation.notebook_log
59 | File share_cell_annotation_monitor_log = cell_annotation.monitor_log
60 | File share_cell_annotation_prediction = cell_annotation.prediction
61 | File share_cell_annotation_prediction_labels = cell_annotation.prediction_labels
62 | File share_cell_annotation_prediction_scores = cell_annotation.prediction_scores
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/src/python/assign_multimappers.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import argparse
3 |
4 | """
5 | From https://github.com/ENCODE-DCC/atac-seq-pipeline/blob/master/src/assign_multimappers.py
6 | Script to take multimappers and randomly assign
7 | REQUIRES A QJNAME SORTED FILE!
8 | """
9 |
10 | def parse_args():
11 | '''
12 | Gives options
13 | '''
14 | parser = argparse.ArgumentParser(description='Saves reads below a alignment threshold and discards all others')
15 | parser.add_argument('-k', help='Alignment number cutoff')
16 | parser.add_argument('--paired-end', dest='paired_ended', action='store_true', help='Data is paired-end')
17 | args = parser.parse_args()
18 | alignment_cutoff = int(args.k)
19 | paired_ended = args.paired_ended
20 |
21 | return alignment_cutoff, paired_ended
22 |
23 |
24 | if __name__ == "__main__":
25 | '''
26 | Runs the filtering step of choosing multimapped reads
27 | '''
28 |
29 | [alignment_cutoff, paired_ended] = parse_args()
30 |
31 | if paired_ended:
32 | alignment_cutoff = int(alignment_cutoff) * 2
33 |
34 | # Store each line in sam file as a list of reads,
35 | # where each read is a list of elements to easily
36 | # modify or grab things
37 | current_reads = []
38 | current_qname = ''
39 |
40 | for line in sys.stdin:
41 |
42 | read_elems = line.strip().split('\t')
43 |
44 | if read_elems[0].startswith('@'):
45 | sys.stdout.write(line)
46 | continue
47 |
48 | # Keep taking lines that have the same qname
49 | if read_elems[0] == current_qname:
50 | # Add line to current reads
51 | current_reads.append(line)
52 | pass
53 | else:
54 | # Discard if there are more than the alignment cutoff
55 | if len(current_reads) > alignment_cutoff:
56 | current_reads = [line]
57 | current_qname = read_elems[0]
58 | elif len(current_reads) > 0:
59 | # Just output all reads, which are then filtered with samtools
60 | for read in current_reads:
61 | sys.stdout.write(str(read))
62 |
63 | # And then discard
64 | current_reads = [line]
65 | current_qname = read_elems[0]
66 | else:
67 | # First read in file
68 | current_reads.append(line)
69 | current_qname = read_elems[0]
70 |
--------------------------------------------------------------------------------
/src/python/barcode_revcomp_detect.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import sys
3 |
4 | REV_COMP = str.maketrans("ATGC", "TACG")
5 | def reverse_complement(seq):
6 | return str.translate(seq, REV_COMP)[::-1]
7 |
8 | def get_open_fn(path):
9 | with open(path, "rb") as f:
10 | is_gzipped = (f.read(2) == b'\x1f\x8b')
11 | return gzip.open if is_gzipped else open
12 |
13 | def read_barcodes(path):
14 | open_fn = get_open_fn(path)
15 | with open_fn(path, 'rt') as file:
16 | bc = [b.strip() for b in file]
17 | bcrc = [reverse_complement(b) for b in bc]
18 | return set(bc), set(bcrc)
19 |
20 | def bc_detect(fastq, whitelist, out, qc, offset, num_reads=100000, thresh=0.45):
21 | bc, bcrc = read_barcodes(whitelist)
22 |
23 | bc_match = 0
24 | bcrc_match = 0
25 | num_lines = num_reads * 4
26 | with gzip.open(fastq, 'rt') as f:
27 | for lnum, line in enumerate(f):
28 | if lnum >= num_lines:
29 | break
30 | if lnum % 4 != 1:
31 | continue
32 | seq = line.strip()[offset:]
33 | if seq in bc:
34 | bc_match += 1
35 | if seq in bcrc:
36 | bcrc_match += 1
37 |
38 | bc_match_prop = bc_match / num_reads
39 | bcrc_match_prop = bcrc_match / num_reads
40 | valid = (bc_match_prop >= thresh) or (bcrc_match_prop >= thresh)
41 | fc_chosen = (bc_match_prop >= bcrc_match_prop)
42 |
43 | with open(qc, 'w') as f:
44 | f.write(f"Direct match proportion: {bc_match_prop}\n")
45 | f.write(f"Reverse-complement match proportion: {bcrc_match_prop}\n")
46 | f.write(f"Reverse-complement chosen: {not fc_chosen}\n")
47 |
48 | if not valid:
49 | raise ValueError(f"Insufficient barcode match rate: {bc_match_prop}, {bcrc_match_prop}")
50 | with open(out, 'w') as f:
51 | if fc_chosen:
52 | f.write(f"{0}\n")
53 | else:
54 | f.write(f"{1}\n")
55 |
56 | try:
57 | fastq = sys.argv[1]
58 | modality = sys.argv[2]
59 | whitelist = sys.argv[3]
60 |
61 | qc = sys.argv[4]
62 | out = sys.argv[5]
63 | thres = sys.argv[6]
64 |
65 | if modality == "10x":
66 | offset = 0
67 | bc_detect(fastq, whitelist, out, qc, offset, 100000, float(thres))
68 | elif modality == "10x_multiome":
69 | offset = 8
70 | bc_detect(fastq, whitelist, out, qc, offset, 100000, float(thres))
71 |
72 | except NameError:
73 | pass
74 |
--------------------------------------------------------------------------------
/tasks/get_cellxgene_data.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task get_cellxgene_data {
4 | meta {
5 | version: 'v0.1'
6 | author: 'Zhijian Li'
7 | affiliation: 'Broad Institute of MIT and Harvard'
8 | email: 'lizhijia@broadinstitute.org'
9 | description: 'SHARE-Seq pipeline: get data from cellxgene server.'
10 | }
11 |
12 | input {
13 | # Reference data id and name
14 | String reference_data_id
15 | String reference_data_name
16 |
17 | # Docker image
18 | String? docker_image
19 | }
20 |
21 | # Determining memory size base on the size of the input files.
22 | Float mem_gb = 32.0
23 |
24 | # Determining disk size base on the size of the input files.
25 | Int disk_gb = 100
26 |
27 | # Determining disk type base on the size of disk.
28 | String disk_type = if disk_gb > 375 then "SSD" else "LOCAL"
29 |
30 | String reference_h5ad = "${reference_data_name}"
31 | String monitor_log = "monitoring.log"
32 | String running_log = "get_cellxgene_data.log"
33 |
34 | command {
35 | set -e
36 |
37 | bash $(which monitor_script.sh) | tee ~{monitor_log} 1>&2 &
38 |
39 | # Download data from cellxgene
40 | python3 $(which get_cellxgene_data.py) ${reference_data_id} ${reference_data_name}
41 |
42 | }
43 |
44 | output {
45 | File reference_h5ad = reference_h5ad
46 | File monitor_log = monitor_log
47 | File running_log = running_log
48 | }
49 |
50 | runtime {
51 | memory : "${mem_gb} GB"
52 | memory_retry_multiplier: 2
53 | disks: "local-disk ${disk_gb} ${disk_type}"
54 | docker : "${docker_image}"
55 | maxRetries:1
56 | }
57 |
58 | parameter_meta {
59 | reference_data_id: {
60 | description: 'Reference dataset id',
61 | help: 'The dataset id from cellxgene server.',
62 | examples: ['3bbb6cf9-72b9-41be-b568-656de6eb18b5']
63 | }
64 |
65 | reference_data_name: {
66 | description: 'Reference dataset name',
67 | help: 'String used to name the reference data.',
68 | examples: ['reference.h5ad']
69 | }
70 |
71 | docker_image: {
72 | description: 'Docker image.',
73 | help: 'Docker image for preprocessing step.',
74 | example: ['put link to gcr or dockerhub']
75 | }
76 |
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/tasks/share_task_correct_fastq.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | # TASK
4 | # SHARE-correct-fastq
5 |
6 | task share_correct_fastq {
7 | meta {
8 | version: 'v0.1'
9 | author: 'Mei Knudson (mknudson@broadinstitute.org) at Broad Institute of MIT and Harvard'
10 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: Correct FASTQs task'
11 | }
12 |
13 | input {
14 | File fastq_R1
15 | File fastq_R2
16 | File whitelist
17 | String sample_type
18 | String? pkr
19 | String? prefix
20 |
21 | Int? cpus = 16
22 | Float? disk_factor = 8.0
23 | Float? memory_factor = 0.08
24 | String? docker_image = "us.gcr.io/buenrostro-share-seq/share_task_correct_fastq:v1.0.0"
25 | }
26 |
27 | # Determine the size of the input
28 | Float input_file_size_gb = size(fastq_R1, "G") + size(fastq_R2, "G")
29 |
30 | # Determining memory size base on the size of the input files.
31 | Float mem_gb = 16.0 + memory_factor * input_file_size_gb
32 |
33 | # Determining disk size base on the size of the input files.
34 | Int disk_gb = round(40.0 + disk_factor * input_file_size_gb)
35 |
36 | # Determining disk type base on the size of disk.
37 | String disk_type = if disk_gb > 375 then "SSD" else "LOCAL"
38 |
39 | String corrected_fastq_R1 = basename(fastq_R1, ".fastq.gz") + "_corrected.fastq"
40 | String corrected_fastq_R2 = basename(fastq_R2, ".fastq.gz") + "_corrected.fastq"
41 | String monitor_log = "correct_fastqs_monitor.log"
42 |
43 | command <<<
44 | set -e
45 |
46 | bash $(which monitor_script.sh) | tee ~{monitor_log} 1>&2 &
47 |
48 | # Perform barcode error correction on FASTQs
49 | python3 $(which correct_fastq.py) \
50 | ~{fastq_R1} \
51 | ~{fastq_R2} \
52 | ~{corrected_fastq_R1} \
53 | ~{corrected_fastq_R2} \
54 | ~{whitelist} \
55 | ~{sample_type} \
56 | ~{prefix} \
57 | ~{pkr}
58 |
59 | pigz -p ~{cpus} *.fastq
60 | >>>
61 |
62 | output {
63 | File corrected_fastq_R1 = "~{corrected_fastq_R1}.gz"
64 | File corrected_fastq_R2 = "~{corrected_fastq_R2}.gz"
65 | File barcode_qc = "~{prefix}_barcode_qc.txt"
66 | File monitor_log = "~{monitor_log}"
67 | }
68 |
69 | runtime {
70 | cpu : cpus
71 | memory : "~{mem_gb} GB"
72 | disks: "local-disk ~{disk_gb} ~{disk_type}"
73 | docker : "~{docker_image}"
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/.vimrc:
--------------------------------------------------------------------------------
1 | " Vim syn file
2 | " Language: Workflow Description Language
3 | " Maintainer: Scott Frazer
4 | " Latest Revision: 21 July 2015
5 | "
6 | if exists("b:current_syn")
7 | finish
8 | endif
9 |
10 | " command { ... } section
11 | syntax region wdlCommandSection start="command\s*{" end="\v\}" contains=wdlCommand,wdlCommandParameter,wdlKeyword,wdlCommandDelimiter
12 | syntax region wdlCommandSection2 start="command\s*<<<" end="\v>>>" contains=wdlCommand,wdlCommandParameter,wdlKeyword,wdlCommandDelimiter
13 | syntax keyword wdlCommandKeyword command contained containedin=wdlCommandSection
14 | syntax match wdlCommand "\zs.\{-}\ze\${" contained containedin=wdlCommandSection
15 | syntax region wdlCommandParameter start=/\v\$\{/ end=/\v\}/ oneline contained containedin=wdlCommandSection contains=wdlType,wdlString,wdlCommandParameterName
16 | syntax match wdlCommandParameterName /\v\zs\w+\ze([\\?\\*\\+]?\})/ contained containedin=wdlCommandParameter
17 |
18 | " Keywords
19 | syntax keyword wdlKeyword workflow task call nextgroup=wdlTaskName
20 | syntax keyword wdlKeyword output scatter if then else runtime
21 | syntax keyword wdlType Boolean Int Float String File Uri nextgroup=wdlIdentifier
22 | syntax keyword wdlImport import
23 |
24 | " Compound Types
25 | syntax region wdlType start=/\(Map\|Array\)\[/ end=/\]/ contains=wdlType nextgroup=wdlIdentifier
26 |
27 | " Identifiers
28 | syntax match wdlIdentifier /\v\s*\w+/ contained
29 | syntax match wdlTaskName /\v\s*\w+/ contained
30 |
31 | " Strings
32 | syntax region wdlString start=/"/ skip=/\\"/ end=/"/ oneline contains=wdlInterpolationWrapper
33 | syntax region wdlInterpolationWrapper start="\v\$\{" end="\v\}" contained containedin=wdlString contains=wdlInterpolatedString
34 | syntax match wdlInterpolatedString "\v\w+" contained containedin=wdlInterpolationWrapper
35 |
36 | " Comments
37 | syntax match wdlComment "\v#.*$"
38 |
39 | highlight link wdlCommandParameter Comment
40 | highlight link wdlKeyword Keyword
41 | highlight link wdlCommandKeyword Keyword
42 | highlight link wdlCommand Punctuation
43 | highlight link wdlTaskName Identifier
44 |
45 | highlight link wdlCommandParameterName Identifier
46 | highlight link wdlIdentifier Identifier
47 | highlight link wdlType Type
48 | highlight link wdlString String
49 | highlight link wdlImport Include
50 | highlight link wdlInterpolationWrapper Include
51 | highlight link wdlInterpolatedString Include
52 | highlight link wdlComment Comment
53 |
54 | setlocal commentstring=//\ %s
55 | " @-@ adds the literal @ to iskeyword for @IBAction and similar
56 | setlocal tabstop=2
57 | setlocal softtabstop=2
58 | setlocal shiftwidth=2
59 |
60 | au BufRead,BufNewFile *.wdl set filetype=wdl
61 |
--------------------------------------------------------------------------------
/dockerfiles/share_task_merge_bams.dockerfile:
--------------------------------------------------------------------------------
1 | ############################################################
2 | # Dockerfile for BROAD GRO share-seq-pipeline
3 | # Based on Debian slim
4 | ############################################################
5 |
6 | FROM debian@sha256:3ecce669b6be99312305bc3acc90f91232880c68b566f257ae66647e9414174f as builder
7 |
8 | ENV SAMBAMBA_VERSION 0.6.6
9 | ENV PICARD_VERSION 2.27.5
10 |
11 | # To prevent time zone prompt
12 | ENV DEBIAN_FRONTEND=noninteractive
13 |
14 | # Install softwares from apt repo
15 | RUN apt-get update && apt-get install -y \
16 | autoconf \
17 | build-essential \
18 | git \
19 | libcurl4-openssl-dev \
20 | liblz4-dev \
21 | liblzma-dev \
22 | libncurses5-dev \
23 | libbz2-dev \
24 | python3 \
25 | unzip \
26 | wget \
27 | zlib1g-dev && \
28 | rm -rf /var/lib/apt/lists/*
29 |
30 |
31 | # Make directory for all softwares
32 | RUN mkdir /software
33 | WORKDIR /software
34 | ENV PATH="/software:${PATH}"
35 |
36 | # Install sambamba 0.6.6
37 | RUN wget https://github.com/lomereiter/sambamba/releases/download/v${SAMBAMBA_VERSION}/sambamba_v${SAMBAMBA_VERSION}_linux.tar.bz2 && \
38 | tar -xvjf sambamba_v${SAMBAMBA_VERSION}_linux.tar.bz2 && \
39 | mv sambamba_v${SAMBAMBA_VERSION} /usr/local/bin/sambamba && \
40 | rm -rf sambamba_*
41 |
42 | # Install Picard 2.20.7
43 | RUN wget https://github.com/broadinstitute/picard/releases/download/${PICARD_VERSION}/picard.jar && chmod +x picard.jar && mv picard.jar /usr/local/bin
44 |
45 | FROM debian@sha256:3ecce669b6be99312305bc3acc90f91232880c68b566f257ae66647e9414174f
46 |
47 | LABEL maintainer = "Eugenio Mattei"
48 | LABEL software = "Share-seq pipeline"
49 | LABEL software.version="1.0.0"
50 | LABEL software.organization="Broad Institute of MIT and Harvard"
51 | LABEL software.version.is-production="Yes"
52 | LABEL software.task="merge"
53 |
54 | RUN apt-get update && apt-get install -y \
55 | openjdk-17-jre && \
56 | rm -rf /var/lib/apt/lists/*
57 |
58 | # Create and setup new user
59 | ENV USER=shareseq
60 | WORKDIR /home/$USER
61 |
62 | RUN groupadd -r $USER &&\
63 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
64 | chown $USER:$USER /home/$USER
65 |
66 | # Add folder with software to the path
67 | ENV PATH="/software:${PATH}"
68 |
69 | # Copy the compiled software from the builder
70 | COPY --from=builder --chown=$USER:$USER /usr/local/bin/* /usr/local/bin/
71 | COPY --from=builder --chown=$USER:$USER /lib/x86_64-linux-gnu/* /lib/x86_64-linux-gnu/
72 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
73 |
74 |
75 | USER ${USER}
76 |
77 |
78 |
--------------------------------------------------------------------------------
/dockerfiles/share_task_archr.dockerfile:
--------------------------------------------------------------------------------
1 | ############################################################
2 | # Dockerfile for BROAD GRO share-seq-pipeline
3 | # Based on Debian slim
4 | ############################################################
5 |
6 | FROM r-base@sha256:fff003a52d076e963396876b83cfa88c4f40a8bc27e341339cd3cc0236c1db79 as builder
7 |
8 | LABEL maintainer = "Siddarth Wekhande"
9 | LABEL software = "Share-seq pipeline"
10 | LABEL software.version="1.0.0"
11 | LABEL software.organization="Broad Institute of MIT and Harvard"
12 | LABEL software.version.is-production="Yes"
13 | LABEL software.task="archr"
14 |
15 | RUN echo "options(repos = 'https://cloud.r-project.org')" > $(R --no-echo --no-save -e "cat(Sys.getenv('R_HOME'))")/etc/Rprofile.site
16 |
17 | ENV R_LIBS_USER=/usr/local/lib/R
18 | ENV RETICULATE_MINICONDA_ENABLED=FALSE
19 |
20 | RUN apt-get update -qq && \
21 | apt-get install -y -qq --no-install-recommends\
22 | binutils \
23 | gtk-doc-tools \
24 | libcairo2-dev \
25 | libcurl4-openssl-dev \
26 | libfreetype-dev \
27 | libfribidi-dev \
28 | libgsl-dev \
29 | libharfbuzz-dev \
30 | libhdf5-dev \
31 | libjpeg-dev \
32 | libmpfr-dev \
33 | libpng-dev \
34 | libssl-dev \
35 | libtiff5-dev \
36 | libxml2-dev \
37 | libxt-dev \
38 | libmagick++-dev \
39 | libgeos-dev \
40 | meson \
41 | python3 \
42 | python3-pip && \
43 | rm -rf /var/lib/apt/lists/*
44 |
45 | ENV USER=shareseq
46 | WORKDIR /home/$USER
47 |
48 | RUN groupadd -r $USER &&\
49 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
50 | chown $USER:$USER /home/$USER
51 |
52 | RUN R --no-echo --no-restore --no-save -e "install.packages(c('devtools','hdf5r','IRkernel','BiocManager','Cairo','magick'))"
53 |
54 | RUN R --no-echo --no-restore --no-save -e "BiocManager::install(c('GenomeInfoDbData','GenomicRanges','Rsamtools'), update=F, ask=F)"
55 |
56 | RUN R --no-echo --no-restore --no-save -e "devtools::install_github('GreenleafLab/ArchR@v1.0.1', repos = BiocManager::repositories());ArchR::installExtraPackages()"
57 |
58 | RUN R --no-echo --no-restore --no-save -e "devtools::install_github('immunogenomics/presto')"
59 |
60 | RUN R --no-echo --no-restore --no-save -e "remotes::install_version('Seurat', version = '4.3.0')"
61 |
62 |
63 | RUN R --no-echo --no-restore --no-save -e "install.packages(c('logr','hexbin', 'ggpointdensity'))"
64 |
65 | RUN python3 -m pip install --break-system-packages jupyter papermill
66 |
67 | COPY src/jupyter_nb/archr_notebook.ipynb /usr/local/bin/
68 |
69 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
70 |
71 |
72 | RUN R -e "IRkernel::installspec()"
73 |
--------------------------------------------------------------------------------
/workflows/subwf-atac-archr.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | # Import the tasks called by the pipeline
4 | import "../tasks/share_task_archr.wdl" as share_task_archr
5 |
6 |
7 | workflow wf_atac {
8 | meta {
9 | version: 'v0.1'
10 | author: 'Eugenio Mattei (emattei@broadinstitute.org) and Sai Ma @ Broad Institute of MIT and Harvard'
11 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: Sub-workflow to process the ATAC portion of SHARE-seq libraries.'
12 | }
13 |
14 | input {
15 | # ATAC Sub-worflow inputs
16 | File atac_fragments_filtered
17 | String genome_name
18 | String peak_set
19 | Int? cpus = 4
20 | String? docker
21 | String? prefix
22 | Int? min_tss = 4
23 | Int? min_frags = 1000
24 | Float? archr_disk_factor = 8.0
25 | Float? archr_memory_factor = 4.0
26 | }
27 |
28 | call share_task_archr.archr as archr{
29 | input:
30 | atac_frag = atac_fragments_filtered,
31 | genome = genome_name,
32 | peak_set = peak_set,
33 | min_tss = min_tss,
34 | min_frags = min_frags,
35 | doublet_k = 10,
36 | doublet_knn_method = "UMAP",
37 | lsi_method = 1,
38 | docker_image = docker,
39 | prefix = prefix,
40 | disk_factor = archr_disk_factor,
41 | memory_factor = archr_memory_factor
42 | }
43 |
44 | output {
45 | File share_atac_archr_notebook_output = archr.notebook_output
46 | File share_atac_archr_notebook_log = archr.notebook_log
47 |
48 | File? share_atac_archr_raw_tss_enrichment = archr.archr_raw_tss_by_uniq_frags_plot
49 | File? share_atac_archr_filtered_tss_enrichment = archr.archr_filtered_tss_by_uniq_frags_plot
50 | File? share_atac_archr_raw_fragment_size_plot = archr.archr_raw_frag_size_dist_plot
51 | File? share_atac_archr_filtered_fragment_size_plot = archr.archr_filtered_frag_size_dist_plot
52 |
53 | File? share_atac_archr_umap_doublets = archr.archr_umap_doublets
54 | File? share_atac_archr_umap_cluster_plot = archr.archr_umap_cluster_plot
55 | File? share_atac_archr_umap_num_frags_plot = archr.archr_umap_num_frags_plot
56 | File? share_atac_archr_umap_tss_score_plot = archr.archr_umap_tss_score_plot
57 | File? share_atac_archr_umap_frip_plot = archr.archr_umap_frip_plot
58 |
59 | File? share_atac_archr_gene_heatmap_plot = archr.archr_heatmap_plot
60 | File? share_atac_archr_arrow = archr.archr_arrow
61 | File? share_atac_archr_obj = archr.archr_raw_obj
62 | File? share_atac_archr_plots_zip = archr.plots_zip
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/dockerfiles/dorcs_task_find_dorcs.dockerfile:
--------------------------------------------------------------------------------
1 | ############################################################
2 | # Dockerfile for BROAD GRO share-seq-pipeline
3 | # Based on Debian slim
4 | ############################################################
5 |
6 | FROM r-base@sha256:fff003a52d076e963396876b83cfa88c4f40a8bc27e341339cd3cc0236c1db79 as builder
7 |
8 | RUN echo "options(repos = 'https://cloud.r-project.org')" > $(R --no-echo --no-save -e "cat(Sys.getenv('R_HOME'))")/etc/Rprofile.site
9 |
10 | ENV R_LIBS_USER=/usr/local/lib/R
11 | ENV RETICULATE_MINICONDA_ENABLED=FALSE
12 |
13 | RUN apt-get update -qq && \
14 | apt-get install -y --no-install-recommends \
15 | binutils \
16 | gtk-doc-tools \
17 | libcairo2-dev \
18 | libcurl4-openssl-dev \
19 | libfreetype-dev \
20 | libfribidi-dev \
21 | libgsl-dev \
22 | libharfbuzz-dev \
23 | libhdf5-dev \
24 | libjpeg-dev \
25 | libmpfr-dev \
26 | libpng-dev \
27 | libssl-dev \
28 | libtiff5-dev \
29 | libxml2-dev \
30 | libxt-dev \
31 | libmagick++-dev \
32 | libgeos-dev \
33 | meson \
34 | python3 \
35 | python3-pip && \
36 | rm -rf /var/lib/apt/lists/*
37 |
38 | RUN R --no-echo --no-restore --no-save -e "install.packages(c('dplyr','patchwork','ggplot2','ggrepel','reshape2','circlize','networkD3','GGally','igraph','network','foreach','iterators','hdf5r','ggrastr','BiocManager','remotes','pbmcapply','doSNOW','Rmpfr', 'glue','magrittr','pillar','RcppArmadillo','reticulate','rlang','yaml','rpart','IRkernel','data.table', 'tidyft','qlcMatrix','logr'))"
39 |
40 | RUN R --no-echo --no-restore --no-save -e "remotes::install_version('Seurat', version = '4.1.1')"
41 |
42 | RUN R --no-echo --no-restore --no-save -e "BiocManager::install(c('Biostrings','rtracklayer','GenomicRanges','motifmatchr','ComplexHeatmap','chromVAR'), update=T, ask=F)"
43 |
44 | RUN R --no-echo --no-restore --no-save -e "remotes::install_github('caleblareau/BuenColors')"
45 |
46 | ENV USER=shareseq
47 | WORKDIR /home/$USER
48 | RUN groupadd -r $USER &&\
49 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
50 | chown $USER:$USER /home/$USER
51 |
52 | RUN python3 -m pip install --break-system-packages jupyter papermill
53 |
54 | RUN chown $USER:$USER /usr/local/lib/R
55 |
56 | COPY --chown=$USER:$USER src/jupyter_nb/dorcs_jplot_notebook.ipynb /usr/local/bin/
57 |
58 | #COPY --chown=$USER:$USER src/jupyter_nb/dorcs_notebook_rds.ipynb /usr/local/bin/
59 |
60 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
61 |
62 |
63 | RUN mkdir -p /home/R/
64 |
65 | COPY --chown=$USER:$USER src/R/DORCS_helper_functions_optimized.R src/R/TSSRanges.RData /home/R/
66 |
67 | USER ${USER}
68 |
69 | RUN R -e "IRkernel::installspec()"
70 |
--------------------------------------------------------------------------------
/dockerfiles/share_task_bowtie2.dockerfile:
--------------------------------------------------------------------------------
1 | ############################################################
2 | # Dockerfile for BROAD GRO share-seq-pipeline
3 | # Based on Debian slim
4 | ############################################################
5 |
6 | FROM debian:buster-slim as builder
7 |
8 | ENV BOWTIE2_VERSION 2.4.3
9 | ENV SAMTOOLS_VERSION 1.9
10 |
11 | # To prevent time zone prompt
12 | ENV DEBIAN_FRONTEND=noninteractive
13 |
14 | # Install softwares from apt repo
15 | RUN apt-get update && apt-get install -y \
16 | build-essential \
17 | cpanminus \
18 | git \
19 | liblz4-dev \
20 | liblzma-dev \
21 | libncurses5-dev \
22 | libbz2-dev \
23 | unzip \
24 | wget \
25 | zlib1g-dev &&\
26 | rm -rf /var/lib/apt/lists/*
27 |
28 |
29 | # Make directory for all softwares
30 | RUN mkdir /software
31 | WORKDIR /software
32 | ENV PATH="/software:${PATH}"
33 |
34 | RUN cpanm Sys::Hostname
35 |
36 | # Install Bowtie2 2.3.4.3
37 | RUN wget https://sourceforge.net/projects/bowtie-bio/files/bowtie2/${BOWTIE2_VERSION}/bowtie2-${BOWTIE2_VERSION}-source.zip && \
38 | unzip bowtie2-${BOWTIE2_VERSION}-source.zip && cd bowtie2-${BOWTIE2_VERSION} && make static-libs && make STATIC_BUILD=1 && \
39 | cp bowtie2* .. && \
40 | cd .. && rm -rf bowtie2-${BOWTIE2_VERSION}*
41 |
42 | # Install samtools 1.9
43 | RUN git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/samtools.git && \
44 | git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/htslib.git && \
45 | cd samtools && make && make install && cd ../ && rm -rf samtools* htslib*
46 |
47 | FROM debian@sha256:3ecce669b6be99312305bc3acc90f91232880c68b566f257ae66647e9414174f
48 |
49 | LABEL maintainer = "Eugenio Mattei"
50 | LABEL software = "Share-seq pipeline"
51 | LABEL software.version="1.0.0"
52 | LABEL software.organization="Broad Institute of MIT and Harvard"
53 | LABEL software.version.is-production="Yes"
54 | LABEL software.task="Bowtie2"
55 |
56 | RUN apt-get update && apt-get install -y \
57 | cpanminus && \
58 | rm -rf /var/lib/apt/lists/*
59 |
60 | # Create and setup new user
61 | ENV USER=shareseq
62 | WORKDIR /home/$USER
63 |
64 | RUN groupadd -r $USER &&\
65 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
66 | chown $USER:$USER /home/$USER
67 |
68 | # Add folder with software to the path
69 | ENV PATH="/software:${PATH}"
70 |
71 | # Copy the compiled software from the builder
72 | COPY --from=builder --chown=$USER:$USER /software/bowtie2* /software/
73 | COPY --from=builder --chown=$USER:$USER /usr/local/bin/* /usr/local/bin/
74 | COPY --from=builder --chown=$USER:$USER /lib/x86_64-linux-gnu/* /lib/x86_64-linux-gnu/
75 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
76 |
77 |
78 |
79 |
80 | USER $USER
81 |
--------------------------------------------------------------------------------
/src/python/write_html.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | Write output HTML file from list of images and text
4 |
5 | @author Neva Durand (c) 2021
6 | """
7 | import argparse
8 | import base64
9 | import io
10 | import os.path
11 |
12 | def main(output_file_name, image_file_list, log_file_list, input_file_name=None):
13 | """
14 | Write to the input file
15 | Image file list is list of png images
16 | Log file list is list of text log files to link to
17 |
18 | Separates images by br tag and encodes directly in utf-8
19 | Log files separated by their title and encoded via pre tag
20 | """
21 | # Open output file, write input if exists
22 | output_file = io.open(output_file_name, 'w', encoding='utf8')
23 | output_file.write('
Results summary')
24 | if input_file_name is not None:
25 | with open(input_file_name) as input_file:
26 | output_file.write(input_file.read())
27 |
28 | with open(image_file_list) as fname:
29 | images = fname.read().splitlines()
30 |
31 | # loop through images in image list and encode
32 | output_file.write('
')
33 | for image in images:
34 | data = open(image, 'rb').read() # read bytes from file
35 | data_base64 = base64.b64encode(data) # encode to base64 (bytes)
36 | data_base64 = data_base64.decode('utf-8') # convert bytes to string
37 | output_file.write('
') # embed in html
38 |
39 | with open(log_file_list) as fname:
40 | logs = fname.read().splitlines()
41 |
42 | # loop through log files in log list and write
43 | for log in logs:
44 | output_file.write(log)
45 | output_file.write("
")
46 | output_file.write('')
47 | output_file.close()
48 |
49 | if __name__ == '__main__':
50 | parser = argparse.ArgumentParser(
51 | formatter_class=argparse.RawDescriptionHelpFormatter,
52 | description=__doc__.split('\n\n\n')[0])
53 | group = parser.add_argument_group()
54 | group.add_argument('output_file_name',
55 | help='html file to write to')
56 | group.add_argument('image_file_list',
57 | help='file containing list of image files to paste in HTML file')
58 | group.add_argument('log_file_list',
59 | help='file containing list of text log files to append to end of HTML file')
60 | group.add_argument('--input_file_name',
61 | help='optional file with html text to add at top of file', nargs='?')
62 | args = parser.parse_args()
63 | main(args.output_file_name, args.image_file_list, args.log_file_list, args.input_file_name)
64 |
65 |
--------------------------------------------------------------------------------
/src/python/bam_to_fragments.py:
--------------------------------------------------------------------------------
1 | # From Kundaje lab
2 | # https://github.com/kundajelab/ENCODE_scatac/blob/master/workflow/scripts/bam_to_fragments.py
3 |
4 | import argparse
5 | import pysam
6 | import sys
7 |
8 | def bam_to_frag(in_path, out_path, barcode_tag="CB", shift_plus=4, shift_minus=-4):
9 | """
10 | Convert coordinate-sorted BAM file to a fragment file format, while adding Tn5 coordinate adjustment
11 | BAM should be pre-filtered for PCR duplicates, secondary alignments, and unpaired reads
12 | Output fragment file is sorted by chr, start, end, barcode
13 | """
14 |
15 | input = pysam.AlignmentFile(in_path, "rb")
16 | with open(out_path, "w") as out_file:
17 | buf = []
18 | curr_pos = None
19 | for read in input:
20 | if read.flag & 16 == 16:
21 | continue # ignore reverse (coordinate-wise second) read in pair
22 |
23 | chromosome = read.reference_name
24 | start = read.reference_start + shift_plus
25 | end = read.reference_start + read.template_length + shift_minus
26 | cell_barcode = read.get_tag(barcode_tag)
27 | # assert(read.next_reference_start >= read.reference_start) ####
28 | data = (chromosome, start, end, cell_barcode, 1)
29 | pos = (chromosome, start)
30 |
31 | if pos == curr_pos:
32 | buf.append(data)
33 | else:
34 | buf.sort()
35 | for i in buf:
36 | print(*i, sep="\t", file=out_file)
37 | buf.clear()
38 | buf.append(data)
39 | curr_pos = pos
40 |
41 | if __name__ == '__main__':
42 |
43 | msg = "Add the description"
44 | parser = argparse.ArgumentParser(description = msg)
45 |
46 | # Adding optional argument
47 | parser.add_argument("bam", help = "Path to the coordinate-sorted bam file.")
48 | parser.add_argument("-o", "--output", help = "Path to the fragments output file.")
49 | parser.add_argument("--prefix", help = "Prefix for the metrics output file.")
50 | parser.add_argument("--shift_plus", help = "Tn5 coordinate adjustment for the plus strand.", type = int, default = 4)
51 | parser.add_argument("--shift_minus", help = "Tn5 coordinate adjustment for the minus strand.", type = int, default = -4)
52 | parser.add_argument("--bc_tag", help = "Specify the tag containing the cell barcode.", default="CB")
53 |
54 | # Read arguments from command line
55 | args = parser.parse_args()
56 |
57 | if args.prefix:
58 | prefix = args.prefix
59 | else:
60 | prefix = args.bam[:-4]
61 |
62 | if args.output:
63 | out_path = args.output
64 | else:
65 | out_path = f"{prefix}.fragments.tsv"
66 |
67 | bc_tag = args.bc_tag
68 |
69 |
70 | bam_to_frag(args.bam, out_path, bc_tag, shift_plus=args.shift_plus, shift_minus=args.shift_minus)
71 |
--------------------------------------------------------------------------------
/dockerfiles/share_task_star.dockerfile:
--------------------------------------------------------------------------------
1 | ############################################################
2 | # Dockerfile for BROAD GRO share-seq-pipeline
3 | # Based on Debian slim
4 | ############################################################
5 |
6 | FROM debian@sha256:3ecce669b6be99312305bc3acc90f91232880c68b566f257ae66647e9414174f as builder
7 |
8 | ENV STAR_VERSION 2.7.10a_alpha_220818
9 | ENV SAMTOOLS_VERSION 1.9
10 |
11 | # To prevent time zone prompt
12 | ENV DEBIAN_FRONTEND=noninteractive
13 |
14 | # Install softwares from apt repo
15 | RUN apt-get update && apt-get install -y \
16 | build-essential \
17 | git \
18 | liblz4-dev \
19 | liblzma-dev \
20 | libncurses5-dev \
21 | libbz2-dev \
22 | unzip \
23 | wget \
24 | zlib1g-dev &&\
25 | rm -rf /var/lib/apt/lists/*
26 |
27 |
28 | # Make directory for all softwares
29 | RUN mkdir /software
30 | WORKDIR /software
31 | ENV PATH="/software:${PATH}"
32 |
33 | # Install STAR 2.7.10a
34 | RUN wget https://github.com/alexdobin/STAR/releases/download/2.7.10a_alpha_220818/STAR_2.7.10a_alpha_220818_Linux_x86_64_static.zip && unzip STAR_2.7.10a_alpha_220818_Linux_x86_64_static.zip && mv STAR /usr/local/bin/
35 | #RUN wget https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.tar.gz && tar -xzf ${STAR_VERSION}.tar.gz
36 | #RUN cd STAR-${STAR_VERSION}/source && make STAR && rm ../../${STAR_VERSION}.tar.gz && mv /software/STAR-${STAR_VERSION}/bin/Linux_x86_64/* /usr/local/bin/
37 |
38 | # Install samtools 1.9
39 | RUN git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/samtools.git && \
40 | git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/htslib.git && \
41 | cd samtools && make && make install && cd ../ && rm -rf samtools* htslib*
42 |
43 | FROM debian@sha256:3ecce669b6be99312305bc3acc90f91232880c68b566f257ae66647e9414174f
44 |
45 | LABEL maintainer = "Eugenio Mattei"
46 | LABEL software = "Share-seq pipeline"
47 | LABEL software.version="1.0.0"
48 | LABEL software.organization="Broad Institute of MIT and Harvard"
49 | LABEL software.version.is-production="Yes"
50 | LABEL software.task="STAR"
51 |
52 | RUN apt-get update && apt-get install -y \
53 | libgc-dev &&\
54 | rm -rf /var/lib/apt/lists/*
55 |
56 | # Create and setup new user
57 | ENV USER=shareseq
58 | WORKDIR /home/$USER
59 |
60 | RUN groupadd -r $USER &&\
61 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
62 | chown $USER:$USER /home/$USER
63 |
64 | # Add folder with software to the path
65 | ENV PATH="/software:${PATH}"
66 |
67 | # Copy the compiled software from the builder
68 | COPY --from=builder --chown=$USER:$USER /usr/local/bin/* /usr/local/bin/
69 | COPY --from=builder --chown=$USER:$USER /usr/lib/x86_64-linux-gnu/libgomp.so.1 /lib/x86_64-linux-gnu/libncurses.so.6 /lib/x86_64-linux-gnu/
70 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
71 |
72 | USER $USER
73 |
--------------------------------------------------------------------------------
/workflows/subwf-rna-seurat.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | # Import the tasks called by the pipeline
4 | import "../tasks/share_task_seurat.wdl" as share_task_seurat
5 |
6 | workflow wf_rna {
7 | meta {
8 | version: 'v0.1'
9 | author: 'Eugenio Mattei (emattei@broadinstitute.org) and Sai Ma @ Broad Institute of MIT and Harvard'
10 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: Sub-workflow to process the RNA portion of SHARE-seq libraries.'
11 | }
12 |
13 | input {
14 | # RNA Seurat inputs
15 |
16 | String prefix
17 | String genome_name
18 | String? docker
19 | File h5_matrix
20 |
21 | #Seurat filtering parameters
22 | Int? min_features
23 | Float? percent_mt
24 | Int? min_cells
25 |
26 | # Seurat UMAP parameters
27 | Int? umap_dim
28 | Float? umap_resolution
29 |
30 | #Seurat runtime parameters
31 | Float? disk_factor
32 | Float? memory_factor
33 | }
34 |
35 | call share_task_seurat.seurat as seurat{
36 | input:
37 | rna_matrix = h5_matrix,
38 | genome_name = genome_name,
39 | min_features = min_features,
40 | percent_mt = percent_mt,
41 | min_cells = min_cells,
42 | umap_dim = umap_dim,
43 | umap_resolution = umap_resolution,
44 | prefix = prefix,
45 | docker_image = docker,
46 | disk_factor = disk_factor,
47 | memory_factor = memory_factor
48 | }
49 |
50 | output {
51 | File share_rna_seurat_notebook_output = seurat.notebook_output
52 | File share_rna_seurat_notebook_log = seurat.notebook_log
53 | File? share_rna_seurat_raw_violin_plot = seurat.seurat_raw_violin_plot
54 | File? share_rna_seurat_filtered_violin_plot = seurat.seurat_filtered_violin_plot
55 | File? share_rna_seurat_raw_qc_scatter_plot = seurat.seurat_raw_qc_scatter_plot
56 | File? share_rna_seurat_filtered_qc_scatter_plot = seurat.seurat_filtered_qc_scatter_plot
57 | File? share_rna_seurat_variable_genes_plot = seurat.seurat_variable_genes_plot
58 | File? share_rna_seurat_PCA_dim_loadings_plot = seurat.seurat_PCA_dim_loadings_plot
59 | File? share_rna_seurat_PCA_plot = seurat.seurat_PCA_plot
60 | File? share_rna_seurat_heatmap_plot = seurat.seurat_heatmap_plot
61 | File? share_rna_seurat_jackstraw_plot = seurat.seurat_jackstraw_plot
62 | File? share_rna_seurat_elbow_plot = seurat.seurat_elbow_plot
63 | File? share_rna_seurat_umap_cluster_plot = seurat.seurat_umap_cluster_plot
64 | File? share_rna_seurat_umap_rna_count_plot = seurat.seurat_umap_rna_count_plot
65 | File? share_rna_seurat_umap_gene_count_plot = seurat.seurat_umap_gene_count_plot
66 | File? share_rna_seurat_umap_mito_plot = seurat.seurat_umap_mito_plot
67 | File? share_rna_seurat_obj = seurat.seurat_filtered_obj
68 | File? share_rna_plots_zip = seurat.plots_zip
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/src/R/atac_qc_plots.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/Rscript
2 |
3 | ### Takes ATAC barcode metadata tsv file, and outputs barcode rank plots as a png file.
4 |
5 | ## Import helper functions
6 | source("/usr/local/bin/barcode_rank_functions.R")
7 |
8 | ## Get arguments, read input
9 | args <- commandArgs()
10 |
11 | barcode_metadata_file <- args[6]
12 | fragment_cutoff <- as.integer(args[7])
13 | fragment_rank_plot_file <- args[8]
14 |
15 | barcode_metadata <- read.table(barcode_metadata_file, header=T)
16 |
17 | ## Get plot inputs
18 |
19 | # Impose fragment cutoff, sort in decreasing order, assign rank
20 | # 1 fragment = 2 reads
21 | fragment <- barcode_metadata$reads_unique / 2
22 | fragment_filtered <- fragment[fragment >= fragment_cutoff]
23 | fragment_filtered_sort <- sort(fragment_filtered, decreasing=T)
24 | fragment_rank <- 1:length(fragment_filtered_sort)
25 |
26 | # Find elbow/knee of fragment barcode rank plot and top-ranked fragment barcode rank plot
27 | fragment_points <- get_elbow_knee_points(x=fragment_rank, y=log10(fragment_filtered_sort))
28 | # For each valid plot, make factor for coloring plot points
29 | if (length(fragment_points) > 0) { # Elbow found in first plot
30 | fragment_plot1 <- TRUE
31 | is_top_ranked_fragment <- factor(ifelse(fragment_rank <= fragment_points[1], 1, 0))
32 | if (length(fragment_points) > 2) { # Elbow/knee found in second plot
33 | fragment_plot2 <- TRUE
34 | fragment_top_rank <- fragment_rank[1:fragment_points[1]]
35 | fragment_top_fragment <- fragment_filtered_sort[1:fragment_points[1]]
36 | is_top_top_ranked_fragment <- factor(ifelse(fragment_top_rank <= fragment_points[3], 1, 0))
37 | } else {
38 | fragment_plot2 <- FALSE
39 | }
40 | } else {
41 | fragment_plot1 <- FALSE
42 | }
43 |
44 | ## Generate plots
45 |
46 | options(scipen=999)
47 |
48 | # Make fragment barcode rank plots
49 | png(fragment_rank_plot_file, width=8, height=8, units='in', res=300)
50 | par(mfrow = c(2,1))
51 |
52 | # Plot 1 (all barcodes passing fragment filter vs log10(fragments))
53 | if (fragment_plot1) {
54 | plot(x=fragment_rank,
55 | y=fragment_filtered_sort,
56 | log="y",
57 | xlab=paste0(" Barcode rank (", length(fragment_rank)-fragment_points[1], " low quality cells)"),
58 | ylab="Fragments per barcode (log10 scale)",
59 | main="ATAC Fragments per Barcode",
60 | col=c("dimgrey","darkblue")[is_top_ranked_fragment],
61 | pch=16,
62 | ylim=c(1,100000))
63 | abline(v=fragment_points[1], h=10^(fragment_points[2]))
64 | text(fragment_points[1], 10^(fragment_points[2]),
65 | paste0("(", fragment_points[1], ", ", 10^(fragment_points[2]), ")"),
66 | adj=c(-0.1,-0.5))
67 | }
68 |
69 | # Plot 2 (top ranked barcodes vs log10(fragments))
70 | if (fragment_plot2) {
71 | plot(x=fragment_top_rank,
72 | y=fragment_top_fragment,
73 | log="y",
74 | xlab="Barcode rank",
75 | ylab="Fragments per barcode (log10 scale)",
76 | main="ATAC Fragments per Top-Ranked Barcode",
77 | col=c("dimgrey","darkblue")[is_top_top_ranked_fragment],
78 | pch=16,
79 | ylim=c(1,100000))
80 | abline(v=fragment_points[3], h=10^(fragment_points[4]))
81 | text(fragment_points[3], 10^(fragment_points[4]),
82 | paste("(", fragment_points[3], ", ", 10^(fragment_points[4]), ")", sep=""),
83 | adj=c(-0.1,-0.5))
84 | }
85 | dev.off()
86 |
87 |
--------------------------------------------------------------------------------
/dockerfiles/share_task_cell_annotation.dockerfile:
--------------------------------------------------------------------------------
1 | ############################################################
2 | # Dockerfile for BROAD GRO share-seq-pipeline
3 | # Based on Debian slim
4 | ############################################################
5 |
6 | FROM ubuntu@sha256:2fdb1cf4995abb74c035e5f520c0f3a46f12b3377a59e86ecca66d8606ad64f9
7 |
8 | LABEL maintainer = "Zhijian Li"
9 | LABEL software = "Share-seq pipeline"
10 | LABEL software.version="0.0.1"
11 | LABEL software.organization="Broad Institute of MIT and Harvard"
12 | LABEL software.version.is-production="No"
13 | LABEL software.task="cell-annotation"
14 |
15 | # To prevent time zone prompt
16 | ENV DEBIAN_FRONTEND=noninteractive
17 | ENV RETICULATE_MINICONDA_ENABLED=FALSE
18 |
19 | ## Create new user
20 | ENV USER=shareseq
21 | WORKDIR /home/$USER
22 | RUN groupadd -r $USER && \
23 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
24 | chown $USER:$USER /home/$USER
25 |
26 | # Install libraries
27 | RUN apt-get update
28 | RUN apt-get install -y --no-install-recommends \
29 | gcc \
30 | g++ \
31 | gfortran \
32 | patch \
33 | build-essential \
34 | binutils \
35 | gtk-doc-tools \
36 | libcairo2-dev \
37 | libcurl4-openssl-dev \
38 | libfreetype6-dev \
39 | libfribidi-dev \
40 | libgsl-dev \
41 | libharfbuzz-dev \
42 | libhdf5-dev \
43 | libjpeg-dev \
44 | libmpfr-dev \
45 | libpng-dev \
46 | libssl-dev \
47 | libtiff5-dev \
48 | libxml2-dev \
49 | libxt-dev \
50 | libgeos-dev \
51 | meson \
52 | libblas-dev \
53 | liblapack-dev \
54 | libbz2-dev
55 |
56 | # Install python and R
57 | RUN apt-get install -y --no-install-recommends \
58 | python3 python3-pip python3-dev python3-venv r-base
59 |
60 | RUN rm -rf /var/lib/apt/lists/*
61 |
62 | RUN echo "options(repos = 'https://cloud.r-project.org')" > $(R --no-echo --no-save -e "cat(Sys.getenv('R_HOME'))")/etc/Rprofile.site
63 | ENV R_LIBS_USER=/usr/local/lib/R
64 |
65 | RUN R --no-echo --no-restore --no-save -e "install.packages('hdf5r')"
66 | RUN R --no-echo --no-restore --no-save -e "install.packages('remotes')"
67 | RUN R --no-echo --no-restore --no-save -e "install.packages('IRkernel')"
68 | RUN R --no-echo --no-restore --no-save -e "install.packages('logr')"
69 | RUN R --no-echo --no-restore --no-save -e "install.packages('BiocManager')"
70 | RUN R --no-echo --no-restore --no-save -e "install.packages('glue')"
71 | RUN R --no-echo --no-restore --no-save -e "install.packages('Matrix')"
72 | RUN R --no-echo --no-restore --no-save -e "install.packages('SeuratObject')"
73 | RUN R --no-echo --no-restore --no-save -e "remotes::install_version('Seurat', version = '4.3.0')"
74 | RUN R --no-echo --no-restore --no-save -e "BiocManager::install('rhdf5', update=F, ask=F)"
75 | RUN R --no-echo --no-restore --no-save -e "BiocManager::install('EnsDb.Mmusculus.v79', update=F, ask=F)"
76 | RUN R --no-echo --no-restore --no-save -e "BiocManager::install('EnsDb.Hsapiens.v86', update=F, ask=F)"
77 | RUN R --no-echo --no-restore --no-save -e "install.packages('optparse')"
78 |
79 | RUN python3 -m pip install anndata cellxgene-census
80 |
81 | COPY src/bash/monitor_script.sh /usr/local/bin
82 | COPY src/python/get_cellxgene_data.py /usr/local/bin
83 | COPY src/R/cell_annotation.R /usr/local/bin/
84 | COPY src/R/cell_annotation_helper_functions.R /usr/local/bin/
85 |
86 |
--------------------------------------------------------------------------------
/tasks/share_task_trim_fastqs_atac.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | # TASK
4 | # trim_fastqs_atac
5 |
6 | task share_trim_fastqs_atac {
7 | meta {
8 | version: 'v0.1'
9 | author: 'Eugenio Mattei (emattei@broadinstitute.org) at Broad Institute of MIT and Harvard'
10 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: trim ATAC FASTQs.'
11 | }
12 |
13 | input {
14 | File fastq_R1 # Pair 1 reads
15 | File fastq_R2 # Pair 2 reads
16 | String chemistry
17 |
18 | Int? cpus = 16
19 | Float? disk_factor = 8.0
20 | Float? memory_factor = 0.15
21 | String? docker_image = "us.gcr.io/buenrostro-share-seq/share_task_trim_fastqs_atac:v1.0.0"
22 | }
23 |
24 | # Determine the size of the input
25 | Float input_file_size_gb = size(fastq_R1, "G") + size(fastq_R2, "G")
26 |
27 | # Determining memory size base on the size of the input files.
28 | Float mem_gb = 16.0 + memory_factor * input_file_size_gb
29 |
30 | # Determining disk size base on the size of the input files.
31 | Int disk_gb = round(40.0 + disk_factor * input_file_size_gb)
32 |
33 | # Determining disk type base on the size of disk.
34 | String disk_type = if disk_gb > 375 then "SSD" else "LOCAL"
35 |
36 | # Read trimming outfiles
37 | String fastq_R1_trimmed = basename(fastq_R1, ".fastq.gz") + "_trimmed.fastq"
38 | String fastq_R2_trimmed = basename(fastq_R2, ".fastq.gz") + "_trimmed.fastq"
39 | String trimming_log_json = basename(fastq_R1, "R1.fastq.gz") + ".atac.preprocess.trimming.log.json"
40 | String trimming_log_html = basename(fastq_R1, "R1.fastq.gz") + ".atac.preprocess.trimming.log.html"
41 | String trimming_stats = basename(fastq_R1, "R1.fastq.gz") + ".atac.preprocess.trimming.adapter.stats.txt"
42 | String monitor_log = 'trim_fastqs_atac_monitor.log'
43 |
44 | command <<<
45 | set -e
46 |
47 | bash $(which monitor_script.sh) | tee ~{monitor_log} 1>&2 &
48 |
49 | # Use trim_fastq script for SHARE ATAC trimming
50 | if [ '~{chemistry}' == 'shareseq' ]; then
51 | python3 $(which trim_fastq.py) ~{fastq_R1} ~{fastq_R2} ~{fastq_R1_trimmed} ~{fastq_R2_trimmed} ~{trimming_stats}
52 |
53 | # Use fastp for 10X ATAC trimming
54 | else
55 | fastp -i ~{fastq_R1} -I ~{fastq_R2} -o ~{fastq_R1_trimmed} -O ~{fastq_R2_trimmed} -h ~{trimming_log_html} -j ~{trimming_log_json} -G -Q -L -w ~{cpus} 2> ~{trimming_stats}
56 |
57 | fi
58 |
59 | pigz -p ~{cpus} *.fastq
60 | >>>
61 |
62 | output {
63 | File fastq_R1_trimmed = fastq_R1_trimmed + ".gz"
64 | File fastq_R2_trimmed = fastq_R2_trimmed + ".gz"
65 | File? tenx_trimming_log_json = trimming_log_json
66 | File? tenx_trimming_log_html = trimming_log_html
67 | File trimming_stats = trimming_stats
68 | File trim_fastqs_atac_monitor = monitor_log
69 | }
70 |
71 | runtime {
72 | cpu: cpus
73 | docker: "${docker_image}"
74 | disks: "local-disk ${disk_gb} ${disk_type}"
75 | memory: "${mem_gb} GB"
76 | }
77 |
78 | parameter_meta {
79 | fastq_R1: {
80 | description: 'Pairs 1 fastq',
81 | help: 'Pairs 1 fastq',
82 | }
83 | fastq_R2: {
84 | description: 'Pairs 2 fastq',
85 | help: 'Pairs 2 fastq',
86 | }
87 | }
88 |
89 | }
90 |
--------------------------------------------------------------------------------
/tasks/share_task_generate_h5.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | # TASK
4 | # SHARE-rna-generate-h5
5 |
6 |
7 | task generate_h5 {
8 | meta {
9 | version: 'v0.1'
10 | author: 'Eugenio Mattei (emattei@broadinstitute.org) at Broad Institute of MIT and Harvard'
11 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: RNA gene x cell matrix'
12 | }
13 |
14 | input {
15 | # This task computes the the gene x cell matrix.
16 | File tar
17 | String genome_name
18 | String? pkr
19 | String prefix
20 | String? gene_naming
21 |
22 | Float? disk_factor = 8.0
23 | Float? memory_factor = 2.0
24 | String? docker_image = "us.gcr.io/buenrostro-share-seq/share_task_generate_h5:v1.0.0"
25 | }
26 |
27 | # Determine the size of the input
28 | Float input_file_size_gb = size(tar, "G")
29 |
30 | # Determining memory size based on the size of the input files.
31 | Float mem_gb = 10.0 + memory_factor * input_file_size_gb
32 |
33 | # Determining disk size based on the size of the input files.
34 | Int disk_gb = round(40.0 + disk_factor * input_file_size_gb)
35 |
36 | # Determining disk type based on the size of disk.
37 | String disk_type = if disk_gb > 375 then "SSD" else "LOCAL"
38 |
39 | String ensembl_option = if "${gene_naming}"=="ensembl" then "--ensembl" else ""
40 | String h5 = "${default="share-seq" prefix}.${genome_name}.rna.h5"
41 | String monitor_log = "monitor.log"
42 |
43 | command <<<
44 | set -e
45 |
46 | bash $(which monitor_script.sh) | tee ~{monitor_log} 1>&2 &
47 |
48 | # Untar
49 | tar xzvf ~{tar}
50 |
51 | # Generate h5 file
52 | python3 $(which generate_h5_rna.py) \
53 | ./matrix.mtx.gz \
54 | ./features.tsv.gz \
55 | ./barcodes.tsv.gz \
56 | ~{h5} \
57 | ~{pkr} \
58 | ~{ensembl_option}
59 | >>>
60 |
61 | output {
62 | File h5_matrix = "${h5}"
63 | }
64 |
65 | runtime {
66 | memory : "${mem_gb} GB"
67 | disks: "local-disk ${disk_gb} ${disk_type}"
68 | docker : "${docker_image}"
69 | }
70 |
71 | parameter_meta {
72 | tar: {
73 | description: 'STARsolo output tar.gz file',
74 | help: 'tar.gz file containing raw matrix, features, and barcodes file from STARsolo.',
75 | example: 'raw.tar.gz'
76 | }
77 | genome_name: {
78 | description: 'Reference name',
79 | help: 'The name genome reference used to align.',
80 | example: ['hg38', 'mm10', 'hg19', 'mm9']
81 | }
82 | prefix: {
83 | description: 'Prefix for output files',
84 | help: 'Prefix that will be used to name the output files.',
85 | example: 'MyExperiment'
86 | }
87 | gene_naming: {
88 | description: 'Gene naming convention',
89 | help: 'Convention for gene naming in h5 matrix; either "gene_name" (default) or "ensembl".',
90 | example: ['gene_name', 'ensembl']
91 | }
92 | docker_image: {
93 | description: 'Docker image.',
94 | help: 'Docker image for preprocessing step. Dependencies: python3 -m pip install h5py scipy',
95 | example: ['put link to gcr or dockerhub']
96 | }
97 | }
98 | }
99 |
--------------------------------------------------------------------------------
/tasks/share_task_html_report.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | # TASK
4 | # SHARE-html-report
5 | # Gather information from log files
6 |
7 |
8 | task html_report {
9 | meta {
10 | version: 'v0.1'
11 | author: 'Neva C. Durand (neva@broadinstitute.org) at Broad Institute of MIT and Harvard'
12 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: create html report task'
13 | }
14 |
15 | input {
16 | # This function takes as input the files to append to the report
17 | # and the metrics and writes out an html file
18 |
19 | String? prefix
20 |
21 | # Stats for ATAC and RNA, will go at top of html
22 | Int? atac_total_reads
23 | Int? atac_aligned_uniquely
24 | Int? atac_unaligned
25 | Int? atac_feature_reads
26 | Int? atac_duplicate_reads
27 | Float? atac_percent_duplicates
28 | Int? rna_total_reads
29 | Int? rna_aligned_uniquely
30 | Int? rna_aligned_multimap
31 | Int? rna_unaligned
32 | Int? rna_feature_reads
33 | Int? rna_duplicate_reads
34 |
35 | ## JPEG files to be encoded and appended to html
36 | Array[File?] image_files
37 |
38 | ## Raw text logs to append to end of html
39 | Array[String?] log_files
40 |
41 | }
42 |
43 | String output_file = "${default="share-seq" prefix}.html"
44 | # need to select from valid files since some are optional
45 | Array[File] valid_image_files = select_all(image_files)
46 | Array[String] valid_log_files = select_all(log_files)
47 |
48 | command <<<
49 |
50 | echo "~{sep="\n" valid_image_files}" > image_list.txt
51 | echo "~{sep="\n" valid_log_files}" > log_list.txt
52 |
53 | echo "Summary Statistics
| ATAC |
| Total reads | " ~{atac_total_reads} " |
" > output.txt
54 | echo "| Aligned uniquely | " ~{atac_aligned_uniquely} " |
" >> output.txt
55 | echo "| Unaligned | " ~{atac_unaligned} " |
" >> output.txt
56 | echo "| Unique Reads | " ~{atac_feature_reads} " |
" >> output.txt
57 | echo "| Duplicate Reads | " ~{atac_duplicate_reads} " |
" >> output.txt
58 | echo "| Percent Duplicates | " ~{atac_percent_duplicates} " |
" >> output.txt
59 | echo "RNA | | Total reads | " ~{rna_total_reads} " |
" >> output.txt
60 | echo "| Aligned uniquely | " ~{rna_aligned_uniquely} " |
" >> output.txt
61 | echo "| Aligned multimap | " ~{rna_aligned_multimap} " |
" >> output.txt
62 | echo "| Unaligned | " ~{rna_unaligned} " |
" >> output.txt
63 | echo "| Filtered (feature) Reads | " ~{rna_feature_reads} " |
" >> output.txt
64 | echo "| Duplicate Reads | " ~{rna_duplicate_reads} " |
" >> output.txt
65 | percent=$(( ~{default=0 rna_duplicate_reads}*100/~{default=1 rna_feature_reads} ))
66 | echo "| Percent Duplicates | " $percent " |
" >> output.txt
67 | PYTHONIOENCODING=utf-8 python3 /software/write_html.py ~{output_file} image_list.txt log_list.txt --input_file_name output.txt
68 | >>>
69 | output {
70 | File html_report_file = "~{output_file}"
71 | }
72 |
73 | runtime {
74 | docker: 'us.gcr.io/buenrostro-share-seq/share_task_html_report:v1.0.0'
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/src/python/filter_mito_reads.py:
--------------------------------------------------------------------------------
1 | # From Kundaje lab
2 | # https://github.com/kundajelab/ENCODE_scatac/blob/master/workflow/scripts/filter_mito.py
3 | #
4 | # Modified by: Eugenio Mattei
5 | # Affiliation: The Broad InstituteOf MIT and Harvard
6 | #
7 | # Changelog:
8 | # 2023/01/20: Now it returns the statistics per barcode
9 | #
10 |
11 | import argparse
12 | import pysam
13 | from collections import defaultdict
14 |
15 |
16 |
17 | def filter_mito(in_path, out_path, barcode_tag, cutoff, prefix, threads=1):
18 | """
19 | Removes mitochondrial alignments from BAM
20 | Calculates number of mapped mitochondrial and non-mitochondrial reads (not alignments)
21 | Assumes mitochondrial chromosome is "chrM"
22 | """
23 |
24 | infile = pysam.AlignmentFile(in_path, "rb", threads=threads)
25 | outfile = pysam.AlignmentFile(out_path, "wb", template=infile, threads=threads)
26 | outfile_bulk_metrics = f"{prefix}.mito.bulk-metrics.tsv"
27 | outfile_barcode_metrics = f"{prefix}.mito.bc-metrics.tsv"
28 |
29 | number_mito = 0
30 | number_non_mito = 0
31 |
32 | # Initializing the dictionary setting the counts for non-mito and mito.
33 | barcode_metrics = defaultdict(lambda: [0,0])
34 |
35 | for read in infile.fetch(until_eof=True,multiple_iterators=True):
36 | if read.reference_name == "chrM":
37 | if read.flag & 260 == 0: # Alignment is mapped and is primary
38 | number_mito += 1
39 | barcode_metrics[read.get_tag(barcode_tag)][1] += 1
40 |
41 | else:
42 | if read.flag & 260 == 0:
43 | number_non_mito += 1
44 | barcode_metrics[read.get_tag(barcode_tag)][0] += 1
45 | #outfile.write(read)
46 |
47 | # Write the summary metrics
48 | with open(outfile_bulk_metrics, "w") as fh:
49 | print("raw_reads_nonmito\traw_reads_mito", file = fh)
50 | print(f"{number_non_mito}\t{number_mito}", file = fh)
51 |
52 | # Write the metrics per barcode
53 | with open(outfile_barcode_metrics, "w") as fh:
54 | # Print header
55 | print("barcode\traw_reads_nonmito\traw_reads_mito", file = fh)
56 | for barcode,counts in barcode_metrics.items():
57 | print(f"{barcode}\t{counts[0]}\t{counts[1]}", file = fh)
58 |
59 | # Write a filtered bam
60 | for read in infile:
61 | if read.flag & 260 == 0 and read.reference_name != "chrM" and barcode_metrics[read.get_tag(barcode_tag)][0] > cutoff*2:
62 | outfile.write(read)
63 |
64 | outfile.close()
65 | return
66 |
67 |
68 |
69 | if __name__ == '__main__':
70 |
71 | msg = "Add the description"
72 | parser = argparse.ArgumentParser(description = msg)
73 |
74 | # Adding optional argument
75 | parser.add_argument("bam", help = "Path to the coordinate-sorted bam file.")
76 | parser.add_argument("-o", "--output", help = "Path to the mitochondrial-free bam file.")
77 | parser.add_argument("-p", help = "Number of threads to use.", type=int, default=1)
78 | parser.add_argument("--prefix", help = "Prefix for the metrics output file.")
79 | parser.add_argument("--cutoff", help = "Remove barcodes with a number of fragments less than the cutoff.", type=int, default=1)
80 | parser.add_argument("--bc_tag", help = "Specify the tag containing the cell barcode.", default="CB")
81 |
82 | # Read arguments from command line
83 | args = parser.parse_args()
84 |
85 | if args.prefix:
86 | prefix = args.prefix
87 | else:
88 | prefix = args.bam[:-4]
89 |
90 | if args.output:
91 | out_path = args.output
92 | else:
93 | out_path = f"{prefix}.no_mito.bam"
94 |
95 | bc_tag = args.bc_tag
96 |
97 | filter_mito(args.bam, out_path, bc_tag, args.cutoff, prefix, threads=args.p)
98 |
--------------------------------------------------------------------------------
/src/R/cell_annotation_helper_functions.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/Rscript
2 |
3 | ## ---------------------------
4 | ## Helper functions for cell annotation
5 | ## Author: Zhijian Li
6 | ## Date Created: 2023-05-29
7 | ## Email: lzj1769@gmail.com
8 | ## ---------------------------
9 | library(reticulate)
10 | use_python("/usr/bin/python3")
11 |
12 | read_h5ad <- function(
13 | filename,
14 | backed = NULL
15 | ) {
16 | python_anndata <- reticulate::import("anndata", convert = FALSE)
17 | filename <- normalizePath(filename, mustWork = FALSE)
18 | py_to_r_ifneedbe(python_anndata$read_h5ad(
19 | filename = filename,
20 | backed = backed
21 | ))
22 | }
23 |
24 | py_to_r_ifneedbe <- function(x) {
25 | if (inherits(x, "python.builtin.object")) {
26 | py_to_r(x)
27 | } else {
28 | x
29 | }
30 | }
31 |
32 | #' @name r-py-conversion
33 | #' @export
34 | py_to_r.pandas.core.indexes.base.Index <- function(x) {
35 | python_builtins <- reticulate::import_builtins()
36 | out <- python_builtins$list(x)
37 | attr(out, "name") <- py_to_r_ifneedbe(x$name)
38 | out
39 | }
40 |
41 | #' Convert between Python and R objects
42 | #'
43 | #' @param x A Python object.
44 | #' @param name A name
45 | #' @param value A value
46 | #'
47 | #' @return An \R object, as converted from the Python object.
48 | #'
49 | #' @name r-py-conversion
50 | #' @export
51 | `[[<-.collections.abc.MutableMapping` <- function(x, name, value) {
52 | if (!is.null(value)) {
53 | reticulate::py_set_item(x, name, value)
54 | } else if (name %in% x$keys()) {
55 | reticulate::py_del_item(x, name)
56 | }
57 | }
58 |
59 | #' @name r-py-conversion
60 | #' @export
61 | `[[.collections.abc.Mapping` <- function(x, name) {
62 | if (name %in% x$keys()) {
63 | py_to_r_ifneedbe(reticulate::py_get_item(x, name))
64 | } else {
65 | NULL
66 | }
67 | }
68 |
69 | #' @name r-py-conversion
70 | #' @export
71 | `[<-.collections.abc.MutableMapping` <- `[[<-.collections.abc.MutableMapping`
72 | #
73 | #' @name r-py-conversion
74 | #' @export
75 | `[.collections.abc.Mapping` <- `[[.collections.abc.Mapping`
76 | #
77 | #' @name r-py-conversion
78 | #' @export
79 | `names.collections.abc.Mapping` <- function(x) {
80 | python_builtins <- reticulate::import_builtins()
81 | python_builtins$list(x$keys())
82 | }
83 |
84 | #' @name r-py-conversion
85 | #' @export
86 | `py_to_r.collections.abc.Set` <- function(x) {
87 | python_builtins <- reticulate::import_builtins()
88 | python_builtins$list(x)
89 | }
90 |
91 | #' @name r-py-conversion
92 | #' @export
93 | py_to_r.pandas.core.indexes.base.Index <- function(x) {
94 | python_builtins <- reticulate::import_builtins()
95 | out <- python_builtins$list(x)
96 | attr(out, "name") <- py_to_r_ifneedbe(x$name)
97 | out
98 | }
99 |
100 | #' @name r-py-conversion
101 | #' @export
102 | py_to_r.collections.abc.KeysView <- function(x) {
103 | python_builtins <- reticulate::import_builtins()
104 | python_builtins$list(x)
105 | }
106 |
107 | #' @name r-py-conversion
108 | #' @export
109 | `py_to_r.collections.abc.Mapping` <- function(x) {
110 | python_builtins <- reticulate::import_builtins()
111 |
112 | x_list <- python_builtins$dict(x)
113 |
114 | # convert members of x_list if need be
115 | for (i in seq_along(x_list)) {
116 | if (inherits(x_list[[i]], "python.builtin.object")) {
117 | x_list[[i]] <- py_to_r_ifneedbe(x_list[[i]])
118 | }
119 | }
120 |
121 | x_list
122 | }
123 |
124 |
125 | #' @importFrom Matrix sparseMatrix
126 | py_to_r.scipy.sparse.csc.csc_matrix <- function(x) {
127 | Matrix::sparseMatrix(
128 | i = as.integer(py_to_r_ifneedbe(x$indices))+1,
129 | p = as.integer(py_to_r_ifneedbe(x$indptr)),
130 | x = as.vector(py_to_r_ifneedbe(x$data)),
131 | dims = as.integer(dim(x))
132 | )
133 | }
134 |
--------------------------------------------------------------------------------
/workflows/subwf-find-dorcs.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 |
4 | # Import the tasks called by the pipeline
5 | import "../tasks/dorcs_task_find_dorcs.wdl" as find_dorcs
6 |
7 | workflow wf_dorcs {
8 |
9 | meta {
10 | version: 'v0.1'
11 | author: 'Siddarth Wekhande (swekhand@broadinstitute.org)'
12 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: Sub-workflow to find DORCs from SHARE-seq data.'
13 | }
14 |
15 | input {
16 | File? rna_matrix
17 | File? atac_fragments
18 | File peak_file
19 |
20 | String genome
21 | Int n_cores = 4
22 | String save_plots_to_dir = "TRUE"
23 | String? output_filename
24 |
25 | Int minFeature_RNA = 200
26 | Int maxFeature_RNA = 2500
27 | Float percentMT_RNA = 5
28 | Int minCells_RNA = 3
29 |
30 | Int dorcGeneCutOff = 10
31 | Float fripCutOff = 0.3
32 | Float corrPVal = 0.05
33 | Int topNGene = 20
34 | Int windowPadSize = 50000
35 |
36 | Int numNearestNeighbor = 30
37 | Float numBackgroundPairs = 100000
38 | Float chunkSize = 50000
39 |
40 | String? prefix
41 | Int mem_gb = 64
42 | Int disk_gb = 100
43 | String? docker
44 | }
45 |
46 | File rna_matrix_ = select_first([rna_matrix])
47 | File atac_fragments_ = select_first([atac_fragments])
48 |
49 | if ( !defined(rna_matrix) || !defined(atac_fragments) ){
50 | call raise_exception as missing_input {
51 | input:
52 | msg = "The genes-by-cell matrix or the dna fragments file are missing."
53 | }
54 | }
55 |
56 | call find_dorcs.find_dorcs as find_dorcs{
57 | input:
58 | rna_matrix = rna_matrix_,
59 | atac_fragments = atac_fragments_,
60 | peak_file = peak_file,
61 | genome = genome,
62 | n_cores = n_cores,
63 | save_plots_to_dir = save_plots_to_dir,
64 | output_filename = output_filename,
65 | minFeature_RNA = minFeature_RNA,
66 | maxFeature_RNA = maxFeature_RNA,
67 | percentMT_RNA = percentMT_RNA,
68 | minCells_RNA = minCells_RNA,
69 | dorcGeneCutOff = dorcGeneCutOff,
70 | fripCutOff = fripCutOff,
71 | corrPVal = corrPVal,
72 | topNGene = topNGene,
73 | windowPadSize = windowPadSize,
74 | numNearestNeighbor = numNearestNeighbor,
75 | numBackgroundPairs = numBackgroundPairs,
76 | chunkSize = chunkSize,
77 | mem_gb = mem_gb,
78 | disk_gb = disk_gb,
79 | docker_image = docker,
80 | prefix = prefix
81 | }
82 |
83 | output {
84 | File dorcs_notebook_output = find_dorcs.notebook_output
85 | File dorcs_notebook_log = find_dorcs.notebook_log
86 | File? seurat_violin_plot = find_dorcs.seurat_violin_plot
87 | File? j_plot = find_dorcs.j_plot
88 | File? plots_zip = find_dorcs.plots_zip
89 | File? dorcs_genes_summary = find_dorcs.dorcs_genes_summary
90 | File? dorcs_regions_summary = find_dorcs.dorcs_regions_summary
91 | }
92 |
93 | }
94 |
95 | # Task to report errors to user.
96 | # From https://github.com/ENCODE-DCC/chip-seq-pipeline2/blob/master/chip.wdl
97 | task raise_exception {
98 | input {
99 | String msg
100 | Array[String]? vals
101 | }
102 | command {
103 | echo -e "\n* Error: ${msg}\n" >&2
104 | echo -e "* Vals: ${sep=',' vals}\n" >&2
105 | exit 2
106 | }
107 | output {
108 | String error_msg = '${msg}'
109 | }
110 | runtime {
111 | maxRetries : 0
112 | cpu : 1
113 | memory : '2 GB'
114 | time : 1
115 | disks : 'local-disk 10 SSD'
116 | docker : 'encodedcc/chip-seq-pipeline:v2.2.1'
117 | }
118 | }
119 |
--------------------------------------------------------------------------------
/dockerfiles/share_task_filter_atac.dockerfile:
--------------------------------------------------------------------------------
1 | ############################################################
2 | # Dockerfile for BROAD GRO share-seq-pipeline
3 | # Based on Debian slim
4 | ############################################################
5 |
6 | FROM debian:buster-slim as builder
7 |
8 | ENV BEDTOOLS_VERSION v2.29.0
9 | ENV PICARD_VERSION 2.27.5
10 | ENV SAMTOOLS_VERSION 1.16
11 | ENV SAMBAMBA_VERSION 0.6.6
12 |
13 | # To prevent time zone prompt
14 | ENV DEBIAN_FRONTEND=noninteractive
15 |
16 | # Install softwares from apt repo
17 | RUN apt-get update && apt-get install -y \
18 | autoconf \
19 | automake \
20 | build-essential \
21 | git \
22 | libcurl4-openssl-dev \
23 | liblz4-dev \
24 | liblzma-dev \
25 | libncurses5-dev \
26 | libncursesw5-dev \
27 | libbz2-dev \
28 | perl \
29 | python \
30 | unzip \
31 | xz-utils \
32 | wget \
33 | zlib1g-dev &&\
34 | rm -rf /var/lib/apt/lists/*
35 |
36 | # Make directory for all softwares
37 | RUN mkdir /software
38 | WORKDIR /software
39 | ENV PATH="/software:${PATH}"
40 |
41 | # Install bedtools 2.29.0
42 | RUN git clone --branch ${BEDTOOLS_VERSION} --single-branch https://github.com/arq5x/bedtools2.git && \
43 | cd bedtools2 && make && make install && cd ../ && rm -rf bedtools2*
44 |
45 | # Install sambamba 0.6.6
46 | RUN wget https://github.com/lomereiter/sambamba/releases/download/v${SAMBAMBA_VERSION}/sambamba_v${SAMBAMBA_VERSION}_linux.tar.bz2 && \
47 | tar -xvjf sambamba_v${SAMBAMBA_VERSION}_linux.tar.bz2 && \
48 | mv sambamba_v${SAMBAMBA_VERSION} /usr/local/bin/sambamba && \
49 | rm -rf sambamba_*
50 |
51 | # Install samtools 1.16
52 | RUN git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/htslib.git && \
53 | cd htslib && git submodule update --init --recursive && autoreconf -i && make && make install && cd ../ && \
54 | git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/samtools.git && \
55 | cd samtools && make && make install && cd ../ && rm -rf samtools* && rm -rf htslib*
56 |
57 |
58 | # Install Picard 2.20.7
59 | RUN wget https://github.com/broadinstitute/picard/releases/download/${PICARD_VERSION}/picard.jar && chmod +x picard.jar && mv picard.jar /usr/local/bin
60 |
61 |
62 |
63 | FROM debian:buster-slim
64 |
65 | LABEL maintainer = "Eugenio Mattei"
66 | LABEL software = "Share-seq pipeline"
67 | LABEL software.version="1.0.0"
68 | LABEL software.organization="Broad Institute of MIT and Harvard"
69 | LABEL software.version.is-production="Yes"
70 | LABEL software.task="filter"
71 |
72 | RUN apt-get update && apt-get install -y \
73 | gcc \
74 | libcurl4-openssl-dev \
75 | libbz2-dev \
76 | liblzma-dev \
77 | python3 \
78 | python3-dev \
79 | python3-pip \
80 | openjdk-11-jre \
81 | zlib1g-dev &&\
82 | rm -rf /var/lib/apt/lists/*
83 |
84 | # Install packages for python3 scripts
85 | RUN python3 -m pip install --upgrade pip
86 | RUN python3 -m pip install --no-cache-dir --ignore-installed pysam
87 |
88 | # Create and setup new user
89 | ENV USER=shareseq
90 | WORKDIR /home/$USER
91 |
92 | RUN groupadd -r $USER &&\
93 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
94 | chown $USER:$USER /home/$USER
95 |
96 | # Add folder with software to the path
97 | ENV PATH="/software:${PATH}"
98 |
99 | # Copy the compiled software from the builder
100 | COPY --from=builder --chown=$USER:$USER /usr/local/bin/* /usr/local/bin/
101 | COPY --from=builder --chown=$USER:$USER /lib/x86_64-linux-gnu/* /lib/x86_64-linux-gnu/
102 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
103 | COPY --chown=$USER:$USER src/python/filter_mito_reads.py /usr/local/bin
104 | COPY --chown=$USER:$USER src/python/bam_to_fragments.py /usr/local/bin
105 | COPY --chown=$USER:$USER src/python/assign_multimappers.py /usr/local/bin
106 |
107 |
108 | USER ${USER}
109 |
--------------------------------------------------------------------------------
/dockerfiles/share_task_qc_atac.dockerfile:
--------------------------------------------------------------------------------
1 | ############################################################
2 | # Dockerfile for BROAD GRO share-seq-pipeline
3 | # Based on Debian slim
4 | ############################################################
5 |
6 | FROM debian:buster-slim as builder
7 |
8 | ENV SAMTOOLS_VERSION 1.9
9 | ENV BEDTOOLS_VERSION v2.29.0
10 | ENV PICARD_VERSION 2.27.5
11 |
12 | # To prevent time zone prompt
13 | ENV DEBIAN_FRONTEND=noninteractive
14 |
15 | # Install softwares from apt repo
16 | RUN apt-get update && apt-get install -y \
17 | autoconf \
18 | build-essential \
19 | git \
20 | libcurl4-openssl-dev \
21 | liblz4-dev \
22 | liblzma-dev \
23 | libncurses5-dev \
24 | libbz2-dev \
25 | python \
26 | unzip \
27 | wget \
28 | zlib1g-dev &&\
29 | rm -rf /var/lib/apt/lists/*
30 |
31 |
32 | # Make directory for all softwares
33 | RUN mkdir /software
34 | WORKDIR /software
35 | ENV PATH="/software:${PATH}"
36 |
37 | # Install bedtools 2.29.0
38 | RUN git clone --branch ${BEDTOOLS_VERSION} --single-branch https://github.com/arq5x/bedtools2.git && \
39 | cd bedtools2 && make && make install && cd ../ && rm -rf bedtools2*
40 |
41 | # Install samtools 1.9
42 | RUN git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/samtools.git && \
43 | git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/htslib.git && \
44 | cd samtools && make && make install && cd ../ && rm -rf samtools* && \
45 | cd htslib && autoreconf -i && make && make install && cd ../ && rm -rf htslib*
46 |
47 | # Install Picard 2.20.7
48 | RUN wget https://github.com/broadinstitute/picard/releases/download/${PICARD_VERSION}/picard.jar && chmod +x picard.jar && mv picard.jar /usr/local/bin
49 |
50 |
51 |
52 | FROM debian:buster-slim
53 |
54 | LABEL maintainer = "Eugenio Mattei"
55 | LABEL software = "Share-seq pipeline"
56 | LABEL software.version="1.0.0"
57 | LABEL software.organization="Broad Institute of MIT and Harvard"
58 | LABEL software.version.is-production="Yes"
59 | LABEL software.task="qc-atac"
60 |
61 | RUN apt-get update && apt-get install -y \
62 | gcc \
63 | git \
64 | python3 \
65 | python3-dev \
66 | python3-pip \
67 | openjdk-11-jre \
68 | r-base \
69 | zlib1g-dev &&\
70 | rm -rf /var/lib/apt/lists/*
71 |
72 | # Install packages for python3 scripts (pysam, SAMstats)
73 | RUN python3 -m pip install --upgrade pip
74 | RUN python3 -m pip install --no-cache-dir --ignore-installed numpy matplotlib pandas plotnine pysam --editable=git+https://github.com/kundajelab/SAMstats@75e60f1e67c6d5d066371a0b53729e4b1f6f76c5#egg=SAMstats
75 |
76 | # Create and setup new user
77 | ENV USER=shareseq
78 | WORKDIR /home/$USER
79 |
80 | RUN groupadd -r $USER &&\
81 | useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
82 | chown $USER:$USER /home/$USER
83 |
84 | # Add folder with software to the path
85 | ENV PATH="/software:${PATH}"
86 |
87 | # Copy the compiled software from the builder
88 | COPY --from=builder --chown=$USER:$USER /usr/local/bin/* /usr/local/bin/
89 | COPY --from=builder --chown=$USER:$USER /lib/x86_64-linux-gnu/* /lib/x86_64-linux-gnu/
90 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
91 | COPY --chown=$USER:$USER src/python/pbc_stats.py /usr/local/bin
92 | COPY --chown=$USER:$USER src/python/qc_atac_compute_tss_enrichment.py /usr/local/bin
93 | COPY --chown=$USER:$USER src/python/qc_atac_count_duplicates_per_barcode.py /usr/local/bin
94 | COPY --chown=$USER:$USER src/python/qc_atac_compute_reads_in_peaks.py /usr/local/bin
95 | COPY --chown=$USER:$USER src/python/plot_insert_size_hist.py /usr/local/bin
96 | COPY --chown=$USER:$USER src/R/barcode_rank_functions.R /usr/local/bin
97 | COPY --chown=$USER:$USER src/R/atac_qc_plots.R /usr/local/bin
98 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
99 |
100 |
101 | USER ${USER}
102 |
103 |
104 |
--------------------------------------------------------------------------------
/tasks/dorcs_task_find_dorcs.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task find_dorcs {
4 | meta {
5 | version: 'v0.1'
6 | author: 'Siddarth Wekhande (swekhand@broadinstitute.org) at Broad Institute of MIT and Harvard'
7 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: find DORCs task'
8 | }
9 |
10 | input {
11 | #This task takes in the RNA and ATAC files and finds the DORCs based on the cut-off criteria provided
12 |
13 | #DORCs parameters
14 | File rna_matrix
15 | File atac_fragments
16 | File? peak_file
17 | String genome
18 | Int n_cores = 4
19 | String save_plots_to_dir = "TRUE"
20 | String prefix = "prefix"
21 |
22 | #RNA QC parameters
23 | Int minFeature_RNA = 200
24 | Int maxFeature_RNA = 2500
25 | Float percentMT_RNA = 5
26 | Int minCells_RNA = 3
27 |
28 | #ATAC QC parameter
29 | Float fripCutOff = 0.3
30 | Float chunkSize = 50000
31 |
32 | #Background correlation parameters
33 | Int numNearestNeighbor = 100
34 | Float numBackgroundPairs = 100000
35 |
36 | #DORC genes parameter
37 | # Regulatory region around TSS. Default is +/- 50Kb
38 | Int windowPadSize = 50000
39 | Int dorcGeneCutOff = 10
40 | Float corrPVal = 0.05
41 | Int topNGene = 20
42 |
43 | String output_filename = "${prefix}.dorcs.notebook.${genome}.ipynb"
44 | String docker_image = "us.gcr.io/buenrostro-share-seq/dorcs_task_find_dorcs:v1.0.0"
45 | #String docker_image = "swekhande/shareseq-prod:share-task-dorcs"
46 | Int mem_gb = 64
47 | Int disk_gb = 100
48 | }
49 |
50 | #Output filepaths
51 |
52 | String violin_plot = '${prefix}.dorcs.plots.${genome}/${prefix}.dorcs.rna_violin_plot.${genome}.png'
53 | String jplot = '${prefix}.dorcs.plots.${genome}/${prefix}.dorcs.jplot.${genome}.png'
54 | String dorc_genes_summ = '${prefix}.dorcs.dorc_genes_summary.${genome}.csv'
55 | String all_regions_summ = '${prefix}.dorcs.all_regions_summary.${genome}.csv'
56 | String plots_zip_dir = '${prefix}.dorcs.plots.${genome}.zip'
57 | #String papermill_log_filename = 'papermill.logfile.txt'
58 | String log_filename = "log/${prefix}.dorcs.logfile.${genome}.txt"
59 |
60 | command {
61 | gzip -dc ${atac_fragments} > tmp_fragments.bedpe
62 |
63 | papermill $(which dorcs_jplot_notebook.ipynb) ${output_filename} \
64 | -p rnaCountMatrix ${rna_matrix} \
65 | -p atacFragFile tmp_fragments.bedpe \
66 | -p peakFile ${peak_file} \
67 | -p savePlotsToDir ${save_plots_to_dir} \
68 | -p nCores ${n_cores} \
69 | -p genome ${genome} \
70 | -p minFeature_RNA ${minFeature_RNA} \
71 | -p maxFeature_RNA ${maxFeature_RNA} \
72 | -p percentMT_RNA ${percentMT_RNA} \
73 | -p minCells_RNA ${minCells_RNA} \
74 | -p dorcGeneCutOff ${dorcGeneCutOff} \
75 | -p fripCutOff ${fripCutOff} \
76 | -p corrPVal ${corrPVal} \
77 | -p topNGene ${topNGene} \
78 | -p windowPadSize ${windowPadSize} \
79 | -p numNearestNeighbor ${numNearestNeighbor} \
80 | -p numBackgroundPairs ${numBackgroundPairs} \
81 | -p chunkSize ${chunkSize} \
82 | -p prefix ${prefix}
83 | }
84 |
85 | output {
86 | File notebook_output = output_filename
87 | File notebook_log = log_filename
88 | #File papermill_log = papermill_log_filename
89 |
90 | File? seurat_violin_plot = violin_plot
91 | File? j_plot = jplot
92 | File? plots_zip = plots_zip_dir
93 |
94 | File? dorcs_genes_summary = dorc_genes_summ
95 | File? dorcs_regions_summary = all_regions_summ
96 |
97 |
98 | }
99 |
100 | runtime {
101 | cpu : 4
102 | memory : mem_gb+'G'
103 | docker : docker_image
104 | disks : 'local-disk ${disk_gb} LOCAL'
105 | maxRetries : 0
106 | }
107 | }
108 |
109 |
110 |
--------------------------------------------------------------------------------
/tasks/share_task_star.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | # TASK
4 | # SHARE-atac-STAR
5 |
6 | task share_rna_align {
7 | meta {
8 | version: 'v0.1'
9 | author: 'Eugenio Mattei (emattei@broadinstitute.org) at Broad Institute of MIT and Harvard'
10 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: align RNA task'
11 | }
12 |
13 | input {
14 | # This function takes in input the pre-processed fastq and align it to the genome
15 | # using STAR.
16 |
17 | Array[File] fastq_R1
18 | Array[File]? fastq_R2
19 | File? genome_index_tar
20 | String genome_name
21 | String? prefix
22 | String docker_image = "docker.io/nchernia/share_task_star:1"
23 | Int cpus = 16
24 | }
25 | #Float input_file_size_gb = size(input[0], "G")
26 | Int samtools_cpus = 6
27 | Int samtools_mem_gb = 8
28 | Int mem_gb = 64
29 | Int disk_gb = 850
30 | #Int disk_gb = round(20.0 + 4 * input_file_size_gb)
31 |
32 | # Define the output names
33 | String sorted_bam = "${default="share-seq" prefix}.rna.align.${genome_name}.sorted.bam"
34 | String sorted_bai = "${default="share-seq" prefix}.rna.align.${genome_name}.sorted.bam.bai"
35 | String alignment_log = "${default="share-seq" prefix}.rna.align.${genome_name}.log"
36 |
37 | command {
38 | set -e
39 | # Untar the genome
40 | tar xvzf ${genome_index_tar} --no-same-owner -C ./
41 |
42 | mkdir out
43 |
44 | $(which STAR) \
45 | --runThreadN ${cpus} \
46 | --chimOutType WithinBAM \
47 | --genomeDir ./ \
48 | --readFilesIn ${sep=',' fastq_R1} ${sep=',' fastq_R2} \
49 | --outFileNamePrefix out/${default="share-seq" prefix}.rna.align.${genome_name}. \
50 | --outFilterMultimapNmax 20 \
51 | --outFilterScoreMinOverLread 0.3 \
52 | --outFilterMatchNminOverLread 0.3 \
53 | --outSAMattributes NH HI AS nM MD \
54 | --limitOutSJcollapsed 2000000 \
55 | --outSAMtype BAM Unsorted \
56 | --limitIObufferSize 400000000 400000000 \
57 | --outReadsUnmapped Fastx \
58 | --readFilesCommand zcat
59 |
60 | $(which samtools) sort \
61 | -@ ${samtools_cpus} \
62 | -m ${samtools_mem_gb}G \
63 | -o out/${sorted_bam} \
64 | out/${default="share-seq" prefix}.rna.align.${genome_name}.Aligned.out.bam
65 |
66 | $(which samtools) index \
67 | -@ ${cpus} \
68 | out/${sorted_bam}
69 | }
70 |
71 | output {
72 | File rna_alignment = "out/${sorted_bam}"
73 | File rna_alignment_index = "out/${sorted_bai}"
74 | File rna_alignment_log = glob('out/*.Log.final.out')[0]
75 | }
76 |
77 | runtime {
78 | cpu : cpus
79 | memory : mem_gb+'G'
80 | disks : 'local-disk ${disk_gb} SSD'
81 | maxRetries: 0
82 | docker: docker_image
83 | }
84 |
85 | parameter_meta {
86 | fastq_R1: {
87 | description: 'Read1 fastq',
88 | help: 'Processed fastq for read1.',
89 | example: 'processed.atac.R1.fq.gz'
90 | }
91 | genome_index_tar: {
92 | description: 'STAR indexes',
93 | help: 'Index files for STAR to use during alignment in tar.gz.',
94 | example: ['']
95 | }
96 | genome_name: {
97 | description: 'Reference name',
98 | help: 'The name of the reference genome used by the aligner.',
99 | example: ['hg38', 'mm10', 'both']
100 | }
101 | prefix: {
102 | description: 'Prefix for output files',
103 | help: 'Prefix that will be used to name the output files',
104 | example: 'MyExperiment'
105 | }
106 | cpus: {
107 | description: 'Number of cpus',
108 | help: 'Set the number of cpus useb by bowtie2',
109 | example: '4'
110 | }
111 | docker_image: {
112 | description: 'Docker image.',
113 | help: 'Docker image for preprocessing step. Dependencies: STAR',
114 | example: ['put link to gcr or dockerhub']
115 | }
116 | }
117 | }
118 |
--------------------------------------------------------------------------------
/src/python/qc_atac_compute_reads_in_peaks.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # Author: Eugenio Mattei, Broad Institute of MIT and Harvard
4 | # modified from Jason Buenrostro's tool
5 |
6 | import argparse
7 | import os
8 | import pysam
9 | from collections import Counter
10 | from collections import defaultdict
11 | import numpy as np
12 |
13 | #import os
14 | #import sys
15 | import matplotlib
16 | matplotlib.use('Agg')
17 | import matplotlib.pyplot as plt
18 | #from multiprocessing import Pool
19 |
20 |
21 | ##### DEFINE FUNCTIONS #####
22 | def count_fragments_in_peaks(tabix_filename,
23 | peaks_list,
24 | mapq_threshold = 30):
25 | """
26 | This funtion counts the per-barcode number of reads in the peak region.
27 |
28 | Parameters
29 | ----------
30 | tabix_filename : str
31 | Path to the tabix file containing the fragments.
32 | File needs to be coordinate-sorted and indexed.
33 | peaks_list : array
34 | Array containing the list of peaks to be included.
35 | Each member of the array contains the following four elements:
36 | Chr, Start, End, Strand
37 | barcode_tag : str
38 | Which tag in the BAM file contains the barcode id.
39 | mapq_threshold : int
40 | Keep only the reads with mapq score greater or equal.
41 | default: 30
42 |
43 | Returns
44 | -------
45 |
46 | Dictionary
47 | Key: Barcode
48 | Value: Number of fragments in peaks.
49 | """
50 | # To count the number of fragments in peaks
51 | reads_in_peaks_counter = defaultdict(set)
52 | fragments_in_peaks_counter = defaultdict(set)
53 |
54 | tabixfile = pysam.TabixFile(tabix_filename)
55 |
56 | for peak in peaks_list:
57 | peak_chr = str(peak[0])
58 | peak_start = int(peak[1])
59 | peak_end = int(peak[2])
60 |
61 | # Find all the fragments overlapping the promoter.
62 | for fragment in tabixfile.fetch(peak_chr, peak_start, peak_end):
63 | fragment_fields = fragment.split("\t")
64 |
65 | fragment_contig = fragment_fields[0]
66 | fragment_start = int(fragment_fields[1])
67 | fragment_end = int(fragment_fields[2])
68 | barcode = fragment_fields[3]
69 |
70 | fragment_id = "-".join(fragment_fields)
71 | fragments_in_peaks_counter[barcode].add(fragment_id)
72 |
73 | # Increment the counter for the specific barcode.
74 | if fragment_start >= peak_start and fragment_start <= peak_end-1:
75 | reads_in_peaks_counter[barcode].add(fragment_id+"start")
76 |
77 | if fragment_end >= peak_start and fragment_end <= peak_end-1:
78 | reads_in_peaks_counter[barcode].add(fragment_id+"end")
79 |
80 | return reads_in_peaks_counter, fragments_in_peaks_counter
81 |
82 |
83 | if __name__ == '__main__':
84 |
85 | #args = _parse_sanitize_cmdline_arguments()
86 |
87 | msg = "Add the description"
88 | parser = argparse.ArgumentParser(description = msg)
89 |
90 | # Adding optional argument
91 | parser.add_argument("tabix", help= "Fragments file in tabix format and indexed.")
92 | parser.add_argument("--prefix", help = "Prefix for the metrics output fil.")
93 | parser.add_argument("--peaks", help= "Peaks bed file")
94 |
95 | # Read arguments from command line
96 | args = parser.parse_args()
97 |
98 | if args.prefix:
99 | prefix = args.prefix
100 | else:
101 | prefix = args.bam[:-4]
102 |
103 | # It is extremely fast. Don't think we need parallel processing.
104 | #cpus = len(os.sched_getaffinity(0))/2
105 | # Using column chr, start, end and what user input contains the strand information.
106 | peaks_list = np.loadtxt(args.peaks, 'str', usecols = (0,1,2))
107 |
108 | reads_in_peaks, fragments_in_peaks = count_fragments_in_peaks(args.tabix,
109 | peaks_list
110 | )
111 | output_fnp = f"{prefix}.reads.in.peak.tsv"
112 |
113 | with open(output_fnp,"w") as out_file:
114 | print(f"barcode\treads_peaks\tfragment_peaks", file=out_file)
115 | for barcode,fragments_in_peak in fragments_in_peaks.items():
116 | print(f"{barcode}\t{len(reads_in_peaks[barcode])}\t{len(fragments_in_peak)}", file=out_file)
117 |
--------------------------------------------------------------------------------
/src/python/infer_barcodes.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # This script is used to infer molecular barcodes
4 | # from raw sequencing BCL data.
5 | #
6 | # It requires running Picard ExtractIlluminaBarcodes with BARCODE=N,
7 | # to extract all barcodes into *_barcode.txt.gz files first.
8 |
9 | import glob
10 | import gzip
11 | import sys
12 |
13 | [
14 | _name,
15 | multiplex_params_file,
16 | candidate_molecular_barcodes_file,
17 | barcode_matches_file,
18 | ] = sys.argv
19 |
20 | YIELD_THRESHOLD = 0.1
21 | MIN_READ_COUNT = 1e6
22 |
23 |
24 | def parse_barcodes(file_path):
25 | with open(file_path) as f:
26 | barcodes = {}
27 | for row in f.readlines():
28 | row = row.strip().split('\t')
29 | copa = row[0]
30 | barcode = ''.join(row[1:])
31 | barcodes[barcode] = copa
32 | return barcodes
33 |
34 |
35 | copa_barcodes = parse_barcodes(multiplex_params_file)
36 | molecular_barcodes = parse_barcodes(candidate_molecular_barcodes_file)
37 |
38 | # count each unique barcode combination
39 | counts = {}
40 | for extracted in glob.glob('*_barcode.txt.gz'):
41 | with gzip.open(extracted, 'rt') as f:
42 | for row in f.readlines():
43 | barcode = row.split('\t')[0]
44 | if barcode in counts:
45 | counts[barcode] += 1
46 | else:
47 | counts[barcode] = 1
48 |
49 | # add any missing barcodes from the list of CoPAs
50 | for barcode in copa_barcodes.keys():
51 | if barcode not in counts:
52 | counts[barcode] = 0
53 |
54 |
55 | def distance(b1, b2):
56 | return sum(c1 != c2 for c1, c2 in zip(b1, b2))
57 |
58 |
59 | COPA_UNDEFINED = 'UNDEFINED'
60 |
61 | # match barcodes to candidates
62 | results = {}
63 | molecular_barcode_len = len(next(iter(molecular_barcodes)))
64 | for barcode, count in counts.items():
65 | molecular_barcode_matched = False
66 | molecular_barcode_match = barcode[:molecular_barcode_len]
67 | molecular_barcode_match_name = molecular_barcode_match
68 |
69 | if molecular_barcode_match in molecular_barcodes:
70 | molecular_barcode_matched = True
71 | molecular_barcode_match_name = molecular_barcodes[molecular_barcode_match]
72 |
73 | barcode_match = molecular_barcode_match
74 | copa = copa_barcodes[barcode_match] if barcode_match in copa_barcodes else COPA_UNDEFINED
75 | if barcode_match in results:
76 | results[barcode_match]['Count'] += count
77 | else:
78 | results[barcode_match] = {
79 | 'CoPA': copa,
80 | 'Molecular Barcode': molecular_barcode_match_name,
81 | 'Count': count,
82 | 'Matched': molecular_barcode_matched
83 | }
84 |
85 | # show barcodes that correspond to a CoPA or have a matched
86 | # barcode at the top of the output file, otherwise
87 | # sort by count
88 | results = sorted(
89 | results.values(),
90 | key=lambda r: (r['CoPA'], int(not r['Matched']), -r['Count'])
91 | )
92 |
93 | # calculate % of average yield
94 | total_yield = 0
95 | copa_count = 0
96 | for r in results:
97 | if r['CoPA'] != COPA_UNDEFINED:
98 | total_yield += r['Count']
99 | copa_count += 1
100 | avg_yield = total_yield / copa_count if copa_count else None
101 | for r in results:
102 | percent_avg_yield = ''
103 | if r['CoPA'] != COPA_UNDEFINED:
104 | percent_avg_yield = '{:.2f}%'.format(
105 | 100 * r['Count'] / avg_yield) if avg_yield else 0
106 | r['Percent of average'] = percent_avg_yield
107 |
108 | # report results as a TSV
109 | with open(barcode_matches_file, 'w') as f:
110 | header = (
111 | 'CoPA', 'Molecular Barcode',
112 | 'Count', 'Percent of average',
113 | )
114 | print('\t'.join(header), file=f)
115 |
116 | # print CoPA matches, barcode matches, and the top barcodes
117 | # with the highest read count
118 | for r in results:
119 | if (
120 | r['CoPA'] != COPA_UNDEFINED or
121 | r['Matched'] or
122 | r['Count'] >= MIN_READ_COUNT
123 | ):
124 | print('\t'.join((str(r[col]) for col in header)), file=f)
125 |
126 | # fail the task (and the workflow) for low yield
127 | if not avg_yield:
128 | raise Exception('None of the candidate barcodes matched any CoPAs!')
129 | failed_copas = []
130 | for r in results:
131 | if (r['CoPA'] != COPA_UNDEFINED and
132 | float(r['Percent of average'].replace('%', '')) < YIELD_THRESHOLD):
133 | failed_copas.append(r['CoPA'])
134 | failed_copas = ', '.join(failed_copas)
135 | if failed_copas:
136 | raise Exception(
137 | f'Found CoPA(s) with < {YIELD_THRESHOLD}% yield: {failed_copas}')
--------------------------------------------------------------------------------
/tasks/share_task_cell_annotation.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task cell_annotation {
4 | meta {
5 | version: 'v0.1'
6 | author: 'Zhijian Li'
7 | affiliation: 'Broad Institute of MIT and Harvard'
8 | email: 'lizhijia@broadinstitute.org'
9 | description: 'SHARE-Seq pipeline: cell type annotation using RNA-seq data.'
10 | }
11 |
12 | input {
13 | # Sample or project name
14 | String? prefix = "prefix"
15 |
16 | # Reference genome
17 | String genome
18 |
19 | # Reference data name and id
20 | String reference_data_id
21 | String reference_data_name
22 | String reference_label
23 |
24 | # Query data
25 | File query_data
26 |
27 | String? gene_id_to_symbol
28 |
29 | # Docker image
30 | String? docker_image
31 |
32 | # Runtime parameter
33 | Float? memory_factor
34 | Float? disk_factor
35 | }
36 |
37 | # Determine the size of the input
38 | Float input_file_size_mb = size(query_data, "G")
39 |
40 | # Determining memory size base on the size of the input files.
41 | Float mem_gb = 64.0 + memory_factor * input_file_size_mb
42 |
43 | # Determining disk size base on the size of the input files.
44 | Int disk_gb = round(disk_factor * input_file_size_mb)
45 |
46 | # Determining disk type base on the size of disk.
47 | String disk_type = if disk_gb > 375 then "SSD" else "LOCAL"
48 |
49 | #Output files
50 | String reference_h5ad = "${reference_data_name}.h5ad"
51 | String monitor_log = "cell_annotation_monitor.log"
52 | String notebook_log = "log/${prefix}.cell.annotation.logfile.${genome}.txt"
53 | String prediction = "${prefix}.cell.annotation.prediction.${genome}.csv"
54 | String prediction_labels = "${prefix}.cell.annotation.labels.${genome}.png"
55 | String prediction_scores = "${prefix}.cell.annotation.scores.${genome}.pdf"
56 |
57 | command {
58 | set -e
59 |
60 | bash $(which monitor_script.sh) | tee ~{monitor_log} 1>&2 &
61 |
62 | # Download data from cellxgene
63 | python3 $(which get_cellxgene_data.py) \
64 | --id ${reference_data_id} \
65 | --out ${reference_data_name}
66 |
67 |
68 | # Perform cell annotation
69 | Rscript $(which cell_annotation.R) \
70 | --prefix ${prefix} \
71 | --reference_data_name ${reference_data_name} \
72 | --reference_label ${reference_label} \
73 | --query_data ${query_data} \
74 | --genome ${genome} \
75 | --gene_id_to_symbol ${gene_id_to_symbol}
76 |
77 | }
78 |
79 | output {
80 | File reference_h5ad = "${reference_h5ad}"
81 | File monitor_log = "${monitor_log}"
82 | File notebook_log = "${notebook_log}"
83 | File prediction = '${prediction}'
84 | File prediction_labels = '${prediction_labels}'
85 | File prediction_scores = '${prediction_scores}'
86 | }
87 |
88 | runtime {
89 | memory : "${mem_gb} GB"
90 | memory_retry_multiplier: 2
91 | disks: "local-disk ${disk_gb} ${disk_type}"
92 | docker : "${docker_image}"
93 | maxRetries:1
94 | }
95 |
96 | parameter_meta {
97 | reference_data_id: {
98 | description: 'Reference dataset id',
99 | help: 'The dataset id from cellxgene server.',
100 | examples: ['3bbb6cf9-72b9-41be-b568-656de6eb18b5']
101 | }
102 |
103 | reference_data_name: {
104 | description: 'Reference data',
105 | help: 'This file will be used as reference',
106 | examples: ['reference.h5ad']
107 | }
108 |
109 | query_data: {
110 | description: 'Query data',
111 | help: 'scRNA-seq data used as query',
112 | examples: ['put link to gcr']
113 | }
114 |
115 | genome: {
116 | description: 'Reference name',
117 | help: 'Reference genome.',
118 | examples: ['hg38', 'mm10', 'hg19', 'mm9']
119 | }
120 |
121 | prefix: {
122 | description: 'Project name',
123 | help: 'String used to name your project and associated file names',
124 | example: "shareseq"
125 | }
126 |
127 | docker_image: {
128 | description: 'Docker image.',
129 | help: 'Docker image for preprocessing step.',
130 | example: ['put link to gcr or dockerhub']
131 | }
132 |
133 | disk_factor: {
134 | description: 'Disk factor',
135 | help: 'Multiply this value to input .h5 file size (MB) to determine disk space (GB)',
136 | example: 16.0
137 | }
138 |
139 | memory_factor: {
140 | description: 'Memory factor',
141 | help: 'Multiply this value to input .h5 file size (MB) and add to default 32GB memory to determine RAM (GB)',
142 | example: 1.0
143 | }
144 | }
145 | }
146 |
--------------------------------------------------------------------------------
/src/R/barcode_rank_functions.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/Rscript
2 |
3 | ## Define functions needed for plotting barcode rank
4 |
5 | # Helper function to get vectors on which to call the elbow_knee_finder.
6 | # Takes in xy values of the curve, outputs appropriate xy vectors to be passed to elbow_knee_finder.
7 | #
8 | # Function computes the second derivative of the curve, and uses the shape of the second
9 | # derivative curve to determine whether the curve has multiple "joints" (i.e. if knee should be found).
10 | # If the second derivative is uniformly positive or uniformly negative, the curve has a single "joint",
11 | # and so elbow_knee_finder can be called on the original input vectors.
12 | # Otherwise (multiple "joints"), find the zeroes of the second derivative to the left and right of the
13 | # absolute minimum of the second derivative.
14 | # These will be the endpoints of the elbow_knee_finder, so return the slices of the xy vectors
15 | # between these zeroes.
16 | get_vectors <- function(x, y){
17 | smooth_spline <- smooth.spline(x, y, spar=1)
18 | second_deriv <- predict(smooth_spline, x, deriv=2)
19 |
20 | # Second derivative values can be noisy at beginning and end of graph; exclude first 10% and last 10%
21 | # of values when establishing uniformity of second derivative sign
22 | ten_percent <- round(length(second_deriv$x)*0.1)
23 | mid_second_deriv <- second_deriv$y[(ten_percent+1):(length(second_deriv$y)-ten_percent)]
24 |
25 | if (all(mid_second_deriv >= 0) | all(mid_second_deriv <= 0)){
26 | print("Returning original vectors")
27 | return(list(x,y)) }
28 | else {
29 | # Find absolute minimum
30 | abs_min_idx <- second_deriv$x[which.min(second_deriv$y)]
31 | # Find last non-negative value before absolute minimum
32 | left_vect <- second_deriv$y[0:abs_min_idx]
33 | endpt_1_idx <- tail(which(left_vect >= 0), n=1)
34 | # Find first non-positive value after absolute minimum
35 | right_vect <- second_deriv$y[abs_min_idx:length(second_deriv$y)]
36 | endpt_2_idx <- abs_min_idx + which(right_vect >= 0)[1] - 1
37 |
38 | # Error cases: revert to elbow finder
39 | # Used when second derivative curve has both positive and negative values,
40 | # but doesn't match positive-negative-positive shape expected of a knee's second derivative
41 | if (length(endpt_1_idx)==0 | length(endpt_2_idx)==0){
42 | print("Returning original vectors")
43 | return(list(x,y))
44 | } else if (is.na(endpt_1_idx) | is.na(endpt_2_idx)){
45 | print("Returning original vectors")
46 | return(list(x,y))
47 | } else {
48 | print("Returning sliced vectors")
49 | return(list(x[endpt_1_idx:endpt_2_idx], y[endpt_1_idx:endpt_2_idx]))
50 | }
51 | }
52 | }
53 |
54 | # Function to find the elbow or knee of a plot.
55 | # Takes in set of xy coordinates of the plot and mode, returns point which is farthest
56 | # from the line formed by the endpoints.
57 | # Basic mode (default) is used when the plot is known to have only one "joint",
58 | # whereas advanced mode is used when it is not known whether the function needs to find an
59 | # elbow or a knee.
60 | elbow_knee_finder <- function(x, y, mode="basic") {
61 | # With advanced mode, use helper function to determine which vectors to perform calculation on
62 | if (mode == "advanced") {
63 | # smooth.spline() function used in get_vectors() requires at least 4 unique
64 | # x values; preempt this error
65 | if (length(unique(x)) < 4) {
66 | return(NULL)
67 | } else {
68 | xy_vects <- get_vectors(x, y)
69 | x <- xy_vects[[1]]
70 | y <- xy_vects[[2]]
71 | }
72 | }
73 | # Error case: return null if vectors have length 0
74 | if (length(x)==0 | length(y)==0) {
75 | return(NULL)
76 | }
77 | # Get endpoints (point with smallest x value, point with largest x value)
78 | endpts_df <- data.frame(x_coords=c(x[1], x[length(x)]),
79 | y_coords=c(y[1], y[length(y)]))
80 | # Fit line between endpoints
81 | fit <- lm(endpts_df$y_coords ~ endpts_df$x_coords)
82 | # For each point, get distance from line
83 | distances <- numeric(length(x))
84 | for(i in 1:length(x)) {
85 | distances[i] <- abs(coef(fit)[2]*x[i] - y[i] + coef(fit)[1]) / sqrt(coef(fit)[2]^2 + 1^2)
86 | }
87 |
88 | # Get point farthest from line
89 | x_max_dist <- x[which.max(distances)]
90 | y_max_dist <- y[which.max(distances)]
91 |
92 | return(c(x_max_dist, y_max_dist))
93 | }
94 |
95 | # Function to find the elbow/knee of a plot, and the elbow/knee of the points
96 | # before the first elbow/knee (i.e. elbow/knee of all barcodes, and elbow/knee
97 | # of top-ranked barcodes).
98 | # Takes in xy coordinates of the plot and returns vector of four coordinates:
99 | # xy coordinates of first elbow/knee, and xy coordinates of second elbow/knee.
100 | get_elbow_knee_points <- function(x, y) {
101 | point_1 <- elbow_knee_finder(x, y, mode="basic")
102 | if (!is.null(point_1)) {
103 | point_2 <- elbow_knee_finder(x[1:point_1[1]], y[1:point_1[1]], mode="advanced")
104 | }
105 | return(c(point_1, point_2))
106 | }
107 |
--------------------------------------------------------------------------------
/src/python/generate_h5_rna.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # coding=utf8
3 |
4 | """
5 | This script takes in the STARsolo barcodes tsv file, features tsv file,
6 | and raw count matrix mtx file, and generates an h5 file containing the
7 | genes x barcodes count matrix.
8 | """
9 |
10 | import argparse
11 | from collections import defaultdict
12 | import gzip
13 | import h5py
14 | import logging
15 | from scipy.sparse import csc_matrix
16 |
17 | def parse_arguments():
18 | parser = argparse.ArgumentParser(description="Generate an h5 count matrix of genes x barcodes")
19 | parser.add_argument("matrix_file", help="Filename for STARsolo raw matrix mtx file")
20 | parser.add_argument("features_file", help="Filename for STARsolo features tsv file")
21 | parser.add_argument("barcodes_file", help="Filename for STARsolo barcodes tsv file")
22 | parser.add_argument("output_file", help="Filename for output h5 file")
23 | parser.add_argument("pkr", help="Experiment prefix", nargs = '?')
24 | parser.add_argument("--ensembl", help="Flag for outputting genes using ENSEMBL ID, rather than gene name", action="store_true")
25 |
26 | return parser.parse_args()
27 |
28 | def get_split_lines(file_name, delimiter, skip=0):
29 | """Read file contents and yield generator with line entries"""
30 | opener = gzip.open if file_name.endswith('.gz') else open
31 |
32 | with opener(file_name, "rt") as f:
33 | for i in range(skip):
34 | next(f)
35 | for line in f:
36 | yield line.rstrip().split(sep=delimiter)
37 |
38 | def rename_duplicates(duplicate_list):
39 | """Rename duplicate entries as entry, entry.1, entry.2, etc."""
40 | seen = defaultdict(int)
41 | renamed_list = []
42 |
43 | for entry in duplicate_list:
44 | renamed_list.append(f"{entry}.{seen[entry]}" if entry in seen else entry)
45 | seen[entry] += 1
46 |
47 | return renamed_list
48 |
49 | def build_count_matrix(matrix):
50 | """Convert contents of mtx file to csc matrix"""
51 | # first line of matrix contains dimensions
52 | dimensions = next(matrix)
53 | n_rows = int(dimensions[0])
54 | n_cols = int(dimensions[1])
55 |
56 | gene_indices = []
57 | barcode_indices = []
58 | counts = []
59 |
60 | for line in matrix:
61 | # subtract 1 from indices to convert to zero-based indexing
62 | gene_indices.append(int(line[0])-1)
63 | barcode_indices.append(int(line[1])-1)
64 | counts.append(int(line[2]))
65 |
66 | count_matrix = csc_matrix((counts, (gene_indices,barcode_indices)), shape=(n_rows,n_cols))
67 |
68 | return count_matrix
69 |
70 | def write_h5(output_file, count_matrix, barcode_list, gene_list):
71 | h5_file = h5py.File(output_file, "w")
72 |
73 | # create datasets expected for Seurat import
74 | g = h5_file.create_group("group")
75 | g.create_dataset("barcodes", data=barcode_list)
76 | g.create_dataset("data", data=count_matrix.data)
77 | g.create_dataset("gene_names", data=gene_list)
78 | g.create_dataset("genes", data=gene_list)
79 | g.create_dataset("indices", data=count_matrix.indices)
80 | g.create_dataset("indptr", data=count_matrix.indptr)
81 | g.create_dataset("shape", data=count_matrix.shape)
82 |
83 | h5_file.close()
84 |
85 | def main():
86 | # create log file
87 | logging.basicConfig(filename="generate_h5_rna.log", level=logging.INFO)
88 |
89 | # get arguments
90 | args = parse_arguments()
91 | matrix_file = getattr(args, "matrix_file")
92 | features_file = getattr(args, "features_file")
93 | barcodes_file = getattr(args, "barcodes_file")
94 | pkr = getattr(args, "pkr", None)
95 | output_file = getattr(args, "output_file")
96 | ensembl = getattr(args, "ensembl")
97 |
98 | # read input files
99 | logging.info("Reading input files\n")
100 |
101 | # get indices and counts from matrix file; skip first two lines of matrix file (header)
102 | matrix = get_split_lines(matrix_file, delimiter=" ", skip=2)
103 |
104 | # get genes from features file
105 | features = get_split_lines(features_file, delimiter="\t")
106 | if ensembl:
107 | gene_list = [line[0] for line in features]
108 | else:
109 | gene_list_duplicated = [line[1] for line in features]
110 | # append .1, .2, etc. for duplicated genes
111 | gene_list = rename_duplicates(gene_list_duplicated)
112 |
113 | # get barcodes from barcodes file, reformat as R1R2R3_PKR
114 | barcodes = get_split_lines(barcodes_file, delimiter="\t")
115 | barcode_list = [line[0] for line in barcodes]
116 | if pkr is None:
117 | formatted_barcode_list = barcode_list
118 | else:
119 | formatted_barcode_list = [barcode + "_" + pkr for barcode in barcode_list]
120 |
121 | # generate count matrix
122 | logging.info("Generating count matrix\n")
123 | count_matrix = build_count_matrix(matrix)
124 |
125 | # write h5 file
126 | logging.info(f"Writing to {output_file}.h5\n")
127 | write_h5(output_file, count_matrix, formatted_barcode_list, gene_list)
128 | logging.info("Finished writing h5 file\n")
129 |
130 | if __name__ == "__main__":
131 | main()
132 |
--------------------------------------------------------------------------------
/src/python/rna_barcode_metadata.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | """
4 | This script takes in a bam file, and outputs a txt file containing the number of
5 | total reads, duplicate reads, UMIs, genes, and percent mitochondrial reads for each barcode.
6 | """
7 |
8 | import argparse
9 | import logging
10 | import pysam
11 | from collections import defaultdict
12 |
13 | logging.basicConfig(filename='barcode_metadata.log', encoding='utf-8', level=logging.DEBUG)
14 | logging.debug('Creating the barcode metadata for RNA from bam.')
15 |
16 | def parse_arguments():
17 | parser = argparse.ArgumentParser(description="Get total reads, duplicate reads, UMIs, genes, and percent mitochondrial reads for each barcode from bam file")
18 | parser.add_argument("bam_file", help="Filename for input bam file")
19 | parser.add_argument("bai_file", help="Filename for bam index file")
20 | parser.add_argument("barcode_metadata_file", help="Filename for output barcode metadata txt file")
21 | parser.add_argument("pkr", help="PKR id for shareseq", default = None, nargs='?')
22 | parser.add_argument("--barcode_tag", help="PKR id for shareseq", default="CB")
23 |
24 | return parser.parse_args()
25 |
26 | def get_metrics(bam, barcode_tag="CB", pkr=None):
27 | """
28 | Get barcode metrics from bam file; all counts are only for reads overlapping genes.
29 | Reported metrics are total counts, UMIs (one UMI counted per unique UMI-gene mapping),
30 | duplicate counts, genes, percent mitochondrial reads
31 | """
32 | total_counts = defaultdict(int)
33 | genes = defaultdict(set)
34 | umi_gene = defaultdict(set)
35 | mitochondrial_counts = defaultdict(int)
36 | barcodes = set()
37 | formatted_barcodes = {}
38 |
39 | for read in bam:
40 | try:
41 | # get barcode; skip read if not present
42 | barcode = read.get_tag(barcode_tag)
43 | if barcode == "-":
44 | #logging.warning(f"Skipping {read.qname} because the {barcode_tag} tag is empty") slowing down
45 | continue
46 |
47 | # get gene id; skip read if not present
48 | gene_id = read.get_tag("GX")
49 | if gene_id == "-":
50 | #logging.warning(f"Skipping {read.qname} because the GX tag is empty")
51 | continue
52 |
53 | # get UMI; skip read if not present
54 | umi = read.get_tag("UB")
55 | if umi == "-":
56 | #logging.warning(f"Skipping {read.qname} because the UB tag is empty")
57 | continue
58 |
59 | barcodes.add(barcode)
60 |
61 | total_counts[barcode] += 1
62 |
63 | genes[barcode].add(gene_id)
64 |
65 | umi_gene[barcode].add(umi + gene_id)
66 |
67 | if read.reference_name == "chrM":
68 | mitochondrial_counts[barcode] += 1
69 | except KeyError:
70 | logging.error(f"Skipping {read.qname} because one of the tags {barcode_tag},GX, or UB is missing.")
71 |
72 | # count unique genes per barcode
73 | genes_per_barcode = {barcode:len(gene_set) for (barcode, gene_set) in genes.items()}
74 |
75 | # count unique umi-gene mappings per barcode
76 | umis_per_barcode = {barcode:len(umi_gene_set) for (barcode, umi_gene_set) in umi_gene.items()}
77 |
78 | # create list with barcodes and associated metrics
79 | barcode_metadata = []
80 | for barcode in barcodes:
81 | total_val = str(total_counts[barcode])
82 | umi_val = str(umis_per_barcode.get(barcode, 0))
83 | duplicate_val = str(total_counts[barcode] - umis_per_barcode.get(barcode, 0))
84 | gene_val = str(genes_per_barcode.get(barcode, 0))
85 | mitochondrial_val = str(round(mitochondrial_counts.get(barcode, 0) / total_counts[barcode] * 100, 2))
86 | out_barcode = barcode + "_" + pkr if pkr else barcode
87 |
88 | metrics = [out_barcode, total_val, duplicate_val, umi_val, gene_val, mitochondrial_val]
89 |
90 | barcode_metadata.append(metrics)
91 |
92 | return barcode_metadata
93 |
94 | def write_metadata_file(barcode_metadata, output_file):
95 | fields = ["barcode", "total_counts", "duplicate_counts", "umis", "genes", "percent_mitochondrial"]
96 |
97 | with open(output_file, "w") as f:
98 | # write header
99 | f.write("\t".join(fields) + "\n")
100 | # write rows
101 | for metrics_list in barcode_metadata:
102 | f.write("\t".join(metrics_list[:]) + "\n")
103 |
104 | def main():
105 | # get arguments
106 | args = parse_arguments()
107 | bam_file = getattr(args, "bam_file")
108 | bai_file = getattr(args, "bai_file")
109 |
110 | pkr = getattr(args, "pkr")
111 | barcode_tag = getattr(args, "barcode_tag")
112 |
113 | barcode_metadata_file = getattr(args, "barcode_metadata_file")
114 |
115 | # load bam file
116 | bam = pysam.AlignmentFile(bam_file, "rb", index_filename=bai_file)
117 |
118 | # get metrics for each barcode
119 | barcode_metadata = get_metrics(bam, barcode_tag, pkr)
120 |
121 | # write txt file
122 | write_metadata_file(barcode_metadata, barcode_metadata_file)
123 |
124 | if __name__ == "__main__":
125 |
126 | main()
127 |
--------------------------------------------------------------------------------
/src/bash/monitor_script.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | declare -a TEMP=$(mktemp temp_monitoring.XXXXXXXX)
4 |
5 | if [[ -z "${BACKEND}" ]]; then
6 | backend=""
7 | else
8 | backend=${BACKEND}
9 | fi
10 |
11 | function get_disk_info() {
12 | # df command and cromwell root field
13 | if [ "$backend" = "aws" ]; then
14 | df | grep '/$'
15 | else
16 | df | grep cromwell_root
17 | fi
18 | }
19 |
20 | function get_disk_usage() {
21 | # get disk usage field
22 | get_disk_info | awk '{ print $5 }'
23 | }
24 |
25 | function get_mem_info() {
26 | # /proc/meminfo
27 | cat /proc/meminfo
28 | }
29 |
30 | function get_mem_available() {
31 | # mem unused from /proc/meminfo
32 | get_mem_info | grep MemAvailable | awk 'BEGIN { FS=" " } ; { print $2 }'
33 | }
34 |
35 | function get_mem_total() {
36 | # mem total from /proc/meminfo
37 | get_mem_info | grep MemTotal | awk 'BEGIN { FS=" " } ; { print $2 }'
38 | }
39 |
40 | function get_mem_usage() {
41 | # memTotal and memAvailable
42 | local -r mem_total=$(get_mem_total)
43 | local -r mem_available=$(get_mem_available)
44 |
45 | # usage = 100 * mem_used / mem_total
46 | local -r mem_used=$(($mem_total-$mem_available))
47 | echo "$mem_used" "$mem_total" "%"| awk '{ print 100*($1/$2)$3 }'
48 | }
49 |
50 | function get_cpu_info() {
51 | # cpu info from /proc/stat
52 | cat /proc/stat | grep "cpu "
53 | }
54 |
55 | function get_cpu_total() {
56 | # get the total cpu usage since a given time (including idle and iowait)
57 | # user+nice+system+idle+iowait+irq+softirq+steal
58 | get_cpu_info | awk 'BEGIN { FS=" " } ; { print $2+$3+$4+$5+$6+$7+$8+$9 }'
59 | }
60 |
61 | function get_cpu_used() {
62 | # get the cpu usage since a given time (w/o idle or iowait)
63 | # user+nice+system+irq+softirq+steal
64 | get_cpu_info | awk 'BEGIN { FS=" " } ; { print $2+$3+$4+$7+$8+$9 }'
65 | }
66 |
67 | function get_cpu_usage() {
68 | # get the cpu usage since a given time (w/o idle or iowait)
69 | # user+nice+system+irq+softirq+steal
70 | local -r cpu_used_cur=$(get_cpu_used)
71 |
72 | # get the total cpu usage since a given time (including idle and iowait)
73 | # user+nice+system+idle+iowait+irq+softirq+steal
74 | local -r cpu_total_cur=$(get_cpu_total)
75 |
76 | # read in previous cpu usage values
77 | read -r -a cpu_prev < ${TEMP}
78 | local -r cpu_used_prev=${cpu_prev[0]}
79 | local -r cpu_total_prev=${cpu_prev[1]}
80 |
81 | # save current values as prev values for next iteration
82 | cpu_prev[0]=$cpu_used_cur
83 | cpu_prev[1]=$cpu_total_cur
84 | echo "${cpu_prev[@]}" > ${TEMP}
85 |
86 | # usage = 100 * (cpu_used_cur - cpu_used_prev) / (cpu_total_cur-cpu_total_prev)
87 | echo "$cpu_used_cur" "$cpu_used_prev" "$cpu_total_cur" "$cpu_total_prev" "%"| awk 'BEGIN {FS=" "} ; { print 100*(($1-$2)/($3-$4))$5 }'
88 |
89 | }
90 |
91 | function print_usage() {
92 | echo [$(date)]
93 | echo \* CPU usage: "$(get_cpu_usage)"
94 | echo \* Memory usage: "$(get_mem_usage)"
95 | echo \* Disk usage: $(get_disk_usage)
96 | }
97 |
98 | function print_summary() {
99 | # display header information
100 | echo ==================================
101 | echo =========== MONITORING ===========
102 | echo ==================================
103 |
104 | # summary info
105 | echo --- General Information ---
106 | # number of cores
107 | echo \#CPU: $(nproc)
108 | # multiply by 10^-6 to convert KB to GB
109 | echo Total Memory: $(echo $(get_mem_total) 1000000 | awk '{ print $1/$2 }')G
110 |
111 | if [ "$backend" = "aws" ]; then
112 | echo Total Disk space: $(df -h | grep '/$' | awk '{ print $2 }')
113 | else
114 | echo Total Disk space: $(df -h | grep cromwell_root | awk '{ print $2}')
115 | fi
116 | }
117 |
118 | function main() {
119 | # disk, mem and cpu general statisitcs
120 | print_summary
121 |
122 | # create variable to store cpu being used (cpu_prev[0]) and total cpu total (cpu_prev[1])
123 | # save variable to a temp file to allow passing in values to a function
124 | declare -a cpu_prev
125 | cpu_prev[0]=$(get_cpu_used)
126 | cpu_prev[1]=$(get_cpu_total)
127 | # save global values to temp file to allow passing in values to a function
128 | echo "${cpu_prev[@]}" > ${TEMP}
129 |
130 | # sleep b/w getting usage and intially storing the cpu_previous usage values
131 | # this is b/c cpu usage values are time dependent
132 | # to calculate cpu usage, values must be determined from 2 diff time stamps
133 | if [ -z "$MONITOR_SCRIPT_SLEEP" ]; then
134 | MONITOR_SCRIPT_SLEEP=30
135 | fi
136 | # get usage of disk, cpu and mem every MONITOR_SCRIPT_SLEEP sec
137 | echo
138 | echo --- Runtime Information ---
139 |
140 | sleep "$MONITOR_SCRIPT_SLEEP";
141 | while true; do print_usage; sleep "$MONITOR_SCRIPT_SLEEP"; done
142 | }
143 |
144 | main
145 |
--------------------------------------------------------------------------------
/tasks/share_task_qc_rna.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | # TASK
4 | # SHARE-qc-rna
5 |
6 | task qc_rna {
7 | meta {
8 | version: 'v0.1'
9 | author: 'Mei Knudson (mknudson@broadinstitute.org) at Broad Institute of MIT and Harvard'
10 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: QC RNA task'
11 | }
12 |
13 | input {
14 | # This function takes in input the sorted bam file produced by STARsolo
15 | File bam
16 | Int? umi_cutoff = 100
17 | Int? gene_cutoff = 100
18 | String genome_name
19 | String? barcode_tag = "CB"
20 | String? pkr
21 | String? prefix
22 |
23 | Int? cpus = 16
24 | Float? disk_factor = 1.0
25 | Float? memory_factor = 1.5
26 | String docker_image = "us.gcr.io/buenrostro-share-seq/share_task_qc_rna:v1.0.0"
27 | }
28 |
29 | # Determine the size of the input
30 | Float input_file_size_gb = size(bam, "G")
31 |
32 | # Determining memory size based on the size of the input files.
33 | Float mem_gb = 5.0 + memory_factor * input_file_size_gb
34 |
35 | # Determining disk size based on the size of the input files.
36 | Int disk_gb = round(40.0 + disk_factor * input_file_size_gb)
37 |
38 | # Determining disk type based on the size of disk.
39 | String disk_type = if disk_gb > 375 then "SSD" else "LOCAL"
40 |
41 | String assay = "RNA"
42 | String bai = "~{default="share-seq" prefix}.qc.rna.~{genome_name}.bam.bai"
43 | String barcode_metadata = "~{default="share-seq" prefix}.qc.rna.~{genome_name}.barcode.metadata.tsv"
44 | String duplicates_log = "~{default="share-seq" prefix}.qc.rna.~{genome_name}.duplicates.log.txt"
45 | String umi_barcode_rank_plot = "~{default="share-seq" prefix}.qc.rna.~{genome_name}.umi.barcode.rank.plot.png"
46 | String gene_barcode_rank_plot = "~{default="share-seq" prefix}.qc.rna.~{genome_name}.gene.barcode.rank.plot.png"
47 | String gene_umi_scatter_plot = "~{default="share-seq" prefix}.qc.rna.~{genome_name}.gene.umi.scatter.plot.png"
48 | String monitor_log = "monitor.log"
49 |
50 | command <<<
51 | set -e
52 |
53 | bash $(which monitor_script.sh) | tee ~{monitor_log} 1>&2 &
54 |
55 | # Index bam file
56 | samtools index -@ ~{cpus} ~{bam} ~{bai}
57 |
58 | # Extract barcode metadata (total counts, unique counts, duplicate counts, genes, percent mitochondrial) from bam file
59 | python3 $(which rna_barcode_metadata.py) ~{bam} \
60 | ~{bai} \
61 | ~{barcode_metadata} \
62 | ~{pkr} ~{"--barcode_tag " + barcode_tag}
63 |
64 | awk '{total+=$2; duplicate+=$3; unique+=$4} END {print "total reads:", total; print "unique reads:", unique; print "duplicate reads:", duplicate}' ~{barcode_metadata} > ~{duplicates_log}
65 |
66 | # Make QC plots
67 | Rscript $(which rna_qc_plots.R) ~{barcode_metadata} ~{umi_cutoff} ~{gene_cutoff} ~{umi_barcode_rank_plot} ~{gene_barcode_rank_plot} ~{gene_umi_scatter_plot}
68 | >>>
69 |
70 | output {
71 | File rna_barcode_metadata = "~{barcode_metadata}"
72 | File rna_duplicates_log = "~{duplicates_log}"
73 | File rna_barcode_metadata_log = "barcode_metadata.log"
74 | File? rna_umi_barcode_rank_plot = "~{umi_barcode_rank_plot}"
75 | File? rna_gene_barcode_rank_plot = "~{gene_barcode_rank_plot}"
76 | File? rna_gene_umi_scatter_plot = "~{gene_umi_scatter_plot}"
77 | }
78 |
79 | runtime {
80 | cpu : cpus
81 | memory : "~{mem_gb} GB"
82 | disks: "local-disk ~{disk_gb} ~{disk_type}"
83 | docker : "${docker_image}"
84 | }
85 |
86 | parameter_meta {
87 | bam: {
88 | description: 'Alignment bam file',
89 | help: 'Aligned reads in bam format.',
90 | example: 'hg38.aligned.bam'
91 | }
92 | umi_cutoff: {
93 | description: 'UMI cutoff',
94 | help: 'Cutoff for number of UMIs required when making UMI barcode rank plot.',
95 | example: 10
96 | }
97 | gene_cutoff: {
98 | description: 'Gene cutoff',
99 | help: 'Cutoff for number of genes required when making gene barcode rank plot.',
100 | example: 10
101 | }
102 | pkr: {
103 | description: 'Experiment pkr',
104 | help: 'Id of the sample pkr (share-seq specific).',
105 | examples: ['SS-PKR-000']
106 | }
107 | genome_name: {
108 | description: 'Reference name',
109 | help: 'The name genome reference used to align.',
110 | example: ['hg38', 'mm10', 'hg19', 'mm9']
111 | }
112 | prefix: {
113 | description: 'Prefix for output files',
114 | help: 'Prefix that will be used to name the output files',
115 | example: 'MyExperiment'
116 | }
117 | docker_image: {
118 | description: 'Docker image.',
119 | help: 'Docker image for preprocessing step. Dependencies: samtools',
120 | example: ['put link to gcr or dockerhub']
121 | }
122 | }
123 | }
124 |
--------------------------------------------------------------------------------
/tasks/share_task_joint_qc.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | # TASK
4 | # SHARE-joint-qc-plotting
5 |
6 |
7 | task joint_qc_plotting {
8 | meta {
9 | version: 'v0.1'
10 | author: 'Mei Knudson (mknudson@broadinstitute.org) at Broad Institute of MIT and Harvard'
11 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: Joint QC plot'
12 | }
13 |
14 | input {
15 | # This task generates a plot of barcodes QC'd jointly by RNA and ATAC metrics, as well as a
16 | # density plot of all barcodes passing at least one filter.
17 | File? atac_barcode_metadata
18 | File? rna_barcode_metadata
19 | Int remove_low_yielding_cells = 10
20 | Int min_umis = 100
21 | Int min_genes = 200
22 | Int min_tss = 4
23 | Int min_frags = 100
24 |
25 | Float? disk_factor = 8.0
26 | Float? memory_factor = 2.0
27 |
28 | String? prefix
29 | String genome_name
30 |
31 | String docker_image = "us.gcr.io/buenrostro-share-seq/share_task_joint_qc:v1.0.0"
32 | }
33 |
34 | # Determine the size of the input
35 | Float input_file_size_gb = size(atac_barcode_metadata, "G") + size(rna_barcode_metadata, "G")
36 |
37 | # Determine memory size based on the size of the input files
38 | Float mem_gb = 5.0 + memory_factor * input_file_size_gb
39 |
40 | # Determine disk size based on the size of the input files
41 | Int disk_gb = round(40.0 + disk_factor * input_file_size_gb)
42 |
43 | # Determining disk type base on the size of disk.
44 | String disk_type = if disk_gb > 375 then "SSD" else "LOCAL"
45 |
46 | String joint_qc_plot = '${default="share-seq" prefix}.${genome_name}.joint.qc.plot.png'
47 | String joint_density_plot = '${default="share-seq" prefix}.${genome_name}.joint.density.plot.png'
48 | String joint_barcode_metadata = '${default="share-seq" prefix}.joint.barcode.metadata.${genome_name}.csv'
49 |
50 | command {
51 | set -e
52 |
53 | bash $(which monitor_script.sh) > monitoring.log &
54 |
55 | # Make joint qc plot
56 | python3 $(which joint_cell_plotting.py) ${rna_barcode_metadata} ${atac_barcode_metadata} ${remove_low_yielding_cells} ${min_umis} ${min_genes} ${min_tss} ${min_frags} ${joint_qc_plot} ${joint_barcode_metadata} ${default="share-seq" prefix}
57 |
58 | # Make joint density plot
59 | Rscript $(which joint_cell_plotting_density.R) ${default="share-seq" prefix} ${joint_barcode_metadata} ${joint_density_plot}
60 | }
61 |
62 | output {
63 | File joint_calling_monitor = "monitoring.log"
64 | File joint_calling_log = "joint_cell_plotting.log"
65 | File? joint_qc_plot = "${joint_qc_plot}"
66 | File? joint_density_plot = "${joint_density_plot}"
67 | File joint_barcode_metadata = "${joint_barcode_metadata}"
68 | }
69 |
70 | runtime {
71 | memory : "${mem_gb} GB"
72 | disks: "local-disk ${disk_gb} ${disk_type}"
73 | docker : "${docker_image}"
74 | }
75 |
76 | parameter_meta {
77 | atac_barcode_metadata: {
78 | description: 'File containing ATAC barcode metrics.',
79 | help: 'tsv file with ATAC barcode (R1,R2,R3,PKR), fragments, TSS enrichment.',
80 | example: 'qc.atac.barcode.metadata.tsv'
81 | }
82 | rna_barcode_metadata: {
83 | description: 'File containing RNA barcode metrics.',
84 | help: 'tsv file with RNA barcode (R1,R2,R3,PKR), UMIs, genes.',
85 | example: 'qc.rna.barcode.metadata.tsv'
86 | }
87 | remove_low_yielding_cells: {
88 | description: 'UMI and fragments cutoff for plotting.',
89 | help: 'Minimum number of UMIs/fragments required for barcode to be plotted.',
90 | example: 10
91 | }
92 | min_umis: {
93 | description: 'UMI cutoff for RNA QC.',
94 | help: 'Minimum number of UMIs required for barcode to pass RNA QC.',
95 | example: 100
96 | }
97 | min_genes: {
98 | description: 'Gene cutoff for RNA QC.',
99 | help: 'Minimum number of genes required for barcode to pass RNA QC.',
100 | example: 200
101 | }
102 | min_tss: {
103 | description: 'TSS cutoff for ATAC QC.',
104 | help: 'Minimum TSS score required for barcode to pass ATAC QC.',
105 | example: 4
106 | }
107 | min_frags: {
108 | description: 'Fragments cutoff for ATAC QC.',
109 | help: 'Minimum number of fragments required for barcode to pass ATAC QC.',
110 | example: 100
111 | }
112 | prefix: {
113 | description: 'Prefix for output files',
114 | help: 'Prefix that will be used to name the output files',
115 | examples: 'MyExperiment'
116 | }
117 | genome_name: {
118 | description: 'Reference name',
119 | help: 'The name genome reference used to align.',
120 | example: ['hg38', 'mm10', 'hg19', 'mm9']
121 | }
122 | docker_image: {
123 | description: 'Docker image.',
124 | help: 'Docker image for preprocessing step.',
125 | example: ['put link to gcr or dockerhub']
126 | }
127 | }
128 | }
129 |
--------------------------------------------------------------------------------
/tasks/10x_task_preprocess.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | # TASK
4 | # 10x_task_preprocess
5 |
6 | task preprocess_tenx {
7 | meta {
8 | version: 'v0.1'
9 | author: 'Eugenio Mattei (emattei@broadinstitute.org) at Broad Institute of MIT and Harvard'
10 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: preprocess 10x ATAC data.'
11 | }
12 |
13 | input {
14 | # This task takes in input the 3 fastqs coming out from cellranger mkfastqs and preprocess them.
15 | File fastq_R1 # Pair 1 reads
16 | File fastq_R3 # Pair 2 reads
17 | File fastq_R2 # Barcode fastq
18 | File? whitelist # Barcode whitelist (chemistry specific)
19 | Int? barcode_dist = 2
20 | Float? threshold_pct_barcode_matching = 0.60
21 | String chemistry
22 | String? prefix
23 | Int? cpus = 16
24 | Float? disk_factor = 8.0
25 | Float? memory_factor = 0.15
26 | String docker_image = "us.gcr.io/buenrostro-share-seq/10x_task_preprocess:v1.0.0"
27 | }
28 |
29 | # Determine the size of the input
30 | Float input_file_size_gb = size(fastq_R1, "G") + size(fastq_R2, "G") + size(fastq_R3, "G")
31 |
32 | # Determining memory size base on the size of the input files.
33 | Float mem_gb = 5.0 + memory_factor * input_file_size_gb
34 |
35 | # Determining disk size base on the size of the input files.
36 | Int disk_gb = round(40.0 + disk_factor * input_file_size_gb)
37 |
38 | # Determining disk type base on the size of disk.
39 | String disk_type = if disk_gb > 375 then "SSD" else "LOCAL"
40 |
41 | # auto-detect barcode complementation outfiles
42 | String barcode_complementation_qc = "${default="10x" prefix}.atac.preprocess.complementation.qc.txt"
43 | String barcode_complementation_out = "${default="10x" prefix}.atac.preprocess.complementation.out.txt"
44 |
45 | # barcode correction and filtering outfiles
46 | String barcode_correction_qc = "${default="10x" prefix}.atac.preprocess.barcode.correction.qc.txt"
47 | String cleaned_fastq_R1 = "${default="10x" prefix}.atac.preprocess.cleaned.R1.fastq.gz"
48 | String cleaned_fastq_R2 = "${default="10x" prefix}.atac.preprocess.cleaned.R2.fastq.gz"
49 |
50 | # read trimming outfiles
51 | String final_fastq_R1 = "${default="10x" prefix}.atac.preprocess.cleaned.trimmed.R1.fastq.gz"
52 | String final_fastq_R2 = "${default="10x" prefix}.atac.preprocess.cleaned.trimmed.R2.fastq.gz"
53 | String trimming_log_json = "${default="10x" prefix}.atac.preprocess.trimming.log.json"
54 | String trimming_log_html = "${default="10x" prefix}.atac.preprocess.trimming.log.html"
55 | String trimming_stats = "${default="10x" prefix}.atac.preprocess.trimming.adapter.stats.txt"
56 |
57 | String barcode_conversion_dict = "barcode_conversion_dict.csv"
58 |
59 | String monitor_log = 'monitor_10x_preprocessing.log.txt'
60 |
61 | command <<<
62 | set -e
63 |
64 | bash $(which monitor_script.sh) | tee ~{monitor_log} 1>&2 &
65 |
66 | # Strip read description
67 | zcat ~{fastq_R1} | sed 's/ .*//' | gzip > stripped_R1.fastq.gz
68 | zcat ~{fastq_R3} | sed 's/ .*//' | gzip > stripped_R2.fastq.gz
69 | zcat ~{fastq_R2} | sed 's/ .*//' | gzip > stripped_barcode.fastq.gz
70 |
71 | if [[ '~{whitelist}' == *.gz ]]; then
72 | gunzip -c ~{whitelist} > whitelist.txt
73 | else
74 | ln -s ~{whitelist} whitelist.txt
75 | fi
76 |
77 | # auto-detect barcode complementation
78 | # python3 barcode_revcomp_detect.py barcode_fastq chemistry whitelist qc_out out threshold
79 |
80 | python3 $(which barcode_revcomp_detect.py) stripped_barcode.fastq.gz ~{chemistry} whitelist.txt ~{barcode_complementation_qc} ~{barcode_complementation_out} ~{threshold_pct_barcode_matching}
81 |
82 | # barcode correction and filtering
83 | # python3 match_barcodes.py
84 |
85 | python3 $(which match_barcodes.py) stripped_R1.fastq.gz stripped_R2.fastq.gz stripped_barcode.fastq.gz ~{chemistry} ~{barcode_dist} ~{barcode_complementation_out} whitelist.txt ~{cleaned_fastq_R1} ~{cleaned_fastq_R2} ~{barcode_correction_qc} ~{cpus}
86 |
87 | # Cleaned old files
88 | rm stripped_R1.fastq.gz stripped_R2.fastq.gz stripped_barcode.fastq.gz
89 | >>>
90 |
91 | output {
92 | File fastq_R1_preprocessed = cleaned_fastq_R1
93 | File fastq_R2_preprocessed = cleaned_fastq_R2
94 | File tenx_barcode_complementation_qc = barcode_complementation_qc
95 | File tenx_barcode_correction_qc = barcode_correction_qc
96 | File? tenx_barcode_conversion_dict = barcode_conversion_dict
97 | #File tenx_trimming_log_json = trimming_log_json
98 | #File trimming_log_html = trimming_log_html
99 | #File tenx_trimming_stats = trimming_stats
100 | }
101 |
102 | runtime {
103 | cpu: cpus
104 | docker: "${docker_image}"
105 | disks: "local-disk ${disk_gb} ${disk_type}"
106 | memory: "${mem_gb} GB"
107 | }
108 |
109 | parameter_meta {
110 | fastq_R1: {
111 | description: 'Pairs 1 fastq',
112 | help: 'Pairs 1 fastq',
113 | }
114 | fastq_R2: {
115 | description: 'Barcode fastq',
116 | help: 'Barcode fastq',
117 | }
118 | fastq_R3: {
119 | description: 'Pairs 2 fastq',
120 | help: 'Pairs 2 fastq',
121 | }
122 | }
123 |
124 | }
125 |
--------------------------------------------------------------------------------
/src/R/rna_qc_plots.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/Rscript
2 |
3 | ### Takes RNA barcode metadata tsv file, and outputs QC plots as png files.
4 | ### QC plots include barcode rank by number of UMIs (all barcodes and top-ranked barcodes),
5 | ### barcode rank by number of genes (all barcodes and top-ranked barcodes),
6 | ### and genes vs UMIs scatter plot.
7 |
8 | ## Import helper functions
9 | source("/usr/local/bin/barcode_rank_functions.R")
10 |
11 | ## Get arguments, read input
12 | args <- commandArgs()
13 |
14 | barcode_metadata_file <- args[6]
15 | umi_cutoff <- as.integer(args[7])
16 | gene_cutoff <- as.integer(args[8])
17 | umi_rank_plot_file <- args[9]
18 | gene_rank_plot_file <- args[10]
19 | gene_umi_plot_file <- args[11]
20 |
21 | barcode_metadata <- read.table(barcode_metadata_file, header=T)
22 |
23 | ## Get plot inputs
24 |
25 | # Impose UMI cutoff, sort in decreasing order, assign rank
26 | umi_filtered <- barcode_metadata$umis[barcode_metadata$umis >= umi_cutoff]
27 | umi_filtered_sort <- sort(umi_filtered, decreasing=T)
28 | umi_rank <- 1:length(umi_filtered_sort)
29 |
30 | # Find elbow/knee of UMI barcode rank plot and top-ranked UMI barcode rank plot
31 | umi_points <- get_elbow_knee_points(x=umi_rank, y=log10(umi_filtered_sort))
32 | # For each valid plot, make factor for coloring plot points
33 | if (length(umi_points) > 0) { # Elbow found in first plot
34 | umi_plot1 <- TRUE
35 | is_top_ranked_umi <- factor(ifelse(umi_rank <= umi_points[1], 1, 0))
36 | if (length(umi_points) > 2) { # Elbow/knee found in second plot
37 | umi_plot2 <- TRUE
38 | umi_top_rank <- umi_rank[1:umi_points[1]]
39 | umi_top_umi <- umi_filtered_sort[1:umi_points[1]]
40 | is_top_top_ranked_umi <- factor(ifelse(umi_top_rank <= umi_points[3], 1, 0))
41 | } else {
42 | umi_plot2 <- FALSE
43 | }
44 | } else {
45 | umi_plot1 <- FALSE
46 | }
47 |
48 | # Impose gene cutoff, sort in decreasing order, assign rank
49 | gene_filtered <- barcode_metadata$genes[barcode_metadata$genes >= gene_cutoff]
50 | gene_filtered_sort <- sort(gene_filtered, decreasing=T)
51 | gene_rank <- 1:length(gene_filtered_sort)
52 |
53 | # Find elbow/knee of gene barcode rank plot and top-ranked gene barcode rank plot
54 | gene_points <- get_elbow_knee_points(x=gene_rank, y=log10(gene_filtered_sort))
55 | # For each valid plot, make factor for coloring plot points
56 | if (length(gene_points) > 0) { # Elbow found in first plot
57 | gene_plot1 <- TRUE
58 | is_top_ranked_gene <- factor(ifelse(gene_rank <= gene_points[1], 1, 0))
59 | if (length(gene_points) > 2) { # Elbow/knee found in second plot
60 | gene_plot2 <- TRUE
61 | gene_top_rank <- gene_rank[1:gene_points[1]]
62 | gene_top_gene <- gene_filtered_sort[1:gene_points[1]]
63 | is_top_top_ranked_gene <- factor(ifelse(gene_top_rank <= gene_points[3], 1, 0))
64 | } else {
65 | gene_plot2 <- FALSE
66 | }
67 | } else {
68 | gene_plot1 <- FALSE
69 | }
70 |
71 | ## Generate plots
72 |
73 | options(scipen=999)
74 |
75 | # Make UMI barcode rank plots
76 | png(umi_rank_plot_file, width=8, height=8, units='in', res=300)
77 | par(mfrow = c(2,1))
78 |
79 | # Plot 1 (all barcodes passing UMI filter vs log10(UMIs))
80 | if (umi_plot1) {
81 | plot(x=umi_rank,
82 | y=umi_filtered_sort,
83 | log="y",
84 | xlab=paste0(" Barcode rank (", length(umi_rank)-umi_points[1], " low quality cells)"),
85 | ylab="log10(UMIs)",
86 | main="RNA UMIs per Barcode",
87 | col=c("dimgrey","darkblue")[is_top_ranked_umi],
88 | pch=16,
89 | ylim=c(1,100000))
90 | abline(v=umi_points[1], h=10^(umi_points[2]))
91 | text(umi_points[1], 10^(umi_points[2]),
92 | paste0("(", umi_points[1], ", ", 10^(umi_points[2]), ")"),
93 | adj=c(-0.1,-0.5))
94 | }
95 |
96 | # Plot 2 (top ranked barcodes vs log10(UMIs))
97 | if (umi_plot2) {
98 | plot(x=umi_top_rank,
99 | y=umi_top_umi,
100 | log="y",
101 | xlab="Barcode rank",
102 | ylab="log10(UMIs)",
103 | main="RNA UMIs per Top-Ranked Barcode",
104 | col=c("dimgrey","darkblue")[is_top_top_ranked_umi],
105 | pch=16,
106 | ylim=c(1,100000))
107 | abline(v=umi_points[3], h=10^(umi_points[4]))
108 | text(umi_points[3], 10^(umi_points[4]),
109 | paste("(", umi_points[3], ", ", 10^(umi_points[4]), ")", sep=""),
110 | adj=c(-0.1,-0.5))
111 | }
112 | dev.off()
113 |
114 |
115 | # Make gene barcode rank plots
116 | png(gene_rank_plot_file, width=8, height=8, units='in', res=300)
117 | par(mfrow = c(2,1))
118 |
119 | # Plot 1 (all barcodes passing gene filter vs log10(genes))
120 | if (gene_plot1) {
121 | plot(x=gene_rank,
122 | y=gene_filtered_sort,
123 | log="y",
124 | xlab=paste0(" Barcode rank (", length(gene_rank)-gene_points[1], " low quality cells)"),
125 | ylab="log10(genes)",
126 | main="RNA Genes per Barcode",
127 | col=c("dimgrey","darkblue")[is_top_ranked_gene],
128 | pch=16,
129 | ylim=c(1,10000))
130 | abline(v=gene_points[1], h=10^(gene_points[2]))
131 | text(gene_points[1], 10^(gene_points[2]),
132 | paste0("(", gene_points[1], ", ", 10^(gene_points[2]), ")"),
133 | adj=c(-0.1,-0.5))
134 | }
135 |
136 | # Plot 2 (top ranked barcodes vs log10(genes))
137 | if (gene_plot2) {
138 | plot(x=gene_top_rank,
139 | y=gene_top_gene,
140 | log="y",
141 | xlab="Barcode rank",
142 | ylab="log10(genes)",
143 | main="RNA Genes per Top-Ranked Barcode",
144 | col=c("dimgrey","darkblue")[is_top_top_ranked_gene],
145 | pch=16,
146 | ylim=c(1,10000))
147 | abline(v=gene_points[3], h=10^(gene_points[4]))
148 | text(gene_points[3], 10^(gene_points[4]),
149 | paste("(", gene_points[3], ", ", 10^(gene_points[4]), ")", sep=""),
150 | adj=c(-0.1,-0.5))
151 | }
152 | dev.off()
153 |
154 | # Make genes vs UMIs scatter plot
155 | png(gene_umi_plot_file, width=8, height=8, units='in', res=300)
156 |
157 | plot(x=barcode_metadata$umis,
158 | y=barcode_metadata$genes,
159 | xlab="UMIs",
160 | ylab="Genes",
161 | main="RNA Genes vs UMIs",
162 | col="darkblue",
163 | pch=16)
164 |
165 | dev.off()
166 |
--------------------------------------------------------------------------------
/tasks/share_task_merge_bams.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | # TASK
4 | # SHARE-atac-merge_bams
5 |
6 | task share_atac_merge_bams {
7 | meta {
8 | version: 'v0.1'
9 | author: 'Eugenio Mattei (emattei@broadinstitute.org) at Broad Institute of MIT and Harvard'
10 | description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: merge the individual bams together'
11 | }
12 |
13 | input {
14 | # This task takes in input the preprocessed ATAC fastqs and align them to the genome.
15 | Array[File] bams
16 | Array[File] logs
17 | String genome_name
18 | String prefix = "sample-share"
19 | Int? multimappers # = 5
20 | Int? cpus = 16
21 | Float? disk_factor = 8.0
22 | Float? memory_factor = 0.15
23 | String? docker_image = "us.gcr.io/buenrostro-share-seq/share_task_merge_bams:v1.0.0"
24 | }
25 |
26 | # Determine the size of the input
27 | Float input_file_size_gb = size(bams, "G")
28 |
29 | # Determining memory size base on the size of the input files.
30 | Float mem_gb = 16.0 + memory_factor * input_file_size_gb
31 |
32 | # Determining disk size base on the size of the input files.
33 | Int disk_gb = round(20.0 + disk_factor * input_file_size_gb)
34 |
35 | # Determining disk type base on the size of disk.
36 | String disk_type = if disk_gb > 375 then "SSD" else "LOCAL"
37 |
38 | # Determining memory for samtools.
39 | Float samtools_memory_gb = 0.8 * mem_gb # Samtools has overheads so reducing the memory to 80% of the total.
40 |
41 | # Number of threads to beable to use 4GB of memory per thread seems to be the fastest way
42 | Int samtools_threads_ = floor(samtools_memory_gb / 4)
43 | Int samtools_threads = if samtools_threads_ == 0 then 1 else samtools_threads_
44 |
45 | Int sambamba_threads = floor(cpus/2)
46 |
47 | # Now that we know how many threads we can use to assure 4GB of memory per thread
48 | # we assign any remaining memory to the threads.
49 | Int samtools_memory_per_thread_ = floor(samtools_memory_gb * 1024 / samtools_threads) # Computing the memory per thread for samtools in MB.
50 | Int samtools_memory_per_thread = if samtools_memory_per_thread_ < 768 then 768 else samtools_memory_per_thread_
51 |
52 | # Tim parameters
53 | Int machine_mem_mb = 18150
54 | Int cpu = 1
55 | Int compression_level = 5
56 | # default to 500GiB of space
57 | Int disk = 500
58 | Int command_mem_mb = machine_mem_mb - 500
59 |
60 | # Define tmp file name
61 | String unsorted_bam = "${prefix}.atac.merge.${genome_name}.bam"
62 |
63 | # Define the output names
64 | String merged_bam = "${prefix}.atac.merged.k${multimappers}.${genome_name}.sorted.bam"
65 | String merged_bai = "${prefix}.atac.merged.k${multimappers}.${genome_name}.sorted.bam.bai"
66 | String alignment_log = "${prefix}.atac.merged.k${multimappers}.${genome_name}.log"
67 |
68 | String monitor_log = "atac_merge_monitor.log"
69 |
70 | command <<<
71 | set -e
72 |
73 | bash $(which monitor_script.sh) 2>&1 &
74 |
75 | #sambamba merge -t ~{cpus} ~{unsorted_bam} ~{sep=" " bams}
76 |
77 | #sambamba sort -t ~{cpus} -m ~{command_mem_mb}M -o ~{merged_bam} ~{unsorted_bam}
78 |
79 | #sambamba index -t ~{cpus} ~{merged_bam}
80 |
81 | # Trying picard
82 |
83 | java -Dsamjdk.compression_level=~{compression_level} -Xms~{command_mem_mb}m -Xmx~{command_mem_mb}m -jar /usr/local/bin/picard.jar \
84 | MergeSamFiles \
85 | USE_THREADING=true \
86 | SORT_ORDER="coordinate" \
87 | INPUT=~{sep=' INPUT=' bams} \
88 | OUTPUT=~{merged_bam}
89 |
90 | sambamba index -t ~{cpus} ~{merged_bam}
91 |
92 | sed 's/^[[:space:]]*//g' ~{sep=" " logs} | cut -f 1 -d ' ' | awk '{ sum[FNR%15]+=$1 } END {n_total=length(sum);for (idx=1; idx <= n_total; idx++){print sum[idx]}}' > ~{alignment_log}
93 |
94 | >>>
95 |
96 | output {
97 | File atac_merged_alignment = merged_bam
98 | File atac_merged_alignment_index = merged_bai
99 | File atac_merged_alignment_log = alignment_log
100 | }
101 |
102 | runtime {
103 | cpu: cpu
104 | docker: "${docker_image}"
105 | disks: "local-disk ${disk} HDD"
106 | disk: disk + " GB" # TES
107 | #disks: "local-disk ${disk_gb} ${disk_type}"
108 | maxRetries:1
109 | memory: "${machine_mem_mb} MiB"
110 | #memory: "${mem_gb} GB"
111 | memory_retry_multiplier: 2
112 | }
113 |
114 | parameter_meta {
115 | bams: {
116 | description: 'Individuals bams from the scatter alignment task',
117 | help: 'Individuals bams from the scatter alignment task',
118 | example: 'align.raw.L1.bam',
119 | }
120 | cpus: {
121 | description: 'Number of cpus.',
122 | help: 'Set the number of cpus used by bowtie2',
123 | default: 16
124 | }
125 | disk_factor: {
126 | description: 'Multiplication factor to determine disk required for task align.',
127 | help: 'This factor will be multiplied to the size of FASTQs to determine required disk of instance (GCP/AWS) or job (HPCs).',
128 | default: 8.0
129 | }
130 | memory_factor: {
131 | description: 'Multiplication factor to determine memory required for task align.',
132 | help: 'This factor will be multiplied to the size of FASTQs to determine required memory of instance (GCP/AWS) or job (HPCs).',
133 | default: 0.15
134 | }
135 | prefix: {
136 | description: 'Prefix for output files.',
137 | help: 'Prefix that will be used to name the output files',
138 | examples: 'my-experiment'
139 | }
140 | docker_image: {
141 | description: 'Docker image.',
142 | help: 'Docker image for the alignment step.',
143 | example: ["us.gcr.io/buenrostro-share-seq/share_task_bowtie2"]
144 | }
145 | }
146 |
147 |
148 | }
149 |
--------------------------------------------------------------------------------
/src/python/trim_fastq.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Trim fastq
5 | Removes dovetail (overlap) between R1 and R2
6 | """
7 |
8 | import argparse
9 | import Levenshtein
10 | import xopen
11 | from collections import deque
12 |
13 | def parse_arguments():
14 | parser = argparse.ArgumentParser(description="Trim dovetail (overlap) between read1 and read2")
15 | parser.add_argument("input_read1_fastq_file", help="Filename for untrimmed input read 1 FASTQ file")
16 | parser.add_argument("input_read2_fastq_file", help="Filename for untrimmed input read 2 FASTQ file")
17 | parser.add_argument("output_read1_fastq_file", help="Filename for corrected output read 1 FASTQ file")
18 | parser.add_argument("output_read2_fastq_file", help="Filename for corrected output read 2 FASTQ file")
19 | parser.add_argument("trimming_stats_file", help="Filename for txt file containing trimming statistics")
20 |
21 | return parser.parse_args()
22 |
23 | REV_COMP = str.maketrans("ATGC", "TACG")
24 | def reverse_complement(seq):
25 | return str.translate(seq, REV_COMP)[::-1]
26 |
27 | def trim_fastqs(input_read1_fastq_file, input_read2_fastq_file,
28 | output_read1_fastq_file, output_read2_fastq_file,
29 | trimming_stats_file):
30 | """
31 | Trim reads if overlapping, write reads to output FASTQ files.
32 | Produces file enumerating how many reads were processed and trimmed.
33 | """
34 | # counters
35 | total = trimmed = 0
36 |
37 | read1_out_writer = xopen.xopen(output_read1_fastq_file, mode="w")
38 | read2_out_writer = xopen.xopen(output_read2_fastq_file, mode="w")
39 |
40 | buffer1 = deque()
41 | buffer2 = deque()
42 | buffer_counter = 0
43 |
44 | # process FASTQs together
45 | with xopen.xopen(input_read1_fastq_file, mode= "r", threads= 8) as read1_fh, xopen.xopen(input_read2_fastq_file, mode= "r", threads= 8) as read2_fh:
46 | for readline1, readline2 in zip(read1_fh, read2_fh):
47 | total += 2
48 |
49 | name1 = readline1.strip()
50 | name2 = readline2.strip()
51 |
52 | readline1 = next(read1_fh)
53 | readline2 = next(read2_fh)
54 |
55 | sequence1 = readline1.strip()
56 | sequence2 = readline2.strip()
57 |
58 | next(read1_fh)
59 | next(read2_fh)
60 |
61 | readline1 = next(read1_fh)
62 | readline2 = next(read2_fh)
63 |
64 | quality1 = readline1.strip()
65 | quality2 = readline2.strip()
66 |
67 | # trim adapters for ATAC
68 | where = trim(sequence1, sequence2)
69 |
70 | if where > -1:
71 | trimmed += 2
72 |
73 | # add trimmed read 1 to buffer
74 | trimmed_read1 = f"{name1}\n{sequence1[:where]}\n+\n{quality1[:where]}\n"
75 | buffer1.append(trimmed_read1)
76 |
77 | # add trimmed read 2 to buffer
78 | trimmed_read2 = f"{name2}\n{sequence2[:where]}\n+\n{quality2[:where]}\n"
79 | buffer2.append(trimmed_read2)
80 |
81 | else:
82 | # add original read 1 to buffer
83 | read1 = f"{name1}\n{sequence1}\n+\n{quality1}\n"
84 | buffer1.append(read1)
85 |
86 | # add original read 1 to buffer
87 | read2 = f"{name2}\n{sequence2}\n+\n{quality2}\n"
88 | buffer2.append(read2)
89 |
90 | buffer_counter += 1
91 |
92 | # write reads to trimmed FASTQ files
93 | if buffer_counter == 10000000:
94 | read1_out_writer.write("".join(buffer1))
95 | buffer1.clear()
96 | read2_out_writer.write("".join(buffer2))
97 | buffer2.clear()
98 | buffer_counter = 0
99 |
100 | # write out remaining reads
101 | if buffer_counter > 0:
102 | read1_out_writer.write("".join(buffer1))
103 | buffer1.clear()
104 | read2_out_writer.write("".join(buffer2))
105 | buffer2.clear()
106 | buffer_counter = 0
107 |
108 | # write trimming statistics output file
109 | with open(trimming_stats_file, "w") as f:
110 | fields = ["total_reads", "untrimmed_reads", "trimmed_reads", "%trimmed"]
111 | f.write("\t".join(fields) + "\n")
112 | f.write("%i\t%i\t%i\t%0.1f" % (total, total-trimmed, trimmed, trimmed/total*100 if total > 0 else 0))
113 |
114 | def trim(seq1, seq2):
115 | """
116 | Find overlap between read1 and read2 and return location
117 | """
118 | query = reverse_complement(seq2[0:20])
119 | idx = seq1.rfind(query) # look for perfect match
120 | if idx == -1:
121 | idx = fuzz_align(query,seq1)
122 |
123 | # found it, return everything through match
124 | if idx > -1:
125 | idx = idx+20
126 | else:
127 | idx = -1
128 | return idx
129 |
130 | def fuzz_align(s_seq, l_seq):
131 | """
132 | Align allowing Levenshtein distance of 1
133 | This iteration should go from the right end of l_seq
134 | since we want to do a rfind
135 | """
136 | for i, base in enumerate(l_seq): # loop through equal size windows
137 | l_subset = l_seq[i:i+len(s_seq)]
138 | dist = Levenshtein.distance(l_subset, s_seq, score_cutoff= 1)
139 | if dist <= 1: # find first then break
140 | return i
141 | return -1
142 |
143 | def main():
144 | args = parse_arguments()
145 | input_read1_fastq_file = getattr(args, "input_read1_fastq_file")
146 | input_read2_fastq_file = getattr(args, "input_read2_fastq_file")
147 | output_read1_fastq_file = getattr(args, "output_read1_fastq_file")
148 | output_read2_fastq_file = getattr(args, "output_read2_fastq_file")
149 | trimming_stats_file = getattr(args, "trimming_stats_file")
150 |
151 | trim_fastqs(input_read1_fastq_file, input_read2_fastq_file,
152 | output_read1_fastq_file, output_read2_fastq_file,
153 | trimming_stats_file)
154 |
155 |
156 | if __name__ == "__main__":
157 | main()
158 |
--------------------------------------------------------------------------------
/src/python/match_barcodes.py:
--------------------------------------------------------------------------------
1 | import gzip
2 |
3 | import numpy as np
4 | # import pandas as pd
5 |
6 | import matcha
7 | import sys
8 |
9 | REV_COMP = str.maketrans("ATGC", "TACG")
10 | def reverse_complement(seq):
11 | return str.translate(seq, REV_COMP)[::-1]
12 |
13 | def get_open_fn(path):
14 | with open(path, "rb") as f:
15 | is_gzipped = (f.read(2) == b'\x1f\x8b')
16 | return gzip.open if is_gzipped else open
17 |
18 | def read_barcodes(path, revcomp):
19 | # if path.endswith(".tsv"):
20 | # bc = pd.read_csv(path, sep="\t")["sequence"]
21 | # else:
22 | open_fn = get_open_fn(path)
23 | with open_fn(path, 'rt') as file:
24 | bc = [b.strip() for b in file]
25 | if revcomp:
26 | valid = [reverse_complement(b) for b in bc]
27 | else:
28 | valid = bc
29 |
30 | return valid
31 |
32 | def match_one_bc(fastqs, whitelists, revcomp, max_barcode_dist, offsets, fastq1_out_path, fastq2_out_path, qc_path, threads):
33 | f = matcha.FastqReader(threads = threads)
34 | f.add_sequence("R1", fastqs["R1"], output_path=fastq1_out_path)
35 | f.add_sequence("R2", fastqs["R2"])
36 | f.add_sequence("R3", fastqs["R3"], output_path=fastq2_out_path)
37 |
38 | with open(revcomp["R2"]) as rf:
39 | rc = (int(rf.read().strip()) == 1)
40 |
41 | barcode_sequences = read_barcodes(whitelists["R2"], rc)
42 | cell_barcode = matcha.HashMatcher(
43 | sequences = barcode_sequences,
44 | labels = barcode_sequences,
45 | max_mismatches=max_barcode_dist,
46 | subsequence_count=2
47 | )
48 | f.add_barcode("cell", cell_barcode, "R2", match_start=offsets["R2"])
49 | f.set_output_names("{read_name} CB:Z:{cell}")
50 |
51 | barcode_counts = np.zeros(max_barcode_dist + 2, int)
52 |
53 | total_reads = 0
54 | total_pass = 0
55 |
56 | # print("start read") ####
57 | chunk_size = 10000
58 | while f.read_chunk(chunk_size):
59 | pass_filter = (f.get_match_result("cell", "dist") <=max_barcode_dist) & \
60 | (f.get_match_result("cell", "second_best_dist") > f.get_match_result("cell", "dist"))
61 |
62 | total_reads += len(pass_filter)
63 | total_pass += pass_filter.sum()
64 | values, counts = np.unique(f.get_match_result("cell", "dist"), return_counts=True)
65 | barcode_counts[np.minimum(values, max_barcode_dist + 1)] += counts
66 |
67 | f.write_chunk(pass_filter)
68 |
69 | with open(qc_path, "w") as stats_output:
70 | print(f"{total_pass}/{total_reads} reads passing, ({total_pass/total_reads*100:.2f}%)\n", file=stats_output)
71 | print("mismatches\treads", file=stats_output)
72 | for dist in range(max_barcode_dist + 2):
73 | print(
74 | dist if dist <= max_barcode_dist else f">{max_barcode_dist}",
75 | barcode_counts[dist],
76 | sep = "\t",
77 | file=stats_output
78 | )
79 |
80 |
81 | # def match_two_bc(fastqs, whitelists, revcomp, max_barcode_dist, offsets, fastq1_out_path, fastq2_out_path, qc_path, threads):
82 | # f = matcha.FastqReader(threads = threads)
83 | # f.add_sequence("R1", fastqs["R1"], output_path=fastq1_out_path)
84 | # f.add_sequence("R2", fastqs["R2"], output_path=fastq2_out_path)
85 | # f.add_sequence("I1", fastqs["I1"])
86 | # f.add_sequence("I2", fastqs["I2"])
87 |
88 | # i5_sequences, i5_maybe_rc = read_barcodes(whitelists["I2"], revcomp["I2"])
89 | # T7_sequences, T7_maybe_rc = read_barcodes(whitelists["I1"], revcomp["I1"])
90 |
91 | # i5_barcode = matcha.HashMatcher(
92 | # sequences = i5_maybe_rc,
93 | # labels = i5_sequences,
94 | # max_mismatches=max_barcode_dist,
95 | # subsequence_count=2
96 | # )
97 |
98 | # T7_barcode = matcha.HashMatcher(
99 | # sequences = T7_maybe_rc,
100 | # labels = T7_sequences,
101 | # max_mismatches=max_barcode_dist,
102 | # subsequence_count=2
103 | # )
104 |
105 | # f.add_barcode("i5", i5_barcode, "I2", match_start=offsets["I2"])
106 | # f.add_barcode("T7", T7_barcode, "I1", match_start=offsets["I1"])
107 |
108 | # f.set_output_names("{read_name} CB:Z:{i5}{T7}")
109 |
110 | # barcode_counts = np.zeros((max_barcode_dist + 2, max_barcode_dist + 2), int)
111 |
112 | # total_reads = 0
113 | # total_pass = 0
114 |
115 | # chunk_size = 10000
116 |
117 | # dists = [None, None]
118 | # second_dists = [None, None]
119 | # while f.read_chunk(chunk_size):
120 | # dists[0] = f.get_match_result("i5", "dist")
121 | # second_dists[0] = f.get_match_result("i5", "second_best_dist")
122 | # dists[1] = f.get_match_result("T7", "dist")
123 | # second_dists[1] = f.get_match_result("T7", "second_best_dist")
124 |
125 | # pass_filter = (dists[0] < max_barcode_dist) & \
126 | # (dists[1] < max_barcode_dist) & \
127 | # (dists[0] + dists[1] < second_dists[0] + second_dists[1])
128 |
129 | # total_reads += len(pass_filter)
130 | # total_pass += pass_filter.sum()
131 |
132 | # values, counts = np.unique(dists, axis = 1, return_counts=True)
133 | # indices = np.minimum(values, max_barcode_dist+1)
134 | # barcode_counts[(indices[0], indices[1])] += counts
135 |
136 | # f.write_chunk(pass_filter)
137 |
138 | # with open(qc_path, "w") as stats_output:
139 | # print(f"{total_pass}/{total_reads} reads passing, ({total_pass/total_reads*100:.2f}%)\n", file=stats_output)
140 | # print("mismatches_i5\tmismatches_T7\treads", file=stats_output)
141 | # for i5_dist in range(max_barcode_dist + 2):
142 | # for T7_dist in range(max_barcode_dist + 2):
143 | # print(
144 | # i5_dist if i5_dist <= max_barcode_dist else f">{max_barcode_dist}",
145 | # T7_dist if T7_dist <= max_barcode_dist else f">{max_barcode_dist}",
146 | # barcode_counts[i5_dist, T7_dist],
147 | # sep = "\t",
148 | # file=stats_output
149 | # )
150 |
151 | modality = sys.argv[4]
152 | whitelist = sys.argv[7]
153 | fastq1_out_path = sys.argv[8]
154 | fastq2_out_path = sys.argv[9]
155 | qc_path = sys.argv[10]
156 | threads = int(sys.argv[11])
157 | max_barcode_dist = int(sys.argv[5])
158 | fastqs = {
159 | "R1": sys.argv[1],
160 | "R2": sys.argv[3],
161 | "R3": sys.argv[2],
162 | }
163 | revcomp = {
164 | "R2": sys.argv[6],
165 | }
166 | if modality == "10x":
167 | whitelists = {
168 | "R2": whitelist,
169 | }
170 | offsets = {
171 | "R2": 0,
172 | }
173 | match_one_bc(fastqs, whitelists, revcomp, max_barcode_dist, offsets, fastq1_out_path, fastq2_out_path, qc_path, threads)
174 |
175 | elif modality == "10x_multiome":
176 | whitelists = {
177 | "R2": whitelist,
178 | }
179 | offsets = {
180 | "R2": 8,
181 | }
182 | match_one_bc(fastqs, whitelists, revcomp, max_barcode_dist, offsets, fastq1_out_path, fastq2_out_path, qc_path, threads)
183 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Broad Institute of MIT and Harvard Single-Cell/Nucleus Multiomic Processing Pipeline
2 |
3 | Pipeline specifications can be found [here](https://docs.google.com/document/d/1J-NWpDLkEGLsLjVe6h6-Rx4nxzTdgy1TJZvuMnYiiyg/edit?usp=sharing).
4 |
5 | Pipeline main page on [dockstore](https://dockstore.org/workflows/github.com/broadinstitute/epi-SHARE-seq-pipeline/SHARE-seq:release?tab=info).
6 |
7 |
8 |
9 |
10 |
11 | ### Structure of this repo
12 | * The **tasks** directory contains the tasks called from the main workflow share-seq.wdl. Each task corresponds to a different step of the pipeline: *align*, *filter*, etc.
13 | * The **src** directory contains bash, Python, R, and notebook scripts called within the tasks.
14 | * The **dockerfiles** directory contains the Dockerfiles used to build the Docker images used by the pipeline.
15 |
16 | ## Introduction
17 |
18 | The **SHARE-seq** multiomic pipeline is based off the original Buenrostro SHARE-seq pipeline specifications (by Sai Ma) in [this github repo](https://github.com/masai1116/SHARE-seq-alignment).
19 |
20 | This **10X** single-cell multiomic pipeline is based off the ENCODE (phase-3) single-cell pipeline specifications (by Anshul Kundaje) in [this google doc](https://docs.google.com/document/u/2/d/e/2PACX-1vTlgtT4WeXbvRicybUHXnhZs8RKyB4EkTbcWooQ6qBxxQ_zIHpFEVHy38D5lC_s8_YDGfUTsyomJcs3/pub).
21 |
22 | ### Features
23 |
24 | * **Portability**: The pipeline can be run on different cloud platforms such as Google, AWS and DNAnexus, as well as on cluster engines such as SLURM, SGE and PBS.
25 | * **User-friendly HTML report**: In addition to the standard outputs, the pipeline generates an HTML report that consists of quality metrics including alignment statistics along with many useful plots. An example of the [HTML report](). # TODO: add an example html.
26 | * **Supported genomes**: The pipeline requires genome-specific data such as aligner indices, chromosome sizes, and blacklisted regions. We provide genome references for hg38, mm10, mm39.
27 |
28 | ## Installation
29 |
30 | 1) Install Caper (Python Wrapper/CLI for [Cromwell](https://github.com/broadinstitute/cromwell)).
31 | ```bash
32 | $ pip install caper
33 | ```
34 |
35 | 2) **IMPORTANT**: Read Caper's [README](https://github.com/ENCODE-DCC/caper/blob/master/README.md) carefully to choose a backend for your system. Follow the instructions in the configuration file.
36 | ```bash
37 | # backend: local or your HPC type (e.g. slurm, sge, pbs, lsf). read Caper's README carefully.
38 | $ caper init [YOUR_BACKEND]
39 |
40 | # IMPORTANT: edit the conf file and follow commented instructions in there
41 | $ vi ~/.caper/default.conf
42 | ```
43 |
44 | 3) Git clone this pipeline.
45 | ```bash
46 | $ cd
47 | $ git clone https://github.com/broadinstitute/epi-SHARE-seq-pipeline/ #TODO: This should point to the release
48 | ```
49 |
50 | 4) Define test input JSON.
51 | ```bash
52 | INPUT_JSON="" #TODO: We need a test dataset available for everyone
53 | ```
54 |
55 | 5) If you have Docker and want to run the pipelines locally on your laptop, `--max-concurrent-tasks 1` limits the number of concurrent tasks to test-run on a laptop. Uncomment if running on a workstation/HPC.
56 | ```bash
57 | # check if Docker works on your machine
58 | $ docker run ubuntu:latest echo hello
59 |
60 | # --max-concurrent-tasks 1 is for computers with limited resources
61 | $ caper run share-seq.wdl -i "${INPUT_JSON}" --docker --max-concurrent-tasks 1
62 | ```
63 |
64 | 6) Otherwise, install Singularity on your system. Please follow [these instructions](https://neuro.debian.net/install_pkg.html?p=singularity-container) to install Singularity on a Debian-based OS. Or ask your system administrator to install Singularity on your HPC.
65 | ```bash
66 | # check if Singularity works on your machine
67 | $ singularity exec docker://ubuntu:latest echo hello
68 |
69 | # on your local machine (--max-concurrent-tasks 1 is for computers with limited resources)
70 | $ caper run share-seq.wdl -i "${INPUT_JSON}" --singularity --max-concurrent-tasks 1
71 |
72 | # on HPC, make sure that Caper's conf ~/.caper/default.conf is correctly configured to work with your HPC
73 | # the following command will submit Caper as a leader job to SLURM with Singularity
74 | $ caper hpc submit share-seq.wdl -i "${INPUT_JSON}" --singularity --leader-job-name ANY_GOOD_LEADER_JOB_NAME
75 |
76 | # check job ID and status of your leader jobs
77 | $ caper hpc list
78 |
79 | # cancel the leader node to close all of its children jobs
80 | # If you directly use cluster command like scancel or qdel then
81 | # child jobs will not be terminated
82 | $ caper hpc abort [JOB_ID]
83 | ```
84 |
85 | ## Input JSON file
86 |
87 | > **IMPORTANT**: DO NOT BLINDLY USE A TEMPLATE/EXAMPLE INPUT JSON. READ THROUGH THE FOLLOWING GUIDE TO MAKE A CORRECT INPUT JSON FILE.
88 |
89 | An input JSON file specifies all of the input parameters and files that are necessary for successfully running this pipeline. This includes a specification of the path to the genome reference files and the raw data FASTQ files. Please make sure to specify absolute paths rather than relative paths in your input JSON files.
90 |
91 | 1) [Input JSON file specification (short)](docs/input_short.md)
92 | 2) [Input JSON file specification (long)](docs/input.md)
93 |
94 |
95 | ## Running on Terra/Anvil (using Dockstore)
96 |
97 | Visit our pipeline repo on [Dockstore](https://dockstore.org/my-workflows/github.com/broadinstitute/epi-SHARE-seq-pipeline/SHARE-seq). Click on `Terra` or `Anvil`. Follow Terra's instructions to create a workspace on Terra and add Terra's billing bot to your Google Cloud account.
98 |
99 | Download this [test input JSON for Terra](we don't have one at the moment), upload it to Terra's UI, and then run the analysis.
100 |
101 | If you would like to use your own input JSON file, make sure that all files in the input JSON are on a Google Cloud Storage bucket (`gs://`). URLs will not work.
102 |
103 | ## How to organize outputs
104 |
105 | Install [Croo](https://github.com/ENCODE-DCC/croo#installation). Make sure that you have python3(> 3.4.1) installed on your system. Find a `metadata.json` on Caper's output directory.
106 |
107 | ```bash
108 | $ pip install croo
109 | $ croo [METADATA_JSON_FILE]
110 | ```
111 |
112 | ## How to make a spreadsheet of QC metrics
113 |
114 | Install [qc2tsv](https://github.com/ENCODE-DCC/qc2tsv#installation). Make sure that you have Python 3 (>3.4.1) installed on your system.
115 |
116 | Once you have [organized the output with Croo](#how-to-organize-outputs), you will be able to find the pipeline's final output file `qc/qc.json` which contains all the QC metrics. Simply feed `qc2tsv` with multiple `qc.json` files. It can take various URIs such as local paths, `gs://`, and `s3://`.
117 |
118 | ```bash
119 | $ pip install qc2tsv
120 | $ qc2tsv /sample1/qc.json gs://sample2/qc.json s3://sample3/qc.json ... > spreadsheet.tsv
121 | ```
122 |
123 | QC metrics for each experiment (`qc.json`) will be split into multiple rows (1 for overall experiment + 1 for each bio replicate) in a spreadsheet.
124 |
125 |
126 | TODO:\
127 | Sambamba\
128 | add track generation \
129 |
130 | Thank you to the **ENCODE DAC** for writing excellent documentation for their pipelines that we used as templates.
131 |
--------------------------------------------------------------------------------
/src/python/joint_cell_plotting.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | """
4 | This script QCs barcodes via ATAC frags & TSS and RNA UMIs & genes,
5 | and plots all barcodes colored by joint QC status. It also generates the
6 | same plot with transparency added to show density.
7 | """
8 |
9 | import argparse
10 | import logging
11 | import numpy as np
12 | import pandas as pd
13 | from plotnine import *
14 |
15 | def parse_arguments():
16 | parser = argparse.ArgumentParser(description="Plot barcodes by RNA and ATAC QC status")
17 | parser.add_argument("rna_metrics_file", help="Filename for RNA metrics tsv file")
18 | parser.add_argument("atac_metrics_file", help="Filename for ATAC metrics tsv file")
19 | parser.add_argument("remove_low_yielding_cells", type=int, help="Minimum number of UMIs/fragments required for a cell to be plotted")
20 | parser.add_argument("min_umis", type=int, help="Cutoff for minimum number of UMIs")
21 | parser.add_argument("min_genes", type=int, help="Cutoff for minimum number of genes")
22 | parser.add_argument("min_tss", type=int, help="Cutoff for minimum TSS score")
23 | parser.add_argument("min_frags", type=int, help="Cutoff for minimum number of ATAC fragments")
24 | parser.add_argument("plot_file", help="Filename for plot png file")
25 | parser.add_argument("barcode_metadata_file", help="Filename for barcode metadata csv file")
26 | parser.add_argument("pkr", help="PKR name", nargs='?', default="")
27 |
28 | return parser.parse_args()
29 |
30 | def get_split_lines(file_name, delimiter, skip_header):
31 | with open(file_name, "r") as f:
32 | if skip_header:
33 | next(f)
34 | for line in f:
35 | yield line.rstrip().split(sep=delimiter)
36 |
37 | def merge_dicts(dict_1, dict_2):
38 | """Merge dictionaries by key; combine values into quadruple, fill with 0s if key not in both dicts"""
39 | keys = set(dict_1.keys() | dict_2.keys())
40 | merged = {k: (dict_1.get(k, (0,0)) + dict_2.get(k, (0,0))) for k in keys}
41 |
42 | return(merged)
43 |
44 | def get_metrics(rna_metrics_file, atac_metrics_file, remove_low_yielding_cells):
45 | """Read files and aggregate metrics into Pandas dataframe"""
46 | rna_metrics_contents = get_split_lines(rna_metrics_file, delimiter="\t", skip_header=True)
47 | umis = []
48 | genes = []
49 | rna_barcodes = []
50 | # remove cells that have fewer than 10 UMIs
51 | for line in rna_metrics_contents:
52 | if int(line[3]) >= remove_low_yielding_cells:
53 | umis.append(int(line[3]))
54 | genes.append(int(line[4]))
55 | rna_barcodes.append(line[0])
56 | rna_metrics = dict(zip(rna_barcodes, zip(umis, genes)))
57 |
58 | atac_metrics_contents = get_split_lines(atac_metrics_file, delimiter="\t", skip_header=True)
59 | tss = []
60 | frags = []
61 | atac_barcodes = []
62 | # remove cells that have fewer than 10 fragments
63 | for line in atac_metrics_contents:
64 | if int(line[6])/2 >= remove_low_yielding_cells:
65 | tss.append(float(line[4]))
66 | frags.append(int(line[6])/2)
67 | atac_barcodes.append(line[0])
68 | atac_metrics = dict(zip(atac_barcodes, zip(tss, frags)))
69 |
70 | # merge metrics by barcodes
71 | metrics = merge_dicts(rna_metrics, atac_metrics)
72 | df = pd.DataFrame.from_dict(metrics, orient="index", columns=["umis","genes","tss","frags"])
73 |
74 | return(df)
75 |
76 | def qc_cells(df, min_umis, min_genes, min_tss, min_frags):
77 | pass_umis = df["umis"] >= min_umis
78 | pass_genes = df["genes"] >= min_genes
79 | pass_tss = df["tss"] >= min_tss
80 | pass_frags = df["frags"] >= min_frags
81 |
82 | # add df column with QC outcome
83 | qc_conditions = [(pass_umis & pass_genes & pass_tss & pass_frags),
84 | (pass_umis & pass_genes),
85 | (pass_tss & pass_frags),
86 | (~(pass_umis & pass_genes) & (~(pass_tss & pass_frags)))]
87 | qc_choices = ["both", "RNA only", "ATAC only", "neither"]
88 | df["QC"] = np.select(qc_conditions, qc_choices)
89 |
90 | # get counts of each outcome type (used in plot legend)
91 | outcome_counts = df["QC"].value_counts()
92 |
93 | df["QC_count"] = [f"{outcome} ({outcome_counts[outcome]})" for outcome in df["QC"]]
94 |
95 | return(df)
96 |
97 | def round_to_power_10(x):
98 | return(10**np.ceil(np.log10(x)))
99 |
100 | def label_func(breaks):
101 | return [int(x) for x in breaks]
102 |
103 | def plot_cells(df, pkr, min_umis, min_genes, min_tss, min_frags, plot_file):
104 | # get max x and y coords to set plot limits
105 | max_x = max(df["frags"])
106 | max_y = max(df["umis"])
107 | xy_lim = round_to_power_10(max(max_x, max_y))
108 |
109 | plot = (ggplot(df, aes("frags", "umis", color="QC_count"))
110 | + geom_point(size=0.5)
111 | + labs(title = f"Joint Cell Calling ({pkr})",
112 | caption = f"ATAC cutoffs: TSS ≥ {min_tss}, frags ≥ {min_frags}. RNA cutoffs: UMIs ≥ {min_umis}, genes ≥ {min_genes}",
113 | x = "ATAC Unique Fragments per Barcode",
114 | y = "RNA UMIs per Barcode",
115 | color = "QC")
116 | + theme_light()
117 | + theme(figure_size = (8,6),
118 | title = element_text(size=12),
119 | axis_title = element_text(size=10),
120 | axis_text = element_text(size=8),
121 | legend_box_margin = 0,
122 | legend_title = element_text(size=8),
123 | legend_text = element_text(size=6),
124 | legend_key = element_blank(),
125 | plot_caption=element_text(size=8, ha="center", margin={"r": 3.2, "t": -0.2, "units": "in"}),
126 | panel_grid_minor = element_blank())
127 | + scale_x_log10(limits=(10,xy_lim), labels=label_func)
128 | + scale_y_log10(limits=(10,xy_lim), labels=label_func)
129 | )
130 |
131 | plot.save(filename=plot_file, dpi=1000)
132 |
133 | def main():
134 | # create log file
135 | logging.basicConfig(filename="joint_cell_plotting.log", level=logging.INFO)
136 |
137 | # get arguments
138 | args = parse_arguments()
139 | pkr = getattr(args, "pkr")
140 | rna_metrics_file = getattr(args, "rna_metrics_file")
141 | atac_metrics_file = getattr(args, "atac_metrics_file")
142 | remove_low_yielding_cells = getattr(args, "remove_low_yielding_cells")
143 | barcode_metadata_file = getattr(args, "barcode_metadata_file")
144 | min_umis = getattr(args, "min_umis")
145 | min_genes = getattr(args, "min_genes")
146 | min_tss = getattr(args, "min_tss")
147 | min_frags = getattr(args, "min_frags")
148 | plot_file = getattr(args, "plot_file")
149 |
150 | # read rna and atac files, get cell metrics
151 | logging.info("Getting metrics\n")
152 | metrics_df = get_metrics(rna_metrics_file, atac_metrics_file, remove_low_yielding_cells)
153 |
154 | # QC cells based on inputted cutoffs
155 | logging.info("QCing cells\n")
156 | metrics_df = qc_cells(metrics_df, min_umis, min_genes, min_tss, min_frags)
157 |
158 | # generate plot
159 | logging.info("Generating joint cell calling plot\n")
160 | plot_cells(metrics_df, pkr, min_umis, min_genes, min_tss, min_frags, plot_file)
161 |
162 | # save dataframe
163 | logging.info("Saving dataframe as csv\n")
164 | metrics_df.to_csv(barcode_metadata_file)
165 | logging.info("All done!")
166 |
167 |
168 | if __name__ == "__main__":
169 | main()
170 |
171 |
--------------------------------------------------------------------------------