├── docs
    ├── input.md
    └── images
    │   └── pipeline_overview.png
├── .gitattributes
├── .dockerignore
├── .gcloudignore
├── src
    ├── R
    │   ├── TSSRanges.RData
    │   ├── joint_cell_plotting_density.R
    │   ├── atac_qc_plots.R
    │   ├── cell_annotation_helper_functions.R
    │   ├── barcode_rank_functions.R
    │   └── rna_qc_plots.R
    ├── python
    │   ├── get_cellxgene_data.py
    │   ├── qc_atac_count_duplicates_per_barcode.py
    │   ├── pbc_stats.py
    │   ├── flexible_import_entities_standard.py
    │   ├── plot_insert_size_hist.py
    │   ├── assign_multimappers.py
    │   ├── barcode_revcomp_detect.py
    │   ├── write_html.py
    │   ├── bam_to_fragments.py
    │   ├── filter_mito_reads.py
    │   ├── qc_atac_compute_reads_in_peaks.py
    │   ├── infer_barcodes.py
    │   ├── generate_h5_rna.py
    │   ├── rna_barcode_metadata.py
    │   ├── trim_fastq.py
    │   ├── match_barcodes.py
    │   └── joint_cell_plotting.py
    └── bash
    │   └── monitor_script.sh
├── dockerfiles
    ├── notes-for-bowtie
    ├── share_task_html_report.dockerfile
    ├── share_task_generate_h5.dockerfile
    ├── terra_archr_and_seurat.dockerfile
    ├── share_task_correct_fastq.dockerfile
    ├── 10x_task_preprocess.dockerfile
    ├── share_task_joint_qc.dockerfile
    ├── share_task_trim_fastqs_atac.dockerfile
    ├── share_task_qc_rna.dockerfile
    ├── share_task_seurat.dockerfile
    ├── share_task_preprocess.dockerfile
    ├── share_task_merge_bams.dockerfile
    ├── share_task_archr.dockerfile
    ├── dorcs_task_find_dorcs.dockerfile
    ├── share_task_bowtie2.dockerfile
    ├── share_task_star.dockerfile
    ├── share_task_cell_annotation.dockerfile
    ├── share_task_filter_atac.dockerfile
    └── share_task_qc_atac.dockerfile
├── example_input_json
    ├── subwf_preprocess.json
    └── inputs-short-share.json
├── .gitignore
├── tasks
    ├── raise_exception.wdl
    ├── share_task_log_atac.wdl
    ├── 10x_create_barcode_mapping.wdl
    ├── share_task_log_rna.wdl
    ├── get_cellxgene_data.wdl
    ├── share_task_correct_fastq.wdl
    ├── share_task_trim_fastqs_atac.wdl
    ├── share_task_generate_h5.wdl
    ├── share_task_html_report.wdl
    ├── dorcs_task_find_dorcs.wdl
    ├── share_task_star.wdl
    ├── share_task_cell_annotation.wdl
    ├── share_task_qc_rna.wdl
    ├── share_task_joint_qc.wdl
    ├── 10x_task_preprocess.wdl
    └── share_task_merge_bams.wdl
├── LICENSE
├── .dockstore.yml
├── workflows
    ├── subwf-cell-annotation.wdl
    ├── subwf-atac-archr.wdl
    ├── subwf-rna-seurat.wdl
    └── subwf-find-dorcs.wdl
├── .vimrc
└── README.md


/docs/input.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | .git
2 | .cache
3 | data
4 | input_examples
5 | tasks
6 | tests
7 | tmp
8 | 


--------------------------------------------------------------------------------
/.gcloudignore:
--------------------------------------------------------------------------------
1 | input_examples
2 | LICENSE
3 | README.md
4 | share-seq.wdl
5 | tasks
6 | tests
7 | workflows
8 | 


--------------------------------------------------------------------------------
/src/R/TSSRanges.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/epi-SHARE-seq-pipeline/HEAD/src/R/TSSRanges.RData


--------------------------------------------------------------------------------
/dockerfiles/notes-for-bowtie:
--------------------------------------------------------------------------------
1 | https://community.arm.com/developer/tools-software/hpc/b/hpc-blog/posts/tuning-bowtie2-better-performance
2 | 


--------------------------------------------------------------------------------
/docs/images/pipeline_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/epi-SHARE-seq-pipeline/HEAD/docs/images/pipeline_overview.png


--------------------------------------------------------------------------------
/example_input_json/subwf_preprocess.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"wf_preprocess.atac_primers" : "P1.01,P1.02",
 3 |     "wf_preprocess.rna_primers" : "P1.17,P1.18",
 4 |     "wf_preprocess.read1" : "other-files-for-testing/Undetermined_S1_R1_001.fastq.gz",
 5 |     "wf_preprocess.read2" : "other-files-for-testing/Undetermined_S1_R4_001.fastq.gz",
 6 |     "wf_preprocess.index1" : "other-files-for-testing/Undetermined_S1_R2_001.fastq.gz",
 7 |     "wf_preprocess.index2" : "other-files-for-testing/Undetermined_S1_R3_001.fastq.gz"
 8 | 
 9 | }
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Generated by Cargo
 2 | # will have compiled files and executables
 3 | /target/
 4 | 
 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
 6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
 7 | Cargo.lock
 8 | 
 9 | # These are backup files generated by rustfmt
10 | **/*.rs.bk
11 | 
12 | .DS_Store
13 | 
14 | .ipynb_checkpoints*
15 | 
16 | .dockstore.yml
17 | 
18 | src/jupyter_nb/log/
19 | src/jupyter_nb/prefix.rna.cell.annotation.plots.mm10/
20 | build_docker.sh
21 | 


--------------------------------------------------------------------------------
/tasks/raise_exception.wdl:
--------------------------------------------------------------------------------
 1 | # From https://github.com/ENCODE-DCC/chip-seq-pipeline2/blob/master/chip.wdl
 2 | 
 3 | 
 4 | task raise_exception {
 5 |     input {
 6 |         String msg
 7 |         Array[String]? vals
 8 |     }
 9 |     command {
10 |         echo -e "\n* Error: ${msg}\n" >&2
11 |         echo -e "* Vals: ${sep=',' vals}\n" >&2
12 |         exit 2
13 |     }
14 |     output {
15 |         String error_msg = '${msg}'
16 |     }
17 |     runtime {
18 |         maxRetries : 0
19 |         cpu : 1
20 |         memory : '2 GB'
21 |         time : 1
22 |         disks : 'local-disk 10 SSD'
23 |     	docker : 'encodedcc/chip-seq-pipeline:v2.2.1'
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/dockerfiles/share_task_html_report.dockerfile:
--------------------------------------------------------------------------------
 1 | ############################################################
 2 | # Dockerfile for BROAD GRO share-seq-pipeline
 3 | # Based on Ubuntu 18.04.3
 4 | ############################################################
 5 | 
 6 | # Set the base image to Ubuntu 18.04.3
 7 | #FROM ubuntu:focal
 8 | FROM ubuntu@sha256:d1d454df0f579c6be4d8161d227462d69e163a8ff9d20a847533989cf0c94d90
 9 | 
10 | LABEL maintainer="Neva Durand"
11 | 
12 | # To prevent time zone prompt
13 | ENV DEBIAN_FRONTEND=noninteractive
14 | 
15 | # Install softwares from apt repo
16 | RUN apt-get update && apt-get install -y \ 
17 |     python3 \
18 |     && rm -rf /var/lib/apt/lists/*
19 | 
20 | # Make directory for all softwares
21 | RUN mkdir /software
22 | WORKDIR /software
23 | ENV PATH="/software:${PATH}"
24 | 
25 | # Copy the external scripts inside
26 | COPY src/python/write_html.py /software
27 | 


--------------------------------------------------------------------------------
/dockerfiles/share_task_generate_h5.dockerfile:
--------------------------------------------------------------------------------
 1 | ############################################################
 2 | # Dockerfile for BROAD GRO share-seq-pipeline
 3 | # Based on Debian slim
 4 | ############################################################
 5 | 
 6 | FROM python@sha256:7ad180fdf785219c4a23124e53745fbd683bd6e23d0885e3554aff59eddbc377
 7 | 
 8 | LABEL maintainer = "Eugenio Mattei"
 9 | LABEL software = "Share-seq pipeline"
10 | LABEL software.version="1.0.0"
11 | LABEL software.organization="Broad Institute of MIT and Harvard"
12 | LABEL software.version.is-production="Yes"
13 | LABEL software.task="generate_h5"
14 | 
15 | # Install python packages
16 | RUN pip install --no-cache-dir h5py scipy
17 | 
18 | # Create and setup new user
19 | ENV USER=shareseq
20 | WORKDIR /home/$USER
21 | RUN groupadd -r $USER &&\
22 |     useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
23 |     chown $USER:$USER /home/$USER
24 | 
25 | # Copy scripts
26 | COPY --chown=$USER:$USER src/python/generate_h5_rna.py /usr/local/bin/
27 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
28 | 
29 | USER ${USER}
30 | 


--------------------------------------------------------------------------------
/dockerfiles/terra_archr_and_seurat.dockerfile:
--------------------------------------------------------------------------------
 1 | ############################################################
 2 | # Dockerfile for Terra to support ArchR
 3 | # Based on Debian slim
 4 | ############################################################
 5 | 
 6 | FROM us.gcr.io/broad-dsp-gcr-public/terra-jupyter-r:2.1.3
 7 | 
 8 | LABEL maintainer = "Siddarth Wekhande"
 9 | LABEL software = "ArchR on Terra"
10 | LABEL software.version="0.0.1"
11 | LABEL software.organization="Broad Institute of MIT and Harvard"
12 | LABEL software.version.is-production="No"
13 | LABEL software.task="archr"
14 | 
15 | USER root
16 | 
17 | RUN R --no-echo --no-restore --no-save -e "install.packages(c('hdf5r','remotes'))"
18 | 
19 | RUN R --no-echo --no-restore --no-save -e "remotes::install_version('Seurat', version = '4.1.1')"
20 | 
21 | RUN R --no-echo --no-restore --no-save -e "remotes::install_github('GreenleafLab/ArchR@v1.0.1', repos = BiocManager::repositories());ArchR::installExtraPackages()"
22 | 
23 | RUN R --no-echo --no-restore --no-save -e "remotes::install_github('immunogenomics/presto')"
24 | 
25 | ENV USER jupyter
26 | USER $USER 
27 | 
28 | ENTRYPOINT ["/bin/bash"]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Broad Institute
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/python/get_cellxgene_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script takes dataset id as input and download an h5ad file 
 3 | from cellxgene server using cellxgene_census API.
 4 | """
 5 | 
 6 | import argparse
 7 | import logging
 8 | import cellxgene_census
 9 | import scanpy as sc
10 | 
11 | def parse_arguments():
12 |     parser = argparse.ArgumentParser(description="Download data from cellxgene server")
13 |     parser.add_argument("--id", type=str, required=True,
14 |                         help="Cellxgene dataset id to download.")
15 |     parser.add_argument("--out", type=str, required=True,
16 |                         help="Output filename", default="reference")
17 |     
18 |     return parser.parse_args()
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     # create log file
23 |     logging.basicConfig(filename="get_cellxgene_data.log", level=logging.INFO)
24 | 
25 |     # get arguments
26 |     args = parse_arguments()
27 |     
28 |     logging.info("Downloading data\n")
29 |     cellxgene_census.download_source_h5ad(
30 |         dataset_id=args.id, 
31 |         to_path=f"{args.out}.h5ad")
32 |     
33 |     adata = sc.read_h5ad(f"{args.out}.h5ad")
34 |     
35 |     # get counts
36 |     if not adata.raw:
37 |         adata.raw = adata.copy()
38 |     
39 |     adata.write_h5ad(f"{args.out}.h5ad")
40 |     
41 |     logging.info("All done!")
42 | 


--------------------------------------------------------------------------------
/dockerfiles/share_task_correct_fastq.dockerfile:
--------------------------------------------------------------------------------
 1 | ############################################################
 2 | # Dockerfile for BROAD GRO share-seq-pipeline
 3 | # Based on Debian slim
 4 | ############################################################
 5 | 
 6 | FROM python@sha256:7ad180fdf785219c4a23124e53745fbd683bd6e23d0885e3554aff59eddbc377
 7 | 
 8 | LABEL maintainer = "Eugenio Mattei"
 9 | LABEL software = "Share-seq pipeline"
10 | LABEL software.version="1.0.0"
11 | LABEL software.organization="Broad Institute of MIT and Harvard"
12 | LABEL software.version.is-production="Yes"
13 | LABEL software.task="correct_fastq"
14 | 
15 | # To prevent time zone prompt
16 | ENV DEBIAN_FRONTEND=noninteractive
17 | 
18 | # Install softwares from apt repo
19 | RUN apt-get update && apt-get install -y \
20 |     pigz && \
21 |     rm -rf /var/lib/apt/lists/*
22 | 
23 | # Install python packages
24 | RUN pip install --no-cache-dir --break-system-packages xopen
25 | 
26 | # Create and setup new user
27 | ENV USER=shareseq
28 | WORKDIR /home/$USER
29 | RUN groupadd -r $USER &&\
30 |     useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
31 |     chown $USER:$USER /home/$USER
32 | 
33 | # Copy scripts
34 | COPY --chown=$USER:$USER src/python/correct_fastq.py /usr/local/bin/
35 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
36 | 
37 | USER ${USER}
38 | 


--------------------------------------------------------------------------------
/dockerfiles/10x_task_preprocess.dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian@sha256:3ecce669b6be99312305bc3acc90f91232880c68b566f257ae66647e9414174f
 2 | 
 3 | LABEL maintainer = "Eugenio Mattei"
 4 | LABEL software = "Share-seq pipeline"
 5 | LABEL software.version="1.0.0"
 6 | LABEL software.organization="Broad Institute of MIT and Harvard"
 7 | LABEL software.version.is-production="Yes"
 8 | LABEL software.task="10x preprocess"
 9 | 
10 | RUN apt-get update && apt-get install -y \
11 |     gcc \
12 |     git \
13 |     python3 \
14 |     python3-dev \
15 |     python3-pip \
16 |     zlib1g-dev \
17 |     wget &&\
18 |     rm -rf /var/lib/apt/lists/*
19 | 
20 | # Install packages for python3 scripts (pysam, SAMstats)
21 | RUN python3 -m pip install --no-cache-dir --break-system-packages --ignore-installed numpy pandas pybind11 --editable=git+https://github.com/GreenleafLab/matcha.git#egg=matcha
22 | 
23 | # Create and setup new user
24 | ENV USER=shareseq
25 | WORKDIR /home/$USER
26 | 
27 | RUN groupadd -r $USER &&\
28 |     useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
29 |     chown $USER:$USER /home/$USER
30 | 
31 | # Add folder with software to the path
32 | ENV PATH="/software:${PATH}"
33 | 
34 | # Copy the compiled software from the builder
35 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
36 | COPY --chown=$USER:$USER src/python/barcode_revcomp_detect.py /usr/local/bin
37 | COPY --chown=$USER:$USER src/python/match_barcodes.py /usr/local/bin
38 | 
39 | USER ${USER}
40 | 


--------------------------------------------------------------------------------
/example_input_json/inputs-short-share.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "share.chemistry": "String",
 3 |   "share.read1_atac": "Array[File]",
 4 |   "share.read2_atac": "Array[File]",
 5 |   "share.read1_rna": "Array[File]",
 6 |   "share.read2_rna": "Array[File]",
 7 |   "share.genome_name_input": "String",
 8 |   "share.pipeline_type": "['full', 'count_only', 'no-align']",
 9 | 
10 |   
11 |   "share.pkr": "String? (optional, default = \"\")",
12 |   "share.prefix": "String (optional, default = \"shareseq-project\")",
13 |   "share.atac.align_multimappers": "Int? (optional)",
14 |   "share.whitelist": "File? (optional)",
15 |   "share.atac.barcode_tag_fragments": "String? (optional)",
16 | 
17 |   "share.trim_fastqs": "Boolean (optional, default = true)",
18 |   
19 | 
20 |   "share.append_comment": "Boolean (optional, default = false)",
21 |   "share.fastq_barcode": "Array[File] (optional, default = [])",
22 |   "share.preprocess_tenx.barcode_dist": "Int? (optional, default = 2)",
23 |   "share.preprocess_tenx.threshold_pct_barcode_matching": "Float? (optional, default = 0.6)",
24 |   "share.whitelist_atac": "File? (optional)",
25 |   "share.whitelist_rna": "File? (optional)",
26 |   
27 |   
28 |   "share.atac.barcode_tag": "String? (optional, default = \"CB\")",
29 | 
30 |   "share.atac_genome_index_tar": "File? (optional)",
31 |   "share.idx_tar_rna": "File? (optional)",
32 |   "share.gtf": "File? (optional)",
33 |   "share.tss_bed": "File? (optional)",
34 |   "share.peak_set": "File? (optional)",
35 |   "share.chrom_sizes": "File? (optional)"
36 |   
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/dockerfiles/share_task_joint_qc.dockerfile:
--------------------------------------------------------------------------------
 1 | ############################################################
 2 | # Dockerfile for BROAD GRO share-seq-pipeline
 3 | ############################################################
 4 | 
 5 | #FROM ubuntu@sha256:d1d454df0f579c6be4d8161d227462d69e163a8ff9d20a847533989cf0c94d90
 6 | FROM python:3.8-buster@sha256:7e7f4c5508b85268a93b573566c8eb321a6fdb466e3b60c663a42300c73a7400
 7 | 
 8 | LABEL maintainer="Mei Knudson"
 9 | 
10 | # To prevent time zone prompt
11 | ENV DEBIAN_FRONTEND=noninteractive
12 | ENV SAMTOOLS_VERSION 1.9
13 | 
14 | # Install softwares from apt repo
15 | RUN apt-get update && apt-get install -y \
16 |     r-base  &&\
17 |     rm -rf /var/lib/apt/lists/*
18 | 
19 | # Install packages for python3 scripts
20 | RUN python3 -m pip install matplotlib numpy pandas plotnine
21 | 
22 | # Install packages for R scripts
23 | RUN R -e "install.packages(c('ggplot2', 'remotes'))"
24 | RUN R -e "remotes::install_github('LKremer/ggpointdensity')"
25 | 
26 | # Create and setup new user
27 | ENV USER=shareseq
28 | WORKDIR /home/$USER
29 | RUN groupadd -r $USER &&\
30 |     useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
31 |     chown $USER:$USER /home/$USER
32 | 
33 | ENV PYTHONPATH="/usr/local/python:$PYTHONPATH"
34 | ENV R_LIBS_USER=/usr/local/lib/R
35 | 
36 | COPY --chown=$USER:$USER src/python/joint_cell_plotting.py /usr/local/bin
37 | COPY --chown=$USER:$USER src/R/joint_cell_plotting_density.R /usr/local/bin
38 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
39 | 
40 | USER ${USER}
41 | 


--------------------------------------------------------------------------------
/dockerfiles/share_task_trim_fastqs_atac.dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian@sha256:3ecce669b6be99312305bc3acc90f91232880c68b566f257ae66647e9414174f
 2 | 
 3 | LABEL maintainer = "Eugenio Mattei"
 4 | LABEL software = "Share-seq pipeline"
 5 | LABEL software.version="1.0.0"
 6 | LABEL software.organization="Broad Institute of MIT and Harvard"
 7 | LABEL software.version.is-production="Yes"
 8 | LABEL software.task="Trim ATAC fastqs"
 9 | 
10 | # Install softwares from apt repo
11 | RUN apt-get update && apt-get install -y \
12 |     autoconf \
13 |     automake \
14 |     binutils \
15 |     build-essential \
16 |     libcurl4-openssl-dev \
17 |     liblz4-dev \
18 |     liblzma-dev \
19 |     libncurses5-dev \
20 |     libbz2-dev \
21 |     pigz \
22 |     python3-dev \
23 |     python3-pip \ 
24 |     wget \
25 |     zlib1g-dev &&\
26 |     rm -rf /var/lib/apt/lists/*
27 | 
28 | # Install python packages
29 | RUN pip install --no-cache-dir --break-system-packages dnaio Levenshtein
30 | # Install fastp
31 | RUN wget http://opengene.org/fastp/fastp.0.20.1 && mv fastp.0.20.1 fastp && chmod a+x ./fastp && mv ./fastp /usr/local/bin
32 | 
33 | # Create and setup new user
34 | ENV USER=shareseq
35 | WORKDIR /home/$USER
36 | 
37 | RUN groupadd -r $USER &&\
38 |     useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
39 |     chown $USER:$USER /home/$USER
40 | 
41 | # Add folder with software to the path
42 | ENV PATH="/software:${PATH}"
43 | 
44 | # Copy the compiled software from the builder
45 | COPY --chown=$USER:$USER src/python/trim_fastq.py /usr/local/bin
46 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
47 | 
48 | USER ${USER}
49 | 


--------------------------------------------------------------------------------
/.dockstore.yml:
--------------------------------------------------------------------------------
 1 | version: 1.2
 2 | workflows:
 3 |    - name: "SHARE-seq"
 4 |      subclass: WDL
 5 |      primaryDescriptorPath: /share-seq.wdl
 6 |      filters:  # Only develop or master branches and localAligner/** tags
 7 |          branches:
 8 |              - main
 9 |              - IGVF-variant-jamboree
10 |          tags:
11 |              - /.*/
12 | 
13 |    - name: "dorcs-find-dorcs"
14 |      subclass: WDL
15 |      primaryDescriptorPath: /workflows/subwf-find-dorcs.wdl
16 |      filters:  # Only develop or master branches and localAligner/** tags
17 |          branches:
18 |              - main
19 |          tags:
20 |              - /.*/
21 |    - name: "SHARE-seq-atac-archr"
22 |      subclass: WDL
23 |      primaryDescriptorPath: /workflows/subwf-atac-archr.wdl
24 |      filters:  # Only develop or master branches and localAligner/** tags
25 |          branches:
26 |              - main
27 |              - dev
28 |          tags:
29 |              - /.*/
30 | 
31 |    - name: "SHARE-seq-rna-seurat"
32 |      subclass: WDL
33 |      primaryDescriptorPath: /workflows/subwf-rna-seurat.wdl
34 |      filters:  # Only develop or master branches and localAligner/** tags
35 |          branches:
36 |              - main
37 |              - dev
38 |          tags:
39 |              - /.*/
40 | 
41 |    - name: "SHARE-seq-sample-demultiplexing"
42 |      subclass: WDL
43 |      primaryDescriptorPath: /workflows/subwf-preprocess.wdl
44 |      filters:  # Only develop or master branches and localAligner/** tags
45 |          branches:
46 |              - main
47 |          tags:
48 |              - /.*/
49 | 
50 |    - name: "SHARE-seq-cell-annotation"
51 |      subclass: WDL
52 |      primaryDescriptorPath: /workflows/subwf-cell-annotation.wdl
53 |      filters:  # Only develop or master branches and localAligner/** tags
54 |          branches:
55 |              - main
56 |              - dev
57 |              - cell-annotation
58 |          tags:
59 |              - /.*/
60 | 


--------------------------------------------------------------------------------
/src/python/qc_atac_count_duplicates_per_barcode.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # Count the number of unique and duplicate fragments per barcode using the DS and DI tag from picard
 4 | 
 5 | import argparse
 6 | import pysam
 7 | import sys
 8 | 
 9 | from collections import defaultdict
10 | 
11 | def count_duplicates(in_path, out_path, barcode_tag="CB"):
12 |     """
13 |     """
14 |     # Dictionary holding the unique and duplicate count per barcode
15 |     counter = defaultdict(lambda: [0,0])
16 |     input = pysam.AlignmentFile(in_path, "rb")
17 |     for read in input:
18 |         cell_barcode = read.get_tag(barcode_tag)
19 |         if read.flag & 1024 == 1024:
20 |             counter[cell_barcode][1] += 1
21 |         else:
22 |             counter[cell_barcode][0] += 1
23 | 
24 |     with open(out_path, "w") as out_file:
25 |         print("barcode\treads_unique\treads_duplicate\tpct_duplicates", file=out_file)
26 |         for barcode, counts_vector in counter.items():
27 |             print(f"{barcode}\t{counts_vector[0]}\t{counts_vector[1]}\t{round(counts_vector[1]/(counts_vector[0]+counts_vector[1])*100,1)}", file=out_file)
28 | 
29 | if __name__ == '__main__':
30 | 
31 |     msg = "Add the description"
32 |     parser = argparse.ArgumentParser(description = msg)
33 | 
34 |     # Adding optional argument
35 |     parser.add_argument("bam_wdup", help = "Path to the coordinate-sorted bam file with duplicates marked but nor removed.")
36 |     parser.add_argument("-o", "--output", help = "Path to the fragments output file.")
37 |     parser.add_argument("--prefix", help = "Prefix for the metrics output file.")
38 |     parser.add_argument("--bc_tag", help = "Specify the tag containing the cell barcode.", default="CB")
39 | 
40 |     # Read arguments from command line
41 |     args = parser.parse_args()
42 | 
43 |     if args.prefix:
44 |         prefix = args.prefix
45 |     else:
46 |         prefix = args.bam_wdup[:-4]
47 | 
48 |     if args.output:
49 |         out_path = args.output
50 |     else:
51 |         out_path = f"{prefix}.duplicate.stats.tsv"
52 | 
53 |     bc_tag = args.bc_tag
54 | 
55 | 
56 |     count_duplicates(args.bam_wdup, out_path, bc_tag)
57 | 


--------------------------------------------------------------------------------
/src/python/pbc_stats.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | # Author Kundaje lab
 4 | # https://github.com/kundajelab/ENCODE_scatac/blob/master/workflow/scripts/pbc_stats.py
 5 | # Input QNAME sorted
 6 | 
 7 | 
 8 | def calc_pbc(in_sam, out_path):
 9 |     total_pairs = 0
10 |     distinct_pairs = -1
11 |     one_read_pairs = 0
12 |     two_read_pairs = 0
13 | 
14 |     current_pair = None
15 |     current_count = 0
16 | 
17 |     for al in in_sam:
18 |         fields = al.strip().split('\t')
19 |         flag = int(fields[1])
20 |         rname = fields[2]
21 |         pos = int(fields[3])
22 |         pnext = int(fields[7])
23 | 
24 |         if not (flag & 35 == 35):
25 |             continue
26 | 
27 |         pair = (rname, pos, pnext)
28 |         if pair == current_pair:
29 |             total_pairs += 1
30 |             current_count += 1
31 |         else:
32 |             total_pairs += current_count
33 |             distinct_pairs += 1
34 |             if current_count == 1:
35 |                 one_read_pairs += 1
36 |             elif current_count == 2:
37 |                 two_read_pairs += 1
38 | 
39 |             current_pair = pair
40 |             current_count = 1
41 | 
42 |     total_pairs += current_count
43 |     distinct_pairs += 1
44 |     if current_count == 1:
45 |         one_read_pairs += 1
46 |     elif current_count == 2:
47 |         two_read_pairs += 1
48 | 
49 |     nrf = distinct_pairs / total_pairs
50 |     pbc1 = one_read_pairs / distinct_pairs
51 |     pbc2 = one_read_pairs / two_read_pairs
52 | 
53 |     stats_str = "\t".join(str(i) for i in [
54 |         total_pairs,
55 |         distinct_pairs,
56 |         one_read_pairs,
57 |         two_read_pairs,
58 |         nrf,
59 |         pbc1,
60 |         pbc2
61 |     ])
62 |     descr_str = "\t".join([
63 |         "TotalReadPairs",
64 |         "DistinctReadPairs",
65 |         "OneReadPair",
66 |         "TwoReadPairs",
67 |         "NRF=Distinct/Total",
68 |         "PBC1=OnePair/Distinct",
69 |         "PBC2=OnePair/TwoPair"
70 |     ])
71 |     with open(out_path, 'w') as f:
72 |         f.write(f"{descr_str}\n{stats_str}\n")
73 | 
74 | if __name__ == "__main__":
75 |     qc_path = sys.argv[1]
76 |     calc_pbc(sys.stdin, qc_path)
77 | 
78 | 


--------------------------------------------------------------------------------
/tasks/share_task_log_atac.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | # TASK
 4 | # SHARE-atac-log
 5 | # Gather information from log files
 6 | 
 7 | 
 8 | task log_atac {
 9 |     meta {
10 |         version: 'v0.1'
11 |         author: 'Neva C. Durand (neva@broadinstitute.org) at Broad Institute of MIT and Harvard'
12 |         description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: log atac task'
13 |     }
14 | 
15 |     input {
16 |         # This function takes as input the necessary log files and extracts
17 |         # the quality metrics
18 |         File alignment_log
19 |         File dups_log
20 |     }
21 | 
22 |     command <<<
23 |         total_reads=$(awk 'NR==1{print $1}' ~{alignment_log})
24 |         echo $total_reads > total_reads.txt
25 |         aligned_uniquely=$(awk 'NR==4{print $1}' ~{alignment_log})
26 |         echo $aligned_uniquely > aligned_uniquely.txt
27 |         echo $(($total_reads - $aligned_uniquely)) > unaligned.txt
28 |         awk 'NR>1{sum += $2}END{print sum/2}' ~{dups_log} > feature_reads.txt
29 |         awk 'NR>1{sum += $3}END{print sum/2}' ~{dups_log} > duplicate_reads.txt
30 |         awk 'NR>1{unique+= $2; dups+=$3}END{printf "%5.1f", 100*dups/(unique+dups)}' ~{dups_log} > pct_duplicate_reads.txt
31 |     >>>
32 |     output {
33 |         Int atac_total_reads = read_int("total_reads.txt")
34 |         Int atac_aligned_uniquely = read_int("aligned_uniquely.txt")
35 |         Int atac_unaligned = read_int("unaligned.txt")
36 |         Int atac_feature_reads = read_int("feature_reads.txt")
37 |         Int atac_duplicate_reads = read_int("duplicate_reads.txt")
38 |         Float atac_pct_dup = read_float("pct_duplicate_reads.txt")
39 |     }
40 | 
41 |     runtime {
42 |         docker: 'ubuntu:latest'
43 |     }
44 |     parameter_meta {
45 |         alignment_log: {
46 |             description: 'ATAC alignment log file',
47 |         help: 'Log file from ATAC alignment step.',
48 |             example: 'SS-PKR-30-96-ENTIRE-PLATE.atac.align.hg38.Log.out'
49 |         }
50 |         dups_log: {
51 |             description: 'ATAC dups log file',
52 |             help: 'Log file from ATAC rmdups step.',
53 |             example: 'SS-PKR-12.atac.counts.mm10.filtered.cs.log'
54 |         }
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/tasks/10x_create_barcode_mapping.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | # TASK
 4 | # 10x_barcode_mapping
 5 | 
 6 | task mapping_tenx_barcodes {
 7 |     meta {
 8 |         version: 'v0.1'
 9 |         author: 'Eugenio Mattei (emattei@broadinstitute.org) at Broad Institute of MIT and Harvard'
10 |         description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: preprocess 10x ATAC data.'
11 |     }
12 | 
13 |     input {
14 |         # This task takes in input the 3 fastqs coming out from cellranger mkfastqs and preprocess them.
15 |         File whitelist_atac # Barcode whitelist (chemistry specific)
16 |         File whitelist_rna # Barcode whitelist (chemistry specific)
17 | 
18 |         Int? cpus = 16
19 |         Float? disk_factor = 0.5
20 |         Float? memory_factor = 0.15
21 |         String? docker_image = "debian:bullseye-slim"
22 |     }
23 | 
24 |     # Determine the size of the input
25 |     Float input_file_size_gb = size(whitelist_rna, "G") + size(whitelist_atac, "G")
26 | 
27 |     # Determining memory size base on the size of the input files.
28 |     Float mem_gb = 5.0 + memory_factor * input_file_size_gb
29 | 
30 |     # Determining disk size base on the size of the input files.
31 |     Int disk_gb = round(40.0 + disk_factor * input_file_size_gb)
32 | 
33 |     # Determining disk type base on the size of disk.
34 |     String disk_type = if disk_gb > 375 then "SSD" else "LOCAL"
35 | 
36 |     String barcode_conversion_dict = "barcode_conversion_dict.csv"
37 | 
38 |     command <<<
39 |         set -e
40 | 
41 |         if [ "$(zcat ~{whitelist_atac} | wc -l)" -eq "$(zcat ~{whitelist_rna} | wc -l)" ]; then
42 |             zcat ~{whitelist_atac} | tr ACGTacgt TGCAtgca | rev | paste -d ',' - <(zcat ~{whitelist_rna}) > ~{barcode_conversion_dict}
43 |             paste -d ',' <(zcat ~{whitelist_atac}) <(zcat ~{whitelist_rna}) >> ~{barcode_conversion_dict}
44 |         fi
45 |     >>>
46 | 
47 |     output {
48 |         File? tenx_barcode_conversion_dict = barcode_conversion_dict
49 |     }
50 | 
51 |     runtime {
52 |         cpu: cpus
53 |         docker: "${docker_image}"
54 |         disks: "local-disk ${disk_gb} ${disk_type}"
55 |         maxRetries: 1
56 |         memory: "${mem_gb} GB"
57 |         memory_retry_multiplier: 2
58 |     }
59 | }
60 | 


--------------------------------------------------------------------------------
/dockerfiles/share_task_qc_rna.dockerfile:
--------------------------------------------------------------------------------
 1 | ############################################################
 2 | # Dockerfile for BROAD GRO share-seq-pipeline
 3 | # Based on Debian slim
 4 | ############################################################
 5 | 
 6 | FROM r-base@sha256:fff003a52d076e963396876b83cfa88c4f40a8bc27e341339cd3cc0236c1db79
 7 | 
 8 | LABEL maintainer = "Eugenio Mattei"
 9 | LABEL software = "Share-seq pipeline"
10 | LABEL software.version="1.0.0"
11 | LABEL software.organization="Broad Institute of MIT and Harvard"
12 | LABEL software.version.is-production="Yes"
13 | LABEL software.task="qc_rna"
14 | 
15 | ENV R_VERSION=4.1.2
16 | 
17 | # To prevent time zone prompt
18 | ENV DEBIAN_FRONTEND=noninteractive
19 | ENV SAMTOOLS_VERSION 1.9
20 | 
21 | # Install softwares from apt repo
22 | RUN apt-get update && apt-get install -y \
23 |     autoconf \
24 |     automake \
25 |     binutils \
26 |     build-essential \
27 |     git \
28 |     libcurl4-openssl-dev \
29 |     liblz4-dev \
30 |     liblzma-dev \
31 |     libncurses5-dev \
32 |     libbz2-dev \
33 |     python3 \
34 |     python3-dev \
35 |     python3-full \
36 |     python3-pip \
37 |     wget \
38 |     zlib1g-dev &&\
39 |     rm -rf /var/lib/apt/lists/*
40 | 
41 | # Install samtools 1.9
42 | RUN git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/samtools.git && \
43 |     git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/htslib.git && \
44 |     cd samtools && make && make install && cd ../ && rm -rf samtools* htslib*
45 | 
46 | # Install python packages
47 | RUN pip install --no-cache-dir --break-system-packages pysam
48 | 
49 | # Create and setup new user
50 | ENV USER=shareseq
51 | WORKDIR /home/$USER
52 | RUN groupadd -r $USER &&\
53 |     useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
54 |     chown $USER:$USER /home/$USER
55 | 
56 | ENV R_LIBS_USER=/usr/local/lib/R
57 | 
58 | # Copy scripts
59 | COPY --chown=$USER:$USER src/python/rna_barcode_metadata.py /usr/local/bin/
60 | COPY --chown=$USER:$USER src/R/barcode_rank_functions.R /usr/local/bin/
61 | COPY --chown=$USER:$USER src/R/rna_qc_plots.R /usr/local/bin/
62 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
63 | 
64 | USER ${USER}
65 | 


--------------------------------------------------------------------------------
/dockerfiles/share_task_seurat.dockerfile:
--------------------------------------------------------------------------------
 1 | ############################################################
 2 | # Dockerfile for BROAD GRO share-seq-pipeline
 3 | # Based on Debian slim
 4 | ############################################################
 5 | 
 6 | FROM r-base@sha256:fff003a52d076e963396876b83cfa88c4f40a8bc27e341339cd3cc0236c1db79 as builder
 7 | 
 8 | LABEL maintainer = "Siddarth Wekhande"
 9 | LABEL software = "Share-seq pipeline"
10 | LABEL software.version="0.0.1"
11 | LABEL software.organization="Broad Institute of MIT and Harvard"
12 | LABEL software.version.is-production="No"
13 | LABEL software.task="seurat"
14 | 
15 | RUN echo "options(repos = 'https://cloud.r-project.org')" > $(R --no-echo --no-save -e "cat(Sys.getenv('R_HOME'))")/etc/Rprofile.site
16 | 
17 | ENV R_LIBS_USER=/usr/local/lib/R
18 | ENV RETICULATE_MINICONDA_ENABLED=FALSE
19 | 
20 | RUN apt-get update -qq && \
21 |     apt-get install -y -qq --no-install-recommends\
22 |     binutils \
23 |     gtk-doc-tools \
24 |     libcairo2-dev \
25 |     libcurl4-openssl-dev \
26 |     libfreetype-dev \
27 |     libfribidi-dev \
28 |     libgsl-dev \
29 |     libharfbuzz-dev \
30 |     libhdf5-dev \
31 |     libjpeg-dev \
32 |     libmpfr-dev \
33 |     libpng-dev \
34 |     libssl-dev \
35 |     libtiff5-dev \
36 |     libxml2-dev \
37 |     libxt-dev \
38 |     libgeos-dev \
39 |     meson \
40 |     pkg-config \
41 |     python3 \
42 |     python3-pip && \
43 |     rm -rf /var/lib/apt/lists/*
44 | 
45 | ENV USER=shareseq
46 | WORKDIR /home/$USER
47 | 
48 | RUN groupadd -r $USER &&\
49 |     useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
50 |     chown $USER:$USER /home/$USER
51 | 
52 | RUN R --no-echo --no-restore --no-save -e "install.packages(c('hdf5r','remotes','IRkernel','logr','BiocManager'))"
53 | 
54 | RUN R --no-echo --no-restore --no-save -e "remotes::install_version('Seurat', version = '4.3.0')"
55 | 
56 | RUN R --no-echo --no-restore --no-save -e "BiocManager::install(c('rhdf5'), update=F, ask=F)"
57 | 
58 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
59 | 
60 | 
61 | RUN python3 -m pip install --break-system-packages jupyter papermill
62 | 
63 | COPY src/jupyter_nb/seurat_notebook.ipynb /usr/local/bin/
64 | 
65 | RUN R -e "IRkernel::installspec()"
66 | 


--------------------------------------------------------------------------------
/dockerfiles/share_task_preprocess.dockerfile:
--------------------------------------------------------------------------------
 1 | ############################################################
 2 | # Dockerfile for BROAD GRO share-seq-pipeline
 3 | # Based on Ubuntu 18.04.3
 4 | ############################################################
 5 | 
 6 | # Set the base image to Ubuntu 18.04.3
 7 | #FROM ubuntu:focal
 8 | FROM ubuntu@sha256:d1d454df0f579c6be4d8161d227462d69e163a8ff9d20a847533989cf0c94d90
 9 | 
10 | MAINTAINER Neva Durand
11 | 
12 | # To prevent time zone prompt
13 | ENV DEBIAN_FRONTEND=noninteractive
14 | 
15 | # Install softwares from apt repo
16 | RUN apt-get update && apt-get install -y \ 
17 |     libncurses5-dev libcurl4-openssl-dev zlib1g-dev liblzma-dev libbz2-dev \
18 |     python3 python3-setuptools python3-pip \
19 |     git wget xmlstarlet \
20 |     openjdk-8-jre \
21 |     && rm -rf /var/lib/apt/lists/*
22 | 
23 | # Make directory for all softwares
24 | RUN mkdir /software
25 | WORKDIR /software
26 | ENV PATH="/software:${PATH}"
27 | 
28 | # Install samtools 1.9
29 | RUN git clone --branch 1.9 --single-branch https://github.com/samtools/samtools.git && \
30 |     git clone --branch 1.9 --single-branch https://github.com/samtools/htslib.git && \
31 |     cd samtools && make && make install && cd ../ && rm -rf samtools* htslib*
32 | 
33 | # Install system/math python packages (python3)
34 | RUN pip3 install --no-cache-dir python-Levenshtein==0.12.2 pysam requests oauth2client
35 | 
36 | # Install Picard 2.26.11
37 | RUN wget https://github.com/broadinstitute/picard/releases/download/2.26.11/picard.jar && chmod +x picard.jar
38 | 
39 | # Install gsutil
40 | # Downloading gcloud package
41 | RUN wget https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz 
42 | 
43 | # Installing the package
44 | RUN mkdir -p /usr/local/gcloud \
45 |   && gunzip google-cloud-sdk.tar.gz \
46 |   && tar -C /usr/local/gcloud -xvf google-cloud-sdk.tar \
47 |   && /usr/local/gcloud/google-cloud-sdk/install.sh \
48 |   && rm google-cloud-sdk.tar
49 | 
50 | # Adding the package path to local
51 | ENV PATH $PATH:/usr/local/gcloud/google-cloud-sdk/bin
52 | 
53 | # Copy the external scripts inside
54 | COPY src/python/bam_to_raw_fastq.py /software
55 | COPY src/python/flexible_import_entities_standard.py /software
56 | COPY src/python/write_terra_tables.py /software
57 | COPY src/bash/monitor_script.sh /software
58 | 


--------------------------------------------------------------------------------
/src/R/joint_cell_plotting_density.R:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | library(ggpointdensity)
 3 | 
 4 | args <- commandArgs()
 5 | pkr <- args[6]
 6 | barcode_metadata_file <- args[7]
 7 | plot_file <- args[8]
 8 | 
 9 | options(scipen=999)
10 | 
11 | barcode_metadata <- read.csv(barcode_metadata_file)
12 | passing_df <- barcode_metadata[barcode_metadata$QC %in% c("RNA only", "ATAC only", "both"),]
13 | 
14 | # get max x and y coords to set plot limits
15 | round_to_power_10 <- function(x){
16 |   return(10^ceiling(log10(x)))
17 | }
18 | max_x <- max(passing_df$frags)
19 | max_y <- max(passing_df$umis)
20 | xy_lim <- round_to_power_10(max(max_x, max_y))
21 | 
22 | # palette from https://rdrr.io/github/GreenleafLab/ArchR/src/R/ColorPalettes.R
23 | sambaNight <- c("6"='#1873CC',"2"='#1798E5',"8"='#00BFFF',"5"='#4AC596',"1"='#00CC00',"4"='#A2E700',"9"='#FFFF00',"7"='#FFD200',"3"='#FFA500')
24 | 
25 | if (sum(barcode_metadata$QC=="both") > 0) {
26 |     png(plot_file, width=8.75, height=6, units="in", res=300)
27 | 
28 |     density_plot <- ggplot(passing_df, aes(x=frags, y=umis)) +
29 |                             geom_pointdensity(size=0.7) +
30 |                             scale_color_gradientn(colors=sambaNight) +
31 |                             labs(title=paste0("Joint Cell Calling (", pkr, "): Density Plot", sep=""),
32 |                                  x="ATAC Unique Fragments per Barcode",
33 |                                  y="RNA UMIs per Barcode") +
34 |                             theme_light() +
35 |                             theme(plot.margin=margin(t=9, r=36.5, b=25, l=9, unit="pt"),
36 |                                   plot.title=element_text(size=12.5, hjust=0.5),
37 |                                   axis.title=element_text(size=11),
38 |                                   axis.text=element_text(size=8.5),
39 |                                   legend.title=element_text(size=8),
40 |                                   legend.text=element_text(size=6),
41 |                                   panel.grid.minor=element_blank()) +
42 |                             scale_x_continuous(trans="log10",
43 |                                                limits=c(10,xy_lim)) +
44 |                             scale_y_continuous(trans="log10",
45 |                                                limits=c(10,xy_lim))
46 |     print(density_plot)
47 |     dev.off()
48 | }
49 | 


--------------------------------------------------------------------------------
/src/python/flexible_import_entities_standard.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import requests
 3 | 
 4 | from oauth2client.client import GoogleCredentials
 5 | 
 6 | # function to get authorization bearer token for requests
 7 | def get_access_token():
 8 |     """Get access token."""
 9 | 
10 |     scopes = ["https://www.googleapis.com/auth/userinfo.profile", "https://www.googleapis.com/auth/userinfo.email"]
11 |     credentials = GoogleCredentials.get_application_default()
12 |     credentials = credentials.create_scoped(scopes)
13 |     return credentials.get_access_token().access_token
14 | 
15 | def call_flexible_import_entities(workspace_name, project, tsv):
16 |     """Post entities to Terra workspace using flexibleImportEntities."""
17 | 
18 |     # rawls request URL for batchUpsert
19 |     uri = f"https://api.firecloud.org/api/workspaces/{project}/{workspace_name}/flexibleImportEntities?async=false&deleteEmptyValues=false"
20 |     # Get access token and and add to headers for requests.
21 |     # -H  "accept: */*" -H  "Authorization: Bearer [token] -H "Content-Type: application/json"
22 |     headers = {"Authorization": "Bearer " + get_access_token(), "accept": "*/*"}
23 | 
24 | 	# Create file dictionary to be passed to request
25 |     files = {'entities': open(tsv ,'rb')}
26 | 
27 |     # capture response from API and parse out status code
28 |     response = requests.post(uri, headers=headers, files=files)
29 |     status_code = response.status_code
30 | 
31 |     if status_code != 200:  # entities upsert fail
32 |         print(f"ERROR: Code {status_code} returned.")
33 |         print(response.text)
34 |         print(response.raise_for_status())
35 |         
36 |     # entities upsert success
37 |     print(f"Successfully uploaded entities." + "\n")
38 | 
39 | if __name__ == '__main__':
40 |     parser = argparse.ArgumentParser(description='')
41 |     parser.add_argument('-w', '--workspace_name', required=True, help='name of workspace in which to make changes')
42 |     parser.add_argument('-p', '--project', required=True, help='billing project (namespace) of workspace in which to make changes')
43 |     parser.add_argument('-t', '--tsv', required=True, help='.tsv file formatted in load format to Terra UI')
44 | 
45 |     args = parser.parse_args()
46 | 
47 |     # call import API (firecloud)
48 |     call_flexible_import_entities(args.workspace_name, args.project, args.tsv)


--------------------------------------------------------------------------------
/tasks/share_task_log_rna.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | # TASK
 4 | # SHARE-rna-log
 5 | # Gather information from log files 
 6 | 
 7 | 
 8 | task log_rna {
 9 |     meta {
10 |         version: 'v0.1'
11 |         author: 'Neva C. Durand (neva@broadinstitute.org) at Broad Institute of MIT and Harvard'
12 |         description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: log rna task'
13 |     }
14 | 
15 |     input {
16 |         # This function takes as input the necessary log files and extracts
17 |         # the quality metrics
18 |         File alignment_log
19 |         File dups_log
20 |     }
21 | 
22 |     command <<<
23 |         total_reads=$(awk -F"|" '$1~/input reads/{print $2}' ~{alignment_log})
24 |         echo $total_reads > total_reads.txt
25 |         aligned_uniquely=$(awk -F"|" '$1~/Uniquely mapped reads number/{print $2}' ~{alignment_log})
26 |         echo $aligned_uniquely > aligned_uniquely.txt
27 |         aligned_multimap=$(awk -F"|" '$1~/Number of reads mapped to multiple loci/{print $2}' ~{alignment_log})
28 |         echo $aligned_multimap > aligned_multimap.txt
29 |         echo $(($total_reads - $aligned_uniquely - $aligned_multimap)) > unaligned.txt
30 |         awk -F":" '$1~/total reads/{print $2}' ~{dups_log} > feature_reads.txt
31 |         awk -F":" '$1~/duplicate reads/{print $2}' ~{dups_log} > duplicate_reads.txt
32 |     >>>
33 |     output {
34 |         Int rna_total_reads = read_int("total_reads.txt")
35 |         Int rna_aligned_uniquely = read_int("aligned_uniquely.txt")
36 |         Int rna_aligned_multimap = read_int("aligned_multimap.txt")
37 |         Int rna_unaligned = read_int("unaligned.txt")	    
38 |         Int rna_feature_reads = read_int("feature_reads.txt")
39 |         Int rna_duplicate_reads = read_int("duplicate_reads.txt")
40 |     }
41 | 
42 |     runtime {
43 |         docker: 'ubuntu:latest'
44 |     }
45 |     parameter_meta {
46 |         alignment_log: {
47 |             description: 'RNA alignment log file',
48 | 	    help: 'Log file from RNA alignment step.',
49 |             example: 'SS-PKR-30-96-ENTIRE-PLATE.rna.align.hg38.Log.out'
50 |         }
51 | 
52 |         dups_log: {
53 |             description: 'Group UMI dups log file',
54 |             help: 'Log file from group UMI task',
55 |             example: 'SS-PKR-30-96-ENTIRE-PLATE.rm_dup_barcode.log.txt'
56 |         }
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/python/plot_insert_size_hist.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | This script takes in the Picard CollectInsertSizeMetrics histogram txt file output,
 5 | and generates the histogram as a png.
 6 | """
 7 | 
 8 | import argparse
 9 | import pandas as pd
10 | from plotnine import *
11 | 
12 | def parse_arguments():
13 |     parser = argparse.ArgumentParser(description="Plot insert size histogram")
14 |     parser.add_argument("histogram_file", help="Histogram txt file name")
15 |     parser.add_argument("pkr", help="PKR ID")
16 |     parser.add_argument("out_file", help="Name of output png file")
17 |     
18 |     return parser.parse_args()
19 | 
20 | def get_hist_vals(histogram_file):
21 |     """Get dataframe of histogram values"""
22 |     with open(histogram_file, "r") as f:
23 |         begin_vals = False
24 |         insert_size = []
25 |         count = []
26 |         for line in f:
27 |             vals = line.rstrip().split(sep="\t")
28 |             if begin_vals and len(vals) == 2: # last line is blank
29 |                 insert_size.append(int(vals[0]))
30 |                 count.append(int(vals[1]))
31 |             elif vals[0] == "insert_size": # desired values occur after line beginning with "insert_size"
32 |                 begin_vals = True
33 |             
34 |     df = pd.DataFrame(list(zip(insert_size, count)), columns=["insert_size","count"])
35 |     
36 |     return(df)
37 | 
38 | def label_func(breaks):
39 |     return ["{:.0e}".format(x) for x in breaks]
40 | 
41 | def plot_hist(df, pkr, out_file):
42 |     plot = (ggplot(df, aes(x="insert_size", y="count")) +
43 |             geom_line(color="red") +
44 |             geom_area(fill="red") +
45 |             labs(title = f"Insert Size Histogram ({pkr})",
46 |                  x = "Insert size",
47 |                  y = "Count") + 
48 |             scale_y_continuous(labels = label_func) +
49 |             theme_classic())
50 |     
51 |     plot.save(filename = out_file, dpi=1000)
52 | 
53 | def main():
54 |     print("Starting histogram plotting script")
55 |     args = parse_arguments() 
56 |     histogram_file = getattr(args, "histogram_file")
57 |     pkr = getattr(args, "pkr")
58 |     out_file = getattr(args, "out_file")
59 |     
60 |     df = get_hist_vals(histogram_file)
61 |     
62 |     plot_hist(df, pkr, out_file)
63 |     print("Finished plotting")
64 | 
65 | if __name__ == "__main__":
66 |     main()    
67 | 


--------------------------------------------------------------------------------
/workflows/subwf-cell-annotation.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | # Import the tasks called by the pipeline
 4 | import "../tasks/share_task_cell_annotation.wdl" as share_task_cell_annotation
 5 | 
 6 | workflow wf_cell_annotation {
 7 |     meta {
 8 |         version: 'v0.1'
 9 |         author: 'Zhijian Li'
10 |         affiliation: 'Broad Institute of MIT and Harvard'
11 |         email: 'lizhijia@broadinstitute.org'
12 |         description: 'SHARE-Seq pipeline: cell type annotation using RNA-seq data.'
13 |     }
14 | 
15 |     input {
16 |         # Sample name
17 |         String? prefix="prefix"
18 | 
19 |         # Reference genome
20 |         String genome
21 | 
22 |         # Reference data for cell annotation
23 |         String reference_data_id
24 |         String reference_data_name
25 |         String reference_label
26 | 
27 |         # Set true if the reference data uses gene id as feature name. 
28 |         # This is usually true for data downloaded from cellxgene server
29 |         String? gene_id_to_symbol = "TRUE"
30 | 
31 |         # Query data
32 |         File query_data
33 |         
34 |         # Docker images
35 |         String? docker_image="lzj1769/cell_annotation"
36 |     
37 |         # Runtime parameters
38 |         Float? memory_factor = 5
39 |         Float? disk_factor = 10
40 |     }
41 | 
42 |     call share_task_cell_annotation.cell_annotation as cell_annotation{
43 |         input:
44 |             reference_data_id = reference_data_id,
45 |             reference_data_name = reference_data_name,
46 |             reference_label = reference_label,
47 |             query_data = query_data,
48 |             genome = genome,
49 |             gene_id_to_symbol = gene_id_to_symbol,
50 |             prefix = prefix,
51 |             docker_image = docker_image,
52 |             disk_factor = disk_factor,
53 |             memory_factor = memory_factor
54 |     }
55 | 
56 |     output {
57 |         File share_cell_annotation_reference_h5ad = cell_annotation.reference_h5ad
58 |         File share_cell_annotation_notebook_log = cell_annotation.notebook_log
59 |         File share_cell_annotation_monitor_log = cell_annotation.monitor_log
60 |         File share_cell_annotation_prediction = cell_annotation.prediction
61 |         File share_cell_annotation_prediction_labels = cell_annotation.prediction_labels
62 |         File share_cell_annotation_prediction_scores = cell_annotation.prediction_scores
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/src/python/assign_multimappers.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import argparse
 3 | 
 4 | """
 5 | From https://github.com/ENCODE-DCC/atac-seq-pipeline/blob/master/src/assign_multimappers.py
 6 | Script to take multimappers and randomly assign
 7 | REQUIRES A QJNAME SORTED FILE!
 8 | """
 9 | 
10 | def parse_args():
11 |     '''
12 |     Gives options
13 |     '''
14 |     parser = argparse.ArgumentParser(description='Saves reads below a alignment threshold and discards all others')
15 |     parser.add_argument('-k', help='Alignment number cutoff')
16 |     parser.add_argument('--paired-end', dest='paired_ended', action='store_true', help='Data is paired-end')
17 |     args = parser.parse_args()
18 |     alignment_cutoff = int(args.k)
19 |     paired_ended = args.paired_ended
20 | 
21 |     return alignment_cutoff, paired_ended
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     '''
26 |     Runs the filtering step of choosing multimapped reads
27 |     '''
28 | 
29 |     [alignment_cutoff, paired_ended] = parse_args()
30 | 
31 |     if paired_ended:
32 |         alignment_cutoff = int(alignment_cutoff) * 2
33 | 
34 |     # Store each line in sam file as a list of reads,
35 |     # where each read is a list of elements to easily
36 |     # modify or grab things
37 |     current_reads = []
38 |     current_qname = ''
39 | 
40 |     for line in sys.stdin:
41 | 
42 |         read_elems = line.strip().split('\t')
43 | 
44 |         if read_elems[0].startswith('@'):
45 |             sys.stdout.write(line)
46 |             continue
47 | 
48 |         # Keep taking lines that have the same qname
49 |         if read_elems[0] == current_qname:
50 |             # Add line to current reads
51 |             current_reads.append(line)
52 |             pass
53 |         else:
54 |             # Discard if there are more than the alignment cutoff
55 |             if len(current_reads) > alignment_cutoff:
56 |                 current_reads = [line]
57 |                 current_qname = read_elems[0]
58 |             elif len(current_reads) > 0:
59 |                 # Just output all reads, which are then filtered with samtools
60 |                 for read in current_reads:
61 |                     sys.stdout.write(str(read))
62 | 
63 |                 # And then discard
64 |                 current_reads = [line]
65 |                 current_qname = read_elems[0]
66 |             else:
67 |                 # First read in file
68 |                 current_reads.append(line)
69 |                 current_qname = read_elems[0]
70 | 


--------------------------------------------------------------------------------
/src/python/barcode_revcomp_detect.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | import sys
 3 | 
 4 | REV_COMP = str.maketrans("ATGC", "TACG")
 5 | def reverse_complement(seq):
 6 |     return str.translate(seq, REV_COMP)[::-1]
 7 | 
 8 | def get_open_fn(path):
 9 |     with open(path, "rb") as f:
10 |         is_gzipped = (f.read(2) == b'\x1f\x8b')
11 |     return gzip.open if is_gzipped else open
12 | 
13 | def read_barcodes(path):
14 |     open_fn = get_open_fn(path)
15 |     with open_fn(path, 'rt') as file:
16 |         bc = [b.strip() for b in file]
17 |     bcrc = [reverse_complement(b) for b in bc]
18 |     return set(bc), set(bcrc)
19 | 
20 | def bc_detect(fastq, whitelist, out, qc, offset, num_reads=100000, thresh=0.45):
21 |     bc, bcrc = read_barcodes(whitelist)
22 | 
23 |     bc_match = 0
24 |     bcrc_match = 0
25 |     num_lines = num_reads * 4
26 |     with gzip.open(fastq, 'rt') as f:
27 |         for lnum, line in enumerate(f):
28 |             if lnum >= num_lines:
29 |                 break
30 |             if lnum % 4 != 1:
31 |                 continue
32 |             seq = line.strip()[offset:]
33 |             if seq in bc:
34 |                 bc_match += 1
35 |             if seq in bcrc:
36 |                 bcrc_match += 1
37 | 
38 |     bc_match_prop = bc_match / num_reads
39 |     bcrc_match_prop = bcrc_match / num_reads
40 |     valid = (bc_match_prop >= thresh) or (bcrc_match_prop >= thresh)
41 |     fc_chosen = (bc_match_prop >= bcrc_match_prop)
42 | 
43 |     with open(qc, 'w') as f:
44 |         f.write(f"Direct match proportion: {bc_match_prop}\n")
45 |         f.write(f"Reverse-complement match proportion: {bcrc_match_prop}\n")
46 |         f.write(f"Reverse-complement chosen: {not fc_chosen}\n")
47 | 
48 |     if not valid:
49 |         raise ValueError(f"Insufficient barcode match rate: {bc_match_prop}, {bcrc_match_prop}")
50 |     with open(out, 'w') as f:
51 |         if fc_chosen:
52 |             f.write(f"{0}\n")
53 |         else:
54 |             f.write(f"{1}\n")
55 | 
56 | try:
57 |     fastq = sys.argv[1]
58 |     modality = sys.argv[2]
59 |     whitelist = sys.argv[3]
60 | 
61 |     qc = sys.argv[4]
62 |     out = sys.argv[5]
63 |     thres = sys.argv[6]
64 | 
65 |     if modality == "10x":
66 |         offset = 0
67 |         bc_detect(fastq, whitelist, out, qc, offset, 100000, float(thres))
68 |     elif modality == "10x_multiome":
69 |         offset = 8
70 |         bc_detect(fastq, whitelist, out, qc, offset, 100000, float(thres))
71 | 
72 | except NameError:
73 |     pass
74 | 


--------------------------------------------------------------------------------
/tasks/get_cellxgene_data.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task get_cellxgene_data {
 4 |     meta {
 5 |         version: 'v0.1'
 6 |         author: 'Zhijian Li'
 7 |         affiliation: 'Broad Institute of MIT and Harvard'
 8 |         email: 'lizhijia@broadinstitute.org'
 9 |         description: 'SHARE-Seq pipeline: get data from cellxgene server.'    
10 |     }
11 | 
12 |     input {
13 |         # Reference data id and name
14 |         String reference_data_id
15 |         String reference_data_name
16 | 
17 |         # Docker image
18 |         String? docker_image
19 |     }
20 |     
21 |     # Determining memory size base on the size of the input files.
22 |     Float mem_gb = 32.0
23 | 
24 |     # Determining disk size base on the size of the input files.
25 |     Int disk_gb = 100
26 | 
27 |     # Determining disk type base on the size of disk.
28 |     String disk_type = if disk_gb > 375 then "SSD" else "LOCAL"
29 | 
30 |     String reference_h5ad = "${reference_data_name}"
31 |     String monitor_log = "monitoring.log"
32 |     String running_log = "get_cellxgene_data.log"
33 |     
34 |     command {
35 |         set -e
36 | 
37 |         bash $(which monitor_script.sh) | tee ~{monitor_log} 1>&2 &
38 |         
39 |         # Download data from cellxgene
40 |         python3 $(which get_cellxgene_data.py) ${reference_data_id} ${reference_data_name}
41 | 
42 |     }
43 | 
44 |     output {
45 |         File reference_h5ad = reference_h5ad
46 |         File monitor_log = monitor_log
47 |         File running_log = running_log
48 |     }
49 | 
50 |     runtime {
51 |         memory : "${mem_gb} GB"
52 |         memory_retry_multiplier: 2
53 |         disks: "local-disk ${disk_gb} ${disk_type}"
54 |         docker : "${docker_image}"
55 |         maxRetries:1
56 |     }
57 | 
58 |     parameter_meta {
59 |         reference_data_id: {
60 |             description: 'Reference dataset id',
61 |             help: 'The dataset id from cellxgene server.',
62 |             examples: ['3bbb6cf9-72b9-41be-b568-656de6eb18b5']
63 |         }
64 | 
65 |         reference_data_name: {
66 |             description: 'Reference dataset name',
67 |             help: 'String used to name the reference data.',
68 |             examples: ['reference.h5ad']
69 |         }
70 | 
71 |         docker_image: {
72 |             description: 'Docker image.',
73 |             help: 'Docker image for preprocessing step.',
74 |             example: ['put link to gcr or dockerhub']
75 |         }
76 |         
77 |     }
78 | }
79 | 


--------------------------------------------------------------------------------
/tasks/share_task_correct_fastq.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | # TASK
 4 | # SHARE-correct-fastq
 5 | 
 6 | task share_correct_fastq {
 7 |     meta {
 8 |         version: 'v0.1'
 9 |         author: 'Mei Knudson (mknudson@broadinstitute.org) at Broad Institute of MIT and Harvard'
10 |         description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: Correct FASTQs task'
11 |     }
12 | 
13 |     input {
14 |         File fastq_R1
15 |         File fastq_R2
16 |         File whitelist
17 |         String sample_type
18 |         String? pkr
19 |         String? prefix
20 | 
21 |         Int? cpus = 16
22 |         Float? disk_factor = 8.0
23 |         Float? memory_factor = 0.08
24 |         String? docker_image = "us.gcr.io/buenrostro-share-seq/share_task_correct_fastq:v1.0.0"
25 |     }
26 | 
27 |     # Determine the size of the input
28 |     Float input_file_size_gb = size(fastq_R1, "G") + size(fastq_R2, "G")
29 | 
30 |     # Determining memory size base on the size of the input files.
31 |     Float mem_gb = 16.0 + memory_factor * input_file_size_gb
32 | 
33 |     # Determining disk size base on the size of the input files.
34 |     Int disk_gb = round(40.0 + disk_factor * input_file_size_gb)
35 | 
36 |     # Determining disk type base on the size of disk.
37 |     String disk_type = if disk_gb > 375 then "SSD" else "LOCAL"
38 | 
39 |     String corrected_fastq_R1 = basename(fastq_R1, ".fastq.gz") + "_corrected.fastq"
40 |     String corrected_fastq_R2 = basename(fastq_R2, ".fastq.gz") + "_corrected.fastq"
41 |     String monitor_log = "correct_fastqs_monitor.log"
42 | 
43 |     command <<<
44 |         set -e
45 | 
46 |         bash $(which monitor_script.sh) | tee ~{monitor_log} 1>&2 &
47 | 
48 |         # Perform barcode error correction on FASTQs
49 |         python3 $(which correct_fastq.py) \
50 |             ~{fastq_R1} \
51 |             ~{fastq_R2} \
52 |             ~{corrected_fastq_R1} \
53 |             ~{corrected_fastq_R2} \
54 |             ~{whitelist} \
55 |             ~{sample_type} \
56 |             ~{prefix} \
57 |             ~{pkr}
58 | 
59 |         pigz -p ~{cpus} *.fastq
60 |     >>>
61 | 
62 |     output {
63 |         File corrected_fastq_R1 = "~{corrected_fastq_R1}.gz"
64 |         File corrected_fastq_R2 = "~{corrected_fastq_R2}.gz"
65 |         File barcode_qc = "~{prefix}_barcode_qc.txt"
66 | 	File monitor_log = "~{monitor_log}"
67 |     }
68 | 
69 |     runtime {
70 |         cpu : cpus
71 |         memory : "~{mem_gb} GB"
72 |         disks: "local-disk ~{disk_gb} ~{disk_type}"
73 |         docker : "~{docker_image}"
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/.vimrc:
--------------------------------------------------------------------------------
 1 | " Vim syn file
 2 | " Language: Workflow Description Language
 3 | " Maintainer: Scott Frazer
 4 | " Latest Revision: 21 July 2015
 5 | "
 6 | if exists("b:current_syn")
 7 |    finish
 8 | endif
 9 | 
10 | " command { ... } section
11 | syntax region wdlCommandSection start="command\s*{" end="\v\}" contains=wdlCommand,wdlCommandParameter,wdlKeyword,wdlCommandDelimiter
12 | syntax region wdlCommandSection2 start="command\s*<<<" end="\v>>>" contains=wdlCommand,wdlCommandParameter,wdlKeyword,wdlCommandDelimiter
13 | syntax keyword wdlCommandKeyword command contained containedin=wdlCommandSection
14 | syntax match wdlCommand "\zs.\{-}\ze\${" contained containedin=wdlCommandSection
15 | syntax region wdlCommandParameter start=/\v\$\{/ end=/\v\}/ oneline contained containedin=wdlCommandSection contains=wdlType,wdlString,wdlCommandParameterName
16 | syntax match wdlCommandParameterName /\v\zs\w+\ze([\\?\\*\\+]?\})/ contained containedin=wdlCommandParameter
17 | 
18 | " Keywords
19 | syntax keyword wdlKeyword workflow task call nextgroup=wdlTaskName
20 | syntax keyword wdlKeyword output scatter if then else runtime
21 | syntax keyword wdlType Boolean Int Float String File Uri nextgroup=wdlIdentifier
22 | syntax keyword wdlImport import
23 | 
24 | " Compound Types
25 | syntax region wdlType start=/\(Map\|Array\)\[/ end=/\]/ contains=wdlType nextgroup=wdlIdentifier
26 | 
27 | " Identifiers
28 | syntax match wdlIdentifier /\v\s*\w+/ contained
29 | syntax match wdlTaskName /\v\s*\w+/ contained
30 | 
31 | " Strings
32 | syntax region wdlString start=/"/ skip=/\\"/ end=/"/ oneline contains=wdlInterpolationWrapper
33 | syntax region wdlInterpolationWrapper start="\v\$\{" end="\v\}" contained containedin=wdlString contains=wdlInterpolatedString
34 | syntax match wdlInterpolatedString "\v\w+" contained containedin=wdlInterpolationWrapper
35 | 
36 | " Comments
37 | syntax match wdlComment "\v#.*$"
38 | 
39 | highlight link wdlCommandParameter Comment
40 | highlight link wdlKeyword Keyword
41 | highlight link wdlCommandKeyword Keyword
42 | highlight link wdlCommand Punctuation
43 | highlight link wdlTaskName Identifier
44 | 
45 | highlight link wdlCommandParameterName Identifier
46 | highlight link wdlIdentifier Identifier
47 | highlight link wdlType Type
48 | highlight link wdlString String
49 | highlight link wdlImport Include
50 | highlight link wdlInterpolationWrapper Include
51 | highlight link wdlInterpolatedString Include
52 | highlight link wdlComment Comment
53 | 
54 | setlocal commentstring=//\ %s
55 | " @-@ adds the literal @ to iskeyword for @IBAction and similar
56 | setlocal tabstop=2
57 | setlocal softtabstop=2
58 | setlocal shiftwidth=2
59 | 
60 | au BufRead,BufNewFile *.wdl set filetype=wdl
61 | 


--------------------------------------------------------------------------------
/dockerfiles/share_task_merge_bams.dockerfile:
--------------------------------------------------------------------------------
 1 | ############################################################
 2 | # Dockerfile for BROAD GRO share-seq-pipeline
 3 | # Based on Debian slim
 4 | ############################################################
 5 | 
 6 | FROM debian@sha256:3ecce669b6be99312305bc3acc90f91232880c68b566f257ae66647e9414174f as builder
 7 | 
 8 | ENV SAMBAMBA_VERSION 0.6.6
 9 | ENV PICARD_VERSION 2.27.5
10 | 
11 | # To prevent time zone prompt
12 | ENV DEBIAN_FRONTEND=noninteractive
13 | 
14 | # Install softwares from apt repo
15 | RUN apt-get update && apt-get install -y \
16 |     autoconf \
17 |     build-essential \
18 |     git \
19 |     libcurl4-openssl-dev \
20 |     liblz4-dev \
21 |     liblzma-dev \
22 |     libncurses5-dev \
23 |     libbz2-dev \
24 |     python3 \
25 |     unzip \
26 |     wget \
27 |     zlib1g-dev && \
28 |     rm -rf /var/lib/apt/lists/*
29 | 
30 | 
31 | # Make directory for all softwares
32 | RUN mkdir /software
33 | WORKDIR /software
34 | ENV PATH="/software:${PATH}"
35 | 
36 | # Install sambamba 0.6.6
37 | RUN wget https://github.com/lomereiter/sambamba/releases/download/v${SAMBAMBA_VERSION}/sambamba_v${SAMBAMBA_VERSION}_linux.tar.bz2 && \
38 |     tar -xvjf sambamba_v${SAMBAMBA_VERSION}_linux.tar.bz2 && \
39 |     mv sambamba_v${SAMBAMBA_VERSION} /usr/local/bin/sambamba && \
40 |     rm -rf sambamba_*
41 | 
42 | # Install Picard 2.20.7
43 | RUN wget https://github.com/broadinstitute/picard/releases/download/${PICARD_VERSION}/picard.jar && chmod +x picard.jar && mv picard.jar /usr/local/bin
44 | 
45 | FROM debian@sha256:3ecce669b6be99312305bc3acc90f91232880c68b566f257ae66647e9414174f
46 | 
47 | LABEL maintainer = "Eugenio Mattei"
48 | LABEL software = "Share-seq pipeline"
49 | LABEL software.version="1.0.0"
50 | LABEL software.organization="Broad Institute of MIT and Harvard"
51 | LABEL software.version.is-production="Yes"
52 | LABEL software.task="merge"
53 | 
54 | RUN apt-get update && apt-get install -y \
55 |     openjdk-17-jre && \
56 |     rm -rf /var/lib/apt/lists/*
57 | 
58 | # Create and setup new user
59 | ENV USER=shareseq
60 | WORKDIR /home/$USER
61 | 
62 | RUN groupadd -r $USER &&\
63 |     useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
64 |     chown $USER:$USER /home/$USER
65 | 
66 | # Add folder with software to the path
67 | ENV PATH="/software:${PATH}"
68 | 
69 | # Copy the compiled software from the builder
70 | COPY --from=builder --chown=$USER:$USER /usr/local/bin/* /usr/local/bin/
71 | COPY --from=builder --chown=$USER:$USER /lib/x86_64-linux-gnu/* /lib/x86_64-linux-gnu/
72 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
73 | 
74 | 
75 | USER ${USER}
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/dockerfiles/share_task_archr.dockerfile:
--------------------------------------------------------------------------------
 1 | ############################################################
 2 | # Dockerfile for BROAD GRO share-seq-pipeline
 3 | # Based on Debian slim
 4 | ############################################################
 5 | 
 6 | FROM r-base@sha256:fff003a52d076e963396876b83cfa88c4f40a8bc27e341339cd3cc0236c1db79 as builder
 7 | 
 8 | LABEL maintainer = "Siddarth Wekhande"
 9 | LABEL software = "Share-seq pipeline"
10 | LABEL software.version="1.0.0"
11 | LABEL software.organization="Broad Institute of MIT and Harvard"
12 | LABEL software.version.is-production="Yes"
13 | LABEL software.task="archr"
14 | 
15 | RUN echo "options(repos = 'https://cloud.r-project.org')" > $(R --no-echo --no-save -e "cat(Sys.getenv('R_HOME'))")/etc/Rprofile.site
16 | 
17 | ENV R_LIBS_USER=/usr/local/lib/R
18 | ENV RETICULATE_MINICONDA_ENABLED=FALSE
19 | 
20 | RUN apt-get update -qq && \
21 |     apt-get install -y -qq --no-install-recommends\
22 |     binutils \
23 |     gtk-doc-tools \
24 |     libcairo2-dev \
25 |     libcurl4-openssl-dev \
26 |     libfreetype-dev \
27 |     libfribidi-dev \
28 |     libgsl-dev \
29 |     libharfbuzz-dev \
30 |     libhdf5-dev \
31 |     libjpeg-dev \
32 |     libmpfr-dev \
33 |     libpng-dev \
34 |     libssl-dev \
35 |     libtiff5-dev \
36 |     libxml2-dev \
37 |     libxt-dev \
38 |     libmagick++-dev \
39 |     libgeos-dev \
40 |     meson \
41 |     python3 \
42 |     python3-pip && \
43 |     rm -rf /var/lib/apt/lists/*
44 | 
45 | ENV USER=shareseq
46 | WORKDIR /home/$USER
47 | 
48 | RUN groupadd -r $USER &&\
49 |     useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
50 |     chown $USER:$USER /home/$USER
51 | 
52 | RUN R --no-echo --no-restore --no-save -e "install.packages(c('devtools','hdf5r','IRkernel','BiocManager','Cairo','magick'))"
53 | 
54 | RUN R --no-echo --no-restore --no-save -e "BiocManager::install(c('GenomeInfoDbData','GenomicRanges','Rsamtools'), update=F, ask=F)"
55 | 
56 | RUN R --no-echo --no-restore --no-save -e "devtools::install_github('GreenleafLab/ArchR@v1.0.1', repos = BiocManager::repositories());ArchR::installExtraPackages()"
57 | 
58 | RUN R --no-echo --no-restore --no-save -e "devtools::install_github('immunogenomics/presto')"
59 | 
60 | RUN R --no-echo --no-restore --no-save -e "remotes::install_version('Seurat', version = '4.3.0')"
61 | 
62 | 
63 | RUN R --no-echo --no-restore --no-save -e "install.packages(c('logr','hexbin', 'ggpointdensity'))"
64 | 
65 | RUN python3 -m pip install --break-system-packages jupyter papermill
66 | 
67 | COPY src/jupyter_nb/archr_notebook.ipynb /usr/local/bin/
68 | 
69 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
70 | 
71 | 
72 | RUN R -e "IRkernel::installspec()"
73 | 


--------------------------------------------------------------------------------
/workflows/subwf-atac-archr.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | # Import the tasks called by the pipeline
 4 | import "../tasks/share_task_archr.wdl" as share_task_archr
 5 | 
 6 | 
 7 | workflow wf_atac {
 8 |     meta {
 9 |         version: 'v0.1'
10 |         author: 'Eugenio Mattei (emattei@broadinstitute.org) and Sai Ma @ Broad Institute of MIT and Harvard'
11 |         description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: Sub-workflow to process the ATAC portion of SHARE-seq libraries.'
12 |     }
13 | 
14 |     input {
15 |         # ATAC Sub-worflow inputs
16 |         File atac_fragments_filtered
17 |         String genome_name
18 |         String peak_set
19 |         Int? cpus = 4
20 |         String? docker
21 |         String? prefix
22 |         Int? min_tss = 4
23 |         Int? min_frags = 1000
24 |         Float? archr_disk_factor = 8.0
25 |         Float? archr_memory_factor = 4.0
26 |     }
27 | 
28 |     call share_task_archr.archr as archr{
29 |         input:
30 |             atac_frag = atac_fragments_filtered,
31 |             genome = genome_name,
32 |             peak_set = peak_set,
33 |             min_tss = min_tss,
34 |             min_frags = min_frags,
35 |             doublet_k = 10,
36 |             doublet_knn_method = "UMAP",
37 |             lsi_method = 1,
38 |             docker_image = docker,
39 |             prefix = prefix,
40 |             disk_factor = archr_disk_factor,
41 |             memory_factor = archr_memory_factor
42 |     }
43 | 
44 |     output {
45 |         File share_atac_archr_notebook_output = archr.notebook_output
46 |         File share_atac_archr_notebook_log = archr.notebook_log
47 |         
48 |         File? share_atac_archr_raw_tss_enrichment = archr.archr_raw_tss_by_uniq_frags_plot
49 |         File? share_atac_archr_filtered_tss_enrichment = archr.archr_filtered_tss_by_uniq_frags_plot
50 |         File? share_atac_archr_raw_fragment_size_plot = archr.archr_raw_frag_size_dist_plot
51 |         File? share_atac_archr_filtered_fragment_size_plot = archr.archr_filtered_frag_size_dist_plot
52 |         
53 |         File? share_atac_archr_umap_doublets = archr.archr_umap_doublets
54 |         File? share_atac_archr_umap_cluster_plot = archr.archr_umap_cluster_plot
55 |         File? share_atac_archr_umap_num_frags_plot = archr.archr_umap_num_frags_plot
56 |         File? share_atac_archr_umap_tss_score_plot = archr.archr_umap_tss_score_plot
57 |         File? share_atac_archr_umap_frip_plot = archr.archr_umap_frip_plot
58 |         
59 |         File? share_atac_archr_gene_heatmap_plot = archr.archr_heatmap_plot
60 |         File? share_atac_archr_arrow = archr.archr_arrow
61 |         File? share_atac_archr_obj = archr.archr_raw_obj
62 |         File? share_atac_archr_plots_zip = archr.plots_zip
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/dockerfiles/dorcs_task_find_dorcs.dockerfile:
--------------------------------------------------------------------------------
 1 | ############################################################
 2 | # Dockerfile for BROAD GRO share-seq-pipeline
 3 | # Based on Debian slim
 4 | ############################################################
 5 | 
 6 | FROM r-base@sha256:fff003a52d076e963396876b83cfa88c4f40a8bc27e341339cd3cc0236c1db79 as builder
 7 | 
 8 | RUN echo "options(repos = 'https://cloud.r-project.org')" > $(R --no-echo --no-save -e "cat(Sys.getenv('R_HOME'))")/etc/Rprofile.site
 9 | 
10 | ENV R_LIBS_USER=/usr/local/lib/R
11 | ENV RETICULATE_MINICONDA_ENABLED=FALSE
12 | 
13 | RUN apt-get update -qq && \
14 |     apt-get install -y --no-install-recommends \
15 |     binutils \
16 |     gtk-doc-tools \
17 |     libcairo2-dev \
18 |     libcurl4-openssl-dev \
19 |     libfreetype-dev \
20 |     libfribidi-dev \
21 |     libgsl-dev \
22 |     libharfbuzz-dev \
23 |     libhdf5-dev \
24 |     libjpeg-dev \
25 |     libmpfr-dev \
26 |     libpng-dev \
27 |     libssl-dev \
28 |     libtiff5-dev \
29 |     libxml2-dev \
30 |     libxt-dev \
31 |     libmagick++-dev \
32 |     libgeos-dev \
33 |     meson \
34 |     python3 \
35 |     python3-pip && \
36 |     rm -rf /var/lib/apt/lists/*
37 | 
38 | RUN R --no-echo --no-restore --no-save -e "install.packages(c('dplyr','patchwork','ggplot2','ggrepel','reshape2','circlize','networkD3','GGally','igraph','network','foreach','iterators','hdf5r','ggrastr','BiocManager','remotes','pbmcapply','doSNOW','Rmpfr', 'glue','magrittr','pillar','RcppArmadillo','reticulate','rlang','yaml','rpart','IRkernel','data.table', 'tidyft','qlcMatrix','logr'))"
39 | 
40 | RUN R --no-echo --no-restore --no-save -e "remotes::install_version('Seurat', version = '4.1.1')"
41 | 
42 | RUN R --no-echo --no-restore --no-save -e "BiocManager::install(c('Biostrings','rtracklayer','GenomicRanges','motifmatchr','ComplexHeatmap','chromVAR'), update=T, ask=F)"
43 | 
44 | RUN R --no-echo --no-restore --no-save -e  "remotes::install_github('caleblareau/BuenColors')"
45 | 
46 | ENV USER=shareseq
47 | WORKDIR /home/$USER
48 | RUN groupadd -r $USER &&\
49 |     useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
50 |     chown $USER:$USER /home/$USER
51 | 
52 | RUN python3 -m pip install --break-system-packages jupyter papermill
53 | 
54 | RUN chown $USER:$USER /usr/local/lib/R
55 | 
56 | COPY --chown=$USER:$USER src/jupyter_nb/dorcs_jplot_notebook.ipynb /usr/local/bin/
57 | 
58 | #COPY --chown=$USER:$USER src/jupyter_nb/dorcs_notebook_rds.ipynb /usr/local/bin/
59 | 
60 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
61 | 
62 | 
63 | RUN mkdir -p /home/R/
64 | 
65 | COPY --chown=$USER:$USER src/R/DORCS_helper_functions_optimized.R src/R/TSSRanges.RData /home/R/
66 | 
67 | USER ${USER}
68 | 
69 | RUN R -e "IRkernel::installspec()"
70 | 


--------------------------------------------------------------------------------
/dockerfiles/share_task_bowtie2.dockerfile:
--------------------------------------------------------------------------------
 1 | ############################################################
 2 | # Dockerfile for BROAD GRO share-seq-pipeline
 3 | # Based on Debian slim
 4 | ############################################################
 5 | 
 6 | FROM debian:buster-slim as builder
 7 | 
 8 | ENV BOWTIE2_VERSION 2.4.3
 9 | ENV SAMTOOLS_VERSION 1.9
10 | 
11 | # To prevent time zone prompt
12 | ENV DEBIAN_FRONTEND=noninteractive
13 | 
14 | # Install softwares from apt repo
15 | RUN apt-get update && apt-get install -y \
16 |     build-essential \
17 |     cpanminus \
18 |     git \
19 |     liblz4-dev \
20 |     liblzma-dev \
21 |     libncurses5-dev \
22 |     libbz2-dev \
23 |     unzip \
24 |     wget \
25 |     zlib1g-dev &&\
26 |     rm -rf /var/lib/apt/lists/*
27 | 
28 | 
29 | # Make directory for all softwares
30 | RUN mkdir /software
31 | WORKDIR /software
32 | ENV PATH="/software:${PATH}"
33 | 
34 | RUN cpanm Sys::Hostname
35 | 
36 | # Install Bowtie2 2.3.4.3
37 | RUN wget https://sourceforge.net/projects/bowtie-bio/files/bowtie2/${BOWTIE2_VERSION}/bowtie2-${BOWTIE2_VERSION}-source.zip && \
38 |     unzip bowtie2-${BOWTIE2_VERSION}-source.zip && cd bowtie2-${BOWTIE2_VERSION} && make static-libs && make STATIC_BUILD=1 && \
39 |     cp bowtie2* .. && \
40 |     cd .. && rm -rf bowtie2-${BOWTIE2_VERSION}*
41 | 
42 | # Install samtools 1.9
43 | RUN git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/samtools.git && \
44 |     git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/htslib.git && \
45 |     cd samtools && make && make install && cd ../ && rm -rf samtools* htslib*
46 | 
47 | FROM debian@sha256:3ecce669b6be99312305bc3acc90f91232880c68b566f257ae66647e9414174f
48 | 
49 | LABEL maintainer = "Eugenio Mattei"
50 | LABEL software = "Share-seq pipeline"
51 | LABEL software.version="1.0.0"
52 | LABEL software.organization="Broad Institute of MIT and Harvard"
53 | LABEL software.version.is-production="Yes"
54 | LABEL software.task="Bowtie2"
55 | 
56 | RUN apt-get update && apt-get install -y \
57 |     cpanminus && \
58 |     rm -rf /var/lib/apt/lists/*
59 | 
60 | # Create and setup new user
61 | ENV USER=shareseq
62 | WORKDIR /home/$USER
63 | 
64 | RUN groupadd -r $USER &&\
65 |     useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
66 |     chown $USER:$USER /home/$USER
67 | 
68 | # Add folder with software to the path
69 | ENV PATH="/software:${PATH}"
70 | 
71 | # Copy the compiled software from the builder
72 | COPY --from=builder --chown=$USER:$USER /software/bowtie2* /software/
73 | COPY --from=builder --chown=$USER:$USER /usr/local/bin/* /usr/local/bin/
74 | COPY --from=builder --chown=$USER:$USER /lib/x86_64-linux-gnu/* /lib/x86_64-linux-gnu/
75 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
76 | 
77 | 
78 | 
79 | 
80 | USER $USER
81 | 


--------------------------------------------------------------------------------
/src/python/write_html.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Write output HTML file from list of images and text 
 4 | 
 5 | @author Neva Durand (c) 2021
 6 | """
 7 | import argparse
 8 | import base64
 9 | import io
10 | import os.path
11 | 
12 | def main(output_file_name, image_file_list, log_file_list, input_file_name=None):
13 |     """
14 |     Write to the input file
15 |     Image file list is list of png images
16 |     Log file list is list of text log files to link to
17 | 
18 |     Separates images by br tag and encodes directly in utf-8
19 |     Log files separated by their title and encoded via pre tag
20 |     """
21 |     # Open output file, write input if exists
22 |     output_file = io.open(output_file_name, 'w', encoding='utf8')
23 |     output_file.write('<!DOCTYPE html><html lang="en"><head><title>Results summary</title><meta charset="UTF-8"><meta name="viewport" content="width=device-width, initial-scale=1.0"><body>')
24 |     if input_file_name is not None:
25 |         with open(input_file_name) as input_file:
26 |            output_file.write(input_file.read())
27 |     
28 |     with open(image_file_list) as fname:
29 |         images = fname.read().splitlines() 
30 | 
31 |     # loop through images in image list and encode
32 |     output_file.write('<br>')
33 |     for image in images:
34 |         data = open(image, 'rb').read() # read bytes from file
35 |         data_base64 = base64.b64encode(data)  # encode to base64 (bytes)
36 |         data_base64 = data_base64.decode('utf-8')    # convert bytes to string
37 |         output_file.write('<img width="1000" src="data:image/png;base64,' + data_base64 + '" alt=' + os.path.basename(image)+ '><br>') # embed in html
38 | 
39 |     with open(log_file_list) as fname:
40 |         logs = fname.read().splitlines()
41 | 
42 |     # loop through log files in log list and write
43 |     for log in logs:
44 |         output_file.write(log)
45 |         output_file.write("<br>")
46 |     output_file.write('</body></html>')
47 |     output_file.close()
48 | 
49 | if __name__ == '__main__':
50 |     parser = argparse.ArgumentParser(
51 |         formatter_class=argparse.RawDescriptionHelpFormatter,
52 |         description=__doc__.split('\n\n\n')[0])
53 |     group = parser.add_argument_group()
54 |     group.add_argument('output_file_name',
55 |                        help='html file to write to')
56 |     group.add_argument('image_file_list', 
57 |                        help='file containing list of image files to paste in HTML file')
58 |     group.add_argument('log_file_list',
59 |                        help='file containing list of text log files to append to end of HTML file')
60 |     group.add_argument('--input_file_name',
61 |                        help='optional file with html text to add at top of file', nargs='?') 
62 |     args = parser.parse_args()
63 |     main(args.output_file_name, args.image_file_list, args.log_file_list, args.input_file_name)
64 | 
65 | 


--------------------------------------------------------------------------------
/src/python/bam_to_fragments.py:
--------------------------------------------------------------------------------
 1 | # From Kundaje lab
 2 | # https://github.com/kundajelab/ENCODE_scatac/blob/master/workflow/scripts/bam_to_fragments.py
 3 | 
 4 | import argparse
 5 | import pysam
 6 | import sys
 7 | 
 8 | def bam_to_frag(in_path, out_path, barcode_tag="CB", shift_plus=4, shift_minus=-4):
 9 |     """
10 |     Convert coordinate-sorted BAM file to a fragment file format, while adding Tn5 coordinate adjustment
11 |     BAM should be pre-filtered for PCR duplicates, secondary alignments, and unpaired reads
12 |     Output fragment file is sorted by chr, start, end, barcode
13 |     """
14 | 
15 |     input = pysam.AlignmentFile(in_path, "rb")
16 |     with open(out_path, "w") as out_file:
17 |         buf = []
18 |         curr_pos = None
19 |         for read in input:
20 |             if read.flag & 16 == 16:
21 |                 continue # ignore reverse (coordinate-wise second) read in pair
22 | 
23 |             chromosome = read.reference_name
24 |             start = read.reference_start + shift_plus
25 |             end = read.reference_start + read.template_length + shift_minus
26 |             cell_barcode = read.get_tag(barcode_tag)
27 |             # assert(read.next_reference_start >= read.reference_start) ####
28 |             data = (chromosome, start, end, cell_barcode, 1)
29 |             pos = (chromosome, start)
30 | 
31 |             if pos == curr_pos:
32 |                 buf.append(data)
33 |             else:
34 |                 buf.sort()
35 |                 for i in buf:
36 |                     print(*i, sep="\t", file=out_file)
37 |                 buf.clear()
38 |                 buf.append(data)
39 |                 curr_pos = pos
40 | 
41 | if __name__ == '__main__':
42 | 
43 |     msg = "Add the description"
44 |     parser = argparse.ArgumentParser(description = msg)
45 | 
46 |     # Adding optional argument
47 |     parser.add_argument("bam", help = "Path to the coordinate-sorted bam file.")
48 |     parser.add_argument("-o", "--output", help = "Path to the fragments output file.")
49 |     parser.add_argument("--prefix", help = "Prefix for the metrics output file.")
50 |     parser.add_argument("--shift_plus", help = "Tn5 coordinate adjustment for the plus strand.", type = int, default = 4)
51 |     parser.add_argument("--shift_minus", help = "Tn5 coordinate adjustment for the minus strand.", type = int, default = -4)
52 |     parser.add_argument("--bc_tag", help = "Specify the tag containing the cell barcode.", default="CB")
53 | 
54 |     # Read arguments from command line
55 |     args = parser.parse_args()
56 | 
57 |     if args.prefix:
58 |         prefix = args.prefix
59 |     else:
60 |         prefix = args.bam[:-4]
61 | 
62 |     if args.output:
63 |         out_path = args.output
64 |     else:
65 |         out_path = f"{prefix}.fragments.tsv"
66 | 
67 |     bc_tag = args.bc_tag
68 | 
69 | 
70 |     bam_to_frag(args.bam, out_path, bc_tag, shift_plus=args.shift_plus, shift_minus=args.shift_minus)
71 | 


--------------------------------------------------------------------------------
/dockerfiles/share_task_star.dockerfile:
--------------------------------------------------------------------------------
 1 | ############################################################
 2 | # Dockerfile for BROAD GRO share-seq-pipeline
 3 | # Based on Debian slim
 4 | ############################################################
 5 | 
 6 | FROM debian@sha256:3ecce669b6be99312305bc3acc90f91232880c68b566f257ae66647e9414174f as builder
 7 | 
 8 | ENV STAR_VERSION 2.7.10a_alpha_220818
 9 | ENV SAMTOOLS_VERSION 1.9
10 | 
11 | # To prevent time zone prompt
12 | ENV DEBIAN_FRONTEND=noninteractive
13 | 
14 | # Install softwares from apt repo
15 | RUN apt-get update && apt-get install -y \
16 |     build-essential \
17 |     git \
18 |     liblz4-dev \
19 |     liblzma-dev \
20 |     libncurses5-dev \
21 |     libbz2-dev \
22 |     unzip \
23 |     wget \
24 |     zlib1g-dev &&\
25 |     rm -rf /var/lib/apt/lists/*
26 | 
27 | 
28 | # Make directory for all softwares
29 | RUN mkdir /software
30 | WORKDIR /software
31 | ENV PATH="/software:${PATH}"
32 | 
33 | # Install STAR 2.7.10a
34 | RUN wget https://github.com/alexdobin/STAR/releases/download/2.7.10a_alpha_220818/STAR_2.7.10a_alpha_220818_Linux_x86_64_static.zip && unzip STAR_2.7.10a_alpha_220818_Linux_x86_64_static.zip && mv STAR /usr/local/bin/
35 | #RUN wget https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.tar.gz && tar -xzf ${STAR_VERSION}.tar.gz
36 | #RUN cd STAR-${STAR_VERSION}/source && make STAR && rm ../../${STAR_VERSION}.tar.gz && mv /software/STAR-${STAR_VERSION}/bin/Linux_x86_64/* /usr/local/bin/
37 | 
38 | # Install samtools 1.9
39 | RUN git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/samtools.git && \
40 |     git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/htslib.git && \
41 |     cd samtools && make && make install && cd ../ && rm -rf samtools* htslib*
42 | 
43 | FROM debian@sha256:3ecce669b6be99312305bc3acc90f91232880c68b566f257ae66647e9414174f
44 | 
45 | LABEL maintainer = "Eugenio Mattei"
46 | LABEL software = "Share-seq pipeline"
47 | LABEL software.version="1.0.0"
48 | LABEL software.organization="Broad Institute of MIT and Harvard"
49 | LABEL software.version.is-production="Yes"
50 | LABEL software.task="STAR"
51 | 
52 | RUN apt-get update && apt-get install -y \
53 |     libgc-dev &&\
54 |     rm -rf /var/lib/apt/lists/*
55 | 
56 | # Create and setup new user
57 | ENV USER=shareseq
58 | WORKDIR /home/$USER
59 | 
60 | RUN groupadd -r $USER &&\
61 |     useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
62 |     chown $USER:$USER /home/$USER
63 | 
64 | # Add folder with software to the path
65 | ENV PATH="/software:${PATH}"
66 | 
67 | # Copy the compiled software from the builder
68 | COPY --from=builder --chown=$USER:$USER /usr/local/bin/* /usr/local/bin/
69 | COPY --from=builder --chown=$USER:$USER /usr/lib/x86_64-linux-gnu/libgomp.so.1 /lib/x86_64-linux-gnu/libncurses.so.6 /lib/x86_64-linux-gnu/
70 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
71 | 
72 | USER $USER
73 | 


--------------------------------------------------------------------------------
/workflows/subwf-rna-seurat.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | # Import the tasks called by the pipeline
 4 | import "../tasks/share_task_seurat.wdl" as share_task_seurat
 5 | 
 6 | workflow wf_rna {
 7 |     meta {
 8 |         version: 'v0.1'
 9 |         author: 'Eugenio Mattei (emattei@broadinstitute.org) and Sai Ma @ Broad Institute of MIT and Harvard'
10 |         description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: Sub-workflow to process the RNA portion of SHARE-seq libraries.'
11 |     }
12 | 
13 |     input {
14 |         # RNA Seurat inputs
15 | 
16 |         String prefix
17 |         String genome_name
18 |         String? docker
19 |         File h5_matrix
20 |         
21 |         #Seurat filtering parameters
22 |         Int? min_features 
23 |         Float? percent_mt 
24 |         Int? min_cells 
25 |         
26 |         # Seurat UMAP parameters
27 |         Int? umap_dim 
28 |         Float? umap_resolution 
29 |         
30 |         #Seurat runtime parameters
31 |         Float? disk_factor
32 |         Float? memory_factor
33 |     }
34 | 
35 |     call share_task_seurat.seurat as seurat{
36 |         input:
37 |             rna_matrix = h5_matrix,
38 |             genome_name = genome_name,
39 |             min_features = min_features,
40 |             percent_mt = percent_mt,
41 |             min_cells = min_cells,
42 |             umap_dim = umap_dim,
43 |             umap_resolution = umap_resolution,
44 |             prefix = prefix,
45 |             docker_image = docker,
46 |             disk_factor = disk_factor,
47 |             memory_factor = memory_factor
48 |     }
49 | 
50 |     output {
51 |         File share_rna_seurat_notebook_output = seurat.notebook_output
52 |         File share_rna_seurat_notebook_log = seurat.notebook_log
53 |         File? share_rna_seurat_raw_violin_plot = seurat.seurat_raw_violin_plot
54 |         File? share_rna_seurat_filtered_violin_plot = seurat.seurat_filtered_violin_plot
55 |         File? share_rna_seurat_raw_qc_scatter_plot = seurat.seurat_raw_qc_scatter_plot
56 |         File? share_rna_seurat_filtered_qc_scatter_plot = seurat.seurat_filtered_qc_scatter_plot
57 |         File? share_rna_seurat_variable_genes_plot = seurat.seurat_variable_genes_plot
58 |         File? share_rna_seurat_PCA_dim_loadings_plot = seurat.seurat_PCA_dim_loadings_plot
59 |         File? share_rna_seurat_PCA_plot = seurat.seurat_PCA_plot
60 |         File? share_rna_seurat_heatmap_plot = seurat.seurat_heatmap_plot
61 |         File? share_rna_seurat_jackstraw_plot = seurat.seurat_jackstraw_plot
62 |         File? share_rna_seurat_elbow_plot = seurat.seurat_elbow_plot
63 |         File? share_rna_seurat_umap_cluster_plot = seurat.seurat_umap_cluster_plot
64 |         File? share_rna_seurat_umap_rna_count_plot = seurat.seurat_umap_rna_count_plot
65 |         File? share_rna_seurat_umap_gene_count_plot = seurat.seurat_umap_gene_count_plot
66 |         File? share_rna_seurat_umap_mito_plot = seurat.seurat_umap_mito_plot
67 |         File? share_rna_seurat_obj = seurat.seurat_filtered_obj
68 |         File? share_rna_plots_zip = seurat.plots_zip
69 |     }
70 | }
71 | 


--------------------------------------------------------------------------------
/src/R/atac_qc_plots.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/Rscript
 2 | 
 3 | ### Takes ATAC barcode metadata tsv file, and outputs barcode rank plots as a png file.
 4 | 
 5 | ## Import helper functions
 6 | source("/usr/local/bin/barcode_rank_functions.R")
 7 | 
 8 | ## Get arguments, read input
 9 | args <- commandArgs()
10 | 
11 | barcode_metadata_file <- args[6]
12 | fragment_cutoff <- as.integer(args[7])
13 | fragment_rank_plot_file <- args[8]
14 | 
15 | barcode_metadata <- read.table(barcode_metadata_file, header=T)
16 | 
17 | ## Get plot inputs
18 | 
19 | # Impose fragment cutoff, sort in decreasing order, assign rank
20 | # 1 fragment = 2 reads
21 | fragment <- barcode_metadata$reads_unique / 2
22 | fragment_filtered <- fragment[fragment >= fragment_cutoff]
23 | fragment_filtered_sort <- sort(fragment_filtered, decreasing=T)
24 | fragment_rank <- 1:length(fragment_filtered_sort)
25 | 
26 | # Find elbow/knee of fragment barcode rank plot and top-ranked fragment barcode rank plot
27 | fragment_points <- get_elbow_knee_points(x=fragment_rank, y=log10(fragment_filtered_sort))
28 | # For each valid plot, make factor for coloring plot points
29 | if (length(fragment_points) > 0) { # Elbow found in first plot
30 |   fragment_plot1 <- TRUE
31 |   is_top_ranked_fragment <- factor(ifelse(fragment_rank <= fragment_points[1], 1, 0))
32 |   if (length(fragment_points) > 2) { # Elbow/knee found in second plot
33 |     fragment_plot2 <- TRUE
34 |     fragment_top_rank <- fragment_rank[1:fragment_points[1]]
35 |     fragment_top_fragment <- fragment_filtered_sort[1:fragment_points[1]]
36 |     is_top_top_ranked_fragment <- factor(ifelse(fragment_top_rank <= fragment_points[3], 1, 0))
37 |   } else {
38 |     fragment_plot2 <- FALSE
39 |   }
40 | } else {
41 |     fragment_plot1 <- FALSE
42 | }
43 | 
44 | ## Generate plots
45 | 
46 | options(scipen=999)
47 | 
48 | # Make fragment barcode rank plots
49 | png(fragment_rank_plot_file, width=8, height=8, units='in', res=300)
50 | par(mfrow = c(2,1))
51 | 
52 | # Plot 1 (all barcodes passing fragment filter vs log10(fragments))
53 | if (fragment_plot1) {
54 |   plot(x=fragment_rank,
55 |        y=fragment_filtered_sort,
56 |        log="y",
57 |        xlab=paste0(" Barcode rank (", length(fragment_rank)-fragment_points[1], " low quality cells)"),
58 |        ylab="Fragments per barcode (log10 scale)",
59 |        main="ATAC Fragments per Barcode",
60 |        col=c("dimgrey","darkblue")[is_top_ranked_fragment],
61 |        pch=16,
62 |        ylim=c(1,100000))
63 |   abline(v=fragment_points[1], h=10^(fragment_points[2]))
64 |   text(fragment_points[1], 10^(fragment_points[2]),
65 |        paste0("(", fragment_points[1], ", ", 10^(fragment_points[2]), ")"),
66 |        adj=c(-0.1,-0.5))
67 | }
68 | 
69 | # Plot 2 (top ranked barcodes vs log10(fragments))
70 | if (fragment_plot2) {
71 |   plot(x=fragment_top_rank,
72 |        y=fragment_top_fragment,
73 |        log="y",
74 |        xlab="Barcode rank",
75 |        ylab="Fragments per barcode (log10 scale)",
76 |        main="ATAC Fragments per Top-Ranked Barcode",
77 |        col=c("dimgrey","darkblue")[is_top_top_ranked_fragment],
78 |        pch=16,
79 |        ylim=c(1,100000))
80 |   abline(v=fragment_points[3], h=10^(fragment_points[4]))
81 |   text(fragment_points[3], 10^(fragment_points[4]),
82 |        paste("(", fragment_points[3], ", ", 10^(fragment_points[4]), ")", sep=""),
83 |        adj=c(-0.1,-0.5))
84 | }
85 | dev.off()
86 | 
87 | 


--------------------------------------------------------------------------------
/dockerfiles/share_task_cell_annotation.dockerfile:
--------------------------------------------------------------------------------
 1 | ############################################################
 2 | # Dockerfile for BROAD GRO share-seq-pipeline
 3 | # Based on Debian slim
 4 | ############################################################
 5 | 
 6 | FROM ubuntu@sha256:2fdb1cf4995abb74c035e5f520c0f3a46f12b3377a59e86ecca66d8606ad64f9
 7 | 
 8 | LABEL maintainer = "Zhijian Li"
 9 | LABEL software = "Share-seq pipeline"
10 | LABEL software.version="0.0.1"
11 | LABEL software.organization="Broad Institute of MIT and Harvard"
12 | LABEL software.version.is-production="No"
13 | LABEL software.task="cell-annotation"
14 | 
15 | # To prevent time zone prompt
16 | ENV DEBIAN_FRONTEND=noninteractive
17 | ENV RETICULATE_MINICONDA_ENABLED=FALSE
18 | 
19 | ## Create new user 
20 | ENV USER=shareseq
21 | WORKDIR /home/$USER
22 | RUN groupadd -r $USER && \
23 |     useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
24 |     chown $USER:$USER /home/$USER
25 | 
26 | # Install libraries
27 | RUN apt-get update
28 | RUN apt-get install -y --no-install-recommends \
29 |     gcc \
30 |     g++ \
31 |     gfortran \
32 |     patch \
33 |     build-essential \
34 |     binutils \
35 |     gtk-doc-tools \
36 |     libcairo2-dev \
37 |     libcurl4-openssl-dev \
38 |     libfreetype6-dev \
39 |     libfribidi-dev \
40 |     libgsl-dev \
41 |     libharfbuzz-dev \
42 |     libhdf5-dev \
43 |     libjpeg-dev \
44 |     libmpfr-dev \
45 |     libpng-dev \
46 |     libssl-dev \
47 |     libtiff5-dev \
48 |     libxml2-dev \
49 |     libxt-dev \
50 |     libgeos-dev \
51 |     meson \
52 |     libblas-dev \
53 |     liblapack-dev \
54 |     libbz2-dev
55 |     
56 | # Install python and R
57 | RUN apt-get install -y --no-install-recommends \
58 |     python3 python3-pip python3-dev python3-venv r-base
59 | 
60 | RUN rm -rf /var/lib/apt/lists/*
61 | 
62 | RUN echo "options(repos = 'https://cloud.r-project.org')" > $(R --no-echo --no-save -e "cat(Sys.getenv('R_HOME'))")/etc/Rprofile.site
63 | ENV R_LIBS_USER=/usr/local/lib/R
64 | 
65 | RUN R --no-echo --no-restore --no-save -e "install.packages('hdf5r')"
66 | RUN R --no-echo --no-restore --no-save -e "install.packages('remotes')"
67 | RUN R --no-echo --no-restore --no-save -e "install.packages('IRkernel')"
68 | RUN R --no-echo --no-restore --no-save -e "install.packages('logr')"
69 | RUN R --no-echo --no-restore --no-save -e "install.packages('BiocManager')"
70 | RUN R --no-echo --no-restore --no-save -e "install.packages('glue')"
71 | RUN R --no-echo --no-restore --no-save -e "install.packages('Matrix')"
72 | RUN R --no-echo --no-restore --no-save -e "install.packages('SeuratObject')"
73 | RUN R --no-echo --no-restore --no-save -e "remotes::install_version('Seurat', version = '4.3.0')"
74 | RUN R --no-echo --no-restore --no-save -e "BiocManager::install('rhdf5', update=F, ask=F)"
75 | RUN R --no-echo --no-restore --no-save -e "BiocManager::install('EnsDb.Mmusculus.v79', update=F, ask=F)"
76 | RUN R --no-echo --no-restore --no-save -e "BiocManager::install('EnsDb.Hsapiens.v86', update=F, ask=F)"
77 | RUN R --no-echo --no-restore --no-save -e "install.packages('optparse')"
78 | 
79 | RUN python3 -m pip install anndata cellxgene-census
80 | 
81 | COPY src/bash/monitor_script.sh /usr/local/bin
82 | COPY src/python/get_cellxgene_data.py /usr/local/bin
83 | COPY src/R/cell_annotation.R /usr/local/bin/
84 | COPY src/R/cell_annotation_helper_functions.R /usr/local/bin/
85 | 
86 | 


--------------------------------------------------------------------------------
/tasks/share_task_trim_fastqs_atac.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | # TASK
 4 | # trim_fastqs_atac
 5 | 
 6 | task share_trim_fastqs_atac {
 7 |     meta {
 8 |         version: 'v0.1'
 9 |         author: 'Eugenio Mattei (emattei@broadinstitute.org) at Broad Institute of MIT and Harvard'
10 |         description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: trim ATAC FASTQs.'
11 |     }
12 | 
13 |     input {
14 |         File fastq_R1 # Pair 1 reads
15 |         File fastq_R2 # Pair 2 reads
16 |         String chemistry
17 | 
18 |         Int? cpus = 16
19 |         Float? disk_factor = 8.0
20 |         Float? memory_factor = 0.15
21 |         String? docker_image = "us.gcr.io/buenrostro-share-seq/share_task_trim_fastqs_atac:v1.0.0"
22 |     }
23 | 
24 |     # Determine the size of the input
25 |     Float input_file_size_gb = size(fastq_R1, "G") + size(fastq_R2, "G")
26 | 
27 |     # Determining memory size base on the size of the input files.
28 |     Float mem_gb = 16.0 + memory_factor * input_file_size_gb
29 | 
30 |     # Determining disk size base on the size of the input files.
31 |     Int disk_gb = round(40.0 + disk_factor * input_file_size_gb)
32 | 
33 |     # Determining disk type base on the size of disk.
34 |     String disk_type = if disk_gb > 375 then "SSD" else "LOCAL"
35 | 
36 |     # Read trimming outfiles
37 |     String fastq_R1_trimmed = basename(fastq_R1, ".fastq.gz") + "_trimmed.fastq"
38 |     String fastq_R2_trimmed = basename(fastq_R2, ".fastq.gz") + "_trimmed.fastq"
39 |     String trimming_log_json = basename(fastq_R1, "R1.fastq.gz") + ".atac.preprocess.trimming.log.json"
40 |     String trimming_log_html = basename(fastq_R1, "R1.fastq.gz") + ".atac.preprocess.trimming.log.html"
41 |     String trimming_stats = basename(fastq_R1, "R1.fastq.gz") + ".atac.preprocess.trimming.adapter.stats.txt"
42 |     String monitor_log = 'trim_fastqs_atac_monitor.log'
43 | 
44 |     command <<<
45 |         set -e
46 | 
47 |         bash $(which monitor_script.sh) | tee ~{monitor_log} 1>&2 &
48 | 
49 |         # Use trim_fastq script for SHARE ATAC trimming
50 |         if [ '~{chemistry}' == 'shareseq' ]; then
51 |             python3 $(which trim_fastq.py) ~{fastq_R1} ~{fastq_R2} ~{fastq_R1_trimmed} ~{fastq_R2_trimmed} ~{trimming_stats}
52 | 
53 |         # Use fastp for 10X ATAC trimming
54 |         else
55 |             fastp -i ~{fastq_R1} -I ~{fastq_R2} -o ~{fastq_R1_trimmed} -O ~{fastq_R2_trimmed} -h ~{trimming_log_html} -j ~{trimming_log_json} -G -Q -L -w ~{cpus} 2> ~{trimming_stats}
56 |       
57 |         fi
58 |   
59 |         pigz -p ~{cpus} *.fastq
60 |     >>>
61 | 
62 |     output {
63 |         File fastq_R1_trimmed = fastq_R1_trimmed + ".gz"
64 |         File fastq_R2_trimmed = fastq_R2_trimmed + ".gz"
65 |         File? tenx_trimming_log_json = trimming_log_json
66 |         File? tenx_trimming_log_html = trimming_log_html
67 |         File trimming_stats = trimming_stats
68 |         File trim_fastqs_atac_monitor = monitor_log
69 |     }
70 | 
71 |     runtime {
72 |         cpu: cpus
73 |         docker: "${docker_image}"
74 |         disks: "local-disk ${disk_gb} ${disk_type}"
75 |         memory: "${mem_gb} GB"
76 |     }
77 | 
78 |     parameter_meta {
79 |         fastq_R1: {
80 |                 description: 'Pairs 1 fastq',
81 |                 help: 'Pairs 1 fastq',
82 |             }
83 |         fastq_R2: {
84 |                 description: 'Pairs 2 fastq',
85 |                 help: 'Pairs 2 fastq',
86 |             }
87 |     }
88 | 
89 | }
90 | 


--------------------------------------------------------------------------------
/tasks/share_task_generate_h5.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | # TASK
 4 | # SHARE-rna-generate-h5
 5 | 
 6 | 
 7 | task generate_h5 {
 8 |     meta {
 9 |         version: 'v0.1'
10 |         author: 'Eugenio Mattei (emattei@broadinstitute.org) at Broad Institute of MIT and Harvard'
11 |         description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: RNA gene x cell matrix'
12 |     }
13 | 
14 |     input {
15 |         # This task computes the the gene x cell matrix.
16 |         File tar
17 |         String genome_name
18 |         String? pkr
19 |         String prefix
20 |         String? gene_naming
21 | 
22 |         Float? disk_factor = 8.0
23 |         Float? memory_factor = 2.0
24 |         String? docker_image = "us.gcr.io/buenrostro-share-seq/share_task_generate_h5:v1.0.0"
25 |     }
26 | 
27 |     # Determine the size of the input
28 |     Float input_file_size_gb = size(tar, "G")
29 | 
30 |     # Determining memory size based on the size of the input files.
31 |     Float mem_gb = 10.0 + memory_factor * input_file_size_gb
32 | 
33 |     # Determining disk size based on the size of the input files.
34 |     Int disk_gb = round(40.0 + disk_factor * input_file_size_gb)
35 | 
36 |     # Determining disk type based on the size of disk.
37 |     String disk_type = if disk_gb > 375 then "SSD" else "LOCAL"
38 | 
39 |     String ensembl_option = if "${gene_naming}"=="ensembl" then "--ensembl" else ""
40 |     String h5 = "${default="share-seq" prefix}.${genome_name}.rna.h5"
41 |     String monitor_log = "monitor.log"
42 | 
43 |     command <<<
44 |         set -e
45 | 
46 |         bash $(which monitor_script.sh) | tee ~{monitor_log} 1>&2 &
47 | 
48 |         # Untar
49 |         tar xzvf ~{tar}
50 | 
51 |         # Generate h5 file
52 |         python3 $(which generate_h5_rna.py) \
53 |             ./matrix.mtx.gz \
54 |             ./features.tsv.gz \
55 |             ./barcodes.tsv.gz \
56 |             ~{h5} \
57 |             ~{pkr} \
58 |             ~{ensembl_option}
59 |     >>>
60 | 
61 |     output {
62 |         File h5_matrix = "${h5}"
63 |     }
64 | 
65 |     runtime {
66 |         memory : "${mem_gb} GB"
67 |         disks: "local-disk ${disk_gb} ${disk_type}"
68 |         docker : "${docker_image}"
69 |     }
70 | 
71 |     parameter_meta {
72 |         tar: {
73 |                 description: 'STARsolo output tar.gz file',
74 |                 help: 'tar.gz file containing raw matrix, features, and barcodes file from STARsolo.',
75 |                 example: 'raw.tar.gz'
76 |             }
77 |         genome_name: {
78 |                 description: 'Reference name',
79 |                 help: 'The name genome reference used to align.',
80 |                 example: ['hg38', 'mm10', 'hg19', 'mm9']
81 |             }
82 |         prefix: {
83 |                 description: 'Prefix for output files',
84 |                 help: 'Prefix that will be used to name the output files.',
85 |                 example: 'MyExperiment'
86 |             }
87 | 	gene_naming: {
88 |                 description: 'Gene naming convention',
89 |                 help: 'Convention for gene naming in h5 matrix; either "gene_name" (default) or "ensembl".',
90 |                 example: ['gene_name', 'ensembl']
91 |             }
92 |         docker_image: {
93 |                 description: 'Docker image.',
94 |                 help: 'Docker image for preprocessing step. Dependencies: python3 -m pip install h5py scipy',
95 |                 example: ['put link to gcr or dockerhub']
96 |             }
97 |     }
98 | }
99 | 


--------------------------------------------------------------------------------
/tasks/share_task_html_report.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | # TASK
 4 | # SHARE-html-report
 5 | # Gather information from log files
 6 | 
 7 | 
 8 | task html_report {
 9 |     meta {
10 |         version: 'v0.1'
11 |         author: 'Neva C. Durand (neva@broadinstitute.org) at Broad Institute of MIT and Harvard'
12 |         description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: create html report task'
13 |     }
14 | 
15 |     input {
16 |         # This function takes as input the files to append to the report
17 |         # and the metrics and writes out an html file
18 | 
19 |         String? prefix
20 | 
21 |         # Stats for ATAC and RNA, will go at top of html
22 |         Int? atac_total_reads
23 |         Int? atac_aligned_uniquely
24 |         Int? atac_unaligned
25 |         Int? atac_feature_reads
26 |         Int? atac_duplicate_reads
27 |         Float? atac_percent_duplicates
28 |         Int? rna_total_reads
29 |         Int? rna_aligned_uniquely
30 |         Int? rna_aligned_multimap
31 |         Int? rna_unaligned
32 |         Int? rna_feature_reads
33 |         Int? rna_duplicate_reads
34 | 
35 |         ## JPEG files to be encoded and appended to html
36 |         Array[File?] image_files
37 | 
38 |         ## Raw text logs to append to end of html
39 |         Array[String?] log_files
40 | 
41 |     }
42 | 
43 |     String output_file = "${default="share-seq" prefix}.html"
44 |     # need to select from valid files since some are optional
45 |     Array[File] valid_image_files = select_all(image_files)
46 |     Array[String] valid_log_files = select_all(log_files)
47 | 
48 |     command <<<
49 | 
50 |         echo "~{sep="\n" valid_image_files}" > image_list.txt
51 |         echo "~{sep="\n" valid_log_files}" > log_list.txt
52 | 
53 |         echo "<h3>Summary Statistics</h3><p><table><tr><td colspan=2>ATAC</td></tr><tr><td>Total reads</td><td>" ~{atac_total_reads} "</td></tr>" > output.txt
54 |         echo "<tr><td>Aligned uniquely</td><td>" ~{atac_aligned_uniquely} "</td></tr>" >> output.txt
55 |         echo "<tr><td>Unaligned</td><td>" ~{atac_unaligned} "</td></tr>" >> output.txt
56 |         echo "<tr><td>Unique Reads</td><td>" ~{atac_feature_reads} "</td></tr>" >> output.txt
57 |         echo "<tr><td>Duplicate Reads</td><td>" ~{atac_duplicate_reads} "</td></tr>" >> output.txt
58 |         echo "<tr><td>Percent Duplicates</td><td>" ~{atac_percent_duplicates} "</td></tr>" >> output.txt
59 |         echo "<td colspan=2>RNA</td></tr><tr><td>Total reads</td><td>" ~{rna_total_reads} "</td></tr>" >> output.txt
60 |         echo "<tr><td>Aligned uniquely</td><td>" ~{rna_aligned_uniquely} "</td></tr>" >> output.txt
61 |         echo "<tr><td>Aligned multimap</td><td>" ~{rna_aligned_multimap} "</td></tr>" >> output.txt
62 |         echo "<tr><td>Unaligned</td><td>" ~{rna_unaligned} "</td></tr>" >> output.txt
63 |         echo "<tr><td>Filtered (feature) Reads</td><td>" ~{rna_feature_reads} "</td></tr>" >> output.txt
64 |         echo "<tr><td>Duplicate Reads</td><td>" ~{rna_duplicate_reads} "</td></tr>" >> output.txt
65 |         percent=$(( ~{default=0 rna_duplicate_reads}*100/~{default=1 rna_feature_reads} ))
66 |         echo "<tr><td>Percent Duplicates</td><td>" $percent "</td></tr></table>" >> output.txt
67 |         PYTHONIOENCODING=utf-8 python3 /software/write_html.py ~{output_file} image_list.txt log_list.txt --input_file_name output.txt
68 |     >>>
69 |     output {
70 |         File html_report_file = "~{output_file}"
71 |     }
72 | 
73 |     runtime {
74 |         docker: 'us.gcr.io/buenrostro-share-seq/share_task_html_report:v1.0.0'
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/python/filter_mito_reads.py:
--------------------------------------------------------------------------------
 1 | # From Kundaje lab
 2 | # https://github.com/kundajelab/ENCODE_scatac/blob/master/workflow/scripts/filter_mito.py
 3 | #
 4 | # Modified by: Eugenio Mattei
 5 | # Affiliation: The Broad InstituteOf MIT and Harvard
 6 | #
 7 | # Changelog:
 8 | # 2023/01/20: Now it returns the statistics per barcode
 9 | #
10 | 
11 | import argparse
12 | import pysam
13 | from collections import defaultdict
14 | 
15 | 
16 | 
17 | def filter_mito(in_path, out_path, barcode_tag, cutoff, prefix, threads=1):
18 |     """
19 |     Removes mitochondrial alignments from BAM
20 |     Calculates number of mapped mitochondrial and non-mitochondrial reads (not alignments)
21 |     Assumes mitochondrial chromosome is "chrM"
22 |     """
23 | 
24 |     infile = pysam.AlignmentFile(in_path, "rb", threads=threads)
25 |     outfile = pysam.AlignmentFile(out_path, "wb", template=infile, threads=threads)
26 |     outfile_bulk_metrics = f"{prefix}.mito.bulk-metrics.tsv"
27 |     outfile_barcode_metrics = f"{prefix}.mito.bc-metrics.tsv"
28 | 
29 |     number_mito = 0
30 |     number_non_mito = 0
31 | 
32 |     # Initializing the dictionary setting the counts for non-mito and mito.
33 |     barcode_metrics = defaultdict(lambda: [0,0])
34 | 
35 |     for read in infile.fetch(until_eof=True,multiple_iterators=True):
36 |         if read.reference_name == "chrM":
37 |             if read.flag & 260 == 0: # Alignment is mapped and is primary
38 |                 number_mito += 1
39 |                 barcode_metrics[read.get_tag(barcode_tag)][1] += 1
40 | 
41 |         else:
42 |             if read.flag & 260 == 0:
43 |                 number_non_mito += 1
44 |                 barcode_metrics[read.get_tag(barcode_tag)][0] += 1
45 |             #outfile.write(read)
46 | 
47 |     # Write the summary metrics
48 |     with open(outfile_bulk_metrics, "w") as fh:
49 |         print("raw_reads_nonmito\traw_reads_mito", file = fh)
50 |         print(f"{number_non_mito}\t{number_mito}", file = fh)
51 | 
52 |     # Write the metrics per barcode
53 |     with open(outfile_barcode_metrics, "w") as fh:
54 |         # Print header
55 |         print("barcode\traw_reads_nonmito\traw_reads_mito", file = fh)
56 |         for barcode,counts in barcode_metrics.items():
57 |             print(f"{barcode}\t{counts[0]}\t{counts[1]}", file = fh)
58 | 
59 |     # Write a filtered bam
60 |     for read in infile:
61 |         if read.flag & 260 == 0 and read.reference_name != "chrM" and barcode_metrics[read.get_tag(barcode_tag)][0] > cutoff*2:
62 |             outfile.write(read)
63 | 
64 |     outfile.close()
65 |     return
66 | 
67 | 
68 | 
69 | if __name__ == '__main__':
70 | 
71 |     msg = "Add the description"
72 |     parser = argparse.ArgumentParser(description = msg)
73 | 
74 |     # Adding optional argument
75 |     parser.add_argument("bam", help = "Path to the coordinate-sorted bam file.")
76 |     parser.add_argument("-o", "--output", help = "Path to the mitochondrial-free bam file.")
77 |     parser.add_argument("-p", help = "Number of threads to use.", type=int, default=1)
78 |     parser.add_argument("--prefix", help = "Prefix for the metrics output file.")
79 |     parser.add_argument("--cutoff", help = "Remove barcodes with a number of fragments less than the cutoff.", type=int, default=1)
80 |     parser.add_argument("--bc_tag", help = "Specify the tag containing the cell barcode.", default="CB")
81 | 
82 |     # Read arguments from command line
83 |     args = parser.parse_args()
84 | 
85 |     if args.prefix:
86 |         prefix = args.prefix
87 |     else:
88 |         prefix = args.bam[:-4]
89 | 
90 |     if args.output:
91 |         out_path = args.output
92 |     else:
93 |         out_path = f"{prefix}.no_mito.bam"
94 | 
95 |     bc_tag = args.bc_tag
96 | 
97 |     filter_mito(args.bam, out_path, bc_tag, args.cutoff, prefix, threads=args.p)
98 | 


--------------------------------------------------------------------------------
/src/R/cell_annotation_helper_functions.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/Rscript
  2 | 
  3 | ## ---------------------------
  4 | ## Helper functions for cell annotation
  5 | ## Author: Zhijian Li
  6 | ## Date Created: 2023-05-29
  7 | ## Email: lzj1769@gmail.com
  8 | ## ---------------------------
  9 | library(reticulate)
 10 | use_python("/usr/bin/python3")
 11 | 
 12 | read_h5ad <- function(
 13 |   filename,
 14 |   backed = NULL
 15 | ) {
 16 |   python_anndata <- reticulate::import("anndata", convert = FALSE)
 17 |   filename <- normalizePath(filename, mustWork = FALSE)
 18 |   py_to_r_ifneedbe(python_anndata$read_h5ad(
 19 |     filename = filename,
 20 |     backed = backed
 21 |   ))
 22 | }
 23 | 
 24 | py_to_r_ifneedbe <- function(x) {
 25 |   if (inherits(x, "python.builtin.object")) {
 26 |     py_to_r(x)
 27 |   } else {
 28 |     x
 29 |   }
 30 | }
 31 | 
 32 | #' @name r-py-conversion
 33 | #' @export
 34 | py_to_r.pandas.core.indexes.base.Index <- function(x) {
 35 |   python_builtins <- reticulate::import_builtins()
 36 |   out <- python_builtins$list(x)
 37 |   attr(out, "name") <- py_to_r_ifneedbe(x$name)
 38 |   out
 39 | }
 40 | 
 41 | #' Convert between Python and R objects
 42 | #'
 43 | #' @param x A Python object.
 44 | #' @param name A name
 45 | #' @param value A value
 46 | #'
 47 | #' @return An \R object, as converted from the Python object.
 48 | #'
 49 | #' @name r-py-conversion
 50 | #' @export
 51 | `[[<-.collections.abc.MutableMapping` <- function(x, name, value) {
 52 |   if (!is.null(value)) {
 53 |     reticulate::py_set_item(x, name, value)
 54 |   } else if (name %in% x$keys()) {
 55 |     reticulate::py_del_item(x, name)
 56 |   }
 57 | }
 58 | 
 59 | #' @name r-py-conversion
 60 | #' @export
 61 | `[[.collections.abc.Mapping` <- function(x, name) {
 62 |   if (name %in% x$keys()) {
 63 |     py_to_r_ifneedbe(reticulate::py_get_item(x, name))
 64 |   } else {
 65 |     NULL
 66 |   }
 67 | }
 68 | 
 69 | #' @name r-py-conversion
 70 | #' @export
 71 | `[<-.collections.abc.MutableMapping` <- `[[<-.collections.abc.MutableMapping`
 72 | #
 73 | #' @name r-py-conversion
 74 | #' @export
 75 | `[.collections.abc.Mapping` <- `[[.collections.abc.Mapping`
 76 | #
 77 | #' @name r-py-conversion
 78 | #' @export
 79 | `names.collections.abc.Mapping` <- function(x) {
 80 |   python_builtins <- reticulate::import_builtins()
 81 |   python_builtins$list(x$keys())
 82 | }
 83 | 
 84 | #' @name r-py-conversion
 85 | #' @export
 86 | `py_to_r.collections.abc.Set` <- function(x) {
 87 |   python_builtins <- reticulate::import_builtins()
 88 |   python_builtins$list(x)
 89 | }
 90 | 
 91 | #' @name r-py-conversion
 92 | #' @export
 93 | py_to_r.pandas.core.indexes.base.Index <- function(x) {
 94 |   python_builtins <- reticulate::import_builtins()
 95 |   out <- python_builtins$list(x)
 96 |   attr(out, "name") <- py_to_r_ifneedbe(x$name)
 97 |   out
 98 | }
 99 | 
100 | #' @name r-py-conversion
101 | #' @export
102 | py_to_r.collections.abc.KeysView <- function(x) {
103 |   python_builtins <- reticulate::import_builtins()
104 |   python_builtins$list(x)
105 | }
106 | 
107 | #' @name r-py-conversion
108 | #' @export
109 | `py_to_r.collections.abc.Mapping` <- function(x) {
110 |   python_builtins <- reticulate::import_builtins()
111 | 
112 |   x_list <- python_builtins$dict(x)
113 | 
114 |   # convert members of x_list if need be
115 |   for (i in seq_along(x_list)) {
116 |     if (inherits(x_list[[i]], "python.builtin.object")) {
117 |       x_list[[i]] <- py_to_r_ifneedbe(x_list[[i]])
118 |     }
119 |   }
120 | 
121 |   x_list
122 | }
123 | 
124 | 
125 | #' @importFrom Matrix sparseMatrix
126 | py_to_r.scipy.sparse.csc.csc_matrix <- function(x) {
127 |   Matrix::sparseMatrix(
128 |     i = as.integer(py_to_r_ifneedbe(x$indices))+1,
129 |     p = as.integer(py_to_r_ifneedbe(x$indptr)),
130 |     x = as.vector(py_to_r_ifneedbe(x$data)),
131 |     dims = as.integer(dim(x))
132 |   )
133 | }
134 | 


--------------------------------------------------------------------------------
/workflows/subwf-find-dorcs.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | 
  4 | # Import the tasks called by the pipeline
  5 | import "../tasks/dorcs_task_find_dorcs.wdl" as find_dorcs
  6 | 
  7 | workflow wf_dorcs {
  8 | 
  9 |     meta {
 10 |         version: 'v0.1'
 11 |         author: 'Siddarth Wekhande (swekhand@broadinstitute.org)'
 12 |         description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: Sub-workflow to find DORCs from SHARE-seq data.'
 13 |     }
 14 | 
 15 |     input {
 16 |         File? rna_matrix
 17 |         File? atac_fragments
 18 |         File peak_file
 19 | 
 20 |         String genome
 21 |         Int n_cores = 4
 22 |         String save_plots_to_dir = "TRUE"
 23 |         String? output_filename
 24 | 
 25 |         Int minFeature_RNA = 200
 26 |         Int maxFeature_RNA = 2500
 27 |         Float percentMT_RNA = 5
 28 |         Int minCells_RNA = 3
 29 | 
 30 |         Int dorcGeneCutOff = 10
 31 |         Float fripCutOff = 0.3
 32 |         Float corrPVal = 0.05
 33 |         Int topNGene = 20
 34 |         Int windowPadSize = 50000
 35 | 
 36 |         Int numNearestNeighbor = 30
 37 |         Float numBackgroundPairs = 100000
 38 |         Float chunkSize = 50000
 39 | 
 40 |         String? prefix
 41 |         Int mem_gb = 64
 42 |         Int disk_gb = 100
 43 |         String? docker
 44 |     }
 45 |   
 46 |     File rna_matrix_ = select_first([rna_matrix])
 47 |     File atac_fragments_ = select_first([atac_fragments])
 48 | 
 49 |     if ( !defined(rna_matrix) || !defined(atac_fragments) ){
 50 |         call raise_exception as missing_input {
 51 |             input:
 52 |                   msg = "The genes-by-cell matrix or the dna fragments file are missing."
 53 |         }
 54 |     }
 55 | 
 56 |     call find_dorcs.find_dorcs as find_dorcs{
 57 |         input:
 58 |             rna_matrix = rna_matrix_,
 59 |             atac_fragments = atac_fragments_,
 60 |             peak_file = peak_file,
 61 |             genome = genome,
 62 |             n_cores = n_cores,
 63 |             save_plots_to_dir = save_plots_to_dir,
 64 |             output_filename = output_filename,
 65 |             minFeature_RNA = minFeature_RNA,
 66 |             maxFeature_RNA = maxFeature_RNA,
 67 |             percentMT_RNA = percentMT_RNA,
 68 |             minCells_RNA = minCells_RNA,
 69 |             dorcGeneCutOff = dorcGeneCutOff,
 70 |             fripCutOff = fripCutOff,
 71 |             corrPVal = corrPVal,
 72 |             topNGene = topNGene,
 73 |             windowPadSize = windowPadSize,
 74 |             numNearestNeighbor = numNearestNeighbor,
 75 |             numBackgroundPairs = numBackgroundPairs,
 76 |             chunkSize = chunkSize,
 77 |             mem_gb = mem_gb,
 78 |             disk_gb = disk_gb,
 79 |             docker_image = docker,
 80 |             prefix = prefix
 81 |     }
 82 | 
 83 |     output {
 84 |         File dorcs_notebook_output = find_dorcs.notebook_output
 85 |         File dorcs_notebook_log = find_dorcs.notebook_log
 86 |         File? seurat_violin_plot = find_dorcs.seurat_violin_plot
 87 |         File? j_plot = find_dorcs.j_plot
 88 |         File? plots_zip = find_dorcs.plots_zip
 89 |         File? dorcs_genes_summary = find_dorcs.dorcs_genes_summary
 90 |         File? dorcs_regions_summary = find_dorcs.dorcs_regions_summary
 91 |     }
 92 | 
 93 | }
 94 | 
 95 | # Task to report errors to user.
 96 | # From https://github.com/ENCODE-DCC/chip-seq-pipeline2/blob/master/chip.wdl
 97 | task raise_exception {
 98 |   input {
 99 |     String msg
100 |     Array[String]? vals
101 |   }
102 |   command {
103 |     echo -e "\n* Error: ${msg}\n" >&2
104 |     echo -e "* Vals: ${sep=',' vals}\n" >&2
105 |     exit 2
106 |   }
107 |   output {
108 |     String error_msg = '${msg}'
109 |   }
110 |   runtime {
111 |     maxRetries : 0
112 |     cpu : 1
113 |     memory : '2 GB'
114 |     time : 1
115 |     disks : 'local-disk 10 SSD'
116 |     docker : 'encodedcc/chip-seq-pipeline:v2.2.1'
117 |   }
118 | }
119 | 


--------------------------------------------------------------------------------
/dockerfiles/share_task_filter_atac.dockerfile:
--------------------------------------------------------------------------------
  1 | ############################################################
  2 | # Dockerfile for BROAD GRO share-seq-pipeline
  3 | # Based on Debian slim
  4 | ############################################################
  5 | 
  6 | FROM debian:buster-slim as builder
  7 | 
  8 | ENV BEDTOOLS_VERSION v2.29.0
  9 | ENV PICARD_VERSION 2.27.5
 10 | ENV SAMTOOLS_VERSION 1.16
 11 | ENV SAMBAMBA_VERSION 0.6.6
 12 | 
 13 | # To prevent time zone prompt
 14 | ENV DEBIAN_FRONTEND=noninteractive
 15 | 
 16 | # Install softwares from apt repo
 17 | RUN apt-get update && apt-get install -y \
 18 |     autoconf \
 19 |     automake \
 20 |     build-essential \
 21 |     git \
 22 |     libcurl4-openssl-dev \
 23 |     liblz4-dev \
 24 |     liblzma-dev \
 25 |     libncurses5-dev \
 26 |     libncursesw5-dev \
 27 |     libbz2-dev \
 28 |     perl \
 29 |     python \
 30 |     unzip \
 31 |     xz-utils \
 32 |     wget \
 33 |     zlib1g-dev &&\
 34 |     rm -rf /var/lib/apt/lists/*
 35 | 
 36 | # Make directory for all softwares
 37 | RUN mkdir /software
 38 | WORKDIR /software
 39 | ENV PATH="/software:${PATH}"
 40 | 
 41 | # Install bedtools 2.29.0
 42 | RUN git clone --branch ${BEDTOOLS_VERSION} --single-branch https://github.com/arq5x/bedtools2.git && \
 43 |     cd bedtools2 && make && make install && cd ../ && rm -rf bedtools2*
 44 | 
 45 | # Install sambamba 0.6.6
 46 | RUN wget https://github.com/lomereiter/sambamba/releases/download/v${SAMBAMBA_VERSION}/sambamba_v${SAMBAMBA_VERSION}_linux.tar.bz2 && \
 47 |     tar -xvjf sambamba_v${SAMBAMBA_VERSION}_linux.tar.bz2 && \
 48 |     mv sambamba_v${SAMBAMBA_VERSION} /usr/local/bin/sambamba && \
 49 |     rm -rf sambamba_*
 50 | 
 51 | # Install samtools 1.16
 52 | RUN git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/htslib.git && \
 53 |     cd htslib && git submodule update --init --recursive && autoreconf -i && make && make install && cd ../ && \
 54 |     git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/samtools.git && \
 55 |     cd samtools && make && make install && cd ../ && rm -rf samtools* && rm -rf htslib*
 56 |     
 57 | 
 58 | # Install Picard 2.20.7
 59 | RUN wget https://github.com/broadinstitute/picard/releases/download/${PICARD_VERSION}/picard.jar && chmod +x picard.jar && mv picard.jar /usr/local/bin
 60 | 
 61 | 
 62 | 
 63 | FROM debian:buster-slim
 64 | 
 65 | LABEL maintainer = "Eugenio Mattei"
 66 | LABEL software = "Share-seq pipeline"
 67 | LABEL software.version="1.0.0"
 68 | LABEL software.organization="Broad Institute of MIT and Harvard"
 69 | LABEL software.version.is-production="Yes"
 70 | LABEL software.task="filter"
 71 | 
 72 | RUN apt-get update && apt-get install -y \
 73 |     gcc \
 74 |     libcurl4-openssl-dev \
 75 |     libbz2-dev \
 76 |     liblzma-dev \
 77 |     python3 \
 78 |     python3-dev \
 79 |     python3-pip \
 80 |     openjdk-11-jre \
 81 |     zlib1g-dev &&\
 82 |     rm -rf /var/lib/apt/lists/*
 83 | 
 84 | # Install packages for python3 scripts
 85 | RUN python3 -m pip install --upgrade pip
 86 | RUN python3 -m pip install --no-cache-dir --ignore-installed pysam 
 87 | 
 88 | # Create and setup new user
 89 | ENV USER=shareseq
 90 | WORKDIR /home/$USER
 91 | 
 92 | RUN groupadd -r $USER &&\
 93 |     useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
 94 |     chown $USER:$USER /home/$USER
 95 | 
 96 | # Add folder with software to the path
 97 | ENV PATH="/software:${PATH}"
 98 | 
 99 | # Copy the compiled software from the builder
100 | COPY --from=builder --chown=$USER:$USER /usr/local/bin/* /usr/local/bin/
101 | COPY --from=builder --chown=$USER:$USER /lib/x86_64-linux-gnu/* /lib/x86_64-linux-gnu/
102 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
103 | COPY --chown=$USER:$USER src/python/filter_mito_reads.py /usr/local/bin
104 | COPY --chown=$USER:$USER src/python/bam_to_fragments.py /usr/local/bin
105 | COPY --chown=$USER:$USER src/python/assign_multimappers.py /usr/local/bin
106 | 
107 | 
108 | USER ${USER}
109 | 


--------------------------------------------------------------------------------
/dockerfiles/share_task_qc_atac.dockerfile:
--------------------------------------------------------------------------------
  1 | ############################################################
  2 | # Dockerfile for BROAD GRO share-seq-pipeline
  3 | # Based on Debian slim
  4 | ############################################################
  5 | 
  6 | FROM debian:buster-slim as builder
  7 | 
  8 | ENV SAMTOOLS_VERSION 1.9
  9 | ENV BEDTOOLS_VERSION v2.29.0
 10 | ENV PICARD_VERSION 2.27.5
 11 | 
 12 | # To prevent time zone prompt
 13 | ENV DEBIAN_FRONTEND=noninteractive
 14 | 
 15 | # Install softwares from apt repo
 16 | RUN apt-get update && apt-get install -y \
 17 |     autoconf \
 18 |     build-essential \
 19 |     git \
 20 |     libcurl4-openssl-dev \
 21 |     liblz4-dev \
 22 |     liblzma-dev \
 23 |     libncurses5-dev \
 24 |     libbz2-dev \
 25 |     python \
 26 |     unzip \
 27 |     wget \
 28 |     zlib1g-dev &&\
 29 |     rm -rf /var/lib/apt/lists/*
 30 | 
 31 | 
 32 | # Make directory for all softwares
 33 | RUN mkdir /software
 34 | WORKDIR /software
 35 | ENV PATH="/software:${PATH}"
 36 | 
 37 | # Install bedtools 2.29.0
 38 | RUN git clone --branch ${BEDTOOLS_VERSION} --single-branch https://github.com/arq5x/bedtools2.git && \
 39 |     cd bedtools2 && make && make install && cd ../ && rm -rf bedtools2*
 40 | 
 41 | # Install samtools 1.9
 42 | RUN git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/samtools.git && \
 43 |     git clone --branch ${SAMTOOLS_VERSION} --single-branch https://github.com/samtools/htslib.git && \
 44 |     cd samtools && make && make install && cd ../ && rm -rf samtools* && \
 45 |     cd htslib && autoreconf -i && make && make install && cd ../ && rm -rf htslib*
 46 | 
 47 | # Install Picard 2.20.7
 48 | RUN wget https://github.com/broadinstitute/picard/releases/download/${PICARD_VERSION}/picard.jar && chmod +x picard.jar && mv picard.jar /usr/local/bin
 49 | 
 50 | 
 51 | 
 52 | FROM debian:buster-slim
 53 | 
 54 | LABEL maintainer = "Eugenio Mattei"
 55 | LABEL software = "Share-seq pipeline"
 56 | LABEL software.version="1.0.0"
 57 | LABEL software.organization="Broad Institute of MIT and Harvard"
 58 | LABEL software.version.is-production="Yes"
 59 | LABEL software.task="qc-atac"
 60 | 
 61 | RUN apt-get update && apt-get install -y \
 62 |     gcc \
 63 |     git \
 64 |     python3 \
 65 |     python3-dev \
 66 |     python3-pip \
 67 |     openjdk-11-jre \
 68 |     r-base \
 69 |     zlib1g-dev &&\
 70 |     rm -rf /var/lib/apt/lists/*
 71 | 
 72 | # Install packages for python3 scripts (pysam, SAMstats)
 73 | RUN python3 -m pip install --upgrade pip
 74 | RUN python3 -m pip install --no-cache-dir --ignore-installed numpy matplotlib pandas plotnine pysam --editable=git+https://github.com/kundajelab/SAMstats@75e60f1e67c6d5d066371a0b53729e4b1f6f76c5#egg=SAMstats
 75 | 
 76 | # Create and setup new user
 77 | ENV USER=shareseq
 78 | WORKDIR /home/$USER
 79 | 
 80 | RUN groupadd -r $USER &&\
 81 |     useradd -r -g $USER --home /home/$USER -s /sbin/nologin -c "Docker image user" $USER &&\
 82 |     chown $USER:$USER /home/$USER
 83 | 
 84 | # Add folder with software to the path
 85 | ENV PATH="/software:${PATH}"
 86 | 
 87 | # Copy the compiled software from the builder
 88 | COPY --from=builder --chown=$USER:$USER /usr/local/bin/* /usr/local/bin/
 89 | COPY --from=builder --chown=$USER:$USER /lib/x86_64-linux-gnu/* /lib/x86_64-linux-gnu/
 90 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
 91 | COPY --chown=$USER:$USER src/python/pbc_stats.py /usr/local/bin
 92 | COPY --chown=$USER:$USER src/python/qc_atac_compute_tss_enrichment.py /usr/local/bin
 93 | COPY --chown=$USER:$USER src/python/qc_atac_count_duplicates_per_barcode.py /usr/local/bin
 94 | COPY --chown=$USER:$USER src/python/qc_atac_compute_reads_in_peaks.py /usr/local/bin
 95 | COPY --chown=$USER:$USER src/python/plot_insert_size_hist.py /usr/local/bin
 96 | COPY --chown=$USER:$USER src/R/barcode_rank_functions.R /usr/local/bin
 97 | COPY --chown=$USER:$USER src/R/atac_qc_plots.R /usr/local/bin
 98 | COPY --chown=$USER:$USER src/bash/monitor_script.sh /usr/local/bin
 99 | 
100 | 
101 | USER ${USER}
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/tasks/dorcs_task_find_dorcs.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | task find_dorcs {
  4 |     meta {
  5 |         version: 'v0.1'
  6 |         author: 'Siddarth Wekhande (swekhand@broadinstitute.org) at Broad Institute of MIT and Harvard'
  7 |         description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: find DORCs task'
  8 |     }
  9 | 
 10 |     input {
 11 |         #This task takes in the RNA and ATAC files and finds the DORCs based on the cut-off criteria provided
 12 | 
 13 |         #DORCs parameters
 14 |         File rna_matrix
 15 |         File atac_fragments
 16 |         File? peak_file
 17 |         String genome
 18 |         Int n_cores = 4
 19 |         String save_plots_to_dir = "TRUE"
 20 |         String prefix = "prefix"
 21 | 
 22 |         #RNA QC parameters
 23 |         Int minFeature_RNA = 200
 24 |         Int maxFeature_RNA = 2500
 25 |         Float percentMT_RNA = 5
 26 |         Int minCells_RNA = 3
 27 | 
 28 |         #ATAC QC parameter
 29 |         Float fripCutOff = 0.3
 30 |         Float chunkSize = 50000
 31 | 
 32 |         #Background correlation parameters
 33 |         Int numNearestNeighbor = 100
 34 |         Float numBackgroundPairs = 100000
 35 | 
 36 |         #DORC genes parameter
 37 |         # Regulatory region around TSS. Default is +/- 50Kb
 38 |         Int windowPadSize = 50000
 39 |         Int dorcGeneCutOff = 10
 40 |         Float corrPVal = 0.05
 41 |         Int topNGene = 20
 42 | 
 43 |         String output_filename = "${prefix}.dorcs.notebook.${genome}.ipynb"
 44 |         String docker_image = "us.gcr.io/buenrostro-share-seq/dorcs_task_find_dorcs:v1.0.0"
 45 |         #String docker_image = "swekhande/shareseq-prod:share-task-dorcs"
 46 |         Int mem_gb = 64
 47 |         Int disk_gb = 100
 48 |     }
 49 | 
 50 |     #Output filepaths
 51 | 
 52 |     String violin_plot = '${prefix}.dorcs.plots.${genome}/${prefix}.dorcs.rna_violin_plot.${genome}.png'
 53 |     String jplot = '${prefix}.dorcs.plots.${genome}/${prefix}.dorcs.jplot.${genome}.png'
 54 |     String dorc_genes_summ = '${prefix}.dorcs.dorc_genes_summary.${genome}.csv'
 55 |     String all_regions_summ = '${prefix}.dorcs.all_regions_summary.${genome}.csv'
 56 |     String plots_zip_dir = '${prefix}.dorcs.plots.${genome}.zip'
 57 |     #String papermill_log_filename = 'papermill.logfile.txt'
 58 |     String log_filename = "log/${prefix}.dorcs.logfile.${genome}.txt"
 59 | 
 60 |     command {
 61 |         gzip -dc ${atac_fragments} > tmp_fragments.bedpe
 62 | 
 63 |         papermill $(which dorcs_jplot_notebook.ipynb) ${output_filename} \
 64 |         -p rnaCountMatrix ${rna_matrix} \
 65 |         -p atacFragFile tmp_fragments.bedpe \
 66 |         -p peakFile ${peak_file} \
 67 |         -p savePlotsToDir ${save_plots_to_dir} \
 68 |         -p nCores ${n_cores} \
 69 |         -p genome ${genome} \
 70 |         -p minFeature_RNA ${minFeature_RNA} \
 71 |         -p maxFeature_RNA ${maxFeature_RNA} \
 72 |         -p percentMT_RNA ${percentMT_RNA} \
 73 |         -p minCells_RNA ${minCells_RNA} \
 74 |         -p dorcGeneCutOff ${dorcGeneCutOff} \
 75 |         -p fripCutOff ${fripCutOff} \
 76 |         -p corrPVal ${corrPVal} \
 77 |         -p topNGene ${topNGene} \
 78 |         -p windowPadSize ${windowPadSize} \
 79 |         -p numNearestNeighbor ${numNearestNeighbor} \
 80 |         -p numBackgroundPairs ${numBackgroundPairs} \
 81 |         -p chunkSize ${chunkSize} \
 82 |         -p prefix ${prefix}
 83 |     }
 84 | 
 85 |     output {
 86 |         File notebook_output = output_filename
 87 |         File notebook_log = log_filename
 88 |         #File papermill_log = papermill_log_filename
 89 | 
 90 |         File? seurat_violin_plot = violin_plot
 91 |         File? j_plot = jplot
 92 |         File? plots_zip = plots_zip_dir
 93 | 
 94 |         File? dorcs_genes_summary = dorc_genes_summ
 95 |         File? dorcs_regions_summary = all_regions_summ
 96 | 
 97 | 
 98 |     }
 99 | 
100 |     runtime {
101 |         cpu : 4
102 |         memory : mem_gb+'G'
103 |         docker : docker_image
104 |         disks : 'local-disk ${disk_gb} LOCAL'
105 |         maxRetries : 0
106 |     }
107 | }
108 | 
109 | 
110 | 


--------------------------------------------------------------------------------
/tasks/share_task_star.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | # TASK
  4 | # SHARE-atac-STAR
  5 | 
  6 | task share_rna_align {
  7 |     meta {
  8 |         version: 'v0.1'
  9 |         author: 'Eugenio Mattei (emattei@broadinstitute.org) at Broad Institute of MIT and Harvard'
 10 |         description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: align RNA task'
 11 |     }
 12 | 
 13 |     input {
 14 |         # This function takes in input the pre-processed fastq and align it to the genome
 15 |         # using STAR.
 16 | 
 17 |         Array[File] fastq_R1
 18 |         Array[File]? fastq_R2
 19 |         File? genome_index_tar
 20 |         String genome_name
 21 |         String? prefix
 22 |         String docker_image = "docker.io/nchernia/share_task_star:1"
 23 |         Int cpus = 16
 24 |     }
 25 |     #Float input_file_size_gb = size(input[0], "G")
 26 |     Int samtools_cpus = 6
 27 |     Int samtools_mem_gb = 8
 28 |     Int mem_gb = 64
 29 |     Int disk_gb = 850
 30 |     #Int disk_gb = round(20.0 + 4 * input_file_size_gb)
 31 | 
 32 |     # Define the output names
 33 |     String sorted_bam = "${default="share-seq" prefix}.rna.align.${genome_name}.sorted.bam"
 34 |     String sorted_bai = "${default="share-seq" prefix}.rna.align.${genome_name}.sorted.bam.bai"
 35 |     String alignment_log = "${default="share-seq" prefix}.rna.align.${genome_name}.log"
 36 | 
 37 |     command {
 38 |         set -e
 39 |         # Untar the genome
 40 |         tar xvzf ${genome_index_tar} --no-same-owner -C ./
 41 | 
 42 |         mkdir out
 43 | 
 44 |         $(which STAR) \
 45 |             --runThreadN ${cpus} \
 46 |             --chimOutType WithinBAM \
 47 |             --genomeDir ./ \
 48 |             --readFilesIn ${sep=',' fastq_R1} ${sep=',' fastq_R2}  \
 49 |             --outFileNamePrefix out/${default="share-seq" prefix}.rna.align.${genome_name}. \
 50 |             --outFilterMultimapNmax 20 \
 51 |             --outFilterScoreMinOverLread 0.3 \
 52 |             --outFilterMatchNminOverLread 0.3 \
 53 |             --outSAMattributes NH HI AS nM MD \
 54 |             --limitOutSJcollapsed 2000000 \
 55 |             --outSAMtype BAM Unsorted \
 56 |             --limitIObufferSize 400000000 400000000 \
 57 |             --outReadsUnmapped Fastx \
 58 |             --readFilesCommand zcat
 59 | 
 60 |         $(which samtools) sort \
 61 |             -@ ${samtools_cpus} \
 62 |             -m ${samtools_mem_gb}G \
 63 |             -o out/${sorted_bam} \
 64 |             out/${default="share-seq" prefix}.rna.align.${genome_name}.Aligned.out.bam
 65 | 
 66 |         $(which samtools) index \
 67 |             -@ ${cpus} \
 68 |             out/${sorted_bam}
 69 |     }
 70 | 
 71 |     output {
 72 |         File rna_alignment = "out/${sorted_bam}"
 73 |         File rna_alignment_index = "out/${sorted_bai}"
 74 |         File rna_alignment_log = glob('out/*.Log.final.out')[0]
 75 |     }
 76 | 
 77 |     runtime {
 78 |         cpu : cpus
 79 |         memory : mem_gb+'G'
 80 |         disks : 'local-disk ${disk_gb} SSD'
 81 |         maxRetries: 0
 82 |         docker: docker_image
 83 |     }
 84 | 
 85 |     parameter_meta {
 86 |         fastq_R1: {
 87 |                 description: 'Read1 fastq',
 88 |                 help: 'Processed fastq for read1.',
 89 |                 example: 'processed.atac.R1.fq.gz'
 90 |             }
 91 |         genome_index_tar: {
 92 |                 description: 'STAR indexes',
 93 |                 help: 'Index files for STAR to use during alignment in tar.gz.',
 94 |                 example: ['']
 95 |             }
 96 |         genome_name: {
 97 |                 description: 'Reference name',
 98 |                 help: 'The name of the reference genome used by the aligner.',
 99 |                 example: ['hg38', 'mm10', 'both']
100 |             }
101 |         prefix: {
102 |                 description: 'Prefix for output files',
103 |                 help: 'Prefix that will be used to name the output files',
104 |                 example: 'MyExperiment'
105 |             }
106 |         cpus: {
107 |                 description: 'Number of cpus',
108 |                 help: 'Set the number of cpus useb by bowtie2',
109 |                 example: '4'
110 |             }
111 |         docker_image: {
112 |                 description: 'Docker image.',
113 |                 help: 'Docker image for preprocessing step. Dependencies: STAR',
114 |                 example: ['put link to gcr or dockerhub']
115 |             }
116 |     }
117 | }
118 | 


--------------------------------------------------------------------------------
/src/python/qc_atac_compute_reads_in_peaks.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Author: Eugenio Mattei, Broad Institute of MIT and Harvard
  4 | # modified from Jason Buenrostro's tool
  5 | 
  6 | import argparse
  7 | import os
  8 | import pysam
  9 | from collections import Counter
 10 | from collections import defaultdict
 11 | import numpy as np
 12 | 
 13 | #import os
 14 | #import sys
 15 | import matplotlib
 16 | matplotlib.use('Agg')
 17 | import matplotlib.pyplot as plt
 18 | #from multiprocessing import Pool
 19 | 
 20 | 
 21 | ##### DEFINE FUNCTIONS #####
 22 | def count_fragments_in_peaks(tabix_filename,
 23 |                                 peaks_list,
 24 |                                 mapq_threshold = 30):
 25 |     """
 26 |     This funtion counts the per-barcode number of reads in the peak region.
 27 | 
 28 |     Parameters
 29 |     ----------
 30 |     tabix_filename : str
 31 |         Path to the tabix file containing the fragments.
 32 |         File needs to be coordinate-sorted and indexed.
 33 |     peaks_list : array
 34 |         Array containing the list of peaks to be included.
 35 |         Each member of the array contains the following four elements:
 36 |         Chr, Start,  End, Strand
 37 |     barcode_tag : str
 38 |         Which tag in the BAM file contains the barcode id.
 39 |     mapq_threshold : int
 40 |         Keep only the reads with mapq score greater or equal.
 41 |         default: 30
 42 | 
 43 |     Returns
 44 |     -------
 45 | 
 46 |     Dictionary
 47 |         Key: Barcode
 48 |         Value: Number of fragments in peaks.
 49 |     """
 50 |     # To count the number of fragments in peaks
 51 |     reads_in_peaks_counter = defaultdict(set)
 52 |     fragments_in_peaks_counter = defaultdict(set)
 53 | 
 54 |     tabixfile = pysam.TabixFile(tabix_filename)
 55 | 
 56 |     for peak in peaks_list:
 57 |         peak_chr = str(peak[0])
 58 |         peak_start = int(peak[1])
 59 |         peak_end = int(peak[2])
 60 | 
 61 |         # Find all the fragments overlapping the promoter.
 62 |         for fragment in tabixfile.fetch(peak_chr, peak_start, peak_end):
 63 |             fragment_fields = fragment.split("\t")
 64 | 
 65 |             fragment_contig = fragment_fields[0]
 66 |             fragment_start = int(fragment_fields[1])
 67 |             fragment_end = int(fragment_fields[2])
 68 |             barcode = fragment_fields[3]
 69 | 
 70 |             fragment_id = "-".join(fragment_fields)
 71 |             fragments_in_peaks_counter[barcode].add(fragment_id)
 72 | 
 73 |             # Increment the counter for the specific barcode.
 74 |             if fragment_start >= peak_start and fragment_start <= peak_end-1:
 75 |                 reads_in_peaks_counter[barcode].add(fragment_id+"start")
 76 | 
 77 |             if fragment_end >= peak_start and fragment_end <= peak_end-1:
 78 |                 reads_in_peaks_counter[barcode].add(fragment_id+"end")
 79 | 
 80 |     return reads_in_peaks_counter, fragments_in_peaks_counter
 81 | 
 82 | 
 83 | if __name__ == '__main__':
 84 | 
 85 |     #args = _parse_sanitize_cmdline_arguments()
 86 | 
 87 |     msg = "Add the description"
 88 |     parser = argparse.ArgumentParser(description = msg)
 89 | 
 90 |     # Adding optional argument
 91 |     parser.add_argument("tabix", help= "Fragments file in tabix format and indexed.")
 92 |     parser.add_argument("--prefix", help = "Prefix for the metrics output fil.")
 93 |     parser.add_argument("--peaks", help= "Peaks bed file")
 94 | 
 95 |     # Read arguments from command line
 96 |     args = parser.parse_args()
 97 | 
 98 |     if args.prefix:
 99 |         prefix = args.prefix
100 |     else:
101 |         prefix = args.bam[:-4]
102 | 
103 |     # It is extremely fast. Don't think we need parallel processing.
104 |     #cpus = len(os.sched_getaffinity(0))/2
105 |     # Using column chr, start, end and what user input contains the strand information.
106 |     peaks_list = np.loadtxt(args.peaks, 'str', usecols = (0,1,2))
107 | 
108 |     reads_in_peaks, fragments_in_peaks = count_fragments_in_peaks(args.tabix,
109 |                                                   peaks_list
110 |                                                  )
111 |     output_fnp = f"{prefix}.reads.in.peak.tsv"
112 | 
113 |     with open(output_fnp,"w") as out_file:
114 |         print(f"barcode\treads_peaks\tfragment_peaks", file=out_file)
115 |         for barcode,fragments_in_peak in fragments_in_peaks.items():
116 |             print(f"{barcode}\t{len(reads_in_peaks[barcode])}\t{len(fragments_in_peak)}", file=out_file)
117 | 


--------------------------------------------------------------------------------
/src/python/infer_barcodes.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # This script is used to infer molecular barcodes
  4 | # from raw sequencing BCL data.
  5 | #
  6 | # It requires running Picard ExtractIlluminaBarcodes with BARCODE=N,
  7 | # to extract all barcodes into *_barcode.txt.gz files first.
  8 | 
  9 | import glob
 10 | import gzip
 11 | import sys
 12 | 
 13 | [
 14 |     _name,
 15 |     multiplex_params_file,
 16 |     candidate_molecular_barcodes_file,
 17 |     barcode_matches_file,
 18 | ] = sys.argv
 19 | 
 20 | YIELD_THRESHOLD = 0.1
 21 | MIN_READ_COUNT = 1e6
 22 | 
 23 | 
 24 | def parse_barcodes(file_path):
 25 |     with open(file_path) as f:
 26 |         barcodes = {}
 27 |         for row in f.readlines():
 28 |             row = row.strip().split('\t')
 29 |             copa = row[0]
 30 |             barcode = ''.join(row[1:])
 31 |             barcodes[barcode] = copa
 32 |         return barcodes
 33 | 
 34 | 
 35 | copa_barcodes = parse_barcodes(multiplex_params_file)
 36 | molecular_barcodes = parse_barcodes(candidate_molecular_barcodes_file)
 37 | 
 38 | # count each unique barcode combination
 39 | counts = {}
 40 | for extracted in glob.glob('*_barcode.txt.gz'):
 41 |     with gzip.open(extracted, 'rt') as f:
 42 |         for row in f.readlines():
 43 |             barcode = row.split('\t')[0]
 44 |             if barcode in counts:
 45 |                 counts[barcode] += 1
 46 |             else:
 47 |                 counts[barcode] = 1
 48 | 
 49 | # add any missing barcodes from the list of CoPAs
 50 | for barcode in copa_barcodes.keys():
 51 |     if barcode not in counts:
 52 |         counts[barcode] = 0
 53 | 
 54 | 
 55 | def distance(b1, b2):
 56 |     return sum(c1 != c2 for c1, c2 in zip(b1, b2))
 57 | 
 58 | 
 59 | COPA_UNDEFINED = 'UNDEFINED'
 60 | 
 61 | # match barcodes to candidates
 62 | results = {}
 63 | molecular_barcode_len = len(next(iter(molecular_barcodes)))
 64 | for barcode, count in counts.items():
 65 |     molecular_barcode_matched = False
 66 |     molecular_barcode_match = barcode[:molecular_barcode_len]
 67 |     molecular_barcode_match_name = molecular_barcode_match
 68 | 
 69 |     if molecular_barcode_match in molecular_barcodes:
 70 |         molecular_barcode_matched = True
 71 |         molecular_barcode_match_name = molecular_barcodes[molecular_barcode_match]
 72 | 
 73 |     barcode_match = molecular_barcode_match
 74 |     copa = copa_barcodes[barcode_match] if barcode_match in copa_barcodes else COPA_UNDEFINED
 75 |     if barcode_match in results:
 76 |         results[barcode_match]['Count'] += count
 77 |     else:
 78 |         results[barcode_match] = {
 79 |             'CoPA': copa,
 80 |             'Molecular Barcode': molecular_barcode_match_name,
 81 |             'Count': count,
 82 |             'Matched': molecular_barcode_matched
 83 |         }
 84 | 
 85 | # show barcodes that correspond to a CoPA or have a matched
 86 | # barcode at the top of the output file, otherwise
 87 | # sort by count
 88 | results = sorted(
 89 |     results.values(),
 90 |     key=lambda r: (r['CoPA'], int(not r['Matched']), -r['Count'])
 91 | )
 92 | 
 93 | # calculate % of average yield
 94 | total_yield = 0
 95 | copa_count = 0
 96 | for r in results:
 97 |     if r['CoPA'] != COPA_UNDEFINED:
 98 |         total_yield += r['Count']
 99 |         copa_count += 1
100 | avg_yield = total_yield / copa_count if copa_count else None
101 | for r in results:
102 |     percent_avg_yield = ''
103 |     if r['CoPA'] != COPA_UNDEFINED:
104 |         percent_avg_yield = '{:.2f}%'.format(
105 |             100 * r['Count'] / avg_yield) if avg_yield else 0
106 |     r['Percent of average'] = percent_avg_yield
107 | 
108 | # report results as a TSV
109 | with open(barcode_matches_file, 'w') as f:
110 |     header = (
111 |         'CoPA', 'Molecular Barcode',
112 |         'Count', 'Percent of average',
113 |     )
114 |     print('\t'.join(header), file=f)
115 | 
116 |     # print CoPA matches, barcode matches, and the top barcodes
117 |     # with the highest read count
118 |     for r in results:
119 |         if (
120 |             r['CoPA'] != COPA_UNDEFINED or
121 |             r['Matched'] or
122 |             r['Count'] >= MIN_READ_COUNT
123 |         ):
124 |             print('\t'.join((str(r[col]) for col in header)), file=f)
125 | 
126 | # fail the task (and the workflow) for low yield
127 | if not avg_yield:
128 |     raise Exception('None of the candidate barcodes matched any CoPAs!')
129 | failed_copas = []
130 | for r in results:
131 |     if (r['CoPA'] != COPA_UNDEFINED and
132 |             float(r['Percent of average'].replace('%', '')) < YIELD_THRESHOLD):
133 |         failed_copas.append(r['CoPA'])
134 | failed_copas = ', '.join(failed_copas)
135 | if failed_copas:
136 |     raise Exception(
137 |         f'Found CoPA(s) with < {YIELD_THRESHOLD}% yield: {failed_copas}')


--------------------------------------------------------------------------------
/tasks/share_task_cell_annotation.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | task cell_annotation {
  4 |     meta {
  5 |         version: 'v0.1'
  6 |         author: 'Zhijian Li'
  7 |         affiliation: 'Broad Institute of MIT and Harvard'
  8 |         email: 'lizhijia@broadinstitute.org'
  9 |         description: 'SHARE-Seq pipeline: cell type annotation using RNA-seq data.'    
 10 |     }
 11 | 
 12 |     input {
 13 |         # Sample or project name
 14 |         String? prefix = "prefix"
 15 | 
 16 |         # Reference genome
 17 |         String genome
 18 | 
 19 |         # Reference data name and id
 20 |         String reference_data_id
 21 |         String reference_data_name
 22 |         String reference_label
 23 | 
 24 |         # Query data
 25 |         File query_data
 26 |  
 27 |         String? gene_id_to_symbol 
 28 | 
 29 |         # Docker image
 30 |         String? docker_image
 31 |         
 32 |         # Runtime parameter
 33 |         Float? memory_factor
 34 |         Float? disk_factor
 35 |     }
 36 |     
 37 |     # Determine the size of the input
 38 |     Float input_file_size_mb = size(query_data, "G")
 39 | 
 40 |     # Determining memory size base on the size of the input files.
 41 |     Float mem_gb = 64.0 + memory_factor * input_file_size_mb
 42 | 
 43 |     # Determining disk size base on the size of the input files.
 44 |     Int disk_gb = round(disk_factor * input_file_size_mb)
 45 | 
 46 |     # Determining disk type base on the size of disk.
 47 |     String disk_type = if disk_gb > 375 then "SSD" else "LOCAL"
 48 |     
 49 |     #Output files
 50 |     String reference_h5ad = "${reference_data_name}.h5ad"
 51 |     String monitor_log = "cell_annotation_monitor.log"
 52 |     String notebook_log = "log/${prefix}.cell.annotation.logfile.${genome}.txt"
 53 |     String prediction = "${prefix}.cell.annotation.prediction.${genome}.csv"
 54 |     String prediction_labels = "${prefix}.cell.annotation.labels.${genome}.png"
 55 |     String prediction_scores = "${prefix}.cell.annotation.scores.${genome}.pdf"
 56 | 
 57 |     command {
 58 |         set -e
 59 | 
 60 |         bash $(which monitor_script.sh) | tee ~{monitor_log} 1>&2 &
 61 |         
 62 |         # Download data from cellxgene
 63 |         python3 $(which get_cellxgene_data.py) \
 64 |         --id ${reference_data_id} \
 65 |         --out ${reference_data_name}
 66 | 
 67 | 
 68 |         # Perform cell annotation
 69 |         Rscript $(which cell_annotation.R) \
 70 |         --prefix ${prefix} \
 71 |         --reference_data_name ${reference_data_name} \
 72 |         --reference_label ${reference_label} \
 73 |         --query_data ${query_data} \
 74 |         --genome ${genome} \
 75 |         --gene_id_to_symbol ${gene_id_to_symbol}
 76 | 
 77 |     }
 78 | 
 79 |     output {
 80 |         File reference_h5ad = "${reference_h5ad}"
 81 |         File monitor_log = "${monitor_log}"
 82 |         File notebook_log = "${notebook_log}"
 83 |         File prediction = '${prediction}'
 84 |         File prediction_labels = '${prediction_labels}'
 85 |         File prediction_scores = '${prediction_scores}'
 86 |     }
 87 | 
 88 |     runtime {
 89 |         memory : "${mem_gb} GB"
 90 |         memory_retry_multiplier: 2
 91 |         disks: "local-disk ${disk_gb} ${disk_type}"
 92 |         docker : "${docker_image}"
 93 |         maxRetries:1
 94 |     }
 95 | 
 96 |     parameter_meta {
 97 |         reference_data_id: {
 98 |             description: 'Reference dataset id',
 99 |             help: 'The dataset id from cellxgene server.',
100 |             examples: ['3bbb6cf9-72b9-41be-b568-656de6eb18b5']
101 |         }
102 | 
103 |         reference_data_name: {
104 |             description: 'Reference data',
105 |             help: 'This file will be used as reference',
106 |             examples: ['reference.h5ad']
107 |         }
108 |         
109 |         query_data: {
110 |             description: 'Query data',
111 |             help: 'scRNA-seq data used as query',
112 |             examples: ['put link to gcr']
113 |         }
114 | 
115 |         genome: {
116 |             description: 'Reference name',
117 |             help: 'Reference genome.',
118 |             examples: ['hg38', 'mm10', 'hg19', 'mm9']
119 |         }
120 | 
121 |         prefix: {
122 |             description: 'Project name',
123 |             help: 'String used to name your project and associated file names',
124 |             example: "shareseq"
125 |         }
126 | 
127 |         docker_image: {
128 |             description: 'Docker image.',
129 |             help: 'Docker image for preprocessing step.',
130 |             example: ['put link to gcr or dockerhub']
131 |         }
132 |         
133 |         disk_factor: {
134 |             description: 'Disk factor',
135 |             help: 'Multiply this value to input .h5 file size (MB) to determine disk space (GB)',
136 |             example: 16.0
137 |         }
138 |         
139 |         memory_factor: {
140 |             description: 'Memory factor',
141 |             help: 'Multiply this value to input .h5 file size (MB) and add to default 32GB memory to determine RAM (GB)',
142 |             example: 1.0
143 |         }
144 |     }
145 | }
146 | 


--------------------------------------------------------------------------------
/src/R/barcode_rank_functions.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/Rscript
  2 | 
  3 | ## Define functions needed for plotting barcode rank
  4 | 
  5 | # Helper function to get vectors on which to call the elbow_knee_finder. 
  6 | # Takes in xy values of the curve, outputs appropriate xy vectors to be passed to elbow_knee_finder.
  7 | #
  8 | # Function computes the second derivative of the curve, and uses the shape of the second
  9 | # derivative curve to determine whether the curve has multiple "joints" (i.e. if knee should be found). 
 10 | # If the second derivative is uniformly positive or uniformly negative, the curve has a single "joint", 
 11 | # and so elbow_knee_finder can be called on the original input vectors.
 12 | # Otherwise (multiple "joints"), find the zeroes of the second derivative to the left and right of the 
 13 | # absolute minimum of the second derivative.
 14 | # These will be the endpoints of the elbow_knee_finder, so return the slices of the xy vectors
 15 | # between these zeroes. 
 16 | get_vectors <- function(x, y){
 17 |   smooth_spline <- smooth.spline(x, y, spar=1)
 18 |   second_deriv <- predict(smooth_spline, x, deriv=2)
 19 |   
 20 |   # Second derivative values can be noisy at beginning and end of graph; exclude first 10% and last 10% 
 21 |   # of values when establishing uniformity of second derivative sign
 22 |   ten_percent <- round(length(second_deriv$x)*0.1)
 23 |   mid_second_deriv <- second_deriv$y[(ten_percent+1):(length(second_deriv$y)-ten_percent)]
 24 |   
 25 |   if (all(mid_second_deriv >= 0) | all(mid_second_deriv <= 0)){
 26 |     print("Returning original vectors")
 27 |     return(list(x,y)) }
 28 |   else {
 29 |     # Find absolute minimum
 30 |     abs_min_idx <- second_deriv$x[which.min(second_deriv$y)]
 31 |     # Find last non-negative value before absolute minimum
 32 |     left_vect <- second_deriv$y[0:abs_min_idx]
 33 |     endpt_1_idx <- tail(which(left_vect >= 0), n=1)
 34 |     # Find first non-positive value after absolute minimum
 35 |     right_vect <- second_deriv$y[abs_min_idx:length(second_deriv$y)]
 36 |     endpt_2_idx <- abs_min_idx + which(right_vect >= 0)[1] - 1
 37 |     
 38 |     # Error cases: revert to elbow finder
 39 |     # Used when second derivative curve has both positive and negative values, 
 40 |     # but doesn't match positive-negative-positive shape expected of a knee's second derivative
 41 |     if (length(endpt_1_idx)==0 | length(endpt_2_idx)==0){
 42 |       print("Returning original vectors")
 43 |       return(list(x,y))
 44 |     } else if (is.na(endpt_1_idx) | is.na(endpt_2_idx)){
 45 |       print("Returning original vectors")
 46 |       return(list(x,y))
 47 |     } else {
 48 |       print("Returning sliced vectors")
 49 |       return(list(x[endpt_1_idx:endpt_2_idx], y[endpt_1_idx:endpt_2_idx]))
 50 |     }
 51 |   }
 52 | }
 53 | 
 54 | # Function to find the elbow or knee of a plot. 
 55 | # Takes in set of xy coordinates of the plot and mode, returns point which is farthest 
 56 | # from the line formed by the endpoints.
 57 | # Basic mode (default) is used when the plot is known to have only one "joint",
 58 | # whereas advanced mode is used when it is not known whether the function needs to find an 
 59 | # elbow or a knee. 
 60 | elbow_knee_finder <- function(x, y, mode="basic") {
 61 |   # With advanced mode, use helper function to determine which vectors to perform calculation on
 62 |   if (mode == "advanced") {
 63 |     # smooth.spline() function used in get_vectors() requires at least 4 unique
 64 |     # x values; preempt this error
 65 |     if (length(unique(x)) < 4) {
 66 |       return(NULL)
 67 |     } else {
 68 |       xy_vects <- get_vectors(x, y)
 69 |       x <- xy_vects[[1]]
 70 |       y <- xy_vects[[2]]
 71 |     }
 72 |   }
 73 |   # Error case: return null if vectors have length 0
 74 |   if (length(x)==0 | length(y)==0) {
 75 |     return(NULL)
 76 |   }
 77 |   # Get endpoints (point with smallest x value, point with largest x value)
 78 |   endpts_df <- data.frame(x_coords=c(x[1], x[length(x)]),
 79 |                           y_coords=c(y[1], y[length(y)]))
 80 |   # Fit line between endpoints
 81 |   fit <- lm(endpts_df$y_coords ~ endpts_df$x_coords)
 82 |   # For each point, get distance from line 
 83 |   distances <- numeric(length(x))
 84 |   for(i in 1:length(x)) {
 85 |     distances[i] <- abs(coef(fit)[2]*x[i] - y[i] + coef(fit)[1]) / sqrt(coef(fit)[2]^2 + 1^2)
 86 |   }
 87 |   
 88 |   # Get point farthest from line
 89 |   x_max_dist <- x[which.max(distances)]
 90 |   y_max_dist <- y[which.max(distances)]
 91 |   
 92 |   return(c(x_max_dist, y_max_dist))
 93 | }
 94 | 
 95 | # Function to find the elbow/knee of a plot, and the elbow/knee of the points 
 96 | # before the first elbow/knee (i.e. elbow/knee of all barcodes, and elbow/knee
 97 | # of top-ranked barcodes).
 98 | # Takes in xy coordinates of the plot and returns vector of four coordinates:
 99 | # xy coordinates of first elbow/knee, and xy coordinates of second elbow/knee.
100 | get_elbow_knee_points <- function(x, y) {
101 |   point_1 <- elbow_knee_finder(x, y, mode="basic")
102 |   if (!is.null(point_1)) {
103 |     point_2 <- elbow_knee_finder(x[1:point_1[1]], y[1:point_1[1]], mode="advanced")
104 |   }
105 |   return(c(point_1, point_2))
106 | }
107 | 


--------------------------------------------------------------------------------
/src/python/generate_h5_rna.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # coding=utf8
  3 | 
  4 | """
  5 | This script takes in the STARsolo barcodes tsv file, features tsv file,
  6 | and raw count matrix mtx file, and generates an h5 file containing the
  7 | genes x barcodes count matrix.
  8 | """
  9 | 
 10 | import argparse
 11 | from collections import defaultdict
 12 | import gzip
 13 | import h5py
 14 | import logging
 15 | from scipy.sparse import csc_matrix
 16 | 
 17 | def parse_arguments():
 18 |     parser = argparse.ArgumentParser(description="Generate an h5 count matrix of genes x barcodes")
 19 |     parser.add_argument("matrix_file", help="Filename for STARsolo raw matrix mtx file")
 20 |     parser.add_argument("features_file", help="Filename for STARsolo features tsv file")
 21 |     parser.add_argument("barcodes_file", help="Filename for STARsolo barcodes tsv file")
 22 |     parser.add_argument("output_file", help="Filename for output h5 file")
 23 |     parser.add_argument("pkr", help="Experiment prefix", nargs = '?')
 24 |     parser.add_argument("--ensembl", help="Flag for outputting genes using ENSEMBL ID, rather than gene name", action="store_true")
 25 | 
 26 |     return parser.parse_args()
 27 | 
 28 | def get_split_lines(file_name, delimiter, skip=0):
 29 |     """Read file contents and yield generator with line entries"""
 30 |     opener = gzip.open if file_name.endswith('.gz') else open
 31 | 
 32 |     with opener(file_name, "rt") as f:
 33 |         for i in range(skip):
 34 |             next(f)
 35 |         for line in f:
 36 |             yield line.rstrip().split(sep=delimiter)
 37 | 
 38 | def rename_duplicates(duplicate_list):
 39 |     """Rename duplicate entries as entry, entry.1, entry.2, etc."""
 40 |     seen = defaultdict(int)
 41 |     renamed_list = []
 42 |     
 43 |     for entry in duplicate_list:
 44 |         renamed_list.append(f"{entry}.{seen[entry]}" if entry in seen else entry)
 45 |         seen[entry] += 1
 46 |         
 47 |     return renamed_list
 48 | 
 49 | def build_count_matrix(matrix):
 50 |     """Convert contents of mtx file to csc matrix"""
 51 |     # first line of matrix contains dimensions
 52 |     dimensions = next(matrix)
 53 |     n_rows = int(dimensions[0])
 54 |     n_cols = int(dimensions[1])
 55 | 
 56 |     gene_indices = []
 57 |     barcode_indices = []
 58 |     counts = []
 59 | 
 60 |     for line in matrix:
 61 |         # subtract 1 from indices to convert to zero-based indexing
 62 |         gene_indices.append(int(line[0])-1)
 63 |         barcode_indices.append(int(line[1])-1)
 64 |         counts.append(int(line[2]))
 65 | 
 66 |     count_matrix = csc_matrix((counts, (gene_indices,barcode_indices)), shape=(n_rows,n_cols))
 67 | 
 68 |     return count_matrix
 69 | 
 70 | def write_h5(output_file, count_matrix, barcode_list, gene_list):
 71 |     h5_file = h5py.File(output_file, "w")
 72 | 
 73 |     # create datasets expected for Seurat import
 74 |     g = h5_file.create_group("group")
 75 |     g.create_dataset("barcodes", data=barcode_list)
 76 |     g.create_dataset("data", data=count_matrix.data)
 77 |     g.create_dataset("gene_names", data=gene_list)
 78 |     g.create_dataset("genes", data=gene_list)
 79 |     g.create_dataset("indices", data=count_matrix.indices)
 80 |     g.create_dataset("indptr", data=count_matrix.indptr)
 81 |     g.create_dataset("shape", data=count_matrix.shape)
 82 | 
 83 |     h5_file.close()
 84 | 
 85 | def main():
 86 |     # create log file
 87 |     logging.basicConfig(filename="generate_h5_rna.log", level=logging.INFO)
 88 | 
 89 |     # get arguments
 90 |     args = parse_arguments()
 91 |     matrix_file = getattr(args, "matrix_file")
 92 |     features_file = getattr(args, "features_file")
 93 |     barcodes_file = getattr(args, "barcodes_file")
 94 |     pkr = getattr(args, "pkr", None)
 95 |     output_file = getattr(args, "output_file")
 96 |     ensembl = getattr(args, "ensembl")
 97 | 
 98 |     # read input files
 99 |     logging.info("Reading input files\n")
100 |     
101 |     # get indices and counts from matrix file; skip first two lines of matrix file (header)
102 |     matrix = get_split_lines(matrix_file, delimiter=" ", skip=2)
103 |     
104 |     # get genes from features file
105 |     features = get_split_lines(features_file, delimiter="\t")
106 |     if ensembl:
107 |         gene_list = [line[0] for line in features]
108 |     else:
109 |         gene_list_duplicated = [line[1] for line in features]
110 |         # append .1, .2, etc. for duplicated genes
111 |         gene_list = rename_duplicates(gene_list_duplicated)
112 | 
113 |     # get barcodes from barcodes file, reformat as R1R2R3_PKR
114 |     barcodes = get_split_lines(barcodes_file, delimiter="\t")
115 |     barcode_list = [line[0] for line in barcodes]
116 |     if pkr is None:
117 |         formatted_barcode_list = barcode_list
118 |     else:
119 |         formatted_barcode_list = [barcode + "_" + pkr for barcode in barcode_list]
120 | 
121 |     # generate count matrix
122 |     logging.info("Generating count matrix\n")
123 |     count_matrix = build_count_matrix(matrix)
124 | 
125 |     # write h5 file
126 |     logging.info(f"Writing to {output_file}.h5\n")
127 |     write_h5(output_file, count_matrix, formatted_barcode_list, gene_list)
128 |     logging.info("Finished writing h5 file\n")
129 | 
130 | if __name__ == "__main__":
131 |     main()
132 | 


--------------------------------------------------------------------------------
/src/python/rna_barcode_metadata.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | This script takes in a bam file, and outputs a txt file containing the number of
  5 | total reads, duplicate reads, UMIs, genes, and percent mitochondrial reads for each barcode.
  6 | """
  7 | 
  8 | import argparse
  9 | import logging
 10 | import pysam
 11 | from collections import defaultdict
 12 | 
 13 | logging.basicConfig(filename='barcode_metadata.log', encoding='utf-8', level=logging.DEBUG)
 14 | logging.debug('Creating the barcode metadata for RNA from bam.')
 15 | 
 16 | def parse_arguments():
 17 |     parser = argparse.ArgumentParser(description="Get total reads, duplicate reads, UMIs, genes, and percent mitochondrial reads for each barcode from bam file")
 18 |     parser.add_argument("bam_file", help="Filename for input bam file")
 19 |     parser.add_argument("bai_file", help="Filename for bam index file")
 20 |     parser.add_argument("barcode_metadata_file", help="Filename for output barcode metadata txt file")
 21 |     parser.add_argument("pkr", help="PKR id for shareseq", default = None, nargs='?')
 22 |     parser.add_argument("--barcode_tag", help="PKR id for shareseq", default="CB")
 23 | 
 24 |     return parser.parse_args()
 25 | 
 26 | def get_metrics(bam, barcode_tag="CB", pkr=None):
 27 |     """
 28 |     Get barcode metrics from bam file; all counts are only for reads overlapping genes.
 29 |     Reported metrics are total counts, UMIs (one UMI counted per unique UMI-gene mapping),
 30 |     duplicate counts, genes, percent mitochondrial reads
 31 |     """
 32 |     total_counts = defaultdict(int)
 33 |     genes = defaultdict(set)
 34 |     umi_gene = defaultdict(set)
 35 |     mitochondrial_counts = defaultdict(int)
 36 |     barcodes = set()
 37 |     formatted_barcodes = {}
 38 | 
 39 |     for read in bam:
 40 |         try:
 41 |             # get barcode; skip read if not present
 42 |             barcode = read.get_tag(barcode_tag)
 43 |             if barcode == "-":
 44 |                 #logging.warning(f"Skipping {read.qname} because the {barcode_tag} tag is empty") slowing down
 45 |                 continue
 46 | 
 47 |             # get gene id; skip read if not present
 48 |             gene_id = read.get_tag("GX")
 49 |             if gene_id == "-":
 50 |                 #logging.warning(f"Skipping {read.qname} because the GX tag is empty")
 51 |                 continue
 52 | 
 53 |             # get UMI; skip read if not present
 54 |             umi = read.get_tag("UB")
 55 |             if umi == "-":
 56 |                 #logging.warning(f"Skipping {read.qname} because the UB tag is empty")
 57 |                 continue
 58 | 
 59 |             barcodes.add(barcode)
 60 | 
 61 |             total_counts[barcode] += 1
 62 | 
 63 |             genes[barcode].add(gene_id)
 64 | 
 65 |             umi_gene[barcode].add(umi + gene_id)
 66 | 
 67 |             if read.reference_name == "chrM":
 68 |                 mitochondrial_counts[barcode] += 1
 69 |         except KeyError:
 70 |             logging.error(f"Skipping {read.qname} because one of the tags {barcode_tag},GX, or UB is missing.")
 71 | 
 72 |     # count unique genes per barcode
 73 |     genes_per_barcode = {barcode:len(gene_set) for (barcode, gene_set) in genes.items()}
 74 | 
 75 |     # count unique umi-gene mappings per barcode
 76 |     umis_per_barcode = {barcode:len(umi_gene_set) for (barcode, umi_gene_set) in umi_gene.items()}
 77 | 
 78 |     # create list with barcodes and associated metrics
 79 |     barcode_metadata = []
 80 |     for barcode in barcodes:
 81 |         total_val = str(total_counts[barcode])
 82 |         umi_val = str(umis_per_barcode.get(barcode, 0))
 83 |         duplicate_val = str(total_counts[barcode] - umis_per_barcode.get(barcode, 0))
 84 |         gene_val = str(genes_per_barcode.get(barcode, 0))
 85 |         mitochondrial_val = str(round(mitochondrial_counts.get(barcode, 0) / total_counts[barcode] * 100, 2))
 86 |         out_barcode = barcode + "_" + pkr if pkr else barcode
 87 | 
 88 |         metrics = [out_barcode, total_val, duplicate_val, umi_val, gene_val, mitochondrial_val]
 89 | 
 90 |         barcode_metadata.append(metrics)
 91 | 
 92 |     return barcode_metadata
 93 | 
 94 | def write_metadata_file(barcode_metadata, output_file):
 95 |     fields = ["barcode", "total_counts", "duplicate_counts", "umis", "genes", "percent_mitochondrial"]
 96 | 
 97 |     with open(output_file, "w") as f:
 98 |         # write header
 99 |         f.write("\t".join(fields) + "\n")
100 |         # write rows
101 |         for metrics_list in barcode_metadata:
102 |             f.write("\t".join(metrics_list[:]) + "\n")
103 | 
104 | def main():
105 |     # get arguments
106 |     args = parse_arguments()
107 |     bam_file = getattr(args, "bam_file")
108 |     bai_file = getattr(args, "bai_file")
109 | 
110 |     pkr = getattr(args, "pkr")
111 |     barcode_tag = getattr(args, "barcode_tag")
112 | 
113 |     barcode_metadata_file = getattr(args, "barcode_metadata_file")
114 | 
115 |     # load bam file
116 |     bam = pysam.AlignmentFile(bam_file, "rb", index_filename=bai_file)
117 | 
118 |     # get metrics for each barcode
119 |     barcode_metadata = get_metrics(bam, barcode_tag, pkr)
120 | 
121 |     # write txt file
122 |     write_metadata_file(barcode_metadata, barcode_metadata_file)
123 | 
124 | if __name__ == "__main__":
125 | 
126 |     main()
127 | 


--------------------------------------------------------------------------------
/src/bash/monitor_script.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | declare -a TEMP=$(mktemp temp_monitoring.XXXXXXXX)
  4 | 
  5 | if [[ -z "${BACKEND}" ]]; then
  6 |         backend=""
  7 | else
  8 |         backend=${BACKEND}
  9 | fi
 10 | 
 11 | function get_disk_info() {
 12 |         # df command and cromwell root field
 13 |         if [ "$backend" = "aws" ]; then
 14 |                 df | grep '/$'
 15 |         else
 16 |                 df | grep cromwell_root
 17 |         fi
 18 | }
 19 | 
 20 | function get_disk_usage() {
 21 |         # get disk usage field
 22 |         get_disk_info | awk '{ print $5 }'
 23 | }
 24 | 
 25 | function get_mem_info() {
 26 |         # /proc/meminfo
 27 |         cat /proc/meminfo
 28 | }
 29 | 
 30 | function get_mem_available() {
 31 |         # mem unused from /proc/meminfo
 32 |         get_mem_info | grep MemAvailable | awk 'BEGIN { FS=" " } ; { print $2 }'
 33 | }
 34 | 
 35 | function get_mem_total() {
 36 |         # mem total from /proc/meminfo
 37 |         get_mem_info | grep MemTotal | awk 'BEGIN { FS=" " } ; { print $2 }'
 38 | }
 39 | 
 40 | function get_mem_usage() {
 41 |         # memTotal and memAvailable
 42 |         local -r mem_total=$(get_mem_total)
 43 |         local -r mem_available=$(get_mem_available)
 44 | 
 45 |         # usage = 100 * mem_used / mem_total
 46 |         local -r mem_used=$(($mem_total-$mem_available))
 47 |         echo "$mem_used" "$mem_total" "%"| awk '{ print 100*($1/$2)$3 }'
 48 | }
 49 | 
 50 | function get_cpu_info() {
 51 |         # cpu info from /proc/stat
 52 |         cat /proc/stat | grep "cpu "
 53 | }
 54 | 
 55 | function get_cpu_total() {
 56 |         # get the total cpu usage since a given time (including idle and iowait)
 57 |         # user+nice+system+idle+iowait+irq+softirq+steal
 58 |         get_cpu_info | awk 'BEGIN { FS=" " } ; { print $2+$3+$4+$5+$6+$7+$8+$9 }'
 59 | }
 60 | 
 61 | function get_cpu_used() {
 62 |         # get the cpu usage since a given time (w/o idle or iowait)
 63 |         # user+nice+system+irq+softirq+steal
 64 |         get_cpu_info | awk 'BEGIN { FS=" " } ; { print $2+$3+$4+$7+$8+$9 }'
 65 | }
 66 | 
 67 | function get_cpu_usage() {
 68 |         # get the cpu usage since a given time (w/o idle or iowait)
 69 |         # user+nice+system+irq+softirq+steal
 70 |         local -r cpu_used_cur=$(get_cpu_used)
 71 | 
 72 |         # get the total cpu usage since a given time (including idle and iowait)
 73 |         # user+nice+system+idle+iowait+irq+softirq+steal
 74 |         local -r cpu_total_cur=$(get_cpu_total)
 75 | 
 76 |         # read in previous cpu usage values
 77 |         read -r -a cpu_prev < ${TEMP}
 78 |         local -r cpu_used_prev=${cpu_prev[0]}
 79 |         local -r cpu_total_prev=${cpu_prev[1]}
 80 | 
 81 |         # save current values as prev values for next iteration
 82 |         cpu_prev[0]=$cpu_used_cur
 83 |         cpu_prev[1]=$cpu_total_cur
 84 |         echo "${cpu_prev[@]}" > ${TEMP}
 85 | 
 86 |         # usage = 100 * (cpu_used_cur - cpu_used_prev) / (cpu_total_cur-cpu_total_prev)
 87 |         echo "$cpu_used_cur" "$cpu_used_prev" "$cpu_total_cur" "$cpu_total_prev" "%"| awk 'BEGIN {FS=" "} ; { print 100*(($1-$2)/($3-$4))$5 }'
 88 | 
 89 | }
 90 | 
 91 | function print_usage() {
 92 |         echo [$(date)]
 93 |         echo \* CPU usage: "$(get_cpu_usage)"
 94 |         echo \* Memory usage: "$(get_mem_usage)"
 95 |         echo \* Disk usage: $(get_disk_usage)
 96 | }
 97 | 
 98 | function print_summary() {
 99 |         # display header information
100 |         echo ==================================
101 |         echo =========== MONITORING ===========
102 |         echo ==================================
103 | 
104 |         # summary info
105 |         echo --- General Information ---
106 |         # number of cores
107 |         echo \#CPU: $(nproc)
108 |         # multiply by 10^-6 to convert KB to GB
109 |         echo Total Memory: $(echo $(get_mem_total) 1000000 | awk '{ print $1/$2 }')G
110 | 
111 |         if [ "$backend" = "aws" ]; then
112 |                 echo Total Disk space: $(df -h | grep '/$' | awk '{ print $2 }')
113 |         else
114 |                 echo Total Disk space: $(df -h | grep cromwell_root | awk '{ print $2}')
115 |         fi
116 | }
117 | 
118 | function main() {
119 |         # disk, mem and cpu general statisitcs
120 |         print_summary
121 | 
122 |         # create variable to store cpu being used (cpu_prev[0]) and total cpu total (cpu_prev[1])
123 |         # save variable to a temp file to allow passing in values to a function
124 |         declare -a cpu_prev
125 |         cpu_prev[0]=$(get_cpu_used)
126 |         cpu_prev[1]=$(get_cpu_total)
127 |         # save global values to temp file to allow passing in values to a function
128 |         echo "${cpu_prev[@]}" > ${TEMP}
129 | 
130 |         # sleep b/w getting usage and intially storing the cpu_previous usage values
131 |         # this is b/c cpu usage values are time dependent
132 |         # to calculate cpu usage, values must be determined from 2 diff time stamps
133 |         if [ -z "$MONITOR_SCRIPT_SLEEP" ]; then
134 |             MONITOR_SCRIPT_SLEEP=30
135 |         fi
136 |         # get usage of disk, cpu and mem every MONITOR_SCRIPT_SLEEP sec
137 |         echo
138 |         echo --- Runtime Information ---
139 | 
140 |         sleep "$MONITOR_SCRIPT_SLEEP";
141 |         while true; do print_usage; sleep "$MONITOR_SCRIPT_SLEEP"; done
142 | }
143 | 
144 | main
145 | 


--------------------------------------------------------------------------------
/tasks/share_task_qc_rna.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | # TASK
  4 | # SHARE-qc-rna
  5 | 
  6 | task qc_rna {
  7 |     meta {
  8 |         version: 'v0.1'
  9 |         author: 'Mei Knudson (mknudson@broadinstitute.org) at Broad Institute of MIT and Harvard'
 10 |         description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: QC RNA task'
 11 |     }
 12 | 
 13 |     input {
 14 |         # This function takes in input the sorted bam file produced by STARsolo
 15 |         File bam
 16 |         Int? umi_cutoff = 100
 17 |         Int? gene_cutoff = 100
 18 |         String genome_name
 19 |         String? barcode_tag = "CB"
 20 |         String? pkr
 21 |         String? prefix
 22 | 
 23 |         Int? cpus = 16
 24 |         Float? disk_factor = 1.0
 25 |         Float? memory_factor = 1.5
 26 |         String docker_image = "us.gcr.io/buenrostro-share-seq/share_task_qc_rna:v1.0.0"
 27 |     }
 28 | 
 29 |     # Determine the size of the input
 30 |     Float input_file_size_gb = size(bam, "G")
 31 | 
 32 |     # Determining memory size based on the size of the input files.
 33 |     Float mem_gb = 5.0 + memory_factor * input_file_size_gb
 34 | 
 35 |     # Determining disk size based on the size of the input files.
 36 |     Int disk_gb = round(40.0 + disk_factor * input_file_size_gb)
 37 | 
 38 |     # Determining disk type based on the size of disk.
 39 |     String disk_type = if disk_gb > 375 then "SSD" else "LOCAL"
 40 | 
 41 |     String assay = "RNA"
 42 |     String bai = "~{default="share-seq" prefix}.qc.rna.~{genome_name}.bam.bai"
 43 |     String barcode_metadata = "~{default="share-seq" prefix}.qc.rna.~{genome_name}.barcode.metadata.tsv"
 44 |     String duplicates_log = "~{default="share-seq" prefix}.qc.rna.~{genome_name}.duplicates.log.txt"
 45 |     String umi_barcode_rank_plot = "~{default="share-seq" prefix}.qc.rna.~{genome_name}.umi.barcode.rank.plot.png"
 46 |     String gene_barcode_rank_plot = "~{default="share-seq" prefix}.qc.rna.~{genome_name}.gene.barcode.rank.plot.png"
 47 |     String gene_umi_scatter_plot = "~{default="share-seq" prefix}.qc.rna.~{genome_name}.gene.umi.scatter.plot.png"
 48 |     String monitor_log = "monitor.log"
 49 | 
 50 |     command <<<
 51 |         set -e
 52 | 
 53 |         bash $(which monitor_script.sh) | tee ~{monitor_log} 1>&2 &
 54 | 
 55 |         # Index bam file
 56 |         samtools index -@ ~{cpus} ~{bam} ~{bai}
 57 | 
 58 |         # Extract barcode metadata (total counts, unique counts, duplicate counts, genes, percent mitochondrial) from bam file
 59 |         python3 $(which rna_barcode_metadata.py) ~{bam} \
 60 |                                                  ~{bai} \
 61 |                                                  ~{barcode_metadata} \
 62 |                                                  ~{pkr} ~{"--barcode_tag " + barcode_tag}
 63 | 
 64 |         awk '{total+=$2; duplicate+=$3; unique+=$4} END {print "total reads:", total; print "unique reads:", unique; print "duplicate reads:", duplicate}' ~{barcode_metadata} > ~{duplicates_log}
 65 | 
 66 |         # Make QC plots
 67 |         Rscript $(which rna_qc_plots.R) ~{barcode_metadata} ~{umi_cutoff} ~{gene_cutoff} ~{umi_barcode_rank_plot} ~{gene_barcode_rank_plot} ~{gene_umi_scatter_plot}
 68 |     >>>
 69 | 
 70 |     output {
 71 |         File rna_barcode_metadata = "~{barcode_metadata}"
 72 |         File rna_duplicates_log = "~{duplicates_log}"
 73 |         File rna_barcode_metadata_log = "barcode_metadata.log"
 74 |         File? rna_umi_barcode_rank_plot = "~{umi_barcode_rank_plot}"
 75 |         File? rna_gene_barcode_rank_plot = "~{gene_barcode_rank_plot}"
 76 |         File? rna_gene_umi_scatter_plot = "~{gene_umi_scatter_plot}"
 77 |     }
 78 | 
 79 |     runtime {
 80 |         cpu : cpus
 81 |         memory : "~{mem_gb} GB"
 82 |         disks: "local-disk ~{disk_gb} ~{disk_type}"
 83 |         docker : "${docker_image}"
 84 |     }
 85 | 
 86 |     parameter_meta {
 87 |         bam: {
 88 |                 description: 'Alignment bam file',
 89 |                 help: 'Aligned reads in bam format.',
 90 |                 example: 'hg38.aligned.bam'
 91 |             }
 92 |         umi_cutoff: {
 93 |                 description: 'UMI cutoff',
 94 |                 help: 'Cutoff for number of UMIs required when making UMI barcode rank plot.',
 95 |                 example: 10
 96 |             }
 97 |         gene_cutoff: {
 98 |                 description: 'Gene cutoff',
 99 |                 help: 'Cutoff for number of genes required when making gene barcode rank plot.',
100 |                 example: 10
101 |             }
102 |         pkr: {
103 |                 description: 'Experiment pkr',
104 |                 help: 'Id of the sample pkr (share-seq specific).',
105 |                 examples: ['SS-PKR-000']
106 |             }
107 |         genome_name: {
108 |                 description: 'Reference name',
109 |                 help: 'The name genome reference used to align.',
110 |                 example: ['hg38', 'mm10', 'hg19', 'mm9']
111 |             }
112 |         prefix: {
113 |                 description: 'Prefix for output files',
114 |                 help: 'Prefix that will be used to name the output files',
115 |                 example: 'MyExperiment'
116 |             }
117 |         docker_image: {
118 |                 description: 'Docker image.',
119 |                 help: 'Docker image for preprocessing step. Dependencies: samtools',
120 |                 example: ['put link to gcr or dockerhub']
121 |             }
122 |     }
123 | }
124 | 


--------------------------------------------------------------------------------
/tasks/share_task_joint_qc.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | # TASK
  4 | # SHARE-joint-qc-plotting
  5 | 
  6 | 
  7 | task joint_qc_plotting {
  8 |     meta {
  9 |         version: 'v0.1'
 10 |         author: 'Mei Knudson (mknudson@broadinstitute.org) at Broad Institute of MIT and Harvard'
 11 |         description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: Joint QC plot'
 12 |     }
 13 | 
 14 |     input {
 15 |         # This task generates a plot of barcodes QC'd jointly by RNA and ATAC metrics, as well as a
 16 |         # density plot of all barcodes passing at least one filter.
 17 |         File? atac_barcode_metadata
 18 |         File? rna_barcode_metadata
 19 |         Int remove_low_yielding_cells = 10
 20 |         Int min_umis = 100
 21 |         Int min_genes = 200
 22 |         Int min_tss = 4
 23 |         Int min_frags = 100
 24 | 
 25 |         Float? disk_factor = 8.0
 26 |         Float? memory_factor = 2.0
 27 | 
 28 |         String? prefix
 29 |         String genome_name
 30 | 
 31 |         String docker_image = "us.gcr.io/buenrostro-share-seq/share_task_joint_qc:v1.0.0"
 32 |     }
 33 | 
 34 |     # Determine the size of the input
 35 |     Float input_file_size_gb = size(atac_barcode_metadata, "G") + size(rna_barcode_metadata, "G")
 36 | 
 37 |     # Determine memory size based on the size of the input files
 38 |     Float mem_gb = 5.0 + memory_factor * input_file_size_gb
 39 | 
 40 |     # Determine disk size based on the size of the input files
 41 |     Int disk_gb = round(40.0 + disk_factor * input_file_size_gb)
 42 | 
 43 |     # Determining disk type base on the size of disk.
 44 |     String disk_type = if disk_gb > 375 then "SSD" else "LOCAL"
 45 | 
 46 |     String joint_qc_plot = '${default="share-seq" prefix}.${genome_name}.joint.qc.plot.png'
 47 |     String joint_density_plot = '${default="share-seq" prefix}.${genome_name}.joint.density.plot.png'
 48 |     String joint_barcode_metadata = '${default="share-seq" prefix}.joint.barcode.metadata.${genome_name}.csv'
 49 | 
 50 |     command {
 51 |         set -e
 52 | 
 53 |         bash $(which monitor_script.sh) > monitoring.log &
 54 | 
 55 |         # Make joint qc plot
 56 |         python3 $(which joint_cell_plotting.py) ${rna_barcode_metadata} ${atac_barcode_metadata} ${remove_low_yielding_cells} ${min_umis} ${min_genes} ${min_tss} ${min_frags} ${joint_qc_plot} ${joint_barcode_metadata} ${default="share-seq" prefix}
 57 | 
 58 |         # Make joint density plot
 59 |         Rscript $(which joint_cell_plotting_density.R) ${default="share-seq" prefix} ${joint_barcode_metadata} ${joint_density_plot}
 60 |     }
 61 | 
 62 |     output {
 63 |         File joint_calling_monitor = "monitoring.log"
 64 |         File joint_calling_log = "joint_cell_plotting.log"
 65 |         File? joint_qc_plot = "${joint_qc_plot}"
 66 |         File? joint_density_plot = "${joint_density_plot}"
 67 |         File joint_barcode_metadata = "${joint_barcode_metadata}"
 68 |     }
 69 | 
 70 |     runtime {
 71 |         memory : "${mem_gb} GB"
 72 |         disks: "local-disk ${disk_gb} ${disk_type}"
 73 |         docker : "${docker_image}"
 74 |     }
 75 | 
 76 |     parameter_meta {
 77 |         atac_barcode_metadata: {
 78 |                 description: 'File containing ATAC barcode metrics.',
 79 |                 help: 'tsv file with ATAC barcode (R1,R2,R3,PKR), fragments, TSS enrichment.',
 80 |                 example: 'qc.atac.barcode.metadata.tsv'
 81 |             }
 82 |         rna_barcode_metadata: {
 83 |                 description: 'File containing RNA barcode metrics.',
 84 |                 help: 'tsv file with RNA barcode (R1,R2,R3,PKR), UMIs, genes.',
 85 |                 example: 'qc.rna.barcode.metadata.tsv'
 86 |            }
 87 |         remove_low_yielding_cells: {
 88 |                 description: 'UMI and fragments cutoff for plotting.',
 89 |                 help: 'Minimum number of UMIs/fragments required for barcode to be plotted.',
 90 |                 example: 10
 91 |            }
 92 |         min_umis: {
 93 |                 description: 'UMI cutoff for RNA QC.',
 94 |                 help: 'Minimum number of UMIs required for barcode to pass RNA QC.',
 95 |                 example: 100
 96 |            }
 97 |         min_genes: {
 98 |                 description: 'Gene cutoff for RNA QC.',
 99 |                 help: 'Minimum number of genes required for barcode to pass RNA QC.',
100 |                 example: 200
101 |            }
102 |         min_tss: {
103 |                 description: 'TSS cutoff for ATAC QC.',
104 |                 help: 'Minimum TSS score required for barcode to pass ATAC QC.',
105 |                 example: 4
106 |            }
107 |         min_frags: {
108 |                 description: 'Fragments cutoff for ATAC QC.',
109 |                 help: 'Minimum number of fragments required for barcode to pass ATAC QC.',
110 |                 example: 100
111 |            }
112 |         prefix: {
113 |                 description: 'Prefix for output files',
114 |                 help: 'Prefix that will be used to name the output files',
115 |                 examples: 'MyExperiment'
116 |             }
117 |         genome_name: {
118 |                 description: 'Reference name',
119 |                 help: 'The name genome reference used to align.',
120 |                 example: ['hg38', 'mm10', 'hg19', 'mm9']
121 |             }
122 |         docker_image: {
123 |                 description: 'Docker image.',
124 |                 help: 'Docker image for preprocessing step.',
125 |                 example: ['put link to gcr or dockerhub']
126 |             }
127 |     }
128 | }
129 | 


--------------------------------------------------------------------------------
/tasks/10x_task_preprocess.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | # TASK
  4 | # 10x_task_preprocess
  5 | 
  6 | task preprocess_tenx {
  7 |     meta {
  8 |         version: 'v0.1'
  9 |         author: 'Eugenio Mattei (emattei@broadinstitute.org) at Broad Institute of MIT and Harvard'
 10 |         description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: preprocess 10x ATAC data.'
 11 |     }
 12 | 
 13 |     input {
 14 |         # This task takes in input the 3 fastqs coming out from cellranger mkfastqs and preprocess them.
 15 |         File fastq_R1 # Pair 1 reads
 16 |         File fastq_R3 # Pair 2 reads
 17 |         File fastq_R2 # Barcode fastq
 18 |         File? whitelist # Barcode whitelist (chemistry specific)
 19 |         Int? barcode_dist = 2
 20 |         Float? threshold_pct_barcode_matching = 0.60
 21 |         String chemistry
 22 |         String? prefix
 23 |         Int? cpus = 16
 24 |         Float? disk_factor = 8.0
 25 |         Float? memory_factor = 0.15
 26 |         String docker_image = "us.gcr.io/buenrostro-share-seq/10x_task_preprocess:v1.0.0"
 27 |     }
 28 | 
 29 |     # Determine the size of the input
 30 |     Float input_file_size_gb = size(fastq_R1, "G") + size(fastq_R2, "G") + size(fastq_R3, "G")
 31 | 
 32 |     # Determining memory size base on the size of the input files.
 33 |     Float mem_gb = 5.0 + memory_factor * input_file_size_gb
 34 | 
 35 |     # Determining disk size base on the size of the input files.
 36 |     Int disk_gb = round(40.0 + disk_factor * input_file_size_gb)
 37 | 
 38 |     # Determining disk type base on the size of disk.
 39 |     String disk_type = if disk_gb > 375 then "SSD" else "LOCAL"
 40 | 
 41 |     # auto-detect barcode complementation outfiles
 42 |     String barcode_complementation_qc = "${default="10x" prefix}.atac.preprocess.complementation.qc.txt"
 43 |     String barcode_complementation_out = "${default="10x" prefix}.atac.preprocess.complementation.out.txt"
 44 | 
 45 |     # barcode correction and filtering outfiles
 46 |     String barcode_correction_qc = "${default="10x" prefix}.atac.preprocess.barcode.correction.qc.txt"
 47 |     String cleaned_fastq_R1 = "${default="10x" prefix}.atac.preprocess.cleaned.R1.fastq.gz"
 48 |     String cleaned_fastq_R2 = "${default="10x" prefix}.atac.preprocess.cleaned.R2.fastq.gz"
 49 | 
 50 |     # read trimming outfiles
 51 |     String final_fastq_R1 = "${default="10x" prefix}.atac.preprocess.cleaned.trimmed.R1.fastq.gz"
 52 |     String final_fastq_R2 = "${default="10x" prefix}.atac.preprocess.cleaned.trimmed.R2.fastq.gz"
 53 |     String trimming_log_json = "${default="10x" prefix}.atac.preprocess.trimming.log.json"
 54 |     String trimming_log_html = "${default="10x" prefix}.atac.preprocess.trimming.log.html"
 55 |     String trimming_stats = "${default="10x" prefix}.atac.preprocess.trimming.adapter.stats.txt"
 56 | 
 57 |     String barcode_conversion_dict = "barcode_conversion_dict.csv"
 58 | 
 59 |     String monitor_log = 'monitor_10x_preprocessing.log.txt'
 60 | 
 61 |     command <<<
 62 |         set -e
 63 | 
 64 |         bash $(which monitor_script.sh) | tee ~{monitor_log} 1>&2 &
 65 | 
 66 |         # Strip read description
 67 |         zcat ~{fastq_R1} | sed 's/ .*//' | gzip > stripped_R1.fastq.gz
 68 |         zcat ~{fastq_R3} | sed 's/ .*//' | gzip > stripped_R2.fastq.gz
 69 |         zcat ~{fastq_R2} | sed 's/ .*//' | gzip > stripped_barcode.fastq.gz
 70 | 
 71 |         if [[ '~{whitelist}' == *.gz ]]; then
 72 |             gunzip -c ~{whitelist} > whitelist.txt
 73 |         else
 74 |             ln -s ~{whitelist} whitelist.txt
 75 |         fi
 76 | 
 77 |         # auto-detect barcode complementation
 78 |         # python3 barcode_revcomp_detect.py barcode_fastq chemistry whitelist qc_out out threshold
 79 | 
 80 |         python3 $(which barcode_revcomp_detect.py) stripped_barcode.fastq.gz ~{chemistry} whitelist.txt ~{barcode_complementation_qc} ~{barcode_complementation_out} ~{threshold_pct_barcode_matching}
 81 | 
 82 |         # barcode correction and filtering
 83 |         # python3 match_barcodes.py
 84 | 
 85 |         python3 $(which match_barcodes.py) stripped_R1.fastq.gz stripped_R2.fastq.gz stripped_barcode.fastq.gz ~{chemistry} ~{barcode_dist} ~{barcode_complementation_out} whitelist.txt ~{cleaned_fastq_R1} ~{cleaned_fastq_R2} ~{barcode_correction_qc} ~{cpus}
 86 | 
 87 |         # Cleaned old files
 88 |         rm stripped_R1.fastq.gz stripped_R2.fastq.gz stripped_barcode.fastq.gz
 89 |     >>>
 90 | 
 91 |     output {
 92 |         File fastq_R1_preprocessed = cleaned_fastq_R1
 93 |         File fastq_R2_preprocessed = cleaned_fastq_R2
 94 |         File tenx_barcode_complementation_qc = barcode_complementation_qc
 95 |         File tenx_barcode_correction_qc = barcode_correction_qc
 96 |         File? tenx_barcode_conversion_dict = barcode_conversion_dict
 97 |         #File tenx_trimming_log_json = trimming_log_json
 98 |         #File trimming_log_html = trimming_log_html
 99 |         #File tenx_trimming_stats = trimming_stats
100 |     }
101 | 
102 |     runtime {
103 |         cpu: cpus
104 |         docker: "${docker_image}"
105 |         disks: "local-disk ${disk_gb} ${disk_type}"
106 |         memory: "${mem_gb} GB"
107 |     }
108 | 
109 |     parameter_meta {
110 |         fastq_R1: {
111 |                 description: 'Pairs 1 fastq',
112 |                 help: 'Pairs 1 fastq',
113 |             }
114 |         fastq_R2: {
115 |                 description: 'Barcode fastq',
116 |                 help: 'Barcode fastq',
117 |             }
118 |         fastq_R3: {
119 |                 description: 'Pairs 2 fastq',
120 |                 help: 'Pairs 2 fastq',
121 |             }
122 |     }
123 | 
124 | }
125 | 


--------------------------------------------------------------------------------
/src/R/rna_qc_plots.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/Rscript
  2 | 
  3 | ### Takes RNA barcode metadata tsv file, and outputs QC plots as png files.
  4 | ### QC plots include barcode rank by number of UMIs (all barcodes and top-ranked barcodes),
  5 | ### barcode rank by number of genes (all barcodes and top-ranked barcodes),
  6 | ### and genes vs UMIs scatter plot.
  7 | 
  8 | ## Import helper functions
  9 | source("/usr/local/bin/barcode_rank_functions.R")
 10 | 
 11 | ## Get arguments, read input
 12 | args <- commandArgs()
 13 | 
 14 | barcode_metadata_file <- args[6]
 15 | umi_cutoff <- as.integer(args[7])
 16 | gene_cutoff <- as.integer(args[8])
 17 | umi_rank_plot_file <- args[9]
 18 | gene_rank_plot_file <- args[10]
 19 | gene_umi_plot_file <- args[11]
 20 | 
 21 | barcode_metadata <- read.table(barcode_metadata_file, header=T)
 22 | 
 23 | ## Get plot inputs
 24 | 
 25 | # Impose UMI cutoff, sort in decreasing order, assign rank
 26 | umi_filtered <- barcode_metadata$umis[barcode_metadata$umis >= umi_cutoff]
 27 | umi_filtered_sort <- sort(umi_filtered, decreasing=T)
 28 | umi_rank <- 1:length(umi_filtered_sort)
 29 | 
 30 | # Find elbow/knee of UMI barcode rank plot and top-ranked UMI barcode rank plot
 31 | umi_points <- get_elbow_knee_points(x=umi_rank, y=log10(umi_filtered_sort))
 32 | # For each valid plot, make factor for coloring plot points
 33 | if (length(umi_points) > 0) { # Elbow found in first plot
 34 |   umi_plot1 <- TRUE
 35 |   is_top_ranked_umi <- factor(ifelse(umi_rank <= umi_points[1], 1, 0))
 36 |   if (length(umi_points) > 2) { # Elbow/knee found in second plot
 37 |     umi_plot2 <- TRUE
 38 |     umi_top_rank <- umi_rank[1:umi_points[1]]
 39 |     umi_top_umi <- umi_filtered_sort[1:umi_points[1]]
 40 |     is_top_top_ranked_umi <- factor(ifelse(umi_top_rank <= umi_points[3], 1, 0))
 41 |   } else {
 42 |     umi_plot2 <- FALSE
 43 |   }
 44 | } else {
 45 |   umi_plot1 <- FALSE
 46 | }
 47 | 
 48 | # Impose gene cutoff, sort in decreasing order, assign rank
 49 | gene_filtered <- barcode_metadata$genes[barcode_metadata$genes >= gene_cutoff]
 50 | gene_filtered_sort <- sort(gene_filtered, decreasing=T)
 51 | gene_rank <- 1:length(gene_filtered_sort)
 52 | 
 53 | # Find elbow/knee of gene barcode rank plot and top-ranked gene barcode rank plot
 54 | gene_points <- get_elbow_knee_points(x=gene_rank, y=log10(gene_filtered_sort))
 55 | # For each valid plot, make factor for coloring plot points
 56 | if (length(gene_points) > 0) { # Elbow found in first plot
 57 |   gene_plot1 <- TRUE
 58 |   is_top_ranked_gene <- factor(ifelse(gene_rank <= gene_points[1], 1, 0))
 59 |   if (length(gene_points) > 2) { # Elbow/knee found in second plot
 60 |     gene_plot2 <- TRUE
 61 |     gene_top_rank <- gene_rank[1:gene_points[1]]
 62 |     gene_top_gene <- gene_filtered_sort[1:gene_points[1]]
 63 |     is_top_top_ranked_gene <- factor(ifelse(gene_top_rank <= gene_points[3], 1, 0))
 64 |   } else {
 65 |     gene_plot2 <- FALSE
 66 |   }
 67 | } else {
 68 |   gene_plot1 <- FALSE
 69 | }
 70 | 
 71 | ## Generate plots
 72 | 
 73 | options(scipen=999)
 74 | 
 75 | # Make UMI barcode rank plots
 76 | png(umi_rank_plot_file, width=8, height=8, units='in', res=300)
 77 | par(mfrow = c(2,1))
 78 | 
 79 | # Plot 1 (all barcodes passing UMI filter vs log10(UMIs))
 80 | if (umi_plot1) {
 81 |   plot(x=umi_rank,
 82 |        y=umi_filtered_sort,
 83 |        log="y",
 84 |        xlab=paste0(" Barcode rank (", length(umi_rank)-umi_points[1], " low quality cells)"), 
 85 |        ylab="log10(UMIs)",
 86 |        main="RNA UMIs per Barcode", 
 87 |        col=c("dimgrey","darkblue")[is_top_ranked_umi], 
 88 |        pch=16,
 89 |        ylim=c(1,100000))
 90 |   abline(v=umi_points[1], h=10^(umi_points[2]))
 91 |   text(umi_points[1], 10^(umi_points[2]),
 92 |        paste0("(", umi_points[1], ", ", 10^(umi_points[2]), ")"),
 93 |        adj=c(-0.1,-0.5))
 94 | }
 95 | 
 96 | # Plot 2 (top ranked barcodes vs log10(UMIs))
 97 | if (umi_plot2) {
 98 |   plot(x=umi_top_rank,
 99 |        y=umi_top_umi,
100 |        log="y",
101 |        xlab="Barcode rank",
102 |        ylab="log10(UMIs)",
103 |        main="RNA UMIs per Top-Ranked Barcode",
104 |        col=c("dimgrey","darkblue")[is_top_top_ranked_umi],
105 |        pch=16,
106 |        ylim=c(1,100000))
107 |   abline(v=umi_points[3], h=10^(umi_points[4]))
108 |   text(umi_points[3], 10^(umi_points[4]),
109 |        paste("(", umi_points[3], ", ", 10^(umi_points[4]), ")", sep=""),
110 |        adj=c(-0.1,-0.5))
111 | }
112 | dev.off()
113 | 
114 | 
115 | # Make gene barcode rank plots
116 | png(gene_rank_plot_file, width=8, height=8, units='in', res=300)
117 | par(mfrow = c(2,1))
118 | 
119 | # Plot 1 (all barcodes passing gene filter vs log10(genes))
120 | if (gene_plot1) {
121 |   plot(x=gene_rank,
122 |        y=gene_filtered_sort,
123 |        log="y",
124 |        xlab=paste0(" Barcode rank (", length(gene_rank)-gene_points[1], " low quality cells)"), 
125 |        ylab="log10(genes)",
126 |        main="RNA Genes per Barcode", 
127 |        col=c("dimgrey","darkblue")[is_top_ranked_gene], 
128 |        pch=16,
129 |        ylim=c(1,10000))
130 |   abline(v=gene_points[1], h=10^(gene_points[2]))
131 |   text(gene_points[1], 10^(gene_points[2]),
132 |        paste0("(", gene_points[1], ", ", 10^(gene_points[2]), ")"),
133 |        adj=c(-0.1,-0.5))
134 | }
135 | 
136 | # Plot 2 (top ranked barcodes vs log10(genes))
137 | if (gene_plot2) {
138 |   plot(x=gene_top_rank,
139 |        y=gene_top_gene,
140 |        log="y",
141 |        xlab="Barcode rank",
142 |        ylab="log10(genes)",
143 |        main="RNA Genes per Top-Ranked Barcode",
144 |        col=c("dimgrey","darkblue")[is_top_top_ranked_gene],
145 |        pch=16,
146 |        ylim=c(1,10000))
147 |   abline(v=gene_points[3], h=10^(gene_points[4]))
148 |   text(gene_points[3], 10^(gene_points[4]),
149 |        paste("(", gene_points[3], ", ", 10^(gene_points[4]), ")", sep=""),
150 |        adj=c(-0.1,-0.5))
151 | }
152 | dev.off()
153 | 
154 | # Make genes vs UMIs scatter plot
155 | png(gene_umi_plot_file, width=8, height=8, units='in', res=300)
156 | 
157 | plot(x=barcode_metadata$umis,
158 |      y=barcode_metadata$genes,
159 |      xlab="UMIs",
160 |      ylab="Genes",
161 |      main="RNA Genes vs UMIs",
162 |      col="darkblue",
163 |      pch=16)
164 | 
165 | dev.off()
166 | 


--------------------------------------------------------------------------------
/tasks/share_task_merge_bams.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | # TASK
  4 | # SHARE-atac-merge_bams
  5 | 
  6 | task share_atac_merge_bams {
  7 |     meta {
  8 |         version: 'v0.1'
  9 |         author: 'Eugenio Mattei (emattei@broadinstitute.org) at Broad Institute of MIT and Harvard'
 10 |         description: 'Broad Institute of MIT and Harvard SHARE-Seq pipeline: merge the individual bams together'
 11 |     }
 12 | 
 13 |     input {
 14 |         # This task takes in input the preprocessed ATAC fastqs and align them to the genome.
 15 |         Array[File] bams
 16 |         Array[File] logs
 17 |         String genome_name
 18 |         String prefix = "sample-share"
 19 |         Int? multimappers # = 5
 20 |         Int? cpus = 16
 21 |         Float? disk_factor = 8.0
 22 |         Float? memory_factor = 0.15
 23 |         String? docker_image = "us.gcr.io/buenrostro-share-seq/share_task_merge_bams:v1.0.0"
 24 |     }
 25 | 
 26 |     # Determine the size of the input
 27 |     Float input_file_size_gb = size(bams, "G")
 28 | 
 29 |     # Determining memory size base on the size of the input files.
 30 |     Float mem_gb = 16.0 + memory_factor * input_file_size_gb
 31 | 
 32 |     # Determining disk size base on the size of the input files.
 33 |     Int disk_gb = round(20.0 + disk_factor * input_file_size_gb)
 34 | 
 35 |     # Determining disk type base on the size of disk.
 36 |     String disk_type = if disk_gb > 375 then "SSD" else "LOCAL"
 37 | 
 38 |     # Determining memory for samtools.
 39 |     Float samtools_memory_gb = 0.8 * mem_gb # Samtools has overheads so reducing the memory to 80% of the total.
 40 | 
 41 |     # Number of threads to beable to use 4GB of memory per thread seems to be the fastest way
 42 |     Int samtools_threads_ = floor(samtools_memory_gb / 4)
 43 |     Int samtools_threads =  if samtools_threads_ == 0 then 1 else samtools_threads_
 44 | 
 45 |     Int sambamba_threads = floor(cpus/2)
 46 | 
 47 |     # Now that we know how many threads we can use to assure 4GB of memory per thread
 48 |     # we assign any remaining memory to the threads.
 49 |     Int samtools_memory_per_thread_ = floor(samtools_memory_gb * 1024 / samtools_threads) # Computing the memory per thread for samtools in MB.
 50 |     Int samtools_memory_per_thread = if samtools_memory_per_thread_ < 768 then 768 else samtools_memory_per_thread_
 51 | 
 52 |     # Tim parameters
 53 |     Int machine_mem_mb = 18150
 54 |     Int cpu = 1
 55 |     Int compression_level = 5
 56 |     # default to 500GiB of space
 57 |     Int disk = 500
 58 |     Int command_mem_mb = machine_mem_mb - 500
 59 | 
 60 |     # Define tmp file name
 61 |     String unsorted_bam = "${prefix}.atac.merge.${genome_name}.bam"
 62 | 
 63 |     # Define the output names
 64 |     String merged_bam = "${prefix}.atac.merged.k${multimappers}.${genome_name}.sorted.bam"
 65 |     String merged_bai = "${prefix}.atac.merged.k${multimappers}.${genome_name}.sorted.bam.bai"
 66 |     String alignment_log = "${prefix}.atac.merged.k${multimappers}.${genome_name}.log"
 67 | 
 68 |     String monitor_log = "atac_merge_monitor.log"
 69 | 
 70 |     command <<<
 71 |         set -e
 72 | 
 73 |         bash $(which monitor_script.sh) 2>&1 &
 74 | 
 75 |         #sambamba merge -t ~{cpus} ~{unsorted_bam} ~{sep=" " bams}
 76 | 
 77 |         #sambamba sort -t ~{cpus} -m ~{command_mem_mb}M -o ~{merged_bam} ~{unsorted_bam}
 78 |       
 79 |         #sambamba index -t ~{cpus} ~{merged_bam}
 80 | 
 81 |         # Trying picard
 82 |         
 83 |         java -Dsamjdk.compression_level=~{compression_level} -Xms~{command_mem_mb}m -Xmx~{command_mem_mb}m -jar /usr/local/bin/picard.jar \
 84 |         MergeSamFiles \
 85 |         USE_THREADING=true \
 86 |         SORT_ORDER="coordinate" \
 87 |         INPUT=~{sep=' INPUT=' bams} \
 88 |         OUTPUT=~{merged_bam}
 89 | 
 90 |         sambamba index -t ~{cpus} ~{merged_bam}
 91 | 
 92 |         sed 's/^[[:space:]]*//g' ~{sep=" " logs} | cut -f 1 -d ' ' | awk '{ sum[FNR%15]+=$1 } END {n_total=length(sum);for (idx=1; idx <= n_total; idx++){print sum[idx]}}' > ~{alignment_log}
 93 | 
 94 |     >>>
 95 | 
 96 |     output {
 97 |         File atac_merged_alignment = merged_bam
 98 |         File atac_merged_alignment_index = merged_bai
 99 |         File atac_merged_alignment_log = alignment_log
100 |     }
101 | 
102 |     runtime {
103 |         cpu: cpu
104 |         docker: "${docker_image}"
105 |         disks: "local-disk ${disk} HDD"
106 |         disk: disk + " GB" # TES
107 |         #disks: "local-disk ${disk_gb} ${disk_type}"
108 |         maxRetries:1
109 |         memory: "${machine_mem_mb} MiB"
110 |         #memory: "${mem_gb} GB"
111 |         memory_retry_multiplier: 2
112 |     }
113 | 
114 |     parameter_meta {
115 |         bams: {
116 |                 description: 'Individuals bams from the scatter alignment task',
117 |                 help: 'Individuals bams from the scatter alignment task',
118 |                 example: 'align.raw.L1.bam',
119 |             }
120 |         cpus: {
121 |                 description: 'Number of cpus.',
122 |                 help: 'Set the number of cpus used by bowtie2',
123 |                 default: 16
124 |             }
125 |         disk_factor: {
126 |                 description: 'Multiplication factor to determine disk required for task align.',
127 |                 help: 'This factor will be multiplied to the size of FASTQs to determine required disk of instance (GCP/AWS) or job (HPCs).',
128 |                 default: 8.0
129 |             }
130 |         memory_factor: {
131 |                 description: 'Multiplication factor to determine memory required for task align.',
132 |                 help: 'This factor will be multiplied to the size of FASTQs to determine required memory of instance (GCP/AWS) or job (HPCs).',
133 |                 default: 0.15
134 |         }
135 |         prefix: {
136 |                 description: 'Prefix for output files.',
137 |                 help: 'Prefix that will be used to name the output files',
138 |                 examples: 'my-experiment'
139 |             }
140 |         docker_image: {
141 |                 description: 'Docker image.',
142 |                 help: 'Docker image for the alignment step.',
143 |                 example: ["us.gcr.io/buenrostro-share-seq/share_task_bowtie2"]
144 |             }
145 |     }
146 | 
147 | 
148 | }
149 | 


--------------------------------------------------------------------------------
/src/python/trim_fastq.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Trim fastq
  5 | Removes dovetail (overlap) between R1 and R2
  6 | """
  7 | 
  8 | import argparse
  9 | import Levenshtein
 10 | import xopen
 11 | from collections import deque
 12 | 
 13 | def parse_arguments():
 14 |     parser = argparse.ArgumentParser(description="Trim dovetail (overlap) between read1 and read2")
 15 |     parser.add_argument("input_read1_fastq_file", help="Filename for untrimmed input read 1 FASTQ file")
 16 |     parser.add_argument("input_read2_fastq_file", help="Filename for untrimmed input read 2 FASTQ file")
 17 |     parser.add_argument("output_read1_fastq_file", help="Filename for corrected output read 1 FASTQ file")
 18 |     parser.add_argument("output_read2_fastq_file", help="Filename for corrected output read 2 FASTQ file")
 19 |     parser.add_argument("trimming_stats_file", help="Filename for txt file containing trimming statistics")
 20 |     
 21 |     return parser.parse_args()
 22 | 
 23 | REV_COMP = str.maketrans("ATGC", "TACG")
 24 | def reverse_complement(seq):
 25 |     return str.translate(seq, REV_COMP)[::-1]
 26 | 
 27 | def trim_fastqs(input_read1_fastq_file, input_read2_fastq_file,
 28 |                 output_read1_fastq_file, output_read2_fastq_file,
 29 |                 trimming_stats_file):
 30 |     """
 31 |     Trim reads if overlapping, write reads to output FASTQ files.
 32 |     Produces file enumerating how many reads were processed and trimmed.
 33 |     """
 34 |     # counters
 35 |     total = trimmed = 0
 36 |     
 37 |     read1_out_writer = xopen.xopen(output_read1_fastq_file, mode="w")
 38 |     read2_out_writer = xopen.xopen(output_read2_fastq_file, mode="w")
 39 | 
 40 |     buffer1 = deque()
 41 |     buffer2 = deque()
 42 |     buffer_counter = 0
 43 |     
 44 |     # process FASTQs together
 45 |     with xopen.xopen(input_read1_fastq_file, mode= "r", threads= 8) as read1_fh, xopen.xopen(input_read2_fastq_file, mode= "r", threads= 8) as read2_fh:
 46 |         for readline1, readline2 in zip(read1_fh, read2_fh):
 47 |             total += 2
 48 |             
 49 |             name1 = readline1.strip()
 50 |             name2 = readline2.strip()
 51 | 
 52 |             readline1 = next(read1_fh)
 53 |             readline2 = next(read2_fh)
 54 | 
 55 |             sequence1 = readline1.strip()
 56 |             sequence2 = readline2.strip()
 57 | 
 58 |             next(read1_fh)
 59 |             next(read2_fh)
 60 | 
 61 |             readline1 = next(read1_fh)
 62 |             readline2 = next(read2_fh)
 63 | 
 64 |             quality1 = readline1.strip()
 65 |             quality2 = readline2.strip()
 66 |              
 67 |             # trim adapters for ATAC
 68 |             where = trim(sequence1, sequence2)
 69 |             
 70 |             if where > -1:
 71 |                 trimmed += 2
 72 |                 
 73 |                 # add trimmed read 1 to buffer
 74 |                 trimmed_read1 = f"{name1}\n{sequence1[:where]}\n+\n{quality1[:where]}\n"
 75 |                 buffer1.append(trimmed_read1)
 76 |                 
 77 |                 # add trimmed read 2 to buffer
 78 |                 trimmed_read2 = f"{name2}\n{sequence2[:where]}\n+\n{quality2[:where]}\n"
 79 |                 buffer2.append(trimmed_read2)
 80 |                 
 81 |             else:
 82 |                 # add original read 1 to buffer
 83 |                 read1 = f"{name1}\n{sequence1}\n+\n{quality1}\n"
 84 |                 buffer1.append(read1)
 85 |                 
 86 |                 # add original read 1 to buffer
 87 |                 read2 = f"{name2}\n{sequence2}\n+\n{quality2}\n"
 88 |                 buffer2.append(read2)
 89 |                 
 90 |             buffer_counter += 1    
 91 |                 
 92 |             # write reads to trimmed FASTQ files 
 93 |             if buffer_counter == 10000000:
 94 |                 read1_out_writer.write("".join(buffer1))
 95 |                 buffer1.clear()
 96 |                 read2_out_writer.write("".join(buffer2))
 97 |                 buffer2.clear()
 98 |                 buffer_counter = 0
 99 |     
100 |     # write out remaining reads
101 |     if buffer_counter > 0:
102 |         read1_out_writer.write("".join(buffer1))
103 |         buffer1.clear()
104 |         read2_out_writer.write("".join(buffer2))
105 |         buffer2.clear()
106 |         buffer_counter = 0
107 |     
108 |     # write trimming statistics output file
109 |     with open(trimming_stats_file, "w") as f:
110 |         fields = ["total_reads", "untrimmed_reads", "trimmed_reads", "%trimmed"]
111 |         f.write("\t".join(fields) + "\n")
112 |         f.write("%i\t%i\t%i\t%0.1f" % (total, total-trimmed, trimmed, trimmed/total*100 if total > 0 else 0))
113 | 
114 | def trim(seq1, seq2):
115 |     """
116 |     Find overlap between read1 and read2 and return location
117 |     """
118 |     query = reverse_complement(seq2[0:20])
119 |     idx = seq1.rfind(query) # look for perfect match
120 |     if idx == -1:
121 |         idx = fuzz_align(query,seq1)
122 | 
123 |     # found it, return everything through match
124 |     if idx > -1:
125 |         idx = idx+20
126 |     else:
127 |         idx = -1
128 |     return idx
129 | 
130 | def fuzz_align(s_seq, l_seq):
131 |     """
132 |     Align allowing Levenshtein distance of 1
133 |     This iteration should go from the right end of l_seq
134 |     since we want to do a rfind 
135 |     """
136 |     for i, base in enumerate(l_seq):  # loop through equal size windows
137 |         l_subset = l_seq[i:i+len(s_seq)]
138 |         dist = Levenshtein.distance(l_subset, s_seq, score_cutoff= 1)
139 |         if dist <= 1:  # find first then break
140 |             return i
141 |     return -1
142 | 
143 | def main():
144 |     args = parse_arguments()
145 |     input_read1_fastq_file = getattr(args, "input_read1_fastq_file")
146 |     input_read2_fastq_file = getattr(args, "input_read2_fastq_file")
147 |     output_read1_fastq_file = getattr(args, "output_read1_fastq_file")
148 |     output_read2_fastq_file = getattr(args, "output_read2_fastq_file")
149 |     trimming_stats_file = getattr(args, "trimming_stats_file")
150 |     
151 |     trim_fastqs(input_read1_fastq_file, input_read2_fastq_file,
152 |                 output_read1_fastq_file, output_read2_fastq_file,
153 |                 trimming_stats_file)
154 | 
155 | 
156 | if __name__ == "__main__":
157 |     main()
158 | 


--------------------------------------------------------------------------------
/src/python/match_barcodes.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | 
  3 | import numpy as np
  4 | # import pandas as pd
  5 | 
  6 | import matcha
  7 | import sys
  8 | 
  9 | REV_COMP = str.maketrans("ATGC", "TACG")
 10 | def reverse_complement(seq):
 11 |     return str.translate(seq, REV_COMP)[::-1]
 12 | 
 13 | def get_open_fn(path):
 14 |     with open(path, "rb") as f:
 15 |         is_gzipped = (f.read(2) == b'\x1f\x8b')
 16 |     return gzip.open if is_gzipped else open
 17 | 
 18 | def read_barcodes(path, revcomp):
 19 |     # if path.endswith(".tsv"):
 20 |     #     bc = pd.read_csv(path, sep="\t")["sequence"]
 21 |     # else:
 22 |     open_fn = get_open_fn(path)
 23 |     with open_fn(path, 'rt') as file:
 24 |         bc = [b.strip() for b in file]
 25 |     if revcomp:
 26 |         valid = [reverse_complement(b) for b in bc]
 27 |     else:
 28 |         valid = bc
 29 | 
 30 |     return valid
 31 | 
 32 | def match_one_bc(fastqs, whitelists, revcomp, max_barcode_dist, offsets, fastq1_out_path, fastq2_out_path, qc_path, threads):
 33 |     f = matcha.FastqReader(threads = threads)
 34 |     f.add_sequence("R1", fastqs["R1"], output_path=fastq1_out_path)
 35 |     f.add_sequence("R2", fastqs["R2"])
 36 |     f.add_sequence("R3", fastqs["R3"], output_path=fastq2_out_path)
 37 | 
 38 |     with open(revcomp["R2"]) as rf:
 39 |         rc = (int(rf.read().strip()) == 1)
 40 | 
 41 |     barcode_sequences = read_barcodes(whitelists["R2"], rc)
 42 |     cell_barcode = matcha.HashMatcher(
 43 |         sequences = barcode_sequences,
 44 |         labels = barcode_sequences,
 45 |         max_mismatches=max_barcode_dist,
 46 |         subsequence_count=2
 47 |     )
 48 |     f.add_barcode("cell", cell_barcode, "R2", match_start=offsets["R2"])
 49 |     f.set_output_names("{read_name} CB:Z:{cell}")
 50 | 
 51 |     barcode_counts = np.zeros(max_barcode_dist + 2, int)
 52 | 
 53 |     total_reads = 0
 54 |     total_pass = 0
 55 | 
 56 |     # print("start read") ####
 57 |     chunk_size = 10000
 58 |     while f.read_chunk(chunk_size):
 59 |         pass_filter = (f.get_match_result("cell", "dist") <=max_barcode_dist) & \
 60 |             (f.get_match_result("cell", "second_best_dist") > f.get_match_result("cell", "dist"))
 61 | 
 62 |         total_reads += len(pass_filter)
 63 |         total_pass += pass_filter.sum()
 64 |         values, counts = np.unique(f.get_match_result("cell", "dist"), return_counts=True)
 65 |         barcode_counts[np.minimum(values, max_barcode_dist + 1)] += counts
 66 | 
 67 |         f.write_chunk(pass_filter)
 68 | 
 69 |     with open(qc_path, "w") as stats_output:
 70 |         print(f"{total_pass}/{total_reads} reads passing, ({total_pass/total_reads*100:.2f}%)\n", file=stats_output)
 71 |         print("mismatches\treads", file=stats_output)
 72 |         for dist in range(max_barcode_dist + 2):
 73 |             print(
 74 |                 dist if dist <= max_barcode_dist else f">{max_barcode_dist}",
 75 |                 barcode_counts[dist],
 76 |                 sep = "\t",
 77 |                 file=stats_output
 78 |             )
 79 | 
 80 | 
 81 | # def match_two_bc(fastqs, whitelists, revcomp, max_barcode_dist, offsets, fastq1_out_path, fastq2_out_path, qc_path, threads):
 82 | #     f = matcha.FastqReader(threads = threads)
 83 | #     f.add_sequence("R1", fastqs["R1"], output_path=fastq1_out_path)
 84 | #     f.add_sequence("R2", fastqs["R2"], output_path=fastq2_out_path)
 85 | #     f.add_sequence("I1", fastqs["I1"])
 86 | #     f.add_sequence("I2", fastqs["I2"])
 87 | 
 88 | #     i5_sequences, i5_maybe_rc = read_barcodes(whitelists["I2"], revcomp["I2"])
 89 | #     T7_sequences, T7_maybe_rc = read_barcodes(whitelists["I1"], revcomp["I1"])
 90 | 
 91 | #     i5_barcode = matcha.HashMatcher(
 92 | #         sequences = i5_maybe_rc,
 93 | #         labels = i5_sequences,
 94 | #         max_mismatches=max_barcode_dist,
 95 | #         subsequence_count=2
 96 | #     )
 97 | 
 98 | #     T7_barcode = matcha.HashMatcher(
 99 | #         sequences = T7_maybe_rc,
100 | #         labels = T7_sequences,
101 | #         max_mismatches=max_barcode_dist,
102 | #         subsequence_count=2
103 | #     )
104 | 
105 | #     f.add_barcode("i5", i5_barcode, "I2", match_start=offsets["I2"])
106 | #     f.add_barcode("T7", T7_barcode, "I1", match_start=offsets["I1"])
107 | 
108 | #     f.set_output_names("{read_name} CB:Z:{i5}{T7}")
109 | 
110 | #     barcode_counts = np.zeros((max_barcode_dist + 2, max_barcode_dist + 2), int)
111 | 
112 | #     total_reads = 0
113 | #     total_pass = 0
114 | 
115 | #     chunk_size = 10000
116 | 
117 | #     dists = [None, None]
118 | #     second_dists = [None, None]
119 | #     while f.read_chunk(chunk_size):
120 | #         dists[0] = f.get_match_result("i5", "dist")
121 | #         second_dists[0] = f.get_match_result("i5", "second_best_dist")
122 | #         dists[1] = f.get_match_result("T7", "dist")
123 | #         second_dists[1] = f.get_match_result("T7", "second_best_dist")
124 | 
125 | #         pass_filter = (dists[0] < max_barcode_dist) & \
126 | #             (dists[1] < max_barcode_dist) & \
127 | #             (dists[0] + dists[1] < second_dists[0] + second_dists[1])
128 | 
129 | #         total_reads += len(pass_filter)
130 | #         total_pass += pass_filter.sum()
131 | 
132 | #         values, counts = np.unique(dists, axis = 1, return_counts=True)
133 | #         indices = np.minimum(values, max_barcode_dist+1)
134 | #         barcode_counts[(indices[0], indices[1])] += counts
135 | 
136 | #         f.write_chunk(pass_filter)
137 | 
138 | #     with open(qc_path, "w") as stats_output:
139 | #         print(f"{total_pass}/{total_reads} reads passing, ({total_pass/total_reads*100:.2f}%)\n", file=stats_output)
140 | #         print("mismatches_i5\tmismatches_T7\treads", file=stats_output)
141 | #         for i5_dist in range(max_barcode_dist + 2):
142 | #             for T7_dist in range(max_barcode_dist + 2):
143 | #                 print(
144 | #                     i5_dist if i5_dist <= max_barcode_dist else f">{max_barcode_dist}",
145 | #                     T7_dist if T7_dist <= max_barcode_dist else f">{max_barcode_dist}",
146 | #                     barcode_counts[i5_dist, T7_dist],
147 | #                     sep = "\t",
148 | #                     file=stats_output
149 | #                 )
150 | 
151 | modality = sys.argv[4]
152 | whitelist = sys.argv[7]
153 | fastq1_out_path = sys.argv[8]
154 | fastq2_out_path = sys.argv[9]
155 | qc_path = sys.argv[10]
156 | threads = int(sys.argv[11])
157 | max_barcode_dist = int(sys.argv[5])
158 | fastqs = {
159 |     "R1": sys.argv[1],
160 |     "R2": sys.argv[3],
161 |     "R3": sys.argv[2],
162 | }
163 | revcomp = {
164 |     "R2": sys.argv[6],
165 | }
166 | if modality == "10x":
167 |     whitelists = {
168 |         "R2": whitelist,
169 |     }
170 |     offsets = {
171 |         "R2": 0,
172 |     }
173 |     match_one_bc(fastqs, whitelists, revcomp, max_barcode_dist, offsets, fastq1_out_path, fastq2_out_path, qc_path, threads)
174 | 
175 | elif modality == "10x_multiome":
176 |     whitelists = {
177 |         "R2": whitelist,
178 |     }
179 |     offsets = {
180 |         "R2": 8,
181 |     }
182 |     match_one_bc(fastqs, whitelists, revcomp, max_barcode_dist, offsets, fastq1_out_path, fastq2_out_path, qc_path, threads)
183 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Broad Institute of MIT and Harvard Single-Cell/Nucleus Multiomic Processing Pipeline
  2 | 
  3 | Pipeline specifications can be found [here](https://docs.google.com/document/d/1J-NWpDLkEGLsLjVe6h6-Rx4nxzTdgy1TJZvuMnYiiyg/edit?usp=sharing).
  4 | 
  5 | Pipeline main page on [dockstore](https://dockstore.org/workflows/github.com/broadinstitute/epi-SHARE-seq-pipeline/SHARE-seq:release?tab=info).
  6 | 
  7 | <p align="center">
  8 |   <img src="docs/images/pipeline_overview.png" alt="Pipeline overview." />
  9 | </p>
 10 | 
 11 | ### Structure of this repo
 12 | * The **tasks** directory contains the tasks called from the main workflow share-seq.wdl. Each task corresponds to a different step of the pipeline: *align*, *filter*, etc.
 13 | * The **src** directory contains bash, Python, R, and notebook scripts called within the tasks.
 14 | * The **dockerfiles** directory contains the Dockerfiles used to build the Docker images used by the pipeline.
 15 | 
 16 | ## Introduction
 17 | 
 18 | The **SHARE-seq** multiomic pipeline is based off the original Buenrostro SHARE-seq pipeline specifications (by Sai Ma) in [this github repo](https://github.com/masai1116/SHARE-seq-alignment).
 19 | 
 20 | This **10X** single-cell multiomic pipeline is based off the ENCODE (phase-3) single-cell pipeline specifications (by Anshul Kundaje) in [this google doc](https://docs.google.com/document/u/2/d/e/2PACX-1vTlgtT4WeXbvRicybUHXnhZs8RKyB4EkTbcWooQ6qBxxQ_zIHpFEVHy38D5lC_s8_YDGfUTsyomJcs3/pub).
 21 | 
 22 | ### Features
 23 | 
 24 | * **Portability**: The pipeline can be run on different cloud platforms such as Google, AWS and DNAnexus, as well as on cluster engines such as SLURM, SGE and PBS.
 25 | * **User-friendly HTML report**: In addition to the standard outputs, the pipeline generates an HTML report that consists of quality metrics including alignment statistics along with many useful plots. An example of the [HTML report](). # TODO: add an example html.
 26 | * **Supported genomes**: The pipeline requires genome-specific data such as aligner indices, chromosome sizes, and blacklisted regions. We provide genome references for hg38, mm10, mm39.
 27 | 
 28 | ## Installation
 29 | 
 30 | 1) Install Caper (Python Wrapper/CLI for [Cromwell](https://github.com/broadinstitute/cromwell)).
 31 | 	```bash
 32 | 	$ pip install caper
 33 | 	```
 34 | 
 35 | 2) **IMPORTANT**: Read Caper's [README](https://github.com/ENCODE-DCC/caper/blob/master/README.md) carefully to choose a backend for your system. Follow the instructions in the configuration file.
 36 | 	```bash
 37 | 	# backend: local or your HPC type (e.g. slurm, sge, pbs, lsf). read Caper's README carefully.
 38 | 	$ caper init [YOUR_BACKEND]
 39 | 
 40 | 	# IMPORTANT: edit the conf file and follow commented instructions in there
 41 | 	$ vi ~/.caper/default.conf
 42 | 	```
 43 | 
 44 | 3) Git clone this pipeline.
 45 | 	```bash
 46 | 	$ cd
 47 | 	$ git clone https://github.com/broadinstitute/epi-SHARE-seq-pipeline/ #TODO: This should point to the release
 48 | 	```
 49 | 
 50 | 4) Define test input JSON.
 51 | 	```bash
 52 | 	INPUT_JSON="" #TODO: We need a test dataset available for everyone
 53 | 	```
 54 | 
 55 | 5) If you have Docker and want to run the pipelines locally on your laptop, `--max-concurrent-tasks 1` limits the number of concurrent tasks to test-run on a laptop. Uncomment if running on a workstation/HPC.
 56 | 	```bash
 57 | 	# check if Docker works on your machine
 58 | 	$ docker run ubuntu:latest echo hello
 59 | 
 60 | 	# --max-concurrent-tasks 1 is for computers with limited resources
 61 | 	$ caper run share-seq.wdl -i "${INPUT_JSON}" --docker --max-concurrent-tasks 1
 62 | 	```
 63 | 
 64 | 6) Otherwise, install Singularity on your system. Please follow [these instructions](https://neuro.debian.net/install_pkg.html?p=singularity-container) to install Singularity on a Debian-based OS. Or ask your system administrator to install Singularity on your HPC.
 65 | 	```bash
 66 | 	# check if Singularity works on your machine
 67 | 	$ singularity exec docker://ubuntu:latest echo hello
 68 | 
 69 | 	# on your local machine (--max-concurrent-tasks 1 is for computers with limited resources)
 70 | 	$ caper run share-seq.wdl -i "${INPUT_JSON}" --singularity --max-concurrent-tasks 1
 71 | 
 72 | 	# on HPC, make sure that Caper's conf ~/.caper/default.conf is correctly configured to work with your HPC
 73 |     # the following command will submit Caper as a leader job to SLURM with Singularity
 74 |     $ caper hpc submit share-seq.wdl -i "${INPUT_JSON}" --singularity --leader-job-name ANY_GOOD_LEADER_JOB_NAME
 75 | 
 76 |     # check job ID and status of your leader jobs
 77 |     $ caper hpc list
 78 | 
 79 |     # cancel the leader node to close all of its children jobs
 80 |     # If you directly use cluster command like scancel or qdel then
 81 |     # child jobs will not be terminated
 82 |     $ caper hpc abort [JOB_ID]
 83 | 	```
 84 | 
 85 | ## Input JSON file
 86 | 
 87 | > **IMPORTANT**: DO NOT BLINDLY USE A TEMPLATE/EXAMPLE INPUT JSON. READ THROUGH THE FOLLOWING GUIDE TO MAKE A CORRECT INPUT JSON FILE.
 88 | 
 89 | An input JSON file specifies all of the input parameters and files that are necessary for successfully running this pipeline. This includes a specification of the path to the genome reference files and the raw data FASTQ files. Please make sure to specify absolute paths rather than relative paths in your input JSON files.
 90 | 
 91 | 1) [Input JSON file specification (short)](docs/input_short.md)
 92 | 2) [Input JSON file specification (long)](docs/input.md)
 93 | 
 94 | 
 95 | ## Running on Terra/Anvil (using Dockstore)
 96 | 
 97 | Visit our pipeline repo on [Dockstore](https://dockstore.org/my-workflows/github.com/broadinstitute/epi-SHARE-seq-pipeline/SHARE-seq). Click on `Terra` or `Anvil`. Follow Terra's instructions to create a workspace on Terra and add Terra's billing bot to your Google Cloud account.
 98 | 
 99 | Download this [test input JSON for Terra](we don't have one at the moment), upload it to Terra's UI, and then run the analysis.
100 | 
101 | If you would like to use your own input JSON file, make sure that all files in the input JSON are on a Google Cloud Storage bucket (`gs://`). URLs will not work.
102 | 
103 | ## How to organize outputs
104 | 
105 | Install [Croo](https://github.com/ENCODE-DCC/croo#installation). Make sure that you have python3(> 3.4.1) installed on your system. Find a `metadata.json` on Caper's output directory.
106 | 
107 | ```bash
108 | $ pip install croo
109 | $ croo [METADATA_JSON_FILE]
110 | ```
111 | 
112 | ## How to make a spreadsheet of QC metrics
113 | 
114 | Install [qc2tsv](https://github.com/ENCODE-DCC/qc2tsv#installation). Make sure that you have Python 3 (>3.4.1) installed on your system. 
115 | 
116 | Once you have [organized the output with Croo](#how-to-organize-outputs), you will be able to find the pipeline's final output file `qc/qc.json` which contains all the QC metrics. Simply feed `qc2tsv` with multiple `qc.json` files. It can take various URIs such as local paths, `gs://`, and `s3://`.
117 | 
118 | ```bash
119 | $ pip install qc2tsv
120 | $ qc2tsv /sample1/qc.json gs://sample2/qc.json s3://sample3/qc.json ... > spreadsheet.tsv
121 | ```
122 | 
123 | QC metrics for each experiment (`qc.json`) will be split into multiple rows (1 for overall experiment + 1 for each bio replicate) in a spreadsheet.
124 | 
125 | <br>
126 | TODO:\
127 | Sambamba\
128 | add track generation \
129 | 
130 | Thank you to the **ENCODE DAC** for writing excellent documentation for their pipelines that we used as templates.
131 | 


--------------------------------------------------------------------------------
/src/python/joint_cell_plotting.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | This script QCs barcodes via ATAC frags & TSS and RNA UMIs & genes,
  5 | and plots all barcodes colored by joint QC status. It also generates the
  6 | same plot with transparency added to show density.
  7 | """
  8 | 
  9 | import argparse
 10 | import logging
 11 | import numpy as np
 12 | import pandas as pd
 13 | from plotnine import *
 14 | 
 15 | def parse_arguments():
 16 |     parser = argparse.ArgumentParser(description="Plot barcodes by RNA and ATAC QC status")
 17 |     parser.add_argument("rna_metrics_file", help="Filename for RNA metrics tsv file")
 18 |     parser.add_argument("atac_metrics_file", help="Filename for ATAC metrics tsv file")
 19 |     parser.add_argument("remove_low_yielding_cells", type=int, help="Minimum number of UMIs/fragments required for a cell to be plotted")
 20 |     parser.add_argument("min_umis", type=int, help="Cutoff for minimum number of UMIs")
 21 |     parser.add_argument("min_genes", type=int, help="Cutoff for minimum number of genes")
 22 |     parser.add_argument("min_tss", type=int, help="Cutoff for minimum TSS score")
 23 |     parser.add_argument("min_frags", type=int, help="Cutoff for minimum number of ATAC fragments")
 24 |     parser.add_argument("plot_file", help="Filename for plot png file")
 25 |     parser.add_argument("barcode_metadata_file", help="Filename for barcode metadata csv file")
 26 |     parser.add_argument("pkr", help="PKR name", nargs='?', default="")
 27 | 
 28 |     return parser.parse_args()
 29 | 
 30 | def get_split_lines(file_name, delimiter, skip_header):
 31 |     with open(file_name, "r") as f:
 32 |         if skip_header:
 33 |             next(f)
 34 |         for line in f:
 35 |             yield line.rstrip().split(sep=delimiter)
 36 | 
 37 | def merge_dicts(dict_1, dict_2):
 38 |     """Merge dictionaries by key; combine values into quadruple, fill with 0s if key not in both dicts"""
 39 |     keys = set(dict_1.keys() | dict_2.keys())
 40 |     merged = {k: (dict_1.get(k, (0,0)) + dict_2.get(k, (0,0))) for k in keys}
 41 | 
 42 |     return(merged)
 43 | 
 44 | def get_metrics(rna_metrics_file, atac_metrics_file, remove_low_yielding_cells):
 45 |     """Read files and aggregate metrics into Pandas dataframe"""
 46 |     rna_metrics_contents = get_split_lines(rna_metrics_file, delimiter="\t", skip_header=True)
 47 |     umis = []
 48 |     genes = []
 49 |     rna_barcodes = []
 50 |     # remove cells that have fewer than 10 UMIs
 51 |     for line in rna_metrics_contents:
 52 |         if int(line[3]) >= remove_low_yielding_cells:
 53 |             umis.append(int(line[3]))
 54 |             genes.append(int(line[4]))
 55 |             rna_barcodes.append(line[0])
 56 |     rna_metrics = dict(zip(rna_barcodes, zip(umis, genes)))
 57 | 
 58 |     atac_metrics_contents = get_split_lines(atac_metrics_file, delimiter="\t", skip_header=True)
 59 |     tss = []
 60 |     frags = []
 61 |     atac_barcodes = []
 62 |     # remove cells that have fewer than 10 fragments
 63 |     for line in atac_metrics_contents:
 64 |         if int(line[6])/2 >= remove_low_yielding_cells:
 65 |             tss.append(float(line[4]))
 66 |             frags.append(int(line[6])/2)
 67 |             atac_barcodes.append(line[0])
 68 |     atac_metrics = dict(zip(atac_barcodes, zip(tss, frags)))
 69 | 
 70 |     # merge metrics by barcodes
 71 |     metrics = merge_dicts(rna_metrics, atac_metrics)
 72 |     df = pd.DataFrame.from_dict(metrics, orient="index", columns=["umis","genes","tss","frags"])
 73 | 
 74 |     return(df)
 75 | 
 76 | def qc_cells(df, min_umis, min_genes, min_tss, min_frags):
 77 |     pass_umis = df["umis"] >= min_umis
 78 |     pass_genes = df["genes"] >= min_genes
 79 |     pass_tss = df["tss"] >= min_tss
 80 |     pass_frags = df["frags"] >= min_frags
 81 | 
 82 |     # add df column with QC outcome
 83 |     qc_conditions  = [(pass_umis & pass_genes & pass_tss & pass_frags),
 84 |                       (pass_umis & pass_genes),
 85 |                       (pass_tss & pass_frags),
 86 |                       (~(pass_umis & pass_genes) & (~(pass_tss & pass_frags)))]
 87 |     qc_choices = ["both", "RNA only", "ATAC only", "neither"]
 88 |     df["QC"] = np.select(qc_conditions, qc_choices)
 89 | 
 90 |     # get counts of each outcome type (used in plot legend)
 91 |     outcome_counts = df["QC"].value_counts()
 92 | 
 93 |     df["QC_count"] = [f"{outcome} ({outcome_counts[outcome]})" for outcome in df["QC"]]
 94 | 
 95 |     return(df)
 96 | 
 97 | def round_to_power_10(x):
 98 |     return(10**np.ceil(np.log10(x)))
 99 | 
100 | def label_func(breaks):
101 |     return [int(x) for x in breaks]
102 | 
103 | def plot_cells(df, pkr, min_umis, min_genes, min_tss, min_frags, plot_file):
104 |     # get max x and y coords to set plot limits
105 |     max_x = max(df["frags"])
106 |     max_y = max(df["umis"])
107 |     xy_lim = round_to_power_10(max(max_x, max_y))
108 | 
109 |     plot = (ggplot(df, aes("frags", "umis", color="QC_count"))
110 |              + geom_point(size=0.5)
111 |              + labs(title = f"Joint Cell Calling ({pkr})",
112 |                     caption = f"ATAC cutoffs: TSS ≥ {min_tss}, frags ≥ {min_frags}. RNA cutoffs: UMIs ≥ {min_umis}, genes ≥ {min_genes}",
113 |                     x = "ATAC Unique Fragments per Barcode",
114 |                     y = "RNA UMIs per Barcode",
115 |                     color = "QC")
116 |              + theme_light()
117 |              + theme(figure_size = (8,6),
118 |                      title = element_text(size=12),
119 |                      axis_title = element_text(size=10),
120 |                      axis_text = element_text(size=8),
121 |                      legend_box_margin = 0,
122 |                      legend_title = element_text(size=8),
123 |                      legend_text = element_text(size=6),
124 |                      legend_key = element_blank(),
125 |                      plot_caption=element_text(size=8, ha="center", margin={"r": 3.2, "t": -0.2, "units": "in"}),
126 |                      panel_grid_minor = element_blank())
127 |              + scale_x_log10(limits=(10,xy_lim), labels=label_func)
128 |              + scale_y_log10(limits=(10,xy_lim), labels=label_func)
129 |              )
130 | 
131 |     plot.save(filename=plot_file, dpi=1000)
132 | 
133 | def main():
134 |     # create log file
135 |     logging.basicConfig(filename="joint_cell_plotting.log", level=logging.INFO)
136 | 
137 |     # get arguments
138 |     args = parse_arguments()
139 |     pkr = getattr(args, "pkr")
140 |     rna_metrics_file = getattr(args, "rna_metrics_file")
141 |     atac_metrics_file = getattr(args, "atac_metrics_file")
142 |     remove_low_yielding_cells = getattr(args, "remove_low_yielding_cells")
143 |     barcode_metadata_file = getattr(args, "barcode_metadata_file")
144 |     min_umis = getattr(args, "min_umis")
145 |     min_genes = getattr(args, "min_genes")
146 |     min_tss = getattr(args, "min_tss")
147 |     min_frags = getattr(args, "min_frags")
148 |     plot_file = getattr(args, "plot_file")
149 | 
150 |     # read rna and atac files, get cell metrics
151 |     logging.info("Getting metrics\n")
152 |     metrics_df = get_metrics(rna_metrics_file, atac_metrics_file, remove_low_yielding_cells)
153 | 
154 |     # QC cells based on inputted cutoffs
155 |     logging.info("QCing cells\n")
156 |     metrics_df = qc_cells(metrics_df, min_umis, min_genes, min_tss, min_frags)
157 | 
158 |     # generate plot
159 |     logging.info("Generating joint cell calling plot\n")
160 |     plot_cells(metrics_df, pkr, min_umis, min_genes, min_tss, min_frags, plot_file)
161 | 
162 |     # save dataframe
163 |     logging.info("Saving dataframe as csv\n")
164 |     metrics_df.to_csv(barcode_metadata_file)
165 |     logging.info("All done!")
166 | 
167 | 
168 | if __name__ == "__main__":
169 |     main()
170 | 
171 | 


--------------------------------------------------------------------------------