├── .gitattributes ├── .gitignore ├── .gitmodules ├── .test ├── NexteraPE-PE.fa ├── config.yaml ├── gtf_biotypes.yaml └── samples.csv ├── .travis.yml ├── LICENSE.txt ├── README.md ├── Snakefile ├── docs ├── .gitignore ├── Snakefile ├── docs │ ├── CHANGELOG.md │ ├── Clusters.md │ ├── Create-config-files.md │ ├── FAQ.md │ ├── Installation.md │ ├── Plots.md │ ├── Reference-Files.md │ ├── Running-dropSeqPipe.md │ ├── images │ │ ├── adapter_content.png │ │ ├── hum_mus_species_plot_transcripts.png │ │ ├── mac_Count_vs_gene.png │ │ ├── mac_UMI_vs_counts.png │ │ ├── mac_UMI_vs_gene.png │ │ ├── mac_violinplots_comparison_UMI.png │ │ ├── sample1_knee_plot.png │ │ ├── sample1_rna_metrics.png │ │ └── yield.png │ └── index.md ├── mkdocs.yml └── mkdocs_env.yml ├── envs ├── bbmap.yaml ├── cutadapt.yaml ├── dropseq_tools.yaml ├── merge.yaml ├── merge_bam.yaml ├── merge_long.yaml ├── picard.yaml ├── pigz.yaml ├── r.yaml ├── samtools.yaml ├── star.yaml ├── umi_tools.yaml └── velocyto.yaml ├── rules ├── cell_barcodes.smk ├── download_meta_mixed.smk ├── download_meta_single.smk ├── extract_expression_single.smk ├── extract_expression_species.smk ├── fastqc.smk ├── filter.smk ├── generate_meta.smk ├── map.smk ├── merge.smk ├── prepare.smk ├── report.smk └── split_species.smk ├── schemas ├── config.schema.yaml └── samples.schema.yaml ├── scripts ├── clean_cutadapt.py ├── convert_mtx.py ├── create_summary_stats.R ├── detect_barcodes.py ├── fa2tsv.py ├── generate_extended_ref.py ├── merge_bam.py ├── plot_adapter_content.R ├── plot_knee_plot.R ├── plot_rna_metrics.R ├── plot_species_plot.R ├── plot_violine.R ├── plot_yield.R ├── publication_text.Rmd ├── repair_barcodes.py └── umi_tools_extended_ref.py └── templates ├── NexteraPE-PE.fa ├── TruSeq2-PE.fa ├── TruSeq2-SE.fa ├── TruSeq3-PE-2.fa ├── TruSeq3-PE.fa ├── TruSeq3-SE.fa ├── cluster.yaml ├── config.yaml ├── config_nadia.yaml ├── custom_adapters.fa ├── gtf_biotypes.yaml └── samples.csv /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto !eol 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .snakemake 2 | scripts/__pycache__* -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule ".test/data"] 2 | path = .test/data 3 | url = https://github.com/Hoohm/scngs-test-data.git 4 | -------------------------------------------------------------------------------- /.test/NexteraPE-PE.fa: -------------------------------------------------------------------------------- 1 | >PrefixNX/1 2 | AGATGTGTATAAGAGACAG 3 | >PrefixNX/2 4 | AGATGTGTATAAGAGACAG 5 | >Trans1 6 | TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG 7 | >Trans1_rc 8 | CTGTCTCTTATACACATCTGACGCTGCCGACGA 9 | >Trans2 10 | GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG 11 | >Trans2_rc 12 | CTGTCTCTTATACACATCTCCGAGCCCACGAGAC -------------------------------------------------------------------------------- /.test/config.yaml: -------------------------------------------------------------------------------- 1 | CONTACT: 2 | email: user.name@provider.com 3 | person: John Doe 4 | LOCAL: 5 | temp-directory: /tmp 6 | memory: 4g 7 | raw_data: data 8 | results: results 9 | META: 10 | species: 11 | mus_musculus: 12 | build: 38 13 | release: 91 14 | ratio: 0.2 15 | reference-directory: data/ref 16 | gtf_biotypes: gtf_biotypes.yaml 17 | 18 | FILTER: 19 | barcode-whitelist: '' 20 | 5-prime-smart-adapter: CCTACACGACGCTCTTCCGATCT 21 | cell-barcode: 22 | start: 2 23 | end: 6 24 | UMI-barcode: 25 | start: 7 26 | end: 16 27 | cutadapt: 28 | adapters-file: NexteraPE-PE.fa 29 | R1: 30 | quality-filter: 20 31 | maximum-Ns: 0 32 | extra-params: '' 33 | R2: 34 | quality-filter: 20 35 | minimum-adapters-overlap: 6 36 | minimum-length: 15 37 | extra-params: '' 38 | MAPPING: 39 | STAR: 40 | genomeChrBinNbits: 18 41 | outFilterMismatchNmax: 10 42 | outFilterMismatchNoverLmax: 0.3 43 | outFilterMismatchNoverReadLmax: 1 44 | outFilterMatchNmin: 0 45 | outFilterMatchNminOverLread: 0.66 46 | outFilterScoreMinOverLread: 0.66 47 | EXTRACTION: 48 | LOCUS: 49 | - CODING 50 | - UTR 51 | strand-strategy: SENSE 52 | UMI-edit-distance: 1 53 | minimum-counts-per-UMI: 0 54 | DEBUG: True -------------------------------------------------------------------------------- /.test/gtf_biotypes.yaml: -------------------------------------------------------------------------------- 1 | biotypes: 2 | - 3prime_overlapping_ncRNA 3 | - antisense 4 | - bidirectional_promoter_lncRNA 5 | - IG_C_gene 6 | - IG_C_pseudogene 7 | - IG_D_gene 8 | - IG_J_gene 9 | - IG_J_pseudogene 10 | - IG_pseudogene 11 | - IG_V_gene 12 | - IG_V_pseudogene 13 | - lincRNA 14 | - macro_lncRNA 15 | - miRNA 16 | - misc_RNA 17 | - Mt_rRNA 18 | - Mt_tRNA 19 | - non_coding 20 | - polymorphic_pseudogene 21 | - processed_pseudogene 22 | - processed_transcript 23 | - protein_coding 24 | - pseudogene 25 | - ribozyme 26 | - rRNA 27 | - scaRNA 28 | - scRNA 29 | - sense_intronic 30 | - sense_overlapping 31 | - snoRNA 32 | - snRNA 33 | - sRNA 34 | - TEC 35 | - transcribed_processed_pseudogene 36 | - transcribed_unitary_pseudogene 37 | - transcribed_unprocessed_pseudogene 38 | - translated_processed_pseudogene 39 | - TR_C_gene 40 | - TR_D_gene 41 | - TR_J_gene 42 | - TR_J_pseudogene 43 | - TR_V_gene 44 | - TR_V_pseudogene 45 | - unitary_pseudogene 46 | - unprocessed_pseudogene 47 | - vaultRNA 48 | -------------------------------------------------------------------------------- /.test/samples.csv: -------------------------------------------------------------------------------- 1 | samples,expected_cells,read_length,batch 2 | sample1,100,75,Batch1 3 | sample2,100,75,Batch2 -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | 5 | branches: 6 | only: 7 | - master 8 | - develop 9 | 10 | install: 11 | - sudo apt-get update 12 | - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; 13 | - bash miniconda.sh -b -p $HOME/miniconda 14 | - export PATH="$HOME/miniconda/bin:$PATH" 15 | - hash -r 16 | - conda config --set always_yes yes --set changeps1 no 17 | - conda update -q conda 18 | # Useful for debugging any issues with conda 19 | - conda info -a 20 | - conda config --add channels defaults 21 | - conda config --add channels conda-forge 22 | - conda config --add channels bioconda 23 | - conda install -c bioconda -c conda-forge snakemake 24 | - conda create -q -n snakemake snakemake>=5.3.1 python=$TRAVIS_PYTHON_VERSION 25 | script: 26 | # run the workflow 27 | - snakemake --use-conda --directory .test -p 28 | 29 | after_success: 30 | - cd docs && snakemake --use-conda build_docs 31 | 32 | deploy: 33 | provider: pages 34 | skip-cleanup: true 35 | github-token: $GITHUB_PAT 36 | keep-history: true 37 | local-dir: docs/site 38 | on: 39 | branch: master 40 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Snakemake](https://img.shields.io/badge/snakemake-≥4.1.0-brightgreen.svg)](https://snakemake.bitbucket.io) 2 | [![Build Status](https://travis-ci.org/Hoohm/dropSeqPipe.svg?branch=master)](https://travis-ci.org/Hoohm/dropSeqPipe) 3 | 4 | Description 5 | ------------------ 6 | This pipeline is based on [snakemake](https://snakemake.readthedocs.io/en/stable/) and the dropseq tools provided by the [McCarroll Lab](http://mccarrolllab.com/dropseq/). It allows to go from raw data of your Single Cell RNA seq experiment until the final count matrix with QC plots along the way. 7 | 8 | This is the tool we use in our lab to improve our wetlab protocol as well as provide an easy framework to reproduce and compare different experiments with different parameters. 9 | 10 | It uses STAR to map the reads. It is usable for any single cell protocol using two reads where the first one holds the Cell and UMI barcodes and the second read holds the RNA. Here is a non-exhausitve list of compatible protocols/brands: 11 | 12 | * Drop-Seq 13 | * SCRB-Seq 14 | * 10x Genomics 15 | * DroNc-seq 16 | * Dolomite Bio ([Nadia Instrument](https://www.dolomite-bio.com/product/nadia-instrument/)) 17 | 18 | This package is trying to be as user friendly as possible. One of the hopes is that non-bioinformatician can make use of it without too much hassle. It will still require some command line execution, this is not going to be fully interactive package. 19 | 20 | 21 | ## Authors 22 | 23 | * Patrick Roelli ([@Hoohm)](https://github.com/Hoohm)) 24 | * Sebastian Mueller ([@seb-mueller)](https://github.com/seb-mueller)) 25 | * Charles Girardot ([@cgirardot)](https://github.com/cgirardot)) 26 | 27 | ## Usage 28 | 29 | ### Step 1: Install workflow 30 | 31 | If you simply want to use this workflow, download and extract the [latest release](https://github.com/Hoohm/dropSeqPipe/releases). 32 | If you intend to modify and further develop this workflow, fork this reposity. Please consider providing any generally applicable modifications via a pull request. 33 | 34 | In any case, if you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this repository and, once available, its DOI. 35 | 36 | ### Step 2: Configure workflow 37 | 38 | Configure the workflow according to your needs via editing the file `config.yaml` and the `samples.tsv` following those [instructions](https://github.com/Hoohm/dropSeqPipe/wiki/Create-config-files) 39 | 40 | ### Step 3: Execute workflow 41 | 42 | All you need to execute this workflow is to install Snakemake via the [Conda package manager](http://snakemake.readthedocs.io/en/stable/getting_started/installation.html#installation-via-conda). Software needed by this workflow is automatically deployed into isolated environments by Snakemake. 43 | 44 | Test your configuration by performing a dry-run via 45 | 46 | snakemake --use-conda -n --directory $WORKING_DIR 47 | 48 | Execute the workflow locally via 49 | 50 | snakemake --use-conda --cores $N --directory $WORKING_DIR 51 | 52 | using `$N` cores on the `$WORKING_DIR`. Alternatively, it can be run in cluster or cloud environments (see [the docs](http://snakemake.readthedocs.io/en/stable/executable.html) for details). 53 | 54 | If you not only want to fix the software stack but also the underlying OS, use 55 | 56 | snakemake --use-conda --use-singularity 57 | 58 | in combination with any of the modes above. 59 | 60 | ### Step 4: Investigate results 61 | 62 | After successful execution, you can create a self-contained report with all results via: 63 | 64 | snakemake --report report.html 65 | 66 | 67 | Documentation 68 | ------------------ 69 | You can find the documentation [here](https://hoohm.github.io/dropSeqPipe/) 70 | 71 | Future implementations 72 | --------------------------- 73 | I'm actively seeking help to implement the points listed bellow. Don't hesitate to contact me if you wish to contribute. 74 | 75 | * Create a sharing platform where quality plots/logs can be discussed and troubleshooted. 76 | * Create a full html report for the whole pipeline 77 | * Multiqc module for drop-seq-tools 78 | * Implement an elegant "preview" mode where the pipeline would only run on a couple of millions of reads and allow you to have an approximated view before running all of the data. This would dramatically reduce the time needed to get an idea of what filters whould be used. 79 | * 80 | 81 | I hope it can help you out in your single cell experiments! 82 | 83 | Feel free to comment and point out potential improvements via [issues](https://github.com/Hoohm/dropSeqPipe/issues) 84 | 85 | 86 | TODO 87 | --------------------------------------------- 88 | * Add a mixed reference reference for testing purposes 89 | * Finalize the parameters validation schema 90 | * Make the debug feature a bit "cleaner". Deal with automatic naming of the debug variables 91 | * Implement ddseq barcoding strategies 92 | -------------------------------------------------------------------------------- /Snakefile: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import re 4 | import glob 5 | from snakemake.utils import validate, min_version 6 | 7 | singularity: 8 | "shub://seb-mueller/singularity_dropSeqPipe:v04" 9 | 10 | min_version("5.1.2") 11 | 12 | #print(os.path.abspath(os.path.dirname(workflow.snakefile))) 13 | 14 | # Load configuration files 15 | 16 | try: 17 | configfile_path = config['configfile_path'] 18 | except: 19 | configfile_path = "config.yaml" 20 | configfile: configfile_path 21 | 22 | 23 | #Include the gtf biotypes yaml 24 | configfile: config['META']['gtf_biotypes'] 25 | 26 | # Define a few variables to make them easier to reference 27 | snakefile_root_path = os.path.abspath(os.path.dirname(workflow.snakefile)) 28 | ref_path = config['META']['reference-directory'] 29 | barcode_whitelist = config['FILTER']['barcode-whitelist'] 30 | results_dir = config['LOCAL']['results'] 31 | raw_data_dir = config['LOCAL']['raw_data'] 32 | 33 | # dropSeqPipe version 34 | config['version'] = '0.5' 35 | validate(config, schema=os.path.join(snakefile_root_path,"schemas","config.schema.yaml")) 36 | 37 | 38 | # In order to deal with single species or mixed species experiment 39 | # we define the same variables for each case. 40 | 41 | 42 | #Define variables for mixed species experiments 43 | if len(config['META']['species'].keys()) == 2: 44 | print('Running the pipeline for a mixed experiment') 45 | species_list = list(config['META']['species']) 46 | build_list = [ 47 | config['META']['species'][species_list[0]]['build'], 48 | config['META']['species'][species_list[1]]['build']] 49 | release_list = [ 50 | config['META']['species'][species_list[0]]['release'], 51 | config['META']['species'][species_list[1]]['release']] 52 | 53 | for species in config['META']['species']: 54 | release = '{}.{}'.format( 55 | config['META']['species'][species_list[0]]['release'], 56 | config['META']['species'][species_list[1]]['release']) 57 | build = '{}.{}'.format( 58 | config['META']['species'][species_list[0]]['build'], 59 | config['META']['species'][species_list[1]]['build']) 60 | species = 'mixed_{}_{}'.format( 61 | species_list[0], 62 | species_list[1]) 63 | 64 | #Define variables for single species experiments 65 | elif len(config['META']['species'].keys()) == 1: 66 | species_list=list(config['META']['species']) 67 | species=species_list[0] 68 | release_list = [config['META']['species'][species]['release']] 69 | release=release_list[0] 70 | build_list = [config['META']['species'][species]['build']] 71 | build=build_list[0] 72 | else: 73 | exit("Number of species in the config.yaml must be one or two. Exiting") 74 | 75 | # Get sample names from samples.csv 76 | samples = pd.read_table("samples.csv", sep=',').set_index("samples", drop=False) 77 | validate(samples, schema=os.path.join(snakefile_root_path,"schemas","samples.schema.yaml")) 78 | types=['read','umi'] 79 | # Get read_lengths from samples.csv 80 | read_lengths = list(samples.loc[:,'read_length']) 81 | 82 | wildcard_constraints: 83 | sample="({})".format("|".join(samples.index)), 84 | type="({})".format("|".join(types)) 85 | 86 | 87 | # Flexible ways to get the R1 and R2 files 88 | def get_R1_files(wildcards): 89 | samples = [f for f in glob.glob("{}/*.fastq.gz".format(raw_data_dir)) if (re.search('R1', re.sub(wildcards.sample,'',f)) and re.search(wildcards.sample,f))] 90 | if len(samples)>1 & isinstance(samples,list): 91 | exit('Multiple read files for one sample. Please check file names or run snakemake -s rules/prepare.smk for multilane samples first.') 92 | if samples == []: 93 | exit('\tNo sample files found in the {}/ directory.\n\t\tPlease check that the path for the raw data is set properly in config.yaml'.format(raw_data_dir)) 94 | return(samples) 95 | 96 | def get_R2_files(wildcards): 97 | samples = [f for f in glob.glob("{}/*.fastq.gz".format(raw_data_dir)) if (re.search('R2', re.sub(wildcards.sample,'',f)) and re.search(wildcards.sample,f))] 98 | if len(samples)>1 & isinstance(samples,list): 99 | exit('Multiple read files for one sample. Please check file names or run snakemake -s rules/prepare.smk for multilane samples first.') 100 | if samples == []: 101 | exit('\tNo sample files found in the {} directory.\n\t\tPlease check that the path for the raw data is set properly in config.yaml'.format(raw_data_dir)) 102 | return(samples) 103 | 104 | 105 | if len(config['META']['species'].keys()) == 2: 106 | rule all: 107 | input: 108 | expand( 109 | ['{ref_path}/{species}_{build}_{release}/STAR_INDEX/SA_{read_length}/SA', 110 | #qc 111 | '{results_dir}/reports/fastqc_reads.html', 112 | '{results_dir}/reports/fastqc_barcodes.html', 113 | #fastqc_adapter 114 | 'fastqc_adapter.tsv', 115 | #filter 116 | '{results_dir}/plots/adapter_content.pdf', 117 | '{results_dir}/reports/barcode_filtering.html', 118 | '{results_dir}/reports/RNA_filtering.html', 119 | '{results_dir}/samples/{sample}/trimmed_repaired_R1.fastq.gz', 120 | '{results_dir}/samples/{sample}/top_barcodes.csv', 121 | #mapping 122 | '{results_dir}/plots/knee_plots/{sample}_knee_plot.pdf', 123 | '{results_dir}/reports/star.html', 124 | '{results_dir}/plots/yield.pdf', 125 | '{results_dir}/samples/{sample}/Unmapped.out.mate1.gz', 126 | #splitting 127 | '{results_dir}/plots/barnyard/{sample}_genes.pdf', 128 | '{results_dir}/plots/barnyard/{sample}_transcripts.pdf'], 129 | read_length=read_lengths, 130 | sample=samples.index, 131 | type=types, 132 | results_dir=results_dir, 133 | ref_path=config['META']['reference-directory'], 134 | build=build, 135 | release=release, 136 | species=species), 137 | expand( 138 | ['{results_dir}/samples/{sample}/{species}/umi/matrix.mtx', 139 | '{results_dir}/samples/{sample}/{species}/read/matrix.mtx', 140 | '{results_dir}/plots/rna_metrics/{sample}_{species}_rna_metrics.pdf'], 141 | results_dir=results_dir, 142 | sample=samples.index, 143 | species=species_list) 144 | 145 | elif len(config['META']['species'].keys()) == 1: 146 | rule all: 147 | input: 148 | #meta 149 | expand( 150 | ['{ref_path}/{species}_{build}_{release}/STAR_INDEX/SA_{read_length}/SA', 151 | #qc 152 | '{results_dir}/reports/fastqc_reads.html', 153 | '{results_dir}/reports/fastqc_barcodes.html', 154 | #filter 155 | '{results_dir}/plots/adapter_content.pdf', 156 | '{results_dir}/reports/barcode_filtering.html', 157 | '{results_dir}/reports/RNA_filtering.html', 158 | #mapping 159 | '{results_dir}/plots/knee_plots/{sample}_knee_plot.pdf', 160 | '{results_dir}/reports/star.html', 161 | '{results_dir}/plots/yield.pdf', 162 | '{results_dir}/samples/{sample}/Unmapped.out.mate1.gz', 163 | #extract 164 | '{results_dir}/plots/rna_metrics/{sample}_rna_metrics.pdf', 165 | '{results_dir}/summary/{type}/matrix.mtx', 166 | '{results_dir}/samples/{sample}/{type}/matrix.mtx', 167 | #merge 168 | '{results_dir}/plots/UMI_vs_counts.pdf', 169 | '{results_dir}/plots/UMI_vs_gene.pdf', 170 | '{results_dir}/plots/Count_vs_gene.pdf', 171 | '{results_dir}/summary/R_Seurat_objects.rdata', 172 | '{results_dir}/summary/barcode_stats_pre_filter.csv', 173 | '{results_dir}/summary/barcode_stats_post_filter.csv', 174 | '{results_dir}/plots/violinplots_comparison_UMI.pdf'], 175 | read_length=read_lengths, 176 | sample=samples.index, 177 | type=types, 178 | results_dir=results_dir, 179 | ref_path=config['META']['reference-directory'], 180 | build=build, 181 | release=release, 182 | species=species) 183 | rule download_meta: 184 | input: 185 | expand( 186 | ["{ref_path}/{species}_{build}_{release}/annotation.gtf", 187 | "{ref_path}/{species}_{build}_{release}/genome.fa"], 188 | ref_path=config['META']['reference-directory'], 189 | species=species_list, 190 | release=release, 191 | build=build) 192 | 193 | 194 | rule qc: 195 | input: 196 | expand( 197 | ['{results_dir}/reports/fastqc_reads.html', 198 | '{results_dir}/reports/fastqc_barcodes.html', 199 | 'fastqc_adapter.tsv'], 200 | results_dir=results_dir) 201 | 202 | rule filter: 203 | input: 204 | expand( 205 | ['{results_dir}/plots/adapter_content.pdf', 206 | '{results_dir}/reports/barcode_filtering.html', 207 | '{results_dir}/reports/RNA_filtering.html', 208 | '{results_dir}/samples/{sample}/trimmed_repaired_R1.fastq.gz', 209 | '{results_dir}/samples/{sample}/top_barcodes.csv'], 210 | results_dir=results_dir, 211 | sample=samples.index) 212 | 213 | rule map: 214 | input: 215 | expand( 216 | ['{results_dir}/plots/knee_plots/{sample}_knee_plot.pdf', 217 | '{results_dir}/reports/star.html', 218 | '{results_dir}/plots/yield.pdf', 219 | '{results_dir}/samples/{sample}/final.bam', 220 | '{results_dir}/samples/{sample}/Unmapped.out.mate1.gz'], 221 | sample=samples.index, 222 | results_dir=results_dir) 223 | 224 | rule extract: 225 | input: 226 | expand( 227 | ['{results_dir}/plots/rna_metrics/{sample}_rna_metrics.pdf', 228 | '{results_dir}/summary/{type}/matrix.mtx', 229 | '{results_dir}/samples/{sample}/{type}/matrix.mtx.gz'], 230 | results_dir=results_dir, 231 | sample=samples.index, 232 | type=types) 233 | 234 | rule split_species: 235 | input: 236 | expand( 237 | ['{results_dir}/samples/{sample}/{species}/barcodes.csv', 238 | '{results_dir}/plots/barnyard/{sample}_genes.pdf', 239 | '{results_dir}/plots/barnyard/{sample}_transcripts.pdf', 240 | '{results_dir}/samples/{sample}/{species}/unfiltered.bam'], 241 | sample=samples.index, 242 | species=config['META']['species'], 243 | results_dir=results_dir) 244 | 245 | 246 | rule extract_species: 247 | input: 248 | expand( 249 | ['{results_dir}/samples/{sample}/{species}/{type}/matrix.mtx', 250 | '{results_dir}/plots/rna_metrics/{sample}_{species}_rna_metrics.pdf'], 251 | sample=samples.index, 252 | species=config['META']['species'], 253 | results_dir=results_dir, 254 | type=types) 255 | 256 | rule merge: 257 | input: 258 | #merge 259 | expand( 260 | ['{results_dir}/plots/UMI_vs_counts.pdf', 261 | '{results_dir}/plots/UMI_vs_gene.pdf', 262 | '{results_dir}/plots/Count_vs_gene.pdf', 263 | '{results_dir}/summary/R_Seurat_objects.rdata', 264 | '{results_dir}/summary/barcode_stats_pre_filter.csv', 265 | '{results_dir}/summary/barcode_stats_post_filter.csv', 266 | '{results_dir}/plots/violinplots_comparison_UMI.pdf', 267 | '{results_dir}/summary/{type}/matrix.mtx'], 268 | results_dir=results_dir, 269 | type=types) 270 | 271 | rule make_report: 272 | input: 273 | expand('{results_dir}/reports/publication_text.html', results_dir=results_dir) 274 | 275 | if len(config['META']['species'].keys()) == 2: 276 | include: "rules/download_meta_mixed.smk" 277 | if len(config['META']['species'].keys()) == 1: 278 | include: "rules/download_meta_single.smk" 279 | 280 | include: "rules/generate_meta.smk" 281 | include: "rules/fastqc.smk" 282 | include: "rules/filter.smk" 283 | include: "rules/cell_barcodes.smk" 284 | include: "rules/map.smk" 285 | include: "rules/extract_expression_single.smk" 286 | include: "rules/split_species.smk" 287 | include: "rules/extract_expression_species.smk" 288 | include: "rules/merge.smk" 289 | include: "rules/report.smk" 290 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | site 2 | .snakemake 3 | -------------------------------------------------------------------------------- /docs/Snakefile: -------------------------------------------------------------------------------- 1 | rule build_docs: 2 | """Build docs using mkdocs""" 3 | conda: 4 | "mkdocs_env.yml" 5 | shell: 6 | "mkdocs build" 7 | 8 | rule serve_docs: 9 | """Build docs and run through developement server""" 10 | conda: 11 | "mkdocs_env.yml" 12 | shell: 13 | "mkdocs serve" 14 | 15 | -------------------------------------------------------------------------------- /docs/docs/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](http://keepachangelog.com/) 5 | and this project adheres to [Semantic Versioning](http://semver.org/). 6 | 7 | 8 | ## [0.5] 9 | ### Added 10 | - Singularity usage. Try out the `--use-singularity` option instead of `--use-conda` 11 | 12 | ### Changed 13 | - Lots off small bugfixes 14 | 15 | 16 | ## [0.4.1] 17 | ### Added 18 | - samples.csv and config.yaml schema validation. This will help users fix missing values. 19 | - DetectBeadSubstitutionErrors was added in the mapping steps. 20 | 21 | ### Changed 22 | - Minimum read length after trimming is now the index of the end of the UMI 23 | - dropSeqPipe can now run with a docker image if you use the `--use-singularity` option. This should help people with package issues and different linux setups. You need to have installed singularity system wide to use this option. 24 | 25 | 26 | ## [0.4] - 2018-12-19 27 | ### Added 28 | - Top barcode detection using [umi-tools](https://github.com/CGATOxford/UMI-tools) based on number of expected cells. 29 | - Genome reference and annotation automatically downloaded now base on build and release number from configuration file. 30 | - On the fly detection of mixed experiment. 31 | - **beta**: Generation of a report for publication describing tools used in each steps. run `make_report` after the preprocessing is done to get `reports/publication_text.html`. This is a really early stage. Feel free to suggest PR for text modifications. 32 | - Raw data, results, reference are now independent from the working dir and can be chosen via the configuration file. 33 | - dropseq_tools v2.0 implemented. This opens up new options such as choosing which locus to use for gene counting. See configuration file. 34 | - Possibility to edit which biotypes are selected from the annotation file via a gtf_biotypes.yaml file provided. 35 | - Cell barcodes are now corrected. One hamming distance for known/given whitelists, graphbased correction based on umi-tools for unknown lists. Those corrections are written in the bam files. This makes final bam files compatible for other tools using the XC/XM bam TAGS. 36 | - UMI are now also corrected based on dropseq_tools v2.0. 37 | - Possibility to choose SENSE, ANTISENSE or BOTH for read counting. 38 | - Adapter content for R1 and R2 have now their own plot, `adapter_content.pdf`. 39 | - New plot called `yield.pdf` makes a summary of total reads and how they are distributed among filtered, trimmed, mapped, etc. 40 | - Configuration file has now a CONTACT section providing a field for a person and a contact e-mail address. 41 | 42 | ### Changed 43 | - Expression matrices output are now sparse (mtx format). This will decrease the size of the output and loading time for downstream analysis. 44 | - Logfiles, plots and samples output are now grouped together in folders by category. This should make browsing results easier. 45 | - Fixed most of the packages versions. 46 | - Summary plots and Seurat object are now in the `all` rule and will be created by default. 47 | 48 | ### Removed 49 | - Merging of species expression accross samples. Since the mixed experiments are mostly used to test out the doublet rate of a platform and not for downstream analysis, this last part has not been updated. Single expression matrices are still there. 50 | - Cell barcodes dropped, umi barcodes dropped, starttrim and polyA trim plots are now gone. BC_drop is also removed. Replacements are adapter_content and yield plots. 51 | - Quality trimming via dropseq_tools has been removed and is now down by cutadapt. Those modifications decrease the running time of the pipeline. 52 | 53 | 54 | ## [0.32] 55 | ### Added 56 | - Documentation generated from the markdown files directly on travis-ci. 57 | 58 | 59 | ## [0.31a] 60 | ### Changed 61 | - fix on species plot. 62 | - fix on rule STAR_align adding now unmapped read to a fastq file. 63 | 64 | ### Added 65 | - Added travis integration. The pipeline is now automatically getting tested when updated and when pull requests are proposed. 66 | - There is now a small git submodule in .test which will provide a sampled file for testing the pipeline on travis-ci. 67 | 68 | ### Removed 69 | - `environment.yaml` has been removed. Youjust have to install snakemake now instead of activating the env. 70 | 71 | ## [0.31] 72 | ### Changed 73 | - Fixed error for STAR index generation. It crashed saying it couldn't write in folder. 74 | - Fixed a missing plot for plot_knee_plot_whitelist. 75 | - Input files for the STAR_align rule have been changed. Adding samples in an already aligned experiment with a different R2 length, will only align the new data and not realign the old one. 76 | - Split reads and barcodes multiqc reports for qc step. 77 | - Modified a few rules to follow the guidelines for [snakemake workflows](https://github.com/snakemake-workflows/docs) 78 | - Fixed an issue where snakemake would crash on clusters if using `expand()` on fixed variables such as `annotation_prefix`. Now using normal python formatting. 79 | - Changed the config.yaml parameters names to lowercase and hyphens! Software specific variables have their original style making it easier to search in manuals. You will have to either copy the new config.yaml from the templates or modify your own accordingly. 80 | - cell-barcode-edit-distance changed to what it actually is, UMI-edit-distance. 81 | - Updated all the envs to fix bugs. 82 | - Fixed a bug where the mixed species would not run properly. 83 | 84 | ### Added 85 | - Added ggpubr in environment.yaml file. 86 | - Added a `templates` folder which will hold `config.yaml`, `samples.csv`, `cluster.yaml` as well as adapters files. This will also help cloning the repository without overwritting your own config.yaml file when updating the pipeline. 87 | - Added the possibility of using your own adapters fasta file for trimmomatic. To use it, please refer to the [WIKI](https://github.com/Hoohm/dropSeqPipe/wiki/Create-config-files#filter) 88 | - Added fastqc, multiqc, STAR wrappers. You have now to use the `--use-conda` option to run the pipeline. 89 | - Added cluster recommendations on the wiki. 90 | - Added Localrules for certain rules. This allows to run low ressource rules on the host computer instead of nodes when using clusters. 91 | - genomeChrBinNbits will be calculated automacially for STAR. 92 | - Exposed all variables for trimmomatic in config.yaml under trimming. 93 | 94 | ### Removed 95 | - png plots have been removed. It was causing some issues on clusters with cairo. Usability is more important than png plots to me. 96 | 97 | 98 | ## [0.3] 99 | ### Changed 100 | - Complete overhaul of how the pipeline is organized to follow the structure proposed for snakemake-workflows. This will allow ease of deployement on any platform having conda installed. It will also help to run on clusters. 101 | - The way to call the pipeline is now simplified. Changes are shown in the [WIKI](https://github.com/Hoohm/dropSeqPipe/wiki/) 102 | - Dependency to Drop-seq-tools updated from version 1.12 to 1.13 103 | - Full compatibility with barcode whitelist. Makes it easier to use for SCRBseq protocols or whitelist from other source (UMI-tools). 104 | - Modified cell and UMI drop plots in order to reflect the option chosen. See [plots](https://github.com/Hoohm/dropSeqPipe/wiki/Plots) 105 | 106 | ### Removed 107 | - Bulk sequencing compatiblity. 108 | - Fastqc and STAR logs plots are removed and replaced by multiqc. 109 | - Automatic determination of STAMPS via knee_plot. Please use an estimated number of cells as the main threshold and filter in downstream analysis for other parameters such as high number of mitochondrial genes. 110 | - `MinCellFraction` entry in config.yaml. This parameter wasn't adding much value and was confusing. 111 | - Base frequency plot has been removed. This will come back with autodetermination of the STAMPS. 112 | 113 | ### Added 114 | - Wrapper for Drop-seq tools. Makes it easier to switch temp folder and choose maximum memory heap. 115 | - More parameters for STAR exposed. See [WIKI](https://github.com/Hoohm/dropSeqPipe/wiki/) 116 | 117 | ## [0.24] 118 | ### Changed 119 | - All the QCplots are now generated inside the snakefiles. No more `generate-plots` mode. 120 | 121 | 122 | ## [0.23a] 123 | ### Changed 124 | - Will now allow you to run `generate-meta` without having a `config.yaml` file in the reference foder. 125 | - Changed the code for Cell and UMI barcode quality drop (per sample and overall). There was an error in the code not givint the right amount of dropped reads. Updated the images on the wiki accordingly. 126 | - Fixed the setup where r2py was called before getting installed. 127 | - Big change in the mapping. From now on the STAR index will be done without a GTF file. This allows to change the overhang option on the fly for each sample based on the mean read length. This also opens up 2-pass mapping. You will have to regenerate your index for it to work. 128 | - Changed `generate_meta` in order to fit the new STAR index without a GTF. You now have to give the path to the GTF file in the config.yaml 129 | 130 | ### Added 131 | - `min_count_per_umi` in the `config.yaml` to decide how many times a Gene - UMI has to be found to be counted as one. 132 | 133 | 134 | ## [0.23] 135 | ### Changed 136 | - pre_align steps will output a fastq.gz instead of a fastq file. 137 | - `fastqc.R` is now compatible with paired and single end data. 138 | - Changed a few options in `GLOBAL` for `UMI` and `Cell_barcodes` options. Now possible to change filtering settings. See [WIKI](https://github.com/Hoohm/dropSeqPipe/wiki/Create-config-files) 139 | - STAR logs have been stripped of the `STAR` string. This is to allow for better compatibility with [multiqc](https://github.com/ewels/MultiQC/) 140 | - Removed `fastqc` folder and moved items to `logs` folder. Grouping all logs files for better [multiqc](https://github.com/ewels/MultiQC/) compatibility. 141 | - Changed `generate_meta` to `generate-meta` for keeping similar syntax between modes. 142 | - Added seperate log files for stats and summary in the DetectBeadSynthesisErrors. 143 | - Moved part of the `README`to the wiki. 144 | - Changed the name of the first expression matrix extracted before the species plot to `unfiltered_expression.` 145 | 146 | 147 | ### Added 148 | - You can now run Bulk Single or paired end RNAseq data. 149 | - Started a wiki with a FAQ 150 | - Added options in `GLOBAL` config.yaml. You can now choose a range of options for UMI and Barcode filtering. please refer to the wiki for more information. 151 | - Support for [MultiQC](https://github.com/ewels/MultiQC/). MultiQC is a great way of summarising all of the logs from your experiment. As of today it supports 46 different modules (such as fastqc, trimmomatic, STAR, etc...) The `generate-plots` mode now produces a `multiqc_report.html` file in the plots folder. 152 | - New plot! BCDrop.pdf is a new plot showing you how many barcode and UMIs you dropped from the raw data before aligning. This helps to track how many samples you might loose because of low quality reads in the barcoding. 153 | 154 | ## [0.22] 155 | ### Changed 156 | - all `subprocess.call` replaced by `shell` from snakemake 157 | - STAR aligner now not limited to 8 cores or threads but will use the maximum number provided in the local.yaml file 158 | - Name from dropSeqPip to dropSeqPipe 159 | - Fixed a bug where all stage1 steps used the same summary file. Now BC tagging, UMI tagging, starting trim and polyA trim have different summary files 160 | - extract-expression now merges all the samples final count matrix into one per run (folder) 161 | - Fixed a bug where the amount of total reads on the knee-plot was overinflated. 162 | - Changed `knee-plot` mode to `generate-plots`. 163 | 164 | ### Added 165 | - Temp files have been added in the pipeline. You can turn this off by using the `--notemp` option 166 | - fastqc mode now available. Generates fastqc reports plus summary plots 167 | - Summary file and plot for fastqc and STAR logs 168 | - Missing R packages should install automatically now. No need to install them beforehand. Report any problem plz 169 | - `GLOBAL` values in the config files are now available. They allow to change UMI and BC ranges as well as mismatches for STAR aligner 170 | - Added a new mode: generate_meta. This allows to create all the metadata files needed for the pipeline. You just need a folder with a genome.fa and an annotation.gtf 171 | 172 | ## [0.21] 173 | ### Added 174 | - Changelog file to track changes 175 | - --rerun option to force a rerun 176 | - Multiple steps allowd now 177 | 178 | ## [0.2] - 2017-03-14 179 | ### Changed 180 | - The pipeline is now a python package being called as an executable 181 | - Went from json to yaml for config files 182 | 183 | ### Added 184 | - setup.py and dependencies 185 | - Species plot available 186 | 187 | ### Removed 188 | - primer handling, went to default: AAGCAGTGGTATCAACGCAGAGTAC 189 | 190 | 191 | ## [0.1] - 2017-02-13 192 | ### First release 193 | - Allows for preprocessing, alignement with STAR, post align processing until knee-plot -------------------------------------------------------------------------------- /docs/docs/Clusters.md: -------------------------------------------------------------------------------- 1 | Running on clusters 2 | ---------------------------------- 3 | There is a file in the `templates` called `cluster.yaml`. This can be used to modify ressources needed for your data. I generally recommand moving the file to the root of the folder so that it doesn't get replaced by updates. 4 | 5 | Bellow is an example of running on a cluster using the template file `cluster.yaml` on SLURM. 6 | 7 | ``` 8 | snakemake --cluster 'sbatch -n {cluster.n} -t {cluster.time} --clusters=CLUSTERNAME --output={cluster.output}' --jobs N --cluster-config cluster.yaml --use-conda --local-cores C 9 | ``` 10 | 11 | * N: is the number of jobs you are allowed to run at the same time 12 | * C: is the local-cores of the host machine. A few simple rules are gonna be run locally (not sent to nodes) because they are not that heavy (mostly plotting) 13 | * CLUSTERNAME: the name of the cluster you want to use 14 | 15 | Note: The default path for cluster logs in the cluster.yaml is `logs/cluster/`. If that folder doesn't exist, our cluster can't write and will crash without an error message. -------------------------------------------------------------------------------- /docs/docs/Create-config-files.md: -------------------------------------------------------------------------------- 1 | # Config file and sample file 2 | --------------------------- 3 | 4 | In order to run the pipeline you will need to complete the config.yaml file and the samples.csv file. Both are located in the `templates` folder , should be moved to the root folder of the experiment and filled in for missing entries before running the pipeline. 5 | 6 | The goal for this is to provide the config.yaml when you finally upload the data to a repository for a publication as well as the pipeline version. This provides other users to ability to rerun the processing from scratch exactly as you did. This is possible because snakemake will download and create the exact same environnment for each rule using the envs files provided with the pipeline. 7 | 8 | ## 1. config.yaml - Executables, system and experiment parameters 9 | The config.yaml contains all the necessary parameters and paths for the pipeline. 10 | ``` 11 | CONTACT: 12 | email: user.name@provider.com 13 | person: John Doe 14 | LOCAL: 15 | temp-directory: /tmp 16 | memory: 4g 17 | raw_data: 18 | results: 19 | META: 20 | species: 21 | mus_musculus: 22 | build: 38 23 | release: 94 24 | homo_sapiens: 25 | build: 38 26 | release: 91 27 | ratio: 0.2 28 | reference-directory: /path/to/references/ 29 | gtf_biotypes: gtf_biotypes.yaml 30 | FILTER: 31 | barcode_whitelist: '' 32 | 5-prime-smart-adapter: AAAAAAAAAAA 33 | cell-barcode: 34 | start: 1 35 | end: 12 36 | UMI-barcode: 37 | start: 13 38 | end: 20 39 | cutadapt: 40 | adapters-file: 'adapters.fa' 41 | R1: 42 | quality-filter: 20 43 | maximum-Ns: 0 44 | extra-params: '' 45 | R2: 46 | quality-filter: 20 47 | minimum-adapters-overlap: 6 48 | minimum-length: 15 49 | extra-params: '' 50 | MAPPING: 51 | STAR: 52 | genomeChrBinNbits: 18 53 | outFilterMismatchNmax: 10 54 | outFilterMismatchNoverLmax: 0.3 55 | outFilterMismatchNoverReadLmax: 1 56 | outFilterMatchNmin: 0 57 | outFilterMatchNminOverLread: 0.66 58 | outFilterScoreMinOverLread: 0.66 59 | EXTRACTION: 60 | LOCUS: 61 | - CODING 62 | - UTR 63 | strand-strategy: SENSE 64 | UMI-edit-distance: 1 65 | minimum-counts-per-UMI: 0 66 | ``` 67 | Please note the "space" after the colon, is needed for the yaml to work. 68 | 69 | ## Subsections 70 | 71 | ### [CONTACT] 72 | * `email` and `person` This is not requested. You can provide the e-mail and name address of the person who processed the data using this configuration. Ideally you should provide the config.yaml with the data repository to allow people to rerun the data using dropSeqPipe. 73 | 74 | ### [LOCAL] 75 | * `temp-directory` is the temp or scratch folder with enough space to keep temporary files. 76 | * `memory` is the maximum memory allocation pool for a Java Virtual Machine. 77 | * `raw_data` is the folder containing all your raw fastq.gz files. 78 | * `results` is the folder that will contain all the results of the pipeline. 79 | 80 | ### [META] 81 | * `species` is where you list the species of your samples. It can be a mixed experiment with two entries. 82 | * `SPECIES_ONE` can be for example: mus_musculus, homo_sapiens, etc... It has to be the name used on ensembl for automatic download to work. 83 | * `build` is the genome build number. 84 | * `release` is the annotation release number. 85 | * `SPECIES_TWO` can be your second species. 86 | * `ratio` is how much "contamination" from another species you allow to validate them as a species or mixed. 0.2 means you allow a maximum of 20% mixing. 87 | * `reference-directory` is where you want to store your references files. 88 | * `gtf_biotypes` is the gtf_biotypes.yaml file containing the selection of biotypes you want to keep for your gene to read attribution. Using less biotypes may decrease your multimapping counts. 89 | 90 | ### [FILTER] 91 | * `barcode_whitelist` is the filename of your whitelist fi you have one. Well plate base protocols often have one. 92 | * `5-prime-smart-adapter` is the 5" smart adapter used in your protocol. 93 | * `cell-barcode and UMI-barcode`: Is the section for cell/umi barcode filtering. 94 | * `start` is the first base position of your cell/umi barcode. 95 | * `end` is the last base position of your cell/umi barcode. 96 | * `cutadapt`: Is the section for trimming. 97 | * `adapters-file` is the file containing your list of adapters as fasta. you can choose between 6 files in the `templates` folder, add any sequence to existing files or provide your own custom one. 98 | * NexteraPE-PE.fa 99 | * TruSeq2-PE.fa 100 | * TruSeq2-SE.fa 101 | * TruSeq3-PE-2.fa 102 | * TruSeq3-PE.fa 103 | * TruSeq3-SE.fa 104 | Provide the path to the file you want to use for trimming. If you want to add custom sequences or create a complete new one, I would advise to store it in the ROOT folder of the experiment. This will ensure that your custom file will not be overwritten if you update the pipeline. 105 | 106 | Example: `NexteraPE-PE.fa` 107 | * `R1` lists the options for read1 (cell barcode and umi) filtering/trimming 108 | * `quality-filter` is the minimum mean score of the sliding window for quality filtering. 109 | * `maximum-Ns` how many Ns you allow in the cell barcode and umi barcode. By default it is one because we want to be able to collapse barcodes that have one mismatch. 110 | * `extra-params` if you usually add extra paramters to cutadapt, you can do it here. *Only for experienced cutadapt users*. 111 | * `R2` lists the options for read2 (mRNA) filtering/trimming 112 | that have one mismatch. 113 | * `maximum-length` is the maximum length of your mRNA read before alignement. 114 | * `extra-params` if you usually add extra paramters to cutadapt, you can do it here. *Only for experienced cutadapt users*. 115 | For more information about trimming and filtering please visit the [cutadapt](https://cutadapt.readthedocs.io/en/stable/guide.html) website. 116 | 117 | ### [MAPPING] 118 | * `STAR` 119 | * `genomeChrBinNbits` is a value used for index generation in STAR. The formula is min(18,int(log2(genomeLength/referenceNumber))) 120 | * `outFilterMismatchNmax` (default:10) is the maximum number of mismatches allowed. 121 | * `outFilterMismatchNoverLmax` (default:0.3) is the maximum ratio of mismatched bases that mapped. 122 | * `outFilterMismatchNoverReadLmax` (default:1.0) is the maximum ratio of mismatched bases of the whole read. 123 | * `outFilterMatchNmin` (default:0) is the minimum number of matched bases. 124 | * `outFilterMatchNminOverLread` (default:0.66) alignment will be output only if the ratio of matched bases is higher than or equal to this value. 125 | * `outFilterScoreMinOverLread` (default:0.66) alignment will be output only if its ratio score is higher than or equal to this value. 126 | 127 | All of the values for STAR are the default ones. For details about STAR parameters and what they do, please refer to the [STAR manual on git](https://github.com/alexdobin/STAR/tree/master/doc). 128 | 129 | ### [EXTRACTION] 130 | * `LOCUS` are the overlapping regions that reads overlap and are counted in the final expression matrix. Possible values are `CODING`, `UTR`, `INTRON` 131 | * `UMI-edit-distance` This is the maximum manhattan distance between two UMI barcode when extracting count matrices. 132 | * `min-count-per-umi` is the minimum UMI/Gene pair needed to be counted as one. 133 | * `strand-strategy` `SENSE` defines that you only count genes where the forward strand mapped to the forward region on the DNA. Other possibilities are `ANTISENSE` (only count reads that mapped on the opposite strand) or `BOTH` (count all). 134 | 135 | # 2. samples.csv - Samples parameters 136 | This file holds the sample names, expected cell numbers and read length for each sample. 137 | The file has to have this format: 138 | 139 | ``` 140 | samples,expected_cells,read_lengths,batch 141 | sample_name1,500,100,Batch1 142 | sample_name2,500,100,Batch2 143 | ``` 144 | 145 | * `expected_cells` is the amount of cells you expect from your sample. 146 | * `read_length` is the read length of the mRNA (Read2). This is necessary for STAR index generation 147 | * `batch` is the batch of your sample. If you are added new samples to the same experiment, this is typically a good place to add the main batch. 148 | 149 | `Note:` You can add any other column you wish here, it won't affect the pipeline and you can use it later on in your analysis. 150 | 151 | Finally, you can now [run the pipeline](https://github.com/Hoohm/dropSeqPipe/wiki/Running-dropSeqPipe) 152 | 153 | or 154 | 155 | Create a [custom reference](https://github.com/Hoohm/dropSeqPipe/wiki/Reference-Files) 156 | 157 | -------------------------------------------------------------------------------- /docs/docs/FAQ.md: -------------------------------------------------------------------------------- 1 | # FAQ 2 | 3 | ## 1. I get `error='Cannot allocate memory' (errno=12)`, what should I do. [Fixed] 4 | 5 | This has been fixed by using a wrapper exposing the TMPDIR to the pipeline. 6 | 7 | First, be sure that your TMPDIR from the first configuration yaml has at least 100Go. 8 | If you still have problems, you should edit the following files in the Drop-seq_tools-1.12: 9 | 10 | * TagBamWithReadSequenceExtended 11 | * FilterBAM 12 | * TrimStartingSequence 13 | * PolyATrimmer 14 | * TagReadWithGeneExon 15 | * DetectBeadSynthesisErrors 16 | * SingleCellRnaSeqMetricsCollector 17 | * BAMTagHistogram 18 | 19 | In each of those files, the last line should be something like: 20 | `java -Xmx${xmx} -Djava.io.tmpdir=/path/to/temp/folder/ -jar $jar_deploy_dir/dropseq.jar $progname $*` 21 | 22 | You can also use this simple bash script to do it: 23 | Replace `/path/to/temp/folder/` with your temp path and don't forget to use escapes for / 24 | ``` 25 | for f in BAMTagHistogram SingleCellRnaSeqMetricsCollector DetectBeadSynthesisErrors TagReadWithGeneExon PolyATrimmer TrimStartingSequence FilterBAM TagBamWithReadSequenceExtended 26 | do 27 | sed -i 's/java -Xmx${xmx}/java -Xmx${xmx} -Djava.io.tmpdir=/path/to/temp/folder/ /g' $f 28 | done 29 | ``` -------------------------------------------------------------------------------- /docs/docs/Installation.md: -------------------------------------------------------------------------------- 1 | This pipeline is dependent on conda. 2 | 3 | ### Step 1: Download and install miniconda3 4 | First you need to download and install miniconda3: 5 | 6 | for linux 7 | ``` 8 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh 9 | bash Miniconda3-latest-Linux-x86_64.sh 10 | ``` 11 | 12 | for mac os 13 | ``` 14 | curl https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -o Miniconda3-latest-MacOSX-x86_64.sh 15 | bash Miniconda3-latest-MacOSX-x86_64.sh 16 | ``` 17 | 18 | 19 | ### Step 2: Clone the workflow 20 | 21 | Clone the worflow 22 | ``` 23 | git clone https://github.com/Hoohm/dropSeqPipe.git 24 | ``` 25 | 26 | ### Step 3: Install snakemake 27 | 28 | ``` 29 | conda install -c bioconda -c conda-forge snakemake 30 | ``` 31 | 32 | Next step is config files completion 33 | 34 | [Complete the config.yaml](https://github.com/Hoohm/dropSeqPipe/wiki/Create-config-files) with the missing information 35 | 36 | ### UPDATES: How to update the pipeline 37 | 38 | Go to your experiment folder, then pull. 39 | ``` 40 | git pull https://github.com/Hoohm/dropSeqPipe.git 41 | ``` 42 | 43 | If you want to update files/plots based on the updates you can use this command: 44 | ``` 45 | snakemake -R `snakemake --list-codes-changes` 46 | ``` 47 | This will update all the files that would be modified by the changes in the code (rules or script). Depending on how much and where the changes have been made, this might rerun the whole pipeline. -------------------------------------------------------------------------------- /docs/docs/Plots.md: -------------------------------------------------------------------------------- 1 | On of the main purpose of this package is getting information about your data to improve your protocol and filter your data for further downstream analysis. 2 | 3 | 4 | Here is a list of plots and reports that you will get from the pipeline. 5 | 6 | Fastqc, STAR and cutadapt reports are generated as [multiqc reports](http://multiqc.info/docs/#using-multiqc-reports) in the reports folder. 7 | 8 | 9 | ## 1. Adapter content 10 | ![Adapter content](images/adapter_content.pdf) 11 | On the x axis are the samples. 12 | On the y axis are the percentages of total adapters that have been found (and trimmed) in respective fastq files based on the `adapter-file` provided via `config.yaml`. 13 | 14 | The top plot is for read1 and the bottom for read2. 15 | 16 | This plot provides an idea of the which adapter has been found and in which proportion in each sample. 17 | 18 | ## 2. Yield (across samples) 19 | ![Yield](images/yield.png) 20 | On the x axis are the samples. 21 | TOP: On the y axis are the number of reads attributed to each category. 22 | BOTTOM: On the y axis are the percentage of attributed to each category. 23 | This plot gives you an overview of all the reads from your samples and how they are distributed in all the possible categories. The reads that are uniquely mapped ar the ones you will keep at the end for the UMI count matrix. 24 | 25 | ## 3. Knee plot (per sample) 26 | ![Knee plot](images/sample1_knee_plot.png) 27 | On the x axis is the cumulative fraction of reads per STAMPS (captured cell). 28 | On the y axis is the ordered STAMPS (based on total reads). 29 | This allows you to determine how much of the reads you actually captured with the number of cells you expected. 30 | The cutting is based on the `expected_cells` parameter in the `samples.csv` file. 31 | The green `selected cells` are the cells that are going to be in the final expression matrix. 32 | If you see a clear bend on the plot that is higher in the number of cells than what you expected, you should increase the `expected_cells` value and rerun the `extract` step. If it is under, I would advise to filter out your data with a downstream analysis tool such as Seurat. 33 | *Note: I advise not to try to discover "real" cells/STAMPS at this stage. I suggest to extract the expected number of cells and filter out later in post-processing with other kind of meta data.* 34 | 35 | 36 | ## 4. RNA metrics (per sample) 37 | ![RNA metrics](images/sample1_rna_metrics.png) 38 | On the x axis are top barcodes based on your `expected_cells` values or the `barcodes.csv` file. 39 | Top plot: On the y axis are the number of bases classified by region of mapping. 40 | Bottom plot: On the y axis are the percentage of bases classified by region of mapping. 41 | This plot gives a lot of different informations. The top plot allows you to quickly compare cells between them in terms of how much has been mapped. This can sometimes help identify outliers or bad runs. 42 | The bottom plot allows you to find cells that have an "abnormal" mapped base distribution compared to other cells. 43 | 44 | 45 | 46 | ## 5. Violine plots for barcode properties (across samples) 47 | ![Violine plots](images/mac_violinplots_comparison_UMI.png) 48 | Various statistic for barcodes that were taken forward as STAMPs as set as `expected_cells` in `config.yaml`. 49 | Each point represents a barcode augmented by a violine-plot density estimator of barcode distribution along the y-axis. 50 | 51 | On the x axis are the samples for each panel (Note: the dot distribution along the x-axis does't not bear information, it's just a visual aid to better assess density). 52 | On the y axis are the respecitve statistics described below for each panel. 53 | 54 | TOP panel from left to right: 55 | 56 | - nUMI: number of UMI per barcode 57 | - nCounts: number of Counts per barcode 58 | - top50: fraction (percentage/100) of the highest expressed genes compared to entire set of genes. 59 | 60 | BOTTOM: 61 | 62 | - nUMI: average number of UMI per Gene per barcode 63 | - pct.Ribo: Fraction of ribosomal RNA (Note: ribsomal transcripts defined as starting with "^Rpl") 64 | - pct.mito: Fraction of mitochondrial RNA (Note: mitchondrial transcripts defined as starting with "^mt-") 65 | 66 | ## 6. Saturation plot: UMI per barcode (across samples) 67 | ![umi per barcode](images/mac_UMI_vs_gene.png) 68 | Number of UMI (x-axis) vs number of Genes (y-axis) for each barcode (points in plot) broken down by sample (different colors). 69 | Number of Genes defined as Genes having at least 1 read mapped to them. 70 | Individual samples are color-coded. A loess regression curve of barcodes for each sample is fitted. 71 | Various statistic for barcodes that were taken forward as STAMPs as set as `expected_cells` in `config.yaml`. 72 | 73 | This plot can indicate how many counts per barcode are required on average to find all expressed genes in a cell. 74 | Given enought coverage, it can also indicate how many genes are expressed for the examined cell type. 75 | 76 | ## 7. Saturation plot: Counts per barcode (across samples) 77 | ![counts per barcode](images/mac_Count_vs_gene.png) 78 | Number of Counts (x-axis) vs number of Genes (y-axis) for each barcode (points in plot) broken down by sample (different colors). 79 | Number of Genes defined as Genes having at least 1 read mapped to them. 80 | Individual samples are color-coded. A loess regression curve of barcodes for each sample is fitted. 81 | Various statistic for barcodes that were taken forward as STAMPs as set as `expected_cells` in `config.yaml`. 82 | 83 | ## 8. Counts per UMI per barcode (across samples) 84 | ![counts per UMI](images/mac_UMI_vs_counts.png) 85 | Number of UMI (x-axis) vs number of Counts (y-axis) for each barcode (points in plot) broken down by sample (different colors). 86 | Individual samples are color-coded. A loess regression curve of barcodes for each sample is fitted. 87 | Black line indicate an optimal 1:1 ratio between UMI and Counts (i.e. no Duplicates!) 88 | 89 | This plots can give an indication on the level of duplication for each sample. The close to black line the lower duplication. 90 | 91 | # Mixed experiment 92 | 93 | ## 9. Barnyard plot (per sample) 94 | ![Barnyard plot](images/hum_mus_species_plot_transcripts.png) 95 | This plot shows you species purity for each STAMPS. Mixed and No call STAMPS are dropped and only single species are kept for extraction. 96 | You can change the minimum ratio of transcripts to define a STAMP as mixed or not in the configfile with: `species_ratio` 97 | You get one plot for genes and one plot for transcripts. The selection is done on the transcript level. 98 | -------------------------------------------------------------------------------- /docs/docs/Reference-Files.md: -------------------------------------------------------------------------------- 1 | Reference files 2 | ----------------- 3 | From version 0.4 on, reference files are automatically downloaded by the pipeline. Mixed references are also downloaded and merged automatically. Since sometimes you still want to use your own reference you can bypass the download by creating your own `genome.fa` and `annotation.gtf` file. 4 | 5 | Snakemake generates file based on paths. If you want to use a custom reference you have to name it properly for snakemake to find it. 6 | 7 | Here is an example: 8 | 9 | Let's assume this is you configuration for the META section: 10 | ``` 11 | META: 12 | species: 13 | funky_species_name: 14 | build: A 15 | release: 1 16 | ratio: 0.2 17 | reference-directory: /absolute/path/to/references 18 | gtf_biotypes: gtf_biotypes.yaml 19 | ``` 20 | 21 | You need to provide the following files 22 | 23 | ``` 24 | /absolute/path/to/references/funky_species_name_A_1/genome.fa 25 | /absolute/path/to/references/funky_species_name_A_1/annotation.gtf 26 | ``` 27 | 28 | This will stop dropSeqPipe from downloading a new reference. 29 | 30 | 31 | Once the pipeline has run completely, the folder will look like this: 32 | 33 | ``` 34 | genome.fa 35 | annotation.gtf 36 | annotation.refFlat 37 | annotation_reduced.gtf 38 | genome.consensus_introns.intervals 39 | genome.dict 40 | genome.exons.intervals 41 | genome.genes.intervals 42 | genome.intergenic.intervals 43 | genome.rRNA.intervals 44 | STAR_INDEX/SA_read_length/ 45 | ``` 46 | 47 | Note: The STAR index will be built based on the read length of your mRNA read (Read2). 48 | If you have different lengths, it will produce multiple indexes. 49 | 50 | Finally, you can now [run the pipeline](https://github.com/Hoohm/dropSeqPipe/wiki/Running-dropSeqPipe) -------------------------------------------------------------------------------- /docs/docs/Running-dropSeqPipe.md: -------------------------------------------------------------------------------- 1 | Example 2 | ----------------------- 3 | The pipeline is to be cloned once and then run on any folder containing the configuration files and your raw data. The workingdir folder can contain multiple runs (aka batches) as you can easily add new samples when recieving new data and run the same commands. This will simply run the pipeline on the newly added data and recreate reports as well as plots containing all the samples. 4 | 5 | Example: You run 2 biological conditions with 2 replicates. This makes up for 4 samples. Assume a simple dropseq protocol with only human cells. 6 | 1. You sequence the data and recieve the 8 files (two files per sample) and download the pipeline 7 | 2. You run the pipeline with the command: `snakemake --use-conda --cores N --directory WORKING_DIR`. `N` being the number of cores available and `WORKING_DIR` being the folder containing your `config.yaml`, `samples.csv`, adapter file and `gtf_biotypes.yaml`. 8 | 3. You see that there is an issue with the protocol and you modify it 9 | 4. You create a new set of libraries and sequence them (same 2x2 design) 10 | 5. You add the new files in the data folder of `WORKING_DIR` and edit the samples.csv to add missing samples. 11 | 6. You run the pipeline as you did the first time `snakemake --use-conda --cores N --directory WORKING_DIR` 12 | 7. This will run the new samples only and recreate the reports as well as the yield plots. 13 | 8. It is now easy to compare the impact of your change in the procotol 14 | 15 | Working dir folder preparation 16 | ---------------- 17 | The raw data from the sequencer should be stored in the `RAW_DATA` folder of `WORKING_DIR` folder like this: 18 | ``` 19 | /path/to/your/WORKING_DIR/ 20 | | -- RAW_DATA/ 21 | | -- -- sample1_R1.fastq.gz 22 | | -- -- sample1_R2.fastq.gz 23 | | -- -- sample2_R1.fastq.gz 24 | | -- -- sample2_R2.fastq.gz 25 | | samples.csv 26 | | config.yaml 27 | | barcodes.csv 28 | | adapters.fa 29 | ``` 30 | *Note: In DropSeq or ScrbSeq you expect a paired sequencing. R1 will hold the information of your barcode and UMI, R2 will hold the 3' end of the captured mRNA.* 31 | 32 | 33 | Once everything is in place, you can run the pipeline using the normal snakemake commands. 34 | 35 | Running the pipeline (TLDR version) 36 | ---------------------------- 37 | 38 | For a simple single cell run you only need to run: `snakemake --cores N --use-conda --directory WORKING_DIR` 39 | This will run the whole pipeline and use the X number of cores you gave to it. 40 | 41 | 42 | Running the pipeline 43 | --------------------------------- 44 | 45 | I highly recommend to take a [look at the options](http://snakemake.readthedocs.io/en/latest/) that are available since I won't cover everything here. 46 | 47 | 48 | Modes 49 | ------------------------------ 50 | You have two main ways to run the pipeline. 51 | 52 | You can either just run `snakemake --use-conda --directory WORKING_DIR` in the root folder containing your experiment and it will run everything without stopping. 53 | 54 | You can also run each step separately. The main advantage of the second way is that you are able to fine tune your parameters based on the results of fastqc, filtering, mapping quality, etc... 55 | I would suggest using the second approach when you work on a new protocol and the first one when you are confident of your parameters. 56 | 57 | There are seven different modes available and to run one specifically you need to call the mode. 58 | 59 | Example: To run the `qc` mode: `snakemake --cores 8 qc --use-conda --directory WORKING_DIR` 60 | You can also run multiple modes at the same time if you want: `snakemake --cores 8 qc filter --use-conda --directory WORKING_DIR` 61 | 62 | ### Single species: 63 | * `meta`: Downloads and generates all the subsequent references files and STAR index needed to run the pipeline. You can run this alone if you just want to create the meta-data file before running a new set of data. 64 | * `qc`: Creates fastqc reports of your data. 65 | * `filter`: Go from sample_R1.fastq.gz to sample_filtered.fastq.gz ready to be mapped to the genome. This step filters out your data for low quality reads and trims adapter you provided in the FILTER section. 66 | * `map`: Go from sample_filtered.fastq.gz to the sample_final.bam read to extract the expression data. This maps the data to the genom. 67 | * `extract`: Extract the expression data. You'll get a umi and a count expression matrix from your whole experiment. 68 | 69 | ### Mixed species 70 | Since v`0.4` the pipeline detects mixed experiments on the fly. Simply run `snakemake --directory WORKING_DIR --use-conda`. The stepwise approach is not available for mixed experiments. 71 | 72 | Barcode whitelist 73 | --------------------- 74 | In protocols such as SCRBseq, the expected barcodes sequences are known. This pipeline also does allow the use of known barcodes instread of a number of expected cells. 75 | In order to use this functionnality you just need to add a whitelist barcode file and provide the name of the file in the configuration in the section: 76 | 77 | ``` 78 | FILTER: 79 | barcode_whitelist: name_of_your_whitelist_file 80 | ``` 81 | The file should be in the WORKING_DIR. Run the pipeline as usual. 82 | 83 | Advanced options 84 | ------------------- 85 | If you have some specific adapters that are not present by default in the ones in the `templates` folder, you can add whatever adapters you want to trim (as many as you need) following the fasta syntax. 86 | 87 | ``` 88 | FILTER: 89 | cutadapt: 90 | adapters-file: name_of_your_adapter_file.fa 91 | ``` 92 | 93 | Further options 94 | --------------------- 95 | * `--cores N` Use this argument to use X amunt of cores available. 96 | * `--notemp` Use this to not delete all the temporary files. Without this option, only files between steps are kept. Use this option if you are troobleshooting the pipeline or you want to analyze in between files yourself. 97 | * `--dryrun` or `-n` Use this to check out what is going to run if you run your command. This is nice to check for potential missing files. 98 | 99 | 100 | 101 | Folder Structure 102 | ----------------------- 103 | This is the folder structure you get in the end: 104 | ``` 105 | /path/to/your/WORKING_DIR/ 106 | | -- RAW_DATA/ 107 | | -- RESULT_DIR/ 108 | | -- -- logs/ 109 | | -- -- -- cluster/ 110 | | -- -- plots/ 111 | | -- -- reports/ 112 | | -- -- summary/ 113 | | -- -- samples/ 114 | | samples.csv 115 | | config.yaml 116 | | barcodes.csv 117 | | adapter.fa 118 | | .snakemake/ 119 | ``` 120 | 121 | * `RAW_DATA/` Contains all your samples as well as the intermediary files 122 | * `RESULT_DIR/logs/` Contains all the logfiles generated by the pipeline 123 | * `RESULT_DIR/logs/cluster` Contains all the logfiles generated by the cluster 124 | * `RESULT_DIR/plots/` Contains all the plots generated by the pipeline 125 | * `RESULT_DIR/reports/` Contains all the reports generated by the pipeline 126 | * `RESULT_DIR/summary/` Contains all the files you might use for downstream analysis (contains barcodes selected per sample per species, final umi/counts expression matrix) 127 | * `RESULT_DIR/samples/` Contains all the sample specific files. Bam files, barcodes used, single sample expression files, etc... 128 | * `samples.csv` File containing sample details 129 | * `config.yaml` File containing pipeline parameters as well as system parameters 130 | * `adapters.fa` File containing all the adapters you wish to trim from the raw data. 131 | * `.snakemake/` Folder that contains all the environements created for the run as well as a lot of other things. -------------------------------------------------------------------------------- /docs/docs/images/adapter_content.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hoohm/dropSeqPipe/4927d6e60e9fcac3516db3df48c67c3c99af7999/docs/docs/images/adapter_content.png -------------------------------------------------------------------------------- /docs/docs/images/hum_mus_species_plot_transcripts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hoohm/dropSeqPipe/4927d6e60e9fcac3516db3df48c67c3c99af7999/docs/docs/images/hum_mus_species_plot_transcripts.png -------------------------------------------------------------------------------- /docs/docs/images/mac_Count_vs_gene.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hoohm/dropSeqPipe/4927d6e60e9fcac3516db3df48c67c3c99af7999/docs/docs/images/mac_Count_vs_gene.png -------------------------------------------------------------------------------- /docs/docs/images/mac_UMI_vs_counts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hoohm/dropSeqPipe/4927d6e60e9fcac3516db3df48c67c3c99af7999/docs/docs/images/mac_UMI_vs_counts.png -------------------------------------------------------------------------------- /docs/docs/images/mac_UMI_vs_gene.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hoohm/dropSeqPipe/4927d6e60e9fcac3516db3df48c67c3c99af7999/docs/docs/images/mac_UMI_vs_gene.png -------------------------------------------------------------------------------- /docs/docs/images/mac_violinplots_comparison_UMI.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hoohm/dropSeqPipe/4927d6e60e9fcac3516db3df48c67c3c99af7999/docs/docs/images/mac_violinplots_comparison_UMI.png -------------------------------------------------------------------------------- /docs/docs/images/sample1_knee_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hoohm/dropSeqPipe/4927d6e60e9fcac3516db3df48c67c3c99af7999/docs/docs/images/sample1_knee_plot.png -------------------------------------------------------------------------------- /docs/docs/images/sample1_rna_metrics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hoohm/dropSeqPipe/4927d6e60e9fcac3516db3df48c67c3c99af7999/docs/docs/images/sample1_rna_metrics.png -------------------------------------------------------------------------------- /docs/docs/images/yield.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hoohm/dropSeqPipe/4927d6e60e9fcac3516db3df48c67c3c99af7999/docs/docs/images/yield.png -------------------------------------------------------------------------------- /docs/docs/index.md: -------------------------------------------------------------------------------- 1 | Welcome 2 | ------------------------------ 3 | 4 | Welcome to the documentation of dropSeqPipe v`0.4`. -------------------------------------------------------------------------------- /docs/mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: dropSeqPipe 2 | theme: readthedocs 3 | repo_name: 'GitHub' 4 | repo_url: https://github.com/Hoohm/dropSeqPipe 5 | nav: 6 | - 'index.md' 7 | - 'Installation.md' 8 | - 'Reference-Files.md' 9 | - 'Create-config-files.md' 10 | - 'Running-dropSeqPipe.md' 11 | - 'Clusters.md' 12 | - 'Plots.md' 13 | - 'CHANGELOG.md' 14 | - 'FAQ.md' 15 | 16 | google_analytics: 17 | - 'UA-128943644-1' 18 | - 'auto' -------------------------------------------------------------------------------- /docs/mkdocs_env.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - defaults 4 | dependencies: 5 | - bzip2=1.0.6=h470a237_2 6 | - ca-certificates=2018.8.24=ha4d7672_0 7 | - certifi=2018.8.24=py37_1001 8 | - click=7.0=py_0 9 | - jinja2=2.10=py_1 10 | - libffi=3.2.1=hfc679d8_5 11 | - libgcc-ng=7.2.0=hdf63c60_3 12 | - libstdcxx-ng=7.2.0=hdf63c60_3 13 | - livereload=2.5.2=py_0 14 | - markdown=2.6.11=py_0 15 | - markupsafe=1.0=py37h470a237_1 16 | - mkdocs=1.0.4=py_0 17 | - ncurses=6.1=hfc679d8_1 18 | - openssl=1.0.2p=h470a237_0 19 | - pip=18.1=py37_1000 20 | - python=3.7.0=h5001a0f_4 21 | - python-markdown-math=0.6=py_0 22 | - pyyaml=3.13=py37h470a237_1 23 | - readline=7.0=haf1bffa_1 24 | - setuptools=40.4.3=py37_0 25 | - six=1.11.0=py37_1001 26 | - sqlite=3.25.2=hb1c47c0_0 27 | - tk=8.6.8=ha92aebf_0 28 | - tornado=5.1.1=py37h470a237_0 29 | - wheel=0.32.1=py37_0 30 | - xz=5.2.4=h470a237_1 31 | - yaml=0.1.7=h470a237_1 32 | - zlib=1.2.11=h470a237_3 33 | 34 | -------------------------------------------------------------------------------- /envs/bbmap.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - bioconda 3 | dependencies: 4 | - bbmap=38.22 5 | -------------------------------------------------------------------------------- /envs/cutadapt.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - bioconda 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - python>=3.3 7 | - cutadapt=1.16 8 | -------------------------------------------------------------------------------- /envs/dropseq_tools.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - bioconda 3 | - anaconda 4 | - conda-forge 5 | dependencies: 6 | - dropseq_tools=2.0.0 7 | - font-ttf-dejavu-sans-mono=2.37 8 | - fontconfig=2.13.1 9 | -------------------------------------------------------------------------------- /envs/merge.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - r 3 | - conda-forge 4 | dependencies: 5 | - r=3.4.1 6 | - readline=6.2 7 | - r-matrix=1.2_14 8 | -------------------------------------------------------------------------------- /envs/merge_bam.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - bioconda 3 | dependencies: 4 | - pysam=0.15.1 5 | - biopython=1.72 6 | - python>=3.6 7 | -------------------------------------------------------------------------------- /envs/merge_long.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | dependencies: 4 | - pandas=0.25.1 5 | -------------------------------------------------------------------------------- /envs/picard.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - bioconda 3 | - anaconda 4 | - conda-forge 5 | dependencies: 6 | - picard=2.14.1.0 7 | - font-ttf-dejavu-sans-mono=2.37 8 | - fontconfig=2.13.1 9 | 10 | -------------------------------------------------------------------------------- /envs/pigz.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - anaconda 3 | dependencies: 4 | - pigz=2.4 5 | -------------------------------------------------------------------------------- /envs/r.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | dependencies: 5 | - r=3.4.1 6 | - r-ggplot2=2.2.1 7 | - r-gridextra 8 | - r-viridis 9 | - r-stringdist 10 | - r-dplyr=0.7.6 11 | - r-mvtnorm 12 | - r-seurat=2 13 | - r-hmisc 14 | - r-tidyverse 15 | - r-devtools 16 | - r-rcolorbrewer 17 | - font-ttf-dejavu-sans-mono=2.37 18 | - fontconfig=2.13.1 19 | -------------------------------------------------------------------------------- /envs/samtools.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - bioconda 3 | - conda-forge 4 | dependencies: 5 | - samtools=1.9 6 | - ncurses=6.1 7 | -------------------------------------------------------------------------------- /envs/star.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - bioconda 3 | dependencies: 4 | - star=2.6.1b 5 | -------------------------------------------------------------------------------- /envs/umi_tools.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - bioconda 3 | dependencies: 4 | - umi_tools=0.5.5 5 | - scipy=1.1.0 -------------------------------------------------------------------------------- /envs/velocyto.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | dependencies: 5 | - numpy 6 | - scipy 7 | - cython 8 | - numba 9 | - matplotlib 10 | - scikit-learn 11 | - h5py 12 | - click 13 | - pip: 14 | - velocyto -------------------------------------------------------------------------------- /rules/cell_barcodes.smk: -------------------------------------------------------------------------------- 1 | 2 | 3 | ruleorder: extend_barcode_whitelist > extend_barcode_top 4 | ruleorder: extend_barcode_whitelist > get_cell_whitelist 5 | 6 | 7 | localrules: 8 | get_cell_whitelist, 9 | extend_barcode_top 10 | 11 | rule extend_barcode_whitelist: 12 | input: 13 | whitelist=barcode_whitelist 14 | output: 15 | barcodes='{results_dir}/samples/{sample}/barcodes.csv', 16 | barcode_ref='{results_dir}/samples/{sample}/barcode_ref.pkl', 17 | barcode_ext_ref='{results_dir}/samples/{sample}/barcode_ext_ref.pkl', 18 | barcode_mapping='{results_dir}/samples/{sample}/empty_barcode_mapping.pkl' 19 | script: 20 | '../scripts/generate_extended_ref.py' 21 | 22 | rule get_top_barcodes: 23 | input: 24 | '{results_dir}/samples/{sample}/trimmed_repaired_R1.fastq.gz' 25 | output: 26 | '{results_dir}/samples/{sample}/top_barcodes.csv' 27 | conda: '../envs/umi_tools.yaml' 28 | params: 29 | cell_barcode_length=(config['FILTER']['cell-barcode']['end'] - config['FILTER']['cell-barcode']['start'] + 1), 30 | umi_barcode_length=(config['FILTER']['UMI-barcode']['end'] - config['FILTER']['UMI-barcode']['start'] + 1), 31 | num_cells=lambda wildcards: round(int(samples.loc[wildcards.sample,'expected_cells'])*1.2), 32 | shell: 33 | """umi_tools whitelist\ 34 | --stdin {input}\ 35 | --bc-pattern='(?P.{{{params.cell_barcode_length}}})(?P.{{{params.umi_barcode_length}}})'\ 36 | --extract-method=regex\ 37 | --set-cell-number={params.num_cells}\ 38 | --log2stderr > {output}""" 39 | 40 | rule get_cell_whitelist: 41 | input: 42 | '{results_dir}/samples/{sample}/top_barcodes.csv' 43 | output: 44 | '{results_dir}/samples/{sample}/barcodes.csv' 45 | shell: 46 | """cat {input} | cut -f 1 > {output}""" 47 | 48 | 49 | rule extend_barcode_top: 50 | input: 51 | whitelist='{results_dir}/samples/{sample}/top_barcodes.csv' 52 | output: 53 | barcode_ref='{results_dir}/samples/{sample}/barcode_ref.pkl', 54 | barcode_ext_ref='{results_dir}/samples/{sample}/barcode_ext_ref.pkl', 55 | barcode_mapping='{results_dir}/samples/{sample}/empty_barcode_mapping.pkl' 56 | script: 57 | '../scripts/umi_tools_extended_ref.py' 58 | 59 | 60 | rule repair_barcodes: 61 | input: 62 | bam='{results_dir}/samples/{sample}/Aligned.merged.bam', 63 | barcode_ref='{results_dir}/samples/{sample}/barcode_ref.pkl', 64 | barcode_ext_ref='{results_dir}/samples/{sample}/barcode_ext_ref.pkl', 65 | barcode_mapping='{results_dir}/samples/{sample}/empty_barcode_mapping.pkl' 66 | conda: '../envs/merge_bam.yaml' 67 | output: 68 | bam=temp('{results_dir}/samples/{sample}/Aligned.repaired.bam'), 69 | barcode_mapping_counts='{results_dir}/samples/{sample}/barcode_mapping_counts.pkl' 70 | script: 71 | '../scripts/repair_barcodes.py' -------------------------------------------------------------------------------- /rules/download_meta_mixed.smk: -------------------------------------------------------------------------------- 1 | from snakemake.remote.FTP import RemoteProvider as FTPRemoteProvider 2 | FTP = FTPRemoteProvider() 3 | 4 | localrules: 5 | download_annotation, 6 | download_genome, 7 | rename_genome, 8 | merge_genomes, 9 | merge_annotations 10 | 11 | def get_annotation(wildcards): 12 | return FTP.remote("ftp.ensembl.org/pub/release-{0}/gtf/{1}/{2}.GRC{3}{4}.{0}.gtf.gz".format( 13 | wildcards.release, 14 | wildcards.species, 15 | wildcards.species.capitalize(), 16 | wildcards.species.lower()[0], 17 | wildcards.build), 18 | static=True, 19 | keep_local=True) 20 | 21 | def get_genome(wildcards): 22 | return FTP.remote("ftp.ensembl.org/pub/release-{0}/fasta/{1}/dna/{2}.GRC{3}{4}.dna.primary_assembly.fa.gz".format( 23 | wildcards.release, 24 | wildcards.species, 25 | wildcards.species.capitalize(), 26 | wildcards.species.lower()[0], 27 | wildcards.build), 28 | static=True, 29 | keep_local=True) 30 | 31 | rule download_annotation: 32 | input: 33 | get_annotation 34 | output: 35 | "{ref_path}/{species}_{build}_{release}/annotation.gtf" 36 | shell: 37 | "gunzip -c -d {input} > {output}" 38 | 39 | rule download_genome: 40 | input: 41 | get_genome 42 | output: 43 | "{ref_path}/{species}_{build}_{release}/genome.fa" 44 | shell: 45 | "gunzip -d -c {input} > {output}" 46 | 47 | 48 | rule rename_genome: 49 | input: 50 | "{ref_path}/{species}_{build}_{release}/genome.fa" 51 | output: 52 | temp("{ref_path}/{species}_{build}_{release}/renamed_genome.fa") 53 | params: 54 | species= lambda wildcards: wildcards.species 55 | shell: 56 | """sed -e 's/>/>{params.species}/g' {input} > {output}""" 57 | 58 | 59 | rule merge_genomes: 60 | input: 61 | genome1=expand("{ref_path}/{species}_{build}_{release}/renamed_genome.fa", 62 | species=species_list[0], 63 | build=build_list[0], 64 | release=release_list[0], 65 | ref_path=config['META']['reference-directory']), 66 | genome2=expand("{ref_path}/{species}_{build}_{release}/renamed_genome.fa", 67 | species=species_list[1], 68 | build=build_list[1], 69 | release=release_list[1], 70 | ref_path=config['META']['reference-directory']) 71 | output: 72 | "{}/{}_{}_{}/genome.fa".format( 73 | config['META']['reference-directory'], 74 | species, 75 | build, 76 | release) 77 | shell: 78 | """cat {input.genome1} {input.genome2} > {output}""" 79 | 80 | rule merge_annotations: 81 | input: 82 | annotation1=expand("{ref_path}/{species}_{build}_{release}/annotation.gtf", 83 | species=species_list[0], 84 | build=build_list[0], 85 | release=release_list[0], 86 | ref_path=config['META']['reference-directory']), 87 | annotation2=expand("{ref_path}/{species}_{build}_{release}/annotation.gtf", 88 | species=species_list[1], 89 | build=build_list[1], 90 | release=release_list[1], 91 | ref_path=config['META']['reference-directory']), 92 | output: 93 | "{}/{}_{}_{}/annotation.gtf".format( 94 | config['META']['reference-directory'], 95 | species, 96 | build, 97 | release) 98 | params: 99 | build_list=build_list, 100 | release_list=release_list, 101 | species_list=species_list 102 | run: 103 | import datetime 104 | import re 105 | header1="#!Mixed reference of {} and {}\n".format( 106 | species_list[0], 107 | species_list[1]) 108 | header2="#!genome-builds GRC{}{} GRC{}{}\n".format( 109 | species_list[0].lower()[0], 110 | build_list[0], 111 | species_list[1].lower()[0], 112 | build_list[1]) 113 | header3="#!genome-releases {} {}\n".format( 114 | release_list[0], 115 | release_list[1]) 116 | header4="#!genome-date {}\n".format(str(datetime.date.today())) 117 | header=[header1,header2,header3,header4] 118 | with open(input.annotation1[0]) as annotation1: 119 | with open(input.annotation2[0]) as annotation2: 120 | with open(output[0], 'w') as outfile: 121 | outfile.writelines(header) 122 | for line in annotation1: 123 | if(not line.startswith('#!')): 124 | outfile.write(re.sub('^',species_list[0],line)) 125 | for line in annotation2: 126 | if(not line.startswith('#!')): 127 | outfile.write(re.sub('^',species_list[1],line)) 128 | 129 | 130 | -------------------------------------------------------------------------------- /rules/download_meta_single.smk: -------------------------------------------------------------------------------- 1 | from snakemake.remote.FTP import RemoteProvider as FTPRemoteProvider 2 | FTP = FTPRemoteProvider() 3 | 4 | localrules: 5 | download_annotation, 6 | download_genome 7 | 8 | def get_annotation(wildcards): 9 | return FTP.remote("ftp.ensembl.org/pub/release-{0}/gtf/{1}/{2}.GRC{3}{4}.{0}.gtf.gz".format( 10 | wildcards.release, 11 | wildcards.species, 12 | wildcards.species.capitalize(), 13 | wildcards.species.lower()[0], 14 | wildcards.build), 15 | static=True, 16 | keep_local=True) 17 | 18 | def get_genome(wildcards): 19 | return FTP.remote("ftp.ensembl.org/pub/release-{0}/fasta/{1}/dna/{2}.GRC{3}{4}.dna.primary_assembly.fa.gz".format( 20 | wildcards.release, 21 | wildcards.species, 22 | wildcards.species.capitalize(), 23 | wildcards.species.lower()[0], 24 | wildcards.build), 25 | static=True, 26 | keep_local=True) 27 | 28 | rule download_annotation: 29 | input: 30 | get_annotation 31 | output: 32 | "{ref_path}/{species}_{build}_{release}/annotation.gtf" 33 | shell: 34 | "gunzip -c -d {input} > {output}" 35 | 36 | rule download_genome: 37 | input: 38 | get_genome 39 | output: 40 | "{ref_path}/{species}_{build}_{release}/genome.fa" 41 | shell: 42 | "gunzip -d -c {input} > {output}" -------------------------------------------------------------------------------- /rules/extract_expression_single.smk: -------------------------------------------------------------------------------- 1 | """Extract expression fof single species""" 2 | 3 | #Which rules will be run on the host computer and not sent to nodes 4 | localrules: 5 | plot_rna_metrics, 6 | convert_long_to_mtx, 7 | compress_mtx 8 | 9 | rule extract_umi_expression: 10 | input: 11 | data='{results_dir}/samples/{sample}/final.bam', 12 | barcode_whitelist='{results_dir}/samples/{sample}/barcodes.csv' 13 | output: 14 | long='{results_dir}/samples/{sample}/umi/expression.long', 15 | dense=temp('{results_dir}/samples/{sample}/umi/expression.tsv') 16 | params: 17 | count_per_umi=config['EXTRACTION']['minimum-counts-per-UMI'], 18 | num_cells=lambda wildcards: int(samples.loc[wildcards.sample,'expected_cells']), 19 | umiBarcodeEditDistance=config['EXTRACTION']['UMI-edit-distance'], 20 | temp_directory=config['LOCAL']['temp-directory'], 21 | memory=config['LOCAL']['memory'], 22 | locus_list=','.join(config['EXTRACTION']['LOCUS']), 23 | strand_strategy=config['EXTRACTION']['strand-strategy'] 24 | conda: '../envs/dropseq_tools.yaml' 25 | shell: 26 | """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && DigitalExpression -m {params.memory}\ 27 | I={input.data}\ 28 | O={output.dense}\ 29 | EDIT_DISTANCE={params.umiBarcodeEditDistance}\ 30 | OUTPUT_LONG_FORMAT={output.long}\ 31 | STRAND_STRATEGY={params.strand_strategy}\ 32 | OUTPUT_READS_INSTEAD=false\ 33 | LOCUS_FUNCTION_LIST={{{params.locus_list}}}\ 34 | MIN_BC_READ_THRESHOLD={params.count_per_umi}\ 35 | CELL_BC_FILE={input.barcode_whitelist}""" 36 | 37 | rule extract_reads_expression: 38 | input: 39 | data='{results_dir}/samples/{sample}/final.bam', 40 | barcode_whitelist='{results_dir}/samples/{sample}/barcodes.csv' 41 | output: 42 | long=temp('{results_dir}/samples/{sample}/read/expression.long'), 43 | dense=temp('{results_dir}/samples/{sample}/read/expression.tsv') 44 | params: 45 | count_per_umi=config['EXTRACTION']['minimum-counts-per-UMI'], 46 | num_cells=lambda wildcards: int(samples.loc[wildcards.sample,'expected_cells']), 47 | umiBarcodeEditDistance=config['EXTRACTION']['UMI-edit-distance'], 48 | temp_directory=config['LOCAL']['temp-directory'], 49 | memory=config['LOCAL']['memory'], 50 | locus_list=','.join(config['EXTRACTION']['LOCUS']), 51 | strand_strategy=config['EXTRACTION']['strand-strategy'] 52 | conda: '../envs/dropseq_tools.yaml' 53 | shell: 54 | """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && DigitalExpression -m {params.memory}\ 55 | I={input.data}\ 56 | O={output.dense}\ 57 | EDIT_DISTANCE={params.umiBarcodeEditDistance}\ 58 | OUTPUT_LONG_FORMAT={output.long}\ 59 | STRAND_STRATEGY={params.strand_strategy}\ 60 | OUTPUT_READS_INSTEAD=true\ 61 | LOCUS_FUNCTION_LIST={{{params.locus_list}}}\ 62 | MIN_BC_READ_THRESHOLD={params.count_per_umi}\ 63 | CELL_BC_FILE={input.barcode_whitelist}""" 64 | 65 | 66 | rule SingleCellRnaSeqMetricsCollector: 67 | input: 68 | data='{results_dir}/samples/{sample}/final.bam', 69 | barcode_whitelist='{results_dir}/samples/{sample}/barcodes.csv', 70 | refFlat=expand("{ref_path}/{species}_{build}_{release}/curated_annotation.refFlat", 71 | ref_path=config['META']['reference-directory'], 72 | species=species, 73 | release=release, 74 | build=build), 75 | rRNA_intervals=expand("{ref_path}/{species}_{build}_{release}/annotation.rRNA.intervals", 76 | ref_path=config['META']['reference-directory'], 77 | species=species, 78 | release=release, 79 | build=build) 80 | params: 81 | temp_directory=config['LOCAL']['temp-directory'], 82 | memory=config['LOCAL']['memory'] 83 | output: 84 | rna_metrics='{results_dir}/logs/dropseq_tools/{sample}_rna_metrics.txt', 85 | conda: '../envs/dropseq_tools.yaml' 86 | shell: 87 | """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && SingleCellRnaSeqMetricsCollector -m {params.memory}\ 88 | INPUT={input.data}\ 89 | OUTPUT={output}\ 90 | ANNOTATIONS_FILE={input.refFlat}\ 91 | CELL_BC_FILE={input.barcode_whitelist}\ 92 | RIBOSOMAL_INTERVALS={input.rRNA_intervals} 93 | """ 94 | 95 | rule plot_rna_metrics: 96 | input: 97 | rna_metrics='{results_dir}/logs/dropseq_tools/{sample}_rna_metrics.txt', 98 | barcodes='{results_dir}/samples/{sample}/barcodes.csv' 99 | conda: '../envs/r.yaml' 100 | output: 101 | pdf='{results_dir}/plots/rna_metrics/{sample}_rna_metrics.pdf' 102 | script: 103 | '../scripts/plot_rna_metrics.R' 104 | 105 | 106 | rule convert_long_to_mtx: 107 | input: 108 | '{results_dir}/samples/{sample}/{type}/expression.long' 109 | output: 110 | barcodes='{results_dir}/samples/{sample}/{type}/barcodes.tsv', 111 | features='{results_dir}/samples/{sample}/{type}/genes.tsv', 112 | mtx='{results_dir}/samples/{sample}/{type}/matrix.mtx' 113 | params: 114 | samples=lambda wildcards: wildcards.sample 115 | script: 116 | "../scripts/convert_mtx.py" 117 | 118 | rule compress_mtx: 119 | input: 120 | barcodes='{results_dir}/samples/{sample}/{type}/barcodes.tsv', 121 | features='{results_dir}/samples/{sample}/{type}/genes.tsv', 122 | mtx='{results_dir}/samples/{sample}/{type}/matrix.mtx' 123 | output: 124 | barcodes='{results_dir}/samples/{sample}/{type}/barcodes.tsv.gz', 125 | features='{results_dir}/samples/{sample}/{type}/genes.tsv.gz', 126 | mtx='{results_dir}/samples/{sample}/{type}/matrix.mtx.gz' 127 | conda: '../envs/pigz.yaml' 128 | threads: 3 129 | shell: 130 | """pigz -p {threads} {input.barcodes} {input.features} {input.mtx}""" -------------------------------------------------------------------------------- /rules/extract_expression_species.smk: -------------------------------------------------------------------------------- 1 | """Extract expression fof mixed species""" 2 | 3 | #Which rules will be run on the host computer and not sent to nodes 4 | localrules: 5 | plot_rna_metrics_species, 6 | convert_long_to_mtx_species 7 | 8 | rule extract_umi_expression_species: 9 | input: 10 | data='{results_dir}/samples/{sample}/{species}/unfiltered.bam', 11 | barcode_whitelist='{results_dir}/samples/{sample}/{species}/barcodes.csv' 12 | output: 13 | dense=temp('{results_dir}/samples/{sample}/{species}/umi/expression.txt'), 14 | long=temp('{results_dir}/samples/{sample}/{species}/umi/expression.long') 15 | params: 16 | count_per_umi=config['EXTRACTION']['minimum-counts-per-UMI'], 17 | num_cells=lambda wildcards: int(samples.loc[wildcards.sample,'expected_cells']), 18 | umiBarcodeEditDistance=config['EXTRACTION']['UMI-edit-distance'], 19 | temp_directory=config['LOCAL']['temp-directory'], 20 | memory=config['LOCAL']['memory'], 21 | locus_list=','.join(config['EXTRACTION']['LOCUS']), 22 | strand_strategy=config['EXTRACTION']['strand-strategy'] 23 | conda: '../envs/dropseq_tools.yaml' 24 | shell: 25 | """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && DigitalExpression -m {params.memory}\ 26 | I={input.data}\ 27 | O={output.dense}\ 28 | EDIT_DISTANCE={params.umiBarcodeEditDistance}\ 29 | OUTPUT_LONG_FORMAT={output.long}\ 30 | STRAND_STRATEGY={params.strand_strategy}\ 31 | OUTPUT_READS_INSTEAD=false\ 32 | LOCUS_FUNCTION_LIST={{{params.locus_list}}}\ 33 | MIN_BC_READ_THRESHOLD={params.count_per_umi}\ 34 | CELL_BC_FILE={input.barcode_whitelist}""" 35 | 36 | 37 | rule extract_reads_expression_species: 38 | input: 39 | data='{results_dir}/samples/{sample}/{species}/unfiltered.bam', 40 | barcode_whitelist='{results_dir}/samples/{sample}/{species}/barcodes.csv' 41 | params: 42 | count_per_umi=config['EXTRACTION']['minimum-counts-per-UMI'], 43 | num_cells=lambda wildcards: int(samples.loc[wildcards.sample,'expected_cells']), 44 | umiBarcodeEditDistance=config['EXTRACTION']['UMI-edit-distance'], 45 | temp_directory=config['LOCAL']['temp-directory'], 46 | memory=config['LOCAL']['memory'], 47 | locus_list=','.join(config['EXTRACTION']['LOCUS']), 48 | strand_strategy=config['EXTRACTION']['strand-strategy'] 49 | output: 50 | dense=temp('{results_dir}/samples/{sample}/{species}/read/expression.txt'), 51 | long=temp('{results_dir}/samples/{sample}/{species}/read/expression.long') 52 | conda: '../envs/dropseq_tools.yaml' 53 | shell: 54 | """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && DigitalExpression -m {params.memory}\ 55 | I={input.data}\ 56 | O={output.dense}\ 57 | EDIT_DISTANCE={params.umiBarcodeEditDistance}\ 58 | OUTPUT_LONG_FORMAT={output.long}\ 59 | STRAND_STRATEGY={params.strand_strategy}\ 60 | OUTPUT_READS_INSTEAD=true\ 61 | LOCUS_FUNCTION_LIST={{{params.locus_list}}}\ 62 | MIN_BC_READ_THRESHOLD={params.count_per_umi}\ 63 | CELL_BC_FILE={input.barcode_whitelist}""" 64 | 65 | rule convert_long_to_mtx_species: 66 | input: 67 | '{results_dir}/samples/{sample}/{species}/{type}/expression.long' 68 | output: 69 | barcodes='{results_dir}/samples/{sample}/{species}/{type}/barcodes.tsv', 70 | features='{results_dir}/samples/{sample}/{species}/{type}/genes.tsv', 71 | mtx='{results_dir}/samples/{sample}/{species}/{type}/matrix.mtx' 72 | params: 73 | samples=lambda wildcards: wildcards.sample 74 | script: 75 | "../scripts/convert_mtx.py" 76 | 77 | rule compress_mtx_species: 78 | input: 79 | barcodes='{results_dir}/samples/{sample}/{species}/{type}/barcodes.tsv', 80 | features='{results_dir}/samples/{sample}/{species}/{type}/genes.tsv', 81 | mtx='{results_dir}/samples/{sample}/{species}/{type}/matrix.mtx' 82 | output: 83 | barcodes='{results_dir}/samples/{sample}/{species}/{type}/barcodes.tsv.gz', 84 | features='{results_dir}/samples/{sample}/{species}/{type}/genes.tsv.gz', 85 | mtx='{results_dir}/samples/{sample}/{species}/{type}/matrix.mtx.gz' 86 | conda: '../envs/pigz.yaml' 87 | threads: 3 88 | shell: 89 | """pigz -p {threads} {input.barcodes} {input.features} {input.mtx}""" 90 | 91 | rule SingleCellRnaSeqMetricsCollector_species: 92 | input: 93 | data='{results_dir}/samples/{sample}/{species}/unfiltered.bam', 94 | barcode_whitelist='{results_dir}/samples/{sample}/{species}/barcodes.csv', 95 | refFlat=expand("{ref_path}/{species}_{build}_{release}/curated_annotation.refFlat", 96 | ref_path=ref_path, 97 | release=release, 98 | species=species, 99 | build=build), 100 | rRNA_intervals=expand("{ref_path}/{species}_{build}_{release}/annotation.rRNA.intervals", 101 | ref_path=ref_path, 102 | release=release, 103 | build=build, 104 | species=species) 105 | params: 106 | cells=lambda wildcards: int(samples.loc[wildcards.sample,'expected_cells']), 107 | memory=config['LOCAL']['memory'], 108 | temp_directory=config['LOCAL']['temp-directory'] 109 | output: 110 | '{results_dir}/logs/dropseq_tools/{sample}/{species}/rna_metrics.txt' 111 | conda: '../envs/dropseq_tools.yaml' 112 | shell: 113 | """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && SingleCellRnaSeqMetricsCollector -m {params.memory}\ 114 | INPUT={input.data}\ 115 | OUTPUT={output}\ 116 | ANNOTATIONS_FILE={input.refFlat}\ 117 | CELL_BC_FILE={input.barcode_whitelist}\ 118 | RIBOSOMAL_INTERVALS={input.rRNA_intervals} 119 | """ 120 | rule plot_rna_metrics_species: 121 | input: 122 | rna_metrics='{results_dir}/logs/dropseq_tools/{sample}/{species}/rna_metrics.txt', 123 | barcode='{results_dir}/samples/{sample}/{species}/barcodes.csv' 124 | conda: '../envs/r.yaml' 125 | output: 126 | pdf='{results_dir}/plots/rna_metrics/{sample}_{species}_rna_metrics.pdf' 127 | script: 128 | '../scripts/plot_rna_metrics.R' 129 | -------------------------------------------------------------------------------- /rules/fastqc.smk: -------------------------------------------------------------------------------- 1 | """Get fastqc reports""" 2 | 3 | #Which rules will be run on the host computer and not sent to nodes 4 | localrules: 5 | multiqc_fastqc_reads, 6 | multiqc_fastqc_barcodes, 7 | fasta_fastq_adapter 8 | 9 | 10 | rule fastqc_barcodes: 11 | """Create fastqc report""" 12 | input: 13 | get_R1_files, 14 | 'fastqc_adapter.tsv', 15 | output: 16 | html='{results_dir}/logs/fastqc/{sample}_R1_fastqc.html', 17 | zip='{results_dir}/logs/fastqc/{sample}_R1_fastqc.zip' 18 | params: '--extract -a fastqc_adapter.tsv' 19 | wrapper: 20 | '0.36.0/bio/fastqc' 21 | 22 | rule fastqc_reads: 23 | """Create fastqc report""" 24 | input: 25 | get_R2_files, 26 | 'fastqc_adapter.tsv', 27 | output: 28 | html='{results_dir}/logs/fastqc/{sample}_R2_fastqc.html', 29 | zip='{results_dir}/logs/fastqc/{sample}_R2_fastqc.zip' 30 | params: '--extract -a fastqc_adapter.tsv' 31 | wrapper: 32 | '0.36.0/bio/fastqc' 33 | 34 | 35 | rule multiqc_fastqc_barcodes: 36 | input: 37 | expand('{results_dir}/logs/fastqc/{sample}_R1_fastqc.html', sample=samples.index, results_dir=results_dir) 38 | output: 39 | html='{results_dir}/reports/fastqc_barcodes.html' 40 | params: '-m fastqc --ignore *_R2*' 41 | wrapper: 42 | '0.36.0/bio/multiqc' 43 | 44 | rule multiqc_fastqc_reads: 45 | input: 46 | expand('{results_dir}/logs/fastqc/{sample}_R2_fastqc.html', sample=samples.index, results_dir=results_dir) 47 | output: 48 | html='{results_dir}/reports/fastqc_reads.html' 49 | params: '-m fastqc --ignore *_R1*' 50 | wrapper: 51 | '0.36.0/bio/multiqc' 52 | 53 | rule fasta_fastq_adapter: 54 | input: 55 | fa=config['FILTER']['cutadapt']['adapters-file'] 56 | output: 57 | tsv="fastqc_adapter.tsv" 58 | conda: '../envs/merge_bam.yaml' 59 | script: 60 | '../scripts/fa2tsv.py' 61 | -------------------------------------------------------------------------------- /rules/filter.smk: -------------------------------------------------------------------------------- 1 | """Filter data""" 2 | 3 | 4 | #Which rules will be run on the host computer and not sent to nodes 5 | localrules: 6 | clean_cutadapt, 7 | plot_adapter_content, 8 | multiqc_cutadapt_barcodes, 9 | multiqc_cutadapt_RNA, 10 | detect_barcodes 11 | 12 | 13 | rule cutadapt_R1: 14 | input: 15 | R1=get_R1_files, 16 | adapters=config['FILTER']['cutadapt']['adapters-file'] 17 | output: 18 | fastq=temp('{results_dir}/samples/{sample}/trimmed_R1.fastq.gz') 19 | params: 20 | cell_barcode_length=round((config['FILTER']['cell-barcode']['end'] - config['FILTER']['cell-barcode']['start'] + 1) * 1.3), 21 | barcode_length=config['FILTER']['UMI-barcode']['end'], 22 | extra_params=config['FILTER']['cutadapt']['R1']['extra-params'], 23 | max_n=config['FILTER']['cutadapt']['R1']['maximum-Ns'], 24 | barcode_quality=config['FILTER']['cutadapt']['R1']['quality-filter'] 25 | threads: 10 26 | log: 27 | qc='{results_dir}/logs/cutadapt/{sample}_R1.qc.txt' 28 | conda: '../envs/cutadapt.yaml' 29 | shell: 30 | """cutadapt\ 31 | --max-n {params.max_n}\ 32 | -a file:{input.adapters}\ 33 | -g file:{input.adapters}\ 34 | -q {params.barcode_quality},{params.barcode_quality}\ 35 | --minimum-length {params.barcode_length}\ 36 | --cores={threads}\ 37 | --overlap {params.cell_barcode_length}\ 38 | -o {output.fastq} {input.R1}\ 39 | {params.extra_params} > {log.qc}""" 40 | 41 | rule cutadapt_R2: 42 | input: 43 | R2=get_R2_files, 44 | adapters=config['FILTER']['cutadapt']['adapters-file'] 45 | output: 46 | fastq=temp('{results_dir}/samples/{sample}/trimmed_R2.fastq.gz') 47 | params: 48 | extra_params=config['FILTER']['cutadapt']['R2']['extra-params'], 49 | read_quality=config['FILTER']['cutadapt']['R2']['quality-filter'], 50 | minimum_length=config['FILTER']['cutadapt']['R2']['minimum-length'], 51 | adapters_minimum_overlap=config['FILTER']['cutadapt']['R2']['minimum-adapters-overlap'], 52 | threads: 10 53 | log: 54 | qc='{results_dir}/logs/cutadapt/{sample}_R2.qc.txt' 55 | conda: '../envs/cutadapt.yaml' 56 | shell: 57 | """cutadapt\ 58 | -a file:{input.adapters}\ 59 | -g file:{input.adapters}\ 60 | -q {params.read_quality}\ 61 | --minimum-length {params.minimum_length}\ 62 | --cores={threads}\ 63 | --overlap {params.adapters_minimum_overlap}\ 64 | -o {output.fastq} {input.R2}\ 65 | {params.extra_params} > {log.qc}""" 66 | 67 | rule clean_cutadapt: 68 | input: 69 | R1='{results_dir}/logs/cutadapt/{sample}_R1.qc.txt', 70 | R2='{results_dir}/logs/cutadapt/{sample}_R2.qc.txt' 71 | output: 72 | '{results_dir}/logs/cutadapt/{sample}.clean_qc.csv' 73 | script: 74 | '../scripts/clean_cutadapt.py' 75 | 76 | rule repair: 77 | input: 78 | R1='{results_dir}/samples/{sample}/trimmed_R1.fastq.gz', 79 | R2='{results_dir}/samples/{sample}/trimmed_R2.fastq.gz' 80 | output: 81 | R1='{results_dir}/samples/{sample}/trimmed_repaired_R1.fastq.gz', 82 | R2='{results_dir}/samples/{sample}/trimmed_repaired_R2.fastq.gz' 83 | log: 84 | '{results_dir}/logs/bbmap/{sample}_repair.txt' 85 | params: 86 | memory='{}g'.format(int(config['LOCAL']['memory'].rstrip('g')) ) 87 | conda: '../envs/bbmap.yaml' 88 | threads: 4 89 | shell: 90 | """repair.sh\ 91 | -Xmx{params.memory}\ 92 | in={input.R1}\ 93 | in2={input.R2}\ 94 | out1={output.R1}\ 95 | out2={output.R2}\ 96 | repair=t\ 97 | threads={threads} 2> {log}""" 98 | 99 | rule detect_barcodes: 100 | input: 101 | R1='{results_dir}/samples/{sample}/trimmed_repaired_R1.fastq.gz' 102 | output: 103 | positions='{results_dir}/samples/{sample}/test.csv' 104 | conda: '../envs/merge_bam.yaml' 105 | script: 106 | '../scripts/detect_barcodes.py' 107 | 108 | rule plot_adapter_content: 109 | input: 110 | expand('{results_dir}/logs/cutadapt/{sample}.clean_qc.csv', sample=samples.index, results_dir=results_dir) 111 | params: 112 | Cell_length=config['FILTER']['cell-barcode']['end'] - config['FILTER']['cell-barcode']['start'] + 1, 113 | UMI_length=config['FILTER']['UMI-barcode']['end'] - config['FILTER']['UMI-barcode']['start'] + 1, 114 | sample_names=lambda wildcards: samples.index, 115 | batches=lambda wildcards: samples.loc[samples.index, 'batch'] 116 | conda: '../envs/r.yaml' 117 | output: 118 | pdf='{results_dir}/plots/adapter_content.pdf' 119 | script: 120 | '../scripts/plot_adapter_content.R' 121 | 122 | rule multiqc_cutadapt_barcodes: 123 | input: 124 | expand('{results_dir}/logs/cutadapt/{sample}_R1.qc.txt', sample=samples.index, results_dir=results_dir) 125 | params: '-m cutadapt --ignore *_R2*' 126 | output: 127 | html='{results_dir}/reports/barcode_filtering.html' 128 | wrapper: 129 | '0.36.0/bio/multiqc' 130 | 131 | rule multiqc_cutadapt_RNA: 132 | input: 133 | expand('{results_dir}/logs/cutadapt/{sample}_R2.qc.txt', sample=samples.index, results_dir=results_dir) 134 | params: '-m cutadapt --ignore *_R1*' 135 | output: 136 | html='{results_dir}/reports/RNA_filtering.html' 137 | wrapper: 138 | '0.36.0/bio/multiqc' 139 | -------------------------------------------------------------------------------- /rules/generate_meta.smk: -------------------------------------------------------------------------------- 1 | import math 2 | import platform 3 | """Generate all the meta data files""" 4 | # To add missing fields for an annotation of ERCC: awk -F'[\t|;]' '{printf $0" "; gsub(/id/,"name"); print $9";"$10"; exon_version \"1\";"}' 5 | #Which rules will be run on the host computer and not sent to nodes 6 | localrules: 7 | create_dict, 8 | reduce_gtf, 9 | create_refFlat, 10 | create_intervals, 11 | curate_annotation 12 | 13 | 14 | rule curate_annotation: 15 | input: 16 | biotypes=config['META']['gtf_biotypes'], 17 | annotation="{ref_path}/{species}_{build}_{release}/annotation.gtf" 18 | output: 19 | temp("{ref_path}/{species}_{build}_{release}/curated_annotation.gtf") 20 | params: 21 | patterns='|'.join(config['biotypes']) 22 | shell: 23 | """cat {input.annotation} | grep -E "{params.patterns}" > {output}""" 24 | 25 | 26 | rule create_dict: 27 | input: 28 | "{ref_path}/{species}_{build}_{release}/genome.fa" 29 | output: 30 | "{ref_path}/{species}_{build}_{release}/genome.dict" 31 | threads:1 32 | params: 33 | picard="$CONDA_PREFIX/share/picard-2.14.1-0/picard.jar", 34 | temp_directory=config['LOCAL']['temp-directory'] 35 | conda: '../envs/picard.yaml' 36 | shell: 37 | """java -jar -Djava.io.tmpdir={params.temp_directory} {params.picard} CreateSequenceDictionary\ 38 | REFERENCE={input}\ 39 | OUTPUT={output} 40 | """ 41 | 42 | rule reduce_gtf: 43 | input: 44 | reference_dict="{ref_path}/{species}_{build}_{release}/genome.dict", 45 | annotation="{ref_path}/{species}_{build}_{release}/curated_annotation.gtf" 46 | params: 47 | memory=config['LOCAL']['memory'], 48 | temp_directory=config['LOCAL']['temp-directory'] 49 | output: 50 | "{ref_path}/{species}_{build}_{release}/curated_reduced_annotation.gtf" 51 | conda: '../envs/dropseq_tools.yaml' 52 | shell: 53 | """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && ReduceGtf -m {params.memory}\ 54 | GTF={input.annotation}\ 55 | OUTPUT={output}\ 56 | SEQUENCE_DICTIONARY={input.reference_dict}\ 57 | IGNORE_FUNC_TYPE='null'\ 58 | ENHANCE_GTF='false'""" 59 | 60 | rule create_refFlat: 61 | input: 62 | reference_dict="{ref_path}/{species}_{build}_{release}/genome.dict", 63 | annotation="{ref_path}/{species}_{build}_{release}/curated_annotation.gtf" 64 | params: 65 | memory=config['LOCAL']['memory'], 66 | temp_directory=config['LOCAL']['temp-directory'] 67 | output: 68 | "{ref_path}/{species}_{build}_{release}/curated_annotation.refFlat" 69 | conda: '../envs/dropseq_tools.yaml' 70 | shell: 71 | """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && ConvertToRefFlat -m {params.memory}\ 72 | ANNOTATIONS_FILE={input.annotation}\ 73 | OUTPUT={output}\ 74 | SEQUENCE_DICTIONARY={input.reference_dict} 75 | """ 76 | 77 | rule create_intervals: 78 | input: 79 | annotation_reduced="{ref_path}/{species}_{build}_{release}/curated_reduced_annotation.gtf", 80 | reference_dict="{ref_path}/{species}_{build}_{release}/genome.dict" 81 | params: 82 | memory=config['LOCAL']['memory'], 83 | reference_directory=config['META']['reference-directory'], 84 | temp_directory=config['LOCAL']['temp-directory'], 85 | prefix="{species}_{build}_{release}/annotation" 86 | output: 87 | intervals="{ref_path}/{species}_{build}_{release}/annotation.rRNA.intervals" 88 | conda: '../envs/dropseq_tools.yaml' 89 | shell: 90 | """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && CreateIntervalsFiles -m {params.memory}\ 91 | REDUCED_GTF={input.annotation_reduced}\ 92 | SEQUENCE_DICTIONARY={input.reference_dict}\ 93 | O={params.reference_directory}\ 94 | PREFIX={params.prefix} 95 | """ 96 | 97 | rule get_genomeChrBinNbits: 98 | input: 99 | reference_file="{ref_path}/{species}_genome.fa" 100 | params: 101 | samples_file='samples.csv', 102 | reference_directory=config['META']['reference-directory'] 103 | output: 104 | '{params.reference_directory}/index_params.txt' 105 | run: 106 | """ 107 | from math import log2 108 | from platform import system 109 | if (system() == 'Darwin'): 110 | genomeLength = shell("wc -c {} | cut -d' ' -f2".format(snakemake.reference_file), iterable=True) 111 | else: 112 | genomeLength = shell("wc -c {} | cut -d' ' -f1".format(snakemake.reference_file), iterable=True) 113 | genomeLength = int(next(genomeLength)) 114 | referenceNumber = shell('grep "^>" {} | wc -l'.format(snakemake.reference_file), iterable=True) 115 | referenceNumber = int(next(referenceNumber)) 116 | value = min([18,int(log2(genomeLength/referenceNumber))]) 117 | """ 118 | 119 | def get_sjdbOverhang(wildcards): 120 | return(int(wildcards.read_length)-1) 121 | 122 | 123 | rule prep_star_index: 124 | input: 125 | reference_file="{ref_path}/{species}_genome.fa", 126 | config_file='config.yaml' 127 | output: 128 | '{reference_directory}/star_ref_config.txt' 129 | conda: 130 | '../envs/pyyaml.yaml' 131 | script: 132 | '../scripts/prep_star.py' 133 | 134 | 135 | 136 | 137 | rule create_star_index: 138 | input: 139 | reference_file="{ref_path}/{species}_{build}_{release}/genome.fa", 140 | annotation_file="{ref_path}/{species}_{build}_{release}/curated_annotation.gtf" 141 | params: 142 | sjdbOverhang=lambda wildcards: get_sjdbOverhang(wildcards), 143 | genomeDir='{ref_path}/{species}_{build}_{release}/STAR_INDEX/SA_{read_length}', 144 | genomeChrBinNbits=config['MAPPING']['STAR']['genomeChrBinNbits'] 145 | output: 146 | '{ref_path}/{species}_{build}_{release}/STAR_INDEX/SA_{read_length}/SA' 147 | threads: 24 148 | conda: '../envs/star.yaml' 149 | shell: 150 | """mkdir -p {params.genomeDir}; STAR\ 151 | --runThreadN {threads}\ 152 | --runMode genomeGenerate\ 153 | --genomeDir {params.genomeDir}\ 154 | --genomeFastaFiles {input.reference_file}\ 155 | --sjdbGTFfile {input.annotation_file}\ 156 | --sjdbOverhang {params.sjdbOverhang}\ 157 | --genomeChrBinNbits {params.genomeChrBinNbits}\ 158 | --genomeSAsparseD 2 159 | """ -------------------------------------------------------------------------------- /rules/map.smk: -------------------------------------------------------------------------------- 1 | """Align the data with STAR.""" 2 | 3 | 4 | #Which rules will be run on the host computer and not sent to nodes 5 | localrules: 6 | multiqc_star, 7 | plot_yield, 8 | plot_knee_plot, 9 | pigz_unmapped 10 | 11 | 12 | rule STAR_align: 13 | input: 14 | fq1='{results_dir}/samples/{sample}/trimmed_repaired_R2.fastq.gz', 15 | index=lambda wildcards: '{}/{}_{}_{}/STAR_INDEX/SA'.format( 16 | config['META']['reference-directory'], 17 | species, 18 | build, 19 | release) + '_' + str(samples.loc[wildcards.sample,'read_length']) + '/SA' 20 | output: 21 | temp('{results_dir}/samples/{sample}/Aligned.out.bam'), 22 | '{results_dir}/samples/{sample}/Unmapped.out.mate1' 23 | 24 | log: 25 | '{results_dir}/samples/{sample}/Log.final.out' 26 | params: 27 | extra="""--outReadsUnmapped Fastx\ 28 | --outFilterMismatchNmax {}\ 29 | --outFilterMismatchNoverLmax {}\ 30 | --outFilterMismatchNoverReadLmax {}\ 31 | --outFilterMatchNmin {}\ 32 | --outFilterScoreMinOverLread {}\ 33 | --outFilterMatchNminOverLread {}""".format( 34 | config['MAPPING']['STAR']['outFilterMismatchNmax'], 35 | config['MAPPING']['STAR']['outFilterMismatchNoverLmax'], 36 | config['MAPPING']['STAR']['outFilterMismatchNoverReadLmax'], 37 | config['MAPPING']['STAR']['outFilterMatchNmin'], 38 | config['MAPPING']['STAR']['outFilterMatchNminOverLread'], 39 | config['MAPPING']['STAR']['outFilterScoreMinOverLread'],), 40 | index=lambda wildcards: '{}/{}_{}_{}/STAR_INDEX/SA'.format( 41 | config['META']['reference-directory'], 42 | species, 43 | build, 44 | release) + '_' + str(samples.loc[wildcards.sample,'read_length']) + '/' 45 | singularity: 46 | "shub://seb-mueller/singularity_dropSeqPipe:v04" 47 | threads: 24 48 | wrapper: 49 | "0.27.1/bio/star/align" 50 | # rule alevin: 51 | # input: 52 | # index='{salmon_index}', 53 | # R1="samples/{sample}/trimmed_repaired_R1.fastq.gz", 54 | # R2="samples/{sample}/trimmed_repaired_R2.fastq.gz", 55 | # conda: '../envs/salmon.yaml' 56 | # params: 57 | # cell_barcode_length=(config['FILTER']['cell-barcode']['end'] - config['FILTER']['cell-barcode']['start'] + 1), 58 | # umi_barcode_length=(config['FILTER']['UMI-barcode']['end'] - config['FILTER']['UMI-barcode']['start'] + 1) 59 | # output: 60 | # out_folder='samples/{sample}/salmon/', 61 | # counts='samples/{sample}/salmon/mapping.tsv' 62 | # shell: 63 | # """salmon alevin\ 64 | # -l ISR\ 65 | # -1 {input.R1}\ 66 | # -2 {input.R2}\ 67 | # -i {inout.index}\ 68 | # -p 10\ 69 | # -o {output.out_folder}\ 70 | # --tgMap {output.counts}\ 71 | # --barcodeLength {params.cell_barcode_length}\ 72 | # --umiLength {params.umi_barcode_length}\ 73 | # --end 5""" 74 | 75 | 76 | rule multiqc_star: 77 | input: 78 | expand('{results_dir}/samples/{sample}/Log.final.out', sample=samples.index, results_dir=results_dir) 79 | output: 80 | html='{results_dir}/reports/star.html' 81 | params: '-m star' 82 | wrapper: 83 | '0.36.0/bio/multiqc' 84 | 85 | rule pigz_unmapped: 86 | input: 87 | '{results_dir}/samples/{sample}/Unmapped.out.mate1' 88 | output: 89 | '{results_dir}/samples/{sample}/Unmapped.out.mate1.gz' 90 | threads: 4 91 | conda: '../envs/pigz.yaml' 92 | shell: 93 | """pigz -p 4 {input}""" 94 | 95 | rule MergeBamAlignment: 96 | input: 97 | mapped='{results_dir}/samples/{sample}/Aligned.out.bam', 98 | R1_ref = '{results_dir}/samples/{sample}/trimmed_repaired_R1.fastq.gz' 99 | output: 100 | temp('{results_dir}/samples/{sample}/Aligned.merged.bam') 101 | params: 102 | BC_start=config['FILTER']['cell-barcode']['start']-1, 103 | BC_end=config['FILTER']['cell-barcode']['end'], 104 | UMI_start=config['FILTER']['UMI-barcode']['start']-1, 105 | UMI_end=config['FILTER']['UMI-barcode']['end'], 106 | discard_secondary_alignements=True 107 | conda: '../envs/merge_bam.yaml' 108 | script: 109 | '../scripts/merge_bam.py' 110 | 111 | # Note: rule repair_barcodes (cell_barcodes.smk) creates Aligned.repaired.bam 112 | # this is using barcode information (i.e. dependent on expected_cells in config.yaml) 113 | 114 | 115 | rule TagReadWithGeneExon: 116 | input: 117 | data='{results_dir}/samples/{sample}/Aligned.repaired.bam', 118 | refFlat=expand("{ref_path}/{species}_{build}_{release}/curated_annotation.refFlat", 119 | ref_path=config['META']['reference-directory'], 120 | species=species, 121 | release=release, 122 | build=build) 123 | params: 124 | memory=config['LOCAL']['memory'], 125 | temp_directory=config['LOCAL']['temp-directory'] 126 | output: 127 | temp('{results_dir}/samples/{sample}/gene_exon_tagged.bam') 128 | conda: '../envs/dropseq_tools.yaml' 129 | shell: 130 | """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && TagReadWithGeneFunction -m {params.memory}\ 131 | INPUT={input.data}\ 132 | OUTPUT={output}\ 133 | ANNOTATIONS_FILE={input.refFlat} 134 | """ 135 | 136 | rule DetectBeadSubstitutionErrors: 137 | input: 138 | '{results_dir}/samples/{sample}/gene_exon_tagged.bam' 139 | output: 140 | data=temp('{results_dir}/samples/{sample}/gene_exon_tagged_bead_sub.bam'), 141 | report='{results_dir}/logs/dropseq_tools/{sample}_beadSubstitutionReport.txt', 142 | summary='{results_dir}/logs/dropseq_tools/{sample}_beadSubstitutionSummary.txt' 143 | params: 144 | SmartAdapter=config['FILTER']['5-prime-smart-adapter'], 145 | memory=config['LOCAL']['memory'], 146 | temp_directory=config['LOCAL']['temp-directory'] 147 | conda: '../envs/dropseq_tools.yaml' 148 | threads: 5 149 | shell: 150 | """ 151 | export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && DetectBeadSubstitutionErrors -m {params.memory}\ 152 | I={input}\ 153 | O={output.data}\ 154 | OUTPUT_REPORT={output.report}\ 155 | OUTPUT_SUMMARY={output.summary}\ 156 | NUM_THREADS={threads} 157 | """ 158 | 159 | rule bead_errors_metrics: 160 | input: 161 | '{results_dir}/samples/{sample}/gene_exon_tagged_bead_sub.bam' 162 | output: 163 | '{results_dir}/samples/{sample}/final.bam' 164 | params: 165 | out_stats='{results_dir}/logs/dropseq_tools/{sample}_synthesis_stats.txt', 166 | summary='{results_dir}/logs/dropseq_tools/{sample}_synthesis_stats_summary.txt', 167 | barcodes=lambda wildcards: int(samples.loc[wildcards.sample,'expected_cells']) * 2, 168 | memory =config['LOCAL']['memory'], 169 | SmartAdapter=config['FILTER']['5-prime-smart-adapter'], 170 | temp_directory=config['LOCAL']['temp-directory'] 171 | conda: '../envs/dropseq_tools.yaml' 172 | threads: 5 173 | shell: 174 | """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && DetectBeadSynthesisErrors -m {params.memory}\ 175 | INPUT={input}\ 176 | OUTPUT={output}\ 177 | OUTPUT_STATS={params.out_stats}\ 178 | SUMMARY={params.summary}\ 179 | NUM_BARCODES={params.barcodes}\ 180 | PRIMER_SEQUENCE={params.SmartAdapter}\ 181 | NUM_THREADS={threads} 182 | """ 183 | 184 | 185 | rule bam_hist: 186 | input: 187 | '{results_dir}/samples/{sample}/final.bam' 188 | params: 189 | memory=config['LOCAL']['memory'], 190 | temp_directory=config['LOCAL']['temp-directory'] 191 | output: 192 | '{results_dir}/logs/dropseq_tools/{sample}_hist_out_cell.txt' 193 | conda: '../envs/dropseq_tools.yaml' 194 | shell: 195 | """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && BamTagHistogram -m {params.memory}\ 196 | TAG=XC\ 197 | I={input}\ 198 | READ_MQ=10\ 199 | O={output} 200 | """ 201 | 202 | 203 | rule plot_yield: 204 | input: 205 | R1_filtered=expand('{results_dir}/logs/cutadapt/{sample}_R1.qc.txt', sample=samples.index, results_dir=results_dir), 206 | R2_filtered=expand('{results_dir}/logs/cutadapt/{sample}_R2.qc.txt', sample=samples.index, results_dir=results_dir), 207 | repaired=expand('{results_dir}/logs/bbmap/{sample}_repair.txt', sample=samples.index, results_dir=results_dir), 208 | STAR_output=expand('{results_dir}/samples/{sample}/Log.final.out', sample=samples.index, results_dir=results_dir), 209 | params: 210 | BC_length=config['FILTER']['cell-barcode']['end'] - config['FILTER']['cell-barcode']['start']+1, 211 | UMI_length=config['FILTER']['UMI-barcode']['end'] - config['FILTER']['UMI-barcode']['start']+1, 212 | sample_names=lambda wildcards: samples.index, 213 | batches=lambda wildcards: samples.loc[samples.index, 'batch'] 214 | conda: '../envs/r.yaml' 215 | output: 216 | pdf='{results_dir}/plots/yield.pdf' 217 | script: 218 | '../scripts/plot_yield.R' 219 | 220 | 221 | rule plot_knee_plot: 222 | input: 223 | data='{results_dir}/logs/dropseq_tools/{sample}_hist_out_cell.txt', 224 | barcodes='{results_dir}/samples/{sample}/barcodes.csv' 225 | params: 226 | cells=lambda wildcards: int(samples.loc[wildcards.sample,'expected_cells']) 227 | conda: '../envs/r.yaml' 228 | output: 229 | pdf='{results_dir}/plots/knee_plots/{sample}_knee_plot.pdf' 230 | script: 231 | '../scripts/plot_knee_plot.R' 232 | -------------------------------------------------------------------------------- /rules/merge.smk: -------------------------------------------------------------------------------- 1 | 2 | localrules: 3 | merge_long, 4 | violine_plots, 5 | summary_stats 6 | 7 | rule merge_long: 8 | input: 9 | expand('{results_dir}/samples/{sample}/{{type}}/expression.long', sample=samples.index, results_dir=results_dir) 10 | output: 11 | mtx='{results_dir}/summary/{type}/matrix.mtx', 12 | barcodes='{results_dir}/summary/{type}/barcodes.tsv', 13 | features='{results_dir}/summary/{type}/genes.tsv', 14 | params: 15 | samples=lambda wildcards: samples.index 16 | conda: '../envs/merge_long.yaml' 17 | script: 18 | "../scripts/convert_mtx.py" 19 | 20 | # rule compress_mtx_summary: 21 | # input: 22 | # barcodes='{results_dir}/summary/{type}/barcodes.tsv', 23 | # features='{results_dir}/summary/{type}/features.tsv', 24 | # mtx='{results_dir}/summary/{type}/matrix.mtx' 25 | # output: 26 | # barcodes='{results_dir}/summary/{type}/barcodes.tsv.gz', 27 | # features='{results_dir}/summary/{type}/features.tsv.gz', 28 | # mtx='{results_dir}/summary/{type}/matrix.mtx.gz' 29 | # conda: '../envs/pigz.yaml' 30 | # threads: 3 31 | # shell: 32 | # """pigz -p {threads} {input.barcodes} {input.features} {input.mtx}""" 33 | 34 | rule violine_plots: 35 | input: 36 | umi_mtx='{results_dir}/summary/umi/matrix.mtx', 37 | read_mtx='{results_dir}/summary/umi/matrix.mtx', 38 | design='samples.csv' 39 | conda: '../envs/r.yaml' 40 | output: 41 | pdf_violine='{results_dir}/plots/violinplots_comparison_UMI.pdf', 42 | pdf_umivscounts='{results_dir}/plots/UMI_vs_counts.pdf', 43 | pdf_umi_vs_gene='{results_dir}/plots/UMI_vs_gene.pdf', 44 | pdf_count_vs_gene='{results_dir}/plots/Count_vs_gene.pdf', 45 | R_objects='{results_dir}/summary/R_Seurat_objects.rdata' 46 | script: 47 | '../scripts/plot_violine.R' 48 | 49 | rule summary_stats: 50 | input: 51 | R_objects='{results_dir}/summary/R_Seurat_objects.rdata', 52 | R2qc=expand('{results_dir}/logs/cutadapt/{sample}_R2.qc.txt', sample=samples.index, results_dir=results_dir), 53 | hist_cell=expand('{results_dir}/logs/dropseq_tools/{sample}_hist_out_cell.txt', sample=samples.index, results_dir=results_dir) 54 | conda: '../envs/r.yaml' 55 | output: 56 | stats_pre='{results_dir}/summary/barcode_stats_pre_filter.csv', 57 | stats_post='{results_dir}/summary/barcode_stats_post_filter.csv', 58 | params: 59 | sample_names=lambda wildcards: samples.index, 60 | batches=lambda wildcards: samples.loc[samples.index, 'batch'] 61 | script: 62 | '../scripts/create_summary_stats.R' 63 | -------------------------------------------------------------------------------- /rules/prepare.smk: -------------------------------------------------------------------------------- 1 | import re 2 | import glob 3 | import gzip 4 | from collections import defaultdict 5 | 6 | multi_lane_pattern = re.compile("../data\/(.*)_(L[0-9]{3})_(R[1-2])_001.fastq.gz") 7 | 8 | 9 | def get_input_files(wildcards): 10 | samples = [f for f in glob.glob("../{results_dir}/samples/*.fastq.gz") if re.match(multi_lane_pattern,f)] 11 | return(samples) 12 | 13 | lanes = sorted(list(set([re.findall(multi_lane_pattern,f)[0][1] for f in glob.glob("../{results_dir}/samples/*.fastq.gz") if re.match(multi_lane_pattern,f)]))) 14 | samples = [re.findall(multi_lane_pattern,f)[0][0] for f in glob.glob("../{results_dir}/samples/*.fastq.gz") if re.match(multi_lane_pattern,f)] 15 | 16 | 17 | 18 | 19 | rule all: 20 | input: 21 | expand('{results_dir}/samples/{sample}_R1.fastq.gz',sample=samples), 22 | expand('{results_dir}/samples/{sample}_R2.fastq.gz',sample=samples) 23 | 24 | 25 | rule generate_samples: 26 | input: 27 | get_input_files 28 | output: 29 | 'samples.csv' 30 | run: 31 | samples = defaultdict(lambda: {'sample_lanes':[],'read_length':0}) 32 | with open(output[0],'w') as sample_file: 33 | sample_file.write("samples,expected_cells,read_length,batch\n") 34 | for file in input: 35 | if('R2' in file): 36 | with gzip.open(file) as fastq_file: 37 | next(fastq_file) 38 | read_length = len(next(fastq_file).strip()) 39 | re_results = re.findall(multi_lane_pattern,file) 40 | samples[re_results[0][0]]['sample_lanes'].append(re_results[0][1]) 41 | samples[re_results[0][0]]['read_length']=read_length 42 | for sample_name in samples: 43 | sample_file.write("{},,{},\n".format(sample_name, read_length)) 44 | 45 | rule concat_lanes: 46 | input: 47 | R1=expand('{results_dir}/samples/{{sample}}_{lane}_R1_001.fastq.gz', lane=lanes), 48 | R2=expand('{results_dir}/samples/{{sample}}_{lane}_R2_001.fastq.gz', lane=lanes), 49 | lanes='samples.csv' 50 | output: 51 | R1='{results_dir}/samples/{sample}_R1.fastq.gz', 52 | R2='{results_dir}/samples/{sample}_R2.fastq.gz' 53 | shell: 54 | """cat {input.R1} > {output.R1};cat {input.R2} > {output.R2}""" -------------------------------------------------------------------------------- /rules/report.smk: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | 4 | localrules: create_publication_text 5 | 6 | def get_yamls(wildcards): 7 | files = glob.glob('.snakemake/conda/*.yaml') 8 | return(files) 9 | 10 | rule create_publication_text: 11 | input: 12 | config_file=configfile_path, 13 | yaml_files=get_yamls 14 | output: 15 | '{results_dir}/reports/publication_text.html' 16 | script: 17 | "../scripts/publication_text.Rmd" -------------------------------------------------------------------------------- /rules/split_species.smk: -------------------------------------------------------------------------------- 1 | """Extract species specific expression to prepare the species plot.""" 2 | 3 | 4 | #Which rules will be run on the host computer and not sent to nodes 5 | localrules: plot_barnyard 6 | 7 | rule split_bam_species: 8 | input: 9 | '{results_dir}/samples/{sample}/final.bam' 10 | output: 11 | '{results_dir}/samples/{sample}/{species}/unfiltered.bam' 12 | params: 13 | species=lambda wildcards: wildcards.species, 14 | memory=config['LOCAL']['memory'], 15 | temp_directory=config['LOCAL']['temp-directory'] 16 | conda: '../envs/dropseq_tools.yaml' 17 | shell: 18 | """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && FilterBam -m {params.memory}\ 19 | REF_SOFT_MATCHED_RETAINED={params.species}\ 20 | INPUT={input}\ 21 | OUTPUT={output}""" 22 | 23 | 24 | rule extract_all_umi_expression: 25 | input: 26 | data='{results_dir}/samples/{sample}/{species}/unfiltered.bam', 27 | barcode_whitelist='{results_dir}/samples/{sample}/barcodes.csv' 28 | output: 29 | umi_matrix=temp('{results_dir}/samples/{sample}/{species}/unfiltered_umi_expression_matrix.tsv'), 30 | summary='{results_dir}/samples/{sample}/{species}/dge.summary.txt' 31 | params: 32 | count_per_umi=config['EXTRACTION']['minimum-counts-per-UMI'], 33 | cellBarcodeEditDistance=config['EXTRACTION']['UMI-edit-distance'], 34 | memory=config['LOCAL']['memory'], 35 | temp_directory=config['LOCAL']['temp-directory'], 36 | locus_list=','.join(config['EXTRACTION']['LOCUS']) 37 | conda: '../envs/dropseq_tools.yaml' 38 | shell: 39 | """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && DigitalExpression -m {params.memory}\ 40 | I={input.data}\ 41 | O={output.umi_matrix}\ 42 | SUMMARY={output.summary}\ 43 | EDIT_DISTANCE={params.cellBarcodeEditDistance}\ 44 | CELL_BC_FILE={input.barcode_whitelist}\ 45 | LOCUS_FUNCTION_LIST={{{params.locus_list}}}\ 46 | MIN_BC_READ_THRESHOLD={params.count_per_umi}""" 47 | 48 | 49 | rule plot_barnyard: 50 | input: 51 | expand('{results_dir}/samples/{{sample}}/{species}/dge.summary.txt',species=config['META']['species'], results_dir=results_dir) 52 | output: 53 | genes_pdf='{results_dir}/plots/barnyard/{sample}_genes.pdf', 54 | transcripts_pdf='{results_dir}/plots/barnyard/{sample}_transcripts.pdf', 55 | barcodes_species=expand('{{results_dir}}/samples/{{sample}}/{species}/barcodes.csv', species=species_list) 56 | params: 57 | expected_cells=lambda wildcards: int(samples.loc[wildcards.sample,'expected_cells']) 58 | script: 59 | '../scripts/plot_species_plot.R' -------------------------------------------------------------------------------- /schemas/config.schema.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-06/schema#" 2 | 3 | description: Validation schema for all config entries 4 | 5 | properties: 6 | CONTACT: 7 | type: object 8 | description: Details for contacting the person that ran the pipeline. 9 | properties: 10 | person: 11 | type: string 12 | description: Name of the contact person. 13 | email: 14 | type: string 15 | pattern: ".*@.*" 16 | description: Email address of the contact person. 17 | default: john.doe@john.com 18 | LOCAL: 19 | type: object 20 | description: Computer/experiment local details, paths, options. 21 | properties: 22 | temp-directory: 23 | type: string 24 | description: Path of the temporary folder. Must have enough space to handle big files. Use scratch on clusters 25 | memory: 26 | type: string 27 | description: Amount of memory needed for the java virtual machine as well as default for clusters. 28 | raw_data: 29 | type: string 30 | description: Raw data folder path. 31 | results: 32 | type: string 33 | description: Results folder path. 34 | default: results 35 | required: 36 | - temp-directory 37 | - memory 38 | - raw_data 39 | - results 40 | META: 41 | type: object 42 | description: Details about metadata. Reference genomes and annotations. 43 | properties: 44 | species: 45 | type: object 46 | description: Details about species used in the experiment. 47 | properties: 48 | first_species: 49 | type: object 50 | description: First or only species of the experiment as in ensembl. 51 | properties: 52 | name: 53 | type: string 54 | description: Species name in lowercase. 55 | build: 56 | type: number 57 | description: Build number of the first species. 58 | release: 59 | type: number 60 | description: Release number of the first species 61 | required: 62 | - build 63 | - release 64 | second_species: 65 | type: object 66 | description: Second species of a mixed experiment as in ensembl. 67 | properties: 68 | name: 69 | type: string 70 | description: Species name in lowercase. 71 | build: 72 | type: number 73 | description: Build number of the first species. 74 | release: 75 | type: number 76 | description: Release number of the first species 77 | required: 78 | - build 79 | - release 80 | required: 81 | - first_species 82 | ratio: 83 | type: number 84 | description: Minimum percentage of total transcripts in one cell to validate a species 85 | reference-directory: 86 | type: string 87 | description: Folder that will contain all the references and metadata files. 88 | gtf_biotypes: 89 | type: string 90 | default: gtf_biotypes.yaml 91 | description: file that contains a list of biotypes that are kept for the annotation curation. 92 | FILTER: 93 | type: object 94 | description: Details about trimming, filtering and cell/UMI barcode structure. 95 | properties: 96 | barcode-whitelist: 97 | type: string 98 | description: Filename to the barcode whitelist 99 | 5-prime-smart-adapter: 100 | type: string 101 | pattern: "[ATGC]*" 102 | description: This is the adapter that comes before the cell barcode in a 3" protocol. 103 | cell-barcode: 104 | type: object 105 | description: Start and end positions for cell barcodes. 106 | properties: 107 | start: 108 | type: number 109 | description: Cell barcode's first position in R1. 110 | end: 111 | type: number 112 | description: Cell barcode's last position in R1. 113 | UMI-barcode: 114 | type: object 115 | description: Start and end positions for umi barcodes. 116 | properties: 117 | start: 118 | type: number 119 | description: UMI barcode's first position in R1 120 | end: 121 | type: number 122 | description: UMI barcode's last position in R1 123 | cutadapt: 124 | type: object 125 | description: Details about trimming and filtering in cutadapt. 126 | properties: 127 | adapters-file: 128 | type: string 129 | description: Adapters file name. 130 | R1: 131 | type: object 132 | description: Details for R1 trimming/filtering. 133 | properties: 134 | quality-filter: 135 | type: number 136 | description: Quality filtering value as described in cutadapt's documentation for 3" end. https://cutadapt.readthedocs.io/en/stable/algorithms.html#quality-trimming-algorithm 137 | maximum-Ns: 138 | type: number 139 | description: Maximum number of Ns in R1. 140 | extra-params: 141 | type: string 142 | description: Additional parameters for R1 filtering/trimming. For experienced cutadapt users. 143 | required: 144 | - quality-filter 145 | - maximum-Ns 146 | R2: 147 | type: object 148 | description: Details for R2 trimming. 149 | properties: 150 | quality-filter: 151 | type: number 152 | description: Quality filtering value as described in cutadapt's documentation for 3" end. https://cutadapt.readthedocs.io/en/stable/algorithms.html#quality-trimming-algorithm 153 | minimum-adapters-overlap: 154 | type: number 155 | description: Minimum number of bases that overlap with the mRNA. 156 | minimum-length: 157 | type: number 158 | description: Minimum length of R2 once it's trimmed. Anything under this value will be filtered out. 159 | extra-params: 160 | type: string 161 | description: Additional parameters for R1 filtering/trimming. For experienced cutadapt users. 162 | required: 163 | - quality-filter 164 | - minimum-adapters-overlap 165 | - minimum-length 166 | required: 167 | - adapters-file 168 | - R1 169 | - R2 170 | EXTRACTION: 171 | type: object 172 | description: Details for count extraction. 173 | properties: 174 | LOCUS: 175 | type: array 176 | description: Any combination of UTR, CODING and INTRON as an array. 177 | UMI-edit-distance: 178 | type: number 179 | description: Number of mismatches allowed between UMI barcodes when demultiplexing. 180 | default: 1 181 | minimum-counts-per-UMI: 182 | type: number 183 | description: Minimum number of UMI-GENE counts to count as a detected gene in a cell. 184 | default: 0 185 | strand-strategy: 186 | type: string 187 | description: Defines how to count genes where the forward strand mapped to the forward region on the DNA. Can be SENSE (only count reads that mapped on the same strand), ANTISENSE (only count reads that mapped on the opposite strand) or BOTH (count all). 188 | required: 189 | - LOCUS 190 | - UMI-edit-distance 191 | - minimum-counts-per-UMI 192 | - strand-strategy 193 | MAPPING: 194 | type: object 195 | properties: 196 | STAR: 197 | type: object 198 | description: STAR mapper parameters 199 | properties: 200 | outFilterMismatchNmax: 201 | type: number 202 | description: 203 | default: 10 204 | outFilterMismatchNoverLmax: 205 | type: number 206 | description: 207 | default: 0.3 208 | outFilterMismatchNoverReadLmax: 209 | type: number 210 | description: 211 | default: 1 212 | outFilterMatchNmin: 213 | type: number 214 | description: 215 | default: 0 216 | outFilterMatchNminOverLread: 217 | type: number 218 | description: 219 | default: 0 220 | outFilterScoreMinOverLread: 221 | type: number 222 | description: 223 | default: 0 224 | genomeChrBinNbits: 225 | type: number 226 | description: 227 | default: 18 228 | required: 229 | - outFilterMismatchNmax 230 | - outFilterMismatchNoverLmax 231 | - outFilterMismatchNoverReadLmax 232 | - outFilterMatchNmin 233 | - outFilterMatchNminOverLread 234 | - outFilterScoreMinOverLread 235 | - genomeChrBinNbits 236 | required: 237 | - STAR 238 | DEBUG: 239 | type: boolean 240 | description: Boolean value that enables debug mode for R scripts providing Rdata of the snakemake object as well as the R env. 241 | default: FALSE 242 | required: 243 | - LOCAL 244 | - META 245 | - FILTER 246 | - EXTRACTION 247 | - MAPPING -------------------------------------------------------------------------------- /schemas/samples.schema.yaml: -------------------------------------------------------------------------------- 1 | $schema: "http://json-schema.org/draft-06/schema#" 2 | description: an entry in the sample sheet 3 | properties: 4 | samples: 5 | type: string 6 | description: sample name/identifier 7 | expected_cells: 8 | type: number 9 | description: Number of cells expected in an experiment. dropSeqPipe will extract 20% more than the value given. 10 | read_length: 11 | type: number 12 | description: Length of read2 (mRNA). Necessary for generating the STAR index. 13 | batch: 14 | type: string 15 | description: String value that gives a batch id 16 | 17 | required: 18 | - samples 19 | - expected_cells 20 | - read_length 21 | - batch -------------------------------------------------------------------------------- /scripts/clean_cutadapt.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import re 3 | 4 | def fill_results(snakemake,pair, adapter_results): 5 | adapter_pattern = re.compile(pattern="=== Adapter (.*) ===\n") 6 | with open(snakemake.input[pair], 'r') as logfile: 7 | line = logfile.readline() 8 | while(line): 9 | adapter_matched = re.findall(pattern=adapter_pattern,string=line) 10 | if(adapter_matched): 11 | logfile.readline() 12 | line = logfile.readline().rstrip('.\n') 13 | line_list = line.split(';') 14 | adapter_results[adapter_matched[0]]['Pair'] = pair 15 | adapter_results[adapter_matched[0]]['Sequence'] = line_list[0].split(':')[1].strip() 16 | if(adapter_matched[0] in adapter_results): 17 | adapter_results[adapter_matched[0]]['Times'] += int(line_list[3].split(':')[1].split(' ')[1].strip()) 18 | else: 19 | adapter_results[adapter_matched[0]]['Times'] += int(line_list[3].split(':')[1].split(' ')[1].strip()) 20 | line = logfile.readline() 21 | return(adapter_results) 22 | 23 | adapter_results_R1 = defaultdict(lambda :{'Pair':'','Sequence':'','Times':0}) 24 | adapter_results_R2 = defaultdict(lambda :{'Pair':'','Sequence':'','Times':0}) 25 | 26 | 27 | adapter_results_R1 = fill_results(snakemake, 'R1', adapter_results_R1) 28 | adapter_results_R2 = fill_results(snakemake, 'R2', adapter_results_R2) 29 | 30 | with open(snakemake.output[0],'w') as outfile: 31 | outfile.write('Adapter,Sequence,Pair,Count\n') 32 | for adapter in adapter_results_R1: 33 | outfile.write("{},{},{},{}\n".format(adapter, adapter_results_R1[adapter]['Sequence'],adapter_results_R1[adapter]['Pair'],adapter_results_R1[adapter]['Times'])) 34 | for adapter in adapter_results_R2: 35 | outfile.write("{},{},{},{}\n".format(adapter, adapter_results_R2[adapter]['Sequence'],adapter_results_R2[adapter]['Pair'],adapter_results_R2[adapter]['Times'])) -------------------------------------------------------------------------------- /scripts/convert_mtx.py: -------------------------------------------------------------------------------- 1 | #Converts the long format given by the dropseqtools v2.0.0 into the sparse mtx format. 2 | #Output provides features, cell barcodes and counts in seperate files. 3 | # Can handle one or multiple samples at a time 4 | 5 | import os 6 | import subprocess 7 | 8 | 9 | samples = snakemake.params['samples'] 10 | barcodes = {} 11 | features = {} 12 | 13 | out_folder = os.path.dirname(snakemake.output['mtx']) 14 | out_barcodes = snakemake.output.barcodes 15 | out_features = snakemake.output.features 16 | mtx = snakemake.output.mtx 17 | temp_mtx = os.path.join(out_folder,'temp_umi.mtx') 18 | header = os.path.join(out_folder,'header.mtx') 19 | n_lines=0 20 | barcode_index = 1 21 | feature_index = 1 22 | 23 | with open(temp_mtx,'w') as mtx_stream: 24 | for i,sample in enumerate(snakemake.input): 25 | if samples[i] not in sample: 26 | sys.exit("Sample name not found in file path") 27 | with open(sample,'r') as input_file: 28 | next(input_file) # skip first line 29 | for line in input_file: 30 | barcode,feature,count = line.strip().split('\t') 31 | if(not isinstance(samples,str)): 32 | barcode = samples[i] + '_' + barcode 33 | if(barcode not in barcodes): 34 | barcodes[barcode] = barcode_index 35 | barcode_index += 1 36 | if(feature not in features): 37 | features[feature] = feature_index 38 | feature_index += 1 39 | mtx_stream.write('{} {} {}\n'.format(features[feature],barcodes[barcode],count)) 40 | n_lines +=1 41 | 42 | with open(out_barcodes,'w') as barcodes_outfile: 43 | for barcode in barcodes: 44 | barcodes_outfile.write('{}\n'.format(barcode)) 45 | 46 | with open(out_features,'w') as features_outfile: 47 | for feature in features: 48 | features_outfile.write('{}\n'.format(feature)) 49 | 50 | with open(header,'w') as header_outfile: 51 | header_outfile.write("%%MatrixMarket matrix coordinate real general\n") 52 | header_outfile.write('{} {} {}\n'.format(len(features), len(barcodes), n_lines)) 53 | 54 | subprocess.call("cat {} {} > {}".format(header, temp_mtx, mtx), shell=True) 55 | 56 | os.remove(temp_mtx) 57 | os.remove(header) -------------------------------------------------------------------------------- /scripts/create_summary_stats.R: -------------------------------------------------------------------------------- 1 | #' --- 2 | #' title: create_summary_stats.R 3 | #' author: Sebastian Mueller (sebm_at_posteo.de) 4 | #' date: 2019-03-04 5 | #' desc: Creating various summary statistics for barcodes (pre and post filtered) 6 | 7 | # o A delimited file containing information for each STAMP (before cut-off) on number of UMIs, number of Genes detected/captured and the number of NGS-reads 8 | # o A separate delimited file containing information for each STAMP (after cut-off) on number of UMIs, number of Genes detected/captured and the number of NGS-reads 9 | # Example of the format could be as follows: 10 | # STAMP id Number of NGS-reads Number of UMIs Number of Genes Detected 11 | # STAMP1 1000000 50000 6000 12 | #' --- 13 | 14 | #------------------------------------ for debuging: 15 | # add the following line in config.yaml (without the #) 16 | # DEBUG: True 17 | # This will create R objects in the debug directory containing the snakemake object 18 | # R object that can be loaded into a custom R session as below: 19 | # load("debug/snakemake_create_summary_stats.rdata") 20 | # load(file="debug/R_image_create_summary_stats.rdata") 21 | 22 | debug_flag <- FALSE 23 | # check if DEBUG flag is set 24 | if (snakemake@config$DEBUG) { 25 | debug_flag <- TRUE 26 | message("In debug mode: saving R objects to inspect later") 27 | path_debug <- file.path(snakemake@config$LOCAL$results, "debug") 28 | dir.create(path_debug, showWarnings = FALSE) 29 | save(snakemake, file = file.path(path_debug, "create_summary_stats_snakemake.rdata")) 30 | } 31 | #### /debug 32 | 33 | library(dplyr) # Dataframe manipulation 34 | library(Matrix) # Sparse matrices 35 | library(stringr) 36 | library(RColorBrewer) 37 | library(devtools) 38 | library(Seurat) 39 | library(plotly) 40 | 41 | samples <- snakemake@params$sample_names 42 | batches <- snakemake@params$batches 43 | 44 | 45 | # importing Seurat object 46 | 47 | seuratobj <- readRDS(file = file.path(snakemake@input$R_objects)) 48 | meta.data <- seuratobj@meta.data 49 | 50 | # subset only highest stamps as set in config.yaml 51 | # this is necessary since there are more stamps selected as a safty margin which now have to be taken out again to calculate stats. 52 | 53 | meta.data.sub <- meta.data %>% 54 | group_by(orig.ident) %>% 55 | arrange(desc(nCounts)) %>% 56 | slice(1:expected_cells[1]) %>% # makes sure only # expected cell are kept 57 | as.data.frame() 58 | 59 | gini_index <- function (x, weights = rep(1, length = length(x))) { 60 | ox <- order(x) 61 | x <- x[ox] 62 | weights <- weights[ox] / sum(weights) 63 | p <- cumsum(weights) 64 | nu <- cumsum(weights * x) 65 | n <- length(nu) 66 | nu <- nu/nu[n] 67 | sum(nu[-1] * p[-n]) - sum(nu[-n] * p[-1]) 68 | } 69 | 70 | #------------------------------------ post-filter-stats 71 | # stats only based after keeping most abundant `expected-cells` 72 | # taken out from seurat object generated in violine_plot rule. 73 | 74 | # median calculator 75 | 76 | stats_post <- meta.data.sub %>% 77 | group_by(orig.ident) %>% 78 | summarise( 79 | Total_nb_reads = sum(nCounts), 80 | Nb_STAMPS = mean(expected_cells), # should be all the same anyway.. 81 | Median_reads_per_STAMP = round(median(nCounts), 2), 82 | Mean_reads_per_STAMP = round(mean(nCounts), 2), 83 | Total_nb_UMIs = sum(nUMI), 84 | Median_UMIs_per_STAMP = round(median(nUMI), 2), 85 | Mean_UMIs_per_STAMP = round(mean(nUMI), 2), 86 | Mean_UMIs_per_Gene = round(mean(umi.per.gene), 2), 87 | Median_number_genes_per_STAMP = round(median(nGene), 2), 88 | Mean_number_genes_per_STAMP = round(mean(nGene), 2), 89 | Mean_Ribo_pct = round(100 * mean(pct.Ribo), 2), 90 | Mean_Mito_pct = round(100 * mean(pct.mito), 2), 91 | Mean_Count_per_UMI = round(sum(nCounts) / sum(nUMI), 2), 92 | Read_length = mean(read_length), # should be all the same anyway.. 93 | Number_barcodes_used_for_debug = n() 94 | ) %>% 95 | as.data.frame() 96 | 97 | row.names(stats_post) <- stats_post$orig.ident 98 | 99 | 100 | 101 | # highest, lowest count/UMI Stamp 102 | # pre STAMP stats 103 | 104 | # hist out goes into knee plots 105 | # 'results/logs/{sample}_hist_out_cell.txt' 106 | # """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && BAMTagHistogram -m {params.memory}\ 107 | # TAG=XC\ 108 | # https://hpc.nih.gov/apps/dropseq.html 109 | # there is not hint in documentation on any filtering (only read quality) 110 | 111 | #------------------------------------ pre-filter-stats 112 | # calculating statistics based on barcodes before thresholding it (i.e. keeping the most abudant barcodes based on `expected cells`) 113 | # This is based on 'logs/{sample}_hist_out_cell.txt' generated by `BAMTagHistogram` from dropseq-tools 114 | # TOOO: The documentation only mentions duplicate and quality filter. But it seems do more filtering since most barcodes are expected to have only one read assinged but there are usually more. Find out. 115 | # https://hpc.nih.gov/apps/dropseq.html 116 | 117 | # TODO: Nr UMI for pre filter 118 | 119 | stats_pre <- data.frame(matrix(nrow = length(samples), ncol = 10)) 120 | colnames(stats_pre) <- c( 121 | "Sample", 122 | "Batch", 123 | "Total_raw_reads", 124 | "Nr_barcodes_total", 125 | "Nr_barcodes_more_than_1_reads", 126 | "Nr_barcodes_more_than_10_reads", 127 | "percentile99", 128 | "percentile95", 129 | "percentile50", 130 | "Gini-index" 131 | ) 132 | 133 | stats_pre[, "Sample"] <- samples 134 | stats_pre[, "Batch"] <- batches 135 | 136 | 137 | for (i in 1:length(samples)) { 138 | # importing 'logs/{sample}_hist_out_cell.txt' 139 | hist_out <- read.table( 140 | file = snakemake@input$hist_cell[i], 141 | header = FALSE, stringsAsFactors = FALSE 142 | ) 143 | mysample <- samples[i] 144 | reads <- hist_out$V1 145 | barcodes <- hist_out$V2 146 | # calculations on reads 147 | # total reads are not sum(reads)! Needs to be taken from 148 | # results/logs/cutadapt/sample1_R1.qc.txt 149 | # 150 | # read in full text file "sample_R2.qc.txt" 151 | filedump <- readLines(snakemake@input$R2qc[i]) 152 | # subset line matching a pattern 153 | total_reads <- filedump[grep("Total reads processed:", filedump)] %>% #extract line 154 | str_extract("[0-9,]+") %>% # extract number from line 155 | str_replace_all(",", "") %>% # delete comma for subsequent numeric casting 156 | as.numeric() 157 | reads_cumsum <- cumsum(reads) 158 | reads_cumsum_perc <- (reads_cumsum / sum(reads)) 159 | # reporting stats 160 | stats_pre[i, "Total_raw_reads"] <- total_reads 161 | stats_pre[i, "Reads_assigned_to_expected_STAMPs"] <- sum(reads[1:stats_post$Nb_STAMPS[i]]) 162 | stats_pre[i, "Nr_barcodes_total"] <- length(barcodes) 163 | stats_pre[i, "percentile99"] <- which.min(reads_cumsum_perc < 0.99) 164 | stats_pre[i, "percentile95"] <- which.min(reads_cumsum_perc < 0.95) 165 | stats_pre[i, "percentile50"] <- which.min(reads_cumsum_perc < 0.50) 166 | stats_pre[i, "Nr_barcodes_more_than_1_reads"] <- sum(reads > 1) 167 | stats_pre[i, "Nr_barcodes_more_than_10_reads"] <- sum(reads > 10) 168 | stats_pre[i, "Gini-index"] <- round(gini_index(reads), 2) 169 | expected_cells <- as.numeric(filter(stats_post, orig.ident==mysample) %>% select(Nb_STAMPS)) 170 | # % of reads left after applying expected_cells cuttoff 171 | stats_post[mysample, "Pct_reads_after_filter_expected_cells"] <- 172 | round(100 * ( 173 | reads_cumsum[expected_cells] / sum(reads) 174 | ), 2) 175 | # % of reads left after applying all filters including mapping etc. 176 | # Thats the effecive usable reads of the sequencing run 177 | stats_post[mysample, "Pct_reads_after_filter_everything"] <- 178 | round(100 * ( 179 | filter(stats_post, orig.ident==mysample) %>% select(Total_nb_reads) / 180 | stats_pre [i, "Total_raw_reads"]), 2 181 | ) 182 | } 183 | 184 | stats_pre <- stats_pre %>% 185 | arrange(Sample) 186 | 187 | stats_post <- stats_post %>% 188 | arrange((orig.ident)) 189 | 190 | # output 191 | write.csv(stats_pre, file.path(snakemake@output$stats_pre)) 192 | write.csv(stats_post, file.path(snakemake@output$stats_post)) # writes table for excel 193 | 194 | if (debug_flag) { 195 | save.image(file = file.path(path_debug, "create_summary_stats_workspace.rdata")) 196 | } 197 | -------------------------------------------------------------------------------- /scripts/detect_barcodes.py: -------------------------------------------------------------------------------- 1 | from Bio import SeqIO 2 | import gzip 3 | from collections import Counter 4 | 5 | fastq_parser = SeqIO.parse(gzip.open(snakemake.input.R1, "rt"), "fastq") 6 | sequences = [] 7 | n=0 8 | for fastq_R1 in fastq_parser: 9 | sequences.append(str(fastq_R1.seq)) 10 | n+=1 11 | if(n==10000000): 12 | break 13 | def parse_barcodes(fastq_parser): 14 | counts={} 15 | ranges = range(5,len(sequences[0])) 16 | 17 | for cell_bc_length in ranges: 18 | counts[cell_bc_length] = list() 19 | for fastq_R1 in sequences: 20 | counts[cell_bc_length].append(fastq_R1[0:cell_bc_length]) 21 | return(counts) 22 | 23 | 24 | 25 | counts = parse_barcodes(fastq_parser) 26 | 27 | with open(snakemake.output[0], "w") as outfile: 28 | outfile.write('bc_length,first_counts\n') 29 | for cell_bc_length in counts: 30 | outfile.write('{},{}\n'.format(cell_bc_length,str(Counter(counts[cell_bc_length]).most_common(100)[0][1]))) 31 | -------------------------------------------------------------------------------- /scripts/fa2tsv.py: -------------------------------------------------------------------------------- 1 | #' --- 2 | #' title: fa2tsv.py 3 | #' author: Sebastian Mueller (sebm_at_posteo.de) 4 | #' date: 2019-03-04- 5 | #' Convertes fasta file into tab seperated file suitable as input for FastQC. 6 | #' This is to have FastQC using customized adapters using the -a option 7 | #' --- 8 | 9 | import sys 10 | from Bio import SeqIO 11 | 12 | number_bp = 12 13 | 14 | with open( snakemake.output['tsv'], "w" ) as output: 15 | for seq_record in SeqIO.parse(snakemake.input['fa'], "fasta"): 16 | myline = (str(seq_record.id)) + "\t" + str(seq_record.seq[0:(number_bp + 1)]) + "\n" 17 | output.write(myline) 18 | 19 | -------------------------------------------------------------------------------- /scripts/generate_extended_ref.py: -------------------------------------------------------------------------------- 1 | from itertools import combinations, product 2 | from collections import defaultdict 3 | from copy import deepcopy 4 | import pickle 5 | from shutil import copyfile 6 | 7 | 8 | def save_obj(obj, name): 9 | with open(name, 'wb') as f: 10 | pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) 11 | 12 | def generate_all(barcode, reference, mapping, edit_distance): 13 | mutants = generate_mutants(barcode, edit_distance) 14 | for mutant in mutants: 15 | if(mutant not in reference): 16 | reference.add(mutant) 17 | mapping[edit_distance][mutant]['ref'] = barcode 18 | mapping[edit_distance][mutant]['count'] = 0 19 | mapping[edit_distance][mutant]['lanes'] = {'1':0,'2':0,'3':0,'4':0,'5':0,'6':0,'7':0,'8':0} 20 | 21 | mapping['unknown']=defaultdict() 22 | return(reference, mapping) 23 | 24 | 25 | def generate_mutants(sequence, d=1): 26 | """Taken from stackoverflow: https://stackoverflow.com/a/19823295/9178565""" 27 | N = len(sequence) 28 | letters = 'ACGTN' 29 | pool = list(sequence) 30 | for indices in combinations(range(N), d): 31 | for replacements in product(letters, repeat=d): 32 | skip = False 33 | for i, a in zip(indices, replacements): 34 | if pool[i] == a: skip = True 35 | if skip: continue 36 | 37 | keys = dict(zip(indices, replacements)) 38 | yield ''.join([pool[i] if i not in indices else keys[i] 39 | for i in range(N)]) 40 | 41 | # Create empty sets and defaultdicts 42 | barcode_ref = set() 43 | mapping=defaultdict(dict) 44 | 45 | # Initiate ref and mapping with the given barcodes 46 | with open(snakemake.input.whitelist,'r') as ref_file: 47 | for line in ref_file.readlines(): 48 | barcode = line.strip() 49 | barcode_ref.add(barcode) 50 | mapping[0][barcode]=defaultdict(dict) 51 | mapping[0][barcode]['ref'] = barcode 52 | mapping[0][barcode]['count'] = 0 53 | mapping[0][barcode]['lanes'] = {'1':0,'2':0,'3':0,'4':0,'5':0,'6':0,'7':0,'8':0} 54 | 55 | barcode_ext_ref = deepcopy(barcode_ref) 56 | # For now edit distance is one, but can be extended to a higher number later on. 57 | max_edit_distance = 1 58 | for edit_distance in range(1,max_edit_distance+1): 59 | mapping[edit_distance]=defaultdict(dict) 60 | for barcode in mapping[0]: 61 | (barcode_ext_ref,mapping) = generate_all(barcode, barcode_ext_ref, mapping, edit_distance) 62 | 63 | # Delete given barcodes out of new reference. This helps later on when running "repair_barcodes.py" 64 | barcode_ref = set(mapping[0]) 65 | barcode_ext_ref.difference_update(barcode_ref) 66 | 67 | # Save mapping and references to reuse later. 68 | save_obj(obj=mapping, name=snakemake.output.barcode_mapping) 69 | save_obj(obj=barcode_ref,name=snakemake.output.barcode_ref) 70 | save_obj(obj=barcode_ext_ref,name=snakemake.output.barcode_ext_ref) 71 | 72 | 73 | copyfile(snakemake.input['whitelist'], snakemake.output['barcodes']) -------------------------------------------------------------------------------- /scripts/merge_bam.py: -------------------------------------------------------------------------------- 1 | import pysam 2 | import re 3 | import csv 4 | from Bio import SeqIO 5 | import gzip 6 | from collections import defaultdict 7 | import sys 8 | 9 | #This function fills in a dict with readids 10 | #and their corresponding cell and umi barcodes until it finds 11 | #a specific read id 12 | 13 | discard_secondary_alignements = snakemake.params['discard_secondary_alignements'] 14 | 15 | barcodes_struct = { 16 | 'BC_start':snakemake.params['BC_start'], 17 | 'BC_end':snakemake.params['BC_end'], 18 | 'UMI_start':snakemake.params['UMI_start'], 19 | 'UMI_end':snakemake.params['UMI_end'] 20 | } 21 | 22 | def parse_barcodes(fastq_parser, query_name, read_barcodes, barcodes_struct): 23 | for fastq_R1 in fastq_parser: 24 | # Some sequencers give a /1 and /2 to R1 and R2 read ids respectively. This attempts to solve the issue #69. 25 | if '/' in fastq_R1.id: 26 | R1_id = fastq_R1.id[:fastq_R1.id.find("/")] 27 | else: 28 | R1_id = fastq_R1.id 29 | read_barcodes[R1_id]['XC'] = str(fastq_R1.seq)[barcodes_struct['BC_start']:barcodes_struct['BC_end']] 30 | read_barcodes[R1_id]['XM'] = str(fastq_R1.seq)[barcodes_struct['UMI_start']:barcodes_struct['UMI_end']] 31 | if(read_barcodes[R1_id]['XM']==''): 32 | sys.SystemExit('UMI empty for read {}.\n The barcode is: {}.\nWhole entry is:{}'.format(R1_id, fastq_R1.seq,fastq_R1)) 33 | if (R1_id == query_name): 34 | return(fastq_parser,read_barcodes) 35 | return(fastq_parser,read_barcodes) 36 | 37 | infile_bam = pysam.AlignmentFile(snakemake.input[0], "rb") 38 | 39 | fastq_parser = SeqIO.parse(gzip.open(snakemake.input[1], "rt"), "fastq") 40 | 41 | outfile = pysam.AlignmentFile(snakemake.output[0], "wb", template=infile_bam) 42 | 43 | read_barcodes = defaultdict(lambda :{'XC':'','XM':''}) 44 | 45 | for bam_read in infile_bam: 46 | if(discard_secondary_alignements & bam_read.is_secondary): 47 | continue 48 | if (bam_read.query_name) in read_barcodes: 49 | current_barcodes = read_barcodes.pop(bam_read.query_name) 50 | tags = bam_read.get_tags() 51 | tags.extend([ 52 | ('XC', current_barcodes['XC'],'Z'), 53 | ('XM', current_barcodes['XM'],'Z')]) 54 | bam_read.set_tags(tags) 55 | else: 56 | fastq_parser,read_barcodes = parse_barcodes(fastq_parser, bam_read.query_name, read_barcodes, barcodes_struct) 57 | if (bam_read.query_name) not in read_barcodes: 58 | raise SystemExit('Read {} from mapped file is missing in reference fastq file!'.format(bam_read.query_name)) 59 | os.remove(snakemake.output[0]) 60 | current_barcodes = read_barcodes.pop(bam_read.query_name) 61 | tags = bam_read.get_tags() 62 | tags.extend([ 63 | ('XC', current_barcodes['XC'],'Z'), 64 | ('XM', current_barcodes['XM'],'Z')]) 65 | bam_read.set_tags(tags) 66 | outfile.write(bam_read) -------------------------------------------------------------------------------- /scripts/plot_adapter_content.R: -------------------------------------------------------------------------------- 1 | #------------------------------------ for debugging: 2 | # For debugging add the following line in config.yaml (without the #) 3 | # DEBUG: True 4 | # This will create R objects in the debug directory containing the snakemake 5 | # object R object that can be loaded into a custom R session as below: 6 | 7 | debug_flag <- FALSE 8 | # check if DEBUG flag is set 9 | if (snakemake@config$DEBUG) { 10 | debug_flag <- TRUE 11 | message("In debug mode: saving R objects to inspect later") 12 | path_debug <- file.path(snakemake@config$LOCAL$results, "debug") 13 | dir.create(path_debug, showWarnings = FALSE) 14 | save(snakemake, file = file.path(path_debug, "plot_adapter_content_snakemake.rdata")) 15 | } 16 | 17 | #------------------------------------ debugging 18 | 19 | library(ggplot2) 20 | library(dplyr) 21 | library(viridis) 22 | 23 | samples <- snakemake@params$sample_names 24 | batches <- snakemake@params$batches 25 | 26 | #Read files into a list 27 | cutadapt_clean_list <- list() 28 | for (i in seq_along(samples)){ 29 | cutadapt_clean <- read.csv(snakemake@input[[i]][1], header = TRUE) 30 | cutadapt_clean$Sample <- samples[i] 31 | cutadapt_clean$Batch <- batches[i] 32 | cutadapt_clean_list[[i]] <- cutadapt_clean 33 | } 34 | 35 | # combining adaptors accross samples 36 | cutadapt_counts <- Reduce(rbind, cutadapt_clean_list, NULL) 37 | 38 | #Transform it into percentages 39 | cutadapt_counts <- group_by(cutadapt_counts, Sample, Pair) %>% 40 | mutate(Percentages=Count/sum(Count)) 41 | # Adapter Sequence Pair Count Sample Batch 42 | # 1 PrefixNX/1 AGATGTGTATAAGAGACAG R1 7 sample1 Batch1 43 | # ... 44 | # 6 Trans2_rc CTGTCTCTTATACACATCTCCGAGCCCACGAGAC R2 5 sample2 Batch2 45 | 46 | p1 <- ggplot(cutadapt_counts, aes(x=Sample, y = Percentages, fill = Adapter)) + 47 | geom_bar(stat = "identity") + 48 | facet_grid(Pair ~ Batch, scales = "free") + 49 | theme_minimal() + 50 | ggtitle("Comparison accross samples of adapter content") + 51 | scale_x_discrete(label=abbreviate) + 52 | scale_y_continuous(labels = scales::percent) + 53 | theme(axis.text.x=element_text(angle = 90, hjust = 0)) + 54 | scale_fill_viridis(discrete=TRUE) 55 | 56 | ggsave(plot=p1, filename=snakemake@output$pdf) 57 | 58 | if (debug_flag) { 59 | save.image(file = file.path(path_debug, "plot_adapter_content_workspace.rdata")) 60 | } 61 | -------------------------------------------------------------------------------- /scripts/plot_knee_plot.R: -------------------------------------------------------------------------------- 1 | #------------------------------------ for debuging: 2 | # add the following line in config.yaml (without the #) 3 | # DEBUG: True 4 | # This will create R objects in the debug directory containing the snakemake object 5 | # R object that can be loaded into a custom R session as below: 6 | 7 | debug_flag <- FALSE 8 | if (snakemake@config$DEBUG) { 9 | debug_flag <- TRUE 10 | message("In debug mode: saving R objects to inspect later") 11 | path_debug <- file.path(snakemake@config$LOCAL$results, "debug") 12 | dir.create(path_debug, showWarnings = FALSE) 13 | save(snakemake, file = file.path(path_debug, 14 | paste0("plot_knee_plot_snakemake_", 15 | attr(snakemake, "wildcard")$sample, ".rdata")) 16 | ) 17 | } 18 | #### /debug 19 | 20 | library(ggplot2) 21 | library(plyr) 22 | # Create the cumulative plot 23 | data=read.table(file = snakemake@input[[1]][1], header=FALSE, stringsAsFactors=FALSE) 24 | barcodes = data$V2 25 | total_reads = sum(data$V1) 26 | y_raw=cumsum(data$V1) 27 | y=(y_raw/total_reads) 28 | x = 1:length(y) 29 | plot_data = data.frame(rank = x,cum_sum=y, Barcode=data$V2) 30 | x_scale = snakemake@params$cells * 4 31 | 32 | knee_plot <- ggplot(plot_data, aes(x=rank, y=cum_sum)) + 33 | geom_point(size = 0.1) + 34 | xlim(0,x_scale) + 35 | geom_vline(xintercept=snakemake@params$cells, linetype="dashed", color = "red") + 36 | ggtitle(paste0(snakemake@wildcards$sample, '\nTotal reads: ', prettyNum(total_reads))) + 37 | theme(plot.title = element_text(size=10)) + 38 | labs(x='STAMPS', y='Cumulative fraction of reads') + 39 | scale_y_continuous(labels = scales::percent) + 40 | theme_classic() 41 | 42 | if(!is.null(snakemake@input$barcodes)) 43 | { 44 | selected_cells <- read.csv(snakemake@input$barcodes, header=FALSE, stringsAsFactors=FALSE) 45 | knee_plot <- knee_plot + 46 | geom_point(data = plot_data[plot_data$Barcode %in% selected_cells$V1,], 47 | aes(x=rank, y=cum_sum, color='Selected'), size=0.1) + 48 | scale_color_manual(values=c('Selected'='green')) 49 | } 50 | ggsave(knee_plot, file=snakemake@output$pdf, width = 4, height = 3) 51 | 52 | 53 | if (debug_flag) { 54 | library(gridExtra) 55 | library(grid) 56 | # potential usefull to change lag of diff calculation 57 | mylag <- 1 58 | # data <- read.table(file = file, header=FALSE, stringsAsFactors=FALSE) 59 | # head(data,3) 60 | # V1 V2 61 | # 1 1145137 CCCTTCGTCTGC 62 | # 2 1039974 ATAGTTTTTTAA 63 | # 3 912199 GCATGAAACTTC 64 | 65 | # borrowed from https://stackoverflow.com/questions/6836409/finding-local-maxima-and-minima 66 | localMaxima <- function(x) { 67 | # Use -Inf instead if x is numeric (non-integer) 68 | y <- diff(c(-.Machine$integer.max, x)) > 0L 69 | rle(y)$lengths 70 | y <- cumsum(rle(y)$lengths) 71 | y <- y[seq.int(1L, length(y), 2L)] 72 | if (x[[1]] == x[[2]]) { 73 | y <- y[-1] 74 | } 75 | y 76 | } 77 | 78 | reads <- data$V1 79 | barcodes <- data$V2 80 | total_reads <- sum(reads) 81 | reads_cumsum <- cumsum(reads) 82 | # 1st dervivative (diff) needs also to be padded to keep same vector length 83 | reads_diff <- c(diff(reads,lag=mylag,differences = 1),rep(0,mylag)) 84 | 85 | # 2nd derivative: twice as much padding: 86 | reads_diff_diff <- c(diff(reads,lag=mylag,differences = 2),rep(0,mylag*2)) 87 | reads_cumsum_perc <- (reads_cumsum/total_reads) 88 | x <- 1:length(reads_cumsum_perc) 89 | plot_data <- data.frame(rank = x, 90 | cum_sum = reads_cumsum_perc, 91 | read_count = reads, 92 | Barcode = data$V2, 93 | diff = reads_diff, 94 | diffdiff = reads_diff_diff) 95 | 96 | # head(plot_data,6) 97 | # rank cum_sum read_count Barcode diff diffdiff 98 | # 1 1 0.007192916 1145137 CCCTTCGTCTGC -105163 -22612 99 | # 2 2 0.013725274 1039974 ATAGTTTTTTAA -127775 14025 100 | # 3 3 0.019455043 912199 GCATGAAACTTC -113750 61486 101 | # 4 4 0.024470318 798449 GTGTGGGTCTCT -52264 34985 102 | # 5 5 0.029157308 746185 CGTACTGACTAC -17279 -60441 103 | # 6 6 0.033735764 728906 GTTCGTCCCGCC -77720 69104 104 | mystats <- paste0("| Nr barcodes total: ", length(barcodes), ' \n ', 105 | "Nr barcodes for 50% reads: ", which.min(reads_cumsum_perc<0.50)," | ", 106 | "Nr barcodes for 95% reads: ", which.min(reads_cumsum_perc<0.95)," | ", 107 | "Nr barcodes for 99% reads: ", which.min(reads_cumsum_perc<0.99) 108 | ) 109 | 110 | # x_scale <- which(reads_cumsum_perc>0.99)[1] 111 | x_scale <- snakemake@params$cells * 4 112 | plot_data_head <- head(plot_data, x_scale) 113 | # plot_data_head$reads_diff_smooth <- predict(loess(diff~rank,data=plot_data_head)) 114 | plot_data_head$reads_diffdiff_smooth <- predict(loess(diffdiff~rank,span=0.2,data=plot_data_head)) 115 | 116 | ## Finding knee in knee-plot: 117 | # Best approach so far is to calclate the 2nd derivative of the read counts per STAMP (reads_diff_diff), smooth it (loess) and find it's maxima. Since ther can be several maxima, they are just ploted and it's up to the user to visually asses and decide. 118 | # finding local maxima in 2nd derivative: 119 | #https://stackoverflow.com/questions/6836409/finding-local-maxima-and-minima 120 | loc.max <- localMaxima(plot_data_head$reads_diffdiff_smooth) 121 | # local maxima are then colored green as lines in plots 122 | plot_data_head_sub <- plot_data_head[loc.max,] 123 | # which.peaks(plot_data_head$reads_diff_smooth2) 124 | 125 | knee_plot_ext <- ggplot(plot_data_head, aes(x=rank, y=cum_sum)) + 126 | xlim(0,x_scale) + 127 | ylim(0,1) + 128 | geom_text(data=plot_data_head_sub,aes(label = round(cum_sum,2)),nudge_y=-0.05, vjust = "inward", hjust = "inward") + 129 | geom_vline(xintercept=snakemake@params$cells, linetype="dashed", color = "red") + 130 | # geom_vline(xintercept=100, linetype="dashed", color = "red") + 131 | geom_vline(xintercept=loc.max, col="lightgreen") + 132 | geom_point(size = 0.1) + 133 | ggtitle(paste0(snakemake@wildcards$sample, '\nTotal reads: ', prettyNum(total_reads), mystats)) + 134 | theme(plot.title = element_text(size=10)) + 135 | labs(x='STAMPS', y='Cumulative fraction of reads') 136 | read_count_plot <- ggplot(plot_data_head, aes(x=rank, y=read_count)) + 137 | geom_text(data=plot_data_head_sub,aes(label = read_count),nudge_y=-0.05, vjust = "inward", hjust = "inward", check_overlap = TRUE) + 138 | # geom_smooth() + 139 | xlim(0,x_scale) + 140 | # ylim(0,1000) + 141 | geom_vline(xintercept=snakemake@params$cells, linetype="dashed", color = "red") + 142 | # geom_vline(xintercept=100, linetype="dashed", color = "red") + 143 | geom_vline(xintercept=loc.max, col="lightgreen") + 144 | geom_point(size = 0.1) + 145 | theme(plot.title = element_text(size=10)) + 146 | labs(x='STAMPS', y='Read counts per STAMP') 147 | 148 | # knee_plot_ext = knee_plot_ext + scale_y_continuous(labels = scales::percent) 149 | diff_plot <- ggplot(plot_data_head, aes(x=rank, y=diff)) + 150 | geom_text(data=plot_data_head_sub,aes(label = diff),nudge_y=-0.05, vjust = "inward", hjust = "inward", check_overlap = TRUE) + 151 | # geom_smooth() + 152 | xlim(0,x_scale) + 153 | # ylim(0,1000) + 154 | geom_vline(xintercept=snakemake@params$cells, linetype="dashed", color = "red") + 155 | # geom_vline(xintercept=100, linetype="dashed", color = "red") + 156 | geom_vline(xintercept=loc.max, col="lightgreen") + 157 | geom_point(size = 0.1) + 158 | theme(plot.title = element_text(size=10)) + 159 | #1st derivative read count diff to next STAMP") 160 | labs(x='STAMPS', y="1st derivative of read counts") 161 | diff_diff_plot <- ggplot(plot_data_head, 162 | aes(x=rank, y=reads_diffdiff_smooth)) + 163 | geom_text(data=plot_data_head_sub, 164 | aes(label = rank),nudge_y=-1, 165 | vjust = "inward", 166 | hjust = "inward", 167 | check_overlap = TRUE) + 168 | xlim(0,x_scale) + 169 | ylim(-30,30) + 170 | geom_vline(xintercept=snakemake@params$cells, 171 | linetype="dashed", color = "red") + 172 | # geom_vline(xintercept=100, linetype="dashed", color = "red") + 173 | geom_vline(xintercept=loc.max, col="lightgreen") + 174 | geom_point(size = 0.1) + 175 | theme(plot.title = element_text(size=10)) + 176 | labs(x='STAMPS', y='2nd derivative of read counts') 177 | 178 | if(!is.null(snakemake@input$barcodes)) 179 | { 180 | selected_cells <- read.csv(snakemake@input$barcodes, header=FALSE, stringsAsFactors=FALSE) 181 | knee_plot_ext <- knee_plot_ext + 182 | geom_point(data = plot_data_head[plot_data_head$Barcode %in% selected_cells$V1,], 183 | aes(x=rank, y=cum_sum, color='Selected'), size=0.1) + 184 | scale_color_manual(values=c('Selected'='green')) + 185 | theme(legend.position="none") 186 | diff_diff_plot <- diff_diff_plot + 187 | geom_point(data = plot_data_head[plot_data_head$Barcode %in% selected_cells$V1,], 188 | aes(x=rank, y=reads_diffdiff_smooth, color='Selected'), size=0.1) + 189 | scale_color_manual(values=c('Selected'='green')) + 190 | theme(legend.position="none") 191 | } 192 | # scale_y_continuous(position = "right") 193 | gp1 <- ggplotGrob(knee_plot_ext) 194 | gp2 <- ggplotGrob(read_count_plot) 195 | gp3 <- ggplotGrob(diff_plot) 196 | gp4 <- ggplotGrob(diff_diff_plot) 197 | # grid::grid.newpage() 198 | # gg <- grid::grid.draw(rbind(gp1, gp2, gp3, gp4, size = "last")) 199 | gg <- gridExtra::arrangeGrob(rbind(gp1, gp2, gp3, gp4, size = "last")) 200 | # gg <- gridExtra::arrangeGrob(gp1, gp2, gp3, gp4, ncol = 1) 201 | 202 | # if barcode.csv is present in base directory, only use barcodes in there (rule plot_knee_plot_whitelist in map.smk) 203 | ggsave(gg, file=paste0(snakemake@output$pdf, "_extended.pdf"), width = 9, height = 11) 204 | } 205 | 206 | if (debug_flag) { 207 | save.image(file = file.path(path_debug, 208 | paste0("plot_knee_plot_workspace_", 209 | attr(snakemake, "wildcard")$sample, ".rdata")) 210 | ) 211 | } 212 | -------------------------------------------------------------------------------- /scripts/plot_rna_metrics.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(tidyr) 3 | library(gridExtra) 4 | library(grid) 5 | library(viridis) 6 | debug_flag <- FALSE 7 | if (snakemake@config$DEBUG) { 8 | debug_flag <- TRUE 9 | message("In debug mode: saving R objects to inspect later") 10 | path_debug <- file.path(snakemake@config$LOCAL$results, "debug") 11 | dir.create(path_debug, showWarnings = FALSE) 12 | save(snakemake, file = file.path(path_debug, "plot_rna_metrics_snakemake.rdata")) 13 | } 14 | 15 | #### /debug 16 | 17 | mydata <- read.csv(file = snakemake@input$rna_metrics, header = T, 18 | stringsAsFactors = F, skip = 6, sep = "\t") 19 | mydata <- mydata[order(mydata$PF_ALIGNED_BASES, decreasing = T), ] 20 | mydata_pct <- mydata[, c("READ_GROUP", 21 | "PCT_INTERGENIC_BASES", 22 | "PCT_UTR_BASES", 23 | "PCT_RIBOSOMAL_BASES", 24 | "PCT_INTRONIC_BASES", 25 | "PCT_CODING_BASES") 26 | ] 27 | colnames(mydata_pct) = c('Cell Barcode', 'Intergenic', 'UTR', 'Ribosomial', 'Intronic', 'Coding') 28 | 29 | mydata <- mydata[, c("READ_GROUP", 30 | "INTERGENIC_BASES", 31 | "UTR_BASES", 32 | "RIBOSOMAL_BASES", 33 | "INTRONIC_BASES", 34 | "CODING_BASES") 35 | ] 36 | colnames(mydata) = c('Cell Barcode', 'Intergenic', 'UTR', 'Ribosomial', 'Intronic', 'Coding') 37 | 38 | # converting into long format for ploting 39 | mydata_long <- mydata %>% gather("Read Overlap", count, -"Cell Barcode") 40 | 41 | # Keep the original order of the barcodes using factor and levels. 42 | mydata_long$`Cell Barcode` <- factor(mydata_long$`Cell Barcode`, 43 | levels = factor(unique(mydata_long$`Cell Barcode`))) 44 | mydata_long$`Read Overlap` <- factor(mydata_long$`Read Overlap`, 45 | levels = unique(mydata_long$`Read Overlap`)) 46 | 47 | p1 <- ggplot(mydata_long, aes(x = `Cell Barcode`, y = count, fill = `Read Overlap`)) + 48 | geom_bar(stat = "identity") + 49 | theme(axis.text.x = element_text(angle = 90, hjust = 0), legend.position = "none") 50 | p1 <- p1 + labs(title = paste(nrow(mydata), 51 | "selected barcodes for", 52 | snakemake@wildcards$sample), 53 | x = "Barcodes", y = "Bases") 54 | p1 <- p1 + theme(axis.title.x = element_blank(), 55 | axis.text.x = element_blank(), 56 | axis.ticks.x = element_blank()) 57 | p1 <- p1 + scale_y_continuous(labels = scales::scientific) 58 | p1 <- p1 + scale_fill_viridis(discrete = TRUE, option = "viridis") 59 | 60 | 61 | mydata_long_pct <- mydata_pct %>% gather("Read Overlap", fraction, -"Cell Barcode") 62 | # Keep the original order of the barcodes using factor and levels. 63 | mydata_long_pct$`Cell Barcode` <- factor(mydata_long_pct$`Cell Barcode`, 64 | levels = factor(unique(mydata_long_pct$`Cell Barcode`))) 65 | mydata_long_pct$`Read Overlap` <- factor(mydata_long_pct$`Read Overlap`, 66 | levels = unique(mydata_long_pct$`Read Overlap`)) 67 | 68 | p2 <- ggplot(mydata_long_pct, aes(x = `Cell Barcode`, y = fraction, fill = `Read Overlap`)) + 69 | geom_bar(stat = "identity") + 70 | theme(axis.text.x = element_text(angle = 90, hjust = 0, size=8, vjust = 0.05), legend.position = "bottom") + 71 | labs(x = "Barcodes", y = "%Bases") + 72 | scale_y_continuous(labels = scales::percent) + scale_fill_viridis(discrete = TRUE, option = "viridis") 73 | # This allows to align the main plots so that we can relate both directly with the label from the bottom one. 74 | gp1 <- ggplotGrob(p1) 75 | gp2 <- ggplotGrob(p2) 76 | pdf(file = snakemake@output$pdf, width = 16, height = 13) 77 | grid::grid.newpage() 78 | grid::grid.draw(rbind(gp1, gp2, size = "last")) 79 | dev.off() 80 | 81 | if (debug_flag) { 82 | save.image(file = file.path(path_debug, "plot_rna_metrics_workspace.rdata")) 83 | } 84 | -------------------------------------------------------------------------------- /scripts/plot_species_plot.R: -------------------------------------------------------------------------------- 1 | # Functions used to plot the species plot for drop-seq mixed protocol 2 | # Authors: James Nemesh, Roelli Patrick, Sebastian Y Mueller 3 | 4 | debug_flag <- FALSE 5 | if (snakemake@config$DEBUG) { 6 | debug_flag <- TRUE 7 | message("In debug mode: saving R objects to inspect later") 8 | path_debug <- file.path(snakemake@config$LOCAL$results, "debug") 9 | dir.create(path_debug, showWarnings = FALSE) 10 | save(snakemake, file = file.path( 11 | path_debug, 12 | paste0("plot_species_plot_snakemake_", attr(snakemake, "wildcard")$sample, ".rdata") 13 | )) 14 | } 15 | 16 | #### /debug 17 | 18 | categorizeCellsUsingKneeKnownNumCellsPaper <- function( 19 | digitalExpressionFileO1, 20 | digitalExpressionFileO2, 21 | organismOne, 22 | organismTwo, 23 | pureRatio = 0.2, 24 | numCells, 25 | numBeads, 26 | point.cex = 1.5, 27 | xlim_range = NULL, 28 | ylim_range = NULL, 29 | category = "transcripts") { 30 | dfFull <- getNumTranscriptsPerCellBarcodeByOrganismPair( 31 | digitalExpressionFileO1, 32 | digitalExpressionFileO2, 33 | organismOne, 34 | organismTwo, 35 | category) 36 | dfFull <- dfFull[order(dfFull$total, decreasing = T), ] 37 | dfFull$ratio_one <- dfFull[, 2] / dfFull[, 4] 38 | dfFull <- head(dfFull, n = numBeads) 39 | df <- head(dfFull, n = numCells) 40 | 41 | dfNoCall <- dfFull[-1:-numCells, ] 42 | if (dim(dfNoCall)[1] > 0) { 43 | dfNoCall$organism <- "No Call" 44 | } 45 | 46 | df$organism <- "Mixed" 47 | 48 | idx <- which(df$ratio_one >= (1 - pureRatio)) 49 | # checks if the species is actually assigned at all 50 | if (length(idx) > 0) { 51 | df[idx, ]$organism <- organismOne 52 | } 53 | idx <- which(df$ratio_one <= (pureRatio)) 54 | if (length(idx) > 0) { 55 | df[idx, ]$organism <- organismTwo 56 | } 57 | 58 | result <- rbind(df, dfNoCall) 59 | 60 | maxRange <- max(result[, 2], result[, 3]) 61 | 62 | dforganismOne <- result[result$organism == organismOne, ] 63 | dforganismTwo <- result[result$organism == organismTwo, ] 64 | dfMixed <- result[result$organism == "Mixed", ] 65 | dfNoCall <- result[result$organism == "No Call", ] 66 | 67 | if (is.null(xlim_range)) { 68 | xlim_range <- c(0, maxRange) 69 | } 70 | 71 | if (is.null(ylim_range)) { 72 | ylim_range <- c(0, maxRange) 73 | } 74 | colors <- c("blue", "red", "purple", "grey") 75 | plot(dforganismOne[, 2], dforganismOne[, 3], col = colors[1], 76 | pch = 16, xlim = xlim_range, ylim = ylim_range, 77 | xlab = paste(organismOne, category), 78 | ylab = paste(organismTwo, category), 79 | cex = point.cex) 80 | points(dforganismTwo[, 2], dforganismTwo[, 3], 81 | col = colors[2], pch = 16, cex = point.cex) 82 | points(dfMixed[, 2], dfMixed[, 3], 83 | col = colors[3], pch = 16, cex = point.cex) 84 | points(dfNoCall[, 2], dfNoCall[, 3], 85 | col = colors[4], pch = 16, cex = point.cex) 86 | l <- c(paste(organismOne, dim(dforganismOne)[1]), 87 | paste(organismTwo, dim(dforganismTwo)[1]), 88 | paste("Mixed", dim(dfMixed)[1]), 89 | paste("No Call", dim(dfNoCall)[1])) 90 | legend("topright", legend = l, fill = colors) 91 | title(paste("Species plot based on", category)) 92 | return(df) 93 | } 94 | 95 | getNumTranscriptsPerCellBarcodeByOrganismPair <- function( 96 | digitalExpressionFileO1, 97 | digitalExpressionFileO2, 98 | organismOne, 99 | organismTwo, 100 | category) { 101 | if (is.null(organismOne) || is.null(organismTwo)) { 102 | return(NULL) 103 | } 104 | 105 | o1 <- getGenesAndTranscriptsPerCellBarcode(digitalExpressionFileO1) 106 | o2 <- getGenesAndTranscriptsPerCellBarcode(digitalExpressionFileO2) 107 | 108 | commonBC <- union(o1$cellBC, o2$cellBC) 109 | o1p <- o1[match(commonBC, o1$cellBC), ] 110 | o2p <- o2[match(commonBC, o2$cellBC), ] 111 | if (category == "genes") { 112 | df <- data.frame(tag = commonBC, o1Count = o1p$numGenes, 113 | o2Count = o2p$numGenes, stringsAsFactors = F) 114 | } 115 | else { 116 | df <- data.frame(tag = commonBC, o1Count = o1p$numTranscripts, 117 | o2Count = o2p$numTranscripts, stringsAsFactors = F) 118 | } 119 | 120 | idx1 <- which(is.na(df$o1Count)) 121 | idx2 <- which(is.na(df$o2Count)) 122 | if (length(idx1) > 0) df[idx1, ]$o1Count <- 0 123 | if (length(idx2) > 0) df[idx2, ]$o2Count <- 0 124 | 125 | df$total <- apply(df[, 2:3], 1, sum, na.rm = T) 126 | df <- df[order(df$total, decreasing = T), ] 127 | colnames(df)[2] <- organismOne 128 | colnames(df)[3] <- organismTwo 129 | return(df) 130 | } 131 | 132 | 133 | getGenesAndTranscriptsPerCellBarcode <- function(digitalExpressionFile) { 134 | a <- read.table(digitalExpressionFile, header = T, stringsAsFactors = F) 135 | colnames(a) <- c("cellBC", "numGenicReads", "numTranscripts", "numGenes") 136 | return(a) 137 | } 138 | 139 | digitalExpressionFileO1 <- snakemake@input[[1]][1] 140 | digitalExpressionFileO2 <- snakemake@input[[2]][1] 141 | 142 | num_cells <- snakemake@params$expected_cells 143 | 144 | organismOne <- names(snakemake@config$META$species)[1] 145 | organismTwo <- names(snakemake@config$META$species)[2] 146 | 147 | par(mar = c(5, 4, 4, 2) + 0.5) 148 | 149 | pdf(snakemake@output$genes_pdf, height = 8, width = 8) 150 | df_temp <- categorizeCellsUsingKneeKnownNumCellsPaper( 151 | digitalExpressionFileO1, 152 | digitalExpressionFileO2, 153 | organismOne = organismOne, 154 | organismTwo = organismTwo, 155 | pureRatio = snakemake@config$META$ratio, 156 | numCells = num_cells, 157 | numBeads = num_cells * 2, 158 | point.cex = 1, 159 | category = "genes" 160 | ) 161 | dev.off() 162 | 163 | 164 | 165 | pdf(snakemake@output$transcripts_pdf, height = 8, width = 8) 166 | df <- categorizeCellsUsingKneeKnownNumCellsPaper( 167 | digitalExpressionFileO1, 168 | digitalExpressionFileO2, 169 | organismOne = organismOne, 170 | organismTwo = organismTwo, 171 | pureRatio = snakemake@config$META$ratio, 172 | numCells = num_cells, 173 | numBeads = num_cells * 2, 174 | point.cex = 1, 175 | category = "transcripts" 176 | ) 177 | dev.off() 178 | organism1 <- subset(df, df$organism == organismOne) 179 | organism2 <- subset(df, df$organism == organismTwo) 180 | 181 | write.table(organism1$tag, snakemake@output$barcodes_species[1], 182 | row.names = F, col.names = F, quote = F) 183 | write.table(organism2$tag, snakemake@output$barcodes_species[2], 184 | row.names = F, col.names = F, quote = F) 185 | 186 | # save.image(paste0(snakemake@output$genes_pdf,".rdata")) 187 | if (debug_flag) { 188 | save.image(file = file.path( 189 | path_debug, 190 | paste0( 191 | "plot_species_plot_workspace_", 192 | attr(snakemake, "wildcard")$sample, ".rdata" 193 | ) 194 | )) 195 | } 196 | -------------------------------------------------------------------------------- /scripts/plot_violine.R: -------------------------------------------------------------------------------- 1 | #' --- 2 | #' title: plot_violine.R 3 | #' author: Sebastian Mueller (sebm_at_posteo.de) 4 | #' date: 2018-04-10 5 | #' --- 6 | ### for debug 7 | # If you wish to access the snakefile object first invoke snakemake and save the session automatically 8 | # Since there are no debug flags to my knowledge, just uncomment the line below and run snakemake which 9 | # creates an R object that can be loaded into a custom R session 10 | # save.image(file="R_workspace_debug.rdata") 11 | # load("R_workspace_debug.rdata") 12 | #### /debug 13 | debug_flag <- FALSE 14 | if (snakemake@config$DEBUG) { 15 | debug_flag <- TRUE 16 | message("In debug mode: saving R objects to inspect later") 17 | path_debug <- file.path(snakemake@config$LOCAL$results, "debug") 18 | dir.create(path_debug, showWarnings = FALSE) 19 | save(snakemake, file = file.path(path_debug, "plot_violin_snakemake.rdata")) 20 | } 21 | 22 | 23 | options(warn = -1) 24 | library(plyr, quietly = TRUE, warn.conflicts = FALSE) 25 | library(dplyr, quietly = TRUE, warn.conflicts = FALSE) # Dataframe manipulation 26 | library(Matrix, quietly = TRUE, warn.conflicts = FALSE) # Sparse matrices 27 | library(stringr, quietly = TRUE, warn.conflicts = FALSE) 28 | library(RColorBrewer, quietly = TRUE, warn.conflicts = FALSE) 29 | library(devtools, quietly = TRUE, warn.conflicts = FALSE) 30 | library(Seurat, quietly = TRUE, warn.conflicts = FALSE) 31 | library(plotly, quietly = TRUE, warn.conflicts = FALSE) 32 | 33 | # rule map in Snakefile 34 | # rule map: 35 | # input: 36 | # 'plots/violinplots_comparison_UMI.pdf', 37 | # ... 38 | 39 | # importing UMI 40 | # importing counts ( summary/counts_expression_matrix.tsv ) 41 | 42 | ReadMTX <- function(mtx_path) { 43 | data_dir <- dirname(mtx_path) 44 | files <- list.files(data_dir) 45 | # Find files 46 | barcodes_file <- grep("barcodes", files, value = TRUE) 47 | features_file <- grep(pattern = "genes|features", x = files, value = TRUE) 48 | mtx <- grep("mtx", files, value = TRUE) 49 | # load the data 50 | data <- readMM(file.path(data_dir, mtx)) 51 | barcodes <- read.csv(file.path(data_dir, barcodes_file), header = FALSE)$V1 52 | features <- read.csv(file.path(data_dir, features_file), header = FALSE)$V1 53 | 54 | colnames(data) <- barcodes 55 | rownames(data) <- features 56 | return(data) 57 | } 58 | 59 | #count_matrix <- ReadMTX(snakemake@input$counts) 60 | # importing UMIs ( summary/umi_expression_matrix.tsv ) 61 | #umi_matrix <- ReadMTX(snakemake@input$UMIs) 62 | 63 | count_matrix <- Read10X(file.path(snakemake@wildcards$results_dir,'summary','read')) 64 | umi_matrix <- Read10X(file.path(snakemake@wildcards$results_dir,'summary','umi')) 65 | 66 | design <- read.csv(snakemake@input$design, 67 | stringsAsFactors = TRUE, 68 | header = TRUE, 69 | row.names = NULL 70 | ) 71 | metaData <- data.frame(cellNames = colnames(umi_matrix)) %>% 72 | mutate(samples = factor(str_replace(cellNames, "_[^_]*$", ""))) %>% 73 | mutate(barcode = factor(str_replace(cellNames, ".+_", ""))) %>% 74 | left_join(design, by = "samples") 75 | rownames(metaData) <- metaData$cellNames 76 | 77 | # possible to set is.expr = -1 to avoid filtering whilst creating 78 | # seuratobj <- CreateSeuratObject(raw.data = umi_matrix, meta.data = metaData, is.expr = -1) 79 | seuratobj <- CreateSeuratObject(raw.data = umi_matrix, meta.data = metaData) 80 | seuratobj <- SetAllIdent(object = seuratobj, id = "samples") 81 | # relabel cell idenity (https://github.com/satijalab/seurat/issues/380) 82 | seuratobj@meta.data$orig.ident <- seuratobj@meta.data$samples 83 | 84 | mycount <- CreateSeuratObject(raw.data = count_matrix, meta.data = metaData) 85 | mycount <- SetAllIdent(object = mycount, id = "samples") 86 | mycount@meta.data$orig.ident <- mycount@meta.data$samples 87 | # turn off filtering 88 | # note, the @meta.data slot contains usefull summary stuff 89 | # head(mycount@meta.data,2) 90 | # nGene nUMI expected_cells read_length barcode 91 | # dropseqLib1_ACTAACATTATT 15 33 400 100 ACTAACATTATT 92 | # dropseqLib1_GAGTCTGAGGCG 5 9 400 100 GAGTCTGAGGCG 93 | # origin origin 94 | # dropseqLib1_ACTAACATTATT dropseqLib1 dropseqLib1 95 | # dropseqLib1_GAGTCTGAGGCG dropseqLib1 dropseqLib1 96 | meta.data <- seuratobj@meta.data 97 | # combining UMIs and Counts in to one Seurat object 98 | meta.data$nCounts <- mycount@meta.data$nUMI 99 | seuratobj@meta.data <- meta.data 100 | # delete since Counts have been added to seuratobj as nCounts column 101 | rm(mycount) 102 | 103 | 104 | # mytheme <- theme_bw(base_size = 9) + 105 | mytheme <- theme_bw() + 106 | theme( 107 | legend.position = "right", 108 | axis.ticks = element_blank(), 109 | axis.text.x = element_text(angle = 300, hjust = 0) 110 | ) 111 | theme_set(mytheme) 112 | 113 | # predefined ggplot layers for subsequent plots 114 | gglayers <- list( 115 | geom_smooth(method = "loess"), 116 | geom_point(size = .5), 117 | scale_y_continuous( 118 | labels = scales::unit_format(unit = "", scale = 1e-3, digits = 2), 119 | breaks = scales::pretty_breaks(n = 8) 120 | ), 121 | scale_x_continuous( 122 | labels = scales::unit_format(unit = "", scale = 1e-3, digits = 2), 123 | breaks = scales::pretty_breaks(n = 8) 124 | ) 125 | ) 126 | 127 | gg <- ggplot(meta.data, aes(x = nUMI, y = nCounts, color = orig.ident)) + 128 | # coord_trans(y="log10",x = "log10") + 129 | gglayers + 130 | geom_abline(intercept = 0, slope = 1) + 131 | labs( 132 | title = "UMI counts vs raw Counts", 133 | subtitle = "Number of UMIs and raw Counts for each Bead", 134 | x = "Number of UMIs per Bead [k]", 135 | y = "Number of Counts per Bead [k]" 136 | ) 137 | 138 | # dev.new() 139 | # htmlwidgets::saveWidget(ggplotly(gg), file.path(getwd(),snakemake@output$html_umivscounts)) 140 | ggsave(gg, file = file.path(getwd(), snakemake@output$pdf_umivscounts), width = 12, height = 7) 141 | 142 | # how about unaligned reads/UMI? 143 | # Note(Seb): raw.data is actually filtered data i.e. nr of genes likely to be smaller than input data! 144 | mito.gene.names <- grep("^mt-", rownames(seuratobj@raw.data), value = TRUE, ignore.case = TRUE) 145 | sribo.gene.names <- grep("^Rps", rownames(seuratobj@raw.data), value = TRUE, ignore.case = TRUE) 146 | lribo.gene.names <- grep("^Rpl", rownames(seuratobj@raw.data), value = TRUE, ignore.case = TRUE) 147 | 148 | col.total <- Matrix::colSums(seuratobj@raw.data) 149 | meta.data$col.total <- col.total 150 | 151 | seuratobj.top_50 <- apply(seuratobj@raw.data, 2, function(x) sum(x[order(x, decreasing = TRUE)][1:50]) / sum(x)) 152 | # mycount.top_50 <- apply(mycount@raw.data, 2, function(x) sum(x[order(x, decreasing = TRUE)][1:50])/sum(x)) 153 | 154 | seuratobj <- AddMetaData(seuratobj, Matrix::colSums(seuratobj@raw.data[sribo.gene.names, ]) / col.total, "pct.sribo") 155 | seuratobj <- AddMetaData(seuratobj, Matrix::colSums(seuratobj@raw.data[lribo.gene.names, ]) / col.total, "pct.lribo") 156 | seuratobj <- AddMetaData(seuratobj, Matrix::colSums(seuratobj@raw.data[unique(c(sribo.gene.names, lribo.gene.names)), ]) / col.total, "pct.Ribo") 157 | seuratobj <- AddMetaData(seuratobj, Matrix::colSums(seuratobj@raw.data[mito.gene.names, ]) / col.total, "pct.mito") 158 | seuratobj <- AddMetaData(seuratobj, seuratobj.top_50, "top50") 159 | tmp <- seuratobj@meta.data$nUMI / seuratobj@meta.data$nGene 160 | names(tmp) <- rownames(seuratobj@meta.data) 161 | seuratobj <- AddMetaData(seuratobj, tmp, "umi.per.gene") 162 | 163 | 164 | gg <- VlnPlot(seuratobj, 165 | c("nUMI", "nGene", "top50", "umi.per.gene", "pct.Ribo", "pct.mito"), 166 | x.lab.rot = TRUE, do.return = TRUE 167 | ) 168 | # ggsave(gg,file=file.path("violinplots_comparison_UMI.pdf"),width=18,height=18) 169 | ggsave(gg, file = snakemake@output$pdf_violine, width = 18, height = 18) 170 | # gg <- VlnPlot(mycount,c("nUMI", "nGene", "top50", "count.per.gene","pct.Ribo", "pct.mito"), x.lab.rot = TRUE, do.return = TRUE) 171 | # ggsave(gg,file=file.path("violinplots_comparison_count.pdf"),width=18,height=18) 172 | 173 | # gg <- GenePlot(object = seuratobj, gene1 = "nUMI", gene2 = "nGene") 174 | # ggsave(gg,file=file.path("violinplots_comparison.pdf"),width=18,height=18) 175 | 176 | 177 | gg <- ggplot(meta.data, aes(x = nUMI, y = nGene, color = orig.ident)) + 178 | gglayers + 179 | labs( 180 | title = "Genes (pooled mouse and human set) vs UMIs for each bead", 181 | x = "Number of UMIs per Bead [k]", 182 | y = "Number of Genes per Bead [k]" 183 | ) 184 | 185 | # dev.new() 186 | # htmlwidgets::saveWidget(ggplotly(gg), 187 | # file.path(getwd(), snakemake@output$html_umi_vs_gene)) 188 | ggsave(gg, file = snakemake@output$pdf_umi_vs_gene, width = 12, height = 7) 189 | 190 | 191 | 192 | ################################################################################ 193 | ## same for Counts instead UMIs (using mycount object) 194 | gg <- ggplot(meta.data, aes(x = nCounts, y = nGene, color = orig.ident)) + 195 | gglayers + 196 | labs( 197 | title = "Genes (pooled mouse and human set) vs Counts for each bead", 198 | x = "Number of Counts per Bead [k]", 199 | y = "Number of Genes per Bead [k]" 200 | ) 201 | 202 | # dev.new() 203 | # htmlwidgets::saveWidget(ggplotly(gg), 204 | # file.path(getwd(), snakemake@output$html_count_vs_gene)) 205 | 206 | ggsave(gg, file = snakemake@output$pdf_count_vs_gene, width = 12, height = 7) 207 | 208 | 209 | # head(meta.data,2) 210 | # nGene nUMI cellNames samples barcode expected_cells read_length batch orig.ident pct.sribo pct.lribo pct.Ribo pct.mito top50 umi.per.gene 211 | # sample1_GAGTCTGAGGCG 6 6 sample1_GAGTCTGAGGCG sample1 GAGTCTGAGGCG 100 100 batch1 sample1 0.0000000 0.00000000 0.0000000 0.0000000 1.0000000 1.000000 212 | # sample1_CAGCCCTCAGTA 264 437 sample1_CAGCCCTCAGTA sample1 CAGCCCTCAGTA 100 100 batch1 sample1 0.0389016 0.07551487 0.1144165 0.0228833 0.5102975 1.655303 213 | 214 | # saving snakemake meta information into misc slot so all can be exported as one object 215 | seuratobj@misc <- snakemake 216 | # exporting R Seurat objects into summary/R_Seurat_objects.rdata 217 | saveRDS(seuratobj, file = file.path(snakemake@output$R_objects)) 218 | 219 | if (debug_flag) { 220 | save.image(file = file.path(path_debug, "plot_violin_workspace.rdata")) 221 | } 222 | -------------------------------------------------------------------------------- /scripts/plot_yield.R: -------------------------------------------------------------------------------- 1 | #------------------------------------ for debugging: 2 | # For debugging add the following line in config.yaml (without the #) 3 | # DEBUG: True 4 | # This will create R objects in the debug directory containing the snakemake 5 | # object R object that can be loaded into a custom R session as below: 6 | debug_flag <- FALSE 7 | if (snakemake@config$DEBUG) { 8 | debug_flag <- TRUE 9 | message("In debug mode: saving R objects to inspect later") 10 | path_debug <- file.path(snakemake@config$LOCAL$results, "debug") 11 | dir.create(path_debug) 12 | save(snakemake, file = file.path(path_debug, "plot_yield_snakemake.rdata")) 13 | } 14 | #------------------------------------ debugging 15 | 16 | library(ggplot2) 17 | library(tidyr) 18 | library(grid) 19 | library(gridExtra) 20 | library(viridis) 21 | library(stringr) 22 | 23 | samples <- snakemake@params$sample_names 24 | batches <- snakemake@params$batches 25 | mydata <- data.frame(matrix(nrow = length(samples), ncol = 7)) 26 | colnames(mydata) <- c("Sample", "Batch", "Cutadapt filtered", "Unmapped", 27 | "Multi mapped", "Uniquely mapped", "Total reads") 28 | mydata[, "Sample"] <- samples 29 | mydata[, "Batch"] <- batches 30 | for (i in 1:length(samples)) { 31 | # Input files and variables 32 | STAR_output <- read.table(snakemake@input$STAR_output[i], 33 | skip = 5, sep = "\t", 34 | fill = TRUE, stringsAsFactors = FALSE) 35 | mysample <- samples[i] 36 | # Read files 37 | # bbmap_log = read.table(snakemake@input$repaired[i], sep=':', header=FALSE, skip=8, row.names=1, nrows=4) 38 | # reads_after_filtering = as.numeric(str_match(bbmap_log['Pairs',], pattern = "\t([0-9]{1,20}) reads.*")[,2])/2 39 | bbmap_log <- readLines(snakemake@input$repaired[i]) 40 | reads_after_filtering <- as.numeric(str_match(bbmap_log[grep("^Pairs:", bbmap_log)], 41 | pattern = "\t([0-9]{1,20}) reads.*")[, 2]) / 2 42 | R1_filtered <- read.table(snakemake@input$R1_filtered[i], header = FALSE, skip = 7, sep = ":", nrows = 7, row.names = 1) 43 | total_reads <- as.numeric(str_replace_all(R1_filtered["Total reads processed", ], pattern = (" |,"), "")) 44 | 45 | # R2_filtered = read.table(snakemake@input$R2_filtered[i], header = FALSE, skip=8, sep=':', nrows=7, row.names=1) 46 | 47 | # R1_adapters = as.numeric(str_remove_all(str_match(R1_filtered['Reads with adapters',], pattern = "(.*) \\(")[,2], pattern = (' |,'))) 48 | # R1_too_short = as.numeric(str_remove_all(str_match(R1_filtered['Reads that were too short',], pattern = "(.*) \\(")[,2], pattern = (' |,'))) 49 | # R1_passed = as.numeric(str_remove_all(str_match(R1_filtered['Reads written (passing filters)',], pattern = "(.*) \\(")[,2], pattern = (' |,'))) 50 | # R1_filtered = total_reads - R1_passed 51 | 52 | # R2_adapters = as.numeric(str_remove_all(str_match(R2_filtered['Reads with adapters',], pattern = "(.*) \\(")[,2], pattern = (' |,'))) 53 | # R2_too_short = as.numeric(str_remove_all(str_match(R2_filtered['Reads that were too short',], pattern = "(.*) \\(")[,2], pattern = (' |,'))) 54 | # R2_passed = as.numeric(str_remove_all(str_match(R2_filtered['Reads written (passing filters)',], pattern = "(.*) \\(")[,2], pattern = (' |,'))) 55 | # R2_filtered = total_reads - R2_passed 56 | 57 | mydata[which(mydata$Sample == mysample), "Cutadapt filtered"] <- total_reads - reads_after_filtering 58 | mydata[which(mydata$Sample == mysample), "Total reads"] <- total_reads 59 | 60 | # STAR output 61 | reads_in <- as.numeric(STAR_output$V2[1]) 62 | uniquely_mapped <- as.numeric(STAR_output$V2[4]) 63 | multi_mapped <- as.numeric(STAR_output$V2[19]) 64 | unmapped <- reads_in - uniquely_mapped - multi_mapped 65 | 66 | mydata[which(mydata$Sample == mysample), "Uniquely mapped"] <- uniquely_mapped 67 | mydata[which(mydata$Sample == mysample), "Multi mapped"] <- multi_mapped 68 | mydata[which(mydata$Sample == mysample), "Unmapped"] <- unmapped 69 | } 70 | 71 | # tidyr version 72 | mydata_long <- mydata %>% gather(variable, value, -Sample, -Batch) 73 | # melt will be retired, use gather instead: https://github.com/hadley/reshape 74 | #Force factor order. 75 | mydata_long$variable = factor(mydata_long$variable, levels = c('Cutadapt filtered','Multi mapped','Total reads','Unmapped','Uniquely mapped')) 76 | color_palette = c('#e88270','#cb7262','#ae6254','#70d6e8') 77 | 78 | 79 | p1 <- ggplot(subset(mydata_long, mydata_long$variable != "Total reads"), 80 | aes(x = Sample, y = value, fill = variable)) + 81 | geom_histogram(stat = "identity", binwidth = 1 / length(samples)) + 82 | theme(axis.text.x = element_text(angle = 90, hjust = 0)) + 83 | labs(title = paste("Yield of all the reads for each category"), 84 | x = "Samples", 85 | y = "Number of reads") + 86 | theme(axis.title.x = element_blank(), 87 | axis.text.x = element_blank(), 88 | axis.ticks.x = element_blank(), 89 | legend.position = "none", 90 | plot.title = element_text(size = 20, face = "bold")) + 91 | facet_grid(~Batch, scales = "free") + 92 | scale_fill_viridis(discrete = TRUE, option = "viridis") + 93 | scale_y_continuous(labels = scales::scientific) 94 | 95 | mydata_pct <- mydata[, -c(1, 2)] / mydata[, "Total reads"] 96 | mydata_pct <- cbind(Sample = mydata[, "Sample"], 97 | Batch = mydata[, "Batch"], mydata_pct) 98 | 99 | mydata_long_pct <- mydata_pct %>% gather(variable, value, -Sample, -Batch) 100 | 101 | mydata_long_pct$variable = factor(mydata_long$variable, levels = c('Cutadapt filtered','Multi mapped','Unmapped','Uniquely mapped')) 102 | 103 | p2 <- ggplot(subset(mydata_long_pct, mydata_long_pct$variable != "Total reads"), 104 | aes(x = Sample, y = value, fill = variable)) + 105 | labs(fill = "Filters") + 106 | geom_histogram(stat = "identity", binwidth = 1 / length(samples)) + 107 | theme(axis.text.x = element_text(angle = 90, hjust = 0), 108 | legend.position = "bottom", 109 | strip.background = element_blank(), 110 | strip.text.x = element_blank()) + 111 | labs(x = "Samples", 112 | y = "Percentage of reads") + 113 | facet_grid(~Batch, scales = "free") + 114 | scale_fill_viridis(discrete = TRUE, option = "viridis") + 115 | scale_y_continuous(labels = scales::percent) 116 | 117 | gp1 <- ggplotGrob(p1) 118 | gp2 <- ggplotGrob(p2) 119 | 120 | pdf(file = snakemake@output$pdf, width = 16, height = 13) 121 | grid::grid.newpage() 122 | grid::grid.draw(rbind(gp1, gp2, size = "last")) 123 | dev.off() 124 | 125 | if (debug_flag) { 126 | save.image(file = file.path(path_debug, "plot_yield_workspace.rdata")) 127 | } 128 | -------------------------------------------------------------------------------- /scripts/publication_text.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Publication_text" 3 | output: html_document 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | --- 6 | 7 | ```{r libraries, message=FALSE, warning=FALSE, include=FALSE} 8 | library(yaml) 9 | ``` 10 | 11 | ```{r load_yaml, message=FALSE, warning=FALSE, include=FALSE, paged.print=FALSE} 12 | versions = list() 13 | for (yaml_file in snakemake@input$yaml_files){ 14 | current_env = yaml.load_file(yaml_file) 15 | for (package in current_env$dependencies){ 16 | if(grepl(pattern = 'cutadapt', package)){ 17 | versions[['cutadapt']] = strsplit(package,'=|==| ==| == ')[[1]][2] 18 | } 19 | else if(grepl(pattern = 'star', package)){ 20 | versions[['star']] = strsplit(package,'=|==| ==| == ')[[1]][2] 21 | } 22 | else if(grepl(pattern = 'dropseq_tools', package)){ 23 | versions[['dropseq_tools']] = strsplit(package,'=|==| ==| == ')[[1]][2] 24 | } 25 | else if(grepl(pattern = 'bbmap', package)){ 26 | versions[['bbmap']] = strsplit(package,'=|==| ==| == ')[[1]][2] 27 | } 28 | } 29 | } 30 | 31 | umi_distance=snakemake@config$EXTRACTION$`UMI-edit-distance` 32 | ``` 33 | 34 | Pipeline 35 | -------------------------- 36 | Data was processed using dropSeqPipe `r paste0('v',snakemake@config$version)`. Parameters that were used are provided in the configuration file on the repository XXXXXX. Rerunning the pipeline can easily be done by following the instructions at this address: https://hoohm.github.io/dropSeqPipe/ 37 | 38 | Trimming and filtering 39 | -------------------------- 40 | Read trimming and filtering was performed with cutadapt `r paste0('v',versions[['cutadapt']])` on both fastq files separatly. Reads with a missing pairs were discarded using bbmap `r paste0('v',versions[['bbmap']])`. 41 | 42 | 43 | Mapping 44 | -------------------------- 45 | Mapping was performed with STAR `r paste0('v',versions[['star']])`. Multimapped reads were discarded. Annotation release number #`r paste0(snakemake@config$META$species[[1]]$release)` and genome build #`r paste0(snakemake@config$META$species[[1]]$build)` for `r paste0(names(snakemake@config$META$species)[1])` were downloaded from ensembl. 46 | 47 | 48 | Barcodes 49 | -------------------------- 50 | Demultiplexing as well as file manipulation have been performed using dropseq_tools `r paste0('v',versions[['dropseq_tools']])`. We used a distance of 1 base for cell barcode and `r paste0(umi_distance)` for umi barcodes. -------------------------------------------------------------------------------- /scripts/repair_barcodes.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import pysam 3 | 4 | def load_obj(name): 5 | with open(name, 'rb') as f: 6 | return pickle.load(f) 7 | 8 | def save_obj(obj, name): 9 | with open(name, 'wb') as f: 10 | pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) 11 | 12 | 13 | infile_bam = pysam.AlignmentFile(snakemake.input.bam, "rb") 14 | outfile = pysam.AlignmentFile(snakemake.output.bam, "wb", template=infile_bam) 15 | 16 | mapping = load_obj(snakemake.input.barcode_mapping) 17 | barcode_ref = load_obj(snakemake.input.barcode_ref) 18 | barcode_ext_ref = load_obj(snakemake.input.barcode_ext_ref) 19 | unknown_barcodes = set() 20 | 21 | for bam_read in infile_bam: 22 | barcode = bam_read.get_tag('XC') 23 | #lane_number = bam_read.query_name.split(':')[3] 24 | if barcode in barcode_ref: 25 | mapping[0][barcode]['count'] += 1 26 | #mapping[0][barcode]['lanes'][lane_number] += 1 27 | outfile.write(bam_read) 28 | continue 29 | elif barcode in barcode_ext_ref: 30 | # The barcode is in our extended reference. Change the barcode to the original one 31 | reference_barcode = mapping[1][barcode]['ref'] 32 | mapping[1][barcode]['count'] += 1 33 | #mapping[1][barcode]['lanes'][lane_number] += 1 34 | bam_read.set_tag('XC',reference_barcode,value_type='Z',replace=True) 35 | outfile.write(bam_read) 36 | continue 37 | else: 38 | # If the barcode is not found in the extended ref, then don't modify it. 39 | if barcode in unknown_barcodes: 40 | mapping['unknown'][barcode]['count'] += 1 41 | #mapping['unknown'][barcode]['lanes'][lane_number] += 1 42 | else: 43 | #mapping['unknown'][barcode] = {'count':1, 'lanes':{'1':0,'2':0,'3':0,'4':0,'5':0,'6':0,'7':0,'8':0}} 44 | mapping['unknown'][barcode] = {'count':1} 45 | #mapping['unknown'][barcode]['lanes'][lane_number] += 1 46 | unknown_barcodes.add(barcode) 47 | outfile.write(bam_read) 48 | 49 | save_obj(obj=mapping, name=snakemake.output.barcode_mapping_counts) 50 | -------------------------------------------------------------------------------- /scripts/umi_tools_extended_ref.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from collections import defaultdict 4 | import pickle 5 | 6 | 7 | def save_obj(obj, name): 8 | with open(name, 'wb') as f: 9 | pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) 10 | 11 | 12 | mapping=defaultdict(dict) 13 | barcode_ref = set() 14 | barcode_ext_ref = set() 15 | 16 | 17 | with open(snakemake.input['whitelist'],'r') as whitelist: 18 | for line in whitelist: 19 | if len(line.strip().split()) == 2: # This means we didn't find any other linked barcode 20 | (reference,counts_ref) = line.strip().split() 21 | mapping[0][reference]= defaultdict() 22 | mapping[0][reference]['ref'] = reference 23 | mapping[0][reference]['count'] = 0 24 | mapping[0][reference]['lanes'] = {'1':0,'2':0,'3':0,'4':0,'5':0,'6':0,'7':0,'8':0} 25 | barcode_ref.add(reference) 26 | continue 27 | (reference,extended_ref,counts_ref,counts_ext) = line.strip().split() 28 | mapping[0][reference]= defaultdict() 29 | mapping[0][reference]['ref'] = reference 30 | mapping[0][reference]['count'] = 0 31 | mapping[0][reference]['lanes'] = {'1':0,'2':0,'3':0,'4':0,'5':0,'6':0,'7':0,'8':0} 32 | barcode_ref.add(reference) 33 | for barcode in extended_ref.split(','): 34 | mapping[1][barcode] = defaultdict() 35 | mapping[1][barcode]['ref'] = reference 36 | mapping[1][barcode]['count'] = 0 37 | mapping[1][barcode]['lanes'] = {'1':0,'2':0,'3':0,'4':0,'5':0,'6':0,'7':0,'8':0} 38 | barcode_ext_ref.update(extended_ref.split(',')) 39 | 40 | # Save mapping and references to reuse later. 41 | save_obj(obj=mapping, name=snakemake.output.barcode_mapping) 42 | save_obj(obj=barcode_ref,name=snakemake.output.barcode_ref) 43 | save_obj(obj=barcode_ext_ref,name=snakemake.output.barcode_ext_ref) -------------------------------------------------------------------------------- /templates/NexteraPE-PE.fa: -------------------------------------------------------------------------------- 1 | >PrefixNX/1 2 | AGATGTGTATAAGAGACAG 3 | >PrefixNX/2 4 | AGATGTGTATAAGAGACAG 5 | >Trans1 6 | TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG 7 | >Trans1_rc 8 | CTGTCTCTTATACACATCTGACGCTGCCGACGA 9 | >Trans2 10 | GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG 11 | >Trans2_rc 12 | CTGTCTCTTATACACATCTCCGAGCCCACGAGAC -------------------------------------------------------------------------------- /templates/TruSeq2-PE.fa: -------------------------------------------------------------------------------- 1 | >PrefixPE/1 2 | AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT 3 | >PrefixPE/2 4 | CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT 5 | >PCR_Primer1 6 | AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT 7 | >PCR_Primer1_rc 8 | AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT 9 | >PCR_Primer2 10 | CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT 11 | >PCR_Primer2_rc 12 | AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG 13 | >FlowCell1 14 | TTTTTTTTTTAATGATACGGCGACCACCGAGATCTACAC 15 | >FlowCell2 16 | TTTTTTTTTTCAAGCAGAAGACGGCATACGA -------------------------------------------------------------------------------- /templates/TruSeq2-SE.fa: -------------------------------------------------------------------------------- 1 | >TruSeq2_SE 2 | AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG 3 | >TruSeq2_PE_f 4 | AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT 5 | >TruSeq2_PE_r 6 | AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG -------------------------------------------------------------------------------- /templates/TruSeq3-PE-2.fa: -------------------------------------------------------------------------------- 1 | >PrefixPE/1 2 | TACACTCTTTCCCTACACGACGCTCTTCCGATCT 3 | >PrefixPE/2 4 | GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 5 | >PE1 6 | TACACTCTTTCCCTACACGACGCTCTTCCGATCT 7 | >PE1_rc 8 | AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA 9 | >PE2 10 | GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 11 | >PE2_rc 12 | AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC -------------------------------------------------------------------------------- /templates/TruSeq3-PE.fa: -------------------------------------------------------------------------------- 1 | >PrefixPE/1 2 | TACACTCTTTCCCTACACGACGCTCTTCCGATCT 3 | >PrefixPE/2 4 | GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT -------------------------------------------------------------------------------- /templates/TruSeq3-SE.fa: -------------------------------------------------------------------------------- 1 | >TruSeq3_IndexedAdapter 2 | AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC 3 | >TruSeq3_UniversalAdapter 4 | AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA -------------------------------------------------------------------------------- /templates/cluster.yaml: -------------------------------------------------------------------------------- 1 | __default__: 2 | time: "03:00:00" 3 | mem: 4g 4 | output: "logs/cluster/{rule}.{wildcards.sample}.out" 5 | error: "logs/cluster/{rule}.{wildcards.sample}.err" 6 | n: '{threads}' 7 | fastqc_barcodes: 8 | jobname: fastqc_barcodes 9 | time: "01:00:00" 10 | mem: 1g 11 | fastqc_reads: 12 | jobname: fastqc_reads 13 | time: "01:00:00" 14 | mem: 1g 15 | STAR_align: 16 | n: 24 17 | create_star_index: 18 | n: 1 19 | time: "04:00:00" 20 | mem: 64g 21 | output: "logs/cluster/{rule}.out" 22 | error: "logs/cluster/{rule}.err" 23 | -------------------------------------------------------------------------------- /templates/config.yaml: -------------------------------------------------------------------------------- 1 | CONTACT: 2 | email: john.doe@john.com 3 | person: John Doe 4 | LOCAL: 5 | temp-directory: 6 | memory: 4g 7 | raw_data: 8 | results: results 9 | META: 10 | species: 11 | SPECIES_ONE: 12 | build: 13 | release: 14 | ratio: 0.2 15 | reference-directory: 16 | gtf_biotypes: gtf_biotypes.yaml 17 | 18 | FILTER: 19 | barcode-whitelist: '' 20 | 5-prime-smart-adapter: '' 21 | cell-barcode: 22 | start: 23 | end: 24 | UMI-barcode: 25 | start: 26 | end: 27 | cutadapt: 28 | adapters-file: 29 | R1: 30 | quality-filter: 20 31 | maximum-Ns: 1 32 | extra-params: '' 33 | R2: 34 | quality-filter: 20 35 | minimum-adapters-overlap: 6 36 | minimum-length: 15 37 | extra-params: '' 38 | MAPPING: 39 | STAR: 40 | genomeChrBinNbits: 18 41 | outFilterMismatchNmax: 10 42 | outFilterMismatchNoverLmax: 0.3 43 | outFilterMismatchNoverReadLmax: 1 44 | outFilterMatchNmin: 0 45 | outFilterMatchNminOverLread: 0.66 46 | outFilterScoreMinOverLread: 0.66 47 | EXTRACTION: 48 | LOCUS: 49 | - CODING 50 | - UTR 51 | strand-strategy: SENSE 52 | UMI-edit-distance: 1 53 | minimum-counts-per-UMI: 0 54 | DEBUG: False -------------------------------------------------------------------------------- /templates/config_nadia.yaml: -------------------------------------------------------------------------------- 1 | # Example config template for Dolomite Bio’s Nadia Instrument 2 | # https://www.dolomite-bio.com/ 3 | # Usage: Copy into project root folder and rename to 'config.yaml' 4 | DEBUG: FALSE 5 | CONTACT: 6 | email: luke@mail.com 7 | person: Luke Dropwalker 8 | LOCAL: 9 | temp-directory: ./tmp 10 | memory: 60g 11 | raw_data: data 12 | results: results 13 | META: 14 | species: 15 | # this list two species which is meant for mixed species 16 | # for single species, just delete one of the two (and/or edit the species as required) 17 | mus_musculus: 18 | build: 38 19 | release: 91 20 | homo_sapiens: 21 | build: 38 22 | release: 91 23 | # for mixed species: threshold for calling a STAMP mixed (i.e. 0.2 means at least 20% from both species) 24 | ratio: 0.2 25 | reference-directory: /path/to/reference-dir 26 | gtf_biotypes: gtf_biotypes.yaml 27 | FILTER: 28 | barcode-whitelist: '' 29 | 5-prime-smart-adapter: CCTACACGACGCTCTTCCGATCT 30 | cell-barcode: 31 | start: 1 32 | end: 12 33 | min-quality: 3 34 | num-below-quality: 0 35 | UMI-barcode: 36 | start: 13 37 | end: 20 38 | min-quality: 3 39 | num-below-quality: 0 40 | cutadapt: 41 | adapters-file: custom_adapters.fa 42 | R1: 43 | quality-filter: 20 44 | maximum-Ns: 0 45 | extra-params: '' 46 | R2: 47 | quality-filter: 20 48 | minimum-adapters-overlap: 6 49 | minimum-length: 15 50 | extra-params: '' 51 | simpleClipThreshold: 10 52 | MAPPING: 53 | STAR: 54 | genomeChrBinNbits: 18 55 | outFilterMismatchNmax: 10 56 | outFilterMismatchNoverLmax: 0.3 57 | outFilterMismatchNoverReadLmax: 1 58 | outFilterMatchNmin: 0 59 | outFilterMatchNminOverLread: 0.66 60 | outFilterScoreMinOverLread: 0.66 61 | EXTRACTION: 62 | LOCUS: 63 | - CODING 64 | - UTR 65 | strand-strategy: SENSE 66 | UMI-edit-distance: 1 67 | minimum-counts-per-UMI: 0 68 | DOUBLET_DETECTION: 69 | min_counts: 1 70 | min_cells: 0 71 | min_gene_variability_pctl: 85 72 | n_prin_comps: 20 73 | -------------------------------------------------------------------------------- /templates/custom_adapters.fa: -------------------------------------------------------------------------------- 1 | >Illumina_Universal 2 | AGATCGGAAGAG 3 | >PrefixNX/1 4 | AGATGTGTATAAGAGACAG 5 | >Trans1 6 | TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG 7 | >Trans1_rc 8 | CTGTCTCTTATACACATCTGACGCTGCCGACGA 9 | >Trans2 10 | GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG 11 | >Trans2_rc 12 | CTGTCTCTTATACACATCTCCGAGCCCACGAGAC 13 | >polyA 14 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 15 | >polyT 16 | TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT 17 | >polyC 18 | CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC 19 | >polyG 20 | GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG 21 | >drop-seq 22 | GTACTCTGCGTTGATACCACTGCTTCCGCGGACAGGC 23 | >Nextera 24 | CTGTCTCTTATACACATCT 25 | -------------------------------------------------------------------------------- /templates/gtf_biotypes.yaml: -------------------------------------------------------------------------------- 1 | biotypes: 2 | - 3prime_overlapping_ncRNA 3 | - antisense 4 | - bidirectional_promoter_lncRNA 5 | - IG_C_gene 6 | - IG_C_pseudogene 7 | - IG_D_gene 8 | - IG_J_gene 9 | - IG_J_pseudogene 10 | - IG_pseudogene 11 | - IG_V_gene 12 | - IG_V_pseudogene 13 | - lincRNA 14 | - macro_lncRNA 15 | - miRNA 16 | - misc_RNA 17 | - Mt_rRNA 18 | - Mt_tRNA 19 | - non_coding 20 | - polymorphic_pseudogene 21 | - processed_pseudogene 22 | - processed_transcript 23 | - protein_coding 24 | - pseudogene 25 | - ribozyme 26 | - rRNA 27 | - scaRNA 28 | - scRNA 29 | - sense_intronic 30 | - sense_overlapping 31 | - snoRNA 32 | - snRNA 33 | - sRNA 34 | - TEC 35 | - transcribed_processed_pseudogene 36 | - transcribed_unitary_pseudogene 37 | - transcribed_unprocessed_pseudogene 38 | - translated_processed_pseudogene 39 | - TR_C_gene 40 | - TR_D_gene 41 | - TR_J_gene 42 | - TR_J_pseudogene 43 | - TR_V_gene 44 | - TR_V_pseudogene 45 | - unitary_pseudogene 46 | - unprocessed_pseudogene 47 | - vaultRNA 48 | -------------------------------------------------------------------------------- /templates/samples.csv: -------------------------------------------------------------------------------- 1 | samples,expected_cells,read_length,batch 2 | sample1,100,75,Batch1 --------------------------------------------------------------------------------