├── .gitattributes
├── .gitignore
├── .gitmodules
├── .test
    ├── NexteraPE-PE.fa
    ├── config.yaml
    ├── gtf_biotypes.yaml
    └── samples.csv
├── .travis.yml
├── LICENSE.txt
├── README.md
├── Snakefile
├── docs
    ├── .gitignore
    ├── Snakefile
    ├── docs
    │   ├── CHANGELOG.md
    │   ├── Clusters.md
    │   ├── Create-config-files.md
    │   ├── FAQ.md
    │   ├── Installation.md
    │   ├── Plots.md
    │   ├── Reference-Files.md
    │   ├── Running-dropSeqPipe.md
    │   ├── images
    │   │   ├── adapter_content.png
    │   │   ├── hum_mus_species_plot_transcripts.png
    │   │   ├── mac_Count_vs_gene.png
    │   │   ├── mac_UMI_vs_counts.png
    │   │   ├── mac_UMI_vs_gene.png
    │   │   ├── mac_violinplots_comparison_UMI.png
    │   │   ├── sample1_knee_plot.png
    │   │   ├── sample1_rna_metrics.png
    │   │   └── yield.png
    │   └── index.md
    ├── mkdocs.yml
    └── mkdocs_env.yml
├── envs
    ├── bbmap.yaml
    ├── cutadapt.yaml
    ├── dropseq_tools.yaml
    ├── merge.yaml
    ├── merge_bam.yaml
    ├── merge_long.yaml
    ├── picard.yaml
    ├── pigz.yaml
    ├── r.yaml
    ├── samtools.yaml
    ├── star.yaml
    ├── umi_tools.yaml
    └── velocyto.yaml
├── rules
    ├── cell_barcodes.smk
    ├── download_meta_mixed.smk
    ├── download_meta_single.smk
    ├── extract_expression_single.smk
    ├── extract_expression_species.smk
    ├── fastqc.smk
    ├── filter.smk
    ├── generate_meta.smk
    ├── map.smk
    ├── merge.smk
    ├── prepare.smk
    ├── report.smk
    └── split_species.smk
├── schemas
    ├── config.schema.yaml
    └── samples.schema.yaml
├── scripts
    ├── clean_cutadapt.py
    ├── convert_mtx.py
    ├── create_summary_stats.R
    ├── detect_barcodes.py
    ├── fa2tsv.py
    ├── generate_extended_ref.py
    ├── merge_bam.py
    ├── plot_adapter_content.R
    ├── plot_knee_plot.R
    ├── plot_rna_metrics.R
    ├── plot_species_plot.R
    ├── plot_violine.R
    ├── plot_yield.R
    ├── publication_text.Rmd
    ├── repair_barcodes.py
    └── umi_tools_extended_ref.py
└── templates
    ├── NexteraPE-PE.fa
    ├── TruSeq2-PE.fa
    ├── TruSeq2-SE.fa
    ├── TruSeq3-PE-2.fa
    ├── TruSeq3-PE.fa
    ├── TruSeq3-SE.fa
    ├── cluster.yaml
    ├── config.yaml
    ├── config_nadia.yaml
    ├── custom_adapters.fa
    ├── gtf_biotypes.yaml
    └── samples.csv


/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto !eol
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .snakemake
2 | scripts/__pycache__*


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule ".test/data"]
2 | 	path = .test/data
3 | 	url = https://github.com/Hoohm/scngs-test-data.git
4 | 


--------------------------------------------------------------------------------
/.test/NexteraPE-PE.fa:
--------------------------------------------------------------------------------
 1 | >PrefixNX/1
 2 | AGATGTGTATAAGAGACAG
 3 | >PrefixNX/2
 4 | AGATGTGTATAAGAGACAG
 5 | >Trans1
 6 | TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG
 7 | >Trans1_rc
 8 | CTGTCTCTTATACACATCTGACGCTGCCGACGA
 9 | >Trans2
10 | GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG
11 | >Trans2_rc
12 | CTGTCTCTTATACACATCTCCGAGCCCACGAGAC


--------------------------------------------------------------------------------
/.test/config.yaml:
--------------------------------------------------------------------------------
 1 | CONTACT:
 2 |   email: user.name@provider.com
 3 |   person: John Doe
 4 | LOCAL:
 5 |     temp-directory: /tmp
 6 |     memory: 4g
 7 |     raw_data: data
 8 |     results: results
 9 | META:
10 |     species:
11 |         mus_musculus:
12 |             build: 38
13 |             release: 91
14 |     ratio: 0.2
15 |     reference-directory: data/ref
16 |     gtf_biotypes: gtf_biotypes.yaml
17 | 
18 | FILTER:
19 |     barcode-whitelist: ''
20 |     5-prime-smart-adapter: CCTACACGACGCTCTTCCGATCT
21 |     cell-barcode:
22 |         start: 2
23 |         end: 6
24 |     UMI-barcode:
25 |         start: 7
26 |         end: 16
27 |     cutadapt:
28 |         adapters-file: NexteraPE-PE.fa
29 |         R1:
30 |             quality-filter: 20
31 |             maximum-Ns: 0
32 |             extra-params: ''
33 |         R2:
34 |             quality-filter: 20
35 |             minimum-adapters-overlap: 6
36 |             minimum-length: 15
37 |             extra-params: ''
38 | MAPPING:
39 |     STAR:
40 |         genomeChrBinNbits: 18
41 |         outFilterMismatchNmax: 10
42 |         outFilterMismatchNoverLmax: 0.3
43 |         outFilterMismatchNoverReadLmax: 1
44 |         outFilterMatchNmin: 0
45 |         outFilterMatchNminOverLread: 0.66
46 |         outFilterScoreMinOverLread: 0.66
47 | EXTRACTION:
48 |     LOCUS:
49 |         - CODING
50 |         - UTR
51 |     strand-strategy: SENSE
52 |     UMI-edit-distance: 1
53 |     minimum-counts-per-UMI: 0
54 | DEBUG: True


--------------------------------------------------------------------------------
/.test/gtf_biotypes.yaml:
--------------------------------------------------------------------------------
 1 | biotypes:
 2 |   - 3prime_overlapping_ncRNA
 3 |   - antisense
 4 |   - bidirectional_promoter_lncRNA
 5 |   - IG_C_gene
 6 |   - IG_C_pseudogene
 7 |   - IG_D_gene
 8 |   - IG_J_gene
 9 |   - IG_J_pseudogene
10 |   - IG_pseudogene
11 |   - IG_V_gene
12 |   - IG_V_pseudogene
13 |   - lincRNA
14 |   - macro_lncRNA
15 |   - miRNA
16 |   - misc_RNA
17 |   - Mt_rRNA
18 |   - Mt_tRNA
19 |   - non_coding
20 |   - polymorphic_pseudogene
21 |   - processed_pseudogene
22 |   - processed_transcript
23 |   - protein_coding
24 |   - pseudogene
25 |   - ribozyme
26 |   - rRNA
27 |   - scaRNA
28 |   - scRNA
29 |   - sense_intronic
30 |   - sense_overlapping
31 |   - snoRNA
32 |   - snRNA
33 |   - sRNA
34 |   - TEC
35 |   - transcribed_processed_pseudogene
36 |   - transcribed_unitary_pseudogene
37 |   - transcribed_unprocessed_pseudogene
38 |   - translated_processed_pseudogene
39 |   - TR_C_gene
40 |   - TR_D_gene
41 |   - TR_J_gene
42 |   - TR_J_pseudogene
43 |   - TR_V_gene
44 |   - TR_V_pseudogene
45 |   - unitary_pseudogene
46 |   - unprocessed_pseudogene
47 |   - vaultRNA
48 | 


--------------------------------------------------------------------------------
/.test/samples.csv:
--------------------------------------------------------------------------------
1 | samples,expected_cells,read_length,batch
2 | sample1,100,75,Batch1
3 | sample2,100,75,Batch2


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "3.6"
 4 | 
 5 | branches:
 6 |   only:
 7 |     - master
 8 |     - develop
 9 | 
10 | install:
11 |   - sudo apt-get update
12 |   - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
13 |   - bash miniconda.sh -b -p $HOME/miniconda
14 |   - export PATH="$HOME/miniconda/bin:$PATH"
15 |   - hash -r
16 |   - conda config --set always_yes yes --set changeps1 no
17 |   - conda update -q conda
18 |   # Useful for debugging any issues with conda
19 |   - conda info -a
20 |   - conda config --add channels defaults
21 |   - conda config --add channels conda-forge
22 |   - conda config --add channels bioconda
23 |   - conda install -c bioconda -c conda-forge snakemake
24 |   - conda create -q -n snakemake snakemake>=5.3.1 python=$TRAVIS_PYTHON_VERSION
25 | script:
26 |   # run the workflow
27 |   - snakemake --use-conda --directory .test -p
28 | 
29 | after_success:
30 |   - cd docs && snakemake --use-conda build_docs
31 | 
32 | deploy:
33 |   provider: pages
34 |   skip-cleanup: true
35 |   github-token: $GITHUB_PAT
36 |   keep-history: true
37 |   local-dir: docs/site
38 |   on:
39 |     branch: master
40 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Snakemake](https://img.shields.io/badge/snakemake-≥4.1.0-brightgreen.svg)](https://snakemake.bitbucket.io)
 2 | [![Build Status](https://travis-ci.org/Hoohm/dropSeqPipe.svg?branch=master)](https://travis-ci.org/Hoohm/dropSeqPipe)
 3 | 
 4 | Description
 5 | ------------------
 6 | This pipeline is based on [snakemake](https://snakemake.readthedocs.io/en/stable/) and the dropseq tools provided by the [McCarroll Lab](http://mccarrolllab.com/dropseq/). It allows to go from raw data of your Single Cell RNA seq experiment until the final count matrix with QC plots along the way.
 7 | 
 8 | This is the tool we use in our lab to improve our wetlab protocol as well as provide an easy framework to reproduce and compare different experiments with different parameters.
 9 | 
10 | It uses STAR to map the reads. It is usable for any single cell protocol using two reads where the first one holds the Cell and UMI barcodes and the second read holds the RNA. Here is a non-exhausitve list of compatible protocols/brands:
11 | 
12 | * Drop-Seq
13 | * SCRB-Seq
14 | * 10x Genomics
15 | * DroNc-seq
16 | * Dolomite Bio ([Nadia Instrument](https://www.dolomite-bio.com/product/nadia-instrument/))
17 | 
18 | This package is trying to be as user friendly as possible. One of the hopes is that non-bioinformatician can make use of it without too much hassle. It will still require some command line execution, this is not going to be fully interactive package.
19 | 
20 | 
21 | ## Authors
22 | 
23 | * Patrick Roelli ([@Hoohm)](https://github.com/Hoohm))
24 | * Sebastian Mueller ([@seb-mueller)](https://github.com/seb-mueller))
25 | * Charles Girardot ([@cgirardot)](https://github.com/cgirardot))
26 | 
27 | ## Usage
28 | 
29 | ### Step 1: Install workflow
30 | 
31 | If you simply want to use this workflow, download and extract the [latest release](https://github.com/Hoohm/dropSeqPipe/releases).
32 | If you intend to modify and further develop this workflow, fork this reposity. Please consider providing any generally applicable modifications via a pull request.
33 | 
34 | In any case, if you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this repository and, once available, its DOI.
35 | 
36 | ### Step 2: Configure workflow
37 | 
38 | Configure the workflow according to your needs via editing the file `config.yaml` and the  `samples.tsv` following those [instructions](https://github.com/Hoohm/dropSeqPipe/wiki/Create-config-files)
39 | 
40 | ### Step 3: Execute workflow
41 | 
42 | All you need to execute this workflow is to install Snakemake via the [Conda package manager](http://snakemake.readthedocs.io/en/stable/getting_started/installation.html#installation-via-conda). Software needed by this workflow is automatically deployed into isolated environments by Snakemake.
43 | 
44 | Test your configuration by performing a dry-run via
45 | 
46 |     snakemake --use-conda -n --directory $WORKING_DIR
47 | 
48 | Execute the workflow locally via
49 | 
50 |     snakemake --use-conda --cores $N --directory $WORKING_DIR
51 | 
52 | using `$N` cores on the `$WORKING_DIR`. Alternatively, it can be run in cluster or cloud environments (see [the docs](http://snakemake.readthedocs.io/en/stable/executable.html) for details).
53 | 
54 | If you not only want to fix the software stack but also the underlying OS, use
55 | 
56 |     snakemake --use-conda --use-singularity
57 | 
58 | in combination with any of the modes above.
59 | 
60 | ### Step 4: Investigate results
61 | 
62 | After successful execution, you can create a self-contained report with all results via:
63 | 
64 |     snakemake --report report.html
65 | 
66 | 
67 | Documentation
68 | ------------------
69 | You can find the documentation [here](https://hoohm.github.io/dropSeqPipe/)
70 | 
71 | Future implementations
72 | ---------------------------
73 | I'm actively seeking help to implement the points listed bellow. Don't hesitate to contact me if you wish to contribute.
74 | 
75 | * Create a sharing platform where quality plots/logs can be discussed and troubleshooted.
76 | * Create a full html report for the whole pipeline
77 | * Multiqc module for drop-seq-tools
78 | * Implement an elegant "preview" mode where the pipeline would only run on a couple of millions of reads and allow you to have an approximated view before running all of the data. This would dramatically reduce the time needed to get an idea of what filters whould be used.
79 | * 
80 | 
81 | I hope it can help you out in your single cell experiments!
82 | 
83 | Feel free to comment and point out potential improvements via [issues](https://github.com/Hoohm/dropSeqPipe/issues)
84 | 
85 | 
86 | TODO
87 | ---------------------------------------------
88 | * Add a mixed reference reference for testing purposes
89 | * Finalize the parameters validation schema
90 | * Make the debug feature a bit "cleaner". Deal with automatic naming of the debug variables
91 | * Implement ddseq barcoding strategies
92 | 


--------------------------------------------------------------------------------
/Snakefile:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import os
  3 | import re
  4 | import glob
  5 | from snakemake.utils import validate, min_version
  6 | 
  7 | singularity:
  8 |     "shub://seb-mueller/singularity_dropSeqPipe:v04"
  9 | 
 10 | min_version("5.1.2")
 11 | 
 12 | #print(os.path.abspath(os.path.dirname(workflow.snakefile)))
 13 | 
 14 | # Load configuration files
 15 | 
 16 | try:
 17 |     configfile_path = config['configfile_path']
 18 | except:
 19 |     configfile_path = "config.yaml"
 20 | configfile: configfile_path
 21 | 
 22 | 
 23 | #Include the gtf biotypes yaml
 24 | configfile: config['META']['gtf_biotypes']
 25 | 
 26 | # Define a few variables to make them easier to reference
 27 | snakefile_root_path = os.path.abspath(os.path.dirname(workflow.snakefile))
 28 | ref_path = config['META']['reference-directory']
 29 | barcode_whitelist = config['FILTER']['barcode-whitelist']
 30 | results_dir = config['LOCAL']['results']
 31 | raw_data_dir = config['LOCAL']['raw_data']
 32 | 
 33 | # dropSeqPipe version
 34 | config['version'] = '0.5'
 35 | validate(config, schema=os.path.join(snakefile_root_path,"schemas","config.schema.yaml"))
 36 | 
 37 | 
 38 | # In order to deal with single species or mixed species experiment
 39 | # we define the same variables for each case.
 40 | 
 41 | 
 42 | #Define variables for mixed species experiments
 43 | if len(config['META']['species'].keys()) == 2:
 44 |     print('Running the pipeline for a mixed experiment')
 45 |     species_list = list(config['META']['species'])
 46 |     build_list = [
 47 |         config['META']['species'][species_list[0]]['build'],
 48 |         config['META']['species'][species_list[1]]['build']]
 49 |     release_list = [
 50 |         config['META']['species'][species_list[0]]['release'],
 51 |         config['META']['species'][species_list[1]]['release']]
 52 | 
 53 |     for species in config['META']['species']:
 54 |         release = '{}.{}'.format(
 55 |             config['META']['species'][species_list[0]]['release'],
 56 |             config['META']['species'][species_list[1]]['release'])
 57 |         build = '{}.{}'.format(
 58 |             config['META']['species'][species_list[0]]['build'],
 59 |             config['META']['species'][species_list[1]]['build'])
 60 |     species = 'mixed_{}_{}'.format(
 61 |         species_list[0],
 62 |         species_list[1])
 63 | 
 64 | #Define variables for single species experiments
 65 | elif len(config['META']['species'].keys()) == 1:
 66 |     species_list=list(config['META']['species'])
 67 |     species=species_list[0]
 68 |     release_list = [config['META']['species'][species]['release']]
 69 |     release=release_list[0]
 70 |     build_list = [config['META']['species'][species]['build']]
 71 |     build=build_list[0]
 72 | else:
 73 |     exit("Number of species in the config.yaml must be one or two. Exiting")
 74 | 
 75 | # Get sample names from samples.csv
 76 | samples = pd.read_table("samples.csv", sep=',').set_index("samples", drop=False)
 77 | validate(samples, schema=os.path.join(snakefile_root_path,"schemas","samples.schema.yaml"))
 78 | types=['read','umi']
 79 | # Get read_lengths from samples.csv
 80 | read_lengths = list(samples.loc[:,'read_length'])
 81 | 
 82 | wildcard_constraints:
 83 |     sample="({})".format("|".join(samples.index)),
 84 |     type="({})".format("|".join(types))
 85 | 
 86 | 
 87 | # Flexible ways to get the R1 and R2 files
 88 | def get_R1_files(wildcards):
 89 |     samples = [f for f in glob.glob("{}/*.fastq.gz".format(raw_data_dir)) if (re.search('R1', re.sub(wildcards.sample,'',f)) and re.search(wildcards.sample,f))]
 90 |     if len(samples)>1 & isinstance(samples,list):
 91 |         exit('Multiple read files for one sample. Please check file names or run snakemake -s rules/prepare.smk for multilane samples first.')
 92 |     if samples == []:
 93 |         exit('\tNo sample files found in the {}/ directory.\n\t\tPlease check that the path for the raw data is set properly in config.yaml'.format(raw_data_dir))
 94 |     return(samples)
 95 | 
 96 | def get_R2_files(wildcards):
 97 |     samples = [f for f in glob.glob("{}/*.fastq.gz".format(raw_data_dir)) if (re.search('R2', re.sub(wildcards.sample,'',f)) and re.search(wildcards.sample,f))]
 98 |     if len(samples)>1 & isinstance(samples,list):
 99 |         exit('Multiple read files for one sample. Please check file names or run snakemake -s rules/prepare.smk for multilane samples first.')
100 |     if samples == []:
101 |         exit('\tNo sample files found in the {} directory.\n\t\tPlease check that the path for the raw data is set properly in config.yaml'.format(raw_data_dir))
102 |     return(samples)
103 | 
104 | 
105 | if len(config['META']['species'].keys()) == 2:
106 |     rule all:
107 |         input:
108 |             expand(
109 |                 ['{ref_path}/{species}_{build}_{release}/STAR_INDEX/SA_{read_length}/SA',
110 |                 #qc
111 |                 '{results_dir}/reports/fastqc_reads.html',
112 |                 '{results_dir}/reports/fastqc_barcodes.html',
113 |                 #fastqc_adapter
114 |                 'fastqc_adapter.tsv',
115 |                 #filter
116 |                 '{results_dir}/plots/adapter_content.pdf',
117 |                 '{results_dir}/reports/barcode_filtering.html',
118 |                 '{results_dir}/reports/RNA_filtering.html',
119 |                 '{results_dir}/samples/{sample}/trimmed_repaired_R1.fastq.gz',
120 |                 '{results_dir}/samples/{sample}/top_barcodes.csv',
121 |                 #mapping
122 |                 '{results_dir}/plots/knee_plots/{sample}_knee_plot.pdf',
123 |                 '{results_dir}/reports/star.html',
124 |                 '{results_dir}/plots/yield.pdf',
125 |                 '{results_dir}/samples/{sample}/Unmapped.out.mate1.gz',
126 |                 #splitting
127 |                 '{results_dir}/plots/barnyard/{sample}_genes.pdf',
128 |                 '{results_dir}/plots/barnyard/{sample}_transcripts.pdf'],
129 |                     read_length=read_lengths,
130 |                     sample=samples.index,
131 |                     type=types,
132 |                     results_dir=results_dir,
133 |                     ref_path=config['META']['reference-directory'],
134 |                     build=build,
135 |                     release=release,
136 |                     species=species),
137 |             expand(
138 |                 ['{results_dir}/samples/{sample}/{species}/umi/matrix.mtx',
139 |                 '{results_dir}/samples/{sample}/{species}/read/matrix.mtx',
140 |                 '{results_dir}/plots/rna_metrics/{sample}_{species}_rna_metrics.pdf'],
141 |                 results_dir=results_dir,
142 |                 sample=samples.index,
143 |                 species=species_list)
144 | 
145 | elif len(config['META']['species'].keys()) == 1:
146 |     rule all:
147 |         input:
148 |             #meta
149 |             expand(
150 |                 ['{ref_path}/{species}_{build}_{release}/STAR_INDEX/SA_{read_length}/SA',
151 |                 #qc
152 |                 '{results_dir}/reports/fastqc_reads.html',
153 |                 '{results_dir}/reports/fastqc_barcodes.html',
154 |                 #filter
155 |                 '{results_dir}/plots/adapter_content.pdf',
156 |                 '{results_dir}/reports/barcode_filtering.html',
157 |                 '{results_dir}/reports/RNA_filtering.html',
158 |                 #mapping
159 |                 '{results_dir}/plots/knee_plots/{sample}_knee_plot.pdf',
160 |                 '{results_dir}/reports/star.html',
161 |                 '{results_dir}/plots/yield.pdf',
162 |                 '{results_dir}/samples/{sample}/Unmapped.out.mate1.gz',
163 |                 #extract
164 |                 '{results_dir}/plots/rna_metrics/{sample}_rna_metrics.pdf',
165 |                 '{results_dir}/summary/{type}/matrix.mtx',
166 |                 '{results_dir}/samples/{sample}/{type}/matrix.mtx',
167 |                 #merge
168 |                 '{results_dir}/plots/UMI_vs_counts.pdf',
169 |                 '{results_dir}/plots/UMI_vs_gene.pdf',
170 |                 '{results_dir}/plots/Count_vs_gene.pdf',
171 |                 '{results_dir}/summary/R_Seurat_objects.rdata',
172 |                 '{results_dir}/summary/barcode_stats_pre_filter.csv',
173 |                 '{results_dir}/summary/barcode_stats_post_filter.csv',
174 |                 '{results_dir}/plots/violinplots_comparison_UMI.pdf'],
175 |                     read_length=read_lengths,
176 |                     sample=samples.index,
177 |                     type=types,
178 |                     results_dir=results_dir,
179 |                     ref_path=config['META']['reference-directory'],
180 |                     build=build,
181 |                     release=release,
182 |                     species=species)
183 |     rule download_meta:
184 |         input:
185 |             expand(
186 |                 ["{ref_path}/{species}_{build}_{release}/annotation.gtf",
187 |                 "{ref_path}/{species}_{build}_{release}/genome.fa"],
188 |                     ref_path=config['META']['reference-directory'],
189 |                     species=species_list,
190 |                     release=release,
191 |                     build=build)
192 | 
193 | 
194 | rule qc:
195 |     input:
196 |         expand(
197 |             ['{results_dir}/reports/fastqc_reads.html',
198 |             '{results_dir}/reports/fastqc_barcodes.html',
199 |             'fastqc_adapter.tsv'],
200 |                 results_dir=results_dir)
201 | 
202 | rule filter:
203 |     input:
204 |         expand(
205 |             ['{results_dir}/plots/adapter_content.pdf',
206 |             '{results_dir}/reports/barcode_filtering.html',
207 |             '{results_dir}/reports/RNA_filtering.html',
208 |             '{results_dir}/samples/{sample}/trimmed_repaired_R1.fastq.gz',
209 |             '{results_dir}/samples/{sample}/top_barcodes.csv'],
210 |                 results_dir=results_dir,
211 |                 sample=samples.index)
212 | 
213 | rule map:
214 |     input:
215 |         expand(
216 |             ['{results_dir}/plots/knee_plots/{sample}_knee_plot.pdf',
217 |             '{results_dir}/reports/star.html',
218 |             '{results_dir}/plots/yield.pdf',
219 |             '{results_dir}/samples/{sample}/final.bam',
220 |             '{results_dir}/samples/{sample}/Unmapped.out.mate1.gz'],
221 |                 sample=samples.index,
222 |                 results_dir=results_dir)
223 | 
224 | rule extract:
225 |     input:
226 |         expand(
227 |             ['{results_dir}/plots/rna_metrics/{sample}_rna_metrics.pdf',
228 |             '{results_dir}/summary/{type}/matrix.mtx',
229 |             '{results_dir}/samples/{sample}/{type}/matrix.mtx.gz'],
230 |                 results_dir=results_dir,
231 |                 sample=samples.index,
232 |                 type=types)
233 | 
234 | rule split_species:
235 |     input:
236 |         expand(
237 |             ['{results_dir}/samples/{sample}/{species}/barcodes.csv',
238 |             '{results_dir}/plots/barnyard/{sample}_genes.pdf',
239 |             '{results_dir}/plots/barnyard/{sample}_transcripts.pdf',
240 |             '{results_dir}/samples/{sample}/{species}/unfiltered.bam'],
241 |                 sample=samples.index,
242 |                 species=config['META']['species'],
243 |                 results_dir=results_dir)
244 | 
245 | 
246 | rule extract_species:
247 |     input:
248 |         expand(
249 |             ['{results_dir}/samples/{sample}/{species}/{type}/matrix.mtx',
250 |             '{results_dir}/plots/rna_metrics/{sample}_{species}_rna_metrics.pdf'],
251 |                 sample=samples.index,
252 |                 species=config['META']['species'],
253 |                 results_dir=results_dir,
254 |                 type=types)
255 | 
256 | rule merge:
257 |     input:
258 |         #merge
259 |         expand(
260 |             ['{results_dir}/plots/UMI_vs_counts.pdf',
261 |             '{results_dir}/plots/UMI_vs_gene.pdf',
262 |             '{results_dir}/plots/Count_vs_gene.pdf',
263 |             '{results_dir}/summary/R_Seurat_objects.rdata',
264 |             '{results_dir}/summary/barcode_stats_pre_filter.csv',
265 |             '{results_dir}/summary/barcode_stats_post_filter.csv',
266 |             '{results_dir}/plots/violinplots_comparison_UMI.pdf',
267 |             '{results_dir}/summary/{type}/matrix.mtx'],
268 |                 results_dir=results_dir,
269 |                 type=types)
270 | 
271 | rule make_report:
272 |     input:
273 |         expand('{results_dir}/reports/publication_text.html', results_dir=results_dir)
274 | 
275 | if len(config['META']['species'].keys()) == 2:
276 |     include: "rules/download_meta_mixed.smk"
277 | if len(config['META']['species'].keys()) == 1:
278 |     include: "rules/download_meta_single.smk"
279 | 
280 | include: "rules/generate_meta.smk"
281 | include: "rules/fastqc.smk"
282 | include: "rules/filter.smk"
283 | include: "rules/cell_barcodes.smk"
284 | include: "rules/map.smk"
285 | include: "rules/extract_expression_single.smk"
286 | include: "rules/split_species.smk"
287 | include: "rules/extract_expression_species.smk"
288 | include: "rules/merge.smk"
289 | include: "rules/report.smk"
290 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | site
2 | .snakemake
3 | 


--------------------------------------------------------------------------------
/docs/Snakefile:
--------------------------------------------------------------------------------
 1 | rule build_docs:
 2 |    """Build docs using mkdocs"""
 3 |    conda:
 4 |      "mkdocs_env.yml"
 5 |    shell:
 6 |      "mkdocs build"
 7 | 
 8 | rule serve_docs:
 9 |    """Build docs and run through developement server"""
10 |    conda:
11 |      "mkdocs_env.yml"
12 |    shell:
13 |      "mkdocs serve"
14 | 
15 | 


--------------------------------------------------------------------------------
/docs/docs/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Change Log
  2 | All notable changes to this project will be documented in this file.
  3 | 
  4 | The format is based on [Keep a Changelog](http://keepachangelog.com/)
  5 | and this project adheres to [Semantic Versioning](http://semver.org/).
  6 | 
  7 | 
  8 | ## [0.5]
  9 | ### Added
 10 | - Singularity usage. Try out the `--use-singularity` option instead of `--use-conda`
 11 | 
 12 | ### Changed 
 13 | - Lots off small bugfixes
 14 | 
 15 | 
 16 | ## [0.4.1]
 17 | ### Added
 18 | - samples.csv and config.yaml schema validation. This will help users fix missing values.
 19 | - DetectBeadSubstitutionErrors was added in the mapping steps.
 20 | 
 21 | ### Changed
 22 | - Minimum read length after trimming is now the index of the end of the UMI
 23 | - dropSeqPipe can now run with a docker image if you use the `--use-singularity` option. This should help people with package issues and different linux setups. You need to have installed singularity system wide to use this option.
 24 | 
 25 | 
 26 | ## [0.4] - 2018-12-19
 27 | ### Added
 28 | - Top barcode detection using [umi-tools](https://github.com/CGATOxford/UMI-tools) based on number of expected cells.
 29 | - Genome reference and annotation automatically downloaded now base on build and release number from configuration file.
 30 | - On the fly detection of mixed experiment.
 31 | - **beta**: Generation of a report for publication describing tools used in each steps. run `make_report` after the preprocessing is done to get `reports/publication_text.html`. This is a really early stage. Feel free to suggest PR for text modifications.
 32 | - Raw data, results, reference are now independent from the working dir and can be chosen via the configuration file.
 33 | - dropseq_tools v2.0 implemented. This opens up new options such as choosing which locus to use for gene counting. See configuration file.
 34 | - Possibility to edit which biotypes are selected from the annotation file via a gtf_biotypes.yaml file provided.
 35 | - Cell barcodes are now corrected. One hamming distance for known/given whitelists, graphbased correction based on umi-tools for unknown lists. Those corrections are written in the bam files. This makes final bam files compatible for other tools using the XC/XM bam TAGS.
 36 | - UMI are now also corrected based on dropseq_tools v2.0.
 37 | - Possibility to choose SENSE, ANTISENSE or BOTH for read counting.
 38 | - Adapter content for R1 and R2 have now their own plot, `adapter_content.pdf`.
 39 | - New plot called `yield.pdf` makes a summary of total reads and how they are distributed among filtered, trimmed, mapped, etc.
 40 | - Configuration file has now a CONTACT section providing a field for a person and a contact e-mail address.
 41 | 
 42 | ### Changed
 43 | - Expression matrices output are now sparse (mtx format). This will decrease the size of the output and loading time for downstream analysis.
 44 | - Logfiles, plots and samples output are now grouped together in folders by category. This should make browsing results easier.
 45 | - Fixed most of the packages versions.
 46 | - Summary plots and Seurat object are now in the `all` rule and will be created by default.
 47 | 
 48 | ### Removed
 49 | - Merging of species expression accross samples. Since the mixed experiments are mostly used to test out the doublet rate of a platform and not for downstream analysis, this last part has not been updated. Single expression matrices are still there.
 50 | - Cell barcodes dropped, umi barcodes dropped, starttrim and polyA trim plots are now gone. BC_drop is also removed. Replacements are adapter_content and yield plots.
 51 | - Quality trimming via dropseq_tools has been removed and is now down by cutadapt. Those modifications decrease the running time of the pipeline.
 52 | 
 53 | 
 54 | ## [0.32]
 55 | ### Added
 56 | - Documentation generated from the markdown files directly on travis-ci.
 57 | 
 58 | 
 59 | ## [0.31a]
 60 | ### Changed
 61 | - fix on species plot.
 62 | - fix on rule STAR_align adding now unmapped read to a fastq file.
 63 | 
 64 | ### Added
 65 | - Added travis integration. The pipeline is now automatically getting tested when updated and when pull requests are proposed.
 66 | - There is now a small git submodule in .test which will provide a sampled file for testing the pipeline on travis-ci.
 67 | 
 68 | ### Removed
 69 | - `environment.yaml` has been removed. Youjust have to install snakemake now instead of activating the env.
 70 | 
 71 | ## [0.31]
 72 | ### Changed
 73 | - Fixed error for STAR index generation. It crashed saying it couldn't write in folder.
 74 | - Fixed a missing plot for plot_knee_plot_whitelist.
 75 | - Input files for the STAR_align rule have been changed. Adding samples in an already aligned experiment with a different R2 length, will only align the new data and not realign the old one.
 76 | - Split reads and barcodes multiqc reports for qc step.
 77 | - Modified a few rules to follow the guidelines for [snakemake workflows](https://github.com/snakemake-workflows/docs)
 78 | - Fixed an issue where snakemake would crash on clusters if using `expand()` on fixed variables such as `annotation_prefix`. Now using normal python formatting.
 79 | - Changed the config.yaml parameters names to lowercase and hyphens! Software specific variables have their original style making it easier to search in manuals. You will have to either copy the new config.yaml from the templates or modify your own accordingly.
 80 | - cell-barcode-edit-distance changed to what it actually is, UMI-edit-distance.
 81 | - Updated all the envs to fix bugs.
 82 | - Fixed a bug where the mixed species would not run properly.
 83 | 
 84 | ### Added
 85 | - Added ggpubr in environment.yaml file.
 86 | - Added a `templates` folder which will hold `config.yaml`, `samples.csv`, `cluster.yaml` as well as adapters files. This will also help cloning the repository without overwritting your own config.yaml file when updating the pipeline.
 87 | - Added the possibility of using your own adapters fasta file for trimmomatic. To use it, please refer to the [WIKI](https://github.com/Hoohm/dropSeqPipe/wiki/Create-config-files#filter)
 88 | - Added fastqc, multiqc, STAR wrappers. You have now to use the `--use-conda` option to run the pipeline.
 89 | - Added cluster recommendations on the wiki.
 90 | - Added Localrules for certain rules. This allows to run low ressource rules on the host computer instead of nodes when using clusters.
 91 | - genomeChrBinNbits will be calculated automacially for STAR.
 92 | - Exposed all variables for trimmomatic in config.yaml under trimming.
 93 | 
 94 | ### Removed
 95 | - png plots have been removed. It was causing some issues on clusters with cairo. Usability is more important than png plots to me.
 96 | 
 97 | 
 98 | ## [0.3]
 99 | ### Changed
100 | - Complete overhaul of how the pipeline is organized to follow the structure proposed for snakemake-workflows. This will allow ease of deployement on any platform having conda installed. It will also help to run on clusters.
101 | - The way to call the pipeline is now simplified. Changes are shown in the [WIKI](https://github.com/Hoohm/dropSeqPipe/wiki/)
102 | - Dependency to Drop-seq-tools updated from version 1.12 to 1.13
103 | - Full compatibility with barcode whitelist. Makes it easier to use for SCRBseq protocols or whitelist from other source (UMI-tools).
104 | - Modified cell and UMI drop plots in order to reflect the option chosen. See [plots](https://github.com/Hoohm/dropSeqPipe/wiki/Plots)
105 | 
106 | ### Removed
107 | - Bulk sequencing compatiblity.
108 | - Fastqc and STAR logs plots are removed and replaced by multiqc.
109 | - Automatic determination of STAMPS via knee_plot. Please use an estimated number of cells as the main threshold and filter in downstream analysis for other parameters such as high number of mitochondrial genes.
110 | - `MinCellFraction` entry in config.yaml. This parameter wasn't adding much value and was confusing.
111 | - Base frequency plot has been removed. This will come back with autodetermination of the STAMPS.
112 | 
113 | ### Added
114 | - Wrapper for Drop-seq tools. Makes it easier to switch temp folder and choose maximum memory heap.
115 | - More parameters for STAR exposed. See [WIKI](https://github.com/Hoohm/dropSeqPipe/wiki/)
116 | 
117 | ## [0.24]
118 | ### Changed
119 | - All the QCplots are now generated inside the snakefiles. No more `generate-plots` mode.
120 | 
121 | 
122 | ## [0.23a]
123 | ### Changed
124 | - Will now allow you to run `generate-meta` without having a `config.yaml` file in the reference foder.
125 | - Changed the code for Cell and UMI barcode quality drop (per sample and overall). There was an error in the code not givint the right amount of dropped reads. Updated the images on the wiki accordingly.
126 | - Fixed the setup where r2py was called before getting installed.
127 | - Big change in the mapping. From now on the STAR index will be done without a GTF file. This allows to change the overhang option on the fly for each sample based on the mean read length. This also opens up 2-pass mapping. You will have to regenerate your index for it to work.
128 | - Changed `generate_meta` in order to fit the new STAR index without a GTF. You now have to give the path to the GTF file in the config.yaml
129 | 
130 | ### Added
131 | - `min_count_per_umi` in the `config.yaml` to decide how many times a Gene - UMI has to be found to be counted as one.
132 | 
133 | 
134 | ## [0.23]
135 | ### Changed
136 | - pre_align steps will output a fastq.gz instead of a fastq file.
137 | - `fastqc.R` is now compatible with paired and single end data.
138 | - Changed a few options in `GLOBAL` for `UMI` and `Cell_barcodes` options. Now possible to change filtering settings. See [WIKI](https://github.com/Hoohm/dropSeqPipe/wiki/Create-config-files)
139 | - STAR logs have been stripped of the `STAR` string. This is to allow for better compatibility with [multiqc](https://github.com/ewels/MultiQC/)
140 | - Removed `fastqc` folder and moved items to `logs` folder. Grouping all logs files for better [multiqc](https://github.com/ewels/MultiQC/) compatibility.
141 | - Changed `generate_meta` to `generate-meta` for keeping similar syntax between modes.
142 | - Added seperate log files for stats and summary in the DetectBeadSynthesisErrors.
143 | - Moved part of the `README`to the wiki.
144 | - Changed the name of the first expression matrix extracted before the species plot to `unfiltered_expression.`
145 | 
146 | 
147 | ### Added
148 | - You can now run Bulk Single or paired end RNAseq data.
149 | - Started a wiki with a FAQ
150 | - Added options in `GLOBAL` config.yaml. You can now choose a range of options for UMI and Barcode filtering. please refer to the wiki for more information.
151 | - Support for [MultiQC](https://github.com/ewels/MultiQC/). MultiQC is a great way of summarising all of the logs from your experiment. As of today it supports 46 different modules (such as fastqc, trimmomatic, STAR, etc...) The `generate-plots` mode now produces a `multiqc_report.html` file in the plots folder.
152 | - New plot! BCDrop.pdf is a new plot showing you how many barcode and UMIs you dropped from the raw data before aligning. This helps to track how many samples you might loose because of low quality reads in the barcoding.
153 | 
154 | ## [0.22]
155 | ### Changed
156 | - all `subprocess.call` replaced by `shell` from snakemake
157 | - STAR aligner now not limited to 8 cores or threads but will use the maximum number provided in the local.yaml file
158 | - Name from dropSeqPip to dropSeqPipe
159 | - Fixed a bug where all stage1 steps used the same summary file. Now BC tagging, UMI tagging, starting trim and polyA trim have different summary files
160 | - extract-expression now merges all the samples final count matrix into one per run (folder)
161 | - Fixed a bug where the amount of total reads on the knee-plot was overinflated.
162 | - Changed `knee-plot` mode to `generate-plots`.
163 | 
164 | ### Added
165 | - Temp files have been added in the pipeline. You can turn this off by using the `--notemp` option
166 | - fastqc mode now available. Generates fastqc reports plus summary plots
167 | - Summary file and plot for fastqc and STAR logs
168 | - Missing R packages should install automatically now. No need to install them beforehand. Report any problem plz
169 | - `GLOBAL` values in the config files are now available. They allow to change UMI and BC ranges as well as mismatches for STAR aligner
170 | - Added a new mode: generate_meta. This allows to create all the metadata files needed for the pipeline. You just need a folder with a genome.fa and an annotation.gtf
171 | 
172 | ## [0.21]
173 | ### Added
174 | - Changelog file to track changes
175 | - --rerun option to force a rerun
176 | - Multiple steps allowd now
177 | 
178 | ## [0.2] - 2017-03-14
179 | ### Changed
180 | - The pipeline is now a python package being called as an executable
181 | - Went from json to yaml for config files
182 | 
183 | ### Added
184 | - setup.py and dependencies
185 | - Species plot available
186 | 
187 | ### Removed
188 | - primer handling, went to default: AAGCAGTGGTATCAACGCAGAGTAC
189 | 
190 | 
191 | ## [0.1] - 2017-02-13
192 | ### First release
193 | - Allows for preprocessing, alignement with STAR, post align processing until knee-plot


--------------------------------------------------------------------------------
/docs/docs/Clusters.md:
--------------------------------------------------------------------------------
 1 | Running on clusters
 2 | ----------------------------------
 3 | There is a file in the `templates` called `cluster.yaml`. This can be used to modify ressources needed for your data. I generally recommand moving the file to the root of the folder so that it doesn't get replaced by updates.
 4 | 
 5 | Bellow is an example of running on a cluster using the template file `cluster.yaml` on SLURM.
 6 | 
 7 | ```
 8 | snakemake --cluster 'sbatch -n {cluster.n}  -t {cluster.time} --clusters=CLUSTERNAME --output={cluster.output}' --jobs N --cluster-config cluster.yaml --use-conda --local-cores C
 9 | ```
10 | 
11 | * N: is the number of jobs you are allowed to run at the same time
12 | * C: is the local-cores of the host machine. A few simple rules are gonna be run locally (not sent to nodes) because they are not that heavy (mostly plotting)
13 | * CLUSTERNAME: the name of the cluster you want to use
14 | 
15 | Note: The default path for cluster logs in the cluster.yaml is `logs/cluster/`. If that folder doesn't exist, our cluster can't write and will crash without an error message.


--------------------------------------------------------------------------------
/docs/docs/Create-config-files.md:
--------------------------------------------------------------------------------
  1 | # Config file and sample file
  2 | ---------------------------
  3 | 
  4 | In order to run the pipeline you will need to complete the config.yaml file and the samples.csv file. Both are located in the `templates` folder , should be moved to the root folder of the experiment and filled in for missing entries before running the pipeline.
  5 | 
  6 | The goal for this is to provide the config.yaml when you finally upload the data to a repository for a publication as well as the pipeline version. This provides other users to ability to rerun the processing from scratch exactly as you did. This is possible because snakemake will download and create the exact same environnment for each rule using the envs files provided with the pipeline.
  7 | 
  8 | ## 1. config.yaml - Executables, system and experiment parameters
  9 | The config.yaml contains all the necessary parameters and paths for the pipeline.
 10 | ```
 11 | CONTACT:
 12 |   email: user.name@provider.com
 13 |   person: John Doe
 14 | LOCAL:
 15 |     temp-directory: /tmp
 16 |     memory: 4g
 17 |     raw_data:
 18 |     results:
 19 | META:
 20 |     species:
 21 |         mus_musculus:
 22 |             build: 38
 23 |             release: 94
 24 |         homo_sapiens:
 25 |             build: 38
 26 |             release: 91
 27 |     ratio: 0.2
 28 |     reference-directory: /path/to/references/
 29 |     gtf_biotypes: gtf_biotypes.yaml
 30 | FILTER:
 31 |     barcode_whitelist: ''
 32 |     5-prime-smart-adapter: AAAAAAAAAAA
 33 |     cell-barcode:
 34 |         start: 1
 35 |         end: 12
 36 |     UMI-barcode:
 37 |         start: 13
 38 |         end: 20
 39 |     cutadapt:
 40 |         adapters-file: 'adapters.fa'
 41 |         R1:
 42 |             quality-filter: 20
 43 |             maximum-Ns: 0
 44 |             extra-params: ''
 45 |         R2:
 46 |             quality-filter: 20
 47 |             minimum-adapters-overlap: 6
 48 |             minimum-length: 15
 49 |             extra-params: ''
 50 | MAPPING:
 51 |     STAR:
 52 |         genomeChrBinNbits: 18
 53 |         outFilterMismatchNmax: 10
 54 |         outFilterMismatchNoverLmax: 0.3
 55 |         outFilterMismatchNoverReadLmax: 1
 56 |         outFilterMatchNmin: 0
 57 |         outFilterMatchNminOverLread: 0.66
 58 |         outFilterScoreMinOverLread: 0.66
 59 | EXTRACTION:
 60 |     LOCUS:
 61 |         - CODING
 62 |         - UTR
 63 |     strand-strategy: SENSE
 64 |     UMI-edit-distance: 1
 65 |     minimum-counts-per-UMI: 0
 66 | ```
 67 | Please note the "space" after the colon, is needed for the yaml to work.
 68 | 
 69 | ## Subsections
 70 | 
 71 | ### [CONTACT]
 72 | * `email` and `person` This is not requested. You can provide the e-mail and name address of the person who processed the data using this configuration. Ideally you should provide the config.yaml with the data repository to allow people to rerun the data using dropSeqPipe.
 73 | 
 74 | ### [LOCAL]
 75 | * `temp-directory` is the temp or scratch folder with enough space to keep temporary files.
 76 | * `memory` is the maximum memory allocation pool for a Java Virtual Machine.
 77 | * `raw_data` is the folder containing all your raw fastq.gz files.
 78 | * `results` is the folder that will contain all the results of the pipeline.
 79 | 
 80 | ### [META]
 81 | * `species` is where you list the species of your samples. It can be a mixed experiment with two entries. 
 82 | * `SPECIES_ONE` can be for example: mus_musculus, homo_sapiens, etc... It has to be the name used on ensembl for automatic download to work.
 83 | * `build` is the genome build number.
 84 | * `release` is the annotation release number.
 85 | * `SPECIES_TWO` can be your second species.
 86 | * `ratio` is how much "contamination" from another species you allow to validate them as a species or mixed. 0.2 means you allow a maximum of 20% mixing.
 87 | * `reference-directory` is where you want to store your references files.
 88 | * `gtf_biotypes` is the gtf_biotypes.yaml file containing the selection of biotypes you want to keep for your gene to read attribution. Using less biotypes may decrease your multimapping counts.
 89 | 
 90 | ### [FILTER]
 91 | * `barcode_whitelist` is the filename of your whitelist fi you have one. Well plate base protocols often have one.
 92 | * `5-prime-smart-adapter` is the 5" smart adapter used in your protocol.
 93 | * `cell-barcode and UMI-barcode`: Is the section for cell/umi barcode filtering.
 94 |     * `start` is the first base position of your cell/umi barcode.
 95 |     * `end` is the last base position of your cell/umi barcode.    
 96 | * `cutadapt`: Is the section for trimming.
 97 | * `adapters-file` is the file containing your list of adapters as fasta. you can choose between 6 files in the `templates` folder, add any sequence to existing files or provide your own custom one.
 98 |     * NexteraPE-PE.fa 
 99 |     * TruSeq2-PE.fa
100 |     * TruSeq2-SE.fa
101 |     * TruSeq3-PE-2.fa
102 |     * TruSeq3-PE.fa
103 |     * TruSeq3-SE.fa
104 | Provide the path to the file you want to use for trimming. If you want to add custom sequences or create a complete new one, I would advise to store it in the ROOT folder of the experiment. This will ensure that your custom file will not be overwritten if you update the pipeline.
105 | 
106 | Example: `NexteraPE-PE.fa`
107 | * `R1` lists the options for read1 (cell barcode and umi) filtering/trimming
108 |     * `quality-filter` is the minimum mean score of the sliding window for quality filtering.
109 |     * `maximum-Ns` how many Ns you allow in the cell barcode and umi barcode. By default it is one because we want to be able to collapse barcodes that have one mismatch.
110 |     * `extra-params` if you usually add extra paramters to cutadapt, you can do it here. *Only for experienced cutadapt users*.
111 | * `R2` lists the options for read2 (mRNA) filtering/trimming
112 | that have one mismatch.
113 |     * `maximum-length` is the maximum length of your mRNA read before alignement.
114 |     * `extra-params` if you usually add extra paramters to cutadapt, you can do it here. *Only for experienced cutadapt users*.
115 | For more information about trimming and filtering please visit the [cutadapt](https://cutadapt.readthedocs.io/en/stable/guide.html) website.
116 | 
117 | ### [MAPPING]
118 | * `STAR`
119 |     * `genomeChrBinNbits` is a value used for index generation in STAR. The formula is min(18,int(log2(genomeLength/referenceNumber)))
120 |     * `outFilterMismatchNmax` (default:10) is the maximum number of mismatches allowed.
121 |     * `outFilterMismatchNoverLmax` (default:0.3) is the maximum ratio of mismatched bases that mapped.
122 |     * `outFilterMismatchNoverReadLmax` (default:1.0) is the maximum ratio of mismatched bases of the whole read.
123 |     * `outFilterMatchNmin` (default:0) is the minimum number of matched bases.
124 |     * `outFilterMatchNminOverLread` (default:0.66) alignment will be output only if the ratio of matched bases is higher than or equal to this value.
125 |     * `outFilterScoreMinOverLread` (default:0.66) alignment will be output only if its ratio score is higher than or equal to this value.
126 | 
127 | All of the values for STAR are the default ones. For details about STAR parameters and what they do, please refer to the [STAR manual on git](https://github.com/alexdobin/STAR/tree/master/doc).
128 | 
129 | ### [EXTRACTION]
130 | * `LOCUS` are the overlapping regions that reads overlap and are counted in the final expression matrix. Possible values are `CODING`, `UTR`, `INTRON`
131 | * `UMI-edit-distance` This is the maximum manhattan distance between two UMI barcode when extracting count matrices.
132 | * `min-count-per-umi` is the minimum UMI/Gene pair needed to be counted as one.
133 | * `strand-strategy` `SENSE` defines that you only count genes where the forward strand mapped to the forward region on the DNA. Other possibilities are `ANTISENSE` (only count reads that mapped on the opposite strand) or `BOTH` (count all).
134 | 
135 | # 2. samples.csv - Samples parameters
136 | This file holds the sample names, expected cell numbers and read length for each sample.
137 | The file has to have this format:
138 | 
139 | ```
140 | samples,expected_cells,read_lengths,batch
141 | sample_name1,500,100,Batch1
142 | sample_name2,500,100,Batch2
143 | ```
144 | 
145 | * `expected_cells` is the amount of cells you expect from your sample.
146 | * `read_length` is the read length of the mRNA (Read2). This is necessary for STAR index generation
147 | * `batch` is the batch of your sample. If you are added new samples to the same experiment, this is typically a good place to add the main batch.
148 | 
149 | `Note:` You can add any other column you wish here, it won't affect the pipeline and you can use it later on in your analysis.
150 | 
151 | Finally, you can now [run the pipeline](https://github.com/Hoohm/dropSeqPipe/wiki/Running-dropSeqPipe)
152 | 
153 | or
154 | 
155 | Create a [custom reference](https://github.com/Hoohm/dropSeqPipe/wiki/Reference-Files)
156 | 
157 | 


--------------------------------------------------------------------------------
/docs/docs/FAQ.md:
--------------------------------------------------------------------------------
 1 | # FAQ
 2 | 
 3 | ## 1. I get `error='Cannot allocate memory' (errno=12)`, what should I do. [Fixed]
 4 | 
 5 | This has been fixed by using a wrapper exposing the TMPDIR to the pipeline.
 6 | 
 7 | First, be sure that your TMPDIR from the first configuration yaml has at least 100Go.
 8 | If you still have problems, you should edit the following files in the Drop-seq_tools-1.12:
 9 | 
10 | * TagBamWithReadSequenceExtended
11 | * FilterBAM
12 | * TrimStartingSequence
13 | * PolyATrimmer
14 | * TagReadWithGeneExon
15 | * DetectBeadSynthesisErrors
16 | * SingleCellRnaSeqMetricsCollector
17 | * BAMTagHistogram
18 | 
19 | In each of those files, the last line should be something like:
20 | `java -Xmx${xmx} -Djava.io.tmpdir=/path/to/temp/folder/ -jar $jar_deploy_dir/dropseq.jar $progname $*`
21 | 
22 | You can also use this simple bash script to do it:  
23 | Replace `/path/to/temp/folder/` with your temp path and don't forget to use escapes for /
24 | ```
25 | for f in  BAMTagHistogram SingleCellRnaSeqMetricsCollector DetectBeadSynthesisErrors TagReadWithGeneExon PolyATrimmer TrimStartingSequence FilterBAM TagBamWithReadSequenceExtended
26 | do
27 |  sed -i 's/java -Xmx${xmx}/java -Xmx${xmx} -Djava.io.tmpdir=/path/to/temp/folder/ /g' $f
28 | done
29 | ```


--------------------------------------------------------------------------------
/docs/docs/Installation.md:
--------------------------------------------------------------------------------
 1 | This pipeline is dependent on conda.
 2 | 
 3 | ### Step 1: Download and install miniconda3
 4 | First you need to download and install miniconda3:
 5 | 
 6 | for linux
 7 | ```
 8 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
 9 | bash Miniconda3-latest-Linux-x86_64.sh
10 | ```
11 | 
12 | for mac os
13 | ```
14 | curl https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -o Miniconda3-latest-MacOSX-x86_64.sh
15 | bash Miniconda3-latest-MacOSX-x86_64.sh
16 | ```
17 | 
18 | 
19 | ### Step 2: Clone the workflow
20 | 
21 | Clone the worflow
22 | ```
23 | git clone https://github.com/Hoohm/dropSeqPipe.git
24 | ```
25 | 
26 | ### Step 3: Install snakemake
27 | 
28 | ```
29 | conda install -c bioconda -c conda-forge snakemake
30 | ```
31 |  
32 | Next step is config files completion
33 | 
34 | [Complete the config.yaml](https://github.com/Hoohm/dropSeqPipe/wiki/Create-config-files) with the missing information
35 | 
36 | ### UPDATES: How to update the pipeline
37 | 
38 | Go to your experiment folder, then pull.
39 | ```
40 | git pull https://github.com/Hoohm/dropSeqPipe.git
41 | ```
42 | 
43 | If you want to update files/plots based on the updates you can use this command:
44 | ```
45 | snakemake -R `snakemake --list-codes-changes`
46 | ```
47 | This will update all the files that would be modified by the changes in the code (rules or script). Depending on how much and where the changes have been made, this might rerun the whole pipeline.


--------------------------------------------------------------------------------
/docs/docs/Plots.md:
--------------------------------------------------------------------------------
 1 | On of the main purpose of this package is getting information about your data to improve your protocol and filter your data for further downstream analysis.
 2 | 
 3 | 
 4 | Here is a list of plots and reports that you will get from the pipeline.
 5 | 
 6 | Fastqc, STAR and cutadapt reports are generated as [multiqc reports](http://multiqc.info/docs/#using-multiqc-reports) in the reports folder.
 7 | 
 8 | 
 9 | ## 1. Adapter content
10 | ![Adapter content](images/adapter_content.pdf)
11 | On the x axis are the samples.
12 | On the y axis are the percentages of total adapters that have been found (and trimmed) in respective fastq files based on the `adapter-file` provided via `config.yaml`.
13 | 
14 | The top plot is for read1 and the bottom for read2.
15 | 
16 | This plot provides an idea of the which adapter has been found and in which proportion in each sample.
17 | 
18 | ## 2. Yield (across samples)
19 | ![Yield](images/yield.png)
20 | On the x axis are the samples.  
21 | TOP: On the y axis are the number of reads attributed to each category.
22 | BOTTOM: On the y axis are the percentage of attributed to each category.
23 | This plot gives you an overview of all the reads from your samples and how they are distributed in all the possible categories. The reads that are uniquely mapped ar the ones you will keep at the end for the UMI count matrix.
24 | 
25 | ## 3. Knee plot (per sample)
26 | ![Knee plot](images/sample1_knee_plot.png)
27 | On the x axis is the cumulative fraction of reads per STAMPS (captured cell).  
28 | On the y axis is the ordered STAMPS (based on total reads).
29 | This allows you to determine how much of the reads you actually captured with the number of cells you expected.
30 | The cutting is based on the `expected_cells` parameter in the `samples.csv` file.
31 | The green `selected cells` are the cells that are going to be in the final expression matrix.
32 | If you see a clear bend on the plot that is higher in the number of cells than what you expected, you should increase the `expected_cells` value and rerun the `extract` step. If it is under, I would advise to filter out your data with a downstream analysis tool such as Seurat.
33 | *Note: I advise not to try to discover "real" cells/STAMPS at this stage. I suggest to extract the expected number of cells and filter out later in post-processing with other kind of meta data.* 
34 | 
35 | 
36 | ## 4. RNA metrics (per sample)
37 | ![RNA metrics](images/sample1_rna_metrics.png)
38 | On the x axis are top barcodes based on your `expected_cells` values or the `barcodes.csv` file.  
39 | Top plot: On the y axis are the number of bases classified by region of mapping.  
40 | Bottom plot: On the y axis are the percentage of bases classified by region of mapping.
41 | This plot gives a lot of different informations. The top plot allows you to quickly compare cells between them in terms of how much has been mapped. This can sometimes help identify outliers or bad runs.
42 | The bottom plot allows you to find cells that have an "abnormal" mapped base distribution compared to other cells.
43 | 
44 | 
45 | 
46 | ## 5. Violine plots for barcode properties (across samples)
47 | ![Violine plots](images/mac_violinplots_comparison_UMI.png)
48 | Various statistic for barcodes that were taken forward as STAMPs as set as `expected_cells` in `config.yaml`.
49 | Each point represents a barcode augmented by a violine-plot density estimator of barcode distribution along the y-axis.
50 | 
51 | On the x axis are the samples for each panel (Note: the dot distribution along the x-axis does't not bear information, it's just a visual aid to better assess density).
52 | On the y axis are the respecitve statistics described below for each panel.
53 | 
54 | TOP panel from left to right: 
55 | 
56 | - nUMI: number of UMI per barcode
57 | - nCounts: number of Counts per barcode
58 | - top50: fraction (percentage/100) of the highest expressed genes compared to entire set of genes. 
59 | 
60 | BOTTOM: 
61 | 
62 | - nUMI: average number of UMI per Gene per barcode
63 | - pct.Ribo: Fraction of ribosomal RNA (Note: ribsomal transcripts defined as starting with "^Rpl")
64 | - pct.mito: Fraction of mitochondrial RNA (Note: mitchondrial transcripts defined as starting with "^mt-")
65 | 
66 | ## 6. Saturation plot: UMI per barcode (across samples)
67 | ![umi per barcode](images/mac_UMI_vs_gene.png)
68 | Number of UMI (x-axis) vs number of Genes (y-axis) for each barcode (points in plot) broken down by sample (different colors). 
69 | Number of Genes defined as Genes having at least 1 read mapped to them.
70 | Individual samples are color-coded. A loess regression curve of barcodes for each sample is fitted. 
71 | Various statistic for barcodes that were taken forward as STAMPs as set as `expected_cells` in `config.yaml`.
72 | 
73 | This plot can indicate how many counts per barcode are required on average to find all expressed genes in a cell.
74 | Given enought coverage, it can also indicate how many genes are expressed for the examined cell type.
75 | 
76 | ## 7. Saturation plot: Counts per barcode (across samples)
77 | ![counts per barcode](images/mac_Count_vs_gene.png)
78 | Number of Counts (x-axis) vs number of Genes (y-axis) for each barcode (points in plot) broken down by sample (different colors). 
79 | Number of Genes defined as Genes having at least 1 read mapped to them.
80 | Individual samples are color-coded. A loess regression curve of barcodes for each sample is fitted. 
81 | Various statistic for barcodes that were taken forward as STAMPs as set as `expected_cells` in `config.yaml`.
82 | 
83 | ## 8. Counts per UMI per barcode (across samples)
84 | ![counts per UMI](images/mac_UMI_vs_counts.png)
85 | Number of UMI (x-axis) vs number of Counts (y-axis) for each barcode (points in plot) broken down by sample (different colors). 
86 | Individual samples are color-coded. A loess regression curve of barcodes for each sample is fitted. 
87 | Black line indicate an optimal 1:1 ratio between UMI and Counts (i.e. no Duplicates!)
88 | 
89 | This plots can give an indication on the level of duplication for each sample. The close to black line the lower duplication.
90 | 
91 | # Mixed experiment
92 | 
93 | ## 9. Barnyard plot (per sample)
94 | ![Barnyard plot](images/hum_mus_species_plot_transcripts.png)
95 | This plot shows you species purity for each STAMPS. Mixed and No call STAMPS are dropped and only single species are kept for extraction.
96 | You can change the minimum ratio of transcripts to define a STAMP as mixed or not in the configfile with: `species_ratio`
97 | You get one plot for genes and one plot for transcripts. The selection is done on the transcript level.
98 | 


--------------------------------------------------------------------------------
/docs/docs/Reference-Files.md:
--------------------------------------------------------------------------------
 1 | Reference files
 2 | -----------------
 3 | From version 0.4 on, reference files are automatically downloaded by the pipeline. Mixed references are also downloaded and merged automatically. Since sometimes you still want to use your own reference you can bypass the download by creating your own `genome.fa` and `annotation.gtf` file.
 4 | 
 5 | Snakemake generates file based on paths. If you want to use a custom reference you have to name it properly for snakemake to find it.
 6 | 
 7 | Here is an example:
 8 | 
 9 | Let's assume this is you configuration for the META section:
10 | ```
11 | META:
12 |     species:
13 |         funky_species_name:
14 |             build: A
15 |             release: 1
16 |     ratio: 0.2
17 |     reference-directory: /absolute/path/to/references
18 |     gtf_biotypes: gtf_biotypes.yaml
19 | ```
20 | 
21 | You need to provide the following files
22 | 
23 | ```
24 | /absolute/path/to/references/funky_species_name_A_1/genome.fa
25 | /absolute/path/to/references/funky_species_name_A_1/annotation.gtf
26 | ```
27 | 
28 | This will stop dropSeqPipe from downloading a new reference.
29 | 
30 | 
31 | Once the pipeline has run completely, the folder will look like this:
32 | 
33 | ```
34 | genome.fa
35 | annotation.gtf
36 | annotation.refFlat
37 | annotation_reduced.gtf
38 | genome.consensus_introns.intervals
39 | genome.dict
40 | genome.exons.intervals
41 | genome.genes.intervals
42 | genome.intergenic.intervals
43 | genome.rRNA.intervals
44 | STAR_INDEX/SA_read_length/
45 | ```
46 | 
47 | Note: The STAR index will be built based on the read length of your mRNA read (Read2).
48 | If you have different lengths, it will produce multiple indexes.
49 | 
50 | Finally, you can now [run the pipeline](https://github.com/Hoohm/dropSeqPipe/wiki/Running-dropSeqPipe)


--------------------------------------------------------------------------------
/docs/docs/Running-dropSeqPipe.md:
--------------------------------------------------------------------------------
  1 | Example
  2 | -----------------------
  3 | The pipeline is to be cloned once and then run on any folder containing the configuration files and your raw data. The workingdir folder can contain multiple runs (aka batches) as you can easily add new samples when recieving new data and run the same commands. This will simply run the pipeline on the newly added data and recreate reports as well as plots containing all the samples.
  4 | 
  5 | Example: You run 2 biological conditions with 2 replicates. This makes up for 4 samples. Assume a simple dropseq protocol with only human cells.
  6 | 1. You sequence the data and recieve the 8 files (two files per sample) and download the pipeline
  7 | 2. You run the pipeline with the command: `snakemake --use-conda --cores N --directory WORKING_DIR`. `N` being the number of cores available and `WORKING_DIR` being the folder containing your `config.yaml`, `samples.csv`, adapter file and `gtf_biotypes.yaml`.
  8 | 3. You see that there is an issue with the protocol and you modify it
  9 | 4. You create a new set of libraries and sequence them (same 2x2 design)
 10 | 5. You add the new files in the data folder of `WORKING_DIR` and edit the samples.csv to add missing samples.
 11 | 6. You run the pipeline as you did the first time `snakemake --use-conda --cores N --directory WORKING_DIR`
 12 | 7. This will run the new samples only and recreate the reports as well as the yield plots.
 13 | 8. It is now easy to compare the impact of your change in the procotol
 14 | 
 15 | Working dir folder preparation
 16 | ----------------
 17 | The raw data from the sequencer should be stored in the `RAW_DATA` folder of `WORKING_DIR` folder like this:
 18 | ```
 19 | /path/to/your/WORKING_DIR/
 20 | | -- RAW_DATA/
 21 | | -- -- sample1_R1.fastq.gz
 22 | | -- -- sample1_R2.fastq.gz
 23 | | -- -- sample2_R1.fastq.gz
 24 | | -- -- sample2_R2.fastq.gz
 25 | | samples.csv
 26 | | config.yaml
 27 | | barcodes.csv
 28 | | adapters.fa
 29 | ```
 30 | *Note: In DropSeq or ScrbSeq you expect a paired sequencing. R1 will hold the information of your barcode and UMI, R2 will hold the 3' end of the captured mRNA.*
 31 | 
 32 | 
 33 | Once everything is in place, you can run the pipeline using the normal snakemake commands.
 34 | 
 35 | Running the pipeline (TLDR version)
 36 | ----------------------------
 37 | 
 38 | For a simple single cell run you only need to run: `snakemake --cores N --use-conda --directory WORKING_DIR`
 39 | This will run the whole pipeline and use the X number of cores you gave to it.
 40 | 
 41 | 
 42 | Running the pipeline
 43 | ---------------------------------
 44 | 
 45 | I highly recommend to take a [look at the options](http://snakemake.readthedocs.io/en/latest/) that are available since I won't cover everything here.
 46 | 
 47 | 
 48 | Modes
 49 | ------------------------------
 50 | You have two main ways to run the pipeline.
 51 | 
 52 | You can either just run `snakemake --use-conda --directory WORKING_DIR` in the root folder containing your experiment and it will run everything without stopping.
 53 | 
 54 | You can also run each step separately. The main advantage of the second way is that you are able to fine tune your parameters based on the results of fastqc, filtering, mapping quality, etc...
 55 | I would suggest using the second approach when you work on a new protocol and the first one when you are confident of your parameters.
 56 | 
 57 | There are seven different modes available and to run one specifically you need to call the mode.
 58 | 
 59 | Example: To run the `qc` mode: `snakemake --cores 8 qc --use-conda --directory WORKING_DIR`
 60 | You can also run multiple modes at the same time if you want: `snakemake --cores 8 qc filter --use-conda --directory WORKING_DIR`
 61 | 
 62 | ### Single species:
 63 | * `meta`: Downloads and generates all the subsequent references files and STAR index needed to run the pipeline. You can run this alone if you just want to create the meta-data file before running a new set of data.
 64 | * `qc`: Creates fastqc reports of your data.    
 65 | * `filter`: Go from sample_R1.fastq.gz to sample_filtered.fastq.gz ready to be mapped to the genome.  This step filters out your data for low quality reads and trims adapter you provided in the FILTER section.
 66 | * `map`: Go from sample_filtered.fastq.gz to the sample_final.bam read to extract the expression data. This maps the data to the genom.
 67 | * `extract`: Extract the expression data. You'll get a umi and a count expression matrix from your whole experiment.
 68 | 
 69 | ### Mixed species
 70 | Since v`0.4` the pipeline detects mixed experiments on the fly. Simply run `snakemake --directory WORKING_DIR --use-conda`. The stepwise approach is not available for mixed experiments.
 71 | 
 72 | Barcode whitelist
 73 | ---------------------
 74 | In protocols such as SCRBseq, the expected barcodes sequences are known. This pipeline also does allow the use of known barcodes instread of a number of expected cells.
 75 | In order to use this functionnality you just need to add a whitelist barcode file and provide the name of the file in the configuration in the section:
 76 | 
 77 | ```
 78 | FILTER:
 79 | 	barcode_whitelist: name_of_your_whitelist_file
 80 | ```
 81 | The file should be in the WORKING_DIR. Run the pipeline as usual.
 82 | 
 83 | Advanced options
 84 | -------------------
 85 | If you have some specific adapters that are not present by default in the ones in the `templates` folder, you can add whatever adapters you want to trim (as many as you need) following the fasta syntax.
 86 | 
 87 | ```
 88 | FILTER:
 89 | 	cutadapt:
 90 | 		adapters-file: name_of_your_adapter_file.fa
 91 | ```
 92 | 
 93 | Further options
 94 | ---------------------
 95 | * `--cores N` Use this argument to use X amunt of cores available.
 96 | * `--notemp` Use this to not delete all the temporary files. Without this option, only files between steps are kept. Use this option if you are troobleshooting the pipeline or you want to analyze in between files yourself.
 97 | * `--dryrun` or `-n` Use this to check out what is going to run if you run your command. This is nice to check for potential missing files.
 98 | 
 99 | 
100 | 
101 | Folder Structure
102 | -----------------------
103 | This is the folder structure you get in the end:
104 | ```
105 | /path/to/your/WORKING_DIR/
106 | | -- RAW_DATA/
107 | | -- RESULT_DIR/
108 | | -- -- logs/
109 | | -- -- -- cluster/
110 | | -- -- plots/
111 | | -- -- reports/
112 | | -- -- summary/
113 | | -- -- samples/
114 | | samples.csv
115 | | config.yaml
116 | | barcodes.csv
117 | | adapter.fa
118 | | .snakemake/
119 | ```
120 | 
121 | * `RAW_DATA/` Contains all your samples as well as the intermediary files
122 | * `RESULT_DIR/logs/` Contains all the logfiles generated by the pipeline
123 | * `RESULT_DIR/logs/cluster` Contains all the logfiles generated by the cluster
124 | * `RESULT_DIR/plots/` Contains all the plots generated by the pipeline
125 | * `RESULT_DIR/reports/` Contains all the reports generated by the pipeline
126 | * `RESULT_DIR/summary/` Contains all the files you might use for downstream analysis (contains barcodes selected per sample per species, final umi/counts expression matrix)
127 | * `RESULT_DIR/samples/` Contains all the sample specific files. Bam files, barcodes used, single sample expression files, etc...
128 | * `samples.csv` File containing sample details
129 | * `config.yaml` File containing pipeline parameters as well as system parameters
130 | * `adapters.fa` File containing all the adapters you wish to trim from the raw data.
131 | * `.snakemake/` Folder that contains all the environements created for the run as well as a lot of other things.


--------------------------------------------------------------------------------
/docs/docs/images/adapter_content.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hoohm/dropSeqPipe/4927d6e60e9fcac3516db3df48c67c3c99af7999/docs/docs/images/adapter_content.png


--------------------------------------------------------------------------------
/docs/docs/images/hum_mus_species_plot_transcripts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hoohm/dropSeqPipe/4927d6e60e9fcac3516db3df48c67c3c99af7999/docs/docs/images/hum_mus_species_plot_transcripts.png


--------------------------------------------------------------------------------
/docs/docs/images/mac_Count_vs_gene.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hoohm/dropSeqPipe/4927d6e60e9fcac3516db3df48c67c3c99af7999/docs/docs/images/mac_Count_vs_gene.png


--------------------------------------------------------------------------------
/docs/docs/images/mac_UMI_vs_counts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hoohm/dropSeqPipe/4927d6e60e9fcac3516db3df48c67c3c99af7999/docs/docs/images/mac_UMI_vs_counts.png


--------------------------------------------------------------------------------
/docs/docs/images/mac_UMI_vs_gene.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hoohm/dropSeqPipe/4927d6e60e9fcac3516db3df48c67c3c99af7999/docs/docs/images/mac_UMI_vs_gene.png


--------------------------------------------------------------------------------
/docs/docs/images/mac_violinplots_comparison_UMI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hoohm/dropSeqPipe/4927d6e60e9fcac3516db3df48c67c3c99af7999/docs/docs/images/mac_violinplots_comparison_UMI.png


--------------------------------------------------------------------------------
/docs/docs/images/sample1_knee_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hoohm/dropSeqPipe/4927d6e60e9fcac3516db3df48c67c3c99af7999/docs/docs/images/sample1_knee_plot.png


--------------------------------------------------------------------------------
/docs/docs/images/sample1_rna_metrics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hoohm/dropSeqPipe/4927d6e60e9fcac3516db3df48c67c3c99af7999/docs/docs/images/sample1_rna_metrics.png


--------------------------------------------------------------------------------
/docs/docs/images/yield.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hoohm/dropSeqPipe/4927d6e60e9fcac3516db3df48c67c3c99af7999/docs/docs/images/yield.png


--------------------------------------------------------------------------------
/docs/docs/index.md:
--------------------------------------------------------------------------------
1 | Welcome
2 | ------------------------------
3 | 
4 | Welcome to the documentation of dropSeqPipe v`0.4`.


--------------------------------------------------------------------------------
/docs/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: dropSeqPipe
 2 | theme: readthedocs
 3 | repo_name: 'GitHub'
 4 | repo_url: https://github.com/Hoohm/dropSeqPipe
 5 | nav:
 6 |   - 'index.md'
 7 |   - 'Installation.md'
 8 |   - 'Reference-Files.md'
 9 |   - 'Create-config-files.md'
10 |   - 'Running-dropSeqPipe.md'
11 |   - 'Clusters.md'
12 |   - 'Plots.md'
13 |   - 'CHANGELOG.md'
14 |   - 'FAQ.md'
15 | 
16 | google_analytics:
17 |   - 'UA-128943644-1'
18 |   - 'auto'


--------------------------------------------------------------------------------
/docs/mkdocs_env.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - defaults
 4 | dependencies:
 5 |   - bzip2=1.0.6=h470a237_2
 6 |   - ca-certificates=2018.8.24=ha4d7672_0
 7 |   - certifi=2018.8.24=py37_1001
 8 |   - click=7.0=py_0
 9 |   - jinja2=2.10=py_1
10 |   - libffi=3.2.1=hfc679d8_5
11 |   - libgcc-ng=7.2.0=hdf63c60_3
12 |   - libstdcxx-ng=7.2.0=hdf63c60_3
13 |   - livereload=2.5.2=py_0
14 |   - markdown=2.6.11=py_0
15 |   - markupsafe=1.0=py37h470a237_1
16 |   - mkdocs=1.0.4=py_0
17 |   - ncurses=6.1=hfc679d8_1
18 |   - openssl=1.0.2p=h470a237_0
19 |   - pip=18.1=py37_1000
20 |   - python=3.7.0=h5001a0f_4
21 |   - python-markdown-math=0.6=py_0
22 |   - pyyaml=3.13=py37h470a237_1
23 |   - readline=7.0=haf1bffa_1
24 |   - setuptools=40.4.3=py37_0
25 |   - six=1.11.0=py37_1001
26 |   - sqlite=3.25.2=hb1c47c0_0
27 |   - tk=8.6.8=ha92aebf_0
28 |   - tornado=5.1.1=py37h470a237_0
29 |   - wheel=0.32.1=py37_0
30 |   - xz=5.2.4=h470a237_1
31 |   - yaml=0.1.7=h470a237_1
32 |   - zlib=1.2.11=h470a237_3
33 | 
34 | 


--------------------------------------------------------------------------------
/envs/bbmap.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - bioconda
3 | dependencies:
4 |   - bbmap=38.22
5 | 


--------------------------------------------------------------------------------
/envs/cutadapt.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - bioconda
3 |   - conda-forge
4 |   - defaults
5 | dependencies:
6 |   - python>=3.3
7 |   - cutadapt=1.16
8 | 


--------------------------------------------------------------------------------
/envs/dropseq_tools.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - bioconda
3 |   - anaconda
4 |   - conda-forge
5 | dependencies:
6 |   - dropseq_tools=2.0.0
7 |   - font-ttf-dejavu-sans-mono=2.37
8 |   - fontconfig=2.13.1
9 | 


--------------------------------------------------------------------------------
/envs/merge.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - r
3 |   - conda-forge
4 | dependencies:
5 |   - r=3.4.1
6 |   - readline=6.2
7 |   - r-matrix=1.2_14
8 | 


--------------------------------------------------------------------------------
/envs/merge_bam.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - bioconda
3 | dependencies:
4 |   - pysam=0.15.1
5 |   - biopython=1.72
6 |   - python>=3.6
7 | 


--------------------------------------------------------------------------------
/envs/merge_long.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 | dependencies:
4 |   - pandas=0.25.1
5 | 


--------------------------------------------------------------------------------
/envs/picard.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - bioconda
 3 |   - anaconda
 4 |   - conda-forge
 5 | dependencies:
 6 |   - picard=2.14.1.0
 7 |   - font-ttf-dejavu-sans-mono=2.37
 8 |   - fontconfig=2.13.1
 9 | 
10 | 


--------------------------------------------------------------------------------
/envs/pigz.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - anaconda
3 | dependencies:
4 |   - pigz=2.4
5 | 


--------------------------------------------------------------------------------
/envs/r.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 | dependencies:
 5 |   - r=3.4.1
 6 |   - r-ggplot2=2.2.1
 7 |   - r-gridextra
 8 |   - r-viridis
 9 |   - r-stringdist
10 |   - r-dplyr=0.7.6
11 |   - r-mvtnorm
12 |   - r-seurat=2
13 |   - r-hmisc
14 |   - r-tidyverse
15 |   - r-devtools
16 |   - r-rcolorbrewer
17 |   - font-ttf-dejavu-sans-mono=2.37
18 |   - fontconfig=2.13.1
19 | 


--------------------------------------------------------------------------------
/envs/samtools.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - bioconda
3 |   - conda-forge
4 | dependencies:
5 |   - samtools=1.9
6 |   - ncurses=6.1
7 | 


--------------------------------------------------------------------------------
/envs/star.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - bioconda
3 | dependencies:
4 |   - star=2.6.1b
5 | 


--------------------------------------------------------------------------------
/envs/umi_tools.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - bioconda
3 | dependencies:
4 |   - umi_tools=0.5.5
5 |   - scipy=1.1.0


--------------------------------------------------------------------------------
/envs/velocyto.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 | dependencies:
 5 |   - numpy
 6 |   - scipy
 7 |   - cython
 8 |   - numba
 9 |   - matplotlib
10 |   - scikit-learn
11 |   - h5py
12 |   - click
13 |   - pip:
14 |     - velocyto


--------------------------------------------------------------------------------
/rules/cell_barcodes.smk:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | ruleorder: extend_barcode_whitelist > extend_barcode_top
 4 | ruleorder: extend_barcode_whitelist > get_cell_whitelist
 5 | 
 6 | 
 7 | localrules:
 8 |     get_cell_whitelist,
 9 |     extend_barcode_top
10 | 
11 | rule extend_barcode_whitelist:
12 |     input:
13 |         whitelist=barcode_whitelist
14 |     output:
15 |         barcodes='{results_dir}/samples/{sample}/barcodes.csv',
16 |         barcode_ref='{results_dir}/samples/{sample}/barcode_ref.pkl',
17 |         barcode_ext_ref='{results_dir}/samples/{sample}/barcode_ext_ref.pkl',
18 |         barcode_mapping='{results_dir}/samples/{sample}/empty_barcode_mapping.pkl'
19 |     script:
20 |         '../scripts/generate_extended_ref.py'
21 | 
22 | rule get_top_barcodes:
23 |     input:
24 |         '{results_dir}/samples/{sample}/trimmed_repaired_R1.fastq.gz'
25 |     output:
26 |         '{results_dir}/samples/{sample}/top_barcodes.csv'
27 |     conda: '../envs/umi_tools.yaml'
28 |     params:
29 |         cell_barcode_length=(config['FILTER']['cell-barcode']['end'] - config['FILTER']['cell-barcode']['start'] + 1),
30 |         umi_barcode_length=(config['FILTER']['UMI-barcode']['end'] - config['FILTER']['UMI-barcode']['start'] + 1),
31 |         num_cells=lambda wildcards: round(int(samples.loc[wildcards.sample,'expected_cells'])*1.2),
32 |     shell:
33 |         """umi_tools whitelist\
34 |         --stdin {input}\
35 |         --bc-pattern='(?P<cell_1>.{{{params.cell_barcode_length}}})(?P<umi_1>.{{{params.umi_barcode_length}}})'\
36 |         --extract-method=regex\
37 |         --set-cell-number={params.num_cells}\
38 |         --log2stderr > {output}"""
39 | 
40 | rule get_cell_whitelist:
41 |     input:
42 |         '{results_dir}/samples/{sample}/top_barcodes.csv'
43 |     output:
44 |         '{results_dir}/samples/{sample}/barcodes.csv'
45 |     shell:
46 |         """cat {input} | cut -f 1 > {output}"""
47 | 
48 | 
49 | rule extend_barcode_top:
50 |     input:
51 |         whitelist='{results_dir}/samples/{sample}/top_barcodes.csv'
52 |     output:
53 |         barcode_ref='{results_dir}/samples/{sample}/barcode_ref.pkl',
54 |         barcode_ext_ref='{results_dir}/samples/{sample}/barcode_ext_ref.pkl',
55 |         barcode_mapping='{results_dir}/samples/{sample}/empty_barcode_mapping.pkl'
56 |     script:
57 |         '../scripts/umi_tools_extended_ref.py'
58 | 
59 | 
60 | rule repair_barcodes:
61 |     input:
62 |         bam='{results_dir}/samples/{sample}/Aligned.merged.bam',
63 |         barcode_ref='{results_dir}/samples/{sample}/barcode_ref.pkl',
64 |         barcode_ext_ref='{results_dir}/samples/{sample}/barcode_ext_ref.pkl',
65 |         barcode_mapping='{results_dir}/samples/{sample}/empty_barcode_mapping.pkl'
66 |     conda: '../envs/merge_bam.yaml'
67 |     output:
68 |         bam=temp('{results_dir}/samples/{sample}/Aligned.repaired.bam'),
69 |         barcode_mapping_counts='{results_dir}/samples/{sample}/barcode_mapping_counts.pkl'
70 |     script:
71 |         '../scripts/repair_barcodes.py'


--------------------------------------------------------------------------------
/rules/download_meta_mixed.smk:
--------------------------------------------------------------------------------
  1 | from snakemake.remote.FTP import RemoteProvider as FTPRemoteProvider
  2 | FTP = FTPRemoteProvider()
  3 | 
  4 | localrules:
  5 |     download_annotation,
  6 |     download_genome,
  7 |     rename_genome,
  8 |     merge_genomes,
  9 |     merge_annotations
 10 | 
 11 | def get_annotation(wildcards):
 12 |     return FTP.remote("ftp.ensembl.org/pub/release-{0}/gtf/{1}/{2}.GRC{3}{4}.{0}.gtf.gz".format(
 13 |         wildcards.release,
 14 |         wildcards.species,
 15 |         wildcards.species.capitalize(),
 16 |         wildcards.species.lower()[0],
 17 |         wildcards.build),
 18 |         static=True,
 19 |         keep_local=True)
 20 | 
 21 | def get_genome(wildcards):
 22 |     return FTP.remote("ftp.ensembl.org/pub/release-{0}/fasta/{1}/dna/{2}.GRC{3}{4}.dna.primary_assembly.fa.gz".format(
 23 |         wildcards.release,
 24 |         wildcards.species,
 25 |         wildcards.species.capitalize(),
 26 |         wildcards.species.lower()[0],
 27 |         wildcards.build),
 28 |         static=True,
 29 |         keep_local=True)
 30 | 
 31 | rule download_annotation:
 32 |     input:
 33 |         get_annotation
 34 |     output:
 35 |         "{ref_path}/{species}_{build}_{release}/annotation.gtf"
 36 |     shell:
 37 |         "gunzip -c -d {input} > {output}"
 38 | 
 39 | rule download_genome:
 40 |     input:
 41 |         get_genome
 42 |     output:
 43 |         "{ref_path}/{species}_{build}_{release}/genome.fa"
 44 |     shell:
 45 |         "gunzip -d -c {input} > {output}"
 46 | 
 47 | 
 48 | rule rename_genome:
 49 |     input:
 50 |         "{ref_path}/{species}_{build}_{release}/genome.fa"
 51 |     output:
 52 |        temp("{ref_path}/{species}_{build}_{release}/renamed_genome.fa")
 53 |     params:
 54 |         species= lambda wildcards: wildcards.species
 55 |     shell:
 56 |         """sed -e 's/>/>{params.species}/g' {input} > {output}"""
 57 | 
 58 | 
 59 | rule merge_genomes:
 60 |     input:
 61 |         genome1=expand("{ref_path}/{species}_{build}_{release}/renamed_genome.fa",
 62 |             species=species_list[0],
 63 |             build=build_list[0],
 64 |             release=release_list[0],
 65 |             ref_path=config['META']['reference-directory']),
 66 |         genome2=expand("{ref_path}/{species}_{build}_{release}/renamed_genome.fa",
 67 |             species=species_list[1],
 68 |             build=build_list[1],
 69 |             release=release_list[1],
 70 |             ref_path=config['META']['reference-directory'])
 71 |     output:
 72 |         "{}/{}_{}_{}/genome.fa".format(
 73 |             config['META']['reference-directory'],
 74 |             species,
 75 |             build,
 76 |             release)
 77 |     shell:
 78 |         """cat {input.genome1} {input.genome2} > {output}"""
 79 | 
 80 | rule merge_annotations:
 81 |     input:
 82 |         annotation1=expand("{ref_path}/{species}_{build}_{release}/annotation.gtf",
 83 |             species=species_list[0],
 84 |             build=build_list[0],
 85 |             release=release_list[0],
 86 |             ref_path=config['META']['reference-directory']),
 87 |         annotation2=expand("{ref_path}/{species}_{build}_{release}/annotation.gtf",
 88 |             species=species_list[1],
 89 |             build=build_list[1],
 90 |             release=release_list[1],
 91 |             ref_path=config['META']['reference-directory']),
 92 |     output:
 93 |         "{}/{}_{}_{}/annotation.gtf".format(
 94 |             config['META']['reference-directory'],
 95 |             species,
 96 |             build,
 97 |             release)
 98 |     params:
 99 |         build_list=build_list,
100 |         release_list=release_list,
101 |         species_list=species_list
102 |     run:
103 |         import datetime
104 |         import re
105 |         header1="#!Mixed reference of {} and {}\n".format(
106 |             species_list[0],
107 |             species_list[1])
108 |         header2="#!genome-builds GRC{}{} GRC{}{}\n".format(
109 |             species_list[0].lower()[0],
110 |             build_list[0],
111 |             species_list[1].lower()[0],
112 |             build_list[1])
113 |         header3="#!genome-releases {} {}\n".format(
114 |             release_list[0],
115 |             release_list[1])
116 |         header4="#!genome-date {}\n".format(str(datetime.date.today()))
117 |         header=[header1,header2,header3,header4]
118 |         with open(input.annotation1[0]) as annotation1:
119 |             with open(input.annotation2[0]) as annotation2:
120 |                 with open(output[0], 'w') as outfile:
121 |                     outfile.writelines(header)
122 |                     for line in annotation1:
123 |                         if(not line.startswith('#!')):
124 |                             outfile.write(re.sub('^',species_list[0],line))
125 |                     for line in annotation2:
126 |                         if(not line.startswith('#!')):
127 |                             outfile.write(re.sub('^',species_list[1],line))
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/rules/download_meta_single.smk:
--------------------------------------------------------------------------------
 1 | from snakemake.remote.FTP import RemoteProvider as FTPRemoteProvider
 2 | FTP = FTPRemoteProvider()
 3 | 
 4 | localrules:
 5 | 	download_annotation,
 6 | 	download_genome
 7 | 
 8 | def get_annotation(wildcards):
 9 |     return FTP.remote("ftp.ensembl.org/pub/release-{0}/gtf/{1}/{2}.GRC{3}{4}.{0}.gtf.gz".format(
10 |         wildcards.release,
11 |         wildcards.species,
12 |         wildcards.species.capitalize(),
13 |         wildcards.species.lower()[0],
14 |         wildcards.build),
15 |         static=True,
16 |         keep_local=True)
17 | 
18 | def get_genome(wildcards):
19 |     return FTP.remote("ftp.ensembl.org/pub/release-{0}/fasta/{1}/dna/{2}.GRC{3}{4}.dna.primary_assembly.fa.gz".format(
20 |         wildcards.release,
21 |         wildcards.species,
22 |         wildcards.species.capitalize(),
23 |         wildcards.species.lower()[0],
24 |         wildcards.build),
25 |         static=True,
26 |         keep_local=True)
27 | 
28 | rule download_annotation:
29 |     input:
30 |         get_annotation
31 |     output:
32 |         "{ref_path}/{species}_{build}_{release}/annotation.gtf"
33 |     shell:
34 |         "gunzip -c -d {input} > {output}"
35 | 
36 | rule download_genome:
37 |     input:
38 |         get_genome
39 |     output:
40 |         "{ref_path}/{species}_{build}_{release}/genome.fa"
41 |     shell:
42 |         "gunzip -d -c {input} > {output}"


--------------------------------------------------------------------------------
/rules/extract_expression_single.smk:
--------------------------------------------------------------------------------
  1 | """Extract expression fof single species"""
  2 | 
  3 | #Which rules will be run on the host computer and not sent to nodes
  4 | localrules:
  5 |     plot_rna_metrics,
  6 |     convert_long_to_mtx,
  7 |     compress_mtx
  8 | 
  9 | rule extract_umi_expression:
 10 |     input:
 11 |         data='{results_dir}/samples/{sample}/final.bam',
 12 |         barcode_whitelist='{results_dir}/samples/{sample}/barcodes.csv'
 13 |     output:
 14 |         long='{results_dir}/samples/{sample}/umi/expression.long',
 15 |         dense=temp('{results_dir}/samples/{sample}/umi/expression.tsv')
 16 |     params:
 17 |         count_per_umi=config['EXTRACTION']['minimum-counts-per-UMI'],
 18 |         num_cells=lambda wildcards: int(samples.loc[wildcards.sample,'expected_cells']),
 19 |         umiBarcodeEditDistance=config['EXTRACTION']['UMI-edit-distance'],
 20 |         temp_directory=config['LOCAL']['temp-directory'],
 21 |         memory=config['LOCAL']['memory'],
 22 |         locus_list=','.join(config['EXTRACTION']['LOCUS']),
 23 |         strand_strategy=config['EXTRACTION']['strand-strategy']
 24 |     conda: '../envs/dropseq_tools.yaml'
 25 |     shell:
 26 |         """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && DigitalExpression -m {params.memory}\
 27 |         I={input.data}\
 28 |         O={output.dense}\
 29 |         EDIT_DISTANCE={params.umiBarcodeEditDistance}\
 30 |         OUTPUT_LONG_FORMAT={output.long}\
 31 |         STRAND_STRATEGY={params.strand_strategy}\
 32 |         OUTPUT_READS_INSTEAD=false\
 33 |         LOCUS_FUNCTION_LIST={{{params.locus_list}}}\
 34 |         MIN_BC_READ_THRESHOLD={params.count_per_umi}\
 35 |         CELL_BC_FILE={input.barcode_whitelist}"""
 36 | 
 37 | rule extract_reads_expression:
 38 |     input:
 39 |         data='{results_dir}/samples/{sample}/final.bam',
 40 |         barcode_whitelist='{results_dir}/samples/{sample}/barcodes.csv'
 41 |     output:
 42 |         long=temp('{results_dir}/samples/{sample}/read/expression.long'),
 43 |         dense=temp('{results_dir}/samples/{sample}/read/expression.tsv')
 44 |     params:
 45 |         count_per_umi=config['EXTRACTION']['minimum-counts-per-UMI'],
 46 |         num_cells=lambda wildcards: int(samples.loc[wildcards.sample,'expected_cells']),
 47 |         umiBarcodeEditDistance=config['EXTRACTION']['UMI-edit-distance'],
 48 |         temp_directory=config['LOCAL']['temp-directory'],
 49 |         memory=config['LOCAL']['memory'],
 50 |         locus_list=','.join(config['EXTRACTION']['LOCUS']),
 51 |         strand_strategy=config['EXTRACTION']['strand-strategy']
 52 |     conda: '../envs/dropseq_tools.yaml'
 53 |     shell:
 54 |         """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && DigitalExpression -m {params.memory}\
 55 |         I={input.data}\
 56 |         O={output.dense}\
 57 |         EDIT_DISTANCE={params.umiBarcodeEditDistance}\
 58 |         OUTPUT_LONG_FORMAT={output.long}\
 59 |         STRAND_STRATEGY={params.strand_strategy}\
 60 |         OUTPUT_READS_INSTEAD=true\
 61 |         LOCUS_FUNCTION_LIST={{{params.locus_list}}}\
 62 |         MIN_BC_READ_THRESHOLD={params.count_per_umi}\
 63 |         CELL_BC_FILE={input.barcode_whitelist}"""
 64 | 
 65 | 
 66 | rule SingleCellRnaSeqMetricsCollector:
 67 |     input:
 68 |         data='{results_dir}/samples/{sample}/final.bam',
 69 |         barcode_whitelist='{results_dir}/samples/{sample}/barcodes.csv',
 70 |         refFlat=expand("{ref_path}/{species}_{build}_{release}/curated_annotation.refFlat",
 71 |             ref_path=config['META']['reference-directory'],
 72 |             species=species,
 73 |             release=release,
 74 |             build=build),
 75 |         rRNA_intervals=expand("{ref_path}/{species}_{build}_{release}/annotation.rRNA.intervals",
 76 |             ref_path=config['META']['reference-directory'],
 77 |             species=species,
 78 |             release=release,
 79 |             build=build)
 80 |     params:     
 81 |         temp_directory=config['LOCAL']['temp-directory'],
 82 |         memory=config['LOCAL']['memory']
 83 |     output:
 84 |         rna_metrics='{results_dir}/logs/dropseq_tools/{sample}_rna_metrics.txt',
 85 |     conda: '../envs/dropseq_tools.yaml'
 86 |     shell:
 87 |         """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && SingleCellRnaSeqMetricsCollector -m {params.memory}\
 88 |         INPUT={input.data}\
 89 |         OUTPUT={output}\
 90 |         ANNOTATIONS_FILE={input.refFlat}\
 91 |         CELL_BC_FILE={input.barcode_whitelist}\
 92 |         RIBOSOMAL_INTERVALS={input.rRNA_intervals}
 93 |         """
 94 | 
 95 | rule plot_rna_metrics:
 96 |     input:
 97 |         rna_metrics='{results_dir}/logs/dropseq_tools/{sample}_rna_metrics.txt',
 98 |         barcodes='{results_dir}/samples/{sample}/barcodes.csv'
 99 |     conda: '../envs/r.yaml'
100 |     output:
101 |         pdf='{results_dir}/plots/rna_metrics/{sample}_rna_metrics.pdf'
102 |     script:
103 |         '../scripts/plot_rna_metrics.R'
104 | 
105 | 
106 | rule convert_long_to_mtx:
107 |     input:
108 |         '{results_dir}/samples/{sample}/{type}/expression.long'
109 |     output:
110 |         barcodes='{results_dir}/samples/{sample}/{type}/barcodes.tsv',
111 |         features='{results_dir}/samples/{sample}/{type}/genes.tsv',
112 |         mtx='{results_dir}/samples/{sample}/{type}/matrix.mtx'
113 |     params:
114 |         samples=lambda wildcards: wildcards.sample
115 |     script:
116 |         "../scripts/convert_mtx.py"
117 | 
118 | rule compress_mtx:
119 |     input: 
120 |         barcodes='{results_dir}/samples/{sample}/{type}/barcodes.tsv',
121 |         features='{results_dir}/samples/{sample}/{type}/genes.tsv',
122 |         mtx='{results_dir}/samples/{sample}/{type}/matrix.mtx'
123 |     output:
124 |         barcodes='{results_dir}/samples/{sample}/{type}/barcodes.tsv.gz',
125 |         features='{results_dir}/samples/{sample}/{type}/genes.tsv.gz',
126 |         mtx='{results_dir}/samples/{sample}/{type}/matrix.mtx.gz'
127 |     conda: '../envs/pigz.yaml'
128 |     threads: 3
129 |     shell:
130 |         """pigz -p {threads} {input.barcodes} {input.features} {input.mtx}"""


--------------------------------------------------------------------------------
/rules/extract_expression_species.smk:
--------------------------------------------------------------------------------
  1 | """Extract expression fof mixed species"""
  2 | 
  3 | #Which rules will be run on the host computer and not sent to nodes
  4 | localrules:
  5 |     plot_rna_metrics_species,
  6 |     convert_long_to_mtx_species
  7 | 
  8 | rule extract_umi_expression_species:
  9 |     input:
 10 |         data='{results_dir}/samples/{sample}/{species}/unfiltered.bam',
 11 |         barcode_whitelist='{results_dir}/samples/{sample}/{species}/barcodes.csv'
 12 |     output:
 13 |         dense=temp('{results_dir}/samples/{sample}/{species}/umi/expression.txt'),
 14 |         long=temp('{results_dir}/samples/{sample}/{species}/umi/expression.long')
 15 |     params:
 16 |         count_per_umi=config['EXTRACTION']['minimum-counts-per-UMI'],
 17 |         num_cells=lambda wildcards: int(samples.loc[wildcards.sample,'expected_cells']),
 18 |         umiBarcodeEditDistance=config['EXTRACTION']['UMI-edit-distance'],
 19 |         temp_directory=config['LOCAL']['temp-directory'],
 20 |         memory=config['LOCAL']['memory'],
 21 |         locus_list=','.join(config['EXTRACTION']['LOCUS']),
 22 |         strand_strategy=config['EXTRACTION']['strand-strategy']
 23 |     conda: '../envs/dropseq_tools.yaml'
 24 |     shell:
 25 |         """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && DigitalExpression -m {params.memory}\
 26 |         I={input.data}\
 27 |         O={output.dense}\
 28 |         EDIT_DISTANCE={params.umiBarcodeEditDistance}\
 29 |         OUTPUT_LONG_FORMAT={output.long}\
 30 |         STRAND_STRATEGY={params.strand_strategy}\
 31 |         OUTPUT_READS_INSTEAD=false\
 32 |         LOCUS_FUNCTION_LIST={{{params.locus_list}}}\
 33 |         MIN_BC_READ_THRESHOLD={params.count_per_umi}\
 34 |         CELL_BC_FILE={input.barcode_whitelist}"""
 35 | 
 36 | 
 37 | rule extract_reads_expression_species:
 38 |     input:
 39 |         data='{results_dir}/samples/{sample}/{species}/unfiltered.bam',
 40 |         barcode_whitelist='{results_dir}/samples/{sample}/{species}/barcodes.csv'
 41 |     params:
 42 |         count_per_umi=config['EXTRACTION']['minimum-counts-per-UMI'],
 43 |         num_cells=lambda wildcards: int(samples.loc[wildcards.sample,'expected_cells']),
 44 |         umiBarcodeEditDistance=config['EXTRACTION']['UMI-edit-distance'],
 45 |         temp_directory=config['LOCAL']['temp-directory'],
 46 |         memory=config['LOCAL']['memory'],
 47 |         locus_list=','.join(config['EXTRACTION']['LOCUS']),
 48 |         strand_strategy=config['EXTRACTION']['strand-strategy']
 49 |     output:
 50 |         dense=temp('{results_dir}/samples/{sample}/{species}/read/expression.txt'),
 51 |         long=temp('{results_dir}/samples/{sample}/{species}/read/expression.long')
 52 |     conda: '../envs/dropseq_tools.yaml'
 53 |     shell:
 54 |         """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && DigitalExpression -m {params.memory}\
 55 |         I={input.data}\
 56 |         O={output.dense}\
 57 |         EDIT_DISTANCE={params.umiBarcodeEditDistance}\
 58 |         OUTPUT_LONG_FORMAT={output.long}\
 59 |         STRAND_STRATEGY={params.strand_strategy}\
 60 |         OUTPUT_READS_INSTEAD=true\
 61 |         LOCUS_FUNCTION_LIST={{{params.locus_list}}}\
 62 |         MIN_BC_READ_THRESHOLD={params.count_per_umi}\
 63 |         CELL_BC_FILE={input.barcode_whitelist}"""
 64 | 
 65 | rule convert_long_to_mtx_species:
 66 |     input:
 67 |         '{results_dir}/samples/{sample}/{species}/{type}/expression.long'
 68 |     output:
 69 |         barcodes='{results_dir}/samples/{sample}/{species}/{type}/barcodes.tsv',
 70 |         features='{results_dir}/samples/{sample}/{species}/{type}/genes.tsv',
 71 |         mtx='{results_dir}/samples/{sample}/{species}/{type}/matrix.mtx'
 72 |     params:
 73 |         samples=lambda wildcards: wildcards.sample
 74 |     script:
 75 |         "../scripts/convert_mtx.py"
 76 | 
 77 | rule compress_mtx_species:
 78 |     input: 
 79 |         barcodes='{results_dir}/samples/{sample}/{species}/{type}/barcodes.tsv',
 80 |         features='{results_dir}/samples/{sample}/{species}/{type}/genes.tsv',
 81 |         mtx='{results_dir}/samples/{sample}/{species}/{type}/matrix.mtx'
 82 |     output:
 83 |         barcodes='{results_dir}/samples/{sample}/{species}/{type}/barcodes.tsv.gz',
 84 |         features='{results_dir}/samples/{sample}/{species}/{type}/genes.tsv.gz',
 85 |         mtx='{results_dir}/samples/{sample}/{species}/{type}/matrix.mtx.gz'
 86 |     conda: '../envs/pigz.yaml'
 87 |     threads: 3
 88 |     shell:
 89 |         """pigz -p {threads} {input.barcodes} {input.features} {input.mtx}"""
 90 | 
 91 | rule SingleCellRnaSeqMetricsCollector_species:
 92 |     input:
 93 |         data='{results_dir}/samples/{sample}/{species}/unfiltered.bam',
 94 |         barcode_whitelist='{results_dir}/samples/{sample}/{species}/barcodes.csv',
 95 |         refFlat=expand("{ref_path}/{species}_{build}_{release}/curated_annotation.refFlat",
 96 |             ref_path=ref_path,
 97 |             release=release,
 98 |             species=species,
 99 |             build=build),
100 |         rRNA_intervals=expand("{ref_path}/{species}_{build}_{release}/annotation.rRNA.intervals",
101 |             ref_path=ref_path,
102 |             release=release,
103 |             build=build,
104 |             species=species)
105 |     params:
106 |         cells=lambda wildcards: int(samples.loc[wildcards.sample,'expected_cells']),        
107 |         memory=config['LOCAL']['memory'],
108 |         temp_directory=config['LOCAL']['temp-directory']
109 |     output:
110 |         '{results_dir}/logs/dropseq_tools/{sample}/{species}/rna_metrics.txt'
111 |     conda: '../envs/dropseq_tools.yaml'
112 |     shell:
113 |         """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && SingleCellRnaSeqMetricsCollector -m {params.memory}\
114 |         INPUT={input.data}\
115 |         OUTPUT={output}\
116 |         ANNOTATIONS_FILE={input.refFlat}\
117 |         CELL_BC_FILE={input.barcode_whitelist}\
118 |         RIBOSOMAL_INTERVALS={input.rRNA_intervals}
119 |         """
120 | rule plot_rna_metrics_species:
121 |     input:
122 |         rna_metrics='{results_dir}/logs/dropseq_tools/{sample}/{species}/rna_metrics.txt',
123 |         barcode='{results_dir}/samples/{sample}/{species}/barcodes.csv'
124 |     conda: '../envs/r.yaml'
125 |     output:
126 |         pdf='{results_dir}/plots/rna_metrics/{sample}_{species}_rna_metrics.pdf'
127 |     script:
128 |         '../scripts/plot_rna_metrics.R'
129 | 


--------------------------------------------------------------------------------
/rules/fastqc.smk:
--------------------------------------------------------------------------------
 1 | """Get fastqc reports"""
 2 | 
 3 | #Which rules will be run on the host computer and not sent to nodes
 4 | localrules:
 5 |     multiqc_fastqc_reads,
 6 |     multiqc_fastqc_barcodes,
 7 |     fasta_fastq_adapter
 8 | 
 9 | 
10 | rule fastqc_barcodes:
11 |     """Create fastqc report"""
12 |     input: 
13 |         get_R1_files,
14 |         'fastqc_adapter.tsv',
15 |     output:
16 |         html='{results_dir}/logs/fastqc/{sample}_R1_fastqc.html',
17 |         zip='{results_dir}/logs/fastqc/{sample}_R1_fastqc.zip'
18 |     params: '--extract -a fastqc_adapter.tsv'
19 |     wrapper:
20 |         '0.36.0/bio/fastqc'
21 | 
22 | rule fastqc_reads:
23 |     """Create fastqc report"""
24 |     input: 
25 |         get_R2_files,
26 |         'fastqc_adapter.tsv',
27 |     output:
28 |         html='{results_dir}/logs/fastqc/{sample}_R2_fastqc.html',
29 |         zip='{results_dir}/logs/fastqc/{sample}_R2_fastqc.zip'
30 |     params: '--extract -a fastqc_adapter.tsv'
31 |     wrapper:
32 |         '0.36.0/bio/fastqc'
33 | 
34 | 
35 | rule multiqc_fastqc_barcodes:
36 |     input:
37 |         expand('{results_dir}/logs/fastqc/{sample}_R1_fastqc.html', sample=samples.index, results_dir=results_dir)
38 |     output:
39 |         html='{results_dir}/reports/fastqc_barcodes.html'
40 |     params: '-m fastqc --ignore *_R2*'
41 |     wrapper:
42 |         '0.36.0/bio/multiqc'
43 | 
44 | rule multiqc_fastqc_reads:
45 |     input: 
46 |         expand('{results_dir}/logs/fastqc/{sample}_R2_fastqc.html', sample=samples.index, results_dir=results_dir)
47 |     output:
48 |         html='{results_dir}/reports/fastqc_reads.html'
49 |     params: '-m fastqc --ignore *_R1*'
50 |     wrapper:
51 |         '0.36.0/bio/multiqc'
52 | 
53 | rule fasta_fastq_adapter:
54 |     input:
55 |         fa=config['FILTER']['cutadapt']['adapters-file']
56 |     output:
57 |         tsv="fastqc_adapter.tsv"
58 |     conda: '../envs/merge_bam.yaml'
59 |     script:
60 |         '../scripts/fa2tsv.py'
61 | 


--------------------------------------------------------------------------------
/rules/filter.smk:
--------------------------------------------------------------------------------
  1 | """Filter data"""
  2 | 
  3 | 
  4 | #Which rules will be run on the host computer and not sent to nodes
  5 | localrules:
  6 | 	clean_cutadapt,
  7 | 	plot_adapter_content,
  8 | 	multiqc_cutadapt_barcodes,
  9 | 	multiqc_cutadapt_RNA,
 10 | 	detect_barcodes
 11 | 
 12 | 
 13 | rule cutadapt_R1:
 14 |     input:
 15 |         R1=get_R1_files,
 16 |         adapters=config['FILTER']['cutadapt']['adapters-file']
 17 |     output:
 18 |         fastq=temp('{results_dir}/samples/{sample}/trimmed_R1.fastq.gz')
 19 |     params:
 20 |         cell_barcode_length=round((config['FILTER']['cell-barcode']['end'] - config['FILTER']['cell-barcode']['start'] + 1) * 1.3),
 21 |         barcode_length=config['FILTER']['UMI-barcode']['end'],
 22 |         extra_params=config['FILTER']['cutadapt']['R1']['extra-params'],
 23 |         max_n=config['FILTER']['cutadapt']['R1']['maximum-Ns'],
 24 |         barcode_quality=config['FILTER']['cutadapt']['R1']['quality-filter']
 25 |     threads: 10
 26 |     log:
 27 |         qc='{results_dir}/logs/cutadapt/{sample}_R1.qc.txt'
 28 |     conda: '../envs/cutadapt.yaml' 
 29 |     shell:
 30 |         """cutadapt\
 31 |         --max-n {params.max_n}\
 32 |         -a file:{input.adapters}\
 33 |         -g file:{input.adapters}\
 34 |         -q {params.barcode_quality},{params.barcode_quality}\
 35 |         --minimum-length {params.barcode_length}\
 36 |         --cores={threads}\
 37 |         --overlap {params.cell_barcode_length}\
 38 |         -o {output.fastq} {input.R1}\
 39 |         {params.extra_params} > {log.qc}"""
 40 | 
 41 | rule cutadapt_R2:
 42 |     input:
 43 |         R2=get_R2_files,
 44 |         adapters=config['FILTER']['cutadapt']['adapters-file']
 45 |     output:
 46 |         fastq=temp('{results_dir}/samples/{sample}/trimmed_R2.fastq.gz')
 47 |     params:
 48 |         extra_params=config['FILTER']['cutadapt']['R2']['extra-params'],
 49 |         read_quality=config['FILTER']['cutadapt']['R2']['quality-filter'],
 50 |         minimum_length=config['FILTER']['cutadapt']['R2']['minimum-length'],
 51 |         adapters_minimum_overlap=config['FILTER']['cutadapt']['R2']['minimum-adapters-overlap'],
 52 |     threads: 10
 53 |     log:
 54 |         qc='{results_dir}/logs/cutadapt/{sample}_R2.qc.txt'
 55 |     conda: '../envs/cutadapt.yaml' 
 56 |     shell:
 57 |         """cutadapt\
 58 |         -a file:{input.adapters}\
 59 |         -g file:{input.adapters}\
 60 |         -q {params.read_quality}\
 61 |         --minimum-length {params.minimum_length}\
 62 |         --cores={threads}\
 63 |         --overlap {params.adapters_minimum_overlap}\
 64 |         -o {output.fastq} {input.R2}\
 65 |         {params.extra_params} > {log.qc}"""
 66 | 
 67 | rule clean_cutadapt:
 68 |     input:
 69 |         R1='{results_dir}/logs/cutadapt/{sample}_R1.qc.txt',
 70 |         R2='{results_dir}/logs/cutadapt/{sample}_R2.qc.txt'
 71 |     output:
 72 |         '{results_dir}/logs/cutadapt/{sample}.clean_qc.csv'
 73 |     script:
 74 |         '../scripts/clean_cutadapt.py'
 75 | 
 76 | rule repair:
 77 |     input:
 78 |         R1='{results_dir}/samples/{sample}/trimmed_R1.fastq.gz',
 79 |         R2='{results_dir}/samples/{sample}/trimmed_R2.fastq.gz'
 80 |     output:
 81 |         R1='{results_dir}/samples/{sample}/trimmed_repaired_R1.fastq.gz',
 82 |         R2='{results_dir}/samples/{sample}/trimmed_repaired_R2.fastq.gz'
 83 |     log:
 84 |         '{results_dir}/logs/bbmap/{sample}_repair.txt'
 85 |     params:
 86 |         memory='{}g'.format(int(config['LOCAL']['memory'].rstrip('g')) )
 87 |     conda: '../envs/bbmap.yaml'
 88 |     threads: 4
 89 |     shell:
 90 |         """repair.sh\
 91 |         -Xmx{params.memory}\
 92 |         in={input.R1}\
 93 |         in2={input.R2}\
 94 |         out1={output.R1}\
 95 |         out2={output.R2}\
 96 |         repair=t\
 97 |         threads={threads} 2> {log}"""
 98 | 
 99 | rule detect_barcodes:
100 |     input:
101 |         R1='{results_dir}/samples/{sample}/trimmed_repaired_R1.fastq.gz'
102 |     output:
103 |         positions='{results_dir}/samples/{sample}/test.csv'
104 |     conda: '../envs/merge_bam.yaml'
105 |     script:
106 |         '../scripts/detect_barcodes.py'
107 | 
108 | rule plot_adapter_content:
109 |     input:
110 |         expand('{results_dir}/logs/cutadapt/{sample}.clean_qc.csv', sample=samples.index, results_dir=results_dir)
111 |     params:
112 |         Cell_length=config['FILTER']['cell-barcode']['end'] - config['FILTER']['cell-barcode']['start'] + 1,
113 |         UMI_length=config['FILTER']['UMI-barcode']['end'] - config['FILTER']['UMI-barcode']['start'] + 1,
114 |         sample_names=lambda wildcards: samples.index,
115 |         batches=lambda wildcards: samples.loc[samples.index, 'batch']
116 |     conda: '../envs/r.yaml'
117 |     output:
118 |         pdf='{results_dir}/plots/adapter_content.pdf'
119 |     script:
120 |         '../scripts/plot_adapter_content.R'
121 | 
122 | rule multiqc_cutadapt_barcodes:
123 |     input:
124 |         expand('{results_dir}/logs/cutadapt/{sample}_R1.qc.txt', sample=samples.index, results_dir=results_dir)
125 |     params: '-m cutadapt --ignore *_R2*'
126 |     output:
127 |         html='{results_dir}/reports/barcode_filtering.html'
128 |     wrapper:
129 |         '0.36.0/bio/multiqc'
130 | 
131 | rule multiqc_cutadapt_RNA:
132 |     input:
133 |         expand('{results_dir}/logs/cutadapt/{sample}_R2.qc.txt', sample=samples.index, results_dir=results_dir)
134 |     params: '-m cutadapt --ignore *_R1*'
135 |     output:
136 |         html='{results_dir}/reports/RNA_filtering.html'
137 |     wrapper:
138 |         '0.36.0/bio/multiqc'
139 | 


--------------------------------------------------------------------------------
/rules/generate_meta.smk:
--------------------------------------------------------------------------------
  1 | import math
  2 | import platform
  3 | """Generate all the meta data files"""
  4 | # To add missing fields for an annotation of ERCC: awk -F'[\t|;]' '{printf $0" "; gsub(/id/,"name"); print $9";"$10"; exon_version \"1\";"}'
  5 | #Which rules will be run on the host computer and not sent to nodes
  6 | localrules:
  7 |      create_dict,
  8 |      reduce_gtf,
  9 |      create_refFlat,
 10 |      create_intervals,
 11 |      curate_annotation
 12 | 
 13 | 
 14 | rule curate_annotation:
 15 |     input:
 16 |         biotypes=config['META']['gtf_biotypes'],
 17 |         annotation="{ref_path}/{species}_{build}_{release}/annotation.gtf"
 18 |     output:
 19 |         temp("{ref_path}/{species}_{build}_{release}/curated_annotation.gtf")
 20 |     params:
 21 |         patterns='|'.join(config['biotypes'])
 22 |     shell:
 23 |         """cat {input.annotation} | grep -E "{params.patterns}" > {output}"""
 24 | 
 25 | 
 26 | rule create_dict:
 27 |     input:
 28 |         "{ref_path}/{species}_{build}_{release}/genome.fa"
 29 |     output:
 30 |         "{ref_path}/{species}_{build}_{release}/genome.dict"
 31 |     threads:1
 32 |     params:
 33 |         picard="$CONDA_PREFIX/share/picard-2.14.1-0/picard.jar",
 34 |         temp_directory=config['LOCAL']['temp-directory']
 35 |     conda: '../envs/picard.yaml'
 36 |     shell:
 37 |         """java -jar -Djava.io.tmpdir={params.temp_directory} {params.picard} CreateSequenceDictionary\
 38 |         REFERENCE={input}\
 39 |         OUTPUT={output}
 40 |         """
 41 | 
 42 | rule reduce_gtf:
 43 |     input:
 44 |         reference_dict="{ref_path}/{species}_{build}_{release}/genome.dict",
 45 |         annotation="{ref_path}/{species}_{build}_{release}/curated_annotation.gtf"
 46 |     params:
 47 |         memory=config['LOCAL']['memory'],
 48 |         temp_directory=config['LOCAL']['temp-directory']
 49 |     output:
 50 |         "{ref_path}/{species}_{build}_{release}/curated_reduced_annotation.gtf"
 51 |     conda: '../envs/dropseq_tools.yaml'
 52 |     shell:
 53 |         """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && ReduceGtf -m {params.memory}\
 54 |         GTF={input.annotation}\
 55 |         OUTPUT={output}\
 56 |         SEQUENCE_DICTIONARY={input.reference_dict}\
 57 |         IGNORE_FUNC_TYPE='null'\
 58 |         ENHANCE_GTF='false'"""
 59 | 
 60 | rule create_refFlat:
 61 |     input:
 62 |         reference_dict="{ref_path}/{species}_{build}_{release}/genome.dict",
 63 |         annotation="{ref_path}/{species}_{build}_{release}/curated_annotation.gtf"
 64 |     params:
 65 |         memory=config['LOCAL']['memory'],
 66 |         temp_directory=config['LOCAL']['temp-directory']
 67 |     output:
 68 |         "{ref_path}/{species}_{build}_{release}/curated_annotation.refFlat"
 69 |     conda: '../envs/dropseq_tools.yaml'
 70 |     shell:
 71 |         """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && ConvertToRefFlat -m {params.memory}\
 72 |         ANNOTATIONS_FILE={input.annotation}\
 73 |         OUTPUT={output}\
 74 |         SEQUENCE_DICTIONARY={input.reference_dict}
 75 |         """
 76 | 
 77 | rule create_intervals:
 78 |     input:
 79 |         annotation_reduced="{ref_path}/{species}_{build}_{release}/curated_reduced_annotation.gtf",
 80 |         reference_dict="{ref_path}/{species}_{build}_{release}/genome.dict"
 81 |     params:
 82 |         memory=config['LOCAL']['memory'],
 83 |         reference_directory=config['META']['reference-directory'],
 84 |         temp_directory=config['LOCAL']['temp-directory'],
 85 |         prefix="{species}_{build}_{release}/annotation"
 86 |     output:
 87 |         intervals="{ref_path}/{species}_{build}_{release}/annotation.rRNA.intervals"
 88 |     conda: '../envs/dropseq_tools.yaml'
 89 |     shell:
 90 |         """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && CreateIntervalsFiles -m {params.memory}\
 91 |         REDUCED_GTF={input.annotation_reduced}\
 92 |         SEQUENCE_DICTIONARY={input.reference_dict}\
 93 |         O={params.reference_directory}\
 94 |         PREFIX={params.prefix}
 95 |         """
 96 | 
 97 | rule get_genomeChrBinNbits:
 98 |     input:
 99 |         reference_file="{ref_path}/{species}_genome.fa"
100 |     params:
101 |         samples_file='samples.csv',
102 |         reference_directory=config['META']['reference-directory']
103 |     output:
104 |         '{params.reference_directory}/index_params.txt'
105 |     run:
106 |         """
107 |         from math import log2
108 |         from platform import system
109 |         if (system() == 'Darwin'):
110 |             genomeLength = shell("wc -c {} | cut -d' ' -f2".format(snakemake.reference_file), iterable=True)
111 |         else:
112 |             genomeLength = shell("wc -c {} | cut -d' ' -f1".format(snakemake.reference_file), iterable=True)
113 |         genomeLength = int(next(genomeLength))
114 |         referenceNumber = shell('grep "^>" {} | wc -l'.format(snakemake.reference_file), iterable=True)
115 |         referenceNumber = int(next(referenceNumber))
116 |         value = min([18,int(log2(genomeLength/referenceNumber))])
117 |         """
118 | 
119 | def get_sjdbOverhang(wildcards):
120 |     return(int(wildcards.read_length)-1)
121 | 
122 | 
123 | rule prep_star_index:
124 |     input:
125 |         reference_file="{ref_path}/{species}_genome.fa",
126 |         config_file='config.yaml'
127 |     output:
128 |         '{reference_directory}/star_ref_config.txt'
129 |     conda:
130 |         '../envs/pyyaml.yaml'
131 |     script:
132 |         '../scripts/prep_star.py'
133 | 
134 | 
135 |     
136 | 
137 | rule create_star_index:
138 |     input:
139 |         reference_file="{ref_path}/{species}_{build}_{release}/genome.fa",
140 |         annotation_file="{ref_path}/{species}_{build}_{release}/curated_annotation.gtf"  
141 |     params:
142 |         sjdbOverhang=lambda wildcards: get_sjdbOverhang(wildcards),
143 |         genomeDir='{ref_path}/{species}_{build}_{release}/STAR_INDEX/SA_{read_length}',
144 |         genomeChrBinNbits=config['MAPPING']['STAR']['genomeChrBinNbits']
145 |     output:
146 |         '{ref_path}/{species}_{build}_{release}/STAR_INDEX/SA_{read_length}/SA'
147 |     threads: 24
148 |     conda: '../envs/star.yaml'
149 |     shell:
150 |         """mkdir -p {params.genomeDir}; STAR\
151 |         --runThreadN {threads}\
152 |         --runMode genomeGenerate\
153 |         --genomeDir {params.genomeDir}\
154 |         --genomeFastaFiles {input.reference_file}\
155 |         --sjdbGTFfile {input.annotation_file}\
156 |         --sjdbOverhang {params.sjdbOverhang}\
157 |         --genomeChrBinNbits {params.genomeChrBinNbits}\
158 |         --genomeSAsparseD 2
159 |         """


--------------------------------------------------------------------------------
/rules/map.smk:
--------------------------------------------------------------------------------
  1 | """Align the data with STAR."""
  2 | 
  3 | 
  4 | #Which rules will be run on the host computer and not sent to nodes
  5 | localrules:
  6 |     multiqc_star,
  7 |     plot_yield,
  8 |     plot_knee_plot,
  9 |     pigz_unmapped
 10 | 
 11 | 
 12 | rule STAR_align:
 13 |     input:
 14 |         fq1='{results_dir}/samples/{sample}/trimmed_repaired_R2.fastq.gz',
 15 |         index=lambda wildcards: '{}/{}_{}_{}/STAR_INDEX/SA'.format(
 16 |             config['META']['reference-directory'],
 17 |             species,
 18 |             build,
 19 |             release) + '_' + str(samples.loc[wildcards.sample,'read_length']) + '/SA'
 20 |     output:
 21 |         temp('{results_dir}/samples/{sample}/Aligned.out.bam'),
 22 |         '{results_dir}/samples/{sample}/Unmapped.out.mate1'
 23 | 
 24 |     log:
 25 |         '{results_dir}/samples/{sample}/Log.final.out'
 26 |     params:
 27 |         extra="""--outReadsUnmapped Fastx\
 28 |                 --outFilterMismatchNmax {}\
 29 |                 --outFilterMismatchNoverLmax {}\
 30 |                 --outFilterMismatchNoverReadLmax {}\
 31 |                 --outFilterMatchNmin {}\
 32 |                 --outFilterScoreMinOverLread {}\
 33 |                 --outFilterMatchNminOverLread {}""".format(
 34 |                 config['MAPPING']['STAR']['outFilterMismatchNmax'],
 35 |                 config['MAPPING']['STAR']['outFilterMismatchNoverLmax'],
 36 |                 config['MAPPING']['STAR']['outFilterMismatchNoverReadLmax'],
 37 |                 config['MAPPING']['STAR']['outFilterMatchNmin'],
 38 |                 config['MAPPING']['STAR']['outFilterMatchNminOverLread'],
 39 |                 config['MAPPING']['STAR']['outFilterScoreMinOverLread'],),
 40 |         index=lambda wildcards: '{}/{}_{}_{}/STAR_INDEX/SA'.format(
 41 |             config['META']['reference-directory'],
 42 |             species,
 43 |             build,
 44 |             release) + '_' + str(samples.loc[wildcards.sample,'read_length']) + '/'
 45 |     singularity:
 46 |         "shub://seb-mueller/singularity_dropSeqPipe:v04"
 47 |     threads: 24
 48 |     wrapper:
 49 |         "0.27.1/bio/star/align"
 50 | # rule alevin:
 51 | #   input:
 52 | #       index='{salmon_index}',
 53 | #       R1="samples/{sample}/trimmed_repaired_R1.fastq.gz",
 54 | #       R2="samples/{sample}/trimmed_repaired_R2.fastq.gz",
 55 | #   conda: '../envs/salmon.yaml'
 56 | #   params:
 57 | #       cell_barcode_length=(config['FILTER']['cell-barcode']['end'] - config['FILTER']['cell-barcode']['start'] + 1),
 58 | #       umi_barcode_length=(config['FILTER']['UMI-barcode']['end'] - config['FILTER']['UMI-barcode']['start'] + 1)
 59 | #   output:
 60 | #       out_folder='samples/{sample}/salmon/',
 61 | #       counts='samples/{sample}/salmon/mapping.tsv'
 62 | #   shell:
 63 | #       """salmon alevin\
 64 | #       -l ISR\
 65 | #       -1 {input.R1}\
 66 | #       -2 {input.R2}\
 67 | #       -i {inout.index}\
 68 | #       -p 10\
 69 | #       -o {output.out_folder}\
 70 | #       --tgMap {output.counts}\
 71 | #       --barcodeLength {params.cell_barcode_length}\
 72 | #       --umiLength {params.umi_barcode_length}\
 73 | #       --end 5"""
 74 | 
 75 | 
 76 | rule multiqc_star:
 77 |     input:
 78 |         expand('{results_dir}/samples/{sample}/Log.final.out', sample=samples.index, results_dir=results_dir)
 79 |     output:
 80 |         html='{results_dir}/reports/star.html'
 81 |     params: '-m star'
 82 |     wrapper:
 83 |         '0.36.0/bio/multiqc'
 84 | 
 85 | rule pigz_unmapped:
 86 |     input:
 87 |         '{results_dir}/samples/{sample}/Unmapped.out.mate1'
 88 |     output:
 89 |         '{results_dir}/samples/{sample}/Unmapped.out.mate1.gz'
 90 |     threads: 4
 91 |     conda: '../envs/pigz.yaml'
 92 |     shell:
 93 |         """pigz -p 4 {input}"""
 94 | 
 95 | rule MergeBamAlignment:
 96 |     input:
 97 |         mapped='{results_dir}/samples/{sample}/Aligned.out.bam',
 98 |         R1_ref = '{results_dir}/samples/{sample}/trimmed_repaired_R1.fastq.gz'
 99 |     output:
100 |         temp('{results_dir}/samples/{sample}/Aligned.merged.bam')
101 |     params:
102 |         BC_start=config['FILTER']['cell-barcode']['start']-1,
103 |         BC_end=config['FILTER']['cell-barcode']['end'],
104 |         UMI_start=config['FILTER']['UMI-barcode']['start']-1,
105 |         UMI_end=config['FILTER']['UMI-barcode']['end'],
106 |         discard_secondary_alignements=True
107 |     conda: '../envs/merge_bam.yaml'
108 |     script:
109 |         '../scripts/merge_bam.py'
110 | 
111 | # Note: rule repair_barcodes (cell_barcodes.smk) creates Aligned.repaired.bam
112 | # this is using barcode information (i.e. dependent on expected_cells in config.yaml)
113 | 
114 | 
115 | rule TagReadWithGeneExon:
116 |     input:
117 |         data='{results_dir}/samples/{sample}/Aligned.repaired.bam',
118 |         refFlat=expand("{ref_path}/{species}_{build}_{release}/curated_annotation.refFlat",
119 |             ref_path=config['META']['reference-directory'],
120 |             species=species,
121 |             release=release,
122 |             build=build)
123 |     params:
124 |         memory=config['LOCAL']['memory'],
125 |         temp_directory=config['LOCAL']['temp-directory']
126 |     output:
127 |         temp('{results_dir}/samples/{sample}/gene_exon_tagged.bam')
128 |     conda: '../envs/dropseq_tools.yaml'
129 |     shell:
130 |         """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && TagReadWithGeneFunction -m {params.memory}\
131 |         INPUT={input.data}\
132 |         OUTPUT={output}\
133 |         ANNOTATIONS_FILE={input.refFlat}
134 |         """
135 | 
136 | rule DetectBeadSubstitutionErrors:
137 |     input:
138 |         '{results_dir}/samples/{sample}/gene_exon_tagged.bam'
139 |     output:
140 |         data=temp('{results_dir}/samples/{sample}/gene_exon_tagged_bead_sub.bam'),
141 |         report='{results_dir}/logs/dropseq_tools/{sample}_beadSubstitutionReport.txt',
142 |         summary='{results_dir}/logs/dropseq_tools/{sample}_beadSubstitutionSummary.txt'
143 |     params:
144 |         SmartAdapter=config['FILTER']['5-prime-smart-adapter'],
145 |         memory=config['LOCAL']['memory'],
146 |         temp_directory=config['LOCAL']['temp-directory']
147 |     conda: '../envs/dropseq_tools.yaml'
148 |     threads: 5
149 |     shell:
150 |         """
151 |         export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && DetectBeadSubstitutionErrors -m {params.memory}\
152 |         I={input}\
153 |         O={output.data}\
154 |         OUTPUT_REPORT={output.report}\
155 |         OUTPUT_SUMMARY={output.summary}\
156 |         NUM_THREADS={threads}
157 |         """
158 | 
159 | rule bead_errors_metrics:
160 |     input:
161 |         '{results_dir}/samples/{sample}/gene_exon_tagged_bead_sub.bam'
162 |     output:
163 |         '{results_dir}/samples/{sample}/final.bam'
164 |     params:
165 |         out_stats='{results_dir}/logs/dropseq_tools/{sample}_synthesis_stats.txt',
166 |         summary='{results_dir}/logs/dropseq_tools/{sample}_synthesis_stats_summary.txt',
167 |         barcodes=lambda wildcards: int(samples.loc[wildcards.sample,'expected_cells']) * 2,
168 |         memory =config['LOCAL']['memory'],
169 |         SmartAdapter=config['FILTER']['5-prime-smart-adapter'],
170 |         temp_directory=config['LOCAL']['temp-directory']
171 |     conda: '../envs/dropseq_tools.yaml'
172 |     threads: 5
173 |     shell:
174 |         """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && DetectBeadSynthesisErrors -m {params.memory}\
175 |         INPUT={input}\
176 |         OUTPUT={output}\
177 |         OUTPUT_STATS={params.out_stats}\
178 |         SUMMARY={params.summary}\
179 |         NUM_BARCODES={params.barcodes}\
180 |         PRIMER_SEQUENCE={params.SmartAdapter}\
181 |         NUM_THREADS={threads}
182 |         """
183 | 
184 | 
185 | rule bam_hist:
186 |     input:
187 |         '{results_dir}/samples/{sample}/final.bam'
188 |     params:
189 |         memory=config['LOCAL']['memory'],
190 |         temp_directory=config['LOCAL']['temp-directory']
191 |     output:
192 |         '{results_dir}/logs/dropseq_tools/{sample}_hist_out_cell.txt'
193 |     conda: '../envs/dropseq_tools.yaml'
194 |     shell:
195 |         """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && BamTagHistogram -m {params.memory}\
196 |         TAG=XC\
197 |         I={input}\
198 |         READ_MQ=10\
199 |         O={output}
200 |         """
201 | 
202 | 
203 | rule plot_yield:
204 |     input:
205 |         R1_filtered=expand('{results_dir}/logs/cutadapt/{sample}_R1.qc.txt', sample=samples.index, results_dir=results_dir),
206 |         R2_filtered=expand('{results_dir}/logs/cutadapt/{sample}_R2.qc.txt', sample=samples.index, results_dir=results_dir),
207 |         repaired=expand('{results_dir}/logs/bbmap/{sample}_repair.txt', sample=samples.index, results_dir=results_dir),
208 |         STAR_output=expand('{results_dir}/samples/{sample}/Log.final.out', sample=samples.index, results_dir=results_dir),
209 |     params:
210 |         BC_length=config['FILTER']['cell-barcode']['end'] - config['FILTER']['cell-barcode']['start']+1,
211 |         UMI_length=config['FILTER']['UMI-barcode']['end'] - config['FILTER']['UMI-barcode']['start']+1,
212 |         sample_names=lambda wildcards: samples.index,
213 |         batches=lambda wildcards: samples.loc[samples.index, 'batch']
214 |     conda: '../envs/r.yaml'
215 |     output:
216 |         pdf='{results_dir}/plots/yield.pdf'
217 |     script:
218 |         '../scripts/plot_yield.R'
219 | 
220 | 
221 | rule plot_knee_plot:
222 |     input:
223 |         data='{results_dir}/logs/dropseq_tools/{sample}_hist_out_cell.txt',
224 |         barcodes='{results_dir}/samples/{sample}/barcodes.csv'
225 |     params:
226 |         cells=lambda wildcards: int(samples.loc[wildcards.sample,'expected_cells'])
227 |     conda: '../envs/r.yaml'
228 |     output:
229 |         pdf='{results_dir}/plots/knee_plots/{sample}_knee_plot.pdf'
230 |     script:
231 |         '../scripts/plot_knee_plot.R'
232 | 


--------------------------------------------------------------------------------
/rules/merge.smk:
--------------------------------------------------------------------------------
 1 | 
 2 | localrules:
 3 |     merge_long,
 4 |     violine_plots,
 5 |     summary_stats
 6 | 
 7 | rule merge_long:
 8 |     input:
 9 |         expand('{results_dir}/samples/{sample}/{{type}}/expression.long', sample=samples.index, results_dir=results_dir)
10 |     output:
11 |         mtx='{results_dir}/summary/{type}/matrix.mtx',
12 |         barcodes='{results_dir}/summary/{type}/barcodes.tsv',
13 |         features='{results_dir}/summary/{type}/genes.tsv',
14 |     params:
15 |         samples=lambda wildcards: samples.index
16 |     conda: '../envs/merge_long.yaml'
17 |     script:
18 |         "../scripts/convert_mtx.py"
19 | 
20 | # rule compress_mtx_summary:
21 | #     input: 
22 | #         barcodes='{results_dir}/summary/{type}/barcodes.tsv',
23 | #         features='{results_dir}/summary/{type}/features.tsv',
24 | #         mtx='{results_dir}/summary/{type}/matrix.mtx'
25 | #     output:
26 | #         barcodes='{results_dir}/summary/{type}/barcodes.tsv.gz',
27 | #         features='{results_dir}/summary/{type}/features.tsv.gz',
28 | #         mtx='{results_dir}/summary/{type}/matrix.mtx.gz'
29 | #     conda: '../envs/pigz.yaml'
30 | #     threads: 3
31 | #     shell:
32 | #         """pigz -p {threads} {input.barcodes} {input.features} {input.mtx}"""
33 | 
34 | rule violine_plots:
35 |     input:
36 |         umi_mtx='{results_dir}/summary/umi/matrix.mtx',
37 |         read_mtx='{results_dir}/summary/umi/matrix.mtx',
38 |         design='samples.csv'
39 |     conda: '../envs/r.yaml'
40 |     output:
41 |         pdf_violine='{results_dir}/plots/violinplots_comparison_UMI.pdf',
42 |         pdf_umivscounts='{results_dir}/plots/UMI_vs_counts.pdf',
43 |         pdf_umi_vs_gene='{results_dir}/plots/UMI_vs_gene.pdf',
44 |         pdf_count_vs_gene='{results_dir}/plots/Count_vs_gene.pdf',
45 |         R_objects='{results_dir}/summary/R_Seurat_objects.rdata'
46 |     script:
47 |         '../scripts/plot_violine.R'
48 | 
49 | rule summary_stats:
50 |     input:
51 |         R_objects='{results_dir}/summary/R_Seurat_objects.rdata',
52 |         R2qc=expand('{results_dir}/logs/cutadapt/{sample}_R2.qc.txt', sample=samples.index, results_dir=results_dir),
53 |         hist_cell=expand('{results_dir}/logs/dropseq_tools/{sample}_hist_out_cell.txt', sample=samples.index, results_dir=results_dir)
54 |     conda: '../envs/r.yaml'
55 |     output:
56 |         stats_pre='{results_dir}/summary/barcode_stats_pre_filter.csv',
57 |         stats_post='{results_dir}/summary/barcode_stats_post_filter.csv',
58 |     params:
59 |         sample_names=lambda wildcards: samples.index,
60 |         batches=lambda wildcards: samples.loc[samples.index, 'batch']
61 |     script:
62 |         '../scripts/create_summary_stats.R'
63 | 


--------------------------------------------------------------------------------
/rules/prepare.smk:
--------------------------------------------------------------------------------
 1 | import re
 2 | import glob
 3 | import gzip
 4 | from collections import defaultdict
 5 | 
 6 | multi_lane_pattern = re.compile("../data\/(.*)_(L[0-9]{3})_(R[1-2])_001.fastq.gz")
 7 | 
 8 | 
 9 | def get_input_files(wildcards):
10 |     samples = [f for f in glob.glob("../{results_dir}/samples/*.fastq.gz") if re.match(multi_lane_pattern,f)]
11 |     return(samples)
12 | 
13 | lanes = sorted(list(set([re.findall(multi_lane_pattern,f)[0][1] for f in glob.glob("../{results_dir}/samples/*.fastq.gz") if re.match(multi_lane_pattern,f)])))
14 | samples = [re.findall(multi_lane_pattern,f)[0][0] for f in glob.glob("../{results_dir}/samples/*.fastq.gz") if re.match(multi_lane_pattern,f)]
15 | 
16 | 
17 | 
18 | 
19 | rule all:
20 |     input:
21 |         expand('{results_dir}/samples/{sample}_R1.fastq.gz',sample=samples),
22 |         expand('{results_dir}/samples/{sample}_R2.fastq.gz',sample=samples)
23 | 
24 | 
25 | rule generate_samples:
26 |     input:
27 |         get_input_files
28 |     output:
29 |         'samples.csv'
30 |     run:
31 |         samples = defaultdict(lambda: {'sample_lanes':[],'read_length':0})
32 |         with open(output[0],'w') as sample_file:
33 |             sample_file.write("samples,expected_cells,read_length,batch\n")
34 |             for file in input:
35 |                 if('R2' in file):
36 |                     with gzip.open(file) as fastq_file:
37 |                         next(fastq_file)
38 |                         read_length = len(next(fastq_file).strip())
39 |                         re_results = re.findall(multi_lane_pattern,file)
40 |                         samples[re_results[0][0]]['sample_lanes'].append(re_results[0][1])
41 |                         samples[re_results[0][0]]['read_length']=read_length
42 |             for sample_name in samples:
43 |                 sample_file.write("{},,{},\n".format(sample_name, read_length))
44 | 
45 | rule concat_lanes:
46 |     input:
47 |         R1=expand('{results_dir}/samples/{{sample}}_{lane}_R1_001.fastq.gz', lane=lanes),
48 |         R2=expand('{results_dir}/samples/{{sample}}_{lane}_R2_001.fastq.gz', lane=lanes),
49 |         lanes='samples.csv'
50 |     output:
51 |         R1='{results_dir}/samples/{sample}_R1.fastq.gz',
52 |         R2='{results_dir}/samples/{sample}_R2.fastq.gz'
53 |     shell:
54 |         """cat {input.R1} > {output.R1};cat {input.R2} > {output.R2}"""


--------------------------------------------------------------------------------
/rules/report.smk:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | 
 4 | localrules: create_publication_text
 5 | 
 6 | def get_yamls(wildcards):
 7 |     files = glob.glob('.snakemake/conda/*.yaml')
 8 |     return(files)
 9 | 
10 | rule create_publication_text:
11 |     input:
12 |         config_file=configfile_path,
13 |         yaml_files=get_yamls
14 |     output:
15 |         '{results_dir}/reports/publication_text.html'
16 |     script:
17 |         "../scripts/publication_text.Rmd"


--------------------------------------------------------------------------------
/rules/split_species.smk:
--------------------------------------------------------------------------------
 1 | """Extract species specific expression to prepare the species plot."""
 2 | 
 3 | 
 4 | #Which rules will be run on the host computer and not sent to nodes
 5 | localrules: plot_barnyard
 6 | 
 7 | rule split_bam_species:
 8 |     input:
 9 |         '{results_dir}/samples/{sample}/final.bam'
10 |     output:
11 |         '{results_dir}/samples/{sample}/{species}/unfiltered.bam'
12 |     params:
13 |         species=lambda wildcards: wildcards.species,
14 |         memory=config['LOCAL']['memory'],
15 |         temp_directory=config['LOCAL']['temp-directory']
16 |     conda: '../envs/dropseq_tools.yaml'
17 |     shell:
18 |         """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && FilterBam -m {params.memory}\
19 |         REF_SOFT_MATCHED_RETAINED={params.species}\
20 |         INPUT={input}\
21 |         OUTPUT={output}"""
22 | 
23 | 
24 | rule extract_all_umi_expression:
25 |     input: 
26 |         data='{results_dir}/samples/{sample}/{species}/unfiltered.bam',
27 |         barcode_whitelist='{results_dir}/samples/{sample}/barcodes.csv'
28 |     output:
29 |         umi_matrix=temp('{results_dir}/samples/{sample}/{species}/unfiltered_umi_expression_matrix.tsv'),
30 |         summary='{results_dir}/samples/{sample}/{species}/dge.summary.txt'
31 |     params:
32 |         count_per_umi=config['EXTRACTION']['minimum-counts-per-UMI'],
33 |         cellBarcodeEditDistance=config['EXTRACTION']['UMI-edit-distance'],
34 |         memory=config['LOCAL']['memory'],
35 |         temp_directory=config['LOCAL']['temp-directory'],
36 |         locus_list=','.join(config['EXTRACTION']['LOCUS'])
37 |     conda: '../envs/dropseq_tools.yaml'
38 |     shell:
39 |         """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && DigitalExpression -m {params.memory}\
40 |         I={input.data}\
41 |         O={output.umi_matrix}\
42 |         SUMMARY={output.summary}\
43 |         EDIT_DISTANCE={params.cellBarcodeEditDistance}\
44 |         CELL_BC_FILE={input.barcode_whitelist}\
45 |         LOCUS_FUNCTION_LIST={{{params.locus_list}}}\
46 |         MIN_BC_READ_THRESHOLD={params.count_per_umi}"""
47 | 
48 | 
49 | rule plot_barnyard:
50 |     input:
51 |         expand('{results_dir}/samples/{{sample}}/{species}/dge.summary.txt',species=config['META']['species'], results_dir=results_dir)
52 |     output: 
53 |         genes_pdf='{results_dir}/plots/barnyard/{sample}_genes.pdf',
54 |         transcripts_pdf='{results_dir}/plots/barnyard/{sample}_transcripts.pdf',
55 |         barcodes_species=expand('{{results_dir}}/samples/{{sample}}/{species}/barcodes.csv', species=species_list)
56 |     params:
57 |         expected_cells=lambda wildcards: int(samples.loc[wildcards.sample,'expected_cells'])
58 |     script: 
59 |         '../scripts/plot_species_plot.R'


--------------------------------------------------------------------------------
/schemas/config.schema.yaml:
--------------------------------------------------------------------------------
  1 | $schema: "http://json-schema.org/draft-06/schema#"
  2 | 
  3 | description: Validation schema for all config entries
  4 | 
  5 | properties:
  6 |   CONTACT:
  7 |     type: object
  8 |     description: Details for contacting the person that ran the pipeline.
  9 |     properties:
 10 |       person:
 11 |         type: string
 12 |         description: Name of the contact person.
 13 |       email:
 14 |         type: string
 15 |         pattern: ".*@.*"
 16 |         description: Email address of the contact person.
 17 |         default: john.doe@john.com
 18 |   LOCAL:
 19 |     type: object
 20 |     description: Computer/experiment local details, paths, options.
 21 |     properties:
 22 |       temp-directory: 
 23 |         type: string
 24 |         description: Path of the temporary folder. Must have enough space to handle big files. Use scratch on clusters
 25 |       memory: 
 26 |         type: string
 27 |         description: Amount of memory needed for the java virtual machine as well as default for clusters.
 28 |       raw_data: 
 29 |         type: string
 30 |         description: Raw data folder path.
 31 |       results:
 32 |         type: string
 33 |         description: Results folder path.
 34 |         default: results
 35 |     required:
 36 |       - temp-directory
 37 |       - memory
 38 |       - raw_data
 39 |       - results
 40 |   META:
 41 |     type: object
 42 |     description: Details about metadata. Reference genomes and annotations.
 43 |     properties:
 44 |       species:
 45 |         type: object
 46 |         description: Details about species used in the experiment.
 47 |         properties:
 48 |           first_species:
 49 |             type: object
 50 |             description: First or only species of the experiment as in ensembl.
 51 |             properties:
 52 |               name:
 53 |                 type: string
 54 |                 description: Species name in lowercase.
 55 |               build:
 56 |                 type: number
 57 |                 description: Build number of the first species.
 58 |               release:
 59 |                 type: number
 60 |                 description: Release number of the first species
 61 |             required:
 62 |               - build
 63 |               - release
 64 |           second_species:
 65 |             type: object
 66 |             description: Second species of a mixed experiment as in ensembl.
 67 |             properties:
 68 |               name:
 69 |                 type: string
 70 |                 description: Species name in lowercase.
 71 |               build:
 72 |                 type: number
 73 |                 description: Build number of the first species.
 74 |               release:
 75 |                 type: number
 76 |                 description: Release number of the first species
 77 |             required:
 78 |               - build
 79 |               - release
 80 |           required:
 81 |             - first_species          
 82 |       ratio:
 83 |         type: number
 84 |         description: Minimum percentage of total transcripts in one cell to validate a species
 85 |       reference-directory:
 86 |         type: string
 87 |         description: Folder that will contain all the references and metadata files.
 88 |       gtf_biotypes:
 89 |         type: string
 90 |         default: gtf_biotypes.yaml
 91 |         description: file that contains a list of biotypes that are kept for the annotation curation.
 92 |   FILTER:
 93 |     type: object
 94 |     description: Details about trimming, filtering and cell/UMI barcode structure.
 95 |     properties:
 96 |       barcode-whitelist:
 97 |         type: string
 98 |         description: Filename to the barcode whitelist
 99 |       5-prime-smart-adapter:
100 |         type: string
101 |         pattern: "[ATGC]*"
102 |         description: This is the adapter that comes before the cell barcode in a 3" protocol.
103 |     cell-barcode:
104 |       type: object
105 |       description: Start and end positions for cell barcodes.
106 |       properties:
107 |         start:
108 |           type: number
109 |           description: Cell barcode's first position in R1.
110 |         end:
111 |           type: number
112 |           description: Cell barcode's last position in R1.
113 |     UMI-barcode:
114 |       type: object
115 |       description: Start and end positions for umi barcodes.
116 |       properties:
117 |         start:
118 |           type: number
119 |           description: UMI barcode's first position in R1
120 |         end:
121 |           type: number
122 |           description: UMI barcode's last position in R1
123 |     cutadapt:
124 |       type: object
125 |       description: Details about trimming and filtering in cutadapt.
126 |       properties:
127 |         adapters-file:
128 |           type: string
129 |           description: Adapters file name.
130 |         R1:
131 |           type: object
132 |           description: Details for R1 trimming/filtering.
133 |           properties:
134 |             quality-filter:
135 |               type: number
136 |               description: Quality filtering value as described in cutadapt's documentation for 3" end. https://cutadapt.readthedocs.io/en/stable/algorithms.html#quality-trimming-algorithm
137 |             maximum-Ns:
138 |               type: number
139 |               description: Maximum number of Ns in R1.
140 |             extra-params:
141 |               type: string
142 |               description: Additional parameters for R1 filtering/trimming. For experienced cutadapt users.
143 |           required:
144 |             - quality-filter
145 |             - maximum-Ns
146 |         R2:
147 |           type: object
148 |           description: Details for R2 trimming.
149 |           properties:
150 |             quality-filter:
151 |               type: number
152 |               description: Quality filtering value as described in cutadapt's documentation for 3" end. https://cutadapt.readthedocs.io/en/stable/algorithms.html#quality-trimming-algorithm
153 |             minimum-adapters-overlap:
154 |               type: number
155 |               description: Minimum number of bases that overlap with the mRNA.
156 |             minimum-length:
157 |               type: number
158 |               description: Minimum length of R2 once it's trimmed. Anything under this value will be filtered out.
159 |             extra-params:
160 |               type: string
161 |               description: Additional parameters for R1 filtering/trimming. For experienced cutadapt users.
162 |           required:
163 |             - quality-filter
164 |             - minimum-adapters-overlap
165 |             - minimum-length
166 |         required:
167 |           - adapters-file
168 |           - R1
169 |           - R2 
170 |   EXTRACTION:
171 |     type: object
172 |     description: Details for count extraction.
173 |     properties:
174 |       LOCUS:
175 |         type: array
176 |         description: Any combination of UTR, CODING and INTRON as an array.
177 |       UMI-edit-distance:
178 |         type: number
179 |         description: Number of mismatches allowed between UMI barcodes when demultiplexing.
180 |         default: 1
181 |       minimum-counts-per-UMI:
182 |         type: number
183 |         description: Minimum number of UMI-GENE counts to count as a detected gene in a cell.
184 |         default: 0
185 |       strand-strategy:
186 |         type: string
187 |         description: Defines how to count genes where the forward strand mapped to the forward region on the DNA. Can be SENSE (only count reads that mapped on the same strand), ANTISENSE (only count reads that mapped on the opposite strand) or BOTH (count all).
188 |     required:
189 |       - LOCUS
190 |       - UMI-edit-distance
191 |       - minimum-counts-per-UMI
192 |       - strand-strategy
193 |   MAPPING:
194 |     type: object
195 |     properties:
196 |       STAR:
197 |         type: object
198 |         description: STAR mapper parameters
199 |         properties:
200 |           outFilterMismatchNmax:
201 |             type: number
202 |             description:
203 |             default: 10
204 |           outFilterMismatchNoverLmax: 
205 |             type: number
206 |             description:
207 |             default: 0.3
208 |           outFilterMismatchNoverReadLmax:
209 |             type: number
210 |             description: 
211 |             default: 1
212 |           outFilterMatchNmin:
213 |             type: number
214 |             description:
215 |             default: 0
216 |           outFilterMatchNminOverLread:
217 |             type: number
218 |             description:
219 |             default: 0
220 |           outFilterScoreMinOverLread:
221 |             type: number
222 |             description:
223 |             default: 0
224 |           genomeChrBinNbits:
225 |             type: number
226 |             description:
227 |             default: 18
228 |         required:
229 |           - outFilterMismatchNmax
230 |           - outFilterMismatchNoverLmax
231 |           - outFilterMismatchNoverReadLmax
232 |           - outFilterMatchNmin
233 |           - outFilterMatchNminOverLread
234 |           - outFilterScoreMinOverLread
235 |           - genomeChrBinNbits
236 |     required:
237 |       - STAR
238 |   DEBUG:
239 |     type: boolean
240 |     description: Boolean value that enables debug mode for R scripts providing Rdata of the snakemake object as well as the R env.
241 |     default: FALSE
242 | required:
243 |   - LOCAL
244 |   - META
245 |   - FILTER
246 |   - EXTRACTION
247 |   - MAPPING


--------------------------------------------------------------------------------
/schemas/samples.schema.yaml:
--------------------------------------------------------------------------------
 1 | $schema: "http://json-schema.org/draft-06/schema#"
 2 | description: an entry in the sample sheet
 3 | properties:
 4 |   samples:
 5 |     type: string
 6 |     description: sample name/identifier
 7 |   expected_cells:
 8 |     type: number
 9 |     description: Number of cells expected in an experiment. dropSeqPipe will extract 20% more than the value given.
10 |   read_length:
11 |     type: number
12 |     description: Length of read2 (mRNA). Necessary for generating the STAR index.
13 |   batch:
14 |     type: string
15 |     description: String value that gives a batch id
16 | 
17 | required:
18 |   - samples
19 |   - expected_cells
20 |   - read_length
21 |   - batch


--------------------------------------------------------------------------------
/scripts/clean_cutadapt.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | import re
 3 | 
 4 | def fill_results(snakemake,pair, adapter_results):
 5 |     adapter_pattern = re.compile(pattern="=== Adapter (.*) ===\n")
 6 |     with open(snakemake.input[pair], 'r') as logfile:
 7 |         line = logfile.readline()
 8 |         while(line):
 9 |             adapter_matched = re.findall(pattern=adapter_pattern,string=line)
10 |             if(adapter_matched):
11 |                 logfile.readline()
12 |                 line = logfile.readline().rstrip('.\n')
13 |                 line_list = line.split(';')
14 |                 adapter_results[adapter_matched[0]]['Pair'] = pair
15 |                 adapter_results[adapter_matched[0]]['Sequence'] = line_list[0].split(':')[1].strip()
16 |                 if(adapter_matched[0] in adapter_results):
17 |                     adapter_results[adapter_matched[0]]['Times'] += int(line_list[3].split(':')[1].split(' ')[1].strip())
18 |                 else:    
19 |                     adapter_results[adapter_matched[0]]['Times'] += int(line_list[3].split(':')[1].split(' ')[1].strip())
20 |             line = logfile.readline()
21 |     return(adapter_results)
22 | 
23 | adapter_results_R1 = defaultdict(lambda :{'Pair':'','Sequence':'','Times':0})
24 | adapter_results_R2 = defaultdict(lambda :{'Pair':'','Sequence':'','Times':0})
25 | 
26 | 
27 | adapter_results_R1 = fill_results(snakemake, 'R1', adapter_results_R1)
28 | adapter_results_R2 = fill_results(snakemake, 'R2', adapter_results_R2)
29 | 
30 | with open(snakemake.output[0],'w') as outfile:
31 |     outfile.write('Adapter,Sequence,Pair,Count\n')
32 |     for adapter in adapter_results_R1:
33 |         outfile.write("{},{},{},{}\n".format(adapter, adapter_results_R1[adapter]['Sequence'],adapter_results_R1[adapter]['Pair'],adapter_results_R1[adapter]['Times']))
34 |     for adapter in adapter_results_R2:
35 |         outfile.write("{},{},{},{}\n".format(adapter, adapter_results_R2[adapter]['Sequence'],adapter_results_R2[adapter]['Pair'],adapter_results_R2[adapter]['Times']))


--------------------------------------------------------------------------------
/scripts/convert_mtx.py:
--------------------------------------------------------------------------------
 1 | #Converts the long format given by the dropseqtools v2.0.0 into the sparse mtx format.
 2 | #Output provides features, cell barcodes and counts in seperate files.
 3 | # Can handle one or multiple samples at a time
 4 | 
 5 | import os
 6 | import subprocess
 7 | 
 8 | 
 9 | samples = snakemake.params['samples']
10 | barcodes = {}
11 | features = {}
12 | 
13 | out_folder = os.path.dirname(snakemake.output['mtx'])
14 | out_barcodes = snakemake.output.barcodes
15 | out_features = snakemake.output.features
16 | mtx = snakemake.output.mtx
17 | temp_mtx = os.path.join(out_folder,'temp_umi.mtx')
18 | header = os.path.join(out_folder,'header.mtx')
19 | n_lines=0
20 | barcode_index = 1
21 | feature_index = 1
22 | 
23 | with open(temp_mtx,'w') as mtx_stream:
24 | 	for i,sample in enumerate(snakemake.input):
25 | 		if samples[i] not in sample:
26 | 			sys.exit("Sample name not found in file path")
27 | 		with open(sample,'r') as input_file:
28 | 			next(input_file) # skip first line
29 | 			for line in input_file:
30 | 				barcode,feature,count = line.strip().split('\t')
31 | 				if(not isinstance(samples,str)):
32 | 					barcode = samples[i] + '_' + barcode
33 | 				if(barcode not in barcodes):
34 | 					barcodes[barcode] = barcode_index
35 | 					barcode_index += 1
36 | 				if(feature not in features):
37 | 					features[feature] = feature_index
38 | 					feature_index += 1
39 | 				mtx_stream.write('{} {} {}\n'.format(features[feature],barcodes[barcode],count))
40 | 				n_lines +=1
41 | 
42 | with open(out_barcodes,'w') as barcodes_outfile:
43 | 	for barcode in barcodes:
44 | 		barcodes_outfile.write('{}\n'.format(barcode))
45 | 
46 | with open(out_features,'w') as features_outfile:
47 | 	for feature in features:
48 | 		features_outfile.write('{}\n'.format(feature))
49 | 
50 | with open(header,'w') as header_outfile:
51 | 	header_outfile.write("%%MatrixMarket matrix coordinate real general\n")
52 | 	header_outfile.write('{} {} {}\n'.format(len(features), len(barcodes), n_lines))
53 | 
54 | subprocess.call("cat {} {} > {}".format(header, temp_mtx, mtx), shell=True)
55 | 
56 | os.remove(temp_mtx)
57 | os.remove(header)


--------------------------------------------------------------------------------
/scripts/create_summary_stats.R:
--------------------------------------------------------------------------------
  1 | #' ---
  2 | #' title:  create_summary_stats.R
  3 | #' author: Sebastian Mueller (sebm_at_posteo.de)
  4 | #' date:   2019-03-04
  5 | #' desc: Creating various summary statistics for barcodes (pre and post filtered)
  6 | 
  7 | # o   A delimited file containing information for each STAMP (before cut-off) on number of UMIs, number of Genes detected/captured and the number of NGS-reads
  8 | # o   A separate delimited file containing information for each STAMP (after cut-off) on number of UMIs, number of Genes detected/captured and the number of NGS-reads
  9 | # Example of the format could be as follows:
 10 | # STAMP id          Number of NGS-reads       Number of UMIs       Number of Genes Detected
 11 | # STAMP1           1000000                           50000                       6000
 12 | #' ---
 13 | 
 14 | #------------------------------------ for debuging:
 15 | # add the following line in config.yaml (without the #)
 16 | # DEBUG: True
 17 | # This will create R objects in the debug directory containing the snakemake object
 18 | #  R object that can be loaded into a custom R session as below:
 19 | # load("debug/snakemake_create_summary_stats.rdata")
 20 | # load(file="debug/R_image_create_summary_stats.rdata")
 21 | 
 22 | debug_flag <- FALSE
 23 | # check if DEBUG flag is set
 24 | if (snakemake@config$DEBUG) {
 25 |   debug_flag <- TRUE
 26 |   message("In debug mode: saving R objects to inspect later")
 27 |   path_debug <- file.path(snakemake@config$LOCAL$results, "debug")
 28 |   dir.create(path_debug, showWarnings = FALSE)
 29 |   save(snakemake, file = file.path(path_debug, "create_summary_stats_snakemake.rdata"))
 30 | }
 31 | #### /debug
 32 | 
 33 | library(dplyr) # Dataframe manipulation
 34 | library(Matrix) # Sparse matrices
 35 | library(stringr)
 36 | library(RColorBrewer)
 37 | library(devtools)
 38 | library(Seurat)
 39 | library(plotly)
 40 | 
 41 | samples <- snakemake@params$sample_names
 42 | batches <- snakemake@params$batches
 43 | 
 44 | 
 45 | # importing Seurat object
 46 | 
 47 | seuratobj <- readRDS(file  = file.path(snakemake@input$R_objects))
 48 | meta.data <- seuratobj@meta.data
 49 | 
 50 | # subset only highest stamps as set in config.yaml
 51 | # this is necessary since there are more stamps selected as a safty margin which now have to be taken out again to calculate stats.
 52 | 
 53 | meta.data.sub <- meta.data %>%
 54 |   group_by(orig.ident) %>%
 55 |   arrange(desc(nCounts)) %>%
 56 |   slice(1:expected_cells[1]) %>% # makes sure only # expected cell are kept
 57 |   as.data.frame()
 58 | 
 59 | gini_index <- function (x, weights = rep(1, length = length(x))) {
 60 |     ox      <- order(x)
 61 |     x       <- x[ox]
 62 |     weights <- weights[ox] / sum(weights)
 63 |     p       <- cumsum(weights)
 64 |     nu      <- cumsum(weights * x)
 65 |     n       <- length(nu)
 66 |     nu      <- nu/nu[n]
 67 |     sum(nu[-1] * p[-n]) - sum(nu[-n] * p[-1])
 68 | }
 69 | 
 70 | #------------------------------------ post-filter-stats
 71 | # stats only based after keeping most abundant `expected-cells`
 72 | # taken out from seurat object generated in violine_plot rule.
 73 | 
 74 | # median calculator
 75 | 
 76 | stats_post <- meta.data.sub %>%
 77 |   group_by(orig.ident) %>%
 78 |   summarise(
 79 |     Total_nb_reads                 = sum(nCounts),
 80 |     Nb_STAMPS                      = mean(expected_cells), # should be all the same anyway..
 81 |     Median_reads_per_STAMP         = round(median(nCounts), 2),
 82 |     Mean_reads_per_STAMP           = round(mean(nCounts), 2),
 83 |     Total_nb_UMIs                  = sum(nUMI),
 84 |     Median_UMIs_per_STAMP          = round(median(nUMI), 2),
 85 |     Mean_UMIs_per_STAMP            = round(mean(nUMI), 2),
 86 |     Mean_UMIs_per_Gene             = round(mean(umi.per.gene), 2),
 87 |     Median_number_genes_per_STAMP  = round(median(nGene), 2),
 88 |     Mean_number_genes_per_STAMP    = round(mean(nGene), 2),
 89 |     Mean_Ribo_pct                  = round(100 * mean(pct.Ribo), 2),
 90 |     Mean_Mito_pct                  = round(100 * mean(pct.mito), 2),
 91 |     Mean_Count_per_UMI             = round(sum(nCounts) / sum(nUMI), 2),
 92 |     Read_length                    = mean(read_length), # should be all the same anyway..
 93 |     Number_barcodes_used_for_debug = n()
 94 |   ) %>%
 95 |   as.data.frame()
 96 | 
 97 | row.names(stats_post) <- stats_post$orig.ident
 98 | 
 99 | 
100 | 
101 | # highest, lowest count/UMI Stamp
102 | # pre STAMP stats
103 | 
104 | # hist out goes into knee plots
105 | # 'results/logs/{sample}_hist_out_cell.txt'
106 | # """export _JAVA_OPTIONS=-Djava.io.tmpdir={params.temp_directory} && BAMTagHistogram -m {params.memory}\
107 | # TAG=XC\
108 | # https://hpc.nih.gov/apps/dropseq.html
109 | # there is not hint in documentation on any filtering (only read quality)
110 | 
111 | #------------------------------------ pre-filter-stats
112 | # calculating statistics based on barcodes before thresholding it (i.e. keeping the most abudant barcodes based on `expected cells`)
113 | # This is based on 'logs/{sample}_hist_out_cell.txt' generated by `BAMTagHistogram` from dropseq-tools
114 | # TOOO: The documentation only mentions duplicate and quality filter. But it seems do more filtering since most barcodes are expected to have only one read assinged but there are usually more. Find out.
115 | # https://hpc.nih.gov/apps/dropseq.html
116 | 
117 | # TODO: Nr UMI for pre filter
118 | 
119 | stats_pre <- data.frame(matrix(nrow = length(samples), ncol = 10))
120 | colnames(stats_pre) <- c(
121 |   "Sample",
122 |   "Batch",
123 |   "Total_raw_reads",
124 |   "Nr_barcodes_total",
125 |   "Nr_barcodes_more_than_1_reads",
126 |   "Nr_barcodes_more_than_10_reads",
127 |   "percentile99",
128 |   "percentile95",
129 |   "percentile50",
130 |   "Gini-index"
131 | )
132 | 
133 | stats_pre[, "Sample"] <- samples
134 | stats_pre[, "Batch"]  <- batches
135 | 
136 | 
137 | for (i in 1:length(samples)) {
138 |   # importing 'logs/{sample}_hist_out_cell.txt'
139 |   hist_out <- read.table(
140 |     file = snakemake@input$hist_cell[i],
141 |     header = FALSE, stringsAsFactors = FALSE
142 |   )
143 |   mysample <- samples[i]
144 |   reads <- hist_out$V1
145 |   barcodes <- hist_out$V2
146 |   # calculations on reads
147 |   # total reads are not sum(reads)! Needs to be taken from
148 |   # results/logs/cutadapt/sample1_R1.qc.txt
149 |   #
150 |   # read in full text file "sample_R2.qc.txt"
151 |   filedump <- readLines(snakemake@input$R2qc[i])
152 |   # subset line matching a pattern
153 |   total_reads <- filedump[grep("Total reads processed:", filedump)] %>% #extract line
154 |     str_extract("[0-9,]+") %>% # extract number from line
155 |     str_replace_all(",", "") %>% # delete comma for subsequent numeric casting
156 |     as.numeric()
157 |   reads_cumsum      <- cumsum(reads)
158 |   reads_cumsum_perc <- (reads_cumsum / sum(reads))
159 |   # reporting stats
160 |   stats_pre[i, "Total_raw_reads"]                <- total_reads
161 |   stats_pre[i, "Reads_assigned_to_expected_STAMPs"] <- sum(reads[1:stats_post$Nb_STAMPS[i]])
162 |   stats_pre[i, "Nr_barcodes_total"]              <- length(barcodes)
163 |   stats_pre[i, "percentile99"]                   <- which.min(reads_cumsum_perc < 0.99)
164 |   stats_pre[i, "percentile95"]                   <- which.min(reads_cumsum_perc < 0.95)
165 |   stats_pre[i, "percentile50"]                   <- which.min(reads_cumsum_perc < 0.50)
166 |   stats_pre[i, "Nr_barcodes_more_than_1_reads"]  <- sum(reads > 1)
167 |   stats_pre[i, "Nr_barcodes_more_than_10_reads"] <- sum(reads > 10)
168 |   stats_pre[i, "Gini-index"]                     <- round(gini_index(reads), 2)
169 |   expected_cells <- as.numeric(filter(stats_post, orig.ident==mysample) %>% select(Nb_STAMPS))
170 |   # % of reads left after applying expected_cells cuttoff
171 |   stats_post[mysample, "Pct_reads_after_filter_expected_cells"] <-
172 |     round(100 * (
173 |                  reads_cumsum[expected_cells] / sum(reads)
174 |     ), 2)
175 |   # % of reads left after applying all filters including mapping etc.
176 |   # Thats the effecive usable reads of the sequencing run
177 |   stats_post[mysample, "Pct_reads_after_filter_everything"] <-
178 |     round(100 * (
179 |                  filter(stats_post, orig.ident==mysample) %>% select(Total_nb_reads) /
180 |                  stats_pre [i, "Total_raw_reads"]), 2
181 |          )
182 | }
183 | 
184 | stats_pre <- stats_pre %>%
185 |   arrange(Sample)
186 | 
187 | stats_post <- stats_post %>%
188 |   arrange((orig.ident))
189 | 
190 | # output
191 | write.csv(stats_pre,  file.path(snakemake@output$stats_pre))
192 | write.csv(stats_post, file.path(snakemake@output$stats_post)) # writes table for excel
193 | 
194 | if (debug_flag) {
195 |   save.image(file = file.path(path_debug, "create_summary_stats_workspace.rdata"))
196 | }
197 | 


--------------------------------------------------------------------------------
/scripts/detect_barcodes.py:
--------------------------------------------------------------------------------
 1 | from Bio import SeqIO
 2 | import gzip
 3 | from collections import Counter
 4 | 
 5 | fastq_parser = SeqIO.parse(gzip.open(snakemake.input.R1, "rt"), "fastq")
 6 | sequences = []
 7 | n=0
 8 | for fastq_R1 in fastq_parser:
 9 | 		sequences.append(str(fastq_R1.seq))
10 | 		n+=1
11 | 		if(n==10000000):
12 | 			break
13 | def parse_barcodes(fastq_parser):
14 | 	counts={}
15 | 	ranges = range(5,len(sequences[0]))
16 | 
17 | 	for cell_bc_length in ranges:
18 | 		counts[cell_bc_length] = list()
19 | 		for fastq_R1 in sequences:
20 | 			counts[cell_bc_length].append(fastq_R1[0:cell_bc_length])
21 | 	return(counts)
22 | 	
23 | 	
24 | 
25 | counts = parse_barcodes(fastq_parser)
26 | 
27 | with open(snakemake.output[0], "w") as outfile:
28 | 	outfile.write('bc_length,first_counts\n')
29 | 	for cell_bc_length in counts:
30 | 		outfile.write('{},{}\n'.format(cell_bc_length,str(Counter(counts[cell_bc_length]).most_common(100)[0][1])))
31 | 


--------------------------------------------------------------------------------
/scripts/fa2tsv.py:
--------------------------------------------------------------------------------
 1 | #' ---
 2 | #' title:  fa2tsv.py
 3 | #' author: Sebastian Mueller (sebm_at_posteo.de)
 4 | #' date:   2019-03-04-
 5 | #' Convertes fasta file into tab seperated file suitable as input for FastQC.
 6 | #' This is to have FastQC using customized adapters using the -a option
 7 | #' ---
 8 | 
 9 | import sys
10 | from Bio import SeqIO
11 | 
12 | number_bp = 12
13 | 
14 | with open( snakemake.output['tsv'], "w" ) as output:
15 |     for seq_record in SeqIO.parse(snakemake.input['fa'], "fasta"):
16 |         myline = (str(seq_record.id)) + "\t" + str(seq_record.seq[0:(number_bp + 1)]) + "\n"
17 |         output.write(myline)
18 | 
19 | 


--------------------------------------------------------------------------------
/scripts/generate_extended_ref.py:
--------------------------------------------------------------------------------
 1 | from itertools import combinations, product
 2 | from collections import defaultdict
 3 | from copy import deepcopy
 4 | import pickle
 5 | from shutil import copyfile
 6 | 
 7 | 
 8 | def save_obj(obj, name):
 9 |     with open(name, 'wb') as f:
10 |         pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
11 | 
12 | def generate_all(barcode, reference, mapping, edit_distance):
13 |     mutants = generate_mutants(barcode, edit_distance)
14 |     for mutant in mutants:
15 |         if(mutant not in reference):
16 |             reference.add(mutant)
17 |             mapping[edit_distance][mutant]['ref'] = barcode
18 |             mapping[edit_distance][mutant]['count'] = 0
19 |             mapping[edit_distance][mutant]['lanes'] = {'1':0,'2':0,'3':0,'4':0,'5':0,'6':0,'7':0,'8':0}
20 | 
21 |     mapping['unknown']=defaultdict()
22 |     return(reference, mapping)
23 | 
24 | 
25 | def generate_mutants(sequence, d=1):
26 |     """Taken from stackoverflow: https://stackoverflow.com/a/19823295/9178565"""
27 |     N = len(sequence)
28 |     letters = 'ACGTN'
29 |     pool = list(sequence)
30 |     for indices in combinations(range(N), d):
31 |         for replacements in product(letters, repeat=d):
32 |             skip = False
33 |             for i, a in zip(indices, replacements):
34 |                 if pool[i] == a: skip = True
35 |             if skip: continue
36 | 
37 |             keys = dict(zip(indices, replacements))
38 |             yield ''.join([pool[i] if i not in indices else keys[i] 
39 |                            for i in range(N)])
40 | 
41 | # Create empty sets and defaultdicts
42 | barcode_ref = set()
43 | mapping=defaultdict(dict)
44 | 
45 | # Initiate ref and mapping with the given barcodes
46 | with open(snakemake.input.whitelist,'r') as ref_file:
47 |     for line in ref_file.readlines():
48 |         barcode = line.strip()
49 |         barcode_ref.add(barcode)
50 |         mapping[0][barcode]=defaultdict(dict)
51 |         mapping[0][barcode]['ref'] = barcode
52 |         mapping[0][barcode]['count'] = 0
53 |         mapping[0][barcode]['lanes'] = {'1':0,'2':0,'3':0,'4':0,'5':0,'6':0,'7':0,'8':0}
54 | 
55 | barcode_ext_ref = deepcopy(barcode_ref)
56 | # For now edit distance is one, but can be extended to a higher number later on.
57 | max_edit_distance = 1
58 | for edit_distance in range(1,max_edit_distance+1):
59 |     mapping[edit_distance]=defaultdict(dict)
60 |     for barcode in mapping[0]:
61 |         (barcode_ext_ref,mapping) = generate_all(barcode, barcode_ext_ref, mapping, edit_distance)
62 | 
63 | # Delete given barcodes out of new reference. This helps later on when running "repair_barcodes.py"
64 | barcode_ref = set(mapping[0])
65 | barcode_ext_ref.difference_update(barcode_ref)
66 | 
67 | # Save mapping and references to reuse later.
68 | save_obj(obj=mapping, name=snakemake.output.barcode_mapping)
69 | save_obj(obj=barcode_ref,name=snakemake.output.barcode_ref)
70 | save_obj(obj=barcode_ext_ref,name=snakemake.output.barcode_ext_ref)
71 | 
72 | 
73 | copyfile(snakemake.input['whitelist'], snakemake.output['barcodes'])


--------------------------------------------------------------------------------
/scripts/merge_bam.py:
--------------------------------------------------------------------------------
 1 | import pysam
 2 | import re
 3 | import csv
 4 | from Bio import SeqIO
 5 | import gzip
 6 | from collections import defaultdict
 7 | import sys
 8 | 
 9 | #This function fills in a dict with readids
10 | #and their corresponding cell and umi barcodes until it finds
11 | #a specific read id
12 | 
13 | discard_secondary_alignements = snakemake.params['discard_secondary_alignements']
14 | 
15 | barcodes_struct = {
16 | 	'BC_start':snakemake.params['BC_start'],
17 | 	'BC_end':snakemake.params['BC_end'],
18 | 	'UMI_start':snakemake.params['UMI_start'],
19 | 	'UMI_end':snakemake.params['UMI_end']
20 | 	}
21 | 
22 | def parse_barcodes(fastq_parser, query_name, read_barcodes, barcodes_struct):
23 | 	for fastq_R1 in fastq_parser:
24 | 		# Some sequencers give a /1 and /2 to R1 and R2 read ids respectively. This attempts to solve the issue #69.
25 | 		if '/' in fastq_R1.id:
26 | 			R1_id = fastq_R1.id[:fastq_R1.id.find("/")]
27 | 		else:
28 | 			R1_id = fastq_R1.id
29 | 		read_barcodes[R1_id]['XC'] = str(fastq_R1.seq)[barcodes_struct['BC_start']:barcodes_struct['BC_end']]
30 | 		read_barcodes[R1_id]['XM'] = str(fastq_R1.seq)[barcodes_struct['UMI_start']:barcodes_struct['UMI_end']]
31 | 		if(read_barcodes[R1_id]['XM']==''):
32 | 			sys.SystemExit('UMI empty for read {}.\n The barcode is: {}.\nWhole entry is:{}'.format(R1_id, fastq_R1.seq,fastq_R1))
33 | 		if (R1_id == query_name):
34 | 			return(fastq_parser,read_barcodes)
35 | 	return(fastq_parser,read_barcodes)
36 | 	
37 | infile_bam = pysam.AlignmentFile(snakemake.input[0], "rb")
38 | 
39 | fastq_parser = SeqIO.parse(gzip.open(snakemake.input[1], "rt"), "fastq")
40 | 
41 | outfile = pysam.AlignmentFile(snakemake.output[0], "wb", template=infile_bam)
42 | 
43 | read_barcodes = defaultdict(lambda :{'XC':'','XM':''})
44 | 
45 | for bam_read in infile_bam:
46 | 	if(discard_secondary_alignements & bam_read.is_secondary):
47 | 		continue
48 | 	if (bam_read.query_name) in read_barcodes:
49 | 		current_barcodes = read_barcodes.pop(bam_read.query_name)
50 | 		tags = bam_read.get_tags()
51 | 		tags.extend([
52 | 			('XC', current_barcodes['XC'],'Z'),
53 | 			('XM', current_barcodes['XM'],'Z')])
54 | 		bam_read.set_tags(tags)
55 | 	else:
56 | 		fastq_parser,read_barcodes = parse_barcodes(fastq_parser, bam_read.query_name, read_barcodes, barcodes_struct)
57 | 		if (bam_read.query_name) not in read_barcodes:
58 | 			raise SystemExit('Read {} from mapped file is missing in reference fastq file!'.format(bam_read.query_name))
59 | 			os.remove(snakemake.output[0])
60 | 		current_barcodes = read_barcodes.pop(bam_read.query_name)
61 | 		tags = bam_read.get_tags()
62 | 		tags.extend([
63 | 			('XC', current_barcodes['XC'],'Z'),
64 | 			('XM', current_barcodes['XM'],'Z')])
65 | 		bam_read.set_tags(tags)
66 | 	outfile.write(bam_read)


--------------------------------------------------------------------------------
/scripts/plot_adapter_content.R:
--------------------------------------------------------------------------------
 1 | #------------------------------------ for debugging:
 2 | # For debugging add the following line in config.yaml (without the #)
 3 | # DEBUG: True
 4 | # This will create R objects in the debug directory containing the snakemake
 5 | # object R object that can be loaded into a custom R session as below:
 6 | 
 7 | debug_flag <- FALSE
 8 | # check if DEBUG flag is set
 9 | if (snakemake@config$DEBUG) {
10 |   debug_flag <- TRUE
11 |   message("In debug mode: saving R objects to inspect later")
12 |   path_debug <- file.path(snakemake@config$LOCAL$results, "debug")
13 |   dir.create(path_debug, showWarnings = FALSE)
14 |   save(snakemake, file = file.path(path_debug, "plot_adapter_content_snakemake.rdata"))
15 | }
16 | 
17 | #------------------------------------ debugging
18 | 
19 | library(ggplot2)
20 | library(dplyr)
21 | library(viridis)
22 | 
23 | samples <- snakemake@params$sample_names
24 | batches <- snakemake@params$batches
25 | 
26 | #Read files into a list
27 | cutadapt_clean_list <- list()
28 | for (i in seq_along(samples)){
29 |   cutadapt_clean           <- read.csv(snakemake@input[[i]][1], header = TRUE)
30 |   cutadapt_clean$Sample    <- samples[i]
31 |   cutadapt_clean$Batch     <- batches[i]
32 |   cutadapt_clean_list[[i]] <- cutadapt_clean
33 | }
34 | 
35 | # combining adaptors accross samples
36 | cutadapt_counts <- Reduce(rbind, cutadapt_clean_list, NULL)
37 | 
38 | #Transform it into percentages
39 | cutadapt_counts <- group_by(cutadapt_counts, Sample, Pair) %>%
40 |   mutate(Percentages=Count/sum(Count))
41 | #      Adapter                           Sequence Pair Count  Sample  Batch
42 | # 1 PrefixNX/1                AGATGTGTATAAGAGACAG   R1     7 sample1 Batch1
43 | # ...
44 | # 6  Trans2_rc CTGTCTCTTATACACATCTCCGAGCCCACGAGAC   R2     5 sample2 Batch2
45 | 
46 | p1 <- ggplot(cutadapt_counts, aes(x=Sample, y = Percentages, fill = Adapter))  +
47 |   geom_bar(stat = "identity") +
48 |   facet_grid(Pair ~ Batch, scales = "free") +
49 |   theme_minimal() +
50 |   ggtitle("Comparison accross samples of adapter content") +
51 |   scale_x_discrete(label=abbreviate) +
52 |   scale_y_continuous(labels = scales::percent) +
53 |   theme(axis.text.x=element_text(angle = 90, hjust = 0)) +
54 |   scale_fill_viridis(discrete=TRUE)
55 | 
56 | ggsave(plot=p1, filename=snakemake@output$pdf)
57 | 
58 | if (debug_flag) {
59 |   save.image(file = file.path(path_debug, "plot_adapter_content_workspace.rdata"))
60 | }
61 | 


--------------------------------------------------------------------------------
/scripts/plot_knee_plot.R:
--------------------------------------------------------------------------------
  1 | #------------------------------------ for debuging:
  2 | # add the following line in config.yaml (without the #)
  3 | # DEBUG: True
  4 | # This will create R objects in the debug directory containing the snakemake object
  5 | #  R object that can be loaded into a custom R session as below:
  6 | 
  7 | debug_flag <- FALSE
  8 | if (snakemake@config$DEBUG) {
  9 |   debug_flag <- TRUE
 10 |   message("In debug mode: saving R objects to inspect later")
 11 |   path_debug <- file.path(snakemake@config$LOCAL$results, "debug")
 12 |   dir.create(path_debug, showWarnings = FALSE)
 13 |   save(snakemake, file = file.path(path_debug,
 14 |      paste0("plot_knee_plot_snakemake_",
 15 |             attr(snakemake, "wildcard")$sample, ".rdata"))
 16 |           )
 17 | }
 18 | #### /debug
 19 | 
 20 | library(ggplot2)
 21 | library(plyr)
 22 | # Create the cumulative plot
 23 | data=read.table(file = snakemake@input[[1]][1], header=FALSE, stringsAsFactors=FALSE)
 24 | barcodes = data$V2
 25 | total_reads = sum(data$V1)
 26 | y_raw=cumsum(data$V1)
 27 | y=(y_raw/total_reads)
 28 | x = 1:length(y)
 29 | plot_data = data.frame(rank = x,cum_sum=y, Barcode=data$V2)
 30 | x_scale = snakemake@params$cells * 4
 31 | 
 32 | knee_plot <- ggplot(plot_data, aes(x=rank, y=cum_sum)) +
 33 |  geom_point(size = 0.1) +
 34 |  xlim(0,x_scale) +
 35 |  geom_vline(xintercept=snakemake@params$cells, linetype="dashed", color = "red") +
 36 |  ggtitle(paste0(snakemake@wildcards$sample, '\nTotal reads: ', prettyNum(total_reads))) +
 37 |  theme(plot.title = element_text(size=10)) +
 38 |  labs(x='STAMPS', y='Cumulative fraction of reads') +
 39 |  scale_y_continuous(labels = scales::percent) +
 40 |  theme_classic()
 41 | 
 42 | if(!is.null(snakemake@input$barcodes))
 43 | {
 44 |   selected_cells <- read.csv(snakemake@input$barcodes, header=FALSE, stringsAsFactors=FALSE)
 45 |   knee_plot <- knee_plot +
 46 |     geom_point(data = plot_data[plot_data$Barcode %in% selected_cells$V1,],
 47 |                aes(x=rank, y=cum_sum, color='Selected'), size=0.1) +
 48 |     scale_color_manual(values=c('Selected'='green'))
 49 | }
 50 | ggsave(knee_plot, file=snakemake@output$pdf, width = 4, height = 3)
 51 | 
 52 | 
 53 | if (debug_flag) {
 54 |   library(gridExtra)
 55 |   library(grid)
 56 |   # potential usefull to change lag of diff calculation
 57 |   mylag <- 1
 58 |   # data <- read.table(file = file, header=FALSE, stringsAsFactors=FALSE)
 59 |   # head(data,3)
 60 |   #        V1           V2
 61 |   # 1 1145137 CCCTTCGTCTGC
 62 |   # 2 1039974 ATAGTTTTTTAA
 63 |   # 3  912199 GCATGAAACTTC
 64 | 
 65 |   # borrowed from https://stackoverflow.com/questions/6836409/finding-local-maxima-and-minima
 66 |   localMaxima <- function(x) {
 67 |     # Use -Inf instead if x is numeric (non-integer)
 68 |     y <- diff(c(-.Machine$integer.max, x)) > 0L
 69 |     rle(y)$lengths
 70 |     y <- cumsum(rle(y)$lengths)
 71 |     y <- y[seq.int(1L, length(y), 2L)]
 72 |     if (x[[1]] == x[[2]]) {
 73 |       y <- y[-1]
 74 |     }
 75 |     y
 76 |   }
 77 | 
 78 |   reads        <- data$V1
 79 |   barcodes     <- data$V2
 80 |   total_reads  <- sum(reads)
 81 |   reads_cumsum <- cumsum(reads)
 82 |   # 1st dervivative (diff) needs also to be padded to keep same vector length
 83 |   reads_diff   <- c(diff(reads,lag=mylag,differences = 1),rep(0,mylag))
 84 | 
 85 |   # 2nd derivative: twice as much padding:
 86 |   reads_diff_diff   <- c(diff(reads,lag=mylag,differences = 2),rep(0,mylag*2))
 87 |   reads_cumsum_perc <- (reads_cumsum/total_reads)
 88 |   x                 <- 1:length(reads_cumsum_perc)
 89 |   plot_data         <- data.frame(rank       = x,
 90 |                                   cum_sum    = reads_cumsum_perc,
 91 |                                   read_count = reads,
 92 |                                   Barcode    = data$V2,
 93 |                                   diff       = reads_diff,
 94 |                                   diffdiff   = reads_diff_diff)
 95 | 
 96 |   # head(plot_data,6)
 97 |   #   rank     cum_sum read_count      Barcode    diff diffdiff
 98 |   # 1    1 0.007192916    1145137 CCCTTCGTCTGC -105163   -22612
 99 |   # 2    2 0.013725274    1039974 ATAGTTTTTTAA -127775    14025
100 |   # 3    3 0.019455043     912199 GCATGAAACTTC -113750    61486
101 |   # 4    4 0.024470318     798449 GTGTGGGTCTCT  -52264    34985
102 |   # 5    5 0.029157308     746185 CGTACTGACTAC  -17279   -60441
103 |   # 6    6 0.033735764     728906 GTTCGTCCCGCC  -77720    69104
104 |   mystats <- paste0("| Nr barcodes total: ", length(barcodes), ' \n ',
105 |                     "Nr barcodes for 50% reads: ", which.min(reads_cumsum_perc<0.50)," | ",
106 |                     "Nr barcodes for 95% reads: ", which.min(reads_cumsum_perc<0.95)," | ",
107 |                     "Nr barcodes for 99% reads: ", which.min(reads_cumsum_perc<0.99)
108 |                     )
109 | 
110 |   # x_scale        <- which(reads_cumsum_perc>0.99)[1]
111 |   x_scale        <- snakemake@params$cells * 4
112 |   plot_data_head <- head(plot_data, x_scale)
113 |   # plot_data_head$reads_diff_smooth <- predict(loess(diff~rank,data=plot_data_head))
114 |   plot_data_head$reads_diffdiff_smooth <- predict(loess(diffdiff~rank,span=0.2,data=plot_data_head))
115 | 
116 |   ## Finding knee in knee-plot:
117 |   # Best approach so far is to calclate the 2nd derivative of the read counts per STAMP (reads_diff_diff), smooth it (loess) and find it's maxima. Since ther can be several maxima, they are just ploted and it's up to the user to visually asses and decide.
118 |   # finding local maxima in 2nd derivative:
119 |   #https://stackoverflow.com/questions/6836409/finding-local-maxima-and-minima
120 |   loc.max <- localMaxima(plot_data_head$reads_diffdiff_smooth)
121 |   # local maxima are then colored green as lines in plots
122 |   plot_data_head_sub <- plot_data_head[loc.max,]
123 |   # which.peaks(plot_data_head$reads_diff_smooth2)
124 | 
125 |   knee_plot_ext <- ggplot(plot_data_head, aes(x=rank, y=cum_sum)) +
126 |     xlim(0,x_scale) +
127 |     ylim(0,1) +
128 |     geom_text(data=plot_data_head_sub,aes(label = round(cum_sum,2)),nudge_y=-0.05, vjust = "inward", hjust = "inward") +
129 |     geom_vline(xintercept=snakemake@params$cells, linetype="dashed", color = "red") +
130 |     #   geom_vline(xintercept=100, linetype="dashed", color = "red") +
131 |     geom_vline(xintercept=loc.max, col="lightgreen") +
132 |     geom_point(size = 0.1)  +
133 |     ggtitle(paste0(snakemake@wildcards$sample, '\nTotal reads: ', prettyNum(total_reads), mystats)) +
134 |     theme(plot.title = element_text(size=10)) +
135 |     labs(x='STAMPS', y='Cumulative fraction of reads')
136 |   read_count_plot <- ggplot(plot_data_head, aes(x=rank, y=read_count)) +
137 |     geom_text(data=plot_data_head_sub,aes(label = read_count),nudge_y=-0.05, vjust = "inward", hjust = "inward", check_overlap = TRUE) +
138 |     #   geom_smooth() +
139 |     xlim(0,x_scale) +
140 |     #   ylim(0,1000) +
141 |     geom_vline(xintercept=snakemake@params$cells, linetype="dashed", color = "red") +
142 |     #   geom_vline(xintercept=100, linetype="dashed", color = "red") +
143 |     geom_vline(xintercept=loc.max, col="lightgreen") +
144 |     geom_point(size = 0.1)  +
145 |     theme(plot.title = element_text(size=10)) +
146 |     labs(x='STAMPS', y='Read counts per STAMP')
147 | 
148 |   # knee_plot_ext = knee_plot_ext + scale_y_continuous(labels = scales::percent)
149 |   diff_plot <- ggplot(plot_data_head, aes(x=rank, y=diff)) +
150 |     geom_text(data=plot_data_head_sub,aes(label = diff),nudge_y=-0.05, vjust = "inward", hjust = "inward", check_overlap = TRUE) +
151 |     #   geom_smooth() +
152 |     xlim(0,x_scale) +
153 |     #   ylim(0,1000) +
154 |     geom_vline(xintercept=snakemake@params$cells, linetype="dashed", color = "red") +
155 |     #   geom_vline(xintercept=100, linetype="dashed", color = "red") +
156 |     geom_vline(xintercept=loc.max, col="lightgreen") +
157 |     geom_point(size = 0.1)  +
158 |     theme(plot.title = element_text(size=10)) +
159 |     #1st derivative read count diff to next STAMP")
160 |     labs(x='STAMPS', y="1st derivative of read counts")
161 |   diff_diff_plot <- ggplot(plot_data_head,
162 |                            aes(x=rank, y=reads_diffdiff_smooth)) +
163 |     geom_text(data=plot_data_head_sub,
164 |               aes(label = rank),nudge_y=-1,
165 |               vjust = "inward",
166 |               hjust = "inward",
167 |               check_overlap = TRUE) +
168 |     xlim(0,x_scale) +
169 |     ylim(-30,30) +
170 |     geom_vline(xintercept=snakemake@params$cells,
171 |                linetype="dashed", color = "red") +
172 |     #   geom_vline(xintercept=100, linetype="dashed", color = "red") +
173 |     geom_vline(xintercept=loc.max, col="lightgreen") +
174 |     geom_point(size = 0.1)  +
175 |     theme(plot.title = element_text(size=10)) +
176 |     labs(x='STAMPS', y='2nd derivative of read counts')
177 | 
178 |   if(!is.null(snakemake@input$barcodes))
179 |   {
180 |     selected_cells <- read.csv(snakemake@input$barcodes, header=FALSE, stringsAsFactors=FALSE)
181 |     knee_plot_ext <- knee_plot_ext +
182 |       geom_point(data = plot_data_head[plot_data_head$Barcode %in% selected_cells$V1,],
183 |                  aes(x=rank, y=cum_sum, color='Selected'), size=0.1) +
184 |       scale_color_manual(values=c('Selected'='green')) +
185 |       theme(legend.position="none")
186 |     diff_diff_plot <- diff_diff_plot +
187 |       geom_point(data = plot_data_head[plot_data_head$Barcode %in% selected_cells$V1,],
188 |                  aes(x=rank, y=reads_diffdiff_smooth, color='Selected'), size=0.1) +
189 |       scale_color_manual(values=c('Selected'='green')) +
190 |       theme(legend.position="none")
191 |   }
192 |   #   scale_y_continuous(position = "right")
193 |   gp1 <- ggplotGrob(knee_plot_ext)
194 |   gp2 <- ggplotGrob(read_count_plot)
195 |   gp3 <- ggplotGrob(diff_plot)
196 |   gp4 <- ggplotGrob(diff_diff_plot)
197 |   # grid::grid.newpage()
198 |   # gg <- grid::grid.draw(rbind(gp1, gp2, gp3, gp4, size = "last"))
199 |   gg <- gridExtra::arrangeGrob(rbind(gp1, gp2, gp3, gp4, size = "last"))
200 |   # gg <- gridExtra::arrangeGrob(gp1, gp2, gp3, gp4, ncol = 1)
201 | 
202 |   # if barcode.csv is present in base directory, only use barcodes in there (rule plot_knee_plot_whitelist in map.smk)
203 |   ggsave(gg, file=paste0(snakemake@output$pdf, "_extended.pdf"), width = 9, height = 11)
204 | }
205 | 
206 | if (debug_flag) {
207 |   save.image(file = file.path(path_debug,
208 |        paste0("plot_knee_plot_workspace_",
209 |               attr(snakemake, "wildcard")$sample, ".rdata"))
210 |             )
211 | }
212 | 


--------------------------------------------------------------------------------
/scripts/plot_rna_metrics.R:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | library(tidyr)
 3 | library(gridExtra)
 4 | library(grid)
 5 | library(viridis)
 6 | debug_flag <- FALSE
 7 | if (snakemake@config$DEBUG) {
 8 |   debug_flag <- TRUE
 9 |   message("In debug mode: saving R objects to inspect later")
10 |   path_debug <- file.path(snakemake@config$LOCAL$results, "debug")
11 |   dir.create(path_debug, showWarnings = FALSE)
12 |   save(snakemake, file = file.path(path_debug, "plot_rna_metrics_snakemake.rdata"))
13 | }
14 | 
15 | #### /debug
16 | 
17 | mydata <- read.csv(file = snakemake@input$rna_metrics, header = T,
18 |                    stringsAsFactors = F, skip = 6, sep = "\t")
19 | mydata <- mydata[order(mydata$PF_ALIGNED_BASES, decreasing = T), ]
20 | mydata_pct <- mydata[, c("READ_GROUP",
21 |                          "PCT_INTERGENIC_BASES",
22 |                          "PCT_UTR_BASES",
23 |                          "PCT_RIBOSOMAL_BASES",
24 |                          "PCT_INTRONIC_BASES",
25 |                          "PCT_CODING_BASES")
26 |                      ]
27 | colnames(mydata_pct) = c('Cell Barcode', 'Intergenic', 'UTR', 'Ribosomial', 'Intronic', 'Coding')
28 | 
29 | mydata <- mydata[, c("READ_GROUP",
30 |                      "INTERGENIC_BASES",
31 |                      "UTR_BASES",
32 |                      "RIBOSOMAL_BASES",
33 |                      "INTRONIC_BASES",
34 |                      "CODING_BASES")
35 |                  ]
36 | colnames(mydata) = c('Cell Barcode', 'Intergenic', 'UTR', 'Ribosomial', 'Intronic', 'Coding')
37 | 
38 | # converting into long format for ploting
39 | mydata_long <- mydata %>% gather("Read Overlap", count, -"Cell Barcode")
40 | 
41 | # Keep the original order of the barcodes using factor and levels.
42 | mydata_long$`Cell Barcode` <- factor(mydata_long$`Cell Barcode`,
43 |                                  levels = factor(unique(mydata_long$`Cell Barcode`)))
44 | mydata_long$`Read Overlap` <- factor(mydata_long$`Read Overlap`,
45 |                                    levels = unique(mydata_long$`Read Overlap`))
46 | 
47 | p1 <- ggplot(mydata_long, aes(x = `Cell Barcode`, y = count, fill = `Read Overlap`)) +
48 |   geom_bar(stat = "identity") +
49 |   theme(axis.text.x = element_text(angle = 90, hjust = 0), legend.position = "none")
50 | p1 <- p1 + labs(title = paste(nrow(mydata),
51 |                               "selected barcodes for",
52 |                               snakemake@wildcards$sample),
53 |                 x = "Barcodes", y = "Bases")
54 | p1 <- p1 + theme(axis.title.x = element_blank(),
55 |                  axis.text.x = element_blank(),
56 |                  axis.ticks.x = element_blank())
57 | p1 <- p1 + scale_y_continuous(labels = scales::scientific)
58 | p1 <- p1 + scale_fill_viridis(discrete = TRUE, option = "viridis")
59 | 
60 | 
61 | mydata_long_pct <- mydata_pct %>% gather("Read Overlap", fraction, -"Cell Barcode")
62 | # Keep the original order of the barcodes using factor and levels.
63 | mydata_long_pct$`Cell Barcode` <- factor(mydata_long_pct$`Cell Barcode`,
64 |                                      levels = factor(unique(mydata_long_pct$`Cell Barcode`)))
65 | mydata_long_pct$`Read Overlap` <- factor(mydata_long_pct$`Read Overlap`,
66 |                                        levels = unique(mydata_long_pct$`Read Overlap`))
67 | 
68 | p2 <- ggplot(mydata_long_pct, aes(x = `Cell Barcode`, y = fraction, fill = `Read Overlap`)) +
69 |   geom_bar(stat = "identity") +
70 |   theme(axis.text.x = element_text(angle = 90, hjust = 0, size=8, vjust = 0.05), legend.position = "bottom") +
71 |   labs(x = "Barcodes", y = "%Bases") +
72 |   scale_y_continuous(labels = scales::percent) + scale_fill_viridis(discrete = TRUE, option = "viridis")
73 | # This allows to align the main plots so that we can relate both directly with the label from the bottom one.
74 | gp1 <- ggplotGrob(p1)
75 | gp2 <- ggplotGrob(p2)
76 | pdf(file = snakemake@output$pdf, width = 16, height = 13)
77 | grid::grid.newpage()
78 | grid::grid.draw(rbind(gp1, gp2, size = "last"))
79 | dev.off()
80 | 
81 | if (debug_flag) {
82 |   save.image(file = file.path(path_debug, "plot_rna_metrics_workspace.rdata"))
83 | }
84 | 


--------------------------------------------------------------------------------
/scripts/plot_species_plot.R:
--------------------------------------------------------------------------------
  1 | # Functions used to plot the species plot for drop-seq mixed protocol
  2 | # Authors: James Nemesh, Roelli Patrick, Sebastian Y Mueller
  3 | 
  4 | debug_flag <- FALSE
  5 | if (snakemake@config$DEBUG) {
  6 |   debug_flag <- TRUE
  7 |   message("In debug mode: saving R objects to inspect later")
  8 |   path_debug <- file.path(snakemake@config$LOCAL$results, "debug")
  9 |   dir.create(path_debug, showWarnings = FALSE)
 10 |   save(snakemake, file = file.path(
 11 |     path_debug,
 12 |     paste0("plot_species_plot_snakemake_", attr(snakemake, "wildcard")$sample, ".rdata")
 13 |   ))
 14 | }
 15 | 
 16 | #### /debug
 17 | 
 18 | categorizeCellsUsingKneeKnownNumCellsPaper <- function(
 19 |                                                    digitalExpressionFileO1,
 20 |                                                    digitalExpressionFileO2,
 21 |                                                    organismOne,
 22 |                                                    organismTwo,
 23 |                                                    pureRatio = 0.2,
 24 |                                                    numCells,
 25 |                                                    numBeads,
 26 |                                                    point.cex = 1.5,
 27 |                                                    xlim_range = NULL,
 28 |                                                    ylim_range = NULL,
 29 |                                                    category = "transcripts") {
 30 |   dfFull <- getNumTranscriptsPerCellBarcodeByOrganismPair(
 31 |                                                     digitalExpressionFileO1,
 32 |                                                     digitalExpressionFileO2,
 33 |                                                     organismOne,
 34 |                                                     organismTwo,
 35 |                                                     category)
 36 |   dfFull <- dfFull[order(dfFull$total, decreasing = T), ]
 37 |   dfFull$ratio_one <- dfFull[, 2] / dfFull[, 4]
 38 |   dfFull <- head(dfFull, n = numBeads)
 39 |   df <- head(dfFull, n = numCells)
 40 | 
 41 |   dfNoCall <- dfFull[-1:-numCells, ]
 42 |   if (dim(dfNoCall)[1] > 0) {
 43 |     dfNoCall$organism <- "No Call"
 44 |   }
 45 | 
 46 |   df$organism <- "Mixed"
 47 | 
 48 |   idx <- which(df$ratio_one >= (1 - pureRatio))
 49 |   # checks if the species is actually assigned at all
 50 |   if (length(idx) > 0) {
 51 |     df[idx, ]$organism <- organismOne
 52 |   }
 53 |   idx <- which(df$ratio_one <= (pureRatio))
 54 |   if (length(idx) > 0) {
 55 |     df[idx, ]$organism <- organismTwo
 56 |   }
 57 | 
 58 |   result <- rbind(df, dfNoCall)
 59 | 
 60 |   maxRange <- max(result[, 2], result[, 3])
 61 | 
 62 |   dforganismOne <- result[result$organism == organismOne, ]
 63 |   dforganismTwo <- result[result$organism == organismTwo, ]
 64 |   dfMixed <- result[result$organism == "Mixed", ]
 65 |   dfNoCall <- result[result$organism == "No Call", ]
 66 | 
 67 |   if (is.null(xlim_range)) {
 68 |     xlim_range <- c(0, maxRange)
 69 |   }
 70 | 
 71 |   if (is.null(ylim_range)) {
 72 |     ylim_range <- c(0, maxRange)
 73 |   }
 74 |   colors <- c("blue", "red", "purple", "grey")
 75 |   plot(dforganismOne[, 2], dforganismOne[, 3], col = colors[1],
 76 |        pch = 16, xlim = xlim_range, ylim = ylim_range,
 77 |        xlab = paste(organismOne, category),
 78 |        ylab = paste(organismTwo, category),
 79 |        cex = point.cex)
 80 |   points(dforganismTwo[, 2], dforganismTwo[, 3],
 81 |          col = colors[2], pch = 16, cex = point.cex)
 82 |   points(dfMixed[, 2], dfMixed[, 3],
 83 |          col = colors[3], pch = 16, cex = point.cex)
 84 |   points(dfNoCall[, 2], dfNoCall[, 3],
 85 |          col = colors[4], pch = 16, cex = point.cex)
 86 |   l <- c(paste(organismOne, dim(dforganismOne)[1]),
 87 |          paste(organismTwo, dim(dforganismTwo)[1]),
 88 |          paste("Mixed", dim(dfMixed)[1]),
 89 |          paste("No Call", dim(dfNoCall)[1]))
 90 |   legend("topright", legend = l, fill = colors)
 91 |   title(paste("Species plot based on", category))
 92 |   return(df)
 93 | }
 94 | 
 95 | getNumTranscriptsPerCellBarcodeByOrganismPair <- function(
 96 |                                                       digitalExpressionFileO1,
 97 |                                                       digitalExpressionFileO2,
 98 |                                                       organismOne,
 99 |                                                       organismTwo,
100 |                                                       category) {
101 |   if (is.null(organismOne) || is.null(organismTwo)) {
102 |     return(NULL)
103 |   }
104 | 
105 |   o1 <- getGenesAndTranscriptsPerCellBarcode(digitalExpressionFileO1)
106 |   o2 <- getGenesAndTranscriptsPerCellBarcode(digitalExpressionFileO2)
107 | 
108 |   commonBC <- union(o1$cellBC, o2$cellBC)
109 |   o1p <- o1[match(commonBC, o1$cellBC), ]
110 |   o2p <- o2[match(commonBC, o2$cellBC), ]
111 |   if (category == "genes") {
112 |     df <- data.frame(tag = commonBC, o1Count = o1p$numGenes,
113 |                      o2Count = o2p$numGenes, stringsAsFactors = F)
114 |   }
115 |   else {
116 |     df <- data.frame(tag = commonBC, o1Count = o1p$numTranscripts,
117 |                      o2Count = o2p$numTranscripts, stringsAsFactors = F)
118 |   }
119 | 
120 |   idx1 <- which(is.na(df$o1Count))
121 |   idx2 <- which(is.na(df$o2Count))
122 |   if (length(idx1) > 0) df[idx1, ]$o1Count <- 0
123 |   if (length(idx2) > 0) df[idx2, ]$o2Count <- 0
124 | 
125 |   df$total <- apply(df[, 2:3], 1, sum, na.rm = T)
126 |   df <- df[order(df$total, decreasing = T), ]
127 |   colnames(df)[2] <- organismOne
128 |   colnames(df)[3] <- organismTwo
129 |   return(df)
130 | }
131 | 
132 | 
133 | getGenesAndTranscriptsPerCellBarcode <- function(digitalExpressionFile) {
134 |   a <- read.table(digitalExpressionFile, header = T, stringsAsFactors = F)
135 |   colnames(a) <- c("cellBC", "numGenicReads", "numTranscripts", "numGenes")
136 |   return(a)
137 | }
138 | 
139 | digitalExpressionFileO1 <- snakemake@input[[1]][1]
140 | digitalExpressionFileO2 <- snakemake@input[[2]][1]
141 | 
142 | num_cells <- snakemake@params$expected_cells
143 | 
144 | organismOne <- names(snakemake@config$META$species)[1]
145 | organismTwo <- names(snakemake@config$META$species)[2]
146 | 
147 | par(mar = c(5, 4, 4, 2) + 0.5)
148 | 
149 | pdf(snakemake@output$genes_pdf, height = 8, width = 8)
150 | df_temp <- categorizeCellsUsingKneeKnownNumCellsPaper(
151 |   digitalExpressionFileO1,
152 |   digitalExpressionFileO2,
153 |   organismOne = organismOne,
154 |   organismTwo = organismTwo,
155 |   pureRatio = snakemake@config$META$ratio,
156 |   numCells = num_cells,
157 |   numBeads = num_cells * 2,
158 |   point.cex = 1,
159 |   category = "genes"
160 | )
161 | dev.off()
162 | 
163 | 
164 | 
165 | pdf(snakemake@output$transcripts_pdf, height = 8, width = 8)
166 | df <- categorizeCellsUsingKneeKnownNumCellsPaper(
167 |   digitalExpressionFileO1,
168 |   digitalExpressionFileO2,
169 |   organismOne = organismOne,
170 |   organismTwo = organismTwo,
171 |   pureRatio = snakemake@config$META$ratio,
172 |   numCells = num_cells,
173 |   numBeads = num_cells * 2,
174 |   point.cex = 1,
175 |   category = "transcripts"
176 | )
177 | dev.off()
178 | organism1 <- subset(df, df$organism == organismOne)
179 | organism2 <- subset(df, df$organism == organismTwo)
180 | 
181 | write.table(organism1$tag, snakemake@output$barcodes_species[1],
182 |             row.names = F, col.names = F, quote = F)
183 | write.table(organism2$tag, snakemake@output$barcodes_species[2],
184 |             row.names = F, col.names = F, quote = F)
185 | 
186 | # save.image(paste0(snakemake@output$genes_pdf,".rdata"))
187 | if (debug_flag) {
188 |   save.image(file = file.path(
189 |     path_debug,
190 |     paste0(
191 |       "plot_species_plot_workspace_",
192 |       attr(snakemake, "wildcard")$sample, ".rdata"
193 |     )
194 |   ))
195 | }
196 | 


--------------------------------------------------------------------------------
/scripts/plot_violine.R:
--------------------------------------------------------------------------------
  1 | #' ---
  2 | #' title:  plot_violine.R
  3 | #' author: Sebastian Mueller (sebm_at_posteo.de)
  4 | #' date:   2018-04-10
  5 | #' ---
  6 | ### for debug
  7 | # If you wish to access the snakefile object first invoke snakemake and save the session automatically
  8 | # Since there are no debug flags to my knowledge, just uncomment the line below and run snakemake which
  9 | # creates an R object that can be loaded into a custom R session
 10 | # save.image(file="R_workspace_debug.rdata")
 11 | # load("R_workspace_debug.rdata")
 12 | #### /debug
 13 | debug_flag <- FALSE
 14 | if (snakemake@config$DEBUG) {
 15 |   debug_flag <- TRUE
 16 |   message("In debug mode: saving R objects to inspect later")
 17 |   path_debug <- file.path(snakemake@config$LOCAL$results, "debug")
 18 |   dir.create(path_debug, showWarnings = FALSE)
 19 |   save(snakemake, file = file.path(path_debug, "plot_violin_snakemake.rdata"))
 20 | }
 21 | 
 22 | 
 23 | options(warn = -1)
 24 | library(plyr, quietly = TRUE, warn.conflicts = FALSE)
 25 | library(dplyr, quietly = TRUE, warn.conflicts = FALSE) # Dataframe manipulation
 26 | library(Matrix, quietly = TRUE, warn.conflicts = FALSE) # Sparse matrices
 27 | library(stringr, quietly = TRUE, warn.conflicts = FALSE)
 28 | library(RColorBrewer, quietly = TRUE, warn.conflicts = FALSE)
 29 | library(devtools, quietly = TRUE, warn.conflicts = FALSE)
 30 | library(Seurat, quietly = TRUE, warn.conflicts = FALSE)
 31 | library(plotly, quietly = TRUE, warn.conflicts = FALSE)
 32 | 
 33 | # rule map in Snakefile
 34 | # rule map:
 35 | #     input:
 36 | #         'plots/violinplots_comparison_UMI.pdf',
 37 | #         ...
 38 | 
 39 | # importing UMI
 40 | # importing counts ( summary/counts_expression_matrix.tsv )
 41 | 
 42 | ReadMTX <- function(mtx_path) {
 43 |   data_dir <- dirname(mtx_path)
 44 |   files <- list.files(data_dir)
 45 |   # Find files
 46 |   barcodes_file <- grep("barcodes", files, value = TRUE)
 47 |   features_file <- grep(pattern = "genes|features", x = files, value = TRUE)
 48 |   mtx <- grep("mtx", files, value = TRUE)
 49 |   # load the data
 50 |   data <- readMM(file.path(data_dir, mtx))
 51 |   barcodes <- read.csv(file.path(data_dir, barcodes_file), header = FALSE)$V1
 52 |   features <- read.csv(file.path(data_dir, features_file), header = FALSE)$V1
 53 | 
 54 |   colnames(data) <- barcodes
 55 |   rownames(data) <- features
 56 |   return(data)
 57 | }
 58 | 
 59 | #count_matrix <- ReadMTX(snakemake@input$counts)
 60 | # importing UMIs ( summary/umi_expression_matrix.tsv )
 61 | #umi_matrix <- ReadMTX(snakemake@input$UMIs)
 62 | 
 63 | count_matrix <- Read10X(file.path(snakemake@wildcards$results_dir,'summary','read'))
 64 | umi_matrix <- Read10X(file.path(snakemake@wildcards$results_dir,'summary','umi'))
 65 | 
 66 | design <- read.csv(snakemake@input$design,
 67 |   stringsAsFactors = TRUE,
 68 |   header = TRUE,
 69 |   row.names = NULL
 70 | )
 71 | metaData <- data.frame(cellNames = colnames(umi_matrix)) %>%
 72 |   mutate(samples = factor(str_replace(cellNames, "_[^_]*$", ""))) %>%
 73 |   mutate(barcode = factor(str_replace(cellNames, ".+_", ""))) %>%
 74 |   left_join(design, by = "samples")
 75 | rownames(metaData) <- metaData$cellNames
 76 | 
 77 | # possible to set is.expr = -1 to avoid filtering whilst creating
 78 | # seuratobj <- CreateSeuratObject(raw.data = umi_matrix, meta.data = metaData, is.expr = -1)
 79 | seuratobj <- CreateSeuratObject(raw.data = umi_matrix, meta.data = metaData)
 80 | seuratobj <- SetAllIdent(object = seuratobj, id = "samples")
 81 | # relabel cell idenity (https://github.com/satijalab/seurat/issues/380)
 82 | seuratobj@meta.data$orig.ident <- seuratobj@meta.data$samples
 83 | 
 84 | mycount <- CreateSeuratObject(raw.data = count_matrix, meta.data = metaData)
 85 | mycount <- SetAllIdent(object = mycount, id = "samples")
 86 | mycount@meta.data$orig.ident <- mycount@meta.data$samples
 87 | # turn off filtering
 88 | # note, the @meta.data slot contains usefull summary stuff
 89 | # head(mycount@meta.data,2)
 90 | #                              nGene nUMI expected_cells read_length      barcode
 91 | # dropseqLib1_ACTAACATTATT    15   33            400         100 ACTAACATTATT
 92 | # dropseqLib1_GAGTCTGAGGCG     5    9            400         100 GAGTCTGAGGCG
 93 | #                                       origin      origin
 94 | # dropseqLib1_ACTAACATTATT dropseqLib1 dropseqLib1
 95 | # dropseqLib1_GAGTCTGAGGCG dropseqLib1 dropseqLib1
 96 | meta.data <- seuratobj@meta.data
 97 | # combining UMIs and Counts in to one Seurat object
 98 | meta.data$nCounts <- mycount@meta.data$nUMI
 99 | seuratobj@meta.data <- meta.data
100 | # delete since Counts have been added to seuratobj as nCounts column
101 | rm(mycount)
102 | 
103 | 
104 | # mytheme <- theme_bw(base_size = 9) +
105 | mytheme <- theme_bw() +
106 |   theme(
107 |     legend.position = "right",
108 |     axis.ticks = element_blank(),
109 |     axis.text.x = element_text(angle = 300, hjust = 0)
110 |   )
111 | theme_set(mytheme)
112 | 
113 | # predefined ggplot layers for subsequent plots
114 | gglayers <- list(
115 |   geom_smooth(method = "loess"),
116 |   geom_point(size = .5),
117 |   scale_y_continuous(
118 |     labels = scales::unit_format(unit = "", scale = 1e-3, digits = 2),
119 |     breaks = scales::pretty_breaks(n = 8)
120 |   ),
121 |   scale_x_continuous(
122 |     labels = scales::unit_format(unit = "", scale = 1e-3, digits = 2),
123 |     breaks = scales::pretty_breaks(n = 8)
124 |   )
125 | )
126 | 
127 | gg <- ggplot(meta.data, aes(x = nUMI, y = nCounts, color = orig.ident)) +
128 |   #   coord_trans(y="log10",x = "log10") +
129 |   gglayers +
130 |   geom_abline(intercept = 0, slope = 1) +
131 |   labs(
132 |     title = "UMI counts vs raw Counts",
133 |     subtitle = "Number of UMIs and raw Counts for each Bead",
134 |     x = "Number of UMIs per Bead [k]",
135 |     y = "Number of Counts per Bead [k]"
136 |   )
137 | 
138 | # dev.new()
139 | # htmlwidgets::saveWidget(ggplotly(gg), file.path(getwd(),snakemake@output$html_umivscounts))
140 | ggsave(gg, file = file.path(getwd(), snakemake@output$pdf_umivscounts), width = 12, height = 7)
141 | 
142 | # how about unaligned reads/UMI?
143 | # Note(Seb): raw.data is actually filtered data i.e. nr of genes likely to be smaller than input data!
144 | mito.gene.names <- grep("^mt-", rownames(seuratobj@raw.data), value = TRUE, ignore.case = TRUE)
145 | sribo.gene.names <- grep("^Rps", rownames(seuratobj@raw.data), value = TRUE, ignore.case = TRUE)
146 | lribo.gene.names <- grep("^Rpl", rownames(seuratobj@raw.data), value = TRUE, ignore.case = TRUE)
147 | 
148 | col.total <- Matrix::colSums(seuratobj@raw.data)
149 | meta.data$col.total <- col.total
150 | 
151 | seuratobj.top_50 <- apply(seuratobj@raw.data, 2, function(x) sum(x[order(x, decreasing = TRUE)][1:50]) / sum(x))
152 | # mycount.top_50 <- apply(mycount@raw.data, 2, function(x) sum(x[order(x, decreasing = TRUE)][1:50])/sum(x))
153 | 
154 | seuratobj <- AddMetaData(seuratobj, Matrix::colSums(seuratobj@raw.data[sribo.gene.names, ]) / col.total, "pct.sribo")
155 | seuratobj <- AddMetaData(seuratobj, Matrix::colSums(seuratobj@raw.data[lribo.gene.names, ]) / col.total, "pct.lribo")
156 | seuratobj <- AddMetaData(seuratobj, Matrix::colSums(seuratobj@raw.data[unique(c(sribo.gene.names, lribo.gene.names)), ]) / col.total, "pct.Ribo")
157 | seuratobj <- AddMetaData(seuratobj, Matrix::colSums(seuratobj@raw.data[mito.gene.names, ]) / col.total, "pct.mito")
158 | seuratobj <- AddMetaData(seuratobj, seuratobj.top_50, "top50")
159 | tmp <- seuratobj@meta.data$nUMI / seuratobj@meta.data$nGene
160 | names(tmp) <- rownames(seuratobj@meta.data)
161 | seuratobj <- AddMetaData(seuratobj, tmp, "umi.per.gene")
162 | 
163 | 
164 | gg <- VlnPlot(seuratobj,
165 |   c("nUMI", "nGene", "top50", "umi.per.gene", "pct.Ribo", "pct.mito"),
166 |   x.lab.rot = TRUE, do.return = TRUE
167 | )
168 | # ggsave(gg,file=file.path("violinplots_comparison_UMI.pdf"),width=18,height=18)
169 | ggsave(gg, file = snakemake@output$pdf_violine, width = 18, height = 18)
170 | # gg <- VlnPlot(mycount,c("nUMI", "nGene", "top50", "count.per.gene","pct.Ribo", "pct.mito"), x.lab.rot = TRUE, do.return = TRUE)
171 | # ggsave(gg,file=file.path("violinplots_comparison_count.pdf"),width=18,height=18)
172 | 
173 | # gg <- GenePlot(object = seuratobj, gene1 = "nUMI", gene2 = "nGene")
174 | # ggsave(gg,file=file.path("violinplots_comparison.pdf"),width=18,height=18)
175 | 
176 | 
177 | gg <- ggplot(meta.data, aes(x = nUMI, y = nGene, color = orig.ident)) +
178 |   gglayers +
179 |   labs(
180 |     title = "Genes (pooled mouse and human set) vs UMIs for each bead",
181 |     x = "Number of UMIs per Bead [k]",
182 |     y = "Number of Genes per Bead [k]"
183 |   )
184 | 
185 | # dev.new()
186 | # htmlwidgets::saveWidget(ggplotly(gg),
187 | # file.path(getwd(), snakemake@output$html_umi_vs_gene))
188 | ggsave(gg, file = snakemake@output$pdf_umi_vs_gene, width = 12, height = 7)
189 | 
190 | 
191 | 
192 | ################################################################################
193 | ## same for Counts instead UMIs (using mycount object)
194 | gg <- ggplot(meta.data, aes(x = nCounts, y = nGene, color = orig.ident)) +
195 |   gglayers +
196 |   labs(
197 |     title = "Genes (pooled mouse and human set) vs Counts for each bead",
198 |     x = "Number of Counts per Bead [k]",
199 |     y = "Number of Genes per Bead [k]"
200 |   )
201 | 
202 | # dev.new()
203 | # htmlwidgets::saveWidget(ggplotly(gg),
204 | #                       file.path(getwd(), snakemake@output$html_count_vs_gene))
205 | 
206 | ggsave(gg, file = snakemake@output$pdf_count_vs_gene, width = 12, height = 7)
207 | 
208 | 
209 | # head(meta.data,2)
210 | #                              nGene nUMI                    cellNames         samples      barcode expected_cells read_length  batch      orig.ident pct.sribo  pct.lribo  pct.Ribo  pct.mito     top50 umi.per.gene
211 | # sample1_GAGTCTGAGGCG     6    6 sample1_GAGTCTGAGGCG sample1 GAGTCTGAGGCG            100         100 batch1 sample1 0.0000000 0.00000000 0.0000000 0.0000000 1.0000000     1.000000
212 | # sample1_CAGCCCTCAGTA   264  437 sample1_CAGCCCTCAGTA sample1 CAGCCCTCAGTA            100         100 batch1 sample1 0.0389016 0.07551487 0.1144165 0.0228833 0.5102975     1.655303
213 | 
214 | # saving snakemake meta information into misc slot so all can be exported as one object
215 | seuratobj@misc <- snakemake
216 | # exporting R Seurat objects into summary/R_Seurat_objects.rdata
217 | saveRDS(seuratobj, file = file.path(snakemake@output$R_objects))
218 | 
219 | if (debug_flag) {
220 |   save.image(file = file.path(path_debug, "plot_violin_workspace.rdata"))
221 | }
222 | 


--------------------------------------------------------------------------------
/scripts/plot_yield.R:
--------------------------------------------------------------------------------
  1 | #------------------------------------ for debugging:
  2 | # For debugging add the following line in config.yaml (without the #)
  3 | # DEBUG: True
  4 | # This will create R objects in the debug directory containing the snakemake
  5 | # object R object that can be loaded into a custom R session as below:
  6 | debug_flag <- FALSE
  7 | if (snakemake@config$DEBUG) {
  8 |   debug_flag <- TRUE
  9 |   message("In debug mode: saving R objects to inspect later")
 10 |   path_debug <- file.path(snakemake@config$LOCAL$results, "debug")
 11 |   dir.create(path_debug)
 12 |   save(snakemake, file = file.path(path_debug, "plot_yield_snakemake.rdata"))
 13 | }
 14 | #------------------------------------ debugging
 15 | 
 16 | library(ggplot2)
 17 | library(tidyr)
 18 | library(grid)
 19 | library(gridExtra)
 20 | library(viridis)
 21 | library(stringr)
 22 | 
 23 | samples <- snakemake@params$sample_names
 24 | batches <- snakemake@params$batches
 25 | mydata  <- data.frame(matrix(nrow = length(samples), ncol = 7))
 26 | colnames(mydata) <- c("Sample", "Batch", "Cutadapt filtered", "Unmapped",
 27 |                       "Multi mapped", "Uniquely mapped", "Total reads")
 28 | mydata[, "Sample"] <- samples
 29 | mydata[, "Batch"]  <- batches
 30 | for (i in 1:length(samples)) {
 31 |   # Input files and variables
 32 |   STAR_output <- read.table(snakemake@input$STAR_output[i],
 33 |                             skip = 5, sep = "\t",
 34 |                             fill = TRUE, stringsAsFactors = FALSE)
 35 |   mysample <- samples[i]
 36 |   # Read files
 37 |   # bbmap_log = read.table(snakemake@input$repaired[i], sep=':', header=FALSE, skip=8, row.names=1, nrows=4)
 38 |   # reads_after_filtering = as.numeric(str_match(bbmap_log['Pairs',], pattern = "\t([0-9]{1,20}) reads.*")[,2])/2
 39 |   bbmap_log   <- readLines(snakemake@input$repaired[i])
 40 |   reads_after_filtering <- as.numeric(str_match(bbmap_log[grep("^Pairs:", bbmap_log)],
 41 |                                                 pattern = "\t([0-9]{1,20}) reads.*")[, 2]) / 2
 42 |   R1_filtered <- read.table(snakemake@input$R1_filtered[i], header = FALSE, skip = 7, sep = ":", nrows = 7, row.names = 1)
 43 |   total_reads <- as.numeric(str_replace_all(R1_filtered["Total reads processed", ], pattern = (" |,"), ""))
 44 | 
 45 |   # R2_filtered = read.table(snakemake@input$R2_filtered[i], header = FALSE, skip=8, sep=':', nrows=7, row.names=1)
 46 | 
 47 |   # R1_adapters = as.numeric(str_remove_all(str_match(R1_filtered['Reads with adapters',], pattern = "(.*) \\(")[,2], pattern = (' |,')))
 48 |   # R1_too_short = as.numeric(str_remove_all(str_match(R1_filtered['Reads that were too short',], pattern = "(.*) \\(")[,2], pattern = (' |,')))
 49 |   # R1_passed = as.numeric(str_remove_all(str_match(R1_filtered['Reads written (passing filters)',], pattern = "(.*) \\(")[,2], pattern = (' |,')))
 50 |   # R1_filtered = total_reads - R1_passed
 51 | 
 52 |   # R2_adapters = as.numeric(str_remove_all(str_match(R2_filtered['Reads with adapters',], pattern = "(.*) \\(")[,2], pattern = (' |,')))
 53 |   # R2_too_short = as.numeric(str_remove_all(str_match(R2_filtered['Reads that were too short',], pattern = "(.*) \\(")[,2], pattern = (' |,')))
 54 |   # R2_passed = as.numeric(str_remove_all(str_match(R2_filtered['Reads written (passing filters)',], pattern = "(.*) \\(")[,2], pattern = (' |,')))
 55 |   # R2_filtered = total_reads - R2_passed
 56 | 
 57 |   mydata[which(mydata$Sample == mysample), "Cutadapt filtered"] <- total_reads - reads_after_filtering
 58 |   mydata[which(mydata$Sample == mysample), "Total reads"] <- total_reads
 59 | 
 60 |   # STAR output
 61 |   reads_in        <- as.numeric(STAR_output$V2[1])
 62 |   uniquely_mapped <- as.numeric(STAR_output$V2[4])
 63 |   multi_mapped    <- as.numeric(STAR_output$V2[19])
 64 |   unmapped        <- reads_in - uniquely_mapped - multi_mapped
 65 | 
 66 |   mydata[which(mydata$Sample == mysample), "Uniquely mapped"] <- uniquely_mapped
 67 |   mydata[which(mydata$Sample == mysample), "Multi mapped"]    <- multi_mapped
 68 |   mydata[which(mydata$Sample == mysample), "Unmapped"]        <- unmapped
 69 | }
 70 | 
 71 | # tidyr version
 72 |  mydata_long <- mydata %>% gather(variable, value, -Sample, -Batch)
 73 | # melt will be retired, use gather instead: https://github.com/hadley/reshape
 74 | #Force factor order.
 75 | mydata_long$variable = factor(mydata_long$variable, levels = c('Cutadapt filtered','Multi mapped','Total reads','Unmapped','Uniquely mapped'))
 76 | color_palette = c('#e88270','#cb7262','#ae6254','#70d6e8')
 77 | 
 78 | 
 79 | p1 <- ggplot(subset(mydata_long, mydata_long$variable != "Total reads"),
 80 |              aes(x = Sample, y = value, fill = variable)) +
 81 |   geom_histogram(stat = "identity", binwidth = 1 / length(samples)) +
 82 |   theme(axis.text.x = element_text(angle = 90, hjust = 0)) +
 83 |   labs(title = paste("Yield of all the reads for each category"),
 84 |        x = "Samples",
 85 |        y = "Number of reads") +
 86 |   theme(axis.title.x = element_blank(),
 87 |         axis.text.x = element_blank(),
 88 |         axis.ticks.x = element_blank(),
 89 |         legend.position = "none",
 90 |         plot.title = element_text(size = 20, face = "bold")) +
 91 |   facet_grid(~Batch, scales = "free") +
 92 |   scale_fill_viridis(discrete = TRUE, option = "viridis") +
 93 |   scale_y_continuous(labels = scales::scientific)
 94 | 
 95 | mydata_pct <- mydata[, -c(1, 2)] / mydata[, "Total reads"]
 96 | mydata_pct <- cbind(Sample = mydata[, "Sample"],
 97 |                     Batch = mydata[, "Batch"], mydata_pct)
 98 | 
 99 | mydata_long_pct <- mydata_pct %>% gather(variable, value, -Sample, -Batch)
100 | 
101 | mydata_long_pct$variable = factor(mydata_long$variable, levels = c('Cutadapt filtered','Multi mapped','Unmapped','Uniquely mapped'))
102 | 
103 | p2 <- ggplot(subset(mydata_long_pct, mydata_long_pct$variable != "Total reads"),
104 |              aes(x = Sample, y = value, fill = variable)) +
105 |   labs(fill = "Filters") +
106 |   geom_histogram(stat = "identity", binwidth = 1 / length(samples)) +
107 |   theme(axis.text.x = element_text(angle = 90, hjust = 0),
108 |         legend.position = "bottom",
109 |         strip.background = element_blank(),
110 |         strip.text.x = element_blank()) +
111 |   labs(x = "Samples",
112 |        y = "Percentage of reads") +
113 |   facet_grid(~Batch, scales = "free") +
114 |   scale_fill_viridis(discrete = TRUE, option = "viridis") +
115 |   scale_y_continuous(labels = scales::percent)
116 | 
117 | gp1 <- ggplotGrob(p1)
118 | gp2 <- ggplotGrob(p2)
119 | 
120 | pdf(file = snakemake@output$pdf, width = 16, height = 13)
121 | grid::grid.newpage()
122 | grid::grid.draw(rbind(gp1, gp2, size = "last"))
123 | dev.off()
124 | 
125 | if (debug_flag) {
126 |   save.image(file = file.path(path_debug, "plot_yield_workspace.rdata"))
127 | }
128 | 


--------------------------------------------------------------------------------
/scripts/publication_text.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Publication_text"
 3 | output: html_document
 4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
 5 | ---
 6 | 
 7 | ```{r libraries, message=FALSE, warning=FALSE, include=FALSE}
 8 | library(yaml)
 9 | ```
10 | 
11 | ```{r load_yaml, message=FALSE, warning=FALSE, include=FALSE, paged.print=FALSE}
12 | versions = list()
13 | for (yaml_file in snakemake@input$yaml_files){
14 |   current_env = yaml.load_file(yaml_file)
15 |   for (package in current_env$dependencies){
16 |     if(grepl(pattern = 'cutadapt', package)){
17 |       versions[['cutadapt']] = strsplit(package,'=|==| ==| == ')[[1]][2]
18 |     }
19 |     else if(grepl(pattern = 'star', package)){
20 |       versions[['star']] = strsplit(package,'=|==| ==| == ')[[1]][2]
21 |     }
22 |     else if(grepl(pattern = 'dropseq_tools', package)){
23 |       versions[['dropseq_tools']] = strsplit(package,'=|==| ==| == ')[[1]][2]
24 |     }
25 |     else if(grepl(pattern = 'bbmap', package)){
26 |       versions[['bbmap']] = strsplit(package,'=|==| ==| == ')[[1]][2]
27 |     }
28 |   }
29 | }
30 | 
31 | umi_distance=snakemake@config$EXTRACTION$`UMI-edit-distance`
32 | ```
33 | 
34 | Pipeline
35 | --------------------------
36 | Data was processed using dropSeqPipe `r paste0('v',snakemake@config$version)`. Parameters that were used are provided in the configuration file on the repository XXXXXX. Rerunning the pipeline can easily be done by following the instructions at this address: https://hoohm.github.io/dropSeqPipe/
37 | 
38 | Trimming and filtering
39 | --------------------------
40 | Read trimming and filtering was performed with cutadapt `r paste0('v',versions[['cutadapt']])` on both fastq files separatly. Reads with a missing pairs were discarded using bbmap `r paste0('v',versions[['bbmap']])`.
41 | 
42 | 
43 | Mapping
44 | --------------------------
45 | Mapping was performed with STAR `r paste0('v',versions[['star']])`. Multimapped reads were discarded. Annotation release number #`r paste0(snakemake@config$META$species[[1]]$release)` and genome build #`r paste0(snakemake@config$META$species[[1]]$build)`  for `r paste0(names(snakemake@config$META$species)[1])` were downloaded from ensembl.
46 | 
47 | 
48 | Barcodes
49 | --------------------------
50 | Demultiplexing as well as file manipulation have been performed using dropseq_tools `r paste0('v',versions[['dropseq_tools']])`. We used a distance of 1 base for cell barcode and `r paste0(umi_distance)` for umi barcodes.


--------------------------------------------------------------------------------
/scripts/repair_barcodes.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import pysam
 3 | 
 4 | def load_obj(name):
 5 |     with open(name, 'rb') as f:
 6 |         return pickle.load(f)
 7 | 
 8 | def save_obj(obj, name):
 9 |     with open(name, 'wb') as f:
10 |         pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
11 | 
12 | 
13 | infile_bam = pysam.AlignmentFile(snakemake.input.bam, "rb")
14 | outfile = pysam.AlignmentFile(snakemake.output.bam, "wb", template=infile_bam)
15 | 
16 | mapping = load_obj(snakemake.input.barcode_mapping)
17 | barcode_ref = load_obj(snakemake.input.barcode_ref)
18 | barcode_ext_ref = load_obj(snakemake.input.barcode_ext_ref)
19 | unknown_barcodes = set()
20 | 
21 | for bam_read in infile_bam:
22 | 	barcode = bam_read.get_tag('XC')
23 | 	#lane_number = bam_read.query_name.split(':')[3]
24 | 	if barcode in barcode_ref:
25 | 		mapping[0][barcode]['count'] += 1
26 | 		#mapping[0][barcode]['lanes'][lane_number] += 1
27 | 		outfile.write(bam_read)
28 | 		continue
29 | 	elif barcode in barcode_ext_ref:
30 | 		# The barcode is in our extended reference. Change the barcode to the original one
31 | 		reference_barcode = mapping[1][barcode]['ref']
32 | 		mapping[1][barcode]['count'] += 1
33 | 		#mapping[1][barcode]['lanes'][lane_number] += 1
34 | 		bam_read.set_tag('XC',reference_barcode,value_type='Z',replace=True)
35 | 		outfile.write(bam_read)
36 | 		continue
37 | 	else:
38 | 		# If the barcode is not found in the extended ref, then don't modify it.
39 | 		if barcode in unknown_barcodes:
40 | 			mapping['unknown'][barcode]['count'] += 1
41 | 			#mapping['unknown'][barcode]['lanes'][lane_number] += 1
42 | 		else:
43 | 			#mapping['unknown'][barcode] = {'count':1, 'lanes':{'1':0,'2':0,'3':0,'4':0,'5':0,'6':0,'7':0,'8':0}}
44 | 			mapping['unknown'][barcode] = {'count':1}
45 | 			#mapping['unknown'][barcode]['lanes'][lane_number] += 1
46 | 			unknown_barcodes.add(barcode)
47 | 		outfile.write(bam_read)
48 | 
49 | save_obj(obj=mapping, name=snakemake.output.barcode_mapping_counts)
50 | 


--------------------------------------------------------------------------------
/scripts/umi_tools_extended_ref.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from collections import defaultdict
 4 | import pickle
 5 | 
 6 | 
 7 | def save_obj(obj, name):
 8 |     with open(name, 'wb') as f:
 9 |         pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
10 | 
11 | 
12 | mapping=defaultdict(dict)
13 | barcode_ref = set()
14 | barcode_ext_ref = set()
15 | 
16 | 
17 | with open(snakemake.input['whitelist'],'r') as whitelist:
18 | 	for line in whitelist:
19 | 		if len(line.strip().split()) == 2:  # This means we didn't find any other linked barcode
20 | 			(reference,counts_ref) = line.strip().split()
21 | 			mapping[0][reference]= defaultdict()
22 | 			mapping[0][reference]['ref'] = reference
23 | 			mapping[0][reference]['count'] = 0
24 | 			mapping[0][reference]['lanes'] = {'1':0,'2':0,'3':0,'4':0,'5':0,'6':0,'7':0,'8':0}
25 | 			barcode_ref.add(reference)
26 | 			continue
27 | 		(reference,extended_ref,counts_ref,counts_ext) = line.strip().split()
28 | 		mapping[0][reference]= defaultdict()
29 | 		mapping[0][reference]['ref'] = reference
30 | 		mapping[0][reference]['count'] = 0
31 | 		mapping[0][reference]['lanes'] = {'1':0,'2':0,'3':0,'4':0,'5':0,'6':0,'7':0,'8':0}
32 | 		barcode_ref.add(reference)
33 | 		for barcode in extended_ref.split(','):
34 | 			mapping[1][barcode] = defaultdict()
35 | 			mapping[1][barcode]['ref'] = reference
36 | 			mapping[1][barcode]['count'] = 0
37 | 			mapping[1][barcode]['lanes'] = {'1':0,'2':0,'3':0,'4':0,'5':0,'6':0,'7':0,'8':0}
38 | 		barcode_ext_ref.update(extended_ref.split(','))
39 | 
40 | # Save mapping and references to reuse later.
41 | save_obj(obj=mapping, name=snakemake.output.barcode_mapping)
42 | save_obj(obj=barcode_ref,name=snakemake.output.barcode_ref)
43 | save_obj(obj=barcode_ext_ref,name=snakemake.output.barcode_ext_ref)


--------------------------------------------------------------------------------
/templates/NexteraPE-PE.fa:
--------------------------------------------------------------------------------
 1 | >PrefixNX/1
 2 | AGATGTGTATAAGAGACAG
 3 | >PrefixNX/2
 4 | AGATGTGTATAAGAGACAG
 5 | >Trans1
 6 | TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG
 7 | >Trans1_rc
 8 | CTGTCTCTTATACACATCTGACGCTGCCGACGA
 9 | >Trans2
10 | GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG
11 | >Trans2_rc
12 | CTGTCTCTTATACACATCTCCGAGCCCACGAGAC


--------------------------------------------------------------------------------
/templates/TruSeq2-PE.fa:
--------------------------------------------------------------------------------
 1 | >PrefixPE/1
 2 | AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
 3 | >PrefixPE/2
 4 | CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT
 5 | >PCR_Primer1
 6 | AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
 7 | >PCR_Primer1_rc
 8 | AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT
 9 | >PCR_Primer2
10 | CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT
11 | >PCR_Primer2_rc
12 | AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG
13 | >FlowCell1
14 | TTTTTTTTTTAATGATACGGCGACCACCGAGATCTACAC
15 | >FlowCell2
16 | TTTTTTTTTTCAAGCAGAAGACGGCATACGA


--------------------------------------------------------------------------------
/templates/TruSeq2-SE.fa:
--------------------------------------------------------------------------------
1 | >TruSeq2_SE
2 | AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG
3 | >TruSeq2_PE_f
4 | AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
5 | >TruSeq2_PE_r
6 | AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG


--------------------------------------------------------------------------------
/templates/TruSeq3-PE-2.fa:
--------------------------------------------------------------------------------
 1 | >PrefixPE/1
 2 | TACACTCTTTCCCTACACGACGCTCTTCCGATCT
 3 | >PrefixPE/2
 4 | GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
 5 | >PE1
 6 | TACACTCTTTCCCTACACGACGCTCTTCCGATCT
 7 | >PE1_rc
 8 | AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA
 9 | >PE2
10 | GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
11 | >PE2_rc
12 | AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC


--------------------------------------------------------------------------------
/templates/TruSeq3-PE.fa:
--------------------------------------------------------------------------------
1 | >PrefixPE/1
2 | TACACTCTTTCCCTACACGACGCTCTTCCGATCT
3 | >PrefixPE/2
4 | GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT


--------------------------------------------------------------------------------
/templates/TruSeq3-SE.fa:
--------------------------------------------------------------------------------
1 | >TruSeq3_IndexedAdapter
2 | AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC
3 | >TruSeq3_UniversalAdapter
4 | AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA


--------------------------------------------------------------------------------
/templates/cluster.yaml:
--------------------------------------------------------------------------------
 1 | __default__:
 2 |     time: "03:00:00"
 3 |     mem: 4g
 4 |     output: "logs/cluster/{rule}.{wildcards.sample}.out"
 5 |     error: "logs/cluster/{rule}.{wildcards.sample}.err"
 6 |     n: '{threads}'
 7 | fastqc_barcodes:
 8 |     jobname: fastqc_barcodes
 9 |     time: "01:00:00"
10 |     mem: 1g
11 | fastqc_reads:
12 |     jobname: fastqc_reads
13 |     time: "01:00:00"
14 |     mem: 1g
15 | STAR_align:
16 |     n: 24
17 | create_star_index:
18 |     n: 1
19 |     time: "04:00:00"
20 |     mem: 64g
21 |     output: "logs/cluster/{rule}.out"
22 |     error: "logs/cluster/{rule}.err"
23 | 


--------------------------------------------------------------------------------
/templates/config.yaml:
--------------------------------------------------------------------------------
 1 | CONTACT:
 2 |   email: john.doe@john.com
 3 |   person: John Doe
 4 | LOCAL:
 5 |     temp-directory:
 6 |     memory: 4g
 7 |     raw_data:
 8 |     results: results
 9 | META:
10 |     species:
11 |         SPECIES_ONE:
12 |             build:
13 |             release:
14 |     ratio: 0.2
15 |     reference-directory:
16 |     gtf_biotypes: gtf_biotypes.yaml
17 | 
18 | FILTER:
19 |     barcode-whitelist: ''
20 |     5-prime-smart-adapter: ''
21 |     cell-barcode:
22 |         start:
23 |         end:
24 |     UMI-barcode:
25 |         start:
26 |         end:
27 |     cutadapt:
28 |         adapters-file:
29 |         R1:
30 |             quality-filter: 20
31 |             maximum-Ns: 1
32 |             extra-params: ''
33 |         R2:
34 |             quality-filter: 20
35 |             minimum-adapters-overlap: 6
36 |             minimum-length: 15
37 |             extra-params: ''
38 | MAPPING:
39 |     STAR:
40 |         genomeChrBinNbits: 18
41 |         outFilterMismatchNmax: 10
42 |         outFilterMismatchNoverLmax: 0.3
43 |         outFilterMismatchNoverReadLmax: 1
44 |         outFilterMatchNmin: 0
45 |         outFilterMatchNminOverLread: 0.66
46 |         outFilterScoreMinOverLread: 0.66
47 | EXTRACTION:
48 |     LOCUS:
49 |         - CODING
50 |         - UTR
51 |     strand-strategy: SENSE
52 |     UMI-edit-distance: 1
53 |     minimum-counts-per-UMI: 0
54 | DEBUG: False


--------------------------------------------------------------------------------
/templates/config_nadia.yaml:
--------------------------------------------------------------------------------
 1 | # Example config template for Dolomite Bio’s Nadia Instrument
 2 | # https://www.dolomite-bio.com/
 3 | # Usage: Copy into project root folder and rename to 'config.yaml'
 4 | DEBUG: FALSE
 5 | CONTACT:
 6 |   email: luke@mail.com
 7 |   person: Luke Dropwalker
 8 | LOCAL:
 9 |     temp-directory: ./tmp
10 |     memory: 60g
11 |     raw_data: data
12 |     results: results
13 | META:
14 |     species:
15 |     # this list two species which is meant for mixed species 
16 |     # for single species, just delete one of the two (and/or edit the species as required)
17 |         mus_musculus:
18 |             build: 38
19 |             release: 91
20 |         homo_sapiens:
21 |             build: 38
22 |             release: 91
23 |     # for mixed species: threshold for calling a STAMP mixed (i.e. 0.2 means at least 20% from both species)
24 |     ratio: 0.2
25 |     reference-directory: /path/to/reference-dir
26 |     gtf_biotypes: gtf_biotypes.yaml
27 | FILTER:
28 |     barcode-whitelist: ''
29 |     5-prime-smart-adapter: CCTACACGACGCTCTTCCGATCT
30 |     cell-barcode:
31 |         start: 1
32 |         end: 12
33 |         min-quality: 3
34 |         num-below-quality: 0
35 |     UMI-barcode:
36 |         start: 13
37 |         end: 20
38 |         min-quality: 3
39 |         num-below-quality: 0
40 |     cutadapt:
41 |         adapters-file: custom_adapters.fa
42 |         R1:
43 |             quality-filter: 20
44 |             maximum-Ns: 0
45 |             extra-params: ''
46 |         R2:
47 |             quality-filter: 20
48 |             minimum-adapters-overlap: 6
49 |             minimum-length: 15
50 |             extra-params: ''
51 |             simpleClipThreshold: 10
52 | MAPPING:
53 |     STAR:
54 |         genomeChrBinNbits: 18
55 |         outFilterMismatchNmax: 10
56 |         outFilterMismatchNoverLmax: 0.3
57 |         outFilterMismatchNoverReadLmax: 1
58 |         outFilterMatchNmin: 0
59 |         outFilterMatchNminOverLread: 0.66
60 |         outFilterScoreMinOverLread: 0.66
61 | EXTRACTION:
62 |     LOCUS:
63 |         - CODING
64 |         - UTR
65 |     strand-strategy: SENSE
66 |     UMI-edit-distance: 1
67 |     minimum-counts-per-UMI: 0
68 | DOUBLET_DETECTION:
69 |     min_counts: 1
70 |     min_cells: 0
71 |     min_gene_variability_pctl: 85
72 |     n_prin_comps: 20
73 | 


--------------------------------------------------------------------------------
/templates/custom_adapters.fa:
--------------------------------------------------------------------------------
 1 | >Illumina_Universal
 2 | AGATCGGAAGAG
 3 | >PrefixNX/1
 4 | AGATGTGTATAAGAGACAG
 5 | >Trans1
 6 | TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG
 7 | >Trans1_rc
 8 | CTGTCTCTTATACACATCTGACGCTGCCGACGA
 9 | >Trans2
10 | GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG
11 | >Trans2_rc
12 | CTGTCTCTTATACACATCTCCGAGCCCACGAGAC
13 | >polyA
14 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
15 | >polyT
16 | TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
17 | >polyC
18 | CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
19 | >polyG
20 | GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
21 | >drop-seq
22 | GTACTCTGCGTTGATACCACTGCTTCCGCGGACAGGC
23 | >Nextera 
24 | CTGTCTCTTATACACATCT
25 | 


--------------------------------------------------------------------------------
/templates/gtf_biotypes.yaml:
--------------------------------------------------------------------------------
 1 | biotypes:
 2 |   - 3prime_overlapping_ncRNA
 3 |   - antisense
 4 |   - bidirectional_promoter_lncRNA
 5 |   - IG_C_gene
 6 |   - IG_C_pseudogene
 7 |   - IG_D_gene
 8 |   - IG_J_gene
 9 |   - IG_J_pseudogene
10 |   - IG_pseudogene
11 |   - IG_V_gene
12 |   - IG_V_pseudogene
13 |   - lincRNA
14 |   - macro_lncRNA
15 |   - miRNA
16 |   - misc_RNA
17 |   - Mt_rRNA
18 |   - Mt_tRNA
19 |   - non_coding
20 |   - polymorphic_pseudogene
21 |   - processed_pseudogene
22 |   - processed_transcript
23 |   - protein_coding
24 |   - pseudogene
25 |   - ribozyme
26 |   - rRNA
27 |   - scaRNA
28 |   - scRNA
29 |   - sense_intronic
30 |   - sense_overlapping
31 |   - snoRNA
32 |   - snRNA
33 |   - sRNA
34 |   - TEC
35 |   - transcribed_processed_pseudogene
36 |   - transcribed_unitary_pseudogene
37 |   - transcribed_unprocessed_pseudogene
38 |   - translated_processed_pseudogene
39 |   - TR_C_gene
40 |   - TR_D_gene
41 |   - TR_J_gene
42 |   - TR_J_pseudogene
43 |   - TR_V_gene
44 |   - TR_V_pseudogene
45 |   - unitary_pseudogene
46 |   - unprocessed_pseudogene
47 |   - vaultRNA
48 | 


--------------------------------------------------------------------------------
/templates/samples.csv:
--------------------------------------------------------------------------------
1 | samples,expected_cells,read_length,batch
2 | sample1,100,75,Batch1


--------------------------------------------------------------------------------