├── .DS_Store ├── Examples ├── Cluster_config │ └── lsf │ │ └── cluster.json ├── Runs │ ├── Autism │ │ ├── NCBI_accession_list.txt │ │ └── config.yaml │ ├── COSMIC │ │ ├── config.yaml │ │ └── local_samples.tsv │ ├── C_elegans │ │ └── config.yaml │ ├── ENCODE │ │ ├── config.yaml │ │ └── sample_url.tsv │ ├── Parada_et_al │ │ ├── NCBI_accession_list.txt │ │ ├── config.yaml │ │ ├── sample_url.tsv │ │ └── whippet_delta.yaml │ ├── README.md │ └── Zebrafish │ │ ├── NCBI_accession_list.txt │ │ └── config.yaml └── Single_cell │ ├── run_metadata.super_clusters.tsv │ └── run_metadata.tsv ├── MicroExonator.smk ├── PWM ├── Human │ ├── hg19_GT_AG_U2_3.good.matrix │ └── hg19_GT_AG_U2_5.good.matrix └── Mouse │ ├── mm10_GT_AG_U2_3.good.matrix │ └── mm10_GT_AG_U2_5.good.matrix ├── README.md ├── config.py ├── docs ├── .DS_Store ├── Makefile ├── build │ ├── doctrees │ │ ├── differential_inclusion_analysis.doctree │ │ ├── discovery_and_quantification.doctree │ │ ├── environment.pickle │ │ ├── index.doctree │ │ ├── install.doctree │ │ ├── licence.doctree │ │ ├── setup.doctree │ │ └── support.doctree │ └── html │ │ ├── .buildinfo │ │ ├── _sources │ │ ├── differential_inclusion_analysis.rst.txt │ │ ├── discovery_and_quantification.rst.txt │ │ ├── index.rst.txt │ │ ├── install.rst.txt │ │ ├── licence.rst.txt │ │ ├── setup.rst.txt │ │ └── support.rst.txt │ │ ├── _static │ │ ├── basic.css │ │ ├── css │ │ │ ├── badge_only.css │ │ │ ├── fonts │ │ │ │ ├── Roboto-Slab-Bold.woff │ │ │ │ ├── Roboto-Slab-Bold.woff2 │ │ │ │ ├── Roboto-Slab-Regular.woff │ │ │ │ ├── Roboto-Slab-Regular.woff2 │ │ │ │ ├── fontawesome-webfont.eot │ │ │ │ ├── fontawesome-webfont.svg │ │ │ │ ├── fontawesome-webfont.ttf │ │ │ │ ├── fontawesome-webfont.woff │ │ │ │ ├── fontawesome-webfont.woff2 │ │ │ │ ├── lato-bold-italic.woff │ │ │ │ ├── lato-bold-italic.woff2 │ │ │ │ ├── lato-bold.woff │ │ │ │ ├── lato-bold.woff2 │ │ │ │ ├── lato-normal-italic.woff │ │ │ │ ├── lato-normal-italic.woff2 │ │ │ │ ├── lato-normal.woff │ │ │ │ └── lato-normal.woff2 │ │ │ └── theme.css │ │ ├── doctools.js │ │ ├── documentation_options.js │ │ ├── file.png │ │ ├── jquery-3.5.1.js │ │ ├── jquery.js │ │ ├── js │ │ │ ├── badge_only.js │ │ │ ├── html5shiv-printshiv.min.js │ │ │ ├── html5shiv.min.js │ │ │ └── theme.js │ │ ├── language_data.js │ │ ├── minus.png │ │ ├── plus.png │ │ ├── pygments.css │ │ ├── searchtools.js │ │ ├── underscore-1.3.1.js │ │ └── underscore.js │ │ ├── differential_inclusion_analysis.html │ │ ├── discovery_and_quantification.html │ │ ├── genindex.html │ │ ├── index.html │ │ ├── install.html │ │ ├── licence.html │ │ ├── objects.inv │ │ ├── search.html │ │ ├── searchindex.js │ │ ├── setup.html │ │ └── support.html ├── make.bat └── source │ ├── conf.py │ ├── differential_inclusion_analysis.rst │ ├── discovery_and_quantification.rst │ ├── index.rst │ ├── install.rst │ ├── licence.rst │ ├── setup.rst │ ├── single_cell_analysis.rst │ └── support.rst ├── envs ├── MicroExonator.yaml ├── MicroExonator.yml ├── R.yaml ├── biopython_py3.yaml ├── core.yaml ├── core_py3.yaml ├── pybedtools.yaml └── snakemake.yaml ├── rules ├── Benchmark.smk ├── Get_data.smk ├── Round1.smk ├── Round1_post_processing.smk ├── Round2.smk ├── Round2_post_processing.smk ├── Snakepool.backup.py ├── Snakepool.py ├── Whippet_delta.smk ├── Whippet_quant.smk ├── init.smk ├── pseudo_pool.smk └── sashimi.smk ├── src ├── Filter1_round2.py ├── GTFtoBED12.py ├── GetPSI.py ├── Get_ME_matches.py ├── Get_annotated_microexons.py ├── Get_exons_from_sam.py ├── Get_fasta_from_bed12.py ├── Get_introns_from_sam.py ├── Get_splicing_PWMs.py ├── ME_SJ_coverage.py ├── ME_centric_table.py ├── ME_filter1.py ├── Micro_exons_tags.py ├── Replace_PSI_whippet.py ├── Report │ └── report_files │ │ └── figure-html │ │ ├── unnamed-chunk-4-1.png │ │ └── unnamed-chunk-5-1.png ├── SJ_tags_generator_for_micro_exons.py ├── Snakefile ├── Snakepool_BetaDist.R ├── alingment_pre_processing.py ├── alingment_pre_processing_round2_bowtie.py ├── counts_to_PSI.py ├── coverage_sample_filter.py ├── final_filters.R ├── final_filters.Rmd ├── final_filters2.Rmd ├── final_filters3.R ├── get_diff_ME_single_cell.py ├── get_isoforms2.py ├── high_confident_list.py ├── merge_pairs.py ├── merge_quant.py ├── round2_ME_reads_fastq.py ├── round2_ME_reads_fastq2.py ├── row_ME2.py ├── sashimi-plot.py ├── sashimi_input_generator.py ├── split_coverage.py ├── split_paired_end.py ├── stats │ └── discovery_stats.py ├── validate_fastq.py ├── whippet_delta_to_ME.py ├── write_bam_tsv.py └── write_sig_node_files.py └── touch ├── MicroExonator └── github_clone ├── VastDb.bed12 ├── gencode.vM16.annotation.bed12 ├── gencode.vM16.annotation.gtf ├── miniconda └── envs │ └── julia_0.6.1 │ └── share │ └── julia │ └── site │ └── v0.6 │ └── Whippet │ └── bin │ └── whipet_scripts ├── mm10.60way.phyloP60way.bw ├── mm10.fa ├── mm10_GT_AG_U2_3.good.matrix └── mm10_GT_AG_U2_5.good.matrix /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/.DS_Store -------------------------------------------------------------------------------- /Examples/Cluster_config/lsf/cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "__default__" : 3 | { 4 | "queue" : "normal", 5 | "nCPUs" : "1", 6 | "memory" : 10000, 7 | "resources" : "\"select[mem>10000] rusage[mem=10000] span[hosts=1]\"", 8 | "name" : "JOBNAME.{rule}.{wildcards}", 9 | "output" : "logs/{rule}.{wildcards}.out", 10 | "error" : "logs/{rule}.{wildcards}.err", 11 | "Group" : "team_hemberg", 12 | "tCPU" : "99999" 13 | }, 14 | 15 | "Round1_bwa_mem_to_tags" : 16 | { 17 | "nCPUs" : 5 18 | }, 19 | 20 | 21 | "hisat2_genome_index" : 22 | { 23 | "nCPUs" : 5 24 | }, 25 | 26 | "whippet_quant" : 27 | { 28 | "nCPUs" : 1, 29 | "memory" : 2000, 30 | "resources" : "\"select[mem>2000] rusage[mem=2000] span[hosts=1]\"" 31 | 32 | }, 33 | 34 | 35 | "Round2_bowtie_to_tags" : 36 | { 37 | "nCPUs" : 5 38 | }, 39 | 40 | "bowtie_genome_index" : 41 | { 42 | "memory" : 30000, 43 | "resources" : "\"select[mem>30000] rusage[mem=30000] span[hosts=1]\"" 44 | }, 45 | 46 | "bowtie_to_genome" : 47 | { 48 | "nCPUs" : 2 49 | }, 50 | 51 | "Output" : 52 | { 53 | "nCPUs" : 2, 54 | "memory" : 30000, 55 | "resources" : "\"select[mem>30000] rusage[mem=30000] span[hosts=1]\"" 56 | }, 57 | 58 | "total_hisat2_to_genome" : 59 | { 60 | "nCPUs" : 5 61 | }, 62 | 63 | } 64 | -------------------------------------------------------------------------------- /Examples/Runs/Autism/NCBI_accession_list.txt: -------------------------------------------------------------------------------- 1 | SRR309144 2 | SRR309143 3 | SRR309142 4 | SRR309141 5 | SRR309136 6 | SRR309135 7 | SRR309134 8 | SRR309133 9 | SRR309140 10 | SRR309139 11 | SRR309138 12 | SRR309137 13 | -------------------------------------------------------------------------------- /Examples/Runs/Autism/config.yaml: -------------------------------------------------------------------------------- 1 | Genome_fasta : /lustre/scratch117/cellgen/team218/gp7/Genome/hg19/hg19.fa 2 | Gene_anontation_bed12 : /lustre/scratch117/cellgen/team218/gp7/Genome/hg19/Tracks/Gene_annotation/gencode.v19.chr_patch_hapl_scaff.annotation.bed12 3 | GT_AG_U2_5 : /lustre/scratch117/cellgen/team218/gp7/Genome/hg19/Tracks/SpliceRack/hg19_GT_AG_U2_5.good.matrix 4 | GT_AG_U2_3 : /lustre/scratch117/cellgen/team218/gp7/Genome/hg19/Tracks/SpliceRack/hg19_GT_AG_U2_3.good.matrix 5 | conservation_bigwig : /lustre/scratch117/cellgen/team218/gp7/Genome/hg19/Tracks/Phylop/hg19.100way.phyloP100way.bw 6 | working_directory : /lustre/scratch117/cellgen/team218/gp7/Micro-exons/Runs/Test_Martin/Autism/MicroExonator/ 7 | ME_DB : /lustre/scratch117/cellgen/team218/gp7/Genome/hg19/Tracks/Gene_annotation/hg19.VastDb.bed12 8 | ME_len : 30 9 | Optimize_hard_drive : F 10 | min_number_files_detected : 3 11 | 12 | 13 | # Whippet 14 | 15 | downstream_only : T 16 | whippet_bin_folder : /lustre/scratch117/cellgen/team218/gp7/miniconda/envs/julia_0.6.1/share/julia/site/v0.6/Whippet/bin 17 | Gene_anontation_GTF : /lustre/scratch117/cellgen/team218/gp7/Genome/hg19/Tracks/Gene_annotation/gencode.v19.chr_patch_hapl_scaff.annotation.gtf 18 | 19 | whippet_delta: 20 | Control_vs_Autism-temporal_cortex : 21 | A : SRR309144,SRR309143,SRR309142,SRR309141 22 | B : SRR309136,SRR309135,SRR309134,SRR309133 23 | Control_vs_Autism-frontal_cortex : 24 | A : SRR309140,SRR309139 25 | B : SRR309138,SRR309137 26 | -------------------------------------------------------------------------------- /Examples/Runs/COSMIC/config.yaml: -------------------------------------------------------------------------------- 1 | Genome_fasta : /lustre/scratch117/cellgen/team218/gp7/Genome/hg19/hg19.fa 2 | Gene_anontation_bed12 : /lustre/scratch117/cellgen/team218/gp7/Genome/hg19/Tracks/Gene_annotation/gencode.v19.chr_patch_hapl_scaff.annotation.bed12 3 | Gene_anontation_fasta : /lustre/scratch117/cellgen/team218/gp7/Genome/hg19/Tracks/Gene_annotation/gencode.v19.pc_transcripts.fa 4 | GT_AG_U2_5 : /lustre/scratch117/cellgen/team218/gp7/Genome/hg19/Tracks/SpliceRack/hg19_GT_AG_U2_5.good.matrix 5 | GT_AG_U2_3 : /lustre/scratch117/cellgen/team218/gp7/Genome/hg19/Tracks/SpliceRack/hg19_GT_AG_U2_3.good.matrix 6 | conservation_bigwig : /lustre/scratch117/cellgen/team218/gp7/Genome/hg19/Tracks/Phylop/hg19.100way.phyloP100way.bw 7 | ME_len : 30 8 | working_directory : /lustre/scratch117/cellgen/team218/igs_gp7/MicroExons/COSMIC/ 9 | ME_DB : /lustre/scratch117/cellgen/team218/gp7/Genome/hg19/Tracks/Gene_annotation/hg19.VastDb.bed12 10 | Optimize_hard_drive : T 11 | -------------------------------------------------------------------------------- /Examples/Runs/C_elegans/config.yaml: -------------------------------------------------------------------------------- 1 | 2 | Genome_fasta : /lustre/scratch117/cellgen/team218/gp7/Fabian/Caenorhabditis_elegans.WBcel235.dna.fa 3 | Gene_anontation_bed12 : /lustre/scratch117/cellgen/team218/gp7/Fabian/ce11.Ensembl.genes.bed12.Ensembl 4 | GT_AG_U2_5 : NA 5 | GT_AG_U2_3 : NA 6 | conservation_bigwig : NA 7 | working_directory : /lustre/scratch117/cellgen/team218/gp7/Fabian/MicroExonator/ 8 | ME_DB : /lustre/scratch117/cellgen/team218/gp7/Fabian/empty_DB 9 | ME_len : 30 10 | Optimize_hard_drive : F 11 | min_number_files_detected : 2 12 | 13 | #Whippet 14 | 15 | downstream_only : F 16 | whippet_bin_folder : /lustre/scratch117/cellgen/team218/gp7/miniconda/envs/julia_0.6.1/share/julia/site/v0.6/Whippet/bin 17 | Gene_anontation_GTF : /lustre/scratch117/cellgen/team218/gp7/Genome/danRer11/Danio_rerio.GRCz11.96.chr.gtf.UCSC_style 18 | 19 | 20 | whippet_delta: 21 | WT_vs_SID1: 22 | A : FB264-RNA-01_S1,FB264-RNA-06_S6,FB264-RNA-11_S11 23 | B : FB264-RNA-02_S2,FB264-RNA-07_S7,FB264-RNA-12_S12,FB264-RNA-04_S4,FB264-RNA-09_S9,FB264-RNA-14_S14 24 | WT_vs_SID1_qt129: 25 | A : FB264-RNA-01_S1,FB264-RNA-06_S6,FB264-RNA-11_S11 26 | B : FB264-RNA-02_S2,FB264-RNA-07_S7,FB264-RNA-12_S12 27 | WT_vs_SID1_mj444: 28 | A : FB264-RNA-01_S1,FB264-RNA-06_S6,FB264-RNA-11_S11 29 | B : FB264-RNA-04_S4,FB264-RNA-09_S9,FB264-RNA-14_S14 30 | WT_vs_SID2: 31 | A : FB264-RNA-01_S1,FB264-RNA-06_S6,FB264-RNA-11_S11 32 | B : FB264-RNA-03_S3,FB264-RNA-08_S8,FB264-RNA-13_S13,FB264-RNA-05_S5,FB264-RNA-10_S10,FB264-RNA-15_S15 33 | WT_vs_SID2_qt142: 34 | A : FB264-RNA-01_S1,FB264-RNA-06_S6,FB264-RNA-11_S11 35 | B : FB264-RNA-03_S3,FB264-RNA-08_S8,FB264-RNA-13_S13 36 | WT_vs_SID2_mj465: 37 | A : FB264-RNA-01_S1,FB264-RNA-06_S6,FB264-RNA-11_S11 38 | B : FB264-RNA-05_S5,FB264-RNA-10_S10,FB264-RNA-15_S15 39 | -------------------------------------------------------------------------------- /Examples/Runs/ENCODE/config.yaml: -------------------------------------------------------------------------------- 1 | Genome_fasta : /touch/mm10.fa 2 | Gene_anontation_bed12 : /touch/gencode.vM16.annotation.bed12 3 | GT_AG_U2_5 : /touch/mm10_GT_AG_U2_5.good.matrix 4 | GT_AG_U2_3 : /touch/mm10_GT_AG_U2_3.good.matrix 5 | conservation_bigwig : /touch/mm10.60way.phyloP60way.bw 6 | working_directory : /touch/MicroExonator/ 7 | ME_DB : /touch/VastDb.bed12 8 | ME_len : 30 9 | Optimize_hard_drive : F 10 | min_number_files_detected : 2 11 | 12 | # Whippet 13 | 14 | whippet_bin_folder : /touch/miniconda/envs/julia_0.6.1/share/julia/site/v0.6/Whippet/bin/ 15 | Gene_anontation_GTF : /touch/gencode.vM16.annotation.gtf 16 | condition1 : ENCFF920CNZ,ENCFF320FJX,ENCFF528EVC,ENCFF663SNC 17 | condition2 : ENCFF270GKY,ENCFF460TCF,ENCFF126IRS,ENCFF748SRJ 18 | comparison_name : forebrain_10.5_vs_forebrain_14.5 19 | -------------------------------------------------------------------------------- /Examples/Runs/ENCODE/sample_url.tsv: -------------------------------------------------------------------------------- 1 | url sample 2 | https://www.encodeproject.org/files/ENCFF270GKY/@@download/ENCFF920CNZ.fastq.gz ENCFF920CNZ 3 | https://www.encodeproject.org/files/ENCFF270GKY/@@download/ENCFF320FJX.fastq.gz ENCFF320FJX 4 | https://www.encodeproject.org/files/ENCFF270GKY/@@download/ENCFF528EVC.fastq.gz ENCFF528EVC 5 | https://www.encodeproject.org/files/ENCFF270GKY/@@download/ENCFF663SNC.fastq.gz ENCFF663SNC 6 | https://www.encodeproject.org/files/ENCFF270GKY/@@download/ENCFF270GKY.fastq.gz ENCFF270GKY 7 | https://www.encodeproject.org/files/ENCFF270GKY/@@download/ENCFF460TCF.fastq.gz ENCFF460TCF 8 | https://www.encodeproject.org/files/ENCFF270GKY/@@download/ENCFF126IRS.fastq.gz ENCFF126IRS 9 | https://www.encodeproject.org/files/ENCFF270GKY/@@download/ENCFF748SRJ.fastq.gz ENCFF748SRJ 10 | -------------------------------------------------------------------------------- /Examples/Runs/Parada_et_al/config.yaml: -------------------------------------------------------------------------------- 1 | Genome_fasta : /lustre/scratch117/cellgen/team218/gp7/Genome/mm10/mm10.fa 2 | Gene_anontation_bed12 : /lustre/scratch117/cellgen/team218/gp7/Genome/mm10/Tracks/Gene_annotation/gencode.vM16.annotation.bed12 3 | GT_AG_U2_5 : /lustre/scratch117/cellgen/team218/MH/MicroExonator/PWM/Mouse/mm10_GT_AG_U2_5.good.matrix 4 | GT_AG_U2_3 : /lustre/scratch117/cellgen/team218/MH/MicroExonator/PWM/Mouse/mm10_GT_AG_U2_3.good.matrix 5 | conservation_bigwig : /lustre/scratch117/cellgen/team218/gp7/Genome/mm10/Tracks/Phylop/mm10.60way.phyloP60way.bw 6 | working_directory : /lustre/scratch117/cellgen/team218/gp7/Micro-exons/Runs/Paper/MicroExonator/ 7 | ME_DB : /lustre/scratch117/cellgen/team218/gp7/Genome/mm10/Tracks/Gene_annotation/VastDb.bed12 8 | ME_len : 30 9 | Optimize_hard_drive : T 10 | min_number_files_detected : 2 11 | 12 | # Whippet 13 | 14 | #whippet_bin_folder : /lustre/scratch117/cellgen/team218/gp7/miniconda/envs/julia_0.6.1/share/julia/site/v0.6/Whippet/bin 15 | #Gene_anontation_GTF : /lustre/scratch117/cellgen/team218/gp7/Genome/mm10/Tracks/Gene_annotation/gencode.vM16.annotation.gtf 16 | #condition1 : ENCFF920CNZ,ENCFF320FJX,ENCFF528EVC,ENCFF663SNC 17 | #condition2 : ENCFF270GKY,ENCFF460TCF,ENCFF126IRS,ENCFF748SRJ 18 | #comparison_name : forebrain_10.5_vs_forebrain_14.5 19 | -------------------------------------------------------------------------------- /Examples/Runs/README.md: -------------------------------------------------------------------------------- 1 | Here we provide example runs that we have implemented for two different projects: 2 | 3 | # Zebrafish 4 | 5 | Small project that were ran using SRA accession codes as input. These accession codes are inputed inside `NCBI_accession_list.txt` file. 6 | 7 | # COSMIC 8 | 9 | Large cancer cell-lines project, where we used a local copy of the input fastq.gz files as an input. The paths and the name of the samples needs to be provided inside a `desing.tvs` file. 10 | 11 | 12 | # Running under lsf 13 | 14 | `snakemake -s MicroExonator.smk --cluster-config cluster.json --cluster "bsub -n {cluster.nCPUs} -R {cluster.resources} -c {cluster.tCPU} -G {cluster.Group} -q {cluster.queue} -o {cluster.output} -e {cluster.error} -M {cluster.memory}" --use-conda -k -j 1000000` 15 | -------------------------------------------------------------------------------- /Examples/Runs/Zebrafish/NCBI_accession_list.txt: -------------------------------------------------------------------------------- 1 | SRR6652888 2 | SRR6652889 3 | SRR6652890 4 | SRR6652891 5 | SRR6652892 6 | SRR6652893 7 | SRR6652894 8 | SRR6652895 9 | SRR6652896 10 | SRR6652897 11 | SRR6652898 12 | SRR6652899 13 | SRR6652900 14 | SRR6652901 15 | SRR6652902 16 | SRR6652903 17 | SRR6652904 18 | SRR6652905 19 | SRR6652906 20 | SRR6652907 21 | SRR6652908 22 | SRR6652909 23 | SRR6652910 24 | -------------------------------------------------------------------------------- /Examples/Runs/Zebrafish/config.yaml: -------------------------------------------------------------------------------- 1 | Genome_fasta : /lustre/scratch117/cellgen/team218/gp7/Genome/danRer11/danRer11.fa 2 | Gene_anontation_bed12 : /lustre/scratch117/cellgen/team218/gp7/Genome/danRer11/darRer11.Ensembl.genes.bed12 3 | GT_AG_U2_5 : /lustre/scratch117/cellgen/team218/igs_gp7/Zebrafish/Data/danRer11_GT_AG_U2_5.good.matrix 4 | GT_AG_U2_3 : /lustre/scratch117/cellgen/team218/igs_gp7/Zebrafish/Data/danRer11_GT_AG_U2_3.good.matrix 5 | conservation_bigwig : NA 6 | ME_len : 30 7 | working_directory : /lustre/scratch117/cellgen/team218/gp7/Micro-exons/Runs/Zebrafish/MicroExonator/ 8 | ME_DB : /lustre/scratch117/cellgen/team218/igs_gp7/Zebrafish/Data/VastDb.bed12 9 | Optimize_hard_drive : F 10 | min_number_files_detected : 2 11 | 12 | #Whippet 13 | 14 | downstream_only : F 15 | whippet_bin_folder : /lustre/scratch117/cellgen/team218/gp7/miniconda/envs/julia_0.6.1/share/julia/site/v0.6/Whippet/bin 16 | Gene_anontation_GTF : /lustre/scratch117/cellgen/team218/gp7/Genome/danRer11/Danio_rerio.GRCz11.96.chr.gtf.UCSC_style 17 | 18 | whippet_delta: 19 | ZT16_vs_ZT4-20Mo: 20 | A : SRR6652910,SRR6652909,SRR6652908,SRR6652907 21 | B : SRR6652906,SRR6652905,SRR6652904,SRR6652903 22 | ZT16_vs_ZT4-16Mo : 23 | A : SRR6652902,SRR6652901,SRR6652900 24 | B : SRR6652899,SRR6652898,SRR6652897,SRR6652896 25 | ZT16_vs_ZT4-4Mo: 26 | A : SRR6652895,SRR6652894,SRR6652893,SRR6652892 27 | B : SRR6652891,SRR6652890,SRR6652889,SRR6652888 28 | 29 | 20Mo_vs_16Mo-ZT16: 30 | A : SRR6652910,SRR6652909,SRR6652908,SRR6652907 31 | B : SRR6652902,SRR6652901,SRR6652900 32 | 20Mo_vs_4M-ZT16: 33 | A : SRR6652910,SRR6652909,SRR6652908,SRR6652907 34 | B : SRR6652895,SRR6652894,SRR6652893,SRR6652892 35 | 16Mo_vs_4Mo-ZT16: 36 | A : SRR6652902,SRR6652901,SRR6652900 37 | B : SRR6652895,SRR6652894,SRR6652893,SRR6652892 38 | 39 | 20Mo_vs_16Mo-ZT4: 40 | A : SRR6652906,SRR6652905,SRR6652904,SRR6652903 41 | B : SRR6652899,SRR6652898,SRR6652897,SRR6652896 42 | 20Mo_vs_4M-ZT4: 43 | A : SRR6652906,SRR6652905,SRR6652904,SRR6652903 44 | B : SRR6652891,SRR6652890,SRR6652889,SRR6652888 45 | 16Mo_vs_4Mo-ZT4: 46 | A : SRR6652899,SRR6652898,SRR6652897,SRR6652896 47 | B : SRR6652891,SRR6652890,SRR6652889,SRR6652888 48 | 49 | 20Mo_vs_16Mo-TOTAL: 50 | A : SRR6652910,SRR6652909,SRR6652908,SRR6652907,SRR6652906,SRR6652905,SRR6652904,SRR6652903 51 | B : SRR6652902,SRR6652901,SRR6652900,SRR6652899,SRR6652898,SRR6652897,SRR6652896 52 | 20Mo_vs_4M-TOTAL: 53 | A : SRR6652910,SRR6652909,SRR6652908,SRR6652907,SRR6652906,SRR6652905,SRR6652904,SRR6652903 54 | B : SRR6652895,SRR6652894,SRR6652893,SRR6652892,SRR6652891,SRR6652890,SRR6652889,SRR6652888 55 | 16Mo_vs_4Mo-TOTAL: 56 | A : SRR6652902,SRR6652901,SRR6652900,SRR6652899,SRR6652898,SRR6652897,SRR6652896 57 | B : SRR6652895,SRR6652894,SRR6652893,SRR6652892,SRR6652891,SRR6652890,SRR6652889,SRR6652888 58 | -------------------------------------------------------------------------------- /Examples/Single_cell/run_metadata.super_clusters.tsv: -------------------------------------------------------------------------------- 1 | Compare_ID A.cluster_names A.number_of_pools B.cluster_names B.number_of_pools Repeat 2 | Mesoderm_vs_Neuroectoderm_10 Ect_caudal_neuroectoderm,Ect_For_mid_hindbrain,Ect_Neural_crest 10 M_Mesoderm_unknown,M_Mesenchyme,M_Paraxial_Mesoderm,M_Intermediate_Mesoderm 10 50 3 | Mesoderm_vs_Neuroectoderm_15 Ect_caudal_neuroectoderm,Ect_For_mid_hindbrain,Ect_Neural_crest 15 M_Mesoderm_unknown,M_Mesenchyme,M_Paraxial_Mesoderm,M_Intermediate_Mesoderm 15 50 4 | Mesoderm_vs_Neuroectoderm_20 Ect_caudal_neuroectoderm,Ect_For_mid_hindbrain,Ect_Neural_crest 20 M_Mesoderm_unknown,M_Mesenchyme,M_Paraxial_Mesoderm,M_Intermediate_Mesoderm 20 50 5 | Mesoderm_vs_Neuroectoderm_25 Ect_caudal_neuroectoderm,Ect_For_mid_hindbrain,Ect_Neural_crest 25 M_Mesoderm_unknown,M_Mesenchyme,M_Paraxial_Mesoderm,M_Intermediate_Mesoderm 25 50 6 | Mesoderm_vs_Neuroectoderm_30 Ect_caudal_neuroectoderm,Ect_For_mid_hindbrain,Ect_Neural_crest 30 M_Mesoderm_unknown,M_Mesenchyme,M_Paraxial_Mesoderm,M_Intermediate_Mesoderm 30 50 7 | -------------------------------------------------------------------------------- /Examples/Single_cell/run_metadata.tsv: -------------------------------------------------------------------------------- 1 | Compare_ID A.cluster_names A.number_of_pools B.cluster_names B.number_of_pools Repeat 2 | E85_NMP_5_vs_E85_SC_5 E85_NMP 5 E85_SC 5 50 3 | E85_NMP_10_vs_E85_SC_10 E85_NMP 10 E85_SC 10 50 4 | E85_NMP_15_vs_E85_SC_15 E85_NMP 15 E85_SC 15 50 5 | E85_NMP_20_vs_E85_SC_20 E85_NMP 20 E85_SC 20 50 6 | E85_NMP_35_vs_E85_SC_35 E85_NMP 35 E85_SC 35 50 7 | E85_NMP_35_vs_E85_SC_40 E85_NMP 40 E85_SC 40 50 8 | E85_NMP_35_vs_E85_SC_45 E85_NMP 45 E85_SC 45 50 9 | E85_NMP_35_vs_E85_SC_50 E85_NMP 50 E85_SC 50 50 10 | E85_NMP_35_vs_E85_SC_55 E85_NMP 55 E85_SC 55 50 11 | E85_NMP_35_vs_E85_SC_60 E85_NMP 60 E85_SC 60 50 12 | -------------------------------------------------------------------------------- /MicroExonator.smk: -------------------------------------------------------------------------------- 1 | #version 0.9.0 2 | 3 | import yaml 4 | from collections import defaultdict 5 | import csv 6 | 7 | configfile : "config.yaml" 8 | DATA = set([]) 9 | 10 | def str2bool(v): 11 | if v==True: 12 | return True 13 | elif v==False: 14 | return False 15 | else: 16 | return v.lower() in ("yes", "true", "t", "1") 17 | 18 | rule quant: 19 | input: 20 | "Report/out.high_quality.txt", 21 | "Report/out_filtered_ME.PSI.txt", 22 | #"Report/stats/Microexons.not_consensus", 23 | #"Report/stats/Microexons.annotation.stats" 24 | 25 | #"Report/out_filtered_ME.txt" 26 | #expand("Genome_aligments/{Software}/TOTAL.exons.{Software}", Software=["Hisat2", "STAR", "Olego"]) 27 | # expand("Genome_aligments/{Software}/{sample}.sam.SJ_count", sample=DATA, Software=["Hisat2", "STAR"]), 28 | #expand("Whippet/Quant/{sample}.psi.gz", sample=DATA), 29 | #expand("Ground_Truth/{sample}.GT.SJ_count", sample=DATA) 30 | 31 | 32 | 33 | 34 | if 'cluster_metadata' in config: 35 | 36 | cluster_files = defaultdict(list) 37 | cluster_files_metadata = defaultdict(list) 38 | single_cell_files = set([]) 39 | 40 | with open(config["cluster_metadata"]) as Single_Cell: 41 | 42 | Single_Cell_clustering = csv.DictReader(Single_Cell, delimiter="\t") 43 | 44 | for row in Single_Cell_clustering: 45 | 46 | cluster_files[row[config["cluster_name"]].replace(" ", "_")].append(row[config["file_basename"]]) 47 | single_cell_files.add(row[config["file_basename"]]) 48 | 49 | 50 | 51 | #### MicroExonator #### 52 | 53 | if ("deletion_penalty" in config)==False: 54 | config["deletion_penalty"]="6" 55 | 56 | if ("insertion_penalty" in config)==False: 57 | config["insertion_penalty"]="2" 58 | 59 | config["indel_penalty"] = ",".join([str(config["deletion_penalty"]), str(config["insertion_penalty"])]) 60 | 61 | if ("ME_DB" in config)==False: 62 | config["ME_DB"]="touch/VastDb.bed12" 63 | 64 | if ("paired_samples" in config)==False: 65 | config["paired_samples"]="F" 66 | 67 | if ("min_reads_PSI" in config)==False: 68 | config["min_reads_PSI"]="5" 69 | 70 | 71 | include : "rules/init.smk" 72 | include : "rules/Get_data.smk" 73 | 74 | 75 | rule bamfiles: 76 | input: 77 | expand("Whippet/BAM/{samples}.bam", samples=DATA), 78 | expand("Whippet/BAM/{samples}.bam.bai", samples=DATA) 79 | 80 | 81 | if str2bool(config.get("downstream_only", False)): 82 | pass 83 | elif str2bool(config.get("skip_discovery_and_quant", False)): 84 | include : "rules/Round2_post_processing.smk" 85 | elif str2bool(config.get("skip_discovery", False)): 86 | include : "rules/Round2.smk" 87 | include : "rules/Round2_post_processing.smk" 88 | else: 89 | include : "rules/Round1.smk" 90 | include : "rules/Round1_post_processing.smk" 91 | include : "rules/Round2.smk" 92 | include : "rules/Round2_post_processing.smk" 93 | 94 | rule discovery: 95 | input: 96 | expand("Round1/{sample}.sam.row_ME.filter1", sample=DATA ) 97 | # "Round2/ME_canonical_SJ_tags.de_novo.fa" 98 | 99 | ##### Downstream Analysis #### 100 | 101 | if "whippet_bin_folder" in config: 102 | include : "rules/Whippet_quant.smk" 103 | 104 | if "whippet_delta" in config: 105 | with open(config["whippet_delta"], 'r') as stream: 106 | whippet_delta = yaml.safe_load(stream) 107 | include : "rules/Whippet_delta.smk" 108 | 109 | 110 | #### Single Cell ### 111 | 112 | if not "Single_Cell" in config: 113 | config["Single_Cell"]="F" 114 | 115 | if str2bool(config["Single_Cell"]): 116 | include : "rules/Snakepool.py" 117 | include : "rules/pseudo_pool.smk" 118 | 119 | #### Benchmark #### 120 | 121 | #include : "rules/Benchmark.smk 122 | 123 | 124 | 125 | #### Re-run incomplete round1 #### 126 | 127 | import os 128 | 129 | round1_incomplete = [] 130 | 131 | for file in DATA: 132 | if os.path.isfile('./Round1/' + file + '.sam.row_ME.filter1')!=True: 133 | round1_incomplete.append(file) 134 | 135 | rule rerun_incomplete_round1: 136 | input: 137 | expand("Round1/{sample}.sam.row_ME.filter1", sample=round1_incomplete ) 138 | 139 | 140 | 141 | round2_incomplete = [] 142 | 143 | for file in DATA: 144 | if os.path.isfile('./Round2/' + file + '.sam.pre_processed.filter1.ME_SJ_coverage')!=True: 145 | round2_incomplete.append(file) 146 | 147 | rule rerun_incomplete_round2: 148 | input: 149 | expand("Round2/{sample}.sam.pre_processed.filter1.ME_SJ_coverage", sample=round2_incomplete ) 150 | 151 | 152 | include : "rules/sashimi.smk" 153 | -------------------------------------------------------------------------------- /PWM/Human/hg19_GT_AG_U2_3.good.matrix: -------------------------------------------------------------------------------- 1 | A C G T 2 | 0.113282768418731 0.28224866187244 0.124165705041168 0.480302864667662 3 | 0.102437941129138 0.27905292210387 0.117779669694435 0.500729467072556 4 | 0.0934659153393549 0.280898502651647 0.108372108672138 0.51726347333686 5 | 0.0858168278183322 0.261479075471941 0.103641107208957 0.54906298950077 6 | 0.0878692876015532 0.284682214984111 0.109673270179273 0.517775227235063 7 | 0.0987522242239905 0.296523329118078 0.114725478876432 0.489998967781499 8 | 0.108900195141561 0.324043711622541 0.105410469091044 0.461645624144854 9 | 0.113745524603276 0.336391135464306 0.0922355283075035 0.457627811624914 10 | 0.0869165542804294 0.343577466800782 0.0651125717027097 0.504393407216078 11 | 0.0898074193862394 0.298875219373652 0.063942070765329 0.547375290474779 12 | 0.239419215945119 0.273510736270133 0.205616237711646 0.281453810073102 13 | 0.0597772651044163 0.646824316628888 0.00363677363339388 0.289761644633302 14 | 0.999999836674288 5.44419040642188e-08 5.44419040642188e-08 5.44419040642188e-08 15 | 5.44419040642188e-08 5.44419040642188e-08 0.999999836674288 5.44419040642188e-08 16 | 0.256470420298032 0.143568799649655 0.487298649339914 0.112662130712398 17 | 0.245941356052012 0.190350727812039 0.195789474028054 0.367918442107895 18 | 0.258746091887917 0.234448670104056 0.236626346266625 0.270178891741403 19 | -------------------------------------------------------------------------------- /PWM/Human/hg19_GT_AG_U2_5.good.matrix: -------------------------------------------------------------------------------- 1 | A C G T 2 | 0.334191682540111 0.362272816656435 0.184356674174568 0.119178826628885 3 | 0.636937666850825 0.107185275163538 0.115226344393823 0.140650713591813 4 | 0.0994980783096703 0.0267038083854034 0.805261145836577 0.0685369674683491 5 | 5.44419040642188e-08 5.44419040642188e-08 0.999999836674288 5.44419040642188e-08 6 | 5.44419040642188e-08 5.44419040642188e-08 5.44419040642188e-08 0.999999836674288 7 | 0.59709708145663 0.027112122665885 0.349772955483291 0.0260178403941942 8 | 0.699219205100292 0.0712155091483087 0.118830398442874 0.110734887308525 9 | 0.0886695835912972 0.0543439630788073 0.781791240994492 0.0751952123354031 10 | 0.180088428895933 0.149170871577864 0.193464804724512 0.477275894801691 11 | 0.295799251794024 0.193960226051496 0.294514422858109 0.215726099296371 12 | 0.225972065641257 0.250378371233246 0.236942109310197 0.2867074538153 13 | 0.223870608144378 0.262028938702989 0.242048759911421 0.272051693241212 14 | 0.226886689629536 0.237960172916198 0.255446912501625 0.279706224952641 15 | -------------------------------------------------------------------------------- /PWM/Mouse/mm10_GT_AG_U2_3.good.matrix: -------------------------------------------------------------------------------- 1 | A C G T 2 | 0.108773468722263 0.285810783269321 0.1286274787743 0.476788269234116 3 | 0.100375016373261 0.281768992295637 0.122713640139999 0.495142351191103 4 | 0.0920681626865521 0.277984822559651 0.112780910197587 0.517166104556209 5 | 0.0808874009703962 0.263214538264883 0.108618895979643 0.547279164785077 6 | 0.0832460665244445 0.279496200487488 0.113914443643465 0.523343289344602 7 | 0.0920681626865521 0.297793033280542 0.119072593313847 0.491066210719059 8 | 0.105246920223977 0.331054797525737 0.108573096648497 0.455125185601789 9 | 0.110473768891079 0.336401869437099 0.094152032253721 0.458972329418101 10 | 0.088713361680066 0.347376534163095 0.064456890921565 0.499453213235274 11 | 0.0863604210424111 0.301279507364074 0.0613081869052384 0.551051884688276 12 | 0.252234148622493 0.268109341781173 0.201986557438315 0.277669952158019 13 | 0.0596823106495353 0.644299322902688 0.00347508149990978 0.292543284947867 14 | 0.999999828252508 5.72491639332099e-08 5.72491639332099e-08 5.72491639332099e-08 15 | 5.72491639332099e-08 5.72491639332099e-08 0.999999828252508 5.72491639332099e-08 16 | 0.256545010666664 0.142596274774003 0.487110293491274 0.113748421068059 17 | 0.245404323365262 0.1904279512402 0.19521970626141 0.368948019133129 18 | 0.260712749801002 0.235322745596623 0.231847721345877 0.272116783256497 19 | -------------------------------------------------------------------------------- /PWM/Mouse/mm10_GT_AG_U2_5.good.matrix: -------------------------------------------------------------------------------- 1 | A C G T 2 | 0.336585066761685 0.360131647887414 0.184525562438686 0.118757722912215 3 | 0.637057303665137 0.107680009691138 0.11313013009758 0.142132556546144 4 | 0.100048696138842 0.0261285756682809 0.803160027900953 0.0706627002919249 5 | 5.72491639332099e-08 5.72491639332099e-08 0.999999828252508 5.72491639332099e-08 6 | 5.72491639332099e-08 5.72491639332099e-08 5.72491639332099e-08 0.999999828252508 7 | 0.608776216682132 0.0282009954026631 0.336224397028906 0.0267983908862995 8 | 0.701914881485071 0.0713496902591235 0.115213999664749 0.111521428591057 9 | 0.0799542395982849 0.0569458006135278 0.793198673376574 0.0699012864116132 10 | 0.173985991358582 0.156794067429439 0.188699026489417 0.480520914722561 11 | 0.297627010705136 0.199450419476074 0.293304698828178 0.209617870990612 12 | 0.223231722173929 0.250115929556965 0.239272937908015 0.287379410361091 13 | 0.213465014806924 0.270422208004074 0.24902247052584 0.267090306663161 14 | 0.220403613475629 0.247516817514397 0.256676683743711 0.275402885266264 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | MicroExonator is a fully-integrated computational pipeline that allows for systematic de novo discovery and quantification of microexons using raw RNA-seq data for any organism with a gene annotation. Compared to other available methods MicroExonator is more sensitive for discovering smaller microexons and it provides higher specificity for all lengths. Moreover, MicroExonator provides integrated downstream comparative analysis between cell types or tissues using [Whippet](https://github.com/timbitz/Whippet.jl) ([Sterne-Weiler et al. 2018](https://doi.org/10.1016/j.molcel.2018.08.018)). 4 | 5 | 6 | # Installation 7 | 8 | Start by cloning MicroExonator 9 | 10 | git clone https://github.com/hemberg-lab/MicroExonator 11 | 12 | Install [Miniconda 3](https://docs.conda.io/en/latest/miniconda.html) 13 | 14 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh 15 | chmod +x Miniconda3-latest-Linux-x86_64.sh 16 | ./Miniconda3-latest-Linux-x86_64.sh 17 | 18 | 19 | Finally, create an enviroment to run [snakemake](https://snakemake.readthedocs.io/en/stable/) 20 | 21 | conda create -n snakemake_env -c bioconda -c conda-forge snakemake 22 | 23 | 24 | # Documentation 25 | 26 | Extended documentation can be found at https://microexonator.readthedocs.io. 27 | 28 | 29 | # Contact 30 | 31 | For questions, ideas, feature requests and potential bug reports please contact geparada@utoronto.ca 32 | 33 | 34 | -------------------------------------------------------------------------------- /docs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/.DS_Store -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/build/doctrees/differential_inclusion_analysis.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/build/doctrees/differential_inclusion_analysis.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/discovery_and_quantification.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/build/doctrees/discovery_and_quantification.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/environment.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/build/doctrees/environment.pickle -------------------------------------------------------------------------------- /docs/build/doctrees/index.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/build/doctrees/index.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/install.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/build/doctrees/install.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/licence.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/build/doctrees/licence.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/setup.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/build/doctrees/setup.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/support.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/build/doctrees/support.doctree -------------------------------------------------------------------------------- /docs/build/html/.buildinfo: -------------------------------------------------------------------------------- 1 | # Sphinx build info version 1 2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. 3 | config: de1274e5a5ba83764ffdbcdd6538995f 4 | tags: 645f666f9bcd5a90fca523b33c5a78b7 5 | -------------------------------------------------------------------------------- /docs/build/html/_sources/differential_inclusion_analysis.rst.txt: -------------------------------------------------------------------------------- 1 | .. differential_inclusion_analysis 2 | 3 | 4 | =============================== 5 | Differential inclusion analysis 6 | =============================== 7 | 8 | 9 | On this secction we descrive the a downstream module that was developed to perform alternative splicing analysis between sample groups. To quantify and assess differential inclusion of novel and annotated microexons, on this moudle we have integrated `Whippet `_, which enables a fast and accurate assesment of alterntive splicing events across user-defined sample groups. 10 | 11 | Install 12 | ======= 13 | 14 | To run this downstream module for the first time you need to create a environment that has `snakemake` and the version of `julia` that is compatible with `Whipet v0.11`. To creat this enviroment execute the following command inside ``MicroExonator/`` folder: 15 | 16 | .. code-block:: bash 17 | 18 | conda env create -f Whippet/julia_0.6.1.yaml 19 | 20 | Then, activate the newly created enviroment: 21 | 22 | .. code-block:: bash 23 | 24 | source activate julia_0.6.1 25 | 26 | Enter julia's interactive mode: 27 | 28 | .. code-block:: bash 29 | 30 | julia 31 | 32 | Install Whippet by excecuting the following command on the interactive session: 33 | 34 | .. code-block:: bash 35 | 36 | Pkg.add("Whippet") 37 | 38 | .. note:: 39 | 40 | To exit julia interactive session press ``control + d``. 41 | 42 | 43 | Configure 44 | ========= 45 | 46 | Here there is an list of the additonal keys that need to be incorporated as a part of config.yaml: 47 | 48 | .. code-block:: bash 49 | 50 | whippet_bin_folder : /path/to/miniconda/envs/julia_0.6.1/share/julia/site/v0.6/Whippet/bin 51 | Gene_anontation_GTF : /path/to/gene.annotation.gtf 52 | whippet_delta : /path/to/whippet_delta.yaml 53 | 54 | * ``whippet_bin_folder`` correspodn t the path of whippet binary folder (``Whippet/bin``) that is located inside ``julia_0.6.1`` virtual enviroment folder. The specific routh to ``Whippet/bin`` may variate, so it is important that you manually identify the correct path. 55 | 56 | * ``Gene_anontation_GTF`` corresponds the path of a gene annotation file as Gene Transfer Format (`GTF `_). Working with the same annotation data base than the one used on the previous steps is recommended. 57 | 58 | * ``whippet_delta`` indicate the path of a `YAML `_ file you need to create to provide information about the desired comparisons between groups of samples. 59 | 60 | 61 | whippet_delta YAML file 62 | ----------------------- 63 | 64 | This file can contain the information to schedule any number of comparison between sample groups of any size. Every comparison should have the following structure inside the YAML file: 65 | 66 | .. code-block:: bash 67 | 68 | comparison_ID: 69 | A : sample1,sample2,sample3 70 | B : sample4,sample5,sample6 71 | 72 | Where ``sample1 ... sample6`` correspond to base names given to each RNA-seq samples at the corresponding input files (See :doc:`setup`) and `comparison_ID` to any given name for the sheduled comparison. As an example see the :download:`YAML file <../../Examples/Runs/Parada_et_al/whippet_delta.yaml>` we used in our publication. 73 | 74 | .. warning:: 75 | 76 | Inside this YAML file sample groups must be named ``A`` and ``B``. 77 | 78 | 79 | Optional parameters 80 | ------------------- 81 | 82 | If you just want to skip Discovery and Quantification modules and just asses alternative splicing events annotated at the provided GTF file, then include the following like at the configuratio file: 83 | 84 | .. code-block:: bash 85 | 86 | downstream_only : T 87 | 88 | Output 89 | ====== 90 | 91 | Quantification files generated per each sample can be found at ``Whipet/Quant``. Differentially included microexon analyses that can be obtained with Whippet, are reported at ``Whippet/Delta`` folder. MicroExonator performs these analyses using both PSI values calculated internally by the pipeline and PSI values directly calculated with Whippet. These results are reported under the same format than the ``diff.gz`` descrived at the `Whippet's GitHub page `_. However, to provide easier interpretation, we filter the Whippet splicing nodes that correspond to microexon inclusion events, these are reported as ``.microexons`` files, where ``.diff.ME.microexons`` files correspond to the output when MicroExonator PSI values are taken as input and ``.diff.microexons`` when Whippet PSI values are taken as input. 92 | 93 | -------------------------------------------------------------------------------- /docs/build/html/_sources/index.rst.txt: -------------------------------------------------------------------------------- 1 | ======== 2 | Overview 3 | ======== 4 | 5 | MicroExonator is a fully-integrated computational pipeline that allows for systematic de novo discovery and quantification 6 | of microexons using raw RNA-seq data for any organism with a gene annotation. Compared to other available methods MicroExonator 7 | is more sensitive for discovering smaller microexons and it provides higher specificity for all lengths. Moreover, MicroExonator 8 | provides integrated downstream comparative analysis between cell types or tissues using 9 | `Whippet `_. (`Sterne-Weiler et al. 2018 `_). 10 | As a proof of principle MicroExonator identified X novel microexons in Y RNA-seq samples from mouse early development to provide a systematic characterization 11 | based on time and tissue specificity. 12 | 13 | MicroExonator pipeline is divided in several modules: 14 | * Discover 15 | * Quantification 16 | * Differential Inclusion 17 | * Single cell analysis 18 | 19 | **Support** 20 | 21 | For questions, ideas, feature requests and potential bug reports submit an issue on our GitHub page or write us at gp7@sanger.ac.uk. 22 | 23 | .. toctree:: 24 | :name: MicroExonator-install 25 | :maxdepth: 1 26 | :hidden: 27 | 28 | install 29 | 30 | .. toctree:: 31 | :name: MicroExonator-setup 32 | :maxdepth: 1 33 | :hidden: 34 | 35 | setup 36 | 37 | .. toctree:: 38 | :name: MicroExonator-discovery-and-quantification 39 | :maxdepth: 3 40 | :hidden: 41 | 42 | discovery_and_quantification 43 | 44 | .. toctree:: 45 | :name: MicroExonator-differential_inclusion_analysis 46 | :maxdepth: 3 47 | :hidden: 48 | 49 | differential_inclusion_analysis 50 | 51 | .. toctree:: 52 | :name: MicroExonator-single_cell_analysis 53 | :maxdepth: 3 54 | :hidden: 55 | 56 | single_cell_analysis 57 | 58 | .. toctree:: 59 | :name: MicroExonator-Licence 60 | :maxdepth: 1 61 | :hidden: 62 | 63 | licence 64 | 65 | .. toctree:: 66 | :name: MicroExonator-Support 67 | :maxdepth: 1 68 | :hidden: 69 | 70 | support -------------------------------------------------------------------------------- /docs/build/html/_sources/install.rst.txt: -------------------------------------------------------------------------------- 1 | .. _Installation: 2 | 3 | ===================== 4 | Installation 5 | ===================== 6 | 7 | To install MicroExonator follow these instructions: 8 | 9 | Clone repository 10 | ================= 11 | Clone the github repository 12 | 13 | .. code-block:: bash 14 | 15 | git clone https://github.com/hemberg-lab/MicroExonator 16 | 17 | Install Miniconda 3 18 | 19 | .. code-block:: bash 20 | 21 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh 22 | chmod +x Miniconda3-latest-Linux-x86_64.sh ./Miniconda3-latest-Linux-x86_64.sh 23 | 24 | 25 | 26 | Set up a master virtual environment 27 | =================================== 28 | 29 | Create a conda virtual enviroment with the necesary dependencies 30 | 31 | .. code-block:: bash 32 | 33 | conda create -n snakemake_env -c bioconda -c conda-forge snakemake 34 | 35 | 36 | -------------------------------------------------------------------------------- /docs/build/html/_sources/licence.rst.txt: -------------------------------------------------------------------------------- 1 | .. _Licence: 2 | 3 | ===================== 4 | MIT License (MIT) 5 | ===================== 6 | 7 | Copyright (c) 2020 Guillermo Parada 8 | 9 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /docs/build/html/_sources/setup.rst.txt: -------------------------------------------------------------------------------- 1 | .. _input_files: 2 | 3 | =========== 4 | Setup 5 | =========== 6 | 7 | Before runnung MicroExonator there are several files that needs to be created inside ``MicroExonator/`` root folder: 8 | 9 | RNA-seq samples 10 | =============== 11 | 12 | Input RNA-seq data either a ``local_samples.tsv``, ``NCBI_accession_list.txt`` or ``sample_url.tsv`` needs to be defined. 13 | If you want to run MicroExonator over RNA-seq samples that are locally stored, they need to be defined inside ``local_samples.tsv``. 14 | MicroExonator can also download and run samples from NCBI if the corresponding SRA accession names are defined inside of ``NCBI_accession_list.txt``, 15 | in addition any ``fastq.gz`` that can be directly download from a URL can be included into the aalysis by defining them inside a ``sample_url.tsv``. 16 | You can find examples of these files inside the ``Examples/`` folder. 17 | Is posible to combine different types of input sources, but at least one of these files needs to be defined inside ``MicroExonator/`` root folder. 18 | 19 | Cluster configuration 20 | ===================== 21 | 22 | If you are working on a high performace cluster, then it is very likely that you need to submit jobs to queueing systems such as lsf, qsub, SLURM, etc. 23 | To make MicroExonator work with these queueing systems, you need to create a `cluster.json` file. 24 | We currently provide in the Examples folder a ``cluster.json`` file to run MicroExonator with `lsf `_. 25 | To adapt MicroExonator to other quequing systems please see the `SnakeMake documentation `_. 26 | 27 | Config file 28 | =========== 29 | 30 | Each MicroExonator's module has certain compulsory and optional parameters that needs to be defined inside a ``config.yaml`` file. 31 | The necesary content of ``config.yaml`` is described on each moudle section and examples can be found at the ``Examples/`` folder. 32 | -------------------------------------------------------------------------------- /docs/build/html/_sources/support.rst.txt: -------------------------------------------------------------------------------- 1 | .. support 2 | 3 | ======== 4 | Support 5 | ======== 6 | 7 | For questions, ideas, feature requests and potential bug reports please contact gp7@sanger.ac.uk. 8 | 9 | 10 | -------------------------------------------------------------------------------- /docs/build/html/_static/css/badge_only.css: -------------------------------------------------------------------------------- 1 | .fa:before{-webkit-font-smoothing:antialiased}.clearfix{*zoom:1}.clearfix:after,.clearfix:before{display:table;content:""}.clearfix:after{clear:both}@font-face{font-family:FontAwesome;font-style:normal;font-weight:400;src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713?#iefix) format("embedded-opentype"),url(fonts/fontawesome-webfont.woff2?af7ae505a9eed503f8b8e6982036873e) format("woff2"),url(fonts/fontawesome-webfont.woff?fee66e712a8a08eef5805a46892932ad) format("woff"),url(fonts/fontawesome-webfont.ttf?b06871f281fee6b241d60582ae9369b9) format("truetype"),url(fonts/fontawesome-webfont.svg?912ec66d7572ff821749319396470bde#FontAwesome) format("svg")}.fa:before{font-family:FontAwesome;font-style:normal;font-weight:400;line-height:1}.fa:before,a .fa{text-decoration:inherit}.fa:before,a .fa,li .fa{display:inline-block}li .fa-large:before{width:1.875em}ul.fas{list-style-type:none;margin-left:2em;text-indent:-.8em}ul.fas li .fa{width:.8em}ul.fas li .fa-large:before{vertical-align:baseline}.fa-book:before,.icon-book:before{content:"\f02d"}.fa-caret-down:before,.icon-caret-down:before{content:"\f0d7"}.fa-caret-up:before,.icon-caret-up:before{content:"\f0d8"}.fa-caret-left:before,.icon-caret-left:before{content:"\f0d9"}.fa-caret-right:before,.icon-caret-right:before{content:"\f0da"}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60}.rst-versions .rst-current-version:after{clear:both;content:"";display:block}.rst-versions .rst-current-version .fa{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up{height:auto;max-height:100%;overflow-y:scroll}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:grey;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:1px solid #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px;max-height:90%}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none;line-height:30px}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge>.rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width:768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}} -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff2 -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff2 -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/fontawesome-webfont.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/build/html/_static/css/fonts/fontawesome-webfont.eot -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/fontawesome-webfont.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/build/html/_static/css/fonts/fontawesome-webfont.ttf -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/fontawesome-webfont.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/build/html/_static/css/fonts/fontawesome-webfont.woff -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/fontawesome-webfont.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/build/html/_static/css/fonts/fontawesome-webfont.woff2 -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/lato-bold-italic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/build/html/_static/css/fonts/lato-bold-italic.woff -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/lato-bold-italic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/build/html/_static/css/fonts/lato-bold-italic.woff2 -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/lato-bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/build/html/_static/css/fonts/lato-bold.woff -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/lato-bold.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/build/html/_static/css/fonts/lato-bold.woff2 -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/lato-normal-italic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/build/html/_static/css/fonts/lato-normal-italic.woff -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/lato-normal-italic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/build/html/_static/css/fonts/lato-normal-italic.woff2 -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/lato-normal.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/build/html/_static/css/fonts/lato-normal.woff -------------------------------------------------------------------------------- /docs/build/html/_static/css/fonts/lato-normal.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/build/html/_static/css/fonts/lato-normal.woff2 -------------------------------------------------------------------------------- /docs/build/html/_static/documentation_options.js: -------------------------------------------------------------------------------- 1 | var DOCUMENTATION_OPTIONS = { 2 | URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), 3 | VERSION: '', 4 | LANGUAGE: 'None', 5 | COLLAPSE_INDEX: false, 6 | BUILDER: 'html', 7 | FILE_SUFFIX: '.html', 8 | LINK_SUFFIX: '.html', 9 | HAS_SOURCE: true, 10 | SOURCELINK_SUFFIX: '.txt', 11 | NAVIGATION_WITH_KEYS: false 12 | }; -------------------------------------------------------------------------------- /docs/build/html/_static/file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/build/html/_static/file.png -------------------------------------------------------------------------------- /docs/build/html/_static/js/badge_only.js: -------------------------------------------------------------------------------- 1 | !function(e){var t={};function r(n){if(t[n])return t[n].exports;var o=t[n]={i:n,l:!1,exports:{}};return e[n].call(o.exports,o,o.exports,r),o.l=!0,o.exports}r.m=e,r.c=t,r.d=function(e,t,n){r.o(e,t)||Object.defineProperty(e,t,{enumerable:!0,get:n})},r.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},r.t=function(e,t){if(1&t&&(e=r(e)),8&t)return e;if(4&t&&"object"==typeof e&&e&&e.__esModule)return e;var n=Object.create(null);if(r.r(n),Object.defineProperty(n,"default",{enumerable:!0,value:e}),2&t&&"string"!=typeof e)for(var o in e)r.d(n,o,function(t){return e[t]}.bind(null,o));return n},r.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return r.d(t,"a",t),t},r.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r.p="",r(r.s=4)}({4:function(e,t,r){}}); -------------------------------------------------------------------------------- /docs/build/html/_static/js/html5shiv-printshiv.min.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @preserve HTML5 Shiv 3.7.3-pre | @afarkas @jdalton @jon_neal @rem | MIT/GPL2 Licensed 3 | */ 4 | !function(a,b){function c(a,b){var c=a.createElement("p"),d=a.getElementsByTagName("head")[0]||a.documentElement;return c.innerHTML="x",d.insertBefore(c.lastChild,d.firstChild)}function d(){var a=y.elements;return"string"==typeof a?a.split(" "):a}function e(a,b){var c=y.elements;"string"!=typeof c&&(c=c.join(" ")),"string"!=typeof a&&(a=a.join(" ")),y.elements=c+" "+a,j(b)}function f(a){var b=x[a[v]];return b||(b={},w++,a[v]=w,x[w]=b),b}function g(a,c,d){if(c||(c=b),q)return c.createElement(a);d||(d=f(c));var e;return e=d.cache[a]?d.cache[a].cloneNode():u.test(a)?(d.cache[a]=d.createElem(a)).cloneNode():d.createElem(a),!e.canHaveChildren||t.test(a)||e.tagUrn?e:d.frag.appendChild(e)}function h(a,c){if(a||(a=b),q)return a.createDocumentFragment();c=c||f(a);for(var e=c.frag.cloneNode(),g=0,h=d(),i=h.length;i>g;g++)e.createElement(h[g]);return e}function i(a,b){b.cache||(b.cache={},b.createElem=a.createElement,b.createFrag=a.createDocumentFragment,b.frag=b.createFrag()),a.createElement=function(c){return y.shivMethods?g(c,a,b):b.createElem(c)},a.createDocumentFragment=Function("h,f","return function(){var n=f.cloneNode(),c=n.createElement;h.shivMethods&&("+d().join().replace(/[\w\-:]+/g,function(a){return b.createElem(a),b.frag.createElement(a),'c("'+a+'")'})+");return n}")(y,b.frag)}function j(a){a||(a=b);var d=f(a);return!y.shivCSS||p||d.hasCSS||(d.hasCSS=!!c(a,"article,aside,dialog,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}mark{background:#FF0;color:#000}template{display:none}")),q||i(a,d),a}function k(a){for(var b,c=a.getElementsByTagName("*"),e=c.length,f=RegExp("^(?:"+d().join("|")+")$","i"),g=[];e--;)b=c[e],f.test(b.nodeName)&&g.push(b.applyElement(l(b)));return g}function l(a){for(var b,c=a.attributes,d=c.length,e=a.ownerDocument.createElement(A+":"+a.nodeName);d--;)b=c[d],b.specified&&e.setAttribute(b.nodeName,b.nodeValue);return e.style.cssText=a.style.cssText,e}function m(a){for(var b,c=a.split("{"),e=c.length,f=RegExp("(^|[\\s,>+~])("+d().join("|")+")(?=[[\\s,>+~#.:]|$)","gi"),g="$1"+A+"\\:$2";e--;)b=c[e]=c[e].split("}"),b[b.length-1]=b[b.length-1].replace(f,g),c[e]=b.join("}");return c.join("{")}function n(a){for(var b=a.length;b--;)a[b].removeNode()}function o(a){function b(){clearTimeout(g._removeSheetTimer),d&&d.removeNode(!0),d=null}var d,e,g=f(a),h=a.namespaces,i=a.parentWindow;return!B||a.printShived?a:("undefined"==typeof h[A]&&h.add(A),i.attachEvent("onbeforeprint",function(){b();for(var f,g,h,i=a.styleSheets,j=[],l=i.length,n=Array(l);l--;)n[l]=i[l];for(;h=n.pop();)if(!h.disabled&&z.test(h.media)){try{f=h.imports,g=f.length}catch(o){g=0}for(l=0;g>l;l++)n.push(f[l]);try{j.push(h.cssText)}catch(o){}}j=m(j.reverse().join("")),e=k(a),d=c(a,j)}),i.attachEvent("onafterprint",function(){n(e),clearTimeout(g._removeSheetTimer),g._removeSheetTimer=setTimeout(b,500)}),a.printShived=!0,a)}var p,q,r="3.7.3",s=a.html5||{},t=/^<|^(?:button|map|select|textarea|object|iframe|option|optgroup)$/i,u=/^(?:a|b|code|div|fieldset|h1|h2|h3|h4|h5|h6|i|label|li|ol|p|q|span|strong|style|table|tbody|td|th|tr|ul)$/i,v="_html5shiv",w=0,x={};!function(){try{var a=b.createElement("a");a.innerHTML="",p="hidden"in a,q=1==a.childNodes.length||function(){b.createElement("a");var a=b.createDocumentFragment();return"undefined"==typeof a.cloneNode||"undefined"==typeof a.createDocumentFragment||"undefined"==typeof a.createElement}()}catch(c){p=!0,q=!0}}();var y={elements:s.elements||"abbr article aside audio bdi canvas data datalist details dialog figcaption figure footer header hgroup main mark meter nav output picture progress section summary template time video",version:r,shivCSS:s.shivCSS!==!1,supportsUnknownElements:q,shivMethods:s.shivMethods!==!1,type:"default",shivDocument:j,createElement:g,createDocumentFragment:h,addElements:e};a.html5=y,j(b);var z=/^$|\b(?:all|print)\b/,A="html5shiv",B=!q&&function(){var c=b.documentElement;return!("undefined"==typeof b.namespaces||"undefined"==typeof b.parentWindow||"undefined"==typeof c.applyElement||"undefined"==typeof c.removeNode||"undefined"==typeof a.attachEvent)}();y.type+=" print",y.shivPrint=o,o(b),"object"==typeof module&&module.exports&&(module.exports=y)}("undefined"!=typeof window?window:this,document); -------------------------------------------------------------------------------- /docs/build/html/_static/js/html5shiv.min.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @preserve HTML5 Shiv 3.7.3 | @afarkas @jdalton @jon_neal @rem | MIT/GPL2 Licensed 3 | */ 4 | !function(a,b){function c(a,b){var c=a.createElement("p"),d=a.getElementsByTagName("head")[0]||a.documentElement;return c.innerHTML="x",d.insertBefore(c.lastChild,d.firstChild)}function d(){var a=t.elements;return"string"==typeof a?a.split(" "):a}function e(a,b){var c=t.elements;"string"!=typeof c&&(c=c.join(" ")),"string"!=typeof a&&(a=a.join(" ")),t.elements=c+" "+a,j(b)}function f(a){var b=s[a[q]];return b||(b={},r++,a[q]=r,s[r]=b),b}function g(a,c,d){if(c||(c=b),l)return c.createElement(a);d||(d=f(c));var e;return e=d.cache[a]?d.cache[a].cloneNode():p.test(a)?(d.cache[a]=d.createElem(a)).cloneNode():d.createElem(a),!e.canHaveChildren||o.test(a)||e.tagUrn?e:d.frag.appendChild(e)}function h(a,c){if(a||(a=b),l)return a.createDocumentFragment();c=c||f(a);for(var e=c.frag.cloneNode(),g=0,h=d(),i=h.length;i>g;g++)e.createElement(h[g]);return e}function i(a,b){b.cache||(b.cache={},b.createElem=a.createElement,b.createFrag=a.createDocumentFragment,b.frag=b.createFrag()),a.createElement=function(c){return t.shivMethods?g(c,a,b):b.createElem(c)},a.createDocumentFragment=Function("h,f","return function(){var n=f.cloneNode(),c=n.createElement;h.shivMethods&&("+d().join().replace(/[\w\-:]+/g,function(a){return b.createElem(a),b.frag.createElement(a),'c("'+a+'")'})+");return n}")(t,b.frag)}function j(a){a||(a=b);var d=f(a);return!t.shivCSS||k||d.hasCSS||(d.hasCSS=!!c(a,"article,aside,dialog,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}mark{background:#FF0;color:#000}template{display:none}")),l||i(a,d),a}var k,l,m="3.7.3-pre",n=a.html5||{},o=/^<|^(?:button|map|select|textarea|object|iframe|option|optgroup)$/i,p=/^(?:a|b|code|div|fieldset|h1|h2|h3|h4|h5|h6|i|label|li|ol|p|q|span|strong|style|table|tbody|td|th|tr|ul)$/i,q="_html5shiv",r=0,s={};!function(){try{var a=b.createElement("a");a.innerHTML="",k="hidden"in a,l=1==a.childNodes.length||function(){b.createElement("a");var a=b.createDocumentFragment();return"undefined"==typeof a.cloneNode||"undefined"==typeof a.createDocumentFragment||"undefined"==typeof a.createElement}()}catch(c){k=!0,l=!0}}();var t={elements:n.elements||"abbr article aside audio bdi canvas data datalist details dialog figcaption figure footer header hgroup main mark meter nav output picture progress section summary template time video",version:m,shivCSS:n.shivCSS!==!1,supportsUnknownElements:l,shivMethods:n.shivMethods!==!1,type:"default",shivDocument:j,createElement:g,createDocumentFragment:h,addElements:e};a.html5=t,j(b),"object"==typeof module&&module.exports&&(module.exports=t)}("undefined"!=typeof window?window:this,document); -------------------------------------------------------------------------------- /docs/build/html/_static/js/theme.js: -------------------------------------------------------------------------------- 1 | !function(n){var e={};function t(i){if(e[i])return e[i].exports;var o=e[i]={i:i,l:!1,exports:{}};return n[i].call(o.exports,o,o.exports,t),o.l=!0,o.exports}t.m=n,t.c=e,t.d=function(n,e,i){t.o(n,e)||Object.defineProperty(n,e,{enumerable:!0,get:i})},t.r=function(n){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(n,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(n,"__esModule",{value:!0})},t.t=function(n,e){if(1&e&&(n=t(n)),8&e)return n;if(4&e&&"object"==typeof n&&n&&n.__esModule)return n;var i=Object.create(null);if(t.r(i),Object.defineProperty(i,"default",{enumerable:!0,value:n}),2&e&&"string"!=typeof n)for(var o in n)t.d(i,o,function(e){return n[e]}.bind(null,o));return i},t.n=function(n){var e=n&&n.__esModule?function(){return n.default}:function(){return n};return t.d(e,"a",e),e},t.o=function(n,e){return Object.prototype.hasOwnProperty.call(n,e)},t.p="",t(t.s=0)}([function(n,e,t){t(1),n.exports=t(3)},function(n,e,t){(function(){var e="undefined"!=typeof window?window.jQuery:t(2);n.exports.ThemeNav={navBar:null,win:null,winScroll:!1,winResize:!1,linkScroll:!1,winPosition:0,winHeight:null,docHeight:null,isRunning:!1,enable:function(n){var t=this;void 0===n&&(n=!0),t.isRunning||(t.isRunning=!0,e((function(e){t.init(e),t.reset(),t.win.on("hashchange",t.reset),n&&t.win.on("scroll",(function(){t.linkScroll||t.winScroll||(t.winScroll=!0,requestAnimationFrame((function(){t.onScroll()})))})),t.win.on("resize",(function(){t.winResize||(t.winResize=!0,requestAnimationFrame((function(){t.onResize()})))})),t.onResize()})))},enableSticky:function(){this.enable(!0)},init:function(n){n(document);var e=this;this.navBar=n("div.wy-side-scroll:first"),this.win=n(window),n(document).on("click","[data-toggle='wy-nav-top']",(function(){n("[data-toggle='wy-nav-shift']").toggleClass("shift"),n("[data-toggle='rst-versions']").toggleClass("shift")})).on("click",".wy-menu-vertical .current ul li a",(function(){var t=n(this);n("[data-toggle='wy-nav-shift']").removeClass("shift"),n("[data-toggle='rst-versions']").toggleClass("shift"),e.toggleCurrent(t),e.hashChange()})).on("click","[data-toggle='rst-current-version']",(function(){n("[data-toggle='rst-versions']").toggleClass("shift-up")})),n("table.docutils:not(.field-list,.footnote,.citation)").wrap("
"),n("table.docutils.footnote").wrap("
"),n("table.docutils.citation").wrap("
"),n(".wy-menu-vertical ul").not(".simple").siblings("a").each((function(){var t=n(this);expand=n(''),expand.on("click",(function(n){return e.toggleCurrent(t),n.stopPropagation(),!1})),t.prepend(expand)}))},reset:function(){var n=encodeURI(window.location.hash)||"#";try{var e=$(".wy-menu-vertical"),t=e.find('[href="'+n+'"]');if(0===t.length){var i=$('.document [id="'+n.substring(1)+'"]').closest("div.section");0===(t=e.find('[href="#'+i.attr("id")+'"]')).length&&(t=e.find('[href="#"]'))}t.length>0&&($(".wy-menu-vertical .current").removeClass("current"),t.addClass("current"),t.closest("li.toctree-l1").addClass("current"),t.closest("li.toctree-l1").parent().addClass("current"),t.closest("li.toctree-l1").addClass("current"),t.closest("li.toctree-l2").addClass("current"),t.closest("li.toctree-l3").addClass("current"),t.closest("li.toctree-l4").addClass("current"),t.closest("li.toctree-l5").addClass("current"),t[0].scrollIntoView())}catch(n){console.log("Error expanding nav for anchor",n)}},onScroll:function(){this.winScroll=!1;var n=this.win.scrollTop(),e=n+this.winHeight,t=this.navBar.scrollTop()+(n-this.winPosition);n<0||e>this.docHeight||(this.navBar.scrollTop(t),this.winPosition=n)},onResize:function(){this.winResize=!1,this.winHeight=this.win.height(),this.docHeight=$(document).height()},hashChange:function(){this.linkScroll=!0,this.win.one("hashchange",(function(){this.linkScroll=!1}))},toggleCurrent:function(n){var e=n.closest("li");e.siblings("li.current").removeClass("current"),e.siblings().find("li.current").removeClass("current"),e.find("> ul li.current").removeClass("current"),e.toggleClass("current")}},"undefined"!=typeof window&&(window.SphinxRtdTheme={Navigation:n.exports.ThemeNav,StickyNav:n.exports.ThemeNav}),function(){for(var n=0,e=["ms","moz","webkit","o"],t=0;t 4 | 5 | 6 | 7 | 8 | 9 | 10 | Index — MicroExonator documentation 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 |
46 | 47 | 112 | 113 |
114 | 115 | 116 | 122 | 123 | 124 |
125 | 126 |
127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 |
145 | 146 |
    147 | 148 |
  • »
  • 149 | 150 |
  • Index
  • 151 | 152 | 153 |
  • 154 | 155 | 156 | 157 |
  • 158 | 159 |
160 | 161 | 162 |
163 |
164 |
165 |
166 | 167 | 168 |

Index

169 | 170 |
171 | 172 |
173 | 174 | 175 |
176 | 177 |
178 |
179 | 180 | 181 |
182 | 183 |
184 |

185 | 186 | © Copyright 2020, Guillermo E. Parada 187 | 188 |

189 |
190 | 191 | 192 | 193 | Built with Sphinx using a 194 | 195 | theme 196 | 197 | provided by Read the Docs. 198 | 199 |
200 | 201 |
202 |
203 | 204 |
205 | 206 |
207 | 208 | 209 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | -------------------------------------------------------------------------------- /docs/build/html/objects.inv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/docs/build/html/objects.inv -------------------------------------------------------------------------------- /docs/build/html/search.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | Search — MicroExonator documentation 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 |
48 | 49 | 114 | 115 |
116 | 117 | 118 | 124 | 125 | 126 |
127 | 128 |
129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 |
147 | 148 |
    149 | 150 |
  • »
  • 151 | 152 |
  • Search
  • 153 | 154 | 155 |
  • 156 | 157 | 158 | 159 |
  • 160 | 161 |
162 | 163 | 164 |
165 |
166 |
167 |
168 | 169 | 176 | 177 | 178 |
179 | 180 |
181 | 182 |
183 | 184 |
185 |
186 | 187 | 188 |
189 | 190 |
191 |

192 | 193 | © Copyright 2020, Guillermo E. Parada 194 | 195 |

196 |
197 | 198 | 199 | 200 | Built with Sphinx using a 201 | 202 | theme 203 | 204 | provided by Read the Docs. 205 | 206 |
207 | 208 |
209 |
210 | 211 |
212 | 213 |
214 | 215 | 216 | 221 | 222 | 223 | 224 | 225 | 226 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | -------------------------------------------------------------------------------- /docs/build/html/support.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | Support — MicroExonator documentation 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 |
47 | 48 | 113 | 114 |
115 | 116 | 117 | 123 | 124 | 125 |
126 | 127 |
128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 |
146 | 147 |
    148 | 149 |
  • »
  • 150 | 151 |
  • Support
  • 152 | 153 | 154 |
  • 155 | 156 | 157 | View page source 158 | 159 | 160 |
  • 161 | 162 |
163 | 164 | 165 |
166 |
167 |
168 |
169 | 170 |
171 |

Support

172 |

For questions, ideas, feature requests and potential bug reports please contact gp7@sanger.ac.uk.

173 |
174 | 175 | 176 |
177 | 178 |
179 |
180 | 181 | 187 | 188 | 189 |
190 | 191 |
192 |

193 | 194 | © Copyright 2020, Guillermo E. Parada 195 | 196 |

197 |
198 | 199 | 200 | 201 | Built with Sphinx using a 202 | 203 | theme 204 | 205 | provided by Read the Docs. 206 | 207 |
208 | 209 |
210 |
211 | 212 |
213 | 214 |
215 | 216 | 217 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'MicroExonator' 21 | copyright = '2020, Guillermo E. Parada' 22 | author = 'Guillermo E. Parada' 23 | 24 | 25 | # -- General configuration --------------------------------------------------- 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be 28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 29 | # ones. 30 | extensions = [ 31 | 'sphinx_rtd_theme', 32 | ] 33 | 34 | # Add any paths that contain templates here, relative to this directory. 35 | templates_path = ['_templates'] 36 | 37 | # List of patterns, relative to source directory, that match files and 38 | # directories to ignore when looking for source files. 39 | # This pattern also affects html_static_path and html_extra_path. 40 | exclude_patterns = [] 41 | 42 | 43 | # -- Options for HTML output ------------------------------------------------- 44 | 45 | # The theme to use for HTML and HTML Help pages. See the documentation for 46 | # a list of builtin themes. 47 | # 48 | #html_theme = 'alabaster' 49 | html_theme = "sphinx_rtd_theme" 50 | 51 | # Add any paths that contain custom static files (such as style sheets) here, 52 | # relative to this directory. They are copied after the builtin static files, 53 | # so a file named "default.css" will overwrite the builtin "default.css". 54 | html_static_path = ['_static'] 55 | 56 | html_context = { 57 | 'css_files': [ 58 | '_static/theme_overrides.css', # override wide tables in RTD theme 59 | ], 60 | } 61 | 62 | master_doc = 'index' 63 | -------------------------------------------------------------------------------- /docs/source/differential_inclusion_analysis.rst: -------------------------------------------------------------------------------- 1 | .. differential_inclusion_analysis 2 | 3 | 4 | =============================== 5 | Differential inclusion analysis 6 | =============================== 7 | 8 | 9 | On this secction we descrive the a downstream module that was developed to perform alternative splicing analysis between sample groups. To quantify and assess differential inclusion of novel and annotated microexons, on this moudle we have integrated `Whippet `_, which enables a fast and accurate assesment of alterntive splicing events across user-defined sample groups. 10 | 11 | Install 12 | ======= 13 | 14 | To run this downstream module for the first time you need to create a environment that has `snakemake` and the version of `julia` that is compatible with `Whipet v0.11`. To creat this enviroment execute the following command inside ``MicroExonator/`` folder: 15 | 16 | .. code-block:: bash 17 | 18 | conda env create -f Whippet/julia_0.6.1.yaml 19 | 20 | Then, activate the newly created enviroment: 21 | 22 | .. code-block:: bash 23 | 24 | source activate julia_0.6.1 25 | 26 | Enter julia's interactive mode: 27 | 28 | .. code-block:: bash 29 | 30 | julia 31 | 32 | Install Whippet by excecuting the following command on the interactive session: 33 | 34 | .. code-block:: bash 35 | 36 | Pkg.add("Whippet") 37 | 38 | .. note:: 39 | 40 | To exit julia interactive session press ``control + d``. 41 | 42 | 43 | Configure 44 | ========= 45 | 46 | Here there is an list of the additonal keys that need to be incorporated as a part of config.yaml: 47 | 48 | .. code-block:: bash 49 | 50 | whippet_bin_folder : /path/to/miniconda/envs/julia_0.6.1/share/julia/site/v0.6/Whippet/bin 51 | Gene_anontation_GTF : /path/to/gene.annotation.gtf 52 | whippet_delta : /path/to/whippet_delta.yaml 53 | 54 | * ``whippet_bin_folder`` correspodn t the path of whippet binary folder (``Whippet/bin``) that is located inside ``julia_0.6.1`` virtual enviroment folder. The specific routh to ``Whippet/bin`` may variate, so it is important that you manually identify the correct path. 55 | 56 | * ``Gene_anontation_GTF`` corresponds the path of a gene annotation file as Gene Transfer Format (`GTF `_). Working with the same annotation data base than the one used on the previous steps is recommended. 57 | 58 | * ``whippet_delta`` indicate the path of a `YAML `_ file you need to create to provide information about the desired comparisons between groups of samples. 59 | 60 | 61 | whippet_delta YAML file 62 | ----------------------- 63 | 64 | This file can contain the information to schedule any number of comparison between sample groups of any size. Every comparison should have the following structure inside the YAML file: 65 | 66 | .. code-block:: bash 67 | 68 | comparison_ID: 69 | A : sample1,sample2,sample3 70 | B : sample4,sample5,sample6 71 | 72 | Where ``sample1 ... sample6`` correspond to base names given to each RNA-seq samples at the corresponding input files (See :doc:`setup`) and `comparison_ID` to any given name for the sheduled comparison. As an example see the :download:`YAML file <../../Examples/Runs/Parada_et_al/whippet_delta.yaml>` we used in our publication. 73 | 74 | .. warning:: 75 | 76 | Inside this YAML file sample groups must be named ``A`` and ``B``. 77 | 78 | 79 | Optional parameters 80 | ------------------- 81 | 82 | If you just want to skip Discovery and Quantification modules and just asses alternative splicing events annotated at the provided GTF file, then include the following like at the configuratio file: 83 | 84 | .. code-block:: bash 85 | 86 | downstream_only : T 87 | 88 | Run 89 | === 90 | 91 | In order to run this module you need to run the standar MicroExonator command, but providing ``differential_inclusion`` as a target. If you have not run previous ``discovery`` and ``quantification`` modules, MicroExonator will include them into the job plan (unless ``downstream_only`` is set as ``T``) 92 | 93 | .. code-block:: bash 94 | 95 | snakemake -s MicroExonator.smk --cluster-config cluster.json --cluster {cluster system params} --use-conda -k -j {number of parallel jobs} differential_inclusion 96 | 97 | 98 | 99 | Output 100 | ====== 101 | 102 | Quantification files generated per each sample can be found at ``Whipet/Quant``. Differentially included microexon analyses that can be obtained with Whippet, are reported at ``Whippet/Delta`` folder. MicroExonator performs these analyses using both PSI values calculated internally by the pipeline and PSI values directly calculated with Whippet. These results are reported under the same format than the ``diff.gz`` descrived at the `Whippet's GitHub page `_. However, to provide easier interpretation, we filter the Whippet splicing nodes that correspond to microexon inclusion events, these are reported as ``.microexons`` files, where ``.diff.ME.microexons`` files correspond to the output when MicroExonator PSI values are taken as input and ``.diff.microexons`` when Whippet PSI values are taken as input. 103 | 104 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Overview 3 | ======== 4 | 5 | MicroExonator is a fully-integrated computational pipeline that allows for systematic de novo discovery and quantification 6 | of microexons using raw RNA-seq data for any organism with a gene annotation. Compared to other available methods MicroExonator 7 | is more sensitive for discovering smaller microexons and it provides higher specificity for all lengths. Moreover, MicroExonator 8 | provides integrated downstream comparative analysis between cell types or tissues using 9 | `Whippet `_. (`Sterne-Weiler et al. 2018 `_). 10 | 11 | MicroExonator pipeline is divided in several modules: 12 | * Discover 13 | * Quantification 14 | * Differential Inclusion 15 | * Single cell analysis 16 | 17 | **Support** 18 | 19 | For questions, ideas, feature requests and potential bug reports submit an issue on our GitHub page or write us at gp7@sanger.ac.uk. 20 | 21 | .. toctree:: 22 | :name: MicroExonator-install 23 | :maxdepth: 1 24 | :hidden: 25 | 26 | install 27 | 28 | .. toctree:: 29 | :name: MicroExonator-setup 30 | :maxdepth: 1 31 | :hidden: 32 | 33 | setup 34 | 35 | .. toctree:: 36 | :name: MicroExonator-discovery-and-quantification 37 | :maxdepth: 3 38 | :hidden: 39 | 40 | discovery_and_quantification 41 | 42 | .. toctree:: 43 | :name: MicroExonator-differential_inclusion_analysis 44 | :maxdepth: 3 45 | :hidden: 46 | 47 | differential_inclusion_analysis 48 | 49 | .. toctree:: 50 | :name: MicroExonator-single_cell_analysis 51 | :maxdepth: 3 52 | :hidden: 53 | 54 | single_cell_analysis 55 | 56 | .. toctree:: 57 | :name: MicroExonator-Licence 58 | :maxdepth: 1 59 | :hidden: 60 | 61 | licence 62 | 63 | .. toctree:: 64 | :name: MicroExonator-Support 65 | :maxdepth: 1 66 | :hidden: 67 | 68 | support 69 | -------------------------------------------------------------------------------- /docs/source/install.rst: -------------------------------------------------------------------------------- 1 | .. _Installation: 2 | 3 | ===================== 4 | Installation 5 | ===================== 6 | 7 | To install MicroExonator follow these instructions: 8 | 9 | Clone repository 10 | ================= 11 | Clone the github repository 12 | 13 | .. code-block:: bash 14 | 15 | git clone https://github.com/hemberg-lab/MicroExonator 16 | 17 | Install Miniconda 3 18 | 19 | .. code-block:: bash 20 | 21 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh 22 | chmod +x Miniconda3-latest-Linux-x86_64.sh 23 | ./Miniconda3-latest-Linux-x86_64.sh 24 | 25 | Start using conda by opening a new terminal or just running: 26 | 27 | .. code-block:: bash 28 | 29 | bash 30 | 31 | 32 | Set up a master virtual environment 33 | =================================== 34 | 35 | Create a conda virtual enviroment with the necesary dependencies 36 | 37 | .. code-block:: bash 38 | 39 | conda create -n snakemake_env -c bioconda -c conda-forge snakemake 40 | 41 | 42 | -------------------------------------------------------------------------------- /docs/source/licence.rst: -------------------------------------------------------------------------------- 1 | .. _Licence: 2 | 3 | ===================== 4 | MIT License (MIT) 5 | ===================== 6 | 7 | Copyright (c) 2020 Guillermo Parada 8 | 9 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /docs/source/setup.rst: -------------------------------------------------------------------------------- 1 | .. _input_files: 2 | 3 | =========== 4 | Setup 5 | =========== 6 | 7 | Before runnung MicroExonator there are several files that needs to be created inside ``MicroExonator/`` root folder: 8 | 9 | RNA-seq samples 10 | =============== 11 | 12 | Input RNA-seq data either a ``local_samples.tsv``, ``NCBI_accession_list.txt`` or ``sample_url.tsv`` needs to be defined. 13 | If you want to run MicroExonator over RNA-seq samples that are locally stored, they need to be defined inside ``local_samples.tsv``. 14 | MicroExonator can also download and run samples from NCBI if the corresponding SRA accession names are defined inside of ``NCBI_accession_list.txt``, 15 | in addition any ``fastq.gz`` that can be directly download from a URL can be included into the aalysis by defining them inside a ``sample_url.tsv``. 16 | You can find examples of these files inside the ``Examples/`` folder. 17 | Is posible to combine different types of input sources, but at least one of these files needs to be defined inside ``MicroExonator/`` root folder. 18 | 19 | Cluster configuration 20 | ===================== 21 | 22 | If you are working on a high performace cluster, then it is very likely that you need to submit jobs to queueing systems such as lsf, qsub, SLURM, etc. 23 | To make MicroExonator work with these queueing systems, you need to create a `cluster.json` file. 24 | We currently provide in the Examples folder a ``cluster.json`` file to run MicroExonator with `lsf `_. 25 | To adapt MicroExonator to other quequing systems please see the `SnakeMake documentation `_. 26 | 27 | Config file 28 | =========== 29 | 30 | Each MicroExonator's module has certain compulsory and optional parameters that needs to be defined inside a ``config.yaml`` file. 31 | The necesary content of ``config.yaml`` is described on each moudle section and examples can be found at the ``Examples/`` folder. 32 | -------------------------------------------------------------------------------- /docs/source/support.rst: -------------------------------------------------------------------------------- 1 | .. support 2 | 3 | ======== 4 | Support 5 | ======== 6 | 7 | For questions, ideas, feature requests and potential bug reports please contact gp7@sanger.ac.uk. 8 | 9 | 10 | -------------------------------------------------------------------------------- /envs/MicroExonator.yaml: -------------------------------------------------------------------------------- 1 | #name: Micro-Exonator 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | - cgat 7 | dependencies: 8 | - r-stringi 9 | - bcftools=1.7=0 10 | - bedtools=2.27.1=he941832_2 11 | - hisat2=2.1.0=py36pl5.22.0_0 12 | - htslib=1.7=0 13 | - libdeflate=1.0=h470a237_0 14 | - perl-threaded=5.22.0=10 15 | - pybedtools=0.7.10=py36_2 16 | - pysam=0.14.1=py36hae42fb6_1 17 | - samtools=1.9=h8ee4bcc_1 18 | - snakemake=5.2.2=py36_1 19 | - snakemake-minimal=5.2.2=py36_1 20 | - sra-tools=2.9.1_1=h470a237_0 21 | - r-reshape2 22 | - aioeasywebdav=2.2.0=py36_0 23 | - aiohttp=3.3.2=py36h470a237_1 24 | - appdirs=1.4.3=py_1 25 | - asn1crypto=0.24.0=py36_0 26 | - async-timeout=3.0.0=py36_0 27 | - attrs=18.1.0=py_1 28 | - biopython=1.72=py36_0 29 | - boto3=1.7.76=py_0 30 | - botocore=1.10.77=py_0 31 | - ca-certificates=2018.8.24=ha4d7672_0 32 | - cachetools=2.1.0=py_0 33 | - certifi=2018.8.24=py36_1001 34 | - cffi=1.11.5=py36_0 35 | - chardet=3.0.4=py36_0 36 | - configargparse=0.13.0=py_1 37 | - cryptography=2.1.4=py36_0 38 | - curl=7.61.0=h93b3f91_2 39 | - decorator=4.3.0=py_0 40 | - docutils=0.14=py36_0 41 | - dropbox=8.7.1=py_0 42 | - expat=2.2.5=hfc679d8_1 43 | - filechunkio=1.8=py36_1 44 | - freetype=2.8.1=hfa320df_1 45 | - ftputil=3.4=py_0 46 | - google-api-core=0.1.4=py_0 47 | - google-auth=1.5.1=py_0 48 | - google-cloud-core=0.28.1=py_0 49 | - google-cloud-storage=1.10.0=py_0 50 | - google-resumable-media=0.3.1=py_0 51 | - googleapis-common-protos=1.5.3=py_1 52 | - graphite2=1.3.12=hfc679d8_0 53 | - graphviz=2.38.0=7 54 | - icu=58.2=0 55 | - idna=2.6=py36_1 56 | - idna_ssl=1.0.0=0 57 | - jinja2=2.10=py_1 58 | - jmespath=0.9.3=py_1 59 | - jpeg=9c=h470a237_1 60 | - jsonschema=2.6.0=py36_1 61 | - krb5=1.14.6=0 62 | - libgfortran=3.0.0=1 63 | - libpng=1.6.34=0 64 | - libprotobuf=3.6.0=hd28b015_0 65 | - libtiff=4.0.9 66 | - libtool=2.4.6=h470a237_1 67 | - libuuid=1.0.3=1 68 | - libxcb=1.13=0 69 | - markupsafe=1.0=py36_0 70 | - multidict=4.3.1=py36h470a237_0 71 | - ncurses=6.1=hfc679d8_1 72 | - networkx=2.1=py_1 73 | - openssl=1.0.2p=h470a237_0 74 | - packaging=17.1=py_0 75 | - pandas=0.23.4=py36hf8a1672_0 76 | - pango=1.40.14=0 77 | - paramiko=2.4.1=py36_0 78 | - perl=5.22.0.1=0 79 | - pip=9.0.* 80 | - prettytable=0.7.2=py_2 81 | - protobuf=3.6.0=py36hfc679d8_0 82 | - psutil=5.4.3=py36_0 83 | - pyasn1=0.4.2=py_0 84 | - pyasn1-modules=0.2.1=py_0 85 | - pycparser=2.18=py36_0 86 | - pygraphviz=1.4rc1=py36h470a237_0 87 | - pynacl=1.1.2=py36_0 88 | - pyopenssl=17.5.0 89 | - pyparsing=2.2.0=py36_0 90 | - pysftp=0.2.9=py36_0 91 | - pysocks=1.6.8=py36_1 92 | - python=3.6.6=h5001a0f_0 93 | - python-dateutil=2.7.3=py_0 94 | - python-irodsclient=0.7.0=py_0 95 | - pytz=2018.5=py_0 96 | - pyyaml=3.12=py36_1 97 | - r=3.4.1=r3.4.1_0 98 | - r-assertthat=0.2.0=r3.4.1_0 99 | - r-backports=1.0.5=r3.4.1_0 100 | - r-base 101 | - r-base64enc 102 | - r-bitops=1.0_6=r3.4.1_0 103 | - r-boot=1.3_20=r3.4.1_0 104 | - r-catools=1.17.1=r3.4.1_0 105 | - r-class=7.3_14=r3.4.1_0 106 | - r-cli=1.0.0=r3.4.1_0 107 | - r-cluster=2.0.6=r3.4.1_0 108 | - r-codetools=0.2_15=r3.4.1_0 109 | - r-colorspace=1.3_2=r3.4.1_0 110 | - r-crayon=1.3.4=r3.4.1_0 111 | - r-data.table=1.10.4=r3.4.1_0 112 | - r-dichromat=2.0_0=r3.4.1_0 113 | - r-digest=0.6.12=r3.4.1_0 114 | - r-evaluate=0.10.1=r3.4.1_0 115 | - r-foreign=0.8_67=r3.4.1_0 116 | - r-formatr=1.5=r3.4.1_0 117 | - r-ggplot2=2.2.1=r3.4.1_0 118 | - r-glue=1.2.0=r3.4.1_0 119 | - r-gtable=0.2.0=r3.4.1_0 120 | - r-highr=0.6=r3.4.1_0 121 | - r-htmltools=0.3.6=r3.4.1_0 122 | - r-jsonlite=1.5=r3.4.1_0 123 | - r-kernsmooth=2.23_15=r3.4.1_0 124 | - r-knitr=1.20=r3.4.1_0 125 | - r-labeling=0.3=r3.4.1_0 126 | - r-lattice=0.20_34=r3.4.1_0 127 | - r-lazyeval=0.2.1=r3.4.1_0 128 | - r-magrittr=1.5=r3.4.1_0 129 | - r-markdown=0.8=r3.4.1_1 130 | - r-mass=7.3_48=r3.4.1_0 131 | - r-matrix=1.2_12=r3.4.1_0 132 | - r-mgcv=1.8_17=r3.4.1_0 133 | - r-mime=0.5=r3.4.1_0 134 | - r-mixtools=1.1.0=r3.4.1_0 135 | - r-munsell=0.4.3=r3.4.1_0 136 | - r-nlme=3.1_131=r3.4.1_0 137 | - r-nnet=7.3_12=r3.4.1_0 138 | - r-pillar=1.2.1=r3.4.1_0 139 | - r-plyr=1.8.4=r3.4.1_0 140 | - r-r6=2.2.2=r3.4.1_0 141 | - r-rcolorbrewer=1.1_2=r3.4.1_0 142 | - r-rcpp=0.12.15=r3.4.1_0 143 | - r-recommended=3.4.1=r3.4.1_0 144 | - r-rlang=0.2.0=r3.4.1_0 145 | - r-rmarkdown=1.8=r3.4.1_0 146 | - r-rpart=4.1_13=r3.4.1_0 147 | - r-rprojroot=1.2=r3.4.1_0 148 | - r-scales=0.5.0=r3.4.1_0 149 | - r-segmented=0.5_2.1=r3.4.1_0 150 | - r-spatial=7.3_11=r3.4.1_0 151 | #- r-stringi 152 | - r-stringr 153 | - r-survival=2.40_1=r3.4.1_0 154 | - r-tibble=1.4.2=r3.4.1_0 155 | - r-utf8=1.1.3=r3.4.1_0 156 | - r-viridislite=0.2.0=r3.4.1_0 157 | - r-yaml=2.1.14=r3.4.1_0 158 | - ratelimiter=1.2.0=py36_0 159 | - readline=7.0=haf1bffa_1 160 | - requests=2.18.4=py36_1 161 | - rsa=3.4.2=py_1 162 | - s3transfer=0.1.13=py36_0 163 | - setuptools=39.0.1=py36_0 164 | - six=1.11.0=py36_1 165 | - sqlite=3.25.2=hb1c47c0_0 166 | - tk=8.6.8=ha92aebf_0 167 | - urllib3=1.22=py36_0 168 | - wheel=0.30.0=py36_2 169 | - wrapt=1.10.11=py36_0 170 | - xmlrunner=1.7.7=py_0 171 | - xorg-kbproto=1.0.7=1 172 | - xorg-libice=1.0.9=2 173 | - xorg-libsm=1.2.2=2 174 | - xorg-libx11=1.6.5=0 175 | - xorg-libxau=1.0.8=3 176 | - xorg-libxdmcp=1.1.2=3 177 | - xorg-libxext=1.3.3=2 178 | - xorg-libxrender=0.9.10=0 179 | - xorg-libxt=1.1.5=h470a237_2 180 | - xorg-renderproto=0.11.1=1 181 | - xorg-xextproto=7.3.0=1 182 | - xorg-xproto=7.0.31=6 183 | - xz=5.2.4=h470a237_1 184 | - yarl=1.2.6=py36h470a237_0 185 | - zlib=1.2.11=0 186 | - bcrypt=3.1.4=py36h621fe67_0 187 | - bzip2=1.0.6=3 188 | - cairo=1.14.12=h77bcde2_0 189 | - datrie=0.7.1=py36_0 190 | - fontconfig=2.12.4=h88586e7_1 191 | - glib=2.53.6=h5d9569c_2 192 | - gmp=6.1.0=0 193 | - gsl=2.2.1=h0c605f7_3 194 | - harfbuzz=1.7.6=hc5b324e_0 195 | - jbig=2.1=0 196 | - libffi=3.2.1=1 197 | - libgcc=5.2.0=0 198 | - libgcc-ng=7.2.0=hdf63c60_3 199 | - libgfortran-ng=7.2.0=hdf63c60_3 200 | - libiconv=1.14=0 201 | - libopenblas=0.2.20=h9ac9557_7 202 | - libssh2=1.8.0=0 203 | - libstdcxx-ng=7.2.0=hdf63c60_3 204 | - libxml2=2.9.8=h26e45fe_1 205 | - numpy=1.14.3 206 | - numpy-base=1.14.3=py36h0ea5e3f_1 207 | - pandoc=1.15.0.6=0 208 | - pcre=8.39=1 209 | - pixman=0.34.0=0 210 | - yaml=0.1.6=0 211 | #prefix: /lustre/scratch117/cellgen/team218/gp7/miniconda/envs/Micro-Exonator 212 | -------------------------------------------------------------------------------- /envs/MicroExonator.yml: -------------------------------------------------------------------------------- 1 | #name: Micro-Exonator 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | - cgat 7 | dependencies: 8 | - bcftools=1.7=0 9 | - bedtools=2.27.1=he941832_2 10 | - hisat2=2.1.0=py36pl5.22.0_0 11 | - htslib=1.7=0 12 | - libdeflate=1.0=h470a237_0 13 | - perl-threaded=5.22.0=10 14 | - pybedtools=0.7.10=py36_2 15 | - pysam=0.14.1=py36hae42fb6_1 16 | - samtools=1.9=h8ee4bcc_1 17 | - snakemake=5.2.2=py36_1 18 | - snakemake-minimal=5.2.2=py36_1 19 | - sra-tools=2.9.1_1=h470a237_0 20 | - r-reshape2=1.4.1=0 21 | - aioeasywebdav=2.2.0=py36_0 22 | - aiohttp=3.3.2=py36h470a237_1 23 | - appdirs=1.4.3=py_1 24 | - asn1crypto=0.24.0=py36_0 25 | - async-timeout=3.0.0=py36_0 26 | - attrs=18.1.0=py_1 27 | - biopython=1.72=py36_0 28 | - boto3=1.7.76=py_0 29 | - botocore=1.10.77=py_0 30 | - ca-certificates=2018.8.24=ha4d7672_0 31 | - cachetools=2.1.0=py_0 32 | - certifi=2018.8.24=py36_1001 33 | - cffi=1.11.5=py36_0 34 | - chardet=3.0.4=py36_0 35 | - configargparse=0.13.0=py_1 36 | - cryptography=2.1.4=py36_0 37 | - curl=7.61.0=h93b3f91_2 38 | - decorator=4.3.0=py_0 39 | - docutils=0.14=py36_0 40 | - dropbox=8.7.1=py_0 41 | - expat=2.2.5=hfc679d8_1 42 | - filechunkio=1.8=py36_1 43 | - freetype=2.8.1=hfa320df_1 44 | - ftputil=3.4=py_0 45 | - google-api-core=0.1.4=py_0 46 | - google-auth=1.5.1=py_0 47 | - google-cloud-core=0.28.1=py_0 48 | - google-cloud-storage=1.10.0=py_0 49 | - google-resumable-media=0.3.1=py_0 50 | - googleapis-common-protos=1.5.3=py_1 51 | - graphite2=1.3.12=hfc679d8_0 52 | - graphviz=2.38.0=7 53 | - icu=58.2=0 54 | - idna=2.6=py36_1 55 | - idna_ssl=1.0.0=0 56 | - jinja2=2.10=py_1 57 | - jmespath=0.9.3=py_1 58 | - jpeg=9c=h470a237_1 59 | - jsonschema=2.6.0=py36_1 60 | - krb5=1.14.6=0 61 | - libgfortran=3.0.0=1 62 | - libpng=1.6.34=0 63 | - libprotobuf=3.6.0=hd28b015_0 64 | - libtiff=4.0.9=0 65 | - libtool=2.4.6=h470a237_1 66 | - libuuid=1.0.3=1 67 | - libxcb=1.13=0 68 | - markupsafe=1.0=py36_0 69 | - multidict=4.3.1=py36h470a237_0 70 | - ncurses=6.1=hfc679d8_1 71 | - networkx=2.1=py_1 72 | - openssl=1.0.2p=h470a237_0 73 | - packaging=17.1=py_0 74 | - pandas=0.23.4=py36hf8a1672_0 75 | - pango=1.40.14=0 76 | - paramiko=2.4.1=py36_0 77 | - perl=5.22.0.1=0 78 | - pip=9.0.2=py36_0 79 | - prettytable=0.7.2=py_2 80 | - protobuf=3.6.0=py36hfc679d8_0 81 | - psutil=5.4.3=py36_0 82 | - pyasn1=0.4.2=py_0 83 | - pyasn1-modules=0.2.1=py_0 84 | - pycparser=2.18=py36_0 85 | - pygraphviz=1.4rc1=py36h470a237_0 86 | - pynacl=1.1.2=py36_0 87 | - pyopenssl=17.5.0=py36_0 88 | - pyparsing=2.2.0=py36_0 89 | - pysftp=0.2.9=py36_0 90 | - pysocks=1.6.8=py36_1 91 | - python=3.6.6=h5001a0f_0 92 | - python-dateutil=2.7.3=py_0 93 | - python-irodsclient=0.7.0=py_0 94 | - pytz=2018.5=py_0 95 | - pyyaml=3.12=py36_1 96 | - r=3.4.1=r3.4.1_0 97 | - r-assertthat=0.2.0=r3.4.1_0 98 | - r-backports=1.0.5=r3.4.1_0 99 | - r-base=3.4.1=h4fe35fd_8 100 | - r-base64enc=0.1_3=r3.4.1_0 101 | - r-bitops=1.0_6=r3.4.1_0 102 | - r-boot=1.3_20=r3.4.1_0 103 | - r-catools=1.17.1=r3.4.1_0 104 | - r-class=7.3_14=r3.4.1_0 105 | - r-cli=1.0.0=r3.4.1_0 106 | - r-cluster=2.0.6=r3.4.1_0 107 | - r-codetools=0.2_15=r3.4.1_0 108 | - r-colorspace=1.3_2=r3.4.1_0 109 | - r-crayon=1.3.4=r3.4.1_0 110 | - r-data.table=1.10.4=r3.4.1_0 111 | - r-dichromat=2.0_0=r3.4.1_0 112 | - r-digest=0.6.12=r3.4.1_0 113 | - r-evaluate=0.10.1=r3.4.1_0 114 | - r-foreign=0.8_67=r3.4.1_0 115 | - r-formatr=1.5=r3.4.1_0 116 | - r-ggplot2=2.2.1=r3.4.1_0 117 | - r-glue=1.2.0=r3.4.1_0 118 | - r-gtable=0.2.0=r3.4.1_0 119 | - r-highr=0.6=r3.4.1_0 120 | - r-htmltools=0.3.6=r3.4.1_0 121 | - r-jsonlite=1.5=r3.4.1_0 122 | - r-kernsmooth=2.23_15=r3.4.1_0 123 | - r-knitr=1.20=r3.4.1_0 124 | - r-labeling=0.3=r3.4.1_0 125 | - r-lattice=0.20_34=r3.4.1_0 126 | - r-lazyeval=0.2.1=r3.4.1_0 127 | - r-magrittr=1.5=r3.4.1_0 128 | - r-markdown=0.8=r3.4.1_1 129 | - r-mass=7.3_48=r3.4.1_0 130 | - r-matrix=1.2_12=r3.4.1_0 131 | - r-mgcv=1.8_17=r3.4.1_0 132 | - r-mime=0.5=r3.4.1_0 133 | - r-mixtools=1.1.0=r3.4.1_0 134 | - r-munsell=0.4.3=r3.4.1_0 135 | - r-nlme=3.1_131=r3.4.1_0 136 | - r-nnet=7.3_12=r3.4.1_0 137 | - r-pillar=1.2.1=r3.4.1_0 138 | - r-plyr=1.8.4=r3.4.1_0 139 | - r-r6=2.2.2=r3.4.1_0 140 | - r-rcolorbrewer=1.1_2=r3.4.1_0 141 | - r-rcpp=0.12.15=r3.4.1_0 142 | - r-recommended=3.4.1=r3.4.1_0 143 | - r-rlang=0.2.0=r3.4.1_0 144 | - r-rmarkdown=1.8=r3.4.1_0 145 | - r-rpart=4.1_13=r3.4.1_0 146 | - r-rprojroot=1.2=r3.4.1_0 147 | - r-scales=0.5.0=r3.4.1_0 148 | - r-segmented=0.5_2.1=r3.4.1_0 149 | - r-spatial=7.3_11=r3.4.1_0 150 | - r-stringi=1.1.6=r3.4.1_0 151 | - r-stringr=1.3.0=r3.4.1_0 152 | - r-survival=2.40_1=r3.4.1_0 153 | - r-tibble=1.4.2=r3.4.1_0 154 | - r-utf8=1.1.3=r3.4.1_0 155 | - r-viridislite=0.2.0=r3.4.1_0 156 | - r-yaml=2.1.14=r3.4.1_0 157 | - ratelimiter=1.2.0=py36_0 158 | - readline=7.0=haf1bffa_1 159 | - requests=2.18.4=py36_1 160 | - rsa=3.4.2=py_1 161 | - s3transfer=0.1.13=py36_0 162 | - setuptools=39.0.1=py36_0 163 | - six=1.11.0=py36_1 164 | - sqlite=3.25.2=hb1c47c0_0 165 | - tk=8.6.8=ha92aebf_0 166 | - urllib3=1.22=py36_0 167 | - wheel=0.30.0=py36_2 168 | - wrapt=1.10.11=py36_0 169 | - xmlrunner=1.7.7=py_0 170 | - xorg-kbproto=1.0.7=1 171 | - xorg-libice=1.0.9=2 172 | - xorg-libsm=1.2.2=2 173 | - xorg-libx11=1.6.5=0 174 | - xorg-libxau=1.0.8=3 175 | - xorg-libxdmcp=1.1.2=3 176 | - xorg-libxext=1.3.3=2 177 | - xorg-libxrender=0.9.10=0 178 | - xorg-libxt=1.1.5=h470a237_2 179 | - xorg-renderproto=0.11.1=1 180 | - xorg-xextproto=7.3.0=1 181 | - xorg-xproto=7.0.31=6 182 | - xz=5.2.4=h470a237_1 183 | - yarl=1.2.6=py36h470a237_0 184 | - zlib=1.2.11=0 185 | - bcrypt=3.1.4=py36h621fe67_0 186 | - bzip2=1.0.6=3 187 | - cairo=1.14.12=h77bcde2_0 188 | - datrie=0.7.1=py36_0 189 | - fontconfig=2.12.4=h88586e7_1 190 | - glib=2.53.6=h5d9569c_2 191 | - gmp=6.1.0=0 192 | - gsl=2.2.1=h0c605f7_3 193 | - harfbuzz=1.7.6=hc5b324e_0 194 | - jbig=2.1=0 195 | - libffi=3.2.1=1 196 | - libgcc=5.2.0=0 197 | - libgcc-ng=7.2.0=hdf63c60_3 198 | - libgfortran-ng=7.2.0=hdf63c60_3 199 | - libiconv=1.14=0 200 | - libopenblas=0.2.20=h9ac9557_7 201 | - libssh2=1.8.0=0 202 | - libstdcxx-ng=7.2.0=hdf63c60_3 203 | - libxml2=2.9.8=h26e45fe_1 204 | - numpy=1.14.3=py36h28100ab_2 205 | - numpy-base=1.14.3=py36h0ea5e3f_1 206 | - pandoc=1.15.0.6=0 207 | - pcre=8.39=1 208 | - pixman=0.34.0=0 209 | - yaml=0.1.6=0 210 | #prefix: /lustre/scratch117/cellgen/team218/gp7/miniconda/envs/Micro-Exonator 211 | -------------------------------------------------------------------------------- /envs/R.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - bioconda 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - r-base =3.6.3 7 | - r-ggplot2 8 | - r-mixtools 9 | - r-data.table 10 | -------------------------------------------------------------------------------- /envs/biopython_py3.yaml: -------------------------------------------------------------------------------- 1 | 2 | channels: 3 | - bioconda 4 | dependencies: 5 | - biopython 6 | - python=3 7 | -------------------------------------------------------------------------------- /envs/core.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | - cgat 6 | dependencies: 7 | - bcftools=1.7 8 | - bedtools=2.27.1 9 | - hisat2=2.1.0 10 | - htslib=1.7 11 | - pybedtools=0.7.10 12 | - pysam=0.14.1 13 | - samtools=1.9 14 | - sra-tools 15 | - biopython 16 | - pandas=0.23.4 17 | - python=2.7 18 | - numpy=1.14.3 19 | - numpy-base=1.14.3 20 | - pybedtools 21 | - pyBigWig 22 | - bwa=0.7.15 23 | - bowtie 24 | - cramtools 25 | - tbb=2020.2 26 | -------------------------------------------------------------------------------- /envs/core_py3.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | - cgat 6 | dependencies: 7 | - bcftools=1.7=0 8 | - bedtools=2.27.1=he941832_2 9 | - hisat2=2.1.0=py36pl5.22.0_0 10 | - htslib=1.7=0 11 | - pybedtools=0.7.10=py36_2 12 | - pysam=0.14.1=py36hae42fb6_1 13 | - samtools=1.9=h8ee4bcc_1 14 | - sra-tools=2.9.1_1 15 | - biopython 16 | - pandas=0.23.4=py36hf8a1672_0 17 | - python=3.6.6=h5001a0f_0 18 | - numpy=1.14.3 19 | - numpy-base=1.14.3=py36h0ea5e3f_1 20 | - pybedtools 21 | - pyBigWig 22 | - bwa=0.7.15 23 | - bowtie 24 | -------------------------------------------------------------------------------- /envs/pybedtools.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | - cgat 6 | dependencies: 7 | - pybedtools=0.8.0 8 | - bedtools=2.27.* 9 | - biopython 10 | - pyBigWig 11 | - python=2.7.* 12 | -------------------------------------------------------------------------------- /envs/snakemake.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | -conda-forge 3 | -bioconda 4 | -defaults 5 | -cgat 6 | dependencies: 7 | -snakemake 8 | -pybedtools=0.8.0 9 | -bedtools=2.27.1 10 | -biopython 11 | -pyBigWig 12 | -python=3.* 13 | -------------------------------------------------------------------------------- /rules/Benchmark.smk: -------------------------------------------------------------------------------- 1 | 2 | ############## Gene Count ####### 3 | 4 | # rule generate_star_olego: 5 | # input: 6 | # "config["Genome_fasta"]", 7 | # 8 | # shell: 9 | # "start --runThreadN 5 --runMode genomeGenerate --genomeDir data/ " 10 | # 11 | # 12 | # 13 | # rule generate_star_index: 14 | # input: 15 | # "config["Genome_fasta"]", 16 | # 17 | # shell: 18 | # "start --runThreadN 5 --runMode genomeGenerate --genomeDir data/ " 19 | # 20 | 21 | 22 | 23 | rule total_hisat2_to_genome: 24 | input: 25 | "FASTQ/{sample}.fastq", 26 | "data/Genome.1.ht2" 27 | output: 28 | "Genome_aligments/Hisat2/{sample}.sam" 29 | threads: 5 30 | shell: 31 | "hisat2 -x data/Genome -U {input[0]} -p 5 > {output}" 32 | 33 | rule total_olego_to_Genome: 34 | input: 35 | "FASTQ/{sample}.fastq" 36 | output: 37 | "Genome_aligments/Olego/{sample}.sam" 38 | threads: 10 39 | shell: 40 | "/lustre/scratch117/cellgen/team218/gp7/olego/olego -t 10 data/Genome_olego {input} > {output}" 41 | 42 | 43 | rule total_STAR_to_Genome: 44 | input: 45 | "FASTQ/{sample}.fastq" 46 | output: 47 | "Genome_aligments/STAR/{sample}.samAligned.out.sam" 48 | threads: 5 49 | shell: 50 | "STAR --genomeDir data --readFilesIn {input} --runThreadN 5 --outFileNamePrefix {output}" 51 | 52 | rule mv_STAR: 53 | input: 54 | "Genome_aligments/STAR/{sample}.samAligned.out.sam" 55 | output: 56 | "Genome_aligments/STAR/{sample}.sam" 57 | threads: 5 58 | shell: 59 | "mv {input} {output}" 60 | 61 | 62 | 63 | 64 | rule total_tophat_to_Genome: 65 | input: 66 | "FASTQ/{sample}.fastq" 67 | output: 68 | dir = "Genome_aligments/Tophat2/{sample}", 69 | sam = "Genome_aligments/Tophat2/{sample}.sam" 70 | threads: 5 71 | shell: 72 | "tophat2 -p 5 --no-convert-bam --microexon-search -o {output.dir} data/Genome_bowtie2 {input} && mv {output.dir}/accepted_hit.sam {output}" 73 | 74 | 75 | rule SJ_count: 76 | input: 77 | "Genome_aligments/{Software}/{sample}.sam" 78 | output: 79 | "Genome_aligments/{Software}/{sample}.sam.SJ_count" 80 | shell: 81 | "python2 src/Get_introns_from_sam.py {input} Rd1 40 1000000 8 > {output}" 82 | 83 | 84 | rule sam_merge: 85 | input: 86 | ["Genome_aligments/{Software}/" + x for x in expand("{sample}.sam", sample=DATA ) ] 87 | output: 88 | temp("Genome_aligments/{Software}/TOTAL.sam") 89 | shell: 90 | "samtools merge {output} {input}" 91 | 92 | 93 | rule get_exons: 94 | input: 95 | "Genome_aligments/{Software}/TOTAL.sam" 96 | output: 97 | "Genome_aligments/{Software}/TOTAL.exons.{Software}" 98 | shell: 99 | "python2 Get_exons_from_sam.py {input} > {output}" 100 | 101 | 102 | 103 | 104 | rule SJ_ground_count: 105 | input: 106 | config["fastq_path"] + '{sample}.fastq.gz' 107 | output: 108 | "Ground_Truth/{sample}.GT.SJ_count" 109 | shell: 110 | "python2 SJ_count_truth.py /lustre/scratch117/cellgen/team218/gp7/Genome/mm10/Tracks/Gene_annotation/gencode.vM11.annotation.bed12 simulated_ME_isoforms.bed12 {input} > {output}" 111 | 112 | 113 | 114 | rule gene_count: 115 | input: 116 | "/lustre/scratch117/cellgen/team218/gp7/Genome/mm10/Tracks/Gene_annotation/gencode.vM11.annotation.gtf", 117 | "Genome_aligments/{sample}.sam" 118 | output: 119 | "Genome_aligments/{sample}.gene_count.txt" 120 | threads: 1 121 | shell: 122 | "featureCounts -a {input[0]} -o {output} {input[1]}" 123 | 124 | 125 | 126 | rule done_gene_count: 127 | input: 128 | expand("Genome_aligments/{sample}.gene_count.txt", sample=DATA ) 129 | output: 130 | "Round2/done.txt" 131 | shell: 132 | "echo done > {output}" 133 | ##### 134 | -------------------------------------------------------------------------------- /rules/Get_data.smk: -------------------------------------------------------------------------------- 1 | if str2bool(config.get("Keep_fastq_gz", False)): 2 | rule download_fastq: 3 | input: 4 | "download/{sample}.download.sh" 5 | output: 6 | "FASTQ/{sample}.fastq.gz" 7 | resources: 8 | get_data = 1 9 | conda: 10 | "../envs/core.yaml" 11 | priority: -10 12 | shell: 13 | "bash {input}" 14 | 15 | else: 16 | rule download_fastq: 17 | input: 18 | "download/{sample}.download.sh" 19 | output: 20 | temp("FASTQ/{sample}.fastq.gz") 21 | resources: 22 | get_data = 1 23 | conda: 24 | "../envs/core.yaml" 25 | priority: -10 26 | shell: 27 | "bash {input}" 28 | 29 | rule unzip: 30 | input: 31 | "FASTQ/{sample}.fastq.gz" 32 | output: 33 | temp("FASTQ/{sample}.fastq") 34 | shell: 35 | "zcat {input} > {output}" 36 | 37 | rule get_fastq: 38 | input: 39 | expand("FASTQ/{sample}.fastq.gz", sample=DATA) 40 | 41 | if "Gene_anontation_bed12" in config: 42 | pass 43 | else: 44 | rule generate_bed12: 45 | input: 46 | config["Gene_anontation_GTF"] 47 | output: 48 | "data/transcriptome.bed12" 49 | shell: 50 | "python2 src/GTFtoBED12.py {input} > {output}" 51 | 52 | config["Gene_anontation_bed12"] = "data/transcriptome.bed12" 53 | 54 | 55 | rule generate_fasta_from_bed12: 56 | input: 57 | config["Genome_fasta"], 58 | config["Gene_anontation_bed12"] 59 | output: 60 | "data/transcripts.fa" 61 | conda: 62 | "../envs/pybedtools.yaml" 63 | shell: 64 | "python2 src/Get_fasta_from_bed12.py {input} > {output}" 65 | 66 | if not "ME_len" in config: 67 | config["ME_len"] = 30 68 | 69 | if not "max_read_len" in config: 70 | config["max_read_len"] = 100 71 | 72 | rule Splice_Junction_Library: 73 | input: 74 | config["Genome_fasta"], 75 | "data/transcripts.fa", 76 | config["Gene_anontation_bed12"] 77 | params: 78 | ME_len = config["ME_len"], 79 | max_read_len = config["max_read_len"] 80 | output: 81 | "Round1/ME_TAGs.fa" 82 | conda: 83 | "../envs/core.yaml" 84 | shell: 85 | "python2 src/SJ_tags_generator_for_micro_exons.py {input} {params.ME_len} {params.max_read_len} > {output}" 86 | 87 | 88 | rule GetPWM: 89 | input: 90 | config["Genome_fasta"], 91 | config["Gene_anontation_bed12"] 92 | params: 93 | config["GT_AG_U2_5"], 94 | config["GT_AG_U2_3"] 95 | output: 96 | "data/GT_AG_U2_5.pwm", 97 | "data/GT_AG_U2_3.pwm" 98 | conda: 99 | "../envs/biopython_py3.yaml" 100 | shell: 101 | "python3 src/Get_splicing_PWMs.py {input} {params} {output}" 102 | 103 | #if str2bool(config.get("Only_whippet", False))==False: 104 | # rule gzip_fastq: 105 | # input: 106 | # "FASTQ/{sample}.fastq" 107 | # output: 108 | # temp("FASTQ/{sample}.fastq.gz") 109 | # priority: 100 110 | # shell: 111 | # "gzip -c {input} > {output}" 112 | 113 | #else: 114 | # rule gzip_fastq: 115 | # input: 116 | # "FASTQ/{sample}.fastq" 117 | # output: 118 | # temp("FASTQ/{sample}.fastq.gz") 119 | # priority: 100 120 | # shell: 121 | # "gzip {input}" 122 | 123 | 124 | 125 | # rule sra_to_fastq: 126 | # input: 127 | # config["input_dir"] + "/{sample}.sra" 128 | # output: 129 | # temp("data/fastq_paired/{sample}.fastq") 130 | # shell: 131 | # "fastq-dump {input} -O data/fastq_paired/" 132 | 133 | 134 | # rule fastq_gz_to_fastq: 135 | # input: 136 | # config["input_dir"] + "/{sample}.fastq.gz" 137 | # output: 138 | # temp("data/fastq/{sample}.fastq") 139 | # shell: 140 | # "gzip -dc {input} > {output}" 141 | # 142 | # rule fastq_input: 143 | # input: 144 | # config["input_dir"] + "/{sample}.fastq" 145 | # output: 146 | # "data/fastq/{sample}.fastq" 147 | # shell: 148 | # "ln -s {input} {output}" 149 | 150 | #rule download_to_fastq: 151 | # input: 152 | # "download/{sample}.download.sh" 153 | # output: 154 | # "data/fastq/{sample}.fastq" 155 | # shell: 156 | # "bash {input}" 157 | 158 | 159 | # rule split_fastq: 160 | # input: 161 | # "data/fastq_paired/{sample}.fastq" 162 | # output: 163 | # temp("data/fastq/{sample}.fastq") 164 | # shell: 165 | # "python2 src/split_paired_end.py {input} > {output}" 166 | -------------------------------------------------------------------------------- /rules/Round1.smk: -------------------------------------------------------------------------------- 1 | 2 | rule bwa_index: 3 | input: 4 | "Round1/ME_TAGs.fa" 5 | output: 6 | "Round1/ME_TAGs.fa.amb" 7 | conda: 8 | "../envs/core.yaml" 9 | shell: 10 | "bwa index {input}" 11 | 12 | rule Round1_bwa_mem_to_tags: 13 | input: 14 | "Round1/ME_TAGs.fa", 15 | "FASTQ/{sample}.fastq.gz", 16 | "Round1/ME_TAGs.fa.amb" 17 | output: 18 | temp("Round1/{sample}.sam") 19 | threads: 5 20 | priority: 100 21 | params: 22 | indel = config["indel_penalty"] 23 | conda: 24 | "../envs/core.yaml" 25 | shell: 26 | "bwa mem -t {threads} -O {params.indel} -L 25 {input[0]} {input[1]} | awk '$6 ~ /I/' > {output}" 27 | 28 | 29 | rule Round1_alingment_pre_processing: 30 | input: 31 | "Round1/{sample}.sam" 32 | output: 33 | temp("Round1/{sample}.sam.pre_processed") 34 | priority: 100 35 | conda: 36 | "../envs/core.yaml" 37 | shell: 38 | "python2 src/alingment_pre_processing.py {input} F > {output}" 39 | -------------------------------------------------------------------------------- /rules/Round1_post_processing.smk: -------------------------------------------------------------------------------- 1 | 2 | 3 | rule row_Micro_Exon_reads: 4 | input: 5 | config["Genome_fasta"], 6 | "Round1/{sample}.sam.pre_processed", 7 | "FASTQ/{sample}.fastq.gz" 8 | output: 9 | temp("Round1/{sample}.sam.row_ME"), 10 | temp("Round1/{sample}.sam.row_ME.fastq") 11 | conda: 12 | "../envs/core.yaml" 13 | shell: 14 | "python2 src/row_ME2.py {input} > {output[0]}" 15 | 16 | 17 | rule hisat2_genome_index: 18 | input: 19 | config["Genome_fasta"] 20 | output: 21 | "data/Genome.1.ht2" 22 | threads: 5 23 | conda: 24 | "../envs/core.yaml" 25 | shell: 26 | "hisat2-build {input} data/Genome" 27 | 28 | if str2bool(config.get("skip_genome_alignment", False)): 29 | 30 | rule hisat2_to_Genome: 31 | input: 32 | "Round1/{sample}.sam.row_ME.fastq", 33 | "data/Genome.1.ht2" 34 | output: 35 | temp("Round1/{sample}.sam.row_ME.Genome.Aligned.out.sam") 36 | threads: 1 37 | conda: 38 | "../envs/core.yaml" 39 | shell: 40 | "touch {output}" 41 | else: 42 | 43 | rule hisat2_to_Genome: 44 | input: 45 | "Round1/{sample}.sam.row_ME.fastq", 46 | "data/Genome.1.ht2" 47 | output: 48 | temp("Round1/{sample}.sam.row_ME.Genome.Aligned.out.sam") 49 | threads: 1 50 | conda: 51 | "../envs/core.yaml" 52 | shell: 53 | "hisat2 -x data/Genome -U {input[0]} > {output}" 54 | 55 | 56 | rule Round1_filter: 57 | input: 58 | config["Genome_fasta"], 59 | "Round1/{sample}.sam.row_ME", 60 | "Round1/{sample}.sam.row_ME.Genome.Aligned.out.sam", 61 | "data/GT_AG_U2_5.pwm", 62 | "data/GT_AG_U2_3.pwm" 63 | params: 64 | bw = config["conservation_bigwig"], 65 | ME_len = config["ME_len"] 66 | output: 67 | protected("Round1/{sample}.sam.row_ME.filter1") 68 | conda: 69 | "../envs/pybedtools.yaml" 70 | shell: 71 | "python2 src/ME_filter1.py {input} {params.bw} {params.ME_len} > {output}" 72 | 73 | 74 | rule Micro_Exon_table: 75 | input: 76 | expand("Round1/{sample}.sam.row_ME.filter1", sample=DATA ) 77 | output: 78 | protected("Round1/TOTAL/TOTAL.sam.row_ME.filter1.ME_centric") 79 | conda: 80 | "../envs/core.yaml" 81 | shell: 82 | "cat Round1/*.sam.row_ME.filter1 | awk 'NF==16' > Round1/TOTAL/TOTAL.sam.row_ME.filter1 &&" 83 | "python2 src/ME_centric_table.py Round1/TOTAL/TOTAL.sam.row_ME.filter1 > {output}" 84 | 85 | -------------------------------------------------------------------------------- /rules/Round2.smk: -------------------------------------------------------------------------------- 1 | rule Micro_Exon_Tags: 2 | input: 3 | "Round1/ME_TAGs.fa", 4 | "Round1/TOTAL/TOTAL.sam.row_ME.filter1.ME_centric" 5 | output: 6 | "Round2/ME_canonical_SJ_tags.de_novo.fa" 7 | conda: 8 | "../envs/core.yaml" 9 | shell: 10 | "python2 src/Micro_exons_tags.py {input} > {output}" 11 | 12 | rule Get_ME_from_annotation: 13 | input: 14 | config["Genome_fasta"], 15 | "Round1/TOTAL/TOTAL.sam.row_ME.filter1.ME_centric", 16 | config["Gene_anontation_bed12"], 17 | "data/GT_AG_U2_5.pwm", 18 | "data/GT_AG_U2_3.pwm", 19 | config["ME_DB"] 20 | params: 21 | bw = config["conservation_bigwig"], 22 | ME_len = config["ME_len"] 23 | output: 24 | "data/ME_canonical_SJ_tags.DB.fa", 25 | "data/DB.ME_centric" 26 | conda: 27 | "../envs/pybedtools.yaml" 28 | shell: 29 | "python2 src/Get_annotated_microexons.py {input[0]} {input[1]} {input[2]} {input[3]} {input[4]} {params.bw} {params.ME_len} {input[5]} " 30 | 31 | 32 | rule merge_tags: 33 | input: 34 | "Round2/ME_canonical_SJ_tags.de_novo.fa", 35 | "data/ME_canonical_SJ_tags.DB.fa" 36 | output: 37 | "Round2/ME_canonical_SJ_tags.fa" 38 | conda: 39 | "../envs/core.yaml" 40 | shell: 41 | "cat {input[0]} {input[1]} > {output}" 42 | 43 | 44 | rule merge_ME_centric: 45 | input: 46 | "Round1/TOTAL/TOTAL.sam.row_ME.filter1.ME_centric", 47 | "data/DB.ME_centric" 48 | output: 49 | "Round2/TOTAL.ME_centric.txt" 50 | conda: 51 | "../envs/core.yaml" 52 | shell: 53 | "cat {input[0]} {input[1]} > {output}" 54 | 55 | 56 | rule Round2_bowtie_tags_index: 57 | input: 58 | "Round2/ME_canonical_SJ_tags.fa" 59 | output: 60 | "Round2/ME_canonical_SJ_tags.fa.1.ebwt" 61 | conda: 62 | "../envs/core.yaml" 63 | shell: 64 | "bowtie-build {input} {input}" 65 | 66 | rule download_fastq2: 67 | input: 68 | "download/{sample}.download.sh", 69 | "Round2/TOTAL.ME_centric.txt" 70 | params: 71 | "FASTQ/{sample}.fastq" 72 | output: 73 | temp("FASTQ/round2/{sample}.fastq") 74 | priority: -10 75 | resources: 76 | get_data = 1 77 | conda: 78 | "../envs/core.yaml" 79 | shell: 80 | #"bash {input[0]}" 81 | "bash {input[0]} && mv {params} {output}" 82 | 83 | def hard_drive_behavior(fastq): 84 | if config.get("Optimize_hard_drive", False)=="T": 85 | 86 | if "validate_fastq_list" in config: 87 | 88 | to_validate = set[()] 89 | 90 | with open(config["validate_fastq_list"]) as fastq_list: 91 | reader = csv.reader(fastq_list, delimiter="\t") 92 | for row in reader: 93 | to_validate.add(row[0]) 94 | 95 | if fastq in to_validate: 96 | return("FASTQ/round2/" + fastq + ".fastq.gz.valid") 97 | else: 98 | return( "FASTQ/round2/" + fastq + ".fastq.gz") 99 | 100 | else: 101 | return( "FASTQ/round2/" + fastq + ".fastq.gz") 102 | else: 103 | 104 | if "validate_fastq_list" in config: 105 | 106 | to_validate = set([]) 107 | 108 | with open(config["validate_fastq_list"]) as fastq_list: 109 | reader = csv.reader(fastq_list, delimiter="\t") 110 | for row in reader: 111 | to_validate.add(row[0]) 112 | 113 | if fastq in to_validate: 114 | return("FASTQ/" + fastq + ".fastq.gz.valid") 115 | else: 116 | return( "FASTQ/" + fastq + ".fastq.gz") 117 | else: 118 | 119 | return("FASTQ/" + fastq + ".fastq.gz") 120 | 121 | 122 | rule validate_fastq: 123 | input: 124 | "FASTQ/{sample}.fastq.gz" 125 | output: 126 | "FASTQ/{sample}.fastq.gz.valid" 127 | shell: 128 | "python3 src/validate_fastq.py {input}" 129 | 130 | rule validate_fastq2: 131 | input: 132 | "FASTQ/round2/{sample}.fastq.gz" 133 | output: 134 | "FASTQ/round2/{sample}.fastq.gz.valid" 135 | shell: 136 | "python3 src/validate_fastq.py {input}" 137 | 138 | rule Round2_bowtie_to_tags: 139 | input: 140 | "Round2/ME_canonical_SJ_tags.fa", 141 | hard_drive_behavior("{sample}"), 142 | "Round2/ME_canonical_SJ_tags.fa.1.ebwt" 143 | output: 144 | temp("Round2/{sample}.sam") 145 | threads: 5 146 | priority: 100 147 | conda: 148 | "../envs/core.yaml" 149 | shell: 150 | "gzip -dc {input[1]} | bowtie {input[0]} -p {threads} -q - -S -v 2 --seed 123 | awk '!($6 ~ /I/) && !($6 ~ /D/) && !($6 ~ /S/) && !($6 ~ /*/)' > {output}" 151 | 152 | 153 | rule Round2_alingment_pre_processing: 154 | input: 155 | "Round2/{sample}.sam" 156 | output: 157 | temp("Round2/{sample}.sam.pre_processed") 158 | priority: 100 159 | conda: 160 | "../envs/core.yaml" 161 | shell: 162 | "python2 src/alingment_pre_processing_round2_bowtie.py {input} F > {output}" 163 | -------------------------------------------------------------------------------- /rules/Round2_post_processing.smk: -------------------------------------------------------------------------------- 1 | 2 | rule ME_reads: 3 | input: 4 | "Round2/{sample}.sam.pre_processed", 5 | "FASTQ/{sample}.fastq.gz" 6 | output: 7 | temp("Round2/{sample}.sam.pre_processed.fastq") 8 | priority: 100 9 | conda: 10 | "../envs/core.yaml" 11 | shell: 12 | "python2 src/round2_ME_reads_fastq2.py {input}" 13 | 14 | rule Get_Genome: 15 | input: 16 | config["Genome_fasta"] 17 | output: 18 | "data/Genome" 19 | priority: 100 20 | shell: 21 | "cp {input} {output}" 22 | 23 | rule bowtie_genome_index: 24 | input: 25 | "data/Genome" 26 | output: 27 | "data/Genome" + ".1.ebwt" 28 | priority: 100 29 | conda: 30 | "../envs/core.yaml" 31 | shell: 32 | "bowtie-build {input} {input}" 33 | 34 | if str2bool(config.get("skip_genome_alignment", False)): 35 | 36 | rule bowtie_to_genome: 37 | input: 38 | "Round2/{sample}.sam.pre_processed.fastq", 39 | "data/Genome", 40 | "data/Genome" + ".1.ebwt" 41 | output: 42 | temp("Round2/{sample}.sam.pre_processed.hg19.sam") 43 | priority: 100 44 | conda: 45 | "../envs/core.yaml" 46 | shell: 47 | "touch {output}" 48 | else: 49 | 50 | rule bowtie_to_genome: 51 | input: 52 | "Round2/{sample}.sam.pre_processed.fastq", 53 | "data/Genome", 54 | "data/Genome" + ".1.ebwt" 55 | output: 56 | temp("Round2/{sample}.sam.pre_processed.hg19.sam") 57 | priority: 100 58 | conda: 59 | "../envs/core.yaml" 60 | shell: 61 | "bowtie {input[1]} -p 1 -q {input[0]} -S -v 2 --seed 123| awk '$2==0 || $2==16'> {output}" 62 | 63 | 64 | rule Round2_filter: 65 | input: 66 | "Round2/{sample}.sam.pre_processed", 67 | "Round2/{sample}.sam.pre_processed.hg19.sam", 68 | output: 69 | temp("Round2/{sample}.sam.pre_processed.filter1") 70 | priority: 100 71 | conda: 72 | "../envs/core.yaml" 73 | shell: 74 | "python2 src/Filter1_round2.py {input} > {output}" 75 | 76 | 77 | rule ME_SJ_coverage: 78 | input: 79 | "Round2/ME_canonical_SJ_tags.fa", 80 | "Round2/TOTAL.ME_centric.txt", 81 | config["Gene_anontation_bed12"], 82 | "Round2/{sample}.sam.pre_processed.filter1" 83 | params: 84 | ME_len = config["ME_len"] 85 | output: 86 | protected("Round2/{sample}.sam.pre_processed.filter1.ME_SJ_coverage") 87 | priority: 100 88 | conda: 89 | "../envs/core.yaml" 90 | shell: 91 | "python2 src/ME_SJ_coverage.py {input} {params.ME_len} > {output}" 92 | 93 | 94 | rule Total_sample_exon_counts: 95 | input: 96 | expand("Round2/{sample}.sam.pre_processed.filter1.ME_SJ_coverage", sample=DATA ) 97 | output: 98 | "Round2/TOTAL.filter1.ME_SJ_coverage" 99 | conda: 100 | "../envs/core.yaml" 101 | shell: 102 | "cat Round2/*.filter1.ME_SJ_coverage > {output}" 103 | 104 | rule write_ME_matches: 105 | input: 106 | "Round2/TOTAL.ME_centric.txt" 107 | output: 108 | "Round2/TOTAL.ME_centric.ME_matches.txt" 109 | conda: 110 | "../envs/core_py3.yaml" 111 | shell: 112 | "python3 src/Get_ME_matches.py {input} > {output}" 113 | 114 | 115 | def get_min_reads(): 116 | if 'min_reads_PSI' in config: 117 | return(int(config['min_reads_PSI'])) 118 | else: 119 | return(5) 120 | 121 | 122 | rule coverage_filter: 123 | input: 124 | "Round2/TOTAL.filter1.ME_SJ_coverage" 125 | params: 126 | min_reads_sample = get_min_reads() 127 | output: 128 | "Round2/TOTAL.sample_cov_filter.txt" 129 | script: 130 | "../src/coverage_sample_filter.py" 131 | 132 | def get_min_conservation(): 133 | if "min_conservation" in config: 134 | return(int(config["min_conservation"])) 135 | else: 136 | return(2) #default value for min_conservation is 2 137 | 138 | rule Output: 139 | input: 140 | ME_table = "Round2/TOTAL.ME_centric.txt", 141 | ME_coverage = "Round2/TOTAL.sample_cov_filter.txt", 142 | ME_matches_file = "Round2/TOTAL.ME_centric.ME_matches.txt" 143 | params: 144 | wd = config["working_directory"], 145 | min_number_files_detected = config["min_number_files_detected"], 146 | skip_mixture = str(str2bool(config.get("skip_mixture_model_filter", False))), 147 | min_conservation = get_min_conservation() 148 | output: 149 | out_filtered_ME = "Report/out_filtered_ME.txt", 150 | out_low_scored_ME = "Report/out_low_scored_ME.txt", 151 | out_shorter_than_3_ME = "Report/out_shorter_than_3_ME.txt", 152 | #"Report/report.html", 153 | #out_filtered_ME_cov = "Report/out_filtered_ME.cov.txt" 154 | log: 155 | "logs/Output.log" 156 | conda: 157 | "../envs/R.yaml" 158 | script: 159 | "../src/final_filters3.R" 160 | 161 | # shell: 162 | # '''R -e 'rmarkdown::render("src/final_filters2.Rmd",params = list(ME_table="{params.wd}{input[0]}", ME_coverage="{params.wd}{input[1]}", ME_matches_file="{params.wd}{input[2]}", out_filtered_ME="{params.wd}{output[0]}", out_low_scored_ME="{params.wd}{output[1]}", out_shorter_than_3_ME="{params.wd}{output[2]}", min_number_files_detected={params.min_number_files_detected}, out_filtered_ME_cov="{params.wd}{output[4]}" ), output_file="{params.wd}{output[3]}")' 2> {log} ''' 163 | 164 | 165 | rule high_confident_filters: 166 | input: 167 | config["Genome_fasta"], 168 | config["Gene_anontation_bed12"], 169 | "Round2/TOTAL.filter1.ME_SJ_coverage", 170 | "Report/out_filtered_ME.txt", 171 | "Report/out_low_scored_ME.txt" 172 | output: 173 | "Report/out.high_quality.txt" 174 | conda: 175 | "../envs/core_py3.yaml" 176 | shell: 177 | "python src/high_confident_list.py {input} > {output}" 178 | 179 | 180 | rule coverage_to_PSI: 181 | input: 182 | "Round2/TOTAL.filter1.ME_SJ_coverage" 183 | params: 184 | config["min_reads_PSI"], 185 | config["paired_samples"] 186 | output: 187 | "Report/out_filtered_ME.PSI.txt" 188 | conda: 189 | "../envs/core_py3.yaml" 190 | shell: 191 | "python src/counts_to_PSI.py {input} {params} > {output}" 192 | 193 | 194 | rule annotation_stats: 195 | input: 196 | config["Gene_anontation_bed12"], 197 | "Report/out.high_quality.txt", 198 | params: 199 | 30 200 | output: 201 | "Report/stats/Microexons.not_consensus", 202 | "Report/stats/Microexons.annotation.stats" 203 | conda: 204 | "../envs/core_py3.yaml" 205 | shell: 206 | "python3 src/stats/discovery_stats.py {input} {params}" 207 | -------------------------------------------------------------------------------- /rules/Whippet_delta.smk: -------------------------------------------------------------------------------- 1 | 2 | comparison_names = whippet_delta.keys() 3 | 4 | 5 | if "whippet_delta" in config: 6 | 7 | if str2bool(config.get("Only_whippet", False)): 8 | rule differential_inclusion: 9 | input: 10 | expand("Whippet/Delta/{comparison_name}.diff.gz", comparison_name=comparison_names) 11 | else: 12 | rule differential_inclusion: 13 | input: 14 | expand("Whippet/Delta/{comparison_name}.diff.microexons", comparison_name=comparison_names), 15 | expand("Whippet/Delta/{comparison_name}.diff.ME.microexons", comparison_name=comparison_names) 16 | 17 | 18 | rule whippet_delta: 19 | input: 20 | lambda wildcards : expand("Whippet/Quant/{sample}.psi.gz", sample= whippet_delta[wildcards.comparison_name]["A"].split(",")), 21 | lambda wildcards : expand("Whippet/Quant/{sample}.psi.gz", sample= whippet_delta[wildcards.comparison_name]["B"].split(",")) 22 | output: 23 | "Whippet/Delta/{comparison_name}.diff.gz" 24 | params: 25 | bin = config["whippet_bin_folder"], 26 | a = lambda wildcards : ",".join(expand("Whippet/Quant/{sample}.psi.gz", sample= whippet_delta[wildcards.comparison_name]["A"].split(","))), 27 | b = lambda wildcards : ",".join(expand("Whippet/Quant/{sample}.psi.gz", sample= whippet_delta[wildcards.comparison_name]["B"].split(","))), 28 | o = lambda wildcards : "Whippet/Delta/" + wildcards.comparison_name, 29 | julia = config["julia"] 30 | shell: 31 | "{params.julia} {params.bin}/whippet-delta.jl -a {params.a} -b {params.b} -o {params.o}" 32 | 33 | 34 | 35 | rule whippet_delta_ME: 36 | input: 37 | lambda wildcards : expand("Whippet/Quant/{sample}.psi.ME.gz", sample= whippet_delta[wildcards.comparison_name]["A"].split(",")), 38 | lambda wildcards : expand("Whippet/Quant/{sample}.psi.ME.gz", sample= whippet_delta[wildcards.comparison_name]["B"].split(",")) 39 | output: 40 | "Whippet/Delta/{comparison_name}.ME.diff.gz" 41 | params: 42 | bin = config["whippet_bin_folder"], 43 | a = lambda wildcards : ",".join(expand("Whippet/Quant/{sample}.psi.ME.gz", sample= whippet_delta[wildcards.comparison_name]["A"].split(","))), 44 | b = lambda wildcards : ",".join(expand("Whippet/Quant/{sample}.psi.ME.gz", sample= whippet_delta[wildcards.comparison_name]["B"].split(","))), 45 | o = lambda wildcards : "Whippet/Delta/" + wildcards.comparison_name + ".ME", 46 | julia = config["julia"] 47 | shell: 48 | "{params.julia} {params.bin}/whippet-delta.jl -a {params.a} -b {params.b} -o {params.o} " 49 | 50 | 51 | -------------------------------------------------------------------------------- /rules/pseudo_pool.smk: -------------------------------------------------------------------------------- 1 | import glob, os 2 | import random 3 | import csv 4 | import gzip 5 | from collections import defaultdict 6 | 7 | 8 | def partition (list_in, n): # Function to do random pooling 9 | random.shuffle(list_in) 10 | return [list_in[i::n] for i in range(n)] 11 | 12 | #n_sb = 5 13 | if "n_pseudo_bulks" in config: 14 | n_cells = int(config["n_pseudo_bulks"]) 15 | else: 16 | n_cells = 15 17 | 18 | cluster_files_pb = dict() 19 | sb_IDs = set() 20 | 21 | for cluster, files in cluster_files.items(): 22 | sb = 1 23 | n_sb = round(len(files)/n_cells) 24 | if n_sb<3: 25 | n_sb=3 26 | 27 | for pool in partition(files, n_sb): 28 | cluster_files_pb[(cluster, sb)] = pool 29 | sb_IDs.add(cluster + "_" + str(sb)) 30 | sb += 1 31 | 32 | rule get_pseudo_bulk_membership: 33 | output: 34 | table = "Whippet/Quant/Single_Cell/Pseudo_bulks/pseudo_bulk_membership.tsv" 35 | run: 36 | with open(output.table, "w") as out: 37 | writer = csv.writer(out, delimiter='\t') 38 | writer.writerow(["pseudo_bulk_ID", "samples"]) 39 | 40 | for key in cluster_files_pb: 41 | cluster, sb = key 42 | sb_ID = cluster + "_" + str(sb) 43 | samples = ",".join(cluster_files_pb[key]) 44 | writer.writerow([sb_ID, samples]) 45 | 46 | 47 | def get_files_by_cluster_pb(cluster, pool_ID): 48 | ext = ".fastq.gz" 49 | path="FASTQ/" 50 | return([path + x + ext for x in cluster_files_pb[(cluster, int(pool_ID))]]) 51 | 52 | rule quant_pool_pb: 53 | input: 54 | fastq = lambda w: get_files_by_cluster_pb(w.cluster, w.pool_ID), 55 | index = "Whippet/Index/whippet.jls" 56 | output: 57 | temp("Whippet/Quant/Single_Cell/Pseudo_bulks/{cluster}_{pool_ID}.gene.tpm.gz"), 58 | temp("Whippet/Quant/Single_Cell/Pseudo_bulks/{cluster}_{pool_ID}.isoform.tpm.gz"), 59 | temp("Whippet/Quant/Single_Cell/Pseudo_bulks/{cluster}_{pool_ID}.jnc.gz"), 60 | temp("Whippet/Quant/Single_Cell/Pseudo_bulks/{cluster}_{pool_ID}.map.gz"), 61 | temp("Whippet/Quant/Single_Cell/Pseudo_bulks/{cluster}_{pool_ID}.psi.gz") 62 | params: 63 | bin = config["whippet_bin_folder"], 64 | output = "Whippet/Quant/Single_Cell/Pseudo_bulks/{cluster}_{pool_ID}" 65 | priority: 10 66 | shell: 67 | "julia {params.bin}/whippet-quant.jl <( cat {input.fastq} ) --force-gz -x {input.index} -o {params.output}" 68 | 69 | #print(expand("Whippet/Quant/Single_Cell/Pseudo_bulks/{cluster}_{pool_ID}.psi.gz", cluster=cluster_files.keys(), pool_ID=list(range(1, n_sb+1 )))) 70 | 71 | rule get_pseudo_pools: 72 | input: 73 | expand("Whippet/Quant/Single_Cell/Pseudo_bulks/{pseudo_pool}.psi.gz", pseudo_pool=sb_IDs) 74 | 75 | 76 | 77 | rule collapse_pseudo_pools: 78 | input: 79 | gene = "Whippet/Quant/Single_Cell/Pseudo_bulks/pseudo_bulks.gene.tpm.tsv", 80 | isoform = "Whippet/Quant/Single_Cell/Pseudo_bulks/pseudo_bulks.isoform.tpm.tsv", 81 | psi = "Whippet/Quant/Single_Cell/Pseudo_bulks/pseudo_bulks.psi.tsv", 82 | table = "Whippet/Quant/Single_Cell/Pseudo_bulks/pseudo_bulk_membership.tsv" 83 | 84 | 85 | rule merge_quant_gene_sp: 86 | input: 87 | files = expand("Whippet/Quant/Single_Cell/Pseudo_bulks/{pseudo_pool}.gene.tpm.gz", pseudo_pool=sb_IDs), 88 | jnc = expand("Whippet/Quant/Single_Cell/Pseudo_bulks/{pseudo_pool}.jnc.gz", pseudo_pool=sb_IDs), 89 | mapf = expand("Whippet/Quant/Single_Cell/Pseudo_bulks/{pseudo_pool}.map.gz", pseudo_pool=sb_IDs), 90 | psi = expand("Whippet/Quant/Single_Cell/Pseudo_bulks/{pseudo_pool}.psi.gz", pseudo_pool=sb_IDs) 91 | params: 92 | feature = "Gene" 93 | output: 94 | merged = "Whippet/Quant/Single_Cell/Pseudo_bulks/pseudo_bulks.gene.tpm.tsv" 95 | script: 96 | "../src/merge_quant.py" 97 | 98 | 99 | rule merge_quant_isoform_sp: 100 | input: 101 | files = expand("Whippet/Quant/Single_Cell/Pseudo_bulks/{pseudo_pool}.isoform.tpm.gz", pseudo_pool=sb_IDs) 102 | params: 103 | feature = "Isoform" 104 | output: 105 | merged = "Whippet/Quant/Single_Cell/Pseudo_bulks/pseudo_bulks.isoform.tpm.tsv" 106 | script: 107 | "../src/merge_quant.py" 108 | 109 | rule merge_quant_PSI_sp: 110 | input: 111 | files = expand("Whippet/Quant/Single_Cell/Pseudo_bulks/{pseudo_pool}.psi.gz", pseudo_pool=sb_IDs) 112 | params: 113 | feature = "PSI" 114 | output: 115 | merged = "Whippet/Quant/Single_Cell/Pseudo_bulks/pseudo_bulks.psi.tsv" 116 | script: 117 | "../src/merge_quant.py" 118 | -------------------------------------------------------------------------------- /rules/sashimi.smk: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | def get_sasshimi_window(ME, w): 4 | 5 | ME_strand, ME_start, ME_end = ME.split("_")[-3:] 6 | chrom = "_".join(ME.split("_")[:-3]) 7 | 8 | start = str(int(ME_start)-w) 9 | end = str(int(ME_start)+w) 10 | return(chrom + ":" + start + "-" + end) 11 | 12 | if "sashimi_tsv" in config: 13 | 14 | target_ME = set([]) 15 | 16 | with open("Report/novel_highly_included.tsv") as tsv: 17 | 18 | reader = csv.DictReader(tsv, delimiter="\t") 19 | for row in reader: 20 | #print(row) 21 | if float(row["mean_PSI"])>0.9: 22 | target_ME.add(row["ME"]) 23 | 24 | rule ggsashmi_bulk_scripts: 25 | params: 26 | gtf = config["Gene_anontation_GTF"], 27 | tsv = config["sashimi_tsv"], 28 | region = lambda w: get_sasshimi_window(w.ME, 10000), 29 | out = "ggsashimi/{ME}", 30 | pallete = config["sashimi_pallete"] 31 | output: 32 | "ggsashimi/{ME}.sh" 33 | #"ggsashimi/{ME}.pdf" 34 | shell: 35 | "echo python src/sashimi-plot.py -b {params.tsv} -c {params.region} -g {params.gtf} -o {params.out} -P {params.pallete} -C 3 -O 3 -A mean > {output}" 36 | 37 | rule run_sashimi: 38 | input: 39 | "ggsashimi/{ME}.sh" 40 | output: 41 | "ggsashimi/{ME}.pdf" 42 | shell: 43 | "bash {input}" 44 | 45 | rule get_sashimis_bulk: 46 | input: 47 | expand("ggsashimi/{ME}.pdf", ME=target_ME) 48 | -------------------------------------------------------------------------------- /src/Filter1_round2.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import csv 3 | from collections import defaultdict 4 | 5 | csv.field_size_limit(100000000) 6 | 7 | def main(pre_processed, genome_sam): 8 | 9 | read_SJ = defaultdict(set) 10 | black_list = set([]) 11 | 12 | # for row in csv.reader(open(dust), delimiter = '>'): 13 | 14 | # black_list.add(row[1]) 15 | 16 | # for row in csv.reader(open(repbase), delimiter = '\t'): 17 | 18 | # black_list.add(row[9]) 19 | 20 | for row in csv.reader(open(genome_sam), delimiter = '\t'): 21 | 22 | try: 23 | if row[1]=="0" or row[1]=="16": 24 | black_list.add(row[0]) 25 | except ValueError: 26 | pass 27 | 28 | for row in csv.reader(open(pre_processed), delimiter = '\t'): 29 | 30 | try: 31 | read, flag, tag, start, cigar, seq, qual = row 32 | 33 | SJ = tag.split("|")[0] 34 | read_SJ[read].add(SJ) 35 | except ValueError: 36 | pass 37 | 38 | for row in csv.reader(open(pre_processed), delimiter = '\t'): 39 | try: 40 | read, flag, tag, start, cigar, seq, qual = row 41 | 42 | #if (read in black_list)==False and len(read_SJ[read])==1: 43 | if (read in black_list)==False: 44 | print "\t".join(row) 45 | except ValueError: 46 | pass 47 | #print black_list 48 | 49 | 50 | main(sys.argv[1], sys.argv[2]) #, sys.argv[3], sys.argv[4]) 51 | -------------------------------------------------------------------------------- /src/GTFtoBED12.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import csv 3 | from collections import defaultdict 4 | 5 | def main(gtf_file): 6 | 7 | with open(gtf_file) as gtf: 8 | 9 | transcript_coords = dict() 10 | transcript_qstarts_blocksize = defaultdict(list) 11 | reader = csv.reader(gtf, delimiter="\t") 12 | 13 | for row in reader: 14 | 15 | if row[0][0]!="#": 16 | 17 | chrom = row[0] 18 | group = row[1] 19 | blocktype = row[2] 20 | block_start = int(row[3]) - 1 21 | block_end = int(row[4]) 22 | block_size = block_end - block_start 23 | strand = row[6] 24 | 25 | tags = row[8].strip(" ").split(";") 26 | 27 | for t in tags: 28 | pair = t.strip(" ").split(" ") 29 | if len(pair)==2: 30 | ID_type, ID = pair 31 | if ID_type == "transcript_id": 32 | transcript = ID.strip('"') 33 | 34 | if blocktype == 'transcript': 35 | 36 | transcript_coords[transcript] = (chrom, block_start, block_end, strand) 37 | 38 | if blocktype == 'exon': 39 | 40 | exon_size = block_end - block_start 41 | 42 | 43 | transcript_qstarts_blocksize[transcript].append((block_start, exon_size)) 44 | 45 | 46 | for transcript in transcript_coords: 47 | 48 | 49 | chrom, start, end, strand = transcript_coords[transcript] 50 | 51 | n_blocks = len(transcript_qstarts_blocksize[transcript]) 52 | 53 | 54 | q_b_tuples = sorted(transcript_qstarts_blocksize[transcript] , key=lambda x: x[0]) 55 | 56 | qstarts_list = [x[0] for x in q_b_tuples ] 57 | blocksizes_list = [x[1] for x in q_b_tuples ] 58 | 59 | qstarts = ",".join(map(str, [x - start for x in qstarts_list] )) 60 | blocksizes = ",".join(map(str, blocksizes_list)) 61 | 62 | 63 | bed12 = [chrom, start, end, transcript, "0", strand, start, end, "0", n_blocks, blocksizes, qstarts] 64 | 65 | print( "\t".join(map(str, bed12))) 66 | 67 | if __name__ == '__main__': 68 | main(sys.argv[1]) 69 | -------------------------------------------------------------------------------- /src/Get_ME_matches.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | import csv 4 | 5 | 6 | def main(ME_centric): 7 | 8 | header = ["ME", "U2_score", "Vertebrate_conservation", "ME_len", "ME_max_U2"] 9 | 10 | print("\t".join(header)) 11 | 12 | for row in csv.reader(open(ME_centric), delimiter = '\t'): 13 | 14 | ME, transcript, sum_total_coverage, total_SJs, total_coverages, len_micro_exon_seq_found, micro_exon_seq_found, total_number_of_micro_exons_matches, U2_scores, mean_conservations, P_MEs, total_ME = row 15 | 16 | ME_strand, ME_start, ME_end = ME.split("_")[-3:] 17 | ME_chrom = "_".join(ME.split("_")[:-3]) 18 | 19 | #print(total_ME.split("|")) 20 | 21 | for ME_match in total_ME.split(","): 22 | 23 | print("\t".join(ME_match.split("|") + [len_micro_exon_seq_found, U2_scores])) 24 | 25 | if __name__ == '__main__': 26 | main (sys.argv[1]) 27 | -------------------------------------------------------------------------------- /src/Get_exons_from_sam.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import csv 3 | from Bio import SeqIO 4 | from Bio.Seq import Seq 5 | from Bio.Alphabet import generic_dna 6 | from collections import defaultdict 7 | 8 | 9 | SeqTable = [] 10 | 11 | #dicionario de fag [Rd1/Rd2, +/-] 12 | flag_dict = {'73':[1,1], '89':[1,1], '121':[1,-1], '153':[-1,-1], '185':[-1,-1], '137':[-1,1], '99':[1,1], '147':[-1,-1], '83':[1,-1], '163':[-1,1], '67':[1,1], '115':[1,-1], '179':[-1,-1], '81':[1,-1], "161":[-1,1], '97':[1,1], '145':[-1,-1], '65':[1,1], '129':[-1,1], '113':[1,-1], '177':[-1,-1] } 13 | 14 | 15 | 16 | def ascii_classifier(c): 17 | 18 | ascii = ord(c) 19 | if ascii >= 48 and ascii <= 57: 20 | return 'number' 21 | else: 22 | return 'letter' 23 | 24 | 25 | def main(sam): #hay que indicar si forward es Rd1 o Rd2 26 | reader = csv.reader(open(sam), delimiter = '\t') 27 | 28 | exon_count = defaultdict(int) 29 | 30 | forward = "Rd1" 31 | 32 | pair_ori = 0 33 | if forward == "Rd1": 34 | pair_ori = 1 35 | elif forward == "Rd2": 36 | pair_ori = -1 37 | 38 | 39 | for row in reader: 40 | if row[0][0] != "@": 41 | if "N" in row[5]: 42 | read = row[0] 43 | flag = row[1] 44 | chr = row[2] 45 | start = int(row[3]) - 1 #Sam es 1 referenciado y es mas comodo trabajar en cordeneadas 0 refereciadas 46 | cigar = row[5] 47 | seq = row[9] 48 | 49 | pair_ori = 0 50 | if forward == "Rd1": 51 | pair_ori = 1 52 | elif forward == "Rd2": 53 | pair_ori = -1 54 | 55 | self_strand = 1 56 | pair_strand = '+' 57 | 58 | #Si no se tiene el flag XS:A:- se tienen que implementar las operaciones a nivel de bits: 59 | 60 | if (1 & int(flag)): #paired end 61 | pair_number = flag_dict[flag][0] 62 | self_strand = flag_dict[flag][1] 63 | if pair_ori*self_strand*pair_number==-1: 64 | pair_strand = '-' 65 | 66 | elif (16 & int(flag)): #single end 67 | self_strand = -1 68 | pair_strand = '-' 69 | 70 | if self_strand == -1: 71 | seq = str(Seq(seq).reverse_complement()) 72 | 73 | aux_str = '' 74 | cigar_vars = [] 75 | 76 | for c in cigar: 77 | c_type = ascii_classifier(c) 78 | 79 | if c_type == 'number': 80 | aux_str += c 81 | 82 | elif c_type == 'letter': 83 | cigar_vars.append((c, int(aux_str))) 84 | aux_str = '' 85 | 86 | Exon_starts = [start] 87 | Exon_ends = [] 88 | 89 | block = 0 90 | var_index = 0 91 | 92 | for var in cigar_vars: 93 | var_type = var[0] 94 | var_value = var[1] 95 | var_index += 1 96 | 97 | if var_type == 'M': 98 | block += var_value 99 | 100 | if var_type == 'D': 101 | block += var_value 102 | 103 | if var_type == 'I': 104 | block += 0 105 | 106 | if var_type == 'N': 107 | Exon_ends.append(Exon_starts[-1] + block) 108 | Exon_starts.append(Exon_ends[-1] + var_value) 109 | block = 0 110 | 111 | if var_index == len(cigar_vars): 112 | Exon_ends.append(Exon_starts[-1] + block) 113 | 114 | if len(Exon_starts)>=3: 115 | 116 | 117 | #for e5s, e5e, e3s, e3e in zip(Exon_starts, Exon_ends, Exon_starts[1:], Exon_ends[1:]): 118 | for estart, eend in zip(Exon_starts[1:-1], Exon_ends[1:-1]): 119 | 120 | exon = "_".join(map(str, [chr, estart, eend])) 121 | 122 | exon_count[exon] += 1 123 | 124 | 125 | for exon, count in exon_count.items(): 126 | 127 | print exon, count 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | if __name__ == '__main__': 136 | main(sys.argv[1]) 137 | -------------------------------------------------------------------------------- /src/Get_fasta_from_bed12.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import csv 3 | from collections import defaultdict 4 | from Bio import SeqIO 5 | from Bio.Seq import Seq 6 | from Bio.Alphabet import generic_dna 7 | 8 | Genome = {} 9 | 10 | def Genomictabulator(fasta): 11 | 12 | print >> sys.stderr, "Loading the genome into RAM memory ...", 13 | 14 | f = open(fasta) 15 | 16 | for chrfa in SeqIO.parse(f, "fasta"): 17 | Genome[chrfa.id] = chrfa.seq 18 | 19 | print >> sys.stderr, "OK" 20 | 21 | f.close() 22 | 23 | 24 | def main(bed12): 25 | 26 | transcripts_seq = defaultdict(str) 27 | 28 | 29 | for row in csv.reader(open(bed12), delimiter = '\t'): 30 | start = int(row[1]) 31 | end = int(row[2]) 32 | strand = row[5] 33 | bn = int(row[9]) 34 | chrom = row[0] 35 | transcript = row[3] 36 | blocksizes = map(int, row[10].strip(",").split(",")) 37 | qstarts = map (int, row[11].strip(",").split(",")) 38 | 39 | seq = "" 40 | 41 | if chrom in Genome: 42 | 43 | for q, b in zip(qstarts, blocksizes): 44 | 45 | estart = start + q 46 | eend = start + q + b 47 | elength = eend - estart 48 | 49 | exon_seq = Genome[chrom][estart:eend] 50 | 51 | seq += exon_seq 52 | 53 | 54 | if strand=="-": 55 | seq = seq.reverse_complement() 56 | 57 | seq = str(seq).upper() 58 | 59 | print(">" + transcript) 60 | print(seq) 61 | 62 | 63 | if __name__ == '__main__': 64 | Genomictabulator(sys.argv[1]) 65 | main(sys.argv[2]) 66 | -------------------------------------------------------------------------------- /src/Get_introns_from_sam.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import csv 3 | from Bio import SeqIO 4 | from Bio.Seq import Seq 5 | from Bio.Alphabet import generic_dna 6 | from collections import defaultdict 7 | 8 | 9 | SeqTable = [] 10 | 11 | #dicionario de fag [Rd1/Rd2, +/-] 12 | flag_dict = {'73':[1,1], '89':[1,1], '121':[1,-1], '153':[-1,-1], '185':[-1,-1], '137':[-1,1], '99':[1,1], '147':[-1,-1], '83':[1,-1], '163':[-1,1], '67':[1,1], '115':[1,-1], '179':[-1,-1], '81':[1,-1], "161":[-1,1], '97':[1,1], '145':[-1,-1], '65':[1,1], '129':[-1,1], '113':[1,-1], '177':[-1,-1] } 13 | 14 | def ascii_classifier(c): 15 | 16 | ascii = ord(c) 17 | if ascii >= 48 and ascii <= 57: 18 | return 'number' 19 | else: 20 | return 'letter' 21 | 22 | 23 | def main(sam, forward, min_ilen, max_ilen, anchor): #hay que indicar si forward es Rd1 o Rd2 24 | # reader = csv.reader(open(sam), delimiter = '\t') 25 | 26 | intron_count = defaultdict(int) 27 | 28 | pair_ori = 0 29 | if forward == "Rd1": 30 | pair_ori = 1 31 | elif forward == "Rd2": 32 | pair_ori = -1 33 | 34 | 35 | reader = csv.reader(open(sam), delimiter = '\t') 36 | 37 | 38 | for row in reader: 39 | if row[0][0] != "@": 40 | if "N" in row[5]: 41 | read = row[0] 42 | flag = row[1] 43 | chr = row[2] 44 | start = int(row[3]) - 1 #Sam es 1 referenciado y es mas comodo trabajar en cordeneadas 0 refereciadas 45 | cigar = row[5] 46 | seq = row[9] 47 | 48 | pair_ori = 0 49 | if forward == "Rd1": 50 | pair_ori = 1 51 | elif forward == "Rd2": 52 | pair_ori = -1 53 | 54 | self_strand = 1 55 | pair_strand = '+' 56 | 57 | #Si no se tiene el flag XS:A:- se tienen que implementar las operaciones a nivel de bits: 58 | 59 | if (1 & int(flag)): #paired end 60 | pair_number = flag_dict[flag][0] 61 | self_strand = flag_dict[flag][1] 62 | if pair_ori*self_strand*pair_number==-1: 63 | pair_strand = '-' 64 | 65 | elif (16 & int(flag)): #single end 66 | self_strand = -1 67 | pair_strand = '-' 68 | 69 | if self_strand == -1: 70 | seq = str(Seq(seq).reverse_complement()) 71 | 72 | aux_str = '' 73 | cigar_vars = [] 74 | 75 | for c in cigar: 76 | c_type = ascii_classifier(c) 77 | 78 | if c_type == 'number': 79 | aux_str += c 80 | 81 | elif c_type == 'letter': 82 | cigar_vars.append((c, int(aux_str))) 83 | aux_str = '' 84 | 85 | Exon_starts = [start] 86 | Exon_ends = [] 87 | 88 | block = 0 89 | var_index = 0 90 | 91 | for var in cigar_vars: 92 | var_type = var[0] 93 | var_value = var[1] 94 | var_index += 1 95 | 96 | if var_type == 'M': 97 | block += var_value 98 | 99 | if var_type == 'D': 100 | block += var_value 101 | 102 | if var_type == 'I': 103 | block += 0 104 | 105 | if var_type == 'N': 106 | Exon_ends.append(Exon_starts[-1] + block) 107 | Exon_starts.append(Exon_ends[-1] + var_value) 108 | block = 0 109 | 110 | if var_index == len(cigar_vars): 111 | Exon_ends.append(Exon_starts[-1] + block) 112 | 113 | 114 | for e5s, e5e, e3s, e3e in zip(Exon_starts, Exon_ends, Exon_starts[1:], Exon_ends[1:]): 115 | e5len= e5e - e5s 116 | e3len = e3e - e3s 117 | istart = e5e 118 | iend = e3s 119 | ilen = iend - istart 120 | 121 | intron = "_".join(map(str, [chr, pair_strand, istart, iend]) ) 122 | 123 | #intron = chr + ":" + str(istart) + pair_strand + str(iend) 124 | 125 | intron_count[intron]+=1 126 | 127 | 128 | 129 | #if max_ilen >= ilen >=min_ilen and e5len >= anchor <= e3len: #filtro tamano de intrones y anchor 130 | #print read, chr, istart, iend, pair_strand, ilen, intron, dn, start, cigar, e5s, e5e, e3s, e3e, seq 131 | 132 | 133 | for intron, count in intron_count.items(): 134 | 135 | print intron, count 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | if __name__ == '__main__': 147 | main(sys.argv[1], sys.argv[2], int(sys.argv[3]), int(sys.argv[4]), int(sys.argv[5])) 148 | -------------------------------------------------------------------------------- /src/Get_splicing_PWMs.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import csv 3 | from Bio import SeqIO 4 | from Bio.Seq import Seq 5 | from collections import defaultdict 6 | from shutil import copyfile 7 | 8 | Genome = {} 9 | 10 | def Genomictabulator(fasta): 11 | 12 | f = open(fasta) 13 | 14 | for chrfa in SeqIO.parse(f, "fasta"): 15 | Genome[chrfa.id] = chrfa.seq 16 | 17 | 18 | f.close() 19 | 20 | def main(bed12, in_GT_AG_U2_5, in_GT_AG_U2_3, out_GT_AG_U2_5, out_GT_AG_U2_3): 21 | 22 | if in_GT_AG_U2_5=="NA" and in_GT_AG_U2_3=="NA": 23 | 24 | 25 | GT_AG_U2_5 = defaultdict(int) #intro-centric 26 | GT_AG_U2_3 = defaultdict(int) 27 | 28 | for row in csv.reader(open(bed12), delimiter = '\t'): 29 | 30 | csv.field_size_limit(1000000000) 31 | 32 | qstarts = list(map (int, row[11].strip(",").split(",")))[1:-1] 33 | blocksizes = list(map(int, row[10].strip(",").split(",")))[1:-1] 34 | 35 | start = int(row[1]) 36 | strand = row[5] 37 | bn = int(row[9]) 38 | chrom = row[0] 39 | 40 | if chrom in Genome: 41 | 42 | for q1, b in zip(qstarts, blocksizes): 43 | estart = start + q1 44 | eend = start + q1 + b 45 | elenght = eend - estart 46 | 47 | 48 | ME5 = str(Genome[chrom][estart-14:estart+3]).upper() #exon-centric 49 | ME3 = str(Genome[chrom][eend-3:eend+10]).upper() 50 | 51 | 52 | if strand == "-": 53 | 54 | ME5 = str(Genome[chrom][eend-3:eend+14].reverse_complement()).upper() 55 | ME3 = str(Genome[chrom][estart-10:estart+3].reverse_complement()).upper() 56 | 57 | 58 | dn = ME3[3:5] + ME5[-5:-3] 59 | 60 | 61 | if dn=="GTAG": 62 | 63 | for pos, nt in enumerate(ME3): 64 | 65 | GT_AG_U2_5[(pos, nt)] += 1 66 | 67 | for pos, nt in enumerate(ME5): 68 | 69 | GT_AG_U2_3[(pos, nt)] += 1 70 | 71 | 72 | 73 | with open(out_GT_AG_U2_5, "w") as GT_AG_U2_5_out: 74 | 75 | 76 | GT_AG_U2_5_out.write( "\t".join(["A", "C", "G", "T"]) +"\n") 77 | 78 | for i in range(len(GT_AG_U2_5)): #This range is about 4 times biger 79 | A = GT_AG_U2_5[(i, "A")] 80 | G = GT_AG_U2_5[(i, "G")] 81 | C = GT_AG_U2_5[(i, "C")] 82 | T = GT_AG_U2_5[(i, "T")] 83 | 84 | TOTAL = A + G + C + T 85 | 86 | if TOTAL >0: 87 | GT_AG_U2_5_out.write("\t".join(map(str, [ x/TOTAL for x in [A, C, G, T]])) + "\n" ) 88 | 89 | 90 | 91 | with open(out_GT_AG_U2_3, "w") as GT_AG_U2_3_out: 92 | 93 | 94 | GT_AG_U2_3_out.write( "\t".join(["A", "C", "G", "T"]) +"\n") 95 | 96 | for i in range(len(GT_AG_U2_3)): #This range is about 4 times biger 97 | A = GT_AG_U2_3[(i, "A")] 98 | G = GT_AG_U2_3[(i, "G")] 99 | C = GT_AG_U2_3[(i, "C")] 100 | T = GT_AG_U2_3[(i, "T")] 101 | 102 | TOTAL = A + G + C + T 103 | 104 | if TOTAL >0: 105 | GT_AG_U2_3_out.write("\t".join(map(str, [ x/TOTAL for x in [A, C, G, T]])) + "\n" ) 106 | 107 | 108 | else: 109 | 110 | copyfile(in_GT_AG_U2_5, out_GT_AG_U2_5) 111 | copyfile(in_GT_AG_U2_3, out_GT_AG_U2_3) 112 | 113 | 114 | 115 | 116 | if __name__ == '__main__': 117 | Genomictabulator(sys.argv[1]) 118 | main(sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6]) 119 | -------------------------------------------------------------------------------- /src/ME_centric_table.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import csv 3 | from collections import defaultdict 4 | import re 5 | from Bio import SeqIO 6 | from Bio.Seq import Seq 7 | from Bio.Alphabet import generic_dna 8 | 9 | csv.field_size_limit(1000000000) 10 | 11 | 12 | def main(row_ME_filter1): 13 | 14 | ME_reads = defaultdict(set) 15 | ME_SJs = defaultdict(set) 16 | SJ_info = {} 17 | 18 | SJ_SJ = defaultdict(set) 19 | SJ_same_ME = set([]) 20 | 21 | for row in csv.reader(open(row_ME_filter1), delimiter = ' '): 22 | 23 | read, seq, qual, tag_alingment, t_score, genome_alingment, g_score, same_ME, len_micro_exon_seq_found, micro_exon_seq_found, number_of_micro_exons_matches, max_U2_scores, max_mean_conservations, micro_exons_coords, U2_scores, mean_conservations = row 24 | SJ, transcript, anchors, cigar = tag_alingment.split("|") 25 | info = " ".join([SJ, transcript, len_micro_exon_seq_found, micro_exon_seq_found, number_of_micro_exons_matches, max_U2_scores, max_mean_conservations, micro_exons_coords, U2_scores, mean_conservations]) 26 | 27 | ME_reads[info].add(seq) 28 | 29 | 30 | # if max_mean_conservations_primates != "None" and max_mean_conservations !="None": #I need to fix this 31 | 32 | for ME in micro_exons_coords.split(","): 33 | ME_SJs[ME].add(SJ + "_" + micro_exon_seq_found) 34 | 35 | 36 | ###Cheking if any ME is in two or more SJ 37 | 38 | for i in ME_SJs.items(): 39 | ME, SJs = i 40 | 41 | for SJ_A in SJs: 42 | 43 | for SJ_B in SJs: 44 | 45 | SJ_SJ[SJ_A].add(SJ_B) 46 | 47 | 48 | for i in SJ_SJ.items(): 49 | 50 | SJ_same_ME.add(" ".join(sorted(list(i[1])))) 51 | 52 | ####Coverage dict ##### 53 | 54 | for i in ME_reads.items(): 55 | 56 | SJ, transcript, len_micro_exon_seq_found, micro_exon_seq_found, number_of_micro_exons_matches, max_U2_scores, max_mean_conservations, micro_exons_coords, U2_scores, mean_conservations, = i[0].split(" ") 57 | 58 | coverage = len(i[1]) 59 | 60 | info = " ".join([str(coverage), transcript, len_micro_exon_seq_found, micro_exon_seq_found, number_of_micro_exons_matches, max_U2_scores, max_mean_conservations, micro_exons_coords, U2_scores, mean_conservations]) 61 | SJ_MEseq = "_".join([SJ, micro_exon_seq_found]) 62 | 63 | SJ_info[SJ_MEseq] = info 64 | 65 | 66 | #### 67 | 68 | for i in SJ_same_ME: 69 | 70 | #if len(i.split(" ")) > 1: 71 | 72 | SJs = [] 73 | SJ_Coverages = [] 74 | SJ_number_of_micro_exons_matches = [] 75 | SJ_max_U2_scores = [] 76 | SJ_max_mean_conservations = [] 77 | ME = [] 78 | 79 | 80 | P_MEs = [] 81 | 82 | info = set([]) 83 | 84 | for SJ_MEseq in i.split(" "): 85 | 86 | SJ = "_".join(SJ_MEseq.split("_")[:-1]) 87 | 88 | coverage, transcript, len_micro_exon_seq_found, micro_exon_seq_found, number_of_micro_exons_matches, max_U2_scores, max_mean_conservations, micro_exons_coords, U2_scores, mean_conservations = SJ_info[SJ_MEseq].split(" ") 89 | 90 | SJs.append(SJ) 91 | SJ_Coverages.append(int(coverage)) 92 | info.add((transcript, len_micro_exon_seq_found, micro_exon_seq_found)) 93 | SJ_number_of_micro_exons_matches.append(int(number_of_micro_exons_matches)) 94 | SJ_max_U2_scores.append(float(max_U2_scores)) 95 | 96 | 97 | # print SJ_info 98 | 99 | # if max_mean_conservations_primates == "None": 100 | 101 | # print SJ_info 102 | 103 | try: 104 | SJ_max_mean_conservations.append(float(max_mean_conservations)) 105 | except ValueError: 106 | SJ_max_mean_conservations.append(0) 107 | 108 | 109 | #print SJ_MEseq, SJ 110 | SJ_chr = "_".join((re.findall(r"[\w']+", SJ)[:-2])) 111 | SJ_istart, SJ_iend = re.findall(r"[\w']+", SJ)[-2:] 112 | SJ_istart = int(SJ_istart) 113 | SJ_iend = int(SJ_iend) 114 | 115 | len_micro_exon_seq_found = int(len_micro_exon_seq_found) 116 | 117 | SJ_len = SJ_iend - SJ_istart 118 | Kmer = SJ_len - (len_micro_exon_seq_found+4) 119 | #P_ME = 1 - ( 1 - (float(1)/float(4**len_micro_exon_seq_found+4)))**Kmer 120 | #P_ME = 1 - ( 1 - (float(1)/float(4**len_micro_exon_seq_found+4 )))**( SJ_len - (len_micro_exon_seq_found+4)) 121 | 122 | P_ME = 1 - ( 1 - (float(1)/float(4**(len_micro_exon_seq_found+4) )))**( SJ_len - (len_micro_exon_seq_found+4)) 123 | 124 | P_MEs.append(P_ME) 125 | 126 | set_ME = set([]) 127 | 128 | for a, b, c in zip(micro_exons_coords.split(","), U2_scores.split(","), mean_conservations.split(",")): 129 | 130 | set_ME.add("|".join([a,b,c])) 131 | 132 | ME.append(set_ME) 133 | 134 | 135 | 136 | sum_total_coverage = sum(SJ_Coverages) 137 | total_SJs = ",".join(SJs) 138 | total_coverages = ",".join(map(str, SJ_Coverages)) 139 | 140 | # total_max_U2_scores = min(SJ_max_U2_scores) 141 | # total_max_mean_conservations = min(SJ_max_mean_conservations) 142 | # total_max_mean_conservations_primates = min(SJ_max_mean_conservations_primates) 143 | 144 | 145 | total_ME = ",".join(set.intersection(*ME)) 146 | total_number_of_micro_exons_matches = len(total_ME.split(",")) 147 | 148 | transcript, len_micro_exon_seq_found, micro_exon_seq_found = list(info)[0] 149 | 150 | 151 | if total_ME!="": #The empty fields refeclts no interesection between micro-exons present on the splice junctions 152 | 153 | true_ME = max([i.split("|") for i in total_ME.split(",")], key=lambda item:float(item[1])) 154 | 155 | 156 | ME, U2_scores, mean_conservations = true_ME 157 | 158 | #if 6 >= len(micro_exon_seq_found) >= 3: 159 | 160 | #### Probabilidad ### 161 | 162 | # if total_ME!="": 163 | 164 | #out = map(str, [ME, transcript, sum_total_coverage, total_SJs, total_coverages, len_micro_exon_seq_found, micro_exon_seq_found, total_number_of_micro_exons_matches, total_max_U2_scores, total_max_mean_conservations, total_max_mean_conservations_primates, min(P_MEs), total_ME]) 165 | 166 | out = map(str, [ME, transcript, sum_total_coverage, total_SJs, total_coverages, len_micro_exon_seq_found, micro_exon_seq_found, total_number_of_micro_exons_matches, U2_scores, mean_conservations, min(P_MEs), total_ME]) 167 | 168 | 169 | 170 | print "\t".join(out) 171 | 172 | 173 | 174 | 175 | 176 | if __name__ == '__main__': 177 | main(sys.argv[1]) 178 | 179 | 180 | #python ~/my_src/ME/Pipeline/ME_centric_table.py _clip1.trim.sam.row_ME.filter1 181 | -------------------------------------------------------------------------------- /src/Micro_exons_tags.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import csv 3 | import re 4 | from Bio import SeqIO 5 | from Bio.Seq import Seq 6 | from Bio.Alphabet import generic_dna 7 | 8 | SJ_Tags_seq = {} 9 | SJ_Tags_info = {} 10 | 11 | def Tagloader(fasta): 12 | 13 | print >> sys.stderr, "Loading SJ Tags in RAM memory ...", 14 | 15 | f = open(fasta) 16 | 17 | for tag in SeqIO.parse(f, "fasta"): 18 | SJ_Tags_seq[tag.id.split("|")[0]] = tag.seq 19 | SJ_Tags_info[tag.id.split("|")[0]] = tag.id 20 | 21 | print ">" + tag.id 22 | print tag.seq 23 | 24 | print >> sys.stderr, "OK" 25 | 26 | f.close() 27 | 28 | 29 | 30 | 31 | def main(ME_centric): 32 | 33 | for row in csv.reader(open(ME_centric), delimiter = '\t'): 34 | 35 | #ME, sum_total_coverage, total_SJs, total_coverages, len_micro_exon_seq_found, micro_exon_seq_found, total_number_of_micro_exons_matches, total_max_U2_scores, total_max_mean_conservations, total_max_mean_conservations_primates, min_P_ME, total_ME = row #, true_ME, score, is_annotated = row 36 | ME, transcript, sum_total_coverage, total_SJs, total_coverages, len_micro_exon_seq_found, micro_exon_seq_found, total_number_of_micro_exons_matches, U2_scores, mean_conservations, P_MEs, total_ME = row 37 | 38 | for SJ in total_SJs.split(","): 39 | 40 | SJ_Tag_seq = SJ_Tags_seq[SJ] 41 | up_block, down_block = SJ_Tags_info[SJ].split("|")[-1].split("_") 42 | 43 | ME_Tag_ID = "|".join(SJ_Tags_info[SJ].split("|")[:-1] + ["_".join([up_block, micro_exon_seq_found, down_block])] ) 44 | ME_Tag_seq = SJ_Tag_seq[:int(up_block)] + micro_exon_seq_found + SJ_Tag_seq[int(up_block):] 45 | # 46 | print ">" + ME_Tag_ID 47 | print ME_Tag_seq 48 | 49 | 50 | 51 | if __name__ == '__main__': 52 | Tagloader(sys.argv[1]) 53 | main(sys.argv[2]) 54 | -------------------------------------------------------------------------------- /src/Replace_PSI_whippet.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import csv 3 | import gzip 4 | csv.field_size_limit(300000000000000) 5 | 6 | 7 | def main(ME_PSI, whippet_PSI): 8 | 9 | Coord_info = dict() 10 | 11 | reader1 = csv.reader(open(ME_PSI), delimiter="\t") 12 | header1 = next(reader1) 13 | 14 | for row in reader1: 15 | 16 | 17 | ME, Coord, PSI, CI_Lo, CI_Hi, Class = row 18 | 19 | Coord_info[Coord] = row 20 | 21 | reader2 = csv.reader(gzip.open(whippet_PSI, mode="rt"), delimiter="\t") 22 | header2 = next(reader2) 23 | 24 | print( "\t".join(header2) ) 25 | 26 | for row in reader2: 27 | 28 | Gene, Node, Coord, Strand, Type, Psi, CI_Width, CI_Lo_Hi, Total_Reads, Complexity, Entropy, Inc_Paths, Exc_Paths, Edges = row 29 | 30 | if Coord in Coord_info: 31 | 32 | ME, Coord, ME_PSI, CI_Lo, CI_Hi, Class = Coord_info[Coord] 33 | 34 | if ME_PSI == "NA": 35 | 36 | print("\t".join(row)) 37 | 38 | else: 39 | 40 | ME_CI_Lo_Hi = ",".join([ CI_Lo, CI_Hi]) 41 | 42 | ME_CI_Width = str(float(CI_Hi) - float(CI_Lo)) 43 | 44 | out = [Gene, Node, Coord, Strand, Type, ME_PSI, ME_CI_Width, ME_CI_Lo_Hi, Total_Reads, Complexity, Entropy, Inc_Paths, Exc_Paths, Edges] 45 | 46 | print("\t".join(out)) 47 | 48 | else: 49 | 50 | print("\t".join(row)) 51 | 52 | 53 | if __name__ == '__main__': 54 | main(sys.argv[1], sys.argv[2] ) 55 | -------------------------------------------------------------------------------- /src/Report/report_files/figure-html/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/src/Report/report_files/figure-html/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /src/Report/report_files/figure-html/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hemberg-lab/MicroExonator/3f8d4aa9c8ace8d1fcc99e8e5cb14f782eccb5c6/src/Report/report_files/figure-html/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /src/SJ_tags_generator_for_micro_exons.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import csv 3 | from Bio import SeqIO 4 | from Bio.Seq import Seq 5 | from random import randint, sample 6 | from operator import itemgetter 7 | from collections import defaultdict 8 | from operator import itemgetter 9 | 10 | 11 | Transcriptome = {} 12 | Genome = {} 13 | 14 | def Genomictabulator(fasta): 15 | 16 | print >> sys.stderr, "Loading the genome into RAM memory ...", 17 | 18 | f = open(fasta) 19 | 20 | for chrfa in SeqIO.parse(f, "fasta"): 21 | Genome[chrfa.id] = chrfa.seq 22 | 23 | print >> sys.stderr, "OK" 24 | 25 | f.close() 26 | 27 | 28 | 29 | def Transcriptometabulator(genecode_fasta): 30 | 31 | print >> sys.stderr, "Loading the genome into RAM memory ...", 32 | 33 | for record in SeqIO.parse(genecode_fasta, "fasta"): 34 | id = str(record.id).split("|")[0].split(" ")[0] 35 | Transcriptome[id] = record.seq 36 | 37 | #print(id) 38 | 39 | print >> sys.stderr, "OK" 40 | 41 | 42 | def main(bed12, ME_len, max_read_len): 43 | 44 | n = max_read_len 45 | 46 | transcript_intron_info = defaultdict(list) 47 | 48 | min_intron_lenght = 80 49 | 50 | for row in csv.reader(open(bed12), delimiter = '\t'): 51 | 52 | try: 53 | 54 | 55 | qName = row[3] 56 | seq = Transcriptome[qName] 57 | 58 | qstarts = map (int, row[11].strip(",").split(",")) 59 | blocksizes = map(int, row[10].strip(",").split(",")) 60 | 61 | start = int(row[1]) 62 | strand = row[5] 63 | bn = int(row[9]) 64 | chr = row[0] 65 | qstart = 0 66 | 67 | for q1, q2, b, b2 in zip(qstarts, qstarts[1:], blocksizes, blocksizes[1:]): 68 | 69 | qstart = qstart + b 70 | tag_start = qstart - n 71 | tag_end = qstart + n 72 | 73 | #if tag_start <= 0: 74 | # print tag_start, qstart, tag_end, strand 75 | 76 | istart = start + q1 + b 77 | iend = start + q2 78 | ilen = iend - istart 79 | intron = row[0] + ":" + str(istart) + row[5] + str(iend) 80 | intron = chr + ":" + str(istart) + strand + str(iend) 81 | ilength = iend - istart 82 | 83 | block_up = n 84 | block_down = n 85 | dn = str(Genome[chr][istart:(istart+2)] + Genome[chr][(iend-2):iend]).upper() 86 | 87 | 88 | if strand == '+' : #Para los que aliniean en la hebra + 89 | 90 | if tag_start<0: #Precausiones generar buenos tag del primer y ultimo tag 91 | tag_start = 0 92 | block_up = qstart 93 | 94 | if tag_end>len(seq): 95 | tag_end=len(seq) 96 | block_down = tag_end - qstart 97 | 98 | 99 | tag = seq[tag_start:tag_end] 100 | 101 | 102 | if strand == '-' : 103 | 104 | dn = str((Genome[chr][istart:(istart+2)] + Genome[chr][(iend-2):iend]).reverse_complement()).upper() 105 | 106 | if tag_end>len(seq): #Para los que alinian en la hebra - es todo al inverso 107 | tag_end=len(seq) 108 | block_up = tag_end - qstart 109 | 110 | tag = seq[-tag_end:-tag_start] 111 | 112 | if tag_start<=0: 113 | 114 | tag = seq[-tag_end:] 115 | block_down = qstart 116 | 117 | 118 | if b > ME_len and b2 > ME_len and ilength >= min_intron_lenght and (dn=="GTAG" or dn=="GCAG" or dn=="ATAC"): # hay que agregarle el filtro de los micro exones!! 119 | 120 | 121 | info = qName, tag, chr, istart, iend, strand, block_up, block_down, block_up + block_down 122 | transcript_intron_info[intron].append(info) 123 | 124 | 125 | except KeyError: 126 | pass 127 | 128 | 129 | for i in transcript_intron_info.items(): 130 | 131 | infos = i[1] 132 | intron = i[0] 133 | 134 | qName, tag, chr, istart, iend, strand, block_up, block_down, sum_blocks = max(infos, key=itemgetter(8)) 135 | 136 | 137 | ID = ">" + intron + "|" + qName + "|" + str(block_up) + "_" + str(block_down) 138 | 139 | print ID 140 | print tag 141 | 142 | 143 | #>chr12:3701518+3702264|ENST00000562877.1|100_19 144 | #AGCTTTCTGTTTAGTTGTGTCAATCGCAGGCCACTCTGCTGAGCATCTTCTCCCAGGAGTACCAGAAACACATTAAAAGAACACATGCCAAACATCATACTTCGGAAGCAATTGAAAGT 145 | 146 | 147 | if __name__ == '__main__': 148 | Genomictabulator(sys.argv[1]) 149 | Transcriptometabulator(sys.argv[2]) 150 | main (sys.argv[3], int(sys.argv[4]), int(sys.argv[5])) 151 | 152 | 153 | 154 | #El filtro del los intrones canonicos fue anadido despues 155 | -------------------------------------------------------------------------------- /src/Snakepool_BetaDist.R: -------------------------------------------------------------------------------- 1 | log <- file(snakemake@log[[1]], open="wt") 2 | 3 | cdf_t = snakemake@params[["ct"]] 4 | min_rep = snakemake@params[["mr"]] 5 | min.p.mean = snakemake@params[["mm"]] 6 | path_run_metatda = snakemake@params[["pm"]] 7 | path_delta = snakemake@params[["path_delta"]] 8 | path_out = snakemake@params[["path_out"]] 9 | min_delta = snakemake@params[["min_delta"]] 10 | 11 | 12 | 13 | library(data.table) 14 | library(distributions3) 15 | 16 | 17 | 18 | 19 | 20 | get_rep_table <- function( file_path, rep){ 21 | 22 | 23 | 24 | comparison <- data.table() 25 | 26 | 27 | for ( i in seq(1:rep)){ 28 | 29 | print(i) 30 | 31 | path <- paste0(file_path, i, ".diff") 32 | file <- fread(path) 33 | file[, Rep:=i] 34 | comparison <- rbind(comparison, file) 35 | 36 | } 37 | 38 | 39 | colnames(comparison) <- c( "Gene", "Node", "Coord", "Strand", "Type", "Psi_A", "Psi_B", "DeltaPsi", "Probability", "Complexity", "Entropy", "V1", "Rep") 40 | comparison[ , V1:=NULL] 41 | 42 | comparison 43 | 44 | } 45 | 46 | 47 | 48 | 49 | 50 | 51 | cdf.beta <- function(mu, var, p) { 52 | alpha <- ((1 - mu) / var - 1 / mu) * mu ^ 2 53 | beta <- alpha * (1 / mu - 1) 54 | 55 | return(cdf( Beta(alpha, beta), p)) 56 | } 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | get_diff_nodes <- function (path, comp_name, reps, beta_t, min.p.mean, min_number_reps){ 65 | 66 | 67 | print(comp_name) 68 | 69 | comp <- get_rep_table( paste0(path, comp_name, "_rep_") , reps) 70 | 71 | 72 | 73 | comp.stats <- comp[ , .(Psi_A.mean =mean(Psi_A) , Psi_B.mean =mean(Psi_B) , 74 | DeltaPsi.mean = mean(DeltaPsi), DeltaPsi.sd = sd(DeltaPsi), 75 | Probability.mean=mean(Probability, na.rm=T), Probability.sd=sd(Probability, na.rm=T), 76 | Probability.var = var(Probability, na.rm=T), Number=.N), 77 | by=c( "Gene", "Node", "Coord", "Strand", "Type")] 78 | 79 | 80 | comp.stats[ , cdf.beta:=cdf.beta( Probability.mean, Probability.var , beta_t) ] 81 | 82 | 83 | comp.stats[ , diff:=(abs(DeltaPsi.mean)>=min_delta & cdf.beta < 0.05 & Probability.mean>=min.p.mean & ! Type %in% c("TE", "TS") & Number > min_number_reps) ] 84 | 85 | 86 | 87 | } 88 | 89 | 90 | snakepool_BetaDist <-function(beta_t, min.p.mean, min_number_reps, path_metadata, path_delta, out_dir){ 91 | 92 | 93 | 94 | 95 | run_metadata <- fread(path_metadata) 96 | 97 | 98 | 99 | for (i in 1:nrow(run_metadata)) { 100 | 101 | #print(run_metadata[i, Compare_ID]) 102 | #print(run_metadata[i, Repeat]) 103 | 104 | out <- get_diff_nodes(path_delta, run_metadata[i, Compare_ID], run_metadata[i, Repeat], beta_t, min.p.mean, min_number_reps ) 105 | 106 | fwrite(out, file= paste0(out_dir, run_metadata[i, Compare_ID], ".txt") , append = FALSE, quote = "auto", sep = "\t", row.names = FALSE, col.names = TRUE) 107 | 108 | } 109 | 110 | } 111 | 112 | snakepool_BetaDist(cdf_t, min.p.mean, min_rep, 113 | path_run_metatda, 114 | path_delta, 115 | path_out) 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /src/coverage_sample_filter.py: -------------------------------------------------------------------------------- 1 | from snakemake.utils import min_version 2 | import csv 3 | from collections import defaultdict 4 | import sys 5 | 6 | 7 | 8 | 9 | with open(snakemake.output[0], "w") as out: 10 | #with open("test.txt", "w") as out: 11 | 12 | min_read_per_sample = int(snakemake.params[0]) 13 | #min_read_per_sample = 5 14 | ME_n_samples = defaultdict(int) 15 | all_ME = set([]) 16 | 17 | for file in snakemake.input: 18 | #for file in sys.argv[1:]: 19 | 20 | with open(file) as cov_file: 21 | 22 | reader = csv.reader(cov_file, delimiter="\t") 23 | 24 | 25 | for row in reader: 26 | 27 | 28 | FILE_NAME, ME, total_SJs, ME_SJ_coverages, sum_ME_coverage, sum_ME_SJ_coverage_up_down_uniq, sum_ME_SJ_coverage_up, sum_ME_SJ_coverage_down, SJ_coverages, sum_SJ_coverage, is_alternative_5, is_alternative_3, alternatives_5, cov_alternatives_5, total_cov_alternatives_5, alternatives_3, cov_alternatives_3, total_cov_alternatives_3 = row 29 | 30 | all_ME.add(ME) 31 | if int(sum_ME_SJ_coverage_up_down_uniq)>=min_read_per_sample: 32 | ME_n_samples[ME] += 1 33 | 34 | out.write("\t".join(["ME", "N_samples" ]) + "\n") 35 | 36 | for ME in all_ME: 37 | 38 | n = ME_n_samples[ME] 39 | 40 | out.write("\t".join([ME, str(n)]) + "\n") 41 | -------------------------------------------------------------------------------- /src/final_filters.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | #' --- 4 | #' title: Micro-exon final filtering report 5 | #' author: Guillermo Parada 6 | #' output: 7 | #' html_document: 8 | #' toc: true 9 | #' highlight: zenburn 10 | #' --- 11 | #' 12 | #' ## Loading libraries 13 | #' 14 | #' The libraries which are used by this script are:" 15 | #' *ggplot2 16 | #' *reshape2 17 | #' *stringr 18 | #' *mixtools 19 | #' *simecol 20 | #' *data.table 21 | #' *optparse 22 | 23 | 24 | library(ggplot2) 25 | library(reshape2) 26 | library(stringr) 27 | library(mixtools) 28 | library(simecol) 29 | library(data.table) 30 | #library(optparse) 31 | 32 | 33 | #' ## Input micro-exon profiling 34 | #' 35 | #' De-novo discovery of micro-exons by uExonator relies on the detection of inserted sequenses over 36 | #' exon-exon junctions, which then are re-mapped inside the cognate introns. Inserted sequences smaller 37 | #' than 6 nucleotides are very likely to be mapped by chance, therefore detected micro-exons smaller than 38 | #' 6 nt are prone to be artefacts by sequencing error or genomic variations. The following plot shows the 39 | #' spurious micro-exon/intron match probability distribution for micro-exon in between 1-15 nt. 40 | 41 | 42 | 43 | 44 | #option_list <- list( 45 | # make_option(c("-met", "--micro_exon_table"), type="character", default=NULL, 46 | # help="Micro-exon centric table", metavar="character"), 47 | # make_option(c("-c", "--micro_exon_coverages"), type="character", default=NULL, 48 | # help="Micro-exon coverage table", metavar="character"), 49 | # make_option(c("-o", "--out"), type="character", default="out.txt", 50 | # help="output file name [default= %default]", metavar="character") 51 | #); 52 | 53 | #opt_parser <- OptionParser(option_list=option_list); 54 | #opt <- parse_args(opt_parser); 55 | 56 | 57 | 58 | ME_centric_raw <- read.delim("~/Google_Drive/Results/ME/Single_cell/TOTAL.sam.row_ME.filter1.ME_centric", header=FALSE, stringsAsFactors=FALSE) 59 | colnames(ME_centric_raw) <- c('ME', 'transcript', 'sum_total_coverage', 'total_SJs', 'total_coverages', 'len_micro_exon_seq_found', 'micro_exon_seq_found', 'total_number_of_micro_exons_matches', 'U2_scores', 'mean_conservations_vertebrates', 'P_MEs', 'total_ME') 60 | 61 | ME_centric_raw <- data.table(ME_centric_raw) 62 | 63 | ggplot(ME_centric_raw[len_micro_exon_seq_found<=15, ], 64 | aes(x=factor(len_micro_exon_seq_found), y=P_MEs) ) + 65 | geom_violin(scale = "width") + 66 | xlab("Micro-exon leght") + 67 | ylab("Spurious micro-exon/intron match probability") + 68 | theme(panel.background = element_rect(fill = 'white', colour = 'black')) 69 | 70 | #' The higher spurious micro-exon/intron match probability is reflected on the number of micro-exon/intron 71 | #' matches inside 72 | 73 | ggplot(ME_centric_raw[len_micro_exon_seq_found<=15, ], 74 | aes(x=factor(len_micro_exon_seq_found), y=total_number_of_micro_exons_matches) ) + 75 | geom_jitter() + 76 | ylim(0,100) + 77 | theme(panel.background = element_rect(fill = 'white', colour = 'black')) 78 | 79 | 80 | #' True 81 | #' splicing events relies on splicing signals, therefore false micro-exons will have weaker splicing signals 82 | #' than the true micro-exons. The following plot show the distribution of U2/GT-AG splicing signal strengh 83 | #' (U2_score) for population the total micro-exons and longer or equal then 3, 6, and 9 nt. Micro-exons equal 84 | #' or longer than 9 nt are less prone to be artefacts, therefore have a U2_score distribution which is expected 85 | #' from real splicng events. 86 | 87 | 88 | #ME_matches <- unlist(strsplit(ME_centric_raw$total_ME, "[,]")) 89 | #ME_matches <- read.table(text=ME_matches, sep="|") 90 | #colnames(ME_matches) <- c("ME", "U2_score", "Vertebrate_conservation", "Primate_conservation") 91 | #ME_matches$Filter = "None" 92 | #ME_centric_raw_longer_3 <- subset(ME_centric_raw, len_micro_exon_seq_found>=3) 93 | #ME_centric_raw_longer_6 <- subset(ME_centric_raw, len_micro_exon_seq_found>=6) 94 | #ME_centric_raw_longer_9 <- subset(ME_centric_raw, len_micro_exon_seq_found>=9) 95 | #ME_matches_3 <- unlist(strsplit(ME_centric_raw_longer_3$total_ME, "[,]")) 96 | #ME_matches_3 <- read.table(text=ME_matches_3, sep="|") 97 | #colnames(ME_matches_3) <- c("ME", "U2_score", "Vertebrate_conservation", "Primate_conservation") 98 | #ME_matches_3$Filter = ">=3" 99 | #ME_matches_6 <- unlist(strsplit(ME_centric_raw_longer_6$total_ME, "[,]")) 100 | #ME_matches_6 <- read.table(text=ME_matches_6, sep="|") 101 | #colnames(ME_matches_6) <- c("ME", "U2_score", "Vertebrate_conservation", "Primate_conservation") 102 | #ME_matches_6$Filter = ">=6" 103 | #ME_matches_9 <- unlist(strsplit(ME_centric_raw_longer_9$total_ME, "[,]")) 104 | #ME_matches_9 <- read.table(text=ME_matches_9, sep="|") 105 | #colnames(ME_matches_9) <- c("ME", "U2_score", "Vertebrate_conservation", "Primate_conservation") 106 | #ME_matches_9$Filter = ">=9" 107 | #ME_matches_Filters <- rbind(ME_matches, ME_matches_3, ME_matches_6, ME_matches_9) 108 | #ggplot(ME_matches_Filters, aes(x=U2_score, ..density.., colour=Filter)) + 109 | # geom_freqpoly(binwidth=5) + 110 | # xlim(40, 100) + 111 | # theme(panel.background = element_rect(fill = 'white', colour = 'black')) 112 | 113 | 114 | R -e "rmarkdown::render('../../../Software/Micro-Exonator/src/final_filters.R',output_file='output.html')" -------------------------------------------------------------------------------- /src/get_diff_ME_single_cell.py: -------------------------------------------------------------------------------- 1 | import csv, sys 2 | from collections import defaultdict 3 | import gzip 4 | 5 | 6 | 7 | def main(jls_exons_tab, delta, high_qual_ME ): 8 | 9 | node_exons = dict() 10 | MEs = set([]) 11 | ME_info = dict() 12 | 13 | header_out = ["Gene", "Node", "Coord", "Strand", "Type", "Psi_A.mean", "Psi_B.mean", "DeltaPsi.mean", "DeltaPsi.sd", "Probability.mean", "Probability.sd", "Probability.var", "N.detected.reps", "cdf.beta", "is.diff", "microexon_ID"] 14 | 15 | header = ["Gene", "Node", "Coord", "Strand", "Type", "Psi_A.mean", "Psi_B.mean", "DeltaPsi.mean", "DeltaPsi.sd", "Probability.mean", "Probability.sd", "Probability.var", "Number", "cdf.beta", "diff"] 16 | 17 | print("\t".join(header_out)) 18 | 19 | 20 | with open(high_qual_ME) as F: 21 | 22 | reader = csv.DictReader(F, delimiter="\t") 23 | 24 | for row in reader: 25 | MEs.add(row["ME"]) 26 | 27 | 28 | with gzip.open(jls_exons_tab, mode="rt") as F: 29 | 30 | reader = csv.DictReader(F, delimiter="\t") 31 | 32 | for row in reader: 33 | chrom, locus, strand = row["Potential_Exon"].split(":") 34 | estart, eend = locus.split("-") 35 | 36 | for node in row["Whippet_Nodes"].split(","): 37 | node_exons[(row["Gene"], node)] = [row["Potential_Exon"], row["Is_Annotated"]] 38 | 39 | 40 | with open(delta) as F: 41 | 42 | reader = csv.DictReader(F, delimiter="\t") 43 | 44 | for row in reader: 45 | 46 | chrom, pos = row["Coord"].split(":") 47 | estart, eend = pos.split("-") 48 | estart = str(int(estart)-1) 49 | exon_ID = "_".join([chrom, row["Strand"], estart, eend]) 50 | 51 | #if exon_ID == "chr10_+_127272438_127272444": 52 | #if "12727243" in exon_ID: 53 | # print(row, exon_ID) 54 | 55 | 56 | 57 | if (row["Gene"], row["Node"] ) in node_exons: 58 | 59 | 60 | 61 | Potential_Exon, Is_Annotated = node_exons[(row["Gene"], row["Node"] )] 62 | 63 | out = [row[x] for x in header] + node_exons[(row["Gene"], row["Node"] )] 64 | 65 | 66 | 67 | if row["Type"]=="AD": 68 | 69 | nchrom, nstrand, nstart, nend = exon_ID.split("_") 70 | 71 | 72 | echrom, eloci, estrand = Potential_Exon.split(":") 73 | 74 | estart, eend = eloci.split("-") 75 | 76 | if estrand == "+" and eend == nend: 77 | 78 | new_exon_ID = "_".join([echrom, estrand, str(int(estart)-1), eend ]) 79 | 80 | exon_ID = new_exon_ID 81 | 82 | if estrand == "-" and str(int(estart)-1) == nstart: 83 | 84 | new_exon_ID = "_".join([echrom, estrand, str(int(estart)-1), eend ]) 85 | 86 | exon_ID = new_exon_ID 87 | 88 | 89 | elif row["Type"]=="AA": 90 | 91 | nchrom, nstrand, nstart, nend = exon_ID.split("_") 92 | 93 | 94 | echrom, eloci, estrand = Potential_Exon.split(":") 95 | 96 | estart, eend = eloci.split("-") 97 | 98 | if estrand == "-" and eend == nend: 99 | 100 | new_exon_ID = "_".join([echrom, estrand, str(int(estart)-1), eend ]) 101 | 102 | 103 | exon_ID = new_exon_ID 104 | 105 | 106 | if estrand == "+" and str(int(estart)-1) == nstart: 107 | 108 | new_exon_ID = "_".join([echrom, estrand, str(int(estart)-1), eend ]) 109 | 110 | exon_ID = new_exon_ID 111 | 112 | #if "12727243" in exon_ID: 113 | # print(row, exon_ID, Potential_Exon) 114 | 115 | 116 | if exon_ID in MEs: 117 | print("\t".join( [row[x] for x in header] + [exon_ID] )) 118 | 119 | else: 120 | print("\t".join( [row[x] for x in header] + ["NA"] )) 121 | 122 | 123 | if __name__ == '__main__': 124 | main(sys.argv[1], sys.argv[2], sys.argv[3]) 125 | -------------------------------------------------------------------------------- /src/merge_pairs.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | from Bio import SeqIO 4 | from Bio.SeqRecord import SeqRecord 5 | 6 | def main(fastq1, fastq2, fastq12): 7 | 8 | r1 = open(fastq1) 9 | r2 = open(fastq2) 10 | out = open(fastq12, 'w') 11 | 12 | for read in SeqIO.parse(r1, "fastq"): 13 | 14 | rd_id = read.id + "_1" 15 | rd_Q = read.letter_annotations["phred_quality"] 16 | reads_rd = SeqRecord( read.seq, rd_id, description = "" ) 17 | reads_rd.letter_annotations["phred_quality"] = rd_Q 18 | out.write(reads_rd.format("fastq")) 19 | 20 | for read in SeqIO.parse(r2, "fastq"): 21 | 22 | rd_id = read.id + "_2" 23 | rd_Q = read.letter_annotations["phred_quality"] 24 | reads_rd = SeqRecord( read.seq, rd_id, description = "" ) 25 | reads_rd.letter_annotations["phred_quality"] = rd_Q 26 | out.write(reads_rd.format("fastq")) 27 | 28 | 29 | if __name__ == '__main__': 30 | main(sys.argv[1], sys.argv[2], sys.argv[3]) -------------------------------------------------------------------------------- /src/merge_quant.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import csv 3 | import gzip 4 | from snakemake.utils import min_version 5 | 6 | csv.field_size_limit(100000000) 7 | csv.field_size_limit() 8 | 9 | 10 | 11 | def main(mode, out_file, file_list ): 12 | with gzip.open(out_file, 'wt') as out: 13 | 14 | for file in file_list: 15 | 16 | with gzip.open(file, mode="rt") as f: 17 | 18 | if mode=="Isoform" or mode=="Gene": 19 | header = ["Sample", mode, "TpM", "Read_Counts"] 20 | elif mode=="PSI": 21 | header = ['Sample', 'Gene', 'Node', 'Coord', 'Strand', 'Type', 'Psi', 'CI_Width', 'CI_Lo,Hi', 'Total_Reads', 'Complexity', 'Entropy', 'Inc_Paths', 'Exc_Paths', 'Edges'] 22 | 23 | writer = csv.DictWriter(out, fieldnames=header, extrasaction='ignore', delimiter="\t") 24 | writer.writeheader() 25 | 26 | sample = file.strip(snakemake.params["trim"]) 27 | 28 | #sample = "".join(file.split("/")[-1].split(".")[:-2]) #files needs to finish with *.fastq.gz 29 | reader = csv.DictReader(f, delimiter="\t") 30 | 31 | for row in reader: 32 | 33 | row["Sample"] = sample 34 | writer.writerow(row) 35 | 36 | #print(snakemake.input) 37 | 38 | if __name__ == '__main__': 39 | main(snakemake.params["feature"], snakemake.output["merged"], snakemake.input["files"]) 40 | -------------------------------------------------------------------------------- /src/round2_ME_reads_fastq.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import csv 3 | 4 | 5 | def main(alingment_pre_processed_round2): 6 | 7 | for row in csv.reader(open(alingment_pre_processed_round2), delimiter = '\t'): 8 | 9 | try: 10 | read, flag, tag, start, cigar, seq, qual = row 11 | 12 | if len(seq)>len(qual): 13 | qual = qual + qual[ -(len(seq) - len(qual)) : ] 14 | 15 | elif len(seq)len(qual): 41 | # qual = qual + qual[ -(len(seq) - len(qual)) : ] 42 | # 43 | # elif len(seq)> sys.stderr, "Loading the genome into RAM memory ...", 19 | 20 | f = open(fasta) 21 | 22 | for chrfa in SeqIO.parse(f, "fasta"): 23 | Genome[chrfa.id] = chrfa.seq 24 | 25 | print >> sys.stderr, "OK" 26 | 27 | f.close() 28 | 29 | 30 | def main(sam_pre_processed, row_fastq): 31 | 32 | fastq_out = open( ".".join(sys.argv[2].split(".")[:-1]) + ".row_ME.fastq", 'w') 33 | ME_reads = set([]) 34 | 35 | for row in csv.reader(open(sam_pre_processed), delimiter = '\t'): 36 | 37 | if len(row)==14: #To avoid rare errors (like SRR2138604.sam.pre_processed) 38 | read, flag, tag, start, cigar, seq, qual, q_block_starts, q_block_ends, micro_exon_seq_found, I_pos_tag, DRU, DRD, DR_corrected_micro_exon_seq_found = row 39 | intron_tag, transcript_ID, anchors = tag.split("|") 40 | 41 | chr = "_".join(re.findall(r"[\w']+", intron_tag)[:-2]) 42 | istart, iend = re.findall(r"[\w']+", intron_tag)[-2:] 43 | 44 | istart = int(istart) 45 | iend = int(iend) 46 | 47 | try: 48 | 49 | intron_seq = str(Genome[chr][istart:iend]).upper() 50 | 51 | micro_exons_coords = [] 52 | 53 | island = "AG" + DR_corrected_micro_exon_seq_found + "GT" 54 | rev_island = str(Seq(island).reverse_complement()) 55 | 56 | strand = "+" 57 | 58 | if "-" in intron_tag: 59 | strand = "-" 60 | 61 | if strand == "+" and island in intron_seq: 62 | 63 | for i in [i for i in range(len(intron_seq)) if intron_seq.startswith(island, i)]: 64 | 65 | ME_start = i + 2 + istart 66 | ME_end = ME_start + len(DR_corrected_micro_exon_seq_found) 67 | ME_chr = chr 68 | ME_strand = strand 69 | 70 | micro_exons_coords.append("_".join((map(str, [ME_chr, ME_strand, ME_start, ME_end])))) 71 | 72 | 73 | elif strand == "-" and rev_island in intron_seq: 74 | 75 | 76 | for i in [i for i in range(len(intron_seq)) if intron_seq.startswith(rev_island, i)]: 77 | 78 | ME_start = i + 2 + istart 79 | ME_end = ME_start + len(DR_corrected_micro_exon_seq_found) 80 | ME_chr = chr 81 | ME_strand = strand 82 | 83 | micro_exons_coords.append("_".join((map(str, [ME_chr, ME_strand, ME_start, ME_end])))) 84 | 85 | micro_exons_coords = ",".join(micro_exons_coords) 86 | 87 | if micro_exons_coords!="": 88 | print "\t".join(row) + "\t" + micro_exons_coords 89 | ME_reads.add(read) 90 | 91 | # if flag==16: 92 | # seq = str(Seq(island).reverse_complement()) 93 | # qual = qual[::-1] 94 | 95 | # if len(seq)==len(qual): 96 | 97 | # fastq_out.write("@" + read + "\n") 98 | # fastq_out.write(seq + "\n") 99 | # fastq_out.write("+" + "\n") 100 | # fastq_out.write(qual + "\n") 101 | 102 | # elif len(seq)>len(qual): ## preventing errors with hisat 103 | 104 | # qual2 = qual + qual[ -(len(seq) - len(qual)) : ] 105 | 106 | # fastq_out.write("@" + read + "\n") 107 | # fastq_out.write(seq + "\n") 108 | # fastq_out.write("+" + "\n") 109 | # fastq_out.write(qual2 + "\n") 110 | 111 | # elif len(seq)