├── .DS_Store ├── At_rRNA_AGIs.txt ├── ChIP ├── README.md └── chip-seq_v0.1.sh ├── DNA └── dna_wgs_pipe.v1.sh ├── MethylC ├── 100bp_dmr_merge.r ├── 100bp_dmrs.v0.1.sh ├── 100bp_heatmap.sh ├── 100bp_wig_to_dmrs.r ├── 5mC_rates.sh ├── BS-SNPer.sh ├── DSS_file_prep.r ├── README.md ├── bed_to_rel_dist.sh ├── dmr_merge.r ├── dss_calling.r ├── merge_wigs.r ├── met_signatures.sh ├── methimpute.r ├── pca_wigs.r ├── rel_methylation_plots.r ├── rel_methylation_plots_v2.r ├── scatman_smooth.sh ├── smooth_scat.r ├── wgbs_cov_to_TDF.sh ├── wgbs_custom_bins.sh ├── wgbs_pipeline_v0.4.sh ├── wgbs_pipeline_v0.5.sh ├── wgbs_pipeline_v0.6.sh └── wgbs_pipeline_v0.7.sh ├── README.md ├── RNA ├── BAM_to_5p_bigWigs.sh ├── BAM_to_EJC.sh ├── BAM_to_ESI.sh ├── BAM_to_STOP.sh ├── BAM_to_TSI.sh ├── BAM_to_bedgraph.sh ├── BAM_to_bedgraph_5p.sh ├── BAM_to_bigWig.sh ├── BAM_to_wigs.sh ├── ESI_calculation.r ├── README.md ├── STAR_pipe_v1.sh ├── SUPPA_pipe_v1.sh ├── SUPPA_pipe_v2.sh ├── TSI_calculation.r ├── featureCounts_to_edgeR.r ├── featureCounts_v1.sh ├── featureCounts_v2.sh ├── featureCounts_v3-gtf.sh ├── get_peak_length.sh ├── gmuct_pipe_v1.sh ├── graft-nad-seq.sh ├── kallisto_pipe_v1.sh ├── macs_peaks.sh ├── nadBAM_to_ADPRC_sites.sh ├── pare_pipe_v1.sh ├── rel_expression_plots.r ├── rel_expression_plots_ejc.r ├── rel_expression_plots_nad.r ├── rel_expression_plots_stop.r ├── smrna_pipe_v1.sh ├── split_file.R ├── stringtie_extract_tpm.r ├── stringtie_pipe_v1.sh ├── stringtie_pipe_v2.sh ├── subread_pipe_v1.sh ├── subread_pipe_v2.sh ├── subread_pipe_v3.sh ├── total_expression.r ├── trim_5p_graft_nad.r └── trim_fastq.sh ├── TAIR10_annotation.sh ├── TruSeq-adapters.fa ├── VennPieces.R ├── araport11_assemble.sh ├── average_cov.sh ├── bashrc ├── gene_to_gene_anno.sh ├── pe_insert_size.sh ├── project_workflows ├── Exp555_cordycepin_comparison.Rmd ├── HL_RNAseq_vs_protein-stability.Rmd ├── RNA_protein_alignments.Rmd ├── Smith_etal_2022.Rmd ├── covid19_qPCR_analysis.Rmd ├── covid19_qPCR_analysis.html ├── diffsegR_v1 │ ├── diffsegR_WT-NvsC_gmuct_3p_v1.r │ ├── diffsegR_WT-NvsC_gmuct_5p_v1.r │ ├── diffsegR_WT-Nvsbulk_gmuct_3p.r │ ├── diffsegR_WT-Nvsbulk_gmuct_5p.r │ ├── diffsegR_abh1-CvsWT-C_gmuct_3p_v1.r │ ├── diffsegR_abh1-CvsWT-C_gmuct_3p_v1_stranded.r │ ├── diffsegR_abh1-CvsWT-C_gmuct_5p_v1.r │ ├── diffsegR_abh1-CvsWT-C_gmuct_5p_v1_stranded.r │ ├── diffsegR_abh1-NvsWT-N_gmuct_3p_v1.r │ ├── diffsegR_abh1-NvsWT-N_gmuct_3p_v1_stranded.r │ ├── diffsegR_abh1-NvsWT-N_gmuct_5p_v1.r │ ├── diffsegR_abh1-NvsWT-N_gmuct_5p_v1_stranded.r │ ├── diffsegR_dxo1-NvsC_gmuct_3p_v1.r │ ├── diffsegR_dxo1-NvsC_gmuct_5p_v1.r │ └── diffsegR_dxo1_gmuct.r ├── diffsegR_v2 │ ├── diffsegR_WT-NvsC_gmuct_3p.r │ ├── diffsegR_WT-NvsC_gmuct_5p.r │ ├── diffsegR_WT-Nvsbulk_gmuct_5p.r │ ├── diffsegR_abh1-NvsC_gmuct_5p.r │ ├── diffsegR_abh1-NvsWT-N_gmuct_3p.r │ └── diffsegR_abh1-NvsWT-N_gmuct_5p.r ├── lowD_5mC_dendrograms_281019.R ├── lowD_SNP_dendrograms_281019.R └── stress_coexpression_networks.Rmd ├── qPCR ├── qPCR_analysis_v1_linreg.R ├── qPCR_analysis_v2_chipPCR.R ├── qPCR_analysis_v2_chipPCR.v2.R ├── qPCR_analysis_v2_chipPCR.v3.R └── standardize_format_LC480_qPCR_DG.R └── screenrc /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dtrain16/NGS-scripts/d42b46723fc3991ac95420186ef686e2a6ed024c/.DS_Store -------------------------------------------------------------------------------- /At_rRNA_AGIs.txt: -------------------------------------------------------------------------------- 1 | AT2G01010 2 | AT2G01020 3 | AT3G41768 4 | AT3G41979 5 | ATCG00920 6 | ATCG00950 7 | ATCG00960 8 | ATCG00970 9 | ATCG01160 10 | ATCG01170 11 | ATCG01180 12 | ATCG01210 13 | ATMG00020 14 | ATMG01380 15 | ATMG01390 16 | -------------------------------------------------------------------------------- /ChIP/README.md: -------------------------------------------------------------------------------- 1 | # ChIP-seq scripts repository 2 | 3 | #### chip-seq_v0.1.sh 4 | Perform quality trimming and align raw reads from ChIP-seq experiments using SubRead (Subjunc). 5 | 6 | -------------------------------------------------------------------------------- /MethylC/100bp_dmr_merge.r: -------------------------------------------------------------------------------- 1 | options(echo=T) 2 | library(reshape2) 3 | args=commandArgs(trailingOnly=T) 4 | print(args) 5 | 6 | ############ 7 | # quick script to grab bedtools results, sum across C's in DMR window, and make final table 8 | ############ 9 | context=args[1] 10 | difference=as.numeric(args[2]) 11 | coverage=as.numeric(args[3]) 12 | 13 | #grab all the individual bedtools results (*.dmr) 14 | 15 | filelist=dir(pattern="*.dmr$") 16 | 17 | #read them in and add a row with the samplename. Rbind them all together 18 | tes=read.delim(filelist[1],head=F) 19 | group=rep(strsplit(filelist[1],"\\.")[[1]][1],nrow(tes)) 20 | tes=cbind(tes,group) 21 | for(i in 2:length(filelist)){ 22 | ss=read.delim(filelist[i],head=F) 23 | group=rep(strsplit(filelist[i],"\\.")[[1]][1],nrow(ss)) 24 | ss=cbind(ss,group) 25 | tes=rbind(tes,ss) 26 | } 27 | #use dcast (reshape2) to get it into a summary table 28 | t1=dcast(tes, V1 + V2 + V3 + V4 + V5 ~ group,value.var='V6') 29 | colnames(t1)[6:ncol(t1)]=paste(names(t1[6:ncol(t1)]),"_prop",sep='') 30 | t2=dcast(tes, V1 + V2 + V3 + V4 + V5 ~ group,value.var='V7') 31 | colnames(t2)[6:ncol(t2)]=paste(names(t2[6:ncol(t2)]),"_met",sep='') 32 | t3=dcast(tes, V1 + V2 + V3 + V4 + V5 ~ group,value.var='V8') 33 | colnames(t3)[6:ncol(t3)]=paste(names(t3[6:ncol(t3)]),"_unmet",sep='') 34 | 35 | #make a table of it all 36 | tout=cbind(t1,t2[,6:ncol(t2)],t3[,6:ncol(t3)]) 37 | 38 | #write it out, and write out a version with only rows with data for all samples 39 | write.table(tout,paste('100bp_DMRs_',context,'_',difference,'diff_',coverage,'cov.output.txt',sep=''),sep='\t',row.names=F,quote=F) 40 | 41 | 42 | # 43 | -------------------------------------------------------------------------------- /MethylC/100bp_dmrs.v0.1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -u 3 | 4 | export PATH=$PATH:/home/diep/bin/ 5 | # 100bp_dmrs.sh 6 | # This script identifies 100bp windows (from bismark alignment pipeline) that display 7 | # a difference (of choice) in methylation (context of choice) with a required coverage 8 | # level (of choice). These windows are selected across every pairwise comparison of 100bp 9 | # window wig files and aggregated into a single (bed) file. All adjacent windows are 10 | # collapsed into a single DMR. This file is then used to grab all individual met/unmet 11 | # reads for each DMR (from .cov files in bismark pipeline) for all samples. 12 | 13 | 14 | #REQUIRES THAT sample names do not contain '_', as this will screw up the final steps 15 | ###################### 16 | 17 | #execute from directory containing all the wig and cov files from all samples 18 | #usage: 19 | #if [ "$#" -ne 3 ]; then 20 | #echo "USAGE: 100bp_dmrs.v0.1.sh " 21 | #echo "EXAMPLE: 100bp_dmrs.v0.1.sh CpG 80 10" 22 | #echo "Look at CG context, difference of 80% methylation with 10 reads minimum over window" 23 | #exit 1 24 | #fi 25 | 26 | usage() { 27 | echo "############################################################" 28 | echo 29 | echo "Usage: $0 [-c ] [-m <0|100>] [-d ] [-s ]" 1>&2 30 | echo 31 | echo "This script will create DMRs from 100bp window wig files" 32 | echo 33 | echo "REQUIREMENTS:" 34 | echo "Execute the script from a directory containing wig files and cov files" 35 | echo "R and the package 'fields'" 36 | echo "bedtools > v2.20" 37 | echo 38 | echo "EXAMPLE: $0 -c CpG -m 80 -d 10" 39 | echo "Look at CpG context, difference of 80% methylation with 10 reads minimum over window" 40 | echo 41 | 42 | echo "############################################################" 43 | exit 1 44 | } 45 | 46 | flag1=0 47 | flag2=0 48 | flag3=0 49 | flag4=0 50 | while getopts ":c:m:d:s:" opt; do 51 | case $opt in 52 | c) context=$OPTARG; flag1=1;; 53 | m) difference=$OPTARG; flag2=1;; 54 | d) coverage=$OPTARG; flag3=1;; 55 | s) sitecount=$OPTARG; flag4=1;; 56 | \?) usage;; 57 | :) echo "Option -$OPTARG requires an argument." >&2; usage;; 58 | *) usage 59 | esac 60 | done 61 | 62 | if [ $flag1 == 0 ]; then 63 | echo "############################################################" 64 | echo "context argument ( -c ) required!" 65 | fi 66 | if [ $flag2 == 0 ]; then 67 | echo "############################################################" 68 | echo "methylation difference argument ( -m ) required!" 69 | fi 70 | if [ $flag3 == 0 ]; then 71 | echo "############################################################" 72 | echo "coverage argument ( -d ) required!" 73 | fi 74 | if [ $flag4 == 0 ]; then 75 | echo "############################################################" 76 | echo "site count argument ( -s ) required!" 77 | fi 78 | if [ $((flag1+flag2+flag3)) != 3 ]; then 79 | usage 80 | fi 81 | 82 | echo "c = ${context}" 83 | echo "m = ${difference}" 84 | echo "d = ${coverage}" 85 | echo "s = ${sitecount}" 86 | 87 | ###################### 88 | 89 | Rscript /home/diep/scripts/100bp_wig_to_dmrs.r ${context} ${difference} ${coverage} ${sitecount} 90 | 91 | #bedtools to intersect the bed file w. the coverage files 92 | for file in *${context}*.cov 93 | do 94 | bedtools intersect -wa -wb -a 100bp_${context}_${difference}diff_${coverage}collapsed.bed -b "$file" | bedtools groupby -i stdin -g 4,1,2,3 -c 5,9,10,11 -o mean,mean,sum,sum > "${file}.${context}_${difference}diff_${coverage}.dmr" 95 | done 96 | 97 | #file structure cleanup 98 | mkdir 100bp_${context}_${difference}diff_${coverage}_${sitecount}_out 99 | mv 100bp_${context}_${difference}diff_${coverage}* 100bp_${context}_${difference}diff_${coverage}_${sitecount}_out/ 100 | mv *.${context}_${difference}diff_${coverage}* 100bp_${context}_${difference}diff_${coverage}_${sitecount}_out/ 101 | cd 100bp_${context}_${difference}diff_${coverage}_${sitecount}_out/ 102 | 103 | #collapse them all into a summary table 104 | Rscript /home/diep/scripts/100bp_dmr_merge.r ${context} ${difference} ${coverage} 105 | -------------------------------------------------------------------------------- /MethylC/100bp_heatmap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -u 3 | # Obtaining single mC information from bed files across DMRs identified in `100bp_dmrs.v0.1.sh` 4 | # Handy when throwing lots of samples against DMRs 5 | 6 | if [ "$#" -ne 2 ]; then 7 | echo "USAGE: 100bp_heatmap.sh <100bp_dmrs output>" 8 | exit 1 9 | fi 10 | 11 | context=$1 12 | dmrs=$2 13 | 14 | echo ${context} 15 | echo ${dmrs} 16 | 17 | echo "Performing intersectBed..." 18 | 19 | for FILE in *_${context}.bed 20 | do 21 | intersectBed -wo -a ${dmrs} -b $FILE > DMRs-${FILE} 22 | bedtools groupby -i DMRs-${FILE} -g 1,2,3 -c 9 -o mean > avgDMRs-${FILE} 23 | done 24 | -------------------------------------------------------------------------------- /MethylC/100bp_wig_to_dmrs.r: -------------------------------------------------------------------------------- 1 | options(echo=T) 2 | library(reshape2) 3 | args=commandArgs(trailingOnly=T) 4 | print(args) 5 | 6 | # 1. grab wig files 7 | # 2. take each pairwise sample of a context and ID windows showing differences of at least X% 8 | # 3. collapse all windows that are adjacent to each other. Give IDs 9 | 10 | #define argments 11 | context=args[1] 12 | difference=as.numeric(args[2]) 13 | coverage=as.numeric(args[3]) 14 | sitecounts=as.numeric(args[4]) 15 | 16 | print(args) 17 | #grab all the wig files, select those for your context 18 | a=dir(pattern="*100bp.bed") 19 | a=subset(a,grepl(context,a)==T) 20 | biglist=as.list(a) 21 | 22 | aa=combn(a,2) 23 | bb=combn(1:length(biglist),2) #combinations of elements in the biglist 24 | 25 | 26 | out=NULL 27 | #loop through all pairwise combinations######################### 28 | for(i in 1:(length(aa)/2)){ 29 | 30 | file1=read.delim(aa[1,i],head=F) 31 | file2=read.delim(aa[2,i],head=F) 32 | 33 | #take windows where there is coverage for both samples 34 | merged=merge(file1,file2,by=c('V1','V2','V3')) 35 | 36 | #ID windows that show the selected difference 37 | test.diff=matrix(ifelse(abs(merged$V4.x - merged$V4.y) >= difference,1,0),ncol=1) 38 | 39 | merged=cbind(merged,test.diff) 40 | 41 | diff.windows=subset(merged,merged$test.diff==1) 42 | 43 | #select windows that meet the coverage threshold 44 | diff.windows.cov=subset(diff.windows,diff.windows$V7.x>=coverage & diff.windows$V7.y>=coverage) 45 | 46 | #select windows that also meet the sitecount threshold (looking at at least CG/CHG/CHH sites with coverage in the window) 47 | diff.windows.cov=subset(diff.windows.cov,diff.windows.cov$V8.x >=sitecounts & diff.windows.cov$V8.y >=sitecounts) 48 | 49 | if(nrow(diff.windows.cov)==0){ next} 50 | 51 | diff.windows.cov=diff.windows.cov[with(diff.windows.cov, order(diff.windows.cov[,1],diff.windows.cov[,2])),] 52 | group.id=c(1,rep(NA,nrow(diff.windows.cov)-1)) 53 | 54 | for(q in 2:nrow(diff.windows.cov)){ 55 | group.id[q]=ifelse(diff.windows.cov[q,2] - diff.windows.cov[q-1,2]<=100,group.id[q-1],group.id[q-1]+1) 56 | } 57 | group.id=matrix(group.id,ncol=1) 58 | diff.windows.cov=cbind(diff.windows.cov,group.id) 59 | 60 | calling=matrix(rep(paste(aa[1,i],"vs",aa[2,i],sep=''),nrow(diff.windows.cov)),ncol=1) 61 | 62 | diff.windows.cov=cbind(diff.windows.cov,calling) 63 | 64 | out=rbind(out,diff.windows.cov) 65 | 66 | } 67 | 68 | colnames(out)=c('chr','start','stop','prop1','met1','unmet1','total1','site1','prop2','met2','unmet2','total2','site2','difference.pass','group.id','contrast') 69 | 70 | #sort out 71 | out=out[with(out,order(out[,1],out[,2])),] 72 | 73 | dmr.id=c(1,rep(NA,nrow(out)-1)) 74 | 75 | 76 | #UPDATED 77 | for(q in 2:nrow(out)){ 78 | dmr.id[q]=ifelse(out[q,2] - out[q-1,2]<=100 & out[q,2] - out[q-1,2] >= 0 & ((out[q,4] + out[q-1,4] <= 200 - 2*difference) | (out[q,4] + out[q-1,4] >= 2*difference)),dmr.id[q-1],dmr.id[q-1]+1) 79 | } 80 | dmr.id=matrix(dmr.id,ncol=1) 81 | 82 | out=cbind(out,dmr.id) 83 | 84 | write.table(out,paste('100bp_',context,'_',difference,'diff','_',coverage,'cov.txt',sep=''),sep='\t',row.names=F) 85 | 86 | #collapse that shiz 87 | 88 | collapsed.dmrs=matrix(NA,ncol=5,nrow=max(dmr.id)) 89 | for(i in 1:max(dmr.id)){ 90 | subs=subset(out,out$dmr.id==i) 91 | chr=as.character(subs[1,1]) 92 | starts=min(subs[,2]) 93 | stops=max(subs[,3]) 94 | dmrid=i 95 | size=stops-starts 96 | collapsed.dmrs[i,]=c(chr,starts,stops,dmrid,size) 97 | } 98 | 99 | write.table(collapsed.dmrs,paste('100bp_',context,'_',difference,'diff','_',coverage,'collapsed.bed',sep=''),sep='\t',row.names=F,col.names=F,quote=F) 100 | -------------------------------------------------------------------------------- /MethylC/5mC_rates.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -u 4 | 5 | # Get methylation rates for all contexts across Chr1-5 as well as % CHH in chloroplast genome as an indication of sodium bisulfite conversion efficiency (unconverted CHH in Cp and Mt genome) 6 | 7 | if [ "$#" -lt 2 ]; then 8 | echo "Missing required arguments!" 9 | echo "USAGE: methylation_rates.sh " 10 | echo "EXAMPLE: methylation_rates.sh col0-r1 bed/cov" 11 | exit 1 12 | fi 13 | 14 | sample=$1 15 | file=$2 16 | 17 | cg="${sample}_CG*.${file}" 18 | chg="${sample}_CHG*.${file}" 19 | chh="${sample}_CHH*.${file}" 20 | 21 | echo "5mC % in $1" 22 | 23 | echo "mCG Chr1-5: "$cg" " 24 | grep -e "Chr1" -e "Chr2" -e "Chr3" -e "Chr4" -e "Chr5" $cg | awk '{ met+= $5} { unmet += $6} { total = met + unmet } END {print ((met / total))}' 25 | 26 | echo "mCHG Chr1-5: "$chg" " 27 | grep -e "Chr1" -e "Chr2" -e "Chr3" -e "Chr4" -e "Chr5" $chg | awk '{ met+= $5} { unmet += $6} { total = met + unmet } END {print ((met / total))}' 28 | 29 | echo "mCHH Chr1-5: "$chh" " 30 | grep -e "Chr1" -e "Chr2" -e "Chr3" -e "Chr4" -e "Chr5" $chh | awk '{ met+= $5} { unmet += $6} { total = met + unmet } END {print ((met / total))}' 31 | 32 | echo "mCHH ChrC: "$chh" " 33 | grep -e "ChrC" $chh | awk '{ met+= $5} { unmet += $6} { total = met + unmet } END {print ((met / total))}' 34 | 35 | echo "mCHH ChrM: "$chh" " 36 | grep -e "ChrM" $chh | awk '{ met+= $5} { unmet += $6} { total = met + unmet } END {print ((met / total))}' 37 | 38 | echo "DONE" 39 | -------------------------------------------------------------------------------- /MethylC/BS-SNPer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # use on sorted BAM file output from bismark alignment to call SNPs using default BS-SNPer settings 4 | 5 | ## require: 6 | # perl 7 | # BS-SNPer http://bioinformatics.oxfordjournals.org/content/31/24/4006.long 8 | 9 | if [ "$#" -ne 3 ]; then 10 | echo "USAGE: " 11 | echo "EXAMPLE: BS-SNPer.sh alx8-r1.sorted.bam $HOME/TAIR10/TAIR10_Chr.all.fasta alx8-r1" 12 | exit 1 13 | fi 14 | 15 | file=$1 16 | fa=$2 17 | out=$3 18 | 19 | perl ~/bin/BS-Snper-master/BS-Snper.pl --fa $fa --input $file --output temp.out --methcg meth.cg --methchg meth.chg --methchh meth.chh --minhetfreq 0.15 --minhomfreq 0.85 --minquali 30 --mincover 15 --maxcover 1000 --minread2 2 --errorate 0.02 --mapvalue 20 > ${out}.SNP.bed 2>${out}_ERR.log 20 | 21 | rm temp.out meth.cg meth.chg meth.chh 22 | -------------------------------------------------------------------------------- /MethylC/DSS_file_prep.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Produce DSS input files from BED files 3 | # run in folder with cov files of interest and tell which context you want to merge together 4 | # USAGE: DSS_file_prep.r 5 | 6 | options(echo=T) 7 | args=commandArgs(trailingOnly=T) 8 | print(args) 9 | 10 | ## mC context to test 11 | context = args[1] 12 | 13 | ## get cov files 14 | files=dir(pattern=paste0(context,".bed")) 15 | 16 | ## get total and met counts for each sample in given context and write out as separate files 17 | for(i in 1:length(files)){ 18 | file <- read.delim(files[i], head=F) 19 | file <- file[file$V1 != "ChrM",] 20 | file <- file[file$V1 != "ChrC",] 21 | file <- file[,c(1,2,7,5)] 22 | test <- as.numeric(regexec(text = as.character(files[i]), pattern=".bed")) 23 | sample <- substr(as.character(files[i]), start = 1, stop = test-1) 24 | colnames(file)=c('chr','pos','N','X') 25 | write.table(x=file, file=paste0(sample,"_output.txt"),sep='\t', quote = F, col.names=T, row.names=F) 26 | } 27 | -------------------------------------------------------------------------------- /MethylC/README.md: -------------------------------------------------------------------------------- 1 | # MethylC-seq scripts repository 2 | 3 | #### 100bp_dmr_merge.r 4 | Supplemental script for *100bp_dmrs.v0.1.sh* for collapsing/summarising DMRs 5 | 6 | #### 100bp_dmrs.v0.1.sh 7 | This script uses 100bp windowed methylation data to call differences across the genome in a defined context with required coverage and a defined difference. These are performed in a pairwise manner between all samples of interest. 8 | 9 | #### 100bp_heatmap.sh 10 | Uses the output from *100bp_dmrs.v0.1.sh* to get mC from bed files (individual mC resolution). 11 | 12 | #### 100bp_wig_to_dmrs.r 13 | First step of *100bp_dmrs.v0.1.sh* that makes pairwise comparisons of windows showing a defined difference in a given context. 14 | 15 | #### 5mC\_rates.sh 16 | Calculate mCG, mCHG and mCHH % across At Chr1-5 and mCHH across Cp and Mt genomes (conversion efficiency) 17 | 18 | #### BS-SNPer.sh 19 | Script to perform SNP calling from aligned bisulfite converted reads. Use on sorted BAM file. 20 | 21 | #### DSS_calling.r 22 | Script for performing DSS DMR calling on re-formatted (using DSS_file_prep.r) bed.cov file output. 23 | 24 | #### DSS_file_prep.r 25 | Script for re-formatting bed.cov file methylation output for input to DSS. 26 | 27 | #### bed_to_rel_dist.sh 28 | Produce genome summarised methylation plots across features of interest e.g. gene models. 29 | 30 | #### dmr_merge.r 31 | Supplemental file for 100bp DMR calling to produce final DMR table. 32 | 33 | #### merge_wigs.r 34 | Merge 100bp binned weighted methylation BED files to produce correlation matrices with hierarchical clustering of samples of interest. 35 | 36 | #### met_signatures.sh 37 | Extract cytosine reports for methylation at non-canonical methylation sequence contexts (see Gouil & Baulcombe, PLoS Gen 2015). 38 | 39 | #### methimpute.r 40 | Use METHimpute to perform HMM-based imputation of methylation state at single C resolution across genome. Also produces 100bp.bed output files (methylation levels binned into 100bp windows). 41 | 42 | #### pca_wigs.r 43 | Perform PCA on 100bp binned weighted methylation levels. 44 | 45 | #### rel_methylation_plots.r 46 | Supplementary R script for bed_to_rel.sh to produce binned methylation values summarised across supplied features of interest. 47 | 48 | #### rel_methylation_plots_v2.r 49 | Variation on rel_methylation_plots.r to get binned summarised methylation values for non-canonical sequence contexts. 50 | 51 | #### scatman_smooth.sh & smooth_scat.r 52 | Pair of scripts (use .sh to run) to take annotation file, bed file, and feature name to make scattersmooth plots in R to correlate methylation levels and feature characteristics (e.g. 5mC vs TE length). 53 | 54 | #### wgbs_cov_to_TDF.txt 55 | Produce TDF files from bismark cov file of interest. Check IGV compatible genome build ready. 56 | 57 | #### wgbs_custom_bins.sh 58 | Bin weighted methylation levels into sizes of users choosing (typically 100bp). 59 | 60 | ### wgbs_pipeline.sh 61 | #### v0.4 62 | Bismark alignment script using Bowtie1 aligner. Has SE and PE options. 63 | #### v0.5 64 | Bismark alignment script using Bowtie2 aligner. Has SE and PE options. 65 | #### v0.6 66 | Perl script for 100bp windows deprecated thus removed. 67 | #### v0.7 68 | added deduplicate\_bismark and keeping only .cov files. Will consider adding coverage filter on single Cs. 69 | -------------------------------------------------------------------------------- /MethylC/bed_to_rel_dist.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Plotting DNA methylation over gene models 3 | 4 | ####################### 5 | # REQUIREMENTS 6 | # bedtools 7 | # awk 8 | # R with libraries: fields 9 | ####################### 10 | 11 | if [ "$#" -ne 3 ]; then 12 | echo "USAGE: bed_to_rel_dist.sh " 13 | echo "EXAMPLE: bed_to_rel_dist.sh $HOME/Araport11/Araport11_genes.sorted.bed sample-r1 genes" 14 | exit 1 15 | fi 16 | 17 | bedpath=$1 18 | filename=$2 19 | outname=$3 20 | 21 | sort -k1,1 -k2,2n ${filename}_CG_100bp*.bed -o ${filename}_CG_100bp.bed 22 | sort -k1,1 -k2,2n ${filename}_CHG_100bp*.bed -o ${filename}_CHG_100bp.bed 23 | sort -k1,1 -k2,2n ${filename}_CHH_100bp*.bed -o ${filename}_CHH_100bp.bed 24 | 25 | #get total number of columns for both input files 26 | l1="$(cat ${filename}_CG_100bp.bed | awk 'BEGIN{FS="\t"};{print NF}' | head -n 1)" 27 | l2="$(cat ${bedpath} | awk 'BEGIN{FS="\t"};{print NF}' | head -n 1)" 28 | 29 | #convert the wigs to bed 30 | ####################### some bedtools stuff 31 | echo "Performing closestBed of CHG methylation..." 32 | closestBed -D "ref" -a ${filename}_CHG_100bp.bed -b $bedpath > ${filename}_CHG_${outname}.bed 33 | echo "Performing closestBed of CHH methylation..." 34 | closestBed -D "ref" -a ${filename}_CHH_100bp.bed -b $bedpath > ${filename}_CHH_${outname}.bed 35 | echo "Performing closestBed of CG methylation..." 36 | closestBed -D "ref" -a ${filename}_CG_100bp.bed -b $bedpath > ${filename}_CG_${outname}.bed 37 | 38 | #subset to the regions within 100bp of a gene (make the files more manageable for R) 39 | echo "subsetting files to within 1kb..." 40 | awk -F$'\t' '$NF<1000 && $NF>-1000' ${filename}_CHG_${outname}.bed > ${filename}_CHG_${outname}.1k.bed 41 | awk -F$'\t' '$NF<1000 && $NF>-1000' ${filename}_CHH_${outname}.bed > ${filename}_CHH_${outname}.1k.bed 42 | awk -F$'\t' '$NF<1000 && $NF>-1000' ${filename}_CG_${outname}.bed > ${filename}_CG_${outname}.1k.bed 43 | ####################### 44 | 45 | rm ${filename}_CG_100bp.bed ${filename}_CHG_100bp.bed ${filename}_CHH_100bp.bed 46 | rm ${filename}_CHG_${outname}.bed ${filename}_CHH_${outname}.bed ${filename}_CG_${outname}.bed 47 | 48 | echo "Performing R plots..." 49 | #initiate the R script to create the plots 50 | Rscript $HOME/scripts/MethylC/rel_methylation_plots.r ${filename} ${outname} ${l1} ${l2} 51 | -------------------------------------------------------------------------------- /MethylC/dmr_merge.r: -------------------------------------------------------------------------------- 1 | options(echo=T) 2 | library(reshape2) 3 | args=commandArgs(trailingOnly=T) 4 | print(args) 5 | 6 | ############ 7 | # quick script to grab bedtools results, sum across C's in DMR window, and make final table 8 | ############ 9 | 10 | 11 | #grab all the individual bedtools results (*.dmr) 12 | 13 | filelist=dir(pattern="*.dmr$") 14 | 15 | #read them in and add a row with the samplename. Rbind them all together 16 | tes=read.delim(filelist[1],head=F) 17 | group=rep(strsplit(filelist[1],"_")[[1]][1],nrow(tes)) 18 | tes=cbind(tes,group) 19 | for(i in 2:length(filelist)){ 20 | ss=read.delim(filelist[i],head=F) 21 | group=rep(strsplit(filelist[i],"_")[[1]][1],nrow(ss)) 22 | ss=cbind(ss,group) 23 | tes=rbind(tes,ss) 24 | } 25 | 26 | #use dcast (reshape2) to get it into a summary table 27 | t1=dcast(tes, V1 + V2 + V3 ~ group,value.var='V4') 28 | colnames(t1)[4:ncol(t1)]=paste(names(t1[4:ncol(t1)]),"_prop",sep='') 29 | t2=dcast(tes, V1 + V2 + V3 ~ group,value.var='V5') 30 | colnames(t2)[4:ncol(t2)]=paste(names(t2[4:ncol(t2)]),"_met",sep='') 31 | t3=dcast(tes, V1 + V2 + V3 ~ group,value.var='V6') 32 | colnames(t3)[4:ncol(t3)]=paste(names(t3[4:ncol(t3)]),"_unmet",sep='') 33 | 34 | #make a table of it all 35 | tout=cbind(t1,t2[,4:ncol(t2)],t3[,4:ncol(t3)]) 36 | 37 | #write it out, and write out a version with only rows with data for all samples 38 | write.table(tout,'all_dmr_metvalues.txt',sep='\t',row.names=F,quote=F) 39 | toutnarm=na.omit(tout) 40 | write.table(toutnarm,'all_dmr_metvalues_noNA.txt',sep='\t',row.names=F,quote=F) 41 | 42 | # 43 | -------------------------------------------------------------------------------- /MethylC/dss_calling.r: -------------------------------------------------------------------------------- 1 | # Script to perform DSS DMR calling 2 | # Need to enter manually; not setup for running 3 | # Make sure files are converted into right format using DSS_file_prep.r 4 | options(echo=T) 5 | args=commandArgs(trailingOnly=T) 6 | print(args) 7 | 8 | # install DSS 9 | # source("http://bioconductor.org/biocLite.R") 10 | # biocLite("DSS") 11 | 12 | # Define arguments 13 | context = args[1] 14 | pvalue = args[2] 15 | dlt = args[3] 16 | condition1 = args[4] 17 | condition2 = args[5] 18 | 19 | # Read in correctly formatted files 20 | files <- dir(pattern = paste0(context,".output")) 21 | 22 | # Define sample groups 23 | group1 <- files[condition1] 24 | group2 <- files[condition2] 25 | 26 | library(DSS) 27 | # read input files in DSS format (chr, pos, N, X) 28 | dat1.1 <- read.delim(unlist(group1)[1]) 29 | dat1.2 <- read.delim(unlist(group1)[2]) 30 | dat1.3 <- read.delim(unlist(group1)[3]) 31 | 32 | dat2.1 <- read.delim(unlist(group2)[1]) 33 | dat2.2 <- read.delim(unlist(group2)[2]) 34 | dat2.3 <- read.delim(unlist(group2)[3]) 35 | 36 | # setup bsseq object 37 | BSobj <- makeBSseqData(list(dat1.1,dat1.2,dat1.3,dat2.1,dat2.2,dat2.3),sampleNames=c("C1","C2","C3","N1","N2","N3")) 38 | 39 | # Estimation of methylation means with smoothing by moving averages and smaller smoothing window 40 | dmlTest <- DMLtest(BSobj,group1=c("C1","C2","C3"), group2=c("N1","N2","N3"),smoothing=TRUE,smoothing.span=100) 41 | 42 | # identify DMRs based on dmltesting and write out to file 43 | dmrs <- callDMR(dmlTest, delta=dlt, minlen=50, minCG=3, pct.sig=0.5, dis.merge=50, p.threshold=pvalue) 44 | 45 | ## look at distributions of test statistics and p-values 46 | par(mfrow=c(2,2)) 47 | hist(dmlTest$stat, 100, main="test statistics") 48 | hist(dmlTest$pval, 100, main="P values") 49 | hist(dmlTest$fdr, 100, main="FDR values") 50 | hist(dmlTest$diff, 100, main="estimates") 51 | dev.off() 52 | 53 | # filename 54 | file1=paste0(group1,"vs",group2,"_",context,"_delta=",dlt,"_p=",pvalue,".bed") 55 | 56 | # write out file 57 | write.table(dmrs,file=file1,quote=FALSE,sep="\t",row.names=FALSE,col.names=FALSE) 58 | -------------------------------------------------------------------------------- /MethylC/merge_wigs.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Produce methylome correlation matrix using specified BED files 3 | 4 | args = commandArgs(trailingOnly=T) 5 | 6 | if(length(args) != 4){ 7 | print('Args missing!') 8 | print('USAGE: merge_wigs.r ') 9 | print('EXAMPLE merge_wigs.r CHH 0 0 1') 10 | quit() 11 | n 12 | } 13 | 14 | context=args[1] 15 | bin=paste0(args[2],"bp") 16 | cov=paste0(args[3],"cov") 17 | type=args[4] 18 | 19 | print(c(context, bin, cov, type)) 20 | 21 | library(tidyverse) 22 | library(gplots) 23 | 24 | if(bin == '0bp' & cov == '0cov' ){ 25 | files <- dir(pattern=paste(context)) %>% 26 | subset(subset=substr(., start=nchar(.)-2, stop=nchar(.)) == 'bed') 27 | } else { 28 | files <- dir(pattern=paste(context,bin,cov,sep="_")) 29 | } 30 | 31 | print(files) 32 | 33 | if(type==1){ 34 | 35 | data <- data_frame(files) %>% 36 | mutate(file_contents = map(files, read_delim, delim='\t', col_names=F, skip=1)) %>% 37 | unnest() %>% 38 | filter(X1 != 'Mt' & X1 != 'ChrM' & X1 != 'Pt' & X1 != 'ChrC') %>% 39 | select(files, X1, X2, X3, X4) %>% 40 | mutate(sample=sapply(strsplit(files, '_'), function(l) l[1])) %>% 41 | mutate(genotype=sapply(strsplit(sample, '-'), function(l) l[1])) %>% 42 | mutate(rep=sapply(strsplit(sample, '-'), function(l) l[2])) %>% 43 | mutate(X1=ifelse(substr(X1, start=1, stop=3)=="Chr",paste0(X1),paste0("Chr",X1))) %>% 44 | na.omit() %>% 45 | group_by(X1, X2, X3, genotype, rep) %>% 46 | summarise(met = mean(X4)) %>% 47 | unite(temp, genotype, rep) %>% 48 | spread(key=temp, value=met) %>% 49 | na.omit() %>% 50 | ungroup() %>% 51 | select(-X1, -X2, -X3) %>% 52 | cor() %>% 53 | as.matrix() 54 | 55 | ### heatmap 56 | pdf(file=paste0('wig_cor_',context,'_reps.pdf'), width = 0, height = 0, paper="a4r") 57 | 58 | heatmap.2(data, 59 | trace='none', 60 | density.info='none', 61 | symm=F, 62 | symkey=F, 63 | key=T, 64 | colsep = 1:ncol(data), 65 | rowsep = 1:nrow(data), 66 | sepcolor = "white", 67 | sepwidth = c(0.001,0.001), 68 | dendrogram='both', 69 | margins = c(8,8), 70 | cexCol = 1, 71 | cexRow = 1) 72 | dev.off() 73 | 74 | } else { 75 | 76 | data <- data_frame(files) %>% 77 | mutate(file_contents = map(files, read_delim, delim='\t', col_names=F, skip=1)) %>% 78 | unnest() %>% 79 | filter(X1 != 'Mt' & X1 != 'ChrM' & X1 != 'Pt' & X1 != 'ChrC') %>% 80 | select(files, X1, X2, X3, X4) %>% 81 | mutate(sample=sapply(strsplit(files, '_'), function(l) l[1])) %>% 82 | mutate(genotype=sapply(strsplit(sample, '-'), function(l) l[1])) %>% 83 | mutate(rep=sapply(strsplit(sample, '-'), function(l) l[2])) %>% 84 | mutate(X1=ifelse(substr(X1, start=1, stop=3)=="Chr",paste0(X1),paste0("Chr",X1))) %>% 85 | na.omit() %>% 86 | group_by(X1, X2, X3, genotype) %>% 87 | summarise(met = mean(X4)) %>% 88 | spread(key=genotype, value=met) %>% 89 | na.omit() %>% 90 | ungroup() %>% 91 | select(-X1, -X2, -X3) %>% 92 | cor() %>% 93 | as.matrix() 94 | 95 | ## heatmap 96 | pdf(file=paste0('wig_cor_',context,'.pdf'), width = 0, height = 0, paper="a4r") 97 | 98 | heatmap.2(data, 99 | trace='none', 100 | density.info='none', 101 | symm=F, 102 | symkey=F, 103 | key=T, 104 | colsep = 1:ncol(data), 105 | rowsep = 1:nrow(data), 106 | sepcolor = "white", 107 | sepwidth = c(0.001,0.001), 108 | dendrogram='both', 109 | margins = c(8,8), 110 | cexCol = 1, 111 | cexRow = 1) 112 | dev.off() 113 | } 114 | -------------------------------------------------------------------------------- /MethylC/met_signatures.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eu 3 | 4 | # re-extract DNA methylation, at custom sequence contexts, from BAM file and produce cytosine report 5 | # perform in 4_bismark output sub-directory of wgbs workflow 6 | 7 | if [ "$#" -ne 6 ]; then 8 | echo "USAGE: met-sign.sh " 9 | echo "EXAMPLE: met-sign.sh SE CHH sample.bam Araport_mRNA.sorted.bed sample mRNA" 10 | exit 1 11 | fi 12 | 13 | layout=$1 14 | context=$2 15 | fl=$3 16 | annopath=$4 17 | sample=$5 18 | outname=$6 19 | 20 | echo "Extracting CX report from $1 BAM ..." 21 | 22 | if [ $layout == "SE" ]; then 23 | bismark_methylation_extractor --comprehensive --multicore 4 --cytosine_report --CX --genome_folder ~/TAIR10/ --report --buffer_size 8G -s ${fl} 24 | fi 25 | 26 | if [ $layout == "PE" ]; then 27 | bismark_methylation_extractor --comprehensive --multicore 4 --cytosine_report --CX --genome_folder ~/TAIR10/ --report --buffer_size 8G -p ${fl} 28 | fi 29 | 30 | gzip -d *cov.gz 31 | sortBed -i ${fl::-3}bismark.cov > ${fl::-3}bismark.bed 32 | bedfile="${fl::-3}bismark.bed" 33 | 34 | echo "reports extracted" 35 | echo "$context from $bedfile" 36 | echo $1 $2 $3 $4 $5 $6 37 | 38 | # re-organise report, grep context, and awk to remove C and M 39 | awk '{print $1 "\t" $2 "\t" $2+1 "\t" $6 "\t" $7}' ${fl::-3}CX_report.txt | grep "$context" | awk -F$'\t' ' $1 != "ChrC" && $1 != "ChrM" ' > ${fl::-3}${context}_report.bed 40 | 41 | sortBed -i ${fl::-3}${context}_report.bed > ${fl::-3}${context}_report.sorted.bed 42 | 43 | # intersect sub-context info to bismark.cov 44 | intersectBed -wo -sorted -a ${fl::-3}${context}_report.sorted.bed -b $bedfile | awk 'BEGIN { OFS = "\t" } {print $1, $2, $3, $4, $5, $9}' > ${sample}-${outname}-sub${context}-report.bed 45 | 46 | # closest to get info across annotation file and subset to within 1kb 47 | closestBed -D "b" -a ${sample}-${outname}-sub${context}-report.bed -b $annopath | awk -F$'\t' '$NF<1000 && $NF>-1000' > ${sample}-${outname}-sub${context}-report.1k.bed 48 | 49 | echo "done... cleaning..." 50 | 51 | rm C*txt 52 | rm *_report.txt 53 | rm *bedGraph.gz 54 | rm *M-bias.txt 55 | rm $bedfile 56 | rm *cov 57 | rm ${fl::-3}${context}_report*bed 58 | rm ${sample}-${outname}-sub${context}-report.bed 59 | 60 | echo "R" 61 | 62 | Rscript ~/scripts/rel_methylation_plots_v2.r ${sample} ${outname} ${context} 63 | 64 | echo "DONE" 65 | -------------------------------------------------------------------------------- /MethylC/methimpute.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | options(echo=T) 3 | library(fields) 4 | args=commandArgs(trailingOnly=T) 5 | print(args) 6 | 7 | ## Citation 8 | # Taudt, A., Roquis, D., Vidalis, A., Wardenaar, R., Johannes, F., and Colome-Tatché́-Tatché, M. (2018). METHimpute: imputation-guided construction of complete methylomes from WGBS data. BMC Genomics 19: 444. 9 | 10 | ## Perform METHimpute to get imputed/recalibrated genome-wide methylation levels at single Cs and 100bp tiles 11 | # https://github.com/ataudt/methimpute/blob/master/README.md 12 | # https://github.com/ataudt/methimpute/blob/master/vignettes/methimpute.pdf 13 | 14 | ### Installation 15 | # install.packages("devtools") 16 | # source("http://bioconductor.org/biocLite.R") 17 | # biocLite(c("GenomicRanges")) 18 | # library(devtools) 19 | # install_github("ataudt/methimpute") 20 | 21 | ### Input files 22 | # Run in bash to get 1-based genome-wide cytosine report 23 | # bismark_methylation_extractor --multicore 4 --cytosine_report --CX --genome_folder $HOME/TAIR10 *sorted.bam 24 | 25 | ## load library 26 | library(methimpute) 27 | library(tidyverse) 28 | 29 | ## file "CX_report.txt" 30 | file <- args[1] 31 | outname <- args[2] 32 | 33 | ## chromosome lengths from methimpute 34 | data(arabidopsis_chromosomes) 35 | arabidopsis_chromosomes$chromosome <- sub('chr', 'Chr', arabidopsis_chromosomes$chromosome) 36 | 37 | ## data import 38 | bismark.data <- importBismark(file, chrom.lengths=arabidopsis_chromosomes) 39 | 40 | ## Get positions of all cytosines to inflate methylation data (include non-covered sites) 41 | fasta.file <- '~/TAIR10/chromosomes/arabidopsis_seq.fa' 42 | cytosine.positions = extractCytosinesFromFASTA(fasta.file, contexts = c('CG','CHG','CHH')) 43 | methylome = inflateMethylome(bismark.data,cytosine.positions) 44 | print(methylome) 45 | 46 | ## Obtain correlation parameters (methylation levels from adjacent cytosines) 47 | distcor = distanceCorrelation(methylome, separate.contexts = TRUE) 48 | 49 | ## Estimate decay parameter for distancce dependeny of the transition probabilities in HMM 50 | fit = estimateTransDist(distcor) 51 | 52 | ## HMM for complete set using transition probabilities 53 | model = callMethylationSeparate(data = methylome, transDist = fit$transDist, num.threads = 4) 54 | # print(model) 55 | 56 | ## At genes and TE coordinates 57 | data(arabidopsis_genes) 58 | seqlevels(arabidopsis_genes) <- sub('chr', 'Chr', seqlevels(arabidopsis_genes)) 59 | data(arabidopsis_TEs) 60 | seqlevels(arabidopsis_TEs) <- sub('chr', 'Chr', seqlevels(arabidopsis_TEs)) 61 | 62 | ## METHimpute plotting 63 | pdf(paste0(outname,"_methimpute_HMMfit_enrichment.pdf")) 64 | print(fit$plot) 65 | plotHistogram(model, total.counts=5) 66 | plotScatter(model) 67 | plotTransitionProbs(model) 68 | plotConvergence(model) 69 | plotPosteriorDistance(model$data) 70 | plotEnrichment(model, annotation=arabidopsis_genes) 71 | plotEnrichment(model, annotation=arabidopsis_TEs) 72 | dev.off() 73 | 74 | ## Export full fitted HMM model 75 | # exportMethylome(model, paste0(outname,"_methimpute_HMMfit.tsv")) 76 | 77 | ## Output recalibrated methylation levels for downstream analysis akin to bismark cov files 78 | df <- methods::as(model$data, 'data.frame') %>% 79 | select(seqnames, start, end, context, rc.meth.lvl) 80 | 81 | df_CG <- subset(df, context == "CG") %>% 82 | select(-context) %>% 83 | mutate(rc.meth.lvl = rc.meth.lvl * 100) %>% 84 | utils::write.table(., file = paste0(outname,"_recal_CG.bed.cov"), quote = F, sep = '\t', row.names = F, col.names = F) 85 | 86 | df_CHG <- subset(df, context == "CHG") %>% 87 | select(-context) %>% 88 | mutate(rc.meth.lvl = rc.meth.lvl * 100) %>% 89 | utils::write.table(., file = paste0(outname,"_recal_CHG.bed.cov"), quote = F, sep = '\t', row.names = F, col.names = F) 90 | 91 | df_CHH <- subset(df, context == "CHH") %>% 92 | select(-context) %>% 93 | mutate(rc.meth.lvl = rc.meth.lvl * 100) %>% 94 | utils::write.table(., file = paste0(outname,"_recal_CHH.bed.cov"), quote = F, sep = '\t', row.names = F, col.names = F) 95 | 96 | ## Binned methylation output of recalibrated weighted methylation levels 97 | df_100bp <- binMethylome(model$data, binsize=100, contexts=c("CG","CHG","CHH"), columns.average="rc.meth.lvl") 98 | 99 | df_100bp_CG <- methods::as(df_100bp$CG, 'data.frame') %>% 100 | select(seqnames, start, end, rc.meth.lvl) %>% 101 | mutate(rc.meth.lvl = rc.meth.lvl * 100) %>% 102 | mutate(start = start - 1) %>% 103 | mutate(end = end - 1) %>% 104 | utils::write.table(., file = paste0(outname,"_recal_CG_100bp.bed"), quote = F, sep = '\t', row.names = F, col.names = F) 105 | 106 | df_100bp_CHG <- methods::as(df_100bp$CHG, 'data.frame') %>% 107 | select(seqnames, start, end, rc.meth.lvl) %>% 108 | mutate(rc.meth.lvl = rc.meth.lvl * 100) %>% 109 | mutate(start = start - 1) %>% 110 | mutate(end = end - 1) %>% 111 | utils::write.table(., file = paste0(outname,"_recal_CHG_100bp.bed"), quote = F, sep = '\t', row.names = F, col.names = F) 112 | 113 | df_100bp_CHH <- methods::as(df_100bp$CHH, 'data.frame') %>% 114 | select(seqnames, start, end, rc.meth.lvl) %>% 115 | mutate(rc.meth.lvl = rc.meth.lvl * 100) %>% 116 | mutate(start = start - 1) %>% 117 | mutate(end = end - 1) %>% 118 | utils::write.table(., file = paste0(outname,"_recal_CHH_100bp.bed"), quote = F, sep = '\t', row.names = F, col.names = F) 119 | 120 | -------------------------------------------------------------------------------- /MethylC/pca_wigs.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # merge wigs and perform PCAs 3 | 4 | args = commandArgs(trailingOnly=T) 5 | print(args) 6 | context=args[1] 7 | method <- paste0(ifelse(args[2] != "pearson" & args[2] != "kendall" & args[2] != "spearman", yes="pearson", no=args[2])) 8 | method <- ifelse(method == "NA", yes="pearson", no=args[2]) 9 | print(paste("cor method = " ,method)) 10 | 11 | library(tidyverse) 12 | 13 | files <- dir(pattern=paste0(context,"_100bp.wig")) 14 | data <- data_frame(files) %>% 15 | mutate(file_contents = map(files, read_delim, delim='\t', col_names=F, skip=1)) %>% 16 | unnest() %>% 17 | filter(X1!='Mt'&X1!='ChrM'&X1!='Pt'&X1!='ChrC') %>% 18 | mutate(sample=sapply(strsplit(files, '_'), function(l) l[1])) %>% 19 | mutate(genotype=sapply(strsplit(sample, '-'), function(l) l[1])) %>% 20 | mutate(rep=sapply(strsplit(sample, '-'), function(l) l[2])) %>% 21 | mutate(X1=ifelse(substr(X1, start=1, stop=3)=="Chr",paste0(X1),paste0("Chr",X1))) %>% 22 | na.omit() %>% 23 | group_by(X1, X2, X3, genotype) %>% 24 | summarise(met = mean(X4)) %>% 25 | spread(key=genotype, value=met) %>% 26 | na.omit() %>% 27 | ungroup() %>% 28 | select(-X1, -X2, -X3) %>% 29 | as.matrix() 30 | 31 | # PCA analysis 32 | # pc=prcomp(data) 33 | # plot(pc, type ='l' , main='Variance of PCs') 34 | # plot(pc$x[1,], pc$x[2,], xlab = 'PC1', ylab='PC2') 35 | # text(pc$x[1,], pc$x[2,], colnames(data), cex = 0.8, pos=4) 36 | # library(devtools) 37 | # install_github("ggbiplot","vqv") 38 | # library(ggbiplot) 39 | # ggbiplot(pc, obs.scale=1, var.scale=1, groups=ir.species, ellipse = TRUE, circle = TRUE) + theme(legend.direction = 'horizontal', legend.position = 'top') 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /MethylC/rel_methylation_plots.r: -------------------------------------------------------------------------------- 1 | options(echo=T) 2 | library(fields) 3 | args=commandArgs(trailingOnly=T) 4 | print(args) 5 | 6 | #read in files# 7 | cpg=read.delim(paste(args[1],'_CG_',args[2],'.1k.bed',sep=''),head=F) 8 | chg=read.delim(paste(args[1],'_CHG_',args[2],'.1k.bed',sep=''),head=F) 9 | chh=read.delim(paste(args[1],'_CHH_',args[2],'.1k.bed',sep=''),head=F) 10 | 11 | #remove scaffolds 12 | cpg.sub=subset(cpg,cpg$V1!='ChrM' & cpg$V1!='chrC') 13 | chg.sub=subset(chg,chg$V1!='ChrM' & chg$V1!='chrC') 14 | chh.sub=subset(chh,chh$V1!='ChrM' & chh$V1!='chrC') 15 | 16 | f1.end=as.numeric(args[3]) 17 | f2.end=as.numeric(args[4])+1 18 | cpg.sub=subset(cpg.sub,cpg.sub[,f1.end + f2.end]!= -1) 19 | chg.sub=subset(chg.sub,chg.sub[,f1.end + f2.end]!= -1) 20 | chh.sub=subset(chh.sub,chh.sub[,f1.end + f2.end]!= -1) 21 | 22 | #CpG 23 | real.dist=matrix(ifelse(cpg.sub[,f1.end + 6]=='+',-1*cpg.sub[,f1.end+f2.end],cpg.sub[,f1.end + f2.end]),ncol=1) 24 | cpg.sub=cbind(cpg.sub,real.dist) 25 | rel.dist=matrix(ifelse(cpg.sub$real.dist==0,ifelse(cpg.sub[,f1.end + 6]=="-",((cpg.sub[,f1.end + 3] - (cpg.sub[,2]))/(cpg.sub[,f1.end + 3] - cpg.sub[,f1.end + 2]))*1000,(((cpg.sub[,2]) - cpg.sub[,f1.end + 2])/(cpg.sub[,f1.end + 3] - cpg.sub[, f1.end + 2]))*1000),ifelse(cpg.sub$real.dist>0,cpg.sub$real.dist + 1000,cpg.sub$real.dist)),ncol=1) 26 | cpg.sub=cbind(cpg.sub,rel.dist) 27 | fixy=ifelse(cpg.sub$rel.dist < 0 & cpg.sub$real.dist==0,0,ifelse(cpg.sub$rel.dist >1000 & cpg.sub$real.dist==0,1000,cpg.sub$rel.dist)) 28 | cpg.sub$rel.dist=fixy 29 | cpg.bin=stats.bin(cpg.sub$rel.dist,cpg.sub$V4,N=100) 30 | p.cpg.bin=cbind(matrix(cpg.bin$centers,ncol=1),cpg.bin$stats["mean",]) 31 | 32 | #CHG 33 | real.dist=matrix(ifelse(chg.sub[,f1.end + 6]=='+',-1*chg.sub[,f1.end + f2.end],chg.sub[,f1.end + f2.end]),ncol=1) 34 | chg.sub=cbind(chg.sub,real.dist) 35 | rel.dist=matrix(ifelse(chg.sub$real.dist==0,ifelse(chg.sub[,f1.end + 6]=="-",((chg.sub[,f1.end + 3] - (chg.sub$V2))/(chg.sub[,f1.end + 3] - chg.sub[,f1.end + 2]))*1000,(((chg.sub$V2) - chg.sub[,f1.end + 2])/(chg.sub[,f1.end + 3] - chg.sub[,f1.end + 2]))*1000),ifelse(chg.sub$real.dist>0,chg.sub$real.dist + 1000,chg.sub$real.dist)),ncol=1) 36 | chg.sub=cbind(chg.sub,rel.dist) 37 | fixy=ifelse(chg.sub$rel.dist < 0 & chg.sub$real.dist==0,0,ifelse(chg.sub$rel.dist >1000 & chg.sub$real.dist==0,1000,chg.sub$rel.dist)) 38 | chg.sub$rel.dist=fixy 39 | chg.bin=stats.bin(chg.sub$rel.dist,chg.sub$V4,N=100) 40 | p.chg.bin=cbind(matrix(chg.bin$centers,ncol=1),chg.bin$stats["mean",]) 41 | 42 | #CHH 43 | real.dist=matrix(ifelse(chh.sub[,f1.end + 6]=='+',-1*chh.sub[,f1.end + f2.end],chh.sub[,f1.end + f2.end]),ncol=1) 44 | chh.sub=cbind(chh.sub,real.dist) 45 | rel.dist=matrix(ifelse(chh.sub$real.dist==0,ifelse(chh.sub[,f1.end + 6]=="-",((chh.sub[,f1.end + 3] - (chh.sub$V2))/(chh.sub[,f1.end + 3] - chh.sub[,f1.end + 2]))*1000,(((chh.sub$V2) - chh.sub[,f1.end + 2])/(chh.sub[,f1.end + 3] - chh.sub[,f1.end + 2]))*1000),ifelse(chh.sub$real.dist>0,chh.sub$real.dist + 1000,chh.sub$real.dist)),ncol=1) 46 | chh.sub=cbind(chh.sub,rel.dist) 47 | fixy=ifelse(chh.sub$rel.dist < 0 & chh.sub$real.dist==0,0,ifelse(chh.sub$rel.dist >1000 & chh.sub$real.dist==0,1000,chh.sub$rel.dist)) 48 | chh.sub$rel.dist=fixy 49 | chh.bin=stats.bin(chh.sub$rel.dist,chh.sub$V4,N=100) 50 | p.chh.bin=cbind(matrix(chh.bin$centers,ncol=1),chh.bin$stats["mean",]) 51 | 52 | #create plots 53 | pdf(paste(args[1],'_',args[2],'_methylation.pdf',sep=''),h=10,w=12) 54 | plot(x=NULL,y=NULL,xlim=c(-1000,2000),ylim=c(0,100),xlab='',ylab='% methylation',main=paste(args[1],' methylation over ',args[2],sep='')) 55 | lines(p.cpg.bin,col=1,lwd=2) 56 | lines(p.chg.bin,col=2,lwd=2) 57 | lines(p.chh.bin,col=3,lwd=2) 58 | abline(v=0,lty=2) 59 | abline(v=1000,lty=2) 60 | legend('topright',c(paste(args[1],' - CpG',sep=''),paste(args[1],' - CHG',sep=''),paste(args[1],' - CHH',sep='')),col=c(1,2,3),lwd=2,lty=1) 61 | dev.off() 62 | ##################################################################################### 63 | 64 | out=cbind(p.cpg.bin,p.chg.bin[,2],p.chh.bin[,2]) 65 | colnames(out)=c( 'pos' , 'CG' , 'CHG' , 'CHH') 66 | write.table(out,paste(args[1],'_',args[2],'values.txt',sep=''), sep='\t', row.names=F, quote=F) 67 | -------------------------------------------------------------------------------- /MethylC/rel_methylation_plots_v2.r: -------------------------------------------------------------------------------- 1 | # produce mean 5mC levels for R plotting 2 | options(echo=T) 3 | library(tidyverse) 4 | library(fields) 5 | args=commandArgs(trailingOnly=T) 6 | print(args) 7 | 8 | smplname <- as.character(paste0(args[1])) 9 | outname <- as.character(paste0(args[2])) 10 | context <- as.character(paste0(args[3])) 11 | 12 | data <- dir(pattern=paste0(smplname,"-",outname,"-sub",context,"-report.1k.bed")) %>% 13 | read_delim(delim = '\t', col_names=F) %>% 14 | mutate(rel.dist=ifelse(X13==0,ifelse(X12=="-",((X9-X2)/(X9-X8))*1000,((X2-X8)/(X9-X8))*1000),ifelse(X13>0,X13+1000,X13))) %>% 15 | mutate(fixy=ifelse(rel.dist<0 & X13==0,0,ifelse(rel.dist>1000 & X13==0, 1000, rel.dist))) 16 | 17 | out <- NULL 18 | for(i in unique(data$X5)){ 19 | a <- subset(data, X5 == i) 20 | a <- stats.bin(a$fixy,a$X6,N=100) 21 | temp <- as.data.frame(cbind(matrix(a$centers,ncol=1),a$stats["mean",])) 22 | temp$motiff <- paste0(i) 23 | out <- rbind(temp, out) 24 | } 25 | 26 | write.table(out,paste0(paste(smplname,context,outname,sep='_'),'.txt'),quote=F, col.names=T, row.names=F, sep='\t') 27 | -------------------------------------------------------------------------------- /MethylC/scatman_smooth.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Observe whether annotation feature length correlates with methylation level (technical bias). 4 | # E.g. Do longer TEs have higher levels of methylation? 5 | # Intersect bed file with annotation file; 6 | # Produce scattersmooth plot of methylation level vs feature length. 7 | 8 | if [ "$#" -ne 3 ]; then 9 | echo "scatman_smooth.sh annotation sample outname" 10 | echo "e.g. scatman_smooth.sh ./TAIR_TE_subset.bed 317-1-4 TE" 11 | exit 1 12 | fi 13 | 14 | annotation=$1 15 | sample=$2 16 | outname=$3 17 | 18 | # intersect bed files with annotation file 19 | echo "Performing intersectBed of CHG methylation..." 20 | intersectBed -wa -wb -a ${sample}_CHG.bed -b $annotation > ${sample}_CHG_${outname}.bed 21 | echo "Performing intersectBed of CHH methylation..." 22 | intersectBed -wa -wb -a ${sample}_CHH.bed -b $annotation > ${sample}_CHH_${outname}.bed 23 | echo "Performing intersectBed of CpG methylation..." 24 | intersectBed -wa -wb -a ${sample}_CpG.bed -b $annotation > ${sample}_CpG_${outname}.bed 25 | 26 | # take output and produce smoothed scatterplots 27 | echo "Performing scatterSmooth in R" 28 | 29 | Rscript $HOME/scripts/smooth_scat.r ${sample} ${outname} 30 | 31 | echo "cleanup intermediates" 32 | rm ${sample}_*_${outname}.bed 33 | -------------------------------------------------------------------------------- /MethylC/smooth_scat.r: -------------------------------------------------------------------------------- 1 | # R script for scatman_smooth.sh to produce smoothed scatterplots for methylation levels vs feature length 2 | # Will produce plots in the order: CpG, CHG, CHH 3 | 4 | args=commandArgs(trailingOnly=T) 5 | print(args) 6 | 7 | #cpg 8 | a <- read.delim(paste0(args[1],"_CpG_",args[2],".bed"), header=F) 9 | a$length <- a$V7 - a$V6 10 | 11 | #chg 12 | b <- read.delim(paste0(args[1],"_CHG_",args[2],".bed"), header=F) 13 | b$length <- b$V7 - b$V6 14 | 15 | #chh 16 | c <- read.delim(paste0(args[1],"_CHH_",args[2],".bed"), header=F) 17 | c$length <- c$V7 - c$V6 18 | 19 | pdf(file=paste0(args[1],"_",args[2],".pdf")) 20 | par(mfrow=c(2,2)) 21 | smoothScatter(x=a$length, y=a$V4, ylab="CpG Methylation", xlab="TE Length (bp)", colramp = colorRampPalette(c("royalblue", "yellow", "red"))) 22 | smoothScatter(x=b$length, y=b$V4, ylab="CHG Methylation", xlab="TE Length (bp)", colramp = colorRampPalette(c("royalblue", "yellow", "red"))) 23 | smoothScatter(x=c$length, y=c$V4, ylab="CHH Methylation", xlab="TE Length (bp)", colramp = colorRampPalette(c("royalblue", "yellow", "red"))) 24 | dev.off() 25 | 26 | -------------------------------------------------------------------------------- /MethylC/wgbs_cov_to_TDF.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eu 3 | 4 | # convert bismark cov files to IGV format, then produce TDF files of that data. 5 | # Use in directory with all cov files of interest. 6 | # Can produce TDF for depth or prop. methylation 7 | 8 | for FILE in *bismark.cov 9 | do 10 | cat -n $FILE | awk -v OFS="\t" '{print $2, $3-1, $4, $1, $5}' > ${FILE%%.bismark.cov}.igv 11 | java -Xmx2g -Djava.awt.headless=true -jar /home/diep/bin/IGVTools/igvtools.jar toTDF ${FILE%%.bismark.cov}.igv ${FILE%%.bismark.cov}.tdf /home/diep/Araport11/Araport11.genome 12 | done 13 | 14 | # Depth of coverage 15 | #cov=read.delim(flist[i],head=F) 16 | #cov$V2=cov$V2-1 17 | #cov$id=seq(1:nrow(cov)) 18 | #cov$V7=cov$V5+cov$V6 19 | #cov=cov[,c(1,2,3,7,8)] 20 | -------------------------------------------------------------------------------- /MethylC/wgbs_custom_bins.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eu 3 | 4 | # Generate mean methylation levels into custom bins with desired read depth/coverage from per-site BED files 5 | 6 | if [ "$#" -lt 5 ]; then 7 | echo "Missing arguments!" 8 | echo "USAGE: wgbs_custom_bins.sh " 9 | echo "EXAMPLE: wgbs_custom_bins.sh col0-r1 bed /home/diep/TAIR10/TAIR10_Chr.all.fasta 15 100" 10 | exit 1 11 | fi 12 | 13 | bed=$1 14 | file=$2 15 | fas=$3 16 | cov=$4 17 | bin=$5 18 | window=$(expr $bin - 1) 19 | 20 | echo "Weighted methylation in $bed across $bin bp windows with depth >= $cov ..." 21 | 22 | cg="${bed}_CG*.${file}" 23 | chg="${bed}_CHG*.${file}" 24 | chh="${bed}_CHH*.${file}" 25 | 26 | # use samtools to generate fasta index 27 | samtools faidx $fas 28 | 29 | # use awk on index to make genome file 30 | # https://www.biostars.org/p/70795/ 31 | awk -v OFS='\t' {'print $1,$2'} ${fas}.fai > temp.genome 32 | 33 | # use genome file to make 100bp windows across genome 34 | bedtools makewindows -g temp.genome -w ${window} -s ${bin} | sortBed | awk -F$'\t' ' $1 != "ChrC" && $1 != "ChrM" ' > temp.genome.${bin}bp.sorted.bed 35 | 36 | if [ "$file" == "cov" ]; then 37 | # use bedtool intersect and groupBy to get mean methylation levels per bin based on per-site methylation 38 | echo "Bedtools $cg ..." 39 | sort -k1,1 -k2,2n $cg | bedtools intersect -sorted -wo -a temp.genome.${bin}bp.sorted.bed -b "stdin" | groupBy -g 1,2,3 -c 7,8,9 -o mean,sum,sum | awk -v OFS='\t' '{print $1,$2,$3,$4 = ($5 / ($5+$6)*100 ),$5 = ($5 + $6)}' | awk '{ if ($5 >= '$cov') { print } }' > ${bed}_CG_${bin}bp_${cov}cov.bed 40 | 41 | echo "Bedtools $chg ..." 42 | sort -k1,1 -k2,2n $chg | bedtools intersect -sorted -wo -a temp.genome.${bin}bp.sorted.bed -b "stdin" | groupBy -g 1,2,3 -c 7,8,9 -o mean,sum,sum | awk -v OFS='\t' '{print $1,$2,$3,$4 = ($5 / ($5+$6)*100 ), $5 = ($5 + $6)}' | awk '{ if ($5 >= '$cov') { print } }' > ${bed}_CHG_${bin}bp_${cov}cov.bed 43 | 44 | echo "Bedtools $chh ..." 45 | sort -k1,1 -k2,2n $chh | bedtools intersect -sorted -wo -a temp.genome.${bin}bp.sorted.bed -b "stdin" | groupBy -g 1,2,3 -c 7,8,9 -o mean,sum,sum | awk -v OFS='\t' '{print $1,$2,$3,$4 = ($5 / ($5+$6)*100 ), $5 = ($5 + $6)}' | awk '{ if ($5 >= '$cov') { print } }' > ${bed}_CHH_${bin}bp_${cov}cov.bed 46 | 47 | fi 48 | 49 | if [ "$file" == "bed" ]; then 50 | # use bedtool intersect and groupBy to get mean methylation levels per bin based on per-site methylation 51 | echo "Bedtools $cg ..." 52 | sort -k1,1 -k2,2n $cg | bedtools intersect -sorted -wo -a temp.genome.${bin}bp.sorted.bed -b "stdin" | groupBy -g 1,2,3 -c 8,9 -o sum,sum | awk -v OFS='\t' '{print $1,$2,$3,$4 = ($4 / ($4+$5)*100 ),$5 = ($4 + $5)}' | awk '{ if ($5 >= '$cov') { print } }' > ${bed}_CG_${bin}bp_${cov}cov.bed 53 | 54 | echo "Bedtools $chg ..." 55 | sort -k1,1 -k2,2n $chg | bedtools intersect -sorted -wo -a temp.genome.${bin}bp.sorted.bed -b "stdin" | groupBy -g 1,2,3 -c 8,9 -o sum,sum | awk -v OFS='\t' '{print $1,$2,$3,$4 = ($4 / ($4+$5)*100 ), $5 = ($4 + $5)}' | awk '{ if ($5 >= '$cov') { print } }' > ${bed}_CHG_${bin}bp_${cov}cov.bed 56 | 57 | echo "Bedtools $chh ..." 58 | sort -k1,1 -k2,2n $chh | bedtools intersect -sorted -wo -a temp.genome.${bin}bp.sorted.bed -b "stdin" | groupBy -g 1,2,3 -c 8,9 -o sum,sum | awk -v OFS='\t' '{print $1,$2,$3,$4 = ($4 / ($4+$5)*100 ), $5 = ($4 + $5)}' | awk '{ if ($5 >= '$cov') { print } }' > ${bed}_CHH_${bin}bp_${cov}cov.bed 59 | 60 | fi 61 | 62 | echo 'cleaning ...' 63 | # CLEAN 64 | rm temp.genome* 65 | 66 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NGS-scripts 2 | Repository for scripts used for analysing next generation sequencing data organised by technique. 3 | 4 | #### At_rRNA_AGIs.txt 5 | List of rRNA loci for removal prior to DEG calling. 6 | 7 | #### TAIR10_annotation.sh 8 | Download TAIR10 gff and produce annotation files 9 | 10 | #### TruSeq-adapters.fa 11 | FASTA file containing illumina adapter sequences for scythe step in RNA-seq alignment. 12 | 13 | #### Araport11_assemble.sh 14 | Build annotation files based on araport11 gff. 15 | 16 | #### average_cov.sh 17 | Calculate average depth using samtools depth on sorted BAM file. 18 | 19 | #### gene_to_gene_anno.sh 20 | Set of commands (bash & R) to take TAIR GFF files and produce annotation files, here specifically making a annotation file of genes in tandem orientation. 21 | 22 | -------------------------------------------------------------------------------- /RNA/BAM_to_5p_bigWigs.sh: -------------------------------------------------------------------------------- 1 | set -eu 2 | 3 | # Produce 5p end coverage data from BAM files from GMUCT or PARE-seq in bedgraph format 4 | # Then produce bigWigs files for viewing delight 5 | # Run in directory with sam converted, sorted, indexed bam file 6 | # Ensure genome index genome & chromosome sizes are prepared: 7 | # samtools faidx TAIR10_Chr.all.fasta | cut -f1,2 TAIR10_Chr.all.fasta.fai > TAIR10_Chr.all.fasta.len 8 | # Make sure you have kentUtils: https://github.com/ENCODE-DCC/kentUtils 9 | # git clone git://github.com/ENCODE-DCC/kentUtils.git 10 | 11 | ### CONDA environment is installed 12 | # conda create --name Bedtools 13 | # conda install -n Bedtools -c bioconda bedtools 14 | 15 | if [ "$#" -lt 4 ]; then 16 | echo "Missing arguments!" 17 | echo "USAGE: BAM_to_5p_bigWig.sh " 18 | echo "EXAMPLE: BAM_to_5p_bigWig.sh col0-r1.bam SE unstranded,stranded TAIR10_Chr.all.fasta.len" 19 | exit 1 20 | fi 21 | 22 | smp=$1 23 | lay=$2 24 | str=$3 25 | chrc_sizes=$4 26 | 27 | echo "" 28 | echo "sample = $1" 29 | echo "layout = $2" 30 | echo "strand = $3" 31 | echo "chr_size = $4" 32 | echo "" 33 | echo "Produce bigWig file(s) for 5p read ends from $smp ..." 34 | echo "" 35 | 36 | 37 | if [[ "$lay" == "SE" ]] && [[ "$str" == "unstranded" ]]; then 38 | 39 | reads=$(samtools view -F 260 -c $smp) 40 | scaling_factor=$(bc <<< "scale=6;1000000/$reads") 41 | 42 | echo "BAM to bedgraph ..." 43 | # unstranded bedgraph of 5' read end coverage scaled to RPM 44 | bedtools genomecov -bga -5 -scale $scaling_factor -ibam $smp > ${smp%%bam}5p.bg 45 | 46 | # convert bedgraph to bigWig 47 | echo "bigWig ..." 48 | $HOME/bin/kentUtils/bin/linux.x86_64/bedGraphToBigWig ${smp%%bam}5p.bg ${chrc_sizes} ${smp%%bam}5p.bigWig 49 | 50 | 51 | fi 52 | 53 | if [[ "$lay" == "SE" ]] && [[ "$str" == "stranded" ]] ; then 54 | # https://www.biostars.org/p/179035/ 55 | # extract reads from + and - strand 56 | 57 | reads=$(samtools view -F 260 -c $smp) 58 | scl=$(bc <<< "scale=6;1000000/$reads") 59 | 60 | # reverse strand 61 | samtools view -@ 2 -f 16 -b $smp > ${smp%%bam}reverse.bam 62 | # forward strand 63 | samtools view -@ 2 -F 16 -b $smp > ${smp%%bam}forward.bam 64 | 65 | echo "BAM to stranded bedgraphs ..." 66 | # reverse/minus bg 67 | bedtools genomecov -bga -5 -scale -${scl} -ibam ${smp%%bam}reverse.bam > ${smp%%bam}minus.5p.bg 68 | # forward/plus bg 69 | bedtools genomecov -bga -5 -scale $scl -ibam ${smp%%bam}forward.bam > ${smp%%bam}plus.5p.bg 70 | 71 | echo "bigWigs..." 72 | $HOME/bin/kentUtils/bin/linux.x86_64/bedGraphToBigWig ${smp%%bam}plus.5p.bg ${chrc_sizes} ${smp%%bam}plus.5p.bigWig 73 | $HOME/bin/kentUtils/bin/linux.x86_64/bedGraphToBigWig ${smp%%bam}minus.5p.bg ${chrc_sizes} ${smp%%bam}minus.5p.bigWig 74 | 75 | rm ${smp%%bam}reverse.bam ${smp%%bam}forward.bam 76 | 77 | fi 78 | 79 | 80 | if [[ "$lay" == "PE" ]] && [[ "$str" == "unstranded" ]] ; then 81 | 82 | reads=$(samtools view -F 260 -c $smp) 83 | frags=$(expr $reads / 2) 84 | scaling_factor=$(bc <<< "scale=6;1000000/$frags") 85 | 86 | echo "BAM to bedgraph ..." 87 | # unstraned bedgraph of 5' read end coverage scaled to RPM 88 | bedtools genomecov -bga -5 -scale $scaling_factor -ibam $smp > ${smp%%bam}5p.bg 89 | 90 | # convert bedgraph to bigWig 91 | echo "bigWig ..." 92 | $HOME/bin/kentUtils/bin/linux.x86_64/bedGraphToBigWig ${smp%%bam}5p.bg ${chrc_sizes} ${smp%%bam}5p.bigWig 93 | 94 | fi 95 | 96 | 97 | if [[ "$lay" == "PE" ]] && [[ "$str" == "stranded" ]] ; then 98 | 99 | reads=$(samtools view -F 260 -c $smp) 100 | frags=$(expr $reads / 2) 101 | scl=$(bc <<< "scale=6;1000000/$frags") 102 | 103 | 104 | echo "Extract properly-paired read mates (+ flags 99/147; - flags 83/163) from paired-end BAM files" 105 | # http://seqanswers.com/forums/showthread.php?t=29399 106 | 107 | # need sorted bam 108 | samtools sort -@ 8 ${smp} -o ${smp%%bam}sorted.bam 109 | smp="${smp%%bam}sorted.bam" 110 | 111 | # R1 forward 112 | samtools view -@ 8 -f 99 -b $smp > ${smp%%bam}R1F.bam 113 | # R2 reverse 114 | samtools view -@ 8 -f 147 -b $smp > ${smp%%bam}R2R.bam 115 | # FORWARD R1 read pairs 116 | samtools merge -f ${smp%%bam}forward.bam ${smp%%bam}R1F.bam ${smp%%bam}R2R.bam 117 | 118 | # R1 reverse 119 | samtools view -@ 8 -f 83 -b $smp > ${smp%%bam}R1R.bam 120 | # R2 forward 121 | samtools view -@ 8 -f 163 -b $smp > ${smp%%bam}R2F.bam 122 | # REVERSE R1 read pairs 123 | samtools merge -f ${smp%%bam}reverse.bam ${smp%%bam}R1R.bam ${smp%%bam}R2F.bam 124 | 125 | rm $smp ${smp%%bam}R1F.bam ${smp%%bam}R2R.bam ${smp%%bam}R1R.bam ${smp%%bam}R2F.bam 126 | 127 | echo "BAM to stranded bedgraph ..." 128 | # minus strand 129 | bedtools genomecov -bga -5 -scale -${scl} -ibam ${smp%%bam}reverse.bam > ${smp%%bam}minus.5p.bg 130 | # plus strand 131 | bedtools genomecov -bga -5 -scale ${scl} -ibam ${smp%%bam}forward.bam > ${smp%%bam}plus.5p.bg 132 | 133 | echo "bigWigs..." 134 | $HOME/bin/kentUtils/bin/linux.x86_64/bedGraphToBigWig ${smp%%bam}plus.5p.bg ${chrc_sizes} ${smp%%bam}plus.5p.bigWig 135 | $HOME/bin/kentUtils/bin/linux.x86_64/bedGraphToBigWig ${smp%%bam}minus.5p.bg ${chrc_sizes} ${smp%%bam}minus.5p.bigWig 136 | 137 | rm ${smp%%bam}forward.bam ${smp%%bam}reverse.bam 138 | 139 | fi 140 | 141 | 142 | -------------------------------------------------------------------------------- /RNA/BAM_to_EJC.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -u 3 | 4 | # Script to extract genome coverage across features of interest 5 | # optimised to caluclate 5'-P end frequency at exons from PARE or GMUCT 6 | # only SE 7 | 8 | ### CONDA environment is installed 9 | # conda create --name ngs_plots 10 | # conda install -n ngs_plots -c bioconda bedtools 11 | # conda install -n ngs_plots r-fields 12 | # conda install -n ngs_plots -c r r-tidyverse 13 | # conda activate ngs_plots 14 | 15 | if [ "$#" -lt 4 ]; then 16 | echo "Missing arguments!" 17 | echo "USAGE: BAM_to_EJC.sh <.BAM> " 18 | echo "treat as unstranded only (degradome)" 19 | echo "EXAMPLE: BAM_to_EJC.sh col0_rep1.sorted.bam PE Arabidopsis_thaliana.TAIR10.54_exon-mRNA.bed exon-mRNA" 20 | exit 1 21 | fi 22 | 23 | smp=$1 24 | lay=$2 25 | bedfile=$3 26 | out=$4 27 | 28 | echo "" 29 | echo "sample = $1" 30 | echo "layout = $2" 31 | echo "bedfile = $3" 32 | echo "feature = $4" 33 | echo "" 34 | 35 | echo "calculate scaling factor" 36 | if [[ "$lay" == "SE" ]] ; then scl=$(bc <<< "scale=6;1000000/$(samtools view -F 260 -c $smp)"); fi 37 | if [[ "$lay" == "PE" ]] ; then scl=$(bc <<< "scale=6;1000000/$(samtools view -F 260 -c $smp)/2"); fi 38 | 39 | echo "BAM to bed..." 40 | bedtools genomecov -bg -5 -ibam $smp > ${smp%%.bam}.5p.bed 41 | closestBed -D "b" -a ${smp%%.bam}.5p.bed -b $bedfile > ${smp%%.bam}_${out}.5p.bed 42 | awk -F$'\t' '$NF<2 && $NF>-2' ${smp%%.bam}_${out}.5p.bed > ${smp%%.bam}_${out}_10bp.5p.bed 43 | 44 | echo 'do maths' 45 | Rscript /home/dganguly/scripts/RNA/rel_expression_plots_ejc.r ${smp%%.bam}_${out}_10bp.5p.bed $scl 46 | 47 | 48 | -------------------------------------------------------------------------------- /RNA/BAM_to_ESI.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -u 3 | 4 | # Calculate terminal stalling index for EJC binding based on 5'-P end counts upstream of exon-exon junctions 5 | 6 | ### CONDA environment is installed 7 | # conda create --name ngs_plots 8 | # conda install -n ngs_plots -c bioconda bedtools 9 | # conda install -n ngs_plots r-fields 10 | # conda install -n ngs_plots -c r r-tidyverse 11 | # conda activate ngs_plots 12 | 13 | if [ "$#" -lt 2 ]; then 14 | echo "Missing arguments!" 15 | echo "USAGE: BAM_to_ESI.sh <.BAM> " 16 | echo "EXAMPLE: BAM_to_ESI.sh col0_rep1.sorted.bam Arabidopsis_thaliana.TAIR10.54_exon-mRNA.bed" 17 | echo "annotation provided should represent exons" 18 | exit 1 19 | fi 20 | 21 | smp=$1 22 | bedfile=$2 23 | 24 | echo "sample = $1" 25 | echo "bedfile = $2" 26 | 27 | echo "BAM to bed..." 28 | bedtools genomecov -bg -5 -ibam $smp > ${smp%%.bam}.5p.bed 29 | closestBed -D "b" -a ${smp%%.bam}.5p.bed -b $bedfile > ${smp%%.bam}_exon.5p.bed 30 | awk -F$'\t' '$NF<1 && $NF>-51' ${smp%%.bam}_exon.5p.bed > ${smp%%.bam}_exon_ESI.5p.bed 31 | 32 | echo 'do maths' 33 | Rscript /home/dganguly/scripts/RNA/ESI_calculation.r ${smp%%.bam}_exon_ESI.5p.bed 34 | 35 | echo 'cleaning' 36 | rm -v ${smp%%.bam}_exon.5p.bed 37 | 38 | 39 | -------------------------------------------------------------------------------- /RNA/BAM_to_STOP.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -u 3 | 4 | # Script to extract genome coverage across features of interest 5 | # optimised to caluclate 5'-P end frequency adjacent to START or STOP codon from PARE or GMUCT 6 | # only SE 7 | 8 | ### CONDA environment is installed 9 | # conda create --name ngs_plots 10 | # conda install -n ngs_plots -c bioconda bedtools 11 | # conda install -n ngs_plots r-fields 12 | # conda install -n ngs_plots -c r r-tidyverse 13 | # conda activate ngs_plots 14 | 15 | if [ "$#" -lt 5 ]; then 16 | echo "Missing arguments!" 17 | echo "USAGE: BAM_to_STOP.sh <.BAM> " 18 | echo "unstranded only (degradomes)" 19 | echo "EXAMPLE: BAM_to_STOP.sh col0_rep1.sorted.bam SE Arabidopsis_thaliana.TAIR10.54_stop.bed stop 40" 20 | echo "annotation should be start or stop codons (see TAIR_annotation.sh)" 21 | exit 1 22 | fi 23 | 24 | smp=$1 25 | lay=$2 26 | bedfile=$3 27 | out=$4 28 | dis=$5 29 | 30 | echo "" 31 | echo "sample = $1" 32 | echo "layout = $2" 33 | echo "bedfile = $3" 34 | echo "feature = $4" 35 | echo "distance = $5" 36 | echo "" 37 | 38 | echo "calculate scaling factor" 39 | if [[ "$lay" == "SE" ]] ; then scl=$(bc <<< "scale=6;1000000/$(samtools view -F 260 -c $smp)"); fi 40 | if [[ "$lay" == "PE" ]] ; then scl=$(bc <<< "scale=6;1000000/$(samtools view -F 260 -c $smp)/2"); fi 41 | 42 | echo "BAM to bed..." 43 | bedtools genomecov -bg -5 -ibam $smp > ${smp%%.bam}.5p.bed 44 | closestBed -D "b" -a ${smp%%.bam}.5p.bed -b $bedfile > ${smp%%.bam}_${out}.5p.bed 45 | awk -F$'\t' -v a=$dis '$NF-a' ${smp%%.bam}_${out}.5p.bed > ${smp%%.bam}_${out}_${dis}bp.5p.bed 46 | 47 | echo 'do maths' 48 | Rscript /home/dganguly/scripts/RNA/rel_expression_plots_stop.r ${smp%%.bam}_${out}_${dis}bp.5p.bed $scl 49 | 50 | 51 | -------------------------------------------------------------------------------- /RNA/BAM_to_TSI.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -u 3 | 4 | # Calculate terminal stalling index based on 5'-P end counts at STOP codon 5 | 6 | ### CONDA environment is installed 7 | # conda create --name ngs_plots 8 | # conda install -n ngs_plots -c bioconda bedtools 9 | # conda install -n ngs_plots r-fields 10 | # conda install -n ngs_plots -c r r-tidyverse 11 | # conda activate ngs_plots 12 | 13 | if [ "$#" -lt 2 ]; then 14 | echo "Missing arguments!" 15 | echo "USAGE: BAM_to_TSI.sh <.BAM> " 16 | echo "EXAMPLE: BAM_to_TSI.sh col0_rep1.sorted.bam Arabidopsis_thaliana.TAIR10.54_stop.bed" 17 | echo "annotation provided should represent coordinates of stop codons" 18 | exit 1 19 | fi 20 | 21 | smp=$1 22 | bedfile=$2 23 | 24 | echo "sample = $1" 25 | echo "bedfile = $2" 26 | 27 | echo "BAM to bed..." 28 | bedtools genomecov -bg -5 -ibam $smp > ${smp%%.bam}.5p.bed 29 | closestBed -D "b" -a ${smp%%.bam}.5p.bed -b $bedfile > ${smp%%.bam}_stop.5p.bed 30 | awk -F$'\t' '$NF<1 && $NF>-51' ${smp%%.bam}_stop.5p.bed > ${smp%%.bam}_stop_TSI.5p.bed 31 | 32 | echo 'do maths' 33 | Rscript /home/dganguly/scripts/RNA/TSI_calculation.r ${smp%%.bam}_stop_TSI.5p.bed 34 | 35 | echo 'cleaning' 36 | rm -v ${smp%%.bam}.5p.bed ${smp%%.bam}_stop.5p.bed 37 | 38 | 39 | -------------------------------------------------------------------------------- /RNA/BAM_to_bedgraph_5p.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eu 3 | 4 | # Script to summarise 5'P read ends (e.g. PARE-seq, GMUCT) across features of interest in bedGraph format and scale to reads per million (RPM) 5 | 6 | ### CONDA environment is installed 7 | # conda create --name ngs_plots 8 | # conda install -n ngs_plots -c bioconda bedtools 9 | # conda install -n ngs_plots r-fields 10 | # conda install -n ngs_plots -c r r-tidyverse 11 | # conda activate ngs_plots 12 | 13 | if [ "$#" -lt 5 ]; then 14 | echo "Missing arguments!" 15 | echo "USAGE: BAM_to_bedgraph_5p.sh <.BAM> " 16 | echo "EXAMPLE: BAM_to_bedgraph_5p.sh col0_rep1.sorted.bam PE Arabidopsis_thaliana.TAIR10.54_stop.bed stop 50" 17 | exit 1 18 | fi 19 | 20 | smp=$1 21 | lay=$2 22 | bedfile=$3 23 | out=$4 24 | dis=$5 25 | 26 | echo "" 27 | echo "sample = $1" 28 | echo "layout = $2" 29 | echo "bedfile = $3" 30 | echo "feature = $4" 31 | echo "distance = $5" 32 | echo "" 33 | 34 | echo "calculate scaling factor" 35 | if [[ "$lay" == "SE" ]] ; then scl=$(bc <<< "scale=6;1000000/$(samtools view -F 260 -c $smp)"); fi 36 | if [[ "$lay" == "PE" ]] ; then scl=$(bc <<< "scale=6;1000000/$(samtools view -F 260 -c $smp)/2"); fi 37 | 38 | echo "BAM to bed..." 39 | bedtools genomecov -bg -5 -scale $scl -ibam $smp > ${smp%%bam}5p.bed 40 | 41 | echo 'bedtools for coverage across chosen features...' 42 | closestBed -D "b" -a ${smp%%bam}5p.bed -b $bedfile > ${smp%%.bam}_${out}.5p.bed 43 | 44 | echo 'subset ...' 45 | awk -F$'\t' -v a=$dis '$NF-a' ${smp%%.bam}_${out}.5p.bed > ${smp%%.bam}_${out}_${dis}bp.5p.bed 46 | 47 | echo 'do maths' 48 | Rscript /home/dganguly/scripts/RNA/rel_expression_plots.r ${smp%%.bam}_${out}_${dis}bp.5p.bed 49 | 50 | echo 'cleaning' 51 | rm -v ${smp%%.bam}_${out}.5p.bed 52 | 53 | 54 | -------------------------------------------------------------------------------- /RNA/BAM_to_wigs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eu 3 | 4 | # Produce files with windowed coverage of RNAseq data (BAM aligned reads) across annotations of interest 5 | # Run in directory with sam converted, sorted, indexed bam file 6 | # Provide path of genome .fa file to produce windowed genome 7 | 8 | if [ "$#" -lt 5 ]; then 9 | echo "Missing arguments!" 10 | echo "USAGE: BAM_to_wigs.sh <.BAM> " 11 | echo "EXAMPLE: BAM_to_wigs.sh col0_rep1.sorted.bam TAIR10_Chr.all.fasta Araport11_TE.bed TE 100" 12 | exit 1 13 | fi 14 | 15 | bam=$1 16 | fas=$2 17 | bedfile=$3 18 | out=$4 19 | size=$5 20 | size_2=$(($size - 1)) 21 | 22 | echo "Make $size bp genome bed ..." 23 | 24 | # use samtools to generate fasta index 25 | samtools faidx $fas 26 | # use awk on index to make genome file 27 | # https://www.biostars.org/p/70795/ 28 | awk -v OFS='\t' {'print $1,$2'} ${fas}.fai > temp.genome 29 | # use genome file to make 100bp windows across genome 30 | bedtools makewindows -g temp.genome -w $size_2 -s $size > temp.genome.${size}bp.bed 31 | sortBed -i temp.genome.${size}bp.bed > temp.genome.${size}bp.sorted.bed 32 | 33 | # use bedtools coverage to get coverage across windows from BAM 34 | # MAKE SURE TO USE -sorted FLAG 35 | bedtools coverage -sorted -a temp.genome.${size}bp.bed -b $bam > ${bam%%.sorted*}_${size}bp.bed 36 | 37 | echo 'cleaning ...' 38 | # CLEAN 39 | rm temp.genome* 40 | 41 | # sort Bed 42 | sortBed -i ${bam%%.sorted*}_${size}bp.bed > ${bam%%.sorted*}_${size}bp.sorted.bed 43 | 44 | echo 'bedtools ...' 45 | # bedtools to desired annotation 46 | closestBed -D "b" -a ${bam%%.sorted*}_${size}bp.sorted.bed -b $bedfile > ${bam%%.sorted*}_${out}_${size}.bed 47 | 48 | echo 'subset to +1k/-1k ...' 49 | # awk to subset 50 | awk -F$'\t' '$NF<1000 && $NF>-1000' ${bam%%.sorted*}_${out}_${size}.bed > ${bam%%.sorted*}_${out}_${size}.1k.bed 51 | 52 | echo 'final clean ...' 53 | rm ${bam%%.sorted*}_${size}bp.bed ${bam%%.sorted*}_${size}bp.sorted.bed ${bam%%.sorted*}_${out}_${size}.bed 54 | 55 | -------------------------------------------------------------------------------- /RNA/ESI_calculation.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # args[1] = filename 3 | # Required for BAM_to_ESI.sh 4 | # Calculate EJC stalling index from 5'P end counts upstream of exon-exon junctions 5 | 6 | options(echo=T) 7 | library(fields) 8 | library(tidyverse) 9 | args=commandArgs(trailingOnly=T) 10 | print(args) 11 | 12 | # Read in file 13 | input <- read.delim(args[1],head=F) %>% 14 | # Remove reads to plastid and mitochondria 15 | subset(V1 != 'ChrM' & V1 != 'ChrC' & V1 != 'Mt' & V1 != 'Pt') %>% 16 | #exon at least 50 nucleotides 17 | mutate(length = V7 - V6) %>% 18 | subset(length > 49) %>% 19 | # calculate position relative to 3' end of feature 20 | mutate(pos = ifelse(V10 == "+", V2-V7, V6-V3)) %>% 21 | subset(pos > -50 & pos < 1) 22 | 23 | # get total end counts in window 24 | stop_5p_sum <- group_by(input, V8) %>% 25 | summarise(avg_frame=mean(V4), avg_frame_true=sum(V4)/50, total_counts=sum(V4)) 26 | 27 | # Get sum of normalized reads (i.e.normalized occurrence of 5'P ends [Pi] in Lee et al 2019 Plant Cell) then calculate relative frequency per nt 28 | a1 <- group_by(input, V8) %>% 29 | subset(pos == -28 | pos == -27) %>% 30 | summarise(avg_ejc = mean(V4), avg_ejc_true = sum(V4)/2) %>% 31 | mutate(avg_frame = stop_5p_sum$avg_frame[match(V8, stop_5p_sum$V8)]) %>% 32 | mutate(total_counts = stop_5p_sum$total_counts[match(V8, stop_5p_sum$V8)]) %>% 33 | subset(total_counts >= 10) %>% 34 | mutate(esi = avg_ejc/avg_frame) 35 | 36 | # name output filea 37 | name <- sapply(strsplit(as.character(args[1]),'Aligned'), function(l) l[1]) 38 | 39 | ## diagnostic plot on single sample 40 | pdf(paste0(name,"ESI.pdf")) 41 | plot(y=log2(a1$avg_ejc), x=log2(a1$avg_frame), col = ifelse(a1$tsi > 2, "salmon", "grey")) 42 | abline(a=0, b=1) 43 | dev.off() 44 | 45 | ## output 46 | write.table(a1, paste0(name,"ESI.txt"), sep='\t', quote=F, row.names=F) 47 | 48 | -------------------------------------------------------------------------------- /RNA/README.md: -------------------------------------------------------------------------------- 1 | ## Scripts for analysing RNA-sequencing 2 | 3 | ### kallisto_pipe 4 | Pipeline for using Kallisto (Bray et al 2016) for alignment-free transcript quantification. See individual version scripts for running updates and changes. 5 | 6 | ### subread_pipe 7 | Pipeline for using Subread (more specifically Subjunc) for read alignment for subsequent quantification with featureCounts. See individual script version for running changes and modifications. 8 | 9 | ### featureCounts 10 | Use featureCounts for gene expression quantification. featureCounts_exon performs the same but at the exon-level rather than gene-level. 11 | 12 | ### featureCounts_to_edgeR 13 | Template script for differential gene expression testing with edgeR (Robinson et al 2010). 14 | 15 | ### RNAseq_bam_to_100bpwigs 16 | Produce 100bp windows (100bp.bed) of RNAseq coverage from BAMs across annotations of interest. 17 | 18 | ### rel_expression_plots 19 | Get raw read coverage across features of interest from using WIG files as input (see RNAseq_bam_to_100bp wigs). 20 | 21 | ### RNAseq_bam_to_bedgraph 22 | Produce coverage data from BAM files for RNAseq or ChIP data in bedgraph format, subsequently converting to bigWig files (IGV browsing). 23 | 24 | ### SUPPA_pipe 25 | SUPPA2 pipeline using event-based analysis to detect alternate splicing and isoform usage. 26 | 27 | ### split_file 28 | R script required for splitting files appropriately for SUPPA_pipe. 29 | 30 | ### stringtie_pipe 31 | StingTie2 pipeline that performs reference-guided de novo transcript assembly and quantification. 32 | 33 | ### stringtie_extract_tpm 34 | Supplementary script for stringtie_pipe to extract TPM based quantification from StringTie2 output. 35 | 36 | 37 | -------------------------------------------------------------------------------- /RNA/SUPPA_pipe_v1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -u 3 | 4 | # Performs event-based splicing analysis using SUPPA2 based on kallisto output 5 | # https://github.com/comprna/SUPPA#command-and-subcommand-structure 6 | # tutorial: https://github.com/comprna/SUPPA/wiki/SUPPA2-tutorial 7 | 8 | ### CONDA environment 9 | # conda create --name 10 | # conda install -n -c bioconda suppa 11 | 12 | 13 | if [ "$#" -lt 5 ]; then 14 | echo "Missing arguments!" 15 | echo "USAGE: SUPPA_pipe_v1.sh " 16 | echo "EXAMPLE: SUPPA_pipe_v1.sh /home/diepg/ref_seqs/AtRTD2/AtRTD2_QUASI_19April2016.gtf /home/diepg/ws/sal1_AS/raw_files/ col0_rep1,col0_rep2,col0_rep3 grp7_rep1,grp7_rep2,grp7_rep3 RTD2-quasi" 17 | exit 1 18 | fi 19 | 20 | #### Parameters 21 | ## annotation file 22 | I=$1 23 | ## events output name 24 | N=$5 25 | ## kallisto quant files 26 | S=$2 27 | # group 1 IDs 28 | grp1=$3 29 | # group 2 IDs 30 | grp2=$4 31 | 32 | ## quantification 33 | mkdir kallisto_output 34 | 35 | fls=$(dir $S) 36 | for i in $fls; do 37 | mkdir kallisto_output/${i%%_kallisto*}; 38 | cp $S/${i}/*/abundance.tsv kallisto_output/${i%%_kallisto*}/abundance.tsv; done 39 | 40 | #python3 ~/bin/SUPPA-2.3/multipleFieldSelection.py -i kallisto_output/*/abundance.tsv -k 1 -f 5 -o iso_tpm.txt 41 | multipleFieldSelection.py -i kallisto_output/*/abundance.tsv -k 1 -f 5 -o iso_tpm.txt 42 | 43 | ### generateEvents 44 | mkdir generateEvents 45 | cd generateEvents 46 | 47 | ## generate transcript events 48 | # python3 ~/bin/SUPPA-2.3/suppa.py generateEvents -i $I -o $N -f ioi 49 | suppa.py generateEvents -i $I -o $N -f ioi 50 | M="${N}.ioi" 51 | 52 | ## generate local AS events 53 | # python3 ~/bin/SUPPA-2.3/suppa.py generateEvents -i $I -o $N -f ioe -e SE SS MX RI FL 54 | suppa.py generateEvents -i $I -o $N -f ioe -e SE SS MX RI FL 55 | 56 | #Put all the ioe events in the same file: 57 | awk ' 58 | FNR==1 && NR!=1 { while (/^
/) getline; } 59 | 1 {print} 60 | ' *.ioe > ${N}.allevents.ioe 61 | N="${N}.allevents.ioe" 62 | 63 | mv $M ../ 64 | mv $N ../ 65 | 66 | awk ' 67 | FNR==1 && NR!=1 { while (/^
/) getline; } 68 | 1 {print} 69 | ' *.gtf > ${N%%.allevents*}.allevents.gtf 70 | mv *.allevents.gtf ../ 71 | 72 | cd ../ 73 | 74 | ### PSI per event 75 | # python3 ~/bin/SUPPA-2.3/suppa.py psiPerEvent -i $N -e iso_tpm.txt -o ${N%%.allevents*}_events 76 | suppa.py psiPerEvent -i $N -e iso_tpm.txt -o ${N%%.allevents*}_events 77 | 78 | ### Differential splicing with local events 79 | ## PSI and TPM per condition 80 | Rscript $HOME/scripts/RNA/split_file.R ./iso_tpm.txt $grp1 $grp2 ${grp1%%_rep*}_iso.tpm ${grp2%%_rep*}_iso.tpm 81 | 82 | Rscript $HOME/scripts/RNA/split_file.R ./${N%%.allevents*}_events.psi $grp1 $grp2 ${grp1%%_rep*}_events.psi ${grp2%%_rep*}_events.psi 83 | 84 | ## differential splicing analysis 85 | # python3 ~/bin/SUPPA-2.3/suppa.py diffSplice -m empirical -gc -i $N -p ${grp2%%_rep*}_events.psi ${grp1%%_rep*}_events.psi -e ${grp2%%_rep*}_iso.tpm ${grp1%%_rep*}_iso.tpm -o ${N%%.ioe}_${grp2%%_rep*}-${grp1%%_rep*}_diffSplice_events 86 | suppa.py diffSplice -m empirical -gc -i $N -p ${grp2%%_rep*}_events.psi ${grp1%%_rep*}_events.psi -e ${grp2%%_rep*}_iso.tpm ${grp1%%_rep*}_iso.tpm -o ${N%%.ioe}_${grp2%%_rep*}-${grp1%%_rep*}_diffSplice_events 87 | 88 | ## differential trascript usage 89 | ### PSI per isoform 90 | # python3 ~/bin/SUPPA-2.3/suppa.py psiPerIsoform -g $I -e iso_tpm.txt -o ${M%%.ioi} 91 | suppa.py psiPerIsoform -g $I -e iso_tpm.txt -o ${M%%.ioi} 92 | 93 | ### Split PSI between 2 conditions: 94 | Rscript $HOME/scripts/RNA/split_file.R ./${M%%.ioi}_isoform.psi $grp1 $grp2 ${grp1%%_rep*}_iso.psi ${grp2%%_rep*}_iso.psi 95 | 96 | ### diffsplice 97 | # python3 ~/bin/SUPPA-2.3/suppa.py diffSplice -m empirical -gc -i $M -p ${grp2%%_rep*}_iso.psi ${grp1%%_rep*}_iso.psi -e ${grp2%%_rep*}_iso.tpm ${grp1%%_rep*}_iso.tpm -o ${M%%.ioi}_${grp2%%_rep*}-${grp1%%_rep*}_diffSplice_iso 98 | suppa.py diffSplice -m empirical -gc -i $M -p ${grp2%%_rep*}_iso.psi ${grp1%%_rep*}_iso.psi -e ${grp2%%_rep*}_iso.tpm ${grp1%%_rep*}_iso.tpm -o ${M%%.ioi}_${grp2%%_rep*}-${grp1%%_rep*}_diffSplice_iso 99 | 100 | ## collect output 101 | mkdir suppa2_output 102 | mv ${N%%.ioe}_${grp2%%_rep*}-${grp1%%_rep*}_diffSplice_events* -t suppa2_output 103 | mv ${M%%.ioi}_${grp2%%_rep*}-${grp1%%_rep*}_diffSplice_iso* -t suppa2_output 104 | 105 | 106 | -------------------------------------------------------------------------------- /RNA/SUPPA_pipe_v2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -u 3 | 4 | # Performs event-based splicing analysis using SUPPA2 using stringtie TPM output 5 | 6 | if [ "$#" -lt 5 ]; then 7 | echo "Missing arguments!" 8 | echo "USAGE: SUPPA_pipe_v2.sh " 9 | echo "EXAMPLE: SUPPA_pipe_v2.sh AtRTD2_QUASI_19April2016.gtf raw_files/ col0_rep1,col0_rep2,col0_rep3 grp7_rep1,grp7_rep2,grp7_rep3 RTD2-quasi" 10 | exit 1 11 | fi 12 | 13 | #### Parameters 14 | ## annotation file 15 | I=$1 16 | ## events output name 17 | N=$5 18 | ## extracted TPM (see stringtie_extract_tpm.r) 19 | S=$2 20 | # group 1 IDs 21 | grp1=$3 22 | # group 2 IDs 23 | grp2=$4 24 | 25 | ## quantification 26 | mkdir tpm_output 27 | 28 | fls=$(dir $S) 29 | 30 | for i in $fls; do 31 | mkdir tpm_output/${i%%_stringtie.tpm*}; 32 | cp $S/$i tpm_output/${i%%_stringtie.tpm*}/abundance.tpm; done 33 | 34 | python3 ~/bin/SUPPA-2.3/multipleFieldSelection.py -i tpm_output/*/abundance.tpm -k 1 -f 3 -o iso_tpm.txt 35 | 36 | ### generateEvents 37 | mkdir generateEvents 38 | cd generateEvents 39 | 40 | ## generate transcript events 41 | python3 ~/bin/SUPPA-2.3/suppa.py generateEvents -i $I -o $N -f ioi 42 | M="${N}.ioi" 43 | 44 | ## generate local AS events 45 | python3 ~/bin/SUPPA-2.3/suppa.py generateEvents -i $I -o $N -f ioe -e SE SS MX RI FL 46 | 47 | #Put all the ioe events in the same file: 48 | awk ' 49 | FNR==1 && NR!=1 { while (/^
/) getline; } 50 | 1 {print} 51 | ' *.ioe > ${N}.allevents.ioe 52 | N="${N}.allevents.ioe" 53 | 54 | mv $M ../ 55 | mv $N ../ 56 | 57 | awk ' 58 | FNR==1 && NR!=1 { while (/^
/) getline; } 59 | 1 {print} 60 | ' *.gtf > ${N%%.allevents*}.allevents.gtf 61 | mv *.allevents.gtf ../ 62 | 63 | cd ../ 64 | 65 | ### PSI per event 66 | python3 ~/bin/SUPPA-2.3/suppa.py psiPerEvent -i $N -e iso_tpm.txt -o ${N%%.allevents*}_events 67 | 68 | ### Differential splicing with local events 69 | ## PSI and TPM per condition 70 | Rscript $HOME/scripts/RNA/split_file.R ./iso_tpm.txt $grp1 $grp2 ${grp1%%_rep*}_iso.tpm ${grp2%%_rep*}_iso.tpm 71 | 72 | Rscript $HOME/scripts/RNA/split_file.R ./${N%%.allevents*}_events.psi $grp1 $grp2 ${grp1%%_rep*}_events.psi ${grp2%%_rep*}_events.psi 73 | 74 | ## differential splicing analysis 75 | python3 ~/bin/SUPPA-2.3/suppa.py diffSplice -m empirical -gc -i $N -p ${grp2%%_rep*}_events.psi ${grp1%%_rep*}_events.psi -e ${grp2%%_rep*}_iso.tpm ${grp1%%_rep*}_iso.tpm -o ${N%%.ioe}_${grp2%%_rep*}-${grp1%%_rep*}_diffSplice_events 76 | 77 | ## differential trascript usage 78 | ### PSI per isoform 79 | python3 ~/bin/SUPPA-2.3/suppa.py psiPerIsoform -g $I -e iso_tpm.txt -o ${M%%.ioi} 80 | 81 | ### Split PSI between 2 conditions: 82 | Rscript $HOME/scripts/RNA/split_file.R ./${M%%.ioi}_isoform.psi $grp1 $grp2 ${grp1%%_rep*}_iso.psi ${grp2%%_rep*}_iso.psi 83 | 84 | ### diffsplice 85 | python3 ~/bin/SUPPA-2.3/suppa.py diffSplice -m empirical -gc -i $M -p ${grp2%%_rep*}_iso.psi ${grp1%%_rep*}_iso.psi -e ${grp2%%_rep*}_iso.tpm ${grp1%%_rep*}_iso.tpm -o ${M%%.ioi}_${grp2%%_rep*}-${grp1%%_rep*}_diffSplice_iso 86 | 87 | ## collect output 88 | mkdir suppa2_output 89 | mv ${N%%.ioe}_${grp2%%_rep*}-${grp1%%_rep*}_diffSplice_events* -t suppa2_output 90 | mv ${M%%.ioi}_${grp2%%_rep*}-${grp1%%_rep*}_diffSplice_iso* -t suppa2_output 91 | 92 | 93 | -------------------------------------------------------------------------------- /RNA/TSI_calculation.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # args[1] = filename 3 | # Runs with BAM_to_TSI.sh 4 | # Calculate terminal stalling index from 5'P end counts surrounding the stop codon 5 | 6 | options(echo=T) 7 | library(fields) 8 | library(tidyverse) 9 | args=commandArgs(trailingOnly=T) 10 | print(args) 11 | 12 | # Read in file 13 | input <- read.delim(args[1],head=F) %>% 14 | # Remove reads to plastid and mitochondria 15 | subset(V1 != 'ChrM' & V1 != 'ChrC' & V1 != 'Mt' & V1 != 'Pt') %>% 16 | # calculate position relative to first base of stop codon 17 | mutate(pos = ifelse(V10 == "+", V2-V6, V7-V3)) %>% 18 | subset(pos > -50 & pos < 1) 19 | 20 | # get total end counts in window 21 | stop_5p_sum <- group_by(input, V8) %>% 22 | summarise(avg_frame=mean(V4), avg_frame_true=sum(V4)/50, total_counts=sum(V4)) 23 | 24 | # Get sum of normalized reads (i.e.normalized occurrence of 5'P ends [Pi] in Lee et al 2019 Plant Cell) then calculate relative frequency per nt 25 | a1 <- group_by(input, V8) %>% 26 | subset(pos == -16 | pos == -17) %>% 27 | summarise(avg_ctrd = mean(V4), avg_ctrd_true = sum(V4)/2) %>% 28 | mutate(avg_frame = stop_5p_sum$avg_frame[match(V8, stop_5p_sum$V8)]) %>% 29 | mutate(total_counts = stop_5p_sum$total_counts[match(V8, stop_5p_sum$V8)]) %>% 30 | subset(total_counts >= 10) %>% 31 | mutate(tsi = avg_ctrd/avg_frame) 32 | 33 | # name output filea 34 | name <- sapply(strsplit(as.character(args[1]),'Aligned'), function(l) l[1]) 35 | 36 | ## diagnostic plot on single sample 37 | pdf(paste0(name,"TSI.pdf")) 38 | plot(y=log2(a1$avg_ctrd), x=log2(a1$avg_frame), col = ifelse(a1$tsi > 2, "salmon", "grey")) 39 | abline(a=0, b=1) 40 | dev.off() 41 | 42 | ## output 43 | write.table(a1, paste0(name,"TSI.txt"), sep='\t', quote=F, row.names=F) 44 | 45 | -------------------------------------------------------------------------------- /RNA/featureCounts_v1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Use featureCounts to assign counts to annotated features (e.g. genes, transposons) from aligned BAM files 4 | 5 | set -eu 6 | 7 | if [ "$#" -lt 5 ]; then 8 | echo "Missing arguments!" 9 | echo "USAGE: RNAseq_featureCounts.sh " 10 | echo "EXAMPLE: RNAseq_featureCounts.sh col0-r1.sorted.bam PE 1 AtRTD2_19April2016.gtf gtf RTD2" 11 | echo "library strandedness: 0 = unstranded, 1 = stranded, 2 = reverse stranded" 12 | echo "format: saf, bed, gtf" 13 | exit 1 14 | fi 15 | 16 | sample=$1 17 | layout=$2 18 | strand=$3 19 | bedfile=$4 20 | format=$5 21 | outname=$6 22 | 23 | echo "" 24 | echo "sample = $1" 25 | echo "layout = $2" 26 | echo "strand = $3" 27 | echo "bedfile = $4 ($format) ($outname)" 28 | echo "" 29 | echo "$layout $strand featureCounts on $bedfile $format ($outname) in $sample ..." 30 | echo "" 31 | 32 | if [[ $format == "saf" ]]; then 33 | 34 | if [[ $layout == "SE" ]]; then 35 | featureCounts\ 36 | -F 'SAF'\ 37 | -C\ 38 | -T 2\ 39 | -s $strand\ 40 | -a $bedfile\ 41 | -o "${1%%.bam*}_${outname}.counts"\ 42 | $sample 2>&1 | tee -a ../*log 43 | fi 44 | 45 | if [[ $layout == "PE" ]]; then 46 | featureCounts\ 47 | -F SAF\ 48 | -p\ 49 | -C\ 50 | -T 2\ 51 | -s $strand\ 52 | -a $bedfile\ 53 | -o "${1%%.bam*}_${outname}.counts"\ 54 | $sample 2>&1 | tee -a ../*log 55 | fi 56 | 57 | fi 58 | 59 | if [[ $format == "bed" ]]; then 60 | ## convert BED to SAF format 61 | awk -F'\t' '{print $4"\t"$1"\t"$2"\t"$3"\t"$6}' $bedfile > temp.saf 62 | awk 'BEGIN {print "GeneID""\t""Chr""\t""Start""\t""End""\t""Strand"}{print}' temp.saf > temp2.saf 63 | 64 | if [[ $layout == "SE" ]]; then 65 | featureCounts\ 66 | -F SAF\ 67 | -C\ 68 | -T 2\ 69 | -s $strand\ 70 | -a temp2.saf\ 71 | -o "${1%%.bam*}_${outname}.counts"\ 72 | $sample 2>&1 | tee -a ../*log 73 | fi 74 | 75 | if [[ $layout == "PE" ]]; then 76 | featureCounts\ 77 | -F SAF\ 78 | -p\ 79 | -C\ 80 | -T 2\ 81 | -s $strand\ 82 | -a temp2.saf\ 83 | -o "${1%%.bam*}_${outname}.counts"\ 84 | $sample 2>&1 | tee -a ../*log 85 | fi 86 | 87 | rm temp*.saf -v 88 | 89 | fi 90 | 91 | if [[ $format == "gtf" ]]; then 92 | 93 | if [[ $layout == "SE" ]]; then 94 | featureCounts\ 95 | -F GTF\ 96 | -C\ 97 | -T 4\ 98 | -s $strand\ 99 | -a $bedfile\ 100 | -o "${1%%.bam*}_${outname}.counts"\ 101 | $sample 2>&1 | tee -a ../*log 102 | fi 103 | 104 | if [[ $layout == "PE" ]]; then 105 | featureCounts\ 106 | -F GTF\ 107 | -p\ 108 | -C\ 109 | -T 2\ 110 | -s $strand\ 111 | -a $bedfile\ 112 | -o "${1%%.bam*}_${outname}.counts"\ 113 | $sample 2>&1 | tee -a ../*log 114 | fi 115 | fi 116 | 117 | echo "DONE" 118 | -------------------------------------------------------------------------------- /RNA/featureCounts_v2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Perform featureCounts on aligned BAM files to assign transcript counts PER EXON based on the AtRTD2 reference transcript dataset (or quasi - for alternative splicing) 4 | # Additional info https://www.biostars.org/p/321379/ 5 | # https://www.bioconductor.org/packages/devel/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf 6 | # Zhang R et al (2017). A high quality Arabidopsis transcriptome for accurate transcript-level analysis of alternative splicing. Nucleic Acids Res. 45: 5061–5073. 7 | 8 | set -eu 9 | 10 | if [ "$#" -lt 4 ]; then 11 | echo "Missing arguments!" 12 | echo "USAGE: RNAseq_featureCounts.sh " 13 | echo "EXAMPLE: RNAseq_featureCounts.sh col0-r1.bam PE 2 rtd2" 14 | echo "library strandedness: 0 = unstranded, 1 = stranded, 2 = reverse stranded" 15 | exit 1 16 | fi 17 | 18 | sample=$1 19 | layout=$2 20 | strand=$3 21 | ref=$4 22 | 23 | if [[ $ref == "rtd2" ]]; then 24 | bedfile="$HOME/ref_seqs/AtRTD2/AtRTD2_19April2016.gtf" 25 | outname="${sample%%.bam*}_RTD2.counts" 26 | elif [[ $ref == "padded" ]]; then 27 | bedfile="$HOME/ref_seqs/AtRTD2/AtRTDv2_QUASI_19April2016.gtf" 28 | outname="${sample%%.bam*}_quasi.counts" 29 | else 30 | echo " bad argument - pick 'rtd2' or 'padded' " 31 | exit 1 32 | fi 33 | 34 | echo "" 35 | echo "sample = $1" 36 | echo "layout = $2" 37 | echo "strand = $3 where 0 = unstranded, 1 = stranded, 2 = reverse stranded" 38 | echo "$bedfile" 39 | echo "" 40 | echo "Exon feature counting - $layout $strand featureCounts on $bedfile in $sample ..." 41 | echo "" 42 | 43 | if [[ $layout == "SE" ]]; then 44 | 45 | featureCounts -F GTF -C -T 4 -f -t exon -g gene_id -O -s $strand -a $bedfile -o $outname $sample 2>&1 | tee -a ../*log 46 | 47 | fi 48 | 49 | if [[ $layout == "PE" ]]; then 50 | 51 | featureCounts -F GTF -p -C -T 4 -f -t exon -g gene_id -O -s $strand -a $bedfile -o $outname $sample 2>&1 | tee -a ../*log 52 | 53 | fi 54 | 55 | # -C - Do not count read pairs matching different chromosomes 56 | # -f - Perform read counting at feature level (e.g. exons vs genes) 57 | # -t - Specify feature 58 | # -g - Specify attribute 59 | # -O - Keep reads assigned to multiple features 60 | # -p - paired-end reads, count fragments 61 | 62 | echo "DONE" 63 | -------------------------------------------------------------------------------- /RNA/featureCounts_v3-gtf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Perform featureCounts on aligned BAM files to assign counts across specified features in a GTF file 4 | 5 | set -eu 6 | 7 | if [ "$#" -lt 5 ]; then 8 | echo "Missing arguments!" 9 | echo "USAGE: RNAseq_featureCounts_v3-gtf.sh " 10 | echo "EXAMPLE: RNAseq_featureCounts_v3-gtf.sh col0-r1.bam PE 2 Arabidopsis.gtf gene" 11 | echo "library strandedness: 0 = unstranded, 1 = stranded, 2 = reverse stranded" 12 | exit 1 13 | fi 14 | 15 | sample=$1 16 | layout=$2 17 | strand=$3 18 | bedfile=$4 19 | feat=$5 20 | outname="${sample%%.bam*}_${feat}.gtf.counts" 21 | 22 | echo "" 23 | echo "sample = $1" 24 | echo "layout = $2" 25 | echo "strand = $3 where 0 = unstranded, 1 = stranded, 2 = reverse stranded" 26 | echo "Counts at $5 in $bedfile" 27 | echo "" 28 | 29 | if [[ $layout == "SE" ]]; then 30 | 31 | featureCounts -F GTF -C -T 4 -M -t $feat -g gene_id -O -s $strand -a $bedfile -o $outname $sample 32 | 33 | fi 34 | 35 | if [[ $layout == "PE" ]]; then 36 | 37 | featureCounts -F GTF -p -C -M -T 4 -t $feat -g gene_id -O -s $strand -a $bedfile -o $outname $sample 38 | 39 | fi 40 | 41 | # -C - Do not count read pairs matching different chromosomes 42 | # -t - Specify feature 43 | # -g - Specify attribute 44 | # -O - Keep reads assigned to multiple features 45 | # -p - paired-end reads, count fragments 46 | # -M - Multi-mapping reads will also be counted. 47 | 48 | echo "DONE" 49 | -------------------------------------------------------------------------------- /RNA/get_peak_length.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -u 4 | 5 | ## get average peak length from macs2 peak calling output (bed file) 6 | 7 | if [ "$#" -lt 1 ]; then 8 | echo "Missing required arguments!" 9 | echo "USAGE: get_peak_length.sh " 10 | echo "EXAMPLE: get_peak_length.sh col0-r1.merged.bed" 11 | exit 1 12 | fi 13 | 14 | sample=$1 15 | 16 | awk '{ $10= $3 - $2} { sum += $10} END {print sum / NR}' $sample 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /RNA/graft-nad-seq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eu 3 | 4 | # Single-end read alignment for GRAFT-NAD-seq libraries with stringent STAR parameters 5 | 6 | # Prepare STAR index based on TAIR10 reference 7 | # wget ftp://ftp.ensemblgenomes.org/pub/release-47/plants/fasta/arabidopsis_thaliana/dna/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.gz 8 | # samtools faidx Arabidopsis_thaliana.TAIR10.dna.toplevel.fa 9 | # cut -f1,2 Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.fai > Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len 10 | 11 | # Build STAR genome index 12 | # STAR --runThreadN 4 --runMode genomeGenerate --genomeSAindexNbases 12 --sjdbGTFfile Arabidopsis_thaliana.TAIR10.54.gtf --genomeDir /path/to/GenomeDir/ --genomeFastaFiles Arabidopsis_thaliana.TAIR10.dna.toplevel.fa 13 | 14 | ### CONDA environment is installed 15 | # conda create --name graft_nad 16 | # conda install -n graft_nad -c bioconda fastqc 17 | # conda install -n graft_nad -c bioconda cutadapt 18 | # conda install -n graft_nad -c bioconda star 19 | # conda install -n graft_nad -c bioconda seqkit 20 | # conda install -n graft_nad -c conda-forge r-tidyverse 21 | # conda install -n graft_nad conda-forge::parallel ## NOT IMPLEMENTED YET 22 | 23 | if [ "$#" -lt 3 ] || [ "$#" -gt 3 ]; then 24 | echo "Missing required arguments!" 25 | echo "USAGE: graft-nad-seq.sh " 26 | echo "EXAMPLE: graft-nad-seq.sh sample.fastq ~/ref_seqs/STAR/TAIR10/GenomeDir sample_rep1" 27 | exit 1 28 | fi 29 | 30 | #gather input variables 31 | fq=$1 32 | index=$2; 33 | fileID=$3; 34 | dow=$(date +"%F-%H-%m") 35 | 36 | echo "##################" 37 | echo "Performing single-end alignment with the following parameters:" 38 | echo "Input Files: $fq" 39 | echo "genome index: $index" 40 | echo "Output ID: $fileID" 41 | echo "Time of analysis: $dow" 42 | echo "##################" 43 | 44 | # make sample work directory 45 | mkdir ${fileID}_graft-nad_${dow} 46 | mv $fq ${fileID}_graft-nad_${dow} 47 | cd ${fileID}_graft-nad_${dow} 48 | 49 | echo "Extract branch sequence and trim adapters" 50 | mkdir 1_read_trimming 51 | mv $fq 1_read_trimming 52 | cd 1_read_trimming 53 | 54 | ## extract reads beginning with branch sequence (GCTTGTTGTG) with flexibility at first and last base 55 | if [[ $fq == *"fq.gz" ]]; 56 | then seqkit grep -j 12 -s -r -p "^.CTTGTTGT" $fq -o ${fq%%.fq*}_branch.fq.gz; 57 | else seqkit grep -j 12 -s -r -p "^.CTTGTTGT" $fq -o ${fq%%.fastq*}_branch.fq.gz; 58 | fi 59 | 60 | if [[ $fq == *"fq.gz" ]]; then fq_branch=${fq%%.fq*}_branch.fq.gz; else fq_branch=${fq%%.fastq*}_branch.fq.gz; fi 61 | 62 | echo "Trim universal PCR primer sequence from 3' end of read" 63 | ## remove universal PCR primer at the 3' end of reads 64 | cutadapt -a "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT" -e 0.2 -m 25 -o "${fq_branch%%.fq*}_3p_trimmed.fq.gz" ${fq_branch} 2>&1 | tee -a ../${fileID}_logs_${dow}.log 65 | 66 | ## old flags to trim 5' adapter sequences -- obselete 67 | #cutadapt -g "^NCTTGTTGTB" -g "^NCTTGTTGTBB" -g "^NCTTGTTGTBBB" -g "^NCTTGTTGTBBBG" 68 | 69 | ## R script - trim reads first A at the 5' end of the read, retain read only if A within first 15 bp (10bp branch + flexibility for RT jumping) 70 | echo "Filter and trim reads based on A at 5' end" 71 | 72 | ## split fq file in 12 files for multi-threading 73 | zcat ${fq_branch%%.fq*}_3p_trimmed.fq.gz 2>&1 | seqkit split -p 12 | tee -a ../${fileID}_logs_${dow}.log 74 | 75 | ## run triming R script on split files in parallel - first numeric argument determines length in which A needs to occur (branch sequence = 10 nts) 76 | parallel -j 12 Rscript ~/scripts/RNA/trim_5p_graft_nad.r 15 {} ::: stdin.split/stdin.part_*.fastq 2>&1 | tee -a ../${fileID}_logs_${dow}.log 77 | 78 | ## concatenate output files 79 | cd stdin.split 80 | cat *processed.fq > ${fileID}_processed_output.fq 81 | pigz -p 4 ${fileID}_processed_output.fq 82 | mv ${fileID}_processed_output.fq.gz ../ 83 | cd ../ 84 | 85 | # clean up intermediates 86 | rm -r stdin.split 87 | 88 | ## qc filtered and trimmed reads 89 | fastqc -t 12 ${fileID}_processed_output.fq.gz 2>&1 | tee -a ../${fileID}_logs_${dow}.log 90 | 91 | cd ../ 92 | mkdir 0_fastq 93 | mv 1_read_trimming/$fq 0_fastq/ 94 | mv 1_read_trimming/$fq_branch 0_fastq/ 95 | 96 | # read alignment 97 | echo "Align filtered and trimmed reads" 98 | 99 | mkdir 2_align 100 | mv 1_read_trimming/${fileID}_processed_output.fq.gz 2_align/ 101 | cd 2_align 102 | 103 | STAR --runThreadN 12 --outFilterMismatchNmax 0 --outFilterMultimapNmax 1 --genomeDir $index --readFilesCommand gunzip -c --readFilesIn ${fileID}_processed_output.fq.gz --outFileNamePrefix "${fileID}_" --outSAMtype BAM SortedByCoordinate --limitBAMsortRAM 8000000000 2>&1 | tee -a ../${fileID}_logs_${dow}.log 104 | 105 | mv ${fileID}_processed_output.fq.gz ../1_read_trimming/ 106 | 107 | echo "Alignment complete" 108 | 109 | 110 | 111 | -------------------------------------------------------------------------------- /RNA/nadBAM_to_ADPRC_sites.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -u 3 | 4 | # Compute NAD enrichment per site (start of read) by providing +ADPRC and -ADPRC samples as BAM aligned reads 5 | 6 | ### CONDA environment is installed 7 | # conda create --name ngs_plots 8 | # conda install -n ngs_plots -c bioconda bedtools 9 | # conda install -n ngs_plots r-fields 10 | # conda install -n ngs_plots -c r r-tidyverse 11 | # conda activate ngs_plots 12 | 13 | 14 | if [ "$#" -lt 5 ]; then 15 | echo "Missing arguments!" 16 | echo "USAGE: graft_nad_adprc_enrich.sh <+adprc bam> <-adprc bam> " 17 | echo "EXAMPLE: graft_nad_adprc_enrich.sh WT_plus-a_rep1.sorted.bam WT_minus-a_rep1.sorted.bam WT_rep1 tss.bed tss" 18 | exit 1 19 | fi 20 | 21 | smp_p=$1 22 | smp_m=$2 23 | out=$3 24 | bedfile=$4 25 | feature=$5 26 | 27 | echo "###############" 28 | echo "+ADPRC sample = $1" 29 | echo "-ADPRC sample= $2" 30 | echo "Sample ID = $3" 31 | echo "Bedfile = $4" 32 | echo "Feature name = $5" 33 | echo "#############" 34 | 35 | echo "calculate scaling factors" 36 | scl_p=$(bc <<< "scale=6;1000000/$(samtools view -F 260 -c $smp_p)") 37 | scl_m=$(bc <<< "scale=6;1000000/$(samtools view -F 260 -c $smp_m)") 38 | 39 | echo "BAM to bed..." 40 | bedtools genomecov -bg -5 -scale $scl_p -ibam $smp_p > ${smp_p%%.bam}.5p.bed 41 | bedtools genomecov -bg -5 -scale $scl_m -ibam $smp_m > ${smp_m%%.bam}.5p.bed 42 | 43 | echo "Combine + and -ADPRC samples to calculate per-nt NAD%" 44 | bedtools unionbedg -names plus minus -i ${smp_p%%.bam}.5p.bed ${smp_m%%.bam}.5p.bed | awk 'BEGIN {FS=OFS="\t"} {prop = $4 / ($4 + $5) } ($4 > 3) && (prop > 0.9) {print $0, prop}' > ${out}.nad.5p.bed 45 | 46 | #echo "closestBed..." 47 | #closestBed -D "a" -a ${out}.nad.5p.bed -b $bedfile > ${out}_${feature}_nad.5p.bed 48 | #awk -F$'\t' '$NF<51 && $NF>-51' ${out}_${feature}_nad.5p.bed > ${out}_${feature}_20bp_nad.5p.bed 49 | 50 | #echo "Maths ..." 51 | #Rscript /home/dganguly/scripts/RNA/rel_expression_plots_nad.r ${out}_${feature}_20bp_nad.5p.bed 52 | 53 | echo 'cleaning' 54 | rm -v ${smp_p%%.bam}.5p.bed ${smp_m%%.bam}.5p.bed 55 | -------------------------------------------------------------------------------- /RNA/pare_pipe_v1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eu 3 | 4 | # Read alignment for PARE libraries with STAR 5 | # SE only, trim adatpers and low quality basecalls (reduce length cutoff for trimming), map reads up to 20 bp length with 0 mismatches using STAR. 6 | 7 | # Retrieve TAIR10 reference and prepare STAR index 8 | # wget ftp://ftp.ensemblgenomes.org/pub/release-47/plants/fasta/arabidopsis_thaliana/dna/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.gz 9 | # samtools faidx Arabidopsis_thaliana.TAIR10.dna.toplevel.fa 10 | # cut -f1,2 Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.fai > Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len 11 | 12 | # Build STAR genome index 13 | # STAR --runThreadN 4 --runMode genomeGenerate --genomeSAindexNbases 12 --sjdbGTFfile Arabidopsis_thaliana.TAIR10.54.gtf --genomeDir /path/to/GenomeDir/ --genomeFastaFiles Arabidopsis_thaliana.TAIR10.dna.toplevel.fa 14 | 15 | ### CONDA environment is installed 16 | # conda create --name 17 | # conda install -n -c bioconda fastqc 18 | # conda install -n -c bioconda star 19 | # conda install -n -c bioconda bedtools 20 | # conda install -c bioconda fastx_toolkit 21 | 22 | if [ "$#" -lt 3 ]; then 23 | echo "Missing required arguments!" 24 | echo "USAGE: pare_pipe_v1.sh " 25 | echo "EXAMPLE: pare_pipe_v1.sh sample.fastq /home/dganguly/ref_seqs/STAR/TAIR10/GenomeDir sample_rep1" 26 | exit 1 27 | fi 28 | 29 | #gather input variables 30 | fq=$1 31 | index=$2; #path to STAR index 32 | fileID=$3; 33 | dow=$(date +"%F-%H-%m") 34 | 35 | echo "##################" 36 | echo "Performing single-end alignment with the following parameters:" 37 | echo "Input Files: $fq" 38 | echo "genome index: $index" 39 | echo "Output ID: $fileID" 40 | echo "Time of analysis: $dow" 41 | echo "##################" 42 | 43 | # make sample work directory 44 | mkdir ${fileID}_pare_${dow} 45 | mv $fq ${fileID}_pare_${dow} 46 | cd ${fileID}_pare_${dow} 47 | 48 | # gzip if unzipped input file 49 | if [[ $fq != *.gz ]];then gzip $fq; fq="${fq}.gz"; fi 50 | 51 | # initial fastqc 52 | mkdir 1_fastqc 53 | fastqc -t 8 $fq 2>&1 | tee -a ${fileID}_logs_${dow}.log 54 | mv ${fq%%.fastq*}_fastqc* 1_fastqc 55 | 56 | echo "Read trimming... " 57 | # Trim_galore: remove adapters and low quality base-calls, retain reads as small as 10 bp, generate fastqc report on trimmed reads. 58 | mkdir 2_read_trimming 59 | cd 2_read_trimming 60 | trim_galore --length 10 --fastqc --fastqc_args "-t 8" ../$fq 2>&1 | tee -a ../${fileID}_logs_${dow}.log 61 | 62 | cd ../ 63 | mkdir 0_fastq 64 | mv $fq 0_fastq 65 | 66 | mkdir 3_align 67 | mv 2_read_trimming/${fq%%.fastq*}_trimmed.fq.gz -t 3_align/ 68 | cd 3_align 69 | echo "Beginning alignment ..." 70 | 71 | # truncate to first 20 nt, discard sequences shorter than 20 nucleotides 72 | zcat ${fq%%.fastq*}_trimmed.fq.gz | fastx_trimmer -z -l 20 -o ${fq%%.fastq}.20bp.trimmed.fq.gz 73 | input=${fq%%.fastq}.20bp.trimmed.fq.gz 74 | 75 | # align using STAR allowing 0 mismatches 76 | STAR --runThreadN 8 --outFilterMismatchNmax 0 --outFilterScoreMinOverLread 0.75 --outFilterMatchNminOverLread 0.75 --outFilterMultimapNmax 1 --genomeDir $index --readFilesCommand gunzip -c --readFilesIn $input --outFileNamePrefix "${fileID}_" --outSAMtype BAM SortedByCoordinate | tee -a ../${fileID}_logs_${dow}.log 77 | 78 | echo "cleaning..." 79 | 80 | outbam="${fileID}*.sortedByCoord.out.bam" 81 | samtools index $outbam 2>&1 | tee -a ../${fileID}_logs_${dow}.log 82 | mv *trimmed.fq.gz ../2_read_trimming/ 83 | 84 | echo "Alignment complete" 85 | 86 | 87 | -------------------------------------------------------------------------------- /RNA/rel_expression_plots.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # args[1] = filename 3 | # Run on output of BAM_to_bedgraph.sh or BAM_to_bedgraph_5p.sh 4 | # Summarise read depth across all features of interest by binned positions 5 | 6 | options(echo=T) 7 | library(fields) 8 | args=commandArgs(trailingOnly=T) 9 | print(args) 10 | 11 | # Read in file 12 | input=read.delim(args[1],head=F) 13 | 14 | # Remove plastids and unmatched rows 15 | input=subset(input,input$V1!='ChrM' & input$V1!='ChrC' & input$V1 != 'Mt' & input$V1 != 'Pt') 16 | input=subset(input,input[,ncol(input)] != -1) 17 | 18 | # calculate normalized distance values for reads relative to feature 19 | rel.dist=matrix(ifelse(input$V11==0,ifelse(input[,10]=="-",((input[,7] - (input[,2]))/(input[,7] - input[,6]))*100,(((input[,2]) - input[,6])/(input[,7] - input[,6]))*100),ifelse(input$V11>0,input$V11 + 100,input$V11)),ncol=1) 20 | input=cbind(input,rel.dist) 21 | fixy=ifelse(input$rel.dist < 0 & input$V11==0,0,ifelse(input$rel.dist > 100 & input$V11==0, 100, input$rel.dist)) 22 | input$rel.dist=fixy 23 | 24 | # bin read depth by distance 25 | exp.bin=stats.bin(input$rel.dist,input$V4,N=100) 26 | p.bin=cbind(matrix(exp.bin$centers,ncol=1),exp.bin$stats["mean",]) 27 | out=cbind(p.bin) 28 | name <- sapply(strsplit(as.character(args[1]),'_'), function(l) l[1]) 29 | colnames(out)=c('pos',paste(name)) 30 | name2 <- sapply(strsplit(args[1], 'bed'), function(l) l[1]) 31 | write.table(out,paste(name2,'values.txt',sep=''),sep='\t', quote=F, row.names=F) 32 | -------------------------------------------------------------------------------- /RNA/rel_expression_plots_ejc.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # args[1] = filename 3 | # Run on output of BAM_to_EJC.sh 4 | # Computes normalized 5'P end frequency upstream of exon-exon junction (> 49 nt length, see Lee et al 2020 The Plant Cell) 5 | 6 | options(echo=T) 7 | library(fields) 8 | library(tidyverse) 9 | args=commandArgs(trailingOnly=T) 10 | print(args) 11 | 12 | rpm_scale <- as.numeric(paste(args[2])) 13 | 14 | # Read in file 15 | input <- read.delim(args[1],head=F) %>% 16 | # Remove reads to plastid and mitochondria 17 | subset(V1 != 'ChrM' & V1 != 'ChrC' & V1 != 'Mt' & V1 != 'Pt') %>% 18 | mutate(length = V7 - V6) %>% 19 | # features at least 50 bp in length 20 | subset(length > 49) %>% 21 | # calculate position relative to 3' end of feature 22 | mutate(pos = ifelse(V10 == "+", V2-V7, V6-V3)) %>% 23 | mutate(rpm = V4 * rpm_scale) 24 | 25 | # sum all reads in 50 nt window upstream of 3' end 26 | exon_3p_sum <- subset(input, pos < 0 & pos > -51) %>% 27 | group_by(V8) %>% 28 | summarise(sum_rpm=sum(rpm)) 29 | 30 | # normalise depth per nt by sum of reads across 50 nt window and filter for scaled read depth > 0 31 | exon_3p <- subset(input, pos <= 0 & pos > -51) %>% 32 | mutate(sum_rpm = exon_3p_sum$sum_rpm[match(V8, exon_3p_sum$V8)]) %>% 33 | mutate(norm_counts = rpm/sum_rpm) %>% 34 | subset(abs(sum_rpm) > 1) 35 | 36 | # Get sum of normalized reads (i.e. normalized occurrence of 5'P ends, Pi) then calculate relative frequency per nt 37 | sum_exon_3p <- group_by(exon_3p, pos) %>% 38 | summarise(sum_norm_counts = sum(norm_counts), counts_raw = sum(V4), counts_rpm = sum(rpm)) %>% ## sum of normalized counts (Pi), raw coubts, and scaled counts (rpm) per nt 39 | mutate(total_counts = sum(sum_norm_counts)) %>% ## sum of all normalized counts 40 | mutate(rel_freq = sum_norm_counts/total_counts) %>% ## freq of normalized counts per position relative to all normalized counts 41 | select(pos, sum_norm_counts, rel_freq) # take columns for Pi and relative frequency of Pi 42 | 43 | # name output file 44 | name <- sapply(strsplit(as.character(args[1]),'.bed'), function(l) l[1]) 45 | 46 | ## diagnostic plot on single sample 47 | pdf(paste0(name,".3p.pdf")) 48 | plot(y=sum_exon_3p$rel_freq, x= sum_exon_3p$pos) 49 | dev.off() 50 | 51 | ## output 52 | write.table(sum_exon_3p, paste0(name,".ejc.txt"), sep='\t', quote=F, row.names=F) 53 | 54 | -------------------------------------------------------------------------------- /RNA/rel_expression_plots_nad.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Run on output of graft_nad_adprc_enrich.sh 3 | 4 | options(echo=T) 5 | library(fields) 6 | library(tidyverse) 7 | args=commandArgs(trailingOnly=T) 8 | print(args) 9 | 10 | # Read in file 11 | input <- read.delim(args[1],head=F) %>% 12 | # Remove reads to plastid and mitochondria 13 | subset(V1 != 'ChrM' & V1 != 'ChrC' & V1 != 'Mt' & V1 != 'Pt') %>% 14 | # calculate position relative to 5' end of feature 15 | mutate(pos = ifelse(V12 == "+", V2-V8, V9-V3)) %>% 16 | subset(pos <= 50 & pos >= -50) %>% 17 | subset(V4 >= 1) ## at least 1 RPM in ADPRC sample 18 | 19 | # summarise prop NAD+ 20 | df1 <- group_by(input, pos) %>% 21 | summarise(mean_prop = mean(V6)) # mean prop NAD = A+ / A- 22 | 23 | # name output file 24 | name <- sapply(strsplit(as.character(args[1]),'.bed'), function(l) l[1]) 25 | 26 | ## diagnostic plot on single sample 27 | pdf(paste0(name,".pdf")) 28 | plot(y=df1$mean_prop, x= df1$pos) 29 | dev.off() 30 | 31 | ## output 32 | write.table(df1, paste0(name,".ejc.txt"), sep='\t', quote=F, row.names=F) 33 | 34 | -------------------------------------------------------------------------------- /RNA/rel_expression_plots_stop.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # args[1] = filename 3 | # Run on output of BAM_to_STOP.sh 4 | # computes 5'P end depth relative to first nucleotide of STOP or START codon 5 | 6 | options(echo=T) 7 | library(fields) 8 | library(tidyverse) 9 | args=commandArgs(trailingOnly=T) 10 | print(args) 11 | 12 | rpm_scale <- as.numeric(paste(args[2])) 13 | 14 | # Read in file 15 | input <- read.delim(args[1],head=F) %>% 16 | # Remove reads to plastid and mitochondria 17 | subset(V1 != 'ChrM' & V1 != 'ChrC' & V1 != 'Mt' & V1 != 'Pt') %>% 18 | # calculate position relative to first base of STOP or START codon 19 | mutate(pos = ifelse(V10 == "+", V2-V6, V7-V3)) %>% 20 | mutate(rpm = V4 * rpm_scale) 21 | 22 | # sum all reads in 50 nt window upstream of 3' end 23 | stop_5p_sum <- group_by(input, V8) %>% 24 | summarise(sum_rpm=sum(rpm), sum_reads=sum(V4)) 25 | 26 | # normalise depth per nt by sum of reads across window and filter for scaled read depth > 0 27 | stop_5p <- mutate(input, sum_rpm = stop_5p_sum$sum_rpm[match(V8, stop_5p_sum$V8)]) %>% 28 | mutate(norm_counts = rpm/sum_rpm) %>% 29 | subset(sum_rpm > 1) 30 | 31 | # Get sum of normalized reads (i.e.normalized occurrence of 5'P ends, Pi) then calculate relative frequency per nt 32 | sum_stop_5p <- group_by(stop_5p, pos) %>% 33 | summarise(sum_norm_counts = sum(norm_counts), counts_raw = sum(V4), counts_rpm = sum(rpm)) %>% ## sum of normalized counts (Pi), raw counts, and scaled counts (rpm) per nt 34 | mutate(total_counts = sum(sum_norm_counts)) %>% ## sum of all normalized counts 35 | mutate(rel_freq = sum_norm_counts/total_counts) %>% ## freq of normalized counts per position relative to all normalized counts 36 | select(pos, sum_norm_counts, rel_freq) 37 | 38 | # name output file 39 | name <- sapply(strsplit(as.character(args[1]),'.bed'), function(l) l[1]) 40 | 41 | ## diagnostic plot on single sample 42 | pdf(paste0(name,".stop.pdf")) 43 | plot(y=sum_stop_5p$rel_freq, x= sum_stop_5p$pos) 44 | dev.off() 45 | 46 | ## output 47 | write.table(sum_stop_5p, paste0(name,".stop.txt"), sep='\t', quote=F, row.names=F) 48 | 49 | -------------------------------------------------------------------------------- /RNA/smrna_pipe_v1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eu 3 | 4 | # Read alignment for smRNA-seq libraries with STAR 5 | # SE only, trim adapters and low quality bases (remove length cutoff for trimming), trim reads to first 25 bp. Map with STAR with 0 mismatches and min mapped length 17. 6 | 7 | # Retrieve TAIR10 reference and prepare STAR index 8 | # wget ftp://ftp.ensemblgenomes.org/pub/release-47/plants/fasta/arabidopsis_thaliana/dna/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.gz 9 | # samtools faidx Arabidopsis_thaliana.TAIR10.dna.toplevel.fa 10 | # cut -f1,2 Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.fai > Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len 11 | 12 | # Build STAR genome index 13 | # STAR --runThreadN 4 --runMode genomeGenerate --genomeSAindexNbases 12 --sjdbGTFfile Arabidopsis_thaliana.TAIR10.54.gtf --genomeDir /path/to/GenomeDir/ --genomeFastaFiles Arabidopsis_thaliana.TAIR10.dna.toplevel.fa 14 | 15 | ### CONDA environment is installed 16 | # conda create --name 17 | # conda install -n -c bioconda fastqc 18 | # conda install -n -c bioconda star 19 | # conda install -n -c bioconda bedtools 20 | # conda install -c bioconda fastx_toolkit 21 | 22 | if [ "$#" -lt 3 ]; then 23 | echo "Missing required arguments!" 24 | echo "USAGE: smrna_pipe_v1.sh " 25 | echo "EXAMPLE: smrna_pipe_v1.sh sample.fastq /home/dganguly/ref_seqs/STAR/TAIR10/GenomeDir sample_rep1" 26 | exit 1 27 | fi 28 | 29 | #gather input variables 30 | fq=$1 31 | index=$2; #path to STAR index 32 | fileID=$3; 33 | dow=$(date +"%F-%H-%m-%S") 34 | 35 | echo "##################" 36 | echo "Performing single-end alignment with the following parameters:" 37 | echo "Input Files: $fq" 38 | echo "genome index: $index" 39 | echo "Output ID: $fileID" 40 | echo "Time of analysis: $dow" 41 | echo "##################" 42 | 43 | # make sample work directory 44 | mkdir ${fileID}_srna_${dow} 45 | mv $fq ${fileID}_srna_${dow} 46 | cd ${fileID}_srna_${dow} 47 | 48 | # gzip if unzipped input file 49 | if [[ $fq != *.gz ]];then gzip $fq; fq="${fq}.gz"; fi 50 | 51 | # initial fastqc 52 | mkdir 1_fastqc 53 | fastqc -t 8 $fq 2>&1 | tee -a ${fileID}_logs_${dow}.log 54 | mv ${fq%%.fastq*}_fastqc* 1_fastqc 55 | 56 | echo "Read trimming... " 57 | # Trim_galore: remove adapters and low quality base-calls, set to small rna mode (min length 15 nt, max length 25 nt), generate fastqc report on trimmed reads. 58 | mkdir 2_read_trimming 59 | cd 2_read_trimming 60 | trim_galore --small_rna --max_length 30 --fastqc --fastqc_args "-t 8" ../$fq 2>&1 | tee -a ../${fileID}_logs_${dow}.log 61 | 62 | cd ../ 63 | mkdir 0_fastq 64 | mv $fq 0_fastq 65 | 66 | ## prep folder for STAR alignment 67 | mkdir 3_align 68 | mv 2_read_trimming/${fq%%.fastq*}_trimmed.fq.gz -t 3_align/ 69 | cd 3_align 70 | echo "Beginning alignment ..." 71 | 72 | ## define input file 73 | if [[ $fq == *"fq.gz" ]]; then input=${fq%%.fq*}_trimmed.fq*; else input=${fq%%.fastq*}_trimmed.fq*; fi 74 | 75 | # STAR alignment: 0 mismatches, min mapped length 18 nt, no more than 4 alignments 76 | STAR --runThreadN 8 --outFilterMismatchNmax 0 --outFilterMatchNmin 18 --outFilterMultimapNmax 4 --genomeDir $index --readFilesCommand gunzip -c --readFilesIn $input --outFileNamePrefix "${fileID}_" --outSAMtype BAM SortedByCoordinate | tee -a ../${fileID}_logs_${dow}.log 77 | 78 | echo "cleaning..." 79 | 80 | outbam="${fileID}*.sortedByCoord.out.bam" 81 | samtools index $outbam 2>&1 | tee -a ../${fileID}_logs_${dow}.log 82 | mv *trimmed.fq.gz ../2_read_trimming/ 83 | 84 | echo "Alignment complete" 85 | 86 | 87 | -------------------------------------------------------------------------------- /RNA/split_file.R: -------------------------------------------------------------------------------- 1 | #Given two pairs of lists of samples, split [1] in two files with the samples indicated in [2] and [3] 2 | #[1] First argument: input file that we want to split 3 | #[2] Second argument: list of samples of the first condition 4 | #[3] Third argument: list of samples of the second condition 5 | #[4] Fourth argument: output file of the first condition 6 | #[5] Fifth argument: output file of the second condition 7 | 8 | # Parse command line arguments 9 | print("Parsing samples...") 10 | CHARACTER_command_args <- commandArgs(trailingOnly=TRUE) 11 | 12 | #Load the input file 13 | print(paste0("Loading ",CHARACTER_command_args[1],"...")) 14 | input_file <- read.table(CHARACTER_command_args[1],header=TRUE) 15 | 16 | #Load the list of samples of the first condition 17 | first_condition <- unlist(strsplit(CHARACTER_command_args[2],",")) 18 | 19 | #Take the samples of first condition and generate a file with just these columns 20 | first_output <- input_file[first_condition] 21 | 22 | #Load the list of samples of the second condition 23 | second_condition <- unlist(strsplit(CHARACTER_command_args[3],",")) 24 | 25 | #Take the samples of second condition and generate a file with just these columns 26 | second_output <- input_file[second_condition] 27 | 28 | #Save the output files 29 | string <- unlist(strsplit(CHARACTER_command_args[1],"/")) 30 | string2 <- paste(string[-length(string)],collapse = "/") 31 | path1 <- paste0(string2,"/",CHARACTER_command_args[4]) 32 | write.table(first_output,file=path1,quote=FALSE,sep="\t") 33 | print(paste0("Saved ",path1)) 34 | 35 | path2 <- paste0(string2,"/",CHARACTER_command_args[5]) 36 | write.table(second_output,file=path2,quote=FALSE,sep="\t") 37 | print(paste0("Saved ",path2)) 38 | 39 | 40 | -------------------------------------------------------------------------------- /RNA/stringtie_extract_tpm.r: -------------------------------------------------------------------------------- 1 | ## extract TPM values for transcripts assembled by stringtie 2 | ## input is stringtie GTF output file 3 | options(echo = FALSE) 4 | library(tidyverse) 5 | args=commandArgs(trailingOnly=T) 6 | 7 | for(i in args){ 8 | 9 | a <- read.delim(i, header=F, skip=2) %>% 10 | subset(V3 == "transcript") %>% 11 | mutate(target_id = sapply(strsplit(as.character(V9), ';'), function(l) l[2])) %>% 12 | mutate(target_id = sapply(strsplit(target_id, "transcript_id "), function(l) l[2])) %>% 13 | mutate(length = V5-V4) %>% 14 | mutate(tpm = sapply(strsplit(as.character(V9), 'TPM '), function(l) l[2])) %>% 15 | mutate(tpm = as.numeric(sapply(strsplit(tpm, ';'), function(l) l[1]))) %>% 16 | select(target_id, length, tpm) 17 | 18 | outfile <- sapply(strsplit(i, "_stringtie"), function(l) l[1]) 19 | write.table(a, paste0(outfile,"_stringtie.tpm"), sep='\t', quote=F, col.names=T, row.names=F) 20 | } 21 | 22 | -------------------------------------------------------------------------------- /RNA/stringtie_pipe_v1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eu 3 | 4 | # StringTie2 v1: assemble transcripts using aligned reads (BAM output), merge assemblies, and quantify transcript abundance (-eB) for each sample. 5 | # software required: samtools, stringtie, gffcompare. 6 | 7 | ## e.g. generate list of sample names 8 | ## dir *bam > files.txt 9 | 10 | if [ "$#" -lt 3 ]; then 11 | echo "Missing required arguments!" 12 | echo "USAGE: stringtie_pipe_v1.sh " 13 | echo "EXAMPLE: stringtie_pipe_v1.sh files.txt Arabidopsis_thaliana.TAIR10.46.gff3" 14 | exit 1 15 | fi 16 | 17 | ## gather input variables 18 | smpls=$(cat $1) 19 | type=$2 20 | ref=$3 21 | dow=$(date +"%F-%H-%m") 22 | 23 | echo "" 24 | echo "Organising BAM files" 25 | echo "" 26 | 27 | ## ensure all BAM files are sorted and indexed 28 | new_smpls="" 29 | 30 | for i in $smpls; 31 | do 32 | if [[ $i != *sorted.bam ]]; then 33 | samtools sort -@ 4 $i -o "${i%%.bam}.sorted.bam" 34 | fq="${i%%.bam}.sorted.bam" 35 | new_smpls="$new_smpls $fq"; 36 | else new_smpls="$new_smpls $i"; 37 | fi; 38 | done 39 | 40 | echo "" 41 | echo "Transcript assembly per sample" 42 | echo "" 43 | 44 | ## Use each sample to produce transcript assembly 45 | 46 | if [ "$type" == "un" ] && [ -z "$ref" ]; then 47 | 48 | echo "" 49 | echo "Unstranded library with no reference annotation" 50 | echo "" 51 | 52 | for i in $new_smpls; do 53 | stringtie --conservative $i -p 4 -g 10 -o "${i%%.sorted.bam}_stringtie.out"; 54 | done 55 | fi 56 | 57 | if [ "$type" == "un" ] && [ ! -z "$ref" ]; then 58 | 59 | echo "" 60 | echo "Unstranded library with annotation $ref" 61 | echo "" 62 | 63 | for i in $new_smpls; do 64 | stringtie --conservative $i -p 4 -g 10 -G $ref -o "${i%%.sorted.bam}_stringtie.out"; 65 | done 66 | fi 67 | 68 | if [ "$type" == "fr" ] && [ -z "$ref" ]; then 69 | 70 | echo "" 71 | echo "Forward stranded without reference annotation" 72 | echo "" 73 | 74 | for i in $new_smpls; do 75 | stringtie --conservative -p 4 -g 10 $i --fr -o "${i%%.sorted.bam}_stringtie.out"; 76 | done 77 | fi 78 | 79 | if [ "$type" == "fr" ] && [ ! -z "$ref" ]; then 80 | 81 | echo "" 82 | echo "Forward stranded with reference $ref" 83 | echo "" 84 | 85 | for i in $new_smpls; do 86 | stringtie --conservative -p 4 -g 10 $i --fr -G $ref -o "${i%%.sorted.bam}_stringtie.out"; 87 | done 88 | fi 89 | 90 | if [ "$type" == "rf" ] && [ -z "$ref" ]; then 91 | 92 | echo "" 93 | echo "Reverse stranded with reference annotation" 94 | echo "" 95 | 96 | for i in $new_smpls; do 97 | stringtie --conservative -p 4 -g 10 $i --rf -o "${i%%.sorted.bam}_stringtie.out"; 98 | done 99 | fi 100 | 101 | if [ "$type" == "rf" ] && [ ! -z "$ref" ]; then 102 | 103 | echo "" 104 | echo "Reverse stranded with reference $ref" 105 | echo "" 106 | 107 | for i in $new_smpls; do 108 | stringtie --conservative -p 4 -g 10 $i --rf -G $ref -o "${i%%.sorted.bam}_stringtie.out"; 109 | done 110 | fi 111 | 112 | ## Merge assemblies with abundance filters 113 | echo "" 114 | echo "Merging assemblies" 115 | echo "" 116 | 117 | strng="*_stringtie.out" 118 | 119 | if [ ! -z "$ref" ]; then 120 | stringtie --merge $strng -G $ref -f 0.05 -T 1 -o "merged_stringtie_out.gtf"; 121 | else stringtie --merge $strng -o "merged_stringtie_out.gtf"; 122 | fi 123 | 124 | ## clean-up 125 | rm *_stringtie.out 126 | 127 | ## Determine abundance of assembled transcript in each sample 128 | 129 | echo "" 130 | echo "Abundance estimation" 131 | echo "" 132 | 133 | if [ "$type" == "un" ]; then 134 | 135 | for i in $new_smpls; do 136 | stringtie $i -eB -G merged_stringtie_out.gtf -o "${i%%.sorted.bam}_stringtie_out.gtf"; 137 | mv t_data.ctab ${i%%.sorted.bam}_tdata.ctab 138 | done 139 | fi 140 | 141 | if [ "$type" == "fr" ]; then 142 | 143 | for i in $new_smpls; do 144 | stringtie $i -eB --fr -G merged_stringtie_out.gtf -o "${i%%.sorted.bam}_stringtie_out.gtf"; 145 | mv t_data.ctab ${i%%.sorted.bam}_tdata.ctab 146 | done 147 | fi 148 | 149 | if [ "$type" == "rf" ]; then 150 | 151 | for i in $new_smpls; do 152 | stringtie $i -eB --rf -G merged_stringtie_out.gtf -o "${i%%.sorted.bam}_stringtie_out.gtf"; 153 | mv t_data.ctab ${i%%.sorted.bam}_tdata.ctab 154 | done 155 | fi 156 | 157 | echo "cleaning ..." 158 | 159 | for i in $smpls; 160 | do 161 | if [[ $i != *sorted.bam ]]; then 162 | rm "${i%%.bam}.sorted.bam" 163 | fi; 164 | done 165 | 166 | ## coverage estimates 167 | mkdir abundance_estimates 168 | mv *tdata.ctab -t abundance_estimates/ 169 | rm *ctab 170 | 171 | ## export TPM 172 | 173 | for i in $new_smpls; do 174 | Rscript $HOME/scripts/RNA/stringtie_extract_tpm.r "${i%%.sorted.bam}_stringtie_out.gtf" 175 | mv *tpm -t abundance_estimates/ 176 | done 177 | 178 | if [ ! -z "$ref" ]; then 179 | 180 | echo "" 181 | echo "compare with $ref" 182 | echo "" 183 | 184 | gffcompare -R -r $ref -o strtcmp merged_stringtie_out.gtf 185 | 186 | mkdir gffcompare_results 187 | mv strtcmp* -t gffcompare_results/ 188 | 189 | fi 190 | 191 | echo "Complete!" 192 | ############################## 193 | 194 | 195 | -------------------------------------------------------------------------------- /RNA/stringtie_pipe_v2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eu 3 | 4 | # StringTie2 v2: use stringtie to compute transcript abundance for pre-assembled transcripts and export TPM. 5 | ## e.g. generate list of sample names 6 | ## dir *bam > files.txt 7 | 8 | if [ "$#" -lt 3 ]; then 9 | echo "Missing required arguments!" 10 | echo "USAGE: stringtie_pipe_v2.sh " 11 | echo "EXAMPLE: stringtie_pipe_v2.sh files.txt merged" 12 | exit 1 13 | fi 14 | 15 | ## gather input variables 16 | smpls=$(cat $1) 17 | type=$2 18 | ref=$3 19 | dow=$(date +"%F-%H-%m") 20 | 21 | echo "" 22 | echo "Organising BAM files" 23 | echo "" 24 | 25 | ## ensure all BAM files are sorted and indexed 26 | new_smpls="" 27 | 28 | for i in $smpls; 29 | do 30 | if [[ $i != *sorted.bam ]]; then 31 | samtools sort -@ 6 $i -o "${i%%.bam}.sorted.bam" 32 | fq="${i%%.bam}.sorted.bam" 33 | new_smpls="$new_smpls $fq"; 34 | else new_smpls="$new_smpls $i"; 35 | fi; 36 | done 37 | 38 | ## Transcript abundance 39 | 40 | echo "" 41 | echo "Abundance estimation (TPM)" 42 | echo "" 43 | 44 | if [ "$type" == "un" ]; then 45 | 46 | for i in $new_smpls; do 47 | echo $i $type 48 | stringtie $i -e -G $ref -o "${i%%.sorted.bam}_stringtie.gtf"; 49 | done 50 | fi 51 | 52 | if [ "$type" == "fr" ]; then 53 | 54 | for i in $i; do 55 | echo $new_smpls $type 56 | stringtie $i -e --fr -G $ref -o "${i%%.sorted.bam}_stringtie.gtf"; 57 | done 58 | fi 59 | 60 | if [ "$type" == "rf" ]; then 61 | 62 | for i in $i; do 63 | echo $new_smpls $type 64 | stringtie $i -e --rf -G $ref -o "${i%%.sorted.bam}_stringtie.gtf"; 65 | done 66 | fi 67 | 68 | echo "cleaning ..." 69 | 70 | for i in $smpls; 71 | do 72 | if [[ $i != *sorted.bam ]]; then 73 | rm "${i%%.bam}.sorted.bam" 74 | fi; 75 | done 76 | 77 | ## export TPM 78 | 79 | for i in $new_smpls; do 80 | Rscript $HOME/scripts/RNA/stringtie_extract_tpm.r "${i%%.sorted.bam}_stringtie.gtf" 81 | done 82 | 83 | mkdir tpm_abundance 84 | mv *_stringtie.tpm tpm_abundance 85 | 86 | echo "Complete!" 87 | ############################## 88 | 89 | -------------------------------------------------------------------------------- /RNA/total_expression.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # args[1] = filename 3 | # Run on output of BAM_to_bedgraph.sh or BAM_to_bedgraph_5p.sh 4 | # calculate total read depth across features of interest 5 | 6 | options(echo=T) 7 | library(fields) 8 | library(tidyverse) 9 | args=commandArgs(trailingOnly=T) 10 | print(args) 11 | 12 | # Read in file 13 | input <- read.delim(args[1],head=F) 14 | 15 | input <- subset(input,input[,ncol(input)] != -1) 16 | 17 | # calculate length of feature and per million scaling factor (total reads) 18 | input <- subset(input, V1 !='ChrM' & V1!='ChrC' & V1 != 'Mt' & V1 != 'Pt') %>% # Remove plastids and unmatched rows 19 | mutate(input, length = V7 - V6) 20 | 21 | # Determine total read depth per feature then calculate RPM using per million scaling factor 22 | out <- group_by(input, V8, length) %>% 23 | summarise(read_depth = sum(V4)) 24 | 25 | name <- sapply(strsplit(args[1], 'bed'), function(l) l[1]) 26 | write.table(out, paste(name,'feature_depth.txt',sep=''), sep='\t', quote=F, row.names=F) 27 | 28 | -------------------------------------------------------------------------------- /RNA/trim_5p_graft_nad.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Filter reads from GRAFT-NAD-seq libraries based on presence of A within a defined distance 3 | # Trim filtered reads to the first A at the 5' end of reads 4 | 5 | library(tidyverse) 6 | args=commandArgs(trailingOnly=T) 7 | d <- as.numeric(paste(args[1])) 8 | fastq_data <- readLines(args[2]) 9 | 10 | # Check that the file has a proper number of lines 11 | if (length(fastq_data) %% 4 != 0) {stop("The FASTQ file is not properly formatted.")} 12 | 13 | # Group lines into a matrix (4 rows per read) 14 | reads_matrix <- matrix(fastq_data, ncol = 4, byrow = TRUE) 15 | 16 | processed_reads <- apply(reads_matrix, 1, function(l) { 17 | head <- l[1] 18 | seq <- l[2] 19 | opt <- l[3] 20 | qual <- l[4] 21 | 22 | # get sequences that contain 'A" within the first #d bases 23 | if (grepl("A", substr(seq, 1, d))){ 24 | 25 | # Find the position of the first 'A' 26 | pos <- regexpr("A", seq)[1] 27 | 28 | # replace original sequence with trimmed sequence 29 | if (pos > 0) { 30 | 31 | # Trim sequence to position of first 'A' 32 | trimmed_sequence <- substr(seq, pos, nchar(seq)) 33 | trimmed_quality <- substr(qual, pos, nchar(qual)) 34 | } 35 | 36 | return(c(head,trimmed_sequence,opt,trimmed_quality))} 37 | else {return(NULL) } 38 | } 39 | ) 40 | 41 | processed_reads <- unlist(processed_reads) 42 | 43 | # Write the filtered reads to a new FASTQ file 44 | name <- sapply(strsplit(args[2], ".fastq"), function(l) l[1]) 45 | writeLines(processed_reads, paste0(name,"_processed.fq")) 46 | cat(paste0("Filtered and trimmed reads saved to '", name,"_processed.fq'.\n")) 47 | 48 | -------------------------------------------------------------------------------- /RNA/trim_fastq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ## hard trim FASTQs sequences e.g. for PARE-seq libraries 4 | ## https://wikis.utexas.edu/display/CoreNGSTools/Pre-processing+raw+sequences#Preprocessingrawsequences-FASTXToolkit 5 | 6 | ### CONDA environment is installed 7 | # conda create --name 8 | # conda install -c bioconda fastx_toolkit 9 | 10 | 11 | if [ "$#" -lt 2 ]; then 12 | echo "Missing required arguments!" 13 | echo "USAGE: trim_fastq.sh " 14 | echo "EXAMPLE: trim_fastq.sh sample.fastq.gz 15" 15 | exit 1 16 | fi 17 | 18 | fq=$1 19 | NSEQS=$2 20 | 21 | if [[ $fq != *.gz ]];then fastx_trimmer -z -l $NSEQS -i $fq -o ${fq%%.fastq}.${NSEQS}bp.fastq.gz; fi 22 | 23 | if [[ $fq == *.gz ]];then zcat $fq | fastx_trimmer -z -l $NSEQS -o ${fq%%.fastq.gz}.${NSEQS}bp.fastq.gz; fi 24 | 25 | 26 | -------------------------------------------------------------------------------- /TruSeq-adapters.fa: -------------------------------------------------------------------------------- 1 | >D701 2 | ATTACTCG 3 | >D702 4 | TCCGGAGA 5 | >D703 6 | CGCTCATT 7 | >D704 8 | GAGATTCC 9 | >D705 10 | ATTCAGAA 11 | >D706 12 | GAATTCGT 13 | >D707 14 | CTGAAGCT 15 | >D708 16 | TAATGCGC 17 | >D709 18 | CGGCTATG 19 | >D710 20 | TCCGCGAA 21 | >D711 22 | TCTCGCGC 23 | >D712 24 | AGCGATAG 25 | >D501 26 | TATAGCCT 27 | >D502 28 | ATAGAGGC 29 | >D503 30 | CCTATCCT 31 | >D504 32 | GGCTCTGA 33 | >D505 34 | AGGCGAAG 35 | >D506 36 | TAATCTTA 37 | >D507 38 | CAGGACGT 39 | >D508 40 | GTACTGA 41 | >multiplexing-forward 42 | GATCGGAAGAGCACACGTCT 43 | >solexa-forward 44 | AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT 45 | >truseq-forward-contam 46 | AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC 47 | >truseq-reverse-contam 48 | AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA 49 | >nextera-forward-read-contam 50 | CTGTCTCTTATACACATCTCCGAGCCCACGAGAC 51 | >nextera-reverse-read-contam 52 | CTGTCTCTTATACACATCTGACGCTGCCGACGA 53 | >solexa-reverse 54 | AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG 55 | >truseq-sRNA 56 | TGGAATTCTCC 57 | >truseq-universal 58 | AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT 59 | >truseq-index1 60 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG 61 | >truseq-index2 62 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG 63 | >truseq-index3 64 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG 65 | >truseq-index4 66 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG 67 | >truseq-index5 68 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG 69 | >truseq-index6 70 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG 71 | >truseq-index7 72 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG 73 | >truseq-index8 74 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG 75 | >truseq-index9 76 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG 77 | >truseq-index10 78 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG 79 | >truseq-index11 80 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG 81 | >truseq-index12 82 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG 83 | >truseq-index13 84 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTCAACAATCTCGTATGCCGTCTTCTGCTTG 85 | >truseq-index14 86 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTTCCGTATCTCGTATGCCGTCTTCTGCTTG 87 | >truseq-index15 88 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACATGTCAGAATCTCGTATGCCGTCTTCTGCTTG 89 | >truseq-index16 90 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCGTCCCGATCTCGTATGCCGTCTTCTGCTTG 91 | >truseq-index18 92 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTCCGCACATCTCGTATGCCGTCTTCTGCTTG 93 | >truseq-index19 94 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGAAACGATCTCGTATGCCGTCTTCTGCTTG 95 | >truseq-index20 96 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGGCCTTATCTCGTATGCCGTCTTCTGCTTG 97 | >truseq-index21 98 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTTTCGGAATCTCGTATGCCGTCTTCTGCTTG 99 | >truseq-index22 100 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGTACGTAATCTCGTATGCCGTCTTCTGCTTG 101 | >truseq-index23 102 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACGAGTGGATATCTCGTATGCCGTCTTCTGCTTG 103 | >truseq-index25 104 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTGATATATCTCGTATGCCGTCTTCTGCTTG 105 | >truseq-index27 106 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACATTCCTTTATCTCGTATGCCGTCTTCTGCTTG 107 | -------------------------------------------------------------------------------- /VennPieces.R: -------------------------------------------------------------------------------- 1 | VennPieces <- function(test.results, comparison1, comparison2, comparison3, out.dir){ 2 | #create a dataframe called results which has 1,0 or -1 for differentialy 3 | #expressed or not for each gene test.results is a dataframe with the actual 4 | #data you want to output, such as the df with the fold changes and gene names 5 | #in it comparsion1-3 are the file names for each part of the venn out.dir is 6 | #the directory to print the results to 7 | 8 | #prints the venn diagram to file too 9 | pdf(paste0(out.dir,"/Venn.pdf")) 10 | vennDiagram(results, 11 | include=c("up","down"), 12 | counts.col=c('red', 'blue'), 13 | show.include=T, cex = c(0.75, 0.75, 0.75)) 14 | dev.off() 15 | #up 16 | c1 <- test.results[results[,1]==1 & results[,2]!=1 & results[,3]!=1,] 17 | write.csv(c1, paste0(out.dir,"/",comparison1,"_up.csv")) 18 | c1.c2 <- test.results[results[,1]==1 & results[,2]==1 & results[,3]!=1,] 19 | write.csv(c1.c2, paste0(out.dir,"/",comparison1,".",comparison2,"_up.csv")) 20 | c2 <- test.results[results[,1]!=1 & results[,2]==1 & results[,3]!=1,] 21 | write.csv(c2, paste0(out.dir,"/",comparison2,"_up.csv")) 22 | c2.c3 <- test.results[results[,1]!=1 & results[,2]==1 & results[,3]==1,] 23 | write.csv(c2.c3, paste0(out.dir,"/",comparison2,".",comparison3,"_up.csv")) 24 | c3 <- test.results[results[,1]!=1 & results[,2]!=1 & results[,3]==1,] 25 | write.csv(c3, paste0(out.dir,"/",comparison3,"_up.csv")) 26 | c1.c3 <- test.results[results[,1]==1 & results[,2]!=1 & results[,3]==1,] 27 | write.csv(c1.c3, paste0(out.dir,"/",comparison1,".",comparison3,"_up.csv")) 28 | c1.c2.c3 <- test.results[results[,1]==1 & results[,2]==1 & results[,3]==1,] 29 | write.csv(c1.c2.c3, paste0(out.dir,"/",comparison1,".",comparison2,".","comparison3","_up.csv")) 30 | 31 | #down 32 | c1 <- test.results[results[,1]==-1 & results[,2]!=-1 & results[,3]!=-1,] 33 | write.csv(c1, paste0(out.dir,"/",comparison1,"_down.csv")) 34 | c1.c2 <- test.results[results[,1]==-1 & results[,2]==-1 & results[,3]!=-1,] 35 | write.csv(c1.c2, paste0(out.dir,"/",comparison1,".",comparison2,"_down.csv")) 36 | c2 <- test.results[results[,1]!=-1 & results[,2]==-1 & results[,3]!=-1,] 37 | write.csv(c2, paste0(out.dir,"/",comparison2,"_down.csv")) 38 | c2.c3 <- test.results[results[,1]!=-1 & results[,2]==-1 & results[,3]==-1,] 39 | write.csv(c2.c3, paste0(out.dir,"/",comparison2,".",comparison3,"_down.csv")) 40 | c3 <- test.results[results[,1]!=-1 & results[,2]!=-1 & results[,3]==-1,] 41 | write.csv(c3, paste0(out.dir,"/",comparison3,"_down.csv")) 42 | c1.c3 <- test.results[results[,1]==-1 & results[,2]!=-1 & results[,3]==-1,] 43 | write.csv(c1.c3, paste0(out.dir,"/",comparison1,".",comparison3,"_down.csv")) 44 | c1.c2.c3 <- test.results[results[,1]==-1 & results[,2]==-1 & results[,3]==-1,] 45 | write.csv(c1.c2.c3, paste0(out.dir,"/",comparison1,".",comparison2,".","comparison3","_down.csv")) 46 | } 47 | 48 | VennPieces(test.results=unfilteredgenelist, comparison1="sco3", comparison2="phyb", comparison3="sco3phyb", out.dir="venns") -------------------------------------------------------------------------------- /araport11_assemble.sh: -------------------------------------------------------------------------------- 1 | # build annotations based off gff3 files from araport11 release 2 | # https://www.araport.org/data/araport11 3 | # derived from SRE gene_to_gene.sh 4 | # Run lines manually in annotation directory 5 | 6 | # Readme file 7 | wget https://www.araport.org/download_file/Araport11_Release_201606/annotation/README.201606.md 8 | 9 | # Araport11 annotation in GFF3 10 | 11 | # curl -sO -H 'Authorization: Bearer 745dd29759980b058db8fb9efc7af5' https://api.araport.org/files/v2/media/system/araport-public-files//Araport11_Release_201606/annotation/Araport11_GFF3_genes_transposons.201606.gff.gz 12 | 13 | wget http://www.arabidopsis.org/download_files/Genes/Araport11_genome_release/Araport11_GFF3_genes_transposons.201606.gff.gz 14 | 15 | gzip -d *gff.gz 16 | 17 | # Make bed files 18 | 19 | R 20 | 21 | library(tidyverse) 22 | 23 | getAttributeField <- function (x, field, attrsep = ";") { 24 | s = strsplit(x, split = attrsep, fixed = TRUE) 25 | sapply(s, function(atts) { 26 | a = strsplit(atts, split = "=", fixed = TRUE) 27 | m = match(field, sapply(a, "[", 1)) 28 | if (!is.na(m)) { 29 | rv = a[[m]][2] 30 | } 31 | else { 32 | rv = as.character(NA) 33 | } 34 | return(rv) 35 | }) 36 | } 37 | 38 | gffRead <- function(gffFile, nrows = -1) { 39 | cat("Reading ", gffFile, ": ", sep="") 40 | gff = read.table(gffFile, sep="\t", as.is=TRUE, quote="", 41 | header=FALSE, comment.char="#", nrows = nrows, 42 | colClasses=c("character", "character", "character", "integer", 43 | "integer", 44 | "character", "character", "character", "character")) 45 | colnames(gff) = c("seqname", "source", "feature", "start", "end", 46 | "score", "strand", "frame", "attributes") 47 | cat("found", nrow(gff), "rows with classes:", 48 | paste(sapply(gff, class), collapse=", "), "\n") 49 | stopifnot(!any(is.na(gff$start)), !any(is.na(gff$end))) 50 | return(gff) 51 | } 52 | 53 | ara=gffRead("Araport11_GFF3_genes_transposons.current.gff") 54 | 55 | ### Gene annotation 56 | gene=subset(ara,ara$feature=='gene') 57 | gene$Name=getAttributeField(gene$attributes, 'Name') 58 | gene$ID=getAttributeField(gene$attributes, 'ID') 59 | gene.out=gene[,c('seqname','start','end','Name','score','strand')] 60 | 61 | write.table(gene.out,'Araport11_genes.bed',sep='\t',row.names=F,col.names=F,quote=F) 62 | 63 | ### TE annotation 64 | te=subset(ara,ara$feature=='transposable_element') 65 | te$Name=getAttributeField(te$attributes, 'Name') 66 | te$ID=getAttributeField(te$attributes, 'ID') 67 | te$alias=getAttributeField(te$attributes, 'Alias') 68 | te$seqname=gsub(pattern="Chr",replacement='', x=te$seqname) 69 | te.out=te[,c('seqname','start','end','Name','feature','strand')] 70 | 71 | write.table(te.out,'Araport11_TE.bed',sep='\t',row.names=F,col.names=F,quote=F) 72 | 73 | ### Transcript annotation 74 | mRNA <- subset(ara, feature == "mRNA") 75 | mRNA$name=getAttributeField(mRNA$attributes, 'Name') 76 | mRNA$parent=getAttributeField(mRNA$attributes, 'Parent') 77 | mRNA.out=mRNA[,c('seqname','start','end','name','score','strand')] 78 | 79 | write.table(mRNA.out,'Araport11_mRNA.bed',sep='\t',row.names=F,col.names=F,quote=F) 80 | 81 | quit() 82 | n 83 | 84 | ### 5' and 3' UTR annotation 85 | utr <- subset(ara, feature == "five_prime_UTR" | feature == "three_prime_UTR") %>% 86 | mutate(id = getAttributeField(attributes, 'Parent')) %>% 87 | select(seqname, start, end, strand, id, feature) %>% 88 | mutate(id = sapply(strsplit(id, "\\."), function(l) l[1])) 89 | 90 | write.table(utr, "Araport11_UTR.bed", sep='\t', row.names=F, col.names=F, quote=F) 91 | 92 | ## use bedtools getfasta to obtain sequences in utr intervals 93 | ## bedtools getfasta -bedOut -s -fi TAIR10_Chr.all.fasta -bed Araport11_UTR.sorted.bed > Araport11_UTR_seq.bed 94 | 95 | ########## 96 | 97 | rm *gff 98 | 99 | -------------------------------------------------------------------------------- /average_cov.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eu 3 | bam=$1 4 | samtools sort -@ 4 $bam | samtools depth - | awk '{sum+=$3} END { print "Mean depth = ",sum/NR}' 5 | -------------------------------------------------------------------------------- /bashrc: -------------------------------------------------------------------------------- 1 | # ~/.bashrc: executed by bash(1) for non-login shells. 2 | # see /usr/share/doc/bash/examples/startup-files (in the package bash-doc) 3 | # for examples 4 | 5 | # If not running interactively, don't do anything 6 | case $- in 7 | *i*) ;; 8 | *) return;; 9 | esac 10 | 11 | # don't put duplicate lines or lines starting with space in the history. 12 | # See bash(1) for more options 13 | HISTCONTROL=ignoreboth 14 | 15 | # append to the history file, don't overwrite it 16 | shopt -s histappend 17 | 18 | # for setting history length see HISTSIZE and HISTFILESIZE in bash(1) 19 | HISTSIZE=1000 20 | HISTFILESIZE=2000 21 | 22 | # check the window size after each command and, if necessary, 23 | # update the values of LINES and COLUMNS. 24 | shopt -s checkwinsize 25 | 26 | # If set, the pattern "**" used in a pathname expansion context will 27 | # match all files and zero or more directories and subdirectories. 28 | #shopt -s globstar 29 | 30 | # make less more friendly for non-text input files, see lesspipe(1) 31 | #[ -x /usr/bin/lesspipe ] && eval "$(SHELL=/bin/sh lesspipe)" 32 | 33 | # set variable identifying the chroot you work in (used in the prompt below) 34 | if [ -z "${debian_chroot:-}" ] && [ -r /etc/debian_chroot ]; then 35 | debian_chroot=$(cat /etc/debian_chroot) 36 | fi 37 | 38 | # set a fancy prompt (non-color, unless we know we "want" color) 39 | case "$TERM" in 40 | xterm-color|*-256color) color_prompt=yes;; 41 | esac 42 | 43 | # uncomment for a colored prompt, if the terminal has the capability; turned 44 | # off by default to not distract the user: the focus in a terminal window 45 | # should be on the output of commands, not on the prompt 46 | #force_color_prompt=yes 47 | 48 | if [ -n "$force_color_prompt" ]; then 49 | if [ -x /usr/bin/tput ] && tput setaf 1 >&/dev/null; then 50 | # We have color support; assume it's compliant with Ecma-48 51 | # (ISO/IEC-6429). (Lack of such support is extremely rare, and such 52 | # a case would tend to support setf rather than setaf.) 53 | color_prompt=yes 54 | else 55 | color_prompt= 56 | fi 57 | fi 58 | 59 | if [ "$color_prompt" = yes ]; then 60 | PS1='${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ ' 61 | else 62 | PS1='${debian_chroot:+($debian_chroot)}\u@\h:\w\$ ' 63 | fi 64 | unset color_prompt force_color_prompt 65 | 66 | # If this is an xterm set the title to user@host:dir 67 | case "$TERM" in 68 | xterm*|rxvt*) 69 | PS1="\[\e]0;${debian_chroot:+($debian_chroot)}\u@\h: \w\a\]$PS1" 70 | ;; 71 | *) 72 | ;; 73 | esac 74 | 75 | # enable color support of ls and also add handy aliases 76 | if [ -x /usr/bin/dircolors ]; then 77 | test -r ~/.dircolors && eval "$(dircolors -b ~/.dircolors)" || eval "$(dircolors -b)" 78 | alias ls='ls --color=auto' 79 | #alias dir='dir --color=auto' 80 | #alias vdir='vdir --color=auto' 81 | 82 | #alias grep='grep --color=auto' 83 | #alias fgrep='fgrep --color=auto' 84 | #alias egrep='egrep --color=auto' 85 | fi 86 | 87 | # colored GCC warnings and errors 88 | #export GCC_COLORS='error=01;31:warning=01;35:note=01;36:caret=01;32:locus=01:quote=01' 89 | 90 | # some more ls aliases 91 | alias ll='ls -l' 92 | alias la='ls -A' 93 | alias l='ls -lh' 94 | 95 | # Alias definitions. 96 | # You may want to put all your additions into a separate file like 97 | # ~/.bash_aliases, instead of adding them here directly. 98 | # See /usr/share/doc/bash-doc/examples in the bash-doc package. 99 | 100 | if [ -f ~/.bash_aliases ]; then 101 | . ~/.bash_aliases 102 | fi 103 | 104 | # enable programmable completion features (you don't need to enable 105 | # this, if it's already enabled in /etc/bash.bashrc and /etc/profile 106 | # sources /etc/bash.bashrc). 107 | if ! shopt -oq posix; then 108 | if [ -f /usr/share/bash-completion/bash_completion ]; then 109 | . /usr/share/bash-completion/bash_completion 110 | elif [ -f /etc/bash_completion ]; then 111 | . /etc/bash_completion 112 | fi 113 | fi 114 | 115 | export PATH=/home/diepg/bin/gffcompare-0.11.6.Linux_x86_64:/home/diepg/bin/subread-2.0.0-Linux-x86_64/bin:/home/diepg/bin/stringtie:/home/diepg/bin/kallisto:/home/diepg/bin/TrimGalore-0.6.5:/home/diepg/bin/kentUtils/bin/linux.x86_64:$PATH 116 | 117 | 118 | -------------------------------------------------------------------------------- /gene_to_gene_anno.sh: -------------------------------------------------------------------------------- 1 | #Using TAIR GFF file to make annotation files 2 | #Derived from SRE gene_to_gene.sh 3 | 4 | mkdir $(date +"%Y-%m-%d")_TAIR10_gene_to_gene_annotation 5 | cd *_TAIR10_gene_to_gene_annotation 6 | 7 | #get TAIR10 gene annotations 8 | 9 | ###GFF file containing features (mRNA, exon, CDS,...) of ALL TAIR10 genes ,including non-protein coding genes(pseudogenes, RNA genes, transposable element genes) 10 | wget https://www.arabidopsis.org/download_files/Genes/TAIR10_genome_release/TAIR10_gff3/TAIR10_GFF3_genes.gff 11 | 12 | ### GFF file containing features (mRNA, exon, CDS,...) of ALL TAIR10 genes ,including TRANSPOSABLE ELEMENTS and all non-protein coding genes (pseudogenes, RNA genes, transposable element genes) 13 | wget https://www.arabidopsis.org/download_files/Genes/TAIR10_genome_release/TAIR10_gff3/TAIR10_GFF3_genes_transposons.gff 14 | 15 | #make bed file of all TAIR10 genes with proper strand information 16 | 17 | ########################### 18 | R 19 | 20 | getAttributeField <- function (x, field, attrsep = ";") { 21 | s = strsplit(x, split = attrsep, fixed = TRUE) 22 | sapply(s, function(atts) { 23 | a = strsplit(atts, split = "=", fixed = TRUE) 24 | m = match(field, sapply(a, "[", 1)) 25 | if (!is.na(m)) { 26 | rv = a[[m]][2] 27 | } 28 | else { 29 | rv = as.character(NA) 30 | } 31 | return(rv) 32 | }) 33 | } 34 | 35 | gffRead <- function(gffFile, nrows = -1) { 36 | cat("Reading ", gffFile, ": ", sep="") 37 | gff = read.table(gffFile, sep="\t", as.is=TRUE, quote="", 38 | header=FALSE, comment.char="#", nrows = nrows, 39 | colClasses=c("character", "character", "character", "integer", 40 | "integer", 41 | "character", "character", "character", "character")) 42 | colnames(gff) = c("seqname", "source", "feature", "start", "end", 43 | "score", "strand", "frame", "attributes") 44 | cat("found", nrow(gff), "rows with classes:", 45 | paste(sapply(gff, class), collapse=", "), "\n") 46 | stopifnot(!any(is.na(gff$start)), !any(is.na(gff$end))) 47 | return(gff) 48 | } 49 | 50 | gene=gffRead('TAIR10_GFF3_genes.gff') 51 | #gene_te=gffRead('TAIR10_GFF3_genes_transposons.gff') 52 | 53 | #I am subsetting to annotated 'gene's which there are 28,775 in total for TAIR10. This may be modified if we are looking for other things. 54 | gene=subset(gene,gene$feature=='gene') 55 | gene$Name=getAttributeField(gene$attributes, 'Name') 56 | gene$ID=getAttributeField(gene$attributes, 'ID') 57 | 58 | gene.out=gene[,c('seqname','start','end','Name','score','strand')] 59 | write.table(gene.out,'TAIR10_genes.bed',sep='\t',row.names=F,col.names=F,quote=F) 60 | quit() 61 | n 62 | 63 | ######################## 64 | 65 | 66 | #ID closest gene to each gene 67 | sort -k1,1 -k2,2n TAIR10_genes.bed > TAIR10_genes.sorted.bed 68 | 69 | #flags for bedtools (v2.25.0) 70 | # -N cannot match same name of gene (i.e. you don't match yourself) 71 | # -iu ignore upstream features 72 | # -s require feature on same strand 73 | # -D a report distance in relationship to the orientation of file 'a' 74 | 75 | bedtools closest -N -iu -s -D a -a TAIR10_genes.sorted.bed -b TAIR10_genes.sorted.bed > TAIR10_gene_to_gene.samestrand.bed 76 | 77 | 78 | #parse output and create final annotation file 79 | 80 | ######################## 81 | 82 | R 83 | 84 | input=read.delim('TAIR10_gene_to_gene.samestrand.bed',head=F) 85 | colnames(input)=c('chr1','start1','stop1','gene1','score1','strand1','chr2','start2','stop2','gene2','score2','strand2','distance') 86 | 87 | input.flt <- input[,c('chr1','start1','stop1','gene1','strand1','chr2','start2','stop2','gene2','strand2','distance')] 88 | input.flt$chr2 <- ifelse(input.flt$chr2=='.',NA,as.character(input.flt$chr2)) 89 | input.flt$chr2 <- as.factor(input.flt$chr2) 90 | input.flt$start2 <- ifelse(input.flt$start2=='-1',NA,input.flt$start2) 91 | input.flt$stop2 <- ifelse(input.flt$stop2=='-1',NA,input.flt$stop2) 92 | input.flt$gene2 <- ifelse(input.flt$gene2=='.',NA,as.character(input.flt$gene2)) 93 | input.flt$gene2 <- as.factor(input.flt$gene2) 94 | input.flt$strand2 <- ifelse(input.flt$strand2=='.',NA,as.character(input.flt$strand2)) 95 | input.flt$strand2 <- as.factor(input.flt$strand2) 96 | input.flt$distance <- ifelse(is.na(input.flt$chr2)==T,NA,input.flt$distance) 97 | input.flt <- na.omit(input.flt) 98 | input.flt <- subset(input.flt, input.flt$chr1 != 'M') 99 | input.flt <- subset(input.flt, input.flt$chr1 != 'C') 100 | 101 | # there are still overlapping records 102 | # These appear to be either microRNAs or closely adjacent genes with one having a long secondary transcript 103 | # just omit these; can browse too 104 | test <- subset(input.flt, input.flt$distance == 0) 105 | output <- subset(input.flt, input.flt$distance != 0) 106 | 107 | # write.table(input.flt,'TAIR10_gene_to_gene.samestrand.anno',sep='\t',row.names=F,quote=F) 108 | write.table(output,'Araport11_gene_to_gene.samestrand.bed',sep='\t',col.names=F,row.names=F,quote=F) 109 | 110 | quit() 111 | n 112 | 113 | # 114 | 115 | 116 | -------------------------------------------------------------------------------- /pe_insert_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eu 3 | bam=$1 4 | 5 | java -jar ~/bin/picard.jar CleanSam I=$bam O=test.bam 6 | java -jar ~/bin/picard.jar ValidateSamFile I=test.bam IGNORE_WARNINGS=true MODE=VERBOSE 7 | 8 | # AddOrReplaceReadGroups 9 | # FixMateInformation 10 | ## Manually remove reads with errors 11 | # samtools view -h test.bam | grep -v 'D00775:83:CC65GANXX:4:2304:19617:7489' | samtools view -b > test2.bam 12 | 13 | samtools sort -@ 6 test.bam -o test.sorted.bam 14 | 15 | java -jar ~/bin/picard.jar CollectInsertSizeMetrics I=test.sorted.bam O=insert_size_metrics.txt H=insert_size_histogram.pdf 16 | 17 | rm test.bam 18 | rm test.sorted.bam 19 | 20 | echo "DONE" 21 | -------------------------------------------------------------------------------- /project_workflows/diffsegR_v1/diffsegR_WT-NvsC_gmuct_3p_v1.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## Installation 4 | #### Conda 5 | #conda create --name diffseg_v1 6 | #conda install -n diffseg_v1 -c conda-forge r-tidyverse 7 | #conda install -n diffseg_v1 -c bioconda bioconductor-deseq2 8 | #conda install -n diffseg_v1 -c bioconda bioconductor-rsubread 9 | #conda install -n diffseg_v1 -c bioconda bioconductor-rtracklayer 10 | #conda install -n diffseg_v1 -c bioconda bioconductor-sparsematrixstats 11 | #conda install -n diffseg_v1 -c bioconda bioconductor-delayedmatrixstats 12 | #conda install -n diffseg_v1 -c conda-forge r-remotes 13 | #conda activate diffseg_v1 14 | 15 | #### R 16 | #remotes::install_github("sanssouci-org/sanssouci") 17 | #remotes::install_github("aLiehrmann/DiffSegR@f657435") 18 | 19 | library(tidyverse) 20 | library(DiffSegR) 21 | 22 | nb_threads = 10 23 | 24 | working_directory <- getwd() 25 | 26 | #- create sample information table --------------------------------------------# 27 | sample_info <- data.frame( 28 | sample = c("WT.N_1", "WT.N_2", "WT.N_3", "WT.C_1", "WT.C_2", "WT.C_3"), 29 | condition = rep(c("WT.N", "WT.C"), each = 3), 30 | replicate = rep(1:3,2), 31 | bam = sapply( 32 | c("S5-3N_Aligned.sortedByCoord.out.bam", 33 | "S7-4N_Aligned.sortedByCoord.out.bam", 34 | "S11-10N_Aligned.sortedByCoord.out.bam", 35 | "S6-3C_Aligned.sortedByCoord.out.bam", 36 | "S8-4C_Aligned.sortedByCoord.out.bam", 37 | "S12-10C_Aligned.sortedByCoord.out.bam" 38 | ), 39 | function(bam) file.path(working_directory, bam)), 40 | coverage = file.path( 41 | working_directory, 42 | paste0(c("WT-N_1", "WT-N_2", "WT-N_3", "WT-C_1", "WT-C_2", "WT-C_3"), ".rds"))) 43 | 44 | #- save sample information table ----------------------------------------------# 45 | write.table( 46 | sample_info, 47 | file.path(working_directory, "sample_info.txt") 48 | ) 49 | 50 | #- display sample information table -------------------------------------------# 51 | knitr::kable(sample_info, row.names = FALSE) 52 | 53 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F) 54 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt") 55 | 56 | ### setup comparisons and loop for each chromosome 57 | out_DERs <- NULL 58 | 59 | for(i in unique(genome$X1)){ 60 | chr <- paste(i) 61 | stop <- genome$X2[genome$X1==i] 62 | 63 | ## load data 64 | data <- loadData( 65 | sampleInfo = file.path(working_directory,"sample_info.txt"), 66 | locus = list(seqid = i, chromStart = 1, chromEnd = stop), 67 | referenceCondition = "WT.C", 68 | isPairedEnd = TRUE, 69 | readLength = 150, 70 | coverageType = "threePrime", 71 | stranded = FALSE, 72 | strandSpecific = 0, 73 | fromBam = TRUE, 74 | nbThreads = nb_threads, 75 | verbose = TRUE, 76 | ) 77 | 78 | ## Changepoint detection to define segments 79 | SExp <- segmentation( 80 | data = data, 81 | weightType = "unweighted", #zeroInflated : low counts have less weight 82 | modelSelectionType = "yao", 83 | featureCountsType = "fromBam", 84 | compressed = TRUE, 85 | alpha = 2, 86 | segmentNeighborhood = FALSE, 87 | Kmax = NULL, 88 | verbose = FALSE, 89 | nbThreadsGridSearch = 1, 90 | alphas = NULL, 91 | gridSearch = FALSE, 92 | outputDirectory = working_directory, 93 | nbThreadsFeatureCounts = nb_threads, 94 | strandSpecific = 0, 95 | read2pos = 3, 96 | isPairedEnd = TRUE 97 | ) 98 | 99 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,] 100 | 101 | dds <- dea( 102 | data = data, 103 | SExp = SExp_10, 104 | design = ~condition, 105 | predicate = NULL, 106 | significanceLevel = 0.01, 107 | verbose = TRUE 108 | ) 109 | 110 | #- extract DERs based on signifiance ----------------------------------------# 111 | DERs <- dds[SummarizedExperiment::mcols(dds)$rejectedHypotheses,] 112 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs)) 113 | 114 | out_DERs <- rbind(out_DERs,DERs) 115 | 116 | } 117 | 118 | #clear memory cache 119 | gc() 120 | 121 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3]))) 122 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, baseVar, log2FoldChange, padj) 123 | out <- subset(out, baseMean > 10) 124 | 125 | write_tsv(out, "WT-N_DERs_3p.bed", col_names=F) 126 | 127 | -------------------------------------------------------------------------------- /project_workflows/diffsegR_v1/diffsegR_WT-NvsC_gmuct_5p_v1.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## Installation 4 | #### Conda 5 | #conda create --name diffseg_v1 6 | #conda install -n diffseg_v1 -c conda-forge r-tidyverse 7 | #conda install -n diffseg_v1 -c bioconda bioconductor-deseq2 8 | #conda install -n diffseg_v1 -c bioconda bioconductor-rsubread 9 | #conda install -n diffseg_v1 -c bioconda bioconductor-rtracklayer 10 | #conda install -n diffseg_v1 -c bioconda bioconductor-sparsematrixstats 11 | #conda install -n diffseg_v1 -c bioconda bioconductor-delayedmatrixstats 12 | #conda install -n diffseg_v1 -c conda-forge r-remotes 13 | #conda activate diffseg_v1 14 | 15 | #### R 16 | #remotes::install_github("sanssouci-org/sanssouci") 17 | #remotes::install_github("aLiehrmann/DiffSegR@f657435") 18 | 19 | library(tidyverse) 20 | library(DiffSegR) 21 | 22 | nb_threads = 10 23 | 24 | working_directory <- getwd() 25 | 26 | #- create sample information table --------------------------------------------# 27 | sample_info <- data.frame( 28 | sample = c("WT.N_1", "WT.N_2", "WT.N_3", "WT.C_1", "WT.C_2", "WT.C_3"), 29 | condition = rep(c("WT.N", "WT.C"), each = 3), 30 | replicate = rep(1:3,2), 31 | bam = sapply( 32 | c("S5-3N_Aligned.sortedByCoord.out.bam", 33 | "S7-4N_Aligned.sortedByCoord.out.bam", 34 | "S11-10N_Aligned.sortedByCoord.out.bam", 35 | "S6-3C_Aligned.sortedByCoord.out.bam", 36 | "S8-4C_Aligned.sortedByCoord.out.bam", 37 | "S12-10C_Aligned.sortedByCoord.out.bam" 38 | ), 39 | function(bam) file.path(working_directory, bam) 40 | ), 41 | coverage = file.path( 42 | working_directory, paste0(c("WT.N_1", "WT.N_2", "WT.N_3", "WT.C_1", "WT.C_2", "WT.C_3"), ".rds"))) 43 | 44 | #- save sample information table ----------------------------------------------# 45 | write.table( 46 | sample_info, 47 | file.path(working_directory, "sample_info.txt") 48 | ) 49 | 50 | #- display sample information table -------------------------------------------# 51 | knitr::kable(sample_info, row.names = FALSE) 52 | 53 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F) 54 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt") 55 | 56 | ### setup comparisons and loop for each chromosome 57 | out_DERs <- NULL 58 | 59 | for(i in unique(genome$X1)){ 60 | chr <- paste(i) 61 | stop <- genome$X2[genome$X1==i] 62 | 63 | ## load data 64 | data <- loadData( 65 | sampleInfo = file.path(working_directory,"sample_info.txt"), 66 | locus = list(seqid = i, chromStart = 1, chromEnd = stop), 67 | referenceCondition = "WT.C", 68 | isPairedEnd = TRUE, 69 | readLength = 150, 70 | coverageType = "fivePrime", 71 | stranded = FALSE, 72 | strandSpecific = 0, 73 | fromBam = TRUE, 74 | nbThreads = nb_threads, 75 | verbose = TRUE, 76 | ) 77 | 78 | ## Changepoint detection to define segments 79 | SExp <- segmentation( 80 | data = data, 81 | weightType = "unweighted", #zeroInflated : low counts have less weight 82 | modelSelectionType = "yao", 83 | featureCountsType = "fromBam", 84 | compressed = TRUE, 85 | alpha = 2, 86 | segmentNeighborhood = FALSE, 87 | Kmax = NULL, 88 | verbose = FALSE, 89 | nbThreadsGridSearch = 1, 90 | alphas = NULL, 91 | gridSearch = FALSE, 92 | outputDirectory = working_directory, 93 | nbThreadsFeatureCounts = nb_threads, 94 | strandSpecific = 0, 95 | read2pos = 5, 96 | #featureCountsOtherParams = list(allowMultiOverlap = FALSE) 97 | isPairedEnd = TRUE 98 | ) 99 | 100 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,] 101 | 102 | dds <- dea( 103 | data = data, 104 | SExp = SExp_10, 105 | design = ~condition, 106 | predicate = NULL, 107 | significanceLevel = 0.01, 108 | verbose = TRUE 109 | ) 110 | 111 | #- extract DERs based on signifiance ----------------------------------------# 112 | DERs <- dds[SummarizedExperiment::mcols(dds)$rejectedHypotheses,] 113 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs)) 114 | 115 | out_DERs <- rbind(out_DERs,DERs) 116 | 117 | } 118 | 119 | #clear memory cache 120 | gc() 121 | 122 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3]))) 123 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, baseVar, log2FoldChange, padj) 124 | out <- subset(out, baseMean > 10) 125 | 126 | write_tsv(out, "WT-N_DERs_5p.bed", col_names=F) 127 | 128 | -------------------------------------------------------------------------------- /project_workflows/diffsegR_v1/diffsegR_abh1-CvsWT-C_gmuct_3p_v1.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## Installation 4 | #### Conda 5 | #conda create --name diffseg_v1 6 | #conda install -n diffseg_v1 -c conda-forge r-tidyverse 7 | #conda install -n diffseg_v1 -c bioconda bioconductor-deseq2 8 | #conda install -n diffseg_v1 -c bioconda bioconductor-rsubread 9 | #conda install -n diffseg_v1 -c bioconda bioconductor-rtracklayer 10 | #conda install -n diffseg_v1 -c bioconda bioconductor-sparsematrixstats 11 | #conda install -n diffseg_v1 -c bioconda bioconductor-delayedmatrixstats 12 | #conda install -n diffseg_v1 -c conda-forge r-remotes 13 | #conda activate diffseg_v1 14 | 15 | #### R 16 | #remotes::install_github("sanssouci-org/sanssouci") 17 | #remotes::install_github("aLiehrmann/DiffSegR@f657435") 18 | 19 | library(tidyverse) 20 | library(DiffSegR) 21 | nb_threads = 10 22 | working_directory <- getwd() 23 | 24 | #- create sample information table --------------------------------------------# 25 | sample_info <- data.frame( 26 | sample = c("abh1.C_1","abh1.C_2","abh1.C_3","WT.C_1","WT.C_2","WT.C_3"), 27 | condition = c(rep("abh1.C", 3), rep( "WT.C", 3)), 28 | replicate = c(1:3,1:3), 29 | bam = sapply( 30 | c("S16-5C_Aligned.sortedByCoord.out.bam", "S33-9C_Aligned.sortedByCoord.out.bam", "S35-11C_Aligned.sortedByCoord.out.bam", 31 | "S6-3C_Aligned.sortedByCoord.out.bam", "S8-4C_Aligned.sortedByCoord.out.bam", "S12-10C_Aligned.sortedByCoord.out.bam"), 32 | function(bam) file.path(working_directory, bam)), 33 | coverage = file.path( 34 | working_directory, 35 | paste0(c("abh1.C_1","abh1.C_2","abh1.C_3","WT.C_1", "WT.C_2", "WT.C_3"), ".rds"))) 36 | 37 | #- save sample information table ----------------------------------------------# 38 | write.table(sample_info, file.path(working_directory, "sample_info.txt")) 39 | 40 | #- display sample information table -------------------------------------------# 41 | knitr::kable(sample_info, row.names = FALSE) 42 | 43 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F) 44 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt") 45 | 46 | ### setup comparisons and loop for each chromosome 47 | out_DERs <- NULL 48 | 49 | for(i in unique(genome$X1)){ 50 | chr <- paste(i) 51 | stop <- genome$X2[genome$X1==i] 52 | 53 | ## load data 54 | data <- loadData( 55 | sampleInfo = file.path(working_directory,"sample_info.txt"), 56 | locus = list(seqid = i, chromStart = 1, chromEnd = stop), 57 | referenceCondition = "WT.C", 58 | isPairedEnd = TRUE, 59 | readLength = 150, 60 | coverageType = "threePrime", 61 | stranded = FALSE, 62 | strandSpecific = 0, 63 | fromBam = TRUE, 64 | nbThreads = nb_threads, 65 | verbose = TRUE, 66 | ) 67 | 68 | ## Changepoint detection to define segments 69 | SExp <- segmentation( 70 | data = data, 71 | weightType = "unweighted", #zeroInflated : low counts have less weight 72 | modelSelectionType = "yao", 73 | featureCountsType = "fromBam", 74 | compressed = TRUE, 75 | alpha = 2, 76 | segmentNeighborhood = FALSE, 77 | Kmax = NULL, 78 | verbose = FALSE, 79 | nbThreadsGridSearch = 1, 80 | alphas = NULL, 81 | gridSearch = FALSE, 82 | outputDirectory = working_directory, 83 | nbThreadsFeatureCounts = nb_threads, 84 | strandSpecific = 0, 85 | read2pos = 3, 86 | isPairedEnd = TRUE 87 | ) 88 | 89 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,] 90 | 91 | dds <- dea( 92 | data = data, 93 | SExp = SExp_10, 94 | design = ~condition, 95 | predicate = NULL, 96 | significanceLevel = 0.01, 97 | verbose = TRUE 98 | ) 99 | 100 | #- extract DERs based on signifiance ----------------------------------------# 101 | DERs <- dds[SummarizedExperiment::mcols(dds)$rejectedHypotheses,] 102 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs)) 103 | 104 | out_DERs <- rbind(out_DERs,DERs) 105 | 106 | } 107 | 108 | #clear memory cache 109 | gc() 110 | 111 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3]))) 112 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, baseVar, log2FoldChange, padj) 113 | out <- subset(out, baseMean > 10) 114 | 115 | write_tsv(out, "abh1-CvsWT-C_DERs_3p.bed", col_names=F) 116 | 117 | -------------------------------------------------------------------------------- /project_workflows/diffsegR_v1/diffsegR_abh1-CvsWT-C_gmuct_3p_v1_stranded.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## Installation 4 | #### Conda 5 | #conda create --name diffseg_v1 6 | #conda install -n diffseg_v1 -c conda-forge r-tidyverse 7 | #conda install -n diffseg_v1 -c bioconda bioconductor-deseq2 8 | #conda install -n diffseg_v1 -c bioconda bioconductor-rsubread 9 | #conda install -n diffseg_v1 -c bioconda bioconductor-rtracklayer 10 | #conda install -n diffseg_v1 -c bioconda bioconductor-sparsematrixstats 11 | #conda install -n diffseg_v1 -c bioconda bioconductor-delayedmatrixstats 12 | #conda install -n diffseg_v1 -c conda-forge r-remotes 13 | #conda activate diffseg_v1 14 | 15 | #### R 16 | #remotes::install_github("sanssouci-org/sanssouci") 17 | #remotes::install_github("aLiehrmann/DiffSegR@f657435") 18 | 19 | library(tidyverse) 20 | library(DiffSegR) 21 | nb_threads = 10 22 | working_directory <- getwd() 23 | 24 | #- create sample information table --------------------------------------------# 25 | sample_info <- data.frame( 26 | sample = c("abh1.C_1","abh1.C_2","abh1.C_3","WT.C_1","WT.C_2","WT.C_3"), 27 | condition = c(rep("abh1.C", 3), rep( "WT.C", 3)), 28 | replicate = c(1:3,1:3), 29 | bam = sapply( 30 | c("S16-5C_Aligned.sortedByCoord.out.bam", "S33-9C_Aligned.sortedByCoord.out.bam", "S35-11C_Aligned.sortedByCoord.out.bam", 31 | "S6-3C_Aligned.sortedByCoord.out.bam", "S8-4C_Aligned.sortedByCoord.out.bam", "S12-10C_Aligned.sortedByCoord.out.bam"), 32 | function(bam) file.path(working_directory, bam)), 33 | coverage = file.path( 34 | working_directory, 35 | paste0(c("abh1.C_1","abh1.C_2","abh1.C_3","WT.C_1", "WT.C_2", "WT.C_3"), ".rds"))) 36 | 37 | #- save sample information table ----------------------------------------------# 38 | write.table(sample_info, file.path(working_directory, "sample_info.txt")) 39 | 40 | #- display sample information table -------------------------------------------# 41 | knitr::kable(sample_info, row.names = FALSE) 42 | 43 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F) 44 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt") 45 | 46 | ### setup comparisons and loop for each chromosome 47 | out_DERs <- NULL 48 | 49 | for(i in unique(genome$X1)){ 50 | chr <- paste(i) 51 | stop <- genome$X2[genome$X1==i] 52 | 53 | ## load data 54 | data <- loadData( 55 | sampleInfo = file.path(working_directory,"sample_info.txt"), 56 | locus = list(seqid = i, chromStart = 1, chromEnd = stop), 57 | referenceCondition = "WT.C", 58 | isPairedEnd = TRUE, 59 | readLength = 150, 60 | coverageType = "threePrime", 61 | stranded = TRUE, 62 | strandSpecific = 1, 63 | fromBam = TRUE, 64 | nbThreads = nb_threads, 65 | verbose = TRUE, 66 | ) 67 | 68 | ## Changepoint detection to define segments 69 | SExp <- segmentation( 70 | data = data, 71 | weightType = "unweighted", #zeroInflated : low counts have less weight 72 | modelSelectionType = "yao", 73 | featureCountsType = "fromBam", 74 | compressed = TRUE, 75 | alpha = 2, 76 | segmentNeighborhood = FALSE, 77 | Kmax = NULL, 78 | verbose = FALSE, 79 | nbThreadsGridSearch = 1, 80 | alphas = NULL, 81 | gridSearch = FALSE, 82 | outputDirectory = working_directory, 83 | nbThreadsFeatureCounts = nb_threads, 84 | strandSpecific = 1, 85 | read2pos = 3, 86 | isPairedEnd = TRUE 87 | ) 88 | 89 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,] 90 | 91 | dds <- dea( 92 | data = data, 93 | SExp = SExp_10, 94 | design = ~condition, 95 | predicate = NULL, 96 | significanceLevel = 0.01, 97 | verbose = TRUE 98 | ) 99 | 100 | #- extract DERs based on signifiance ----------------------------------------# 101 | DERs <- dds[SummarizedExperiment::mcols(dds)$rejectedHypotheses,] 102 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs)) 103 | 104 | out_DERs <- rbind(out_DERs,DERs) 105 | 106 | } 107 | 108 | #clear memory cache 109 | gc() 110 | 111 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3],".",l[4]))) 112 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, strand, baseVar, log2FoldChange, padj) 113 | out <- subset(out, baseMean > 10) 114 | 115 | write_tsv(out, "abh1-CvsWT-C_DERs_3p.bed", col_names=F) 116 | 117 | -------------------------------------------------------------------------------- /project_workflows/diffsegR_v1/diffsegR_abh1-CvsWT-C_gmuct_5p_v1.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## Installation 4 | #### Conda 5 | #conda create --name diffseg_v1 6 | #conda install -n diffseg_v1 -c conda-forge r-tidyverse 7 | #conda install -n diffseg_v1 -c bioconda bioconductor-deseq2 8 | #conda install -n diffseg_v1 -c bioconda bioconductor-rsubread 9 | #conda install -n diffseg_v1 -c bioconda bioconductor-rtracklayer 10 | #conda install -n diffseg_v1 -c bioconda bioconductor-sparsematrixstats 11 | #conda install -n diffseg_v1 -c bioconda bioconductor-delayedmatrixstats 12 | #conda install -n diffseg_v1 -c conda-forge r-remotes 13 | #conda activate diffseg_v1 14 | 15 | #### R 16 | #remotes::install_github("sanssouci-org/sanssouci") 17 | #remotes::install_github("aLiehrmann/DiffSegR@f657435") 18 | 19 | library(tidyverse) 20 | library(DiffSegR) 21 | nb_threads = 10 22 | working_directory <- getwd() 23 | 24 | #- create sample information table --------------------------------------------# 25 | sample_info <- data.frame( 26 | sample = c("abh1.C_1","abh1.C_2","abh1.C_3","WT.C_1","WT.C_2","WT.C_3"), 27 | condition = c(rep("abh1.C", 3), rep( "WT.C", 3)), 28 | replicate = c(1:3,1:3), 29 | bam = sapply( 30 | c("S16-5C_Aligned.sortedByCoord.out.bam", "S33-9C_Aligned.sortedByCoord.out.bam", "S35-11C_Aligned.sortedByCoord.out.bam", 31 | "S6-3C_Aligned.sortedByCoord.out.bam", "S8-4C_Aligned.sortedByCoord.out.bam", "S12-10C_Aligned.sortedByCoord.out.bam"), 32 | function(bam) file.path(working_directory, bam)), 33 | coverage = file.path( 34 | working_directory, 35 | paste0(c("abh1.C_1","abh1.C_2","abh1.C_3","WT.C_1", "WT.C_2", "WT.C_3"), ".rds"))) 36 | 37 | #- save sample information table ----------------------------------------------# 38 | write.table(sample_info, file.path(working_directory, "sample_info.txt")) 39 | 40 | #- display sample information table -------------------------------------------# 41 | knitr::kable(sample_info, row.names = FALSE) 42 | 43 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F) 44 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt") 45 | 46 | ### setup comparisons and loop for each chromosome 47 | out_DERs <- NULL 48 | 49 | for(i in unique(genome$X1)){ 50 | chr <- paste(i) 51 | stop <- genome$X2[genome$X1==i] 52 | 53 | ## load data 54 | data <- loadData( 55 | sampleInfo = file.path(working_directory,"sample_info.txt"), 56 | locus = list(seqid = i, chromStart = 1, chromEnd = stop), 57 | referenceCondition = "WT.C", 58 | isPairedEnd = TRUE, 59 | readLength = 150, 60 | coverageType = "fivePrime", 61 | stranded = FALSE, 62 | strandSpecific = 0, 63 | fromBam = TRUE, 64 | nbThreads = nb_threads, 65 | verbose = TRUE, 66 | ) 67 | 68 | ## Changepoint detection to define segments 69 | SExp <- segmentation( 70 | data = data, 71 | weightType = "unweighted", #zeroInflated : low counts have less weight 72 | modelSelectionType = "yao", 73 | featureCountsType = "fromBam", 74 | compressed = TRUE, 75 | alpha = 2, 76 | segmentNeighborhood = FALSE, 77 | Kmax = NULL, 78 | verbose = FALSE, 79 | nbThreadsGridSearch = 1, 80 | alphas = NULL, 81 | gridSearch = FALSE, 82 | outputDirectory = working_directory, 83 | nbThreadsFeatureCounts = nb_threads, 84 | strandSpecific = 0, 85 | read2pos = 5, 86 | isPairedEnd = TRUE 87 | ) 88 | 89 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,] 90 | 91 | dds <- dea( 92 | data = data, 93 | SExp = SExp_10, 94 | design = ~condition, 95 | predicate = NULL, 96 | significanceLevel = 0.01, 97 | orderBy = "pvalue", 98 | verbose = TRUE 99 | ) 100 | 101 | #- extract DERs based on signifiance ----------------------------------------# 102 | DERs <- dds[SummarizedExperiment::mcols(dds)$rejectedHypotheses,] 103 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs)) 104 | 105 | out_DERs <- rbind(out_DERs,DERs) 106 | 107 | } 108 | 109 | #clear memory cache 110 | gc() 111 | 112 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3]))) 113 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, baseVar, log2FoldChange, padj) 114 | out <- subset(out, baseMean > 10) 115 | 116 | write_tsv(out, "abh1-CvsWT-C_DERs_5p.bed", col_names=F) 117 | 118 | -------------------------------------------------------------------------------- /project_workflows/diffsegR_v1/diffsegR_abh1-CvsWT-C_gmuct_5p_v1_stranded.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## Installation 4 | #### Conda 5 | #conda create --name diffseg_v1 6 | #conda install -n diffseg_v1 -c conda-forge r-tidyverse 7 | #conda install -n diffseg_v1 -c bioconda bioconductor-deseq2 8 | #conda install -n diffseg_v1 -c bioconda bioconductor-rsubread 9 | #conda install -n diffseg_v1 -c bioconda bioconductor-rtracklayer 10 | #conda install -n diffseg_v1 -c bioconda bioconductor-sparsematrixstats 11 | #conda install -n diffseg_v1 -c bioconda bioconductor-delayedmatrixstats 12 | #conda install -n diffseg_v1 -c conda-forge r-remotes 13 | #conda activate diffseg_v1 14 | 15 | #### R 16 | #remotes::install_github("sanssouci-org/sanssouci") 17 | #remotes::install_github("aLiehrmann/DiffSegR@f657435") 18 | 19 | library(tidyverse) 20 | library(DiffSegR) 21 | nb_threads = 10 22 | working_directory <- getwd() 23 | 24 | #- create sample information table --------------------------------------------# 25 | sample_info <- data.frame( 26 | sample = c("abh1.C_1","abh1.C_2","abh1.C_3","WT.C_1","WT.C_2","WT.C_3"), 27 | condition = c(rep("abh1.C", 3), rep( "WT.C", 3)), 28 | replicate = c(1:3,1:3), 29 | bam = sapply( 30 | c("S16-5C_Aligned.sortedByCoord.out.bam", "S33-9C_Aligned.sortedByCoord.out.bam", "S35-11C_Aligned.sortedByCoord.out.bam", 31 | "S6-3C_Aligned.sortedByCoord.out.bam", "S8-4C_Aligned.sortedByCoord.out.bam", "S12-10C_Aligned.sortedByCoord.out.bam"), 32 | function(bam) file.path(working_directory, bam)), 33 | coverage = file.path( 34 | working_directory, 35 | paste0(c("abh1.C_1","abh1.C_2","abh1.C_3","WT.C_1", "WT.C_2", "WT.C_3"), ".rds"))) 36 | 37 | #- save sample information table ----------------------------------------------# 38 | write.table(sample_info, file.path(working_directory, "sample_info.txt")) 39 | 40 | #- display sample information table -------------------------------------------# 41 | knitr::kable(sample_info, row.names = FALSE) 42 | 43 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F) 44 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt") 45 | 46 | ### setup comparisons and loop for each chromosome 47 | out_DERs <- NULL 48 | 49 | for(i in unique(genome$X1)){ 50 | chr <- paste(i) 51 | stop <- genome$X2[genome$X1==i] 52 | 53 | ## load data 54 | data <- loadData( 55 | sampleInfo = file.path(working_directory,"sample_info.txt"), 56 | locus = list(seqid = i, chromStart = 1, chromEnd = stop), 57 | referenceCondition = "WT.C", 58 | isPairedEnd = TRUE, 59 | readLength = 150, 60 | coverageType = "fivePrime", 61 | stranded = TRUE, 62 | strandSpecific = 1, 63 | fromBam = TRUE, 64 | nbThreads = nb_threads, 65 | verbose = TRUE, 66 | ) 67 | 68 | ## Changepoint detection to define segments 69 | SExp <- segmentation( 70 | data = data, 71 | weightType = "unweighted", #zeroInflated : low counts have less weight 72 | modelSelectionType = "yao", 73 | featureCountsType = "fromBam", 74 | compressed = TRUE, 75 | alpha = 2, 76 | segmentNeighborhood = FALSE, 77 | Kmax = NULL, 78 | verbose = FALSE, 79 | nbThreadsGridSearch = 1, 80 | alphas = NULL, 81 | gridSearch = FALSE, 82 | outputDirectory = working_directory, 83 | nbThreadsFeatureCounts = nb_threads, 84 | strandSpecific = 1, 85 | read2pos = 5, 86 | isPairedEnd = TRUE 87 | ) 88 | 89 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,] 90 | 91 | dds <- dea( 92 | data = data, 93 | SExp = SExp_10, 94 | design = ~condition, 95 | predicate = NULL, 96 | significanceLevel = 0.01, 97 | orderBy = "pvalue", 98 | verbose = TRUE 99 | ) 100 | 101 | #- extract DERs based on signifiance ----------------------------------------# 102 | DERs <- dds[SummarizedExperiment::mcols(dds)$rejectedHypotheses,] 103 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs)) 104 | 105 | out_DERs <- rbind(out_DERs,DERs) 106 | 107 | } 108 | 109 | #clear memory cache 110 | gc() 111 | 112 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3],".",l[4]))) 113 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, strand, baseVar, log2FoldChange, padj) 114 | out <- subset(out, baseMean > 10) 115 | 116 | write_tsv(out, "abh1-CvsWT-C_DERs_5p.bed", col_names=F) 117 | 118 | -------------------------------------------------------------------------------- /project_workflows/diffsegR_v1/diffsegR_abh1-NvsWT-N_gmuct_3p_v1.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## Installation 4 | #### Conda 5 | #conda create --name diffseg_v1 6 | #conda install -n diffseg_v1 -c conda-forge r-tidyverse 7 | #conda install -n diffseg_v1 -c bioconda bioconductor-deseq2 8 | #conda install -n diffseg_v1 -c bioconda bioconductor-rsubread 9 | #conda install -n diffseg_v1 -c bioconda bioconductor-rtracklayer 10 | #conda install -n diffseg_v1 -c bioconda bioconductor-sparsematrixstats 11 | #conda install -n diffseg_v1 -c bioconda bioconductor-delayedmatrixstats 12 | #conda install -n diffseg_v1 -c conda-forge r-remotes 13 | #conda activate diffseg_v1 14 | 15 | #### R 16 | #remotes::install_github("sanssouci-org/sanssouci") 17 | #remotes::install_github("aLiehrmann/DiffSegR@f657435") 18 | 19 | library(tidyverse) 20 | library(DiffSegR) 21 | nb_threads = 10 22 | working_directory <- getwd() 23 | 24 | #- create sample information table --------------------------------------------# 25 | sample_info <- data.frame( 26 | sample = c("abh1.N_1","abh1.N_2","abh1.N_3","WT.N_1","WT.N_2","WT.N_3"), 27 | condition = c(rep("abh1.N", 3), rep( "WT.N", 3)), 28 | replicate = c(1:3,1:3), 29 | bam = sapply( 30 | c("S15-5N_Aligned.sortedByCoord.out.bam", "S32-9N_Aligned.sortedByCoord.out.bam", "S34-11N_Aligned.sortedByCoord.out.bam", 31 | "S5-3N_Aligned.sortedByCoord.out.bam", "S7-4N_Aligned.sortedByCoord.out.bam", "S11-10N_Aligned.sortedByCoord.out.bam"), 32 | function(bam) file.path(working_directory, bam)), 33 | coverage = file.path( 34 | working_directory, 35 | paste0(c("abh1.N_1","abh1.N_2","abh1.N_3","WT.N_1", "WT.N_2", "WT.N_3"), ".rds"))) 36 | 37 | #- save sample information table ----------------------------------------------# 38 | write.table(sample_info, file.path(working_directory, "sample_info.txt")) 39 | 40 | #- display sample information table -------------------------------------------# 41 | knitr::kable(sample_info, row.names = FALSE) 42 | 43 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F) 44 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt") 45 | 46 | ### setup comparisons and loop for each chromosome 47 | out_DERs <- NULL 48 | 49 | for(i in unique(genome$X1)){ 50 | chr <- paste(i) 51 | stop <- genome$X2[genome$X1==i] 52 | 53 | ## load data 54 | data <- loadData( 55 | sampleInfo = file.path(working_directory,"sample_info.txt"), 56 | locus = list(seqid = i, chromStart = 1, chromEnd = stop), 57 | referenceCondition = "WT.N", 58 | isPairedEnd = TRUE, 59 | readLength = 150, 60 | coverageType = "threePrime", 61 | stranded = FALSE, 62 | strandSpecific = 0, 63 | fromBam = TRUE, 64 | nbThreads = nb_threads, 65 | verbose = TRUE, 66 | ) 67 | 68 | ## Changepoint detection to define segments 69 | SExp <- segmentation( 70 | data = data, 71 | weightType = "unweighted", #zeroInflated : low counts have less weight 72 | modelSelectionType = "yao", 73 | featureCountsType = "fromBam", 74 | compressed = TRUE, 75 | alpha = 2, 76 | segmentNeighborhood = FALSE, 77 | Kmax = NULL, 78 | verbose = FALSE, 79 | nbThreadsGridSearch = 1, 80 | alphas = NULL, 81 | gridSearch = FALSE, 82 | outputDirectory = working_directory, 83 | nbThreadsFeatureCounts = nb_threads, 84 | strandSpecific = 0, 85 | read2pos = 3, 86 | isPairedEnd = TRUE 87 | ) 88 | 89 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,] 90 | 91 | dds <- dea( 92 | data = data, 93 | SExp = SExp_10, 94 | design = ~condition, 95 | predicate = NULL, 96 | significanceLevel = 0.01, 97 | verbose = TRUE 98 | ) 99 | 100 | #- extract DERs based on signifiance ----------------------------------------# 101 | DERs <- dds[SummarizedExperiment::mcols(dds)$rejectedHypotheses,] 102 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs)) 103 | 104 | out_DERs <- rbind(out_DERs,DERs) 105 | 106 | } 107 | 108 | #clear memory cache 109 | gc() 110 | 111 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3]))) 112 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, baseVar, log2FoldChange, padj) 113 | out <- subset(out, baseMean > 10) 114 | 115 | write_tsv(out, "abh1-NvsWT-N_DERs_3p.bed", col_names=F) 116 | 117 | -------------------------------------------------------------------------------- /project_workflows/diffsegR_v1/diffsegR_abh1-NvsWT-N_gmuct_3p_v1_stranded.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## Installation 4 | #### Conda 5 | #conda create --name diffseg_v1 6 | #conda install -n diffseg_v1 -c conda-forge r-tidyverse 7 | #conda install -n diffseg_v1 -c bioconda bioconductor-deseq2 8 | #conda install -n diffseg_v1 -c bioconda bioconductor-rsubread 9 | #conda install -n diffseg_v1 -c bioconda bioconductor-rtracklayer 10 | #conda install -n diffseg_v1 -c bioconda bioconductor-sparsematrixstats 11 | #conda install -n diffseg_v1 -c bioconda bioconductor-delayedmatrixstats 12 | #conda install -n diffseg_v1 -c conda-forge r-remotes 13 | #conda activate diffseg_v1 14 | 15 | #### R 16 | #remotes::install_github("sanssouci-org/sanssouci") 17 | #remotes::install_github("aLiehrmann/DiffSegR@f657435") 18 | 19 | library(tidyverse) 20 | library(DiffSegR) 21 | nb_threads = 10 22 | working_directory <- getwd() 23 | 24 | #- create sample information table --------------------------------------------# 25 | sample_info <- data.frame( 26 | sample = c("abh1.N_1","abh1.N_2","abh1.N_3","WT.N_1","WT.N_2","WT.N_3"), 27 | condition = c(rep("abh1.N", 3), rep( "WT.N", 3)), 28 | replicate = c(1:3,1:3), 29 | bam = sapply( 30 | c("S15-5N_Aligned.sortedByCoord.out.bam", "S32-9N_Aligned.sortedByCoord.out.bam", "S34-11N_Aligned.sortedByCoord.out.bam", 31 | "S5-3N_Aligned.sortedByCoord.out.bam", "S7-4N_Aligned.sortedByCoord.out.bam", "S11-10N_Aligned.sortedByCoord.out.bam"), 32 | function(bam) file.path(working_directory, bam)), 33 | coverage = file.path( 34 | working_directory, 35 | paste0(c("abh1.N_1","abh1.N_2","abh1.N_3","WT.N_1", "WT.N_2", "WT.N_3"), ".rds"))) 36 | 37 | #- save sample information table ----------------------------------------------# 38 | write.table(sample_info, file.path(working_directory, "sample_info.txt")) 39 | 40 | #- display sample information table -------------------------------------------# 41 | knitr::kable(sample_info, row.names = FALSE) 42 | 43 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F) 44 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt") 45 | 46 | ### setup comparisons and loop for each chromosome 47 | out_DERs <- NULL 48 | 49 | for(i in unique(genome$X1)){ 50 | chr <- paste(i) 51 | stop <- genome$X2[genome$X1==i] 52 | 53 | ## load data 54 | data <- loadData( 55 | sampleInfo = file.path(working_directory,"sample_info.txt"), 56 | locus = list(seqid = i, chromStart = 1, chromEnd = stop), 57 | referenceCondition = "WT.N", 58 | isPairedEnd = TRUE, 59 | readLength = 150, 60 | coverageType = "threePrime", 61 | stranded = TRUE, 62 | strandSpecific = 1, 63 | fromBam = TRUE, 64 | nbThreads = nb_threads, 65 | verbose = TRUE, 66 | ) 67 | 68 | ## Changepoint detection to define segments 69 | SExp <- segmentation( 70 | data = data, 71 | weightType = "unweighted", #zeroInflated : low counts have less weight 72 | modelSelectionType = "yao", 73 | featureCountsType = "fromBam", 74 | compressed = TRUE, 75 | alpha = 2, 76 | segmentNeighborhood = FALSE, 77 | Kmax = NULL, 78 | verbose = FALSE, 79 | nbThreadsGridSearch = 1, 80 | alphas = NULL, 81 | gridSearch = FALSE, 82 | outputDirectory = working_directory, 83 | nbThreadsFeatureCounts = nb_threads, 84 | strandSpecific = 1, 85 | read2pos = 3, 86 | isPairedEnd = TRUE 87 | ) 88 | 89 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,] 90 | 91 | dds <- dea( 92 | data = data, 93 | SExp = SExp_10, 94 | design = ~condition, 95 | predicate = NULL, 96 | significanceLevel = 0.01, 97 | verbose = TRUE 98 | ) 99 | 100 | #- extract DERs based on signifiance ----------------------------------------# 101 | DERs <- dds[SummarizedExperiment::mcols(dds)$rejectedHypotheses,] 102 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs)) 103 | 104 | out_DERs <- rbind(out_DERs,DERs) 105 | 106 | } 107 | 108 | #clear memory cache 109 | gc() 110 | 111 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3],".",l[4]))) 112 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, strand, baseVar, log2FoldChange, padj) 113 | out <- subset(out, baseMean > 10) 114 | 115 | write_tsv(out, "abh1-NvsWT-N_DERs_3p.bed", col_names=F) 116 | 117 | -------------------------------------------------------------------------------- /project_workflows/diffsegR_v1/diffsegR_abh1-NvsWT-N_gmuct_5p_v1.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## Installation 4 | #### Conda 5 | #conda create --name diffseg_v1 6 | #conda install -n diffseg_v1 -c conda-forge r-tidyverse 7 | #conda install -n diffseg_v1 -c bioconda bioconductor-deseq2 8 | #conda install -n diffseg_v1 -c bioconda bioconductor-rsubread 9 | #conda install -n diffseg_v1 -c bioconda bioconductor-rtracklayer 10 | #conda install -n diffseg_v1 -c bioconda bioconductor-sparsematrixstats 11 | #conda install -n diffseg_v1 -c bioconda bioconductor-delayedmatrixstats 12 | #conda install -n diffseg_v1 -c conda-forge r-remotes 13 | #conda activate diffseg_v1 14 | 15 | #### R 16 | #remotes::install_github("sanssouci-org/sanssouci") 17 | #remotes::install_github("aLiehrmann/DiffSegR@f657435") 18 | 19 | library(tidyverse) 20 | library(DiffSegR) 21 | nb_threads = 10 22 | working_directory <- getwd() 23 | 24 | #- create sample information table --------------------------------------------# 25 | sample_info <- data.frame( 26 | sample = c("abh1.N_1","abh1.N_2","abh1.N_3","WT.N_1","WT.N_2","WT.N_3"), 27 | condition = c(rep("abh1.N", 3), rep( "WT.N", 3)), 28 | replicate = c(1:3,1:3), 29 | bam = sapply( 30 | c("S15-5N_Aligned.sortedByCoord.out.bam", "S32-9N_Aligned.sortedByCoord.out.bam", "S34-11N_Aligned.sortedByCoord.out.bam", 31 | "S5-3N_Aligned.sortedByCoord.out.bam", "S7-4N_Aligned.sortedByCoord.out.bam", "S11-10N_Aligned.sortedByCoord.out.bam"), 32 | function(bam) file.path(working_directory, bam)), 33 | coverage = file.path( 34 | working_directory, 35 | paste0(c("abh1.N_1","abh1.N_2","abh1.N_3","WT.N_1", "WT.N_2", "WT.N_3"), ".rds"))) 36 | 37 | #- save sample information table ----------------------------------------------# 38 | write.table(sample_info, file.path(working_directory, "sample_info.txt")) 39 | 40 | #- display sample information table -------------------------------------------# 41 | knitr::kable(sample_info, row.names = FALSE) 42 | 43 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F) 44 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt") 45 | 46 | ### setup comparisons and loop for each chromosome 47 | out_DERs <- NULL 48 | 49 | for(i in unique(genome$X1)){ 50 | chr <- paste(i) 51 | stop <- genome$X2[genome$X1==i] 52 | 53 | ## load data 54 | data <- loadData( 55 | sampleInfo = file.path(working_directory,"sample_info.txt"), 56 | locus = list(seqid = i, chromStart = 1, chromEnd = stop), 57 | referenceCondition = "WT.N", 58 | isPairedEnd = TRUE, 59 | readLength = 150, 60 | coverageType = "fivePrime", 61 | stranded = FALSE, 62 | strandSpecific = 0, 63 | fromBam = TRUE, 64 | nbThreads = nb_threads, 65 | verbose = TRUE, 66 | ) 67 | 68 | ## Changepoint detection to define segments 69 | SExp <- segmentation( 70 | data = data, 71 | weightType = "unweighted", #zeroInflated : low counts have less weight 72 | modelSelectionType = "yao", 73 | featureCountsType = "fromBam", 74 | compressed = TRUE, 75 | alpha = 2, 76 | segmentNeighborhood = FALSE, 77 | Kmax = NULL, 78 | verbose = FALSE, 79 | nbThreadsGridSearch = 1, 80 | alphas = NULL, 81 | gridSearch = FALSE, 82 | outputDirectory = working_directory, 83 | nbThreadsFeatureCounts = nb_threads, 84 | strandSpecific = 0, 85 | read2pos = 5, 86 | isPairedEnd = TRUE 87 | ) 88 | 89 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,] 90 | 91 | dds <- dea( 92 | data = data, 93 | SExp = SExp_10, 94 | design = ~condition, 95 | predicate = NULL, 96 | significanceLevel = 0.01, 97 | orderBy = "pvalue", 98 | verbose = TRUE 99 | ) 100 | 101 | #- extract DERs based on signifiance ----------------------------------------# 102 | DERs <- dds[SummarizedExperiment::mcols(dds)$rejectedHypotheses,] 103 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs)) 104 | 105 | out_DERs <- rbind(out_DERs,DERs) 106 | 107 | } 108 | 109 | #clear memory cache 110 | gc() 111 | 112 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3]))) 113 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, baseVar, log2FoldChange, padj) 114 | out <- subset(out, baseMean > 10) 115 | 116 | write_tsv(out, "abh1-NvsWT-N_DERs_5p.bed", col_names=F) 117 | 118 | -------------------------------------------------------------------------------- /project_workflows/diffsegR_v1/diffsegR_abh1-NvsWT-N_gmuct_5p_v1_stranded.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## Installation 4 | #### Conda 5 | #conda create --name diffseg_v1 6 | #conda install -n diffseg_v1 -c conda-forge r-tidyverse 7 | #conda install -n diffseg_v1 -c bioconda bioconductor-deseq2 8 | #conda install -n diffseg_v1 -c bioconda bioconductor-rsubread 9 | #conda install -n diffseg_v1 -c bioconda bioconductor-rtracklayer 10 | #conda install -n diffseg_v1 -c bioconda bioconductor-sparsematrixstats 11 | #conda install -n diffseg_v1 -c bioconda bioconductor-delayedmatrixstats 12 | #conda install -n diffseg_v1 -c conda-forge r-remotes 13 | #conda activate diffseg_v1 14 | 15 | #### R 16 | #remotes::install_github("sanssouci-org/sanssouci") 17 | #remotes::install_github("aLiehrmann/DiffSegR@f657435") 18 | 19 | library(tidyverse) 20 | library(DiffSegR) 21 | nb_threads = 10 22 | working_directory <- getwd() 23 | 24 | #- create sample information table --------------------------------------------# 25 | sample_info <- data.frame( 26 | sample = c("abh1.N_1","abh1.N_2","abh1.N_3","WT.N_1","WT.N_2","WT.N_3"), 27 | condition = c(rep("abh1.N", 3), rep( "WT.N", 3)), 28 | replicate = c(1:3,1:3), 29 | bam = sapply( 30 | c("S15-5N_Aligned.sortedByCoord.out.bam", "S32-9N_Aligned.sortedByCoord.out.bam", "S34-11N_Aligned.sortedByCoord.out.bam", 31 | "S5-3N_Aligned.sortedByCoord.out.bam", "S7-4N_Aligned.sortedByCoord.out.bam", "S11-10N_Aligned.sortedByCoord.out.bam"), 32 | function(bam) file.path(working_directory, bam)), 33 | coverage = file.path( 34 | working_directory, 35 | paste0(c("abh1.N_1","abh1.N_2","abh1.N_3","WT.N_1", "WT.N_2", "WT.N_3"), ".rds"))) 36 | 37 | #- save sample information table ----------------------------------------------# 38 | write.table(sample_info, file.path(working_directory, "sample_info.txt")) 39 | 40 | #- display sample information table -------------------------------------------# 41 | knitr::kable(sample_info, row.names = FALSE) 42 | 43 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F) 44 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt") 45 | 46 | ### setup comparisons and loop for each chromosome 47 | out_DERs <- NULL 48 | 49 | for(i in unique(genome$X1)){ 50 | chr <- paste(i) 51 | stop <- genome$X2[genome$X1==i] 52 | 53 | ## load data 54 | data <- loadData( 55 | sampleInfo = file.path(working_directory,"sample_info.txt"), 56 | locus = list(seqid = i, chromStart = 1, chromEnd = stop), 57 | referenceCondition = "WT.N", 58 | isPairedEnd = TRUE, 59 | readLength = 150, 60 | coverageType = "fivePrime", 61 | stranded = TRUE, 62 | strandSpecific = 1, 63 | fromBam = TRUE, 64 | nbThreads = nb_threads, 65 | verbose = TRUE, 66 | ) 67 | 68 | ## Changepoint detection to define segments 69 | SExp <- segmentation( 70 | data = data, 71 | weightType = "unweighted", #zeroInflated : low counts have less weight 72 | modelSelectionType = "yao", 73 | featureCountsType = "fromBam", 74 | compressed = TRUE, 75 | alpha = 2, 76 | segmentNeighborhood = FALSE, 77 | Kmax = NULL, 78 | verbose = FALSE, 79 | nbThreadsGridSearch = 1, 80 | alphas = NULL, 81 | gridSearch = FALSE, 82 | outputDirectory = working_directory, 83 | nbThreadsFeatureCounts = nb_threads, 84 | strandSpecific = 1, 85 | read2pos = 5, 86 | isPairedEnd = TRUE 87 | ) 88 | 89 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,] 90 | 91 | dds <- dea( 92 | data = data, 93 | SExp = SExp_10, 94 | design = ~condition, 95 | predicate = NULL, 96 | significanceLevel = 0.01, 97 | orderBy = "pvalue", 98 | verbose = TRUE 99 | ) 100 | 101 | #- extract DERs based on signifiance ----------------------------------------# 102 | DERs <- dds[SummarizedExperiment::mcols(dds)$rejectedHypotheses,] 103 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs)) 104 | 105 | out_DERs <- rbind(out_DERs,DERs) 106 | 107 | } 108 | 109 | #clear memory cache 110 | gc() 111 | 112 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3],".",l[4]))) 113 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, strand, baseVar, log2FoldChange, padj) 114 | out <- subset(out, baseMean > 10) 115 | 116 | write_tsv(out, "abh1-NvsWT-N_DERs_5p.bed", col_names=F) 117 | 118 | -------------------------------------------------------------------------------- /project_workflows/diffsegR_v1/diffsegR_dxo1-NvsC_gmuct_3p_v1.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## Installation 4 | #### Conda 5 | #conda create --name diffseg_v1 6 | #conda install -n diffseg_v1 -c conda-forge r-tidyverse 7 | #conda install -n diffseg_v1 -c bioconda bioconductor-deseq2 8 | #conda install -n diffseg_v1 -c bioconda bioconductor-rsubread 9 | #conda install -n diffseg_v1 -c bioconda bioconductor-rtracklayer 10 | #conda install -n diffseg_v1 -c bioconda bioconductor-sparsematrixstats 11 | #conda install -n diffseg_v1 -c bioconda bioconductor-delayedmatrixstats 12 | #conda install -n diffseg_v1 -c conda-forge r-remotes 13 | #conda activate diffseg_v1 14 | 15 | #### R 16 | #remotes::install_github("sanssouci-org/sanssouci") 17 | #remotes::install_github("aLiehrmann/DiffSegR@f657435") 18 | 19 | library(tidyverse) 20 | library(DiffSegR) 21 | nb_threads = 10 22 | working_directory <- getwd() 23 | 24 | #- create sample information table --------------------------------------------# 25 | sample_info <- data.frame( 26 | sample = c("dxo1.N_1","dxo1.N_2","dxo1.N_3","dxo1.C_1","dxo1.C_2","dxo1.C_3"), 27 | condition = c(rep("dxo1.N", 3), rep( "dxo1.C", 3)), 28 | replicate = c(1:3,1:3), 29 | bam = sapply( 30 | c("S2-2N_Aligned.sortedByCoord.out.bam", "S17-6N_Aligned.sortedByCoord.out.bam", "S13-19N_Aligned.sortedByCoord.out.bam", 31 | "S23-2C_Aligned.sortedByCoord.out.bam", "S18-6C_Aligned.sortedByCoord.out.bam", "S14-19C_Aligned.sortedByCoord.out.bam"), 32 | function(bam) file.path(working_directory, bam)), 33 | coverage = file.path( 34 | working_directory, 35 | paste0(c("dxo1.N_1","dxo1.N_2","dxo1.N_3","dxo1.C_1", "dxo1.C_2", "dxo1.C_3"), ".rds"))) 36 | 37 | #- save sample information table ----------------------------------------------# 38 | write.table(sample_info, file.path(working_directory, "sample_info.txt")) 39 | 40 | #- display sample information table -------------------------------------------# 41 | knitr::kable(sample_info, row.names = FALSE) 42 | 43 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F) 44 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt") 45 | 46 | ### setup comparisons and loop for each chromosome 47 | out_DERs <- NULL 48 | 49 | for(i in unique(genome$X1)){ 50 | chr <- paste(i) 51 | stop <- genome$X2[genome$X1==i] 52 | 53 | ## load data 54 | data <- loadData( 55 | sampleInfo = file.path(working_directory,"sample_info.txt"), 56 | locus = list(seqid = i, chromStart = 1, chromEnd = stop), 57 | referenceCondition = "dxo1.C", 58 | isPairedEnd = TRUE, 59 | readLength = 150, 60 | coverageType = "threePrime", 61 | stranded = FALSE, 62 | strandSpecific = 0, 63 | fromBam = TRUE, 64 | nbThreads = nb_threads, 65 | verbose = TRUE, 66 | ) 67 | 68 | ## Changepoint detection to define segments 69 | SExp <- segmentation( 70 | data = data, 71 | weightType = "unweighted", #zeroInflated : low counts have less weight 72 | modelSelectionType = "yao", 73 | featureCountsType = "fromBam", 74 | compressed = TRUE, 75 | alpha = 2, 76 | segmentNeighborhood = FALSE, 77 | Kmax = NULL, 78 | verbose = FALSE, 79 | nbThreadsGridSearch = 1, 80 | alphas = NULL, 81 | gridSearch = FALSE, 82 | outputDirectory = working_directory, 83 | nbThreadsFeatureCounts = nb_threads, 84 | strandSpecific = 0, 85 | read2pos = 3, 86 | isPairedEnd = TRUE 87 | ) 88 | 89 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,] 90 | 91 | dds <- dea( 92 | data = data, 93 | SExp = SExp_10, 94 | design = ~condition, 95 | predicate = NULL, 96 | significanceLevel = 0.01, 97 | verbose = TRUE 98 | ) 99 | 100 | #- extract DERs based on signifiance ----------------------------------------# 101 | DERs <- dds[SummarizedExperiment::mcols(dds)$rejectedHypotheses,] 102 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs)) 103 | 104 | out_DERs <- rbind(out_DERs,DERs) 105 | 106 | } 107 | 108 | #clear memory cache 109 | gc() 110 | 111 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3]))) 112 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, baseVar, log2FoldChange, padj) 113 | out <- subset(out, baseMean > 10) 114 | 115 | write_tsv(out, "dxo1-N_DERs_3p.bed", col_names=F) 116 | 117 | -------------------------------------------------------------------------------- /project_workflows/diffsegR_v1/diffsegR_dxo1-NvsC_gmuct_5p_v1.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## Installation 4 | #### Conda 5 | #conda create --name diffseg_v1 6 | #conda install -n diffseg_v1 -c conda-forge r-tidyverse 7 | #conda install -n diffseg_v1 -c bioconda bioconductor-deseq2 8 | #conda install -n diffseg_v1 -c bioconda bioconductor-rsubread 9 | #conda install -n diffseg_v1 -c bioconda bioconductor-rtracklayer 10 | #conda install -n diffseg_v1 -c bioconda bioconductor-sparsematrixstats 11 | #conda install -n diffseg_v1 -c bioconda bioconductor-delayedmatrixstats 12 | #conda install -n diffseg_v1 -c conda-forge r-remotes 13 | #conda activate diffseg_v1 14 | 15 | #### R 16 | #remotes::install_github("sanssouci-org/sanssouci") 17 | #remotes::install_github("aLiehrmann/DiffSegR@f657435") 18 | 19 | library(tidyverse) 20 | library(DiffSegR) 21 | nb_threads = 10 22 | working_directory <- getwd() 23 | 24 | #- create sample information table --------------------------------------------# 25 | sample_info <- data.frame( 26 | sample = c("dxo1.N_1","dxo1.N_2","dxo1.N_3","dxo1.C_1","dxo1.C_2","dxo1.C_3"), 27 | condition = c(rep("dxo1.N", 3), rep( "dxo1.C", 3)), 28 | replicate = c(1:3,1:3), 29 | bam = sapply( 30 | c("S2-2N_Aligned.sortedByCoord.out.bam", "S17-6N_Aligned.sortedByCoord.out.bam", "S13-19N_Aligned.sortedByCoord.out.bam", 31 | "S23-2C_Aligned.sortedByCoord.out.bam", "S18-6C_Aligned.sortedByCoord.out.bam", "S14-19C_Aligned.sortedByCoord.out.bam"), 32 | function(bam) file.path(working_directory, bam)), 33 | coverage = file.path( 34 | working_directory, 35 | paste0(c("dxo1.N_1","dxo1.N_2","dxo1.N_3","dxo1.C_1", "dxo1.C_2", "dxo1.C_3"), ".rds"))) 36 | 37 | #- save sample information table ----------------------------------------------# 38 | write.table(sample_info, file.path(working_directory, "sample_info.txt")) 39 | 40 | #- display sample information table -------------------------------------------# 41 | knitr::kable(sample_info, row.names = FALSE) 42 | 43 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F) 44 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt") 45 | 46 | ### setup comparisons and loop for each chromosome 47 | out_DERs <- NULL 48 | 49 | for(i in unique(genome$X1)){ 50 | chr <- paste(i) 51 | stop <- genome$X2[genome$X1==i] 52 | 53 | ## load data 54 | data <- loadData( 55 | sampleInfo = file.path(working_directory,"sample_info.txt"), 56 | locus = list(seqid = i, chromStart = 1, chromEnd = stop), 57 | referenceCondition = "dxo1.C", 58 | isPairedEnd = TRUE, 59 | readLength = 150, 60 | coverageType = "fivePrime", 61 | stranded = FALSE, 62 | strandSpecific = 0, 63 | fromBam = TRUE, 64 | nbThreads = nb_threads, 65 | verbose = TRUE, 66 | ) 67 | 68 | ## Changepoint detection to define segments 69 | SExp <- segmentation( 70 | data = data, 71 | weightType = "unweighted", #zeroInflated : low counts have less weight 72 | modelSelectionType = "yao", 73 | featureCountsType = "fromBam", 74 | compressed = TRUE, 75 | alpha = 2, 76 | segmentNeighborhood = FALSE, 77 | Kmax = NULL, 78 | verbose = FALSE, 79 | nbThreadsGridSearch = 1, 80 | alphas = NULL, 81 | gridSearch = FALSE, 82 | outputDirectory = working_directory, 83 | nbThreadsFeatureCounts = nb_threads, 84 | strandSpecific = 0, 85 | read2pos = 5, 86 | isPairedEnd = TRUE 87 | ) 88 | 89 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,] 90 | 91 | dds <- dea( 92 | data = data, 93 | SExp = SExp_10, 94 | design = ~condition, 95 | predicate = NULL, 96 | significanceLevel = 0.01, 97 | orderBy = "pvalue", 98 | verbose = TRUE 99 | ) 100 | 101 | #- extract DERs based on signifiance ----------------------------------------# 102 | DERs <- dds[SummarizedExperiment::mcols(dds)$rejectedHypotheses,] 103 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs)) 104 | 105 | out_DERs <- rbind(out_DERs,DERs) 106 | 107 | } 108 | 109 | #clear memory cache 110 | gc() 111 | 112 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3]))) 113 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, baseVar, log2FoldChange, padj) 114 | out <- subset(out, baseMean > 10) 115 | 116 | write_tsv(out, "dxo1-N_DERs_5p.bed", col_names=F) 117 | 118 | -------------------------------------------------------------------------------- /project_workflows/diffsegR_v2/diffsegR_WT-NvsC_gmuct_3p.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## Installation 4 | #### Conda 5 | #conda create --name diffseg 6 | #conda install -n diffseg -c conda-forge r-tidyverse 7 | #conda install -n diffseg -c bioconda bioconductor-deseq2 8 | #conda install -n diffseg -c bioconda bioconductor-rsubread 9 | #conda install -n diffseg -c bioconda bioconductor-rtracklayer 10 | #conda install -n diffseg -c bioconda bioconductor-sparsematrixstats 11 | #conda install -n diffseg -c bioconda bioconductor-delayedmatrixstats 12 | #conda install -n diffseg -c conda-forge r-remotes 13 | #conda activate diffseg 14 | 15 | #### R 16 | #remotes::install_github("sanssouci-org/sanssouci") 17 | #remotes::install_github("aLiehrmann/DiffSegR") 18 | 19 | library(tidyverse) 20 | library(DiffSegR) 21 | 22 | ## multi-threading options 23 | nb_threads = 10 24 | nb_threads_locus = 10 25 | 26 | working_directory <- getwd() 27 | 28 | #- create sample information table --------------------------------------------# 29 | sample_info <- data.frame( 30 | sample = c("WT.N_1", "WT.N_2", "WT.N_3", "WT.C_1", "WT.C_2", "WT.C_3"), 31 | condition = rep(c("WT.N", "WT.C"), each = 3), 32 | replicate = rep(1:3,2), 33 | bam = sapply( 34 | c("S5-3N_Aligned.sortedByCoord.out.bam", 35 | "S7-4N_Aligned.sortedByCoord.out.bam", 36 | "S11-10N_Aligned.sortedByCoord.out.bam", 37 | "S6-3C_Aligned.sortedByCoord.out.bam", 38 | "S8-4C_Aligned.sortedByCoord.out.bam", 39 | "S12-10C_Aligned.sortedByCoord.out.bam" 40 | ), function(bam) file.path(working_directory, bam)), 41 | isPairedEnd = rep(TRUE, 6), 42 | strandSpecific = rep(0, 6) 43 | ) 44 | 45 | #- display sample information table -------------------------------------------# 46 | knitr::kable(sample_info, row.names = FALSE) 47 | 48 | ## genome file 49 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F) 50 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt") 51 | 52 | ### setup comparisons and loop for each chromosome 53 | out_DERs <- NULL 54 | 55 | for(i in unique(genome$X1)){ 56 | 57 | chr <- paste(i) 58 | stop <- genome$X2[genome$X1==i] 59 | 60 | ## import data on experiment 61 | data <- newExperiment( 62 | sampleInfo = sample_info, 63 | loci = data.frame(seqid = i, chromStart = 1, chromEnd = stop), 64 | referenceCondition = "WT.C", 65 | otherCondition = "WT.N", 66 | nbThreads = nb_threads, 67 | nbThreadsByLocus = nb_threads_locus, 68 | coverage = working_directory 69 | ) 70 | 71 | print(data) 72 | 73 | ## generate coverage profile from BAM 74 | coverage(data = data, coverageType = "threePrime", verbose = TRUE) 75 | 76 | ## transform coverage profile into per-base log2-FC and perform changepoint detection to define segments 77 | features <- segmentationLFC( 78 | data = data, 79 | alpha = 2, 80 | modelSelectionType = "yao", 81 | verbose = TRUE 82 | ) 83 | 84 | ## Quantify expression of segments 85 | SExp <- counting( 86 | data = data, 87 | features = features, 88 | featureCountsType = "fromBam", 89 | featureCountsOtherParams = list(read2pos = 3), 90 | verbose = TRUE 91 | ) 92 | 93 | #- subset to segments with width < 11 nt ------------------------------------------------# 94 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,] 95 | 96 | # differential exprssion analysis 97 | dds <- dea( 98 | SExp = SExp_10, 99 | design = ~condition, 100 | significanceLevel = 0.01, 101 | verbose = TRUE 102 | ) 103 | 104 | #- extract DERs based on signifiance ----------------------------------------# 105 | DERs <- dds[SummarizedExperiment::mcols(dds)$DER,] 106 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs)) 107 | 108 | out_DERs <- rbind(out_DERs,DERs) 109 | } 110 | 111 | ## clear memory cache 112 | gc() 113 | 114 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3]))) 115 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, baseVar, log2FoldChange, padj) 116 | out <- subset(out, baseMean > 10) 117 | 118 | write_tsv(out, "WT-N_DERs_3p.bed", col_names=F) 119 | 120 | 121 | -------------------------------------------------------------------------------- /project_workflows/diffsegR_v2/diffsegR_WT-NvsC_gmuct_5p.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## Installation 4 | #### Conda 5 | #conda create --name diffseg 6 | #conda install -n diffseg -c conda-forge r-tidyverse 7 | #conda install -n diffseg -c bioconda bioconductor-deseq2 8 | #conda install -n diffseg -c bioconda bioconductor-rsubread 9 | #conda install -n diffseg -c bioconda bioconductor-rtracklayer 10 | #conda install -n diffseg -c bioconda bioconductor-sparsematrixstats 11 | #conda install -n diffseg -c bioconda bioconductor-delayedmatrixstats 12 | #conda install -n diffseg -c conda-forge r-remotes 13 | #conda activate diffseg 14 | #### R 15 | #remotes::install_github("sanssouci-org/sanssouci") 16 | #remotes::install_github("aLiehrmann/DiffSegR") 17 | 18 | library(tidyverse) 19 | library(DiffSegR) 20 | 21 | ## multi-threading options 22 | nb_threads = 10 23 | nb_threads_locus = 10 24 | 25 | working_directory <- getwd() 26 | 27 | #- create sample information table --------------------------------------------# 28 | sample_info <- data.frame( 29 | sample = c("WT.N_1", "WT.N_2", "WT.N_3", "WT.C_1", "WT.C_2", "WT.C_3"), 30 | condition = rep(c("WT.N", "WT.C"), each = 3), 31 | replicate = rep(1:3,2), 32 | bam = sapply( 33 | c("S5-3N_Aligned.sortedByCoord.out.bam", 34 | "S7-4N_Aligned.sortedByCoord.out.bam", 35 | "S11-10N_Aligned.sortedByCoord.out.bam", 36 | "S6-3C_Aligned.sortedByCoord.out.bam", 37 | "S8-4C_Aligned.sortedByCoord.out.bam", 38 | "S12-10C_Aligned.sortedByCoord.out.bam" 39 | ), function(bam) file.path(working_directory, bam)), 40 | isPairedEnd = rep(TRUE, 6), 41 | strandSpecific = rep(0, 6) 42 | ) 43 | 44 | #- display sample information table -------------------------------------------# 45 | knitr::kable(sample_info, row.names = FALSE) 46 | 47 | ## genome file 48 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F) 49 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt") 50 | 51 | ### setup comparisons and loop for each chromosome 52 | out_DERs <- NULL 53 | 54 | for(i in unique(genome$X1)){ 55 | 56 | chr <- paste(i) 57 | stop <- genome$X2[genome$X1==i] 58 | 59 | ## import data on experiment 60 | data <- newExperiment( 61 | sampleInfo = sample_info, 62 | loci = data.frame(seqid = i, chromStart = 1, chromEnd = stop), 63 | referenceCondition = "WT.C", 64 | otherCondition = "WT.N", 65 | nbThreads = nb_threads, 66 | nbThreadsByLocus = nb_threads_locus, 67 | coverage = working_directory 68 | ) 69 | 70 | print(data) 71 | 72 | ## generate coverage profile from BAM 73 | coverage(data = data, coverageType = "fivePrime", verbose = TRUE) 74 | 75 | ## transform coverage profile into per-base log2-FC and perform changepoint detection to define segments 76 | features <- segmentationLFC( 77 | data = data, 78 | alpha = 2, 79 | modelSelectionType = "yao", 80 | verbose = TRUE 81 | ) 82 | 83 | ## Quantify expression of segments 84 | SExp <- counting( 85 | data = data, 86 | features = features, 87 | featureCountsType = "fromBam", 88 | featureCountsOtherParams = list(read2pos = 5), 89 | verbose = TRUE 90 | ) 91 | 92 | #- subset to segments with width < 11 nt --------------------------------------# 93 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,] 94 | 95 | 96 | # differential exprssion analysis 97 | dds <- dea( 98 | SExp = SExp_10, 99 | design = ~condition, 100 | significanceLevel = 0.01, 101 | verbose = TRUE, 102 | predicate = NULL 103 | ) 104 | 105 | #- extract DERs based on signifiance ----------------------------------------# 106 | DERs <- dds[SummarizedExperiment::mcols(dds)$DER,] 107 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs)) 108 | 109 | out_DERs <- rbind(out_DERs,DERs) 110 | } 111 | 112 | ## clear memory cache 113 | gc() 114 | 115 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3]))) 116 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, baseVar, log2FoldChange, padj) 117 | out <- subset(out, baseMean > 10) 118 | 119 | write_tsv(out, "WT-N_DERs_5p.bed", col_names=F) 120 | 121 | -------------------------------------------------------------------------------- /project_workflows/diffsegR_v2/diffsegR_WT-Nvsbulk_gmuct_5p.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## Installation 4 | #### Conda 5 | #conda create --name diffseg 6 | #conda install -n diffseg -c conda-forge r-tidyverse 7 | #conda install -n diffseg -c bioconda bioconductor-deseq2 8 | #conda install -n diffseg -c bioconda bioconductor-rsubread 9 | #conda install -n diffseg -c bioconda bioconductor-rtracklayer 10 | #conda install -n diffseg -c bioconda bioconductor-sparsematrixstats 11 | #conda install -n diffseg -c bioconda bioconductor-delayedmatrixstats 12 | #conda activate diffseg 13 | #### R 14 | #remotes::install_github("sanssouci-org/sanssouci") 15 | #remotes::install_github("aLiehrmann/DiffSegR") 16 | 17 | library(tidyverse) 18 | library(DiffSegR) 19 | 20 | ## multi-threading options 21 | nb_threads = 10 22 | nb_threads_locus = 10 23 | 24 | working_directory <- getwd() 25 | 26 | #- create sample information table --------------------------------------------# 27 | sample_info <- data.frame( 28 | sample = c("WT.N_1", "WT.N_2", "WT.N_3", "BDG_1", "BDG_2", "HMC_1", "HMC_2"), 29 | condition = c(rep("WT.N", each = 3),rep("BDG", each=2),rep("HMC",each=2)), 30 | replicate = c(1:3,1:2,1:2), 31 | bam = sapply( 32 | c("S5-3N_Aligned.sortedByCoord.out.bam", 33 | "S7-4N_Aligned.sortedByCoord.out.bam", 34 | "S11-10N_Aligned.sortedByCoord.out.bam", 35 | "WT-BDG_rep1_Aligned.sortedByCoord.out.bam", 36 | "WT-BDG_rep2_Aligned.sortedByCoord.out.bam", 37 | "WT-HMC_rep1_Aligned.sortedByCoord.out.bam", 38 | "WT-HMC_rep2_Aligned.sortedByCoord.out.bam"), 39 | function(bam) file.path(working_directory, bam)), 40 | isPairedEnd = c(rep(TRUE, 3),rep(FALSE,4)), 41 | strandSpecific = rep(0, 7) 42 | ) 43 | 44 | #- display sample information table -------------------------------------------# 45 | knitr::kable(sample_info, row.names = FALSE) 46 | 47 | ## genome file 48 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F) 49 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt") 50 | 51 | ### setup comparisons and loop for each chromosome 52 | out_DERs <- NULL 53 | 54 | for(i in unique(genome$X1)){ 55 | 56 | chr <- paste(i) 57 | stop <- genome$X2[genome$X1==i] 58 | 59 | ## import data on experiment 60 | data <- newExperiment( 61 | sampleInfo = sample_info, 62 | loci = data.frame(seqid = i, chromStart = 1, chromEnd = stop), 63 | referenceCondition = "BDG", 64 | otherCondition = "WT.N", 65 | nbThreads = nb_threads, 66 | nbThreadsByLocus = nb_threads_locus, 67 | coverage = working_directory 68 | ) 69 | 70 | print(data) 71 | 72 | data2 <- newExperiment( 73 | sampleInfo = sample_info, 74 | loci = data.frame(seqid = i, chromStart = 1, chromEnd = stop), 75 | referenceCondition = "HMC", 76 | otherCondition = "WT.N", 77 | nbThreads = nb_threads, 78 | nbThreadsByLocus = nb_threads_locus, 79 | coverage = working_directory 80 | ) 81 | 82 | print(data2) 83 | 84 | ## generate coverage profile from BAM 85 | coverage(data = data, coverageType = "fivePrime", verbose = TRUE) 86 | 87 | ## transform coverage profile into per-base log2-FC and perform changepoint detection to define segments 88 | features <- segmentationLFC( 89 | data = data, 90 | alpha = 2, 91 | modelSelectionType = "yao", 92 | verbose = TRUE 93 | ) 94 | 95 | ## Quantify expression of segments 96 | SExp <- counting( 97 | data = data, 98 | features = features, 99 | featureCountsType = "fromBam", 100 | featureCountsOtherParams = list(read2pos = 5), 101 | verbose = TRUE 102 | ) 103 | 104 | #- subset to segments with width < 11 nt --------------------------------------# 105 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,] 106 | 107 | 108 | # differential exprssion analysis 109 | dds <- dea( 110 | SExp = SExp_10, 111 | design = ~condition, 112 | significanceLevel = 0.01, 113 | verbose = TRUE 114 | ) 115 | 116 | #- extract DERs based on signifiance ----------------------------------------# 117 | DERs <- dds[SummarizedExperiment::mcols(dds)$DER,] 118 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs)) 119 | DERs <- subset(DERs, baseMean > 10) 120 | 121 | out_DERs <- rbind(out_DERs,DERs) 122 | } 123 | 124 | ## clear memory cache 125 | gc() 126 | 127 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3]))) 128 | out <- select(out_DERs, seqnames, start, end, derId, log2FoldChange, padj, baseMean) 129 | 130 | write_tsv(out, "WT-NvsBDG_5p.bed", col_names=F) 131 | 132 | 133 | -------------------------------------------------------------------------------- /project_workflows/diffsegR_v2/diffsegR_abh1-NvsC_gmuct_5p.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## Installation 4 | #### Conda 5 | #conda create --name diffseg 6 | #conda install -n diffseg -c conda-forge r-tidyverse 7 | #conda install -n diffseg -c bioconda bioconductor-deseq2 8 | #conda install -n diffseg -c bioconda bioconductor-rsubread 9 | #conda install -n diffseg -c bioconda bioconductor-rtracklayer 10 | #conda install -n diffseg -c bioconda bioconductor-sparsematrixstats 11 | #conda install -n diffseg -c bioconda bioconductor-delayedmatrixstats 12 | #conda install -n diffseg -c conda-forge r-remotes 13 | #conda activate diffseg 14 | #### R 15 | #remotes::install_github("sanssouci-org/sanssouci") 16 | #remotes::install_github("aLiehrmann/DiffSegR") 17 | 18 | library(tidyverse) 19 | library(DiffSegR) 20 | 21 | ## multi-threading options 22 | nb_threads = 10 23 | nb_threads_locus = 10 24 | 25 | working_directory <- getwd() 26 | 27 | #- create sample information table --------------------------------------------# 28 | sample_info <- data.frame( 29 | sample = c("abh1.N_1", "abh1.N_2", "abh1.N_3", "abh1.C_1", "abh1.C_2", "abh1.C_3"), 30 | condition = rep(c("abh1.N", "abh1.C"), each = 3), 31 | replicate = rep(1:3,2), 32 | bam = sapply( 33 | c("S15-5N_Aligned.sortedByCoord.out.bam", 34 | "S9-20N_Aligned.sortedByCoord.out.bam", 35 | "S24-34N_Aligned.sortedByCoord.out.bam", 36 | "S16-5C_Aligned.sortedByCoord.out.bam", 37 | "S10-20C_Aligned.sortedByCoord.out.bam", 38 | "S25-34C_Aligned.sortedByCoord.out.bam" 39 | ), function(bam) file.path(working_directory, bam)), 40 | isPairedEnd = rep(TRUE, 6), 41 | strandSpecific = rep(0, 6) 42 | ) 43 | 44 | #- display sample information table -------------------------------------------# 45 | knitr::kable(sample_info, row.names = FALSE) 46 | 47 | ## genome file 48 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F) 49 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt") 50 | 51 | ### setup comparisons and loop for each chromosome 52 | out_DERs <- NULL 53 | 54 | for(i in unique(genome$X1)){ 55 | 56 | chr <- paste(i) 57 | stop <- genome$X2[genome$X1==i] 58 | 59 | ## import data on experiment 60 | data <- newExperiment( 61 | sampleInfo = sample_info, 62 | loci = data.frame(seqid = i, chromStart = 1, chromEnd = stop), 63 | referenceCondition = "abh1.C", 64 | otherCondition = "abh1.N", 65 | nbThreads = nb_threads, 66 | nbThreadsByLocus = nb_threads_locus, 67 | coverage = working_directory 68 | ) 69 | 70 | print(data) 71 | 72 | ## generate coverage profile from BAM 73 | coverage(data = data, coverageType = "fivePrime", verbose = TRUE) 74 | 75 | ## transform coverage profile into per-base log2-FC and perform changepoint detection to define segments 76 | features <- segmentationLFC( 77 | data = data, 78 | alpha = 2, 79 | modelSelectionType = "yao", 80 | verbose = TRUE 81 | ) 82 | 83 | ## Quantify expression of segments 84 | SExp <- counting( 85 | data = data, 86 | features = features, 87 | featureCountsType = "fromBam", 88 | featureCountsOtherParams = list(read2pos = 5), 89 | verbose = TRUE 90 | ) 91 | 92 | #- subset to segments with width < 11 nt --------------------------------------# 93 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,] 94 | 95 | 96 | # differential exprssion analysis 97 | dds <- dea( 98 | SExp = SExp_10, 99 | design = ~condition, 100 | significanceLevel = 0.01, 101 | verbose = TRUE, 102 | predicate = NULL 103 | ) 104 | 105 | #- extract DERs based on signifiance ----------------------------------------# 106 | DERs <- dds[SummarizedExperiment::mcols(dds)$DER,] 107 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs)) 108 | 109 | out_DERs <- rbind(out_DERs,DERs) 110 | } 111 | 112 | ## clear memory cache 113 | gc() 114 | 115 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3]))) 116 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, baseVar, maxCooks, log2FoldChange, padj) 117 | out$cov <- sqrt(out$baseVar)/out$baseMean 118 | out <- subset(out, baseMean > 10 & cov < 1) 119 | 120 | write_tsv(out, "abh1-N_DERs_5p.bed", col_names=F) 121 | 122 | 123 | -------------------------------------------------------------------------------- /project_workflows/diffsegR_v2/diffsegR_abh1-NvsWT-N_gmuct_3p.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## Installation 4 | #### Conda 5 | #conda create --name diffseg 6 | #conda install -n diffseg -c conda-forge r-tidyverse 7 | #conda install -n diffseg -c bioconda bioconductor-deseq2 8 | #conda install -n diffseg -c bioconda bioconductor-rsubread 9 | #conda install -n diffseg -c bioconda bioconductor-rtracklayer 10 | #conda install -n diffseg -c bioconda bioconductor-sparsematrixstats 11 | #conda install -n diffseg -c bioconda bioconductor-delayedmatrixstats 12 | #conda install -n diffseg -c conda-forge r-remotes 13 | #conda activate diffseg 14 | #### R 15 | #remotes::install_github("sanssouci-org/sanssouci") 16 | #remotes::install_github("aLiehrmann/DiffSegR") 17 | 18 | library(tidyverse) 19 | library(DiffSegR) 20 | 21 | ## multi-threading options 22 | nb_threads = 10 23 | nb_threads_locus = 10 24 | 25 | working_directory <- getwd() 26 | 27 | #- create sample information table --------------------------------------------# 28 | sample_info <- data.frame( 29 | sample = c("abh1.N_1", "abh1.N_2", "abh1.N_3", "WT.N_1", "WT.N_2", "WT.N_3"), 30 | condition = c(rep("abh1.N", 3), rep( "WT.N", 3)), 31 | replicate = c(1:3,1:3), 32 | bam = sapply( 33 | c("S15-5N_Aligned.sortedByCoord.out.bam", 34 | "S9-20N_Aligned.sortedByCoord.out.bam", 35 | "S24-34N_Aligned.sortedByCoord.out.bam", 36 | "S5-3N_Aligned.sortedByCoord.out.bam", 37 | "S7-4N_Aligned.sortedByCoord.out.bam", 38 | "S11-10N_Aligned.sortedByCoord.out.bam"), 39 | function(bam) file.path(working_directory, bam) ), 40 | isPairedEnd = rep(TRUE, 6), 41 | strandSpecific = rep(0, 6) 42 | ) 43 | 44 | #- display sample information table -------------------------------------------# 45 | knitr::kable(sample_info, row.names = FALSE) 46 | 47 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F) 48 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt") 49 | 50 | ### setup comparisons and loop for each chromosome 51 | out_DERs <- NULL 52 | out_segments <- NULL 53 | 54 | for(i in unique(genome$X1)){ 55 | 56 | chr <- paste(i) 57 | stop <- genome$X2[genome$X1==i] 58 | 59 | ## import data on experiment 60 | data <- newExperiment( 61 | sampleInfo = sample_info, 62 | loci = data.frame(seqid = i, chromStart = 1, chromEnd = stop), 63 | referenceCondition = "WT.N", 64 | otherCondition = "abh1.N", 65 | nbThreads = nb_threads, 66 | nbThreadsByLocus = nb_threads_locus, 67 | coverage = working_directory 68 | ) 69 | 70 | print(data) 71 | 72 | ## generate coverage profile from BAM 73 | coverage(data = data, coverageType = "threePrime", verbose = TRUE) 74 | 75 | ## transform coverage profile into per-base log2-FC and perform changepoint detection to define segments 76 | features <- segmentationLFC( 77 | data = data, 78 | alpha = 2, 79 | modelSelectionType = "yao", 80 | verbose = TRUE 81 | ) 82 | 83 | ## Quantify expression of segments 84 | SExp <- counting( 85 | data = data, 86 | features = features, 87 | featureCountsType = "fromBam", 88 | featureCountsOtherParams = list(read2pos = 3), 89 | verbose = TRUE 90 | ) 91 | 92 | #- subset to segments with width < 11 nt --------------------------------------# 93 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,] 94 | 95 | 96 | # differential exprssion analysis 97 | dds <- dea( 98 | SExp = SExp_10, 99 | design = ~condition, 100 | significanceLevel = 0.01, 101 | verbose = TRUE 102 | ) 103 | 104 | #- extract DERs based on signifiance ----------------------------------------# 105 | DERs <- dds[SummarizedExperiment::mcols(dds)$DER,] 106 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs)) 107 | DERs <- subset(DERs, baseMean > 10) 108 | 109 | out_DERs <- rbind(out_DERs,DERs) 110 | } 111 | 112 | ## clear memory cache 113 | gc() 114 | 115 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3]))) 116 | out <- select(out_DERs, seqnames, start, end, derId, log2FoldChange, padj, baseMean) 117 | write_tsv(out, "abh1-NvsWT-N_DERs_3p.bed", col_names=F) 118 | 119 | -------------------------------------------------------------------------------- /project_workflows/diffsegR_v2/diffsegR_abh1-NvsWT-N_gmuct_5p.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## Installation 4 | #### Conda 5 | #conda create --name diffseg 6 | #conda install -n diffseg -c conda-forge r-tidyverse 7 | #conda install -n diffseg -c bioconda bioconductor-deseq2 8 | #conda install -n diffseg -c bioconda bioconductor-rsubread 9 | #conda install -n diffseg -c bioconda bioconductor-rtracklayer 10 | #conda install -n diffseg -c bioconda bioconductor-sparsematrixstats 11 | #conda install -n diffseg -c bioconda bioconductor-delayedmatrixstats 12 | #conda install -n diffseg -c conda-forge r-remotes 13 | #conda install -n diffseg bioconda::r-scatterplot3d 14 | #conda activate diffseg 15 | #### R 16 | #remotes::install_github("sanssouci-org/sanssouci") 17 | #remotes::install_github("aLiehrmann/DiffSegR") 18 | 19 | library(tidyverse) 20 | library(DiffSegR) 21 | 22 | ## multi-threading options 23 | nb_threads = 10 24 | nb_threads_locus = 10 25 | 26 | working_directory <- getwd() 27 | 28 | #- create sample information table --------------------------------------------# 29 | sample_info <- data.frame( 30 | sample = c("abh1.N_1", "abh1.N_2", "abh1.N_3", "WT.N_1", "WT.N_2", "WT.N_3"), 31 | condition = c(rep("abh1.N", 3), rep( "WT.N", 3)), 32 | replicate = c(1:3,1:3), 33 | bam = sapply( 34 | c("S15-5N_Aligned.sortedByCoord.out.bam", 35 | "S9-20N_Aligned.sortedByCoord.out.bam", 36 | "S24-34N_Aligned.sortedByCoord.out.bam", 37 | "S5-3N_Aligned.sortedByCoord.out.bam", 38 | "S7-4N_Aligned.sortedByCoord.out.bam", 39 | "S11-10N_Aligned.sortedByCoord.out.bam"), 40 | function(bam) file.path(working_directory, bam)), 41 | isPairedEnd = rep(TRUE, 6), 42 | strandSpecific = rep(0, 6) 43 | ) 44 | 45 | #- display sample information table -------------------------------------------# 46 | knitr::kable(sample_info, row.names = FALSE) 47 | 48 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F) 49 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt") 50 | 51 | ### setup comparisons and loop for each chromosome 52 | out_DERs <- NULL 53 | out_segments <- NULL 54 | 55 | for(i in unique(genome$X1)){ 56 | 57 | chr <- paste(i) 58 | stop <- genome$X2[genome$X1==i] 59 | 60 | ## import data on experiment 61 | data <- newExperiment( 62 | sampleInfo = sample_info, 63 | loci = data.frame(seqid = i, chromStart = 1, chromEnd = stop), 64 | referenceCondition = "WT.N", 65 | otherCondition = "abh1.N", 66 | nbThreads = nb_threads, 67 | nbThreadsByLocus = nb_threads_locus, 68 | coverage = working_directory 69 | ) 70 | 71 | print(data) 72 | 73 | ## generate coverage profile from BAM 74 | coverage(data = data, coverageType = "fivePrime", verbose = TRUE) 75 | 76 | ## transform coverage profile into per-base log2-FC and perform changepoint detection to define segments 77 | features <- segmentationLFC( 78 | data = data, 79 | alpha = 2, 80 | modelSelectionType = "yao", 81 | verbose = TRUE 82 | ) 83 | 84 | ## Quantify expression of segments 85 | SExp <- counting( 86 | data = data, 87 | features = features, 88 | featureCountsType = "fromBam", 89 | featureCountsOtherParams = list(read2pos = 5), 90 | verbose = TRUE 91 | ) 92 | 93 | #- subset to segments with width < 11 nt --------------------------------------# 94 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,] 95 | 96 | 97 | # differential exprssion analysis 98 | dds <- dea( 99 | SExp = SExp_10, 100 | design = ~condition, 101 | significanceLevel = 0.01, 102 | verbose = TRUE 103 | ) 104 | 105 | #- extract DERs based on signifiance ----------------------------------------# 106 | DERs <- dds[SummarizedExperiment::mcols(dds)$DER,] 107 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs)) 108 | DERs <- subset(DERs, baseMean > 10) 109 | 110 | out_DERs <- rbind(out_DERs,DERs) 111 | } 112 | 113 | ## clear memory cache 114 | gc() 115 | 116 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3]))) 117 | out <- select(out_DERs, seqnames, start, end, derId, log2FoldChange, padj, baseMean) 118 | write_tsv(out, "abh1-NvsWT-N_DERs_5p.bed", col_names=F) 119 | 120 | -------------------------------------------------------------------------------- /project_workflows/lowD_5mC_dendrograms_281019.R: -------------------------------------------------------------------------------- 1 | # making dendrograms for Eichten lowD brachy epigenomics 2 | # libraries 3 | library(tidyverse) 4 | suppressPackageStartupMessages(library(dendextend)) 5 | library(pheatmap) 6 | 7 | # read in metadata file 8 | meta <- read.delim("lowD_metadata.txt") %>% 9 | select(SampleID, condition, PlantName, Accession, ClonalGroup) 10 | 11 | my_grps <- data.frame(acc = unique(meta$Accession)) %>% 12 | mutate(Group = meta$ClonalGroup[match(acc, meta$Accession)]) %>% 13 | column_to_rownames("acc") 14 | 15 | # read in tiled 5mC 16 | i <- "CG_alltiles_merged_2017-05-05.txt" 17 | 18 | ## CG = "CG_alltiles_merged_2017-05-05.txt" 19 | ## CHG = "CHG_alltiles_merged.2017-05-04.txt" 20 | # colnames(a)[4:ncol(a)] <- sapply(strsplit(colnames(a)[4:ncol(a)], ".fastq"), function(l) l[1]) 21 | # colnames(a)[4:ncol(a)] <- sapply(strsplit(colnames(a)[4:ncol(a)], ".S"), function(l) l[2]) 22 | # colnames(a)[4:ncol(a)] <- paste0("S",colnames(a)[4:ncol(a)]) 23 | ## CHH = "CHH_alltiles_merged.2017-05-09.txt" 24 | 25 | a <- read.delim(paste(i)) %>% 26 | gather(sample, met, -V1, -V2, -V3) %>% 27 | na.omit() %>% 28 | #mutate(condition = meta$condition[match(sample, meta$SampleID)]) %>% 29 | mutate(acc = meta$Accession[match(sample, meta$SampleID)]) %>% 30 | select(V1, V2, acc, met) 31 | 32 | a <- group_by(a, V1, V2, acc) %>% 33 | summarise(avg_met = mean(met)) 34 | 35 | # memory allocation too great to pipe 36 | a <- spread(a, acc, avg_met) 37 | 38 | ann_colors <- list( 39 | Group = c(Bd21="coral", Clone1="royalblue", Clone2="darkgoldenrod1", 40 | Clone3="darkolivegreen2", Clone4="darkorchid2", Clone5="forestgreen", 41 | Clone6="firebrick1", Clone7="pink", HYB1="coral4", HYB2="bisque2") 42 | ) 43 | 44 | # produce correlation matrix & heatmap 45 | cor_matrix=as.matrix(cor(a[,3:ncol(a)],use='pairwise.complete.obs')) 46 | pheatmap(cor_matrix, 47 | cutree_cols = 3, 48 | cutree_rows = 3, 49 | show_colnames = F, 50 | fontsize_row = 5, 51 | border_color = NA, 52 | annotation_colors = ann_colors, 53 | annotation_row = my_grps) 54 | 55 | dev.off() 56 | 57 | -------------------------------------------------------------------------------- /project_workflows/lowD_SNP_dendrograms_281019.R: -------------------------------------------------------------------------------- 1 | ## check out https://github.com/borevitzlab/brachy-genotyping-notes/blob/master/snprelate.Rmd 2 | ## Produce SNP relationships of Turkish brachypodium lines for Eichten et al 2019 3 | 4 | library(SNPRelate) 5 | library(tidyverse) 6 | 7 | metadata = read.csv("brachy-metadata.csv") 8 | lowd_names = read.delim("lowD_GBS_sample_names.txt") 9 | 10 | #snpgdsVCF2GDS("freebayes~GBS~lowD.sorted.vcf.gz", 11 | # "freebayes~GBS~lowD.gds") 12 | 13 | geno = snpgdsOpen("freebayes~GBS~lowD.gds", allow.duplicate = T, readonly = T) 14 | samp = snpgdsSummary(geno)$sample.id 15 | ## Filter out very bad missing data 16 | snps = snpgdsSelectSNP(geno, missing.rate=0.999, autosome.only=F) 17 | 18 | ## Functions for further filtering 19 | ssp.filt = function(geno, samps, snps, max.snp.miss.rate=0.99, 20 | max.samp.miss.rate=0.99, min.maf=0.001 ) { 21 | miss.samp = snpgdsSampMissRate(geno, snp.id=snps, sample.id=samps) 22 | 23 | hist(miss.samp, breaks=100, main="Sample Missing Data (pre-filt)") 24 | abline(v=max.samp.miss.rate, col="blue", lwd=2) 25 | 26 | samps = samps[miss.samp <= max.samp.miss.rate] 27 | 28 | srf = snpgdsSNPRateFreq(geno, sample.id=samps, snp.id = snps) 29 | miss.snp = srf$MissingRate 30 | hist(miss.snp, breaks=100, main="SNP missing data") 31 | abline(v=max.snp.miss.rate, col="blue", lwd=2) 32 | 33 | maf = srf$MinorFreq 34 | hist(maf, breaks=50, main="SNP MAF") 35 | abline(v=min.maf, lwd=2, col="blue") 36 | 37 | snps = snpgdsSelectSNP(geno, sample.id=samps, snp.id=snps, maf=min.maf, missing.rate=max.snp.miss.rate, autosome.only=F) 38 | 39 | miss.samp = snpgdsSampMissRate(geno, snp.id=snps, sample.id=samps) 40 | hist(miss.samp, breaks=100, main="Sample Missing Data (post-filt)") 41 | 42 | print(paste("Num SNPs:", length(snps))) 43 | print(paste("Num Samples:", length(samps))) 44 | return(list(snps=snps, samps=samps, miss.samp=miss.samp)) 45 | } 46 | 47 | ssp.geno = function(geno, filt) { 48 | ibs = snpgdsIBS(geno, sample.id=filt$samps, snp.id=filt$snps, autosome.only=F, num.thread=4) 49 | ibs.nacnt = rowSums(is.na(ibs$ibs)) 50 | table(ibs.nacnt) 51 | return(ibs) 52 | } 53 | 54 | filt.dis = ssp.filt(geno, samp, snps, min.maf=0.01, max.samp.miss.rate = 0.995, max.snp.miss.rate=0.95) 55 | dev.off() 56 | 57 | dist <- snpgdsDiss(geno, sample.id=filt.dis$samps, snp.id=filt.dis$snps, autosome.only=F) 58 | dist$sample.id <- paste(lowd_names$acc[match(dist$sample.id, lowd_names$anon)]) 59 | hc.dis.plt = snpgdsHCluster(dist) %>% snpgdsCutTree(label.H=F, label.Z=F) 60 | 61 | pdf("Brachy_SNP_dendro.pdf", pointsize=4) 62 | snpgdsDrawTree(hc.dis.plt, leaflab="perpendicular", cex.lab=0.01) 63 | dev.off() 64 | 65 | -------------------------------------------------------------------------------- /qPCR/standardize_format_LC480_qPCR_DG.R: -------------------------------------------------------------------------------- 1 | ## convert LC480 output to standardized format for analysis 2 | 3 | options(stringsAsFactors = FALSE) 4 | library(tidyverse) 5 | library(janitor) 6 | 7 | setwd("C://Users/u4667515/Dropbox/Collab_Projects/Covid19/RawData_RSB_v1/") 8 | outdir <- "../RawData_v2_standardised/" 9 | 10 | ## probe fluorescence files 11 | fls <- dir(pattern = "Probe_LC480_raw-fluorescence") 12 | 13 | for(i in 1:length(fls)){ 14 | a <- read_delim(fls[i], delim = '\t', skip=1, col_names = T) %>% 15 | clean_names() %>% 16 | select(sample_pos, cycle_number, x483_533, x523_568, x558_610) %>% 17 | rename(well = sample_pos, cycle=cycle_number) %>% 18 | gather(fluorescence_name, fluorescence, -well, -cycle) %>% 19 | mutate(fluorescence_name = ifelse(fluorescence_name == "x483_533", yes = "FAM_483-533", 20 | no = ifelse(fluorescence_name == "x523_568", yes = "HEX_523-568", no ="Red_558-610"))) 21 | 22 | write_delim(x = a, path = paste(outdir, fls[i], sep = '/'), delim = '\t') 23 | } 24 | 25 | ## SYBR fluorescence files 26 | fls <- dir(pattern = "SYBR_LC480_raw-fluorescence") 27 | 28 | for(i in 1:length(fls)){ 29 | a <- read_delim(fls[i], delim = '\t', skip=1, col_names = T) %>% 30 | clean_names() %>% 31 | filter(!(cycle_number == 1 & temp != 59.90)) %>% ## remove melt curve values QC = plot(cycle_number ~ temp, a) 32 | select(sample_pos, cycle_number, x483_533) %>% 33 | rename(well = sample_pos, cycle=cycle_number) %>% 34 | gather(fluorescence_name, fluorescence, -well, -cycle) %>% 35 | mutate(fluorescence_name = ifelse(fluorescence_name == "x483_533", yes = "SYBR_483-533", no="")) 36 | 37 | write_delim(x = a, path = paste(outdir, fls[i], sep = '/'), delim = '\t') 38 | } 39 | 40 | ## SYBR fluorescence files 41 | fls <- dir(pattern = "sample-sheet") 42 | 43 | for(i in 1:length(fls)){ 44 | a <- read_delim(fls[i], delim = '\t', col_names = T) %>% 45 | clean_names() %>% 46 | select(general_pos, general_sample_name, general_filt_comb, general_target_name) %>% 47 | mutate(general_target_name = ifelse(is.na(general_target_name)==F, yes = general_target_name, 48 | no= ifelse(general_filt_comb == "483_533", yes = "FAM_RdRP", 49 | no = ifelse(general_filt_comb == "523_568", yes = "VIC_human_RP", 50 | no ="ROX_N")))) %>% 51 | rename(well = general_pos, 52 | sample_name=general_sample_name, 53 | filter=general_filt_comb, 54 | probe_amplicon=general_target_name) 55 | 56 | write_delim(x = a, path = paste(outdir, fls[i], sep = '/'), delim = '\t') 57 | } 58 | 59 | ## view output files to transfer to LabArchives 60 | dir(outdir) 61 | 62 | dev.off() 63 | rm(list=ls()) 64 | 65 | -------------------------------------------------------------------------------- /screenrc: -------------------------------------------------------------------------------- 1 | ## .screenrc file 2 | ## move to home directory and rename ".screenrc" 3 | 4 | # Get rid of the startup message 5 | startup_message off 6 | 7 | # Stop flashing when I get an alert 8 | vbell off 9 | 10 | # Automatically detach if a disconnect occurs 11 | autodetach on 12 | 13 | # Think of it as xterm and give me scrolling 14 | termcapinfo xterm-color|xterm|xterms|xs|rxvt ti@:te@ 15 | 16 | # Tabs along the bottom of the session 17 | caption always "%{= k}%-w%{= G}%n %t%{-}%+w %-= %{= G}%H" 18 | 19 | # Start at Window 1 20 | bind c screen 1 21 | bind ^c screen 1 22 | bind 0 select 10 23 | screen 1 24 | 25 | # Allow h and l to move between tabs as well 26 | bind h prev 27 | bind ^h prev 28 | bind l next 29 | bind ^l next 30 | --------------------------------------------------------------------------------