├── .DS_Store
├── At_rRNA_AGIs.txt
├── ChIP
    ├── README.md
    └── chip-seq_v0.1.sh
├── DNA
    └── dna_wgs_pipe.v1.sh
├── MethylC
    ├── 100bp_dmr_merge.r
    ├── 100bp_dmrs.v0.1.sh
    ├── 100bp_heatmap.sh
    ├── 100bp_wig_to_dmrs.r
    ├── 5mC_rates.sh
    ├── BS-SNPer.sh
    ├── DSS_file_prep.r
    ├── README.md
    ├── bed_to_rel_dist.sh
    ├── dmr_merge.r
    ├── dss_calling.r
    ├── merge_wigs.r
    ├── met_signatures.sh
    ├── methimpute.r
    ├── pca_wigs.r
    ├── rel_methylation_plots.r
    ├── rel_methylation_plots_v2.r
    ├── scatman_smooth.sh
    ├── smooth_scat.r
    ├── wgbs_cov_to_TDF.sh
    ├── wgbs_custom_bins.sh
    ├── wgbs_pipeline_v0.4.sh
    ├── wgbs_pipeline_v0.5.sh
    ├── wgbs_pipeline_v0.6.sh
    └── wgbs_pipeline_v0.7.sh
├── README.md
├── RNA
    ├── BAM_to_5p_bigWigs.sh
    ├── BAM_to_EJC.sh
    ├── BAM_to_ESI.sh
    ├── BAM_to_STOP.sh
    ├── BAM_to_TSI.sh
    ├── BAM_to_bedgraph.sh
    ├── BAM_to_bedgraph_5p.sh
    ├── BAM_to_bigWig.sh
    ├── BAM_to_wigs.sh
    ├── ESI_calculation.r
    ├── README.md
    ├── STAR_pipe_v1.sh
    ├── SUPPA_pipe_v1.sh
    ├── SUPPA_pipe_v2.sh
    ├── TSI_calculation.r
    ├── featureCounts_to_edgeR.r
    ├── featureCounts_v1.sh
    ├── featureCounts_v2.sh
    ├── featureCounts_v3-gtf.sh
    ├── get_peak_length.sh
    ├── gmuct_pipe_v1.sh
    ├── graft-nad-seq.sh
    ├── kallisto_pipe_v1.sh
    ├── macs_peaks.sh
    ├── nadBAM_to_ADPRC_sites.sh
    ├── pare_pipe_v1.sh
    ├── rel_expression_plots.r
    ├── rel_expression_plots_ejc.r
    ├── rel_expression_plots_nad.r
    ├── rel_expression_plots_stop.r
    ├── smrna_pipe_v1.sh
    ├── split_file.R
    ├── stringtie_extract_tpm.r
    ├── stringtie_pipe_v1.sh
    ├── stringtie_pipe_v2.sh
    ├── subread_pipe_v1.sh
    ├── subread_pipe_v2.sh
    ├── subread_pipe_v3.sh
    ├── total_expression.r
    ├── trim_5p_graft_nad.r
    └── trim_fastq.sh
├── TAIR10_annotation.sh
├── TruSeq-adapters.fa
├── VennPieces.R
├── araport11_assemble.sh
├── average_cov.sh
├── bashrc
├── gene_to_gene_anno.sh
├── pe_insert_size.sh
├── project_workflows
    ├── Exp555_cordycepin_comparison.Rmd
    ├── HL_RNAseq_vs_protein-stability.Rmd
    ├── RNA_protein_alignments.Rmd
    ├── Smith_etal_2022.Rmd
    ├── covid19_qPCR_analysis.Rmd
    ├── covid19_qPCR_analysis.html
    ├── diffsegR_v1
    │   ├── diffsegR_WT-NvsC_gmuct_3p_v1.r
    │   ├── diffsegR_WT-NvsC_gmuct_5p_v1.r
    │   ├── diffsegR_WT-Nvsbulk_gmuct_3p.r
    │   ├── diffsegR_WT-Nvsbulk_gmuct_5p.r
    │   ├── diffsegR_abh1-CvsWT-C_gmuct_3p_v1.r
    │   ├── diffsegR_abh1-CvsWT-C_gmuct_3p_v1_stranded.r
    │   ├── diffsegR_abh1-CvsWT-C_gmuct_5p_v1.r
    │   ├── diffsegR_abh1-CvsWT-C_gmuct_5p_v1_stranded.r
    │   ├── diffsegR_abh1-NvsWT-N_gmuct_3p_v1.r
    │   ├── diffsegR_abh1-NvsWT-N_gmuct_3p_v1_stranded.r
    │   ├── diffsegR_abh1-NvsWT-N_gmuct_5p_v1.r
    │   ├── diffsegR_abh1-NvsWT-N_gmuct_5p_v1_stranded.r
    │   ├── diffsegR_dxo1-NvsC_gmuct_3p_v1.r
    │   ├── diffsegR_dxo1-NvsC_gmuct_5p_v1.r
    │   └── diffsegR_dxo1_gmuct.r
    ├── diffsegR_v2
    │   ├── diffsegR_WT-NvsC_gmuct_3p.r
    │   ├── diffsegR_WT-NvsC_gmuct_5p.r
    │   ├── diffsegR_WT-Nvsbulk_gmuct_5p.r
    │   ├── diffsegR_abh1-NvsC_gmuct_5p.r
    │   ├── diffsegR_abh1-NvsWT-N_gmuct_3p.r
    │   └── diffsegR_abh1-NvsWT-N_gmuct_5p.r
    ├── lowD_5mC_dendrograms_281019.R
    ├── lowD_SNP_dendrograms_281019.R
    └── stress_coexpression_networks.Rmd
├── qPCR
    ├── qPCR_analysis_v1_linreg.R
    ├── qPCR_analysis_v2_chipPCR.R
    ├── qPCR_analysis_v2_chipPCR.v2.R
    ├── qPCR_analysis_v2_chipPCR.v3.R
    └── standardize_format_LC480_qPCR_DG.R
└── screenrc


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dtrain16/NGS-scripts/d42b46723fc3991ac95420186ef686e2a6ed024c/.DS_Store


--------------------------------------------------------------------------------
/At_rRNA_AGIs.txt:
--------------------------------------------------------------------------------
 1 | AT2G01010
 2 | AT2G01020
 3 | AT3G41768
 4 | AT3G41979
 5 | ATCG00920
 6 | ATCG00950
 7 | ATCG00960
 8 | ATCG00970
 9 | ATCG01160
10 | ATCG01170
11 | ATCG01180
12 | ATCG01210
13 | ATMG00020
14 | ATMG01380
15 | ATMG01390
16 | 


--------------------------------------------------------------------------------
/ChIP/README.md:
--------------------------------------------------------------------------------
1 | # ChIP-seq scripts repository
2 | 
3 | #### chip-seq_v0.1.sh
4 | Perform quality trimming and align raw reads from ChIP-seq experiments using SubRead (Subjunc).
5 | 
6 | 


--------------------------------------------------------------------------------
/MethylC/100bp_dmr_merge.r:
--------------------------------------------------------------------------------
 1 | options(echo=T)
 2 | library(reshape2)
 3 | args=commandArgs(trailingOnly=T)
 4 | print(args)
 5 | 
 6 | ############
 7 | # quick script to grab bedtools results, sum across C's in DMR window, and make final table
 8 | ############
 9 | context=args[1]
10 | difference=as.numeric(args[2])
11 | coverage=as.numeric(args[3])
12 | 
13 | #grab all the individual bedtools results (*.dmr)
14 | 
15 | filelist=dir(pattern="*.dmr$")
16 | 
17 | #read them in and add a row with the samplename. Rbind them all together
18 | tes=read.delim(filelist[1],head=F)
19 | group=rep(strsplit(filelist[1],"\\.")[[1]][1],nrow(tes))
20 | tes=cbind(tes,group)
21 | for(i in 2:length(filelist)){
22 | 	ss=read.delim(filelist[i],head=F)
23 | 	group=rep(strsplit(filelist[i],"\\.")[[1]][1],nrow(ss))
24 |     ss=cbind(ss,group)
25 | 	tes=rbind(tes,ss)
26 | 	}
27 | #use dcast (reshape2) to get it into a summary table	
28 | t1=dcast(tes, V1 + V2 + V3 + V4 + V5 ~ group,value.var='V6')
29 | colnames(t1)[6:ncol(t1)]=paste(names(t1[6:ncol(t1)]),"_prop",sep='')
30 | t2=dcast(tes, V1 + V2 + V3 + V4 + V5 ~ group,value.var='V7')
31 | colnames(t2)[6:ncol(t2)]=paste(names(t2[6:ncol(t2)]),"_met",sep='')
32 | t3=dcast(tes, V1 + V2 + V3 + V4 + V5 ~ group,value.var='V8')
33 | colnames(t3)[6:ncol(t3)]=paste(names(t3[6:ncol(t3)]),"_unmet",sep='')
34 | 
35 | #make a table of it all
36 | tout=cbind(t1,t2[,6:ncol(t2)],t3[,6:ncol(t3)])
37 | 
38 | #write it out, and write out a version with only rows with data for all samples
39 | write.table(tout,paste('100bp_DMRs_',context,'_',difference,'diff_',coverage,'cov.output.txt',sep=''),sep='\t',row.names=F,quote=F)
40 | 
41 | 
42 | #
43 | 


--------------------------------------------------------------------------------
/MethylC/100bp_dmrs.v0.1.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -u
  3 | 
  4 | export PATH=$PATH:/home/diep/bin/
  5 | # 100bp_dmrs.sh
  6 | # This script identifies 100bp windows (from bismark alignment pipeline) that display 
  7 | # a difference (of choice) in methylation (context of choice) with a required coverage
  8 | # level (of choice). These windows are selected across every pairwise comparison of 100bp 
  9 | # window wig files and aggregated into a single (bed) file. All adjacent windows are 
 10 | # collapsed into a single DMR. This file is then used to grab all individual met/unmet 
 11 | # reads for each DMR (from .cov files in bismark pipeline) for all samples.
 12 | 
 13 | 
 14 | #REQUIRES THAT sample names do not contain '_', as this will screw up the final steps
 15 | ######################
 16 | 
 17 | #execute from directory containing all the wig and cov files from all samples
 18 | #usage:
 19 | #if [ "$#" -ne 3 ]; then
 20 | #echo "USAGE: 100bp_dmrs.v0.1.sh <context> <diffmeth> <coverage_req>"
 21 | #echo "EXAMPLE: 100bp_dmrs.v0.1.sh CpG 80 10"
 22 | #echo "Look at CG context, difference of 80% methylation with 10 reads minimum over window"
 23 | #exit 1
 24 | #fi
 25 | 
 26 | usage() { 
 27 | echo "############################################################"
 28 | echo
 29 | echo "Usage: $0 [-c <CpG | CHG | CHH>] [-m <0|100>] [-d <integer>] [-s <integer>]" 1>&2
 30 | echo
 31 | echo "This script will create DMRs from 100bp window wig files"
 32 | echo
 33 | echo "REQUIREMENTS:"
 34 | echo "Execute the script from a directory containing wig files and cov files"
 35 | echo "R and the package 'fields'"
 36 | echo "bedtools > v2.20"
 37 | echo
 38 | echo "EXAMPLE: $0 -c CpG -m 80 -d 10"
 39 | echo "Look at CpG context, difference of 80% methylation with 10 reads minimum over window"
 40 | echo
 41 | 
 42 | echo "############################################################"
 43 | exit 1
 44 | }
 45 | 
 46 | flag1=0
 47 | flag2=0
 48 | flag3=0
 49 | flag4=0
 50 | while getopts ":c:m:d:s:" opt; do
 51 | 	case $opt in
 52 |      c)  context=$OPTARG; flag1=1;;
 53 |      m)  difference=$OPTARG; flag2=1;;
 54 |      d)  coverage=$OPTARG; flag3=1;;
 55 |      s)  sitecount=$OPTARG; flag4=1;;
 56 |     \?)  usage;;
 57 |      :)  echo "Option -$OPTARG requires an argument." >&2; usage;;	
 58 |      *)  usage
 59 | 	esac
 60 | done
 61 | 
 62 | if [ $flag1 == 0 ]; then
 63 | 	echo "############################################################"
 64 | 	echo "context argument ( -c ) required!"
 65 | fi
 66 | if [ $flag2 == 0 ]; then
 67 | 	echo "############################################################"
 68 | 	echo "methylation difference argument ( -m ) required!"
 69 | fi
 70 | if [ $flag3 == 0 ]; then
 71 | 	echo "############################################################"
 72 | 	echo "coverage argument ( -d ) required!"
 73 | fi
 74 | if [ $flag4 == 0 ]; then
 75 | 	echo "############################################################"
 76 | 	echo "site count argument ( -s ) required!"
 77 | fi
 78 | if [ $((flag1+flag2+flag3)) != 3 ]; then
 79 | 	usage
 80 | fi
 81 | 
 82 | echo "c = ${context}"
 83 | echo "m = ${difference}"
 84 | echo "d = ${coverage}"
 85 | echo "s = ${sitecount}"
 86 | 
 87 | ######################
 88 | 
 89 | Rscript /home/diep/scripts/100bp_wig_to_dmrs.r ${context} ${difference} ${coverage} ${sitecount}
 90 | 
 91 | #bedtools to intersect the bed file w. the coverage files
 92 | for file in *${context}*.cov
 93 | do
 94 | 	bedtools intersect -wa -wb -a 100bp_${context}_${difference}diff_${coverage}collapsed.bed -b "$file" | bedtools groupby -i stdin -g 4,1,2,3 -c 5,9,10,11 -o mean,mean,sum,sum > "${file}.${context}_${difference}diff_${coverage}.dmr"
 95 | done
 96 | 
 97 | #file structure cleanup
 98 | mkdir 100bp_${context}_${difference}diff_${coverage}_${sitecount}_out
 99 | mv 100bp_${context}_${difference}diff_${coverage}* 100bp_${context}_${difference}diff_${coverage}_${sitecount}_out/
100 | mv *.${context}_${difference}diff_${coverage}* 100bp_${context}_${difference}diff_${coverage}_${sitecount}_out/
101 | cd 100bp_${context}_${difference}diff_${coverage}_${sitecount}_out/
102 | 
103 | #collapse them all into a summary table
104 | Rscript /home/diep/scripts/100bp_dmr_merge.r ${context} ${difference} ${coverage}
105 | 


--------------------------------------------------------------------------------
/MethylC/100bp_heatmap.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -u
 3 | # Obtaining single mC information from bed files across DMRs identified in `100bp_dmrs.v0.1.sh`
 4 | # Handy when throwing lots of samples against DMRs
 5 | 
 6 | if [ "$#" -ne 2 ]; then
 7 | echo "USAGE: 100bp_heatmap.sh <context> <100bp_dmrs output>"
 8 | exit 1
 9 | fi
10 | 
11 | context=$1
12 | dmrs=$2
13 | 
14 | echo ${context}
15 | echo ${dmrs}
16 | 
17 | echo "Performing intersectBed..."
18 | 
19 | for FILE in *_${context}.bed
20 | do
21 | 	intersectBed -wo -a ${dmrs} -b $FILE > DMRs-${FILE}
22 | 	bedtools groupby -i DMRs-${FILE} -g 1,2,3 -c 9 -o mean > avgDMRs-${FILE}
23 | done
24 | 


--------------------------------------------------------------------------------
/MethylC/100bp_wig_to_dmrs.r:
--------------------------------------------------------------------------------
  1 | options(echo=T)
  2 | library(reshape2)
  3 | args=commandArgs(trailingOnly=T)
  4 | print(args)
  5 | 
  6 | # 1. grab wig files
  7 | # 2. take each pairwise sample of a context and ID windows showing differences of at least X%
  8 | # 3. collapse all windows that are adjacent to each other. Give IDs
  9 | 
 10 | #define argments
 11 | context=args[1]
 12 | difference=as.numeric(args[2])
 13 | coverage=as.numeric(args[3])
 14 | sitecounts=as.numeric(args[4])
 15 | 
 16 | print(args)
 17 | #grab all the wig files, select those for your context
 18 | a=dir(pattern="*100bp.bed")
 19 | a=subset(a,grepl(context,a)==T)
 20 | biglist=as.list(a)
 21 | 
 22 | aa=combn(a,2)
 23 | bb=combn(1:length(biglist),2) #combinations of elements in the biglist
 24 | 
 25 | 
 26 | out=NULL
 27 | #loop through all pairwise combinations#########################
 28 | for(i in 1:(length(aa)/2)){
 29 | 
 30 | file1=read.delim(aa[1,i],head=F)
 31 | file2=read.delim(aa[2,i],head=F)
 32 | 
 33 | #take windows where there is coverage for both samples
 34 | merged=merge(file1,file2,by=c('V1','V2','V3'))
 35 | 
 36 | #ID windows that show the selected difference
 37 | test.diff=matrix(ifelse(abs(merged$V4.x - merged$V4.y) >= difference,1,0),ncol=1)
 38 | 
 39 | merged=cbind(merged,test.diff)
 40 | 
 41 | diff.windows=subset(merged,merged$test.diff==1)
 42 | 
 43 | #select windows that meet the coverage threshold
 44 | diff.windows.cov=subset(diff.windows,diff.windows$V7.x>=coverage & diff.windows$V7.y>=coverage)
 45 | 
 46 | #select windows that also meet the sitecount threshold (looking at at least <s> CG/CHG/CHH sites with coverage in the window)
 47 | diff.windows.cov=subset(diff.windows.cov,diff.windows.cov$V8.x >=sitecounts & diff.windows.cov$V8.y >=sitecounts)
 48 | 
 49 | if(nrow(diff.windows.cov)==0){ next}
 50 | 
 51 | diff.windows.cov=diff.windows.cov[with(diff.windows.cov, order(diff.windows.cov[,1],diff.windows.cov[,2])),]
 52 | group.id=c(1,rep(NA,nrow(diff.windows.cov)-1))
 53 | 
 54 | for(q in 2:nrow(diff.windows.cov)){
 55 | 	group.id[q]=ifelse(diff.windows.cov[q,2] - diff.windows.cov[q-1,2]<=100,group.id[q-1],group.id[q-1]+1)
 56 | 	}
 57 | group.id=matrix(group.id,ncol=1)
 58 | diff.windows.cov=cbind(diff.windows.cov,group.id)
 59 | 
 60 | calling=matrix(rep(paste(aa[1,i],"vs",aa[2,i],sep=''),nrow(diff.windows.cov)),ncol=1)
 61 | 
 62 | diff.windows.cov=cbind(diff.windows.cov,calling)
 63 | 
 64 | out=rbind(out,diff.windows.cov)
 65 | 
 66 | }
 67 | 
 68 | colnames(out)=c('chr','start','stop','prop1','met1','unmet1','total1','site1','prop2','met2','unmet2','total2','site2','difference.pass','group.id','contrast')
 69 | 
 70 | #sort out
 71 | out=out[with(out,order(out[,1],out[,2])),]
 72 | 
 73 | dmr.id=c(1,rep(NA,nrow(out)-1))
 74 | 
 75 | 
 76 | #UPDATED
 77 | for(q in 2:nrow(out)){
 78 | 	dmr.id[q]=ifelse(out[q,2] - out[q-1,2]<=100 & out[q,2] - out[q-1,2] >= 0 & ((out[q,4] + out[q-1,4] <= 200 - 2*difference) | (out[q,4] + out[q-1,4] >= 2*difference)),dmr.id[q-1],dmr.id[q-1]+1)
 79 | 	}
 80 | dmr.id=matrix(dmr.id,ncol=1)
 81 | 
 82 | out=cbind(out,dmr.id)
 83 | 
 84 | write.table(out,paste('100bp_',context,'_',difference,'diff','_',coverage,'cov.txt',sep=''),sep='\t',row.names=F)
 85 | 
 86 | #collapse that shiz
 87 | 
 88 | collapsed.dmrs=matrix(NA,ncol=5,nrow=max(dmr.id))
 89 | for(i in 1:max(dmr.id)){
 90 | 	subs=subset(out,out$dmr.id==i)
 91 | 	chr=as.character(subs[1,1])
 92 | 	starts=min(subs[,2])
 93 | 	stops=max(subs[,3])
 94 | 	dmrid=i
 95 | 	size=stops-starts
 96 | 	collapsed.dmrs[i,]=c(chr,starts,stops,dmrid,size)
 97 | 	}
 98 | 	
 99 | write.table(collapsed.dmrs,paste('100bp_',context,'_',difference,'diff','_',coverage,'collapsed.bed',sep=''),sep='\t',row.names=F,col.names=F,quote=F)
100 | 


--------------------------------------------------------------------------------
/MethylC/5mC_rates.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -u
 4 | 
 5 | # Get methylation rates for all contexts across Chr1-5 as well as % CHH in chloroplast genome as an indication of sodium bisulfite conversion efficiency (unconverted CHH in Cp and Mt genome)
 6 | 
 7 | if [ "$#" -lt 2 ]; then
 8 | 	echo "Missing required arguments!"
 9 | 	echo "USAGE: methylation_rates.sh <sample> <file>"
10 | 	echo "EXAMPLE: methylation_rates.sh col0-r1 bed/cov"
11 | 	exit 1
12 | fi
13 | 
14 | sample=$1
15 | file=$2
16 | 
17 | cg="${sample}_CG*.${file}"
18 | chg="${sample}_CHG*.${file}"
19 | chh="${sample}_CHH*.${file}"
20 | 
21 | echo "5mC % in $1"
22 | 
23 | echo "mCG Chr1-5: "$cg" "
24 | grep -e "Chr1" -e "Chr2" -e "Chr3" -e "Chr4" -e "Chr5" $cg |  awk '{ met+= $5} { unmet += $6} { total = met + unmet } END {print ((met / total))}'
25 | 
26 | echo "mCHG Chr1-5: "$chg" "
27 | grep -e "Chr1" -e "Chr2" -e "Chr3" -e "Chr4" -e "Chr5" $chg |  awk '{ met+= $5} { unmet += $6} { total = met + unmet } END {print ((met / total))}'
28 | 
29 | echo "mCHH Chr1-5: "$chh" "
30 | grep -e "Chr1" -e "Chr2" -e "Chr3" -e "Chr4" -e "Chr5" $chh |  awk '{ met+= $5} { unmet += $6} { total = met + unmet } END {print ((met / total))}'
31 | 
32 | echo "mCHH ChrC: "$chh" "
33 | grep -e "ChrC"  $chh | awk '{ met+= $5} { unmet += $6} { total = met + unmet } END {print ((met / total))}'
34 | 
35 | echo "mCHH ChrM: "$chh" "
36 | grep -e "ChrM"  $chh | awk '{ met+= $5} { unmet += $6} { total = met + unmet } END {print ((met / total))}'
37 | 
38 | echo "DONE"
39 | 


--------------------------------------------------------------------------------
/MethylC/BS-SNPer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # use on sorted BAM file output from bismark alignment to call SNPs using default BS-SNPer settings
 4 | 
 5 | ## require: 
 6 | # perl
 7 | # BS-SNPer http://bioinformatics.oxfordjournals.org/content/31/24/4006.long
 8 | 
 9 | if [ "$#" -ne 3 ]; then
10 | echo "USAGE: <file> <fasta> <outname>"
11 | echo "EXAMPLE: BS-SNPer.sh alx8-r1.sorted.bam $HOME/TAIR10/TAIR10_Chr.all.fasta alx8-r1"
12 | exit 1
13 | fi
14 | 
15 | file=$1
16 | fa=$2
17 | out=$3
18 | 
19 | perl ~/bin/BS-Snper-master/BS-Snper.pl --fa $fa --input $file --output temp.out --methcg meth.cg --methchg meth.chg --methchh meth.chh --minhetfreq 0.15 --minhomfreq 0.85 --minquali 30 --mincover 15 --maxcover 1000 --minread2 2 --errorate 0.02 --mapvalue 20 > ${out}.SNP.bed 2>${out}_ERR.log
20 | 
21 | rm temp.out meth.cg meth.chg meth.chh
22 | 


--------------------------------------------------------------------------------
/MethylC/DSS_file_prep.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Produce DSS input files from BED files
 3 | # run in folder with cov files of interest and tell which context you want to merge together
 4 | # USAGE: DSS_file_prep.r <context>
 5 | 
 6 | options(echo=T)
 7 | args=commandArgs(trailingOnly=T)
 8 | print(args)
 9 | 
10 | ## mC context to test
11 | context = args[1]
12 | 
13 | ## get cov files 
14 | files=dir(pattern=paste0(context,".bed"))
15 | 
16 | ## get total and met counts for each sample in given context and write out as separate files
17 | for(i in 1:length(files)){
18 | file <- read.delim(files[i], head=F)
19 | file <- file[file$V1 != "ChrM",]
20 | file <- file[file$V1 != "ChrC",]
21 | file <- file[,c(1,2,7,5)]
22 | test <- as.numeric(regexec(text = as.character(files[i]), pattern=".bed"))
23 | sample <- substr(as.character(files[i]), start = 1, stop = test-1)
24 | colnames(file)=c('chr','pos','N','X')
25 | write.table(x=file, file=paste0(sample,"_output.txt"),sep='\t', quote = F, col.names=T, row.names=F)
26 | }
27 | 


--------------------------------------------------------------------------------
/MethylC/README.md:
--------------------------------------------------------------------------------
 1 | # MethylC-seq scripts repository
 2 | 
 3 | #### 100bp_dmr_merge.r
 4 | Supplemental script for *100bp_dmrs.v0.1.sh* for collapsing/summarising DMRs
 5 | 
 6 | #### 100bp_dmrs.v0.1.sh
 7 | This script uses 100bp windowed methylation data to call differences across the genome in a defined context with required coverage and a defined difference. These are performed in a pairwise manner between all samples of interest.
 8 | 
 9 | #### 100bp_heatmap.sh
10 | Uses the output from *100bp_dmrs.v0.1.sh* to get mC from bed files (individual mC resolution).
11 | 
12 | #### 100bp_wig_to_dmrs.r
13 | First step of *100bp_dmrs.v0.1.sh* that makes pairwise comparisons of windows showing a defined difference in a given context.
14 | 
15 | #### 5mC\_rates.sh
16 | Calculate mCG, mCHG and mCHH % across At Chr1-5 and mCHH across Cp and Mt genomes (conversion efficiency)
17 | 
18 | #### BS-SNPer.sh		
19 | Script to perform SNP calling from aligned bisulfite converted reads. Use on sorted BAM file.
20 | 
21 | #### DSS_calling.r
22 | Script for performing DSS DMR calling on re-formatted (using DSS_file_prep.r) bed.cov file output.
23 | 
24 | #### DSS_file_prep.r
25 | Script for re-formatting bed.cov file methylation output for input to DSS.
26 | 
27 | #### bed_to_rel_dist.sh
28 | Produce genome summarised methylation plots across features of interest e.g. gene models.
29 | 
30 | #### dmr_merge.r
31 | Supplemental file for 100bp DMR calling to produce final DMR table.
32 | 
33 | #### merge_wigs.r
34 | Merge 100bp binned weighted methylation BED files to produce correlation matrices with hierarchical clustering of samples of interest.
35 | 
36 | #### met_signatures.sh
37 | Extract cytosine reports for methylation at non-canonical methylation sequence contexts (see Gouil & Baulcombe, PLoS Gen 2015).
38 | 
39 | #### methimpute.r
40 | Use METHimpute to perform HMM-based imputation of methylation state at single C resolution across genome. Also produces 100bp.bed output files (methylation levels binned into 100bp windows).
41 | 
42 | #### pca_wigs.r
43 | Perform PCA on 100bp binned weighted methylation levels.
44 | 
45 | #### rel_methylation_plots.r
46 | Supplementary R script for bed_to_rel.sh to produce binned methylation values summarised across supplied features of interest.
47 | 
48 | #### rel_methylation_plots_v2.r
49 | Variation on rel_methylation_plots.r to get binned summarised methylation values for non-canonical sequence contexts.
50 | 
51 | #### scatman_smooth.sh & smooth_scat.r
52 | Pair of scripts (use .sh to run) to take annotation file, bed file, and feature name to make scattersmooth plots in R to correlate methylation levels and feature characteristics (e.g. 5mC vs TE length).
53 | 
54 | #### wgbs_cov_to_TDF.txt
55 | Produce TDF files from bismark cov file of interest. Check IGV compatible genome build ready.
56 | 
57 | #### wgbs_custom_bins.sh
58 | Bin weighted methylation levels into sizes of users choosing (typically 100bp).
59 | 
60 | ### wgbs_pipeline.sh
61 | #### v0.4
62 | Bismark alignment script using Bowtie1 aligner. Has SE and PE options.
63 | #### v0.5
64 | Bismark alignment script using Bowtie2 aligner. Has SE and PE options.
65 | #### v0.6
66 | Perl script for 100bp windows deprecated thus removed.
67 | #### v0.7
68 | added deduplicate\_bismark and keeping only .cov files. Will consider adding coverage filter on single Cs.
69 | 


--------------------------------------------------------------------------------
/MethylC/bed_to_rel_dist.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Plotting DNA methylation over gene models
 3 | 
 4 | #######################
 5 | # REQUIREMENTS
 6 | # bedtools
 7 | # awk
 8 | # R with libraries: fields
 9 | #######################
10 | 
11 | if [ "$#" -ne 3 ]; then
12 | echo "USAGE: bed_to_rel_dist.sh <input path to bed file> <filename prefix> <output map name>"
13 | echo "EXAMPLE: bed_to_rel_dist.sh $HOME/Araport11/Araport11_genes.sorted.bed sample-r1 genes"
14 | exit 1
15 | fi
16 | 
17 | bedpath=$1
18 | filename=$2
19 | outname=$3
20 | 
21 | sort -k1,1 -k2,2n ${filename}_CG_100bp*.bed -o ${filename}_CG_100bp.bed
22 | sort -k1,1 -k2,2n ${filename}_CHG_100bp*.bed -o ${filename}_CHG_100bp.bed
23 | sort -k1,1 -k2,2n ${filename}_CHH_100bp*.bed -o ${filename}_CHH_100bp.bed
24 | 
25 | #get total number of columns for both input files
26 | l1="$(cat ${filename}_CG_100bp.bed | awk 'BEGIN{FS="\t"};{print NF}' | head -n 1)"
27 | l2="$(cat ${bedpath} | awk 'BEGIN{FS="\t"};{print NF}' | head -n 1)"
28 | 
29 | #convert the wigs to bed
30 | ####################### some bedtools stuff
31 | echo "Performing closestBed of CHG methylation..."
32 | closestBed -D "ref" -a ${filename}_CHG_100bp.bed -b $bedpath > ${filename}_CHG_${outname}.bed
33 | echo "Performing closestBed of CHH methylation..."
34 | closestBed -D "ref" -a ${filename}_CHH_100bp.bed -b $bedpath > ${filename}_CHH_${outname}.bed
35 | echo "Performing closestBed of CG methylation..."
36 | closestBed -D "ref" -a ${filename}_CG_100bp.bed -b $bedpath > ${filename}_CG_${outname}.bed
37 | 
38 | #subset to the regions within 100bp of a gene (make the files more manageable for R)
39 | echo "subsetting files to within 1kb..."
40 | awk -F$'\t' '$NF<1000 && $NF>-1000' ${filename}_CHG_${outname}.bed > ${filename}_CHG_${outname}.1k.bed
41 | awk -F$'\t' '$NF<1000 && $NF>-1000' ${filename}_CHH_${outname}.bed > ${filename}_CHH_${outname}.1k.bed
42 | awk -F$'\t' '$NF<1000 && $NF>-1000' ${filename}_CG_${outname}.bed > ${filename}_CG_${outname}.1k.bed
43 | #######################
44 | 
45 | rm ${filename}_CG_100bp.bed ${filename}_CHG_100bp.bed ${filename}_CHH_100bp.bed
46 | rm ${filename}_CHG_${outname}.bed ${filename}_CHH_${outname}.bed ${filename}_CG_${outname}.bed
47 | 
48 | echo "Performing R plots..."
49 | #initiate the R script to create the plots
50 | Rscript $HOME/scripts/MethylC/rel_methylation_plots.r ${filename} ${outname} ${l1} ${l2}
51 | 


--------------------------------------------------------------------------------
/MethylC/dmr_merge.r:
--------------------------------------------------------------------------------
 1 | options(echo=T)
 2 | library(reshape2)
 3 | args=commandArgs(trailingOnly=T)
 4 | print(args)
 5 | 
 6 | ############
 7 | # quick script to grab bedtools results, sum across C's in DMR window, and make final table
 8 | ############
 9 | 
10 | 
11 | #grab all the individual bedtools results (*.dmr)
12 | 
13 | filelist=dir(pattern="*.dmr$")
14 | 
15 | #read them in and add a row with the samplename. Rbind them all together
16 | tes=read.delim(filelist[1],head=F)
17 | group=rep(strsplit(filelist[1],"_")[[1]][1],nrow(tes))
18 | tes=cbind(tes,group)
19 | for(i in 2:length(filelist)){
20 | 	ss=read.delim(filelist[i],head=F)
21 | 	group=rep(strsplit(filelist[i],"_")[[1]][1],nrow(ss))
22 |     ss=cbind(ss,group)
23 | 	tes=rbind(tes,ss)
24 | 	}
25 | 
26 | #use dcast (reshape2) to get it into a summary table	
27 | t1=dcast(tes, V1 + V2 + V3 ~ group,value.var='V4')
28 | colnames(t1)[4:ncol(t1)]=paste(names(t1[4:ncol(t1)]),"_prop",sep='')
29 | t2=dcast(tes, V1 + V2 + V3 ~ group,value.var='V5')
30 | colnames(t2)[4:ncol(t2)]=paste(names(t2[4:ncol(t2)]),"_met",sep='')
31 | t3=dcast(tes, V1 + V2 + V3 ~ group,value.var='V6')
32 | colnames(t3)[4:ncol(t3)]=paste(names(t3[4:ncol(t3)]),"_unmet",sep='')
33 | 
34 | #make a table of it all
35 | tout=cbind(t1,t2[,4:ncol(t2)],t3[,4:ncol(t3)])
36 | 
37 | #write it out, and write out a version with only rows with data for all samples
38 | write.table(tout,'all_dmr_metvalues.txt',sep='\t',row.names=F,quote=F)
39 | toutnarm=na.omit(tout)
40 | write.table(toutnarm,'all_dmr_metvalues_noNA.txt',sep='\t',row.names=F,quote=F)
41 | 
42 | #
43 | 


--------------------------------------------------------------------------------
/MethylC/dss_calling.r:
--------------------------------------------------------------------------------
 1 | # Script to perform DSS DMR calling 
 2 | # Need to enter manually; not setup for running
 3 | # Make sure files are converted into right format using DSS_file_prep.r
 4 | options(echo=T)
 5 | args=commandArgs(trailingOnly=T)
 6 | print(args)
 7 | 
 8 | # install DSS
 9 | # source("http://bioconductor.org/biocLite.R")
10 | # biocLite("DSS")
11 | 
12 | # Define arguments
13 | context = args[1]
14 | pvalue = args[2]
15 | dlt = args[3]
16 | condition1 = args[4]
17 | condition2 = args[5]
18 | 
19 | # Read in correctly formatted files
20 | files <- dir(pattern = paste0(context,".output"))
21 | 
22 | # Define sample groups
23 | group1 <- files[condition1]
24 | group2 <- files[condition2]
25 | 
26 | library(DSS)
27 | # read input files in DSS format (chr, pos, N, X)
28 | dat1.1 <- read.delim(unlist(group1)[1])
29 | dat1.2 <- read.delim(unlist(group1)[2])
30 | dat1.3 <- read.delim(unlist(group1)[3])
31 | 
32 | dat2.1 <- read.delim(unlist(group2)[1])
33 | dat2.2 <- read.delim(unlist(group2)[2])
34 | dat2.3 <- read.delim(unlist(group2)[3])
35 | 
36 | # setup bsseq object
37 | BSobj <- makeBSseqData(list(dat1.1,dat1.2,dat1.3,dat2.1,dat2.2,dat2.3),sampleNames=c("C1","C2","C3","N1","N2","N3"))
38 | 
39 | # Estimation of methylation means with smoothing by moving averages and smaller smoothing window
40 | dmlTest <- DMLtest(BSobj,group1=c("C1","C2","C3"), group2=c("N1","N2","N3"),smoothing=TRUE,smoothing.span=100)
41 | 
42 | # identify DMRs based on dmltesting and write out to file
43 | dmrs <- callDMR(dmlTest, delta=dlt, minlen=50, minCG=3, pct.sig=0.5, dis.merge=50, p.threshold=pvalue)
44 | 
45 | ## look at distributions of test statistics and p-values
46 | par(mfrow=c(2,2))
47 | hist(dmlTest$stat, 100, main="test statistics")
48 | hist(dmlTest$pval, 100, main="P values")
49 | hist(dmlTest$fdr, 100, main="FDR values")
50 | hist(dmlTest$diff, 100, main="estimates")
51 | dev.off()
52 | 
53 | # filename
54 | file1=paste0(group1,"vs",group2,"_",context,"_delta=",dlt,"_p=",pvalue,".bed")
55 | 
56 | # write out file
57 | write.table(dmrs,file=file1,quote=FALSE,sep="\t",row.names=FALSE,col.names=FALSE)
58 | 


--------------------------------------------------------------------------------
/MethylC/merge_wigs.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | # Produce methylome correlation matrix using specified BED files
  3 | 
  4 | args = commandArgs(trailingOnly=T)
  5 | 
  6 | if(length(args) != 4){
  7 | 	print('Args missing!')
  8 | 	print('USAGE: merge_wigs.r <context> <bin size> <cov> <type [1,reps vs 2,sum]>')
  9 | 	print('EXAMPLE merge_wigs.r CHH 0 0 1')
 10 | 	quit()
 11 | 	n
 12 | }
 13 | 
 14 | context=args[1]
 15 | bin=paste0(args[2],"bp")
 16 | cov=paste0(args[3],"cov")
 17 | type=args[4]
 18 | 
 19 | print(c(context, bin, cov, type))
 20 | 
 21 | library(tidyverse)
 22 | library(gplots)
 23 | 
 24 | if(bin == '0bp' & cov == '0cov' ){
 25 | 	files <- dir(pattern=paste(context)) %>% 
 26 | 		subset(subset=substr(., start=nchar(.)-2, stop=nchar(.)) == 'bed')
 27 | } else {
 28 | 	files <- dir(pattern=paste(context,bin,cov,sep="_"))
 29 | }
 30 | 
 31 | print(files)
 32 | 
 33 | if(type==1){
 34 | 
 35 | data <- data_frame(files) %>%
 36 | 	mutate(file_contents = map(files, read_delim, delim='\t', col_names=F, skip=1)) %>%
 37 | 	unnest() %>%
 38 | 	filter(X1 != 'Mt' & X1 != 'ChrM' & X1 != 'Pt' & X1 != 'ChrC') %>%
 39 | 	select(files, X1, X2, X3, X4) %>%
 40 | 	mutate(sample=sapply(strsplit(files, '_'), function(l) l[1])) %>%
 41 | 	mutate(genotype=sapply(strsplit(sample, '-'), function(l) l[1])) %>%
 42 | 	mutate(rep=sapply(strsplit(sample, '-'), function(l) l[2])) %>%
 43 | 	mutate(X1=ifelse(substr(X1, start=1, stop=3)=="Chr",paste0(X1),paste0("Chr",X1))) %>%
 44 | 	na.omit() %>%
 45 | 	group_by(X1, X2, X3, genotype, rep) %>%
 46 | 	summarise(met = mean(X4)) %>%
 47 | 	unite(temp, genotype, rep) %>%
 48 | 	spread(key=temp, value=met) %>%
 49 | 	na.omit() %>%
 50 | 	ungroup() %>%
 51 | 	select(-X1, -X2, -X3) %>%
 52 | 	cor() %>%
 53 | 	as.matrix()
 54 | 
 55 | ### heatmap
 56 | pdf(file=paste0('wig_cor_',context,'_reps.pdf'), width = 0, height = 0, paper="a4r")
 57 | 
 58 | heatmap.2(data, 
 59 | 	trace='none',
 60 | 	density.info='none',
 61 | 	symm=F,
 62 | 	symkey=F,
 63 | 	key=T,
 64 | 	colsep = 1:ncol(data),
 65 | 	rowsep = 1:nrow(data),
 66 | 	sepcolor = "white",
 67 | 	sepwidth = c(0.001,0.001),
 68 | 	dendrogram='both',
 69 | 	margins = c(8,8),
 70 | 	cexCol = 1,
 71 | 	cexRow = 1)
 72 | dev.off()
 73 | 
 74 | } else {
 75 | 
 76 | data <- data_frame(files) %>%
 77 | 	mutate(file_contents = map(files, read_delim, delim='\t', col_names=F, skip=1)) %>%
 78 | 	unnest() %>%
 79 | 	filter(X1 != 'Mt' & X1 != 'ChrM' & X1 != 'Pt' & X1 != 'ChrC') %>%
 80 | 	select(files, X1, X2, X3, X4) %>%
 81 | 	mutate(sample=sapply(strsplit(files, '_'), function(l) l[1])) %>%
 82 | 	mutate(genotype=sapply(strsplit(sample, '-'), function(l) l[1])) %>%
 83 | 	mutate(rep=sapply(strsplit(sample, '-'), function(l) l[2])) %>%
 84 | 	mutate(X1=ifelse(substr(X1, start=1, stop=3)=="Chr",paste0(X1),paste0("Chr",X1))) %>%
 85 | 	na.omit() %>%
 86 | 	group_by(X1, X2, X3, genotype) %>%
 87 | 	summarise(met = mean(X4)) %>%
 88 | 	spread(key=genotype, value=met) %>%
 89 | 	na.omit() %>%
 90 | 	ungroup() %>%
 91 | 	select(-X1, -X2, -X3) %>%
 92 | 	cor() %>%
 93 | 	as.matrix()
 94 | 
 95 | ## heatmap
 96 | pdf(file=paste0('wig_cor_',context,'.pdf'), width = 0, height = 0, paper="a4r")
 97 | 
 98 | heatmap.2(data,
 99 |         trace='none',
100 |         density.info='none',
101 |         symm=F,
102 |         symkey=F,
103 |         key=T,
104 |         colsep = 1:ncol(data),
105 |         rowsep = 1:nrow(data),
106 |         sepcolor = "white",
107 |         sepwidth = c(0.001,0.001),
108 |         dendrogram='both',
109 |         margins = c(8,8),
110 |         cexCol = 1,
111 |         cexRow = 1)
112 | dev.off()
113 | }
114 | 


--------------------------------------------------------------------------------
/MethylC/met_signatures.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eu
 3 | 
 4 | # re-extract DNA methylation, at custom sequence contexts, from BAM file and produce cytosine report
 5 | # perform in 4_bismark output sub-directory of wgbs workflow
 6 | 
 7 | if [ "$#" -ne 6 ]; then
 8 | echo "USAGE: met-sign.sh <SE/PE> <context> <file> <annotation> <sample> <outname>"
 9 | echo "EXAMPLE: met-sign.sh SE CHH sample.bam Araport_mRNA.sorted.bed sample mRNA"
10 | exit 1
11 | fi
12 | 
13 | layout=$1
14 | context=$2
15 | fl=$3
16 | annopath=$4
17 | sample=$5
18 | outname=$6
19 | 
20 | echo "Extracting CX report from $1 BAM ..."
21 | 
22 | if [ $layout == "SE" ]; then
23 | bismark_methylation_extractor --comprehensive --multicore 4 --cytosine_report --CX --genome_folder ~/TAIR10/  --report --buffer_size 8G -s ${fl}
24 | fi
25 | 
26 | if [ $layout == "PE" ]; then
27 | bismark_methylation_extractor --comprehensive --multicore 4 --cytosine_report --CX --genome_folder ~/TAIR10/  --report --buffer_size 8G -p ${fl}
28 | fi
29 | 
30 | gzip -d *cov.gz
31 | sortBed -i ${fl::-3}bismark.cov > ${fl::-3}bismark.bed
32 | bedfile="${fl::-3}bismark.bed"
33 | 
34 | echo "reports extracted"
35 | echo "$context from $bedfile"
36 | echo $1 $2 $3 $4 $5 $6
37 | 
38 | # re-organise report, grep context, and awk to remove C and M
39 | awk '{print $1 "\t" $2 "\t" $2+1 "\t" $6 "\t" $7}' ${fl::-3}CX_report.txt | grep "$context" | awk -F$'\t' ' $1 != "ChrC" && $1 != "ChrM" ' > ${fl::-3}${context}_report.bed
40 | 
41 | sortBed -i ${fl::-3}${context}_report.bed > ${fl::-3}${context}_report.sorted.bed
42 | 
43 | # intersect sub-context info to bismark.cov
44 | intersectBed -wo -sorted -a ${fl::-3}${context}_report.sorted.bed -b $bedfile | awk 'BEGIN { OFS = "\t" } {print $1, $2, $3, $4, $5, $9}' > ${sample}-${outname}-sub${context}-report.bed
45 | 
46 | # closest to get info across annotation file and subset to within 1kb
47 | closestBed -D "b" -a ${sample}-${outname}-sub${context}-report.bed -b $annopath | awk -F$'\t' '$NF<1000 && $NF>-1000' > ${sample}-${outname}-sub${context}-report.1k.bed
48 | 
49 | echo "done... cleaning..."
50 | 
51 | rm C*txt
52 | rm *_report.txt
53 | rm *bedGraph.gz
54 | rm *M-bias.txt
55 | rm $bedfile
56 | rm *cov
57 | rm ${fl::-3}${context}_report*bed
58 | rm ${sample}-${outname}-sub${context}-report.bed
59 | 
60 | echo "R"
61 | 
62 | Rscript ~/scripts/rel_methylation_plots_v2.r ${sample} ${outname} ${context}
63 | 
64 | echo "DONE"
65 | 


--------------------------------------------------------------------------------
/MethylC/methimpute.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | options(echo=T)
  3 | library(fields)
  4 | args=commandArgs(trailingOnly=T)
  5 | print(args)
  6 | 
  7 | ## Citation
  8 | # Taudt, A., Roquis, D., Vidalis, A., Wardenaar, R., Johannes, F., and Colome-Tatché́-Tatché, M. (2018). METHimpute: imputation-guided construction of complete methylomes from WGBS data. BMC Genomics 19: 444.
  9 | 
 10 | ## Perform METHimpute to get imputed/recalibrated genome-wide methylation levels at single Cs and 100bp tiles
 11 | # https://github.com/ataudt/methimpute/blob/master/README.md
 12 | # https://github.com/ataudt/methimpute/blob/master/vignettes/methimpute.pdf
 13 | 
 14 | ### Installation
 15 | # install.packages("devtools")
 16 | # source("http://bioconductor.org/biocLite.R")
 17 | # biocLite(c("GenomicRanges"))
 18 | # library(devtools)
 19 | # install_github("ataudt/methimpute")
 20 | 
 21 | ### Input files
 22 | # Run in bash to get 1-based genome-wide cytosine report
 23 | # bismark_methylation_extractor --multicore 4 --cytosine_report --CX --genome_folder $HOME/TAIR10 *sorted.bam
 24 | 
 25 | ## load library
 26 | library(methimpute)
 27 | library(tidyverse)
 28 | 
 29 | ## file "CX_report.txt"
 30 | file <- args[1]
 31 | outname <- args[2]
 32 | 
 33 | ## chromosome lengths from methimpute
 34 | data(arabidopsis_chromosomes)
 35 | arabidopsis_chromosomes$chromosome <- sub('chr', 'Chr', arabidopsis_chromosomes$chromosome)
 36 | 
 37 | ## data import
 38 | bismark.data <- importBismark(file, chrom.lengths=arabidopsis_chromosomes)
 39 | 
 40 | ## Get positions of all cytosines to inflate methylation data (include non-covered sites)
 41 | fasta.file <- '~/TAIR10/chromosomes/arabidopsis_seq.fa'
 42 | cytosine.positions = extractCytosinesFromFASTA(fasta.file, contexts = c('CG','CHG','CHH'))
 43 | methylome = inflateMethylome(bismark.data,cytosine.positions)
 44 | print(methylome)
 45 | 
 46 | ## Obtain correlation parameters (methylation levels from adjacent cytosines)
 47 | distcor = distanceCorrelation(methylome, separate.contexts = TRUE)
 48 | 
 49 | ## Estimate decay parameter for distancce dependeny of the transition probabilities in HMM
 50 | fit = estimateTransDist(distcor)
 51 | 
 52 | ## HMM for complete set using transition probabilities
 53 | model = callMethylationSeparate(data = methylome, transDist = fit$transDist, num.threads = 4)
 54 | # print(model)
 55 | 
 56 | ## At genes and TE coordinates
 57 | data(arabidopsis_genes)
 58 | seqlevels(arabidopsis_genes) <-  sub('chr', 'Chr', seqlevels(arabidopsis_genes))
 59 | data(arabidopsis_TEs)
 60 | seqlevels(arabidopsis_TEs) <- sub('chr', 'Chr', seqlevels(arabidopsis_TEs))
 61 | 
 62 | ## METHimpute plotting
 63 | pdf(paste0(outname,"_methimpute_HMMfit_enrichment.pdf"))
 64 | print(fit$plot)
 65 | plotHistogram(model, total.counts=5)
 66 | plotScatter(model)
 67 | plotTransitionProbs(model)
 68 | plotConvergence(model)
 69 | plotPosteriorDistance(model$data)
 70 | plotEnrichment(model, annotation=arabidopsis_genes)
 71 | plotEnrichment(model, annotation=arabidopsis_TEs)
 72 | dev.off()
 73 | 
 74 | ## Export full fitted HMM model
 75 | # exportMethylome(model, paste0(outname,"_methimpute_HMMfit.tsv"))
 76 | 
 77 | ## Output recalibrated methylation levels for downstream analysis akin to bismark cov files
 78 | df <- methods::as(model$data, 'data.frame') %>%
 79 | select(seqnames, start, end, context, rc.meth.lvl)
 80 | 
 81 | df_CG <- subset(df, context == "CG") %>%
 82 | select(-context) %>%
 83 | mutate(rc.meth.lvl = rc.meth.lvl * 100) %>%
 84 | utils::write.table(., file = paste0(outname,"_recal_CG.bed.cov"), quote = F, sep = '\t', row.names = F, col.names = F)
 85 | 
 86 | df_CHG <- subset(df, context == "CHG") %>%
 87 | select(-context) %>%
 88 | mutate(rc.meth.lvl = rc.meth.lvl * 100) %>%
 89 | utils::write.table(., file = paste0(outname,"_recal_CHG.bed.cov"), quote = F, sep = '\t', row.names = F, col.names = F)
 90 | 
 91 | df_CHH <- subset(df, context == "CHH") %>%
 92 | select(-context) %>%
 93 | mutate(rc.meth.lvl = rc.meth.lvl * 100) %>%
 94 | utils::write.table(., file = paste0(outname,"_recal_CHH.bed.cov"), quote = F, sep = '\t', row.names = F, col.names = F)
 95 | 
 96 | ## Binned methylation output of recalibrated weighted methylation levels
 97 | df_100bp <- binMethylome(model$data, binsize=100, contexts=c("CG","CHG","CHH"), columns.average="rc.meth.lvl")
 98 | 
 99 | df_100bp_CG <- methods::as(df_100bp$CG, 'data.frame') %>%
100 | select(seqnames, start, end, rc.meth.lvl) %>%
101 | mutate(rc.meth.lvl = rc.meth.lvl * 100) %>%
102 | mutate(start = start - 1) %>%
103 | mutate(end = end - 1) %>%
104 | utils::write.table(., file = paste0(outname,"_recal_CG_100bp.bed"), quote = F, sep = '\t', row.names = F, col.names = F)
105 | 
106 | df_100bp_CHG <- methods::as(df_100bp$CHG, 'data.frame') %>%
107 | select(seqnames, start, end, rc.meth.lvl) %>%
108 | mutate(rc.meth.lvl = rc.meth.lvl * 100) %>%
109 | mutate(start = start - 1) %>%
110 | mutate(end = end - 1) %>%
111 | utils::write.table(., file = paste0(outname,"_recal_CHG_100bp.bed"), quote = F, sep = '\t', row.names = F, col.names = F)
112 | 
113 | df_100bp_CHH <- methods::as(df_100bp$CHH, 'data.frame') %>%
114 | select(seqnames, start, end, rc.meth.lvl) %>%
115 | mutate(rc.meth.lvl = rc.meth.lvl * 100) %>%
116 | mutate(start = start - 1) %>%
117 | mutate(end = end - 1) %>%
118 | utils::write.table(., file = paste0(outname,"_recal_CHH_100bp.bed"), quote = F, sep = '\t', row.names = F, col.names = F)
119 | 
120 | 


--------------------------------------------------------------------------------
/MethylC/pca_wigs.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # merge wigs and perform PCAs
 3 | 
 4 | args = commandArgs(trailingOnly=T)
 5 | print(args)
 6 | context=args[1]
 7 | method <- paste0(ifelse(args[2] != "pearson" & args[2] != "kendall" & args[2] != "spearman", yes="pearson", no=args[2]))
 8 | method <- ifelse(method == "NA", yes="pearson", no=args[2])
 9 | print(paste("cor method = " ,method))
10 | 
11 | library(tidyverse)
12 | 
13 | files <- dir(pattern=paste0(context,"_100bp.wig"))
14 | data <- data_frame(files) %>%
15 | mutate(file_contents = map(files, read_delim, delim='\t', col_names=F, skip=1)) %>%
16 | unnest() %>%
17 | filter(X1!='Mt'&X1!='ChrM'&X1!='Pt'&X1!='ChrC') %>%
18 | mutate(sample=sapply(strsplit(files, '_'), function(l) l[1])) %>%
19 | mutate(genotype=sapply(strsplit(sample, '-'), function(l) l[1])) %>%
20 | mutate(rep=sapply(strsplit(sample, '-'), function(l) l[2])) %>%
21 | mutate(X1=ifelse(substr(X1, start=1, stop=3)=="Chr",paste0(X1),paste0("Chr",X1))) %>%
22 | na.omit() %>%
23 | group_by(X1, X2, X3, genotype) %>%
24 | summarise(met = mean(X4)) %>%
25 | spread(key=genotype, value=met) %>%
26 | na.omit() %>%
27 | ungroup() %>%
28 | select(-X1, -X2, -X3) %>%
29 | as.matrix()
30 | 
31 | # PCA analysis
32 | # pc=prcomp(data)
33 | # plot(pc, type ='l' , main='Variance of PCs')
34 | # plot(pc$x[1,], pc$x[2,], xlab = 'PC1', ylab='PC2')
35 | # text(pc$x[1,], pc$x[2,], colnames(data), cex = 0.8, pos=4)
36 | # library(devtools)
37 | # install_github("ggbiplot","vqv")
38 | # library(ggbiplot)
39 | # ggbiplot(pc, obs.scale=1, var.scale=1, groups=ir.species, ellipse = TRUE, circle = TRUE) + theme(legend.direction = 'horizontal', legend.position = 'top')
40 | 
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/MethylC/rel_methylation_plots.r:
--------------------------------------------------------------------------------
 1 | options(echo=T)
 2 | library(fields)
 3 | args=commandArgs(trailingOnly=T)
 4 | print(args)
 5 | 
 6 | #read in files#
 7 | cpg=read.delim(paste(args[1],'_CG_',args[2],'.1k.bed',sep=''),head=F)
 8 | chg=read.delim(paste(args[1],'_CHG_',args[2],'.1k.bed',sep=''),head=F)
 9 | chh=read.delim(paste(args[1],'_CHH_',args[2],'.1k.bed',sep=''),head=F)
10 | 
11 | #remove scaffolds
12 | cpg.sub=subset(cpg,cpg$V1!='ChrM' & cpg$V1!='chrC')
13 | chg.sub=subset(chg,chg$V1!='ChrM' & chg$V1!='chrC')
14 | chh.sub=subset(chh,chh$V1!='ChrM' & chh$V1!='chrC')
15 | 
16 | f1.end=as.numeric(args[3])
17 | f2.end=as.numeric(args[4])+1
18 | cpg.sub=subset(cpg.sub,cpg.sub[,f1.end + f2.end]!= -1)
19 | chg.sub=subset(chg.sub,chg.sub[,f1.end + f2.end]!= -1)
20 | chh.sub=subset(chh.sub,chh.sub[,f1.end + f2.end]!= -1)
21 | 
22 | #CpG
23 | real.dist=matrix(ifelse(cpg.sub[,f1.end + 6]=='+',-1*cpg.sub[,f1.end+f2.end],cpg.sub[,f1.end + f2.end]),ncol=1)
24 | cpg.sub=cbind(cpg.sub,real.dist)
25 | rel.dist=matrix(ifelse(cpg.sub$real.dist==0,ifelse(cpg.sub[,f1.end + 6]=="-",((cpg.sub[,f1.end + 3] - (cpg.sub[,2]))/(cpg.sub[,f1.end + 3] - cpg.sub[,f1.end + 2]))*1000,(((cpg.sub[,2]) - cpg.sub[,f1.end + 2])/(cpg.sub[,f1.end + 3] - cpg.sub[, f1.end + 2]))*1000),ifelse(cpg.sub$real.dist>0,cpg.sub$real.dist + 1000,cpg.sub$real.dist)),ncol=1)
26 | cpg.sub=cbind(cpg.sub,rel.dist)
27 | fixy=ifelse(cpg.sub$rel.dist < 0 & cpg.sub$real.dist==0,0,ifelse(cpg.sub$rel.dist >1000 & cpg.sub$real.dist==0,1000,cpg.sub$rel.dist))
28 | cpg.sub$rel.dist=fixy
29 | cpg.bin=stats.bin(cpg.sub$rel.dist,cpg.sub$V4,N=100)
30 | p.cpg.bin=cbind(matrix(cpg.bin$centers,ncol=1),cpg.bin$stats["mean",])
31 | 
32 | #CHG
33 | real.dist=matrix(ifelse(chg.sub[,f1.end + 6]=='+',-1*chg.sub[,f1.end + f2.end],chg.sub[,f1.end + f2.end]),ncol=1)
34 | chg.sub=cbind(chg.sub,real.dist)
35 | rel.dist=matrix(ifelse(chg.sub$real.dist==0,ifelse(chg.sub[,f1.end + 6]=="-",((chg.sub[,f1.end + 3] - (chg.sub$V2))/(chg.sub[,f1.end + 3] - chg.sub[,f1.end + 2]))*1000,(((chg.sub$V2) - chg.sub[,f1.end + 2])/(chg.sub[,f1.end + 3] - chg.sub[,f1.end + 2]))*1000),ifelse(chg.sub$real.dist>0,chg.sub$real.dist + 1000,chg.sub$real.dist)),ncol=1)
36 | chg.sub=cbind(chg.sub,rel.dist)
37 | fixy=ifelse(chg.sub$rel.dist < 0 & chg.sub$real.dist==0,0,ifelse(chg.sub$rel.dist >1000 & chg.sub$real.dist==0,1000,chg.sub$rel.dist))
38 | chg.sub$rel.dist=fixy
39 | chg.bin=stats.bin(chg.sub$rel.dist,chg.sub$V4,N=100)
40 | p.chg.bin=cbind(matrix(chg.bin$centers,ncol=1),chg.bin$stats["mean",])
41 | 
42 | #CHH
43 | real.dist=matrix(ifelse(chh.sub[,f1.end + 6]=='+',-1*chh.sub[,f1.end + f2.end],chh.sub[,f1.end + f2.end]),ncol=1)
44 | chh.sub=cbind(chh.sub,real.dist)
45 | rel.dist=matrix(ifelse(chh.sub$real.dist==0,ifelse(chh.sub[,f1.end + 6]=="-",((chh.sub[,f1.end + 3] - (chh.sub$V2))/(chh.sub[,f1.end + 3] - chh.sub[,f1.end + 2]))*1000,(((chh.sub$V2) - chh.sub[,f1.end + 2])/(chh.sub[,f1.end + 3] - chh.sub[,f1.end + 2]))*1000),ifelse(chh.sub$real.dist>0,chh.sub$real.dist + 1000,chh.sub$real.dist)),ncol=1)
46 | chh.sub=cbind(chh.sub,rel.dist)
47 | fixy=ifelse(chh.sub$rel.dist < 0 & chh.sub$real.dist==0,0,ifelse(chh.sub$rel.dist >1000 & chh.sub$real.dist==0,1000,chh.sub$rel.dist))
48 | chh.sub$rel.dist=fixy
49 | chh.bin=stats.bin(chh.sub$rel.dist,chh.sub$V4,N=100)
50 | p.chh.bin=cbind(matrix(chh.bin$centers,ncol=1),chh.bin$stats["mean",])
51 | 
52 | #create plots
53 | pdf(paste(args[1],'_',args[2],'_methylation.pdf',sep=''),h=10,w=12)
54 | plot(x=NULL,y=NULL,xlim=c(-1000,2000),ylim=c(0,100),xlab='',ylab='% methylation',main=paste(args[1],' methylation over ',args[2],sep=''))
55 | lines(p.cpg.bin,col=1,lwd=2)
56 | lines(p.chg.bin,col=2,lwd=2)
57 | lines(p.chh.bin,col=3,lwd=2)
58 | abline(v=0,lty=2)
59 | abline(v=1000,lty=2)
60 | legend('topright',c(paste(args[1],' - CpG',sep=''),paste(args[1],' - CHG',sep=''),paste(args[1],' - CHH',sep='')),col=c(1,2,3),lwd=2,lty=1)
61 | dev.off()
62 | #####################################################################################
63 | 
64 | out=cbind(p.cpg.bin,p.chg.bin[,2],p.chh.bin[,2])
65 | colnames(out)=c( 'pos' , 'CG' , 'CHG' , 'CHH')
66 | write.table(out,paste(args[1],'_',args[2],'values.txt',sep=''), sep='\t', row.names=F, quote=F)
67 | 


--------------------------------------------------------------------------------
/MethylC/rel_methylation_plots_v2.r:
--------------------------------------------------------------------------------
 1 | # produce mean 5mC levels for R plotting
 2 | options(echo=T)
 3 | library(tidyverse)
 4 | library(fields)
 5 | args=commandArgs(trailingOnly=T)
 6 | print(args)
 7 | 
 8 | smplname <- as.character(paste0(args[1]))
 9 | outname <- as.character(paste0(args[2]))
10 | context <- as.character(paste0(args[3]))
11 | 
12 | data <- dir(pattern=paste0(smplname,"-",outname,"-sub",context,"-report.1k.bed")) %>%
13 | read_delim(delim = '\t', col_names=F) %>%
14 | mutate(rel.dist=ifelse(X13==0,ifelse(X12=="-",((X9-X2)/(X9-X8))*1000,((X2-X8)/(X9-X8))*1000),ifelse(X13>0,X13+1000,X13))) %>%
15 | mutate(fixy=ifelse(rel.dist<0 & X13==0,0,ifelse(rel.dist>1000 & X13==0, 1000, rel.dist)))
16 | 
17 | out <- NULL
18 | for(i in unique(data$X5)){
19 | a <- subset(data, X5 == i)
20 | a <- stats.bin(a$fixy,a$X6,N=100)
21 | temp <- as.data.frame(cbind(matrix(a$centers,ncol=1),a$stats["mean",]))
22 | temp$motiff <- paste0(i)
23 | out <- rbind(temp, out)
24 | }
25 | 
26 | write.table(out,paste0(paste(smplname,context,outname,sep='_'),'.txt'),quote=F, col.names=T, row.names=F, sep='\t')
27 | 


--------------------------------------------------------------------------------
/MethylC/scatman_smooth.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Observe whether annotation feature length correlates with methylation level (technical bias).
 4 | # E.g. Do longer TEs have higher levels of methylation? 
 5 | # Intersect bed file with annotation file;
 6 | # Produce scattersmooth plot of methylation level vs feature length.
 7 | 
 8 | if [ "$#" -ne 3 ]; then
 9 | echo "scatman_smooth.sh annotation sample outname"
10 | echo "e.g. scatman_smooth.sh ./TAIR_TE_subset.bed 317-1-4 TE" 
11 | exit 1
12 | fi
13 | 
14 | annotation=$1	
15 | sample=$2
16 | outname=$3
17 | 
18 | # intersect bed files with annotation file
19 | echo "Performing intersectBed of CHG methylation..."
20 | intersectBed -wa -wb -a ${sample}_CHG.bed -b $annotation > ${sample}_CHG_${outname}.bed
21 | echo "Performing intersectBed of CHH methylation..."
22 | intersectBed -wa -wb -a ${sample}_CHH.bed -b $annotation > ${sample}_CHH_${outname}.bed
23 | echo "Performing intersectBed of CpG methylation..."
24 | intersectBed -wa -wb -a ${sample}_CpG.bed -b $annotation > ${sample}_CpG_${outname}.bed
25 | 
26 | # take output and produce smoothed scatterplots
27 | echo "Performing scatterSmooth in R"
28 | 
29 | Rscript $HOME/scripts/smooth_scat.r ${sample} ${outname}
30 | 
31 | echo "cleanup intermediates"
32 | rm ${sample}_*_${outname}.bed
33 | 


--------------------------------------------------------------------------------
/MethylC/smooth_scat.r:
--------------------------------------------------------------------------------
 1 | # R script for scatman_smooth.sh to produce smoothed scatterplots for methylation levels vs feature length
 2 | # Will produce plots in the order: CpG, CHG, CHH
 3 | 
 4 | args=commandArgs(trailingOnly=T)
 5 | print(args)
 6 | 
 7 | #cpg
 8 | a <- read.delim(paste0(args[1],"_CpG_",args[2],".bed"), header=F)
 9 | a$length <- a$V7 - a$V6
10 | 
11 | #chg
12 | b <- read.delim(paste0(args[1],"_CHG_",args[2],".bed"), header=F)
13 | b$length <- b$V7 - b$V6
14 | 
15 | #chh
16 | c <- read.delim(paste0(args[1],"_CHH_",args[2],".bed"), header=F)
17 | c$length <- c$V7 - c$V6
18 | 
19 | pdf(file=paste0(args[1],"_",args[2],".pdf"))
20 | par(mfrow=c(2,2))
21 | smoothScatter(x=a$length, y=a$V4, ylab="CpG Methylation", xlab="TE Length (bp)", colramp = colorRampPalette(c("royalblue", "yellow", "red")))
22 | smoothScatter(x=b$length, y=b$V4, ylab="CHG Methylation", xlab="TE Length (bp)", colramp = colorRampPalette(c("royalblue", "yellow", "red")))
23 | smoothScatter(x=c$length, y=c$V4, ylab="CHH Methylation", xlab="TE Length (bp)", colramp = colorRampPalette(c("royalblue", "yellow", "red")))
24 | dev.off()
25 | 
26 | 


--------------------------------------------------------------------------------
/MethylC/wgbs_cov_to_TDF.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eu
 3 | 
 4 | # convert bismark cov files to IGV format, then produce TDF files of that data.
 5 | # Use in directory with all cov files of interest.
 6 | # Can produce TDF for depth or prop. methylation
 7 | 
 8 | for FILE in *bismark.cov
 9 | do
10 | cat -n $FILE | awk -v OFS="\t" '{print $2, $3-1, $4, $1, $5}' > ${FILE%%.bismark.cov}.igv
11 | java -Xmx2g -Djava.awt.headless=true -jar /home/diep/bin/IGVTools/igvtools.jar toTDF ${FILE%%.bismark.cov}.igv ${FILE%%.bismark.cov}.tdf /home/diep/Araport11/Araport11.genome
12 | done
13 | 
14 | # Depth of coverage
15 | #cov=read.delim(flist[i],head=F)
16 | #cov$V2=cov$V2-1
17 | #cov$id=seq(1:nrow(cov))
18 | #cov$V7=cov$V5+cov$V6
19 | #cov=cov[,c(1,2,3,7,8)]
20 | 


--------------------------------------------------------------------------------
/MethylC/wgbs_custom_bins.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eu
 3 | 
 4 | # Generate mean methylation levels into custom bins with desired read depth/coverage from per-site BED files
 5 | 
 6 | if [ "$#" -lt 5 ]; then
 7 | echo "Missing arguments!"
 8 | echo "USAGE: wgbs_custom_bins.sh <sample> <file> <genome fasta> <coverage> <bin size>"
 9 | echo "EXAMPLE: wgbs_custom_bins.sh col0-r1 bed /home/diep/TAIR10/TAIR10_Chr.all.fasta 15 100"
10 | exit 1
11 | fi
12 | 
13 | bed=$1
14 | file=$2
15 | fas=$3
16 | cov=$4
17 | bin=$5
18 | window=$(expr $bin - 1)
19 | 
20 | echo "Weighted methylation in $bed across $bin bp windows with depth >= $cov ..."
21 | 
22 | cg="${bed}_CG*.${file}"
23 | chg="${bed}_CHG*.${file}"
24 | chh="${bed}_CHH*.${file}"
25 | 
26 | # use samtools to generate fasta index
27 | samtools faidx $fas
28 | 
29 | # use awk on index to make genome file
30 | # https://www.biostars.org/p/70795/
31 | awk -v OFS='\t' {'print $1,$2'} ${fas}.fai > temp.genome
32 | 
33 | # use genome file to make 100bp windows across genome
34 | bedtools makewindows -g temp.genome -w ${window} -s ${bin} | sortBed | awk -F$'\t' ' $1 != "ChrC" && $1 != "ChrM" ' > temp.genome.${bin}bp.sorted.bed
35 | 
36 | if [ "$file" == "cov" ]; then
37 | # use bedtool intersect and groupBy to get mean methylation levels per bin based on per-site methylation
38 | 	echo "Bedtools $cg ..."
39 | 	sort -k1,1 -k2,2n $cg | bedtools intersect -sorted -wo -a temp.genome.${bin}bp.sorted.bed -b "stdin" | groupBy -g 1,2,3 -c 7,8,9 -o mean,sum,sum | awk -v OFS='\t' '{print $1,$2,$3,$4 = ($5 / ($5+$6)*100 ),$5 = ($5 + $6)}' | awk '{ if ($5 >= '$cov') { print } }' > ${bed}_CG_${bin}bp_${cov}cov.bed
40 | 
41 | 	echo "Bedtools $chg ..."
42 | 	sort -k1,1 -k2,2n $chg | bedtools intersect -sorted -wo -a temp.genome.${bin}bp.sorted.bed -b "stdin" | groupBy -g 1,2,3 -c 7,8,9 -o mean,sum,sum | awk -v OFS='\t' '{print $1,$2,$3,$4 = ($5 / ($5+$6)*100 ), $5 = ($5 + $6)}' | awk '{ if ($5 >= '$cov') { print } }' > ${bed}_CHG_${bin}bp_${cov}cov.bed
43 | 
44 | 	echo "Bedtools $chh ..."
45 | 	sort -k1,1 -k2,2n $chh | bedtools intersect -sorted -wo -a temp.genome.${bin}bp.sorted.bed -b "stdin" | groupBy -g 1,2,3 -c 7,8,9 -o mean,sum,sum | awk -v OFS='\t' '{print $1,$2,$3,$4 = ($5 / ($5+$6)*100 ), $5 = ($5 + $6)}' | awk '{ if ($5 >= '$cov') { print } }' > ${bed}_CHH_${bin}bp_${cov}cov.bed
46 | 
47 | fi
48 | 
49 | if [ "$file" == "bed" ]; then
50 | # use bedtool intersect and groupBy to get mean methylation levels per bin based on per-site methylation
51 |         echo "Bedtools $cg ..."
52 | 	sort -k1,1 -k2,2n $cg | bedtools intersect -sorted -wo -a temp.genome.${bin}bp.sorted.bed -b "stdin" | groupBy -g 1,2,3 -c 8,9 -o sum,sum | awk -v OFS='\t' '{print $1,$2,$3,$4 = ($4 / ($4+$5)*100 ),$5 = ($4 + $5)}' | awk '{ if ($5 >= '$cov') { print } }' > ${bed}_CG_${bin}bp_${cov}cov.bed
53 | 
54 | 	echo "Bedtools $chg ..."
55 | 	sort -k1,1 -k2,2n $chg | bedtools intersect -sorted -wo -a temp.genome.${bin}bp.sorted.bed -b "stdin" | groupBy -g 1,2,3 -c 8,9 -o sum,sum | awk -v OFS='\t' '{print $1,$2,$3,$4 = ($4 / ($4+$5)*100 ), $5 = ($4 + $5)}' | awk '{ if ($5 >= '$cov') { print } }' > ${bed}_CHG_${bin}bp_${cov}cov.bed
56 | 
57 | 	echo "Bedtools $chh ..."
58 | 	sort -k1,1 -k2,2n $chh | bedtools intersect -sorted -wo -a temp.genome.${bin}bp.sorted.bed -b "stdin" | groupBy -g 1,2,3 -c 8,9 -o sum,sum | awk -v OFS='\t' '{print $1,$2,$3,$4 = ($4 / ($4+$5)*100 ), $5 = ($4 + $5)}' | awk '{ if ($5 >= '$cov') { print } }' > ${bed}_CHH_${bin}bp_${cov}cov.bed
59 | 
60 | fi
61 | 	
62 | echo 'cleaning ...'
63 | # CLEAN
64 | rm temp.genome*
65 | 
66 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # NGS-scripts
 2 | Repository for scripts used for analysing next generation sequencing data organised by technique.
 3 | 
 4 | #### At_rRNA_AGIs.txt
 5 | List of rRNA loci for removal prior to DEG calling.
 6 | 
 7 | #### TAIR10_annotation.sh
 8 | Download TAIR10 gff and produce annotation files
 9 | 
10 | #### TruSeq-adapters.fa
11 | FASTA file containing illumina adapter sequences for scythe step in RNA-seq alignment.
12 | 
13 | #### Araport11_assemble.sh
14 | Build annotation files based on araport11 gff.
15 | 
16 | #### average_cov.sh
17 | Calculate average depth using samtools depth on sorted BAM file.
18 | 
19 | #### gene_to_gene_anno.sh
20 | Set of commands (bash & R) to take TAIR GFF files and produce annotation files, here specifically making a annotation file of genes in tandem orientation.
21 | 
22 | 


--------------------------------------------------------------------------------
/RNA/BAM_to_5p_bigWigs.sh:
--------------------------------------------------------------------------------
  1 | set -eu
  2 | 
  3 | # Produce 5p end coverage data from BAM files from GMUCT or PARE-seq in bedgraph format
  4 | # Then produce bigWigs files for viewing delight
  5 | # Run in directory with sam converted, sorted, indexed  bam file
  6 | # Ensure genome index genome & chromosome sizes are prepared:
  7 | # samtools faidx TAIR10_Chr.all.fasta | cut -f1,2 TAIR10_Chr.all.fasta.fai > TAIR10_Chr.all.fasta.len
  8 | # Make sure you have kentUtils: https://github.com/ENCODE-DCC/kentUtils
  9 | # git clone git://github.com/ENCODE-DCC/kentUtils.git
 10 | 
 11 | ### CONDA environment is installed
 12 | # conda create --name Bedtools
 13 | # conda install -n Bedtools -c bioconda bedtools
 14 | 
 15 | if [ "$#" -lt 4 ]; then
 16 | echo "Missing arguments!"
 17 | echo "USAGE: BAM_to_5p_bigWig.sh <BAM> <layout> <strand> <chr_sizes>"
 18 | echo "EXAMPLE: BAM_to_5p_bigWig.sh col0-r1.bam SE unstranded,stranded TAIR10_Chr.all.fasta.len"
 19 | exit 1
 20 | fi
 21 | 
 22 | smp=$1
 23 | lay=$2
 24 | str=$3
 25 | chrc_sizes=$4
 26 | 
 27 | echo ""
 28 | echo "sample = $1"
 29 | echo "layout = $2"
 30 | echo "strand = $3"
 31 | echo "chr_size = $4"
 32 | echo ""
 33 | echo "Produce bigWig file(s) for 5p read ends from $smp ..."
 34 | echo ""
 35 | 
 36 | 
 37 | if [[ "$lay" == "SE" ]] && [[ "$str"  == "unstranded" ]]; then
 38 | 
 39 | 	reads=$(samtools view -F 260 -c $smp)
 40 | 	scaling_factor=$(bc <<< "scale=6;1000000/$reads")
 41 | 
 42 | 	echo "BAM to bedgraph ..."
 43 | 	# unstranded bedgraph of 5' read end coverage scaled to RPM
 44 | 	bedtools genomecov -bga -5 -scale $scaling_factor -ibam $smp > ${smp%%bam}5p.bg
 45 | 
 46 | 	# convert bedgraph to bigWig
 47 | 	echo "bigWig ..."
 48 | 	$HOME/bin/kentUtils/bin/linux.x86_64/bedGraphToBigWig ${smp%%bam}5p.bg ${chrc_sizes} ${smp%%bam}5p.bigWig
 49 | 
 50 | 
 51 | fi
 52 | 
 53 | if [[ "$lay" == "SE" ]] && [[ "$str"  == "stranded" ]] ; then
 54 | 	# https://www.biostars.org/p/179035/
 55 | 	# extract reads from + and - strand
 56 | 	
 57 | 	reads=$(samtools view -F 260 -c $smp)
 58 | 	scl=$(bc <<< "scale=6;1000000/$reads")
 59 | 
 60 | 	# reverse strand
 61 | 	samtools view -@ 2 -f 16 -b $smp > ${smp%%bam}reverse.bam
 62 | 	# forward strand
 63 | 	samtools view -@ 2 -F 16 -b $smp > ${smp%%bam}forward.bam
 64 | 	
 65 | 	echo "BAM to stranded bedgraphs ..."
 66 | 	# reverse/minus bg
 67 | 	bedtools genomecov -bga -5 -scale -${scl} -ibam ${smp%%bam}reverse.bam > ${smp%%bam}minus.5p.bg
 68 | 	# forward/plus bg
 69 | 	bedtools genomecov -bga -5 -scale $scl -ibam ${smp%%bam}forward.bam > ${smp%%bam}plus.5p.bg
 70 | 	
 71 | 	echo "bigWigs..."
 72 | 	$HOME/bin/kentUtils/bin/linux.x86_64/bedGraphToBigWig ${smp%%bam}plus.5p.bg ${chrc_sizes} ${smp%%bam}plus.5p.bigWig
 73 | 	$HOME/bin/kentUtils/bin/linux.x86_64/bedGraphToBigWig ${smp%%bam}minus.5p.bg ${chrc_sizes} ${smp%%bam}minus.5p.bigWig
 74 | 	
 75 | 	rm ${smp%%bam}reverse.bam ${smp%%bam}forward.bam
 76 | 
 77 | fi
 78 | 
 79 | 
 80 | if [[ "$lay" == "PE" ]] && [[ "$str"  == "unstranded" ]] ; then
 81 | 
 82 | 	reads=$(samtools view -F 260 -c $smp)
 83 | 	frags=$(expr $reads / 2)
 84 | 	scaling_factor=$(bc <<< "scale=6;1000000/$frags")
 85 | 
 86 | 	echo "BAM to bedgraph ..."
 87 | 	# unstraned bedgraph of 5' read end coverage scaled to RPM
 88 | 	bedtools genomecov -bga -5 -scale $scaling_factor -ibam $smp > ${smp%%bam}5p.bg
 89 | 
 90 | 	# convert bedgraph to bigWig
 91 | 	echo "bigWig ..."
 92 | 	$HOME/bin/kentUtils/bin/linux.x86_64/bedGraphToBigWig ${smp%%bam}5p.bg ${chrc_sizes} ${smp%%bam}5p.bigWig
 93 | 
 94 | fi
 95 | 
 96 | 
 97 | if [[ "$lay" == "PE" ]] && [[ "$str"  == "stranded" ]] ; then
 98 | 
 99 | 	reads=$(samtools view -F 260 -c $smp)
100 | 	frags=$(expr $reads / 2)
101 | 	scl=$(bc <<< "scale=6;1000000/$frags")
102 | 
103 | 
104 | 	echo "Extract properly-paired read mates (+ flags 99/147; - flags 83/163) from paired-end BAM files"
105 | 	# http://seqanswers.com/forums/showthread.php?t=29399
106 | 	
107 | 	# need sorted bam
108 | 	samtools sort -@ 8 ${smp} -o ${smp%%bam}sorted.bam
109 | 	smp="${smp%%bam}sorted.bam"
110 | 
111 | 	# R1 forward
112 | 	samtools view -@ 8 -f 99 -b $smp > ${smp%%bam}R1F.bam
113 | 	# R2 reverse
114 | 	samtools view -@ 8 -f 147 -b $smp > ${smp%%bam}R2R.bam
115 | 	# FORWARD R1 read pairs
116 | 	samtools merge -f ${smp%%bam}forward.bam ${smp%%bam}R1F.bam ${smp%%bam}R2R.bam
117 | 
118 | 	# R1 reverse
119 | 	samtools view -@ 8 -f 83 -b $smp > ${smp%%bam}R1R.bam
120 | 	# R2 forward
121 | 	samtools view -@ 8 -f 163 -b $smp > ${smp%%bam}R2F.bam
122 | 	# REVERSE R1 read pairs
123 | 	samtools merge -f ${smp%%bam}reverse.bam ${smp%%bam}R1R.bam ${smp%%bam}R2F.bam
124 | 	
125 | 	rm $smp ${smp%%bam}R1F.bam ${smp%%bam}R2R.bam ${smp%%bam}R1R.bam ${smp%%bam}R2F.bam	
126 | 
127 | 	echo "BAM to stranded bedgraph ..."
128 | 	# minus strand
129 | 	bedtools genomecov -bga -5 -scale -${scl} -ibam ${smp%%bam}reverse.bam > ${smp%%bam}minus.5p.bg
130 | 	# plus strand
131 | 	bedtools genomecov -bga -5 -scale ${scl} -ibam ${smp%%bam}forward.bam > ${smp%%bam}plus.5p.bg
132 | 
133 | 	echo "bigWigs..."
134 | 	$HOME/bin/kentUtils/bin/linux.x86_64/bedGraphToBigWig ${smp%%bam}plus.5p.bg ${chrc_sizes}  ${smp%%bam}plus.5p.bigWig
135 | 	$HOME/bin/kentUtils/bin/linux.x86_64/bedGraphToBigWig ${smp%%bam}minus.5p.bg ${chrc_sizes} ${smp%%bam}minus.5p.bigWig
136 | 
137 | 	rm ${smp%%bam}forward.bam ${smp%%bam}reverse.bam
138 | 
139 | fi
140 | 
141 | 
142 | 


--------------------------------------------------------------------------------
/RNA/BAM_to_EJC.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -u
 3 | 
 4 | # Script to extract genome coverage across features of interest
 5 | # optimised to caluclate 5'-P end frequency at exons from PARE or GMUCT
 6 | # only SE
 7 | 
 8 | ### CONDA environment is installed
 9 | # conda create --name ngs_plots
10 | # conda install -n ngs_plots -c bioconda bedtools
11 | # conda install -n ngs_plots r-fields
12 | # conda install -n ngs_plots -c r r-tidyverse
13 | # conda activate ngs_plots
14 | 
15 | if [ "$#" -lt 4 ]; then
16 | echo "Missing arguments!"
17 | echo "USAGE: BAM_to_EJC.sh <.BAM> <layout: SE/PE> <bedfile annotation> <feature name>"
18 | echo "treat as unstranded only (degradome)"
19 | echo "EXAMPLE: BAM_to_EJC.sh col0_rep1.sorted.bam PE Arabidopsis_thaliana.TAIR10.54_exon-mRNA.bed exon-mRNA"
20 | exit 1
21 | fi
22 | 
23 | smp=$1
24 | lay=$2
25 | bedfile=$3
26 | out=$4
27 | 
28 | echo ""
29 | echo "sample = $1"
30 | echo "layout = $2"
31 | echo "bedfile = $3"
32 | echo "feature = $4"
33 | echo ""
34 | 
35 | echo "calculate scaling factor"
36 | if [[ "$lay" == "SE" ]] ; then scl=$(bc <<< "scale=6;1000000/$(samtools view -F 260 -c $smp)"); fi
37 | if [[ "$lay" == "PE" ]] ; then scl=$(bc <<< "scale=6;1000000/$(samtools view -F 260 -c $smp)/2"); fi
38 | 
39 | echo "BAM to bed..."
40 | bedtools genomecov -bg -5 -ibam $smp > ${smp%%.bam}.5p.bed
41 | closestBed -D "b" -a ${smp%%.bam}.5p.bed -b $bedfile > ${smp%%.bam}_${out}.5p.bed
42 | awk -F$'\t' '$NF<2 && $NF>-2' ${smp%%.bam}_${out}.5p.bed > ${smp%%.bam}_${out}_10bp.5p.bed 
43 | 
44 | echo 'do maths'
45 | Rscript /home/dganguly/scripts/RNA/rel_expression_plots_ejc.r ${smp%%.bam}_${out}_10bp.5p.bed $scl
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/RNA/BAM_to_ESI.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -u
 3 | 
 4 | # Calculate terminal stalling index for EJC binding based on 5'-P end counts upstream of exon-exon junctions
 5 | 
 6 | ### CONDA environment is installed
 7 | # conda create --name ngs_plots
 8 | # conda install -n ngs_plots -c bioconda bedtools
 9 | # conda install -n ngs_plots r-fields
10 | # conda install -n ngs_plots -c r r-tidyverse
11 | # conda activate ngs_plots
12 | 
13 | if [ "$#" -lt 2 ]; then
14 | echo "Missing arguments!"
15 | echo "USAGE: BAM_to_ESI.sh <.BAM> <bedfile annotation>"
16 | echo "EXAMPLE: BAM_to_ESI.sh col0_rep1.sorted.bam Arabidopsis_thaliana.TAIR10.54_exon-mRNA.bed"
17 | echo "annotation provided should represent exons"
18 | exit 1
19 | fi
20 | 
21 | smp=$1
22 | bedfile=$2
23 | 
24 | echo "sample = $1"
25 | echo "bedfile = $2"
26 | 
27 | echo "BAM to bed..."
28 | bedtools genomecov -bg -5 -ibam $smp > ${smp%%.bam}.5p.bed
29 | closestBed -D "b" -a ${smp%%.bam}.5p.bed -b $bedfile > ${smp%%.bam}_exon.5p.bed
30 | awk -F$'\t' '$NF<1 && $NF>-51' ${smp%%.bam}_exon.5p.bed > ${smp%%.bam}_exon_ESI.5p.bed 
31 | 
32 | echo 'do maths'
33 | Rscript /home/dganguly/scripts/RNA/ESI_calculation.r ${smp%%.bam}_exon_ESI.5p.bed
34 | 
35 | echo 'cleaning'
36 | rm -v ${smp%%.bam}_exon.5p.bed
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/RNA/BAM_to_STOP.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -u
 3 | 
 4 | # Script to extract genome coverage across features of interest
 5 | # optimised to caluclate 5'-P end frequency adjacent to START or STOP codon from PARE or GMUCT
 6 | # only SE
 7 | 
 8 | ### CONDA environment is installed
 9 | # conda create --name ngs_plots
10 | # conda install -n ngs_plots -c bioconda bedtools
11 | # conda install -n ngs_plots r-fields
12 | # conda install -n ngs_plots -c r r-tidyverse
13 | # conda activate ngs_plots
14 | 
15 | if [ "$#" -lt 5 ]; then
16 | echo "Missing arguments!"
17 | echo "USAGE: BAM_to_STOP.sh <.BAM> <layout: SE/PE> <bedfile annotation> <feature name> <distance>"
18 | echo "unstranded only (degradomes)"
19 | echo "EXAMPLE: BAM_to_STOP.sh col0_rep1.sorted.bam SE Arabidopsis_thaliana.TAIR10.54_stop.bed stop 40"
20 | echo "annotation should be start or stop codons (see TAIR_annotation.sh)"
21 | exit 1
22 | fi
23 | 
24 | smp=$1
25 | lay=$2
26 | bedfile=$3
27 | out=$4
28 | dis=$5
29 | 
30 | echo ""
31 | echo "sample = $1"
32 | echo "layout = $2"
33 | echo "bedfile = $3"
34 | echo "feature = $4"
35 | echo "distance = $5"
36 | echo ""
37 | 
38 | echo "calculate scaling factor"
39 | if [[ "$lay" == "SE" ]] ; then scl=$(bc <<< "scale=6;1000000/$(samtools view -F 260 -c $smp)"); fi
40 | if [[ "$lay" == "PE" ]] ; then scl=$(bc <<< "scale=6;1000000/$(samtools view -F 260 -c $smp)/2"); fi
41 | 
42 | echo "BAM to bed..."
43 | bedtools genomecov -bg -5 -ibam $smp > ${smp%%.bam}.5p.bed
44 | closestBed -D "b" -a ${smp%%.bam}.5p.bed -b $bedfile > ${smp%%.bam}_${out}.5p.bed
45 | awk -F$'\t' -v a=$dis '$NF<a && $NF>-a' ${smp%%.bam}_${out}.5p.bed > ${smp%%.bam}_${out}_${dis}bp.5p.bed 
46 | 
47 | echo 'do maths'
48 | Rscript /home/dganguly/scripts/RNA/rel_expression_plots_stop.r ${smp%%.bam}_${out}_${dis}bp.5p.bed $scl
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/RNA/BAM_to_TSI.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -u
 3 | 
 4 | # Calculate terminal stalling index based on 5'-P end counts at STOP codon
 5 | 
 6 | ### CONDA environment is installed
 7 | # conda create --name ngs_plots
 8 | # conda install -n ngs_plots -c bioconda bedtools
 9 | # conda install -n ngs_plots r-fields
10 | # conda install -n ngs_plots -c r r-tidyverse
11 | # conda activate ngs_plots
12 | 
13 | if [ "$#" -lt 2 ]; then
14 | echo "Missing arguments!"
15 | echo "USAGE: BAM_to_TSI.sh <.BAM> <bedfile annotation>"
16 | echo "EXAMPLE: BAM_to_TSI.sh col0_rep1.sorted.bam Arabidopsis_thaliana.TAIR10.54_stop.bed"
17 | echo "annotation provided should represent coordinates of stop codons"
18 | exit 1
19 | fi
20 | 
21 | smp=$1
22 | bedfile=$2
23 | 
24 | echo "sample = $1"
25 | echo "bedfile = $2"
26 | 
27 | echo "BAM to bed..."
28 | bedtools genomecov -bg -5 -ibam $smp > ${smp%%.bam}.5p.bed
29 | closestBed -D "b" -a ${smp%%.bam}.5p.bed -b $bedfile > ${smp%%.bam}_stop.5p.bed
30 | awk -F$'\t' '$NF<1 && $NF>-51' ${smp%%.bam}_stop.5p.bed > ${smp%%.bam}_stop_TSI.5p.bed 
31 | 
32 | echo 'do maths'
33 | Rscript /home/dganguly/scripts/RNA/TSI_calculation.r ${smp%%.bam}_stop_TSI.5p.bed
34 | 
35 | echo 'cleaning'
36 | rm -v ${smp%%.bam}.5p.bed ${smp%%.bam}_stop.5p.bed
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/RNA/BAM_to_bedgraph_5p.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eu
 3 | 
 4 | # Script to summarise 5'P read ends (e.g. PARE-seq, GMUCT) across features of interest in bedGraph format and scale to reads per million (RPM)
 5 | 
 6 | ### CONDA environment is installed
 7 | # conda create --name ngs_plots
 8 | # conda install -n ngs_plots -c bioconda bedtools
 9 | # conda install -n ngs_plots r-fields
10 | # conda install -n ngs_plots -c r r-tidyverse
11 | # conda activate ngs_plots
12 | 
13 | if [ "$#" -lt 5 ]; then
14 | echo "Missing arguments!"
15 | echo "USAGE: BAM_to_bedgraph_5p.sh <.BAM> <layout: SE/PE> <bedfile annotation> <feature name> <distance>"
16 | echo "EXAMPLE: BAM_to_bedgraph_5p.sh col0_rep1.sorted.bam PE Arabidopsis_thaliana.TAIR10.54_stop.bed stop 50"
17 | exit 1
18 | fi
19 | 
20 | smp=$1
21 | lay=$2
22 | bedfile=$3
23 | out=$4
24 | dis=$5
25 | 
26 | echo ""
27 | echo "sample = $1"
28 | echo "layout = $2"
29 | echo "bedfile = $3"
30 | echo "feature = $4"
31 | echo "distance = $5"
32 | echo ""
33 | 
34 | echo "calculate scaling factor"
35 | if [[ "$lay" == "SE" ]] ; then scl=$(bc <<< "scale=6;1000000/$(samtools view -F 260 -c $smp)"); fi
36 | if [[ "$lay" == "PE" ]] ; then scl=$(bc <<< "scale=6;1000000/$(samtools view -F 260 -c $smp)/2"); fi
37 | 
38 | echo "BAM to bed..."
39 | bedtools genomecov -bg -5 -scale $scl -ibam $smp > ${smp%%bam}5p.bed
40 | 
41 | echo 'bedtools for coverage across chosen features...'
42 | closestBed -D "b" -a ${smp%%bam}5p.bed -b $bedfile > ${smp%%.bam}_${out}.5p.bed
43 | 
44 | echo 'subset ...'
45 | awk -F$'\t' -v a=$dis '$NF<a && $NF>-a' ${smp%%.bam}_${out}.5p.bed > ${smp%%.bam}_${out}_${dis}bp.5p.bed
46 | 
47 | echo 'do maths'
48 | Rscript /home/dganguly/scripts/RNA/rel_expression_plots.r ${smp%%.bam}_${out}_${dis}bp.5p.bed
49 | 	
50 | echo 'cleaning'
51 | rm -v ${smp%%.bam}_${out}.5p.bed
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/RNA/BAM_to_wigs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eu
 3 | 
 4 | # Produce files with windowed coverage of RNAseq data (BAM aligned reads) across annotations of interest
 5 | # Run in directory with sam converted, sorted, indexed  bam file
 6 | # Provide path of genome .fa file to produce windowed genome
 7 | 
 8 | if [ "$#" -lt 5 ]; then
 9 | echo "Missing arguments!"
10 | echo "USAGE: BAM_to_wigs.sh <.BAM> <genome fasta> <bedfile annotation> <feature name> <window size>"
11 | echo "EXAMPLE: BAM_to_wigs.sh col0_rep1.sorted.bam TAIR10_Chr.all.fasta Araport11_TE.bed TE 100"
12 | exit 1
13 | fi
14 | 
15 | bam=$1
16 | fas=$2
17 | bedfile=$3
18 | out=$4
19 | size=$5
20 | size_2=$(($size - 1))
21 | 
22 | echo "Make $size bp genome bed ..."
23 | 
24 | # use samtools to generate fasta index
25 | samtools faidx $fas
26 | # use awk on index to make genome file
27 | # https://www.biostars.org/p/70795/
28 | awk -v OFS='\t' {'print $1,$2'} ${fas}.fai > temp.genome 
29 | # use genome file to make 100bp windows across genome
30 | bedtools makewindows -g temp.genome -w $size_2 -s $size > temp.genome.${size}bp.bed
31 | sortBed -i temp.genome.${size}bp.bed > temp.genome.${size}bp.sorted.bed
32 | 
33 | # use bedtools coverage to get coverage across windows from BAM
34 | # MAKE SURE TO USE -sorted FLAG
35 | bedtools coverage -sorted -a temp.genome.${size}bp.bed -b $bam > ${bam%%.sorted*}_${size}bp.bed 
36 | 
37 | echo 'cleaning ...'
38 | # CLEAN
39 | rm temp.genome*
40 | 
41 | # sort Bed
42 | sortBed -i ${bam%%.sorted*}_${size}bp.bed > ${bam%%.sorted*}_${size}bp.sorted.bed
43 | 
44 | echo 'bedtools ...'
45 | # bedtools to desired annotation
46 | closestBed -D "b" -a ${bam%%.sorted*}_${size}bp.sorted.bed -b $bedfile > ${bam%%.sorted*}_${out}_${size}.bed
47 | 
48 | echo 'subset to +1k/-1k ...'
49 | # awk to subset
50 | awk -F$'\t' '$NF<1000 && $NF>-1000' ${bam%%.sorted*}_${out}_${size}.bed > ${bam%%.sorted*}_${out}_${size}.1k.bed
51 | 
52 | echo 'final clean ...'
53 | rm ${bam%%.sorted*}_${size}bp.bed ${bam%%.sorted*}_${size}bp.sorted.bed ${bam%%.sorted*}_${out}_${size}.bed
54 | 
55 | 


--------------------------------------------------------------------------------
/RNA/ESI_calculation.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # args[1] = filename
 3 | # Required for BAM_to_ESI.sh
 4 | # Calculate EJC stalling index from 5'P end counts upstream of exon-exon junctions
 5 | 
 6 | options(echo=T)
 7 | library(fields)
 8 | library(tidyverse)
 9 | args=commandArgs(trailingOnly=T)
10 | print(args)
11 | 
12 | # Read in file
13 | input <- read.delim(args[1],head=F) %>% 
14 | # Remove reads to plastid and mitochondria
15 | 	subset(V1 != 'ChrM' & V1 != 'ChrC' & V1 != 'Mt' & V1 != 'Pt') %>%
16 | #exon at least 50 nucleotides
17 | 	mutate(length = V7 - V6) %>%
18 |         subset(length > 49) %>%
19 | # calculate position relative to 3' end of feature
20 |         mutate(pos = ifelse(V10 == "+", V2-V7, V6-V3)) %>%
21 | 	 subset(pos > -50 & pos < 1)
22 | 
23 | # get total end counts in window
24 | stop_5p_sum <- group_by(input, V8) %>%
25 | 	summarise(avg_frame=mean(V4), avg_frame_true=sum(V4)/50, total_counts=sum(V4))
26 | 
27 | # Get sum of normalized reads (i.e.normalized occurrence of 5'P ends [Pi] in Lee et al 2019 Plant Cell) then calculate relative frequency per nt
28 | a1 <- group_by(input, V8) %>% 
29 | 	subset(pos == -28 | pos == -27) %>%
30 | 	summarise(avg_ejc = mean(V4), avg_ejc_true = sum(V4)/2) %>%
31 | 	mutate(avg_frame = stop_5p_sum$avg_frame[match(V8, stop_5p_sum$V8)]) %>%
32 | 	mutate(total_counts = stop_5p_sum$total_counts[match(V8, stop_5p_sum$V8)]) %>%
33 | 	subset(total_counts >= 10) %>%
34 | 	mutate(esi = avg_ejc/avg_frame)
35 | 
36 | # name output filea
37 | name <- sapply(strsplit(as.character(args[1]),'Aligned'), function(l) l[1])
38 | 
39 | ## diagnostic plot on single sample
40 | pdf(paste0(name,"ESI.pdf"))
41 | plot(y=log2(a1$avg_ejc), x=log2(a1$avg_frame), col = ifelse(a1$tsi > 2, "salmon", "grey"))
42 | abline(a=0, b=1)
43 | dev.off()
44 | 
45 | ## output
46 | write.table(a1, paste0(name,"ESI.txt"), sep='\t', quote=F, row.names=F)
47 | 
48 | 


--------------------------------------------------------------------------------
/RNA/README.md:
--------------------------------------------------------------------------------
 1 | ## Scripts for analysing RNA-sequencing
 2 | 
 3 | ### kallisto_pipe
 4 | Pipeline for using Kallisto (Bray et al 2016) for alignment-free transcript quantification. See individual version scripts for running updates and changes.
 5 | 
 6 | ### subread_pipe
 7 | Pipeline for using Subread (more specifically Subjunc) for read alignment for subsequent quantification with featureCounts. See individual script version for running changes and modifications.
 8 | 
 9 | ### featureCounts
10 | Use featureCounts for gene expression quantification. featureCounts_exon performs the same but at the exon-level rather than gene-level.
11 | 
12 | ### featureCounts_to_edgeR
13 | Template script for differential gene expression testing with edgeR (Robinson et al 2010).
14 | 
15 | ### RNAseq_bam_to_100bpwigs
16 | Produce 100bp windows (100bp.bed) of RNAseq coverage from BAMs across annotations of interest.
17 | 
18 | ### rel_expression_plots
19 | Get raw read coverage across features of interest from using WIG files as input (see RNAseq_bam_to_100bp wigs).
20 | 
21 | ### RNAseq_bam_to_bedgraph
22 | Produce coverage data from BAM files for RNAseq or ChIP data in bedgraph format, subsequently converting to bigWig files (IGV browsing).
23 | 
24 | ### SUPPA_pipe
25 | SUPPA2 pipeline using event-based analysis to detect alternate splicing and isoform usage.
26 | 
27 | ### split_file
28 | R script required for splitting files appropriately for SUPPA_pipe.
29 | 
30 | ### stringtie_pipe
31 | StingTie2 pipeline that performs reference-guided de novo transcript assembly and quantification.
32 | 
33 | ### stringtie_extract_tpm
34 | Supplementary script for stringtie_pipe to extract TPM based quantification from StringTie2 output.
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/RNA/SUPPA_pipe_v1.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -u
  3 | 
  4 | # Performs event-based splicing analysis using SUPPA2 based on kallisto output 
  5 | # https://github.com/comprna/SUPPA#command-and-subcommand-structure
  6 | # tutorial: https://github.com/comprna/SUPPA/wiki/SUPPA2-tutorial
  7 | 
  8 | ### CONDA environment
  9 | # conda create --name <name>
 10 | # conda install -n <name> -c bioconda suppa
 11 | 
 12 | 
 13 | if [ "$#" -lt 5 ]; then
 14 | echo "Missing arguments!"
 15 | echo "USAGE: SUPPA_pipe_v1.sh <annotation> <file dir> <group1> <group2> <name>"
 16 | echo "EXAMPLE: SUPPA_pipe_v1.sh /home/diepg/ref_seqs/AtRTD2/AtRTD2_QUASI_19April2016.gtf /home/diepg/ws/sal1_AS/raw_files/ col0_rep1,col0_rep2,col0_rep3 grp7_rep1,grp7_rep2,grp7_rep3 RTD2-quasi"
 17 | exit 1
 18 | fi
 19 | 
 20 | #### Parameters
 21 | ## annotation file
 22 | I=$1
 23 | ## events output name
 24 | N=$5
 25 | ## kallisto quant files
 26 | S=$2
 27 | # group 1 IDs
 28 | grp1=$3
 29 | # group 2 IDs
 30 | grp2=$4
 31 | 
 32 | ## quantification
 33 | mkdir kallisto_output
 34 | 
 35 | fls=$(dir $S)
 36 | for i in $fls; do 
 37 | mkdir kallisto_output/${i%%_kallisto*};
 38 | cp $S/${i}/*/abundance.tsv kallisto_output/${i%%_kallisto*}/abundance.tsv; done
 39 | 
 40 | #python3 ~/bin/SUPPA-2.3/multipleFieldSelection.py -i kallisto_output/*/abundance.tsv -k 1 -f 5 -o iso_tpm.txt
 41 | multipleFieldSelection.py -i kallisto_output/*/abundance.tsv -k 1 -f 5 -o iso_tpm.txt
 42 | 
 43 | ### generateEvents
 44 | mkdir generateEvents
 45 | cd generateEvents
 46 | 
 47 | ## generate transcript events
 48 | # python3 ~/bin/SUPPA-2.3/suppa.py generateEvents -i $I -o $N -f ioi
 49 | suppa.py generateEvents -i $I -o $N -f ioi
 50 | M="${N}.ioi"
 51 | 
 52 | ## generate local AS events
 53 | # python3 ~/bin/SUPPA-2.3/suppa.py generateEvents -i $I -o $N -f ioe -e SE SS MX RI FL
 54 | suppa.py generateEvents -i $I -o $N -f ioe -e SE SS MX RI FL
 55 | 
 56 | #Put all the ioe events in the same file:
 57 | awk '
 58 |     FNR==1 && NR!=1 { while (/^<header>/) getline; }
 59 |     1 {print}
 60 | ' *.ioe > ${N}.allevents.ioe
 61 | N="${N}.allevents.ioe"
 62 | 
 63 | mv $M ../
 64 | mv $N ../
 65 | 
 66 | awk '
 67 |     FNR==1 && NR!=1 { while (/^<header>/) getline; }
 68 |     1 {print}
 69 | ' *.gtf > ${N%%.allevents*}.allevents.gtf
 70 | mv *.allevents.gtf ../
 71 | 
 72 | cd ../
 73 | 
 74 | ### PSI per event
 75 | # python3 ~/bin/SUPPA-2.3/suppa.py psiPerEvent -i $N -e iso_tpm.txt -o ${N%%.allevents*}_events
 76 | suppa.py psiPerEvent -i $N -e iso_tpm.txt -o ${N%%.allevents*}_events
 77 | 
 78 | ### Differential splicing with local events
 79 | ## PSI and TPM per condition
 80 | Rscript $HOME/scripts/RNA/split_file.R ./iso_tpm.txt $grp1 $grp2 ${grp1%%_rep*}_iso.tpm ${grp2%%_rep*}_iso.tpm
 81 | 
 82 | Rscript $HOME/scripts/RNA/split_file.R ./${N%%.allevents*}_events.psi $grp1 $grp2 ${grp1%%_rep*}_events.psi ${grp2%%_rep*}_events.psi
 83 | 
 84 | ## differential splicing analysis
 85 | # python3 ~/bin/SUPPA-2.3/suppa.py diffSplice -m empirical -gc -i $N -p ${grp2%%_rep*}_events.psi ${grp1%%_rep*}_events.psi -e ${grp2%%_rep*}_iso.tpm ${grp1%%_rep*}_iso.tpm -o ${N%%.ioe}_${grp2%%_rep*}-${grp1%%_rep*}_diffSplice_events
 86 | suppa.py diffSplice -m empirical -gc -i $N -p ${grp2%%_rep*}_events.psi ${grp1%%_rep*}_events.psi -e ${grp2%%_rep*}_iso.tpm ${grp1%%_rep*}_iso.tpm -o ${N%%.ioe}_${grp2%%_rep*}-${grp1%%_rep*}_diffSplice_events
 87 | 
 88 | ## differential trascript usage
 89 | ### PSI per isoform
 90 | # python3 ~/bin/SUPPA-2.3/suppa.py psiPerIsoform -g $I -e iso_tpm.txt -o ${M%%.ioi}
 91 | suppa.py psiPerIsoform -g $I -e iso_tpm.txt -o ${M%%.ioi}
 92 | 
 93 | ### Split PSI between 2 conditions:
 94 | Rscript $HOME/scripts/RNA/split_file.R ./${M%%.ioi}_isoform.psi $grp1 $grp2 ${grp1%%_rep*}_iso.psi ${grp2%%_rep*}_iso.psi
 95 | 
 96 | ### diffsplice
 97 | # python3 ~/bin/SUPPA-2.3/suppa.py diffSplice -m empirical -gc -i $M -p  ${grp2%%_rep*}_iso.psi ${grp1%%_rep*}_iso.psi -e ${grp2%%_rep*}_iso.tpm ${grp1%%_rep*}_iso.tpm -o ${M%%.ioi}_${grp2%%_rep*}-${grp1%%_rep*}_diffSplice_iso
 98 | suppa.py diffSplice -m empirical -gc -i $M -p  ${grp2%%_rep*}_iso.psi ${grp1%%_rep*}_iso.psi -e ${grp2%%_rep*}_iso.tpm ${grp1%%_rep*}_iso.tpm -o ${M%%.ioi}_${grp2%%_rep*}-${grp1%%_rep*}_diffSplice_iso
 99 | 
100 | ## collect output
101 | mkdir suppa2_output
102 | mv ${N%%.ioe}_${grp2%%_rep*}-${grp1%%_rep*}_diffSplice_events* -t suppa2_output
103 | mv ${M%%.ioi}_${grp2%%_rep*}-${grp1%%_rep*}_diffSplice_iso* -t suppa2_output
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------
/RNA/SUPPA_pipe_v2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -u
 3 | 
 4 | # Performs event-based splicing analysis using SUPPA2 using stringtie TPM output 
 5 | 
 6 | if [ "$#" -lt 5 ]; then
 7 | echo "Missing arguments!"
 8 | echo "USAGE: SUPPA_pipe_v2.sh <annotation> <file dir> <group1> <group2> <name>"
 9 | echo "EXAMPLE: SUPPA_pipe_v2.sh AtRTD2_QUASI_19April2016.gtf raw_files/ col0_rep1,col0_rep2,col0_rep3 grp7_rep1,grp7_rep2,grp7_rep3 RTD2-quasi"
10 | exit 1
11 | fi
12 | 
13 | #### Parameters
14 | ## annotation file
15 | I=$1
16 | ## events output name
17 | N=$5
18 | ## extracted TPM (see stringtie_extract_tpm.r)
19 | S=$2
20 | # group 1 IDs
21 | grp1=$3
22 | # group 2 IDs
23 | grp2=$4
24 | 
25 | ## quantification
26 | mkdir tpm_output
27 | 
28 | fls=$(dir $S)
29 | 
30 | for i in $fls; do 
31 | mkdir tpm_output/${i%%_stringtie.tpm*};
32 | cp $S/$i tpm_output/${i%%_stringtie.tpm*}/abundance.tpm; done
33 | 
34 | python3 ~/bin/SUPPA-2.3/multipleFieldSelection.py -i tpm_output/*/abundance.tpm -k 1 -f 3 -o iso_tpm.txt
35 | 
36 | ### generateEvents
37 | mkdir generateEvents
38 | cd generateEvents
39 | 
40 | ## generate transcript events
41 | python3 ~/bin/SUPPA-2.3/suppa.py generateEvents -i $I -o $N -f ioi
42 | M="${N}.ioi"
43 | 
44 | ## generate local AS events
45 | python3 ~/bin/SUPPA-2.3/suppa.py generateEvents -i $I -o $N -f ioe -e SE SS MX RI FL
46 | 
47 | #Put all the ioe events in the same file:
48 | awk '
49 |     FNR==1 && NR!=1 { while (/^<header>/) getline; }
50 |     1 {print}
51 | ' *.ioe > ${N}.allevents.ioe
52 | N="${N}.allevents.ioe"
53 | 
54 | mv $M ../
55 | mv $N ../
56 | 
57 | awk '
58 |     FNR==1 && NR!=1 { while (/^<header>/) getline; }
59 |     1 {print}
60 | ' *.gtf > ${N%%.allevents*}.allevents.gtf
61 | mv *.allevents.gtf ../
62 | 
63 | cd ../
64 | 
65 | ### PSI per event
66 | python3 ~/bin/SUPPA-2.3/suppa.py psiPerEvent -i $N -e iso_tpm.txt -o ${N%%.allevents*}_events
67 | 
68 | ### Differential splicing with local events
69 | ## PSI and TPM per condition
70 | Rscript $HOME/scripts/RNA/split_file.R ./iso_tpm.txt $grp1 $grp2 ${grp1%%_rep*}_iso.tpm ${grp2%%_rep*}_iso.tpm
71 | 
72 | Rscript $HOME/scripts/RNA/split_file.R ./${N%%.allevents*}_events.psi $grp1 $grp2 ${grp1%%_rep*}_events.psi ${grp2%%_rep*}_events.psi
73 | 
74 | ## differential splicing analysis
75 | python3 ~/bin/SUPPA-2.3/suppa.py diffSplice -m empirical -gc -i $N -p ${grp2%%_rep*}_events.psi ${grp1%%_rep*}_events.psi -e ${grp2%%_rep*}_iso.tpm ${grp1%%_rep*}_iso.tpm -o ${N%%.ioe}_${grp2%%_rep*}-${grp1%%_rep*}_diffSplice_events
76 | 
77 | ## differential trascript usage
78 | ### PSI per isoform
79 | python3 ~/bin/SUPPA-2.3/suppa.py psiPerIsoform -g $I -e iso_tpm.txt -o ${M%%.ioi}
80 | 
81 | ### Split PSI between 2 conditions:
82 | Rscript $HOME/scripts/RNA/split_file.R ./${M%%.ioi}_isoform.psi $grp1 $grp2 ${grp1%%_rep*}_iso.psi ${grp2%%_rep*}_iso.psi
83 | 
84 | ### diffsplice
85 | python3 ~/bin/SUPPA-2.3/suppa.py diffSplice -m empirical -gc -i $M -p  ${grp2%%_rep*}_iso.psi ${grp1%%_rep*}_iso.psi -e ${grp2%%_rep*}_iso.tpm ${grp1%%_rep*}_iso.tpm -o ${M%%.ioi}_${grp2%%_rep*}-${grp1%%_rep*}_diffSplice_iso
86 | 
87 | ## collect output
88 | mkdir suppa2_output
89 | mv ${N%%.ioe}_${grp2%%_rep*}-${grp1%%_rep*}_diffSplice_events* -t suppa2_output
90 | mv ${M%%.ioi}_${grp2%%_rep*}-${grp1%%_rep*}_diffSplice_iso* -t suppa2_output
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/RNA/TSI_calculation.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # args[1] = filename
 3 | # Runs with BAM_to_TSI.sh
 4 | # Calculate terminal stalling index from 5'P end counts surrounding the stop codon
 5 | 
 6 | options(echo=T)
 7 | library(fields)
 8 | library(tidyverse)
 9 | args=commandArgs(trailingOnly=T)
10 | print(args)
11 | 
12 | # Read in file
13 | input <- read.delim(args[1],head=F) %>% 
14 | # Remove reads to plastid and mitochondria
15 | 	subset(V1 != 'ChrM' & V1 != 'ChrC' & V1 != 'Mt' & V1 != 'Pt') %>%
16 | # calculate position relative to first base of stop codon
17 | 	mutate(pos = ifelse(V10 == "+", V2-V6, V7-V3)) %>%
18 | 	subset(pos > -50 & pos < 1)
19 | 
20 | # get total end counts in window
21 | stop_5p_sum <- group_by(input, V8) %>%
22 | 	summarise(avg_frame=mean(V4), avg_frame_true=sum(V4)/50, total_counts=sum(V4))
23 | 
24 | # Get sum of normalized reads (i.e.normalized occurrence of 5'P ends [Pi] in Lee et al 2019 Plant Cell) then calculate relative frequency per nt
25 | a1 <- group_by(input, V8) %>% 
26 | 	subset(pos == -16 | pos == -17) %>%
27 | 	summarise(avg_ctrd = mean(V4), avg_ctrd_true = sum(V4)/2) %>%
28 | 	mutate(avg_frame = stop_5p_sum$avg_frame[match(V8, stop_5p_sum$V8)]) %>%
29 | 	mutate(total_counts = stop_5p_sum$total_counts[match(V8, stop_5p_sum$V8)]) %>%
30 | 	subset(total_counts >= 10) %>%
31 | 	mutate(tsi = avg_ctrd/avg_frame)
32 | 
33 | # name output filea
34 | name <- sapply(strsplit(as.character(args[1]),'Aligned'), function(l) l[1])
35 | 
36 | ## diagnostic plot on single sample
37 | pdf(paste0(name,"TSI.pdf"))
38 | plot(y=log2(a1$avg_ctrd), x=log2(a1$avg_frame), col = ifelse(a1$tsi > 2, "salmon", "grey"))
39 | abline(a=0, b=1)
40 | dev.off()
41 | 
42 | ## output
43 | write.table(a1, paste0(name,"TSI.txt"), sep='\t', quote=F, row.names=F)
44 | 
45 | 


--------------------------------------------------------------------------------
/RNA/featureCounts_v1.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Use featureCounts to assign counts to annotated features (e.g. genes, transposons) from aligned BAM files
  4 | 
  5 | set -eu
  6 | 
  7 | if [ "$#" -lt 5 ]; then
  8 | echo "Missing arguments!"
  9 | echo "USAGE: RNAseq_featureCounts.sh <filename> <SE/PE> <library strandedness> <bedfile> <bedfile format> <outname>"
 10 | echo "EXAMPLE: RNAseq_featureCounts.sh col0-r1.sorted.bam PE 1 AtRTD2_19April2016.gtf gtf RTD2"
 11 | echo "library strandedness: 0 = unstranded, 1 = stranded, 2 = reverse stranded"
 12 | echo "format: saf, bed, gtf"
 13 | exit 1
 14 | fi
 15 | 
 16 | sample=$1
 17 | layout=$2
 18 | strand=$3
 19 | bedfile=$4
 20 | format=$5
 21 | outname=$6
 22 | 
 23 | echo ""
 24 | echo "sample = $1"
 25 | echo "layout = $2"
 26 | echo "strand = $3"
 27 | echo "bedfile = $4 ($format) ($outname)"
 28 | echo ""
 29 | echo "$layout $strand featureCounts on $bedfile $format ($outname) in $sample ..."
 30 | echo ""
 31 | 
 32 | if [[ $format == "saf" ]]; then
 33 | 	
 34 | 	if [[ $layout == "SE" ]]; then
 35 | 		featureCounts\
 36 | 			-F 'SAF'\
 37 | 			-C\
 38 | 			-T 2\
 39 | 			-s $strand\
 40 | 			-a $bedfile\
 41 | 		        -o "${1%%.bam*}_${outname}.counts"\
 42 | 		        $sample 2>&1 | tee -a ../*log
 43 | 	fi
 44 | 
 45 | 	if [[ $layout == "PE" ]]; then
 46 | 		featureCounts\
 47 | 			-F SAF\
 48 | 			-p\
 49 | 			-C\
 50 | 			-T 2\
 51 | 			-s $strand\
 52 | 			-a $bedfile\
 53 | 			-o "${1%%.bam*}_${outname}.counts"\
 54 | 			$sample 2>&1 | tee -a ../*log
 55 | 	fi
 56 | 
 57 | fi
 58 | 
 59 | if [[ $format == "bed" ]]; then
 60 | 	## convert BED to SAF format
 61 | 	awk -F'\t' '{print $4"\t"$1"\t"$2"\t"$3"\t"$6}' $bedfile > temp.saf
 62 | 	awk 'BEGIN {print "GeneID""\t""Chr""\t""Start""\t""End""\t""Strand"}{print}' temp.saf > temp2.saf
 63 | 	
 64 | 	if [[ $layout == "SE" ]]; then 
 65 | 		featureCounts\
 66 | 			-F SAF\
 67 | 			-C\
 68 | 			-T 2\
 69 | 			-s $strand\
 70 | 			-a temp2.saf\
 71 | 			-o "${1%%.bam*}_${outname}.counts"\
 72 | 			$sample 2>&1 | tee -a ../*log
 73 | 	fi
 74 | 		
 75 | 	if [[ $layout == "PE" ]]; then
 76 | 	       featureCounts\
 77 | 		       -F SAF\
 78 | 			-p\
 79 | 			-C\
 80 | 			-T 2\
 81 | 			-s $strand\
 82 | 			-a temp2.saf\
 83 | 			-o "${1%%.bam*}_${outname}.counts"\
 84 | 			$sample 2>&1 | tee -a ../*log
 85 | 	fi
 86 | 
 87 | rm temp*.saf -v
 88 | 
 89 | fi
 90 | 
 91 | if [[ $format == "gtf" ]]; then
 92 | 
 93 | 	if [[ $layout == "SE" ]]; then
 94 | 		featureCounts\
 95 | 			-F GTF\
 96 | 			-C\
 97 | 			-T 4\
 98 | 			-s $strand\
 99 | 			-a $bedfile\
100 | 			-o "${1%%.bam*}_${outname}.counts"\
101 | 			$sample 2>&1 | tee -a ../*log
102 | 	fi
103 | 	
104 | 	if [[ $layout == "PE" ]]; then
105 | 		featureCounts\
106 | 			-F GTF\
107 | 			-p\
108 | 			-C\
109 | 			-T 2\
110 | 			-s $strand\
111 | 			-a $bedfile\
112 | 			-o "${1%%.bam*}_${outname}.counts"\
113 | 		        $sample 2>&1 | tee -a ../*log
114 | 	fi
115 | fi
116 | 
117 | echo "DONE"
118 | 


--------------------------------------------------------------------------------
/RNA/featureCounts_v2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Perform featureCounts on aligned BAM files to assign transcript counts PER EXON based on the AtRTD2 reference transcript dataset (or quasi - for alternative splicing)
 4 | # Additional info https://www.biostars.org/p/321379/
 5 | # https://www.bioconductor.org/packages/devel/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf 
 6 | # Zhang R et al (2017). A high quality Arabidopsis transcriptome for accurate transcript-level analysis of alternative splicing. Nucleic Acids Res. 45: 5061–5073.
 7 | 
 8 | set -eu
 9 | 
10 | if [ "$#" -lt 4 ]; then
11 | echo "Missing arguments!"
12 | echo "USAGE: RNAseq_featureCounts.sh <filename> <SE/PE> <library strandedness> <rtd2 or padded>"
13 | echo "EXAMPLE: RNAseq_featureCounts.sh col0-r1.bam PE 2 rtd2"
14 | echo "library strandedness: 0 = unstranded, 1 = stranded, 2 = reverse stranded"
15 | exit 1
16 | fi
17 | 
18 | sample=$1
19 | layout=$2
20 | strand=$3
21 | ref=$4
22 | 
23 | if [[ $ref == "rtd2" ]]; then
24 | 	bedfile="$HOME/ref_seqs/AtRTD2/AtRTD2_19April2016.gtf"
25 | 	outname="${sample%%.bam*}_RTD2.counts"
26 | elif [[ $ref == "padded" ]]; then
27 | 	bedfile="$HOME/ref_seqs/AtRTD2/AtRTDv2_QUASI_19April2016.gtf"
28 | 	outname="${sample%%.bam*}_quasi.counts"
29 | else
30 | 	echo " bad argument - pick 'rtd2' or 'padded' "
31 | 	exit 1
32 | fi
33 | 
34 | echo ""
35 | echo "sample = $1"
36 | echo "layout = $2"
37 | echo "strand = $3 where 0 = unstranded, 1 = stranded, 2 = reverse stranded"
38 | echo "$bedfile"
39 | echo ""
40 | echo "Exon feature counting - $layout $strand featureCounts on $bedfile in $sample ..."
41 | echo ""
42 | 
43 | if [[ $layout == "SE" ]]; then 
44 | 
45 | featureCounts -F GTF -C -T 4 -f -t exon -g gene_id -O -s $strand -a $bedfile -o $outname $sample 2>&1 | tee -a ../*log
46 | 	
47 | fi
48 | 	
49 | if [[ $layout == "PE" ]]; then 
50 | 
51 | featureCounts -F GTF -p	-C -T 4 -f -t exon -g gene_id -O -s $strand -a $bedfile -o $outname $sample 2>&1 | tee -a ../*log
52 | 
53 | fi
54 | 
55 | # -C - Do not count read pairs matching different chromosomes
56 | # -f - Perform read counting at feature level (e.g. exons vs genes)
57 | # -t - Specify feature
58 | # -g - Specify attribute
59 | # -O - Keep reads assigned to multiple features
60 | # -p - paired-end reads, count fragments
61 | 
62 | echo "DONE"
63 | 


--------------------------------------------------------------------------------
/RNA/featureCounts_v3-gtf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Perform featureCounts on aligned BAM files to assign counts across specified features in a GTF file
 4 | 
 5 | set -eu
 6 | 
 7 | if [ "$#" -lt 5 ]; then
 8 | echo "Missing arguments!"
 9 | echo "USAGE: RNAseq_featureCounts_v3-gtf.sh <filename> <SE/PE> <library strandedness> <bedfile> <gtf feature>"
10 | echo "EXAMPLE: RNAseq_featureCounts_v3-gtf.sh col0-r1.bam PE 2 Arabidopsis.gtf gene"
11 | echo "library strandedness: 0 = unstranded, 1 = stranded, 2 = reverse stranded"
12 | exit 1
13 | fi
14 | 
15 | sample=$1
16 | layout=$2
17 | strand=$3
18 | bedfile=$4
19 | feat=$5
20 | outname="${sample%%.bam*}_${feat}.gtf.counts"
21 | 
22 | echo ""
23 | echo "sample = $1"
24 | echo "layout = $2"
25 | echo "strand = $3 where 0 = unstranded, 1 = stranded, 2 = reverse stranded"
26 | echo "Counts at $5 in $bedfile"
27 | echo ""
28 | 
29 | if [[ $layout == "SE" ]]; then 
30 | 
31 | featureCounts -F GTF -C -T 4 -M -t $feat -g gene_id -O -s $strand -a $bedfile -o $outname $sample
32 | 	
33 | fi
34 | 	
35 | if [[ $layout == "PE" ]]; then 
36 | 
37 | featureCounts -F GTF -p	-C -M -T 4 -t $feat -g gene_id -O -s $strand -a $bedfile -o $outname $sample
38 | 
39 | fi
40 | 
41 | # -C - Do not count read pairs matching different chromosomes
42 | # -t - Specify feature
43 | # -g - Specify attribute
44 | # -O - Keep reads assigned to multiple features
45 | # -p - paired-end reads, count fragments
46 | # -M - Multi-mapping reads will also be counted.
47 | 
48 | echo "DONE"
49 | 


--------------------------------------------------------------------------------
/RNA/get_peak_length.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -u
 4 | 
 5 | ## get average peak length from macs2 peak calling output (bed file)
 6 | 
 7 | if [ "$#" -lt 1 ]; then
 8 |         echo "Missing required arguments!"
 9 |         echo "USAGE: get_peak_length.sh <sample>"
10 |         echo "EXAMPLE: get_peak_length.sh col0-r1.merged.bed"
11 |         exit 1
12 | fi
13 | 
14 | sample=$1
15 | 
16 | awk '{ $10= $3 - $2} { sum += $10} END {print sum / NR}' $sample
17 | 
18 | 
19 |  
20 | 


--------------------------------------------------------------------------------
/RNA/graft-nad-seq.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -eu
  3 | 
  4 | # Single-end read alignment for GRAFT-NAD-seq libraries with stringent STAR parameters
  5 | 
  6 | # Prepare STAR index based on TAIR10 reference
  7 | # wget ftp://ftp.ensemblgenomes.org/pub/release-47/plants/fasta/arabidopsis_thaliana/dna/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.gz
  8 | # samtools faidx Arabidopsis_thaliana.TAIR10.dna.toplevel.fa
  9 | # cut -f1,2 Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.fai > Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len
 10 | 
 11 | # Build STAR genome index
 12 | # STAR --runThreadN 4 --runMode genomeGenerate --genomeSAindexNbases 12 --sjdbGTFfile Arabidopsis_thaliana.TAIR10.54.gtf --genomeDir /path/to/GenomeDir/ --genomeFastaFiles Arabidopsis_thaliana.TAIR10.dna.toplevel.fa
 13 | 
 14 | ### CONDA environment is installed
 15 | # conda create --name graft_nad
 16 | # conda install -n graft_nad -c bioconda fastqc
 17 | # conda install -n graft_nad -c bioconda cutadapt
 18 | # conda install -n graft_nad -c bioconda star
 19 | # conda install -n graft_nad -c bioconda seqkit 
 20 | # conda install -n graft_nad -c conda-forge r-tidyverse
 21 | # conda install -n graft_nad conda-forge::parallel ## NOT IMPLEMENTED YET
 22 | 
 23 | if [ "$#" -lt 3 ] || [ "$#" -gt 3 ]; then
 24 | echo "Missing required arguments!"
 25 | echo "USAGE: graft-nad-seq.sh <fastq R2> </path/to/index> <fileID>"
 26 | echo "EXAMPLE: graft-nad-seq.sh sample.fastq ~/ref_seqs/STAR/TAIR10/GenomeDir sample_rep1"
 27 | exit 1
 28 | fi
 29 | 
 30 | #gather input variables
 31 | fq=$1
 32 | index=$2;
 33 | fileID=$3;
 34 | dow=$(date +"%F-%H-%m")
 35 | 
 36 | echo "##################"
 37 | echo "Performing single-end alignment with the following parameters:"
 38 | echo "Input Files: $fq"
 39 | echo "genome index: $index"
 40 | echo "Output ID: $fileID"
 41 | echo "Time of analysis: $dow"
 42 | echo "##################"
 43 | 
 44 | # make sample work directory
 45 | mkdir ${fileID}_graft-nad_${dow}
 46 | mv $fq ${fileID}_graft-nad_${dow}
 47 | cd ${fileID}_graft-nad_${dow}
 48 | 
 49 | echo "Extract branch sequence and trim adapters"
 50 | mkdir 1_read_trimming
 51 | mv $fq 1_read_trimming
 52 | cd 1_read_trimming
 53 | 
 54 | ## extract reads beginning with branch sequence (GCTTGTTGTG) with flexibility at first and last base
 55 | if [[ $fq == *"fq.gz" ]]; 
 56 | 	then seqkit grep -j 12 -s -r -p "^.CTTGTTGT" $fq -o ${fq%%.fq*}_branch.fq.gz;
 57 | 	else seqkit grep -j 12 -s -r -p "^.CTTGTTGT" $fq -o ${fq%%.fastq*}_branch.fq.gz; 
 58 | fi
 59 | 
 60 | if [[ $fq == *"fq.gz" ]]; then fq_branch=${fq%%.fq*}_branch.fq.gz; else fq_branch=${fq%%.fastq*}_branch.fq.gz; fi
 61 | 
 62 | echo "Trim universal PCR primer sequence from 3' end of read"
 63 | ## remove universal PCR primer at the 3' end of reads
 64 | cutadapt -a "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT" -e 0.2 -m 25 -o "${fq_branch%%.fq*}_3p_trimmed.fq.gz" ${fq_branch} 2>&1 | tee -a ../${fileID}_logs_${dow}.log
 65 | 
 66 | ## old flags to trim 5' adapter sequences -- obselete
 67 | #cutadapt -g "^NCTTGTTGTB" -g "^NCTTGTTGTBB" -g "^NCTTGTTGTBBB" -g  "^NCTTGTTGTBBBG"
 68 | 
 69 | ## R script - trim reads first A at the 5' end of the read, retain read only if A within first 15 bp (10bp branch + flexibility for RT jumping)
 70 | echo "Filter and trim reads based on A at 5' end"
 71 | 
 72 | ## split fq file in 12 files for multi-threading
 73 | zcat ${fq_branch%%.fq*}_3p_trimmed.fq.gz 2>&1 | seqkit split -p 12 | tee -a ../${fileID}_logs_${dow}.log
 74 | 
 75 | ## run triming R script on split files in parallel - first numeric argument determines length in which A needs to occur (branch sequence = 10 nts)
 76 | parallel -j 12 Rscript ~/scripts/RNA/trim_5p_graft_nad.r 15 {} ::: stdin.split/stdin.part_*.fastq 2>&1 | tee -a ../${fileID}_logs_${dow}.log
 77 | 
 78 | ## concatenate output files
 79 | cd stdin.split
 80 | cat *processed.fq > ${fileID}_processed_output.fq
 81 | pigz -p 4 ${fileID}_processed_output.fq
 82 | mv ${fileID}_processed_output.fq.gz ../
 83 | cd ../
 84 | 
 85 | # clean up intermediates
 86 | rm -r stdin.split
 87 | 
 88 | ## qc filtered and trimmed reads
 89 | fastqc -t 12 ${fileID}_processed_output.fq.gz 2>&1 | tee -a ../${fileID}_logs_${dow}.log
 90 | 
 91 | cd ../
 92 | mkdir 0_fastq
 93 | mv 1_read_trimming/$fq  0_fastq/
 94 | mv 1_read_trimming/$fq_branch 0_fastq/
 95 | 
 96 | # read alignment
 97 | echo "Align filtered and trimmed reads"
 98 | 
 99 | mkdir 2_align
100 | mv 1_read_trimming/${fileID}_processed_output.fq.gz 2_align/
101 | cd 2_align
102 | 
103 | STAR --runThreadN 12 --outFilterMismatchNmax 0 --outFilterMultimapNmax 1 --genomeDir $index --readFilesCommand gunzip -c --readFilesIn ${fileID}_processed_output.fq.gz --outFileNamePrefix "${fileID}_" --outSAMtype BAM SortedByCoordinate --limitBAMsortRAM 8000000000 2>&1 | tee -a  ../${fileID}_logs_${dow}.log
104 | 
105 | mv ${fileID}_processed_output.fq.gz ../1_read_trimming/
106 | 
107 | echo "Alignment complete"
108 | 
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/RNA/nadBAM_to_ADPRC_sites.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -u
 3 | 
 4 | # Compute NAD enrichment per site (start of read) by providing +ADPRC and -ADPRC samples as BAM aligned reads
 5 | 
 6 | ### CONDA environment is installed
 7 | # conda create --name ngs_plots
 8 | # conda install -n ngs_plots -c bioconda bedtools
 9 | # conda install -n ngs_plots r-fields
10 | # conda install -n ngs_plots -c r r-tidyverse
11 | # conda activate ngs_plots
12 | 
13 | 
14 | if [ "$#" -lt 5 ]; then
15 | echo "Missing arguments!"
16 | echo "USAGE: graft_nad_adprc_enrich.sh <+adprc bam> <-adprc bam> <sample name> <bedfile> <feature>"
17 | echo "EXAMPLE: graft_nad_adprc_enrich.sh WT_plus-a_rep1.sorted.bam WT_minus-a_rep1.sorted.bam WT_rep1 tss.bed tss"
18 | exit 1
19 | fi
20 | 
21 | smp_p=$1
22 | smp_m=$2
23 | out=$3
24 | bedfile=$4
25 | feature=$5
26 | 
27 | echo "###############"
28 | echo "+ADPRC sample = $1"
29 | echo "-ADPRC sample= $2"
30 | echo "Sample ID = $3"
31 | echo "Bedfile = $4"
32 | echo "Feature name = $5"
33 | echo "#############"
34 | 
35 | echo "calculate scaling factors"
36 | scl_p=$(bc <<< "scale=6;1000000/$(samtools view -F 260 -c $smp_p)") 
37 | scl_m=$(bc <<< "scale=6;1000000/$(samtools view -F 260 -c $smp_m)")
38 | 
39 | echo "BAM to bed..."
40 | bedtools genomecov -bg -5 -scale $scl_p -ibam $smp_p > ${smp_p%%.bam}.5p.bed
41 | bedtools genomecov -bg -5 -scale $scl_m -ibam $smp_m > ${smp_m%%.bam}.5p.bed
42 | 
43 | echo "Combine + and -ADPRC samples to calculate per-nt NAD%"
44 | bedtools unionbedg -names plus minus -i ${smp_p%%.bam}.5p.bed ${smp_m%%.bam}.5p.bed | awk 'BEGIN {FS=OFS="\t"} {prop = $4 / ($4 + $5) } ($4 > 3) && (prop > 0.9) {print $0, prop}' > ${out}.nad.5p.bed
45 | 
46 | #echo "closestBed..."
47 | #closestBed -D "a" -a ${out}.nad.5p.bed -b $bedfile > ${out}_${feature}_nad.5p.bed
48 | #awk -F$'\t' '$NF<51 && $NF>-51' ${out}_${feature}_nad.5p.bed > ${out}_${feature}_20bp_nad.5p.bed 
49 | 
50 | #echo "Maths ..."
51 | #Rscript /home/dganguly/scripts/RNA/rel_expression_plots_nad.r ${out}_${feature}_20bp_nad.5p.bed
52 | 
53 | echo 'cleaning'
54 | rm -v ${smp_p%%.bam}.5p.bed ${smp_m%%.bam}.5p.bed
55 | 


--------------------------------------------------------------------------------
/RNA/pare_pipe_v1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eu
 3 | 
 4 | # Read alignment for PARE libraries with STAR
 5 | # SE only, trim adatpers and low quality basecalls (reduce length cutoff for trimming), map reads up to 20 bp length with 0 mismatches using STAR.
 6 | 
 7 | # Retrieve TAIR10 reference and prepare STAR index
 8 | # wget ftp://ftp.ensemblgenomes.org/pub/release-47/plants/fasta/arabidopsis_thaliana/dna/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.gz
 9 | # samtools faidx Arabidopsis_thaliana.TAIR10.dna.toplevel.fa
10 | # cut -f1,2 Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.fai > Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len
11 | 
12 | # Build STAR genome index
13 | # STAR --runThreadN 4 --runMode genomeGenerate --genomeSAindexNbases 12 --sjdbGTFfile Arabidopsis_thaliana.TAIR10.54.gtf --genomeDir /path/to/GenomeDir/ --genomeFastaFiles Arabidopsis_thaliana.TAIR10.dna.toplevel.fa
14 | 
15 | ### CONDA environment is installed
16 | # conda create --name <name>
17 | # conda install -n <name> -c bioconda fastqc
18 | # conda install -n <name> -c bioconda star
19 | # conda install -n <name> -c bioconda bedtools
20 | # conda install -c bioconda fastx_toolkit
21 | 
22 | if [ "$#" -lt 3 ]; then
23 | echo "Missing required arguments!"
24 | echo "USAGE: pare_pipe_v1.sh <fastq R1> </path/to/index> <fileID>"
25 | echo "EXAMPLE: pare_pipe_v1.sh sample.fastq /home/dganguly/ref_seqs/STAR/TAIR10/GenomeDir sample_rep1"
26 | exit 1
27 | fi
28 | 
29 | #gather input variables
30 | fq=$1
31 | index=$2; #path to STAR index
32 | fileID=$3;
33 | dow=$(date +"%F-%H-%m")
34 | 
35 | echo "##################"
36 | echo "Performing single-end alignment with the following parameters:"
37 | echo "Input Files: $fq"
38 | echo "genome index: $index"
39 | echo "Output ID: $fileID"
40 | echo "Time of analysis: $dow"
41 | echo "##################"
42 | 
43 | # make sample work directory
44 | mkdir ${fileID}_pare_${dow}
45 | mv $fq ${fileID}_pare_${dow}
46 | cd ${fileID}_pare_${dow}
47 | 
48 | # gzip if unzipped input file
49 | if [[ $fq != *.gz ]];then gzip $fq; fq="${fq}.gz"; fi
50 | 
51 | # initial fastqc
52 | mkdir 1_fastqc
53 | fastqc -t 8 $fq 2>&1 | tee -a ${fileID}_logs_${dow}.log
54 | mv ${fq%%.fastq*}_fastqc* 1_fastqc
55 | 
56 | echo "Read trimming... "
57 | # Trim_galore: remove adapters and low quality base-calls, retain reads as small as 10 bp, generate fastqc report on trimmed reads.
58 | mkdir 2_read_trimming
59 | cd 2_read_trimming
60 | trim_galore --length 10 --fastqc --fastqc_args "-t 8" ../$fq 2>&1 | tee -a ../${fileID}_logs_${dow}.log
61 | 
62 | cd ../
63 | mkdir 0_fastq
64 | mv $fq 0_fastq
65 | 
66 | mkdir 3_align
67 | mv 2_read_trimming/${fq%%.fastq*}_trimmed.fq.gz -t 3_align/
68 | cd 3_align
69 | echo "Beginning alignment ..."
70 | 
71 | # truncate to first 20 nt, discard sequences shorter than 20 nucleotides
72 | zcat ${fq%%.fastq*}_trimmed.fq.gz | fastx_trimmer -z -l 20 -o ${fq%%.fastq}.20bp.trimmed.fq.gz
73 | input=${fq%%.fastq}.20bp.trimmed.fq.gz
74 | 
75 | # align using STAR allowing 0 mismatches
76 | STAR --runThreadN 8 --outFilterMismatchNmax 0 --outFilterScoreMinOverLread 0.75 --outFilterMatchNminOverLread 0.75 --outFilterMultimapNmax 1 --genomeDir $index --readFilesCommand gunzip -c --readFilesIn $input --outFileNamePrefix "${fileID}_" --outSAMtype BAM SortedByCoordinate | tee -a  ../${fileID}_logs_${dow}.log
77 | 
78 | echo "cleaning..."
79 | 
80 | outbam="${fileID}*.sortedByCoord.out.bam"
81 | samtools index $outbam 2>&1 | tee -a ../${fileID}_logs_${dow}.log
82 | mv *trimmed.fq.gz ../2_read_trimming/
83 | 
84 | echo "Alignment complete"
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/RNA/rel_expression_plots.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # args[1] = filename
 3 | # Run on output of BAM_to_bedgraph.sh or BAM_to_bedgraph_5p.sh
 4 | # Summarise read depth across all features of interest by binned positions
 5 | 
 6 | options(echo=T)
 7 | library(fields)
 8 | args=commandArgs(trailingOnly=T)
 9 | print(args)
10 | 
11 | # Read in file
12 | input=read.delim(args[1],head=F)
13 | 
14 | # Remove plastids and unmatched rows
15 | input=subset(input,input$V1!='ChrM' & input$V1!='ChrC' & input$V1 != 'Mt' & input$V1 != 'Pt')
16 | input=subset(input,input[,ncol(input)] != -1)
17 | 
18 | # calculate normalized distance values for reads relative to feature
19 | rel.dist=matrix(ifelse(input$V11==0,ifelse(input[,10]=="-",((input[,7] - (input[,2]))/(input[,7] - input[,6]))*100,(((input[,2]) - input[,6])/(input[,7] - input[,6]))*100),ifelse(input$V11>0,input$V11 + 100,input$V11)),ncol=1)
20 | input=cbind(input,rel.dist)
21 | fixy=ifelse(input$rel.dist < 0 & input$V11==0,0,ifelse(input$rel.dist > 100 & input$V11==0, 100, input$rel.dist))
22 | input$rel.dist=fixy
23 | 
24 | # bin read depth by distance
25 | exp.bin=stats.bin(input$rel.dist,input$V4,N=100)
26 | p.bin=cbind(matrix(exp.bin$centers,ncol=1),exp.bin$stats["mean",])
27 | out=cbind(p.bin)
28 | name <- sapply(strsplit(as.character(args[1]),'_'), function(l) l[1])
29 | colnames(out)=c('pos',paste(name))
30 | name2 <- sapply(strsplit(args[1], 'bed'), function(l) l[1])
31 | write.table(out,paste(name2,'values.txt',sep=''),sep='\t', quote=F, row.names=F)
32 | 


--------------------------------------------------------------------------------
/RNA/rel_expression_plots_ejc.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # args[1] = filename
 3 | # Run on output of BAM_to_EJC.sh
 4 | # Computes normalized 5'P end frequency upstream of exon-exon junction (> 49 nt length, see Lee et al 2020 The Plant Cell)
 5 | 
 6 | options(echo=T)
 7 | library(fields)
 8 | library(tidyverse)
 9 | args=commandArgs(trailingOnly=T)
10 | print(args)
11 | 
12 | rpm_scale <- as.numeric(paste(args[2]))
13 | 
14 | # Read in file
15 | input <- read.delim(args[1],head=F) %>% 
16 | # Remove reads to plastid and mitochondria
17 | 	subset(V1 != 'ChrM' & V1 != 'ChrC' & V1 != 'Mt' & V1 != 'Pt') %>%
18 | 	mutate(length = V7 - V6) %>%
19 | # features at least 50 bp in length
20 | 	subset(length > 49) %>%
21 | # calculate position relative to 3' end of feature
22 | 	mutate(pos = ifelse(V10 == "+", V2-V7, V6-V3)) %>%
23 | 	mutate(rpm = V4 * rpm_scale)
24 | 
25 | # sum all reads in 50 nt window upstream of 3' end
26 | exon_3p_sum <- subset(input, pos < 0 & pos > -51) %>%
27 | 	group_by(V8) %>%
28 | 	summarise(sum_rpm=sum(rpm))
29 | 
30 | # normalise depth per nt by sum of reads across 50 nt window and filter for scaled read depth > 0
31 | exon_3p <- subset(input, pos <= 0 & pos > -51) %>%
32 | 	mutate(sum_rpm = exon_3p_sum$sum_rpm[match(V8, exon_3p_sum$V8)]) %>%
33 | 	mutate(norm_counts = rpm/sum_rpm) %>%
34 | 	subset(abs(sum_rpm) > 1)
35 | 
36 | # Get sum of normalized reads (i.e. normalized occurrence of 5'P ends, Pi) then calculate relative frequency per nt
37 | sum_exon_3p <- group_by(exon_3p, pos) %>%
38 | 	summarise(sum_norm_counts = sum(norm_counts), counts_raw = sum(V4), counts_rpm = sum(rpm)) %>% ## sum of normalized counts (Pi), raw coubts, and scaled counts (rpm) per nt
39 | 	mutate(total_counts = sum(sum_norm_counts)) %>% ## sum of all normalized counts 
40 | 	mutate(rel_freq = sum_norm_counts/total_counts) %>% ## freq of normalized counts per position relative to all normalized counts
41 | 	select(pos, sum_norm_counts, rel_freq) # take columns for Pi and relative frequency of Pi
42 | 
43 | # name output file
44 | name <- sapply(strsplit(as.character(args[1]),'.bed'), function(l) l[1])
45 | 
46 | ## diagnostic plot on single sample
47 | pdf(paste0(name,".3p.pdf"))
48 | plot(y=sum_exon_3p$rel_freq, x= sum_exon_3p$pos)
49 | dev.off()
50 | 
51 | ## output
52 | write.table(sum_exon_3p, paste0(name,".ejc.txt"), sep='\t', quote=F, row.names=F)
53 | 
54 | 


--------------------------------------------------------------------------------
/RNA/rel_expression_plots_nad.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Run on output of graft_nad_adprc_enrich.sh
 3 | 
 4 | options(echo=T)
 5 | library(fields)
 6 | library(tidyverse)
 7 | args=commandArgs(trailingOnly=T)
 8 | print(args)
 9 | 
10 | # Read in file
11 | input <- read.delim(args[1],head=F) %>% 
12 | # Remove reads to plastid and mitochondria
13 | 	subset(V1 != 'ChrM' & V1 != 'ChrC' & V1 != 'Mt' & V1 != 'Pt') %>%
14 | # calculate position relative to 5' end of feature
15 | 	mutate(pos = ifelse(V12 == "+", V2-V8, V9-V3)) %>%
16 | 	subset(pos <= 50 & pos >= -50) %>%
17 | 	subset(V4 >= 1) ## at least 1 RPM in ADPRC sample
18 | 
19 | # summarise prop NAD+
20 | df1 <- group_by(input, pos) %>%
21 | 	summarise(mean_prop = mean(V6)) # mean prop NAD = A+ / A-
22 | 
23 | # name output file
24 | name <- sapply(strsplit(as.character(args[1]),'.bed'), function(l) l[1])
25 | 
26 | ## diagnostic plot on single sample
27 | pdf(paste0(name,".pdf"))
28 | plot(y=df1$mean_prop, x= df1$pos)
29 | dev.off()
30 | 
31 | ## output
32 | write.table(df1, paste0(name,".ejc.txt"), sep='\t', quote=F, row.names=F)
33 | 
34 | 


--------------------------------------------------------------------------------
/RNA/rel_expression_plots_stop.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # args[1] = filename
 3 | # Run on output of BAM_to_STOP.sh
 4 | # computes 5'P end depth relative to first nucleotide of STOP or START codon 
 5 | 
 6 | options(echo=T)
 7 | library(fields)
 8 | library(tidyverse)
 9 | args=commandArgs(trailingOnly=T)
10 | print(args)
11 | 
12 | rpm_scale <- as.numeric(paste(args[2]))
13 | 
14 | # Read in file
15 | input <- read.delim(args[1],head=F) %>% 
16 | # Remove reads to plastid and mitochondria
17 | 	subset(V1 != 'ChrM' & V1 != 'ChrC' & V1 != 'Mt' & V1 != 'Pt') %>%
18 | # calculate position relative to first base of STOP or START codon
19 | 	mutate(pos = ifelse(V10 == "+", V2-V6, V7-V3)) %>%
20 | 	mutate(rpm = V4 * rpm_scale)
21 | 
22 | # sum all reads in 50 nt window upstream of 3' end
23 | stop_5p_sum <- group_by(input, V8) %>%
24 | 	summarise(sum_rpm=sum(rpm), sum_reads=sum(V4))
25 | 
26 | # normalise depth per nt by sum of reads across window and filter for scaled read depth > 0
27 | stop_5p <- mutate(input, sum_rpm = stop_5p_sum$sum_rpm[match(V8, stop_5p_sum$V8)]) %>%
28 | 	mutate(norm_counts = rpm/sum_rpm) %>%
29 | 	subset(sum_rpm > 1)
30 | 
31 | # Get sum of normalized reads (i.e.normalized occurrence of 5'P ends, Pi) then calculate relative frequency per nt
32 | sum_stop_5p <- group_by(stop_5p, pos) %>% 
33 | 	summarise(sum_norm_counts = sum(norm_counts), counts_raw = sum(V4), counts_rpm = sum(rpm)) %>% ## sum of normalized counts (Pi), raw counts, and scaled counts (rpm) per nt  
34 | 	mutate(total_counts = sum(sum_norm_counts)) %>% ## sum of all normalized counts 
35 | 	mutate(rel_freq = sum_norm_counts/total_counts) %>% ## freq of normalized counts per position relative to all normalized counts
36 | 	select(pos, sum_norm_counts, rel_freq)
37 | 
38 | # name output file
39 | name <- sapply(strsplit(as.character(args[1]),'.bed'), function(l) l[1])
40 | 
41 | ## diagnostic plot on single sample
42 | pdf(paste0(name,".stop.pdf"))
43 | plot(y=sum_stop_5p$rel_freq, x= sum_stop_5p$pos)
44 | dev.off()
45 | 
46 | ## output
47 | write.table(sum_stop_5p, paste0(name,".stop.txt"), sep='\t', quote=F, row.names=F)
48 | 
49 | 


--------------------------------------------------------------------------------
/RNA/smrna_pipe_v1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eu
 3 | 
 4 | # Read alignment for smRNA-seq libraries with STAR
 5 | # SE only, trim adapters and low quality bases (remove length cutoff for trimming), trim reads to first 25 bp. Map with STAR with 0 mismatches and min mapped length 17.
 6 | 
 7 | # Retrieve TAIR10 reference and prepare STAR index
 8 | # wget ftp://ftp.ensemblgenomes.org/pub/release-47/plants/fasta/arabidopsis_thaliana/dna/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.gz
 9 | # samtools faidx Arabidopsis_thaliana.TAIR10.dna.toplevel.fa
10 | # cut -f1,2 Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.fai > Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len
11 | 
12 | # Build STAR genome index
13 | # STAR --runThreadN 4 --runMode genomeGenerate --genomeSAindexNbases 12 --sjdbGTFfile Arabidopsis_thaliana.TAIR10.54.gtf --genomeDir /path/to/GenomeDir/ --genomeFastaFiles Arabidopsis_thaliana.TAIR10.dna.toplevel.fa
14 | 
15 | ### CONDA environment is installed
16 | # conda create --name <name>
17 | # conda install -n <name> -c bioconda fastqc
18 | # conda install -n <name> -c bioconda star
19 | # conda install -n <name> -c bioconda bedtools
20 | # conda install -c bioconda fastx_toolkit
21 | 
22 | if [ "$#" -lt 3 ]; then
23 | echo "Missing required arguments!"
24 | echo "USAGE: smrna_pipe_v1.sh <fastq R1> </path/to/index> <fileID>"
25 | echo "EXAMPLE: smrna_pipe_v1.sh sample.fastq /home/dganguly/ref_seqs/STAR/TAIR10/GenomeDir sample_rep1"
26 | exit 1
27 | fi
28 | 
29 | #gather input variables
30 | fq=$1
31 | index=$2; #path to STAR index
32 | fileID=$3;
33 | dow=$(date +"%F-%H-%m-%S")
34 | 
35 | echo "##################"
36 | echo "Performing single-end alignment with the following parameters:"
37 | echo "Input Files: $fq"
38 | echo "genome index: $index"
39 | echo "Output ID: $fileID"
40 | echo "Time of analysis: $dow"
41 | echo "##################"
42 | 
43 | # make sample work directory
44 | mkdir ${fileID}_srna_${dow}
45 | mv $fq ${fileID}_srna_${dow}
46 | cd ${fileID}_srna_${dow}
47 | 
48 | # gzip if unzipped input file
49 | if [[ $fq != *.gz ]];then gzip $fq; fq="${fq}.gz"; fi
50 | 
51 | # initial fastqc
52 | mkdir 1_fastqc
53 | fastqc -t 8 $fq 2>&1 | tee -a ${fileID}_logs_${dow}.log
54 | mv ${fq%%.fastq*}_fastqc* 1_fastqc
55 | 
56 | echo "Read trimming... "
57 | # Trim_galore: remove adapters and low quality base-calls, set to small rna mode (min length 15 nt, max length 25 nt), generate fastqc report on trimmed reads.
58 | mkdir 2_read_trimming
59 | cd 2_read_trimming
60 | trim_galore --small_rna --max_length 30 --fastqc --fastqc_args "-t 8" ../$fq 2>&1 | tee -a ../${fileID}_logs_${dow}.log
61 | 
62 | cd ../
63 | mkdir 0_fastq
64 | mv $fq 0_fastq
65 | 
66 | ## prep folder for STAR alignment
67 | mkdir 3_align
68 | mv 2_read_trimming/${fq%%.fastq*}_trimmed.fq.gz -t 3_align/
69 | cd 3_align
70 | echo "Beginning alignment ..."
71 | 
72 | ## define input file
73 | if [[ $fq == *"fq.gz" ]]; then input=${fq%%.fq*}_trimmed.fq*; else input=${fq%%.fastq*}_trimmed.fq*; fi
74 | 
75 | # STAR alignment: 0 mismatches, min mapped length 18 nt, no more than 4 alignments
76 | STAR --runThreadN 8 --outFilterMismatchNmax 0 --outFilterMatchNmin 18 --outFilterMultimapNmax 4 --genomeDir $index --readFilesCommand gunzip -c --readFilesIn $input --outFileNamePrefix "${fileID}_" --outSAMtype BAM SortedByCoordinate | tee -a  ../${fileID}_logs_${dow}.log
77 | 
78 | echo "cleaning..."
79 | 
80 | outbam="${fileID}*.sortedByCoord.out.bam"
81 | samtools index $outbam 2>&1 | tee -a ../${fileID}_logs_${dow}.log
82 | mv *trimmed.fq.gz ../2_read_trimming/
83 | 
84 | echo "Alignment complete"
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/RNA/split_file.R:
--------------------------------------------------------------------------------
 1 | #Given two pairs of lists of samples, split [1] in two files with the samples indicated in [2] and [3]
 2 | #[1] First argument: input file that we want to split
 3 | #[2] Second argument: list of samples of the first condition
 4 | #[3] Third argument: list of samples of the second condition
 5 | #[4] Fourth argument: output file of the first condition
 6 | #[5] Fifth argument: output file of the second condition
 7 | 
 8 | # Parse command line arguments
 9 | print("Parsing samples...")
10 | CHARACTER_command_args <- commandArgs(trailingOnly=TRUE)
11 | 
12 | #Load the input file
13 | print(paste0("Loading ",CHARACTER_command_args[1],"..."))
14 | input_file <- read.table(CHARACTER_command_args[1],header=TRUE)
15 | 
16 | #Load the list of samples of the first condition
17 | first_condition <- unlist(strsplit(CHARACTER_command_args[2],","))
18 | 
19 | #Take the samples of first condition and generate a file with just these columns
20 | first_output <- input_file[first_condition]
21 | 
22 | #Load the list of samples of the second condition
23 | second_condition <- unlist(strsplit(CHARACTER_command_args[3],","))
24 | 
25 | #Take the samples of second condition and generate a file with just these columns
26 | second_output <- input_file[second_condition]
27 | 
28 | #Save the output files
29 | string <- unlist(strsplit(CHARACTER_command_args[1],"/"))
30 | string2 <- paste(string[-length(string)],collapse = "/")
31 | path1 <- paste0(string2,"/",CHARACTER_command_args[4])
32 | write.table(first_output,file=path1,quote=FALSE,sep="\t")
33 | print(paste0("Saved ",path1))
34 | 
35 | path2 <- paste0(string2,"/",CHARACTER_command_args[5])
36 | write.table(second_output,file=path2,quote=FALSE,sep="\t")
37 | print(paste0("Saved ",path2))
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/RNA/stringtie_extract_tpm.r:
--------------------------------------------------------------------------------
 1 | ## extract TPM values for transcripts assembled by stringtie
 2 | ## input is stringtie GTF output file
 3 | options(echo = FALSE)
 4 | library(tidyverse)
 5 | args=commandArgs(trailingOnly=T)
 6 | 
 7 | for(i in args){
 8 | 
 9 | a <- read.delim(i, header=F, skip=2) %>%
10 | 	subset(V3 == "transcript") %>%
11 | 	mutate(target_id = sapply(strsplit(as.character(V9), ';'), function(l) l[2])) %>%
12 | 	mutate(target_id = sapply(strsplit(target_id, "transcript_id "), function(l) l[2])) %>%
13 | 	mutate(length = V5-V4) %>%
14 | 	mutate(tpm = sapply(strsplit(as.character(V9), 'TPM '), function(l) l[2])) %>%
15 | 	mutate(tpm = as.numeric(sapply(strsplit(tpm, ';'), function(l) l[1]))) %>%
16 | 	select(target_id, length, tpm)
17 | 
18 | outfile <- sapply(strsplit(i, "_stringtie"), function(l) l[1])
19 | write.table(a, paste0(outfile,"_stringtie.tpm"), sep='\t', quote=F, col.names=T, row.names=F)
20 | }
21 | 
22 | 


--------------------------------------------------------------------------------
/RNA/stringtie_pipe_v1.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -eu
  3 | 
  4 | # StringTie2 v1: assemble transcripts using aligned reads (BAM output), merge assemblies, and quantify transcript abundance (-eB) for each sample. 
  5 | # software required: samtools, stringtie, gffcompare.
  6 | 
  7 | ## e.g. generate list of sample names 
  8 | ## dir *bam > files.txt
  9 | 
 10 | if [ "$#" -lt 3 ]; then
 11 | echo "Missing required arguments!"
 12 | echo "USAGE: stringtie_pipe_v1.sh <sample list> <library strandedness> <reference GFF3/GTF>"
 13 | echo "EXAMPLE: stringtie_pipe_v1.sh files.txt <un / fr / rf> Arabidopsis_thaliana.TAIR10.46.gff3"
 14 | exit 1
 15 | fi
 16 | 
 17 | ## gather input variables
 18 | smpls=$(cat $1)
 19 | type=$2
 20 | ref=$3
 21 | dow=$(date +"%F-%H-%m")
 22 | 
 23 | echo ""
 24 | echo "Organising BAM files"
 25 | echo ""
 26 | 
 27 | ## ensure all BAM files are sorted and indexed
 28 | new_smpls=""
 29 | 
 30 | for i in $smpls; 
 31 | 	do
 32 | 	if [[ $i != *sorted.bam ]]; then 
 33 | 		samtools sort -@ 4 $i -o "${i%%.bam}.sorted.bam"
 34 | 		fq="${i%%.bam}.sorted.bam"
 35 | 		new_smpls="$new_smpls $fq";
 36 | 	else new_smpls="$new_smpls $i";
 37 | 	fi;
 38 | done
 39 | 
 40 | echo ""
 41 | echo "Transcript assembly per sample"
 42 | echo ""
 43 | 
 44 | ## Use each sample to produce transcript assembly
 45 | 
 46 | if [ "$type" == "un" ] && [ -z "$ref" ]; then
 47 | 
 48 | echo ""
 49 | echo "Unstranded library with no reference annotation"
 50 | echo ""
 51 | 
 52 | for i in $new_smpls; do
 53 | 	stringtie --conservative $i -p 4 -g 10 -o "${i%%.sorted.bam}_stringtie.out";
 54 | 	done
 55 | fi
 56 | 
 57 | if [ "$type" == "un" ] && [ ! -z "$ref" ]; then
 58 | 
 59 | echo ""
 60 | echo "Unstranded library with annotation $ref"
 61 | echo ""
 62 | 
 63 | for i in $new_smpls; do
 64 |         stringtie --conservative $i -p 4 -g 10 -G $ref -o "${i%%.sorted.bam}_stringtie.out";
 65 | 	done
 66 | fi
 67 | 
 68 | if [ "$type" == "fr" ] && [ -z "$ref" ]; then
 69 | 
 70 | echo ""
 71 | echo "Forward stranded without reference annotation"
 72 | echo ""
 73 | 
 74 | for i in $new_smpls; do
 75 | 	stringtie --conservative -p 4 -g 10 $i --fr -o "${i%%.sorted.bam}_stringtie.out";
 76 | 	done
 77 | fi
 78 | 
 79 | if [ "$type" == "fr" ] && [ ! -z "$ref" ]; then
 80 | 
 81 | echo ""
 82 | echo "Forward stranded with reference $ref"
 83 | echo ""
 84 | 
 85 | for i in $new_smpls; do
 86 | 	stringtie --conservative -p 4 -g 10 $i --fr -G $ref -o "${i%%.sorted.bam}_stringtie.out";
 87 | 	done
 88 | fi
 89 | 
 90 | if [ "$type" == "rf" ]  && [ -z "$ref" ]; then
 91 | 
 92 | echo ""
 93 | echo "Reverse stranded with reference annotation"
 94 | echo ""
 95 | 
 96 | for i in $new_smpls; do
 97 | 	stringtie --conservative -p 4 -g 10 $i --rf -o "${i%%.sorted.bam}_stringtie.out";
 98 | 	done
 99 | fi
100 | 
101 | if [ "$type" == "rf" ]  && [ ! -z "$ref" ]; then
102 | 
103 | echo ""
104 | echo "Reverse stranded with reference $ref"
105 | echo ""
106 | 
107 | for i in $new_smpls; do
108 |         stringtie --conservative -p 4 -g 10 $i --rf -G $ref -o "${i%%.sorted.bam}_stringtie.out";
109 |         done
110 | fi
111 | 
112 | ## Merge assemblies with abundance filters
113 | echo ""
114 | echo "Merging assemblies"
115 | echo ""
116 | 
117 | strng="*_stringtie.out"
118 | 
119 | if [ ! -z "$ref" ]; then 
120 | 	stringtie --merge $strng -G $ref -f 0.05 -T 1 -o "merged_stringtie_out.gtf"; 
121 | 	else stringtie --merge $strng -o "merged_stringtie_out.gtf";
122 | fi
123 | 
124 | ## clean-up
125 | rm *_stringtie.out
126 | 
127 | ## Determine abundance of assembled transcript in each sample
128 | 
129 | echo ""
130 | echo "Abundance estimation"
131 | echo ""
132 | 
133 | if [ "$type" == "un" ]; then
134 | 
135 | for i in $new_smpls; do
136 | 	stringtie $i -eB -G merged_stringtie_out.gtf -o "${i%%.sorted.bam}_stringtie_out.gtf";
137 | 	mv t_data.ctab ${i%%.sorted.bam}_tdata.ctab
138 | 	done
139 | fi
140 | 
141 | if [ "$type" == "fr" ]; then
142 | 
143 | for i in $new_smpls; do
144 |         stringtie $i -eB --fr -G merged_stringtie_out.gtf -o "${i%%.sorted.bam}_stringtie_out.gtf";
145 |         mv t_data.ctab ${i%%.sorted.bam}_tdata.ctab
146 | 	done
147 | fi
148 | 
149 | if [ "$type" == "rf" ]; then
150 | 
151 | for i in $new_smpls; do
152 |         stringtie $i -eB --rf -G merged_stringtie_out.gtf -o "${i%%.sorted.bam}_stringtie_out.gtf";
153 |         mv t_data.ctab ${i%%.sorted.bam}_tdata.ctab
154 | 	done
155 | fi
156 | 
157 | echo "cleaning ..."
158 | 
159 | for i in $smpls;
160 |         do
161 |         if [[ $i != *sorted.bam ]]; then
162 |                 rm "${i%%.bam}.sorted.bam"
163 |         fi;
164 | done
165 | 
166 | ## coverage estimates
167 | mkdir abundance_estimates
168 | mv *tdata.ctab -t abundance_estimates/
169 | rm *ctab
170 | 
171 | ## export TPM
172 | 
173 | for i in $new_smpls; do
174 | 	Rscript $HOME/scripts/RNA/stringtie_extract_tpm.r "${i%%.sorted.bam}_stringtie_out.gtf"
175 | 	mv *tpm -t abundance_estimates/
176 | done
177 | 
178 | if [ ! -z "$ref" ]; then
179 | 
180 | echo ""
181 | echo "compare with $ref"
182 | echo ""
183 | 
184 | gffcompare -R -r $ref -o strtcmp merged_stringtie_out.gtf
185 | 
186 | mkdir gffcompare_results
187 | mv strtcmp* -t gffcompare_results/
188 | 
189 | fi
190 | 
191 | echo "Complete!"
192 | ##############################
193 | 
194 | 
195 | 


--------------------------------------------------------------------------------
/RNA/stringtie_pipe_v2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eu
 3 | 
 4 | # StringTie2 v2: use stringtie to compute transcript abundance for pre-assembled transcripts and export TPM.	
 5 | ## e.g. generate list of sample names 	
 6 | ## dir *bam > files.txt	
 7 | 
 8 | if [ "$#" -lt 3 ]; then	
 9 | echo "Missing required arguments!"	
10 | echo "USAGE: stringtie_pipe_v2.sh <sample list> <strandedness> <reference GFF3/GTF>"	
11 | echo "EXAMPLE: stringtie_pipe_v2.sh files.txt <un/fr/rf> merged"	
12 | exit 1	
13 | fi	
14 | 
15 | ## gather input variables	
16 | smpls=$(cat $1)	
17 | type=$2	
18 | ref=$3	
19 | dow=$(date +"%F-%H-%m")	
20 | 
21 | echo ""	
22 | echo "Organising BAM files"	
23 | echo ""	
24 | 
25 | ## ensure all BAM files are sorted and indexed	
26 | new_smpls=""	
27 | 
28 | for i in $smpls; 	
29 | 	do	
30 | 	if [[ $i != *sorted.bam ]]; then 	
31 | 		samtools sort -@ 6 $i -o "${i%%.bam}.sorted.bam"	
32 | 		fq="${i%%.bam}.sorted.bam"	
33 | 		new_smpls="$new_smpls $fq";	
34 | 	else new_smpls="$new_smpls $i";	
35 | 	fi;	
36 | done	
37 | 
38 | ## Transcript abundance	
39 | 
40 | echo ""	
41 | echo "Abundance estimation (TPM)"	
42 | echo ""	
43 | 
44 | if [ "$type" == "un" ]; then	
45 | 
46 | for i in $new_smpls; do	
47 | 	echo $i $type
48 | 	stringtie $i -e -G $ref -o "${i%%.sorted.bam}_stringtie.gtf";	
49 | 	done	
50 | fi	
51 | 
52 | if [ "$type" == "fr" ]; then	
53 | 
54 | for i in $i; do
55 | 	echo $new_smpls $type
56 |         stringtie $i -e --fr -G $ref -o "${i%%.sorted.bam}_stringtie.gtf";	
57 | 	done	
58 | fi	
59 | 
60 | if [ "$type" == "rf" ]; then	
61 | 
62 | for i in $i; do
63 | 	echo $new_smpls $type	
64 |         stringtie $i -e --rf -G $ref -o "${i%%.sorted.bam}_stringtie.gtf";	
65 | 	done	
66 | fi	
67 | 
68 | echo "cleaning ..."	
69 | 
70 | for i in $smpls;	
71 |         do	
72 |         if [[ $i != *sorted.bam ]]; then	
73 |                 rm "${i%%.bam}.sorted.bam"	
74 |         fi;	
75 | done	
76 | 
77 | ## export TPM
78 | 
79 | for i in $new_smpls; do
80 |         Rscript $HOME/scripts/RNA/stringtie_extract_tpm.r "${i%%.sorted.bam}_stringtie.gtf"
81 | done
82 | 
83 | mkdir tpm_abundance
84 | mv *_stringtie.tpm tpm_abundance
85 | 
86 | echo "Complete!"	
87 | ##############################
88 | 
89 | 


--------------------------------------------------------------------------------
/RNA/total_expression.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # args[1] = filename
 3 | # Run on output of BAM_to_bedgraph.sh or BAM_to_bedgraph_5p.sh
 4 | # calculate total read depth across features of interest
 5 | 
 6 | options(echo=T)
 7 | library(fields)
 8 | library(tidyverse)
 9 | args=commandArgs(trailingOnly=T)
10 | print(args)
11 | 
12 | # Read in file
13 | input <- read.delim(args[1],head=F)
14 | 
15 | input <- subset(input,input[,ncol(input)] != -1)
16 | 
17 | # calculate length of feature and per million scaling factor (total reads)
18 | input <- subset(input, V1 !='ChrM' & V1!='ChrC' & V1 != 'Mt' & V1 != 'Pt') %>% # Remove plastids and unmatched rows
19 | 	mutate(input, length = V7 - V6)
20 | 	
21 | # Determine total read depth per feature then calculate RPM using per million scaling factor
22 | out <- group_by(input, V8, length) %>%
23 | 	summarise(read_depth = sum(V4))
24 | 
25 | name <- sapply(strsplit(args[1], 'bed'), function(l) l[1])
26 | write.table(out, paste(name,'feature_depth.txt',sep=''), sep='\t', quote=F, row.names=F)
27 | 
28 | 


--------------------------------------------------------------------------------
/RNA/trim_5p_graft_nad.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Filter reads from GRAFT-NAD-seq libraries based on presence of A within a defined distance
 3 | # Trim filtered reads to the first A at the 5' end of reads
 4 | 
 5 | library(tidyverse)
 6 | args=commandArgs(trailingOnly=T)
 7 | d <- as.numeric(paste(args[1]))
 8 | fastq_data <- readLines(args[2])
 9 | 
10 | # Check that the file has a proper number of lines
11 | if (length(fastq_data) %% 4 != 0) {stop("The FASTQ file is not properly formatted.")}
12 | 
13 | # Group lines into a matrix (4 rows per read)
14 | reads_matrix <- matrix(fastq_data, ncol = 4, byrow = TRUE)
15 | 
16 | processed_reads <- apply(reads_matrix, 1, function(l) {
17 | 	head <- l[1]
18 | 	seq <- l[2]
19 | 	opt <- l[3]
20 | 	qual <- l[4]
21 | 
22 | 	# get sequences that contain 'A" within the first #d bases
23 | 	if (grepl("A", substr(seq, 1, d))){
24 | 		
25 | 		# Find the position of the first 'A'		
26 | 		pos <- regexpr("A", seq)[1]
27 | 		
28 | 		# replace original sequence with trimmed sequence
29 | 	        if (pos > 0) {
30 |                 
31 | 		# Trim sequence to position of first 'A'
32 |                 trimmed_sequence <- substr(seq, pos, nchar(seq))
33 |                 trimmed_quality <- substr(qual, pos, nchar(qual))
34 | 		}
35 | 	
36 | 		return(c(head,trimmed_sequence,opt,trimmed_quality))} 
37 | 	else {return(NULL) }
38 | 	}
39 | )
40 | 
41 | processed_reads <- unlist(processed_reads)
42 | 
43 | # Write the filtered reads to a new FASTQ file
44 | name <- sapply(strsplit(args[2], ".fastq"), function(l) l[1])
45 | writeLines(processed_reads, paste0(name,"_processed.fq"))
46 | cat(paste0("Filtered and trimmed reads saved to '", name,"_processed.fq'.\n"))
47 | 
48 | 


--------------------------------------------------------------------------------
/RNA/trim_fastq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ## hard trim FASTQs sequences e.g. for PARE-seq libraries
 4 | ## https://wikis.utexas.edu/display/CoreNGSTools/Pre-processing+raw+sequences#Preprocessingrawsequences-FASTXToolkit
 5 | 
 6 | ### CONDA environment is installed
 7 | # conda create --name <name>
 8 | # conda install -c bioconda fastx_toolkit
 9 | 
10 | 
11 | if [ "$#" -lt 2 ]; then
12 | echo "Missing required arguments!"
13 | echo "USAGE: trim_fastq.sh <fastq> <length>"
14 | echo "EXAMPLE: trim_fastq.sh sample.fastq.gz 15"
15 | exit 1
16 | fi
17 | 
18 | fq=$1
19 | NSEQS=$2
20 | 
21 | if [[ $fq != *.gz ]];then fastx_trimmer -z -l $NSEQS -i $fq -o ${fq%%.fastq}.${NSEQS}bp.fastq.gz; fi
22 | 
23 | if [[ $fq == *.gz ]];then zcat $fq | fastx_trimmer -z -l $NSEQS -o ${fq%%.fastq.gz}.${NSEQS}bp.fastq.gz; fi
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/TruSeq-adapters.fa:
--------------------------------------------------------------------------------
  1 | >D701
  2 | ATTACTCG
  3 | >D702
  4 | TCCGGAGA
  5 | >D703
  6 | CGCTCATT
  7 | >D704
  8 | GAGATTCC
  9 | >D705
 10 | ATTCAGAA
 11 | >D706
 12 | GAATTCGT
 13 | >D707
 14 | CTGAAGCT
 15 | >D708
 16 | TAATGCGC
 17 | >D709
 18 | CGGCTATG
 19 | >D710
 20 | TCCGCGAA
 21 | >D711
 22 | TCTCGCGC
 23 | >D712
 24 | AGCGATAG
 25 | >D501
 26 | TATAGCCT
 27 | >D502
 28 | ATAGAGGC
 29 | >D503
 30 | CCTATCCT
 31 | >D504
 32 | GGCTCTGA
 33 | >D505
 34 | AGGCGAAG
 35 | >D506
 36 | TAATCTTA
 37 | >D507
 38 | CAGGACGT
 39 | >D508
 40 | GTACTGA
 41 | >multiplexing-forward
 42 | GATCGGAAGAGCACACGTCT
 43 | >solexa-forward
 44 | AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
 45 | >truseq-forward-contam
 46 | AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC
 47 | >truseq-reverse-contam
 48 | AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA
 49 | >nextera-forward-read-contam
 50 | CTGTCTCTTATACACATCTCCGAGCCCACGAGAC
 51 | >nextera-reverse-read-contam
 52 | CTGTCTCTTATACACATCTGACGCTGCCGACGA
 53 | >solexa-reverse
 54 | AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
 55 | >truseq-sRNA
 56 | TGGAATTCTCC
 57 | >truseq-universal
 58 | AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
 59 | >truseq-index1
 60 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG
 61 | >truseq-index2
 62 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG
 63 | >truseq-index3
 64 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG
 65 | >truseq-index4
 66 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG
 67 | >truseq-index5
 68 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG
 69 | >truseq-index6
 70 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG
 71 | >truseq-index7
 72 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG
 73 | >truseq-index8
 74 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG
 75 | >truseq-index9
 76 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG
 77 | >truseq-index10
 78 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG
 79 | >truseq-index11
 80 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG
 81 | >truseq-index12
 82 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG
 83 | >truseq-index13
 84 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTCAACAATCTCGTATGCCGTCTTCTGCTTG
 85 | >truseq-index14
 86 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTTCCGTATCTCGTATGCCGTCTTCTGCTTG
 87 | >truseq-index15
 88 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACATGTCAGAATCTCGTATGCCGTCTTCTGCTTG
 89 | >truseq-index16
 90 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCGTCCCGATCTCGTATGCCGTCTTCTGCTTG
 91 | >truseq-index18
 92 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTCCGCACATCTCGTATGCCGTCTTCTGCTTG
 93 | >truseq-index19
 94 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGAAACGATCTCGTATGCCGTCTTCTGCTTG
 95 | >truseq-index20
 96 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGGCCTTATCTCGTATGCCGTCTTCTGCTTG
 97 | >truseq-index21
 98 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTTTCGGAATCTCGTATGCCGTCTTCTGCTTG
 99 | >truseq-index22
100 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGTACGTAATCTCGTATGCCGTCTTCTGCTTG
101 | >truseq-index23
102 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACGAGTGGATATCTCGTATGCCGTCTTCTGCTTG
103 | >truseq-index25
104 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTGATATATCTCGTATGCCGTCTTCTGCTTG
105 | >truseq-index27
106 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACATTCCTTTATCTCGTATGCCGTCTTCTGCTTG
107 | 


--------------------------------------------------------------------------------
/VennPieces.R:
--------------------------------------------------------------------------------
 1 | VennPieces <- function(test.results, comparison1, comparison2, comparison3, out.dir){
 2 |   #create a dataframe called results which has 1,0 or -1 for differentialy
 3 |   #expressed or not for each gene test.results is a dataframe with the actual
 4 |   #data you want to output, such as the df with the fold changes and gene names
 5 |   #in it comparsion1-3 are the file names for each part of the venn out.dir is
 6 |   #the directory to print the results to
 7 |   
 8 |   #prints the venn diagram to file too
 9 |   pdf(paste0(out.dir,"/Venn.pdf"))
10 |   vennDiagram(results, 
11 |               include=c("up","down"), 
12 |               counts.col=c('red', 'blue'), 
13 |               show.include=T, cex = c(0.75, 0.75, 0.75))
14 |   dev.off()
15 |   #up
16 |   c1 <- test.results[results[,1]==1 & results[,2]!=1 & results[,3]!=1,]
17 |   write.csv(c1, paste0(out.dir,"/",comparison1,"_up.csv"))
18 |   c1.c2 <- test.results[results[,1]==1 & results[,2]==1 & results[,3]!=1,]
19 |   write.csv(c1.c2, paste0(out.dir,"/",comparison1,".",comparison2,"_up.csv"))
20 |   c2 <- test.results[results[,1]!=1 & results[,2]==1 & results[,3]!=1,]
21 |   write.csv(c2, paste0(out.dir,"/",comparison2,"_up.csv"))
22 |   c2.c3 <- test.results[results[,1]!=1 & results[,2]==1 & results[,3]==1,]
23 |   write.csv(c2.c3, paste0(out.dir,"/",comparison2,".",comparison3,"_up.csv"))
24 |   c3 <- test.results[results[,1]!=1 & results[,2]!=1 & results[,3]==1,]
25 |   write.csv(c3, paste0(out.dir,"/",comparison3,"_up.csv"))
26 |   c1.c3 <- test.results[results[,1]==1 & results[,2]!=1 & results[,3]==1,]
27 |   write.csv(c1.c3, paste0(out.dir,"/",comparison1,".",comparison3,"_up.csv"))
28 |   c1.c2.c3 <- test.results[results[,1]==1 & results[,2]==1 & results[,3]==1,]
29 |   write.csv(c1.c2.c3, paste0(out.dir,"/",comparison1,".",comparison2,".","comparison3","_up.csv"))
30 |   
31 |   #down
32 |   c1 <- test.results[results[,1]==-1 & results[,2]!=-1 & results[,3]!=-1,]
33 |   write.csv(c1, paste0(out.dir,"/",comparison1,"_down.csv"))
34 |   c1.c2 <- test.results[results[,1]==-1 & results[,2]==-1 & results[,3]!=-1,]
35 |   write.csv(c1.c2, paste0(out.dir,"/",comparison1,".",comparison2,"_down.csv"))
36 |   c2 <- test.results[results[,1]!=-1 & results[,2]==-1 & results[,3]!=-1,]
37 |   write.csv(c2, paste0(out.dir,"/",comparison2,"_down.csv"))
38 |   c2.c3 <- test.results[results[,1]!=-1 & results[,2]==-1 & results[,3]==-1,]
39 |   write.csv(c2.c3, paste0(out.dir,"/",comparison2,".",comparison3,"_down.csv"))
40 |   c3 <- test.results[results[,1]!=-1 & results[,2]!=-1 & results[,3]==-1,]
41 |   write.csv(c3, paste0(out.dir,"/",comparison3,"_down.csv"))
42 |   c1.c3 <- test.results[results[,1]==-1 & results[,2]!=-1 & results[,3]==-1,]
43 |   write.csv(c1.c3, paste0(out.dir,"/",comparison1,".",comparison3,"_down.csv"))
44 |   c1.c2.c3 <- test.results[results[,1]==-1 & results[,2]==-1 & results[,3]==-1,]
45 |   write.csv(c1.c2.c3, paste0(out.dir,"/",comparison1,".",comparison2,".","comparison3","_down.csv"))
46 | }
47 | 
48 | VennPieces(test.results=unfilteredgenelist, comparison1="sco3", comparison2="phyb", comparison3="sco3phyb", out.dir="venns")


--------------------------------------------------------------------------------
/araport11_assemble.sh:
--------------------------------------------------------------------------------
 1 | # build annotations based off gff3 files from araport11 release
 2 | # https://www.araport.org/data/araport11
 3 | # derived from SRE gene_to_gene.sh
 4 | # Run lines manually in annotation directory
 5 | 
 6 | # Readme file
 7 | wget https://www.araport.org/download_file/Araport11_Release_201606/annotation/README.201606.md
 8 | 
 9 | # Araport11 annotation in GFF3
10 | 
11 | # curl -sO -H 'Authorization: Bearer 745dd29759980b058db8fb9efc7af5' https://api.araport.org/files/v2/media/system/araport-public-files//Araport11_Release_201606/annotation/Araport11_GFF3_genes_transposons.201606.gff.gz
12 | 
13 | wget http://www.arabidopsis.org/download_files/Genes/Araport11_genome_release/Araport11_GFF3_genes_transposons.201606.gff.gz
14 | 
15 | gzip -d *gff.gz
16 | 
17 | # Make bed files
18 | 
19 | R
20 | 
21 | library(tidyverse)
22 | 
23 | getAttributeField <- function (x, field, attrsep = ";") {
24 |      s = strsplit(x, split = attrsep, fixed = TRUE)
25 |      sapply(s, function(atts) {
26 |          a = strsplit(atts, split = "=", fixed = TRUE)
27 |          m = match(field, sapply(a, "[", 1))
28 |          if (!is.na(m)) {
29 |              rv = a[[m]][2]
30 |          }
31 |          else {
32 |              rv = as.character(NA)
33 |          }
34 |          return(rv)
35 |      })
36 | }
37 | 
38 | gffRead <- function(gffFile, nrows = -1) {
39 |      cat("Reading ", gffFile, ": ", sep="")
40 |      gff = read.table(gffFile, sep="\t", as.is=TRUE, quote="",
41 |      header=FALSE, comment.char="#", nrows = nrows,
42 |      colClasses=c("character", "character", "character", "integer",  
43 | "integer",
44 |      "character", "character", "character", "character"))
45 |      colnames(gff) = c("seqname", "source", "feature", "start", "end",
46 |              "score", "strand", "frame", "attributes")
47 | 	cat("found", nrow(gff), "rows with classes:",
48 |         paste(sapply(gff, class), collapse=", "), "\n")
49 |      stopifnot(!any(is.na(gff$start)), !any(is.na(gff$end)))
50 |      return(gff)
51 | }
52 | 
53 | ara=gffRead("Araport11_GFF3_genes_transposons.current.gff")
54 | 
55 | ### Gene annotation
56 | gene=subset(ara,ara$feature=='gene')
57 | gene$Name=getAttributeField(gene$attributes, 'Name')
58 | gene$ID=getAttributeField(gene$attributes, 'ID')
59 | gene.out=gene[,c('seqname','start','end','Name','score','strand')]
60 | 
61 | write.table(gene.out,'Araport11_genes.bed',sep='\t',row.names=F,col.names=F,quote=F)
62 | 
63 | ### TE annotation
64 | te=subset(ara,ara$feature=='transposable_element')
65 | te$Name=getAttributeField(te$attributes, 'Name')
66 | te$ID=getAttributeField(te$attributes, 'ID')
67 | te$alias=getAttributeField(te$attributes, 'Alias')
68 | te$seqname=gsub(pattern="Chr",replacement='', x=te$seqname)
69 | te.out=te[,c('seqname','start','end','Name','feature','strand')]
70 | 
71 | write.table(te.out,'Araport11_TE.bed',sep='\t',row.names=F,col.names=F,quote=F)
72 | 
73 | ### Transcript annotation
74 | mRNA <- subset(ara, feature == "mRNA")
75 | mRNA$name=getAttributeField(mRNA$attributes, 'Name')
76 | mRNA$parent=getAttributeField(mRNA$attributes, 'Parent')
77 | mRNA.out=mRNA[,c('seqname','start','end','name','score','strand')]
78 | 
79 | write.table(mRNA.out,'Araport11_mRNA.bed',sep='\t',row.names=F,col.names=F,quote=F)
80 | 
81 | quit()
82 | n
83 | 
84 | ### 5' and 3' UTR annotation
85 | utr <- subset(ara, feature == "five_prime_UTR" | feature == "three_prime_UTR") %>%
86 | 	mutate(id = getAttributeField(attributes, 'Parent')) %>%
87 | 	select(seqname, start, end, strand, id, feature) %>%
88 | 	mutate(id = sapply(strsplit(id, "\\."), function(l) l[1]))
89 | 	
90 | 	write.table(utr, "Araport11_UTR.bed", sep='\t', row.names=F, col.names=F, quote=F)
91 | 
92 | ## use bedtools getfasta to obtain sequences in utr intervals 
93 | ## bedtools getfasta -bedOut -s -fi TAIR10_Chr.all.fasta -bed Araport11_UTR.sorted.bed > Araport11_UTR_seq.bed
94 | 
95 | ##########
96 | 
97 | rm *gff
98 | 
99 | 


--------------------------------------------------------------------------------
/average_cov.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -eu
3 | bam=$1
4 | samtools sort -@ 4 $bam | samtools depth - | awk '{sum+=$3} END { print "Mean depth = ",sum/NR}'
5 | 


--------------------------------------------------------------------------------
/bashrc:
--------------------------------------------------------------------------------
  1 | # ~/.bashrc: executed by bash(1) for non-login shells.
  2 | # see /usr/share/doc/bash/examples/startup-files (in the package bash-doc)
  3 | # for examples
  4 | 
  5 | # If not running interactively, don't do anything
  6 | case $- in
  7 |     *i*) ;;
  8 |       *) return;;
  9 | esac
 10 | 
 11 | # don't put duplicate lines or lines starting with space in the history.
 12 | # See bash(1) for more options
 13 | HISTCONTROL=ignoreboth
 14 | 
 15 | # append to the history file, don't overwrite it
 16 | shopt -s histappend
 17 | 
 18 | # for setting history length see HISTSIZE and HISTFILESIZE in bash(1)
 19 | HISTSIZE=1000
 20 | HISTFILESIZE=2000
 21 | 
 22 | # check the window size after each command and, if necessary,
 23 | # update the values of LINES and COLUMNS.
 24 | shopt -s checkwinsize
 25 | 
 26 | # If set, the pattern "**" used in a pathname expansion context will
 27 | # match all files and zero or more directories and subdirectories.
 28 | #shopt -s globstar
 29 | 
 30 | # make less more friendly for non-text input files, see lesspipe(1)
 31 | #[ -x /usr/bin/lesspipe ] && eval "$(SHELL=/bin/sh lesspipe)"
 32 | 
 33 | # set variable identifying the chroot you work in (used in the prompt below)
 34 | if [ -z "${debian_chroot:-}" ] && [ -r /etc/debian_chroot ]; then
 35 |     debian_chroot=$(cat /etc/debian_chroot)
 36 | fi
 37 | 
 38 | # set a fancy prompt (non-color, unless we know we "want" color)
 39 | case "$TERM" in
 40 |     xterm-color|*-256color) color_prompt=yes;;
 41 | esac
 42 | 
 43 | # uncomment for a colored prompt, if the terminal has the capability; turned
 44 | # off by default to not distract the user: the focus in a terminal window
 45 | # should be on the output of commands, not on the prompt
 46 | #force_color_prompt=yes
 47 | 
 48 | if [ -n "$force_color_prompt" ]; then
 49 |     if [ -x /usr/bin/tput ] && tput setaf 1 >&/dev/null; then
 50 | 	# We have color support; assume it's compliant with Ecma-48
 51 | 	# (ISO/IEC-6429). (Lack of such support is extremely rare, and such
 52 | 	# a case would tend to support setf rather than setaf.)
 53 | 	color_prompt=yes
 54 |     else
 55 | 	color_prompt=
 56 |     fi
 57 | fi
 58 | 
 59 | if [ "$color_prompt" = yes ]; then
 60 |     PS1='${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ '
 61 | else
 62 |     PS1='${debian_chroot:+($debian_chroot)}\u@\h:\w\$ '
 63 | fi
 64 | unset color_prompt force_color_prompt
 65 | 
 66 | # If this is an xterm set the title to user@host:dir
 67 | case "$TERM" in
 68 | xterm*|rxvt*)
 69 |     PS1="\[\e]0;${debian_chroot:+($debian_chroot)}\u@\h: \w\a\]$PS1"
 70 |     ;;
 71 | *)
 72 |     ;;
 73 | esac
 74 | 
 75 | # enable color support of ls and also add handy aliases
 76 | if [ -x /usr/bin/dircolors ]; then
 77 |     test -r ~/.dircolors && eval "$(dircolors -b ~/.dircolors)" || eval "$(dircolors -b)"
 78 |     alias ls='ls --color=auto'
 79 |     #alias dir='dir --color=auto'
 80 |     #alias vdir='vdir --color=auto'
 81 | 
 82 |     #alias grep='grep --color=auto'
 83 |     #alias fgrep='fgrep --color=auto'
 84 |     #alias egrep='egrep --color=auto'
 85 | fi
 86 | 
 87 | # colored GCC warnings and errors
 88 | #export GCC_COLORS='error=01;31:warning=01;35:note=01;36:caret=01;32:locus=01:quote=01'
 89 | 
 90 | # some more ls aliases
 91 | alias ll='ls -l'
 92 | alias la='ls -A'
 93 | alias l='ls -lh'
 94 | 
 95 | # Alias definitions.
 96 | # You may want to put all your additions into a separate file like
 97 | # ~/.bash_aliases, instead of adding them here directly.
 98 | # See /usr/share/doc/bash-doc/examples in the bash-doc package.
 99 | 
100 | if [ -f ~/.bash_aliases ]; then
101 |     . ~/.bash_aliases
102 | fi
103 | 
104 | # enable programmable completion features (you don't need to enable
105 | # this, if it's already enabled in /etc/bash.bashrc and /etc/profile
106 | # sources /etc/bash.bashrc).
107 | if ! shopt -oq posix; then
108 |   if [ -f /usr/share/bash-completion/bash_completion ]; then
109 |     . /usr/share/bash-completion/bash_completion
110 |   elif [ -f /etc/bash_completion ]; then
111 |     . /etc/bash_completion
112 |   fi
113 | fi
114 | 
115 | export PATH=/home/diepg/bin/gffcompare-0.11.6.Linux_x86_64:/home/diepg/bin/subread-2.0.0-Linux-x86_64/bin:/home/diepg/bin/stringtie:/home/diepg/bin/kallisto:/home/diepg/bin/TrimGalore-0.6.5:/home/diepg/bin/kentUtils/bin/linux.x86_64:$PATH
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/gene_to_gene_anno.sh:
--------------------------------------------------------------------------------
  1 | #Using TAIR GFF file to make annotation files
  2 | #Derived from SRE gene_to_gene.sh
  3 | 
  4 | mkdir $(date +"%Y-%m-%d")_TAIR10_gene_to_gene_annotation
  5 | cd *_TAIR10_gene_to_gene_annotation
  6 | 
  7 | #get TAIR10 gene annotations
  8 | 
  9 | ###GFF file containing features (mRNA, exon, CDS,...) of ALL TAIR10 genes ,including non-protein coding genes(pseudogenes, RNA genes, transposable element genes)
 10 | wget https://www.arabidopsis.org/download_files/Genes/TAIR10_genome_release/TAIR10_gff3/TAIR10_GFF3_genes.gff
 11 | 
 12 | ### GFF file containing features (mRNA, exon, CDS,...) of ALL TAIR10 genes ,including TRANSPOSABLE ELEMENTS and all non-protein coding genes (pseudogenes, RNA genes, transposable element genes)
 13 | wget https://www.arabidopsis.org/download_files/Genes/TAIR10_genome_release/TAIR10_gff3/TAIR10_GFF3_genes_transposons.gff
 14 | 
 15 | #make bed file of all TAIR10 genes with proper strand information
 16 | 
 17 | ###########################
 18 | R
 19 | 
 20 | getAttributeField <- function (x, field, attrsep = ";") {
 21 |      s = strsplit(x, split = attrsep, fixed = TRUE)
 22 |      sapply(s, function(atts) {
 23 |          a = strsplit(atts, split = "=", fixed = TRUE)
 24 |          m = match(field, sapply(a, "[", 1))
 25 |          if (!is.na(m)) {
 26 |              rv = a[[m]][2]
 27 |          }
 28 |          else {
 29 |              rv = as.character(NA)
 30 |          }
 31 |          return(rv)
 32 |      })
 33 | }
 34 | 
 35 | gffRead <- function(gffFile, nrows = -1) {
 36 |      cat("Reading ", gffFile, ": ", sep="")
 37 |      gff = read.table(gffFile, sep="\t", as.is=TRUE, quote="",
 38 |      header=FALSE, comment.char="#", nrows = nrows,
 39 |      colClasses=c("character", "character", "character", "integer",  
 40 | "integer",
 41 |      "character", "character", "character", "character"))
 42 |      colnames(gff) = c("seqname", "source", "feature", "start", "end",
 43 |              "score", "strand", "frame", "attributes")
 44 | 	cat("found", nrow(gff), "rows with classes:",
 45 |         paste(sapply(gff, class), collapse=", "), "\n")
 46 |      stopifnot(!any(is.na(gff$start)), !any(is.na(gff$end)))
 47 |      return(gff)
 48 | }
 49 | 
 50 | gene=gffRead('TAIR10_GFF3_genes.gff')
 51 | #gene_te=gffRead('TAIR10_GFF3_genes_transposons.gff')
 52 | 
 53 | #I am subsetting to annotated 'gene's which there are 28,775 in total for TAIR10. This may be modified if we are looking for other things.
 54 | gene=subset(gene,gene$feature=='gene')
 55 | gene$Name=getAttributeField(gene$attributes, 'Name')
 56 | gene$ID=getAttributeField(gene$attributes, 'ID')
 57 | 
 58 | gene.out=gene[,c('seqname','start','end','Name','score','strand')]
 59 | write.table(gene.out,'TAIR10_genes.bed',sep='\t',row.names=F,col.names=F,quote=F)
 60 | quit()
 61 | n
 62 | 
 63 | ########################
 64 | 
 65 | 
 66 | #ID closest gene to each gene
 67 | sort -k1,1 -k2,2n TAIR10_genes.bed > TAIR10_genes.sorted.bed
 68 | 
 69 | #flags for bedtools (v2.25.0)
 70 | # -N cannot match same name of gene (i.e. you don't match yourself)
 71 | # -iu ignore upstream features
 72 | # -s require feature on same strand
 73 | # -D a report distance in relationship to the orientation of file 'a'
 74 | 
 75 | bedtools closest -N -iu -s -D a -a TAIR10_genes.sorted.bed -b TAIR10_genes.sorted.bed > TAIR10_gene_to_gene.samestrand.bed
 76 | 
 77 | 
 78 | #parse output and create final annotation file
 79 | 
 80 | ########################
 81 | 
 82 | R
 83 | 
 84 | input=read.delim('TAIR10_gene_to_gene.samestrand.bed',head=F)
 85 | colnames(input)=c('chr1','start1','stop1','gene1','score1','strand1','chr2','start2','stop2','gene2','score2','strand2','distance')
 86 | 
 87 | input.flt <- input[,c('chr1','start1','stop1','gene1','strand1','chr2','start2','stop2','gene2','strand2','distance')]
 88 | input.flt$chr2 <- ifelse(input.flt$chr2=='.',NA,as.character(input.flt$chr2))
 89 | input.flt$chr2 <- as.factor(input.flt$chr2)
 90 | input.flt$start2 <- ifelse(input.flt$start2=='-1',NA,input.flt$start2)
 91 | input.flt$stop2 <- ifelse(input.flt$stop2=='-1',NA,input.flt$stop2)
 92 | input.flt$gene2 <- ifelse(input.flt$gene2=='.',NA,as.character(input.flt$gene2))
 93 | input.flt$gene2 <- as.factor(input.flt$gene2)
 94 | input.flt$strand2 <- ifelse(input.flt$strand2=='.',NA,as.character(input.flt$strand2))
 95 | input.flt$strand2 <- as.factor(input.flt$strand2)
 96 | input.flt$distance <- ifelse(is.na(input.flt$chr2)==T,NA,input.flt$distance)
 97 | input.flt <- na.omit(input.flt)
 98 | input.flt <- subset(input.flt, input.flt$chr1 != 'M')
 99 | input.flt <- subset(input.flt, input.flt$chr1 != 'C')
100 | 
101 | # there are still overlapping records
102 | # These appear to be either microRNAs or closely adjacent genes with one having a long secondary transcript
103 | # just omit these; can browse too
104 | test <- subset(input.flt, input.flt$distance == 0)
105 | output <- subset(input.flt, input.flt$distance != 0)
106 | 
107 | # write.table(input.flt,'TAIR10_gene_to_gene.samestrand.anno',sep='\t',row.names=F,quote=F)
108 | write.table(output,'Araport11_gene_to_gene.samestrand.bed',sep='\t',col.names=F,row.names=F,quote=F)
109 | 
110 | quit()
111 | n
112 | 
113 | #
114 | 
115 | 
116 | 


--------------------------------------------------------------------------------
/pe_insert_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eu
 3 | bam=$1
 4 | 
 5 | java -jar ~/bin/picard.jar CleanSam I=$bam O=test.bam
 6 | java -jar ~/bin/picard.jar ValidateSamFile I=test.bam IGNORE_WARNINGS=true MODE=VERBOSE
 7 | 
 8 | # AddOrReplaceReadGroups
 9 | # FixMateInformation
10 | ## Manually remove reads with errors
11 | # samtools view -h test.bam | grep -v 'D00775:83:CC65GANXX:4:2304:19617:7489' | samtools view -b > test2.bam
12 | 
13 | samtools sort -@ 6 test.bam -o test.sorted.bam
14 | 
15 | java -jar ~/bin/picard.jar CollectInsertSizeMetrics I=test.sorted.bam O=insert_size_metrics.txt H=insert_size_histogram.pdf
16 | 
17 | rm test.bam
18 | rm test.sorted.bam
19 | 
20 | echo "DONE"
21 | 


--------------------------------------------------------------------------------
/project_workflows/diffsegR_v1/diffsegR_WT-NvsC_gmuct_3p_v1.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ## Installation
  4 | #### Conda
  5 | #conda create --name diffseg_v1
  6 | #conda install -n diffseg_v1 -c conda-forge r-tidyverse
  7 | #conda install -n diffseg_v1 -c bioconda bioconductor-deseq2
  8 | #conda install -n diffseg_v1 -c bioconda bioconductor-rsubread
  9 | #conda install -n diffseg_v1 -c bioconda bioconductor-rtracklayer
 10 | #conda install -n diffseg_v1 -c bioconda bioconductor-sparsematrixstats
 11 | #conda install -n diffseg_v1 -c bioconda bioconductor-delayedmatrixstats
 12 | #conda install -n diffseg_v1 -c conda-forge r-remotes
 13 | #conda activate diffseg_v1
 14 | 
 15 | #### R
 16 | #remotes::install_github("sanssouci-org/sanssouci")
 17 | #remotes::install_github("aLiehrmann/DiffSegR@f657435")
 18 | 
 19 | library(tidyverse)
 20 | library(DiffSegR)
 21 | 
 22 | nb_threads = 10
 23 | 
 24 | working_directory <- getwd()
 25 | 
 26 | #- create sample information table --------------------------------------------#
 27 | sample_info <- data.frame(
 28 |   sample    = c("WT.N_1", "WT.N_2", "WT.N_3", "WT.C_1", "WT.C_2", "WT.C_3"),
 29 |   condition = rep(c("WT.N", "WT.C"), each = 3),
 30 |   replicate = rep(1:3,2),
 31 |   bam       = sapply(
 32 | 	c("S5-3N_Aligned.sortedByCoord.out.bam", 
 33 | 	"S7-4N_Aligned.sortedByCoord.out.bam",
 34 | 	"S11-10N_Aligned.sortedByCoord.out.bam",
 35 | 	"S6-3C_Aligned.sortedByCoord.out.bam",
 36 | 	"S8-4C_Aligned.sortedByCoord.out.bam",
 37 | 	"S12-10C_Aligned.sortedByCoord.out.bam"
 38 |     ),
 39 |     function(bam) file.path(working_directory, bam)),
 40 |   coverage  = file.path(
 41 |     working_directory,
 42 |     paste0(c("WT-N_1", "WT-N_2", "WT-N_3", "WT-C_1", "WT-C_2", "WT-C_3"), ".rds")))
 43 | 
 44 | #- save sample information table ----------------------------------------------# 
 45 | write.table(
 46 |   sample_info, 
 47 |   file.path(working_directory, "sample_info.txt")
 48 | )
 49 | 
 50 | #- display sample information table -------------------------------------------#
 51 | knitr::kable(sample_info, row.names = FALSE)
 52 | 
 53 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F)
 54 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt")
 55 | 
 56 | ### setup comparisons and loop for each chromosome
 57 | out_DERs <- NULL
 58 | 
 59 | for(i in unique(genome$X1)){
 60 | chr <- paste(i)
 61 | stop <- genome$X2[genome$X1==i]
 62 | 
 63 | ## load data
 64 | data <- loadData(
 65 |   sampleInfo   = file.path(working_directory,"sample_info.txt"),
 66 |   locus        = list(seqid = i, chromStart = 1, chromEnd = stop),
 67 |   referenceCondition = "WT.C",
 68 |   isPairedEnd = TRUE,
 69 |   readLength = 150,
 70 |   coverageType = "threePrime",
 71 |   stranded = FALSE,
 72 |   strandSpecific = 0,
 73 |   fromBam    = TRUE,
 74 |   nbThreads  = nb_threads,
 75 |   verbose = TRUE,
 76 | )
 77 | 
 78 | ## Changepoint detection to define segments
 79 | SExp <- segmentation(
 80 | 	data = data, 
 81 | 	weightType = "unweighted", #zeroInflated : low counts have less weight
 82 | 	modelSelectionType = "yao",
 83 | 	featureCountsType = "fromBam",
 84 | 	compressed = TRUE,
 85 | 	alpha = 2,
 86 | 	segmentNeighborhood = FALSE,
 87 | 	Kmax = NULL,
 88 | 	verbose = FALSE,
 89 | 	nbThreadsGridSearch = 1,
 90 | 	alphas = NULL,
 91 | 	gridSearch = FALSE,
 92 | 	outputDirectory = working_directory,
 93 | 	nbThreadsFeatureCounts = nb_threads,
 94 | 	strandSpecific = 0,
 95 | 	read2pos = 3,
 96 | 	isPairedEnd = TRUE
 97 | )
 98 | 
 99 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,]
100 | 
101 | dds <- dea(
102 | 	data              = data,
103 | 	SExp              = SExp_10,
104 | 	design            = ~condition,
105 | 	predicate = NULL,
106 | 	significanceLevel = 0.01,
107 | 	verbose = TRUE
108 | )
109 | 
110 | #- extract DERs based on signifiance ----------------------------------------#
111 | DERs <- dds[SummarizedExperiment::mcols(dds)$rejectedHypotheses,]
112 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs))
113 | 
114 | out_DERs <- rbind(out_DERs,DERs)
115 | 
116 | }
117 | 
118 | #clear memory cache
119 | gc()
120 | 
121 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3])))
122 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, baseVar, log2FoldChange, padj)
123 | out <- subset(out, baseMean > 10)
124 | 
125 | write_tsv(out, "WT-N_DERs_3p.bed", col_names=F)
126 | 
127 | 


--------------------------------------------------------------------------------
/project_workflows/diffsegR_v1/diffsegR_WT-NvsC_gmuct_5p_v1.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ## Installation
  4 | #### Conda
  5 | #conda create --name diffseg_v1
  6 | #conda install -n diffseg_v1 -c conda-forge r-tidyverse
  7 | #conda install -n diffseg_v1 -c bioconda bioconductor-deseq2
  8 | #conda install -n diffseg_v1 -c bioconda bioconductor-rsubread
  9 | #conda install -n diffseg_v1 -c bioconda bioconductor-rtracklayer
 10 | #conda install -n diffseg_v1 -c bioconda bioconductor-sparsematrixstats
 11 | #conda install -n diffseg_v1 -c bioconda bioconductor-delayedmatrixstats
 12 | #conda install -n diffseg_v1 -c conda-forge r-remotes
 13 | #conda activate diffseg_v1
 14 | 
 15 | #### R
 16 | #remotes::install_github("sanssouci-org/sanssouci")
 17 | #remotes::install_github("aLiehrmann/DiffSegR@f657435")
 18 | 
 19 | library(tidyverse)
 20 | library(DiffSegR)
 21 | 
 22 | nb_threads = 10
 23 | 
 24 | working_directory <- getwd()
 25 | 
 26 | #- create sample information table --------------------------------------------#
 27 | sample_info <- data.frame(
 28 |   sample    = c("WT.N_1", "WT.N_2", "WT.N_3", "WT.C_1", "WT.C_2", "WT.C_3"),
 29 |   condition = rep(c("WT.N", "WT.C"), each = 3),
 30 |   replicate = rep(1:3,2),
 31 |   bam       = sapply(
 32 | 	c("S5-3N_Aligned.sortedByCoord.out.bam", 
 33 | 	"S7-4N_Aligned.sortedByCoord.out.bam",
 34 | 	"S11-10N_Aligned.sortedByCoord.out.bam",
 35 | 	"S6-3C_Aligned.sortedByCoord.out.bam",
 36 | 	"S8-4C_Aligned.sortedByCoord.out.bam",
 37 | 	"S12-10C_Aligned.sortedByCoord.out.bam"
 38 |     ),
 39 |     function(bam) file.path(working_directory, bam)
 40 |   ),
 41 |   coverage  = file.path(
 42 |     working_directory, paste0(c("WT.N_1", "WT.N_2", "WT.N_3", "WT.C_1", "WT.C_2", "WT.C_3"), ".rds")))
 43 | 
 44 | #- save sample information table ----------------------------------------------# 
 45 | write.table(
 46 |   sample_info, 
 47 |   file.path(working_directory, "sample_info.txt")
 48 | )
 49 | 
 50 | #- display sample information table -------------------------------------------#
 51 | knitr::kable(sample_info, row.names = FALSE)
 52 | 
 53 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F)
 54 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt")
 55 | 
 56 | ### setup comparisons and loop for each chromosome
 57 | out_DERs <- NULL
 58 | 
 59 | for(i in unique(genome$X1)){
 60 | chr <- paste(i)
 61 | stop <- genome$X2[genome$X1==i]
 62 | 
 63 | ## load data
 64 | data <- loadData(
 65 |   sampleInfo   = file.path(working_directory,"sample_info.txt"),
 66 |   locus        = list(seqid = i, chromStart = 1, chromEnd = stop),
 67 |   referenceCondition = "WT.C",
 68 |   isPairedEnd = TRUE,
 69 |   readLength = 150,
 70 |   coverageType = "fivePrime",
 71 |   stranded = FALSE,
 72 |   strandSpecific = 0,
 73 |   fromBam    = TRUE,
 74 |   nbThreads  = nb_threads,
 75 |   verbose = TRUE,
 76 | )
 77 | 
 78 | ## Changepoint detection to define segments
 79 | SExp <- segmentation(
 80 | 	data = data, 
 81 | 	weightType = "unweighted", #zeroInflated : low counts have less weight
 82 | 	modelSelectionType = "yao",
 83 | 	featureCountsType = "fromBam",
 84 | 	compressed = TRUE,
 85 | 	alpha = 2,
 86 | 	segmentNeighborhood = FALSE,
 87 | 	Kmax = NULL,
 88 | 	verbose = FALSE,
 89 | 	nbThreadsGridSearch = 1,
 90 | 	alphas = NULL,
 91 | 	gridSearch = FALSE,
 92 | 	outputDirectory = working_directory,
 93 | 	nbThreadsFeatureCounts = nb_threads,
 94 | 	strandSpecific = 0,
 95 | 	read2pos = 5,
 96 | 	#featureCountsOtherParams = list(allowMultiOverlap = FALSE)	
 97 | 	isPairedEnd = TRUE
 98 | )
 99 | 
100 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,]
101 | 
102 | dds <- dea(
103 | 	data              = data,
104 | 	SExp              = SExp_10,
105 | 	design            = ~condition,
106 | 	predicate = NULL,
107 | 	significanceLevel = 0.01,
108 | 	verbose = TRUE
109 | )
110 | 
111 | #- extract DERs based on signifiance ----------------------------------------#
112 | DERs <- dds[SummarizedExperiment::mcols(dds)$rejectedHypotheses,]
113 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs))
114 | 
115 | out_DERs <- rbind(out_DERs,DERs)
116 | 
117 | }
118 | 
119 | #clear memory cache
120 | gc()
121 | 
122 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3])))
123 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, baseVar, log2FoldChange, padj)
124 | out <- subset(out, baseMean > 10)
125 | 
126 | write_tsv(out, "WT-N_DERs_5p.bed", col_names=F)
127 | 
128 | 


--------------------------------------------------------------------------------
/project_workflows/diffsegR_v1/diffsegR_abh1-CvsWT-C_gmuct_3p_v1.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ## Installation
  4 | #### Conda
  5 | #conda create --name diffseg_v1
  6 | #conda install -n diffseg_v1 -c conda-forge r-tidyverse
  7 | #conda install -n diffseg_v1 -c bioconda bioconductor-deseq2
  8 | #conda install -n diffseg_v1 -c bioconda bioconductor-rsubread
  9 | #conda install -n diffseg_v1 -c bioconda bioconductor-rtracklayer
 10 | #conda install -n diffseg_v1 -c bioconda bioconductor-sparsematrixstats
 11 | #conda install -n diffseg_v1 -c bioconda bioconductor-delayedmatrixstats
 12 | #conda install -n diffseg_v1 -c conda-forge r-remotes
 13 | #conda activate diffseg_v1
 14 | 
 15 | #### R
 16 | #remotes::install_github("sanssouci-org/sanssouci")
 17 | #remotes::install_github("aLiehrmann/DiffSegR@f657435")
 18 | 
 19 | library(tidyverse)
 20 | library(DiffSegR)
 21 | nb_threads = 10
 22 | working_directory <- getwd()
 23 | 
 24 | #- create sample information table --------------------------------------------#
 25 | sample_info <- data.frame(
 26 |   sample    = c("abh1.C_1","abh1.C_2","abh1.C_3","WT.C_1","WT.C_2","WT.C_3"),
 27 |   condition = c(rep("abh1.C", 3), rep( "WT.C", 3)),
 28 |   replicate = c(1:3,1:3),
 29 |   bam       = sapply(
 30 |         c("S16-5C_Aligned.sortedByCoord.out.bam", "S33-9C_Aligned.sortedByCoord.out.bam", "S35-11C_Aligned.sortedByCoord.out.bam",
 31 |         "S6-3C_Aligned.sortedByCoord.out.bam", "S8-4C_Aligned.sortedByCoord.out.bam", "S12-10C_Aligned.sortedByCoord.out.bam"),
 32 |     function(bam) file.path(working_directory, bam)),
 33 |   coverage  = file.path(
 34 |     working_directory,
 35 |     paste0(c("abh1.C_1","abh1.C_2","abh1.C_3","WT.C_1", "WT.C_2", "WT.C_3"), ".rds")))
 36 | 
 37 | #- save sample information table ----------------------------------------------# 
 38 | write.table(sample_info, file.path(working_directory, "sample_info.txt"))
 39 | 
 40 | #- display sample information table -------------------------------------------#
 41 | knitr::kable(sample_info, row.names = FALSE)
 42 | 
 43 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F)
 44 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt")
 45 | 
 46 | ### setup comparisons and loop for each chromosome
 47 | out_DERs <- NULL
 48 | 
 49 | for(i in unique(genome$X1)){
 50 | chr <- paste(i)
 51 | stop <- genome$X2[genome$X1==i]
 52 | 
 53 | ## load data
 54 | data <- loadData(
 55 |   sampleInfo   = file.path(working_directory,"sample_info.txt"),
 56 |   locus        = list(seqid = i, chromStart = 1, chromEnd = stop),
 57 |   referenceCondition = "WT.C",
 58 |   isPairedEnd = TRUE,
 59 |   readLength = 150,
 60 |   coverageType = "threePrime",
 61 |   stranded = FALSE,
 62 |   strandSpecific = 0,
 63 |   fromBam    = TRUE,
 64 |   nbThreads  = nb_threads,
 65 |   verbose = TRUE,
 66 | )
 67 | 
 68 | ## Changepoint detection to define segments
 69 | SExp <- segmentation(
 70 | 	data = data, 
 71 | 	weightType = "unweighted", #zeroInflated : low counts have less weight
 72 | 	modelSelectionType = "yao",
 73 | 	featureCountsType = "fromBam",
 74 | 	compressed = TRUE,
 75 | 	alpha = 2,
 76 | 	segmentNeighborhood = FALSE,
 77 | 	Kmax = NULL,
 78 | 	verbose = FALSE,
 79 | 	nbThreadsGridSearch = 1,
 80 | 	alphas = NULL,
 81 | 	gridSearch = FALSE,
 82 | 	outputDirectory = working_directory,
 83 | 	nbThreadsFeatureCounts = nb_threads,
 84 | 	strandSpecific = 0,
 85 | 	read2pos = 3,
 86 | 	isPairedEnd = TRUE
 87 | )
 88 | 
 89 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,]
 90 | 
 91 | dds <- dea(
 92 | 	data      = data,
 93 | 	SExp      = SExp_10,
 94 | 	design    = ~condition,
 95 | 	predicate = NULL,
 96 | 	significanceLevel = 0.01,
 97 | 	verbose = TRUE
 98 | )
 99 | 
100 | #- extract DERs based on signifiance ----------------------------------------#
101 | DERs <- dds[SummarizedExperiment::mcols(dds)$rejectedHypotheses,]
102 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs))
103 | 
104 | out_DERs <- rbind(out_DERs,DERs)
105 | 
106 | }
107 | 
108 | #clear memory cache
109 | gc()
110 | 
111 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3])))
112 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, baseVar, log2FoldChange, padj)
113 | out <- subset(out, baseMean > 10)
114 | 
115 | write_tsv(out, "abh1-CvsWT-C_DERs_3p.bed", col_names=F)
116 | 
117 | 


--------------------------------------------------------------------------------
/project_workflows/diffsegR_v1/diffsegR_abh1-CvsWT-C_gmuct_3p_v1_stranded.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ## Installation
  4 | #### Conda
  5 | #conda create --name diffseg_v1
  6 | #conda install -n diffseg_v1 -c conda-forge r-tidyverse
  7 | #conda install -n diffseg_v1 -c bioconda bioconductor-deseq2
  8 | #conda install -n diffseg_v1 -c bioconda bioconductor-rsubread
  9 | #conda install -n diffseg_v1 -c bioconda bioconductor-rtracklayer
 10 | #conda install -n diffseg_v1 -c bioconda bioconductor-sparsematrixstats
 11 | #conda install -n diffseg_v1 -c bioconda bioconductor-delayedmatrixstats
 12 | #conda install -n diffseg_v1 -c conda-forge r-remotes
 13 | #conda activate diffseg_v1
 14 | 
 15 | #### R
 16 | #remotes::install_github("sanssouci-org/sanssouci")
 17 | #remotes::install_github("aLiehrmann/DiffSegR@f657435")
 18 | 
 19 | library(tidyverse)
 20 | library(DiffSegR)
 21 | nb_threads = 10
 22 | working_directory <- getwd()
 23 | 
 24 | #- create sample information table --------------------------------------------#
 25 | sample_info <- data.frame(
 26 |   sample    = c("abh1.C_1","abh1.C_2","abh1.C_3","WT.C_1","WT.C_2","WT.C_3"),
 27 |   condition = c(rep("abh1.C", 3), rep( "WT.C", 3)),
 28 |   replicate = c(1:3,1:3),
 29 |   bam       = sapply(
 30 |         c("S16-5C_Aligned.sortedByCoord.out.bam", "S33-9C_Aligned.sortedByCoord.out.bam", "S35-11C_Aligned.sortedByCoord.out.bam",
 31 |         "S6-3C_Aligned.sortedByCoord.out.bam", "S8-4C_Aligned.sortedByCoord.out.bam", "S12-10C_Aligned.sortedByCoord.out.bam"),
 32 |     function(bam) file.path(working_directory, bam)),
 33 |   coverage  = file.path(
 34 |     working_directory,
 35 |     paste0(c("abh1.C_1","abh1.C_2","abh1.C_3","WT.C_1", "WT.C_2", "WT.C_3"), ".rds")))
 36 | 
 37 | #- save sample information table ----------------------------------------------# 
 38 | write.table(sample_info, file.path(working_directory, "sample_info.txt"))
 39 | 
 40 | #- display sample information table -------------------------------------------#
 41 | knitr::kable(sample_info, row.names = FALSE)
 42 | 
 43 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F)
 44 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt")
 45 | 
 46 | ### setup comparisons and loop for each chromosome
 47 | out_DERs <- NULL
 48 | 
 49 | for(i in unique(genome$X1)){
 50 | chr <- paste(i)
 51 | stop <- genome$X2[genome$X1==i]
 52 | 
 53 | ## load data
 54 | data <- loadData(
 55 |   sampleInfo   = file.path(working_directory,"sample_info.txt"),
 56 |   locus        = list(seqid = i, chromStart = 1, chromEnd = stop),
 57 |   referenceCondition = "WT.C",
 58 |   isPairedEnd = TRUE,
 59 |   readLength = 150,
 60 |   coverageType = "threePrime",
 61 |   stranded = TRUE,
 62 |   strandSpecific = 1,
 63 |   fromBam    = TRUE,
 64 |   nbThreads  = nb_threads,
 65 |   verbose = TRUE,
 66 | )
 67 | 
 68 | ## Changepoint detection to define segments
 69 | SExp <- segmentation(
 70 | 	data = data, 
 71 | 	weightType = "unweighted", #zeroInflated : low counts have less weight
 72 | 	modelSelectionType = "yao",
 73 | 	featureCountsType = "fromBam",
 74 | 	compressed = TRUE,
 75 | 	alpha = 2,
 76 | 	segmentNeighborhood = FALSE,
 77 | 	Kmax = NULL,
 78 | 	verbose = FALSE,
 79 | 	nbThreadsGridSearch = 1,
 80 | 	alphas = NULL,
 81 | 	gridSearch = FALSE,
 82 | 	outputDirectory = working_directory,
 83 | 	nbThreadsFeatureCounts = nb_threads,
 84 | 	strandSpecific = 1,
 85 | 	read2pos = 3,
 86 | 	isPairedEnd = TRUE
 87 | )
 88 | 
 89 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,]
 90 | 
 91 | dds <- dea(
 92 | 	data      = data,
 93 | 	SExp      = SExp_10,
 94 | 	design    = ~condition,
 95 | 	predicate = NULL,
 96 | 	significanceLevel = 0.01,
 97 | 	verbose = TRUE
 98 | )
 99 | 
100 | #- extract DERs based on signifiance ----------------------------------------#
101 | DERs <- dds[SummarizedExperiment::mcols(dds)$rejectedHypotheses,]
102 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs))
103 | 
104 | out_DERs <- rbind(out_DERs,DERs)
105 | 
106 | }
107 | 
108 | #clear memory cache
109 | gc()
110 | 
111 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3],".",l[4])))
112 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, strand, baseVar, log2FoldChange, padj)
113 | out <- subset(out, baseMean > 10)
114 | 
115 | write_tsv(out, "abh1-CvsWT-C_DERs_3p.bed", col_names=F)
116 | 
117 | 


--------------------------------------------------------------------------------
/project_workflows/diffsegR_v1/diffsegR_abh1-CvsWT-C_gmuct_5p_v1.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ## Installation
  4 | #### Conda
  5 | #conda create --name diffseg_v1
  6 | #conda install -n diffseg_v1 -c conda-forge r-tidyverse
  7 | #conda install -n diffseg_v1 -c bioconda bioconductor-deseq2
  8 | #conda install -n diffseg_v1 -c bioconda bioconductor-rsubread
  9 | #conda install -n diffseg_v1 -c bioconda bioconductor-rtracklayer
 10 | #conda install -n diffseg_v1 -c bioconda bioconductor-sparsematrixstats
 11 | #conda install -n diffseg_v1 -c bioconda bioconductor-delayedmatrixstats
 12 | #conda install -n diffseg_v1 -c conda-forge r-remotes
 13 | #conda activate diffseg_v1
 14 | 
 15 | #### R
 16 | #remotes::install_github("sanssouci-org/sanssouci")
 17 | #remotes::install_github("aLiehrmann/DiffSegR@f657435")
 18 | 
 19 | library(tidyverse)
 20 | library(DiffSegR)
 21 | nb_threads = 10
 22 | working_directory <- getwd()
 23 | 
 24 | #- create sample information table --------------------------------------------#
 25 | sample_info <- data.frame(
 26 |   sample    = c("abh1.C_1","abh1.C_2","abh1.C_3","WT.C_1","WT.C_2","WT.C_3"),
 27 |   condition = c(rep("abh1.C", 3), rep( "WT.C", 3)),
 28 |   replicate = c(1:3,1:3),
 29 |   bam       = sapply(
 30 | 	c("S16-5C_Aligned.sortedByCoord.out.bam", "S33-9C_Aligned.sortedByCoord.out.bam", "S35-11C_Aligned.sortedByCoord.out.bam",
 31 | 	"S6-3C_Aligned.sortedByCoord.out.bam", "S8-4C_Aligned.sortedByCoord.out.bam", "S12-10C_Aligned.sortedByCoord.out.bam"),
 32 |     function(bam) file.path(working_directory, bam)),
 33 |   coverage  = file.path(
 34 |     working_directory,
 35 |     paste0(c("abh1.C_1","abh1.C_2","abh1.C_3","WT.C_1", "WT.C_2", "WT.C_3"), ".rds")))
 36 | 
 37 | #- save sample information table ----------------------------------------------# 
 38 | write.table(sample_info, file.path(working_directory, "sample_info.txt"))
 39 | 
 40 | #- display sample information table -------------------------------------------#
 41 | knitr::kable(sample_info, row.names = FALSE)
 42 | 
 43 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F)
 44 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt")
 45 | 
 46 | ### setup comparisons and loop for each chromosome
 47 | out_DERs <- NULL
 48 | 
 49 | for(i in unique(genome$X1)){
 50 | chr <- paste(i)
 51 | stop <- genome$X2[genome$X1==i]
 52 | 
 53 | ## load data
 54 | data <- loadData(
 55 |   sampleInfo   = file.path(working_directory,"sample_info.txt"),
 56 |   locus        = list(seqid = i, chromStart = 1, chromEnd = stop),
 57 |   referenceCondition = "WT.C",
 58 |   isPairedEnd = TRUE,
 59 |   readLength = 150,
 60 |   coverageType = "fivePrime",
 61 |   stranded = FALSE,
 62 |   strandSpecific = 0,
 63 |   fromBam    = TRUE,
 64 |   nbThreads  = nb_threads,
 65 |   verbose = TRUE,
 66 | )
 67 | 
 68 | ## Changepoint detection to define segments
 69 | SExp <- segmentation(
 70 | 	data = data, 
 71 | 	weightType = "unweighted", #zeroInflated : low counts have less weight
 72 | 	modelSelectionType = "yao",
 73 | 	featureCountsType = "fromBam",
 74 | 	compressed = TRUE,
 75 | 	alpha = 2,
 76 | 	segmentNeighborhood = FALSE,
 77 | 	Kmax = NULL,
 78 | 	verbose = FALSE,
 79 | 	nbThreadsGridSearch = 1,
 80 | 	alphas = NULL,
 81 | 	gridSearch = FALSE,
 82 | 	outputDirectory = working_directory,
 83 | 	nbThreadsFeatureCounts = nb_threads,
 84 | 	strandSpecific = 0,
 85 | 	read2pos = 5,
 86 | 	isPairedEnd = TRUE
 87 | )
 88 | 
 89 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,]
 90 | 
 91 | dds <- dea(
 92 | 	data      = data,
 93 | 	SExp      = SExp_10,
 94 | 	design    = ~condition,
 95 | 	predicate = NULL,
 96 | 	significanceLevel = 0.01,
 97 | 	orderBy = "pvalue",
 98 | 	verbose = TRUE
 99 | )
100 | 
101 | #- extract DERs based on signifiance ----------------------------------------#
102 | DERs <- dds[SummarizedExperiment::mcols(dds)$rejectedHypotheses,]
103 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs))
104 | 
105 | out_DERs <- rbind(out_DERs,DERs)
106 | 
107 | }
108 | 
109 | #clear memory cache
110 | gc()
111 | 
112 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3])))
113 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, baseVar, log2FoldChange, padj)
114 | out <- subset(out, baseMean > 10)
115 | 
116 | write_tsv(out, "abh1-CvsWT-C_DERs_5p.bed", col_names=F)
117 | 
118 | 


--------------------------------------------------------------------------------
/project_workflows/diffsegR_v1/diffsegR_abh1-CvsWT-C_gmuct_5p_v1_stranded.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ## Installation
  4 | #### Conda
  5 | #conda create --name diffseg_v1
  6 | #conda install -n diffseg_v1 -c conda-forge r-tidyverse
  7 | #conda install -n diffseg_v1 -c bioconda bioconductor-deseq2
  8 | #conda install -n diffseg_v1 -c bioconda bioconductor-rsubread
  9 | #conda install -n diffseg_v1 -c bioconda bioconductor-rtracklayer
 10 | #conda install -n diffseg_v1 -c bioconda bioconductor-sparsematrixstats
 11 | #conda install -n diffseg_v1 -c bioconda bioconductor-delayedmatrixstats
 12 | #conda install -n diffseg_v1 -c conda-forge r-remotes
 13 | #conda activate diffseg_v1
 14 | 
 15 | #### R
 16 | #remotes::install_github("sanssouci-org/sanssouci")
 17 | #remotes::install_github("aLiehrmann/DiffSegR@f657435")
 18 | 
 19 | library(tidyverse)
 20 | library(DiffSegR)
 21 | nb_threads = 10
 22 | working_directory <- getwd()
 23 | 
 24 | #- create sample information table --------------------------------------------#
 25 | sample_info <- data.frame(
 26 |   sample    = c("abh1.C_1","abh1.C_2","abh1.C_3","WT.C_1","WT.C_2","WT.C_3"),
 27 |   condition = c(rep("abh1.C", 3), rep( "WT.C", 3)),
 28 |   replicate = c(1:3,1:3),
 29 |   bam       = sapply(
 30 | 	c("S16-5C_Aligned.sortedByCoord.out.bam", "S33-9C_Aligned.sortedByCoord.out.bam", "S35-11C_Aligned.sortedByCoord.out.bam",
 31 | 	"S6-3C_Aligned.sortedByCoord.out.bam", "S8-4C_Aligned.sortedByCoord.out.bam", "S12-10C_Aligned.sortedByCoord.out.bam"),
 32 |     function(bam) file.path(working_directory, bam)),
 33 |   coverage  = file.path(
 34 |     working_directory,
 35 |     paste0(c("abh1.C_1","abh1.C_2","abh1.C_3","WT.C_1", "WT.C_2", "WT.C_3"), ".rds")))
 36 | 
 37 | #- save sample information table ----------------------------------------------# 
 38 | write.table(sample_info, file.path(working_directory, "sample_info.txt"))
 39 | 
 40 | #- display sample information table -------------------------------------------#
 41 | knitr::kable(sample_info, row.names = FALSE)
 42 | 
 43 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F)
 44 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt")
 45 | 
 46 | ### setup comparisons and loop for each chromosome
 47 | out_DERs <- NULL
 48 | 
 49 | for(i in unique(genome$X1)){
 50 | chr <- paste(i)
 51 | stop <- genome$X2[genome$X1==i]
 52 | 
 53 | ## load data
 54 | data <- loadData(
 55 |   sampleInfo   = file.path(working_directory,"sample_info.txt"),
 56 |   locus        = list(seqid = i, chromStart = 1, chromEnd = stop),
 57 |   referenceCondition = "WT.C",
 58 |   isPairedEnd = TRUE,
 59 |   readLength = 150,
 60 |   coverageType = "fivePrime",
 61 |   stranded = TRUE,
 62 |   strandSpecific = 1,
 63 |   fromBam    = TRUE,
 64 |   nbThreads  = nb_threads,
 65 |   verbose = TRUE,
 66 | )
 67 | 
 68 | ## Changepoint detection to define segments
 69 | SExp <- segmentation(
 70 | 	data = data, 
 71 | 	weightType = "unweighted", #zeroInflated : low counts have less weight
 72 | 	modelSelectionType = "yao",
 73 | 	featureCountsType = "fromBam",
 74 | 	compressed = TRUE,
 75 | 	alpha = 2,
 76 | 	segmentNeighborhood = FALSE,
 77 | 	Kmax = NULL,
 78 | 	verbose = FALSE,
 79 | 	nbThreadsGridSearch = 1,
 80 | 	alphas = NULL,
 81 | 	gridSearch = FALSE,
 82 | 	outputDirectory = working_directory,
 83 | 	nbThreadsFeatureCounts = nb_threads,
 84 | 	strandSpecific = 1,
 85 | 	read2pos = 5,
 86 | 	isPairedEnd = TRUE
 87 | )
 88 | 
 89 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,]
 90 | 
 91 | dds <- dea(
 92 | 	data      = data,
 93 | 	SExp      = SExp_10,
 94 | 	design    = ~condition,
 95 | 	predicate = NULL,
 96 | 	significanceLevel = 0.01,
 97 | 	orderBy = "pvalue",
 98 | 	verbose = TRUE
 99 | )
100 | 
101 | #- extract DERs based on signifiance ----------------------------------------#
102 | DERs <- dds[SummarizedExperiment::mcols(dds)$rejectedHypotheses,]
103 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs))
104 | 
105 | out_DERs <- rbind(out_DERs,DERs)
106 | 
107 | }
108 | 
109 | #clear memory cache
110 | gc()
111 | 
112 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3],".",l[4])))
113 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, strand, baseVar, log2FoldChange, padj)
114 | out <- subset(out, baseMean > 10)
115 | 
116 | write_tsv(out, "abh1-CvsWT-C_DERs_5p.bed", col_names=F)
117 | 
118 | 


--------------------------------------------------------------------------------
/project_workflows/diffsegR_v1/diffsegR_abh1-NvsWT-N_gmuct_3p_v1.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ## Installation
  4 | #### Conda
  5 | #conda create --name diffseg_v1
  6 | #conda install -n diffseg_v1 -c conda-forge r-tidyverse
  7 | #conda install -n diffseg_v1 -c bioconda bioconductor-deseq2
  8 | #conda install -n diffseg_v1 -c bioconda bioconductor-rsubread
  9 | #conda install -n diffseg_v1 -c bioconda bioconductor-rtracklayer
 10 | #conda install -n diffseg_v1 -c bioconda bioconductor-sparsematrixstats
 11 | #conda install -n diffseg_v1 -c bioconda bioconductor-delayedmatrixstats
 12 | #conda install -n diffseg_v1 -c conda-forge r-remotes
 13 | #conda activate diffseg_v1
 14 | 
 15 | #### R
 16 | #remotes::install_github("sanssouci-org/sanssouci")
 17 | #remotes::install_github("aLiehrmann/DiffSegR@f657435")
 18 | 
 19 | library(tidyverse)
 20 | library(DiffSegR)
 21 | nb_threads = 10
 22 | working_directory <- getwd()
 23 | 
 24 | #- create sample information table --------------------------------------------#
 25 | sample_info <- data.frame(
 26 |   sample    = c("abh1.N_1","abh1.N_2","abh1.N_3","WT.N_1","WT.N_2","WT.N_3"),
 27 |   condition = c(rep("abh1.N", 3), rep( "WT.N", 3)),
 28 |   replicate = c(1:3,1:3),
 29 |   bam       = sapply(
 30 |         c("S15-5N_Aligned.sortedByCoord.out.bam", "S32-9N_Aligned.sortedByCoord.out.bam", "S34-11N_Aligned.sortedByCoord.out.bam",
 31 |         "S5-3N_Aligned.sortedByCoord.out.bam", "S7-4N_Aligned.sortedByCoord.out.bam", "S11-10N_Aligned.sortedByCoord.out.bam"),
 32 |     function(bam) file.path(working_directory, bam)),
 33 |   coverage  = file.path(
 34 |     working_directory,
 35 |     paste0(c("abh1.N_1","abh1.N_2","abh1.N_3","WT.N_1", "WT.N_2", "WT.N_3"), ".rds")))
 36 | 
 37 | #- save sample information table ----------------------------------------------# 
 38 | write.table(sample_info, file.path(working_directory, "sample_info.txt"))
 39 | 
 40 | #- display sample information table -------------------------------------------#
 41 | knitr::kable(sample_info, row.names = FALSE)
 42 | 
 43 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F)
 44 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt")
 45 | 
 46 | ### setup comparisons and loop for each chromosome
 47 | out_DERs <- NULL
 48 | 
 49 | for(i in unique(genome$X1)){
 50 | chr <- paste(i)
 51 | stop <- genome$X2[genome$X1==i]
 52 | 
 53 | ## load data
 54 | data <- loadData(
 55 |   sampleInfo   = file.path(working_directory,"sample_info.txt"),
 56 |   locus        = list(seqid = i, chromStart = 1, chromEnd = stop),
 57 |   referenceCondition = "WT.N",
 58 |   isPairedEnd = TRUE,
 59 |   readLength = 150,
 60 |   coverageType = "threePrime",
 61 |   stranded = FALSE,
 62 |   strandSpecific = 0,
 63 |   fromBam    = TRUE,
 64 |   nbThreads  = nb_threads,
 65 |   verbose = TRUE,
 66 | )
 67 | 
 68 | ## Changepoint detection to define segments
 69 | SExp <- segmentation(
 70 | 	data = data, 
 71 | 	weightType = "unweighted", #zeroInflated : low counts have less weight
 72 | 	modelSelectionType = "yao",
 73 | 	featureCountsType = "fromBam",
 74 | 	compressed = TRUE,
 75 | 	alpha = 2,
 76 | 	segmentNeighborhood = FALSE,
 77 | 	Kmax = NULL,
 78 | 	verbose = FALSE,
 79 | 	nbThreadsGridSearch = 1,
 80 | 	alphas = NULL,
 81 | 	gridSearch = FALSE,
 82 | 	outputDirectory = working_directory,
 83 | 	nbThreadsFeatureCounts = nb_threads,
 84 | 	strandSpecific = 0,
 85 | 	read2pos = 3,
 86 | 	isPairedEnd = TRUE
 87 | )
 88 | 
 89 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,]
 90 | 
 91 | dds <- dea(
 92 | 	data      = data,
 93 | 	SExp      = SExp_10,
 94 | 	design    = ~condition,
 95 | 	predicate = NULL,
 96 | 	significanceLevel = 0.01,
 97 | 	verbose = TRUE
 98 | )
 99 | 
100 | #- extract DERs based on signifiance ----------------------------------------#
101 | DERs <- dds[SummarizedExperiment::mcols(dds)$rejectedHypotheses,]
102 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs))
103 | 
104 | out_DERs <- rbind(out_DERs,DERs)
105 | 
106 | }
107 | 
108 | #clear memory cache
109 | gc()
110 | 
111 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3])))
112 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, baseVar, log2FoldChange, padj)
113 | out <- subset(out, baseMean > 10)
114 | 
115 | write_tsv(out, "abh1-NvsWT-N_DERs_3p.bed", col_names=F)
116 | 
117 | 


--------------------------------------------------------------------------------
/project_workflows/diffsegR_v1/diffsegR_abh1-NvsWT-N_gmuct_3p_v1_stranded.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ## Installation
  4 | #### Conda
  5 | #conda create --name diffseg_v1
  6 | #conda install -n diffseg_v1 -c conda-forge r-tidyverse
  7 | #conda install -n diffseg_v1 -c bioconda bioconductor-deseq2
  8 | #conda install -n diffseg_v1 -c bioconda bioconductor-rsubread
  9 | #conda install -n diffseg_v1 -c bioconda bioconductor-rtracklayer
 10 | #conda install -n diffseg_v1 -c bioconda bioconductor-sparsematrixstats
 11 | #conda install -n diffseg_v1 -c bioconda bioconductor-delayedmatrixstats
 12 | #conda install -n diffseg_v1 -c conda-forge r-remotes
 13 | #conda activate diffseg_v1
 14 | 
 15 | #### R
 16 | #remotes::install_github("sanssouci-org/sanssouci")
 17 | #remotes::install_github("aLiehrmann/DiffSegR@f657435")
 18 | 
 19 | library(tidyverse)
 20 | library(DiffSegR)
 21 | nb_threads = 10
 22 | working_directory <- getwd()
 23 | 
 24 | #- create sample information table --------------------------------------------#
 25 | sample_info <- data.frame(
 26 |   sample    = c("abh1.N_1","abh1.N_2","abh1.N_3","WT.N_1","WT.N_2","WT.N_3"),
 27 |   condition = c(rep("abh1.N", 3), rep( "WT.N", 3)),
 28 |   replicate = c(1:3,1:3),
 29 |   bam       = sapply(
 30 |         c("S15-5N_Aligned.sortedByCoord.out.bam", "S32-9N_Aligned.sortedByCoord.out.bam", "S34-11N_Aligned.sortedByCoord.out.bam",
 31 |         "S5-3N_Aligned.sortedByCoord.out.bam", "S7-4N_Aligned.sortedByCoord.out.bam", "S11-10N_Aligned.sortedByCoord.out.bam"),
 32 |     function(bam) file.path(working_directory, bam)),
 33 |   coverage  = file.path(
 34 |     working_directory,
 35 |     paste0(c("abh1.N_1","abh1.N_2","abh1.N_3","WT.N_1", "WT.N_2", "WT.N_3"), ".rds")))
 36 | 
 37 | #- save sample information table ----------------------------------------------# 
 38 | write.table(sample_info, file.path(working_directory, "sample_info.txt"))
 39 | 
 40 | #- display sample information table -------------------------------------------#
 41 | knitr::kable(sample_info, row.names = FALSE)
 42 | 
 43 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F)
 44 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt")
 45 | 
 46 | ### setup comparisons and loop for each chromosome
 47 | out_DERs <- NULL
 48 | 
 49 | for(i in unique(genome$X1)){
 50 | chr <- paste(i)
 51 | stop <- genome$X2[genome$X1==i]
 52 | 
 53 | ## load data
 54 | data <- loadData(
 55 |   sampleInfo   = file.path(working_directory,"sample_info.txt"),
 56 |   locus        = list(seqid = i, chromStart = 1, chromEnd = stop),
 57 |   referenceCondition = "WT.N",
 58 |   isPairedEnd = TRUE,
 59 |   readLength = 150,
 60 |   coverageType = "threePrime",
 61 |   stranded = TRUE,
 62 |   strandSpecific = 1,
 63 |   fromBam    = TRUE,
 64 |   nbThreads  = nb_threads,
 65 |   verbose = TRUE,
 66 | )
 67 | 
 68 | ## Changepoint detection to define segments
 69 | SExp <- segmentation(
 70 | 	data = data, 
 71 | 	weightType = "unweighted", #zeroInflated : low counts have less weight
 72 | 	modelSelectionType = "yao",
 73 | 	featureCountsType = "fromBam",
 74 | 	compressed = TRUE,
 75 | 	alpha = 2,
 76 | 	segmentNeighborhood = FALSE,
 77 | 	Kmax = NULL,
 78 | 	verbose = FALSE,
 79 | 	nbThreadsGridSearch = 1,
 80 | 	alphas = NULL,
 81 | 	gridSearch = FALSE,
 82 | 	outputDirectory = working_directory,
 83 | 	nbThreadsFeatureCounts = nb_threads,
 84 | 	strandSpecific = 1,
 85 | 	read2pos = 3,
 86 | 	isPairedEnd = TRUE
 87 | )
 88 | 
 89 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,]
 90 | 
 91 | dds <- dea(
 92 | 	data      = data,
 93 | 	SExp      = SExp_10,
 94 | 	design    = ~condition,
 95 | 	predicate = NULL,
 96 | 	significanceLevel = 0.01,
 97 | 	verbose = TRUE
 98 | )
 99 | 
100 | #- extract DERs based on signifiance ----------------------------------------#
101 | DERs <- dds[SummarizedExperiment::mcols(dds)$rejectedHypotheses,]
102 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs))
103 | 
104 | out_DERs <- rbind(out_DERs,DERs)
105 | 
106 | }
107 | 
108 | #clear memory cache
109 | gc()
110 | 
111 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3],".",l[4])))
112 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, strand, baseVar, log2FoldChange, padj)
113 | out <- subset(out, baseMean > 10)
114 | 
115 | write_tsv(out, "abh1-NvsWT-N_DERs_3p.bed", col_names=F)
116 | 
117 | 


--------------------------------------------------------------------------------
/project_workflows/diffsegR_v1/diffsegR_abh1-NvsWT-N_gmuct_5p_v1.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ## Installation
  4 | #### Conda
  5 | #conda create --name diffseg_v1
  6 | #conda install -n diffseg_v1 -c conda-forge r-tidyverse
  7 | #conda install -n diffseg_v1 -c bioconda bioconductor-deseq2
  8 | #conda install -n diffseg_v1 -c bioconda bioconductor-rsubread
  9 | #conda install -n diffseg_v1 -c bioconda bioconductor-rtracklayer
 10 | #conda install -n diffseg_v1 -c bioconda bioconductor-sparsematrixstats
 11 | #conda install -n diffseg_v1 -c bioconda bioconductor-delayedmatrixstats
 12 | #conda install -n diffseg_v1 -c conda-forge r-remotes
 13 | #conda activate diffseg_v1
 14 | 
 15 | #### R
 16 | #remotes::install_github("sanssouci-org/sanssouci")
 17 | #remotes::install_github("aLiehrmann/DiffSegR@f657435")
 18 | 
 19 | library(tidyverse)
 20 | library(DiffSegR)
 21 | nb_threads = 10
 22 | working_directory <- getwd()
 23 | 
 24 | #- create sample information table --------------------------------------------#
 25 | sample_info <- data.frame(
 26 |   sample    = c("abh1.N_1","abh1.N_2","abh1.N_3","WT.N_1","WT.N_2","WT.N_3"),
 27 |   condition = c(rep("abh1.N", 3), rep( "WT.N", 3)),
 28 |   replicate = c(1:3,1:3),
 29 |   bam       = sapply(
 30 | 	c("S15-5N_Aligned.sortedByCoord.out.bam", "S32-9N_Aligned.sortedByCoord.out.bam", "S34-11N_Aligned.sortedByCoord.out.bam",
 31 | 	"S5-3N_Aligned.sortedByCoord.out.bam", "S7-4N_Aligned.sortedByCoord.out.bam", "S11-10N_Aligned.sortedByCoord.out.bam"),
 32 |     function(bam) file.path(working_directory, bam)),
 33 |   coverage  = file.path(
 34 |     working_directory,
 35 |     paste0(c("abh1.N_1","abh1.N_2","abh1.N_3","WT.N_1", "WT.N_2", "WT.N_3"), ".rds")))
 36 | 
 37 | #- save sample information table ----------------------------------------------# 
 38 | write.table(sample_info, file.path(working_directory, "sample_info.txt"))
 39 | 
 40 | #- display sample information table -------------------------------------------#
 41 | knitr::kable(sample_info, row.names = FALSE)
 42 | 
 43 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F)
 44 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt")
 45 | 
 46 | ### setup comparisons and loop for each chromosome
 47 | out_DERs <- NULL
 48 | 
 49 | for(i in unique(genome$X1)){
 50 | chr <- paste(i)
 51 | stop <- genome$X2[genome$X1==i]
 52 | 
 53 | ## load data
 54 | data <- loadData(
 55 |   sampleInfo   = file.path(working_directory,"sample_info.txt"),
 56 |   locus        = list(seqid = i, chromStart = 1, chromEnd = stop),
 57 |   referenceCondition = "WT.N",
 58 |   isPairedEnd = TRUE,
 59 |   readLength = 150,
 60 |   coverageType = "fivePrime",
 61 |   stranded = FALSE,
 62 |   strandSpecific = 0,
 63 |   fromBam    = TRUE,
 64 |   nbThreads  = nb_threads,
 65 |   verbose = TRUE,
 66 | )
 67 | 
 68 | ## Changepoint detection to define segments
 69 | SExp <- segmentation(
 70 | 	data = data, 
 71 | 	weightType = "unweighted", #zeroInflated : low counts have less weight
 72 | 	modelSelectionType = "yao",
 73 | 	featureCountsType = "fromBam",
 74 | 	compressed = TRUE,
 75 | 	alpha = 2,
 76 | 	segmentNeighborhood = FALSE,
 77 | 	Kmax = NULL,
 78 | 	verbose = FALSE,
 79 | 	nbThreadsGridSearch = 1,
 80 | 	alphas = NULL,
 81 | 	gridSearch = FALSE,
 82 | 	outputDirectory = working_directory,
 83 | 	nbThreadsFeatureCounts = nb_threads,
 84 | 	strandSpecific = 0,
 85 | 	read2pos = 5,
 86 | 	isPairedEnd = TRUE
 87 | )
 88 | 
 89 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,]
 90 | 
 91 | dds <- dea(
 92 | 	data      = data,
 93 | 	SExp      = SExp_10,
 94 | 	design    = ~condition,
 95 | 	predicate = NULL,
 96 | 	significanceLevel = 0.01,
 97 | 	orderBy = "pvalue",
 98 | 	verbose = TRUE
 99 | )
100 | 
101 | #- extract DERs based on signifiance ----------------------------------------#
102 | DERs <- dds[SummarizedExperiment::mcols(dds)$rejectedHypotheses,]
103 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs))
104 | 
105 | out_DERs <- rbind(out_DERs,DERs)
106 | 
107 | }
108 | 
109 | #clear memory cache
110 | gc()
111 | 
112 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3])))
113 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, baseVar, log2FoldChange, padj)
114 | out <- subset(out, baseMean > 10)
115 | 
116 | write_tsv(out, "abh1-NvsWT-N_DERs_5p.bed", col_names=F)
117 | 
118 | 


--------------------------------------------------------------------------------
/project_workflows/diffsegR_v1/diffsegR_abh1-NvsWT-N_gmuct_5p_v1_stranded.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ## Installation
  4 | #### Conda
  5 | #conda create --name diffseg_v1
  6 | #conda install -n diffseg_v1 -c conda-forge r-tidyverse
  7 | #conda install -n diffseg_v1 -c bioconda bioconductor-deseq2
  8 | #conda install -n diffseg_v1 -c bioconda bioconductor-rsubread
  9 | #conda install -n diffseg_v1 -c bioconda bioconductor-rtracklayer
 10 | #conda install -n diffseg_v1 -c bioconda bioconductor-sparsematrixstats
 11 | #conda install -n diffseg_v1 -c bioconda bioconductor-delayedmatrixstats
 12 | #conda install -n diffseg_v1 -c conda-forge r-remotes
 13 | #conda activate diffseg_v1
 14 | 
 15 | #### R
 16 | #remotes::install_github("sanssouci-org/sanssouci")
 17 | #remotes::install_github("aLiehrmann/DiffSegR@f657435")
 18 | 
 19 | library(tidyverse)
 20 | library(DiffSegR)
 21 | nb_threads = 10
 22 | working_directory <- getwd()
 23 | 
 24 | #- create sample information table --------------------------------------------#
 25 | sample_info <- data.frame(
 26 |   sample    = c("abh1.N_1","abh1.N_2","abh1.N_3","WT.N_1","WT.N_2","WT.N_3"),
 27 |   condition = c(rep("abh1.N", 3), rep( "WT.N", 3)),
 28 |   replicate = c(1:3,1:3),
 29 |   bam       = sapply(
 30 | 	c("S15-5N_Aligned.sortedByCoord.out.bam", "S32-9N_Aligned.sortedByCoord.out.bam", "S34-11N_Aligned.sortedByCoord.out.bam",
 31 | 	"S5-3N_Aligned.sortedByCoord.out.bam", "S7-4N_Aligned.sortedByCoord.out.bam", "S11-10N_Aligned.sortedByCoord.out.bam"),
 32 |     function(bam) file.path(working_directory, bam)),
 33 |   coverage  = file.path(
 34 |     working_directory,
 35 |     paste0(c("abh1.N_1","abh1.N_2","abh1.N_3","WT.N_1", "WT.N_2", "WT.N_3"), ".rds")))
 36 | 
 37 | #- save sample information table ----------------------------------------------# 
 38 | write.table(sample_info, file.path(working_directory, "sample_info.txt"))
 39 | 
 40 | #- display sample information table -------------------------------------------#
 41 | knitr::kable(sample_info, row.names = FALSE)
 42 | 
 43 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F)
 44 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt")
 45 | 
 46 | ### setup comparisons and loop for each chromosome
 47 | out_DERs <- NULL
 48 | 
 49 | for(i in unique(genome$X1)){
 50 | chr <- paste(i)
 51 | stop <- genome$X2[genome$X1==i]
 52 | 
 53 | ## load data
 54 | data <- loadData(
 55 |   sampleInfo   = file.path(working_directory,"sample_info.txt"),
 56 |   locus        = list(seqid = i, chromStart = 1, chromEnd = stop),
 57 |   referenceCondition = "WT.N",
 58 |   isPairedEnd = TRUE,
 59 |   readLength = 150,
 60 |   coverageType = "fivePrime",
 61 |   stranded = TRUE,
 62 |   strandSpecific = 1,
 63 |   fromBam    = TRUE,
 64 |   nbThreads  = nb_threads,
 65 |   verbose = TRUE,
 66 | )
 67 | 
 68 | ## Changepoint detection to define segments
 69 | SExp <- segmentation(
 70 | 	data = data, 
 71 | 	weightType = "unweighted", #zeroInflated : low counts have less weight
 72 | 	modelSelectionType = "yao",
 73 | 	featureCountsType = "fromBam",
 74 | 	compressed = TRUE,
 75 | 	alpha = 2,
 76 | 	segmentNeighborhood = FALSE,
 77 | 	Kmax = NULL,
 78 | 	verbose = FALSE,
 79 | 	nbThreadsGridSearch = 1,
 80 | 	alphas = NULL,
 81 | 	gridSearch = FALSE,
 82 | 	outputDirectory = working_directory,
 83 | 	nbThreadsFeatureCounts = nb_threads,
 84 | 	strandSpecific = 1,
 85 | 	read2pos = 5,
 86 | 	isPairedEnd = TRUE
 87 | )
 88 | 
 89 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,]
 90 | 
 91 | dds <- dea(
 92 | 	data      = data,
 93 | 	SExp      = SExp_10,
 94 | 	design    = ~condition,
 95 | 	predicate = NULL,
 96 | 	significanceLevel = 0.01,
 97 | 	orderBy = "pvalue",
 98 | 	verbose = TRUE
 99 | )
100 | 
101 | #- extract DERs based on signifiance ----------------------------------------#
102 | DERs <- dds[SummarizedExperiment::mcols(dds)$rejectedHypotheses,]
103 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs))
104 | 
105 | out_DERs <- rbind(out_DERs,DERs)
106 | 
107 | }
108 | 
109 | #clear memory cache
110 | gc()
111 | 
112 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3],".",l[4])))
113 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, strand, baseVar, log2FoldChange, padj)
114 | out <- subset(out, baseMean > 10)
115 | 
116 | write_tsv(out, "abh1-NvsWT-N_DERs_5p.bed", col_names=F)
117 | 
118 | 


--------------------------------------------------------------------------------
/project_workflows/diffsegR_v1/diffsegR_dxo1-NvsC_gmuct_3p_v1.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ## Installation
  4 | #### Conda
  5 | #conda create --name diffseg_v1
  6 | #conda install -n diffseg_v1 -c conda-forge r-tidyverse
  7 | #conda install -n diffseg_v1 -c bioconda bioconductor-deseq2
  8 | #conda install -n diffseg_v1 -c bioconda bioconductor-rsubread
  9 | #conda install -n diffseg_v1 -c bioconda bioconductor-rtracklayer
 10 | #conda install -n diffseg_v1 -c bioconda bioconductor-sparsematrixstats
 11 | #conda install -n diffseg_v1 -c bioconda bioconductor-delayedmatrixstats
 12 | #conda install -n diffseg_v1 -c conda-forge r-remotes
 13 | #conda activate diffseg_v1
 14 | 
 15 | #### R
 16 | #remotes::install_github("sanssouci-org/sanssouci")
 17 | #remotes::install_github("aLiehrmann/DiffSegR@f657435")
 18 | 
 19 | library(tidyverse)
 20 | library(DiffSegR)
 21 | nb_threads = 10
 22 | working_directory <- getwd()
 23 | 
 24 | #- create sample information table --------------------------------------------#
 25 | sample_info <- data.frame(
 26 |   sample    = c("dxo1.N_1","dxo1.N_2","dxo1.N_3","dxo1.C_1","dxo1.C_2","dxo1.C_3"),
 27 |   condition = c(rep("dxo1.N", 3), rep( "dxo1.C", 3)),
 28 |   replicate = c(1:3,1:3),
 29 |   bam       = sapply(
 30 | 	c("S2-2N_Aligned.sortedByCoord.out.bam", "S17-6N_Aligned.sortedByCoord.out.bam", "S13-19N_Aligned.sortedByCoord.out.bam",
 31 | 	"S23-2C_Aligned.sortedByCoord.out.bam", "S18-6C_Aligned.sortedByCoord.out.bam", "S14-19C_Aligned.sortedByCoord.out.bam"),
 32 |     function(bam) file.path(working_directory, bam)),
 33 |   coverage  = file.path(
 34 |     working_directory,
 35 |     paste0(c("dxo1.N_1","dxo1.N_2","dxo1.N_3","dxo1.C_1", "dxo1.C_2", "dxo1.C_3"), ".rds")))
 36 | 
 37 | #- save sample information table ----------------------------------------------# 
 38 | write.table(sample_info, file.path(working_directory, "sample_info.txt"))
 39 | 
 40 | #- display sample information table -------------------------------------------#
 41 | knitr::kable(sample_info, row.names = FALSE)
 42 | 
 43 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F)
 44 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt")
 45 | 
 46 | ### setup comparisons and loop for each chromosome
 47 | out_DERs <- NULL
 48 | 
 49 | for(i in unique(genome$X1)){
 50 | chr <- paste(i)
 51 | stop <- genome$X2[genome$X1==i]
 52 | 
 53 | ## load data
 54 | data <- loadData(
 55 |   sampleInfo   = file.path(working_directory,"sample_info.txt"),
 56 |   locus        = list(seqid = i, chromStart = 1, chromEnd = stop),
 57 |   referenceCondition = "dxo1.C",
 58 |   isPairedEnd = TRUE,
 59 |   readLength = 150,
 60 |   coverageType = "threePrime",
 61 |   stranded = FALSE,
 62 |   strandSpecific = 0,
 63 |   fromBam    = TRUE,
 64 |   nbThreads  = nb_threads,
 65 |   verbose = TRUE,
 66 | )
 67 | 
 68 | ## Changepoint detection to define segments
 69 | SExp <- segmentation(
 70 | 	data = data, 
 71 | 	weightType = "unweighted", #zeroInflated : low counts have less weight
 72 | 	modelSelectionType = "yao",
 73 | 	featureCountsType = "fromBam",
 74 | 	compressed = TRUE,
 75 | 	alpha = 2,
 76 | 	segmentNeighborhood = FALSE,
 77 | 	Kmax = NULL,
 78 | 	verbose = FALSE,
 79 | 	nbThreadsGridSearch = 1,
 80 | 	alphas = NULL,
 81 | 	gridSearch = FALSE,
 82 | 	outputDirectory = working_directory,
 83 | 	nbThreadsFeatureCounts = nb_threads,
 84 | 	strandSpecific = 0,
 85 | 	read2pos = 3,
 86 | 	isPairedEnd = TRUE
 87 | )
 88 | 
 89 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,]
 90 | 
 91 | dds <- dea(
 92 | 	data      = data,
 93 | 	SExp      = SExp_10,
 94 | 	design    = ~condition,
 95 | 	predicate = NULL,
 96 | 	significanceLevel = 0.01,
 97 | 	verbose = TRUE
 98 | )
 99 | 
100 | #- extract DERs based on signifiance ----------------------------------------#
101 | DERs <- dds[SummarizedExperiment::mcols(dds)$rejectedHypotheses,]
102 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs))
103 | 
104 | out_DERs <- rbind(out_DERs,DERs)
105 | 
106 | }
107 | 
108 | #clear memory cache
109 | gc()
110 | 
111 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3])))
112 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, baseVar, log2FoldChange, padj)
113 | out <- subset(out, baseMean > 10)
114 | 
115 | write_tsv(out, "dxo1-N_DERs_3p.bed", col_names=F)
116 | 
117 | 


--------------------------------------------------------------------------------
/project_workflows/diffsegR_v1/diffsegR_dxo1-NvsC_gmuct_5p_v1.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ## Installation
  4 | #### Conda
  5 | #conda create --name diffseg_v1
  6 | #conda install -n diffseg_v1 -c conda-forge r-tidyverse
  7 | #conda install -n diffseg_v1 -c bioconda bioconductor-deseq2
  8 | #conda install -n diffseg_v1 -c bioconda bioconductor-rsubread
  9 | #conda install -n diffseg_v1 -c bioconda bioconductor-rtracklayer
 10 | #conda install -n diffseg_v1 -c bioconda bioconductor-sparsematrixstats
 11 | #conda install -n diffseg_v1 -c bioconda bioconductor-delayedmatrixstats
 12 | #conda install -n diffseg_v1 -c conda-forge r-remotes
 13 | #conda activate diffseg_v1
 14 | 
 15 | #### R
 16 | #remotes::install_github("sanssouci-org/sanssouci")
 17 | #remotes::install_github("aLiehrmann/DiffSegR@f657435")
 18 | 
 19 | library(tidyverse)
 20 | library(DiffSegR)
 21 | nb_threads = 10
 22 | working_directory <- getwd()
 23 | 
 24 | #- create sample information table --------------------------------------------#
 25 | sample_info <- data.frame(
 26 |   sample    = c("dxo1.N_1","dxo1.N_2","dxo1.N_3","dxo1.C_1","dxo1.C_2","dxo1.C_3"),
 27 |   condition = c(rep("dxo1.N", 3), rep( "dxo1.C", 3)),
 28 |   replicate = c(1:3,1:3),
 29 |   bam       = sapply(
 30 | 	c("S2-2N_Aligned.sortedByCoord.out.bam", "S17-6N_Aligned.sortedByCoord.out.bam", "S13-19N_Aligned.sortedByCoord.out.bam",
 31 | 	"S23-2C_Aligned.sortedByCoord.out.bam", "S18-6C_Aligned.sortedByCoord.out.bam", "S14-19C_Aligned.sortedByCoord.out.bam"),
 32 |     function(bam) file.path(working_directory, bam)),
 33 |   coverage  = file.path(
 34 |     working_directory,
 35 |     paste0(c("dxo1.N_1","dxo1.N_2","dxo1.N_3","dxo1.C_1", "dxo1.C_2", "dxo1.C_3"), ".rds")))
 36 | 
 37 | #- save sample information table ----------------------------------------------# 
 38 | write.table(sample_info, file.path(working_directory, "sample_info.txt"))
 39 | 
 40 | #- display sample information table -------------------------------------------#
 41 | knitr::kable(sample_info, row.names = FALSE)
 42 | 
 43 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F)
 44 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt")
 45 | 
 46 | ### setup comparisons and loop for each chromosome
 47 | out_DERs <- NULL
 48 | 
 49 | for(i in unique(genome$X1)){
 50 | chr <- paste(i)
 51 | stop <- genome$X2[genome$X1==i]
 52 | 
 53 | ## load data
 54 | data <- loadData(
 55 |   sampleInfo   = file.path(working_directory,"sample_info.txt"),
 56 |   locus        = list(seqid = i, chromStart = 1, chromEnd = stop),
 57 |   referenceCondition = "dxo1.C",
 58 |   isPairedEnd = TRUE,
 59 |   readLength = 150,
 60 |   coverageType = "fivePrime",
 61 |   stranded = FALSE,
 62 |   strandSpecific = 0,
 63 |   fromBam    = TRUE,
 64 |   nbThreads  = nb_threads,
 65 |   verbose = TRUE,
 66 | )
 67 | 
 68 | ## Changepoint detection to define segments
 69 | SExp <- segmentation(
 70 | 	data = data, 
 71 | 	weightType = "unweighted", #zeroInflated : low counts have less weight
 72 | 	modelSelectionType = "yao",
 73 | 	featureCountsType = "fromBam",
 74 | 	compressed = TRUE,
 75 | 	alpha = 2,
 76 | 	segmentNeighborhood = FALSE,
 77 | 	Kmax = NULL,
 78 | 	verbose = FALSE,
 79 | 	nbThreadsGridSearch = 1,
 80 | 	alphas = NULL,
 81 | 	gridSearch = FALSE,
 82 | 	outputDirectory = working_directory,
 83 | 	nbThreadsFeatureCounts = nb_threads,
 84 | 	strandSpecific = 0,
 85 | 	read2pos = 5,
 86 | 	isPairedEnd = TRUE
 87 | )
 88 | 
 89 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,]
 90 | 
 91 | dds <- dea(
 92 | 	data      = data,
 93 | 	SExp      = SExp_10,
 94 | 	design    = ~condition,
 95 | 	predicate = NULL,
 96 | 	significanceLevel = 0.01,
 97 | 	orderBy = "pvalue",
 98 | 	verbose = TRUE
 99 | )
100 | 
101 | #- extract DERs based on signifiance ----------------------------------------#
102 | DERs <- dds[SummarizedExperiment::mcols(dds)$rejectedHypotheses,]
103 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs))
104 | 
105 | out_DERs <- rbind(out_DERs,DERs)
106 | 
107 | }
108 | 
109 | #clear memory cache
110 | gc()
111 | 
112 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3])))
113 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, baseVar, log2FoldChange, padj)
114 | out <- subset(out, baseMean > 10)
115 | 
116 | write_tsv(out, "dxo1-N_DERs_5p.bed", col_names=F)
117 | 
118 | 


--------------------------------------------------------------------------------
/project_workflows/diffsegR_v2/diffsegR_WT-NvsC_gmuct_3p.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ## Installation
  4 | #### Conda
  5 | #conda create --name diffseg
  6 | #conda install -n diffseg -c conda-forge r-tidyverse
  7 | #conda install -n diffseg -c bioconda bioconductor-deseq2
  8 | #conda install -n diffseg -c bioconda bioconductor-rsubread
  9 | #conda install -n diffseg -c bioconda bioconductor-rtracklayer
 10 | #conda install -n diffseg -c bioconda bioconductor-sparsematrixstats
 11 | #conda install -n diffseg -c bioconda bioconductor-delayedmatrixstats
 12 | #conda install -n diffseg -c conda-forge r-remotes
 13 | #conda activate diffseg
 14 | 
 15 | #### R
 16 | #remotes::install_github("sanssouci-org/sanssouci")
 17 | #remotes::install_github("aLiehrmann/DiffSegR")
 18 | 
 19 | library(tidyverse)
 20 | library(DiffSegR)
 21 | 
 22 | ## multi-threading options
 23 | nb_threads = 10
 24 | nb_threads_locus = 10
 25 | 
 26 | working_directory <- getwd()
 27 | 
 28 | #- create sample information table --------------------------------------------#
 29 | sample_info <- data.frame(
 30 | 	sample    = c("WT.N_1", "WT.N_2", "WT.N_3", "WT.C_1", "WT.C_2", "WT.C_3"),
 31 | 	condition = rep(c("WT.N", "WT.C"), each = 3),
 32 | 	replicate = rep(1:3,2),
 33 | 	bam       = sapply(
 34 | 		c("S5-3N_Aligned.sortedByCoord.out.bam", 
 35 | 		"S7-4N_Aligned.sortedByCoord.out.bam",
 36 | 		"S11-10N_Aligned.sortedByCoord.out.bam",
 37 | 		"S6-3C_Aligned.sortedByCoord.out.bam",
 38 | 		"S8-4C_Aligned.sortedByCoord.out.bam",
 39 | 		"S12-10C_Aligned.sortedByCoord.out.bam"
 40 | 		), function(bam) file.path(working_directory, bam)),
 41 | 	isPairedEnd = rep(TRUE, 6),
 42 | 	strandSpecific = rep(0, 6)
 43 | )
 44 | 
 45 | #- display sample information table -------------------------------------------#
 46 | knitr::kable(sample_info, row.names = FALSE)
 47 | 
 48 | ## genome file
 49 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F)
 50 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt")
 51 | 
 52 | ### setup comparisons and loop for each chromosome
 53 | out_DERs <- NULL
 54 | 
 55 | for(i in unique(genome$X1)){
 56 | 
 57 | chr <- paste(i)
 58 | stop <- genome$X2[genome$X1==i]
 59 | 
 60 | ## import data on experiment
 61 | data <- newExperiment(
 62 | 	sampleInfo = sample_info,
 63 | 	loci       = data.frame(seqid = i, chromStart = 1, chromEnd = stop),
 64 | 	referenceCondition = "WT.C",
 65 | 	otherCondition = "WT.N",
 66 | 	nbThreads  = nb_threads,
 67 | 	nbThreadsByLocus = nb_threads_locus,
 68 | 	coverage = working_directory
 69 | )
 70 | 
 71 | print(data)
 72 | 
 73 | ## generate coverage profile from BAM
 74 | coverage(data = data, coverageType = "threePrime", verbose = TRUE)
 75 | 
 76 | ## transform coverage profile into per-base log2-FC and perform changepoint detection to define segments
 77 | features <- segmentationLFC(
 78 | 	data  = data, 
 79 | 	alpha = 2,
 80 | 	modelSelectionType = "yao",
 81 | 	verbose = TRUE
 82 | )
 83 | 
 84 | ## Quantify expression of segments
 85 | SExp <- counting(
 86 | 	data = data,
 87 | 	features = features,
 88 | 	featureCountsType = "fromBam",
 89 | 	featureCountsOtherParams = list(read2pos = 3),
 90 | 	verbose = TRUE 
 91 | )
 92 | 
 93 | #- subset to segments with width < 11 nt ------------------------------------------------#
 94 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,]
 95 | 
 96 | # differential exprssion analysis
 97 | dds <- dea(
 98 | 	SExp        = SExp_10, 
 99 | 	design      = ~condition,
100 | 	significanceLevel = 0.01,
101 | 	verbose = TRUE
102 | )
103 | 
104 | #- extract DERs based on signifiance ----------------------------------------#
105 | DERs <- dds[SummarizedExperiment::mcols(dds)$DER,]
106 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs))
107 | 
108 | out_DERs <- rbind(out_DERs,DERs)
109 | }
110 | 
111 | ## clear memory cache
112 | gc()
113 | 
114 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3])))
115 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, baseVar, log2FoldChange, padj)
116 | out <- subset(out, baseMean > 10)
117 | 
118 | write_tsv(out, "WT-N_DERs_3p.bed", col_names=F)
119 | 
120 | 
121 | 


--------------------------------------------------------------------------------
/project_workflows/diffsegR_v2/diffsegR_WT-NvsC_gmuct_5p.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ## Installation
  4 | #### Conda
  5 | #conda create --name diffseg
  6 | #conda install -n diffseg -c conda-forge r-tidyverse
  7 | #conda install -n diffseg -c bioconda bioconductor-deseq2
  8 | #conda install -n diffseg -c bioconda bioconductor-rsubread
  9 | #conda install -n diffseg -c bioconda bioconductor-rtracklayer
 10 | #conda install -n diffseg -c bioconda bioconductor-sparsematrixstats
 11 | #conda install -n diffseg -c bioconda bioconductor-delayedmatrixstats
 12 | #conda install -n diffseg -c conda-forge r-remotes
 13 | #conda activate diffseg
 14 | #### R
 15 | #remotes::install_github("sanssouci-org/sanssouci")
 16 | #remotes::install_github("aLiehrmann/DiffSegR")
 17 | 
 18 | library(tidyverse)
 19 | library(DiffSegR)
 20 | 
 21 | ## multi-threading options
 22 | nb_threads = 10
 23 | nb_threads_locus = 10
 24 | 
 25 | working_directory <- getwd()
 26 | 
 27 | #- create sample information table --------------------------------------------#
 28 | sample_info <- data.frame(
 29 | 	sample    = c("WT.N_1", "WT.N_2", "WT.N_3", "WT.C_1", "WT.C_2", "WT.C_3"),
 30 | 	condition = rep(c("WT.N", "WT.C"), each = 3),
 31 | 	replicate = rep(1:3,2),
 32 | 	bam       = sapply(
 33 | 		c("S5-3N_Aligned.sortedByCoord.out.bam", 
 34 | 		"S7-4N_Aligned.sortedByCoord.out.bam",
 35 | 		"S11-10N_Aligned.sortedByCoord.out.bam",
 36 | 		"S6-3C_Aligned.sortedByCoord.out.bam",
 37 | 		"S8-4C_Aligned.sortedByCoord.out.bam",
 38 | 		"S12-10C_Aligned.sortedByCoord.out.bam"
 39 | 	), function(bam) file.path(working_directory, bam)),
 40 | 	isPairedEnd = rep(TRUE, 6),
 41 | 	strandSpecific = rep(0, 6)
 42 | )
 43 | 
 44 | #- display sample information table -------------------------------------------#
 45 | knitr::kable(sample_info, row.names = FALSE)
 46 | 
 47 | ## genome file
 48 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F)
 49 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt")
 50 | 
 51 | ### setup comparisons and loop for each chromosome
 52 | out_DERs <- NULL
 53 | 
 54 | for(i in unique(genome$X1)){
 55 | 
 56 | chr <- paste(i)
 57 | stop <- genome$X2[genome$X1==i]
 58 | 
 59 | ## import data on experiment
 60 | data <- newExperiment(
 61 | 	sampleInfo = sample_info,
 62 | 	loci       = data.frame(seqid = i, chromStart = 1, chromEnd = stop),
 63 | 	referenceCondition = "WT.C",
 64 | 	otherCondition = "WT.N",
 65 | 	nbThreads  = nb_threads,
 66 | 	nbThreadsByLocus = nb_threads_locus,
 67 | 	coverage = working_directory
 68 | )
 69 | 
 70 | print(data)
 71 | 
 72 | ## generate coverage profile from BAM
 73 | coverage(data = data, coverageType = "fivePrime", verbose = TRUE)
 74 | 
 75 | ## transform coverage profile into per-base log2-FC and perform changepoint detection to define segments
 76 | features <- segmentationLFC(
 77 | 	data  = data, 
 78 | 	alpha = 2,
 79 | 	modelSelectionType = "yao",
 80 | 	verbose = TRUE
 81 | )
 82 | 
 83 | ## Quantify expression of segments
 84 | SExp <- counting(
 85 | 	data = data,
 86 | 	features = features,
 87 | 	featureCountsType = "fromBam",
 88 | 	featureCountsOtherParams = list(read2pos = 5),
 89 | 	verbose = TRUE 
 90 | )
 91 | 
 92 | #- subset to segments with width < 11 nt --------------------------------------#
 93 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,]
 94 | 
 95 | 
 96 | # differential exprssion analysis
 97 | dds <- dea(
 98 | 	SExp        = SExp_10, 
 99 | 	design      = ~condition,
100 | 	significanceLevel = 0.01,
101 | 	verbose = TRUE,
102 | 	predicate = NULL	
103 | )
104 | 
105 | #- extract DERs based on signifiance ----------------------------------------#
106 | DERs <- dds[SummarizedExperiment::mcols(dds)$DER,]
107 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs))
108 | 
109 | out_DERs <- rbind(out_DERs,DERs)
110 | }
111 | 
112 | ## clear memory cache
113 | gc()
114 | 
115 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3])))
116 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, baseVar, log2FoldChange, padj)
117 | out <- subset(out, baseMean > 10)
118 | 
119 | write_tsv(out, "WT-N_DERs_5p.bed", col_names=F)
120 | 
121 | 


--------------------------------------------------------------------------------
/project_workflows/diffsegR_v2/diffsegR_WT-Nvsbulk_gmuct_5p.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ## Installation
  4 | #### Conda
  5 | #conda create --name diffseg
  6 | #conda install -n diffseg -c conda-forge r-tidyverse
  7 | #conda install -n diffseg -c bioconda bioconductor-deseq2
  8 | #conda install -n diffseg -c bioconda bioconductor-rsubread
  9 | #conda install -n diffseg -c bioconda bioconductor-rtracklayer
 10 | #conda install -n diffseg -c bioconda bioconductor-sparsematrixstats
 11 | #conda install -n diffseg -c bioconda bioconductor-delayedmatrixstats
 12 | #conda activate diffseg
 13 | #### R
 14 | #remotes::install_github("sanssouci-org/sanssouci")
 15 | #remotes::install_github("aLiehrmann/DiffSegR")
 16 | 
 17 | library(tidyverse)
 18 | library(DiffSegR)
 19 | 
 20 | ## multi-threading options
 21 | nb_threads = 10
 22 | nb_threads_locus = 10
 23 | 
 24 | working_directory <- getwd()
 25 | 
 26 | #- create sample information table --------------------------------------------#
 27 | sample_info <- data.frame(
 28 | 	sample    = c("WT.N_1", "WT.N_2", "WT.N_3", "BDG_1", "BDG_2", "HMC_1", "HMC_2"),
 29 | 	condition = c(rep("WT.N", each = 3),rep("BDG", each=2),rep("HMC",each=2)),
 30 | 	replicate = c(1:3,1:2,1:2),
 31 | 	bam       = sapply(
 32 | 		c("S5-3N_Aligned.sortedByCoord.out.bam", 
 33 | 		"S7-4N_Aligned.sortedByCoord.out.bam",
 34 | 		"S11-10N_Aligned.sortedByCoord.out.bam",
 35 | 		"WT-BDG_rep1_Aligned.sortedByCoord.out.bam",
 36 | 		"WT-BDG_rep2_Aligned.sortedByCoord.out.bam",
 37 | 		"WT-HMC_rep1_Aligned.sortedByCoord.out.bam",
 38 | 		"WT-HMC_rep2_Aligned.sortedByCoord.out.bam"),
 39 | 	function(bam) file.path(working_directory, bam)),
 40 | 	isPairedEnd = c(rep(TRUE, 3),rep(FALSE,4)),
 41 | 	strandSpecific = rep(0, 7)
 42 | )
 43 | 
 44 | #- display sample information table -------------------------------------------#
 45 | knitr::kable(sample_info, row.names = FALSE)
 46 | 
 47 | ## genome file
 48 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F)
 49 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt")
 50 | 
 51 | ### setup comparisons and loop for each chromosome
 52 | out_DERs <- NULL
 53 | 
 54 | for(i in unique(genome$X1)){
 55 | 
 56 | chr <- paste(i)
 57 | stop <- genome$X2[genome$X1==i]
 58 | 
 59 | ## import data on experiment
 60 | data <- newExperiment(
 61 | 	sampleInfo = sample_info,
 62 | 	loci       = data.frame(seqid = i, chromStart = 1, chromEnd = stop),
 63 | 	referenceCondition = "BDG",
 64 | 	otherCondition = "WT.N",
 65 | 	nbThreads  = nb_threads,
 66 | 	nbThreadsByLocus = nb_threads_locus,
 67 | 	coverage = working_directory
 68 | )
 69 | 
 70 | print(data)
 71 | 
 72 | data2 <- newExperiment(
 73 |         sampleInfo = sample_info,
 74 |         loci       = data.frame(seqid = i, chromStart = 1, chromEnd = stop),
 75 |         referenceCondition = "HMC",
 76 |         otherCondition = "WT.N",
 77 |         nbThreads  = nb_threads,
 78 |         nbThreadsByLocus = nb_threads_locus,
 79 |         coverage = working_directory
 80 | )
 81 | 
 82 | print(data2)
 83 | 
 84 | ## generate coverage profile from BAM
 85 | coverage(data = data, coverageType = "fivePrime", verbose = TRUE)
 86 | 
 87 | ## transform coverage profile into per-base log2-FC and perform changepoint detection to define segments
 88 | features <- segmentationLFC(
 89 | 	data  = data, 
 90 | 	alpha = 2,
 91 | 	modelSelectionType = "yao",
 92 | 	verbose = TRUE
 93 | )
 94 | 
 95 | ## Quantify expression of segments
 96 | SExp <- counting(
 97 | 	data = data,
 98 | 	features = features,
 99 | 	featureCountsType = "fromBam",
100 | 	featureCountsOtherParams = list(read2pos = 5),
101 | 	verbose = TRUE 
102 | )
103 | 
104 | #- subset to segments with width < 11 nt --------------------------------------#
105 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,]
106 | 
107 | 
108 | # differential exprssion analysis
109 | dds <- dea(
110 | 	SExp        = SExp_10, 
111 | 	design      = ~condition,
112 | 	significanceLevel = 0.01,
113 | 	verbose = TRUE
114 | )
115 | 
116 | #- extract DERs based on signifiance ----------------------------------------#
117 | DERs <- dds[SummarizedExperiment::mcols(dds)$DER,]
118 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs))
119 | DERs <- subset(DERs, baseMean > 10)
120 | 
121 | out_DERs <- rbind(out_DERs,DERs)
122 | }
123 | 
124 | ## clear memory cache
125 | gc()
126 | 
127 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3])))
128 | out <- select(out_DERs, seqnames, start, end, derId, log2FoldChange, padj, baseMean)
129 | 
130 | write_tsv(out, "WT-NvsBDG_5p.bed", col_names=F)
131 | 
132 | 
133 | 


--------------------------------------------------------------------------------
/project_workflows/diffsegR_v2/diffsegR_abh1-NvsC_gmuct_5p.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ## Installation
  4 | #### Conda
  5 | #conda create --name diffseg
  6 | #conda install -n diffseg -c conda-forge r-tidyverse
  7 | #conda install -n diffseg -c bioconda bioconductor-deseq2
  8 | #conda install -n diffseg -c bioconda bioconductor-rsubread
  9 | #conda install -n diffseg -c bioconda bioconductor-rtracklayer
 10 | #conda install -n diffseg -c bioconda bioconductor-sparsematrixstats
 11 | #conda install -n diffseg -c bioconda bioconductor-delayedmatrixstats
 12 | #conda install -n diffseg -c conda-forge r-remotes
 13 | #conda activate diffseg
 14 | #### R
 15 | #remotes::install_github("sanssouci-org/sanssouci")
 16 | #remotes::install_github("aLiehrmann/DiffSegR")
 17 | 
 18 | library(tidyverse)
 19 | library(DiffSegR)
 20 | 
 21 | ## multi-threading options
 22 | nb_threads = 10
 23 | nb_threads_locus = 10
 24 | 
 25 | working_directory <- getwd()
 26 | 
 27 | #- create sample information table --------------------------------------------#
 28 | sample_info <- data.frame(
 29 | 	sample    = c("abh1.N_1", "abh1.N_2", "abh1.N_3", "abh1.C_1", "abh1.C_2", "abh1.C_3"),
 30 | 	condition = rep(c("abh1.N", "abh1.C"), each = 3),
 31 | 	replicate = rep(1:3,2),
 32 | 	bam       = sapply(
 33 | 		c("S15-5N_Aligned.sortedByCoord.out.bam", 
 34 | 		"S9-20N_Aligned.sortedByCoord.out.bam",
 35 | 		"S24-34N_Aligned.sortedByCoord.out.bam",
 36 | 		"S16-5C_Aligned.sortedByCoord.out.bam",
 37 | 		"S10-20C_Aligned.sortedByCoord.out.bam",
 38 | 		"S25-34C_Aligned.sortedByCoord.out.bam"
 39 | 	), function(bam) file.path(working_directory, bam)),
 40 | 	isPairedEnd = rep(TRUE, 6),
 41 | 	strandSpecific = rep(0, 6)
 42 | )
 43 | 
 44 | #- display sample information table -------------------------------------------#
 45 | knitr::kable(sample_info, row.names = FALSE)
 46 | 
 47 | ## genome file
 48 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F)
 49 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt")
 50 | 
 51 | ### setup comparisons and loop for each chromosome
 52 | out_DERs <- NULL
 53 | 
 54 | for(i in unique(genome$X1)){
 55 | 
 56 | chr <- paste(i)
 57 | stop <- genome$X2[genome$X1==i]
 58 | 
 59 | ## import data on experiment
 60 | data <- newExperiment(
 61 | 	sampleInfo = sample_info,
 62 | 	loci       = data.frame(seqid = i, chromStart = 1, chromEnd = stop),
 63 | 	referenceCondition = "abh1.C",
 64 | 	otherCondition = "abh1.N",
 65 | 	nbThreads  = nb_threads,
 66 | 	nbThreadsByLocus = nb_threads_locus,
 67 | 	coverage = working_directory
 68 | )
 69 | 
 70 | print(data)
 71 | 
 72 | ## generate coverage profile from BAM
 73 | coverage(data = data, coverageType = "fivePrime", verbose = TRUE)
 74 | 
 75 | ## transform coverage profile into per-base log2-FC and perform changepoint detection to define segments
 76 | features <- segmentationLFC(
 77 | 	data  = data, 
 78 | 	alpha = 2,
 79 | 	modelSelectionType = "yao",
 80 | 	verbose = TRUE
 81 | )
 82 | 
 83 | ## Quantify expression of segments
 84 | SExp <- counting(
 85 | 	data = data,
 86 | 	features = features,
 87 | 	featureCountsType = "fromBam",
 88 | 	featureCountsOtherParams = list(read2pos = 5),
 89 | 	verbose = TRUE 
 90 | )
 91 | 
 92 | #- subset to segments with width < 11 nt --------------------------------------#
 93 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,]
 94 | 
 95 | 
 96 | # differential exprssion analysis
 97 | dds <- dea(
 98 | 	SExp        = SExp_10, 
 99 | 	design      = ~condition,
100 | 	significanceLevel = 0.01,
101 | 	verbose = TRUE,
102 | 	predicate = NULL	
103 | )
104 | 
105 | #- extract DERs based on signifiance ----------------------------------------#
106 | DERs <- dds[SummarizedExperiment::mcols(dds)$DER,]
107 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs))
108 | 
109 | out_DERs <- rbind(out_DERs,DERs)
110 | }
111 | 
112 | ## clear memory cache
113 | gc()
114 | 
115 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3])))
116 | out <- select(out_DERs, seqnames, start, end, derId, baseMean, baseVar, maxCooks, log2FoldChange, padj)
117 | out$cov <- sqrt(out$baseVar)/out$baseMean
118 | out <- subset(out, baseMean > 10 & cov < 1)
119 | 
120 | write_tsv(out, "abh1-N_DERs_5p.bed", col_names=F)
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/project_workflows/diffsegR_v2/diffsegR_abh1-NvsWT-N_gmuct_3p.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ## Installation
  4 | #### Conda
  5 | #conda create --name diffseg
  6 | #conda install -n diffseg -c conda-forge r-tidyverse
  7 | #conda install -n diffseg -c bioconda bioconductor-deseq2
  8 | #conda install -n diffseg -c bioconda bioconductor-rsubread
  9 | #conda install -n diffseg -c bioconda bioconductor-rtracklayer
 10 | #conda install -n diffseg -c bioconda bioconductor-sparsematrixstats
 11 | #conda install -n diffseg -c bioconda bioconductor-delayedmatrixstats
 12 | #conda install -n diffseg -c conda-forge r-remotes
 13 | #conda activate diffseg
 14 | #### R
 15 | #remotes::install_github("sanssouci-org/sanssouci")
 16 | #remotes::install_github("aLiehrmann/DiffSegR")
 17 | 
 18 | library(tidyverse)
 19 | library(DiffSegR)
 20 | 
 21 | ## multi-threading options
 22 | nb_threads = 10
 23 | nb_threads_locus = 10
 24 | 
 25 | working_directory <- getwd()
 26 | 
 27 | #- create sample information table --------------------------------------------#
 28 | sample_info <- data.frame(
 29 | 	sample    = c("abh1.N_1", "abh1.N_2", "abh1.N_3", "WT.N_1", "WT.N_2", "WT.N_3"),
 30 | 	condition = c(rep("abh1.N", 3), rep( "WT.N", 3)),
 31 | 	replicate = c(1:3,1:3),
 32 | 	bam       = sapply(
 33 | 		c("S15-5N_Aligned.sortedByCoord.out.bam", 
 34 | 		"S9-20N_Aligned.sortedByCoord.out.bam",
 35 | 		"S24-34N_Aligned.sortedByCoord.out.bam",
 36 | 		"S5-3N_Aligned.sortedByCoord.out.bam",
 37 | 		"S7-4N_Aligned.sortedByCoord.out.bam",
 38 | 		"S11-10N_Aligned.sortedByCoord.out.bam"),
 39 | 	function(bam) file.path(working_directory, bam) ),
 40 | 	isPairedEnd = rep(TRUE, 6),
 41 | 	strandSpecific = rep(0, 6)
 42 | )
 43 | 
 44 | #- display sample information table -------------------------------------------#
 45 | knitr::kable(sample_info, row.names = FALSE)
 46 | 
 47 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F)
 48 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt")
 49 | 
 50 | ### setup comparisons and loop for each chromosome
 51 | out_DERs <- NULL
 52 | out_segments <- NULL
 53 | 
 54 | for(i in unique(genome$X1)){
 55 | 
 56 | chr <- paste(i)
 57 | stop <- genome$X2[genome$X1==i]
 58 | 
 59 | ## import data on experiment
 60 | data <- newExperiment(
 61 | 	sampleInfo = sample_info,
 62 | 	loci       = data.frame(seqid = i, chromStart = 1, chromEnd = stop),
 63 | 	referenceCondition = "WT.N",
 64 | 	otherCondition = "abh1.N",
 65 | 	nbThreads  = nb_threads,
 66 | 	nbThreadsByLocus = nb_threads_locus,
 67 | 	coverage = working_directory
 68 | )
 69 | 
 70 | print(data)
 71 | 
 72 | ## generate coverage profile from BAM
 73 | coverage(data = data, coverageType = "threePrime", verbose = TRUE)
 74 | 
 75 | ## transform coverage profile into per-base log2-FC and perform changepoint detection to define segments
 76 | features <- segmentationLFC(
 77 | 	data  = data, 
 78 | 	alpha = 2,
 79 | 	modelSelectionType = "yao",
 80 | 	verbose = TRUE
 81 | )
 82 | 
 83 | ## Quantify expression of segments
 84 | SExp <- counting(
 85 | 	data = data,
 86 | 	features = features,
 87 | 	featureCountsType = "fromBam",
 88 | 	featureCountsOtherParams = list(read2pos = 3),
 89 | 	verbose = TRUE 
 90 | )
 91 | 
 92 | #- subset to segments with width < 11 nt --------------------------------------#
 93 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,]
 94 | 
 95 | 
 96 | # differential exprssion analysis
 97 | dds <- dea(
 98 | 	SExp    = SExp_10, 
 99 | 	design  = ~condition,
100 | 	significanceLevel = 0.01,
101 | 	verbose = TRUE
102 | )
103 | 
104 | #- extract DERs based on signifiance ----------------------------------------#
105 | DERs <- dds[SummarizedExperiment::mcols(dds)$DER,]
106 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs))
107 | DERs <- subset(DERs, baseMean > 10)
108 | 
109 | out_DERs <- rbind(out_DERs,DERs)
110 | }
111 | 
112 | ## clear memory cache
113 | gc()
114 | 
115 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3])))
116 | out <- select(out_DERs, seqnames, start, end, derId, log2FoldChange, padj, baseMean)
117 | write_tsv(out, "abh1-NvsWT-N_DERs_3p.bed", col_names=F)
118 | 
119 | 


--------------------------------------------------------------------------------
/project_workflows/diffsegR_v2/diffsegR_abh1-NvsWT-N_gmuct_5p.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ## Installation
  4 | #### Conda
  5 | #conda create --name diffseg
  6 | #conda install -n diffseg -c conda-forge r-tidyverse
  7 | #conda install -n diffseg -c bioconda bioconductor-deseq2
  8 | #conda install -n diffseg -c bioconda bioconductor-rsubread
  9 | #conda install -n diffseg -c bioconda bioconductor-rtracklayer
 10 | #conda install -n diffseg -c bioconda bioconductor-sparsematrixstats
 11 | #conda install -n diffseg -c bioconda bioconductor-delayedmatrixstats
 12 | #conda install -n diffseg -c conda-forge r-remotes
 13 | #conda install -n diffseg bioconda::r-scatterplot3d 
 14 | #conda activate diffseg
 15 | #### R
 16 | #remotes::install_github("sanssouci-org/sanssouci")
 17 | #remotes::install_github("aLiehrmann/DiffSegR")
 18 | 
 19 | library(tidyverse)
 20 | library(DiffSegR)
 21 | 
 22 | ## multi-threading options
 23 | nb_threads = 10
 24 | nb_threads_locus = 10
 25 | 
 26 | working_directory <- getwd()
 27 | 
 28 | #- create sample information table --------------------------------------------#
 29 | sample_info <- data.frame(
 30 | 	sample    = c("abh1.N_1", "abh1.N_2", "abh1.N_3", "WT.N_1", "WT.N_2", "WT.N_3"),
 31 | 	condition = c(rep("abh1.N", 3), rep( "WT.N", 3)),
 32 | 	replicate = c(1:3,1:3),
 33 | 	bam       = sapply(
 34 | 		c("S15-5N_Aligned.sortedByCoord.out.bam", 
 35 | 		"S9-20N_Aligned.sortedByCoord.out.bam",
 36 | 		"S24-34N_Aligned.sortedByCoord.out.bam",
 37 | 		"S5-3N_Aligned.sortedByCoord.out.bam",
 38 | 		"S7-4N_Aligned.sortedByCoord.out.bam",
 39 | 		"S11-10N_Aligned.sortedByCoord.out.bam"),
 40 | 	function(bam) file.path(working_directory, bam)),
 41 | 	isPairedEnd = rep(TRUE, 6),
 42 | 	strandSpecific = rep(0, 6)	
 43 | )
 44 | 
 45 | #- display sample information table -------------------------------------------#
 46 | knitr::kable(sample_info, row.names = FALSE)
 47 | 
 48 | genome <- read_tsv("~/ref_seqs/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.len", col_names=F)
 49 | genome <- subset(genome, X1 != "Mt" & X1 != "Pt")
 50 | 
 51 | ### setup comparisons and loop for each chromosome
 52 | out_DERs <- NULL
 53 | out_segments <- NULL
 54 | 
 55 | for(i in unique(genome$X1)){
 56 | 
 57 | chr <- paste(i)
 58 | stop <- genome$X2[genome$X1==i]
 59 | 
 60 | ## import data on experiment
 61 | data <- newExperiment(
 62 | 	sampleInfo = sample_info,
 63 | 	loci       = data.frame(seqid = i, chromStart = 1, chromEnd = stop),
 64 | 	referenceCondition = "WT.N",
 65 | 	otherCondition = "abh1.N",
 66 | 	nbThreads  = nb_threads,
 67 | 	nbThreadsByLocus = nb_threads_locus,
 68 | 	coverage = working_directory
 69 | )
 70 | 
 71 | print(data)
 72 | 
 73 | ## generate coverage profile from BAM
 74 | coverage(data = data, coverageType = "fivePrime", verbose = TRUE)
 75 | 
 76 | ## transform coverage profile into per-base log2-FC and perform changepoint detection to define segments
 77 | features <- segmentationLFC(
 78 | 	data  = data, 
 79 | 	alpha = 2,
 80 | 	modelSelectionType = "yao",
 81 | 	verbose = TRUE
 82 | )
 83 | 
 84 | ## Quantify expression of segments
 85 | SExp <- counting(
 86 | 	data = data,
 87 | 	features = features,
 88 | 	featureCountsType = "fromBam",
 89 | 	featureCountsOtherParams = list(read2pos = 5),
 90 | 	verbose = TRUE 
 91 | )
 92 | 
 93 | #- subset to segments with width < 11 nt --------------------------------------#
 94 | SExp_10 <- SExp[as.data.frame(SummarizedExperiment::rowRanges(SExp))$width < 11,]
 95 | 
 96 | 
 97 | # differential exprssion analysis
 98 | dds <- dea(
 99 | 	SExp        = SExp_10, 
100 | 	design      = ~condition,
101 | 	significanceLevel = 0.01,
102 | 	verbose = TRUE
103 | )
104 | 
105 | #- extract DERs based on signifiance ----------------------------------------#
106 | DERs <- dds[SummarizedExperiment::mcols(dds)$DER,]
107 | DERs <- as.data.frame(SummarizedExperiment::rowRanges(DERs))
108 | DERs <- subset(DERs, baseMean > 10)
109 | 
110 | out_DERs <- rbind(out_DERs,DERs)
111 | }
112 | 
113 | ## clear memory cache
114 | gc()
115 | 
116 | out_DERs <- mutate(out_DERs, derId = sapply(strsplit(featureId, "_"), function(l) paste0(l[1],":",l[2],"-",l[3])))
117 | out <- select(out_DERs, seqnames, start, end, derId, log2FoldChange, padj, baseMean)
118 | write_tsv(out, "abh1-NvsWT-N_DERs_5p.bed", col_names=F)
119 | 
120 | 


--------------------------------------------------------------------------------
/project_workflows/lowD_5mC_dendrograms_281019.R:
--------------------------------------------------------------------------------
 1 | # making dendrograms for Eichten lowD brachy epigenomics
 2 | # libraries
 3 | library(tidyverse)
 4 | suppressPackageStartupMessages(library(dendextend))
 5 | library(pheatmap)
 6 | 
 7 | # read in metadata file
 8 | meta <- read.delim("lowD_metadata.txt") %>%
 9 | 	select(SampleID, condition, PlantName, Accession, ClonalGroup)
10 | 
11 | my_grps <- data.frame(acc = unique(meta$Accession)) %>%
12 | 	mutate(Group = meta$ClonalGroup[match(acc, meta$Accession)]) %>%
13 | 	column_to_rownames("acc")
14 | 
15 | # read in tiled 5mC
16 | i <- "CG_alltiles_merged_2017-05-05.txt"
17 | 
18 | ## CG = "CG_alltiles_merged_2017-05-05.txt"
19 | ## CHG = "CHG_alltiles_merged.2017-05-04.txt"
20 | # colnames(a)[4:ncol(a)] <- sapply(strsplit(colnames(a)[4:ncol(a)], ".fastq"), function(l) l[1])
21 | # colnames(a)[4:ncol(a)] <- sapply(strsplit(colnames(a)[4:ncol(a)], ".S"), function(l) l[2])
22 | # colnames(a)[4:ncol(a)] <- paste0("S",colnames(a)[4:ncol(a)])
23 | ## CHH = "CHH_alltiles_merged.2017-05-09.txt"
24 | 
25 | a <- read.delim(paste(i)) %>%
26 | 	gather(sample, met, -V1, -V2, -V3) %>%
27 | 	na.omit() %>%
28 | 	#mutate(condition = meta$condition[match(sample, meta$SampleID)]) %>%
29 | 	mutate(acc = meta$Accession[match(sample, meta$SampleID)]) %>%
30 | 	select(V1, V2, acc, met)
31 | 
32 | a <- group_by(a, V1, V2, acc) %>%
33 | 	summarise(avg_met = mean(met))
34 | 
35 | # memory allocation too great to pipe
36 | a <- spread(a, acc, avg_met)
37 | 
38 | ann_colors <- list(
39 | 	Group = c(Bd21="coral", Clone1="royalblue", Clone2="darkgoldenrod1", 
40 | 	Clone3="darkolivegreen2", Clone4="darkorchid2", Clone5="forestgreen", 
41 | 	Clone6="firebrick1", Clone7="pink", HYB1="coral4", HYB2="bisque2")
42 | )
43 | 
44 | # produce correlation matrix & heatmap
45 | cor_matrix=as.matrix(cor(a[,3:ncol(a)],use='pairwise.complete.obs'))
46 | pheatmap(cor_matrix, 
47 | 	cutree_cols = 3, 
48 | 	cutree_rows = 3,
49 | 	show_colnames = F,
50 | 	fontsize_row = 5,
51 | 	border_color = NA,
52 | 	annotation_colors = ann_colors,
53 | 	annotation_row = my_grps)
54 | 
55 | dev.off()
56 | 
57 | 


--------------------------------------------------------------------------------
/project_workflows/lowD_SNP_dendrograms_281019.R:
--------------------------------------------------------------------------------
 1 | ## check out https://github.com/borevitzlab/brachy-genotyping-notes/blob/master/snprelate.Rmd
 2 | ## Produce SNP relationships of Turkish brachypodium lines for Eichten et al 2019
 3 | 
 4 | library(SNPRelate)
 5 | library(tidyverse)
 6 | 
 7 | metadata = read.csv("brachy-metadata.csv")
 8 | lowd_names = read.delim("lowD_GBS_sample_names.txt")
 9 | 
10 | #snpgdsVCF2GDS("freebayes~GBS~lowD.sorted.vcf.gz",
11 | #              "freebayes~GBS~lowD.gds")
12 | 
13 | geno = snpgdsOpen("freebayes~GBS~lowD.gds", allow.duplicate = T, readonly = T)
14 | samp = snpgdsSummary(geno)$sample.id
15 | ## Filter out very bad missing data
16 | snps = snpgdsSelectSNP(geno, missing.rate=0.999, autosome.only=F)
17 | 
18 | ## Functions for further filtering
19 | ssp.filt = function(geno, samps, snps,  max.snp.miss.rate=0.99,
20 |                     max.samp.miss.rate=0.99, min.maf=0.001 ) {
21 |     miss.samp = snpgdsSampMissRate(geno, snp.id=snps, sample.id=samps)
22 |     
23 |     hist(miss.samp, breaks=100, main="Sample Missing Data (pre-filt)")
24 |     abline(v=max.samp.miss.rate, col="blue", lwd=2)
25 |     
26 |     samps = samps[miss.samp <= max.samp.miss.rate]
27 |     
28 |     srf = snpgdsSNPRateFreq(geno, sample.id=samps, snp.id = snps)
29 |     miss.snp = srf$MissingRate
30 |     hist(miss.snp, breaks=100, main="SNP missing data")
31 |     abline(v=max.snp.miss.rate, col="blue", lwd=2)
32 |     
33 |     maf = srf$MinorFreq
34 |     hist(maf, breaks=50, main="SNP MAF")
35 |     abline(v=min.maf, lwd=2, col="blue")
36 |     
37 |     snps = snpgdsSelectSNP(geno, sample.id=samps, snp.id=snps, maf=min.maf, missing.rate=max.snp.miss.rate, autosome.only=F)
38 |     
39 |     miss.samp = snpgdsSampMissRate(geno, snp.id=snps, sample.id=samps)
40 |     hist(miss.samp, breaks=100, main="Sample Missing Data (post-filt)")
41 |     
42 |     print(paste("Num SNPs:", length(snps)))
43 |     print(paste("Num Samples:", length(samps)))
44 |     return(list(snps=snps, samps=samps, miss.samp=miss.samp))
45 | }
46 | 
47 | ssp.geno = function(geno, filt) {
48 |     ibs = snpgdsIBS(geno, sample.id=filt$samps, snp.id=filt$snps, autosome.only=F, num.thread=4)
49 |     ibs.nacnt = rowSums(is.na(ibs$ibs))
50 |     table(ibs.nacnt)
51 |     return(ibs)
52 | }
53 | 
54 | filt.dis = ssp.filt(geno, samp, snps, min.maf=0.01, max.samp.miss.rate = 0.995, max.snp.miss.rate=0.95)
55 | dev.off()
56 | 
57 | dist <- snpgdsDiss(geno, sample.id=filt.dis$samps, snp.id=filt.dis$snps, autosome.only=F)
58 | dist$sample.id <- paste(lowd_names$acc[match(dist$sample.id, lowd_names$anon)])
59 | hc.dis.plt = snpgdsHCluster(dist) %>% snpgdsCutTree(label.H=F, label.Z=F)
60 | 
61 | pdf("Brachy_SNP_dendro.pdf", pointsize=4)
62 | snpgdsDrawTree(hc.dis.plt, leaflab="perpendicular", cex.lab=0.01)
63 | dev.off()
64 | 
65 | 


--------------------------------------------------------------------------------
/qPCR/standardize_format_LC480_qPCR_DG.R:
--------------------------------------------------------------------------------
 1 | ## convert LC480 output to standardized format for analysis
 2 | 
 3 | options(stringsAsFactors = FALSE)
 4 | library(tidyverse)
 5 | library(janitor)
 6 | 
 7 | setwd("C://Users/u4667515/Dropbox/Collab_Projects/Covid19/RawData_RSB_v1/")
 8 | outdir <- "../RawData_v2_standardised/"
 9 | 
10 | ## probe fluorescence files
11 | fls <- dir(pattern = "Probe_LC480_raw-fluorescence")
12 | 
13 | for(i in 1:length(fls)){
14 |   a <- read_delim(fls[i], delim = '\t', skip=1, col_names = T) %>% 
15 |     clean_names() %>%
16 |     select(sample_pos, cycle_number, x483_533, x523_568, x558_610) %>%
17 |     rename(well = sample_pos, cycle=cycle_number) %>%
18 |     gather(fluorescence_name, fluorescence, -well, -cycle) %>%
19 |     mutate(fluorescence_name = ifelse(fluorescence_name == "x483_533", yes = "FAM_483-533",
20 |                       no = ifelse(fluorescence_name == "x523_568", yes = "HEX_523-568", no ="Red_558-610")))
21 |   
22 |   write_delim(x = a, path = paste(outdir, fls[i], sep = '/'), delim = '\t')
23 | }
24 | 
25 | ## SYBR fluorescence files
26 | fls <- dir(pattern = "SYBR_LC480_raw-fluorescence")
27 | 
28 | for(i in 1:length(fls)){
29 |   a <- read_delim(fls[i], delim = '\t', skip=1, col_names = T) %>% 
30 |     clean_names() %>%
31 |     filter(!(cycle_number == 1 & temp != 59.90)) %>% ## remove melt curve values QC = plot(cycle_number ~ temp, a)
32 |     select(sample_pos, cycle_number, x483_533) %>%
33 |     rename(well = sample_pos, cycle=cycle_number) %>%
34 |     gather(fluorescence_name, fluorescence, -well, -cycle) %>%
35 |     mutate(fluorescence_name = ifelse(fluorescence_name == "x483_533", yes = "SYBR_483-533", no=""))
36 | 
37 |     write_delim(x = a, path = paste(outdir, fls[i], sep = '/'), delim = '\t')
38 | }
39 | 
40 | ## SYBR fluorescence files
41 | fls <- dir(pattern = "sample-sheet")
42 | 
43 | for(i in 1:length(fls)){
44 |   a <- read_delim(fls[i], delim = '\t', col_names = T) %>% 
45 |     clean_names() %>%
46 |     select(general_pos, general_sample_name, general_filt_comb, general_target_name) %>%
47 |     mutate(general_target_name = ifelse(is.na(general_target_name)==F, yes = general_target_name,
48 |              no= ifelse(general_filt_comb == "483_533", yes = "FAM_RdRP",
49 |                   no = ifelse(general_filt_comb == "523_568", yes = "VIC_human_RP", 
50 |                           no ="ROX_N")))) %>%
51 |     rename(well = general_pos, 
52 |            sample_name=general_sample_name, 
53 |            filter=general_filt_comb, 
54 |            probe_amplicon=general_target_name)
55 |   
56 |   write_delim(x = a, path = paste(outdir, fls[i], sep = '/'), delim = '\t')
57 | }
58 | 
59 | ## view output files to transfer to LabArchives
60 | dir(outdir)
61 | 
62 | dev.off()
63 | rm(list=ls())
64 | 
65 | 


--------------------------------------------------------------------------------
/screenrc:
--------------------------------------------------------------------------------
 1 | ## .screenrc file
 2 | ## move to home directory and rename ".screenrc"
 3 | 
 4 | # Get rid of the startup message
 5 | startup_message off
 6 | 
 7 | # Stop flashing when I get an alert
 8 | vbell off
 9 | 
10 | # Automatically detach if a disconnect occurs
11 | autodetach on
12 | 
13 | # Think of it as xterm and give me scrolling
14 | termcapinfo xterm-color|xterm|xterms|xs|rxvt ti@:te@
15 | 
16 | # Tabs along the bottom of the session
17 | caption always "%{= k}%-w%{= G}%n %t%{-}%+w %-= %{= G}%H"
18 | 
19 | # Start at Window 1
20 | bind c screen 1
21 | bind ^c screen 1
22 | bind 0 select 10
23 | screen 1
24 | 
25 | # Allow h and l to move between tabs as well
26 | bind h prev
27 | bind ^h prev
28 | bind l next
29 | bind ^l next
30 | 


--------------------------------------------------------------------------------