├── .gitattributes ├── .gitignore ├── assets ├── nf-core-meripseqpipe_logo.png ├── multiqc_config.yaml ├── email_template.txt ├── sendmail_template.txt └── email_template.html ├── conf ├── C2.config ├── docker.config ├── test_bam.config ├── test.config ├── test_mixed.config └── base.config ├── .github ├── markdownlint.yml ├── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── PULL_REQUEST_TEMPLATE.md ├── workflows │ ├── ci_aligners.yml │ ├── ci_methy_methods.yml │ ├── linting.yml │ ├── ci_peakcalling_tools.yml │ ├── branch.yml │ ├── ci_peakcalling_methods.yml │ ├── awstest.yml │ └── awsfulltest.yml └── CONTRIBUTING.md ├── CHANGELOG.md ├── bin ├── geneBody_coverage2.sh ├── DiffReport.rmd ├── MeTPeak.R ├── merge_peaks_by_rank.R ├── markdown_to_html.r ├── cufflinks.sh ├── MATK_quantification.sh ├── normalize_peaks.py ├── QC_Peaks_Report.rmd ├── generate_featurecount_mat.R ├── QNB_quantification.R ├── scrape_software_versions.py ├── m6Aprediction.sh ├── get_htseq_matrix.R ├── m6Am.R ├── intersec.pl ├── edgeR.R ├── bedtools_quantification.R ├── DESeq2.R ├── MATK_diffm6A.sh ├── create_IGV_js.sh ├── bedtools_diffm6A.R ├── QNB_diffm6A.R ├── bed_count.sh ├── DESeq2_quantification.R ├── markdown_to_html.py ├── meyer.py ├── merge_peaks_by_bedtools.sh ├── arranged_results.R ├── merge_peaks_by_mspc.sh ├── merge_peaks_by_rank.sh ├── GLM_DESeq2_DM.R ├── GLM_edgeR_DM.R ├── QC_Peaks_Report.R ├── MeTDiff_diffm6A.R ├── m6A_motif.meme ├── DiffReport.R └── m6A_annotate_forGTF_xingyang2.pl ├── docs ├── README.md ├── troubleshooting.md ├── configuration │ ├── reference_genomes.md │ └── adding_your_own.md └── output.md ├── environment.yml ├── LICENSE ├── Dockerfile ├── CODE_OF_CONDUCT.md ├── README.md ├── lib └── LikeletUtils.groovy └── nextflow.config /.gitattributes: -------------------------------------------------------------------------------- 1 | *.config linguist-language=nextflow 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .nextflow* 2 | work/ 3 | data/ 4 | results/ 5 | .DS_Store 6 | tests/ 7 | testing/ 8 | *.pyc 9 | -------------------------------------------------------------------------------- /assets/nf-core-meripseqpipe_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lazyky/meripseqpipe/HEAD/assets/nf-core-meripseqpipe_logo.png -------------------------------------------------------------------------------- /conf/C2.config: -------------------------------------------------------------------------------- 1 | // config file for run analysis in new cluster 2 2 | 3 | 4 | process.executor = 'pbs' 5 | 6 | params { 7 | // Defaults only, expecting to be overwritten 8 | max_memory = 128.GB 9 | max_cpus = 38 10 | max_time = 240.h 11 | } -------------------------------------------------------------------------------- /.github/markdownlint.yml: -------------------------------------------------------------------------------- 1 | # Markdownlint configuration file 2 | default: true, 3 | line-length: false 4 | no-multiple-blanks: 0 5 | blanks-around-headers: false 6 | blanks-around-lists: false 7 | header-increment: false 8 | no-duplicate-header: 9 | siblings_only: true 10 | -------------------------------------------------------------------------------- /conf/docker.config: -------------------------------------------------------------------------------- 1 | /* 2 | * ------------------------------------------------- 3 | * nf-core/m6APipe Nextflow docker soft config file 4 | * ------------------------------------------------- 5 | */ 6 | 7 | process { 8 | container = 'kingzhuky/meripseqpipe:dev' 9 | } 10 | params { 11 | // Defaults only, expecting to be overwritten 12 | matk_jar = '/MATK-1.0.jar' 13 | } 14 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # nf-core/meripseqpipe: Changelog 2 | 3 | The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) 4 | and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). 5 | 6 | ## v1.0dev - [date] 7 | 8 | Initial release of nf-core/meripseqpipe, created with the [nf-core](http://nf-co.re/) template. 9 | 10 | ### `Added` 11 | 12 | ### `Fixed` 13 | 14 | ### `Dependencies` 15 | 16 | ### `Deprecated` 17 | -------------------------------------------------------------------------------- /assets/multiqc_config.yaml: -------------------------------------------------------------------------------- 1 | report_comment: > 2 | This report has been generated by the nf-core/meripseqpipe 3 | analysis pipeline. For information about how to interpret these results, please see the 4 | documentation. 5 | report_section_order: 6 | software_versions: 7 | order: -1000 8 | nf-core-meripseqpipe-summary: 9 | order: -1001 10 | 11 | export_plots: true 12 | -------------------------------------------------------------------------------- /bin/geneBody_coverage2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #bash geneBody_coverage2.sh 3 | #$1 argv 1 : gtf file 4 | #$2 argv 2 : THREAD_NUM 5 | bed12_file=$1 6 | THREAD_NUM=$2 7 | ## Define a multi-threaded run channel 8 | mkfifo tmp 9 | exec 9<>tmp 10 | for ((i=1;i<=${THREAD_NUM:=1};i++)) 11 | do 12 | echo >&9 13 | done 14 | 15 | for bigwig_file in *.bigwig 16 | do 17 | read -u 9 18 | { 19 | geneBody_coverage2.py -i $bigwig_file -o ${bigwig_file%.bigwig*}.rseqc.txt -r ${bed12_file} 20 | echo >&9 21 | }& 22 | done 23 | wait 24 | echo "Calculate coverage of data is finish" -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # nf-core/meripseqpipe: Documentation 2 | 3 | The nf-core/meripseqpipe documentation is split into the following files: 4 | 5 | 1. [Installation](https://nf-co.re/usage/installation) 6 | 2. Pipeline configuration 7 | * [Local installation](https://nf-co.re/usage/local_installation) 8 | * [Adding your own system config](https://nf-co.re/usage/adding_own_config) 9 | * [Reference genomes](https://nf-co.re/usage/reference_genomes) 10 | 3. [Running the pipeline](usage.md) 11 | 4. [Output and how to interpret the results](output.md) 12 | 5. [Troubleshooting](https://nf-co.re/usage/troubleshooting) 13 | -------------------------------------------------------------------------------- /bin/DiffReport.rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "DiffReport" 3 | author: "MeRIPseqPipe" 4 | output: html_document 5 | --- 6 | 7 | ```{r setup, include=FALSE} 8 | library(grid) 9 | library(pheatmap) 10 | ``` 11 | design.matrix,compare.list,heatmap_dm.list,heatmap_de.list,volcano_dm.list,ecdf.list,quadrant.list 12 | ## Heatmap 13 | Heatmap of differential expression analysis 14 | ```{r ,echo=FALSE} 15 | for (group in compare.list) { 16 | print(group) 17 | grid.newpage() 18 | print(heatmap_dm.list[[group]]) 19 | grid.newpage() 20 | print(heatmap_de.list[[group]]) 21 | print(volcano_dm.list[[group]]) 22 | print(quadrant.list[[group]]) 23 | print(ecdf.list[[group]]) 24 | } 25 | ``` 26 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | Hi there! 2 | 3 | Thanks for suggesting a new feature for the pipeline! Please delete this text and anything that's not relevant from the template below: 4 | 5 | #### Is your feature request related to a problem? Please describe. 6 | A clear and concise description of what the problem is. 7 | Ex. I'm always frustrated when [...] 8 | 9 | #### Describe the solution you'd like 10 | A clear and concise description of what you want to happen. 11 | 12 | #### Describe alternatives you've considered 13 | A clear and concise description of any alternative solutions or features you've considered. 14 | 15 | #### Additional context 16 | Add any other context about the feature request here. 17 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Many thanks to contributing to nf-core/meripseqpipe! 2 | 3 | Please fill in the appropriate checklist below (delete whatever is not relevant). These are the most common things requested on pull requests (PRs). 4 | 5 | ## PR checklist 6 | - [ ] This comment contains a description of changes (with reason) 7 | - [ ] If you've fixed a bug or added code that should be tested, add tests! 8 | - [ ] If necessary, also make a PR on the [nf-core/meripseqpipe branch on the nf-core/test-datasets repo]( https://github.com/nf-core/test-datasets/pull/new/nf-core/meripseqpipe) 9 | - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker`). 10 | - [ ] Make sure your code lints (`nf-core lint .`). 11 | - [ ] Documentation in `docs` is updated 12 | - [ ] `CHANGELOG.md` is updated 13 | - [ ] `README.md` is updated 14 | 15 | **Learn more about contributing:** https://github.com/nf-core/meripseqpipe/tree/master/.github/CONTRIBUTING.md 16 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | Hi there! 2 | 3 | Thanks for telling us about a problem with the pipeline. Please delete this text and anything that's not relevant from the template below: 4 | 5 | #### Describe the bug 6 | A clear and concise description of what the bug is. 7 | 8 | #### Steps to reproduce 9 | Steps to reproduce the behaviour: 10 | 1. Command line: `nextflow run ...` 11 | 2. See error: _Please provide your error message_ 12 | 13 | #### Expected behaviour 14 | A clear and concise description of what you expected to happen. 15 | 16 | #### System: 17 | - Hardware: [e.g. HPC, Desktop, Cloud...] 18 | - Executor: [e.g. slurm, local, awsbatch...] 19 | - OS: [e.g. CentOS Linux, macOS, Linux Mint...] 20 | - Version [e.g. 7, 10.13.6, 18.3...] 21 | 22 | #### Nextflow Installation: 23 | - Version: [e.g. 0.31.0] 24 | 25 | #### Container engine: 26 | - Engine: [e.g. Conda, Docker or Singularity] 27 | - version: [e.g. 1.0.0] 28 | - Image tag: [e.g. nfcore/meripseqpipe:1.0.0] 29 | 30 | #### Additional context 31 | Add any other context about the problem here. 32 | -------------------------------------------------------------------------------- /bin/MeTPeak.R: -------------------------------------------------------------------------------- 1 | ## Rscript MeTPeak.R eg. Rscript MeTPeak.R designfile.txt genes.gtf 10 2 | ### designfile: Sample_id, Input_filename, IP_filename, group.id 3 | ### flag_peakCallingbygroup: 1(group) 0(sample) 4 | library(MeTPeak) 5 | library(parallel) 6 | args <- commandArgs(T) 7 | input.bam.vec <- unlist(strsplit(args[1], split=',')) 8 | ip.bam.vec <- unlist(strsplit(args[2], split=',')) 9 | group.id <- args[3] 10 | gtf <- args[4] 11 | 12 | ##Running MeTPeak and rename the output name 13 | metpeak(GENE_ANNO_GTF = gtf, 14 | IP_BAM = ip.bam.vec, 15 | INPUT_BAM = input.bam.vec, 16 | EXPERIMENT_NAME = paste0( "metpeak_",group.id ) 17 | ) 18 | bed_name <- paste0( "metpeak_",group.id ,"/peak.xls") 19 | output_bed_name <- paste0("metpeak_group_",group.id,"_normalized.bed") #peak.bed 20 | bed12.to.bed6 <- paste0("awk 'BEGIN{OFS=\"\t\"}NR>1{print $1,$2,$3,$1\":\"$2\"-\"$3,$5,$6,$7,$8,$9,$10,$11,$12}' ", bed_name," | bed12ToBed6 -i | awk 'BEGIN{FS=\"\t\";OFS=\"\t\"}{print $1,$2,$3,$4,$5}'> ", output_bed_name) 21 | system(bed12.to.bed6) -------------------------------------------------------------------------------- /bin/merge_peaks_by_rank.R: -------------------------------------------------------------------------------- 1 | # required for merge_peaks_by_rank.sh 2 | library(RobustRankAggreg) 3 | args<-commandArgs(T) 4 | bedlist <- read.table(args[1],header = F,sep = "\t",stringsAsFactors = F, na.strings = "") 5 | len_of_bed <- as.numeric(args[2]) 6 | out_name <- as.character(args[3]) 7 | bedlist2 <- as.list(NULL) 8 | for (i in c(1:ncol(bedlist))){ 9 | if (TRUE %in% is.na(bedlist[,i])){ 10 | sub <- which(is.na(bedlist[,i])) 11 | bedlist2[[i]] <- bedlist[-sub,i] 12 | } 13 | else{ 14 | bedlist2[[i]] <-bedlist[,i] 15 | } 16 | } 17 | mergepeak <- aggregateRanks(glist = bedlist2, N = len_of_bed) 18 | sub <- which(as.numeric(mergepeak$Score)==1) 19 | mergepeak <- mergepeak[-sub,] 20 | merged.bed <- apply(mergepeak, 1 ,function(x){ 21 | peak.info <- as.vector(as.matrix(x)) 22 | peak.region = unlist(strsplit(strsplit(as.character(peak.info[1]),split = ":" )[[1]],split = "-")) 23 | x = c(peak.region,as.character(peak.info[1]),peak.info[2]) 24 | }) 25 | ## 26 | merged.bed <- t(merged.bed) 27 | write.table(merged.bed,file = out_name,sep = "\t",quote = FALSE,row.names = FALSE,col.names = FALSE) 28 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: meripseqpipe-1.0dev 2 | channels: 3 | - r 4 | - conda-forge 5 | - bioconda 6 | - anaconda 7 | - defaults 8 | dependencies: 9 | - openjdk=8.0.192 10 | - python=2.7.15 11 | - picard=2.21.6 12 | - fastqc=0.11.8 13 | - fastp=0.19.7 14 | - bedtools=2.27.1 15 | - ucsc-gtftogenepred=377 16 | - ucsc-fatotwobit=377 17 | - ucsc-facount=377 18 | - hisat2=2.1.0 19 | - bowtie2=2.2.5 20 | - bwa=0.7.17 21 | - star=2.6.1b 22 | - tophat=2.1.1 23 | - samtools=1.9 24 | - rseqc=2.6.4 25 | - macs2=2.1.2 26 | - meme=5.1.1 27 | - homer=4.9.1 28 | - dos2unix=7.4.1 29 | - r-base=3.5.1 30 | - bioconductor-edger=3.26.0 31 | - bioconductor-deseq2=1.22.1 32 | - igvtools=2.3.93 33 | - bioconductor-exomepeak=2.16.0 34 | - r-robustrankaggreg=1.1 35 | - perl=5.26.2 36 | - ucsc-genepredtobed=377 37 | - scipy=1.2.1 38 | - deeptools=3.1.3 39 | - ucsc-bigwigtowig=357 40 | - r-ggplot2=3.1.1 41 | - r-ggrepel=0.8.1 42 | - r-ggsci=2.9 43 | - r-pheatmap=1.0.12 44 | - r-dplyr=0.8.0.1 45 | - r-knitr=1.22 46 | - r-ggseqlogo=0.1 47 | - r-rmarkdown=1.10 48 | - subread=2.0.0 49 | - pandoc=2.7.3 50 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Kaiyu Zhu, Yu Sun, Xiaoqiong Bao 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /conf/test_bam.config: -------------------------------------------------------------------------------- 1 | /* 2 | * ------------------------------------------------- 3 | * Nextflow config file for running tests 4 | * ------------------------------------------------- 5 | * Defines bundled input files and everything required 6 | * to run a fast and simple test. Use as follows: 7 | * nextflow run nf-core/meripseqpipe -profile test 8 | */ 9 | 10 | params { 11 | config_profile_name = 'Test profile' 12 | config_profile_description = 'Minimal test dataset to check pipeline function' 13 | // Limit resources so that this can run on Travis 14 | max_cpus = 2 15 | max_memory = 6.GB 16 | max_time = 48.h 17 | 18 | // Input data 19 | // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets 20 | // TODO nf-core: Give any required params for the test so that command line flags are not needed 21 | aligners = "none" 22 | comparefile = 'https://raw.githubusercontent.com/kingzhuky/test-datasets/meripseqpipe/designfiles/comparefile.txt' 23 | designfile = 'https://raw.githubusercontent.com/kingzhuky/test-datasets/meripseqpipe/designfiles/designfile_bam.tsv' 24 | fasta = 'https://raw.githubusercontent.com/kingzhuky/test-datasets/meripseqpipe/reference/TEST.fa' 25 | gtf = 'https://raw.githubusercontent.com/kingzhuky/test-datasets/meripseqpipe/reference/TEST.gtf' 26 | } -------------------------------------------------------------------------------- /conf/test.config: -------------------------------------------------------------------------------- 1 | /* 2 | * ------------------------------------------------- 3 | * Nextflow config file for running tests 4 | * ------------------------------------------------- 5 | * Defines bundled input files and everything required 6 | * to run a fast and simple test. Use as follows: 7 | * nextflow run nf-core/meripseqpipe -profile test, 8 | */ 9 | 10 | params { 11 | config_profile_name = 'Test profile' 12 | config_profile_description = 'Minimal test dataset to check pipeline function' 13 | // Limit resources so that this can run on GitHub Actions 14 | max_cpus = 2 15 | max_memory = 6.GB 16 | max_time = 48.h 17 | 18 | // Input data 19 | // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets 20 | // TODO nf-core: Give any required params for the test so that command line flags are not needed 21 | single_end = false 22 | comparefile = 'https://raw.githubusercontent.com/kingzhuky/test-datasets/meripseqpipe/designfiles/comparefile.txt' 23 | designfile = 'https://raw.githubusercontent.com/kingzhuky/test-datasets/meripseqpipe/designfiles/designfile_paired.tsv' 24 | fasta = 'https://raw.githubusercontent.com/kingzhuky/test-datasets/meripseqpipe/reference/TEST.fa' 25 | gtf = 'https://raw.githubusercontent.com/kingzhuky/test-datasets/meripseqpipe/reference/TEST.gtf' 26 | } -------------------------------------------------------------------------------- /conf/test_mixed.config: -------------------------------------------------------------------------------- 1 | /* 2 | * ------------------------------------------------- 3 | * Nextflow config file for running tests 4 | * ------------------------------------------------- 5 | * Defines bundled input files and everything required 6 | * to run a fast and simple test. Use as follows: 7 | * nextflow run nf-core/meripseqpipe -profile test, 8 | */ 9 | 10 | params { 11 | config_profile_name = 'Test profile' 12 | config_profile_description = 'Minimal test dataset to check pipeline function' 13 | // Limit resources so that this can run on GitHub Actions 14 | max_cpus = 2 15 | max_memory = 6.GB 16 | max_time = 48.h 17 | 18 | // Input data 19 | // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets 20 | // TODO nf-core: Give any required params for the test so that command line flags are not needed 21 | single_end = false 22 | comparefile = 'https://raw.githubusercontent.com/kingzhuky/test-datasets/meripseqpipe/designfiles/comparefile.txt' 23 | designfile = 'https://raw.githubusercontent.com/kingzhuky/test-datasets/meripseqpipe/designfiles/designfile_mixed.tsv' 24 | fasta = 'https://raw.githubusercontent.com/kingzhuky/test-datasets/meripseqpipe/reference/TEST.fa' 25 | gtf = 'https://raw.githubusercontent.com/kingzhuky/test-datasets/meripseqpipe/reference/TEST.gtf' 26 | } -------------------------------------------------------------------------------- /assets/email_template.txt: -------------------------------------------------------------------------------- 1 | ---------------------------------------------------- 2 | ,--./,-. 3 | ___ __ __ __ ___ /,-._.--~\\ 4 | |\\ | |__ __ / ` / \\ |__) |__ } { 5 | | \\| | \\__, \\__/ | \\ |___ \\`-._,-`-, 6 | `._,._,' 7 | nf-core/meripseqpipe v${version} 8 | ---------------------------------------------------- 9 | 10 | Run Name: $runName 11 | 12 | <% if (success){ 13 | out << "## nf-core/meripseqpipe execution completed successfully! ##" 14 | } else { 15 | out << """#################################################### 16 | ## nf-core/meripseqpipe execution completed unsuccessfully! ## 17 | #################################################### 18 | The exit status of the task that caused the workflow execution to fail was: $exitStatus. 19 | The full error message was: 20 | 21 | ${errorReport} 22 | """ 23 | } %> 24 | 25 | 26 | The workflow was completed at $dateComplete (duration: $duration) 27 | 28 | The command used to launch the workflow was as follows: 29 | 30 | $commandLine 31 | 32 | 33 | 34 | Pipeline Configuration: 35 | ----------------------- 36 | <% out << summary.collect{ k,v -> " - $k: $v" }.join("\n") %> 37 | 38 | -- 39 | nf-core/meripseqpipe 40 | https://github.com/nf-core/meripseqpipe 41 | -------------------------------------------------------------------------------- /bin/markdown_to_html.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Command line argument processing 4 | args = commandArgs(trailingOnly=TRUE) 5 | if (length(args) < 2) { 6 | stop("Usage: markdown_to_html.r ", call.=FALSE) 7 | } 8 | markdown_fn <- args[1] 9 | output_fn <- args[2] 10 | 11 | # Load / install packages 12 | if (!require("markdown")) { 13 | install.packages("markdown", dependencies=TRUE, repos='http://cloud.r-project.org/') 14 | library("markdown") 15 | } 16 | 17 | base_css_fn <- getOption("markdown.HTML.stylesheet") 18 | base_css <- readChar(base_css_fn, file.info(base_css_fn)$size) 19 | custom_css <- paste(base_css, " 20 | body { 21 | padding: 3em; 22 | margin-right: 350px; 23 | max-width: 100%; 24 | } 25 | #toc { 26 | position: fixed; 27 | right: 20px; 28 | width: 300px; 29 | padding-top: 20px; 30 | overflow: scroll; 31 | height: calc(100% - 3em - 20px); 32 | } 33 | #toc_header { 34 | font-size: 1.8em; 35 | font-weight: bold; 36 | } 37 | #toc > ul { 38 | padding-left: 0; 39 | list-style-type: none; 40 | } 41 | #toc > ul ul { padding-left: 20px; } 42 | #toc > ul > li > a { display: none; } 43 | img { max-width: 800px; } 44 | ") 45 | 46 | markdownToHTML( 47 | file = markdown_fn, 48 | output = output_fn, 49 | stylesheet = custom_css, 50 | options = c('toc', 'base64_images', 'highlight_code') 51 | ) 52 | -------------------------------------------------------------------------------- /assets/sendmail_template.txt: -------------------------------------------------------------------------------- 1 | To: $email 2 | Subject: $subject 3 | Mime-Version: 1.0 4 | Content-Type: multipart/related;boundary="nfcoremimeboundary" 5 | 6 | --nfcoremimeboundary 7 | Content-Type: text/html; charset=utf-8 8 | 9 | $email_html 10 | 11 | --nfcoremimeboundary 12 | Content-Type: image/png;name="nf-core-meripseqpipe_logo.png" 13 | Content-Transfer-Encoding: base64 14 | Content-ID: 15 | Content-Disposition: inline; filename="nf-core-meripseqpipe_logo.png" 16 | 17 | <% out << new File("$baseDir/assets/nf-core-meripseqpipe_logo.png"). 18 | bytes. 19 | encodeBase64(). 20 | toString(). 21 | tokenize( '\n' )*. 22 | toList()*. 23 | collate( 76 )*. 24 | collect { it.join() }. 25 | flatten(). 26 | join( '\n' ) %> 27 | 28 | <% 29 | if (mqcFile){ 30 | def mqcFileObj = new File("$mqcFile") 31 | if (mqcFileObj.length() < mqcMaxSize){ 32 | out << """ 33 | --nfcoremimeboundary 34 | Content-Type: text/html; name=\"multiqc_report\" 35 | Content-Transfer-Encoding: base64 36 | Content-ID: 37 | Content-Disposition: attachment; filename=\"${mqcFileObj.getName()}\" 38 | 39 | ${mqcFileObj. 40 | bytes. 41 | encodeBase64(). 42 | toString(). 43 | tokenize( '\n' )*. 44 | toList()*. 45 | collate( 76 )*. 46 | collect { it.join() }. 47 | flatten(). 48 | join( '\n' )} 49 | """ 50 | }} 51 | %> 52 | 53 | --nfcoremimeboundary-- 54 | -------------------------------------------------------------------------------- /bin/cufflinks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #bash cufflinks.sh 3 | #$1 argv 1 : designfile 4 | #$2 argv 2 : gtf file 5 | #$3 argv 3 : THREAD_NUM 6 | designfile=$1 7 | gtf_file=$2 8 | THREAD_NUM=$3 9 | 10 | group_list=$(awk 'BEGIN{FS=","}NR>1{print $4}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}') 11 | tag=$(echo $group_list | awk '{OFS=",";ORS=""}{for(x=1;x> assembly_list.txt 26 | done 27 | cuffmerge -o ./merged_gtf -g ${gtf_file} -p ${THREAD_NUM} assembly_list.txt 28 | 29 | ## Run Cuffdiff for differential expression analysis 30 | cuffdiff -o cuffdiff\ 31 | -L $tag \ 32 | -p ${THREAD_NUM} \ 33 | --time-series --multi-read-correct \ 34 | --library-type fr-unstranded \ 35 | ./merged_gtf/merged.gtf ${bam_file_array} 36 | -------------------------------------------------------------------------------- /bin/MATK_quantification.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ## MATK_quantification.sh 3 | ## $1 argv 1 : matk_jar 4 | ## $2 argv 2 : gtf file 5 | ## $3 argv 3 : designfile 6 | ## $4 argv 4 : merge_bed_file 7 | matk_jar=$1 8 | gtf_file=$2 9 | designfile=$3 10 | merge_bed_file=$4 11 | THREAD_NUM=$5 12 | 13 | #Define a multi-threaded run channel 14 | mkfifo tmp 15 | exec 9<>tmp 16 | for ((i=1;i<=${THREAD_NUM:=1};i++)) 17 | do 18 | echo >&9 19 | done 20 | 21 | sample_list=$(awk 'BEGIN{FS=","}NR>1{print $1}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}') 22 | for sample_id in $sample_list 23 | do 24 | #read -u 9 25 | { 26 | ip_bam_file=$(ls ${sample_id}.ip*.bam) 27 | input_bam_file=$(ls ${sample_id}.input*.bam) 28 | java -jar $matk_jar -quantification \ 29 | -ip "$ip_bam_file" \ 30 | -input "$input_bam_file" \ 31 | -bed $merge_bed_file \ 32 | -gtf $gtf_file \ 33 | -out MATK_${sample_id}_quantification.bed 34 | echo $sample_id > tmp.quantification.$sample_id 35 | awk 'BEGIN{FS="\t"}{print $5}' MATK_${sample_id}_quantification.bed >> tmp.quantification.$sample_id 36 | awk 'BEGIN{FS="\t";print ""}NR>1{print $1":"$2"-"$3}' MATK_${sample_id}_quantification.bed > tmp.MATK.quantification 37 | #echo >&9 38 | } 39 | done 40 | wait 41 | ls tmp.quantification.* |xargs -iFILE sed -i '2d' FILE 42 | ls tmp.quantification.* |xargs paste tmp.MATK.quantification > MATK_quantification.matrix 43 | echo "MATK quantification done" -------------------------------------------------------------------------------- /.github/workflows/ci_aligners.yml: -------------------------------------------------------------------------------- 1 | name: nf-core CI (aligners) 2 | # This workflow is triggered on pushes and PRs to the repository. 3 | # It runs the pipeline with the minimal test dataset to check that it completes without any syntax errors 4 | on: [push, pull_request] 5 | 6 | jobs: 7 | test: 8 | env: 9 | NXF_VER: ${{ matrix.nxf_ver }} 10 | NXF_ANSI_LOG: false 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | # Nextflow versions: check pipeline minimum and current latest 15 | nxf_ver: ['19.04.0', ''] 16 | steps: 17 | - uses: actions/checkout@v2 18 | - name: Install Nextflow 19 | run: | 20 | wget -qO- get.nextflow.io | bash 21 | sudo mv nextflow /usr/local/bin/ 22 | - name: Pull docker image 23 | run: | 24 | docker pull kingzhuky/meripseqpipe:dev 25 | docker tag kingzhuky/meripseqpipe:dev kingzhuky/meripseqpipe:dev 26 | - name: Run pipeline with test data 27 | run: | 28 | # Run the pipeline with the test profile 29 | nextflow run ${GITHUB_WORKSPACE} -profile test,docker --skip_peakCalling --skip_expression 30 | # Run, build reference genome with HISAT2 31 | nextflow run ${GITHUB_WORKSPACE} -profile test_mixed,docker --aligners hisat2 --skip_peakCalling --skip_expression -resume 32 | # Run, build reference genome with BWA 33 | nextflow run ${GITHUB_WORKSPACE} -profile test_mixed,docker --aligners bwa --skip_peakCalling --skip_expression -resume 34 | 35 | -------------------------------------------------------------------------------- /bin/normalize_peaks.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jul 22 16:44:32 2019 4 | 5 | @author: zky 6 | """ 7 | from math import log 8 | import sys 9 | import numpy as np 10 | #from scipy import stats 11 | if len(sys.argv) <= 2: 12 | print("This script need two parameters. For example,\ 13 | python normalize_peaks.py ") 14 | sys.exit() 15 | input_bed_file = sys.argv[1] 16 | output_bed_file = sys.argv[2] 17 | 18 | def MaxMinNormalization(x,Max,Min): 19 | if Max != Min : 20 | x = 1e-20 + (x - Min)*(1-1e-20) / (Max - Min) 21 | return x 22 | #def Z_ScoreNormalization(x,mu,sigma): 23 | # x = (x - mu) / sigma; 24 | # return x 25 | with open(input_bed_file) as peaks_bed: 26 | pvalue_array = [] 27 | normalized_peaks = [] 28 | max_pvalue = min_pvalue = 0 29 | for line in peaks_bed: 30 | data = line.replace('\n','').replace('\r','').split('\t') 31 | pvalue = float(data[4]) 32 | pvalue_array.append(pvalue) 33 | normalized_peaks.append(data) 34 | if pvalue_array : 35 | max_pvalue = np.max(pvalue_array) 36 | min_pvalue = np.min(pvalue_array) 37 | # mu = np.average(pvalue_array) 38 | # sigma = np.std(pvalue_array)cd 39 | for data in normalized_peaks: 40 | data[4] = MaxMinNormalization(float(data[4]),max_pvalue,min_pvalue) 41 | data[4] = -log(data[4],10) 42 | with open(output_bed_file,'w') as output_file: 43 | for data in normalized_peaks: 44 | output_file.write('\t'.join(str(i) for i in data)) 45 | output_file.write('\n') 46 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nfcore/base:1.9 2 | LABEL authors="Kaiyu Zhu, Yu Sun, Xiaoqiong Bao" \ 3 | description="Docker image containing all software requirements for the MeRIPseqPipe pipeline" 4 | 5 | # Install the conda environment 6 | COPY environment.yml / 7 | RUN conda env create -f /environment.yml && conda clean -a 8 | # install subread 9 | RUN conda create -n multiqc -c conda-forge -c bioconda python=3.7.8 multiqc=1.7 && conda clean -a 10 | 11 | RUN conda env export --name meripseqpipe-1.0dev > meripseqpipe-1.0dev.yml 12 | ENV PATH /mspc:$PATH 13 | ENV PATH /opt/conda/bin:$PATH 14 | ENV PATH /opt/conda/envs/multiqc/bin/:$PATH 15 | ENV PATH /opt/conda/envs/meripseqpipe-1.0dev/bin:$PATH 16 | 17 | 18 | # install MATK 19 | RUN wget https://github.com/kingzhuky/MATK_backup/releases/download/v0.1dev/MATK-1.0.jar 20 | 21 | # install QNB 22 | RUN wget https://cran.r-project.org/src/contrib/Archive/QNB/QNB_1.1.11.tar.gz && \ 23 | R CMD INSTALL QNB_1.1.11.tar.gz && \ 24 | rm QNB_1.1.11.tar.gz 25 | 26 | # install MeTDiff 27 | RUN git clone https://github.com/compgenomics/MeTDiff.git && \ 28 | R CMD build MeTDiff/ && \ 29 | R CMD INSTALL MeTDiff_1.0.tar.gz && \ 30 | rm -rf MeTDiff* 31 | 32 | # install MeTPeak 33 | RUN git clone https://github.com/compgenomics/MeTPeak.git && \ 34 | R CMD build MeTPeak/ && \ 35 | R CMD INSTALL MeTPeak_1.0.0.tar.gz && \ 36 | rm -rf MeTPeak* 37 | 38 | # install MSPC 39 | RUN conda install -y unzip 40 | RUN wget -O mspc.zip "https://github.com/Genometric/MSPC/releases/download/v5.4.0/linux-x64.zip" && \ 41 | unzip mspc.zip -d mspc && \ 42 | chmod 775 mspc/mspc && \ 43 | rm mspc.zip 44 | -------------------------------------------------------------------------------- /.github/workflows/ci_methy_methods.yml: -------------------------------------------------------------------------------- 1 | name: nf-core CI (methylation_analysis_mode) 2 | # This workflow is triggered on pushes and PRs to the repository. 3 | # It runs the pipeline with the minimal test dataset to check that it completes without any syntax errors 4 | on: [push, pull_request] 5 | 6 | jobs: 7 | test: 8 | env: 9 | NXF_VER: ${{ matrix.nxf_ver }} 10 | NXF_ANSI_LOG: false 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | # Nextflow versions: check pipeline minimum and current latest 15 | nxf_ver: ['19.04.0', ''] 16 | steps: 17 | - uses: actions/checkout@v2 18 | - name: Install Nextflow 19 | run: | 20 | wget -qO- get.nextflow.io | bash 21 | sudo mv nextflow /usr/local/bin/ 22 | - name: Pull docker image 23 | run: | 24 | docker pull kingzhuky/meripseqpipe:dev 25 | docker tag kingzhuky/meripseqpipe:dev kingzhuky/meripseqpipe:dev 26 | - name: Run pipeline with test data 27 | run: | 28 | # Run, test PeakCalling mode 'group' and Methylation Analysis mode 'QNB' 29 | nextflow run ${GITHUB_WORKSPACE} -profile test_bam,docker --skip_expression --methylation_analysis_mode QNB -resume 30 | # Run, test PeakMerged mode 'rank' and Methylation Analysis mode 'MATK' 31 | nextflow run ${GITHUB_WORKSPACE} -profile test_bam,docker --skip_expression --methylation_analysis_mode MATK -resume 32 | # Run, test one of PeakMerged mode 'bedtools' and Methylation Analysis mode 'DESeq2' 33 | nextflow run ${GITHUB_WORKSPACE} -profile test_bam,docker --peakMerged_mode macs2 --skip_expression --methylation_analysis_mode DESeq2 -resume 34 | -------------------------------------------------------------------------------- /bin/QC_Peaks_Report.rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "QC_Peaks_Report" 3 | author: "MeRIPseqPipe" 4 | output: html_document 5 | --- 6 | 7 | ```{r setup, include=TRUE, echo=FALSE} 8 | require(grid) 9 | ``` 10 | 11 | ## Peaks Distribution 12 | Compare different distribution of different PeakCalling tools. 13 | ```{r Distribution, echo=FALSE} 14 | print(distribute.barplot) 15 | print(distribute.barplot.count) 16 | ## Curve 17 | for( sample in names(sample.plots.list) ){ 18 | print(sample) 19 | print(sample.plots.list[[sample]]) 20 | } 21 | print("merged peaks") 22 | print(merged.plot) 23 | ``` 24 | 25 | ## Peaks' motif 26 | 27 | Compare different motifs(top three) of different groups. 28 | 29 | ```{r motif, echo=FALSE} 30 | ggplot2.multiplot <- function(..., plotlist=NULL, cols=2) { 31 | # Make a list from the ... arguments and plotlist 32 | plots <- c(list(...), plotlist) 33 | numPlots = length(plots) 34 | 35 | # Make the panel 36 | plotCols = cols # Number of columns of plots 37 | plotRows = ceiling(numPlots/plotCols) # Number of rows needed, calculated from # of cols 38 | # Set up the page 39 | grid::grid.newpage() 40 | grid::pushViewport(grid::viewport(layout = grid::grid.layout(plotRows, plotCols))) 41 | vplayout <- function(x, y) 42 | grid::viewport(layout.pos.row = x, layout.pos.col = y,name = "abc") 43 | # Make each plot, in the correct location 44 | for (i in 1:numPlots) { 45 | curRow = ceiling(i/plotCols) 46 | curCol = (i-1) %% plotCols + 1 47 | print(plots[[i]], vp = vplayout(curRow, curCol)) 48 | } 49 | } 50 | 51 | for( peakfile in names(QC.motif.list) ){ 52 | print(peakfile) 53 | ggplot2.multiplot(plotlist = QC.motif.list[[peakfile]] ,cols = 1) 54 | } 55 | ``` 56 | -------------------------------------------------------------------------------- /.github/workflows/linting.yml: -------------------------------------------------------------------------------- 1 | name: nf-core linting 2 | # This workflow is triggered on pushes and PRs to the repository. 3 | # It runs the `nf-core lint` and markdown lint tests to ensure that the code meets the nf-core guidelines 4 | on: 5 | push: 6 | pull_request: 7 | release: 8 | types: [published] 9 | 10 | jobs: 11 | Markdown: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v2 15 | - uses: actions/setup-node@v1 16 | with: 17 | node-version: '10' 18 | - name: Install markdownlint 19 | run: npm install -g markdownlint-cli 20 | - name: Run Markdownlint 21 | run: markdownlint ${GITHUB_WORKSPACE} -c ${GITHUB_WORKSPACE}/.github/markdownlint.yml 22 | YAML: 23 | runs-on: ubuntu-latest 24 | steps: 25 | - uses: actions/checkout@v1 26 | - uses: actions/setup-node@v1 27 | with: 28 | node-version: '10' 29 | - name: Install yaml-lint 30 | run: npm install -g yaml-lint 31 | - name: Run yaml-lint 32 | run: yamllint $(find ${GITHUB_WORKSPACE} -type f -name "*.yml") 33 | # nf-core: 34 | # runs-on: ubuntu-latest 35 | # steps: 36 | # - uses: actions/checkout@v2 37 | # - name: Install Nextflow 38 | # run: | 39 | # wget -qO- get.nextflow.io | bash 40 | # sudo mv nextflow /usr/local/bin/ 41 | # - uses: actions/setup-python@v1 42 | # with: 43 | # python-version: '3.6' 44 | # architecture: 'x64' 45 | # - name: Install dependencies 46 | # run: | 47 | # python -m pip install --upgrade pip 48 | # pip install nf-core 49 | # - name: Run nf-core lint 50 | # run: nf-core lint ${GITHUB_WORKSPACE} 51 | -------------------------------------------------------------------------------- /bin/generate_featurecount_mat.R: -------------------------------------------------------------------------------- 1 | #!/bin/Rscript 2 | ## Rscript get_htseq_matrix.R designfile THREAD_NUM eg. Rscript get_htseq_matrix.R designfile_single.txt 10 3 | ## designfile: filename, control_or_treated, input_or_ip, group(default 0 is CONTROL_SITUATION else are TREATED_SITUATION) 4 | 5 | library(parallel) 6 | library(data.table) 7 | args<-commandArgs(T) 8 | designfile <- args[1] 9 | THREAD_NUM <- as.numeric(args[2]) 10 | 11 | designtable <- read.csv(designfile,header = TRUE,stringsAsFactors=FALSE, colClasses = c("character")) 12 | #Generate gene count matrix 13 | fc.files <- list.files("./",pattern = ".txt$") 14 | mclapply(unique(designtable$Group),function(x){ 15 | group_id <- x 16 | group.input.count.mat <- NULL 17 | for(pc in grep(paste0(".input_",group_id,"[.]bam"),fc.files,value = TRUE)){ 18 | pc.exp <- fread(pc,sep = "\t")[,c(1,7)] 19 | if(is.null(group.input.count.mat)){ 20 | group.input.count.mat <- pc.exp 21 | }else{ 22 | group.input.count.mat <- merge(group.input.count.mat,pc.exp,by = c("Geneid")) 23 | } 24 | } 25 | #parsing samplenames 26 | output_pattern = paste0("htseq_group_",group_id) #添加aligner 27 | fwrite(group.input.count.mat, file = paste0(output_pattern,"_input.count"), sep = "\t") 28 | }, 29 | mc.cores = THREAD_NUM 30 | ) 31 | group.mat.list = grep("htseq",list.files(path = "./",pattern = "input.count"), value = T) 32 | expression.matrix <- NULL 33 | for( file in group.mat.list ){ 34 | tmp.expression.table <- as.matrix(read.table(file, header = TRUE, row.names = 1, check.names=F)) 35 | expression.matrix <- cbind(expression.matrix, tmp.expression.table) 36 | } 37 | colnames(expression.matrix) <- as.matrix(lapply(strsplit(colnames(expression.matrix),".input"), function(x){ x[1]})) 38 | write.table(expression.matrix,file = "expression.matrix",quote=F) 39 | -------------------------------------------------------------------------------- /.github/workflows/ci_peakcalling_tools.yml: -------------------------------------------------------------------------------- 1 | name: nf-core CI (peakcalling tools) 2 | # This workflow is triggered on pushes and PRs to the repository. 3 | # It runs the pipeline with the minimal test dataset to check that it completes without any syntax errors 4 | on: [push, pull_request] 5 | 6 | jobs: 7 | test: 8 | env: 9 | NXF_VER: ${{ matrix.nxf_ver }} 10 | NXF_ANSI_LOG: false 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | # Nextflow versions: check pipeline minimum and current latest 15 | nxf_ver: ["19.04.0", ""] 16 | steps: 17 | - uses: actions/checkout@v2 18 | - name: Install Nextflow 19 | run: | 20 | wget -qO- get.nextflow.io | bash 21 | sudo mv nextflow /usr/local/bin/ 22 | - name: Pull docker image 23 | run: | 24 | docker pull kingzhuky/meripseqpipe:dev 25 | docker tag kingzhuky/meripseqpipe:dev kingzhuky/meripseqpipe:dev 26 | - name: Run pipeline with test data 27 | run: | 28 | # Run, test PeakCalling mode 'rank' of one peakcalling tool 'meyer' 29 | nextflow run ${GITHUB_WORKSPACE} -profile test_bam,docker --skip_metpeak --skip_macs2 --skip_matk -resume 30 | # Run, test PeakCalling mode 'rank' of one peakcalling tool 'metpeak' 31 | nextflow run ${GITHUB_WORKSPACE} -profile test_bam,docker --skip_meyer --skip_macs2 --skip_matk -resume 32 | # Run, test PeakCalling mode 'rank' of one peakcalling tool 'matk' 33 | nextflow run ${GITHUB_WORKSPACE} -profile test_bam,docker --skip_metpeak --skip_macs2 --skip_meyer -resume 34 | # Run, test PeakCalling mode 'rank' of one peakcalling tool 'macs2' 35 | nextflow run ${GITHUB_WORKSPACE} -profile test_bam,docker --skip_metpeak --skip_meyer --skip_matk -resume 36 | -------------------------------------------------------------------------------- /.github/workflows/branch.yml: -------------------------------------------------------------------------------- 1 | name: nf-core branch protection 2 | # This workflow is triggered on PRs to master branch on the repository 3 | # It fails when someone tries to make a PR against the nf-core `master` branch instead of `dev` 4 | on: 5 | pull_request: 6 | branches: [master] 7 | 8 | jobs: 9 | test: 10 | runs-on: ubuntu-latest 11 | steps: 12 | # PRs to the nf-core repo master branch are only ok if coming from the nf-core repo `dev` or any `patch` branches 13 | - name: Check PRs 14 | if: github.repository == 'nf-core/meripseqpipe' 15 | run: | 16 | { [[ $(git remote get-url origin) == *nf-core/meripseqpipe ]] && [[ ${GITHUB_HEAD_REF} = "dev" ]]; } || [[ ${GITHUB_HEAD_REF} == "patch" ]] 17 | 18 | # If the above check failed, post a comment on the PR explaining the failure 19 | # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets 20 | - name: Post PR comment 21 | if: failure() 22 | uses: mshick/add-pr-comment@v1 23 | with: 24 | message: | 25 | Hi @${{ github.event.pull_request.user.login }}, 26 | 27 | It looks like this pull-request is has been made against the ${{github.event.pull_request.head.repo.full_name}} `master` branch. 28 | The `master` branch on nf-core repositories should always contain code from the latest release. 29 | Because of this, PRs to `master` are only allowed if they come from the ${{github.event.pull_request.head.repo.full_name}} `dev` branch. 30 | 31 | You do not need to close this PR, you can change the target branch to `dev` by clicking the _"Edit"_ button at the top of this page. 32 | 33 | Thanks again for your contribution! 34 | repo-token: ${{ secrets.GITHUB_TOKEN }} 35 | allow-repeats: false -------------------------------------------------------------------------------- /.github/workflows/ci_peakcalling_methods.yml: -------------------------------------------------------------------------------- 1 | name: nf-core CI (peakcalling methods) 2 | # This workflow is triggered on pushes and PRs to the repository. 3 | # It runs the pipeline with the minimal test dataset to check that it completes without any syntax errors 4 | on: [push, pull_request] 5 | 6 | jobs: 7 | test: 8 | env: 9 | NXF_VER: ${{ matrix.nxf_ver }} 10 | NXF_ANSI_LOG: false 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | # Nextflow versions: check pipeline minimum and current latest 15 | nxf_ver: ["19.04.0", ""] 16 | steps: 17 | - uses: actions/checkout@v2 18 | - name: Install Nextflow 19 | run: | 20 | wget -qO- get.nextflow.io | bash 21 | sudo mv nextflow /usr/local/bin/ 22 | - name: Pull docker image 23 | run: | 24 | docker pull kingzhuky/meripseqpipe:dev 25 | docker tag kingzhuky/meripseqpipe:dev kingzhuky/meripseqpipe:dev 26 | - name: Run pipeline with test data 27 | run: | 28 | # Run, test PeakCalling mode 'group' of one peakcalling tool 'macs2' 29 | nextflow run ${GITHUB_WORKSPACE} -profile test_bam,docker --skip_metpeak --skip_meyer --skip_matk --peakCalling_mode group -resume 30 | # Run, test PeakCalling mode 'mspc' of one peakcalling tool 'macs2' 31 | nextflow run ${GITHUB_WORKSPACE} -profile test_bam,docker --skip_metpeak --skip_meyer --skip_matk --peakCalling_mode group --peakMerged_mode mspc -resume 32 | # Run, test PeakCalling mode 'macs2' of one peakcalling tool 'macs2' 33 | nextflow run ${GITHUB_WORKSPACE} -profile test_bam,docker --skip_metpeak --skip_meyer --skip_matk --peakMerged_mode macs2 -resume 34 | # Run, test PeakCalling mode 'rank' of four peakcalling tools 35 | nextflow run ${GITHUB_WORKSPACE} -profile test_bam,docker -resume -------------------------------------------------------------------------------- /bin/QNB_quantification.R: -------------------------------------------------------------------------------- 1 | # Rscript QNB_quantification.R designfile 2 | library("QNB") 3 | args <- commandArgs(T) 4 | designfile <- args[1] 5 | ## read designfile to get the name of samples 6 | designtable <- read.csv(designfile,head = TRUE,stringsAsFactors=FALSE, colClasses = c("character")) 7 | filelist = list.files(path = "./",pattern = ".count") 8 | rpkm_peaks_list <- NULL 9 | for(sample_id in designtable$Sample_ID){ 10 | ## generate the dataframe of peak count 11 | input.count <- c() 12 | input.names <- c() 13 | input.samples <- c() 14 | for(input in grep(sample_id, grep("[.]input",filelist,value = TRUE), value = T)){ 15 | input.exp <- read.table(input,header=T,sep="\t",row.names= NULL,quote = "") 16 | input.count <- cbind(input.count,input.exp[,5]) 17 | input.names <- input.exp[,4] #peaks name 18 | input.samples <- c(input.samples,input) #samples name 19 | } 20 | colnames(input.count) <- input.samples 21 | rownames(input.count) <- input.names 22 | ip.count <- c() 23 | ip.names <- c() 24 | ip.samples <- c() 25 | for(ip in grep(sample_id, grep("[.]ip",filelist,value = TRUE), value = T)){ 26 | ip.exp <- read.table(ip,header=T,sep="\t",row.names= NULL,quote = "") 27 | ip.count <- cbind(ip.count,ip.exp[,5]) 28 | ip.names <- ip.exp[,4] #peaks name 29 | ip.samples <- c(ip.samples,ip) #samples name 30 | } 31 | colnames(ip.count) <- ip.samples 32 | rownames(ip.count) <- ip.names 33 | ## Run the QNB to generate quantificative value per sample 34 | result <- qnbtest(ip.count, ip.count, input.count, input.count, mode="blind") 35 | sample_quantification <- as.matrix(result$p.treated) 36 | colnames(sample_quantification) <- sample_id 37 | rpkm_peaks_list <- cbind(rpkm_peaks_list,sample_quantification) 38 | rownames(rpkm_peaks_list) <- rownames(ip.count) 39 | } 40 | write.table(rpkm_peaks_list,sep = "\t",file = "QNB_quantification.matrix",quote = F) 41 | -------------------------------------------------------------------------------- /bin/scrape_software_versions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | from collections import OrderedDict 4 | import re 5 | 6 | # TODO nf-core: Add additional regexes for new tools in process get_software_versions 7 | regexes = { 8 | 'nf-core/meripseqpipe': ['v_pipeline.txt', r"(\S+)"], 9 | 'Nextflow': ['v_nextflow.txt', r"(\S+)"], 10 | 'FastQC': ['v_fastqc.txt', r"FastQC v(\S+)"], 11 | 'MultiQC': ['v_multiqc.txt', r"multiqc, version (\S+)"], 12 | } 13 | results = OrderedDict() 14 | results['nf-core/meripseqpipe'] = 'N/A' 15 | results['Nextflow'] = 'N/A' 16 | results['FastQC'] = 'N/A' 17 | results['MultiQC'] = 'N/A' 18 | 19 | # Search each file using its regex 20 | for k, v in regexes.items(): 21 | try: 22 | with open(v[0]) as x: 23 | versions = x.read() 24 | match = re.search(v[1], versions) 25 | if match: 26 | results[k] = "v{}".format(match.group(1)) 27 | except IOError: 28 | results[k] = False 29 | 30 | # Remove software set to false in results 31 | for k in list(results): 32 | if not results[k]: 33 | del(results[k]) 34 | 35 | # Dump to YAML 36 | print (''' 37 | id: 'software_versions' 38 | section_name: 'nf-core/meripseqpipe Software Versions' 39 | section_href: 'https://github.com/nf-core/meripseqpipe' 40 | plot_type: 'html' 41 | description: 'are collected at run time from the software output.' 42 | data: | 43 |
44 | ''') 45 | for k,v in results.items(): 46 | print("
{}
{}
".format(k,v)) 47 | print ("
") 48 | 49 | # Write out regexes as csv file: 50 | with open('software_versions.csv', 'w') as f: 51 | for k,v in results.items(): 52 | f.write("{}\t{}\n".format(k,v)) 53 | -------------------------------------------------------------------------------- /docs/troubleshooting.md: -------------------------------------------------------------------------------- 1 | # MeRIPseqPipe: Troubleshooting 2 | 3 | ## Input files not found 4 | 5 | If only no file, only one input file , or only read one and not read two is picked up then something is wrong with your input file declaration 6 | 7 | 1. The path must be enclosed in quotes (`'` or `"`) 8 | 2. The path must have at least one `*` wildcard character. This is even if you are only running one paired end sample. 9 | 3. When using the pipeline with paired end data, the path must use `{1,2}` or `{R1,R2}` notation to specify read pairs. 10 | 4. If you are running Single end data make sure to specify `--singleEnd` 11 | 12 | If the pipeline can't find your files then you will get the following error 13 | 14 | ```bash 15 | ERROR ~ Cannot find any reads matching: *{1,2}.fastq.gz 16 | ``` 17 | 18 | Note that if your sample name is "messy" then you have to be very particular with your glob specification. A file name like `L1-1-D-2h_S1_L002_R1_001.fastq.gz` can be difficult enough for a human to read. Specifying `*{1,2}*.gz` wont work give you what you want Whilst `*{R1,R2}*.gz` will. 19 | 20 | ## Data organization 21 | 22 | The pipeline can't take a list of multiple input files - it takes a glob expression. If your input files are scattered in different paths then we recommend that you generate a directory with symlinked files. If running in paired end mode please make sure that your files are sensibly named so that they can be properly paired. See the previous point. 23 | 24 | ## Extra resources and getting help 25 | 26 | If you still have an issue with running the pipeline then feel free to contact us. 27 | Have a look at the [pipeline website](https://github.com/nf-core/m6APipe) to find out how. 28 | 29 | If you have problems that are related to Nextflow and not our pipeline then check out the [Nextflow gitter channel](https://gitter.im/nextflow-io/nextflow) or the [google group](https://groups.google.com/forum/#!forum/nextflow). 30 | -------------------------------------------------------------------------------- /.github/workflows/awstest.yml: -------------------------------------------------------------------------------- 1 | name: nf-core AWS test 2 | # This workflow is triggered on push to the master branch. 3 | # It runs the -profile 'test' on AWS batch 4 | 5 | on: 6 | push: 7 | branches: 8 | - master 9 | 10 | jobs: 11 | run-awstest: 12 | name: Run AWS tests 13 | if: github.repository == 'nf-core/meripseqpipe' 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: Setup Miniconda 17 | uses: goanpeca/setup-miniconda@v1.0.2 18 | with: 19 | auto-update-conda: true 20 | python-version: 3.7 21 | - name: Install awscli 22 | run: conda install -c conda-forge awscli 23 | - name: Start AWS batch job 24 | # TODO nf-core: You can customise CI pipeline run tests as required 25 | # For example: adding multiple test runs with different parameters 26 | # Remember that you can parallelise this by using strategy.matrix 27 | env: 28 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 29 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 30 | TOWER_ACCESS_TOKEN: ${{ secrets.AWS_TOWER_TOKEN }} 31 | AWS_JOB_DEFINITION: ${{ secrets.AWS_JOB_DEFINITION }} 32 | AWS_JOB_QUEUE: ${{ secrets.AWS_JOB_QUEUE }} 33 | AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }} 34 | run: | 35 | aws batch submit-job \ 36 | --region eu-west-1 \ 37 | --job-name nf-core-test \ 38 | --job-queue $AWS_JOB_QUEUE \ 39 | --job-definition $AWS_JOB_DEFINITION \ 40 | --container-overrides '{"command": ["nf-core/meripseqpipe", "-r '"${GITHUB_SHA}"' -profile test --outdir s3://'"${AWS_S3_BUCKET}"'/test/results-'"${GITHUB_SHA}"' -w s3://'"${AWS_S3_BUCKET}"'/test/work-'"${GITHUB_SHA}"' -with-tower"], "environment": [{"name": "TOWER_ACCESS_TOKEN", "value": "'"$TOWER_ACCESS_TOKEN"'"}]}'test/work-'"${GITHUB_SHA}"' -with-tower"], "environment": [{"name": "TOWER_ACCESS_TOKEN", "value": "'"$TOWER_ACCESS_TOKEN"'"}]}' 41 | -------------------------------------------------------------------------------- /.github/workflows/awsfulltest.yml: -------------------------------------------------------------------------------- 1 | name: nf-core AWS full size tests 2 | # This workflow is triggered on push to the master branch. 3 | # It runs the -profile 'test_full' on AWS batch 4 | 5 | on: 6 | release: 7 | types: [published] 8 | 9 | jobs: 10 | run-awstest: 11 | name: Run AWS full tests 12 | if: github.repository == 'nf-core/meripseqpipe' 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Setup Miniconda 16 | uses: goanpeca/setup-miniconda@v1.0.2 17 | with: 18 | auto-update-conda: true 19 | python-version: 3.7 20 | - name: Install awscli 21 | run: conda install -c conda-forge awscli 22 | - name: Start AWS batch job 23 | # TODO nf-core: You can customise AWS full pipeline tests as required 24 | # Add full size test data (but still relatively small datasets for few samples) 25 | # on the `test_full.config` test runs with only one set of parameters 26 | # Then specify `-profile test_full` instead of `-profile test` on the AWS batch command 27 | env: 28 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 29 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 30 | TOWER_ACCESS_TOKEN: ${{ secrets.AWS_TOWER_TOKEN }} 31 | AWS_JOB_DEFINITION: ${{ secrets.AWS_JOB_DEFINITION }} 32 | AWS_JOB_QUEUE: ${{ secrets.AWS_JOB_QUEUE }} 33 | AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }} 34 | run: | 35 | aws batch submit-job \ 36 | --region eu-west-1 \ 37 | --job-name nf-core-test \ 38 | --job-queue $AWS_JOB_QUEUE \ 39 | --job-definition $AWS_JOB_DEFINITION \ 40 | --container-overrides '{"command": ["nf-core/meripseqpipe", "-r '"${GITHUB_SHA}"' -profile test --outdir s3://'"${AWS_S3_BUCKET}"'/test/results-'"${GITHUB_SHA}"' -w s3://'"${AWS_S3_BUCKET}"'/test/work-'"${GITHUB_SHA}"' -with-tower"], "environment": [{"name": "TOWER_ACCESS_TOKEN", "value": "'"$TOWER_ACCESS_TOKEN"'"}]}' 41 | -------------------------------------------------------------------------------- /bin/m6Aprediction.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #bash m6Aprediction.sh 3 | ## $1 argv 1 : matk_jar 4 | ## $2 argv 2 : designfile 5 | ## $3 argv 3 : fasta file 6 | ## $4 argv 4 : gtf file 7 | matk_jar=$1 8 | designfile=$2 9 | fasta_file=$3 10 | gtf_file=$4 11 | 12 | ### check if the file matk.jar exists 13 | if [ ! -f "$matk_jar" ]; then 14 | echo "Cannot find matk.jar. Please check the param of matk_jar" 1>&2 15 | exit 1 16 | fi 17 | 18 | faToTwoBit ${fasta_file} ${fasta_file/.fa/.2bit} 19 | awk -F "\t" '$3=="gene"{print }' $gtf_file > tmp.$gtf_file 20 | group_list=$(awk 'BEGIN{FS=","}NR>1{print $4}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}') 21 | for group_id in $group_list 22 | do 23 | { 24 | bedfile=$(ls *merged_group_${group_id}.bed) 25 | ip_bam_file_array=$(echo *.ip_${group_id}*.bam | awk '{OFS=",";ORS=""}{for(x=1;x tmp.m6A_sites_${group_id}.bed 36 | } 37 | done 38 | wait 39 | cat tmp.m6A_sites*.bed | sortBed | mergeBed -s -c 4,6,7,8 -o first,first,collapse,collapse > tmp.m6A_sites_merged.bed 40 | awk -v gap=25 '{print $1"\t"$2-gap"\t"$3+gap"\t*\t*\t"$5}' tmp.m6A_sites_merged.bed | bedtools getfasta -s -fi ${fasta_file} -bed - | awk '$0!~">"{print $0}' > tmp.m6A_sites_merged.fa 41 | paste tmp.m6A_sites_merged.bed tmp.m6A_sites_merged.fa > m6A_sites_merged.bed 42 | rm tmp.* 43 | echo "Prediction sites of m6A done" -------------------------------------------------------------------------------- /bin/get_htseq_matrix.R: -------------------------------------------------------------------------------- 1 | #!/bin/Rscript 2 | ## Rscript get_htseq_matrix.R designfile THREAD_NUM eg. Rscript get_htseq_matrix.R designfile_single.txt 10 3 | ## designfile: filename, control_or_treated, input_or_ip, group(default 0 is CONTROL_SITUATION else are TREATED_SITUATION) 4 | 5 | library(parallel) 6 | args<-commandArgs(T) 7 | designfile <- args[1] 8 | THREAD_NUM <- as.numeric(args[2]) 9 | 10 | designtable <- read.csv(designfile,header = TRUE,stringsAsFactors=FALSE, colClasses = c("character")) 11 | #Generate gene count matrix 12 | htseq.files <- list.files("./",pattern = ".txt") 13 | mclapply(unique(designtable$Group),function(x){ 14 | group_id <- x 15 | trans.htseq.input.count <- c() 16 | pc.names <- c() 17 | pc.samples <- c() 18 | for(pc in grep(paste0(".input_",group_id,"[.]bam"),htseq.files,value = TRUE)){ 19 | pc.exp <- read.table(pc,header=F,sep="\t",row.names=1,quote = "") 20 | trans.htseq.input.count <- cbind(trans.htseq.input.count,pc.exp[,1]) 21 | pc.names <- rownames(pc.exp) #genes name 22 | pc.samples <- c(pc.samples,pc) #samples name 23 | } 24 | rownames(trans.htseq.input.count) <- pc.names 25 | trans.htseq.input.count <- as.matrix(trans.htseq.input.count[c(-nrow(trans.htseq.input.count):-(nrow(trans.htseq.input.count)-4)),]) 26 | colnames(trans.htseq.input.count) <- pc.samples 27 | #parsing samplenames 28 | output_pattern = paste0("htseq_group_",group_id) #添加aligner 29 | write.table(trans.htseq.input.count, file = paste0(output_pattern,"_input.count") , sep ="\t", row.names = TRUE, col.names = TRUE, quote = FALSE) 30 | }, 31 | mc.cores = THREAD_NUM 32 | ) 33 | htseq.filelist = grep("htseq",list.files(path = "./",pattern = "input.count"), value = T) 34 | expression.matrix <- NULL 35 | for( file in htseq.filelist ){ 36 | tmp.expression.table <- as.matrix(read.table(file, header = TRUE, row.names = 1, check.names=F)) 37 | expression.matrix <- cbind(expression.matrix, tmp.expression.table) 38 | } 39 | colnames(expression.matrix) <- as.matrix(lapply(strsplit(colnames(expression.matrix),".input"), function(x){ x[1]})) 40 | write.table(expression.matrix,file = "expression.matrix",quote=F) -------------------------------------------------------------------------------- /bin/m6Am.R: -------------------------------------------------------------------------------- 1 | #Find m6Am 5'UTR peaks 2 | anno_5UTR=overlap.anno[which(overlap.anno$Gene.site=="5UTR"),] 3 | 4 | gtf.temp=fread("/data/database/hg38/GENCODE/gencode.v25.annotation.gtf",sep="\t",skip = 5,data.table = F) 5 | gtf.temp=cbind(gtf.temp,Transcript.id=strsplit2(strsplit2(gtf.temp$V9,split = "transcript_id ")[,2],split = ";")[,1]) 6 | gtf.temp$Transcript.id=gsub("\"","",gtf.temp$Transcript.id) 7 | 8 | write.table(strsplit2(anno_5UTR$Peak.id,split=":|-"),"UTR5.peak.bed",row.names = F,col.names = F,quote = F,sep="\t") 9 | system("fastaFromBed -fi /data/database/hg38/genome.fa -bed UTR5.peak.bed -fo UTR5.peak.fa") 10 | system("/data/software/homer/bin/homer2 find -i UTR5.peak.fa -m /data/xingyang/m6A_zhengjian/BCA.motif -p 5 > /data/xingyang/m6A_zhengjian/analysis/BCA_peak_offset.txt") 11 | BCA_in_5UTR_offset=read.table("analysis/BCA_peak_offset.txt",header=F) 12 | 13 | anno_5UTR=overlap.anno[unique(BCA_in_5UTR_offset$V1),] 14 | anno_5UTR=merge(gtf.temp,anno_5UTR,by="Transcript.id",all.y=T) 15 | anno_5UTR=anno_5UTR[which(anno_5UTR$V3=="UTR"),] 16 | anno_5UTR$temp.start=anno_5UTR$V4-anno_5UTR$Start 17 | anno_5UTR$temp.end=anno_5UTR$V5-anno_5UTR$Start 18 | anno_5UTR[which(anno_5UTR$temp.start>0),"temp.start"]=1 19 | anno_5UTR[which(anno_5UTR$temp.start<0),"temp.start"]=(-1) 20 | anno_5UTR[which(anno_5UTR$temp.end>0),"temp.end"]=1 21 | anno_5UTR[which(anno_5UTR$temp.end<0),"temp.end"]=(-1) 22 | anno_5UTR=anno_5UTR[which((anno_5UTR$temp.start*anno_5UTR$temp.end)<=0),] 23 | anno_5UTR.bed=cbind(anno_5UTR$V1,anno_5UTR$V4,anno_5UTR$V5,anno_5UTR$Peak.id,".",anno_5UTR$V7) 24 | write.table(anno_5UTR.bed,"UTR5.peak.bed",row.names = F,col.names = F,quote = F,sep="\t") 25 | system("fastaFromBed -fi /data/database/hg38/genome.fa -bed UTR5.peak.bed -s -name -fo UTR5.peak.fa") 26 | 27 | temp.utr5=read.table("UTR5.peak.fa",sep="\n") 28 | temp.utr5=cbind(temp.utr5,substr(temp.utr5[,1],1,1)) 29 | 30 | i=2 31 | n=nrow(temp.utr5) 32 | temp.utr5=cbind(temp.utr5,type=NA) 33 | while(i<=n){ 34 | if(temp.utr5[i,2]=="A"){ 35 | temp.utr5[c(i-1,i),"type"]="m6Am" 36 | } 37 | i=i+2 38 | } 39 | m6Am=na.omit(temp.utr5) 40 | m6Am=m6Am[grep(">",m6Am$V1),] 41 | m6Am=gsub(">","",m6Am$V1) 42 | m6Am=strsplit2(m6Am,split="[(]")[,1] -------------------------------------------------------------------------------- /bin/intersec.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | use strict; 3 | use Getopt::Long; 4 | 5 | 6 | my $file1; 7 | my $column1; 8 | my $file2; 9 | my $column2; 10 | my $type; 11 | GetOptions( 12 | 'a=s' => \$file1, 13 | 'na=i' => \$column1, 14 | 'b=s' => \$file2, 15 | 'nb=i' => \$column2, 16 | 't=s' => \$type 17 | ); 18 | open FH1, $file1 or die "can not open $file1: $!"; 19 | 20 | my %id1; 21 | while(){ 22 | chomp; 23 | $_=~s/"//g; 24 | my @field=split /\s+/; 25 | if(!exists $id1{$field[$column1-1]}){ 26 | $id1{$field[$column1-1]}=$_; 27 | }else{ 28 | $id1{$field[$column1-1]}.="\n".$_; 29 | } 30 | } 31 | close(FH1); 32 | open FH2,$file2 or die "can not open $file2:$!"; 33 | my %id2; 34 | while(){ 35 | chomp; 36 | $_=~s/"//g; 37 | my @field=split /\s+/; 38 | if(!exists $id2{$field[$column2-1]}){ 39 | $id2{$field[$column2-1]}=$_; 40 | }else{ 41 | $id2{$field[$column2-1]}.="\n".$_; 42 | } 43 | } 44 | 45 | if($type eq "ua"){ 46 | foreach my $a (keys %id1){ 47 | if(!exists $id2{$a}){ 48 | print $id1{$a}."\n"; 49 | } 50 | } 51 | } 52 | 53 | if($type eq "ub"){ 54 | foreach my $b (keys %id2){ 55 | if(!exists $id1{$b}){ 56 | print $id2{$b}."\n"; 57 | } 58 | } 59 | } 60 | 61 | if($type eq "d"){ 62 | foreach my $b (keys %id2){ 63 | if(exists $id1{$b}){ 64 | print $b."\n"; 65 | } 66 | } 67 | 68 | } 69 | 70 | if($type eq "da"){ 71 | foreach my $b (keys %id2){ 72 | if(exists $id1{$b}){ 73 | print $id1{$b}."\n"; 74 | } 75 | } 76 | } 77 | if($type eq "db"){ 78 | foreach my $b (keys %id2){ 79 | if(exists $id1{$b}){ 80 | print $id2{$b}."\n"; 81 | } 82 | } 83 | } 84 | if($type eq "dab"){ 85 | foreach my $b (keys %id2){ 86 | if(exists $id1{$b}){ 87 | my @tmp1=split "\n",$id1{$b}; 88 | my @tmp2=split "\n",$id2{$b}; 89 | foreach my $t1 (@tmp1){ 90 | foreach my $t2 (@tmp2){ 91 | print $t1."\t".$t2."\n"; 92 | } 93 | } 94 | #print $id1{$b}."\t".$id2{$b}."\n"; 95 | } 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /bin/edgeR.R: -------------------------------------------------------------------------------- 1 | #!/bin/Rscript 2 | ## Rscript edgeR.R eg. Rscript edgeR.R designfile_single.txt T_vs_N 3 | ### designfile: Sample_id, Input_filename, IP_filename, group_id 4 | ### compare_str: Compairision design (eg: A_vs_B) 5 | library("edgeR") 6 | args<-commandArgs(T) 7 | designfile <- args[1] 8 | compare_str <- as.character(args[2]) 9 | 10 | designtable <- read.csv(designfile,head = TRUE,stringsAsFactors=FALSE, colClasses = c("character")) 11 | # Running edgeR by compare.file 12 | ## while there are only 2 groups, running edgeR without compare.file 13 | if(length(unique(designtable$Group)) < 2){ 14 | stop( "The count of Group is less than two, please check your designfile.") 15 | }else if( compare_str == "two_group" ){ 16 | # Running edgeR without compare_str beacause of only two groups 17 | ## Combine expression matrix 18 | group_id_1 <- unique(designtable$Group)[1] 19 | group_id_2 <- unique(designtable$Group)[2] 20 | }else{ 21 | # Running edgeR with compare_str 22 | ## Combine expression matrix 23 | group_id_1 <- strsplit(as.character(compare_str), "_vs_")[[1]][1] 24 | group_id_2 <- strsplit(as.character(compare_str), "_vs_")[[1]][2] 25 | } 26 | control_database = read.table(paste0("htseq_group_", group_id_1, "_input.count"), header = TRUE, row.names = 1) 27 | treated_database = read.table(paste0("htseq_group_", group_id_2, "_input.count"), header = TRUE, row.names = 1) 28 | combined_database <- cbind(control_database,treated_database) 29 | group <- factor(c(rep(group_id_1,ncol(control_database)), rep(group_id_2,ncol(treated_database)))) #setting factors 30 | y <- DGEList(counts=combined_database,group=group) 31 | rownames(y) <- rownames(combined_database) 32 | y <- calcNormFactors(y) 33 | design <- model.matrix(~group) 34 | y <- estimateDisp(y,design) 35 | #To perform likelihood ratio tests: 36 | fit <- glmFit(y,design) 37 | lrt <- glmLRT(fit,coef=2) 38 | topTags(lrt) 39 | ### set output_name 40 | lrt$table$padj <- p.adjust(lrt$table$PValue,"BH") 41 | lrt.res <- lrt$table[order(lrt$table$padj),] 42 | colnames(lrt.res) <- c("log2FoldChange","logCPM","LR","pvalue","padj") 43 | output_name <- paste0("edgeR_group_",group_id_1, "_",group_id_2) 44 | write.csv(combined_database, file = paste0(output_name,".matirx") ) 45 | write.csv(lrt.res, file = paste0(output_name, ".csv")) 46 | -------------------------------------------------------------------------------- /bin/bedtools_quantification.R: -------------------------------------------------------------------------------- 1 | #!/bin/Rscript 2 | ## Rscript bedtools_quantification.R 3 | ### the content of bam_stat_summary_file: example.bam TOTAL_READS 4 | ### designfile: Sample_id, Input_filename, IP_filename, group_id 5 | args <- commandArgs(T) 6 | designfile <- args[1] 7 | bam_stat_summary <- args[2] 8 | designtable <- read.csv(designfile,head = TRUE,stringsAsFactors=FALSE, colClasses = c("character")) 9 | bam_stat_table <- read.table(bam_stat_summary,row.names = 1) 10 | filelist =list.files(path = "./",pattern = ".count") 11 | ## Generate the quantificative value of peaks referred to RPKM 12 | rpkm_peaks_list <- NULL 13 | #rpkm_peaks_list1 <- NULL 14 | #rpkm_peaks_list2 <- NULL 15 | for(sample_id in designtable$Sample_ID){ 16 | input_count_file <- grep(paste0("[.]",sample_id,"[.]input"),filelist,value = TRUE) 17 | input_count_table <- read.table(file = input_count_file, sep = "\t", row.names = NULL,header = T) 18 | bam_stat_index = grep(paste0("^",sample_id,"[.]input"),rownames(bam_stat_table)) 19 | input_rpkm = apply(input_count_table,1,function(x) (as.numeric(x[5])/(as.numeric(x[3])-as.numeric(x[2]))*1000/bam_stat_table[bam_stat_index,]*1000000)) 20 | 21 | ip_count_file <- grep(paste0("[.]",sample_id,"[.]ip"),filelist,value = TRUE) 22 | ip_count_table <- read.table(file = ip_count_file, sep = "\t", row.names = NULL, header = T) 23 | bam_stat_index = grep(paste0("^",sample_id,"[.]ip"),rownames(bam_stat_table)) 24 | ip_rpkm = apply(ip_count_table,1,function(x) (as.numeric(x[5])/(as.numeric(x[3])-as.numeric(x[2]))*1000/bam_stat_table[bam_stat_index,]*1000000)) 25 | 26 | rpkm <- as.matrix((ip_rpkm+1)/(input_rpkm+1)) 27 | #rpkm1<- as.matrix((ip_rpkm)/(input_rpkm+1)) 28 | #rpkm2<- as.matrix((ip_rpkm)/(input_rpkm+ip_rpkm)) 29 | colnames(rpkm)[1] <- sample_id 30 | #colnames(rpkm1)[1] <- sample_id 31 | #colnames(rpkm2)[1] <- sample_id 32 | rpkm_peaks_list <- cbind(rpkm_peaks_list,rpkm) 33 | #rpkm_peaks_list1 <- cbind(rpkm_peaks_list1,rpkm1) 34 | #rpkm_peaks_list2 <- cbind(rpkm_peaks_list2,rpkm2) 35 | } 36 | rownames(rpkm_peaks_list) <- input_count_table[,4] 37 | #rownames(rpkm_peaks_list1) <- input_count_table[,4] 38 | #rownames(rpkm_peaks_list2) <- input_count_table[,4] 39 | write.table(rpkm_peaks_list,sep = "\t",file = "bedtools_quantification.matrix",quote = F) 40 | #write.table(rpkm_peaks_list1,sep = "\t",file = "bedtools_quantification.inputadd.matrix",quote = F) 41 | #write.table(rpkm_peaks_list2,sep = "\t",file = "bedtools_quantification.alladd.matrix",quote = F) 42 | -------------------------------------------------------------------------------- /docs/configuration/reference_genomes.md: -------------------------------------------------------------------------------- 1 | # nf-core/m6APipe: Reference Genomes Configuration 2 | 3 | The nf-core/m6APipe pipeline needs a reference genome for alignment and annotation. 4 | 5 | These paths can be supplied on the command line at run time (see the [usage docs](../usage.md)), 6 | but for convenience it's often better to save these paths in a nextflow config file. 7 | See below for instructions on how to do this. 8 | Read [Adding your own system](adding_your_own.md) to find out how to set up custom config files. 9 | 10 | ## Adding paths to a config file 11 | 12 | Specifying long paths every time you run the pipeline is a pain. 13 | To make this easier, the pipeline comes configured to understand reference genome keywords which correspond to preconfigured paths, meaning that you can just specify `--genome ID` when running the pipeline. 14 | 15 | Note that this genome key can also be specified in a config file if you always use the same genome. 16 | 17 | To use this system, add paths to your config file using the following template: 18 | 19 | ```nextflow 20 | params { 21 | genomes { 22 | 'YOUR-ID' { 23 | fasta = '/genome.fa' 24 | } 25 | 'OTHER-GENOME' { 26 | // [..] 27 | } 28 | } 29 | // Optional - default genome. Ignored if --genome 'OTHER-GENOME' specified on command line 30 | genome = 'YOUR-ID' 31 | } 32 | ``` 33 | 34 | You can add as many genomes as you like as long as they have unique IDs. 35 | 36 | ## illumina iGenomes 37 | 38 | To make the use of reference genomes easier, illumina has developed a centralised resource called [iGenomes](https://support.illumina.com/sequencing/sequencing_software/igenome.html). 39 | Multiple reference index types are held together with consistent structure for multiple genomes. 40 | 41 | We have put a copy of iGenomes up onto AWS S3 hosting and this pipeline is configured to use this by default. 42 | The hosting fees for AWS iGenomes are currently kindly funded by a grant from Amazon. 43 | The pipeline will automatically download the required reference files when you run the pipeline. 44 | For more information about the AWS iGenomes, see 45 | 46 | Downloading the files takes time and bandwidth, so we recommend making a local copy of the iGenomes resource. 47 | Once downloaded, you can customise the variable `params.igenomes_base` in your custom configuration file to point to the reference location. 48 | For example: 49 | 50 | ```nextflow 51 | params.igenomes_base = '/path/to/data/igenomes/' 52 | ``` 53 | -------------------------------------------------------------------------------- /bin/DESeq2.R: -------------------------------------------------------------------------------- 1 | #!/bin/Rscript 2 | ## Rscript DESeq2.R eg. Rscript DESeq2.R designfile_single.txt T_vs_N 3 | ### designfile: Sample_id, Input_filename, IP_filename, group_id 4 | ### compare_str: Compairision design (eg: A_vs_B) 5 | 6 | library(DESeq2) 7 | args<-commandArgs(T) 8 | designfile <- args[1] 9 | compare_str <- as.character(args[2]) 10 | 11 | designtable <- read.csv(designfile,head = TRUE,stringsAsFactors=FALSE, colClasses = c("character")) 12 | # Running DEseq2 by compare_str 13 | ## while there are only 2 groups, running DEseq2 without compare_str 14 | if( length(unique(designtable$Group)) < 2 ){ 15 | stop( "The count of Group is less than two, please check your designfile.") 16 | }else if( compare_str == "two_group" ){ 17 | # Running DESeq2 without compare_str beacause of only two groups 18 | ## Combine expression matrix 19 | group_id_1 <- unique(designtable$Group)[1] 20 | group_id_2 <- unique(designtable$Group)[2] 21 | }else{ 22 | # Running DESeq2 with compare_str 23 | ## Combine expression matrix 24 | group_id_1 <- strsplit(as.character(compare_str), "_vs_")[[1]][1] 25 | group_id_2 <- strsplit(as.character(compare_str), "_vs_")[[1]][2] 26 | } 27 | control_database = read.table(paste0("htseq_group_", group_id_1, "_input.count"), header = TRUE, row.names = 1, check.names = FALSE) 28 | treated_database = read.table(paste0("htseq_group_", group_id_2, "_input.count"), header = TRUE, row.names = 1, check.names = FALSE) 29 | combined_database <- cbind(control_database,treated_database) 30 | condition <- factor(c(rep(group_id_1,ncol(control_database)), rep(group_id_2,ncol(treated_database)))) #setting factors 31 | ### assign gene names 32 | colData <- data.frame(row.names=colnames(combined_database), group = condition) 33 | dds <- DESeqDataSetFromMatrix(countData = combined_database,colData = colData,design = ~ group) 34 | rownames(dds) <- rownames(combined_database) 35 | #dds <- dds[ rowSums(counts(dds)) > 1, ] 36 | dds <- DESeq(dds) 37 | ## FoldChange = group_id_2 / group_id_1 38 | res <- results(object = dds, contrast = c("group",group_id_2,group_id_1)) 39 | table(res$padj <0.05) 40 | res <- res[order(res$padj),] 41 | #resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)),by="row.names",sort=FALSE) 42 | #resdata2=resdata[resdata$log2FoldChange > 1|resdata$log2FoldChange < -1, ] 43 | ### set output_name 44 | output_name <- paste0("DESeq2_group_",group_id_1, "_",group_id_2) 45 | write.csv(res, file = paste0(output_name, ".csv")) 46 | #write.csv(resdata2,file = paste0(output_name, "_log2.csv"),row.names =FALSE) 47 | -------------------------------------------------------------------------------- /bin/MATK_diffm6A.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ## MATK_diffpeakCalling.sh 3 | ## $1 argv 1 : matk_jar 4 | ## $2 argv 2 : designfile 5 | ## $3 argv 3 : gtf file 6 | ## $4 argv 4 : compare_str 7 | ### designfile: Sample_id, Input_filename, IP_filename, group_id 8 | ### compare_str: Compairision design (eg: A_vs_B) 9 | matk_jar=$1 10 | designfile=$2 11 | gtf_file=$3 12 | compare_str=$4 13 | merged_bed=$5 14 | 15 | # setting the function of Running the quantification mode of MATK by two names of groups 16 | function matk_diffm6a_by_two_id() 17 | { 18 | group_id_1=$1 19 | group_id_2=$2 20 | matk_jar=$3 21 | gtf_file=$4 22 | control_ip_bam_file_array=$(echo *ip_${group_id_1}*.bam | awk '{OFS=",";ORS=""}{for(x=1;x1{print $1":"$2"-"$3,$4,$5,$6,$7,log($8)/log(2),$9,$10}' tmp.${group_id_1}_${group_id_2}.txt > MATK_diffm6A_${group_id_1}_${group_id_2}.txt 37 | } 38 | 39 | if [ "$compare_str" != "two_group" ]; then 40 | # Running MATK quantification with compare_str 41 | group_id_1=$(echo $compare_str | awk 'BEGIN{FS="_vs_"}{print $1}') 42 | group_id_2=$(echo $compare_str | awk 'BEGIN{FS="_vs_"}{print $2}') 43 | matk_diffm6a_by_two_id $group_id_1 $group_id_2 $matk_jar $gtf_file 44 | else 45 | # Running MATK quantification without compare_str beacause of only two groups 46 | echo "no compare file" 47 | group_list=$(awk 'BEGIN{FS=","}NR>1{print $4}' $designfile |sort|uniq|awk 'BEGIN{ORS="\t"}{print $0}') 48 | group_id_1=$(echo $group_list | awk 'BEGIN{FS="\t"}{print $1}') 49 | group_id_2=$(echo $group_list | awk 'BEGIN{FS="\t"}{print $2}') 50 | matk_diffm6a_by_two_id $group_id_1 $group_id_2 $matk_jar $gtf_file 51 | fi 52 | wait 53 | echo "diffMATK done" -------------------------------------------------------------------------------- /assets/email_template.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | nf-core/meripseqpipe Pipeline Report 10 | 11 | 12 |
13 | 14 | 15 | 16 |

nf-core/meripseqpipe v${version}

17 |

Run Name: $runName

18 | 19 | <% if (!success){ 20 | out << """ 21 |
22 |

nf-core/meripseqpipe execution completed unsuccessfully!

23 |

The exit status of the task that caused the workflow execution to fail was: $exitStatus.

24 |

The full error message was:

25 |
${errorReport}
26 |
27 | """ 28 | } else { 29 | out << """ 30 |
31 | nf-core/meripseqpipe execution completed successfully! 32 |
33 | """ 34 | } 35 | %> 36 | 37 |

The workflow was completed at $dateComplete (duration: $duration)

38 |

The command used to launch the workflow was as follows:

39 |
$commandLine
40 | 41 |

Pipeline Configuration:

42 | 43 | 44 | <% out << summary.collect{ k,v -> "" }.join("\n") %> 45 | 46 |
$k
$v
47 | 48 |

nf-core/meripseqpipe

49 |

https://github.com/nf-core/meripseqpipe

50 | 51 |
52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /bin/create_IGV_js.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | fasta=$1 3 | gtf=$2 4 | merged_peak_file=$3 5 | designfile=$4 6 | echo "Start to generate IGV.js" 7 | 8 | ## setting tmp files' name 9 | bedgraph_tracks_file=tmp.bedgraph.tracks 10 | peaks_tracks_file=tmp.peaks.tracks 11 | 12 | ## combined tracks of bedgraph 13 | sampleinfo_list=$(awk 'BEGIN{FS=","}NR>1{print $1","$4}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}') 14 | for sample_group_id in ${sampleinfo_list} 15 | do 16 | { 17 | sample_id=$(echo ${sample_group_id} | awk 'BEGIN{FS=","}{print $1}') 18 | group_id=$(echo ${sample_group_id} | awk 'BEGIN{FS=","}{print $2}') 19 | bedgraph_input_file=$(ls ${sample_id}.input_*.igv.bedgraph) 20 | bedgraph_ip_file=$(ls ${sample_id}.ip_*.igv.bedgraph) 21 | cat >> ${bedgraph_tracks_file} << EOF 22 | { 23 | url: '${bedgraph_input_file}', 24 | name: '${sample_id}.input', 25 | color: 'rgb(200,0,0)', 26 | type: "wig", 27 | sourceType: "file", 28 | autoscaleGroup: 'group_${group_id}.${sample_id}' 29 | }, 30 | { 31 | url: '${bedgraph_ip_file}', 32 | name: '${sample_id}.ip', 33 | type: "wig", 34 | sourceType: "file", 35 | color: 'rgb(200,0,0)', 36 | autoscaleGroup: 'group_${group_id}.${sample_id}' 37 | }, 38 | EOF 39 | } 40 | done 41 | 42 | ## combined tracks of merged group peaks 43 | groups_peak_file=$(ls *_merged_group_*igv.bed) 44 | for peak_file in ${groups_peak_file} 45 | do 46 | { 47 | cat >> ${peaks_tracks_file} << EOF 48 | { 49 | type: "annotation", 50 | format: "bed", 51 | url: '${peak_file}', 52 | name: "${peak_file}" 53 | }, 54 | EOF 55 | } 56 | done 57 | 58 | ## combined tracks and allpeaks track 59 | cat ${bedgraph_tracks_file} ${peaks_tracks_file} > tmp.tracks 60 | cat >> tmp.tracks << EOF 61 | { 62 | type: "annotation", 63 | format: "bed", 64 | url: '${merged_peak_file}', 65 | name: "${merged_peak_file}" 66 | } 67 | EOF 68 | tracks_js=$(cat tmp.tracks) 69 | 70 | ## combined all info 71 | cat>igv.js< 3 | ### designfile: Sample_id, Input_filename, IP_filename, group_id 4 | ### compare_str: Compairision design (eg: A_vs_B) 5 | args <- commandArgs(T) 6 | designfile <- args[1] 7 | quantification_matrix_file <- args[2] 8 | compare_str <- as.character(args[3]) 9 | 10 | designtable <- read.csv(designfile, head = TRUE, stringsAsFactors=FALSE, colClasses = c("character")) 11 | quantification_matrix = read.table(quantification_matrix_file ,sep = "\t",header = T, row.names = 1) 12 | # generate design matrix 13 | design.matrix <- as.data.frame(designtable$Group) 14 | rownames(design.matrix) <- designtable$Sample_ID 15 | colnames(design.matrix) <- "Type" 16 | 17 | # Wilcoxon test for the vector of two groups 18 | row_wilcox <- function(design.matrix,group_id_1,group_id_2,x,test_mode=""){ 19 | group1 <- as.character(rownames(subset(design.matrix,Type==group_id_1))) 20 | group2 <- as.character(rownames(subset(design.matrix,Type==group_id_2))) 21 | if (test_mode=="paired"){ 22 | res_wix0 <- wilcox.test(x[which(rownames(design.matrix)%in%group1)],x[which(rownames(design.matrix)%in%group2)], paired = T) 23 | } else { 24 | res_wix0 <- wilcox.test(x[which(rownames(design.matrix)%in%group1)],x[which(rownames(design.matrix)%in%group2)]) 25 | } 26 | res_wix0$log2FC = log2(mean(x[which(rownames(design.matrix)%in%group2)])/mean(x[which(rownames(design.matrix)%in%group1)])) 27 | res_wix <- c(log2FC=res_wix0$log2FC,pvalue=res_wix0$p.value,statistic=res_wix0$statistic) 28 | return(res_wix) 29 | } 30 | 31 | # Get the information of groups from compare_str 32 | if(length(unique(design.matrix$Type)) < 2){ 33 | stop( "The count of Group is less than two, please check your designfile.") 34 | }else if( compare_str == "two_group" ){ 35 | # Get the information without compare_str beacause of only two groups 36 | group_id_1 <- unique(design.matrix$Type)[1] 37 | group_id_2 <- unique(design.matrix$Type)[2] 38 | }else{ 39 | # Running MeTDiff quantification with compare_str 40 | group_id_1 <- strsplit(as.character(compare_str), "_vs_")[[1]][1] 41 | group_id_2 <- strsplit(as.character(compare_str), "_vs_")[[1]][2] 42 | } 43 | cat("peak number ",dim(quantification_matrix)[1],"\n") 44 | 45 | # Run the Wilcoxon test for the quantifacative value of every peak 46 | test_mode="" 47 | res_wix_lst <- apply(na.omit(quantification_matrix[,3:ncol(quantification_matrix)]),1,function(x){row_wilcox(design.matrix,group_id_1,group_id_2,x,test_mode)}) 48 | res_wix_lst = as.data.frame(t(res_wix_lst)) 49 | res_wix_lst$padj = p.adjust(res_wix_lst$pvalue,method = "BH") 50 | res_wix_lst$BY = p.adjust(res_wix_lst$pvalue,method = "bonferroni") 51 | cat("DM peaks pvalue(0.05)",sum(res_wix_lst$pvalue <=0.05),"\n") 52 | cat("DM peaks FDR(0.05)",sum(res_wix_lst$padj <=0.05),"\n") 53 | output_name <- paste0("bedtools_diffm6A_",group_id_1, "_", group_id_2) 54 | write.table(res_wix_lst, file = paste0(output_name,".txt"), sep = "\t", quote = F) 55 | 56 | -------------------------------------------------------------------------------- /bin/QNB_diffm6A.R: -------------------------------------------------------------------------------- 1 | #!/bin/Rscript 2 | ## Rscript QNB_diffm6A.R 3 | ### designfile: Sample_id, Input_filename, IP_filename, group_id 4 | ### compare_str: Compairision design (eg: A_vs_B) 5 | library(QNB) 6 | args <- commandArgs(T) 7 | designfile <- args[1] 8 | quantification_matrix_file <- args[2] 9 | compare_str <- as.character(args[3]) 10 | 11 | designtable <- read.csv(designfile,head = TRUE,stringsAsFactors=FALSE, colClasses = c("character")) 12 | # Running QNB quantification 13 | if(length(unique(designtable$Group)) < 2){ 14 | stop( "The count of Group is less than two, please check your designfile.") 15 | }else if( compare_str == "two_group" ){ 16 | # Running QNB quantification without compare_str beacause of only two groups 17 | group_id_1 <- unique(designtable$Group)[1] 18 | group_id_2 <- unique(designtable$Group)[2] 19 | }else{ 20 | # Running QNB quantification with compare_str 21 | group_id_1 <- strsplit(as.character(compare_str), "_vs_")[[1]][1] 22 | group_id_2 <- strsplit(as.character(compare_str), "_vs_")[[1]][2] 23 | } 24 | 25 | # combine the matrix by groups 26 | countlist <- NULL 27 | for(group_id in c(group_id_1,group_id_2)){ 28 | ## generate the list of input count 29 | input.count <- c() 30 | input.names <- c() 31 | input.samples <- c() 32 | for( input in grep(paste0(quantification_matrix_file,"[.]",group_id,"[.]"), list.files(pattern = "input.count"), value = T) ){ 33 | input.exp <- read.table(input,header=T,sep="\t",row.names= NULL,quote = "") 34 | input.count <- cbind(input.count,input.exp[,5]) 35 | input.names <- input.exp[,4] #peaks name 36 | input.samples <- c(input.samples,input) #samples name 37 | } 38 | colnames(input.count) <- input.samples 39 | rownames(input.count) <- input.names 40 | countlist[[paste0(group_id,"_input")]] <- input.count 41 | 42 | ## generate the list of ip count 43 | ip.count <- c() 44 | ip.names <- c() 45 | ip.samples <- c() 46 | for( ip in grep(paste0(quantification_matrix_file,"[.]",group_id,"[.]"), list.files(pattern = "ip.count"), value = T) ){ 47 | ip.exp <- read.table(ip,header=T,sep="\t",row.names= NULL,quote = "") 48 | ip.count <- cbind(ip.count,ip.exp[,5]) 49 | ip.names <- ip.exp[,4] #peaks name 50 | ip.samples <- c(ip.samples,ip) #samples name 51 | } 52 | colnames(ip.count) <- ip.samples 53 | rownames(ip.count) <- ip.names 54 | countlist[[paste0(group_id,"_ip")]] <- ip.count 55 | } 56 | ## Run the QNB by using the count of peaks 57 | meth1 = countlist[[paste0(group_id_1,"_ip")]] 58 | meth2 = countlist[[paste0(group_id_2,"_ip")]] 59 | unmeth1 = countlist[[paste0(group_id_1,"_input")]] 60 | unmeth2 = countlist[[paste0(group_id_2,"_input")]] 61 | output_name <- paste0("QNB_diffm6A_",group_id_1, "_",group_id_2) 62 | dir.create(output_name) 63 | result <- qnbtest(meth1, meth2, unmeth1, unmeth2, mode="auto", output.dir = output_name) 64 | colnames(result) <- c("p.treated","p.control","log2FC","log2.OR","pvalue","qvalue","padj") 65 | write.table(result, file = paste0(output_name,".txt"), sep = "\t", quote = F) -------------------------------------------------------------------------------- /bin/bed_count.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #bash bed_count.sh 3 | #$1 argv 1 : designfile 4 | #$2 argv 2 : THREAD_NUM 5 | #$3 argv 3 : merge_bed_file 6 | #$4 argv 4 : output_bam_stat_file 7 | designfile=$1 8 | THREAD_NUM=$2 9 | merge_bed_file=$3 10 | output_bam_stat_file=$4 11 | 12 | # Define a multi-threaded run channel 13 | mkfifo tmp 14 | exec 9<>tmp 15 | for ((i=1;i<=${THREAD_NUM:=1};i++)) 16 | do 17 | echo >&9 18 | done 19 | 20 | # Create the file about the summary of bam stat 21 | echo "Total_Reads" > $output_bam_stat_file 22 | awk '{ print $1"\t"$2"\t"$3"\t"$4}' ${merge_bed_file} > tmp.${merge_bed_file} 23 | 24 | sampleinfo_list=$(awk 'BEGIN{FS=","}NR>1{print $1","$4}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}') 25 | for sample_group_id in ${sampleinfo_list} 26 | do 27 | read -u 9 28 | { 29 | sample_id=$(echo ${sample_group_id} | awk 'BEGIN{FS=","}{print $1}') 30 | group_id=$(echo ${sample_group_id} | awk 'BEGIN{FS=","}{print $2}') 31 | 32 | #Define the input/ip file of the sample 33 | input_bam_file=$(ls ${sample_id}.input*.bam | awk '{ORS=" "}{print $0}') 34 | ip_bam_file=$(ls ${sample_id}.ip*.bam | awk '{ORS=" "}{print $0}') 35 | 36 | ## Create files and print the name of samples && Get the Total reads of the samples of input and ip 37 | echo -e ${input_bam_file}"\t" | awk 'BEGIN{ORS=""}{print $0}' > ${sample_id}.bam_stat.txt 38 | samtools view -c ${input_bam_file} >> ${sample_id}.bam_stat.txt 39 | echo -e ${ip_bam_file}"\t" | awk 'BEGIN{ORS=""}{print $0}' >> ${sample_id}.bam_stat.txt 40 | samtools view -c ${ip_bam_file} >> ${sample_id}.bam_stat.txt 41 | 42 | ## Setting colnames of peaks input/ip count 43 | echo $input_bam_file \ 44 | | awk 'BEGIN{ORS=""}{print "chrom\tchromStart\tchromEND\tPeakName\t"}{for(x=1;x ${merge_bed_file}.${group_id}.${sample_id}.input.count 46 | echo ${ip_bam_file} \ 47 | | awk 'BEGIN{ORS=""}{print "chrom\tchromStart\tchromEND\tPeakName\t"}{for(x=1;x ${merge_bed_file}.${group_id}.${sample_id}.ip.count 49 | 50 | ## Count input/ip peaks 51 | 52 | bedtools multicov -bams ${input_bam_file} -bed tmp.${merge_bed_file} >> ${merge_bed_file}.${group_id}.${sample_id}.input.count 53 | bedtools multicov -bams ${ip_bam_file} -bed tmp.${merge_bed_file} >> ${merge_bed_file}.${group_id}.${sample_id}.ip.count 54 | echo >&9 55 | 56 | # awk -v bam="$input_bam" -v pre="$prefix" ' 57 | # {print " bedtools multicov -bams '${input_bam_file}' -bed tmp.'${merge_bed_file}' >> '${merge_bed_file}'.'${group_id}'.'${sample_id}'.input.count; \ 58 | # bedtools multicov -bams '${ip_bam_file}' -bed tmp.'${merge_bed_file}' >> '${merge_bed_file}'.'${group_id}'.'${sample_id}'.ip.count; \ 59 | # sortBed -i ./"pre".tmp/input/"$1".bed | intersectBed -a '${genomebin_dir}'"$1".bin25.bed -b - -sorted -c > ./"pre".tmp/input/"$1".bin25.txt"}' $chrName_file \ 60 | # | xargs -iCMD -P$THREAD_NUM bash -c CMD 61 | }& 62 | done 63 | wait 64 | cat *.bam_stat.txt >> $output_bam_stat_file 65 | wait 66 | echo "bedtools count done" -------------------------------------------------------------------------------- /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # nf-core/meripseqpipe: Contributing Guidelines 2 | 3 | Hi there! Many thanks for taking an interest in improving nf-core/meripseqpipe. 4 | 5 | We try to manage the required tasks for nf-core/meripseqpipe using GitHub issues, you probably came to this page when creating one. Please use the pre-filled template to save time. 6 | 7 | However, don't be put off by this template - other more general issues and suggestions are welcome! Contributions to the code are even more welcome ;) 8 | 9 | > If you need help using or modifying nf-core/meripseqpipe then the best place to ask is on the pipeline channel on [Slack](https://nf-co.re/join/slack/). 10 | 11 | 12 | 13 | ## Contribution workflow 14 | If you'd like to write some code for nf-core/meripseqpipe, the standard workflow 15 | is as follows: 16 | 17 | 1. Check that there isn't already an issue about your idea in the 18 | [nf-core/meripseqpipe issues](https://github.com/nf-core/meripseqpipe/issues) to avoid 19 | duplicating work. 20 | * If there isn't one already, please create one so that others know you're working on this 21 | 2. Fork the [nf-core/meripseqpipe repository](https://github.com/nf-core/meripseqpipe) to your GitHub account 22 | 3. Make the necessary changes / additions within your forked repository 23 | 4. Submit a Pull Request against the `dev` branch and wait for the code to be reviewed and merged. 24 | 25 | If you're not used to this workflow with git, you can start with some [basic docs from GitHub](https://help.github.com/articles/fork-a-repo/) or even their [excellent interactive tutorial](https://try.github.io/). 26 | 27 | 28 | ## Tests 29 | When you create a pull request with changes, [Travis CI](https://travis-ci.org/) will run automatic tests. 30 | Typically, pull-requests are only fully reviewed when these tests are passing, though of course we can help out before then. 31 | 32 | There are typically two types of tests that run: 33 | 34 | ### Lint Tests 35 | The nf-core has a [set of guidelines](https://nf-co.re/developers/guidelines) which all pipelines must adhere to. 36 | To enforce these and ensure that all pipelines stay in sync, we have developed a helper tool which runs checks on the pipeline code. This is in the [nf-core/tools repository](https://github.com/nf-core/tools) and once installed can be run locally with the `nf-core lint ` command. 37 | 38 | If any failures or warnings are encountered, please follow the listed URL for more documentation. 39 | 40 | ### Pipeline Tests 41 | Each nf-core pipeline should be set up with a minimal set of test-data. 42 | Travis CI then runs the pipeline on this data to ensure that it exists successfully. 43 | If there are any failures then the automated tests fail. 44 | These tests are run both with the latest available version of Nextflow and also the minimum required version that is stated in the pipeline code. 45 | 46 | ## Getting help 47 | For further information/help, please consult the [nf-core/meripseqpipe documentation](https://github.com/nf-core/meripseqpipe#documentation) and don't hesitate to get in touch on the [nf-core/meripseqpipe pipeline channel](https://nfcore.slack.com/channels/nf-core/meripseqpipe) on [Slack](https://nf-co.re/join/slack/). 48 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | * Using welcoming and inclusive language 12 | * Being respectful of differing viewpoints and experiences 13 | * Gracefully accepting constructive criticism 14 | * Focusing on what is best for the community 15 | * Showing empathy towards other community members 16 | 17 | Examples of unacceptable behavior by participants include: 18 | 19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 20 | * Trolling, insulting/derogatory comments, and personal or political attacks 21 | * Public or private harassment 22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 23 | * Other conduct which could reasonably be considered inappropriate in a professional setting 24 | 25 | ## Our Responsibilities 26 | 27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 28 | 29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | ## Scope 32 | 33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 34 | 35 | ## Enforcement 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team on [Slack](https://nf-co.re/join/slack/). The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 38 | 39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 40 | 41 | ## Attribution 42 | 43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] 44 | 45 | [homepage]: http://contributor-covenant.org 46 | [version]: http://contributor-covenant.org/version/1/4/ 47 | -------------------------------------------------------------------------------- /bin/DESeq2_quantification.R: -------------------------------------------------------------------------------- 1 | #!/bin/Rscript 2 | ## generate DESeq2 logfc matrix 3 | anno.exp.m6a <- subset(m6a.anno.matrix,select= c(PeakRegion,ID)) 4 | final.count.table <- NULL 5 | htseq_list <- dir(pattern = "count",full.names = T) 6 | combined_htseq_count <- read.table(htseq_list[1], header = TRUE, row.names = 1, check.names = FALSE) 7 | for (file in htseq_list[-1]){ 8 | combined_htseq_count <- cbind(combined_htseq_count,read.table(file, header = TRUE, row.names = 1, check.names = FALSE)) 9 | } 10 | colnames(combined_htseq_count) <- unlist(lapply(strsplit(colnames(combined_htseq_count),split = "_"),FUN = function(x){x[1]})) 11 | combined_htseq_count$ID <- rownames(combined_htseq_count) 12 | final.count.table <- merge(anno.exp.m6a,combined_htseq_count,by= "ID") 13 | 14 | sample_id <- designtable$Sample_ID[1] 15 | ip_count_file <- grep(paste0("[.]",sample_id,"[.]ip"),count_filelist,value = TRUE) 16 | ip.matrix <- read.table(file = ip_count_file, sep = "\t", row.names = NULL, header = T, check.names=F) 17 | rownames.ip.matrix <- ip.matrix$PeakName 18 | ip.matrix <- subset(ip.matrix , select= 5) 19 | colnames(ip.matrix) <- paste0(sample_id,".ip") 20 | for(sample_id in designtable$Sample_ID[-1]){ 21 | ip_count_file <- grep(paste0("[.]",sample_id,"[.]ip"),count_filelist,value = TRUE) 22 | ip_count_table <- read.table(file = ip_count_file, sep = "\t", row.names = NULL, header = T, check.names=F) 23 | ip_count_table <- subset(ip_count_table , select= 5) 24 | colnames(ip_count_table) <- paste0(sample_id,".ip") 25 | ip.matrix <- cbind(ip.matrix,ip_count_table) 26 | } 27 | row.names(ip.matrix) <- rownames.ip.matrix 28 | ip.matrix$PeakRegion <- rownames(ip.matrix) 29 | final.count.table <- merge(final.count.table,ip.matrix,by= "PeakRegion") 30 | 31 | load("deseq2.Rdata") 32 | library("DESeq2") 33 | deseq2.count.table <- subset(final.count.table,select= colnames(final.count.table)[c(-1,-2)]) 34 | rownames(deseq2.count.table) <- final.count.table$PeakRegion 35 | coldata <- data.frame(row.names = colnames(deseq2.count.table),group = colnames(deseq2.count.table) ,sample = unlist(lapply(strsplit(colnames(deseq2.count.table),split = "[.]"),FUN = function(x){x[1]}))) 36 | final.deseq2.logfc.matrix <- subset(final.count.table, select = PeakRegion) 37 | for (sample_id in unique(coldata$sample)) { 38 | coldata.sample <- subset(coldata,sample == sample_id,group) 39 | coldata.sample$group <- unlist(lapply(strsplit(as.character(coldata.sample$group),split = "[.]"),function(x){x[2]})) 40 | coldata.sample$group <- factor(coldata.sample$group) 41 | inf.dds <- DESeq2::DESeqDataSetFromMatrix(countData = deseq2.count.table[,rownames(coldata.sample)],colData = coldata.sample,design = ~group) 42 | inf.dds.LRT <- DESeq2::DESeq(inf.dds) 43 | head(deseq2.count.table[,rownames(coldata.sample)]) 44 | results <- DESeq2::results(inf.dds.LRT,constract=c("group","input","ip")) 45 | results <- data.frame(PeakRegion = rownames(results),sample_id = results$log2FoldChange) 46 | colnames(results)[2] <- sample_id 47 | final.deseq2.logfc.matrix <- merge(final.deseq2.logfc.matrix,results,by= "PeakRegion") 48 | } 49 | 2^head(final.deseq2.logfc.matrix) 50 | exp()final.deseq2.logfc.matrix 51 | rownames(final.deseq2.logfc.matrix) <- final.deseq2.logfc.matrix$PeakRegion 52 | final.deseq2.logfc.matrix <- final.deseq2.logfc.matrix[,-1] 53 | final.deseq2.logfc.matrix <- 2^(final.deseq2.logfc.matrix) 54 | save(final.deseq2.logfc.matrix,file = "deseq.quantification.matrix.RData") -------------------------------------------------------------------------------- /bin/markdown_to_html.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | import argparse 4 | import markdown 5 | import os 6 | import sys 7 | 8 | def convert_markdown(in_fn): 9 | input_md = open(in_fn, mode="r", encoding="utf-8").read() 10 | html = markdown.markdown( 11 | "[TOC]\n" + input_md, 12 | extensions = [ 13 | 'pymdownx.extra', 14 | 'pymdownx.b64', 15 | 'pymdownx.highlight', 16 | 'pymdownx.emoji', 17 | 'pymdownx.tilde', 18 | 'toc' 19 | ], 20 | extension_configs = { 21 | 'pymdownx.b64': { 22 | 'base_path': os.path.dirname(in_fn) 23 | }, 24 | 'pymdownx.highlight': { 25 | 'noclasses': True 26 | }, 27 | 'toc': { 28 | 'title': 'Table of Contents' 29 | } 30 | } 31 | ) 32 | return html 33 | 34 | def wrap_html(contents): 35 | header = """ 36 | 37 | 38 | 72 | 73 | 74 |
75 | """ 76 | footer = """ 77 |
78 | 79 | 80 | """ 81 | return header + contents + footer 82 | 83 | 84 | def parse_args(args=None): 85 | parser = argparse.ArgumentParser() 86 | parser.add_argument('mdfile', type=argparse.FileType('r'), nargs='?', 87 | help='File to convert. Defaults to stdin.') 88 | parser.add_argument('-o', '--out', type=argparse.FileType('w'), 89 | default=sys.stdout, 90 | help='Output file name. Defaults to stdout.') 91 | return parser.parse_args(args) 92 | 93 | def main(args=None): 94 | args = parse_args(args) 95 | converted_md = convert_markdown(args.mdfile.name) 96 | html = wrap_html(converted_md) 97 | args.out.write(html) 98 | 99 | if __name__ == '__main__': 100 | sys.exit(main()) 101 | -------------------------------------------------------------------------------- /bin/meyer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Apr 23 18:53:30 2019 4 | 5 | @author: zky 6 | """ 7 | from sys import argv 8 | from math import log 9 | from scipy import stats 10 | input_bin25_file = argv[1] 11 | ip_bin25_file = argv[2] 12 | input_total_reads_count = int(argv[3]) 13 | ip_total_reads_count = int(argv[4]) 14 | peak_windows_number = int(argv[5]) 15 | output_ip_file = argv[6] 16 | def windows_fisher_test(input_count, ip_count, input_total_reads_count, ip_total_reads_count): 17 | """fisher test for the PeakCalling of meyer""" 18 | site_input_rest_reads_count = input_total_reads_count - int(input_count) 19 | site_ip_rest_reads_count = ip_total_reads_count - int(ip_count) 20 | ip_oddsratio, ip_pvalue = stats.fisher_exact([[input_count, ip_count], [input_total_reads_count, ip_total_reads_count]], 'less') 21 | input_oddsratio, input_pvalue = stats.fisher_exact([[input_count, ip_count], [site_input_rest_reads_count, site_ip_rest_reads_count]], 'greater') 22 | return input_pvalue,ip_pvalue 23 | 24 | def cluster_bin( bonferroni_filter_list ): 25 | bonferroni_peak = [] 26 | peak_line = [] 27 | idx = 0 28 | pre_end_position = 0 29 | for data in bonferroni_filter_list: 30 | distance = data[1] - pre_end_position 31 | if pre_end_position == 0 or distance > 0 : 32 | if peak_line : 33 | peak_region = peak_line[2] - peak_line[1] 34 | if peak_region >= 100 : 35 | bonferroni_peak.append([]) 36 | bonferroni_peak[idx] = peak_line 37 | idx += 1 38 | peak_line = [] 39 | peak_line = data[:] 40 | pre_end_position = data[2] 41 | else: 42 | peak_line[2] = data[2] 43 | pre_end_position = data[2] 44 | peak_line.append(data[3]) 45 | for data in bonferroni_peak: 46 | statistic, pval = stats.combine_pvalues(data[3:len(data)], method='fisher', weights=None) 47 | data[3] = pval 48 | del data[4:len(data)] 49 | return bonferroni_peak 50 | 51 | with open (input_bin25_file) as input_bin25,open (ip_bin25_file) as ip_bin25: 52 | """Generate the list of bonferroni_filter_windows""" 53 | ip_bonferroni_filter_list = [] 54 | ip_index = 0 55 | print ("Generate the list of bonferroni_filter_windows") 56 | while True: 57 | input_line = input_bin25.readline().rstrip("\n") 58 | ip_line = ip_bin25.readline().rstrip("\n") 59 | if input_line == '': 60 | break 61 | input_line_list = input_line.split("\t") 62 | ip_line_list = ip_line.split("\t") 63 | input_pvalue,ip_pvalue = windows_fisher_test(input_line_list[-1],ip_line_list[-1],input_total_reads_count,ip_total_reads_count) 64 | if (ip_pvalue < 0.05/peak_windows_number ): 65 | del ip_line_list[-1] 66 | ip_line_list.append(ip_pvalue) 67 | ip_line_list[1] = int(ip_line_list[1]) 68 | ip_line_list[2] = int(ip_line_list[2]) 69 | ip_bonferroni_filter_list.append([]) 70 | ip_bonferroni_filter_list[ip_index] = ip_line_list 71 | ip_index += 1 72 | """Generate the list of bonferroni_filter_peaks""" 73 | print ("Generate the list of bonferroni_filter_peaks") 74 | ip_bonferroni_peak = cluster_bin(ip_bonferroni_filter_list[:]) 75 | """Write the list of bonferroni_filter_peaks""" 76 | print ("Write the list of bonferroni_filter_peaks") 77 | with open(output_ip_file,'w') as output_file: 78 | for data in ip_bonferroni_peak: 79 | output_file.write('\t'.join(str(i) for i in data)) 80 | output_file.write('\n') -------------------------------------------------------------------------------- /bin/merge_peaks_by_bedtools.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$1 argv 1 : designfile 3 | #$2 argv 2 : THREAD_NUM 4 | #$3 argv 3 : flag_peakCallingbygroup 5 | #$4 argv 4 : peakCalling_tools_count 6 | designfile=$1 7 | THREAD_NUM=$2 8 | flag_peakCallingbygroup=$3 9 | peakCalling_tools_count=$4 10 | peakCalling_tools_main=$5 11 | 12 | # Define a multi-threaded run channel 13 | mkfifo tmp 14 | exec 9<>tmp 15 | for ((i=1;i<=${THREAD_NUM:=1};i++)) 16 | do 17 | echo >&9 18 | done 19 | 20 | function mergebedByBedtools() 21 | { 22 | prefix_id=$1 23 | out_prefix=$2 24 | peakCalling_tools_main=$3 25 | cat ${peakCalling_tools_main}*${prefix_id}*normalized.bed | sortBed -i - | mergeBed -i - -c 4,5 -o count,mean > tmp.${prefix_id}_allPeaks.bed 26 | ls *${prefix_id}*normalized.bed | grep -v ${peakCalling_tools_main} | xargs -i cat {} | sortBed -i - | mergeBed -i - -c 4,5 -o count,mean > tmp.${prefix_id}_others_allPeaks.bed 27 | intersectBed -a tmp.${prefix_id}_allPeaks.bed -b tmp.${prefix_id}_others_allPeaks.bed -u | awk 'BEGIN{FS="\t";OFS="\t"}{print $1,$2,$3,$1":"$2"-"$3,$5}' > ${out_prefix}.bed 28 | } 29 | 30 | if [ $flag_peakCallingbygroup -gt 0 ]; then 31 | group_list=$(awk 'BEGIN{FS=","}NR>1{print $4}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}') 32 | for group_id in $group_list 33 | do 34 | read -u 9 35 | { 36 | if [ $peakCalling_tools_count -gt 1 ]; then 37 | mergebedByBedtools ${group_id} bedtools_merged_group_${group_id} ${peakCalling_tools_main} 38 | else 39 | awk '{OFS="\t";$5=10^-$5;print }' *${group_id}*.bed | sortBed -i - > bedtools_merged_group_${group_id}.bed 40 | fi 41 | echo >&9 42 | }& 43 | done 44 | wait 45 | if [ $peakCalling_tools_count -gt 1 ]; then 46 | mergebedByBedtools "" bedtools_merged_allpeaks ${peakCalling_tools_main} 47 | else 48 | cat ${peakCalling_tools_main}_*_normalized.bed | sortBed -i - | mergeBed -i - -c 4,5 -o count,mean | awk 'BEGIN{FS="\t";OFS="\t"}{print $1,$2,$3,$1":"$2"-"$3,$5}' > bedtools_merged_allpeaks.bed 49 | fi 50 | else 51 | sampleinfo_list=$(awk 'BEGIN{FS=","}NR>1{print $1","$4}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}') 52 | for sample_group_id in ${sampleinfo_list} 53 | do 54 | read -u 9 55 | { 56 | sample_id=$(echo ${sample_group_id} | awk 'BEGIN{FS=","}{print $1}') 57 | group_id=$(echo ${sample_group_id} | awk 'BEGIN{FS=","}{print $2}') 58 | ## Adding the information of group 59 | for samplefile in *_${sample_id}_normalized.bed 60 | do 61 | mv $samplefile ${samplefile/_normalized.bed/_${group_id}_normalized.bed} 62 | done 63 | echo >&9 64 | }& 65 | done 66 | wait 67 | group_list=$(awk 'BEGIN{FS=","}NR>1{print $4}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}') 68 | for group_id in $group_list 69 | do 70 | read -u 9 71 | { 72 | if [ $peakCalling_tools_count -gt 1 ]; then 73 | mergebedByBedtools ${group_id} bedtools_merged_group_${group_id} ${peakCalling_tools_main} 74 | else 75 | awk '{OFS="\t";$5=10^-$5;print }' *${group_id}*.bed | sortBed -i - > bedtools_merged_group_${group_id}.bed 76 | fi 77 | echo >&9 78 | }& 79 | done 80 | wait 81 | if [ $peakCalling_tools_count -gt 1 ]; then 82 | mergebedByBedtools "" bedtools_merged_allpeaks ${peakCalling_tools_main} 83 | else 84 | cat ${peakCalling_tools_main}_*_normalized.bed | sortBed -i - | mergeBed -i - -c 4,5 -o count,mean | awk 'BEGIN{FS="\t";OFS="\t"}{print $1,$2,$3,$1":"$2"-"$3,$5}' > bedtools_merged_allpeaks.bed 85 | fi 86 | fi 87 | echo "${peakCalling_tools_main} merged peaks done" -------------------------------------------------------------------------------- /bin/arranged_results.R: -------------------------------------------------------------------------------- 1 | #!/bin/Rscript 2 | ## Rscript arranged_result.R 3 | ### designfile: Sample_id, Input_filename, IP_filename, group_id 4 | ### compare_str: Compairision design (eg: A_vs_B) 5 | args <- commandArgs(T) 6 | #args <- c("formatted_designfile.txt", "compare_info", "Wilcox-test", "edgeR", "rank") 7 | designfile <- args[1]#"formatted_designfile.txt" 8 | comparefile <- args[2]#"compare_info" 9 | diffm6A_mode <- args[3]#"QNB" 10 | rnaseq_mode <- args[4]#"DESeq2" 11 | peakMerged_mode <- args[5] 12 | options(stringsAsFactors = F) 13 | 14 | ## generate design matrix 15 | compare.list <- read.csv(comparefile,header = F, check.names=F) 16 | designtable <- read.csv(designfile, head = TRUE, colClasses = c("character"), check.names=F) 17 | design.matrix <- as.matrix(designtable$Group) 18 | rownames(design.matrix) <- designtable$Sample_ID 19 | colnames(design.matrix) <- "Type" 20 | 21 | ## generate peak Visualization 22 | annotation.file <- list.files(pattern = "merged_allpeaks.anno.txt") 23 | annotation.info <- read.table(annotation.file, header = F, sep = "\t", quote = "", check.names=F)[,c(4,15,11)] 24 | colnames(annotation.info) <- c("PeakRegion","ID","Gene_symbol") 25 | m6a.peaks.file <- list.files(pattern = "merged_allpeaks.bed$") 26 | m6a.peaks.table <- read.table(m6a.peaks.file, header = F, sep = "\t", quote = "", check.names=F) 27 | colnames(m6a.peaks.table) <- c("Chr","ChrStart","ChrEnd","PeakRegion","pvalue") 28 | m6a.peaks.table = merge(x = m6a.peaks.table,y = annotation.info,by = "PeakRegion",all.x = TRUE) 29 | m6a.sites.file <- list.files(pattern = "m6A_sites_merged.bed") 30 | m6a.sites.table <- read.table(m6a.sites.file, header = F, sep = "\t", quote = "", check.names=F) 31 | colnames(m6a.sites.table) <- c("Chr","ChrStart","ChrEnd","Gene_symbol&ID","Strand","Score","Group","Sequence") 32 | 33 | expression.matrix <- NULL 34 | diffexpression.list <- NULL 35 | if (rnaseq_mode != "none"){ 36 | ## generate expression matrix 37 | htseq.filelist = grep("htseq",list.files(path = "./",pattern = "input.count"), value = T) 38 | for( file in htseq.filelist ){ 39 | tmp.expression.table <- as.matrix(read.table(file, sep = "\t", header = TRUE, row.names = 1, check.names=F)) 40 | expression.matrix <- cbind(expression.matrix, tmp.expression.table) 41 | } 42 | colnames(expression.matrix) <- as.matrix(lapply(strsplit(colnames(expression.matrix),".input"), function(x){ x[1]})) 43 | 44 | ## generate diff_expression list 45 | diffexpression.filelist <- grep(rnaseq_mode,list.files(pattern = ".csv"), value = T) 46 | for( compare_str in compare.list ){ 47 | diffexpression.list[[compare_str]] <- read.csv(grep(sub("_vs_","_",compare_str), diffexpression.filelist, value = T), header = T, check.names=F) 48 | colnames(diffexpression.list[[compare_str]])[1] <- "ID" 49 | } 50 | } 51 | 52 | ## generate m6A matrix 53 | m6a.anno.matrix <- read.delim(file = grep("quantification.matrix",x = list.files(),value = T), header = T, sep = "\t", row.names = 1, check.names=F) 54 | m6a.anno.matrix <- cbind(PeakRegion = row.names(m6a.anno.matrix), m6a.anno.matrix) 55 | 56 | ## generate diffm6A list 57 | diffm6A.filelist <- grep("_diffm6A_",list.files(pattern = ".txt"), value = T) 58 | diffm6A.list <- NULL 59 | for( compare_str in compare.list ){ 60 | diffm6A.list[[compare_str]] <- read.table(grep(sub("_vs_","_",compare_str), diffm6A.filelist, value = T),header = T,row.names = 1, check.names=F) 61 | if( diffm6A_mode == "MATK" ){ 62 | diffm6A.list[[compare_str]]$padj = p.adjust(diffm6A.list[[compare_str]]$pvalue, method = "BH") 63 | diffm6A.list[[compare_str]] <- diffm6A.list[[compare_str]][,-seq(1,2)] 64 | } 65 | diffm6A.list[[compare_str]]$PeakRegion <- rownames(diffm6A.list[[compare_str]]) 66 | diffm6A.list[[compare_str]] <- merge(x = annotation.info,y = diffm6A.list[[compare_str]],by = "PeakRegion", all.y = TRUE) 67 | } 68 | 69 | ## save variable for m6Aviewer 70 | write.table(expression.matrix,file = "expression.matrix",quote=F) 71 | write.table(m6a.anno.matrix,file= "m6a.anno.matrix",quote=F) 72 | #write.table(diffm6A.list, file= "diffm6A.list") 73 | #write.table(diffm6A.anno.list, file = "diffm6A.anno.list") 74 | 75 | save(design.matrix, compare.list, 76 | m6a.peaks.table, m6a.sites.table, 77 | expression.matrix, m6a.anno.matrix, 78 | diffexpression.list, diffm6A.list, 79 | file = paste0(peakMerged_mode,"_",diffm6A_mode,"_",rnaseq_mode,"_arranged_results_",Sys.Date(),".m6APipe")) 80 | 81 | -------------------------------------------------------------------------------- /bin/merge_peaks_by_mspc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$1 argv 1 : designfile 3 | #$2 argv 2 : THREAD_NUM 4 | #$3 argv 3 : flag_peakCallingbygroup 5 | #$4 argv 4 : peakCalling_tools_count 6 | designfile=$1 7 | THREAD_NUM=$2 8 | flag_peakCallingbygroup=$3 9 | peakCalling_tools_count=$4 10 | out_dir=$5 11 | # Define a multi-threaded run channel 12 | mkfifo tmp 13 | exec 9<>tmp 14 | for ((i=1;i<=${THREAD_NUM:=1};i++)) 15 | do 16 | echo >&9 17 | done 18 | # Generate the diroectory of results: 19 | mkdir ${out_dir} 20 | 21 | # Define the function of MSPC runing for different situations 22 | function mergebedForBio() 23 | { 24 | prefix_id=$1 25 | out_prefix=$2 26 | bedfile_array=$(ls *_${prefix_id}_*.bed | awk '{ORS=" "}{print "-i",$0}') 27 | mspc -i $bedfile_array -r bio -s 1E-4 -w 1E-2 -o Bio_$prefix_id 28 | ln Bio_$prefix_id/ConsensusPeaks.bed ${out_prefix}.bed 29 | awk 'NR>1{OFS="\t";$5=10^-$5;print $1,$2,$3,$1":"$2"-"$3,$5}' Bio_$prefix_id/ConsensusPeaks.bed |sortBed -i - > ${out_dir}/${out_prefix}.bed 30 | } 31 | function mergebedForTec() 32 | { 33 | prefix_id=$1 34 | out_prefix=$2 35 | peakCalling_tools_count=$3 36 | bedfile_array=$(ls *_${prefix_id}_*.bed | awk '{ORS=" "}{print "-i",$0}') 37 | mspc -i $bedfile_array -r tec -s 1E-2 -w 1E-1 -o Tec_$prefix_id 38 | ln Tec_$prefix_id/ConsensusPeaks.bed ${out_prefix}.bed 39 | awk 'NR>1{OFS="\t";$5=10^-$5;print $1,$2,$3,$1":"$2"-"$3,$5}' Tec_$prefix_id/ConsensusPeaks.bed |sortBed -i - > ${out_dir}/${out_prefix}.bed 40 | } 41 | 42 | # Before merging peaks, normalize all peaks of different tools 43 | for bedfile in *.bed 44 | do 45 | read -u 9 46 | { 47 | mv $bedfile tmp.$bedfile 48 | python normalize_peaks.py tmp.$bedfile $bedfile 49 | rm tmp.$bedfile 50 | echo >&9 51 | }& 52 | done 53 | wait 54 | 55 | # if the number of peakcalling tools > 2 56 | if [ $flag_peakCallingbygroup -gt 0 ]; then 57 | group_list=$(awk 'BEGIN{FS=","}NR>1{print $4}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}') 58 | for group_id in $group_list 59 | do 60 | read -u 9 61 | { 62 | if [ $peakCalling_tools_count -gt 1 ]; then 63 | mergebedForTec ${group_id} mspc_merged_group_${group_id} 64 | else 65 | ln *${group_id}*.bed mspc_merged_group_${group_id}.bed 66 | awk '{OFS="\t";$5=10^-$5;print }' *${group_id}*.bed |sortBed -i - > ${out_dir}/mspc_merged_group_${group_id}.bed 67 | fi 68 | echo >&9 69 | }& 70 | done 71 | wait 72 | mergebedForBio merged_group mspc_merged_allpeaks 73 | else 74 | sampleinfo_list=$(awk 'BEGIN{FS=","}NR>1{print $1","$4}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}') 75 | for sample_group_id in ${sampleinfo_list} 76 | do 77 | read -u 9 78 | { 79 | sample_id=$(echo ${sample_group_id} | awk 'BEGIN{FS=","}{print $1}') 80 | group_id=$(echo ${sample_group_id} | awk 'BEGIN{FS=","}{print $2}') 81 | ## Adding the information of group 82 | for samplefile in *_${sample_id}_normalized.bed 83 | do 84 | mv $samplefile ${samplefile/_normalized.bed/}_${group_id}_normalized.bed 85 | done 86 | if [ $peakCalling_tools_count -gt 1 ]; then 87 | mergebedForTec ${sample_id} mspc_merged_sample_${group_id}_${sample_id} $peakCalling_tools_count 88 | else 89 | ln *${sample_id}*.bed mspc_merged_sample_${group_id}_${sample_id}.bed 90 | awk '{OFS="\t";$5=10^-$5;print }' *_${sample_id}_*normalized.bed |sortBed -i - > ${out_dir}/mspc_merged_sample_${group_id}_${sample_id}.bed 91 | fi 92 | echo >&9 93 | }& 94 | done 95 | wait 96 | group_list=$(awk 'BEGIN{FS=","}NR>1{print $4}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}') 97 | for group_id in $group_list 98 | do 99 | read -u 9 100 | { 101 | mergebedForBio merged_sample_${group_id} mspc_merged_group_${group_id} 102 | echo >&9 103 | }& 104 | done 105 | wait 106 | #mergebedForBio merged_sample mspc_merged_allpeaks 107 | cat ${out_dir}/*_merged_sample_*.bed | sortBed -i - |mergeBed -i - -c 4,5 -o count,mean | awk '$4>1{print $1"\t"$2"\t"$3"\t"$1":"$2"-"$3"\t"$5}' > ${out_dir}/mspc_merged_allpeaks.bed 108 | fi 109 | judge_chr=$(cat *.bed |cut -f 1 |sort |uniq| awk '$0~"chr"{print "includeChr"}' |uniq) 110 | if [ "$judge_chr" != "includeChr" ]; then sed -i 's/chr//g' ${out_dir}/*.bed ;fi 111 | mv ${out_dir}/*.bed ./ 112 | exec 9<>- 113 | echo "MSPC merged peaks done" 114 | -------------------------------------------------------------------------------- /docs/configuration/adding_your_own.md: -------------------------------------------------------------------------------- 1 | # nf-core/m6APipe: Configuration for other clusters 2 | 3 | It is entirely possible to run this pipeline on other clusters, though you will need to set up your own config file so that the pipeline knows how to work with your cluster. 4 | 5 | > If you think that there are other people using the pipeline who would benefit from your configuration (eg. other common cluster setups), please let us know. We can add a new configuration and profile which can used by specifying `-profile ` when running the pipeline. 6 | 7 | If you are the only person to be running this pipeline, you can create your config file as `~/.nextflow/config` and it will be applied every time you run Nextflow. Alternatively, save the file anywhere and reference it when running the pipeline with `-c path/to/config` (see the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for more). 8 | 9 | A basic configuration comes with the pipeline, which runs by default (the `standard` config profile - see [`conf/base.config`](../conf/base.config)). This means that you only need to configure the specifics for your system and overwrite any defaults that you want to change. 10 | 11 | ## Cluster Environment 12 | 13 | By default, pipeline uses the `local` Nextflow executor - in other words, all jobs are run in the login session. If you're using a simple server, this may be fine. If you're using a compute cluster, this is bad as all jobs will run on the head node. 14 | 15 | To specify your cluster environment, add the following line to your config file: 16 | 17 | ```nextflow 18 | process.executor = 'YOUR_SYSTEM_TYPE' 19 | ``` 20 | 21 | Many different cluster types are supported by Nextflow. For more information, please see the [Nextflow documentation](https://www.nextflow.io/docs/latest/executor.html). 22 | 23 | Note that you may need to specify cluster options, such as a project or queue. To do so, use the `clusterOptions` config option: 24 | 25 | ```nextflow 26 | process { 27 | executor = 'SLURM' 28 | clusterOptions = '-A myproject' 29 | } 30 | ``` 31 | 32 | ## Software Requirements 33 | 34 | To run the pipeline, several software packages are required. How you satisfy these requirements is essentially up to you and depends on your system. If possible, we _highly_ recommend using either Docker or Singularity. 35 | 36 | Please see the [`installation documentation`](../installation.md) for how to run using the below as a one-off. These instructions are about configuring a config file for repeated use. 37 | 38 | ### Docker 39 | 40 | Docker is a great way to run nf-core/m6APipe, as it manages all software installations and allows the pipeline to be run in an identical software environment across a range of systems. 41 | 42 | Nextflow has [excellent integration](https://www.nextflow.io/docs/latest/docker.html) with Docker, and beyond installing the two tools, not much else is required - nextflow will automatically fetch the [nfcore/m6APipe](https://hub.docker.com/r/nfcore/m6APipe/) image that we have created and is hosted at dockerhub at run time. 43 | 44 | To add docker support to your own config file, add the following: 45 | 46 | ```nextflow 47 | docker.enabled = true 48 | process.container = "nfcore/m6APipe" 49 | ``` 50 | 51 | Note that the dockerhub organisation name annoyingly can't have a hyphen, so is `nfcore` and not `nf-core`. 52 | 53 | ### Singularity image 54 | 55 | Many HPC environments are not able to run Docker due to security issues. 56 | [Singularity](http://singularity.lbl.gov/) is a tool designed to run on such HPC systems which is very similar to Docker. 57 | 58 | To specify singularity usage in your pipeline config file, add the following: 59 | 60 | ```nextflow 61 | singularity.enabled = true 62 | process.container = "shub://nf-core/m6APipe" 63 | ``` 64 | 65 | If you intend to run the pipeline offline, nextflow will not be able to automatically download the singularity image for you. 66 | Instead, you'll have to do this yourself manually first, transfer the image file and then point to that. 67 | 68 | First, pull the image file where you have an internet connection: 69 | 70 | ```bash 71 | singularity pull --name nf-core-m6APipe.simg shub://nf-core/m6APipe 72 | ``` 73 | 74 | Then transfer this file and point the config file to the image: 75 | 76 | ```nextflow 77 | singularity.enabled = true 78 | process.container = "/path/to/nf-core-m6APipe.simg" 79 | ``` 80 | 81 | ### Conda 82 | 83 | If you're not able to use Docker or Singularity, you can instead use conda to manage the software requirements. 84 | To use conda in your own config file, add the following: 85 | 86 | ```nextflow 87 | process.conda = "$baseDir/environment.yml" 88 | ``` 89 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MeRIPseqPipe 2 | 3 | **MeRIP-seq analysis pipeline arranged multiple alignment tools, peakCalling tools, Merge Peaks\' methods and methylation analysis methods.**. 4 | 5 | [![Build Status](https://travis-ci.com/nf-core/meripseqpipe.svg?branch=master)](https://travis-ci.com/nf-core/meripseqpipe) 6 | [![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A519.04.0-brightgreen.svg)](https://www.nextflow.io/) 7 | 8 | [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg)](http://bioconda.github.io/) 9 | [![Docker](https://img.shields.io/docker/automated/kingzhuky/meripseqpipe.svg)](https://hub.docker.com/r/kingzhuky/meripseqpipe) 10 | 11 | ## Introduction 12 | 13 | The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker containers making installation trivial and results highly reproducible.N6-methyladenosine (m6A) is the most prevalent modification in the mRNA of many eukaryotic species, including yeast, plants, flies, and mammals. In order to analyze m6A-seq data, we developed a user-friendly, integrated analysis pipeline called MeRIPseqPipe based on Nextflow. It integrated ten main functional modules including data preprocessing, quality control, read mapping, peak calling, peak merging, motif searching, peak annotation,differential methylation analysis, differential expression analysis, and data visualization.  14 | 15 | ## Quick Start 16 | 17 | i. Install [`nextflow`](https://nf-co.re/usage/installation) 18 | 19 | ii. Install one of [`docker`](https://docs.docker.com/engine/installation/) or [`conda`](https://conda.io/miniconda.html) 20 | 21 | iii. Download the pipeline and test it on a minimal dataset with a single command 22 | 23 | ```bash 24 | nextflow run path/to/meripseqpipe -profile test, 25 | ``` 26 | 27 | > Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `conda` and set the appropriate execution settings for your local compute environment. 28 | 29 | iv. Start running your own analysis! 30 | 31 | 32 | ```bash 33 | nextflow run path/to/meripseqpipe -profile --designfile 'designfile.tsv' --comparefile 'comparefile.txt' --fasta path/to/genome_fasta.fa --gtf path/to/genome_annotation.gtf 34 | ``` 35 | 36 | See [usage docs](docs/usage.md) for all of the available options when running the pipeline. 37 | 38 | ## Documentation 39 | 40 | The nf-core/meripseqpipe pipeline comes with documentation about the pipeline, found in the `docs/` directory: 41 | 42 | 1. [Installation](https://nf-co.re/usage/installation) 43 | 2. Pipeline configuration 44 | * [Local installation](https://nf-co.re/usage/local_installation) 45 | * [Adding your own system config](https://nf-co.re/usage/adding_own_config) 46 | * [Reference genomes](https://nf-co.re/usage/reference_genomes) 47 | 3. [Running the pipeline](docs/usage.md) 48 | 4. [Output and how to interpret the results](docs/output.md) 49 | 5. [Troubleshooting](https://nf-co.re/usage/troubleshooting) 50 | 51 | 52 | 53 | ## Credits 54 | 55 | MeRIPseqPipe was originally written by Kaiyu Zhu, Yu Sun, Xiaoqiong Bao. 56 | 57 | ## Contributions and Support 58 | 59 | If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md). 60 | 61 | For further information or help, don't hesitate to get in touch on [Slack](https://nfcore.slack.com/channels/meripseqpipe) (you can join with [this invite](https://nf-co.re/join/slack)). 62 | 63 | ## Citation 64 | 65 | 66 | 67 | 68 | You can cite the `nf-core` publication as follows: 69 | 70 | > **The nf-core framework for community-curated bioinformatics pipelines.** 71 | > 72 | > Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen. 73 | > 74 | > _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x). 75 | > ReadCube: [Full Access Link](https://rdcu.be/b1GjZ) 76 | -------------------------------------------------------------------------------- /bin/merge_peaks_by_rank.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$1 argv 1 : designfile 3 | #$2 argv 2 : THREAD_NUM 4 | #$3 argv 3 : flag_peakCallingbygroup 5 | #$4 argv 4 : peakCalling_tools_count 6 | designfile=$1 7 | THREAD_NUM=$2 8 | flag_peakCallingbygroup=$3 9 | peakCalling_tools_count=$4 10 | 11 | # Define a multi-threaded run channel 12 | mkfifo tmp 13 | exec 9<>tmp 14 | for ((i=1;i<=${THREAD_NUM:=1};i++)) 15 | do 16 | echo >&9 17 | done 18 | 19 | function SortTransferBed() 20 | { 21 | bed_file=$1 22 | bed_anno_file=$2 23 | outdir=$3 24 | ## sort bed by pvalue for rank merge && transfer the origin region of peaks into the bedtools merged region of peaks 25 | awk '{ print $1":"$2"-"$3,$5}' ${bed_file} | sort -k1,1 |join -a1 - ${bed_anno_file} | sort -k2,2 -n -r | awk '{print $3}' > ${outdir}/tmp.${bed_file}.location 26 | } 27 | function mergebedByRank() 28 | { 29 | prefix_id=$1 30 | out_prefix=$2 31 | mkdir tmp.${out_prefix} 32 | cat *_${prefix_id}_*.bed | awk '{print $1"\t"$2*1"\t"$3*1"\t"$1":"$2"-"$3}' > tmp.${out_prefix}/bedtools_${prefix_id}_all_peaks 33 | sortBed -i tmp.${out_prefix}/bedtools_${prefix_id}_all_peaks |mergeBed -i - -c 4,4 -o collapse,count | awk '{print $1"\t"$2"\t"$3"\t"$1":"$2"-"$3"\t"$4}' > tmp.${out_prefix}/bedtools_${prefix_id} 34 | awk -F "\t" '{print $4,$5}' tmp.${out_prefix}/bedtools_${prefix_id} | awk -F '[," "]+' '{for (i=2 ;i<=NF;i++) printf $i" "$1"\n" }' | sort -k1 | uniq > tmp.${out_prefix}/bed_anno_file 35 | for bedfile in *_${prefix_id}_*.bed 36 | do 37 | SortTransferBed $bedfile tmp.${out_prefix}/bed_anno_file tmp.${out_prefix} 38 | done 39 | paste -d "\t" tmp.${out_prefix}/tmp*location > ${out_prefix}.bedlist 40 | peak_number=$(wc -l tmp.${out_prefix}/bed_anno_file | cut -d " " -f 1) 41 | Rscript merge_peaks_by_rank.R ${out_prefix}.bedlist ${peak_number} ${out_prefix}.bed 42 | rm -rf tmp.${out_prefix} ${out_prefix}.bedlist 43 | } 44 | 45 | # Before merging peaks, normalize all peaks of different tools 46 | for bedfile in *.bed 47 | do 48 | read -u 9 49 | { 50 | mv $bedfile tmp.$bedfile 51 | python normalize_peaks.py tmp.$bedfile $bedfile 52 | rm tmp.$bedfile 53 | echo >&9 54 | }& 55 | done 56 | wait 57 | # if the number of peakcalling tools > 2 58 | if [ $flag_peakCallingbygroup -gt 0 ]; then 59 | echo "Start to merge different tools' result of every group" 60 | group_list=$(awk 'BEGIN{FS=","}NR>1{print $4}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}') 61 | for group_id in $group_list 62 | do 63 | read -u 9 64 | { 65 | if [ $peakCalling_tools_count -gt 1 ]; then 66 | mergebedByRank ${group_id} rank_merged_group_${group_id} 67 | else 68 | awk '{OFS="\t";$5=10^-$5;print }' *${group_id}*.bed |sortBed -i - > rank_merged_group_${group_id}.bed 69 | fi 70 | echo >&9 71 | }& 72 | done 73 | wait 74 | mergebedByRank merged_group rank_merged_allpeaks 75 | else 76 | echo "Start to merge different tools' result of every sample" 77 | sampleinfo_list=$(awk 'BEGIN{FS=","}NR>1{print $1","$4}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}') 78 | for sample_group_id in ${sampleinfo_list} 79 | do 80 | read -u 9 81 | { 82 | sample_id=$(echo ${sample_group_id} | awk 'BEGIN{FS=","}{print $1}') 83 | group_id=$(echo ${sample_group_id} | awk 'BEGIN{FS=","}{print $2}') 84 | ## Adding the information of group 85 | for samplefile in *_${sample_id}_normalized.bed 86 | do 87 | mv $samplefile ${samplefile/_normalized.bed/}_${group_id}_normalized.bed 88 | done 89 | if [ $peakCalling_tools_count -gt 1 ]; then 90 | mergebedByRank ${sample_id} rank_merged_sample_${group_id}_${sample_id} 91 | else 92 | awk '{OFS="\t";$5=10^-$5;print }' *${sample_id}*.bed |sortBed -i - > rank_merged_sample_${group_id}_${sample_id}.bed 93 | fi 94 | echo >&9 95 | }& 96 | done 97 | wait 98 | echo "Start to merge different samples' result of every group" 99 | group_list=$(awk 'BEGIN{FS=","}NR>1{print $4}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}') 100 | for group_id in $group_list 101 | do 102 | read -u 9 103 | { 104 | mergebedByRank merged_sample_${group_id} rank_merged_group_${group_id} 105 | echo >&9 106 | }& 107 | done 108 | wait 109 | #mergebedByRank merged_sample rank_merged_allpeaks 110 | cat *_merged_sample_*.bed | sortBed -i - |mergeBed -i - -c 4,5 -o count,mean | awk '$4>1{print $1"\t"$2"\t"$3"\t"$1":"$2"-"$3"\t"$5}' > rank_merged_allpeaks.bed 111 | fi 112 | echo "Rank merged peaks done" 113 | 114 | 115 | -------------------------------------------------------------------------------- /bin/GLM_DESeq2_DM.R: -------------------------------------------------------------------------------- 1 | library(DESeq2) 2 | library(BiocParallel) 3 | 4 | #load data 5 | args <- commandArgs(T) 6 | #args <- c("formatted_designfile.txt","shGFPa549_vs_shMettl3a549", "10","bedtools_quantification.matrix","expression.matrix") 7 | designfile <- args[1] 8 | compare_str <- args[2] 9 | THREAD_NUM <- as.numeric(args[3]) 10 | annotation.file <- args[4] 11 | input.count.matrix.file <- args[5] 12 | register(MulticoreParam(THREAD_NUM)) 13 | 14 | designtable <- read.csv(designfile, head = TRUE, stringsAsFactors=FALSE, colClasses = c("character"), check.names=F) 15 | design.matrix <- as.data.frame(designtable$Group) 16 | rownames(design.matrix) <- designtable$Sample_ID 17 | colnames(design.matrix) <- "Condition" 18 | 19 | # Get the information of groups from compare_str 20 | if(length(unique(design.matrix$Condition)) < 2){ 21 | stop( "The count of Group is less than two, please check your designfile.") 22 | }else if( compare_str == "two_group" ){ 23 | # Get the information without compare_str beacause of only two groups 24 | group_id_1 <- unique(design.matrix$Condition)[1] 25 | group_id_2 <- unique(design.matrix$Condition)[2] 26 | }else{ 27 | # Running MeTDiff quantification with compare_str 28 | group_id_1 <- strsplit(as.character(compare_str), "_vs_")[[1]][1] 29 | group_id_2 <- strsplit(as.character(compare_str), "_vs_")[[1]][2] 30 | } 31 | design.matrix <- subset(design.matrix, Condition == group_id_1 | Condition == group_id_2 ) 32 | design.matrix$Condition <- factor(design.matrix$Condition, levels = c(group_id_1,group_id_2), labels = c("control","treatment")) 33 | filelist = list.files(path = ".",pattern = ".count",full.names = T) 34 | ## Generate the matrix of peaks count 35 | peaks.count.list <- NULL 36 | for(sample_id in rownames(design.matrix)){ 37 | input_count_file <- grep(paste0("[.]",sample_id,"[.]input"),filelist,value = TRUE) 38 | input_count_table <- read.table(file = input_count_file, sep = "\t", row.names = NULL,header = T) 39 | 40 | ip_count_file <- grep(paste0("[.]",sample_id,"[.]ip"),filelist,value = TRUE) 41 | ip_count_table <- read.table(file = ip_count_file, sep = "\t", row.names = NULL, header = T) 42 | rpkm <- cbind(input_count_table[,5],ip_count_table[,5]) 43 | colnames(rpkm) <- c(paste0(sample_id,".input"),paste0(sample_id,".ip")) 44 | peaks.count.list <- cbind(peaks.count.list,rpkm) 45 | } 46 | rownames(peaks.count.list) <- ip_count_table$PeakName 47 | 48 | ## generate design matrix 49 | design.matrix$m6A <- "input" 50 | design.matrix$sample_id <- paste0(rownames(design.matrix),".input") 51 | design.matrix_ip <- design.matrix 52 | design.matrix_ip$m6A <- "IP" 53 | design.matrix_ip$sample_id <- paste0(rownames(design.matrix_ip),".ip") 54 | design.matrix <- rbind(design.matrix,design.matrix_ip) 55 | rownames(design.matrix) <- design.matrix$sample_id 56 | design.matrix$m6A <- factor(design.matrix$m6A) 57 | design.matrix <- design.matrix[colnames(peaks.count.list),] 58 | 59 | run.deseq2 <- function(cnts,meta){ 60 | inf.dds <- DESeq2::DESeqDataSetFromMatrix(countData = cnts,colData = meta,design = ~Condition+m6A+Condition:m6A) 61 | inf.dds.LRT <- DESeq2::DESeq(inf.dds,betaPrior=FALSE, test="LRT", 62 | full=~Condition+m6A+Condition:m6A,reduced=~Condition+m6A) 63 | inf.dds.res <- DESeq2::results(inf.dds.LRT) 64 | results <- inf.dds.res 65 | colnames(results) <- c("baseMean", "log2FC", "lfcSE", "stat", "pvalue", "padj") 66 | return(results) 67 | } 68 | 69 | run.deseq2.4l2fc <- function(cnts,meta,label){ 70 | dds <- DESeq2::DESeqDataSetFromMatrix(cnts,meta,formula(~Condition)) 71 | dds$Condition <- factor(dds$Condition, levels=c('control','treatment')) 72 | gene.col2check <- meta$Condition 73 | dds$Condition <- droplevels(dds$Condition) 74 | gene.deseq <- DESeq2::DESeq(dds) 75 | gene.deseq <- DESeq2::results(gene.deseq) 76 | gene.results <- gene.deseq[,c("log2FoldChange","pvalue","padj")] 77 | colnames(gene.results) <- paste0(label,c(".l2fc",".p",".padj")) 78 | return(gene.results) 79 | } 80 | 81 | results <- run.deseq2(peaks.count.list,design.matrix) 82 | ip.peaks.count <- read.table(annotation.file,sep = "\t", row.names = 1, check.names = F) 83 | input.gene.count <- read.table(input.count.matrix.file,sep = " ",row.names = 1, check.names = F) 84 | colnames(input.gene.count) <- paste0(colnames(input.gene.count),".input") 85 | peaks.de <- run.deseq2.4l2fc(peaks.count.list[,rownames(design.matrix)[design.matrix$m6A == "IP"]], 86 | design.matrix[rownames(design.matrix)[design.matrix$m6A == "IP"],],'peak') 87 | gene.de <- run.deseq2.4l2fc(input.gene.count[,rownames(design.matrix)[design.matrix$m6A == "input"]], 88 | design.matrix[rownames(design.matrix)[design.matrix$m6A == "input"],],'gene') 89 | peaks.de$gene_id <- ip.peaks.count[rownames(peaks.de),"ID"] 90 | peaks.de$gene.l2fc <- gene.de[peaks.de$gene_id,]$gene.l2fc 91 | peaks.de$diff.l2fc <- peaks.de$peak.l2fc - peaks.de$gene.l2fc 92 | results$diff.l2fc <- peaks.de[rownames(results),]$diff.l2fc 93 | 94 | write.table(results,file = paste0("DESeq2_diffm6A_",group_id_1, "_",group_id_2,".txt") ,sep = "\t",quote = F) 95 | -------------------------------------------------------------------------------- /bin/GLM_edgeR_DM.R: -------------------------------------------------------------------------------- 1 | #!/bin/Rscript 2 | ## Rscript GLM_edgeR_DM.R 3 | ### designfile: Sample_id, Input_filename, IP_filename, group_id 4 | ### compare_str: Compairision design (eg: A_vs_B) 5 | 6 | #####GLM model### 7 | #edgeR 8 | library(edgeR) 9 | 10 | #load data 11 | args <- commandArgs(T) 12 | designfile <- args[1] 13 | compare_str <- args[2] 14 | annotation.file <- args[3] 15 | input.count.matrix.file <- args[4] 16 | 17 | designtable <- read.csv(designfile, head = TRUE, stringsAsFactors=FALSE, colClasses = c("character"), check.names=F) 18 | design.matrix <- as.data.frame(designtable$Group) 19 | rownames(design.matrix) <- designtable$Sample_ID 20 | colnames(design.matrix) <- "Condition" 21 | 22 | # Get the information of groups from compare_str 23 | if(length(unique(design.matrix$Condition)) < 2){ 24 | stop( "The count of Group is less than two, please check your designfile.") 25 | }else if( compare_str == "two_group" ){ 26 | # Get the information without compare_str beacause of only two groups 27 | group_id_1 <- unique(design.matrix$Condition)[1] 28 | group_id_2 <- unique(design.matrix$Condition)[2] 29 | }else{ 30 | # Running MeTDiff quantification with compare_str 31 | group_id_1 <- strsplit(as.character(compare_str), "_vs_")[[1]][1] 32 | group_id_2 <- strsplit(as.character(compare_str), "_vs_")[[1]][2] 33 | } 34 | design.matrix <- subset(design.matrix, Condition == group_id_1 | Condition == group_id_2 ) 35 | design.matrix$Condition <- factor(design.matrix$Condition, levels = c(group_id_1,group_id_2), labels = c("control","treatment")) 36 | filelist = list.files(path = ".",pattern = ".count",full.names = T) 37 | ## Generate the matrix of peaks count 38 | peaks.count.list <- NULL 39 | for(sample_id in row.names(design.matrix)){ 40 | input_count_file <- grep(paste0("[.]",sample_id,"[.]input"),filelist,value = TRUE) 41 | input_count_table <- read.table(file = input_count_file, sep = "\t", row.names = NULL,header = T) 42 | 43 | ip_count_file <- grep(paste0("[.]",sample_id,"[.]ip"),filelist,value = TRUE) 44 | ip_count_table <- read.table(file = ip_count_file, sep = "\t", row.names = NULL, header = T) 45 | rpkm <- cbind(input_count_table[,5],ip_count_table[,5]) 46 | colnames(rpkm) <- c(paste0(sample_id,".input"),paste0(sample_id,".ip")) 47 | peaks.count.list <- cbind(peaks.count.list,rpkm) 48 | } 49 | rownames(peaks.count.list) <- ip_count_table$PeakName 50 | 51 | ## generate design matrix 52 | design.matrix$m6A <- "input" 53 | design.matrix$sample_id <- paste0(rownames(design.matrix),".input") 54 | design.matrix_ip <- design.matrix 55 | design.matrix_ip$m6A <- "IP" 56 | design.matrix_ip$sample_id <- paste0(rownames(design.matrix_ip),".ip") 57 | design.matrix <- rbind(design.matrix,design.matrix_ip) 58 | rownames(design.matrix) <- design.matrix$sample_id 59 | design.matrix$m6A <- factor(design.matrix$m6A) 60 | design.matrix <- design.matrix[colnames(peaks.count.list),] 61 | 62 | run.edger <- function(cnts,meta){ 63 | #add count filter? 64 | er.design <- model.matrix(~meta$Condition+meta$m6A+meta$Condition*meta$m6A) 65 | er.dgelist <- edgeR::DGEList(counts=cnts,group=meta$Condition) 66 | er.dgelist <- edgeR::estimateDisp(er.dgelist, design=er.design) 67 | er.fit <- edgeR::glmFit(er.dgelist, er.design) 68 | er.lrt <- edgeR::glmLRT(er.fit, coef=4) 69 | #hist(er.lrt$table$PValue) er.lrt$table$logFC, 70 | results <- er.lrt$table 71 | results$padj <- p.adjust(results$PValue,"BH") 72 | colnames(results) <- c("log2FC","logCPM","LR","pvalue","padj") 73 | return(results) 74 | } 75 | run.deseq2.4l2fc <- function(cnts,meta,label){ 76 | dds <- DESeq2::DESeqDataSetFromMatrix(cnts,meta,formula(~Condition)) 77 | dds$Condition <- factor(dds$Condition, levels=c('control','treatment')) 78 | gene.col2check <- meta$Condition 79 | dds$Condition <- droplevels(dds$Condition) 80 | gene.deseq <- DESeq2::DESeq(dds) 81 | gene.deseq <- DESeq2::results(gene.deseq) 82 | gene.results <- gene.deseq[,c("log2FoldChange","pvalue","padj")] 83 | colnames(gene.results) <- paste0(label,c(".l2fc",".p",".padj")) 84 | return(gene.results) 85 | } 86 | 87 | results <- run.edger(peaks.count.list,design.matrix) 88 | ip.peaks.count <- read.table(annotation.file,sep = "\t", row.names = 1, check.names = F) 89 | input.gene.count <- read.table(input.count.matrix.file,sep = " ",row.names = 1, check.names = F) 90 | colnames(input.gene.count) <- paste0(colnames(input.gene.count),".input") 91 | peaks.de <- run.deseq2.4l2fc(peaks.count.list[,rownames(design.matrix)[design.matrix$m6A == "IP"]], 92 | design.matrix[rownames(design.matrix)[design.matrix$m6A == "IP"],],'peak') 93 | gene.de <- run.deseq2.4l2fc(input.gene.count[,rownames(design.matrix)[design.matrix$m6A == "input"]], 94 | design.matrix[rownames(design.matrix)[design.matrix$m6A == "input"],],'gene') 95 | peaks.de$gene_id <- ip.peaks.count[rownames(peaks.de),"ID"] 96 | peaks.de$gene.l2fc <- gene.de[peaks.de$gene_id,]$gene.l2fc 97 | peaks.de$diff.l2fc <- peaks.de$peak.l2fc - peaks.de$gene.l2fc 98 | results$diff.l2fc <- peaks.de[rownames(results),]$diff.l2fc 99 | write.table(results,file = paste0("edgeR_diffm6A_",group_id_1, "_",group_id_2,".txt") ,sep = "\t",quote = F) 100 | 101 | -------------------------------------------------------------------------------- /docs/output.md: -------------------------------------------------------------------------------- 1 | # MeRIPseqPipe: Output 2 | 3 | This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. 4 | 5 | 6 | 7 | ## Pipeline overview 8 | 9 | The pipeline is built using [Nextflow](https://www.nextflow.io/) 10 | and processes data using the following steps: 11 | 12 | - [MeRIPseqPipe: Output](#MeRIPseqPipe-output) 13 | - [Pipeline overview](#pipeline-overview) 14 | - [Quality Control](#quality-control) 15 | - [Fastp](#fastp) 16 | - [FastQC](#fastqc) 17 | - [RSeQC](#rseqc) 18 | - [MultiQC](#multiqc) 19 | - [Align reasults](#align-reasults) 20 | - [STAR](#star) 21 | - [BWA](#bwa) 22 | - [TopHat2](#tophat2) 23 | - [HISAT2](#hisat2) 24 | - [SAMtools](#samtools) 25 | - [PeakCalling](#peakcalling) 26 | - [MeTPeak](#metpeak) 27 | - [MATK](#matk) 28 | - [Meyer](#meyer) 29 | - [MACS2](#macs2) 30 | - [PeakMerged](#peakmerged) 31 | - [RobustRankAggreg](#robustrankaggreg) 32 | - [MSPC](#mspc) 33 | - [BEDtools](#bedtools) 34 | - [M6A sites prediction](#m6a-sites-prediction) 35 | - [MATK](#matk) 36 | - [Differtial Methylation Analysis](#differtial-methylation-analysis) 37 | - [QNB](#qnb) 38 | - [MATK](#matk) 39 | - [DESeq2_DM](#deseq2dm) 40 | - [edgeR_DM](#edgerdm) 41 | - [Differtial Expression Analysis](#differtial-expression-analysis) 42 | - [featureCounts](#featureCounts) 43 | - [DESeq2_DE](#deseq2de) 44 | - [edgeR_DE](#edgerde) 45 | - [Reports](#reports) 46 | 47 | Several R packages for downstream analysis. 48 | 49 | ## Quality Control 50 | 51 | **Output directory: `results/QC/`** 52 | 53 | ### Fastp 54 | 55 | [Fastp](https://github.com/OpenGene/fastp) 56 | 57 | **Output directory: `results/QC/fastp`** 58 | 59 | ### FastQC 60 | 61 | [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your reads. It provides information about the quality score distribution across your reads, the per base sequence content (%T/A/G/C). You get information about adapter contamination and other overrepresented sequences. 62 | 63 | For further reading and documentation see the [FastQC help](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). 64 | 65 | > **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. To see how your reads look after trimming, look at the FastQC reports in the `trim_galore` directory. 66 | 67 | **Output directory: `results/QC/fastqc`** 68 | 69 | - `sample_fastqc.html` 70 | - FastQC report, containing quality metrics for your untrimmed raw fastq files 71 | - `zips/sample_fastqc.zip` 72 | - zip file containing the FastQC report, tab-delimited data file and plot images 73 | 74 | ### RSeQC 75 | 76 | [RSeQC](http://rseqc.sourceforge.net/) 77 | 78 | **Output directory: `results/QC/RSeQC`** 79 | 80 | ### MultiQC 81 | 82 | [MultiQC](http://multiqc.info) is a visualisation tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in within the report data directory. 83 | 84 | The pipeline has special steps which allow the software versions used to be reported in the MultiQC output for future traceability. 85 | 86 | **Output directory: `results/QC/multiqc`** 87 | 88 | - `Project_multiqc_report.html` 89 | - MultiQC report - a standalone HTML file that can be viewed in your web browser 90 | - `Project_multiqc_data/` 91 | - Directory containing parsed statistics from the different tools used in the pipeline 92 | 93 | For more information about how to use MultiQC reports, see [http://multiqc.info](http://multiqc.info) 94 | 95 | ## Align reasults 96 | 97 | **Output directory: `results/QC/multiqc`** 98 | 99 | ### STAR 100 | 101 | [STAR](https://github.com/alexdobin/STAR) 102 | 103 | ### BWA 104 | 105 | [BWA](https://github.com/lh3/bwa) 106 | 107 | ### TopHat2 108 | 109 | [TopHat2](https://ccb.jhu.edu/software/tophat/) 110 | 111 | ### HISAT2 112 | 113 | [HISAT2](https://ccb.jhu.edu/software/hisat2/) 114 | 115 | ## SAMtools 116 | 117 | [SAMtools](http://www.htslib.org/) 118 | 119 | ## PeakCalling 120 | 121 | ### MeTPeak 122 | 123 | [MeTPeak](https://github.com/compgenomics/MeTPeak) 124 | 125 | ### MATK 126 | 127 | [MATK](http://matk.renlab.org) 128 | 129 | ### Meyer 130 | 131 | [meyer](http://matk.renlab.org) 132 | 133 | ### MACS2 134 | 135 | [MACS2](https://github.com/taoliu/MACS) 136 | 137 | ## PeakMerged 138 | 139 | ### RobustRankAggreg 140 | 141 | [RobustRankAggreg](https://cran.r-project.org/web/packages/RobustRankAggreg/index.html) 142 | 143 | ### MSPC 144 | 145 | [MSPC] 146 | 147 | ### BEDtools 148 | 149 | [BEDtools](https://bedtools.readthedocs.io/en/latest/index.html) 150 | 151 | ## M6A sites prediction 152 | 153 | [MATK](http://matk.renlab.org) 154 | 155 | ## Differtial Methylation Analysis 156 | 157 | ### QNB 158 | 159 | [QNB](https://cran.r-project.org/src/contrib/Archive/QNB/) 160 | 161 | ### MATK 162 | [MATK](http://matk.renlab.org) 163 | 164 | ### DESeq2_DM 165 | 166 | [DESeq2](http://bioconductor.org/packages/DESeq2/) 167 | 168 | ### edgeR_DM 169 | 170 | [edgeR](http://bioconductor.org/packages/edgeR/) 171 | 172 | ## Differtial Expression Analysis 173 | 174 | ### featureCounts 175 | 176 | [featureCounts](http://subread.sourceforge.net) 177 | 178 | ### DESeq2_DE 179 | 180 | [DESeq2](http://bioconductor.org/packages/DESeq2/) 181 | 182 | ### edgeR_DE 183 | 184 | [edgeR](http://bioconductor.org/packages/edgeR/) 185 | 186 | ## Reports 187 | -------------------------------------------------------------------------------- /lib/LikeletUtils.groovy: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env groovy 2 | import static nextflow.Nextflow.file 3 | import nextflow.Channel 4 | 5 | class LikeletUtils { 6 | 7 | // adjust command colors 8 | static String ANSI_RESET = "\u001B[0m" 9 | static String ANSI_BLACK = "\u001B[30m" 10 | static String ANSI_RED = "\u001B[31m" 11 | static String ANSI_GREEN = "\u001B[32m" 12 | static String ANSI_YELLOW = "\u001B[33m" 13 | static String ANSI_BLUE = "\u001B[34m" 14 | static String ANSI_PURPLE = "\u001B[35m" 15 | static String ANSI_CYAN = "\u001B[36m" 16 | static String ANSI_WHITE = "\u001B[37m" 17 | 18 | static def print_red = { str -> LikeletUtils.ANSI_RED + str + LikeletUtils.ANSI_RESET } 19 | static def print_black = { str -> LikeletUtils.ANSI_BLACK + str + LikeletUtils.ANSI_RESET } 20 | static def print_green = { str -> LikeletUtils.ANSI_GREEN + str + LikeletUtils.ANSI_RESET } 21 | static def print_yellow = { str -> LikeletUtils.ANSI_YELLOW + str + LikeletUtils.ANSI_RESET } 22 | static def print_blue = { str -> LikeletUtils.ANSI_BLUE + str + LikeletUtils.ANSI_RESET } 23 | static def print_cyan = { str -> LikeletUtils.ANSI_CYAN + str + LikeletUtils.ANSI_RESET } 24 | static def print_purple = { str -> LikeletUtils.ANSI_PURPLE + str + LikeletUtils.ANSI_RESET } 25 | static def print_white = { str -> LikeletUtils.ANSI_WHITE + str + LikeletUtils.ANSI_RESET } 26 | 27 | // Check if a row has the expected number of item, adjusted from Sarek 28 | static def checkNumberOfItem(row, number) { 29 | if (row.size() != number) exit 1, println("Malformed row in TSV file: ${row}, see --help for more information") 30 | return true 31 | } 32 | 33 | // Return status [0,1] 34 | // 0 == Normal, 1 == Tumor 35 | static def returnStatus(it) { 36 | if (!(it in [0, 1])) exit 1, println("Status is not recognized in TSV file: ${it}, see --help for more information") 37 | return it 38 | } 39 | 40 | // Return file if it exists 41 | static def returnFile(it) { 42 | if (!file(it).exists()) exit 1, println("Missing file in TSV file: ${it}, see --help for more information") 43 | return file(it) 44 | } 45 | 46 | static def sysucc_ascii() { 47 | print LikeletUtils.print_yellow(" ▄▄▄▄▄▄▄▄▄▄▄ ▄ ▄ ▄▄▄▄▄▄▄▄▄▄▄ ▄ ▄ ▄▄▄▄▄▄▄▄▄▄▄ ▄▄▄▄▄▄▄▄▄▄▄ \n") 48 | print LikeletUtils.print_yellow("▐░░░░░░░░░░░▌▐░▌ ▐░▌▐░░░░░░░░░░░▌▐░▌ ▐░▌▐░░░░░░░░░░░▌▐░░░░░░░░░░░▌\n") 49 | print LikeletUtils.print_yellow("▐░█▀▀▀▀▀▀▀▀▀ ▐░▌ ▐░▌▐░█▀▀▀▀▀▀▀▀▀ ▐░▌ ▐░▌▐░█▀▀▀▀▀▀▀▀▀ ▐░█▀▀▀▀▀▀▀▀▀ \n") 50 | print LikeletUtils.print_yellow("▐░▌ ▐░▌ ▐░▌▐░▌ ▐░▌ ▐░▌▐░▌ ▐░▌ \n") 51 | print LikeletUtils.print_yellow("▐░█▄▄▄▄▄▄▄▄▄ ▐░█▄▄▄▄▄▄▄█░▌▐░█▄▄▄▄▄▄▄▄▄ ▐░▌ ▐░▌▐░▌ ▐░▌ \n") 52 | print LikeletUtils.print_yellow("▐░░░░░░░░░░░▌▐░░░░░░░░░░░▌▐░░░░░░░░░░░▌▐░▌ ▐░▌▐░▌ ▐░▌ \n") 53 | print LikeletUtils.print_yellow(" ▀▀▀▀▀▀▀▀▀█░▌ ▀▀▀▀█░█▀▀▀▀ ▀▀▀▀▀▀▀▀▀█░▌▐░▌ ▐░▌▐░▌ ▐░▌ \n") 54 | print LikeletUtils.print_yellow(" ▐░▌ ▐░▌ ▐░▌▐░▌ ▐░▌▐░▌ ▐░▌ \n") 55 | print LikeletUtils.print_yellow(" ▄▄▄▄▄▄▄▄▄█░▌ ▐░▌ ▄▄▄▄▄▄▄▄▄█░▌▐░█▄▄▄▄▄▄▄█░▌▐░█▄▄▄▄▄▄▄▄▄ ▐░█▄▄▄▄▄▄▄▄▄ \n") 56 | print LikeletUtils.print_yellow("▐░░░░░░░░░░░▌ ▐░▌ ▐░░░░░░░░░░░▌▐░░░░░░░░░░░▌▐░░░░░░░░░░░▌▐░░░░░░░░░░░▌\n") 57 | print LikeletUtils.print_yellow(" ▀▀▀▀▀▀▀▀▀▀▀ ▀ ▀▀▀▀▀▀▀▀▀▀▀ ▀▀▀▀▀▀▀▀▀▀▀ ▀▀▀▀▀▀▀▀▀▀▀ ▀▀▀▀▀▀▀▀▀▀▀ \n") 58 | } 59 | // extrct fastq information from tsvFile 60 | static def extractData(tsvFile) { 61 | // Channeling the TSV file containing FASTQ. 62 | // Format is: "subject gender status sample lane fastq1 fastq2" 63 | def inputData = Channel.from(tsvFile) 64 | .splitCsv(sep: '\t', skip: 1) 65 | .map { row -> 66 | LikeletUtils.checkNumberOfItem(row, 6) 67 | def idSample = row[0] 68 | def fastqFile1 = file(row[1]) 69 | def fastqFile2 = file(row[2]) 70 | def group = row[5] 71 | def input = true 72 | def gzip = false 73 | def readsSingle = false 74 | def filetype = "fastq" 75 | if (row[1].endsWith(".gz") == true ){ 76 | gzip = true 77 | }else if (row[1].endsWith(".bam") == true ){ 78 | filetype = "bam" 79 | } 80 | if (row[2].endsWith("false") == true){ 81 | readsSingle = true 82 | [idSample, [fastqFile1], readsSingle, gzip, input, group, filetype] 83 | } else { 84 | [idSample, [fastqFile1, fastqFile2], readsSingle, gzip, input, group, filetype] 85 | } 86 | } 87 | def ipData = Channel.from(tsvFile) 88 | .splitCsv(sep: '\t', skip: 1) 89 | .map { row -> 90 | LikeletUtils.checkNumberOfItem(row, 6) 91 | def idSample = row[0] 92 | def fastqFile1 = file(row[3]) 93 | def fastqFile2 = file(row[4]) 94 | def group = row[5] 95 | def input = false 96 | def gzip = false 97 | def readsSingle = false 98 | def filetype = "fastq" 99 | if (row[3].endsWith(".gz") == true){ 100 | gzip = true 101 | }else if (row[3].endsWith(".bam") == true){ 102 | filetype = "bam" 103 | } 104 | if (row[4].endsWith("false") == true){ 105 | readsSingle = true 106 | [idSample, [fastqFile1], readsSingle, gzip, input, group, filetype] 107 | } else { 108 | [idSample, [fastqFile1, fastqFile2], readsSingle, gzip, input, group, filetype] 109 | } 110 | } 111 | return inputData.mix(ipData) 112 | } 113 | static def addstringToalign(String str,int num){ 114 | if(str.length() < num) { 115 | def numSpace = num-str.length() 116 | 117 | numSpace.times{ 118 | str += ' ' 119 | } 120 | } 121 | str 122 | } 123 | } 124 | 125 | 126 | -------------------------------------------------------------------------------- /nextflow.config: -------------------------------------------------------------------------------- 1 | /* 2 | * ------------------------------------------------- 3 | * nf-core/meripseqpipe Nextflow config file 4 | * ------------------------------------------------- 5 | * Default config options for all environments. 6 | */ 7 | 8 | // Global default params, used in configs 9 | params { 10 | // Pipeline Options 11 | // Input files 12 | designfile = "$baseDir/designfile.tsv" 13 | comparefile = "$baseDir/comparefile.txt" 14 | 15 | // Setting main parameters of analysis mode 16 | stranded = "no" // "yes" OR "no" OR "reverse" 17 | mapq_cutoff = 20 // "255" means unique mapping reads 18 | aligners = "star" // "star" OR "bwa" OR "tophat2" OR "hisat2" OR "none" 19 | peakCalling_mode = "independence" // "group" OR "independence" 20 | peakMerged_mode = "rank" // "rank" OR "macs2" OR "MATK" OR "metpeak" OR "mspc" 21 | expression_analysis_mode = "DESeq2" // "DESeq2" OR "edgeR" OR "none" 22 | methylation_analysis_mode = "QNB" // "MATK" OR "QNB" OR "Wilcox-test" OR "MeTDiff" OR "edgeR" OR "DESeq2" 23 | 24 | // Reference genomes 25 | matk_jar = "$baseDir/MATK-1.0.jar" 26 | fasta = "/data1/zhuky/test-datasets/reference/TEST.fa" 27 | gtf = "/data1/zhuky/test-datasets/reference/TEST.gtf" 28 | rRNA_fasta = false 29 | tophat2_index = false 30 | hisat2_index = false 31 | bwa_index = false 32 | star_index = false 33 | 34 | // Other command line parameters 35 | peak_threshold = "medium" // "low" OR "medium" OR "high" 36 | saveReference = false 37 | outdir = "$baseDir/results" 38 | tracedir = "${params.outdir}/pipeline_info/" 39 | name = false 40 | multiqc_config = "$baseDir/assets/multiqc_config.yaml" 41 | email = false 42 | email_on_fail = false 43 | maxMultiqcEmailFileSize = 25.MB 44 | plaintext_email = false 45 | monochrome_logs = false 46 | help = false 47 | igenomes_base = 's3://ngi-igenomes/igenomes/' 48 | tracedir = "${params.outdir}/pipeline_info" 49 | igenomes_ignore = false 50 | custom_config_version = 'master' 51 | custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" 52 | hostnames = false 53 | config_profile_description = false 54 | config_profile_contact = false 55 | config_profile_url = false 56 | 57 | // Defaults only, expecting to be overwritten 58 | max_memory = 128.GB 59 | max_cpus = 16 60 | max_time = 240.h 61 | 62 | // skipping modes Options 63 | skip_sort = false 64 | skip_peakCalling = false 65 | skip_diffpeakCalling = false 66 | skip_annotation = false 67 | skip_m6Aprediction = false 68 | skip_qc = false 69 | skip_motif = false 70 | skip_filterrRNA = false 71 | 72 | // skipping tools Options 73 | // PeakCalling tools 74 | skip_metpeak = false 75 | skip_macs2 = false 76 | skip_matk = false 77 | skip_meyer = false 78 | // QC Steps 79 | skip_fastp = false 80 | skip_fastqc = false 81 | skip_rseqc = false 82 | skip_createbedgraph = true 83 | skip_genebody_coverage = true 84 | 85 | // ignore 86 | input = false 87 | readPaths = false 88 | single_end = false 89 | gzip = false 90 | } 91 | 92 | // Container slug. Stable releases should specify release tag! 93 | // Developmental code should specify :dev 94 | process.container = 'kingzhuky/meripseqpipe:dev' 95 | 96 | // Load base.config by default for all pipelines 97 | includeConfig 'conf/base.config' 98 | 99 | // Load nf-core custom profiles from different Institutions 100 | //try { 101 | // includeConfig "${params.custom_config_base}/nfcore_custom.config" 102 | //} catch (Exception e) { 103 | // System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config") 104 | //} 105 | 106 | // Avoid this error: 107 | // WARNING: Your kernel does not support swap limit capabilities or the cgroup is not mounted. Memory limited without swap. 108 | // Testing this in nf-core after discussion here https://github.com/nf-core/tools/pull/351, once this is established and works well, nextflow might implement this behavior as new default. 109 | docker.runOptions = '-u \$(id -u):\$(id -g)' 110 | 111 | // Load igenomes.config if required 112 | if (!params.igenomes_ignore) { 113 | includeConfig 'conf/igenomes.config' 114 | } 115 | profiles { 116 | conda { process.conda = "$baseDir/environment.yml" } 117 | docker { 118 | params.matk_jar = "/MATK-1.0.jar" 119 | docker.enabled = true 120 | docker.runOptions = '-u $(id -u):$(id -g)' 121 | } 122 | C2 { includeConfig 'conf/C2.config'} 123 | debug { process.beforeScript = 'echo $HOSTNAME' } 124 | test { includeConfig 'conf/test.config' } 125 | test_mixed { includeConfig 'conf/test_mixed.config' } 126 | test_bam { includeConfig 'conf/test_bam.config' } 127 | none { 128 | // Don't load any config (for use with custom home configs) 129 | } 130 | } 131 | 132 | // Capture exit codes from upstream processes when piping 133 | process.shell = ['/bin/bash', '-euo', 'pipefail'] 134 | 135 | timeline { 136 | enabled = true 137 | file = "${params.tracedir}/execution_timeline.html" 138 | } 139 | report { 140 | enabled = true 141 | file = "${params.tracedir}/execution_report.html" 142 | } 143 | trace { 144 | enabled = true 145 | file = "${params.tracedir}/execution_trace.txt" 146 | } 147 | dag { 148 | enabled = true 149 | file = "${params.tracedir}/pipeline_dag.svg" 150 | } 151 | 152 | manifest { 153 | name = 'MeRIPseqPipe' 154 | author = 'Kaiyu Zhu, Yu Sun, Xiaoqiong Bao' 155 | homePage = 'https://github.com/canceromics/MeRIPseqPipe' 156 | description = 'MeRIP-seq analysis pipeline arranged multiple alignment tools, peakCalling tools, Merge Peaks\' methods and methylation analysis methods.' 157 | mainScript = 'main.nf' 158 | nextflowVersion = '>=19.04.0' 159 | version = '1.0dev' 160 | } 161 | 162 | // Function to ensure that resource requirements don't go beyond 163 | // a maximum limit 164 | def check_max(obj, type) { 165 | if (type == 'memory') { 166 | try { 167 | if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1) 168 | return params.max_memory as nextflow.util.MemoryUnit 169 | else 170 | return obj 171 | } catch (all) { 172 | println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj" 173 | return obj 174 | } 175 | } else if (type == 'time') { 176 | try { 177 | if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1) 178 | return params.max_time as nextflow.util.Duration 179 | else 180 | return obj 181 | } catch (all) { 182 | println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj" 183 | return obj 184 | } 185 | } else if (type == 'cpus') { 186 | try { 187 | return Math.min( obj, params.max_cpus as int ) 188 | } catch (all) { 189 | println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj" 190 | return obj 191 | } 192 | } 193 | } 194 | -------------------------------------------------------------------------------- /bin/QC_Peaks_Report.R: -------------------------------------------------------------------------------- 1 | #!/bin/Rscript 2 | ## Rscript QC_Peaks_Report.R 3 | ### designfile: Sample_id, Input_filename, IP_filename, group_id 4 | library(ggplot2) 5 | library(ggseqlogo) 6 | library(reshape2) 7 | options(stringsAsFactors = FALSE) 8 | 9 | args <- commandArgs(T) 10 | #args <- c("formatted_designfile.txt", "mspc", "group","QCPeaksPlot.RData") 11 | designfile <- args[1] #"formatted_designfile.txt" 12 | peakMerged.mode <- args[2]#rank 13 | peakCalling.mode <- args[3]#"group" 14 | output.Rdata <- args[4]#"QCPeaksPlot.RData" 15 | designtable <- read.csv(designfile, head = TRUE, colClasses = c("character")) 16 | 17 | ## Peaks Distribution 18 | pdf(file = paste0("distribution.plot_",peakMerged.mode,".pdf"),paper = "USr") 19 | PlotPeaksDitr <- function(files.list, suffix = "[.]anno[.]txt"){ 20 | distribute_df <- NULL 21 | for( file in files.list ){ 22 | anno.table <- read.table(file, header=F, sep="\t", quote="", stringsAsFactors = F)[,c(1,2,3,15,11,12,13,14,17)] 23 | colnames(anno.table) <- c("Chr","ChrStart","ChrEnd","ID","Gene_symbol","Coding","Location","Relative_distance","RNA_type") 24 | peak.freq = c(as.numeric(as.vector(anno.table[which(anno.table[,7]=="5UTR"),8])), 25 | as.numeric(as.vector(anno.table[which(anno.table[,7]=="CDS"),8]))+100, 26 | as.numeric(as.vector(anno.table[which(anno.table[,7]=="3UTR"),8]))+200) 27 | freq = data.frame(Freq = peak.freq, group = strsplit(file,suffix)[[1]]) 28 | distribute_df = rbind(distribute_df, freq) 29 | } 30 | ggplot(distribute_df, aes(x=Freq, colour = group))+ 31 | geom_line(stat = "density", size=1, adjust = 0.8)+ 32 | scale_x_continuous(breaks = c(50,150,250), labels = c("5'UTR", "CDS", "3'UTR"))+ #axis labels 33 | labs(y="m6A coding peak density",x="Region of gene")+ 34 | geom_vline(xintercept = c(100,200), linetype = "dashed")+ 35 | theme_bw()+ 36 | theme(panel.grid =element_blank(),#remove grid line 37 | axis.title.x = element_text(size = 20, angle = 0, face = "plain", colour = "black"), 38 | axis.title.y = element_text(size = 20, angle = 90, face = "plain", colour = "black"), 39 | axis.text.x = element_text(size = 15,colour = "black"), 40 | axis.text.y = element_text(size = 15,colour = "black"), 41 | aspect.ratio=1, 42 | axis.ticks.x = element_blank()) #remove ticks 43 | } 44 | 45 | anno.files.list <- dir(pattern = "[.]anno[.]txt") 46 | ### barplot 47 | total.distribute <- NULL 48 | for( file in anno.files.list ){ 49 | anno.table <- read.table(file, header=F, sep="\t", quote="", stringsAsFactors = F)[,c(1,2,3,15,11,12,13,14,17)] 50 | colnames(anno.table) <- c("Chr","ChrStart","ChrEnd","ID","Gene_symbol","Coding","Location","Relative_distance","RNA_type") 51 | anno.table[anno.table$Location == "CDS" & anno.table$Relative_distance >= 95,7] <- "Stop Codon" 52 | anno.table[anno.table$Location == "3UTR" & anno.table$Relative_distance <= 5,7] <- "Stop Codon" 53 | group.name <- strsplit(file,"[.]anno[.]txt")[[1]] 54 | freq = data.frame(Location = anno.table[anno.table$Coding == "coding",7], group = group.name) 55 | total.distribute = rbind(total.distribute, freq) 56 | } 57 | distribute.table <- melt(table(total.distribute)) 58 | distribute.table$Location <- factor(distribute.table$Location,levels = c("intron","3UTR","Stop Codon","CDS","5UTR")) 59 | col <- c('plum1','pink2','#58B2DC',"#51A8DD","#005CAF") 60 | distribute.barplot <- ggplot(distribute.table,aes(group, value, fill = Location)) + 61 | geom_bar(stat="identity",position = 'fill') + coord_flip() + 62 | ggtitle("Peaks Distribution") + 63 | scale_y_continuous(expand = c(0, 0)) + 64 | guides(fill = guide_legend(reverse = TRUE)) + 65 | scale_fill_brewer() + 66 | theme(panel.grid =element_blank(), #remove grid line 67 | title = element_text(size = 15, angle = 0, face = "plain", colour = "black"), 68 | axis.text.x = element_text(size = 12,colour = "black"), 69 | axis.text.y = element_text(size = 12,colour = "black"), 70 | panel.background = element_rect(fill = "transparent",colour = NA), 71 | axis.title = element_blank(), 72 | axis.ticks.x = element_blank()) #remove ticks 73 | distribute.barplot.count <- ggplot(distribute.table,aes(group, value, fill = Location)) + 74 | geom_bar(stat="identity") + coord_flip() + 75 | ggtitle("Peaks Distribution") + 76 | scale_y_continuous(expand = c(0, 0)) + 77 | guides(fill = guide_legend(reverse = TRUE)) + 78 | scale_fill_brewer() + 79 | theme(panel.grid =element_blank(), #remove grid line 80 | title = element_text(size = 15, angle = 0, face = "plain", colour = "black"), 81 | axis.text.x = element_text(size = 12,colour = "black"), 82 | axis.text.y = element_text(size = 12,colour = "black"), 83 | panel.background = element_rect(fill = "transparent",colour = NA), 84 | axis.title = element_blank(), 85 | axis.ticks.x = element_blank()) #remove ticks 86 | print(distribute.barplot) 87 | print(distribute.barplot.count) 88 | ### Curve 89 | sample.plots.list <- NULL 90 | sample.list <- if(peakCalling.mode == "group") designtable$Group else designtable$Sample_ID 91 | for( sample in sample.list ){ 92 | sample.files.list <- grep(paste0("_",sample,"_normalized"), anno.files.list, value = T) 93 | sample.plots.list[[sample]] <- PlotPeaksDitr(sample.files.list, "_normalized[.]anno[.]txt") 94 | print(sample.plots.list[[sample]]) 95 | } 96 | merged.files.list <- grep("merged", anno.files.list, value = T) 97 | merged.plot <- PlotPeaksDitr(merged.files.list) 98 | print(merged.plot) 99 | dev.off() 100 | 101 | ## Peaks' motif 102 | pdf(file = paste0("motif.plot_",peakMerged.mode,".pdf"),paper = "USr") 103 | ggplot2.multiplot <- function(..., plotlist=NULL, cols=2) { 104 | # Make a list from the ... arguments and plotlist 105 | plots <- c(list(...), plotlist) 106 | numPlots = length(plots) 107 | 108 | # Make the panel 109 | plotCols = cols # Number of columns of plots 110 | plotRows = ceiling(numPlots/plotCols) # Number of rows needed, calculated from # of cols 111 | 112 | # Set up the page 113 | grid::grid.newpage() 114 | grid::pushViewport(grid::viewport(layout = grid::grid.layout(plotRows, plotCols))) 115 | vplayout <- function(x, y) 116 | grid::viewport(layout.pos.row = x, layout.pos.col = y) 117 | 118 | # Make each plot, in the correct location 119 | for (i in 1:numPlots) { 120 | curRow = ceiling(i/plotCols) 121 | curCol = (i-1) %% plotCols + 1 122 | print(plots[[i]], vp = vplayout(curRow, curCol)) 123 | } 124 | 125 | } 126 | motif_plot <- function(motif, pval, rank){ 127 | ggplot()+ 128 | geom_logo(motif, method = "probability")+ 129 | annotate("text", x=ncol(motif)-0.5, y=1.5, label=paste0("p = ",pval),size = 5)+ 130 | ggtitle(rank)+ 131 | theme(plot.title = element_text(hjust = 0, size = 6))+ 132 | theme_logo() 133 | } 134 | QC.motif.filelist = dir(".",pattern = "motif[1,2,3].motif",recursive = TRUE) 135 | QC.motif.list <- NULL 136 | QC.motif.pvalue <- NULL 137 | motif.peakfiles <- unique(unlist(lapply(strsplit(QC.motif.filelist,"_homer/homerResults"), function(x){x[1]}))) 138 | for( peakfile.name in motif.peakfiles ){ 139 | group.motif.list <- NULL 140 | for (file in grep(paste0(peakfile.name, "_homer/homerResults"), QC.motif.filelist, value = T) ){ 141 | motif_matrix <- read.delim(file,header = F,sep = "\t", check.names=F) 142 | motif_pvalue <- strsplit(motif_matrix[1,6], split = ":")[[1]][4] 143 | motif_matrix <- motif_matrix[-1,c(-5,-6)] 144 | colnames(motif_matrix) <- c("A","C","G","T") 145 | rownames(motif_matrix) <- c(1:nrow(motif_matrix)) 146 | motif_matrix <- as.matrix(t(motif_matrix)) 147 | motif_name <- strsplit(strsplit(file,split = c("_homer/homerResults/"))[[1]][2],split = "[.]motif")[[1]][1] 148 | group.motif.list[[motif_name]] <- motif_plot(t(apply(motif_matrix, 1, function(x)as.numeric(x))), motif_pvalue, paste0(peakfile.name,"_",motif_name)) 149 | } 150 | QC.motif.list[[peakfile.name]] <- group.motif.list 151 | ggplot2.multiplot(plotlist = QC.motif.list[[peakfile.name]] ,cols = 1) 152 | } 153 | dev.off() 154 | save(distribute.barplot.count,distribute.barplot,sample.plots.list,merged.plot,QC.motif.list,file = output.Rdata) 155 | -------------------------------------------------------------------------------- /bin/MeTDiff_diffm6A.R: -------------------------------------------------------------------------------- 1 | #!/bin/Rscript 2 | ## Rscript MeTDiff_diffm6A.R 3 | ### designfile: Sample_id, Input_filename, IP_filename, group_id 4 | ### compare_str: Compairision design (eg: A_vs_B) 5 | library(MeTDiff) 6 | args <- commandArgs(T) 7 | designfile <- args[1] 8 | compare_str <- args[2] 9 | 10 | .help.digamma <- function(xx,alpha){ 11 | Tm <- dim(xx) 12 | TT <- Tm[1] 13 | m <- Tm[2] 14 | 15 | res <- matrix(0,TT,1) 16 | for (ii in 1:m){ 17 | res <- res + digamma(xx[,ii] + alpha) 18 | } 19 | return(res) 20 | } 21 | 22 | .help.trigamma <- function(xx,alpha){ 23 | Tm <- dim(xx) 24 | if (is.null(Tm)){ 25 | TT <- length(xx) 26 | m <- 1 27 | }else{ 28 | TT <- Tm[1] 29 | m <- Tm[2] 30 | } 31 | res <- matrix(0,TT,1) 32 | for (ii in 1:m){ 33 | res <- res + trigamma(xx[,ii] + alpha) 34 | } 35 | return(res) 36 | } 37 | 38 | .help.postprob <- function(dxx,dyy,dnn,xx,yy,alpha,beta){ 39 | N <- length(alpha) 40 | Tm <- dim(xx) 41 | if (is.null(Tm)){ 42 | TT <- length(xx) 43 | m <- 1 44 | }else{ 45 | TT <- Tm[1] 46 | m <- Tm[2] 47 | } 48 | res <- matrix(0,TT,N) 49 | 50 | for (ii in 1:m){ 51 | dnx <- as.matrix(dxx[,ii]) 52 | dny <- as.matrix(dyy[,ii]) 53 | dn <- as.matrix(dnn[,ii]) 54 | x <- as.matrix(xx[,ii]) 55 | y <- as.matrix(yy[,ii]) 56 | res <- res + (dn-dnx-dny) %*% matrix(1,1,N) + lgamma(x %*% matrix(1,1,N) + matrix(1,TT) %*% alpha) - 57 | lgamma(matrix(1,TT) %*% (alpha+beta) + (x+y) %*% matrix(1,1,N)) + 58 | lgamma(y %*% matrix(1,1,N) + matrix(1,TT) %*% beta) + lgamma(matrix(1,TT) %*% (alpha+beta)) - 59 | lgamma(matrix(1,TT) %*% alpha) - lgamma(matrix(1,TT) %*% beta) 60 | } 61 | res <- exp(res) 62 | } 63 | 64 | .help.factorial <- function(count){ 65 | #compute the log(count!) 66 | cm = max(count) 67 | if (is.null(ncol(count))){ 68 | D <- 1 69 | }else{ 70 | D <- ncol(count) 71 | } 72 | if(cm > 50000){ 73 | dnorm <- as.matrix(lgamma(data.matrix(count+1))) 74 | } 75 | else{ 76 | tmp <- cumsum(rbind(0,log(as.matrix(1:max(count))))) 77 | dnorm <- matrix(tmp[data.matrix(count+1)],ncol=D) 78 | } 79 | } 80 | 81 | .betabinomial.lh <- function(x,y,Nit=40,Npara=1e-9){ 82 | # x <- as.matrix(x[peak,]) 83 | # y <- as.matrix(y[peak,]) 84 | N <- 2 # number of states 85 | J <- matrix(0,N,1) 86 | H <- matrix(0,N,N) 87 | T <- nrow(x) 88 | IP_mean <- rowMeans(x) 89 | INPUT_mean <- rowMeans(y) 90 | nip = ncol(x) 91 | nin = ncol(y) 92 | # if the dimension for x and y does not match 93 | if (nip > nin) { 94 | avg_input <- round(matrix(rep(INPUT_mean,nip-nin),ncol=nip-nin)) 95 | y <- cbind(y,avg_input) 96 | } 97 | else if (nip < nin){ 98 | avg_ip <- matrix(rep(IP_mean,nin-nip),ncol=nin-nip) 99 | x <- cbind(x,avg_ip) 100 | } 101 | n <- x + y 102 | m <- ncol(x) 103 | rr <- x/n 104 | 105 | # use another method to initialize 106 | p1_e <- exp(sum( log(rr) )/(T*m)) 107 | p2_e <- exp(sum( log(1-rr)/(T*m) )) 108 | alpha <- 1/2 *(1-p2_e)/(1-p1_e-p2_e ) # to avoid 0 109 | beta <- 1/2 *(1-p1_e)/(1-p1_e-p2_e ) 110 | c = rbind(alpha,beta) 111 | # add break condition to avoid alpha is na 112 | if ( !any(is.finite(beta)) | !is.finite(alpha) | any(beta <= 0) | any(alpha<= 0) ){ 113 | return(list(logl=rnorm(1)*10000,alpha=c(1,1),beta=c(1,1))) 114 | } 115 | for (nit in 1:Nit){ 116 | J[1] <- T*digamma(sum(c))*m - sum( .help.digamma(as.matrix(n),sum(c)) ) + sum( .help.digamma(as.matrix(x),c[1]) ) - T*digamma(c[1])*m 117 | J[2] <- T*digamma(sum(c))*m - sum( .help.digamma(as.matrix(n),sum(c)) ) + sum( .help.digamma(as.matrix(y),c[2]) ) - T*digamma(c[2])*m 118 | H[1,1] <- T*trigamma(sum(c))*m - sum(.help.trigamma(as.matrix(n),sum(c))) + sum(.help.trigamma(as.matrix(x),c[1])) - T*trigamma(c[1])*m 119 | H[2,2] <- T*trigamma(sum(c))*m - sum(.help.trigamma(as.matrix(n),sum(c))) + sum(.help.trigamma(as.matrix(y),c[2])) - T*trigamma(c[2])*m 120 | H[1,2] <- T*trigamma(sum(c))*m - sum(.help.trigamma(as.matrix(n),sum(c))) 121 | H[2,1] <- H[1,2] 122 | eigvalue <- eigen(H)$values 123 | 124 | if ( (any(beta < Npara)) | (any(alpha < Npara)) 125 | | abs(eigvalue[1]/eigvalue[2]) > 1e12 | abs(eigvalue[1]/eigvalue[2]) < 1e-12 126 | | any(eigvalue==0) ){ break } 127 | 128 | # tmp_step <- -solve(H,tol=1e-20) %*% J 129 | tmp_step <- -solve(H, J) # using newton smoothing 130 | tmp <- c + tmp_step 131 | while(any(tmp <= 0)){ 132 | # warning(sprintf("Could not update the Newton step ...\n")) 133 | tmp_step <- tmp_step / 20 134 | tmp <- c + tmp_step 135 | } 136 | c <- tmp 137 | 138 | } 139 | # caculate the likelihood 140 | alpha <- c[1] 141 | beta <- c[2] 142 | dnx <- .help.factorial(x) 143 | dny <- .help.factorial(y) 144 | dn <- .help.factorial(n) 145 | prob <- .help.postprob(dnx,dny,dn,x,y,alpha,beta) 146 | return(list(logl=sum(log(prob)),alpha=alpha,beta=beta)) 147 | 148 | } 149 | 150 | # merge and compare two conditions 151 | diff.call.module <- function(meth1,unmeth1,meth2,unmeth2){ 152 | #x = untreated IP, y = untreated input, xx = treated IP, yy = treated input 153 | no_peak=length(meth1[,1]) #PEAK$loci2peak_merged[,1]) 154 | pvalues <- rep(1,no_peak) 155 | log.fc <- rep(0,no_peak) 156 | for (ipeak in 1:no_peak) { 157 | if (ipeak%%1000 == 0){print(ipeak)} 158 | x = t(as.array(meth1[ipeak,])) 159 | y = t(as.matrix(unmeth1[ipeak,])) 160 | xx = t(as.matrix(meth2[ipeak,])) 161 | yy = t(as.matrix(unmeth2[ipeak,])) 162 | xxx = cbind(x,xx) 163 | yyy = cbind(y,yy) 164 | #BBtest 165 | logl1 <- .betabinomial.lh(x,y+1) 166 | logl2 <- .betabinomial.lh(xx,yy+1) 167 | logl3 <- .betabinomial.lh(xxx,yyy+1) 168 | tst <- (logl1$logl+logl2$logl-logl3$logl)*2 169 | pvalues[ipeak] <- 1 - pchisq(tst,2) 170 | log.fc[ipeak] <- log2( (sum(xx)+1)/(1+sum(yy)) * (1+sum(y))/(1+sum(x)) ) 171 | 172 | } 173 | p <- pvalues 174 | fdr <- p.adjust(pvalues,method='fdr') 175 | 176 | DIFF <- list(fdr=fdr,pvalues=p,fc=log.fc) 177 | # result 178 | result =list() 179 | result$DIFF = DIFF 180 | return(result) 181 | 182 | } 183 | 184 | designtable <- read.csv(designfile, head = TRUE, stringsAsFactors=FALSE, colClasses = c("character"), check.names=F) 185 | design.matrix <- as.data.frame(designtable$Group) 186 | rownames(design.matrix) <- designtable$Sample_ID 187 | colnames(design.matrix) <- "Condition" 188 | 189 | # Get the information of groups from compare_str 190 | if(length(unique(design.matrix$Condition)) < 2){ 191 | stop( "The count of Group is less than two, please check your designfile.") 192 | }else if( compare_str == "two_group" ){ 193 | # Get the information without compare_str beacause of only two groups 194 | group_id_1 <- unique(design.matrix$Condition)[1] 195 | group_id_2 <- unique(design.matrix$Condition)[2] 196 | }else{ 197 | # Running MeTDiff quantification with compare_str 198 | group_id_1 <- strsplit(as.character(compare_str), "_vs_")[[1]][1] 199 | group_id_2 <- strsplit(as.character(compare_str), "_vs_")[[1]][2] 200 | } 201 | design.matrix <- subset(design.matrix, Condition == group_id_1 | Condition == group_id_2 ) 202 | design.matrix$Condition <- factor(design.matrix$Condition,labels = c("control","treatment")) 203 | filelist = list.files(path = ".",pattern = ".count",full.names = T) 204 | ## Generate the matrix of peaks count 205 | rpkm_peaks_list <- NULL 206 | for(sample_id in row.names(design.matrix)){ 207 | input_count_file <- grep(paste0("[.]",sample_id,"[.]input"),filelist,value = TRUE) 208 | input_count_table <- read.table(file = input_count_file, sep = "\t", row.names = NULL,header = T) 209 | 210 | ip_count_file <- grep(paste0("[.]",sample_id,"[.]ip"),filelist,value = TRUE) 211 | ip_count_table <- read.table(file = ip_count_file, sep = "\t", row.names = NULL, header = T) 212 | rpkm <- cbind(input_count_table[,5],ip_count_table[,5]) 213 | colnames(rpkm) <- c(paste0(sample_id,".input"),paste0(sample_id,".ip")) 214 | rpkm_peaks_list <- cbind(rpkm_peaks_list,rpkm) 215 | } 216 | rownames(rpkm_peaks_list) <- ip_count_table$PeakName 217 | 218 | ## generate design matrix 219 | design.matrix$m6A <- "input" 220 | design.matrix$sample_id <- paste0(rownames(design.matrix),".input") 221 | design.matrix_ip <- design.matrix 222 | design.matrix_ip$m6A <- "IP" 223 | design.matrix_ip$sample_id <- paste0(rownames(design.matrix_ip),".ip") 224 | design.matrix <- rbind(design.matrix,design.matrix_ip) 225 | rownames(design.matrix) <- design.matrix$sample_id 226 | design.matrix$m6A <- factor(design.matrix$m6A) 227 | design.matrix <- design.matrix[colnames(rpkm_peaks_list),] 228 | 229 | cnts <- rpkm_peaks_list 230 | meta <- design.matrix 231 | run.metdiff <- function(cnts,meta){ 232 | meth1 <- cnts[,which(meta$Condition == 'treatment' & meta$m6A == "IP")] 233 | meth2 <- cnts[,which(meta$Condition != 'treatment' & meta$m6A == "IP")] 234 | unmeth1 <- cnts[,which(meta$Condition == 'treatment' & meta$m6A == "input")] 235 | unmeth2 <- cnts[,which(meta$Condition != 'treatment' & meta$m6A == "input")] 236 | metdiff.result <- diff.call.module(meth1,unmeth1,meth2,unmeth2) 237 | results <- data.frame(log2FC= metdiff.result$DIFF$fc, pvalue = metdiff.result$DIFF$pvalues, padj = p.adjust(metdiff.result$DIFF$pvalues,"BH")) 238 | rownames(results) <- rownames(cnts) 239 | return(results) 240 | } 241 | results <- run.metdiff(rpkm_peaks_list,design.matrix) 242 | write.table(results,file = paste0("MeTDiff_diffm6A_",group_id_1, "_",group_id_2,".txt") ,sep = "\t",quote = F) 243 | -------------------------------------------------------------------------------- /bin/m6A_motif.meme: -------------------------------------------------------------------------------- 1 | MEME version 5.0.2 2 | 3 | ALPHABET= ACGT 4 | 5 | strands: + - 6 | 7 | Background letter frequencies (from unknown source): 8 | A 0.250 C 0.250 G 0.250 T 0.250 9 | 10 | MOTIF 1.1 AAACA 11 | 12 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 13 | 1.000000 0.000000 0.000000 0.000000 14 | 1.000000 0.000000 0.000000 0.000000 15 | 1.000000 0.000000 0.000000 0.000000 16 | 0.000000 1.000000 0.000000 0.000000 17 | 1.000000 0.000000 0.000000 0.000000 18 | 19 | 20 | MOTIF 1.2 AAACC 21 | 22 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 23 | 1.000000 0.000000 0.000000 0.000000 24 | 1.000000 0.000000 0.000000 0.000000 25 | 1.000000 0.000000 0.000000 0.000000 26 | 0.000000 1.000000 0.000000 0.000000 27 | 0.000000 1.000000 0.000000 0.000000 28 | 29 | 30 | MOTIF 1.3 AAACH 31 | 32 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 33 | 1.000000 0.000000 0.000000 0.000000 34 | 1.000000 0.000000 0.000000 0.000000 35 | 1.000000 0.000000 0.000000 0.000000 36 | 0.000000 1.000000 0.000000 0.000000 37 | 0.333333 0.333333 0.000000 0.333333 38 | 39 | 40 | MOTIF 1.4 AAACT 41 | 42 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 43 | 1.000000 0.000000 0.000000 0.000000 44 | 1.000000 0.000000 0.000000 0.000000 45 | 1.000000 0.000000 0.000000 0.000000 46 | 0.000000 1.000000 0.000000 0.000000 47 | 0.000000 0.000000 0.000000 1.000000 48 | 49 | 50 | MOTIF 1.5 AGACA 51 | 52 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 53 | 1.000000 0.000000 0.000000 0.000000 54 | 0.000000 0.000000 1.000000 0.000000 55 | 1.000000 0.000000 0.000000 0.000000 56 | 0.000000 1.000000 0.000000 0.000000 57 | 1.000000 0.000000 0.000000 0.000000 58 | 59 | 60 | MOTIF 1.6 AGACC 61 | 62 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 63 | 1.000000 0.000000 0.000000 0.000000 64 | 0.000000 0.000000 1.000000 0.000000 65 | 1.000000 0.000000 0.000000 0.000000 66 | 0.000000 1.000000 0.000000 0.000000 67 | 0.000000 1.000000 0.000000 0.000000 68 | 69 | 70 | MOTIF 1.7 AGACH 71 | 72 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 73 | 1.000000 0.000000 0.000000 0.000000 74 | 0.000000 0.000000 1.000000 0.000000 75 | 1.000000 0.000000 0.000000 0.000000 76 | 0.000000 1.000000 0.000000 0.000000 77 | 0.333333 0.333333 0.000000 0.333333 78 | 79 | 80 | MOTIF 1.8 AGACT 81 | 82 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 83 | 1.000000 0.000000 0.000000 0.000000 84 | 0.000000 0.000000 1.000000 0.000000 85 | 1.000000 0.000000 0.000000 0.000000 86 | 0.000000 1.000000 0.000000 0.000000 87 | 0.000000 0.000000 0.000000 1.000000 88 | 89 | 90 | MOTIF 1.9 ARACA 91 | 92 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 93 | 1.000000 0.000000 0.000000 0.000000 94 | 0.500000 0.000000 0.500000 0.000000 95 | 1.000000 0.000000 0.000000 0.000000 96 | 0.000000 1.000000 0.000000 0.000000 97 | 1.000000 0.000000 0.000000 0.000000 98 | 99 | 100 | MOTIF 1.10 ARACC 101 | 102 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 103 | 1.000000 0.000000 0.000000 0.000000 104 | 0.500000 0.000000 0.500000 0.000000 105 | 1.000000 0.000000 0.000000 0.000000 106 | 0.000000 1.000000 0.000000 0.000000 107 | 0.000000 1.000000 0.000000 0.000000 108 | 109 | 110 | MOTIF 1.11 ARACH 111 | 112 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 113 | 1.000000 0.000000 0.000000 0.000000 114 | 0.500000 0.000000 0.500000 0.000000 115 | 1.000000 0.000000 0.000000 0.000000 116 | 0.000000 1.000000 0.000000 0.000000 117 | 0.333333 0.333333 0.000000 0.333333 118 | 119 | 120 | MOTIF 1.12 ARACT 121 | 122 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 123 | 1.000000 0.000000 0.000000 0.000000 124 | 0.500000 0.000000 0.500000 0.000000 125 | 1.000000 0.000000 0.000000 0.000000 126 | 0.000000 1.000000 0.000000 0.000000 127 | 0.000000 0.000000 0.000000 1.000000 128 | 129 | 130 | MOTIF 1.13 GAACA 131 | 132 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 133 | 0.000000 0.000000 1.000000 0.000000 134 | 1.000000 0.000000 0.000000 0.000000 135 | 1.000000 0.000000 0.000000 0.000000 136 | 0.000000 1.000000 0.000000 0.000000 137 | 1.000000 0.000000 0.000000 0.000000 138 | 139 | 140 | MOTIF 1.14 GAACC 141 | 142 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 143 | 0.000000 0.000000 1.000000 0.000000 144 | 1.000000 0.000000 0.000000 0.000000 145 | 1.000000 0.000000 0.000000 0.000000 146 | 0.000000 1.000000 0.000000 0.000000 147 | 0.000000 1.000000 0.000000 0.000000 148 | 149 | 150 | MOTIF 1.15 GAACH 151 | 152 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 153 | 0.000000 0.000000 1.000000 0.000000 154 | 1.000000 0.000000 0.000000 0.000000 155 | 1.000000 0.000000 0.000000 0.000000 156 | 0.000000 1.000000 0.000000 0.000000 157 | 0.333333 0.333333 0.000000 0.333333 158 | 159 | 160 | MOTIF 1.16 GAACT 161 | 162 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 163 | 0.000000 0.000000 1.000000 0.000000 164 | 1.000000 0.000000 0.000000 0.000000 165 | 1.000000 0.000000 0.000000 0.000000 166 | 0.000000 1.000000 0.000000 0.000000 167 | 0.000000 0.000000 0.000000 1.000000 168 | 169 | 170 | MOTIF 1.17 GGACA 171 | 172 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 173 | 0.000000 0.000000 1.000000 0.000000 174 | 0.000000 0.000000 1.000000 0.000000 175 | 1.000000 0.000000 0.000000 0.000000 176 | 0.000000 1.000000 0.000000 0.000000 177 | 1.000000 0.000000 0.000000 0.000000 178 | 179 | 180 | MOTIF 1.18 GGACC 181 | 182 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 183 | 0.000000 0.000000 1.000000 0.000000 184 | 0.000000 0.000000 1.000000 0.000000 185 | 1.000000 0.000000 0.000000 0.000000 186 | 0.000000 1.000000 0.000000 0.000000 187 | 0.000000 1.000000 0.000000 0.000000 188 | 189 | 190 | MOTIF 1.19 GGACH 191 | 192 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 193 | 0.000000 0.000000 1.000000 0.000000 194 | 0.000000 0.000000 1.000000 0.000000 195 | 1.000000 0.000000 0.000000 0.000000 196 | 0.000000 1.000000 0.000000 0.000000 197 | 0.333333 0.333333 0.000000 0.333333 198 | 199 | 200 | MOTIF 1.20 GGACT 201 | 202 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 203 | 0.000000 0.000000 1.000000 0.000000 204 | 0.000000 0.000000 1.000000 0.000000 205 | 1.000000 0.000000 0.000000 0.000000 206 | 0.000000 1.000000 0.000000 0.000000 207 | 0.000000 0.000000 0.000000 1.000000 208 | 209 | 210 | MOTIF 1.21 GRACA 211 | 212 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 213 | 0.000000 0.000000 1.000000 0.000000 214 | 0.500000 0.000000 0.500000 0.000000 215 | 1.000000 0.000000 0.000000 0.000000 216 | 0.000000 1.000000 0.000000 0.000000 217 | 1.000000 0.000000 0.000000 0.000000 218 | 219 | 220 | MOTIF 1.22 GRACC 221 | 222 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 223 | 0.000000 0.000000 1.000000 0.000000 224 | 0.500000 0.000000 0.500000 0.000000 225 | 1.000000 0.000000 0.000000 0.000000 226 | 0.000000 1.000000 0.000000 0.000000 227 | 0.000000 1.000000 0.000000 0.000000 228 | 229 | 230 | MOTIF 1.23 GRACH 231 | 232 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 233 | 0.000000 0.000000 1.000000 0.000000 234 | 0.500000 0.000000 0.500000 0.000000 235 | 1.000000 0.000000 0.000000 0.000000 236 | 0.000000 1.000000 0.000000 0.000000 237 | 0.333333 0.333333 0.000000 0.333333 238 | 239 | 240 | MOTIF 1.24 GRACT 241 | 242 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 243 | 0.000000 0.000000 1.000000 0.000000 244 | 0.500000 0.000000 0.500000 0.000000 245 | 1.000000 0.000000 0.000000 0.000000 246 | 0.000000 1.000000 0.000000 0.000000 247 | 0.000000 0.000000 0.000000 1.000000 248 | 249 | 250 | MOTIF 1.25 RAACA 251 | 252 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 253 | 0.500000 0.000000 0.500000 0.000000 254 | 1.000000 0.000000 0.000000 0.000000 255 | 1.000000 0.000000 0.000000 0.000000 256 | 0.000000 1.000000 0.000000 0.000000 257 | 1.000000 0.000000 0.000000 0.000000 258 | 259 | 260 | MOTIF 1.26 RAACC 261 | 262 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 263 | 0.500000 0.000000 0.500000 0.000000 264 | 1.000000 0.000000 0.000000 0.000000 265 | 1.000000 0.000000 0.000000 0.000000 266 | 0.000000 1.000000 0.000000 0.000000 267 | 0.000000 1.000000 0.000000 0.000000 268 | 269 | 270 | MOTIF 1.27 RAACH 271 | 272 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 273 | 0.500000 0.000000 0.500000 0.000000 274 | 1.000000 0.000000 0.000000 0.000000 275 | 1.000000 0.000000 0.000000 0.000000 276 | 0.000000 1.000000 0.000000 0.000000 277 | 0.333333 0.333333 0.000000 0.333333 278 | 279 | 280 | MOTIF 1.28 RAACT 281 | 282 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 283 | 0.500000 0.000000 0.500000 0.000000 284 | 1.000000 0.000000 0.000000 0.000000 285 | 1.000000 0.000000 0.000000 0.000000 286 | 0.000000 1.000000 0.000000 0.000000 287 | 0.000000 0.000000 0.000000 1.000000 288 | 289 | 290 | MOTIF 1.29 RGACA 291 | 292 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 293 | 0.500000 0.000000 0.500000 0.000000 294 | 0.000000 0.000000 1.000000 0.000000 295 | 1.000000 0.000000 0.000000 0.000000 296 | 0.000000 1.000000 0.000000 0.000000 297 | 1.000000 0.000000 0.000000 0.000000 298 | 299 | 300 | MOTIF 1.30 RGACC 301 | 302 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 303 | 0.500000 0.000000 0.500000 0.000000 304 | 0.000000 0.000000 1.000000 0.000000 305 | 1.000000 0.000000 0.000000 0.000000 306 | 0.000000 1.000000 0.000000 0.000000 307 | 0.000000 1.000000 0.000000 0.000000 308 | 309 | 310 | MOTIF 1.31 RGACH 311 | 312 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 313 | 0.500000 0.000000 0.500000 0.000000 314 | 0.000000 0.000000 1.000000 0.000000 315 | 1.000000 0.000000 0.000000 0.000000 316 | 0.000000 1.000000 0.000000 0.000000 317 | 0.333333 0.333333 0.000000 0.333333 318 | 319 | 320 | MOTIF 1.32 RGACT 321 | 322 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 323 | 0.500000 0.000000 0.500000 0.000000 324 | 0.000000 0.000000 1.000000 0.000000 325 | 1.000000 0.000000 0.000000 0.000000 326 | 0.000000 1.000000 0.000000 0.000000 327 | 0.000000 0.000000 0.000000 1.000000 328 | 329 | 330 | MOTIF 1.33 RRACA 331 | 332 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 333 | 0.500000 0.000000 0.500000 0.000000 334 | 0.500000 0.000000 0.500000 0.000000 335 | 1.000000 0.000000 0.000000 0.000000 336 | 0.000000 1.000000 0.000000 0.000000 337 | 1.000000 0.000000 0.000000 0.000000 338 | 339 | 340 | MOTIF 1.34 RRACC 341 | 342 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 343 | 0.500000 0.000000 0.500000 0.000000 344 | 0.500000 0.000000 0.500000 0.000000 345 | 1.000000 0.000000 0.000000 0.000000 346 | 0.000000 1.000000 0.000000 0.000000 347 | 0.000000 1.000000 0.000000 0.000000 348 | 349 | 350 | MOTIF 1.35 RRACH 351 | 352 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 353 | 0.500000 0.000000 0.500000 0.000000 354 | 0.500000 0.000000 0.500000 0.000000 355 | 1.000000 0.000000 0.000000 0.000000 356 | 0.000000 1.000000 0.000000 0.000000 357 | 0.333333 0.333333 0.000000 0.333333 358 | 359 | 360 | MOTIF 1.36 RRACT 361 | 362 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000 363 | 0.500000 0.000000 0.500000 0.000000 364 | 0.500000 0.000000 0.500000 0.000000 365 | 1.000000 0.000000 0.000000 0.000000 366 | 0.000000 1.000000 0.000000 0.000000 367 | 0.000000 0.000000 0.000000 1.000000 368 | -------------------------------------------------------------------------------- /bin/DiffReport.R: -------------------------------------------------------------------------------- 1 | #!/bin/Rscript 2 | ## Rscript arranged_result.R 3 | ### designfile: Sample_id, Input_filename, IP_filename, group_id 4 | ### compare_str: Compairision design (eg: A_vs_B) 5 | args <- commandArgs(T) 6 | #args <- c("macs2_MeTDiff_DESeq2_arranged_results_2019-12-11.m6APipe", "DiffReport.RData") 7 | m6APipe.data <- args[1]#"formatted_designfile.txt" 8 | output.Rdata <- args[2]#"compare_info" 9 | 10 | library(pheatmap) 11 | library(ggplot2) 12 | library(ggrepel) 13 | library(grid) 14 | library(reshape2) 15 | load(m6APipe.data) 16 | 17 | draw_colnames_90 <- function (coln, ...) { 18 | m = length(coln) 19 | x = (1:m)/m - 1/2/m 20 | grid.text(coln, x = x, y = unit(0.96, "npc"), vjust = .5, 21 | hjust = 1, rot = 90, gp = gpar(...)) ## 注意缺省值为 'hjust=0' 和'rot=270' 22 | } 23 | assignInNamespace(x="draw_colnames", value="draw_colnames_90", 24 | ns=asNamespace("pheatmap")) 25 | 26 | heatmap_dm <- function(mat,coldt){ 27 | pheatmap(mat, cluster_rows=FALSE, show_rownames=F, cluster_cols=FALSE, annotation_col=coldt, 28 | main = "Heatmap of Different Methylation", scale = "row") 29 | } 30 | 31 | heatmap_de <- function(mat, coldt){ 32 | pheatmap(mat, cluster_rows=FALSE, show_rownames=F, cluster_cols=FALSE, annotation_col=coldt, 33 | color = colorRampPalette(c(rep('#1C2B6F',1),'black', rep('#E31E26',1)))(50), 34 | main = "Heatmap of Different Expression", scale = "row") 35 | } 36 | 37 | ECDF_plot <- function(df,value_var,group,plot_title="",test_result=""){ 38 | if (test_result !=""){ 39 | if (test_result$p.value <=0){ 40 | test_anno = paste(test_result$method,"\n",names(test_result$statistic)," = ",signif(test_result$statistic, 3), 41 | "\nP Value < 2.2e-16",sep = "") 42 | } else { 43 | test_anno = paste(test_result$method,"\n",names(test_result$statistic)," = ",signif(test_result$statistic, 3), 44 | "\nP Value= ",signif(test_result$p.value,3),sep = "") 45 | } 46 | }else{ 47 | test_anno = "" 48 | } 49 | p <- ggplot(df,aes(x=value_var,group=group,color=group))+theme_test()+ 50 | stat_ecdf(size = 1)+theme(legend.position=c(0.85,.15))+ 51 | annotate("text",x=-Inf,y=Inf,vjust=1.5,hjust=-.12,label=test_anno)+ 52 | scale_y_continuous(expand = c(0,0))+scale_x_continuous(expand = c(0,0),limits = c(0,1.02))+ 53 | labs(title= plot_title, y="Cumulative fraction" , x = "Peaks intensity")+ 54 | theme(plot.title = element_text(size = 15, angle = 0, face = "plain", colour = "black"), 55 | axis.title.x = element_text(size = 15, angle = 0, face = "plain", colour = "black"), 56 | axis.title.y = element_text(size = 15, angle = 90, face = "plain", colour = "black"), 57 | axis.text.x = element_text(size = 15, angle = 0, face = "plain", colour = "black"), 58 | axis.text.y = element_text(size = 15, angle = 0, face = "plain", colour = "black")) 59 | return(p) 60 | } 61 | 62 | volcano_plot_dm = function(res, Sample_1 = "A", Sample_2 = "B", lfc = 0.58, pval = 0.05, groupname = ""){ 63 | par(mar = c(5, 6, 5, 5)) 64 | tab = data.frame(logFC = res$log2FC, negLogPval = -log10(res$pvalue)) 65 | tab$gene_name = rownames(res) 66 | tab = na.omit(tab) 67 | tab$threshold <- "C" 68 | tab$threshold[tab$logFC >= lfc & tab$negLogPval > -log10(pval)] <- "B" 69 | tab$threshold[tab$logFC <=-lfc & tab$negLogPval > -log10(pval)] <- "A" 70 | #tab<-tab%>%mutate(threshold = ifelse(logFC >= lfc & negLogPval > -log10(pval) ,"B", ifelse(logFC<=-lfc & negLogPval > -log10(pval), "A", "C"))) 71 | n_up = length(which(tab$threshold=="B")) 72 | n_down = length(which(tab$threshold=="A")) 73 | tab_order = tab[order(tab$negLogPval, decreasing = T),] 74 | ggplot(tab_order, aes(x=logFC, y=negLogPval)) + 75 | geom_point(aes(colour = threshold)) + 76 | scale_colour_manual(values = c("A"= "#619cff", "B"="#f8766d", "C"= "#c8c8c8"), 77 | labels=c(paste("Down: ", n_down, sep=""),paste("Up: ", n_up, sep = "") , "No sig"), name = NULL) + 78 | geom_hline(aes(yintercept=-log10(pval)), linetype="dashed") + 79 | geom_vline(aes(xintercept=-lfc), linetype="dashed") + 80 | geom_vline(aes(xintercept=lfc), linetype="dashed") + 81 | ggtitle(paste("Volcano Plot of Different Methylation in", groupname))+ 82 | xlab(expression(paste(Log[2], " fold change", sep = ""))) + 83 | ylab(expression(paste(-Log[10], " adjusted P value", sep = ""))) + 84 | theme_bw() + 85 | theme(legend.position = 'top', 86 | plot.title = element_text(hjust = 0.5)) 87 | } 88 | 89 | quadrant_plot <- function(quadrant.data, lfc = 0.58 , pval = 0.05, groupname = ""){ 90 | quadrant.data$threshold <- "nosig" 91 | quadrant.data$threshold[quadrant.data$m6A >= lfc & quadrant.data$exp >= lfc & 92 | quadrant.data$m6A.p <= pval & quadrant.data$exp.p <= pval ] <- "Hyper-up" 93 | quadrant.data$threshold[quadrant.data$m6A >= lfc & quadrant.data$exp <= -lfc & 94 | quadrant.data$m6A.p <= pval & quadrant.data$exp.p <= pval ] <- "Hyper-down" 95 | quadrant.data$threshold[quadrant.data$m6A <= -lfc & quadrant.data$exp >= lfc & 96 | quadrant.data$m6A.p <= pval & quadrant.data$exp.p <= pval ] <- "Hypo-up" 97 | quadrant.data$threshold[quadrant.data$m6A <= -lfc & quadrant.data$exp <= -lfc & 98 | quadrant.data$m6A.p <= pval & quadrant.data$exp.p <= pval ] <- "Hypo-down" 99 | quadrant.data.length <- table(quadrant.data$threshold) 100 | quadrant.data <- na.omit(quadrant.data) 101 | quadrant.data$threshold <- factor(quadrant.data$threshold,levels = c("Hyper-up","Hyper-down","Hypo-up","Hypo-down","nosig")) 102 | ggplot()+ 103 | geom_point(data = quadrant.data, 104 | aes_string(x= "exp" ,y="m6A", color="threshold"),size = 1)+ 105 | geom_hline(yintercept = lfc,linetype="dashed")+ 106 | geom_hline(yintercept = -lfc,linetype="dashed")+ 107 | geom_vline(xintercept = lfc,linetype="dashed")+ylab("dmlogfc")+ 108 | geom_vline(xintercept = -lfc,linetype="dashed")+ 109 | scale_x_continuous(limits = c(-5,5))+ 110 | scale_y_continuous(limits = c(-5,5))+ 111 | ggtitle(paste("Quadrant Plot between Methylation and Expression in", groupname))+ 112 | scale_colour_manual(values = c("Hyper-up" = "#7DB9DE", "Hyper-down" = "#D75455", 113 | "Hypo-up" = "#7BA23F", "Hypo-down" = "#A35E47", 114 | "nosig"= "#c8c8c8" ), 115 | labels = c(paste("Hyper-up: ", quadrant.data.length["Hyper-up"], sep=""), 116 | paste("Hyper-down: ", quadrant.data.length["Hyper-down"], sep = ""), 117 | paste("Hypo-up: ", quadrant.data.length["Hypo-up"], sep=""), 118 | paste("Hypo-down: ", quadrant.data.length["Hypo-down"], sep = ""), 119 | "No sig"), name = NULL) + 120 | theme_classic() + 121 | theme(plot.title = element_text(hjust = 0.5))+ 122 | geom_point(data = quadrant.data[quadrant.data$threshold!="nosig",], 123 | aes_string(x= "exp" ,y="m6A", color="threshold"),size = 1.5) 124 | 125 | } 126 | matrixcluster <- function(matrixData, cluster_rows = TRUE, cluster_cols = TRUE, cmethod = "complete"){ 127 | if(cluster_rows == TRUE){ 128 | ht <- hclust(dist(matrixData), method = cmethod) #对行进行聚类 129 | rowInd <- ht$order #将聚类后行的顺序存为rowInd 130 | }else{ 131 | rowInd <- 1:nrow(matrixData) 132 | } 133 | 134 | if(cluster_cols == TRUE){ 135 | ht <- hclust(dist(t(matrixData)), method = cmethod) #对矩阵进行转置,对原本的列进行聚类 136 | colInd <- ht$order #将聚类后列的顺序存为colInd 137 | }else{ 138 | colInd <- 1:ncol(matrixData) 139 | } 140 | 141 | matrixDataNew <-matrixData[rowInd,colInd] #将数据按照聚类结果重排行和列 142 | print(c("ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median", "centroid")) 143 | return(matrixDataNew) 144 | } 145 | ## dm & de & ecdf 146 | heatmap_dm.list <- NULL 147 | heatmap_de.list <- NULL 148 | volcano_dm.list <- NULL 149 | ecdf.list <- NULL 150 | quadrant.list <- NULL 151 | ecdf.data <- NULL 152 | ecdf.group.data <- melt(m6a.anno.matrix,ID="PeakRegion") 153 | for( group in as.character(compare.list) ){ 154 | group1 = strsplit(group, "_vs_")[[1]][1] 155 | group2 = strsplit(group, "_vs_")[[1]][2] 156 | coldata = subset(as.data.frame(design.matrix), Type==group1|Type==group2) 157 | coldata$Type = as.factor(coldata$Type) 158 | ## dm 159 | dmres = diffm6A.list[[which(names(diffm6A.list)==group)]] 160 | dmg = subset(dmres, abs(log2FC) > 0.58 & pvalue < 0.05) 161 | if (nrow(dmg)<=1) dmg = dmres 162 | matrix.dm = m6a.anno.matrix[,-c(1:3)] 163 | dm_mat = matrix.dm[dmg$PeakRegion,rownames(coldata)] 164 | select <- dmg[order(dmg$log2FC, decreasing = TRUE),] 165 | dm_mat = log2(dm_mat+1) 166 | dm_mat = dm_mat[select$PeakRegion,] 167 | dm_mat = na.omit(dm_mat) 168 | dm_mat_new <- matrixcluster(dm_mat,cmethod = "single") 169 | heatmap_dm.list[[group]] <- heatmap_dm(dm_mat_new,coldata) 170 | ## de 171 | deres = diffexpression.list[[which(names(diffexpression.list)==group)]] 172 | deg = subset(deres, abs(log2FoldChange)> 0.58 & pvalue < 0.05) 173 | if (nrow(deg)<=1) deg = deres 174 | rownames(deg) = deg$ID 175 | de_mat = expression.matrix[row.names(deg),rownames(coldata)] 176 | select <- deg[order(deg$log2FoldChange, decreasing = TRUE), ] 177 | de_mat = log2(de_mat+1) 178 | de_mat = de_mat[rownames(select),] 179 | de_mat = na.omit(de_mat) 180 | de_mat_new <- matrixcluster(de_mat,cmethod = "single") 181 | heatmap_de.list[[group]] <- heatmap_de(de_mat_new,coldata) 182 | 183 | ## ecdf 184 | ecdf.group.data.tmp = subset(ecdf.group.data,variable %in% rownames(coldata)) 185 | ecdf.group.data.tmp$group <- group1 186 | ecdf.group.data.tmp$group[ecdf.group.data.tmp$variable %in% rownames(coldata)[coldata$Type == group2]] <- group2 187 | #ecdf.group.data.tmp <- ecdf.group.data.tmp%>%mutate(group = ifelse(variable %in% rownames(coldata)[coldata$Type == group1], group1, group2)) 188 | ecdf.group.data.tmp <- na.omit(ecdf.group.data.tmp) 189 | ecdf.list[[group]] <- ECDF_plot(ecdf.group.data.tmp,ecdf.group.data.tmp$value,ecdf.group.data.tmp$group) 190 | ecdf.data <- rbind(ecdf.data,data.frame(data = diffm6A.list[[group]]$log2FC , group = group)) 191 | ## volcano plot 192 | volcano_dm.list[[group]] <- volcano_plot_dm(diffm6A.list[[group]],groupname = group) 193 | ## quadrant plot 194 | diffm6a.results <- diffm6A.list[[group]] 195 | rownames(diffm6a.results) <- diffm6a.results$PeakRegion 196 | diffexp.results <- diffexpression.list[[group]] 197 | rownames(diffexp.results) <- diffexp.results$ID 198 | quadrant.data <- data.frame(row.names = rownames(diffm6a.results), 199 | m6A = diffm6a.results$log2FC, 200 | m6A.p = diffm6a.results$pvalue, 201 | exp = diffexp.results[diffm6a.results$ID,"log2FoldChange"], 202 | exp.p = diffexp.results[diffm6a.results$ID,"pvalue"]) 203 | quadrant.list[[group]] <-quadrant_plot(quadrant.data,lfc = 0.58,pval = 0.05,groupname = group) 204 | 205 | ## plot 206 | pdf(file = paste0("heatmap_dm_",group,".pdf"),paper = "USr") 207 | print(heatmap_dm.list[[group]]) 208 | dev.off() 209 | pdf(file = paste0("heatmap_de_",group,".pdf"),paper = "USr") 210 | print(heatmap_de.list[[group]]) 211 | dev.off() 212 | pdf(file = paste0("volcano_dm_",group,".pdf"),paper = "USr") 213 | print(volcano_dm.list[[group]]) 214 | dev.off() 215 | pdf(file = paste0("ecdf_",group,".pdf"),paper = "USr") 216 | print(ecdf.list[[group]]) 217 | dev.off() 218 | pdf(file = paste0("quadrant_",group,".pdf"),paper = "USr") 219 | print(quadrant.list[[group]]) 220 | dev.off() 221 | } 222 | ecdf.data <- na.omit(ecdf.data) 223 | ecdf.data <- ecdf.data[is.finite(ecdf.data$data),] 224 | ecdf.list[["combined"]] <- ECDF_plot(ecdf.data,ecdf.data$data,ecdf.data$group) 225 | pdf(file = paste0("ecdf_","combined.pdf"),paper = "USr") 226 | print(ecdf.list[["combined"]]) 227 | dev.off() 228 | save(design.matrix,compare.list,heatmap_dm.list,heatmap_de.list,volcano_dm.list,ecdf.list,quadrant.list,file = output.Rdata) 229 | -------------------------------------------------------------------------------- /bin/m6A_annotate_forGTF_xingyang2.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | #perl m6A_annotate_forGTF.pl /data1/database/hg38/GENCODE/gencode.v25.annotation.gtf macs2/merged_Peak.bed macs2/merged_Peak 3 | 4 | use strict; 5 | use warnings; 6 | use FindBin qw($Bin); 7 | 8 | if (@ARGV < 3) { 9 | print " 10 | usage: perl m6A_annotate_forGTF.pl 11 | program will create multi file: .center .anno.txt .unanno.txt ... 12 | \n"; 13 | exit; 14 | } 15 | 16 | 17 | my ($ref_gene_gtf, $peak_bed, $outPrefix) = @ARGV; 18 | 19 | #make a gene type list 20 | my %GeneType; #refer to ftp://ftp.sanger.ac.uk/pub/gencode/_README_stats.txt of 1) GENES. of https://www.gencodegenes.org/stats/current.html 21 | foreach (qw (protein_coding)) { 22 | $GeneType{"$_"} = "mRNA"; } 23 | foreach (qw (3prime_overlapping_ncRNA antisense bidirectional_promoter_lncRNA known_ncrna lincRNA macro_lncRNA non_coding nonsense_mediated_decay non_stop_decay processed_transcript retained_intron sense_intronic sense_overlapping)) { 24 | $GeneType{"$_"} = "Long non-coding RNA"; } 25 | foreach (qw (miRNA misc_RNA Mt_rRNA Mt_tRNA ribozyme rRNA scaRNA scRNA snoRNA snRNA sRNA vaultRNA)) { 26 | $GeneType{"$_"} = "Others"; } 27 | foreach (qw (pseudogene polymorphic_pseudogene processed_pseudogene transcribed_processed_pseudogene transcribed_unitary_pseudogene transcribed_unprocessed_pseudogene unitary_pseudogene unprocessed_pseudogene)) { 28 | $GeneType{"$_"} = "Pseudogene"; } 29 | foreach (qw (IG_C_gene IG_D_gene IG_J_gene IG_V_gene IG_pseudogene IG_C_pseudogene IG_J_pseudogene IG_V_pseudogene TR_C_gene TR_D_gene TR_J_gene TR_V_gene TR_J_pseudogene TR_V_pseudogene processed_transcript TEC)) { 30 | $GeneType{"$_"} = "Others"; } 31 | 32 | `awk '{print \$1"\t"int((\$2+\$3)/2)"\t"int((\$2+\$3)/2)+1"\t"\$1":"\$2"-"\$3}' $peak_bed > $outPrefix.peak_bed.center`; 33 | 34 | #reads all genes of reference and save transcript's information(exon & CDS). 35 | my (%RefAllGene, %RefPickTran, %RefPickTran_final); 36 | open IN, $ref_gene_gtf || die; 37 | open RBED,">$outPrefix.tmp.refSeq.bed" or die; 38 | while () { 39 | chomp; 40 | next if (/^\s*$|^\#/); 41 | my @w = split (/\t/); 42 | next if (@w < 9); 43 | my ($chr, $ftype, $start, $end, $strand, $features) = @w[0,2,3,4,6,8]; 44 | next if ($ftype !~ /^exon$|^CDS$|^transcript$/); 45 | my ($gene_id, $transcript_id) = ("") x 2; 46 | $gene_id = $1 if ($features =~ /\bgene_id\s+\"([^\"]+)\";/); 47 | $transcript_id = $1 if ($features =~ /\btranscript_id\s+\"([^\"]+)\";/); 48 | if ($ftype eq "transcript") { 49 | my ($gene_name, $transcript_type) = ("") x 2; 50 | $gene_name = $1 if ($features =~ /\bgene_name\s+\"([^\"]+)\";/); 51 | $transcript_type = $1 if ($features =~ /\btranscript_type\s+\"([^\"]+)\";/); 52 | $RefAllGene{$gene_id}{$transcript_id}{chr} = $chr; 53 | $RefAllGene{$gene_id}{$transcript_id}{start} = $start; 54 | $RefAllGene{$gene_id}{$transcript_id}{end} = $end; 55 | $RefAllGene{$gene_id}{$transcript_id}{strand} = $strand; 56 | $RefAllGene{$gene_id}{$transcript_id}{gene_id} = $gene_id; 57 | $RefAllGene{$gene_id}{$transcript_id}{gene_name} = $gene_name; 58 | $RefAllGene{$gene_id}{$transcript_id}{transcript_type} = $transcript_type; 59 | $RefAllGene{$gene_id}{$transcript_id}{Gene_Type} = (exists $GeneType{$transcript_type}) ? $GeneType{$transcript_type} : "Unknown"; 60 | print RBED "$RefAllGene{$gene_id}{$transcript_id}{chr}\t$RefAllGene{$gene_id}{$transcript_id}{start}\t$RefAllGene{$gene_id}{$transcript_id}{end}\t$transcript_id\t0\t$RefAllGene{$gene_id}{$transcript_id}{strand}\t$gene_id\n"; 61 | } 62 | else { #$ftype =~ /^exon$|^CDS$/ 63 | push @{$RefAllGene{$gene_id}{$transcript_id}{$ftype}}, [$start, $end, $end-$start+1]; 64 | } 65 | } 66 | close RBED; 67 | close IN; 68 | 69 | `intersectBed -a $outPrefix.peak_bed.center -b $outPrefix.tmp.refSeq.bed -wa -wb > $outPrefix.refSeq.all.bed`; 70 | 71 | #delete exon if exists CDS. delete shorter transcript if there are multi transcript in a gene. 72 | open FH,"$outPrefix.refSeq.all.bed" or die; 73 | while () { 74 | chomp; 75 | my @fields = split "\t"; 76 | my ($peakid,$overlap_chr,$overlap_start,$overlap_end,$overlap_transid,$overlap_strand,$overlap_gene_id,$ppos) = @fields[3,4,5,6,7,9,10,1]; 77 | #$RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{chr} = $RefAllGene{$overlap_gene_id}{$overlap_transid}{chr}; 78 | #$RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{start} = $RefAllGene{$overlap_gene_id}{$overlap_transid}{start}; 79 | #$RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{end} = $RefAllGene{$overlap_gene_id}{$overlap_transid}{end}; 80 | #$RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{strand} = $RefAllGene{$overlap_gene_id}{$overlap_transid}{strand}; 81 | #$RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{gene_id} = $RefAllGene{$overlap_gene_id}{$overlap_transid}{gene_id}; 82 | #$RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{gene_name} = $RefAllGene{$overlap_gene_id}{$overlap_transid}{gene_name}; 83 | #$RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{transcript_type} = $RefAllGene{$overlap_gene_id}{$overlap_transid}{tanscript_type}; 84 | #$RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{Gene_Type} = $RefAllGene{$overlap_gene_id}{$overlap_transid}{Gene_Type}; 85 | %{$RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}} = %{$RefAllGene{$overlap_gene_id}{$overlap_transid}}; 86 | $RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{chr} = $overlap_chr; 87 | $RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{center_start} = $ppos; 88 | $RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{center_end} = $ppos+1; 89 | $RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{chr_trans} = $overlap_chr; 90 | $RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{trans_start} = $overlap_start; 91 | $RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{trans_end} = $overlap_end; 92 | $RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{fuck_peakid} = $peakid; 93 | $RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{zero} = 0; 94 | $RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{strand} = $overlap_strand; 95 | $RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{ppos} = $ppos; 96 | } 97 | print "\n"; 98 | foreach my $peakid (keys %RefPickTran) { 99 | my ($cur_array, $cur_total_len, $cur_ftype); 100 | my ($longest_transcript_id, $longest_sum_len, $ftype, $longest_gene_id) = ("", 0, "",""); 101 | foreach my $gene_id_temp (keys %{$RefPickTran{$peakid}}){ 102 | foreach my $transcript_id_temp (keys %{$RefPickTran{$peakid}{$gene_id_temp}}){ 103 | if (exists $RefPickTran{$peakid}{$gene_id_temp}{$transcript_id_temp}{CDS}) { 104 | $cur_array = $RefPickTran{$peakid}{$gene_id_temp}{$transcript_id_temp}{CDS}; 105 | $cur_ftype = "CDS"; 106 | } else { 107 | $cur_array = $RefPickTran{$peakid}{$gene_id_temp}{$transcript_id_temp}{exon}; 108 | $cur_ftype = "exon"; 109 | } 110 | foreach (@{$cur_array}) { 111 | $cur_total_len += $_->[2]; 112 | } 113 | if ($cur_ftype eq $ftype and $cur_total_len > $longest_sum_len) { 114 | $longest_transcript_id = $transcript_id_temp; 115 | $longest_gene_id = $gene_id_temp; 116 | $longest_sum_len = $cur_total_len; 117 | $ftype = $cur_ftype; 118 | } else { 119 | if($cur_ftype ne $ftype and $cur_ftype eq "CDS"){ 120 | $longest_transcript_id = $transcript_id_temp; 121 | $longest_gene_id = $gene_id_temp; 122 | $longest_sum_len = $cur_total_len; 123 | $ftype = $cur_ftype; 124 | } else { 125 | if($cur_ftype eq "exon" and $ftype eq ""){ 126 | $longest_transcript_id = $transcript_id_temp; 127 | $longest_gene_id = $gene_id_temp; 128 | $longest_sum_len = $cur_total_len; 129 | $ftype = $cur_ftype; 130 | } 131 | } 132 | } 133 | } 134 | } 135 | $RefPickTran_final{$peakid}{$longest_transcript_id} = $RefPickTran{$peakid}{$longest_gene_id}{$longest_transcript_id}; 136 | $RefPickTran_final{$peakid}{$longest_transcript_id}{gene_type} = $ftype; 137 | @{$RefPickTran_final{$peakid}{$longest_transcript_id}{exon}} = sort {$a->[0] <=> $b->[0]} @{$RefPickTran_final{$peakid}{$longest_transcript_id}{exon}}; 138 | if ($ftype eq "CDS") { 139 | @{$RefPickTran_final{$peakid}{$longest_transcript_id}{CDS}} = sort {$a->[0] <=> $b->[0]} @{$RefPickTran_final{$peakid}{$longest_transcript_id}{CDS}}; 140 | my ($total_tran_len, $total_cds_len, $utr5_len, $utr3_len) = (0, 0, 0, 0); 141 | my ($cds_start, $cds_end) = ($RefPickTran_final{$peakid}{$longest_transcript_id}{CDS}->[0][0], $RefPickTran_final{$peakid}{$longest_transcript_id}{CDS}->[-1][1]); 142 | foreach (@{$RefPickTran_final{$peakid}{$longest_transcript_id}{CDS}}) { 143 | $total_cds_len += $_->[2]; 144 | } 145 | foreach (@{$RefPickTran_final{$peakid}{$longest_transcript_id}{exon}}) { 146 | $total_tran_len += $_->[2]; 147 | my ($cur_start, $cur_end, $cur_len) = @{$_}; 148 | if ($cds_start > $cur_start) { 149 | if ($cds_start <= $cur_end) { $utr5_len += $cds_start - $cur_start; } 150 | else { $utr5_len += $cur_len; } 151 | } 152 | if ($cds_end < $cur_end) { 153 | if ($cds_end >= $cur_start) { $utr3_len += $cur_end - $cds_end; } 154 | else { $utr3_len += $cur_len; } 155 | } 156 | } 157 | $RefPickTran_final{$peakid}{$longest_transcript_id}{total_tran_len} = $total_tran_len; 158 | $RefPickTran_final{$peakid}{$longest_transcript_id}{total_cds_len} = $total_cds_len; 159 | $RefPickTran_final{$peakid}{$longest_transcript_id}{utr5_len} = $utr5_len; 160 | $RefPickTran_final{$peakid}{$longest_transcript_id}{utr3_len} = $utr3_len; 161 | } 162 | else { #$ftype eq "exon", that is not coding RNA 163 | my $total_tran_len; 164 | foreach (@{$RefPickTran_final{$peakid}{$longest_transcript_id}{exon}}) { $total_tran_len += $_->[2]; } 165 | $RefPickTran_final{$peakid}{$longest_transcript_id}{total_tran_len} = $total_tran_len; 166 | $RefPickTran_final{$peakid}{$longest_transcript_id}{total_cds_len} = 0; 167 | $RefPickTran_final{$peakid}{$longest_transcript_id}{utr5_len} = 0; 168 | $RefPickTran_final{$peakid}{$longest_transcript_id}{utr3_len} = 0; 169 | } 170 | } 171 | 172 | sub sort_2dArray { 173 | my $in_arr = $_[0]; 174 | my @out_arr = sort {$a->[0] <=> $b->[0]} @{$in_arr}; 175 | return \@out_arr; 176 | } 177 | 178 | 179 | #annotate 180 | my %p2t; 181 | foreach my $peak_id (keys %RefPickTran_final){ 182 | my $tran_id = (keys %{$RefPickTran_final{$peak_id}})[0]; 183 | my $ppos = $RefPickTran_final{$peak_id}{$tran_id}{ppos}; 184 | if(exists $p2t{$peak_id}){ 185 | if($p2t{$peak_id}{cdslen}==0){ 186 | if($RefPickTran_final{$peak_id}{$tran_id}{total_cds_len}<=$p2t{$peak_id}{cdslen}){ 187 | next; 188 | }else{ 189 | if($RefPickTran_final{$peak_id}{$tran_id}{total_tran_len}<=$p2t{$peak_id}{tlen}){ 190 | next; 191 | } 192 | } 193 | }else{ 194 | if($RefPickTran_final{$peak_id}{$tran_id}{total_cds_len}<=$p2t{$peak_id}{cdslen}){ 195 | next; 196 | } 197 | } 198 | } 199 | $p2t{$peak_id}{cdslen}= $RefPickTran_final{$peak_id}{$tran_id}{total_cds_len}; #$cdslen{$tran_id}; 200 | $p2t{$peak_id}{tlen}=$RefPickTran_final{$peak_id}{$tran_id}{total_tran_len}; 201 | $p2t{$peak_id}{gene}=$RefPickTran_final{$peak_id}{$tran_id}{gene_name}; 202 | $p2t{$peak_id}{ts}=$tran_id; 203 | $p2t{$peak_id}{ppos}=$ppos; 204 | $p2t{$peak_id}{intersect}= $RefPickTran_final{$peak_id}{$tran_id}{chr}."\t".$RefPickTran_final{$peak_id}{$tran_id}{center_start}."\t".$RefPickTran_final{$peak_id}{$tran_id}{center_end}."\t".$peak_id."\t".$RefPickTran_final{$peak_id}{$tran_id}{chr_trans}."\t".$RefPickTran_final{$peak_id}{$tran_id}{trans_start}."\t".$RefPickTran_final{$peak_id}{$tran_id}{trans_end}."\t".$tran_id."\t".$RefPickTran_final{$peak_id}{$tran_id}{zero}."\t".$RefPickTran_final{$peak_id}{$tran_id}{strand}; 205 | } 206 | 207 | open OUT, ">$outPrefix.anno.txt" || die; 208 | foreach my $peak_id (keys %p2t){ 209 | my $ppos=$p2t{$peak_id}{ppos}; 210 | my $tran_id=$p2t{$peak_id}{ts}; 211 | my %Tran = %{$RefPickTran_final{$peak_id}{$tran_id}}; 212 | my ($bin, $exon_sum_len, $segtype) = (0, 0, ""); 213 | my $cstatus = ($Tran{gene_type} eq "CDS") ? "coding" : "noncoding"; 214 | my @cur_array= @{$Tran{exon}}; 215 | my ($cds_start, $cds_end) = ($Tran{CDS}->[0][0], $Tran{CDS}->[-1][1]); 216 | for (my $i=0; $i<=$#cur_array; $i++) { 217 | my ($exon_start, $exon_end, $exon_len) = @{$cur_array[$i]}; 218 | $exon_sum_len += $exon_len; 219 | if ($ppos >= $exon_start && $ppos <= $exon_end) { 220 | if ($cstatus eq "noncoding") { 221 | # $bin = int (($ppos - $exon_start) / $exon_len * 100); # $bin is the percentage of ppos in each exon. 222 | $bin = int (($exon_sum_len - ($exon_end - $ppos)) / $Tran{total_tran_len} * 100); # $bin is the percentage of ppos in each transcript. 223 | $segtype = "exon"; 224 | } 225 | else { #$cstatus eq "coding" 226 | if ($ppos < $cds_start) { 227 | if ($Tran{utr5_len} == 0) {print join ("\t", $peak_id, $cds_start, $cds_end, $ppos, "\n");} 228 | $bin = int (($exon_sum_len - ($exon_end - $ppos)) / $Tran{utr5_len} * 100); 229 | $segtype = ($Tran{strand} eq "+") ? "5UTR" : "3UTR"; 230 | }elsif ($ppos > $cds_end) { 231 | $bin = int (($Tran{total_tran_len} - $exon_sum_len + ($exon_end - $ppos)) / $Tran{utr3_len} * 100); 232 | $bin = 100 - $bin; 233 | $segtype = ($Tran{strand} eq "+") ? "3UTR" : "5UTR"; 234 | }else { 235 | $bin = int (($exon_sum_len - ($exon_end - $ppos) - $Tran{utr5_len}) / $Tran{total_cds_len} * 100); 236 | $segtype="CDS"; 237 | } 238 | } 239 | last; 240 | } 241 | else { 242 | if ($i < $#cur_array) {# isn't the last one 243 | my $next_exon_start = $cur_array[$i+1]->[0]; 244 | if ($ppos > $exon_end && $ppos < $next_exon_start) { 245 | $bin = int (($ppos - $exon_end) / ($next_exon_start - $exon_end) * 100); 246 | $segtype = "intron"; 247 | last; 248 | } 249 | } 250 | } 251 | } #end: for (my $i=0; $i<=$#cur_array; $i++) 252 | if ($segtype) { #can find peak in the transcript 253 | $bin = 100 - $bin if ($Tran{strand} eq "-"); 254 | print OUT $p2t{$peak_id}{intersect}, "\t", join ("\t", $Tran{gene_name}, $cstatus, $segtype, $bin, $Tran{gene_id}, $Tran{transcript_type}, $Tran{Gene_Type}), "\n"; 255 | } 256 | } 257 | close OUT; 258 | 259 | print `perl $Bin/intersec.pl -a $outPrefix.peak_bed.center -na 4 -b $outPrefix.anno.txt -nb 4 -t ua > $outPrefix.unanno.txt`; 260 | 261 | __END__ 262 | --------------------------------------------------------------------------------