├── .gitattributes
├── .gitignore
├── assets
    ├── nf-core-meripseqpipe_logo.png
    ├── multiqc_config.yaml
    ├── email_template.txt
    ├── sendmail_template.txt
    └── email_template.html
├── conf
    ├── C2.config
    ├── docker.config
    ├── test_bam.config
    ├── test.config
    ├── test_mixed.config
    └── base.config
├── .github
    ├── markdownlint.yml
    ├── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
    ├── PULL_REQUEST_TEMPLATE.md
    ├── workflows
    │   ├── ci_aligners.yml
    │   ├── ci_methy_methods.yml
    │   ├── linting.yml
    │   ├── ci_peakcalling_tools.yml
    │   ├── branch.yml
    │   ├── ci_peakcalling_methods.yml
    │   ├── awstest.yml
    │   └── awsfulltest.yml
    └── CONTRIBUTING.md
├── CHANGELOG.md
├── bin
    ├── geneBody_coverage2.sh
    ├── DiffReport.rmd
    ├── MeTPeak.R
    ├── merge_peaks_by_rank.R
    ├── markdown_to_html.r
    ├── cufflinks.sh
    ├── MATK_quantification.sh
    ├── normalize_peaks.py
    ├── QC_Peaks_Report.rmd
    ├── generate_featurecount_mat.R
    ├── QNB_quantification.R
    ├── scrape_software_versions.py
    ├── m6Aprediction.sh
    ├── get_htseq_matrix.R
    ├── m6Am.R
    ├── intersec.pl
    ├── edgeR.R
    ├── bedtools_quantification.R
    ├── DESeq2.R
    ├── MATK_diffm6A.sh
    ├── create_IGV_js.sh
    ├── bedtools_diffm6A.R
    ├── QNB_diffm6A.R
    ├── bed_count.sh
    ├── DESeq2_quantification.R
    ├── markdown_to_html.py
    ├── meyer.py
    ├── merge_peaks_by_bedtools.sh
    ├── arranged_results.R
    ├── merge_peaks_by_mspc.sh
    ├── merge_peaks_by_rank.sh
    ├── GLM_DESeq2_DM.R
    ├── GLM_edgeR_DM.R
    ├── QC_Peaks_Report.R
    ├── MeTDiff_diffm6A.R
    ├── m6A_motif.meme
    ├── DiffReport.R
    └── m6A_annotate_forGTF_xingyang2.pl
├── docs
    ├── README.md
    ├── troubleshooting.md
    ├── configuration
    │   ├── reference_genomes.md
    │   └── adding_your_own.md
    └── output.md
├── environment.yml
├── LICENSE
├── Dockerfile
├── CODE_OF_CONDUCT.md
├── README.md
├── lib
    └── LikeletUtils.groovy
└── nextflow.config


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.config linguist-language=nextflow
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .nextflow*
2 | work/
3 | data/
4 | results/
5 | .DS_Store
6 | tests/
7 | testing/
8 | *.pyc
9 | 


--------------------------------------------------------------------------------
/assets/nf-core-meripseqpipe_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lazyky/meripseqpipe/HEAD/assets/nf-core-meripseqpipe_logo.png


--------------------------------------------------------------------------------
/conf/C2.config:
--------------------------------------------------------------------------------
 1 | // config file for run analysis in new cluster 2
 2 | 
 3 | 
 4 | process.executor = 'pbs'
 5 | 
 6 | params {
 7 |     // Defaults only, expecting to be overwritten
 8 |     max_memory = 128.GB
 9 |     max_cpus = 38
10 |     max_time = 240.h
11 | }


--------------------------------------------------------------------------------
/.github/markdownlint.yml:
--------------------------------------------------------------------------------
 1 | # Markdownlint configuration file
 2 | default: true,
 3 | line-length: false
 4 | no-multiple-blanks: 0
 5 | blanks-around-headers: false
 6 | blanks-around-lists: false
 7 | header-increment: false
 8 | no-duplicate-header:
 9 |     siblings_only: true
10 | 


--------------------------------------------------------------------------------
/conf/docker.config:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * -------------------------------------------------
 3 |  *  nf-core/m6APipe Nextflow docker soft config file
 4 |  * -------------------------------------------------
 5 |  */
 6 | 
 7 | process {
 8 |   container = 'kingzhuky/meripseqpipe:dev'
 9 | }
10 | params {
11 |   // Defaults only, expecting to be overwritten
12 |   matk_jar = '/MATK-1.0.jar'
13 | }
14 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # nf-core/meripseqpipe: Changelog
 2 | 
 3 | The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 4 | and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
 5 | 
 6 | ## v1.0dev - [date]
 7 | 
 8 | Initial release of nf-core/meripseqpipe, created with the [nf-core](http://nf-co.re/) template.
 9 | 
10 | ### `Added`
11 | 
12 | ### `Fixed`
13 | 
14 | ### `Dependencies`
15 | 
16 | ### `Deprecated`
17 | 


--------------------------------------------------------------------------------
/assets/multiqc_config.yaml:
--------------------------------------------------------------------------------
 1 | report_comment: >
 2 |     This report has been generated by the <a href="https://github.com/nf-core/meripseqpipe" target="_blank">nf-core/meripseqpipe</a>
 3 |     analysis pipeline. For information about how to interpret these results, please see the
 4 |     <a href="https://github.com/nf-core/meripseqpipe" target="_blank">documentation</a>.
 5 | report_section_order:
 6 |     software_versions:
 7 |         order: -1000
 8 |     nf-core-meripseqpipe-summary:
 9 |         order: -1001
10 | 
11 | export_plots: true
12 | 


--------------------------------------------------------------------------------
/bin/geneBody_coverage2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #bash geneBody_coverage2.sh <gtf> <THREAD_NUM>
 3 | #$1 argv 1 : gtf file
 4 | #$2 argv 2 : THREAD_NUM
 5 | bed12_file=$1
 6 | THREAD_NUM=$2
 7 | ## Define a multi-threaded run channel
 8 | mkfifo tmp
 9 | exec 9<>tmp
10 | for ((i=1;i<=${THREAD_NUM:=1};i++))
11 | do
12 |     echo >&9   
13 | done
14 | 
15 | for bigwig_file in *.bigwig
16 | do
17 | read -u 9
18 | {
19 |     geneBody_coverage2.py -i $bigwig_file -o ${bigwig_file%.bigwig*}.rseqc.txt -r ${bed12_file}
20 |     echo >&9
21 | }& 
22 | done
23 | wait
24 | echo "Calculate coverage of data is finish"


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # nf-core/meripseqpipe: Documentation
 2 | 
 3 | The nf-core/meripseqpipe documentation is split into the following files:
 4 | 
 5 | 1. [Installation](https://nf-co.re/usage/installation)
 6 | 2. Pipeline configuration
 7 |     * [Local installation](https://nf-co.re/usage/local_installation)
 8 |     * [Adding your own system config](https://nf-co.re/usage/adding_own_config)
 9 |     * [Reference genomes](https://nf-co.re/usage/reference_genomes)
10 | 3. [Running the pipeline](usage.md)
11 | 4. [Output and how to interpret the results](output.md)
12 | 5. [Troubleshooting](https://nf-co.re/usage/troubleshooting)
13 | 


--------------------------------------------------------------------------------
/bin/DiffReport.rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "DiffReport"
 3 | author: "MeRIPseqPipe"
 4 | output: html_document
 5 | ---
 6 | 
 7 | ```{r setup, include=FALSE}
 8 | library(grid)
 9 | library(pheatmap)
10 | ```
11 | design.matrix,compare.list,heatmap_dm.list,heatmap_de.list,volcano_dm.list,ecdf.list,quadrant.list
12 | ## Heatmap
13 |   Heatmap of differential expression analysis
14 | ```{r ,echo=FALSE}
15 | for (group in compare.list) {
16 |   print(group)
17 |   grid.newpage()
18 |   print(heatmap_dm.list[[group]])
19 |   grid.newpage()
20 |   print(heatmap_de.list[[group]]) 
21 |   print(volcano_dm.list[[group]])
22 |   print(quadrant.list[[group]])
23 |   print(ecdf.list[[group]])
24 | }
25 | ```
26 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | Hi there!
 2 | 
 3 | Thanks for suggesting a new feature for the pipeline! Please delete this text and anything that's not relevant from the template below:
 4 | 
 5 | #### Is your feature request related to a problem? Please describe.
 6 | A clear and concise description of what the problem is.
 7 | Ex. I'm always frustrated when [...]
 8 | 
 9 | #### Describe the solution you'd like
10 | A clear and concise description of what you want to happen.
11 | 
12 | #### Describe alternatives you've considered
13 | A clear and concise description of any alternative solutions or features you've considered.
14 | 
15 | #### Additional context
16 | Add any other context about the feature request here.
17 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | Many thanks to contributing to nf-core/meripseqpipe!
 2 | 
 3 | Please fill in the appropriate checklist below (delete whatever is not relevant). These are the most common things requested on pull requests (PRs).
 4 | 
 5 | ## PR checklist
 6 |  - [ ] This comment contains a description of changes (with reason)
 7 |  - [ ] If you've fixed a bug or added code that should be tested, add tests!
 8 |  - [ ] If necessary, also make a PR on the [nf-core/meripseqpipe branch on the nf-core/test-datasets repo]( https://github.com/nf-core/test-datasets/pull/new/nf-core/meripseqpipe)
 9 |  - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker`).
10 |  - [ ] Make sure your code lints (`nf-core lint .`).
11 |  - [ ] Documentation in `docs` is updated
12 |  - [ ] `CHANGELOG.md` is updated
13 |  - [ ] `README.md` is updated
14 | 
15 | **Learn more about contributing:** https://github.com/nf-core/meripseqpipe/tree/master/.github/CONTRIBUTING.md
16 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | Hi there!
 2 | 
 3 | Thanks for telling us about a problem with the pipeline. Please delete this text and anything that's not relevant from the template below:
 4 | 
 5 | #### Describe the bug
 6 | A clear and concise description of what the bug is.
 7 | 
 8 | #### Steps to reproduce
 9 | Steps to reproduce the behaviour:
10 | 1. Command line: `nextflow run ...`
11 | 2. See error: _Please provide your error message_
12 | 
13 | #### Expected behaviour
14 | A clear and concise description of what you expected to happen.
15 | 
16 | #### System:
17 |  - Hardware: [e.g. HPC, Desktop, Cloud...]
18 |  - Executor: [e.g. slurm, local, awsbatch...]
19 |  - OS: [e.g. CentOS Linux, macOS, Linux Mint...]
20 |  - Version [e.g. 7, 10.13.6, 18.3...]
21 | 
22 | #### Nextflow Installation:
23 |  - Version: [e.g. 0.31.0]
24 | 
25 | #### Container engine:
26 |  - Engine: [e.g. Conda, Docker or Singularity]
27 |  - version: [e.g. 1.0.0]
28 |  - Image tag: [e.g. nfcore/meripseqpipe:1.0.0]
29 | 
30 | #### Additional context
31 | Add any other context about the problem here.
32 | 


--------------------------------------------------------------------------------
/bin/MeTPeak.R:
--------------------------------------------------------------------------------
 1 | ## Rscript MeTPeak.R <designfile> <gtf> <THREAD_NUM> <flag_peakCallingbygroup> eg. Rscript MeTPeak.R designfile.txt genes.gtf 10
 2 | ### designfile: Sample_id, Input_filename, IP_filename, group.id
 3 | ### flag_peakCallingbygroup: 1(group) 0(sample)
 4 | library(MeTPeak)
 5 | library(parallel)
 6 | args <- commandArgs(T)
 7 | input.bam.vec <- unlist(strsplit(args[1], split=','))
 8 | ip.bam.vec <- unlist(strsplit(args[2], split=','))
 9 | group.id <- args[3]
10 | gtf <- args[4]
11 | 
12 | ##Running MeTPeak and rename the output name
13 | metpeak(GENE_ANNO_GTF = gtf,
14 |         IP_BAM = ip.bam.vec,
15 |         INPUT_BAM = input.bam.vec,
16 |         EXPERIMENT_NAME = paste0( "metpeak_",group.id )
17 | )
18 | bed_name <- paste0( "metpeak_",group.id ,"/peak.xls")
19 | output_bed_name <- paste0("metpeak_group_",group.id,"_normalized.bed") #peak.bed
20 | bed12.to.bed6 <- paste0("awk 'BEGIN{OFS=\"\t\"}NR>1{print $1,$2,$3,$1\":\"$2\"-\"$3,$5,$6,$7,$8,$9,$10,$11,$12}' ", bed_name," | bed12ToBed6 -i | awk 'BEGIN{FS=\"\t\";OFS=\"\t\"}{print $1,$2,$3,$4,$5}'> ", output_bed_name)
21 | system(bed12.to.bed6)


--------------------------------------------------------------------------------
/bin/merge_peaks_by_rank.R:
--------------------------------------------------------------------------------
 1 | # required for merge_peaks_by_rank.sh
 2 | library(RobustRankAggreg)
 3 | args<-commandArgs(T)
 4 | bedlist <- read.table(args[1],header = F,sep = "\t",stringsAsFactors = F, na.strings = "")
 5 | len_of_bed <- as.numeric(args[2])
 6 | out_name <- as.character(args[3])
 7 | bedlist2 <- as.list(NULL)
 8 | for (i in c(1:ncol(bedlist))){
 9 |   if (TRUE %in% is.na(bedlist[,i])){
10 |     sub <- which(is.na(bedlist[,i]))
11 |     bedlist2[[i]] <- bedlist[-sub,i]
12 |   }
13 |   else{
14 |     bedlist2[[i]] <-bedlist[,i]
15 |   }
16 | }
17 | mergepeak <- aggregateRanks(glist = bedlist2, N = len_of_bed)
18 | sub <- which(as.numeric(mergepeak$Score)==1)
19 | mergepeak <- mergepeak[-sub,]
20 | merged.bed <- apply(mergepeak, 1 ,function(x){
21 |   peak.info <- as.vector(as.matrix(x))
22 |   peak.region = unlist(strsplit(strsplit(as.character(peak.info[1]),split = ":" )[[1]],split = "-"))
23 |   x = c(peak.region,as.character(peak.info[1]),peak.info[2])
24 | })
25 | ## 
26 | merged.bed <- t(merged.bed)
27 | write.table(merged.bed,file = out_name,sep = "\t",quote = FALSE,row.names = FALSE,col.names = FALSE)
28 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: meripseqpipe-1.0dev
 2 | channels:
 3 |   - r
 4 |   - conda-forge
 5 |   - bioconda
 6 |   - anaconda
 7 |   - defaults
 8 | dependencies:
 9 |   - openjdk=8.0.192
10 |   - python=2.7.15
11 |   - picard=2.21.6
12 |   - fastqc=0.11.8
13 |   - fastp=0.19.7
14 |   - bedtools=2.27.1
15 |   - ucsc-gtftogenepred=377
16 |   - ucsc-fatotwobit=377
17 |   - ucsc-facount=377
18 |   - hisat2=2.1.0
19 |   - bowtie2=2.2.5
20 |   - bwa=0.7.17
21 |   - star=2.6.1b
22 |   - tophat=2.1.1
23 |   - samtools=1.9
24 |   - rseqc=2.6.4
25 |   - macs2=2.1.2
26 |   - meme=5.1.1
27 |   - homer=4.9.1
28 |   - dos2unix=7.4.1
29 |   - r-base=3.5.1
30 |   - bioconductor-edger=3.26.0
31 |   - bioconductor-deseq2=1.22.1
32 |   - igvtools=2.3.93
33 |   - bioconductor-exomepeak=2.16.0
34 |   - r-robustrankaggreg=1.1
35 |   - perl=5.26.2
36 |   - ucsc-genepredtobed=377
37 |   - scipy=1.2.1
38 |   - deeptools=3.1.3
39 |   - ucsc-bigwigtowig=357
40 |   - r-ggplot2=3.1.1
41 |   - r-ggrepel=0.8.1
42 |   - r-ggsci=2.9
43 |   - r-pheatmap=1.0.12
44 |   - r-dplyr=0.8.0.1
45 |   - r-knitr=1.22
46 |   - r-ggseqlogo=0.1
47 |   - r-rmarkdown=1.10
48 |   - subread=2.0.0 
49 |   - pandoc=2.7.3
50 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) Kaiyu Zhu, Yu Sun, Xiaoqiong Bao
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/conf/test_bam.config:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * -------------------------------------------------
 3 |  *  Nextflow config file for running tests
 4 |  * -------------------------------------------------
 5 |  * Defines bundled input files and everything required
 6 |  * to run a fast and simple test. Use as follows:
 7 |  *   nextflow run nf-core/meripseqpipe -profile test
 8 |  */
 9 | 
10 | params {
11 |   config_profile_name = 'Test profile'
12 |   config_profile_description = 'Minimal test dataset to check pipeline function'
13 |   // Limit resources so that this can run on Travis
14 |   max_cpus = 2
15 |   max_memory = 6.GB
16 |   max_time = 48.h
17 |   
18 |   // Input data
19 |   // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
20 |   // TODO nf-core: Give any required params for the test so that command line flags are not needed
21 |   aligners = "none"
22 |   comparefile = 'https://raw.githubusercontent.com/kingzhuky/test-datasets/meripseqpipe/designfiles/comparefile.txt'
23 |   designfile = 'https://raw.githubusercontent.com/kingzhuky/test-datasets/meripseqpipe/designfiles/designfile_bam.tsv'
24 |   fasta = 'https://raw.githubusercontent.com/kingzhuky/test-datasets/meripseqpipe/reference/TEST.fa'
25 |   gtf = 'https://raw.githubusercontent.com/kingzhuky/test-datasets/meripseqpipe/reference/TEST.gtf'
26 | }


--------------------------------------------------------------------------------
/conf/test.config:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * -------------------------------------------------
 3 |  *  Nextflow config file for running tests
 4 |  * -------------------------------------------------
 5 |  * Defines bundled input files and everything required
 6 |  * to run a fast and simple test. Use as follows:
 7 |  *   nextflow run nf-core/meripseqpipe -profile test,<docker/singularity>
 8 |  */
 9 | 
10 | params {
11 |   config_profile_name = 'Test profile'
12 |   config_profile_description = 'Minimal test dataset to check pipeline function'
13 |   // Limit resources so that this can run on GitHub Actions
14 |   max_cpus = 2
15 |   max_memory = 6.GB
16 |   max_time = 48.h
17 | 
18 |   // Input data
19 |   // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
20 |   // TODO nf-core: Give any required params for the test so that command line flags are not needed
21 |   single_end = false
22 |   comparefile = 'https://raw.githubusercontent.com/kingzhuky/test-datasets/meripseqpipe/designfiles/comparefile.txt'
23 |   designfile = 'https://raw.githubusercontent.com/kingzhuky/test-datasets/meripseqpipe/designfiles/designfile_paired.tsv'
24 |   fasta = 'https://raw.githubusercontent.com/kingzhuky/test-datasets/meripseqpipe/reference/TEST.fa'
25 |   gtf = 'https://raw.githubusercontent.com/kingzhuky/test-datasets/meripseqpipe/reference/TEST.gtf'
26 | }


--------------------------------------------------------------------------------
/conf/test_mixed.config:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * -------------------------------------------------
 3 |  *  Nextflow config file for running tests
 4 |  * -------------------------------------------------
 5 |  * Defines bundled input files and everything required
 6 |  * to run a fast and simple test. Use as follows:
 7 |  *   nextflow run nf-core/meripseqpipe -profile test,<docker/singularity>
 8 |  */
 9 | 
10 | params {
11 |   config_profile_name = 'Test profile'
12 |   config_profile_description = 'Minimal test dataset to check pipeline function'
13 |   // Limit resources so that this can run on GitHub Actions
14 |   max_cpus = 2
15 |   max_memory = 6.GB
16 |   max_time = 48.h
17 | 
18 |   // Input data
19 |   // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
20 |   // TODO nf-core: Give any required params for the test so that command line flags are not needed
21 |   single_end = false
22 |   comparefile = 'https://raw.githubusercontent.com/kingzhuky/test-datasets/meripseqpipe/designfiles/comparefile.txt'
23 |   designfile = 'https://raw.githubusercontent.com/kingzhuky/test-datasets/meripseqpipe/designfiles/designfile_mixed.tsv'
24 |   fasta = 'https://raw.githubusercontent.com/kingzhuky/test-datasets/meripseqpipe/reference/TEST.fa'
25 |   gtf = 'https://raw.githubusercontent.com/kingzhuky/test-datasets/meripseqpipe/reference/TEST.gtf'
26 | }


--------------------------------------------------------------------------------
/assets/email_template.txt:
--------------------------------------------------------------------------------
 1 | ----------------------------------------------------
 2 |                                         ,--./,-.
 3 |         ___     __   __   __   ___     /,-._.--~\\
 4 |   |\\ | |__  __ /  ` /  \\ |__) |__         }  {
 5 |   | \\| |       \\__, \\__/ |  \\ |___     \\`-._,-`-,
 6 |                                         `._,._,'
 7 |   nf-core/meripseqpipe v${version}
 8 | ----------------------------------------------------
 9 | 
10 | Run Name: $runName
11 | 
12 | <% if (success){
13 |     out << "## nf-core/meripseqpipe execution completed successfully! ##"
14 | } else {
15 |     out << """####################################################
16 | ## nf-core/meripseqpipe execution completed unsuccessfully! ##
17 | ####################################################
18 | The exit status of the task that caused the workflow execution to fail was: $exitStatus.
19 | The full error message was:
20 | 
21 | ${errorReport}
22 | """
23 | } %>
24 | 
25 | 
26 | The workflow was completed at $dateComplete (duration: $duration)
27 | 
28 | The command used to launch the workflow was as follows:
29 | 
30 |   $commandLine
31 | 
32 | 
33 | 
34 | Pipeline Configuration:
35 | -----------------------
36 | <% out << summary.collect{ k,v -> " - $k: $v" }.join("\n") %>
37 | 
38 | --
39 | nf-core/meripseqpipe
40 | https://github.com/nf-core/meripseqpipe
41 | 


--------------------------------------------------------------------------------
/bin/markdown_to_html.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # Command line argument processing
 4 | args = commandArgs(trailingOnly=TRUE)
 5 | if (length(args) < 2) {
 6 |   stop("Usage: markdown_to_html.r <input.md> <output.html>", call.=FALSE)
 7 | }
 8 | markdown_fn <- args[1]
 9 | output_fn <- args[2]
10 | 
11 | # Load / install packages
12 | if (!require("markdown")) {
13 |   install.packages("markdown", dependencies=TRUE, repos='http://cloud.r-project.org/')
14 |   library("markdown")
15 | }
16 | 
17 | base_css_fn <- getOption("markdown.HTML.stylesheet")
18 | base_css <- readChar(base_css_fn, file.info(base_css_fn)$size)
19 | custom_css <-  paste(base_css, "
20 | body {
21 |   padding: 3em;
22 |   margin-right: 350px;
23 |   max-width: 100%;
24 | }
25 | #toc {
26 |   position: fixed;
27 |   right: 20px;
28 |   width: 300px;
29 |   padding-top: 20px;
30 |   overflow: scroll;
31 |   height: calc(100% - 3em - 20px);
32 | }
33 | #toc_header {
34 |   font-size: 1.8em;
35 |   font-weight: bold;
36 | }
37 | #toc > ul {
38 |   padding-left: 0;
39 |   list-style-type: none;
40 | }
41 | #toc > ul ul { padding-left: 20px; }
42 | #toc > ul > li > a { display: none; }
43 | img { max-width: 800px; }
44 | ")
45 | 
46 | markdownToHTML(
47 |   file = markdown_fn,
48 |   output = output_fn,
49 |   stylesheet = custom_css,
50 |   options = c('toc', 'base64_images', 'highlight_code')
51 | )
52 | 


--------------------------------------------------------------------------------
/assets/sendmail_template.txt:
--------------------------------------------------------------------------------
 1 | To: $email
 2 | Subject: $subject
 3 | Mime-Version: 1.0
 4 | Content-Type: multipart/related;boundary="nfcoremimeboundary"
 5 | 
 6 | --nfcoremimeboundary
 7 | Content-Type: text/html; charset=utf-8
 8 | 
 9 | $email_html
10 | 
11 | --nfcoremimeboundary
12 | Content-Type: image/png;name="nf-core-meripseqpipe_logo.png"
13 | Content-Transfer-Encoding: base64
14 | Content-ID: <nfcorepipelinelogo>
15 | Content-Disposition: inline; filename="nf-core-meripseqpipe_logo.png"
16 | 
17 | <% out << new File("$baseDir/assets/nf-core-meripseqpipe_logo.png").
18 |   bytes.
19 |   encodeBase64().
20 |   toString().
21 |   tokenize( '\n' )*.
22 |   toList()*.
23 |   collate( 76 )*.
24 |   collect { it.join() }.
25 |   flatten().
26 |   join( '\n' ) %>
27 | 
28 | <%
29 | if (mqcFile){
30 | def mqcFileObj = new File("$mqcFile")
31 | if (mqcFileObj.length() < mqcMaxSize){
32 | out << """
33 | --nfcoremimeboundary
34 | Content-Type: text/html; name=\"multiqc_report\"
35 | Content-Transfer-Encoding: base64
36 | Content-ID: <mqcreport>
37 | Content-Disposition: attachment; filename=\"${mqcFileObj.getName()}\"
38 | 
39 | ${mqcFileObj.
40 |   bytes.
41 |   encodeBase64().
42 |   toString().
43 |   tokenize( '\n' )*.
44 |   toList()*.
45 |   collate( 76 )*.
46 |   collect { it.join() }.
47 |   flatten().
48 |   join( '\n' )}
49 | """
50 | }}
51 | %>
52 | 
53 | --nfcoremimeboundary--
54 | 


--------------------------------------------------------------------------------
/bin/cufflinks.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #bash cufflinks.sh <designfile> <gtf> <THREAD_NUM>
 3 | #$1 argv 1 : designfile
 4 | #$2 argv 2 : gtf file
 5 | #$3 argv 3 : THREAD_NUM
 6 | designfile=$1
 7 | gtf_file=$2
 8 | THREAD_NUM=$3
 9 | 
10 | group_list=$(awk 'BEGIN{FS=","}NR>1{print $4}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}')
11 | tag=$(echo $group_list | awk '{OFS=",";ORS=""}{for(x=1;x<NF;x++) print $x"," }END{print $x" "}')
12 | 
13 | ## Generate the array of bam files' name by groups eg. group1_1,group1_2 group2_1,group2_2 group3_1
14 | bam_file_array=$(for group_id in $group_list 
15 |         do
16 |                echo *input_${group_id}*.bam | awk '{OFS=",";ORS=""}{for(x=1;x<NF;x++) print $x"," }END{print $x" "}'
17 |         done 
18 |         )
19 | 
20 | ## Generate the information of assembly for cuffdiff 
21 | rm -f assembly_list.txt
22 | for bam_file in *input*.bam
23 | do
24 |         cufflinks -p ${THREAD_NUM} -G ${gtf_file} --library-type fr-unstranded -o ${bam_file/.bam/} $bam_file
25 |         echo "./"${bam_file/.bam/}"/transcripts.gtf" >> assembly_list.txt
26 | done
27 | cuffmerge -o ./merged_gtf -g ${gtf_file} -p ${THREAD_NUM} assembly_list.txt
28 | 
29 | ## Run Cuffdiff for differential expression analysis
30 | cuffdiff -o cuffdiff\
31 |          -L $tag \
32 |          -p ${THREAD_NUM} \
33 |          --time-series --multi-read-correct \
34 |          --library-type fr-unstranded \
35 |          ./merged_gtf/merged.gtf ${bam_file_array}
36 | 


--------------------------------------------------------------------------------
/bin/MATK_quantification.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ## MATK_quantification.sh <matk_jar> <gtf> <designfile> <merge_bed> <THREAD_NUM>
 3 | ## $1 argv 1 : matk_jar
 4 | ## $2 argv 2 : gtf file
 5 | ## $3 argv 3 : designfile
 6 | ## $4 argv 4 : merge_bed_file
 7 | matk_jar=$1
 8 | gtf_file=$2
 9 | designfile=$3
10 | merge_bed_file=$4
11 | THREAD_NUM=$5
12 | 
13 | #Define a multi-threaded run channel
14 | mkfifo tmp
15 | exec 9<>tmp
16 | for ((i=1;i<=${THREAD_NUM:=1};i++))
17 | do
18 |     echo >&9
19 | done
20 | 
21 | sample_list=$(awk 'BEGIN{FS=","}NR>1{print $1}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}')
22 | for sample_id in $sample_list
23 | do
24 | #read -u 9
25 | {
26 |     ip_bam_file=$(ls ${sample_id}.ip*.bam)
27 |     input_bam_file=$(ls ${sample_id}.input*.bam)
28 |     java -jar $matk_jar -quantification \
29 |                 -ip "$ip_bam_file" \
30 |                 -input "$input_bam_file" \
31 |                 -bed $merge_bed_file \
32 |                 -gtf $gtf_file \
33 |                 -out MATK_${sample_id}_quantification.bed
34 |     echo $sample_id > tmp.quantification.$sample_id
35 |     awk 'BEGIN{FS="\t"}{print $5}' MATK_${sample_id}_quantification.bed >> tmp.quantification.$sample_id
36 |     awk 'BEGIN{FS="\t";print ""}NR>1{print $1":"$2"-"$3}' MATK_${sample_id}_quantification.bed > tmp.MATK.quantification
37 |     #echo >&9
38 | }
39 | done
40 | wait
41 | ls tmp.quantification.* |xargs -iFILE sed -i '2d' FILE 
42 | ls tmp.quantification.* |xargs paste tmp.MATK.quantification > MATK_quantification.matrix
43 | echo "MATK quantification done"


--------------------------------------------------------------------------------
/.github/workflows/ci_aligners.yml:
--------------------------------------------------------------------------------
 1 | name: nf-core CI (aligners)
 2 | # This workflow is triggered on pushes and PRs to the repository.
 3 | # It runs the pipeline with the minimal test dataset to check that it completes without any syntax errors
 4 | on: [push, pull_request]
 5 | 
 6 | jobs:
 7 |   test:
 8 |     env:
 9 |       NXF_VER: ${{ matrix.nxf_ver }}
10 |       NXF_ANSI_LOG: false
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       matrix:
14 |         # Nextflow versions: check pipeline minimum and current latest
15 |         nxf_ver: ['19.04.0', '']
16 |     steps:
17 |       - uses: actions/checkout@v2
18 |       - name: Install Nextflow
19 |         run: |
20 |           wget -qO- get.nextflow.io | bash
21 |           sudo mv nextflow /usr/local/bin/
22 |       - name: Pull docker image
23 |         run: |
24 |           docker pull kingzhuky/meripseqpipe:dev
25 |           docker tag kingzhuky/meripseqpipe:dev kingzhuky/meripseqpipe:dev
26 |       - name: Run pipeline with test data
27 |         run: |
28 |           # Run the pipeline with the test profile
29 |           nextflow run ${GITHUB_WORKSPACE} -profile test,docker --skip_peakCalling --skip_expression
30 |           # Run, build reference genome with HISAT2
31 |           nextflow run ${GITHUB_WORKSPACE} -profile test_mixed,docker --aligners hisat2 --skip_peakCalling --skip_expression -resume
32 |           # Run, build reference genome with BWA
33 |           nextflow run ${GITHUB_WORKSPACE} -profile test_mixed,docker --aligners bwa --skip_peakCalling --skip_expression -resume
34 |           
35 | 


--------------------------------------------------------------------------------
/bin/normalize_peaks.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Jul 22 16:44:32 2019
 4 | 
 5 | @author: zky
 6 | """
 7 | from math import log
 8 | import sys
 9 | import numpy as np
10 | #from scipy import stats
11 | if len(sys.argv) <= 2:
12 |     print("This script need two parameters. For example,\
13 |           python normalize_peaks.py <input_bed> <ouput_bed>")
14 |     sys.exit()
15 | input_bed_file = sys.argv[1]
16 | output_bed_file = sys.argv[2]
17 | 
18 | def MaxMinNormalization(x,Max,Min):
19 |     if Max != Min :
20 |         x = 1e-20 + (x - Min)*(1-1e-20) / (Max - Min)
21 |     return x
22 | #def Z_ScoreNormalization(x,mu,sigma):
23 | #    x = (x - mu) / sigma;
24 | #    return x
25 | with open(input_bed_file) as peaks_bed:
26 |     pvalue_array = []
27 |     normalized_peaks = []
28 |     max_pvalue = min_pvalue = 0
29 |     for line in peaks_bed:
30 |         data = line.replace('\n','').replace('\r','').split('\t')
31 |         pvalue = float(data[4])
32 |         pvalue_array.append(pvalue)
33 |         normalized_peaks.append(data)
34 |     if pvalue_array :
35 |         max_pvalue = np.max(pvalue_array)
36 |         min_pvalue = np.min(pvalue_array)
37 | #    mu = np.average(pvalue_array)
38 | #    sigma = np.std(pvalue_array)cd 
39 |     for data in normalized_peaks:
40 |         data[4] = MaxMinNormalization(float(data[4]),max_pvalue,min_pvalue)
41 |         data[4] = -log(data[4],10)
42 | with open(output_bed_file,'w') as output_file:
43 |     for data in normalized_peaks:
44 |         output_file.write('\t'.join(str(i) for i in data))
45 |         output_file.write('\n')
46 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nfcore/base:1.9
 2 | LABEL authors="Kaiyu Zhu, Yu Sun, Xiaoqiong Bao" \
 3 |       description="Docker image containing all software requirements for the MeRIPseqPipe pipeline"
 4 | 
 5 | # Install the conda environment
 6 | COPY environment.yml /
 7 | RUN conda env create -f /environment.yml && conda clean -a
 8 | # install subread
 9 | RUN conda create -n multiqc -c conda-forge -c bioconda python=3.7.8 multiqc=1.7 && conda clean -a
10 | 
11 | RUN conda env export --name meripseqpipe-1.0dev > meripseqpipe-1.0dev.yml
12 | ENV PATH /mspc:$PATH
13 | ENV PATH /opt/conda/bin:$PATH
14 | ENV PATH /opt/conda/envs/multiqc/bin/:$PATH
15 | ENV PATH /opt/conda/envs/meripseqpipe-1.0dev/bin:$PATH
16 | 
17 | 
18 | # install MATK
19 | RUN wget https://github.com/kingzhuky/MATK_backup/releases/download/v0.1dev/MATK-1.0.jar
20 | 
21 | # install QNB
22 | RUN wget https://cran.r-project.org/src/contrib/Archive/QNB/QNB_1.1.11.tar.gz && \ 
23 |     R CMD INSTALL QNB_1.1.11.tar.gz && \
24 |     rm QNB_1.1.11.tar.gz
25 | 
26 | # install MeTDiff
27 | RUN git clone https://github.com/compgenomics/MeTDiff.git && \
28 |     R CMD build MeTDiff/ && \
29 |     R CMD INSTALL MeTDiff_1.0.tar.gz && \
30 |     rm -rf MeTDiff*
31 | 
32 | # install MeTPeak
33 | RUN git clone https://github.com/compgenomics/MeTPeak.git && \
34 |     R CMD build MeTPeak/ && \
35 |     R CMD INSTALL MeTPeak_1.0.0.tar.gz && \
36 |     rm -rf MeTPeak*
37 | 
38 | # install MSPC
39 | RUN conda install -y unzip 
40 | RUN wget -O mspc.zip "https://github.com/Genometric/MSPC/releases/download/v5.4.0/linux-x64.zip" && \
41 |     unzip mspc.zip -d mspc && \
42 |     chmod 775 mspc/mspc && \ 
43 |     rm mspc.zip
44 | 


--------------------------------------------------------------------------------
/.github/workflows/ci_methy_methods.yml:
--------------------------------------------------------------------------------
 1 | name: nf-core CI (methylation_analysis_mode)
 2 | # This workflow is triggered on pushes and PRs to the repository.
 3 | # It runs the pipeline with the minimal test dataset to check that it completes without any syntax errors
 4 | on: [push, pull_request]
 5 | 
 6 | jobs:
 7 |   test:
 8 |     env:
 9 |       NXF_VER: ${{ matrix.nxf_ver }}
10 |       NXF_ANSI_LOG: false
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       matrix:
14 |         # Nextflow versions: check pipeline minimum and current latest
15 |         nxf_ver: ['19.04.0', '']
16 |     steps:
17 |       - uses: actions/checkout@v2
18 |       - name: Install Nextflow
19 |         run: |
20 |           wget -qO- get.nextflow.io | bash
21 |           sudo mv nextflow /usr/local/bin/
22 |       - name: Pull docker image
23 |         run: |
24 |           docker pull kingzhuky/meripseqpipe:dev
25 |           docker tag kingzhuky/meripseqpipe:dev kingzhuky/meripseqpipe:dev
26 |       - name: Run pipeline with test data
27 |         run: |
28 |           # Run, test PeakCalling mode 'group' and Methylation Analysis mode 'QNB'
29 |           nextflow run ${GITHUB_WORKSPACE} -profile test_bam,docker --skip_expression --methylation_analysis_mode QNB -resume
30 |           # Run, test PeakMerged mode 'rank' and Methylation Analysis mode 'MATK'
31 |           nextflow run ${GITHUB_WORKSPACE} -profile test_bam,docker --skip_expression --methylation_analysis_mode MATK -resume
32 |           # Run, test one of PeakMerged mode 'bedtools' and Methylation Analysis mode 'DESeq2'
33 |           nextflow run ${GITHUB_WORKSPACE} -profile test_bam,docker --peakMerged_mode macs2 --skip_expression --methylation_analysis_mode DESeq2 -resume
34 | 


--------------------------------------------------------------------------------
/bin/QC_Peaks_Report.rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "QC_Peaks_Report"
 3 | author: "MeRIPseqPipe"
 4 | output: html_document
 5 | ---
 6 | 
 7 | ```{r setup, include=TRUE, echo=FALSE}
 8 | require(grid)
 9 | ```
10 | 
11 | ## Peaks Distribution
12 |   Compare different distribution of different PeakCalling tools.
13 | ```{r Distribution, echo=FALSE}
14 | print(distribute.barplot)
15 | print(distribute.barplot.count)
16 | ## Curve
17 | for( sample in names(sample.plots.list) ){
18 |   print(sample)
19 |   print(sample.plots.list[[sample]])
20 | }
21 | print("merged peaks")
22 | print(merged.plot)
23 | ```
24 | 
25 | ## Peaks' motif
26 | 
27 |   Compare different motifs(top three) of different groups.
28 | 
29 | ```{r motif, echo=FALSE}
30 | ggplot2.multiplot <- function(..., plotlist=NULL, cols=2) {
31 |   # Make a list from the ... arguments and plotlist
32 |   plots <- c(list(...), plotlist)
33 |   numPlots = length(plots)
34 |   
35 |   # Make the panel
36 |   plotCols = cols                          # Number of columns of plots
37 |   plotRows = ceiling(numPlots/plotCols) # Number of rows needed, calculated from # of cols
38 |   # Set up the page
39 |   grid::grid.newpage()
40 |   grid::pushViewport(grid::viewport(layout = grid::grid.layout(plotRows, plotCols)))
41 |   vplayout <- function(x, y)
42 |     grid::viewport(layout.pos.row = x, layout.pos.col = y,name = "abc")
43 |   # Make each plot, in the correct location
44 |   for (i in 1:numPlots) {
45 |     curRow = ceiling(i/plotCols)
46 |     curCol = (i-1) %% plotCols + 1
47 |     print(plots[[i]], vp = vplayout(curRow, curCol))
48 |   }
49 | }
50 | 
51 | for( peakfile in names(QC.motif.list) ){
52 |   print(peakfile)
53 |   ggplot2.multiplot(plotlist = QC.motif.list[[peakfile]] ,cols = 1)
54 | }
55 | ```
56 | 


--------------------------------------------------------------------------------
/.github/workflows/linting.yml:
--------------------------------------------------------------------------------
 1 | name: nf-core linting
 2 | # This workflow is triggered on pushes and PRs to the repository.
 3 | # It runs the `nf-core lint` and markdown lint tests to ensure that the code meets the nf-core guidelines
 4 | on:
 5 |   push:
 6 |   pull_request:
 7 |   release:
 8 |     types: [published]
 9 | 
10 | jobs:
11 |   Markdown:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/checkout@v2
15 |       - uses: actions/setup-node@v1
16 |         with:
17 |           node-version: '10'
18 |       - name: Install markdownlint
19 |         run: npm install -g markdownlint-cli
20 |       - name: Run Markdownlint
21 |         run: markdownlint ${GITHUB_WORKSPACE} -c ${GITHUB_WORKSPACE}/.github/markdownlint.yml
22 |   YAML:
23 |     runs-on: ubuntu-latest
24 |     steps:
25 |       - uses: actions/checkout@v1
26 |       - uses: actions/setup-node@v1
27 |         with:
28 |           node-version: '10'
29 |       - name: Install yaml-lint
30 |         run: npm install -g yaml-lint
31 |       - name: Run yaml-lint
32 |         run: yamllint $(find ${GITHUB_WORKSPACE} -type f -name "*.yml")
33 |   # nf-core:
34 |   #   runs-on: ubuntu-latest
35 |   #   steps:
36 |   #     - uses: actions/checkout@v2
37 |   #     - name: Install Nextflow
38 |   #       run: |
39 |   #         wget -qO- get.nextflow.io | bash
40 |   #         sudo mv nextflow /usr/local/bin/
41 |   #     - uses: actions/setup-python@v1
42 |   #       with:
43 |   #         python-version: '3.6'
44 |   #         architecture: 'x64'
45 |   #     - name: Install dependencies
46 |   #       run: |
47 |   #         python -m pip install --upgrade pip
48 |   #         pip install nf-core
49 |   #     - name: Run nf-core lint
50 |   #       run: nf-core lint ${GITHUB_WORKSPACE}
51 | 


--------------------------------------------------------------------------------
/bin/generate_featurecount_mat.R:
--------------------------------------------------------------------------------
 1 | #!/bin/Rscript
 2 | ## Rscript get_htseq_matrix.R designfile THREAD_NUM eg. Rscript get_htseq_matrix.R designfile_single.txt 10
 3 | ## designfile: filename, control_or_treated, input_or_ip, group(default 0 is CONTROL_SITUATION else are TREATED_SITUATION)
 4 | 
 5 | library(parallel)
 6 | library(data.table)
 7 | args<-commandArgs(T)
 8 | designfile <- args[1]
 9 | THREAD_NUM <- as.numeric(args[2])
10 | 
11 | designtable <- read.csv(designfile,header = TRUE,stringsAsFactors=FALSE, colClasses = c("character"))
12 | #Generate gene count matrix
13 | fc.files <- list.files("./",pattern = ".txt$")
14 | mclapply(unique(designtable$Group),function(x){
15 |   group_id <- x
16 |   group.input.count.mat <- NULL
17 |   for(pc in grep(paste0(".input_",group_id,"[.]bam"),fc.files,value = TRUE)){
18 |     pc.exp <- fread(pc,sep = "\t")[,c(1,7)]
19 |     if(is.null(group.input.count.mat)){
20 |       group.input.count.mat <- pc.exp
21 |     }else{
22 |       group.input.count.mat <- merge(group.input.count.mat,pc.exp,by = c("Geneid"))
23 |     }
24 |   }
25 |   #parsing samplenames
26 |   output_pattern = paste0("htseq_group_",group_id)  #添加aligner
27 |   fwrite(group.input.count.mat, file = paste0(output_pattern,"_input.count"), sep = "\t")
28 | },
29 | mc.cores = THREAD_NUM
30 | )
31 | group.mat.list = grep("htseq",list.files(path = "./",pattern = "input.count"), value = T)
32 | expression.matrix <- NULL
33 | for( file in group.mat.list ){
34 |   tmp.expression.table <- as.matrix(read.table(file, header = TRUE, row.names = 1, check.names=F))
35 |   expression.matrix <- cbind(expression.matrix, tmp.expression.table)
36 | }
37 | colnames(expression.matrix) <- as.matrix(lapply(strsplit(colnames(expression.matrix),".input"), function(x){ x[1]}))
38 | write.table(expression.matrix,file = "expression.matrix",quote=F)
39 | 


--------------------------------------------------------------------------------
/.github/workflows/ci_peakcalling_tools.yml:
--------------------------------------------------------------------------------
 1 | name: nf-core CI (peakcalling tools)
 2 | # This workflow is triggered on pushes and PRs to the repository.
 3 | # It runs the pipeline with the minimal test dataset to check that it completes without any syntax errors
 4 | on: [push, pull_request]
 5 | 
 6 | jobs:
 7 |   test:
 8 |     env:
 9 |       NXF_VER: ${{ matrix.nxf_ver }}
10 |       NXF_ANSI_LOG: false
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       matrix:
14 |         # Nextflow versions: check pipeline minimum and current latest
15 |         nxf_ver: ["19.04.0", ""]
16 |     steps:
17 |       - uses: actions/checkout@v2
18 |       - name: Install Nextflow
19 |         run: |
20 |           wget -qO- get.nextflow.io | bash
21 |           sudo mv nextflow /usr/local/bin/
22 |       - name: Pull docker image
23 |         run: |
24 |           docker pull kingzhuky/meripseqpipe:dev
25 |           docker tag kingzhuky/meripseqpipe:dev kingzhuky/meripseqpipe:dev
26 |       - name: Run pipeline with test data
27 |         run: |
28 |           # Run, test PeakCalling mode 'rank' of one peakcalling tool 'meyer'
29 |           nextflow run ${GITHUB_WORKSPACE} -profile test_bam,docker --skip_metpeak --skip_macs2 --skip_matk -resume
30 |           # Run, test PeakCalling mode 'rank' of one peakcalling tool 'metpeak'
31 |           nextflow run ${GITHUB_WORKSPACE} -profile test_bam,docker --skip_meyer --skip_macs2 --skip_matk -resume
32 |           # Run, test PeakCalling mode 'rank' of one peakcalling tool 'matk'
33 |           nextflow run ${GITHUB_WORKSPACE} -profile test_bam,docker --skip_metpeak --skip_macs2 --skip_meyer -resume
34 |           # Run, test PeakCalling mode 'rank' of one peakcalling tool 'macs2'
35 |           nextflow run ${GITHUB_WORKSPACE} -profile test_bam,docker --skip_metpeak --skip_meyer --skip_matk -resume
36 | 


--------------------------------------------------------------------------------
/.github/workflows/branch.yml:
--------------------------------------------------------------------------------
 1 | name: nf-core branch protection
 2 | # This workflow is triggered on PRs to master branch on the repository
 3 | # It fails when someone tries to make a PR against the nf-core `master` branch instead of `dev`
 4 | on:
 5 |   pull_request:
 6 |     branches: [master]
 7 | 
 8 | jobs:
 9 |   test:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       # PRs to the nf-core repo master branch are only ok if coming from the nf-core repo `dev` or any `patch` branches
13 |       - name: Check PRs
14 |         if: github.repository == 'nf-core/meripseqpipe'
15 |         run: |
16 |           { [[ $(git remote get-url origin) == *nf-core/meripseqpipe ]] && [[ ${GITHUB_HEAD_REF} = "dev" ]]; } || [[ ${GITHUB_HEAD_REF} == "patch" ]]
17 | 
18 |       # If the above check failed, post a comment on the PR explaining the failure
19 |       # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets
20 |       - name: Post PR comment
21 |         if: failure()
22 |         uses: mshick/add-pr-comment@v1
23 |         with:
24 |           message: |
25 |             Hi @${{ github.event.pull_request.user.login }},
26 | 
27 |             It looks like this pull-request is has been made against the ${{github.event.pull_request.head.repo.full_name}} `master` branch.
28 |             The `master` branch on nf-core repositories should always contain code from the latest release.
29 |             Because of this, PRs to `master` are only allowed if they come from the ${{github.event.pull_request.head.repo.full_name}} `dev` branch.
30 | 
31 |             You do not need to close this PR, you can change the target branch to `dev` by clicking the _"Edit"_ button at the top of this page.
32 | 
33 |             Thanks again for your contribution!
34 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
35 |           allow-repeats: false


--------------------------------------------------------------------------------
/.github/workflows/ci_peakcalling_methods.yml:
--------------------------------------------------------------------------------
 1 | name: nf-core CI (peakcalling methods)
 2 | # This workflow is triggered on pushes and PRs to the repository.
 3 | # It runs the pipeline with the minimal test dataset to check that it completes without any syntax errors
 4 | on: [push, pull_request]
 5 | 
 6 | jobs:
 7 |   test:
 8 |     env:
 9 |       NXF_VER: ${{ matrix.nxf_ver }}
10 |       NXF_ANSI_LOG: false
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       matrix:
14 |         # Nextflow versions: check pipeline minimum and current latest
15 |         nxf_ver: ["19.04.0", ""]
16 |     steps:
17 |       - uses: actions/checkout@v2
18 |       - name: Install Nextflow
19 |         run: |
20 |           wget -qO- get.nextflow.io | bash
21 |           sudo mv nextflow /usr/local/bin/
22 |       - name: Pull docker image
23 |         run: |
24 |           docker pull kingzhuky/meripseqpipe:dev
25 |           docker tag kingzhuky/meripseqpipe:dev kingzhuky/meripseqpipe:dev
26 |       - name: Run pipeline with test data
27 |         run: |
28 |           # Run, test PeakCalling mode 'group' of one peakcalling tool 'macs2'
29 |           nextflow run ${GITHUB_WORKSPACE} -profile test_bam,docker --skip_metpeak --skip_meyer --skip_matk --peakCalling_mode group -resume
30 |           # Run, test PeakCalling mode 'mspc' of one peakcalling tool 'macs2'
31 |           nextflow run ${GITHUB_WORKSPACE} -profile test_bam,docker --skip_metpeak --skip_meyer --skip_matk --peakCalling_mode group --peakMerged_mode mspc -resume
32 |           # Run, test PeakCalling mode 'macs2' of one peakcalling tool 'macs2'
33 |           nextflow run ${GITHUB_WORKSPACE} -profile test_bam,docker --skip_metpeak --skip_meyer --skip_matk --peakMerged_mode macs2 -resume
34 |           # Run, test PeakCalling mode 'rank' of four peakcalling tools
35 |           nextflow run ${GITHUB_WORKSPACE} -profile test_bam,docker -resume


--------------------------------------------------------------------------------
/bin/QNB_quantification.R:
--------------------------------------------------------------------------------
 1 | # Rscript QNB_quantification.R designfile
 2 | library("QNB")
 3 | args <- commandArgs(T)
 4 | designfile <- args[1]
 5 | ## read designfile to get the name of samples
 6 | designtable <- read.csv(designfile,head = TRUE,stringsAsFactors=FALSE, colClasses = c("character"))
 7 | filelist = list.files(path = "./",pattern = ".count")
 8 | rpkm_peaks_list <- NULL
 9 | for(sample_id in designtable$Sample_ID){
10 |   ## generate the dataframe of peak count
11 |   input.count <- c()
12 |   input.names <- c()
13 |   input.samples <- c()
14 |   for(input in grep(sample_id, grep("[.]input",filelist,value = TRUE), value = T)){
15 |     input.exp <- read.table(input,header=T,sep="\t",row.names= NULL,quote = "")
16 |     input.count <- cbind(input.count,input.exp[,5])
17 |     input.names <- input.exp[,4] #peaks name
18 |     input.samples <- c(input.samples,input) #samples name
19 |   }
20 |   colnames(input.count) <- input.samples
21 |   rownames(input.count) <- input.names
22 |   ip.count <- c()
23 |   ip.names <- c()
24 |   ip.samples <- c()
25 |   for(ip in grep(sample_id, grep("[.]ip",filelist,value = TRUE), value = T)){
26 |     ip.exp <- read.table(ip,header=T,sep="\t",row.names= NULL,quote = "")
27 |     ip.count <- cbind(ip.count,ip.exp[,5])
28 |     ip.names <- ip.exp[,4] #peaks name
29 |     ip.samples <- c(ip.samples,ip) #samples name
30 |   }
31 |   colnames(ip.count) <- ip.samples
32 |   rownames(ip.count) <- ip.names
33 |   ## Run the QNB to generate quantificative value per sample
34 |   result <- qnbtest(ip.count, ip.count, input.count, input.count, mode="blind")
35 |   sample_quantification <- as.matrix(result$p.treated)
36 |   colnames(sample_quantification) <- sample_id
37 |   rpkm_peaks_list <- cbind(rpkm_peaks_list,sample_quantification)
38 |   rownames(rpkm_peaks_list) <- rownames(ip.count)
39 | }
40 | write.table(rpkm_peaks_list,sep = "\t",file = "QNB_quantification.matrix",quote = F)
41 | 


--------------------------------------------------------------------------------
/bin/scrape_software_versions.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import print_function
 3 | from collections import OrderedDict
 4 | import re
 5 | 
 6 | # TODO nf-core: Add additional regexes for new tools in process get_software_versions
 7 | regexes = {
 8 |     'nf-core/meripseqpipe': ['v_pipeline.txt', r"(\S+)"],
 9 |     'Nextflow': ['v_nextflow.txt', r"(\S+)"],
10 |     'FastQC': ['v_fastqc.txt', r"FastQC v(\S+)"],
11 |     'MultiQC': ['v_multiqc.txt', r"multiqc, version (\S+)"],
12 | }
13 | results = OrderedDict()
14 | results['nf-core/meripseqpipe'] = '<span style="color:#999999;\">N/A</span>'
15 | results['Nextflow'] = '<span style="color:#999999;\">N/A</span>'
16 | results['FastQC'] = '<span style="color:#999999;\">N/A</span>'
17 | results['MultiQC'] = '<span style="color:#999999;\">N/A</span>'
18 | 
19 | # Search each file using its regex
20 | for k, v in regexes.items():
21 |     try:
22 |         with open(v[0]) as x:
23 |             versions = x.read()
24 |             match = re.search(v[1], versions)
25 |             if match:
26 |                 results[k] = "v{}".format(match.group(1))
27 |     except IOError:
28 |         results[k] = False
29 | 
30 | # Remove software set to false in results
31 | for k in list(results):
32 |     if not results[k]:
33 |         del(results[k])
34 | 
35 | # Dump to YAML
36 | print ('''
37 | id: 'software_versions'
38 | section_name: 'nf-core/meripseqpipe Software Versions'
39 | section_href: 'https://github.com/nf-core/meripseqpipe'
40 | plot_type: 'html'
41 | description: 'are collected at run time from the software output.'
42 | data: |
43 |     <dl class="dl-horizontal">
44 | ''')
45 | for k,v in results.items():
46 |     print("        <dt>{}</dt><dd><samp>{}</samp></dd>".format(k,v))
47 | print ("    </dl>")
48 | 
49 | # Write out regexes as csv file:
50 | with open('software_versions.csv', 'w') as f:
51 |     for k,v in results.items():
52 |         f.write("{}\t{}\n".format(k,v))
53 | 


--------------------------------------------------------------------------------
/docs/troubleshooting.md:
--------------------------------------------------------------------------------
 1 | # MeRIPseqPipe: Troubleshooting
 2 | 
 3 | ## Input files not found
 4 | 
 5 | If only no file, only one input file , or only read one and not read two is picked up then something is wrong with your input file declaration
 6 | 
 7 | 1. The path must be enclosed in quotes (`'` or `"`)
 8 | 2. The path must have at least one `*` wildcard character. This is even if you are only running one paired end sample.
 9 | 3. When using the pipeline with paired end data, the path must use `{1,2}` or `{R1,R2}` notation to specify read pairs.
10 | 4. If you are running Single end data make sure to specify `--singleEnd`
11 | 
12 | If the pipeline can't find your files then you will get the following error
13 | 
14 | ```bash
15 | ERROR ~ Cannot find any reads matching: *{1,2}.fastq.gz
16 | ```
17 | 
18 | Note that if your sample name is "messy" then you have to be very particular with your glob specification. A file name like `L1-1-D-2h_S1_L002_R1_001.fastq.gz` can be difficult enough for a human to read. Specifying `*{1,2}*.gz` wont work give you what you want Whilst `*{R1,R2}*.gz` will.
19 | 
20 | ## Data organization
21 | 
22 | The pipeline can't take a list of multiple input files - it takes a glob expression. If your input files are scattered in different paths then we recommend that you generate a directory with symlinked files. If running in paired end mode please make sure that your files are sensibly named so that they can be properly paired. See the previous point.
23 | 
24 | ## Extra resources and getting help
25 | 
26 | If you still have an issue with running the pipeline then feel free to contact us.
27 | Have a look at the [pipeline website](https://github.com/nf-core/m6APipe) to find out how.
28 | 
29 | If you have problems that are related to Nextflow and not our pipeline then check out the [Nextflow gitter channel](https://gitter.im/nextflow-io/nextflow) or the [google group](https://groups.google.com/forum/#!forum/nextflow).
30 | 


--------------------------------------------------------------------------------
/.github/workflows/awstest.yml:
--------------------------------------------------------------------------------
 1 | name: nf-core AWS test
 2 | # This workflow is triggered on push to the master branch.
 3 | # It runs the -profile 'test' on AWS batch
 4 | 
 5 | on:
 6 |   push:
 7 |     branches:
 8 |       - master
 9 | 
10 | jobs:
11 |   run-awstest:
12 |     name: Run AWS tests
13 |     if: github.repository == 'nf-core/meripseqpipe'
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - name: Setup Miniconda
17 |         uses: goanpeca/setup-miniconda@v1.0.2
18 |         with:
19 |           auto-update-conda: true
20 |           python-version: 3.7
21 |       - name: Install awscli
22 |         run: conda install -c conda-forge awscli
23 |       - name: Start AWS batch job
24 |         # TODO nf-core: You can customise CI pipeline run tests as required
25 |         # For example: adding multiple test runs with different parameters
26 |         # Remember that you can parallelise this by using strategy.matrix
27 |         env:
28 |           AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
29 |           AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
30 |           TOWER_ACCESS_TOKEN: ${{ secrets.AWS_TOWER_TOKEN }}
31 |           AWS_JOB_DEFINITION: ${{ secrets.AWS_JOB_DEFINITION }}
32 |           AWS_JOB_QUEUE: ${{ secrets.AWS_JOB_QUEUE }}
33 |           AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }}
34 |         run: |
35 |           aws batch submit-job \
36 |           --region eu-west-1 \
37 |           --job-name nf-core-test \
38 |           --job-queue $AWS_JOB_QUEUE \
39 |           --job-definition $AWS_JOB_DEFINITION \
40 |           --container-overrides '{"command": ["nf-core/meripseqpipe", "-r '"${GITHUB_SHA}"' -profile test --outdir s3://'"${AWS_S3_BUCKET}"'/test/results-'"${GITHUB_SHA}"' -w s3://'"${AWS_S3_BUCKET}"'/test/work-'"${GITHUB_SHA}"' -with-tower"], "environment": [{"name": "TOWER_ACCESS_TOKEN", "value": "'"$TOWER_ACCESS_TOKEN"'"}]}'test/work-'"${GITHUB_SHA}"' -with-tower"], "environment": [{"name": "TOWER_ACCESS_TOKEN", "value": "'"$TOWER_ACCESS_TOKEN"'"}]}'
41 | 


--------------------------------------------------------------------------------
/.github/workflows/awsfulltest.yml:
--------------------------------------------------------------------------------
 1 | name: nf-core AWS full size tests
 2 | # This workflow is triggered on push to the master branch.
 3 | # It runs the -profile 'test_full' on AWS batch
 4 | 
 5 | on:
 6 |   release:
 7 |     types: [published]
 8 | 
 9 | jobs:
10 |   run-awstest:
11 |     name: Run AWS full tests
12 |     if: github.repository == 'nf-core/meripseqpipe'
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - name: Setup Miniconda
16 |         uses: goanpeca/setup-miniconda@v1.0.2
17 |         with:
18 |           auto-update-conda: true
19 |           python-version: 3.7
20 |       - name: Install awscli
21 |         run: conda install -c conda-forge awscli
22 |       - name: Start AWS batch job
23 |         # TODO nf-core: You can customise AWS full pipeline tests as required
24 |         # Add full size test data (but still relatively small datasets for few samples)
25 |         # on the `test_full.config` test runs with only one set of parameters
26 |         # Then specify `-profile test_full` instead of `-profile test` on the AWS batch command
27 |         env:
28 |           AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
29 |           AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
30 |           TOWER_ACCESS_TOKEN: ${{ secrets.AWS_TOWER_TOKEN }}
31 |           AWS_JOB_DEFINITION: ${{ secrets.AWS_JOB_DEFINITION }}
32 |           AWS_JOB_QUEUE: ${{ secrets.AWS_JOB_QUEUE }}
33 |           AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }}
34 |         run: |
35 |           aws batch submit-job \
36 |             --region eu-west-1 \
37 |             --job-name nf-core-test \
38 |             --job-queue $AWS_JOB_QUEUE \
39 |             --job-definition $AWS_JOB_DEFINITION \
40 |             --container-overrides '{"command": ["nf-core/meripseqpipe", "-r '"${GITHUB_SHA}"' -profile test --outdir s3://'"${AWS_S3_BUCKET}"'/test/results-'"${GITHUB_SHA}"' -w s3://'"${AWS_S3_BUCKET}"'/test/work-'"${GITHUB_SHA}"' -with-tower"], "environment": [{"name": "TOWER_ACCESS_TOKEN", "value": "'"$TOWER_ACCESS_TOKEN"'"}]}'
41 | 


--------------------------------------------------------------------------------
/bin/m6Aprediction.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #bash m6Aprediction.sh <matk_jar> <fasta> <gtf> <THREAD_NUM>
 3 | ## $1 argv 1 : matk_jar
 4 | ## $2 argv 2 : designfile
 5 | ## $3 argv 3 : fasta file
 6 | ## $4 argv 4 : gtf file
 7 | matk_jar=$1
 8 | designfile=$2
 9 | fasta_file=$3
10 | gtf_file=$4
11 | 
12 | ### check if the file matk.jar exists
13 | if [ ! -f "$matk_jar" ]; then
14 |     echo "Cannot find matk.jar. Please check the param of matk_jar" 1>&2
15 |     exit 1
16 | fi
17 | 
18 | faToTwoBit ${fasta_file} ${fasta_file/.fa/.2bit}
19 | awk -F "\t" '$3=="gene"{print }' $gtf_file > tmp.$gtf_file
20 | group_list=$(awk 'BEGIN{FS=","}NR>1{print $4}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}')
21 | for group_id in $group_list
22 | do 
23 | {
24 |     bedfile=$(ls *merged_group_${group_id}.bed)
25 |     ip_bam_file_array=$(echo *.ip_${group_id}*.bam | awk '{OFS=",";ORS=""}{for(x=1;x<NF;x++) print $x";" }END{print $x""}')
26 |     input_bam_file_array=$(echo *.input_${group_id}*.bam | awk '{OFS=",";ORS=""}{for(x=1;x<NF;x++) print $x";" }END{print $x""}')
27 |     java -jar $matk_jar -singleNucleotide \
28 |                     -mode "MeRIP" \
29 |                     -ip "$ip_bam_file_array" \
30 |                     -input "$input_bam_file_array" \
31 |                     -bed ${bedfile} \
32 |                     -2bit ${fasta_file/.fa/.2bit} \
33 |                     -gtf tmp.$gtf_file \
34 |                     -out m6A_sites_${group_id}.bed
35 |     awk -F "\t" '{print $1"\t"$2"\t"$3"\t"$4"\t*\t"$6"\t"$5"\t'${group_id}'"}' m6A_sites_${group_id}.bed > tmp.m6A_sites_${group_id}.bed
36 | }
37 | done
38 | wait
39 | cat tmp.m6A_sites*.bed | sortBed | mergeBed -s -c 4,6,7,8 -o first,first,collapse,collapse > tmp.m6A_sites_merged.bed
40 | awk -v gap=25 '{print $1"\t"$2-gap"\t"$3+gap"\t*\t*\t"$5}' tmp.m6A_sites_merged.bed | bedtools getfasta -s -fi ${fasta_file} -bed - | awk '$0!~">"{print $0}' > tmp.m6A_sites_merged.fa
41 | paste tmp.m6A_sites_merged.bed tmp.m6A_sites_merged.fa > m6A_sites_merged.bed
42 | rm tmp.*
43 | echo "Prediction sites of m6A done"


--------------------------------------------------------------------------------
/bin/get_htseq_matrix.R:
--------------------------------------------------------------------------------
 1 | #!/bin/Rscript
 2 | ## Rscript get_htseq_matrix.R designfile THREAD_NUM eg. Rscript get_htseq_matrix.R designfile_single.txt 10
 3 | ## designfile: filename, control_or_treated, input_or_ip, group(default 0 is CONTROL_SITUATION else are TREATED_SITUATION)
 4 | 
 5 | library(parallel)
 6 | args<-commandArgs(T)
 7 | designfile <- args[1]
 8 | THREAD_NUM <- as.numeric(args[2])
 9 | 
10 | designtable <- read.csv(designfile,header = TRUE,stringsAsFactors=FALSE, colClasses = c("character"))
11 | #Generate gene count matrix
12 | htseq.files <- list.files("./",pattern = ".txt")
13 | mclapply(unique(designtable$Group),function(x){
14 |   group_id <- x
15 |   trans.htseq.input.count <- c()
16 |   pc.names <- c()
17 |   pc.samples <- c()
18 |   for(pc in grep(paste0(".input_",group_id,"[.]bam"),htseq.files,value = TRUE)){
19 |     pc.exp <- read.table(pc,header=F,sep="\t",row.names=1,quote = "")
20 |     trans.htseq.input.count <- cbind(trans.htseq.input.count,pc.exp[,1])
21 |     pc.names <- rownames(pc.exp) #genes name
22 |     pc.samples <- c(pc.samples,pc) #samples name
23 |   }
24 |   rownames(trans.htseq.input.count) <- pc.names  
25 |   trans.htseq.input.count <- as.matrix(trans.htseq.input.count[c(-nrow(trans.htseq.input.count):-(nrow(trans.htseq.input.count)-4)),])
26 |   colnames(trans.htseq.input.count) <- pc.samples
27 |   #parsing samplenames
28 |   output_pattern = paste0("htseq_group_",group_id)  #添加aligner
29 |   write.table(trans.htseq.input.count, file = paste0(output_pattern,"_input.count") , sep ="\t", row.names = TRUE, col.names = TRUE, quote = FALSE)
30 | },
31 | mc.cores = THREAD_NUM
32 | )
33 | htseq.filelist = grep("htseq",list.files(path = "./",pattern = "input.count"), value = T)
34 | expression.matrix <- NULL
35 | for( file in htseq.filelist ){
36 |   tmp.expression.table <- as.matrix(read.table(file, header = TRUE, row.names = 1, check.names=F))
37 |   expression.matrix <- cbind(expression.matrix, tmp.expression.table)
38 | }
39 | colnames(expression.matrix) <- as.matrix(lapply(strsplit(colnames(expression.matrix),".input"), function(x){ x[1]}))
40 | write.table(expression.matrix,file = "expression.matrix",quote=F)


--------------------------------------------------------------------------------
/bin/m6Am.R:
--------------------------------------------------------------------------------
 1 | #Find m6Am 5'UTR peaks
 2 | anno_5UTR=overlap.anno[which(overlap.anno$Gene.site=="5UTR"),]
 3 | 
 4 | gtf.temp=fread("/data/database/hg38/GENCODE/gencode.v25.annotation.gtf",sep="\t",skip = 5,data.table = F)
 5 | gtf.temp=cbind(gtf.temp,Transcript.id=strsplit2(strsplit2(gtf.temp$V9,split = "transcript_id ")[,2],split = ";")[,1])
 6 | gtf.temp$Transcript.id=gsub("\"","",gtf.temp$Transcript.id)
 7 | 
 8 | write.table(strsplit2(anno_5UTR$Peak.id,split=":|-"),"UTR5.peak.bed",row.names = F,col.names = F,quote = F,sep="\t")
 9 | system("fastaFromBed -fi /data/database/hg38/genome.fa -bed UTR5.peak.bed -fo UTR5.peak.fa")
10 | system("/data/software/homer/bin/homer2 find -i UTR5.peak.fa -m /data/xingyang/m6A_zhengjian/BCA.motif -p 5 > /data/xingyang/m6A_zhengjian/analysis/BCA_peak_offset.txt")
11 | BCA_in_5UTR_offset=read.table("analysis/BCA_peak_offset.txt",header=F)
12 | 
13 | anno_5UTR=overlap.anno[unique(BCA_in_5UTR_offset$V1),]
14 | anno_5UTR=merge(gtf.temp,anno_5UTR,by="Transcript.id",all.y=T)
15 | anno_5UTR=anno_5UTR[which(anno_5UTR$V3=="UTR"),]
16 | anno_5UTR$temp.start=anno_5UTR$V4-anno_5UTR$Start
17 | anno_5UTR$temp.end=anno_5UTR$V5-anno_5UTR$Start
18 | anno_5UTR[which(anno_5UTR$temp.start>0),"temp.start"]=1
19 | anno_5UTR[which(anno_5UTR$temp.start<0),"temp.start"]=(-1)
20 | anno_5UTR[which(anno_5UTR$temp.end>0),"temp.end"]=1
21 | anno_5UTR[which(anno_5UTR$temp.end<0),"temp.end"]=(-1)
22 | anno_5UTR=anno_5UTR[which((anno_5UTR$temp.start*anno_5UTR$temp.end)<=0),]
23 | anno_5UTR.bed=cbind(anno_5UTR$V1,anno_5UTR$V4,anno_5UTR$V5,anno_5UTR$Peak.id,".",anno_5UTR$V7)
24 | write.table(anno_5UTR.bed,"UTR5.peak.bed",row.names = F,col.names = F,quote = F,sep="\t")
25 | system("fastaFromBed -fi /data/database/hg38/genome.fa -bed UTR5.peak.bed -s -name -fo UTR5.peak.fa")
26 | 
27 | temp.utr5=read.table("UTR5.peak.fa",sep="\n")
28 | temp.utr5=cbind(temp.utr5,substr(temp.utr5[,1],1,1))
29 | 
30 | i=2
31 | n=nrow(temp.utr5)
32 | temp.utr5=cbind(temp.utr5,type=NA)
33 | while(i<=n){
34 |   if(temp.utr5[i,2]=="A"){
35 |     temp.utr5[c(i-1,i),"type"]="m6Am"
36 |   }
37 |   i=i+2
38 | }
39 | m6Am=na.omit(temp.utr5)
40 | m6Am=m6Am[grep(">",m6Am$V1),]
41 | m6Am=gsub(">","",m6Am$V1)
42 | m6Am=strsplit2(m6Am,split="[(]")[,1]


--------------------------------------------------------------------------------
/bin/intersec.pl:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/perl -w
 2 | use strict;
 3 | use Getopt::Long;
 4 | 
 5 | 
 6 | my $file1;
 7 | my $column1;
 8 | my $file2;
 9 | my $column2;
10 | my $type;
11 | GetOptions(
12 | 	'a=s' => \$file1,
13 | 	'na=i' => \$column1,
14 | 	'b=s' => \$file2,
15 | 	'nb=i' => \$column2,
16 | 	't=s' => \$type
17 | );
18 | open FH1, $file1 or die "can not open $file1: $!";
19 | 
20 | my %id1;
21 | while(<FH1>){
22 | 	chomp;
23 | 	$_=~s/"//g;
24 | 	my @field=split /\s+/;
25 | 	if(!exists $id1{$field[$column1-1]}){
26 | 		$id1{$field[$column1-1]}=$_;
27 | 	}else{
28 | 		$id1{$field[$column1-1]}.="\n".$_;
29 | 	}
30 | }
31 | close(FH1);
32 | open FH2,$file2 or die "can not open $file2:$!";
33 | my %id2;
34 | while(<FH2>){
35 | 	chomp;
36 | 	$_=~s/"//g;
37 | 	my @field=split /\s+/;
38 | 	if(!exists $id2{$field[$column2-1]}){
39 |                 $id2{$field[$column2-1]}=$_;
40 |         }else{
41 |                 $id2{$field[$column2-1]}.="\n".$_;
42 |         }
43 | }
44 | 
45 | if($type eq "ua"){
46 | 	foreach my $a (keys %id1){
47 | 		if(!exists $id2{$a}){
48 | 			print $id1{$a}."\n";
49 | 		}
50 | 	}
51 | }
52 | 
53 | if($type eq "ub"){
54 | 	foreach my $b (keys %id2){
55 |                 if(!exists $id1{$b}){
56 |                         print $id2{$b}."\n";
57 |                 }
58 |         }
59 | }
60 | 
61 | if($type eq "d"){
62 |         foreach my $b (keys %id2){
63 |                 if(exists $id1{$b}){
64 |                         print $b."\n";
65 |                 }
66 |         }
67 | 
68 | }
69 | 
70 | if($type eq "da"){
71 |         foreach my $b (keys %id2){
72 |                 if(exists $id1{$b}){
73 |                         print $id1{$b}."\n";
74 |                 }
75 |         }
76 | }
77 | if($type eq "db"){
78 |         foreach my $b (keys %id2){
79 |                 if(exists $id1{$b}){
80 |                         print $id2{$b}."\n";
81 |                 }
82 |         }
83 | }
84 | if($type eq "dab"){
85 | 	foreach my $b (keys %id2){
86 |                 if(exists $id1{$b}){
87 | 			my @tmp1=split "\n",$id1{$b};
88 | 			my @tmp2=split "\n",$id2{$b};
89 | 			foreach my $t1 (@tmp1){
90 | 				foreach my $t2 (@tmp2){
91 | 					print $t1."\t".$t2."\n";
92 | 				}
93 | 			}
94 |                         #print $id1{$b}."\t".$id2{$b}."\n";
95 |                 }
96 |         }
97 | }
98 | 


--------------------------------------------------------------------------------
/bin/edgeR.R:
--------------------------------------------------------------------------------
 1 | #!/bin/Rscript
 2 | ## Rscript edgeR.R <designfile> <compare_str> eg. Rscript edgeR.R designfile_single.txt T_vs_N
 3 | ### designfile: Sample_id, Input_filename, IP_filename, group_id
 4 | ### compare_str: Compairision design (eg: A_vs_B)
 5 | library("edgeR")
 6 | args<-commandArgs(T) 
 7 | designfile <- args[1]
 8 | compare_str <- as.character(args[2])
 9 | 
10 | designtable <- read.csv(designfile,head = TRUE,stringsAsFactors=FALSE, colClasses = c("character"))
11 | # Running edgeR by compare.file
12 | ## while there are only 2 groups, running edgeR without compare.file
13 | if(length(unique(designtable$Group)) < 2){
14 |   stop( "The count of Group is less than two, please check your designfile.")
15 | }else if( compare_str == "two_group" ){
16 |   # Running edgeR without compare_str beacause of only two groups
17 |   ## Combine expression matrix
18 |   group_id_1 <- unique(designtable$Group)[1]
19 |   group_id_2 <- unique(designtable$Group)[2]
20 | }else{
21 |   # Running edgeR with compare_str 
22 |   ## Combine expression matrix
23 |   group_id_1 <- strsplit(as.character(compare_str), "_vs_")[[1]][1]
24 |   group_id_2 <- strsplit(as.character(compare_str), "_vs_")[[1]][2]
25 | }
26 | control_database = read.table(paste0("htseq_group_", group_id_1, "_input.count"), header = TRUE, row.names = 1)
27 | treated_database = read.table(paste0("htseq_group_", group_id_2, "_input.count"), header = TRUE, row.names = 1)
28 | combined_database <- cbind(control_database,treated_database)
29 | group <- factor(c(rep(group_id_1,ncol(control_database)), rep(group_id_2,ncol(treated_database)))) #setting factors
30 | y <- DGEList(counts=combined_database,group=group)
31 | rownames(y) <- rownames(combined_database)
32 | y <- calcNormFactors(y)
33 | design <- model.matrix(~group)
34 | y <- estimateDisp(y,design)
35 | #To perform likelihood ratio tests:
36 | fit <- glmFit(y,design)
37 | lrt <- glmLRT(fit,coef=2)
38 | topTags(lrt)
39 | ### set output_name
40 | lrt$table$padj <- p.adjust(lrt$table$PValue,"BH")
41 | lrt.res <- lrt$table[order(lrt$table$padj),]
42 | colnames(lrt.res) <- c("log2FoldChange","logCPM","LR","pvalue","padj")
43 | output_name <- paste0("edgeR_group_",group_id_1, "_",group_id_2)
44 | write.csv(combined_database, file = paste0(output_name,".matirx") )
45 | write.csv(lrt.res, file = paste0(output_name, ".csv"))
46 | 


--------------------------------------------------------------------------------
/bin/bedtools_quantification.R:
--------------------------------------------------------------------------------
 1 | #!/bin/Rscript
 2 | ## Rscript bedtools_quantification.R <designfile> <bam_stat_summary_file>
 3 | ### the content of bam_stat_summary_file: example.bam TOTAL_READS
 4 | ### designfile: Sample_id, Input_filename, IP_filename, group_id
 5 | args <- commandArgs(T)
 6 | designfile <- args[1]
 7 | bam_stat_summary <- args[2]
 8 | designtable <- read.csv(designfile,head = TRUE,stringsAsFactors=FALSE, colClasses = c("character"))
 9 | bam_stat_table <- read.table(bam_stat_summary,row.names = 1)
10 | filelist =list.files(path = "./",pattern = ".count")
11 | ## Generate the quantificative value of peaks referred to RPKM
12 | rpkm_peaks_list <- NULL
13 | #rpkm_peaks_list1 <- NULL
14 | #rpkm_peaks_list2 <- NULL
15 | for(sample_id in designtable$Sample_ID){
16 |   input_count_file <- grep(paste0("[.]",sample_id,"[.]input"),filelist,value = TRUE)
17 |   input_count_table <- read.table(file = input_count_file, sep = "\t", row.names = NULL,header = T)
18 |   bam_stat_index = grep(paste0("^",sample_id,"[.]input"),rownames(bam_stat_table))
19 |   input_rpkm =  apply(input_count_table,1,function(x) (as.numeric(x[5])/(as.numeric(x[3])-as.numeric(x[2]))*1000/bam_stat_table[bam_stat_index,]*1000000))
20 |   
21 |   ip_count_file <- grep(paste0("[.]",sample_id,"[.]ip"),filelist,value = TRUE)
22 |   ip_count_table <- read.table(file = ip_count_file, sep = "\t", row.names = NULL, header = T)
23 |   bam_stat_index = grep(paste0("^",sample_id,"[.]ip"),rownames(bam_stat_table))
24 |   ip_rpkm =  apply(ip_count_table,1,function(x) (as.numeric(x[5])/(as.numeric(x[3])-as.numeric(x[2]))*1000/bam_stat_table[bam_stat_index,]*1000000))
25 |   
26 |   rpkm <- as.matrix((ip_rpkm+1)/(input_rpkm+1))
27 |   #rpkm1<- as.matrix((ip_rpkm)/(input_rpkm+1))
28 |   #rpkm2<- as.matrix((ip_rpkm)/(input_rpkm+ip_rpkm))
29 |   colnames(rpkm)[1] <- sample_id
30 |   #colnames(rpkm1)[1] <- sample_id
31 |   #colnames(rpkm2)[1] <- sample_id
32 |   rpkm_peaks_list <- cbind(rpkm_peaks_list,rpkm)
33 |   #rpkm_peaks_list1 <- cbind(rpkm_peaks_list1,rpkm1)
34 |   #rpkm_peaks_list2 <- cbind(rpkm_peaks_list2,rpkm2)	
35 | }
36 | rownames(rpkm_peaks_list) <- input_count_table[,4] 
37 | #rownames(rpkm_peaks_list1) <- input_count_table[,4] 
38 | #rownames(rpkm_peaks_list2) <- input_count_table[,4] 
39 | write.table(rpkm_peaks_list,sep = "\t",file = "bedtools_quantification.matrix",quote = F)
40 | #write.table(rpkm_peaks_list1,sep = "\t",file = "bedtools_quantification.inputadd.matrix",quote = F)
41 | #write.table(rpkm_peaks_list2,sep = "\t",file = "bedtools_quantification.alladd.matrix",quote = F)
42 | 


--------------------------------------------------------------------------------
/docs/configuration/reference_genomes.md:
--------------------------------------------------------------------------------
 1 | # nf-core/m6APipe: Reference Genomes Configuration
 2 | 
 3 | The nf-core/m6APipe pipeline needs a reference genome for alignment and annotation.
 4 | 
 5 | These paths can be supplied on the command line at run time (see the [usage docs](../usage.md)),
 6 | but for convenience it's often better to save these paths in a nextflow config file.
 7 | See below for instructions on how to do this.
 8 | Read [Adding your own system](adding_your_own.md) to find out how to set up custom config files.
 9 | 
10 | ## Adding paths to a config file
11 | 
12 | Specifying long paths every time you run the pipeline is a pain.
13 | To make this easier, the pipeline comes configured to understand reference genome keywords which correspond to preconfigured paths, meaning that you can just specify `--genome ID` when running the pipeline.
14 | 
15 | Note that this genome key can also be specified in a config file if you always use the same genome.
16 | 
17 | To use this system, add paths to your config file using the following template:
18 | 
19 | ```nextflow
20 | params {
21 |   genomes {
22 |     'YOUR-ID' {
23 |       fasta  = '<PATH TO FASTA FILE>/genome.fa'
24 |     }
25 |     'OTHER-GENOME' {
26 |       // [..]
27 |     }
28 |   }
29 |   // Optional - default genome. Ignored if --genome 'OTHER-GENOME' specified on command line
30 |   genome = 'YOUR-ID'
31 | }
32 | ```
33 | 
34 | You can add as many genomes as you like as long as they have unique IDs.
35 | 
36 | ## illumina iGenomes
37 | 
38 | To make the use of reference genomes easier, illumina has developed a centralised resource called [iGenomes](https://support.illumina.com/sequencing/sequencing_software/igenome.html).
39 | Multiple reference index types are held together with consistent structure for multiple genomes.
40 | 
41 | We have put a copy of iGenomes up onto AWS S3 hosting and this pipeline is configured to use this by default.
42 | The hosting fees for AWS iGenomes are currently kindly funded by a grant from Amazon.
43 | The pipeline will automatically download the required reference files when you run the pipeline.
44 | For more information about the AWS iGenomes, see <https://ewels.github.io/AWS-iGenomes/>
45 | 
46 | Downloading the files takes time and bandwidth, so we recommend making a local copy of the iGenomes resource.
47 | Once downloaded, you can customise the variable `params.igenomes_base` in your custom configuration file to point to the reference location.
48 | For example:
49 | 
50 | ```nextflow
51 | params.igenomes_base = '/path/to/data/igenomes/'
52 | ```
53 | 


--------------------------------------------------------------------------------
/bin/DESeq2.R:
--------------------------------------------------------------------------------
 1 | #!/bin/Rscript
 2 | ## Rscript DESeq2.R <designfile> <compare_str>  eg. Rscript DESeq2.R designfile_single.txt T_vs_N 
 3 | ### designfile: Sample_id, Input_filename, IP_filename, group_id
 4 | ### compare_str: Compairision design (eg: A_vs_B)
 5 | 
 6 | library(DESeq2)
 7 | args<-commandArgs(T) 
 8 | designfile <- args[1]
 9 | compare_str <- as.character(args[2])
10 | 
11 | designtable <- read.csv(designfile,head = TRUE,stringsAsFactors=FALSE, colClasses = c("character"))
12 | # Running DEseq2 by compare_str
13 | ## while there are only 2 groups, running DEseq2 without compare_str
14 | if( length(unique(designtable$Group)) < 2 ){
15 |   stop( "The count of Group is less than two, please check your designfile.")
16 | }else if( compare_str == "two_group" ){
17 |   # Running DESeq2 without compare_str beacause of only two groups
18 |   ## Combine expression matrix
19 |   group_id_1 <- unique(designtable$Group)[1]
20 |   group_id_2 <- unique(designtable$Group)[2]
21 | }else{
22 |   # Running DESeq2 with compare_str
23 |   ## Combine expression matrix
24 |   group_id_1 <- strsplit(as.character(compare_str), "_vs_")[[1]][1]
25 |   group_id_2 <- strsplit(as.character(compare_str), "_vs_")[[1]][2]
26 | }
27 | control_database = read.table(paste0("htseq_group_", group_id_1, "_input.count"), header = TRUE, row.names = 1, check.names = FALSE)
28 | treated_database = read.table(paste0("htseq_group_", group_id_2, "_input.count"), header = TRUE, row.names = 1, check.names = FALSE)
29 | combined_database <- cbind(control_database,treated_database)
30 | condition <- factor(c(rep(group_id_1,ncol(control_database)), rep(group_id_2,ncol(treated_database)))) #setting factors
31 | ### assign gene names
32 | colData <- data.frame(row.names=colnames(combined_database), group = condition)
33 | dds <- DESeqDataSetFromMatrix(countData = combined_database,colData = colData,design = ~ group)
34 | rownames(dds) <- rownames(combined_database)
35 | #dds <- dds[ rowSums(counts(dds)) > 1, ]
36 | dds <- DESeq(dds)
37 | ## FoldChange = group_id_2 / group_id_1
38 | res <- results(object = dds, contrast = c("group",group_id_2,group_id_1))
39 | table(res$padj <0.05)
40 | res <- res[order(res$padj),]
41 | #resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)),by="row.names",sort=FALSE)
42 | #resdata2=resdata[resdata$log2FoldChange > 1|resdata$log2FoldChange < -1, ]
43 | ### set output_name
44 | output_name <- paste0("DESeq2_group_",group_id_1, "_",group_id_2)
45 | write.csv(res, file = paste0(output_name, ".csv"))
46 | #write.csv(resdata2,file = paste0(output_name, "_log2.csv"),row.names =FALSE)
47 | 


--------------------------------------------------------------------------------
/bin/MATK_diffm6A.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ## MATK_diffpeakCalling.sh <matk_jar> <designfile> <gtf> <compare_str>
 3 | ## $1 argv 1 : matk_jar
 4 | ## $2 argv 2 : designfile
 5 | ## $3 argv 3 : gtf file
 6 | ## $4 argv 4 : compare_str
 7 | ### designfile: Sample_id, Input_filename, IP_filename, group_id
 8 | ### compare_str: Compairision design (eg: A_vs_B)
 9 | matk_jar=$1
10 | designfile=$2
11 | gtf_file=$3
12 | compare_str=$4
13 | merged_bed=$5
14 | 
15 | # setting the function of Running the quantification mode of MATK by two names of groups
16 | function matk_diffm6a_by_two_id()
17 | {
18 |     group_id_1=$1
19 |     group_id_2=$2
20 |     matk_jar=$3
21 |     gtf_file=$4
22 |     control_ip_bam_file_array=$(echo *ip_${group_id_1}*.bam | awk '{OFS=",";ORS=""}{for(x=1;x<NF;x++) print $x";" }END{print $x""}')
23 |     control_input_bam_file_array=$(echo *input_${group_id_1}*.bam | awk '{OFS=",";ORS=""}{for(x=1;x<NF;x++) print $x";" }END{print $x""}')
24 |     treated_ip_bam_file_array=$(echo *ip_${group_id_2}*.bam | awk '{OFS=",";ORS=""}{for(x=1;x<NF;x++) print $x";" }END{print $x""}')
25 |     treated_input_bam_file_array=$(echo *input_${group_id_2}*.bam | awk '{OFS=",";ORS=""}{for(x=1;x<NF;x++) print $x";" }END{print $x""}')
26 |     java -jar ${matk_jar} -diff \
27 |                     -control_ip "${control_ip_bam_file_array}" \
28 |                     -control_input "${control_input_bam_file_array}" \
29 |                     -treated_ip "${treated_ip_bam_file_array}" \
30 |                     -treated_input "${treated_input_bam_file_array}" \
31 |                     -control_bed $merged_bed \
32 |                     -treated_bed $merged_bed \
33 |                     -gtf ${gtf_file} \
34 |                     -out tmp.${group_id_1}_${group_id_2}.txt
35 |     awk 'BEGIN{print "ID","Gene_symbol","ControlScore","TreatScore","log2FC","pvalue","qvalue"} \
36 |         NR>1{print $1":"$2"-"$3,$4,$5,$6,$7,log($8)/log(2),$9,$10}' tmp.${group_id_1}_${group_id_2}.txt  > MATK_diffm6A_${group_id_1}_${group_id_2}.txt
37 | }
38 | 
39 | if [ "$compare_str" != "two_group" ]; then
40 |     # Running MATK quantification with compare_str
41 |     group_id_1=$(echo $compare_str | awk 'BEGIN{FS="_vs_"}{print $1}')
42 |     group_id_2=$(echo $compare_str | awk 'BEGIN{FS="_vs_"}{print $2}')
43 |     matk_diffm6a_by_two_id $group_id_1 $group_id_2 $matk_jar $gtf_file
44 | else
45 |     # Running MATK quantification without compare_str beacause of only two groups
46 |     echo "no compare file"
47 |     group_list=$(awk 'BEGIN{FS=","}NR>1{print $4}' $designfile |sort|uniq|awk 'BEGIN{ORS="\t"}{print $0}')
48 |     group_id_1=$(echo $group_list | awk 'BEGIN{FS="\t"}{print $1}')
49 |     group_id_2=$(echo $group_list | awk 'BEGIN{FS="\t"}{print $2}')   
50 |     matk_diffm6a_by_two_id $group_id_1 $group_id_2 $matk_jar $gtf_file
51 | fi
52 | wait
53 | echo "diffMATK done"


--------------------------------------------------------------------------------
/assets/email_template.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | <head>
 3 |   <head>
 4 |   <meta charset="utf-8">
 5 |   <meta http-equiv="X-UA-Compatible" content="IE=edge">
 6 |   <meta name="viewport" content="width=device-width, initial-scale=1">
 7 | 
 8 |   <meta name="description" content="nf-core/meripseqpipe: MeRIP-seq analysis pipeline arranged multiple alignment tools, peakCalling tools, Merge Peaks\' methods and methylation analysis methods.">
 9 |   <title>nf-core/meripseqpipe Pipeline Report</title>
10 | </head>
11 | <body>
12 | <div style="font-family: Helvetica, Arial, sans-serif; padding: 30px; max-width: 800px; margin: 0 auto;">
13 | 
14 | <img src="cid:nfcorepipelinelogo">
15 | 
16 | <h1>nf-core/meripseqpipe v${version}</h1>
17 | <h2>Run Name: $runName</h2>
18 | 
19 | <% if (!success){
20 |     out << """
21 |     <div style="color: #a94442; background-color: #f2dede; border-color: #ebccd1; padding: 15px; margin-bottom: 20px; border: 1px solid transparent; border-radius: 4px;">
22 |         <h4 style="margin-top:0; color: inherit;">nf-core/meripseqpipe execution completed unsuccessfully!</h4>
23 |         <p>The exit status of the task that caused the workflow execution to fail was: <code>$exitStatus</code>.</p>
24 |         <p>The full error message was:</p>
25 |         <pre style="white-space: pre-wrap; overflow: visible; margin-bottom: 0;">${errorReport}</pre>
26 |     </div>
27 |     """
28 | } else {
29 |     out << """
30 |     <div style="color: #3c763d; background-color: #dff0d8; border-color: #d6e9c6; padding: 15px; margin-bottom: 20px; border: 1px solid transparent; border-radius: 4px;">
31 |         nf-core/meripseqpipe execution completed successfully!
32 |     </div>
33 |     """
34 | }
35 | %>
36 | 
37 | <p>The workflow was completed at <strong>$dateComplete</strong> (duration: <strong>$duration</strong>)</p>
38 | <p>The command used to launch the workflow was as follows:</p>
39 | <pre style="white-space: pre-wrap; overflow: visible; background-color: #ededed; padding: 15px; border-radius: 4px; margin-bottom:30px;">$commandLine</pre>
40 | 
41 | <h3>Pipeline Configuration:</h3>
42 | <table style="width:100%; max-width:100%; border-spacing: 0; border-collapse: collapse; border:0; margin-bottom: 30px;">
43 |     <tbody style="border-bottom: 1px solid #ddd;">
44 |         <% out << summary.collect{ k,v -> "<tr><th style='text-align:left; padding: 8px 0; line-height: 1.42857143; vertical-align: top; border-top: 1px solid #ddd;'>$k</th><td style='text-align:left; padding: 8px; line-height: 1.42857143; vertical-align: top; border-top: 1px solid #ddd;'><pre style='white-space: pre-wrap; overflow: visible;'>$v</pre></td></tr>" }.join("\n") %>
45 |     </tbody>
46 | </table>
47 | 
48 | <p>nf-core/meripseqpipe</p>
49 | <p><a href="https://github.com/nf-core/meripseqpipe">https://github.com/nf-core/meripseqpipe</a></p>
50 | 
51 | </div>
52 | 
53 | </body>
54 | </html>
55 | 


--------------------------------------------------------------------------------
/bin/create_IGV_js.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | fasta=$1
 3 | gtf=$2
 4 | merged_peak_file=$3
 5 | designfile=$4
 6 | echo "Start to generate IGV.js"
 7 | 
 8 | ## setting tmp files' name
 9 | bedgraph_tracks_file=tmp.bedgraph.tracks
10 | peaks_tracks_file=tmp.peaks.tracks
11 | 
12 | ## combined tracks of bedgraph
13 | sampleinfo_list=$(awk 'BEGIN{FS=","}NR>1{print $1","$4}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}')
14 | for sample_group_id in ${sampleinfo_list}
15 |     do
16 |     {  
17 |         sample_id=$(echo ${sample_group_id} | awk 'BEGIN{FS=","}{print $1}')
18 |         group_id=$(echo ${sample_group_id} | awk 'BEGIN{FS=","}{print $2}')
19 |         bedgraph_input_file=$(ls ${sample_id}.input_*.igv.bedgraph)
20 |         bedgraph_ip_file=$(ls ${sample_id}.ip_*.igv.bedgraph)
21 |         cat >> ${bedgraph_tracks_file} << EOF
22 |         {
23 |             url: '${bedgraph_input_file}',
24 |             name: '${sample_id}.input',
25 |             color: 'rgb(200,0,0)',
26 |             type: "wig",
27 |             sourceType: "file",
28 |             autoscaleGroup: 'group_${group_id}.${sample_id}'
29 |         },
30 |         {
31 |             url: '${bedgraph_ip_file}',
32 |             name: '${sample_id}.ip',
33 |             type: "wig",
34 |             sourceType: "file",
35 |             color: 'rgb(200,0,0)',
36 |             autoscaleGroup: 'group_${group_id}.${sample_id}'
37 |         },
38 | EOF
39 |     }
40 | done
41 | 
42 | ## combined tracks of merged group peaks
43 | groups_peak_file=$(ls *_merged_group_*igv.bed)
44 | for peak_file in ${groups_peak_file}
45 |     do
46 |     {  
47 |         cat >> ${peaks_tracks_file} << EOF
48 |         {
49 |             type: "annotation",
50 |             format: "bed",
51 |             url: '${peak_file}',
52 |             name: "${peak_file}"
53 |         },
54 | EOF
55 |     }
56 | done
57 | 
58 | ## combined tracks and allpeaks track
59 | cat ${bedgraph_tracks_file} ${peaks_tracks_file} > tmp.tracks
60 | cat >> tmp.tracks << EOF
61 |         {
62 |             type: "annotation",
63 |             format: "bed",
64 |             url: '${merged_peak_file}',
65 |             name: "${merged_peak_file}"
66 |         }
67 | EOF
68 | tracks_js=$(cat tmp.tracks)
69 | 
70 | ## combined all info 
71 | cat>igv.js<<EOF
72 | var igvDiv = document.getElementById("igvDiv");
73 | var options =
74 | {
75 |     reference: {
76 |         id: "$fasta",
77 |         fastaURL: "$fasta",
78 |         indexURL: "$fasta.fai",
79 |         wholeGenomeView: false
80 |     },
81 |     locus: 'chr22',
82 |     tracks: [
83 |         {
84 |             type: "annotation",
85 |             format: "gtf",
86 |             url: "$gtf",
87 |             displayMode: "SQUISHED",
88 |             name: "$gtf",
89 |             visibilityWindow: 10000000
90 |         },
91 | $tracks_js
92 | ]
93 | }; 
94 | var browser = igv.createBrowser(igvDiv, options);
95 | EOF
96 | rm -rf tmp*
97 | echo "Generate IGV.js was success"


--------------------------------------------------------------------------------
/conf/base.config:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * -------------------------------------------------
 3 |  *  nf-core/meripseqpipe Nextflow base config file
 4 |  * -------------------------------------------------
 5 |  * A 'blank slate' config file, appropriate for general
 6 |  * use on most high performace compute environments.
 7 |  * Assumes that all software is installed and available
 8 |  * on the PATH. Runs in `local` mode - all jobs will be
 9 |  * run on the logged in environment.
10 |  */
11 | 
12 | process {
13 |   cpus = { check_max( 20, 'cpus' ) }
14 |   memory = { check_max( 40.GB * task.attempt, 'memory' ) }
15 |   time = { check_max( 240.h * task.attempt, 'time' ) }
16 | 
17 |   errorStrategy = { task.exitStatus in [143,137] ? 'retry' : 'finish' }
18 |   maxRetries = 1
19 |   maxErrors = '-1'
20 |   maxForks = 12
21 | 
22 |   // Process-specific resource requirements
23 | 
24 |   withLabel: build_index {
25 |     cpus = { check_max( 30, 'cpus' ) }
26 |     memory = { check_max( 90.GB * task.attempt, 'memory' ) }
27 |     time = { check_max( 240.h * task.attempt, 'time' ) }
28 |   }
29 |   withLabel: peak_calling {
30 |     cpus = { check_max( 15, 'cpus' ) }
31 |     memory = { check_max( 70.GB * task.attempt, 'memory' ) }
32 |     time = { check_max( 240.h * task.attempt, 'time' ) }
33 |   }
34 |   withLabel: onecore_peak {
35 |     cpus = { check_max( 1, 'cpus' ) }
36 |     memory = { check_max( 40.GB * task.attempt, 'memory' ) }
37 |     time = { check_max( 240.h * task.attempt, 'time' ) }
38 |   }
39 |   withLabel: aligners {
40 |     cpus = { check_max( 1, 'cpus' ) }
41 |     memory = { check_max( 35.GB * task.attempt, 'memory' ) }
42 |     time = { check_max( 240.h * task.attempt, 'time' ) }
43 |     errorStrategy = { task.exitStatus in [143,137] ? 'retry' : 'terminate' }
44 |   }
45 |   withLabel: analysis {
46 |     cpus = { check_max( 15, 'cpus' ) }
47 |     memory = { check_max( 30.GB * task.attempt, 'memory' ) }
48 |     time = { check_max( 240.h * task.attempt, 'time' ) }
49 |   }
50 |   withLabel: exp_analysis {
51 |     cpus = { check_max( 15, 'cpus' ) }
52 |     memory = { check_max( 30.GB * task.attempt, 'memory' ) }
53 |     time = { check_max( 240.h * task.attempt, 'time' ) }
54 |     errorStrategy = { task.exitStatus in [143,137] ? 'retry' : 'ignore' }
55 |   }
56 |   withName: fastp {
57 |     cpus = { check_max( 10, 'cpus' ) }
58 |     memory = { check_max( 25.GB * task.attempt, 'memory' ) }
59 |     time = { check_max( 240.h * task.attempt, 'time' ) }
60 |   }
61 |   withName: sort {
62 |     cpus = { check_max( 30, 'cpus' ) }
63 |     memory = { check_max( 35.GB * task.attempt, 'memory' ) }
64 |     time = { check_max( 240.h * task.attempt, 'time' ) }
65 |     errorStrategy = { task.exitStatus in [143,137] ? 'retry' : 'ignore' }
66 |   }
67 | }
68 | executor {
69 |     name = 'local'
70 |     cpus = 20
71 |     memory = 100.GB
72 |     queueSize = 200
73 |     pollInterval = '30 sec'
74 | }
75 | params {
76 |   // Defaults only, expecting to be overwritten
77 |   max_memory = 100.GB
78 |   max_cpus = 20
79 |   max_time = 240.h
80 | }
81 | 


--------------------------------------------------------------------------------
/bin/bedtools_diffm6A.R:
--------------------------------------------------------------------------------
 1 | #!/bin/Rscript
 2 | ## Rscript bedtools_diffm6A.R <desginfile> <quantification_matrix_file> <compare_str>
 3 | ### designfile: Sample_id, Input_filename, IP_filename, group_id
 4 | ### compare_str: Compairision design (eg: A_vs_B)
 5 | args <- commandArgs(T)
 6 | designfile <- args[1]
 7 | quantification_matrix_file <- args[2]
 8 | compare_str <- as.character(args[3])
 9 | 
10 | designtable <- read.csv(designfile, head = TRUE, stringsAsFactors=FALSE, colClasses = c("character"))
11 | quantification_matrix = read.table(quantification_matrix_file ,sep = "\t",header = T, row.names = 1)
12 | # generate design matrix
13 | design.matrix <- as.data.frame(designtable$Group)
14 | rownames(design.matrix) <- designtable$Sample_ID
15 | colnames(design.matrix) <- "Type"
16 | 
17 | # Wilcoxon test for the vector of two groups
18 | row_wilcox <- function(design.matrix,group_id_1,group_id_2,x,test_mode=""){
19 |   group1 <- as.character(rownames(subset(design.matrix,Type==group_id_1))) 
20 |   group2 <- as.character(rownames(subset(design.matrix,Type==group_id_2)))
21 |   if (test_mode=="paired"){
22 |     res_wix0 <- wilcox.test(x[which(rownames(design.matrix)%in%group1)],x[which(rownames(design.matrix)%in%group2)], paired = T)
23 |   } else {
24 |     res_wix0 <- wilcox.test(x[which(rownames(design.matrix)%in%group1)],x[which(rownames(design.matrix)%in%group2)])
25 |   }
26 |   res_wix0$log2FC = log2(mean(x[which(rownames(design.matrix)%in%group2)])/mean(x[which(rownames(design.matrix)%in%group1)]))
27 |   res_wix <- c(log2FC=res_wix0$log2FC,pvalue=res_wix0$p.value,statistic=res_wix0$statistic) 
28 |   return(res_wix)
29 | }
30 | 
31 | # Get the information of groups from compare_str
32 | if(length(unique(design.matrix$Type)) < 2){
33 |   stop( "The count of Group is less than two, please check your designfile.")
34 | }else if( compare_str == "two_group" ){
35 |   # Get the information without compare_str beacause of only two groups
36 |   group_id_1 <- unique(design.matrix$Type)[1]
37 |   group_id_2 <- unique(design.matrix$Type)[2]
38 | }else{
39 |   # Running MeTDiff quantification with compare_str
40 |   group_id_1 <- strsplit(as.character(compare_str), "_vs_")[[1]][1]
41 |   group_id_2 <- strsplit(as.character(compare_str), "_vs_")[[1]][2]
42 | }
43 | cat("peak number ",dim(quantification_matrix)[1],"\n")
44 | 
45 | # Run the Wilcoxon test for the quantifacative value of every peak
46 | test_mode=""
47 | res_wix_lst <- apply(na.omit(quantification_matrix[,3:ncol(quantification_matrix)]),1,function(x){row_wilcox(design.matrix,group_id_1,group_id_2,x,test_mode)})
48 | res_wix_lst = as.data.frame(t(res_wix_lst)) 
49 | res_wix_lst$padj = p.adjust(res_wix_lst$pvalue,method = "BH")
50 | res_wix_lst$BY = p.adjust(res_wix_lst$pvalue,method = "bonferroni")
51 | cat("DM peaks pvalue(0.05)",sum(res_wix_lst$pvalue <=0.05),"\n")
52 | cat("DM peaks FDR(0.05)",sum(res_wix_lst$padj <=0.05),"\n")
53 | output_name <- paste0("bedtools_diffm6A_",group_id_1, "_", group_id_2)
54 | write.table(res_wix_lst, file = paste0(output_name,".txt"), sep = "\t", quote = F)
55 | 
56 | 


--------------------------------------------------------------------------------
/bin/QNB_diffm6A.R:
--------------------------------------------------------------------------------
 1 | #!/bin/Rscript
 2 | ## Rscript QNB_diffm6A.R <desginfile> <quantification_matrix_file> <compare_str>
 3 | ### designfile: Sample_id, Input_filename, IP_filename, group_id
 4 | ### compare_str: Compairision design (eg: A_vs_B)
 5 | library(QNB)
 6 | args <- commandArgs(T)
 7 | designfile <- args[1]
 8 | quantification_matrix_file <- args[2]
 9 | compare_str <- as.character(args[3])
10 | 
11 | designtable <- read.csv(designfile,head = TRUE,stringsAsFactors=FALSE, colClasses = c("character"))
12 | # Running QNB quantification
13 | if(length(unique(designtable$Group)) < 2){
14 |   stop( "The count of Group is less than two, please check your designfile.")
15 | }else if( compare_str == "two_group" ){
16 |   # Running QNB quantification without compare_str beacause of only two groups
17 |   group_id_1 <- unique(designtable$Group)[1]
18 |   group_id_2 <- unique(designtable$Group)[2]
19 | }else{
20 |   # Running QNB quantification with compare_str
21 |   group_id_1 <- strsplit(as.character(compare_str), "_vs_")[[1]][1]
22 |   group_id_2 <- strsplit(as.character(compare_str), "_vs_")[[1]][2]
23 | }
24 | 
25 | # combine the matrix by groups
26 | countlist <- NULL
27 | for(group_id in c(group_id_1,group_id_2)){
28 |   ## generate the list of input count
29 |   input.count <- c()
30 |   input.names <- c()
31 |   input.samples <- c()
32 |   for( input in grep(paste0(quantification_matrix_file,"[.]",group_id,"[.]"), list.files(pattern = "input.count"), value = T) ){
33 |     input.exp <- read.table(input,header=T,sep="\t",row.names= NULL,quote = "")
34 |     input.count <- cbind(input.count,input.exp[,5])
35 |     input.names <- input.exp[,4] #peaks name
36 |     input.samples <- c(input.samples,input) #samples name
37 |   }
38 |   colnames(input.count) <- input.samples
39 |   rownames(input.count) <- input.names
40 |   countlist[[paste0(group_id,"_input")]] <- input.count 
41 | 
42 |   ## generate the list of ip count
43 |   ip.count <- c()
44 |   ip.names <- c()
45 |   ip.samples <- c()
46 |   for( ip in grep(paste0(quantification_matrix_file,"[.]",group_id,"[.]"), list.files(pattern = "ip.count"), value = T) ){
47 |     ip.exp <- read.table(ip,header=T,sep="\t",row.names= NULL,quote = "")
48 |     ip.count <- cbind(ip.count,ip.exp[,5])
49 |     ip.names <- ip.exp[,4] #peaks name
50 |     ip.samples <- c(ip.samples,ip) #samples name
51 |   }
52 |   colnames(ip.count) <- ip.samples
53 |   rownames(ip.count) <- ip.names
54 |   countlist[[paste0(group_id,"_ip")]] <- ip.count
55 | }
56 | ## Run the QNB by using the count of peaks
57 | meth1 = countlist[[paste0(group_id_1,"_ip")]]
58 | meth2 = countlist[[paste0(group_id_2,"_ip")]]
59 | unmeth1 = countlist[[paste0(group_id_1,"_input")]]
60 | unmeth2 = countlist[[paste0(group_id_2,"_input")]]
61 | output_name <- paste0("QNB_diffm6A_",group_id_1, "_",group_id_2)
62 | dir.create(output_name)
63 | result <- qnbtest(meth1, meth2, unmeth1, unmeth2, mode="auto", output.dir = output_name)
64 | colnames(result) <- c("p.treated","p.control","log2FC","log2.OR","pvalue","qvalue","padj")
65 | write.table(result, file = paste0(output_name,".txt"), sep = "\t", quote = F)


--------------------------------------------------------------------------------
/bin/bed_count.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #bash bed_count.sh <designfile> <THREAD_NUM> <merge_bed_file> <output_bam_stat_file>
 3 | #$1 argv 1 : designfile
 4 | #$2 argv 2 : THREAD_NUM
 5 | #$3 argv 3 : merge_bed_file
 6 | #$4 argv 4 : output_bam_stat_file
 7 | designfile=$1
 8 | THREAD_NUM=$2
 9 | merge_bed_file=$3
10 | output_bam_stat_file=$4
11 | 
12 | # Define a multi-threaded run channel
13 | mkfifo tmp
14 | exec 9<>tmp
15 | for ((i=1;i<=${THREAD_NUM:=1};i++))
16 | do
17 |     echo >&9
18 | done
19 | 
20 | # Create the file about the summary of bam stat
21 | echo "Total_Reads" > $output_bam_stat_file
22 | awk '{ print $1"\t"$2"\t"$3"\t"$4}' ${merge_bed_file} > tmp.${merge_bed_file}
23 | 
24 | sampleinfo_list=$(awk 'BEGIN{FS=","}NR>1{print $1","$4}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}')
25 | for sample_group_id in ${sampleinfo_list}
26 | do
27 | read -u 9
28 | {
29 |     sample_id=$(echo ${sample_group_id} | awk 'BEGIN{FS=","}{print $1}')
30 |     group_id=$(echo ${sample_group_id} | awk 'BEGIN{FS=","}{print $2}') 
31 | 
32 |     #Define the input/ip file of the sample
33 |     input_bam_file=$(ls ${sample_id}.input*.bam | awk '{ORS=" "}{print $0}')
34 |     ip_bam_file=$(ls ${sample_id}.ip*.bam | awk '{ORS=" "}{print $0}')
35 | 
36 |     ## Create files and print the name of samples && Get the Total reads of the samples of input and ip
37 |     echo -e ${input_bam_file}"\t" | awk 'BEGIN{ORS=""}{print $0}' > ${sample_id}.bam_stat.txt
38 |     samtools view -c ${input_bam_file} >> ${sample_id}.bam_stat.txt
39 |     echo -e ${ip_bam_file}"\t" | awk 'BEGIN{ORS=""}{print $0}' >> ${sample_id}.bam_stat.txt
40 |     samtools view -c ${ip_bam_file} >> ${sample_id}.bam_stat.txt
41 | 
42 |     ## Setting colnames of peaks input/ip count
43 |     echo $input_bam_file \
44 |     | awk 'BEGIN{ORS=""}{print "chrom\tchromStart\tchromEND\tPeakName\t"}{for(x=1;x<NF;x++) print $x"\t" }END{print $x"\n"}' \
45 |     > ${merge_bed_file}.${group_id}.${sample_id}.input.count
46 |     echo ${ip_bam_file} \
47 |     | awk 'BEGIN{ORS=""}{print "chrom\tchromStart\tchromEND\tPeakName\t"}{for(x=1;x<NF;x++) print $x"\t" }END{print $x"\n"}' \
48 |     > ${merge_bed_file}.${group_id}.${sample_id}.ip.count
49 | 
50 |     ## Count input/ip peaks
51 |     
52 |     bedtools multicov -bams ${input_bam_file} -bed tmp.${merge_bed_file} >> ${merge_bed_file}.${group_id}.${sample_id}.input.count
53 |     bedtools multicov -bams ${ip_bam_file} -bed tmp.${merge_bed_file} >> ${merge_bed_file}.${group_id}.${sample_id}.ip.count
54 |     echo >&9
55 | 
56 |     # awk -v bam="$input_bam" -v pre="$prefix" '
57 |     # {print " bedtools multicov -bams '${input_bam_file}' -bed tmp.'${merge_bed_file}' >> '${merge_bed_file}'.'${group_id}'.'${sample_id}'.input.count; \
58 |     # bedtools multicov -bams '${ip_bam_file}' -bed tmp.'${merge_bed_file}' >> '${merge_bed_file}'.'${group_id}'.'${sample_id}'.ip.count; \
59 |     # sortBed -i ./"pre".tmp/input/"$1".bed | intersectBed  -a '${genomebin_dir}'"$1".bin25.bed -b - -sorted -c > ./"pre".tmp/input/"$1".bin25.txt"}' $chrName_file \
60 |     # | xargs -iCMD -P$THREAD_NUM bash -c CMD
61 | }&
62 | done
63 | wait
64 | cat *.bam_stat.txt >> $output_bam_stat_file
65 | wait
66 | echo "bedtools count done"


--------------------------------------------------------------------------------
/.github/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # nf-core/meripseqpipe: Contributing Guidelines
 2 | 
 3 | Hi there! Many thanks for taking an interest in improving nf-core/meripseqpipe.
 4 | 
 5 | We try to manage the required tasks for nf-core/meripseqpipe using GitHub issues, you probably came to this page when creating one. Please use the pre-filled template to save time.
 6 | 
 7 | However, don't be put off by this template - other more general issues and suggestions are welcome! Contributions to the code are even more welcome ;)
 8 | 
 9 | > If you need help using or modifying nf-core/meripseqpipe then the best place to ask is on the pipeline channel on [Slack](https://nf-co.re/join/slack/).
10 | 
11 | 
12 | 
13 | ## Contribution workflow
14 | If you'd like to write some code for nf-core/meripseqpipe, the standard workflow
15 | is as follows:
16 | 
17 | 1. Check that there isn't already an issue about your idea in the
18 |    [nf-core/meripseqpipe issues](https://github.com/nf-core/meripseqpipe/issues) to avoid
19 |    duplicating work.
20 |     * If there isn't one already, please create one so that others know you're working on this
21 | 2. Fork the [nf-core/meripseqpipe repository](https://github.com/nf-core/meripseqpipe) to your GitHub account
22 | 3. Make the necessary changes / additions within your forked repository
23 | 4. Submit a Pull Request against the `dev` branch and wait for the code to be reviewed and merged.
24 | 
25 | If you're not used to this workflow with git, you can start with some [basic docs from GitHub](https://help.github.com/articles/fork-a-repo/) or even their [excellent interactive tutorial](https://try.github.io/).
26 | 
27 | 
28 | ## Tests
29 | When you create a pull request with changes, [Travis CI](https://travis-ci.org/) will run automatic tests.
30 | Typically, pull-requests are only fully reviewed when these tests are passing, though of course we can help out before then.
31 | 
32 | There are typically two types of tests that run:
33 | 
34 | ### Lint Tests
35 | The nf-core has a [set of guidelines](https://nf-co.re/developers/guidelines) which all pipelines must adhere to.
36 | To enforce these and ensure that all pipelines stay in sync, we have developed a helper tool which runs checks on the pipeline code. This is in the [nf-core/tools repository](https://github.com/nf-core/tools) and once installed can be run locally with the `nf-core lint <pipeline-directory>` command.
37 | 
38 | If any failures or warnings are encountered, please follow the listed URL for more documentation.
39 | 
40 | ### Pipeline Tests
41 | Each nf-core pipeline should be set up with a minimal set of test-data.
42 | Travis CI then runs the pipeline on this data to ensure that it exists successfully.
43 | If there are any failures then the automated tests fail.
44 | These tests are run both with the latest available version of Nextflow and also the minimum required version that is stated in the pipeline code.
45 | 
46 | ## Getting help
47 | For further information/help, please consult the [nf-core/meripseqpipe documentation](https://github.com/nf-core/meripseqpipe#documentation) and don't hesitate to get in touch on the [nf-core/meripseqpipe pipeline channel](https://nfcore.slack.com/channels/nf-core/meripseqpipe) on [Slack](https://nf-co.re/join/slack/).
48 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
 6 | 
 7 | ## Our Standards
 8 | 
 9 | Examples of behavior that contributes to creating a positive environment include:
10 | 
11 | * Using welcoming and inclusive language
12 | * Being respectful of differing viewpoints and experiences
13 | * Gracefully accepting constructive criticism
14 | * Focusing on what is best for the community
15 | * Showing empathy towards other community members
16 | 
17 | Examples of unacceptable behavior by participants include:
18 | 
19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
20 | * Trolling, insulting/derogatory comments, and personal or political attacks
21 | * Public or private harassment
22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
23 | * Other conduct which could reasonably be considered inappropriate in a professional setting
24 | 
25 | ## Our Responsibilities
26 | 
27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
28 | 
29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
30 | 
31 | ## Scope
32 | 
33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
34 | 
35 | ## Enforcement
36 | 
37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team on [Slack](https://nf-co.re/join/slack/). The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
38 | 
39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
40 | 
41 | ## Attribution
42 | 
43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
44 | 
45 | [homepage]: http://contributor-covenant.org
46 | [version]: http://contributor-covenant.org/version/1/4/
47 | 


--------------------------------------------------------------------------------
/bin/DESeq2_quantification.R:
--------------------------------------------------------------------------------
 1 | #!/bin/Rscript
 2 | ## generate DESeq2 logfc matrix 
 3 | anno.exp.m6a <- subset(m6a.anno.matrix,select= c(PeakRegion,ID))
 4 | final.count.table <- NULL
 5 | htseq_list <- dir(pattern = "count",full.names = T)
 6 | combined_htseq_count <- read.table(htseq_list[1], header = TRUE, row.names = 1, check.names = FALSE)
 7 | for (file in htseq_list[-1]){
 8 |   combined_htseq_count <- cbind(combined_htseq_count,read.table(file, header = TRUE, row.names = 1, check.names = FALSE))
 9 | }
10 | colnames(combined_htseq_count) <- unlist(lapply(strsplit(colnames(combined_htseq_count),split = "_"),FUN = function(x){x[1]}))
11 | combined_htseq_count$ID <- rownames(combined_htseq_count)
12 | final.count.table <- merge(anno.exp.m6a,combined_htseq_count,by= "ID")
13 | 
14 | sample_id <- designtable$Sample_ID[1]
15 | ip_count_file <- grep(paste0("[.]",sample_id,"[.]ip"),count_filelist,value = TRUE)
16 | ip.matrix <- read.table(file = ip_count_file, sep = "\t", row.names = NULL, header = T, check.names=F)
17 | rownames.ip.matrix <- ip.matrix$PeakName
18 | ip.matrix <- subset(ip.matrix , select= 5)
19 | colnames(ip.matrix) <- paste0(sample_id,".ip")
20 | for(sample_id in designtable$Sample_ID[-1]){
21 |   ip_count_file <- grep(paste0("[.]",sample_id,"[.]ip"),count_filelist,value = TRUE)
22 |   ip_count_table <- read.table(file = ip_count_file, sep = "\t", row.names = NULL, header = T, check.names=F)
23 |   ip_count_table <- subset(ip_count_table , select= 5)
24 |   colnames(ip_count_table) <- paste0(sample_id,".ip")
25 |   ip.matrix <- cbind(ip.matrix,ip_count_table)
26 | }
27 | row.names(ip.matrix) <- rownames.ip.matrix
28 | ip.matrix$PeakRegion <- rownames(ip.matrix)
29 | final.count.table <- merge(final.count.table,ip.matrix,by= "PeakRegion")
30 | 
31 | load("deseq2.Rdata")
32 | library("DESeq2")
33 | deseq2.count.table <- subset(final.count.table,select= colnames(final.count.table)[c(-1,-2)])
34 | rownames(deseq2.count.table) <- final.count.table$PeakRegion
35 | coldata <- data.frame(row.names = colnames(deseq2.count.table),group = colnames(deseq2.count.table) ,sample = unlist(lapply(strsplit(colnames(deseq2.count.table),split = "[.]"),FUN = function(x){x[1]})))
36 | final.deseq2.logfc.matrix <- subset(final.count.table, select = PeakRegion)
37 | for (sample_id in unique(coldata$sample)) {
38 |   coldata.sample <- subset(coldata,sample == sample_id,group)
39 |   coldata.sample$group <- unlist(lapply(strsplit(as.character(coldata.sample$group),split = "[.]"),function(x){x[2]}))
40 |   coldata.sample$group <- factor(coldata.sample$group)
41 |   inf.dds <- DESeq2::DESeqDataSetFromMatrix(countData = deseq2.count.table[,rownames(coldata.sample)],colData = coldata.sample,design = ~group)
42 |   inf.dds.LRT <- DESeq2::DESeq(inf.dds)
43 |   head(deseq2.count.table[,rownames(coldata.sample)])
44 |   results <- DESeq2::results(inf.dds.LRT,constract=c("group","input","ip"))
45 |   results <- data.frame(PeakRegion = rownames(results),sample_id = results$log2FoldChange)
46 |   colnames(results)[2] <- sample_id
47 |   final.deseq2.logfc.matrix <- merge(final.deseq2.logfc.matrix,results,by= "PeakRegion")
48 | }
49 | 2^head(final.deseq2.logfc.matrix)
50 | exp()final.deseq2.logfc.matrix
51 | rownames(final.deseq2.logfc.matrix) <- final.deseq2.logfc.matrix$PeakRegion
52 | final.deseq2.logfc.matrix <- final.deseq2.logfc.matrix[,-1]
53 | final.deseq2.logfc.matrix <- 2^(final.deseq2.logfc.matrix)
54 | save(final.deseq2.logfc.matrix,file = "deseq.quantification.matrix.RData")


--------------------------------------------------------------------------------
/bin/markdown_to_html.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import print_function
  3 | import argparse
  4 | import markdown
  5 | import os
  6 | import sys
  7 | 
  8 | def convert_markdown(in_fn):
  9 |     input_md = open(in_fn, mode="r", encoding="utf-8").read()
 10 |     html = markdown.markdown(
 11 |         "[TOC]\n" + input_md,
 12 |         extensions = [
 13 |             'pymdownx.extra',
 14 |             'pymdownx.b64',
 15 |             'pymdownx.highlight',
 16 |             'pymdownx.emoji',
 17 |             'pymdownx.tilde',
 18 |             'toc'
 19 |         ],
 20 |         extension_configs = {
 21 |             'pymdownx.b64': {
 22 |                 'base_path': os.path.dirname(in_fn)
 23 |             },
 24 |             'pymdownx.highlight': {
 25 |                 'noclasses': True
 26 |             },
 27 |             'toc': {
 28 |                 'title': 'Table of Contents'
 29 |             }
 30 |         }
 31 |     )
 32 |     return html
 33 | 
 34 | def wrap_html(contents):
 35 |     header = """<!DOCTYPE html><html>
 36 |     <head>
 37 |         <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
 38 |         <style>
 39 |             body {
 40 |               font-family: -apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,"Noto Sans",sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol","Noto Color Emoji";
 41 |               padding: 3em;
 42 |               margin-right: 350px;
 43 |               max-width: 100%;
 44 |             }
 45 |             .toc {
 46 |               position: fixed;
 47 |               right: 20px;
 48 |               width: 300px;
 49 |               padding-top: 20px;
 50 |               overflow: scroll;
 51 |               height: calc(100% - 3em - 20px);
 52 |             }
 53 |             .toctitle {
 54 |               font-size: 1.8em;
 55 |               font-weight: bold;
 56 |             }
 57 |             .toc > ul {
 58 |               padding: 0;
 59 |               margin: 1rem 0;
 60 |               list-style-type: none;
 61 |             }
 62 |             .toc > ul ul { padding-left: 20px; }
 63 |             .toc > ul > li > a { display: none; }
 64 |             img { max-width: 800px; }
 65 |             pre {
 66 |               padding: 0.6em 1em;
 67 |             }
 68 |             h2 {
 69 | 
 70 |             }
 71 |         </style>
 72 |     </head>
 73 |     <body>
 74 |     <div class="container">
 75 |     """
 76 |     footer = """
 77 |     </div>
 78 |     </body>
 79 |     </html>
 80 |     """
 81 |     return header + contents + footer
 82 | 
 83 | 
 84 | def parse_args(args=None):
 85 |     parser = argparse.ArgumentParser()
 86 |     parser.add_argument('mdfile', type=argparse.FileType('r'), nargs='?',
 87 |                         help='File to convert. Defaults to stdin.')
 88 |     parser.add_argument('-o', '--out', type=argparse.FileType('w'),
 89 |                         default=sys.stdout,
 90 |                         help='Output file name. Defaults to stdout.')
 91 |     return parser.parse_args(args)
 92 | 
 93 | def main(args=None):
 94 |     args = parse_args(args)
 95 |     converted_md = convert_markdown(args.mdfile.name)
 96 |     html = wrap_html(converted_md)
 97 |     args.out.write(html)
 98 | 
 99 | if __name__ == '__main__':
100 |     sys.exit(main())
101 | 


--------------------------------------------------------------------------------
/bin/meyer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Apr 23 18:53:30 2019
 4 | 
 5 | @author: zky
 6 | """
 7 | from sys import argv
 8 | from math import log
 9 | from scipy import stats
10 | input_bin25_file = argv[1]
11 | ip_bin25_file = argv[2]
12 | input_total_reads_count = int(argv[3])
13 | ip_total_reads_count = int(argv[4])
14 | peak_windows_number = int(argv[5])
15 | output_ip_file = argv[6]
16 | def windows_fisher_test(input_count, ip_count, input_total_reads_count, ip_total_reads_count):
17 |     """fisher test for the PeakCalling of meyer"""
18 |     site_input_rest_reads_count = input_total_reads_count - int(input_count)
19 |     site_ip_rest_reads_count = ip_total_reads_count - int(ip_count)
20 |     ip_oddsratio, ip_pvalue = stats.fisher_exact([[input_count, ip_count], [input_total_reads_count, ip_total_reads_count]], 'less')
21 |     input_oddsratio, input_pvalue = stats.fisher_exact([[input_count, ip_count], [site_input_rest_reads_count, site_ip_rest_reads_count]], 'greater')
22 |     return input_pvalue,ip_pvalue
23 | 
24 | def cluster_bin( bonferroni_filter_list ):
25 |     bonferroni_peak = []
26 |     peak_line = []
27 |     idx = 0
28 |     pre_end_position = 0
29 |     for data in bonferroni_filter_list:
30 |         distance = data[1] - pre_end_position
31 |         if pre_end_position == 0 or distance > 0 :
32 |             if peak_line :
33 |                 peak_region = peak_line[2] - peak_line[1]
34 |                 if peak_region >= 100 :
35 |                     bonferroni_peak.append([])
36 |                     bonferroni_peak[idx] = peak_line
37 |                     idx += 1
38 |             peak_line = []
39 |             peak_line = data[:]
40 |             pre_end_position = data[2]
41 |         else:
42 |             peak_line[2] = data[2]
43 |             pre_end_position = data[2]
44 |             peak_line.append(data[3])
45 |     for data in bonferroni_peak:
46 |         statistic, pval = stats.combine_pvalues(data[3:len(data)], method='fisher', weights=None)
47 |         data[3] = pval
48 |         del data[4:len(data)]
49 |     return bonferroni_peak
50 | 
51 | with open (input_bin25_file) as input_bin25,open (ip_bin25_file) as ip_bin25:
52 |     """Generate the list of bonferroni_filter_windows"""
53 |     ip_bonferroni_filter_list = []
54 |     ip_index = 0
55 |     print ("Generate the list of bonferroni_filter_windows")
56 |     while True:
57 |         input_line = input_bin25.readline().rstrip("\n")
58 |         ip_line = ip_bin25.readline().rstrip("\n")
59 |         if input_line == '':
60 |             break
61 |         input_line_list = input_line.split("\t")
62 |         ip_line_list = ip_line.split("\t")
63 |         input_pvalue,ip_pvalue = windows_fisher_test(input_line_list[-1],ip_line_list[-1],input_total_reads_count,ip_total_reads_count)
64 |         if (ip_pvalue < 0.05/peak_windows_number ):
65 |             del ip_line_list[-1]
66 |             ip_line_list.append(ip_pvalue)
67 |             ip_line_list[1] = int(ip_line_list[1])
68 |             ip_line_list[2] = int(ip_line_list[2])
69 |             ip_bonferroni_filter_list.append([])
70 |             ip_bonferroni_filter_list[ip_index] = ip_line_list
71 |             ip_index += 1
72 | """Generate the list of bonferroni_filter_peaks"""
73 | print ("Generate the list of bonferroni_filter_peaks")
74 | ip_bonferroni_peak = cluster_bin(ip_bonferroni_filter_list[:])
75 | """Write the list of bonferroni_filter_peaks"""
76 | print ("Write the list of bonferroni_filter_peaks")
77 | with open(output_ip_file,'w') as output_file:
78 |     for data in ip_bonferroni_peak:  
79 |         output_file.write('\t'.join(str(i) for i in data))
80 |         output_file.write('\n')


--------------------------------------------------------------------------------
/bin/merge_peaks_by_bedtools.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$1 argv 1 : designfile
 3 | #$2 argv 2 : THREAD_NUM
 4 | #$3 argv 3 : flag_peakCallingbygroup
 5 | #$4 argv 4 : peakCalling_tools_count
 6 | designfile=$1
 7 | THREAD_NUM=$2
 8 | flag_peakCallingbygroup=$3
 9 | peakCalling_tools_count=$4
10 | peakCalling_tools_main=$5
11 | 
12 | # Define a multi-threaded run channel
13 | mkfifo tmp
14 | exec 9<>tmp
15 | for ((i=1;i<=${THREAD_NUM:=1};i++))
16 | do
17 |     echo >&9 
18 | done
19 | 
20 | function mergebedByBedtools()
21 | {
22 |     prefix_id=$1
23 |     out_prefix=$2
24 |     peakCalling_tools_main=$3
25 |     cat ${peakCalling_tools_main}*${prefix_id}*normalized.bed | sortBed -i - | mergeBed -i - -c 4,5 -o count,mean > tmp.${prefix_id}_allPeaks.bed
26 |     ls *${prefix_id}*normalized.bed | grep -v ${peakCalling_tools_main} | xargs -i cat {} | sortBed -i - | mergeBed -i - -c 4,5 -o count,mean > tmp.${prefix_id}_others_allPeaks.bed
27 |     intersectBed -a tmp.${prefix_id}_allPeaks.bed -b tmp.${prefix_id}_others_allPeaks.bed -u | awk 'BEGIN{FS="\t";OFS="\t"}{print $1,$2,$3,$1":"$2"-"$3,$5}' > ${out_prefix}.bed
28 | }
29 | 
30 | if [ $flag_peakCallingbygroup -gt 0 ]; then
31 |     group_list=$(awk 'BEGIN{FS=","}NR>1{print $4}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}')
32 |     for group_id in $group_list
33 |     do
34 |     read -u 9
35 |     {
36 |         if [ $peakCalling_tools_count -gt 1 ]; then
37 |             mergebedByBedtools ${group_id} bedtools_merged_group_${group_id} ${peakCalling_tools_main}
38 |         else
39 |             awk '{OFS="\t";$5=10^-$5;print }' *${group_id}*.bed | sortBed -i - > bedtools_merged_group_${group_id}.bed
40 |         fi
41 |         echo >&9
42 |     }&
43 |     done
44 |     wait
45 |     if [ $peakCalling_tools_count -gt 1 ]; then
46 |         mergebedByBedtools "" bedtools_merged_allpeaks ${peakCalling_tools_main}
47 |     else
48 |         cat ${peakCalling_tools_main}_*_normalized.bed | sortBed -i - | mergeBed -i - -c 4,5 -o count,mean | awk 'BEGIN{FS="\t";OFS="\t"}{print $1,$2,$3,$1":"$2"-"$3,$5}' > bedtools_merged_allpeaks.bed
49 |     fi
50 | else
51 |     sampleinfo_list=$(awk 'BEGIN{FS=","}NR>1{print $1","$4}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}')
52 |     for sample_group_id in ${sampleinfo_list}
53 |     do
54 |     read -u 9
55 |     {
56 |         sample_id=$(echo ${sample_group_id} | awk 'BEGIN{FS=","}{print $1}')
57 |         group_id=$(echo ${sample_group_id} | awk 'BEGIN{FS=","}{print $2}')
58 |         ## Adding the information of group
59 |         for samplefile in *_${sample_id}_normalized.bed
60 |         do
61 |             mv $samplefile ${samplefile/_normalized.bed/_${group_id}_normalized.bed}
62 |         done
63 |         echo >&9
64 |     }&
65 |     done
66 |     wait
67 |     group_list=$(awk 'BEGIN{FS=","}NR>1{print $4}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}')
68 |     for group_id in $group_list
69 |     do
70 |     read -u 9
71 |     {
72 |         if [ $peakCalling_tools_count -gt 1 ]; then
73 |             mergebedByBedtools ${group_id} bedtools_merged_group_${group_id} ${peakCalling_tools_main}
74 |         else
75 |             awk '{OFS="\t";$5=10^-$5;print }' *${group_id}*.bed | sortBed -i - > bedtools_merged_group_${group_id}.bed
76 |         fi
77 |         echo >&9
78 |     }&
79 |     done
80 |     wait
81 |     if [ $peakCalling_tools_count -gt 1 ]; then
82 |         mergebedByBedtools "" bedtools_merged_allpeaks ${peakCalling_tools_main}
83 |     else
84 |         cat ${peakCalling_tools_main}_*_normalized.bed | sortBed -i - | mergeBed -i - -c 4,5 -o count,mean | awk 'BEGIN{FS="\t";OFS="\t"}{print $1,$2,$3,$1":"$2"-"$3,$5}' > bedtools_merged_allpeaks.bed
85 |     fi
86 | fi
87 | echo "${peakCalling_tools_main} merged peaks done"


--------------------------------------------------------------------------------
/bin/arranged_results.R:
--------------------------------------------------------------------------------
 1 | #!/bin/Rscript
 2 | ## Rscript arranged_result.R <designfile> <comparefile> <diffm6A_mode> <expression_mode> <mergePeaks_mode>
 3 | ### designfile: Sample_id, Input_filename, IP_filename, group_id
 4 | ### compare_str: Compairision design (eg: A_vs_B)
 5 | args <- commandArgs(T)
 6 | #args <- c("formatted_designfile.txt", "compare_info", "Wilcox-test", "edgeR", "rank")
 7 | designfile <- args[1]#"formatted_designfile.txt"
 8 | comparefile <- args[2]#"compare_info"
 9 | diffm6A_mode <- args[3]#"QNB"
10 | rnaseq_mode <- args[4]#"DESeq2"
11 | peakMerged_mode <- args[5]
12 | options(stringsAsFactors = F)
13 | 
14 | ## generate design matrix
15 | compare.list <- read.csv(comparefile,header = F, check.names=F)
16 | designtable <- read.csv(designfile, head = TRUE, colClasses = c("character"), check.names=F)
17 | design.matrix <- as.matrix(designtable$Group)
18 | rownames(design.matrix) <- designtable$Sample_ID
19 | colnames(design.matrix) <- "Type"
20 | 
21 | ## generate peak Visualization
22 | annotation.file <- list.files(pattern = "merged_allpeaks.anno.txt")
23 | annotation.info <- read.table(annotation.file, header = F, sep = "\t", quote = "", check.names=F)[,c(4,15,11)]
24 | colnames(annotation.info) <- c("PeakRegion","ID","Gene_symbol")
25 | m6a.peaks.file <- list.files(pattern = "merged_allpeaks.bed$")
26 | m6a.peaks.table <- read.table(m6a.peaks.file, header = F, sep = "\t", quote = "", check.names=F)
27 | colnames(m6a.peaks.table) <- c("Chr","ChrStart","ChrEnd","PeakRegion","pvalue")
28 | m6a.peaks.table = merge(x = m6a.peaks.table,y = annotation.info,by = "PeakRegion",all.x = TRUE)
29 | m6a.sites.file <- list.files(pattern = "m6A_sites_merged.bed")
30 | m6a.sites.table <- read.table(m6a.sites.file, header = F, sep = "\t", quote = "", check.names=F)
31 | colnames(m6a.sites.table) <- c("Chr","ChrStart","ChrEnd","Gene_symbol&ID","Strand","Score","Group","Sequence")
32 | 
33 | expression.matrix <- NULL
34 | diffexpression.list <- NULL
35 | if (rnaseq_mode != "none"){
36 |   ## generate expression matrix
37 |   htseq.filelist = grep("htseq",list.files(path = "./",pattern = "input.count"), value = T)
38 |   for( file in htseq.filelist ){
39 |     tmp.expression.table <- as.matrix(read.table(file, sep = "\t", header = TRUE, row.names = 1, check.names=F))
40 |     expression.matrix <- cbind(expression.matrix, tmp.expression.table)
41 |   }
42 |   colnames(expression.matrix) <- as.matrix(lapply(strsplit(colnames(expression.matrix),".input"), function(x){ x[1]}))
43 |   
44 |   ## generate diff_expression list
45 |   diffexpression.filelist <- grep(rnaseq_mode,list.files(pattern = ".csv"), value = T)
46 |   for( compare_str in compare.list ){
47 |     diffexpression.list[[compare_str]] <- read.csv(grep(sub("_vs_","_",compare_str), diffexpression.filelist, value = T), header = T, check.names=F)
48 |     colnames(diffexpression.list[[compare_str]])[1] <- "ID"
49 |   }
50 | }
51 | 
52 | ## generate m6A matrix
53 | m6a.anno.matrix <- read.delim(file = grep("quantification.matrix",x = list.files(),value = T), header = T, sep = "\t", row.names = 1, check.names=F)
54 | m6a.anno.matrix <- cbind(PeakRegion = row.names(m6a.anno.matrix), m6a.anno.matrix)
55 | 
56 | ## generate diffm6A list
57 | diffm6A.filelist <- grep("_diffm6A_",list.files(pattern = ".txt"), value = T)
58 | diffm6A.list <- NULL
59 | for( compare_str in compare.list ){
60 |   diffm6A.list[[compare_str]] <- read.table(grep(sub("_vs_","_",compare_str), diffm6A.filelist, value = T),header = T,row.names = 1, check.names=F)
61 |   if( diffm6A_mode == "MATK" ){
62 |     diffm6A.list[[compare_str]]$padj = p.adjust(diffm6A.list[[compare_str]]$pvalue, method = "BH")
63 |     diffm6A.list[[compare_str]] <- diffm6A.list[[compare_str]][,-seq(1,2)]
64 |   }
65 |   diffm6A.list[[compare_str]]$PeakRegion <- rownames(diffm6A.list[[compare_str]])
66 |   diffm6A.list[[compare_str]] <- merge(x = annotation.info,y = diffm6A.list[[compare_str]],by = "PeakRegion", all.y = TRUE)
67 | }
68 | 
69 | ## save variable for m6Aviewer
70 | write.table(expression.matrix,file = "expression.matrix",quote=F)
71 | write.table(m6a.anno.matrix,file= "m6a.anno.matrix",quote=F)
72 | #write.table(diffm6A.list, file= "diffm6A.list")
73 | #write.table(diffm6A.anno.list, file = "diffm6A.anno.list")
74 | 
75 | save(design.matrix, compare.list,
76 |      m6a.peaks.table, m6a.sites.table,
77 |      expression.matrix, m6a.anno.matrix, 
78 |      diffexpression.list, diffm6A.list,
79 |      file = paste0(peakMerged_mode,"_",diffm6A_mode,"_",rnaseq_mode,"_arranged_results_",Sys.Date(),".m6APipe"))
80 | 
81 | 


--------------------------------------------------------------------------------
/bin/merge_peaks_by_mspc.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #$1 argv 1 : designfile
  3 | #$2 argv 2 : THREAD_NUM
  4 | #$3 argv 3 : flag_peakCallingbygroup
  5 | #$4 argv 4 : peakCalling_tools_count
  6 | designfile=$1
  7 | THREAD_NUM=$2
  8 | flag_peakCallingbygroup=$3
  9 | peakCalling_tools_count=$4
 10 | out_dir=$5
 11 | # Define a multi-threaded run channel
 12 | mkfifo tmp
 13 | exec 9<>tmp
 14 | for ((i=1;i<=${THREAD_NUM:=1};i++))
 15 | do
 16 |     echo >&9 
 17 | done
 18 | # Generate the diroectory of results:
 19 | mkdir ${out_dir}
 20 | 
 21 | # Define the function of MSPC runing for different situations
 22 | function mergebedForBio()
 23 | {
 24 |     prefix_id=$1
 25 |     out_prefix=$2
 26 |     bedfile_array=$(ls *_${prefix_id}_*.bed | awk '{ORS=" "}{print "-i",$0}')
 27 |     mspc -i  $bedfile_array -r bio -s 1E-4 -w 1E-2 -o Bio_$prefix_id
 28 |     ln Bio_$prefix_id/ConsensusPeaks.bed ${out_prefix}.bed
 29 |     awk 'NR>1{OFS="\t";$5=10^-$5;print $1,$2,$3,$1":"$2"-"$3,$5}' Bio_$prefix_id/ConsensusPeaks.bed |sortBed -i - > ${out_dir}/${out_prefix}.bed
 30 | }
 31 | function mergebedForTec()
 32 | {
 33 |     prefix_id=$1
 34 |     out_prefix=$2
 35 |     peakCalling_tools_count=$3
 36 |     bedfile_array=$(ls *_${prefix_id}_*.bed | awk '{ORS=" "}{print "-i",$0}')
 37 |     mspc -i $bedfile_array -r tec -s 1E-2 -w 1E-1 -o Tec_$prefix_id
 38 |     ln Tec_$prefix_id/ConsensusPeaks.bed ${out_prefix}.bed
 39 |     awk 'NR>1{OFS="\t";$5=10^-$5;print $1,$2,$3,$1":"$2"-"$3,$5}' Tec_$prefix_id/ConsensusPeaks.bed |sortBed -i - > ${out_dir}/${out_prefix}.bed
 40 | }
 41 | 
 42 | # Before merging peaks, normalize all peaks of different tools
 43 | for bedfile in *.bed
 44 | do
 45 | read -u 9
 46 | {
 47 |     mv $bedfile tmp.$bedfile
 48 |     python normalize_peaks.py tmp.$bedfile $bedfile
 49 |     rm tmp.$bedfile
 50 |     echo >&9
 51 | }&
 52 | done
 53 | wait
 54 | 
 55 | # if the number of peakcalling tools > 2
 56 | if [ $flag_peakCallingbygroup -gt 0 ]; then
 57 |     group_list=$(awk 'BEGIN{FS=","}NR>1{print $4}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}')
 58 |     for group_id in $group_list
 59 |     do
 60 |     read -u 9
 61 |     {
 62 |         if [ $peakCalling_tools_count -gt 1 ]; then
 63 |             mergebedForTec ${group_id} mspc_merged_group_${group_id}
 64 |         else
 65 |             ln *${group_id}*.bed mspc_merged_group_${group_id}.bed
 66 |             awk '{OFS="\t";$5=10^-$5;print }' *${group_id}*.bed |sortBed -i - > ${out_dir}/mspc_merged_group_${group_id}.bed
 67 |         fi
 68 |         echo >&9
 69 |     }&
 70 |     done
 71 |     wait
 72 |     mergebedForBio merged_group mspc_merged_allpeaks
 73 | else
 74 |     sampleinfo_list=$(awk 'BEGIN{FS=","}NR>1{print $1","$4}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}')
 75 |     for sample_group_id in ${sampleinfo_list}
 76 |     do
 77 |     read -u 9
 78 |     {
 79 |         sample_id=$(echo ${sample_group_id} | awk 'BEGIN{FS=","}{print $1}')
 80 |         group_id=$(echo ${sample_group_id} | awk 'BEGIN{FS=","}{print $2}')
 81 |         ## Adding the information of group
 82 |         for samplefile in *_${sample_id}_normalized.bed
 83 |         do
 84 |             mv $samplefile ${samplefile/_normalized.bed/}_${group_id}_normalized.bed
 85 |         done
 86 |         if [ $peakCalling_tools_count -gt 1 ]; then
 87 |             mergebedForTec ${sample_id} mspc_merged_sample_${group_id}_${sample_id} $peakCalling_tools_count
 88 |         else
 89 |             ln *${sample_id}*.bed mspc_merged_sample_${group_id}_${sample_id}.bed
 90 |             awk '{OFS="\t";$5=10^-$5;print }' *_${sample_id}_*normalized.bed |sortBed -i - > ${out_dir}/mspc_merged_sample_${group_id}_${sample_id}.bed
 91 |         fi
 92 |         echo >&9
 93 |     }&
 94 |     done
 95 |     wait
 96 |     group_list=$(awk 'BEGIN{FS=","}NR>1{print $4}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}')
 97 |     for group_id in $group_list
 98 |     do
 99 |     read -u 9
100 |     {
101 |         mergebedForBio merged_sample_${group_id} mspc_merged_group_${group_id}
102 |         echo >&9
103 |     }&
104 |     done
105 |     wait
106 |     #mergebedForBio merged_sample mspc_merged_allpeaks
107 |     cat ${out_dir}/*_merged_sample_*.bed | sortBed -i - |mergeBed -i - -c 4,5 -o count,mean | awk '$4>1{print $1"\t"$2"\t"$3"\t"$1":"$2"-"$3"\t"$5}' > ${out_dir}/mspc_merged_allpeaks.bed
108 | fi
109 | judge_chr=$(cat *.bed |cut -f 1 |sort |uniq| awk '$0~"chr"{print "includeChr"}' |uniq)
110 | if [ "$judge_chr" != "includeChr" ]; then sed -i 's/chr//g' ${out_dir}/*.bed ;fi
111 | mv ${out_dir}/*.bed ./
112 | exec 9<>-
113 | echo "MSPC merged peaks done"
114 | 


--------------------------------------------------------------------------------
/docs/configuration/adding_your_own.md:
--------------------------------------------------------------------------------
 1 | # nf-core/m6APipe: Configuration for other clusters
 2 | 
 3 | It is entirely possible to run this pipeline on other clusters, though you will need to set up your own config file so that the pipeline knows how to work with your cluster.
 4 | 
 5 | > If you think that there are other people using the pipeline who would benefit from your configuration (eg. other common cluster setups), please let us know. We can add a new configuration and profile which can used by specifying `-profile <name>` when running the pipeline.
 6 | 
 7 | If you are the only person to be running this pipeline, you can create your config file as `~/.nextflow/config` and it will be applied every time you run Nextflow. Alternatively, save the file anywhere and reference it when running the pipeline with `-c path/to/config` (see the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for more).
 8 | 
 9 | A basic configuration comes with the pipeline, which runs by default (the `standard` config profile - see [`conf/base.config`](../conf/base.config)). This means that you only need to configure the specifics for your system and overwrite any defaults that you want to change.
10 | 
11 | ## Cluster Environment
12 | 
13 | By default, pipeline uses the `local` Nextflow executor - in other words, all jobs are run in the login session. If you're using a simple server, this may be fine. If you're using a compute cluster, this is bad as all jobs will run on the head node.
14 | 
15 | To specify your cluster environment, add the following line to your config file:
16 | 
17 | ```nextflow
18 | process.executor = 'YOUR_SYSTEM_TYPE'
19 | ```
20 | 
21 | Many different cluster types are supported by Nextflow. For more information, please see the [Nextflow documentation](https://www.nextflow.io/docs/latest/executor.html).
22 | 
23 | Note that you may need to specify cluster options, such as a project or queue. To do so, use the `clusterOptions` config option:
24 | 
25 | ```nextflow
26 | process {
27 |   executor = 'SLURM'
28 |   clusterOptions = '-A myproject'
29 | }
30 | ```
31 | 
32 | ## Software Requirements
33 | 
34 | To run the pipeline, several software packages are required. How you satisfy these requirements is essentially up to you and depends on your system. If possible, we _highly_ recommend using either Docker or Singularity.
35 | 
36 | Please see the [`installation documentation`](../installation.md) for how to run using the below as a one-off. These instructions are about configuring a config file for repeated use.
37 | 
38 | ### Docker
39 | 
40 | Docker is a great way to run nf-core/m6APipe, as it manages all software installations and allows the pipeline to be run in an identical software environment across a range of systems.
41 | 
42 | Nextflow has [excellent integration](https://www.nextflow.io/docs/latest/docker.html) with Docker, and beyond installing the two tools, not much else is required - nextflow will automatically fetch the [nfcore/m6APipe](https://hub.docker.com/r/nfcore/m6APipe/) image that we have created and is hosted at dockerhub at run time.
43 | 
44 | To add docker support to your own config file, add the following:
45 | 
46 | ```nextflow
47 | docker.enabled = true
48 | process.container = "nfcore/m6APipe"
49 | ```
50 | 
51 | Note that the dockerhub organisation name annoyingly can't have a hyphen, so is `nfcore` and not `nf-core`.
52 | 
53 | ### Singularity image
54 | 
55 | Many HPC environments are not able to run Docker due to security issues.
56 | [Singularity](http://singularity.lbl.gov/) is a tool designed to run on such HPC systems which is very similar to Docker.
57 | 
58 | To specify singularity usage in your pipeline config file, add the following:
59 | 
60 | ```nextflow
61 | singularity.enabled = true
62 | process.container = "shub://nf-core/m6APipe"
63 | ```
64 | 
65 | If you intend to run the pipeline offline, nextflow will not be able to automatically download the singularity image for you.
66 | Instead, you'll have to do this yourself manually first, transfer the image file and then point to that.
67 | 
68 | First, pull the image file where you have an internet connection:
69 | 
70 | ```bash
71 | singularity pull --name nf-core-m6APipe.simg shub://nf-core/m6APipe
72 | ```
73 | 
74 | Then transfer this file and point the config file to the image:
75 | 
76 | ```nextflow
77 | singularity.enabled = true
78 | process.container = "/path/to/nf-core-m6APipe.simg"
79 | ```
80 | 
81 | ### Conda
82 | 
83 | If you're not able to use Docker or Singularity, you can instead use conda to manage the software requirements.
84 | To use conda in your own config file, add the following:
85 | 
86 | ```nextflow
87 | process.conda = "$baseDir/environment.yml"
88 | ```
89 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MeRIPseqPipe
 2 | 
 3 | **MeRIP-seq analysis pipeline arranged multiple alignment tools, peakCalling tools, Merge Peaks\' methods and methylation analysis methods.**.
 4 | 
 5 | [![Build Status](https://travis-ci.com/nf-core/meripseqpipe.svg?branch=master)](https://travis-ci.com/nf-core/meripseqpipe)
 6 | [![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A519.04.0-brightgreen.svg)](https://www.nextflow.io/)
 7 | 
 8 | [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg)](http://bioconda.github.io/)
 9 | [![Docker](https://img.shields.io/docker/automated/kingzhuky/meripseqpipe.svg)](https://hub.docker.com/r/kingzhuky/meripseqpipe)
10 | 
11 | ## Introduction
12 | 
13 | The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker containers making installation trivial and results highly reproducible.N6-methyladenosine (m6A) is the most prevalent modification in the mRNA of many eukaryotic species, including yeast, plants, flies, and mammals. In order to analyze m6A-seq data, we developed a user-friendly, integrated analysis pipeline called MeRIPseqPipe based on Nextflow. It integrated ten main functional modules including data preprocessing, quality control, read mapping, peak calling, peak merging, motif searching, peak annotation,differential methylation analysis, differential expression analysis, and data visualization. 
14 | 
15 | ## Quick Start
16 | 
17 | i. Install [`nextflow`](https://nf-co.re/usage/installation)
18 | 
19 | ii. Install one of [`docker`](https://docs.docker.com/engine/installation/) or [`conda`](https://conda.io/miniconda.html)
20 | 
21 | iii. Download the pipeline and test it on a minimal dataset with a single command
22 | 
23 | ```bash
24 | nextflow run path/to/meripseqpipe -profile test,<docker/singularity/conda>
25 | ```
26 | 
27 | > Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile <institute>` in your command. This will enable either `docker` or `conda` and set the appropriate execution settings for your local compute environment.
28 | 
29 | iv. Start running your own analysis!
30 | 
31 | <!-- TODO nf-core: Update the default command above used to run the pipeline -->
32 | ```bash
33 | nextflow run path/to/meripseqpipe -profile <docker/singularity/conda> --designfile 'designfile.tsv' --comparefile 'comparefile.txt'  --fasta path/to/genome_fasta.fa --gtf path/to/genome_annotation.gtf
34 | ```
35 | 
36 | See [usage docs](docs/usage.md) for all of the available options when running the pipeline.
37 | 
38 | ## Documentation
39 | 
40 | The nf-core/meripseqpipe pipeline comes with documentation about the pipeline, found in the `docs/` directory:
41 | 
42 | 1. [Installation](https://nf-co.re/usage/installation)
43 | 2. Pipeline configuration
44 |     * [Local installation](https://nf-co.re/usage/local_installation)
45 |     * [Adding your own system config](https://nf-co.re/usage/adding_own_config)
46 |     * [Reference genomes](https://nf-co.re/usage/reference_genomes)
47 | 3. [Running the pipeline](docs/usage.md)
48 | 4. [Output and how to interpret the results](docs/output.md)
49 | 5. [Troubleshooting](https://nf-co.re/usage/troubleshooting)
50 | 
51 | <!-- TODO nf-core: Add a brief overview of what the pipeline does and how it works -->
52 | 
53 | ## Credits
54 | 
55 | MeRIPseqPipe was originally written by Kaiyu Zhu, Yu Sun, Xiaoqiong Bao.
56 | 
57 | ## Contributions and Support
58 | 
59 | If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).
60 | 
61 | For further information or help, don't hesitate to get in touch on [Slack](https://nfcore.slack.com/channels/meripseqpipe) (you can join with [this invite](https://nf-co.re/join/slack)).
62 | 
63 | ## Citation
64 | 
65 | <!-- TODO nf-core: Add citation for pipeline after first release. Uncomment lines below and update Zenodo doi. -->
66 | <!-- If you use  nf-core/meripseqpipe for your analysis, please cite it using the following doi: [10.5281/zenodo.XXXXXX](https://doi.org/10.5281/zenodo.XXXXXX) -->
67 | 
68 | You can cite the `nf-core` publication as follows:
69 | 
70 | > **The nf-core framework for community-curated bioinformatics pipelines.**
71 | >
72 | > Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.
73 | >
74 | > _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).  
75 | > ReadCube: [Full Access Link](https://rdcu.be/b1GjZ)
76 | 


--------------------------------------------------------------------------------
/bin/merge_peaks_by_rank.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #$1 argv 1 : designfile
  3 | #$2 argv 2 : THREAD_NUM
  4 | #$3 argv 3 : flag_peakCallingbygroup
  5 | #$4 argv 4 : peakCalling_tools_count
  6 | designfile=$1
  7 | THREAD_NUM=$2
  8 | flag_peakCallingbygroup=$3
  9 | peakCalling_tools_count=$4
 10 | 
 11 | # Define a multi-threaded run channel
 12 | mkfifo tmp
 13 | exec 9<>tmp
 14 | for ((i=1;i<=${THREAD_NUM:=1};i++))
 15 | do
 16 |     echo >&9 
 17 | done
 18 | 
 19 | function SortTransferBed()
 20 | {
 21 |     bed_file=$1
 22 |     bed_anno_file=$2
 23 |     outdir=$3
 24 |     ## sort bed by pvalue for rank merge && transfer the origin region of peaks into the bedtools merged region of peaks 
 25 |     awk '{ print $1":"$2"-"$3,$5}' ${bed_file} | sort -k1,1 |join -a1 - ${bed_anno_file} | sort -k2,2 -n -r | awk '{print $3}' > ${outdir}/tmp.${bed_file}.location
 26 | }
 27 | function mergebedByRank()
 28 | {
 29 |     prefix_id=$1
 30 |     out_prefix=$2
 31 |     mkdir tmp.${out_prefix}
 32 |     cat *_${prefix_id}_*.bed | awk '{print $1"\t"$2*1"\t"$3*1"\t"$1":"$2"-"$3}' > tmp.${out_prefix}/bedtools_${prefix_id}_all_peaks
 33 |     sortBed -i tmp.${out_prefix}/bedtools_${prefix_id}_all_peaks |mergeBed -i - -c 4,4 -o collapse,count | awk '{print $1"\t"$2"\t"$3"\t"$1":"$2"-"$3"\t"$4}'  > tmp.${out_prefix}/bedtools_${prefix_id}
 34 |     awk -F "\t" '{print $4,$5}' tmp.${out_prefix}/bedtools_${prefix_id} | awk -F '[," "]+' '{for (i=2 ;i<=NF;i++) printf $i" "$1"\n" }' | sort -k1 | uniq > tmp.${out_prefix}/bed_anno_file
 35 |     for bedfile in *_${prefix_id}_*.bed
 36 |     do
 37 |         SortTransferBed $bedfile tmp.${out_prefix}/bed_anno_file tmp.${out_prefix}
 38 |     done
 39 |     paste -d "\t" tmp.${out_prefix}/tmp*location > ${out_prefix}.bedlist
 40 |     peak_number=$(wc -l tmp.${out_prefix}/bed_anno_file | cut -d " " -f 1)
 41 |     Rscript merge_peaks_by_rank.R ${out_prefix}.bedlist ${peak_number} ${out_prefix}.bed
 42 |     rm -rf tmp.${out_prefix} ${out_prefix}.bedlist
 43 | }
 44 | 
 45 | # Before merging peaks, normalize all peaks of different tools
 46 | for bedfile in *.bed
 47 | do
 48 | read -u 9
 49 | {
 50 |     mv $bedfile tmp.$bedfile
 51 |     python normalize_peaks.py tmp.$bedfile $bedfile
 52 |     rm tmp.$bedfile
 53 |     echo >&9
 54 | }&
 55 | done
 56 | wait
 57 | # if the number of peakcalling tools > 2
 58 | if [ $flag_peakCallingbygroup -gt 0 ]; then
 59 |     echo "Start to merge different tools' result of every group"
 60 |     group_list=$(awk 'BEGIN{FS=","}NR>1{print $4}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}')
 61 |     for group_id in $group_list
 62 |     do
 63 |     read -u 9
 64 |     {
 65 |         if [ $peakCalling_tools_count -gt 1 ]; then
 66 |             mergebedByRank ${group_id} rank_merged_group_${group_id}
 67 |         else
 68 |             awk '{OFS="\t";$5=10^-$5;print }' *${group_id}*.bed |sortBed -i - > rank_merged_group_${group_id}.bed
 69 |         fi
 70 |         echo >&9
 71 |     }&
 72 |     done
 73 |     wait
 74 |     mergebedByRank merged_group rank_merged_allpeaks
 75 | else
 76 |     echo "Start to merge different tools' result of every sample"
 77 |     sampleinfo_list=$(awk 'BEGIN{FS=","}NR>1{print $1","$4}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}')
 78 |     for sample_group_id in ${sampleinfo_list}
 79 |     do
 80 |     read -u 9
 81 |     {  
 82 |         sample_id=$(echo ${sample_group_id} | awk 'BEGIN{FS=","}{print $1}')
 83 |         group_id=$(echo ${sample_group_id} | awk 'BEGIN{FS=","}{print $2}')
 84 |         ## Adding the information of group
 85 |         for samplefile in *_${sample_id}_normalized.bed
 86 |         do
 87 |             mv $samplefile ${samplefile/_normalized.bed/}_${group_id}_normalized.bed
 88 |         done
 89 |         if [ $peakCalling_tools_count -gt 1 ]; then
 90 |             mergebedByRank ${sample_id} rank_merged_sample_${group_id}_${sample_id}
 91 |         else
 92 |             awk '{OFS="\t";$5=10^-$5;print }' *${sample_id}*.bed |sortBed -i - > rank_merged_sample_${group_id}_${sample_id}.bed
 93 |         fi
 94 |         echo >&9
 95 |     }&
 96 |     done
 97 |     wait
 98 |     echo "Start to merge different samples' result of every group"
 99 |     group_list=$(awk 'BEGIN{FS=","}NR>1{print $4}' $designfile |sort|uniq|awk 'BEGIN{ORS=" "}{print $0}')
100 |     for group_id in $group_list
101 |     do
102 |     read -u 9
103 |     {
104 |         mergebedByRank merged_sample_${group_id} rank_merged_group_${group_id}
105 |         echo >&9
106 |     }&
107 |     done
108 |     wait
109 |     #mergebedByRank merged_sample rank_merged_allpeaks
110 |     cat *_merged_sample_*.bed | sortBed -i - |mergeBed -i - -c 4,5 -o count,mean | awk '$4>1{print $1"\t"$2"\t"$3"\t"$1":"$2"-"$3"\t"$5}' > rank_merged_allpeaks.bed
111 | fi
112 | echo "Rank merged peaks done"
113 | 
114 | 
115 | 


--------------------------------------------------------------------------------
/bin/GLM_DESeq2_DM.R:
--------------------------------------------------------------------------------
 1 | library(DESeq2)
 2 | library(BiocParallel)
 3 | 
 4 | #load data
 5 | args <- commandArgs(T)
 6 | #args <- c("formatted_designfile.txt","shGFPa549_vs_shMettl3a549", "10","bedtools_quantification.matrix","expression.matrix")
 7 | designfile <- args[1]
 8 | compare_str <- args[2]
 9 | THREAD_NUM <- as.numeric(args[3])
10 | annotation.file <- args[4]
11 | input.count.matrix.file <- args[5]
12 | register(MulticoreParam(THREAD_NUM))
13 | 
14 | designtable <- read.csv(designfile, head = TRUE, stringsAsFactors=FALSE, colClasses = c("character"), check.names=F)
15 | design.matrix <- as.data.frame(designtable$Group)
16 | rownames(design.matrix) <- designtable$Sample_ID
17 | colnames(design.matrix) <- "Condition"
18 | 
19 | # Get the information of groups from compare_str
20 | if(length(unique(design.matrix$Condition)) < 2){
21 |   stop( "The count of Group is less than two, please check your designfile.")
22 | }else if( compare_str == "two_group" ){
23 |   # Get the information without compare_str beacause of only two groups
24 |   group_id_1 <- unique(design.matrix$Condition)[1]
25 |   group_id_2 <- unique(design.matrix$Condition)[2]
26 | }else{
27 |   # Running MeTDiff quantification with compare_str
28 |   group_id_1 <- strsplit(as.character(compare_str), "_vs_")[[1]][1]
29 |   group_id_2 <- strsplit(as.character(compare_str), "_vs_")[[1]][2]
30 | }
31 | design.matrix <- subset(design.matrix, Condition == group_id_1 | Condition == group_id_2 )
32 | design.matrix$Condition <- factor(design.matrix$Condition, levels = c(group_id_1,group_id_2), labels = c("control","treatment"))
33 | filelist = list.files(path = ".",pattern = ".count",full.names = T)
34 | ## Generate the matrix of peaks count
35 | peaks.count.list <- NULL
36 | for(sample_id in rownames(design.matrix)){
37 |   input_count_file <- grep(paste0("[.]",sample_id,"[.]input"),filelist,value = TRUE)
38 |   input_count_table <- read.table(file = input_count_file, sep = "\t", row.names = NULL,header = T)
39 |   
40 |   ip_count_file <- grep(paste0("[.]",sample_id,"[.]ip"),filelist,value = TRUE)
41 |   ip_count_table <- read.table(file = ip_count_file, sep = "\t", row.names = NULL, header = T)
42 |   rpkm <- cbind(input_count_table[,5],ip_count_table[,5])
43 |   colnames(rpkm) <- c(paste0(sample_id,".input"),paste0(sample_id,".ip"))
44 |   peaks.count.list <- cbind(peaks.count.list,rpkm)
45 | }
46 | rownames(peaks.count.list) <- ip_count_table$PeakName
47 | 
48 | ## generate design matrix
49 | design.matrix$m6A <- "input"
50 | design.matrix$sample_id <- paste0(rownames(design.matrix),".input")
51 | design.matrix_ip <- design.matrix
52 | design.matrix_ip$m6A <- "IP"
53 | design.matrix_ip$sample_id <- paste0(rownames(design.matrix_ip),".ip")
54 | design.matrix <- rbind(design.matrix,design.matrix_ip)
55 | rownames(design.matrix) <- design.matrix$sample_id
56 | design.matrix$m6A <- factor(design.matrix$m6A)
57 | design.matrix <- design.matrix[colnames(peaks.count.list),]
58 | 
59 | run.deseq2 <- function(cnts,meta){
60 |   inf.dds <- DESeq2::DESeqDataSetFromMatrix(countData = cnts,colData = meta,design = ~Condition+m6A+Condition:m6A)
61 |   inf.dds.LRT <- DESeq2::DESeq(inf.dds,betaPrior=FALSE, test="LRT",
62 |                                full=~Condition+m6A+Condition:m6A,reduced=~Condition+m6A)    
63 |   inf.dds.res <- DESeq2::results(inf.dds.LRT)
64 |   results <- inf.dds.res
65 |   colnames(results) <- c("baseMean", "log2FC", "lfcSE", "stat", "pvalue", "padj")
66 |   return(results)
67 | }
68 | 
69 | run.deseq2.4l2fc <- function(cnts,meta,label){
70 |   dds <- DESeq2::DESeqDataSetFromMatrix(cnts,meta,formula(~Condition))
71 |   dds$Condition <- factor(dds$Condition, levels=c('control','treatment'))
72 |   gene.col2check <- meta$Condition
73 |   dds$Condition <- droplevels(dds$Condition)
74 |   gene.deseq <- DESeq2::DESeq(dds)
75 |   gene.deseq <- DESeq2::results(gene.deseq)
76 |   gene.results <- gene.deseq[,c("log2FoldChange","pvalue","padj")]
77 |   colnames(gene.results) <- paste0(label,c(".l2fc",".p",".padj"))
78 |   return(gene.results)
79 | }
80 | 
81 | results <- run.deseq2(peaks.count.list,design.matrix)
82 | ip.peaks.count <- read.table(annotation.file,sep = "\t", row.names = 1, check.names = F)
83 | input.gene.count <- read.table(input.count.matrix.file,sep = " ",row.names = 1, check.names = F)
84 | colnames(input.gene.count) <- paste0(colnames(input.gene.count),".input")
85 | peaks.de <- run.deseq2.4l2fc(peaks.count.list[,rownames(design.matrix)[design.matrix$m6A == "IP"]],
86 |                              design.matrix[rownames(design.matrix)[design.matrix$m6A == "IP"],],'peak')
87 | gene.de <- run.deseq2.4l2fc(input.gene.count[,rownames(design.matrix)[design.matrix$m6A == "input"]],
88 |                             design.matrix[rownames(design.matrix)[design.matrix$m6A == "input"],],'gene')
89 | peaks.de$gene_id <- ip.peaks.count[rownames(peaks.de),"ID"]
90 | peaks.de$gene.l2fc <- gene.de[peaks.de$gene_id,]$gene.l2fc
91 | peaks.de$diff.l2fc <- peaks.de$peak.l2fc - peaks.de$gene.l2fc
92 | results$diff.l2fc <-  peaks.de[rownames(results),]$diff.l2fc
93 | 
94 | write.table(results,file = paste0("DESeq2_diffm6A_",group_id_1, "_",group_id_2,".txt") ,sep = "\t",quote = F)
95 | 


--------------------------------------------------------------------------------
/bin/GLM_edgeR_DM.R:
--------------------------------------------------------------------------------
  1 | #!/bin/Rscript
  2 | ## Rscript GLM_edgeR_DM.R <desginfile> <compare_str>
  3 | ### designfile: Sample_id, Input_filename, IP_filename, group_id
  4 | ### compare_str: Compairision design (eg: A_vs_B)
  5 | 
  6 | #####GLM model###
  7 | #edgeR
  8 | library(edgeR)
  9 | 
 10 | #load data
 11 | args <- commandArgs(T)
 12 | designfile <- args[1]
 13 | compare_str <- args[2]
 14 | annotation.file <- args[3]
 15 | input.count.matrix.file <- args[4]
 16 | 
 17 | designtable <- read.csv(designfile, head = TRUE, stringsAsFactors=FALSE, colClasses = c("character"), check.names=F)
 18 | design.matrix <- as.data.frame(designtable$Group)
 19 | rownames(design.matrix) <- designtable$Sample_ID
 20 | colnames(design.matrix) <- "Condition"
 21 | 
 22 | # Get the information of groups from compare_str
 23 | if(length(unique(design.matrix$Condition)) < 2){
 24 |   stop( "The count of Group is less than two, please check your designfile.")
 25 | }else if( compare_str == "two_group" ){
 26 |   # Get the information without compare_str beacause of only two groups
 27 |   group_id_1 <- unique(design.matrix$Condition)[1]
 28 |   group_id_2 <- unique(design.matrix$Condition)[2]
 29 | }else{
 30 |   # Running MeTDiff quantification with compare_str
 31 |   group_id_1 <- strsplit(as.character(compare_str), "_vs_")[[1]][1]
 32 |   group_id_2 <- strsplit(as.character(compare_str), "_vs_")[[1]][2]
 33 | }
 34 | design.matrix <- subset(design.matrix, Condition == group_id_1 | Condition == group_id_2 )
 35 | design.matrix$Condition <- factor(design.matrix$Condition, levels = c(group_id_1,group_id_2), labels = c("control","treatment"))
 36 | filelist = list.files(path = ".",pattern = ".count",full.names = T)
 37 | ## Generate the matrix of peaks count
 38 | peaks.count.list <- NULL
 39 | for(sample_id in row.names(design.matrix)){
 40 |   input_count_file <- grep(paste0("[.]",sample_id,"[.]input"),filelist,value = TRUE)
 41 |   input_count_table <- read.table(file = input_count_file, sep = "\t", row.names = NULL,header = T)
 42 | 
 43 |   ip_count_file <- grep(paste0("[.]",sample_id,"[.]ip"),filelist,value = TRUE)
 44 |   ip_count_table <- read.table(file = ip_count_file, sep = "\t", row.names = NULL, header = T)
 45 |   rpkm <- cbind(input_count_table[,5],ip_count_table[,5])
 46 |   colnames(rpkm) <- c(paste0(sample_id,".input"),paste0(sample_id,".ip"))
 47 |   peaks.count.list <- cbind(peaks.count.list,rpkm)
 48 | }
 49 | rownames(peaks.count.list) <- ip_count_table$PeakName
 50 | 
 51 | ## generate design matrix
 52 | design.matrix$m6A <- "input"
 53 | design.matrix$sample_id <- paste0(rownames(design.matrix),".input")
 54 | design.matrix_ip <- design.matrix
 55 | design.matrix_ip$m6A <- "IP"
 56 | design.matrix_ip$sample_id <- paste0(rownames(design.matrix_ip),".ip")
 57 | design.matrix <- rbind(design.matrix,design.matrix_ip)
 58 | rownames(design.matrix) <- design.matrix$sample_id
 59 | design.matrix$m6A <- factor(design.matrix$m6A)
 60 | design.matrix <- design.matrix[colnames(peaks.count.list),]
 61 | 
 62 | run.edger <- function(cnts,meta){
 63 |   #add count filter?
 64 |   er.design <- model.matrix(~meta$Condition+meta$m6A+meta$Condition*meta$m6A)
 65 |   er.dgelist <- edgeR::DGEList(counts=cnts,group=meta$Condition) 
 66 |   er.dgelist <- edgeR::estimateDisp(er.dgelist, design=er.design)
 67 |   er.fit <- edgeR::glmFit(er.dgelist, er.design)
 68 |   er.lrt <- edgeR::glmLRT(er.fit, coef=4)
 69 |   #hist(er.lrt$table$PValue) er.lrt$table$logFC,
 70 |   results <- er.lrt$table
 71 |   results$padj <- p.adjust(results$PValue,"BH")
 72 |   colnames(results) <- c("log2FC","logCPM","LR","pvalue","padj")
 73 |   return(results)
 74 | }
 75 | run.deseq2.4l2fc <- function(cnts,meta,label){
 76 |   dds <- DESeq2::DESeqDataSetFromMatrix(cnts,meta,formula(~Condition))
 77 |   dds$Condition <- factor(dds$Condition, levels=c('control','treatment'))
 78 |   gene.col2check <- meta$Condition
 79 |   dds$Condition <- droplevels(dds$Condition)
 80 |   gene.deseq <- DESeq2::DESeq(dds)
 81 |   gene.deseq <- DESeq2::results(gene.deseq)
 82 |   gene.results <- gene.deseq[,c("log2FoldChange","pvalue","padj")]
 83 |   colnames(gene.results) <- paste0(label,c(".l2fc",".p",".padj"))
 84 |   return(gene.results)
 85 | }
 86 | 
 87 | results <- run.edger(peaks.count.list,design.matrix)
 88 | ip.peaks.count <- read.table(annotation.file,sep = "\t", row.names = 1, check.names = F)
 89 | input.gene.count <- read.table(input.count.matrix.file,sep = " ",row.names = 1, check.names = F)
 90 | colnames(input.gene.count) <- paste0(colnames(input.gene.count),".input")
 91 | peaks.de <- run.deseq2.4l2fc(peaks.count.list[,rownames(design.matrix)[design.matrix$m6A == "IP"]],
 92 |                              design.matrix[rownames(design.matrix)[design.matrix$m6A == "IP"],],'peak')
 93 | gene.de <- run.deseq2.4l2fc(input.gene.count[,rownames(design.matrix)[design.matrix$m6A == "input"]],
 94 |                             design.matrix[rownames(design.matrix)[design.matrix$m6A == "input"],],'gene')
 95 | peaks.de$gene_id <- ip.peaks.count[rownames(peaks.de),"ID"]
 96 | peaks.de$gene.l2fc <- gene.de[peaks.de$gene_id,]$gene.l2fc
 97 | peaks.de$diff.l2fc <- peaks.de$peak.l2fc - peaks.de$gene.l2fc
 98 | results$diff.l2fc <-  peaks.de[rownames(results),]$diff.l2fc
 99 | write.table(results,file = paste0("edgeR_diffm6A_",group_id_1, "_",group_id_2,".txt") ,sep = "\t",quote = F)
100 | 
101 | 


--------------------------------------------------------------------------------
/docs/output.md:
--------------------------------------------------------------------------------
  1 | # MeRIPseqPipe: Output
  2 | 
  3 | This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline.
  4 | 
  5 | <!-- TODO nf-core: Write this documentation describing your workflow's output -->
  6 | 
  7 | ## Pipeline overview
  8 | 
  9 | The pipeline is built using [Nextflow](https://www.nextflow.io/)
 10 | and processes data using the following steps:
 11 | 
 12 | - [MeRIPseqPipe: Output](#MeRIPseqPipe-output)
 13 |   - [Pipeline overview](#pipeline-overview)
 14 |   - [Quality Control](#quality-control)
 15 |     - [Fastp](#fastp)
 16 |     - [FastQC](#fastqc)
 17 |     - [RSeQC](#rseqc)
 18 |     - [MultiQC](#multiqc)
 19 |   - [Align reasults](#align-reasults)
 20 |     - [STAR](#star)
 21 |     - [BWA](#bwa)
 22 |     - [TopHat2](#tophat2)
 23 |     - [HISAT2](#hisat2)
 24 |   - [SAMtools](#samtools)
 25 |   - [PeakCalling](#peakcalling)
 26 |     - [MeTPeak](#metpeak)
 27 |     - [MATK](#matk)
 28 |     - [Meyer](#meyer)
 29 |     - [MACS2](#macs2)
 30 |   - [PeakMerged](#peakmerged)
 31 |     - [RobustRankAggreg](#robustrankaggreg)
 32 |     - [MSPC](#mspc)
 33 |     - [BEDtools](#bedtools)
 34 |   - [M6A sites prediction](#m6a-sites-prediction)
 35 |     - [MATK](#matk)
 36 |   - [Differtial Methylation Analysis](#differtial-methylation-analysis)
 37 |     - [QNB](#qnb)
 38 |     - [MATK](#matk)
 39 |     - [DESeq2_DM](#deseq2dm)
 40 |     - [edgeR_DM](#edgerdm)
 41 |   - [Differtial Expression Analysis](#differtial-expression-analysis)
 42 |     - [featureCounts](#featureCounts)
 43 |     - [DESeq2_DE](#deseq2de)
 44 |     - [edgeR_DE](#edgerde)
 45 |   - [Reports](#reports)
 46 | 
 47 | Several R packages for downstream analysis.
 48 | 
 49 | ## Quality Control
 50 | 
 51 | **Output directory: `results/QC/`**
 52 | 
 53 | ### Fastp
 54 | 
 55 | [Fastp](https://github.com/OpenGene/fastp)
 56 | 
 57 | **Output directory: `results/QC/fastp`**
 58 | 
 59 | ### FastQC
 60 | 
 61 | [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your reads. It provides information about the quality score distribution across your reads, the per base sequence content (%T/A/G/C). You get information about adapter contamination and other overrepresented sequences.
 62 | 
 63 | For further reading and documentation see the [FastQC help](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/).
 64 | 
 65 | > **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. To see how your reads look after trimming, look at the FastQC reports in the `trim_galore` directory.
 66 | 
 67 | **Output directory: `results/QC/fastqc`**
 68 | 
 69 | - `sample_fastqc.html`
 70 |   - FastQC report, containing quality metrics for your untrimmed raw fastq files
 71 | - `zips/sample_fastqc.zip`
 72 |   - zip file containing the FastQC report, tab-delimited data file and plot images
 73 | 
 74 | ### RSeQC
 75 | 
 76 | [RSeQC](http://rseqc.sourceforge.net/)
 77 | 
 78 | **Output directory: `results/QC/RSeQC`**
 79 | 
 80 | ### MultiQC
 81 | 
 82 | [MultiQC](http://multiqc.info) is a visualisation tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in within the report data directory.
 83 | 
 84 | The pipeline has special steps which allow the software versions used to be reported in the MultiQC output for future traceability.
 85 | 
 86 | **Output directory: `results/QC/multiqc`**
 87 | 
 88 | - `Project_multiqc_report.html`
 89 |   - MultiQC report - a standalone HTML file that can be viewed in your web browser
 90 | - `Project_multiqc_data/`
 91 |   - Directory containing parsed statistics from the different tools used in the pipeline
 92 | 
 93 | For more information about how to use MultiQC reports, see [http://multiqc.info](http://multiqc.info)
 94 | 
 95 | ## Align reasults
 96 | 
 97 | **Output directory: `results/QC/multiqc`**
 98 | 
 99 | ### STAR
100 | 
101 | [STAR](https://github.com/alexdobin/STAR)
102 | 
103 | ### BWA
104 | 
105 | [BWA](https://github.com/lh3/bwa)
106 | 
107 | ### TopHat2
108 | 
109 | [TopHat2](https://ccb.jhu.edu/software/tophat/)
110 | 
111 | ### HISAT2
112 | 
113 | [HISAT2](https://ccb.jhu.edu/software/hisat2/)
114 | 
115 | ## SAMtools
116 | 
117 | [SAMtools](http://www.htslib.org/)
118 | 
119 | ## PeakCalling
120 | 
121 | ### MeTPeak
122 | 
123 | [MeTPeak](https://github.com/compgenomics/MeTPeak)
124 | 
125 | ### MATK
126 | 
127 | [MATK](http://matk.renlab.org)
128 | 
129 | ### Meyer
130 | 
131 | [meyer](http://matk.renlab.org)
132 | 
133 | ### MACS2
134 | 
135 | [MACS2](https://github.com/taoliu/MACS)
136 | 
137 | ## PeakMerged
138 | 
139 | ### RobustRankAggreg
140 | 
141 | [RobustRankAggreg](https://cran.r-project.org/web/packages/RobustRankAggreg/index.html)
142 | 
143 | ### MSPC
144 | 
145 | [MSPC]
146 | 
147 | ### BEDtools
148 | 
149 | [BEDtools](https://bedtools.readthedocs.io/en/latest/index.html)
150 | 
151 | ## M6A sites prediction
152 | 
153 | [MATK](http://matk.renlab.org)
154 | 
155 | ## Differtial Methylation Analysis
156 | 
157 | ### QNB
158 | 
159 | [QNB](https://cran.r-project.org/src/contrib/Archive/QNB/)
160 | 
161 | ### MATK
162 | [MATK](http://matk.renlab.org)
163 | 
164 | ### DESeq2_DM
165 | 
166 | [DESeq2](http://bioconductor.org/packages/DESeq2/)
167 | 
168 | ### edgeR_DM
169 | 
170 | [edgeR](http://bioconductor.org/packages/edgeR/)
171 | 
172 | ## Differtial Expression Analysis
173 | 
174 | ### featureCounts
175 | 
176 | [featureCounts](http://subread.sourceforge.net)
177 | 
178 | ### DESeq2_DE
179 | 
180 | [DESeq2](http://bioconductor.org/packages/DESeq2/)
181 | 
182 | ### edgeR_DE
183 | 
184 | [edgeR](http://bioconductor.org/packages/edgeR/)
185 | 
186 | ## Reports
187 | 


--------------------------------------------------------------------------------
/lib/LikeletUtils.groovy:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env groovy
  2 | import static nextflow.Nextflow.file
  3 | import nextflow.Channel
  4 | 
  5 | class LikeletUtils {
  6 | 
  7 |   // adjust command colors
  8 |   static String ANSI_RESET = "\u001B[0m"
  9 |   static String ANSI_BLACK = "\u001B[30m"
 10 |   static String ANSI_RED = "\u001B[31m"
 11 |   static String ANSI_GREEN = "\u001B[32m"
 12 |   static String ANSI_YELLOW = "\u001B[33m"
 13 |   static String ANSI_BLUE = "\u001B[34m"
 14 |   static String ANSI_PURPLE = "\u001B[35m"
 15 |   static String ANSI_CYAN = "\u001B[36m"
 16 |   static String ANSI_WHITE = "\u001B[37m"
 17 | 
 18 |   static def print_red = {  str -> LikeletUtils.ANSI_RED + str + LikeletUtils.ANSI_RESET }
 19 |   static def print_black = {  str -> LikeletUtils.ANSI_BLACK + str + LikeletUtils.ANSI_RESET }
 20 |   static def print_green = {  str -> LikeletUtils.ANSI_GREEN + str + LikeletUtils.ANSI_RESET }
 21 |   static def print_yellow = {  str -> LikeletUtils.ANSI_YELLOW + str + LikeletUtils.ANSI_RESET }
 22 |   static def print_blue = {  str -> LikeletUtils.ANSI_BLUE + str + LikeletUtils.ANSI_RESET }
 23 |   static def print_cyan = {  str -> LikeletUtils.ANSI_CYAN + str + LikeletUtils.ANSI_RESET }
 24 |   static def print_purple = {  str -> LikeletUtils.ANSI_PURPLE + str + LikeletUtils.ANSI_RESET }
 25 |   static def print_white = {  str -> LikeletUtils.ANSI_WHITE + str + LikeletUtils.ANSI_RESET }
 26 | 
 27 |   // Check if a row has the expected number of item, adjusted from Sarek 
 28 |     static def checkNumberOfItem(row, number) {
 29 |       if (row.size() != number) exit 1, println("Malformed row in TSV file: ${row}, see --help for more information")
 30 |       return true
 31 |     } 
 32 | 
 33 |   // Return status [0,1]
 34 |     // 0 == Normal, 1 == Tumor
 35 |     static def returnStatus(it) {
 36 |       if (!(it in [0, 1])) exit 1, println("Status is not recognized in TSV file: ${it}, see --help for more information")
 37 |       return it
 38 |     }
 39 | 
 40 |     // Return file if it exists
 41 |     static def returnFile(it) {
 42 |       if (!file(it).exists()) exit 1, println("Missing file in TSV file: ${it}, see --help for more information")
 43 |       return file(it)
 44 |     }
 45 | 
 46 |   static def sysucc_ascii() {
 47 |     print LikeletUtils.print_yellow(" ▄▄▄▄▄▄▄▄▄▄▄  ▄         ▄  ▄▄▄▄▄▄▄▄▄▄▄  ▄         ▄  ▄▄▄▄▄▄▄▄▄▄▄  ▄▄▄▄▄▄▄▄▄▄▄ \n")
 48 |     print LikeletUtils.print_yellow("▐░░░░░░░░░░░▌▐░▌       ▐░▌▐░░░░░░░░░░░▌▐░▌       ▐░▌▐░░░░░░░░░░░▌▐░░░░░░░░░░░▌\n")
 49 |     print LikeletUtils.print_yellow("▐░█▀▀▀▀▀▀▀▀▀ ▐░▌       ▐░▌▐░█▀▀▀▀▀▀▀▀▀ ▐░▌       ▐░▌▐░█▀▀▀▀▀▀▀▀▀ ▐░█▀▀▀▀▀▀▀▀▀ \n")
 50 |     print LikeletUtils.print_yellow("▐░▌          ▐░▌       ▐░▌▐░▌          ▐░▌       ▐░▌▐░▌          ▐░▌          \n")
 51 |     print LikeletUtils.print_yellow("▐░█▄▄▄▄▄▄▄▄▄ ▐░█▄▄▄▄▄▄▄█░▌▐░█▄▄▄▄▄▄▄▄▄ ▐░▌       ▐░▌▐░▌          ▐░▌          \n")
 52 |     print LikeletUtils.print_yellow("▐░░░░░░░░░░░▌▐░░░░░░░░░░░▌▐░░░░░░░░░░░▌▐░▌       ▐░▌▐░▌          ▐░▌          \n")
 53 |     print LikeletUtils.print_yellow(" ▀▀▀▀▀▀▀▀▀█░▌ ▀▀▀▀█░█▀▀▀▀  ▀▀▀▀▀▀▀▀▀█░▌▐░▌       ▐░▌▐░▌          ▐░▌          \n")
 54 |     print LikeletUtils.print_yellow("          ▐░▌     ▐░▌               ▐░▌▐░▌       ▐░▌▐░▌          ▐░▌          \n")
 55 |     print LikeletUtils.print_yellow(" ▄▄▄▄▄▄▄▄▄█░▌     ▐░▌      ▄▄▄▄▄▄▄▄▄█░▌▐░█▄▄▄▄▄▄▄█░▌▐░█▄▄▄▄▄▄▄▄▄ ▐░█▄▄▄▄▄▄▄▄▄ \n")
 56 |     print LikeletUtils.print_yellow("▐░░░░░░░░░░░▌     ▐░▌     ▐░░░░░░░░░░░▌▐░░░░░░░░░░░▌▐░░░░░░░░░░░▌▐░░░░░░░░░░░▌\n")
 57 |     print LikeletUtils.print_yellow(" ▀▀▀▀▀▀▀▀▀▀▀       ▀       ▀▀▀▀▀▀▀▀▀▀▀  ▀▀▀▀▀▀▀▀▀▀▀  ▀▀▀▀▀▀▀▀▀▀▀  ▀▀▀▀▀▀▀▀▀▀▀ \n")
 58 |     }
 59 |   // extrct fastq information from tsvFile
 60 |   static def extractData(tsvFile) {
 61 |     // Channeling the TSV file containing FASTQ.
 62 |     // Format is: "subject gender status sample lane fastq1 fastq2"
 63 |     def inputData = Channel.from(tsvFile)
 64 |     .splitCsv(sep: '\t', skip: 1)
 65 |     .map { row ->
 66 |       LikeletUtils.checkNumberOfItem(row, 6)
 67 |       def idSample  = row[0]
 68 |       def fastqFile1 = file(row[1])
 69 |       def fastqFile2 = file(row[2])
 70 |       def group = row[5]
 71 |       def input = true
 72 |       def gzip = false
 73 |       def readsSingle = false
 74 |       def filetype = "fastq"
 75 |       if (row[1].endsWith(".gz") == true ){
 76 |         gzip = true
 77 |       }else if (row[1].endsWith(".bam") == true ){
 78 |         filetype = "bam"
 79 |       }
 80 |       if (row[2].endsWith("false") == true){
 81 |         readsSingle = true
 82 |         [idSample, [fastqFile1], readsSingle, gzip, input, group, filetype]
 83 |       } else {
 84 |         [idSample, [fastqFile1, fastqFile2], readsSingle, gzip, input, group, filetype]
 85 |       }
 86 |     }
 87 |     def ipData = Channel.from(tsvFile)
 88 |     .splitCsv(sep: '\t', skip: 1)
 89 |     .map { row ->
 90 |       LikeletUtils.checkNumberOfItem(row, 6)
 91 |       def idSample  = row[0]
 92 |       def fastqFile1 = file(row[3])
 93 |       def fastqFile2 = file(row[4])
 94 |       def group = row[5]
 95 |       def input = false
 96 |       def gzip = false
 97 |       def readsSingle = false
 98 |       def filetype = "fastq"
 99 |       if (row[3].endsWith(".gz") == true){
100 |         gzip = true
101 |       }else if (row[3].endsWith(".bam") == true){
102 |         filetype = "bam"
103 |       }
104 |       if (row[4].endsWith("false") == true){
105 |         readsSingle = true
106 |         [idSample, [fastqFile1], readsSingle, gzip, input, group, filetype]
107 |       } else {
108 |         [idSample, [fastqFile1, fastqFile2], readsSingle, gzip, input, group, filetype]
109 |       }          
110 |     }
111 |     return inputData.mix(ipData)
112 |   }
113 |   static def addstringToalign(String str,int num){
114 |     if(str.length() < num) {
115 |       def numSpace =  num-str.length() 
116 | 
117 |       numSpace.times{
118 |         str += ' '
119 |       }
120 |     }
121 |     str
122 |   }
123 | }
124 | 
125 | 
126 | 


--------------------------------------------------------------------------------
/nextflow.config:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * -------------------------------------------------
  3 |  *  nf-core/meripseqpipe Nextflow config file
  4 |  * -------------------------------------------------
  5 |  * Default config options for all environments.
  6 |  */
  7 | 
  8 | // Global default params, used in configs
  9 | params {
 10 |   // Pipeline Options
 11 |   // Input files
 12 |   designfile = "$baseDir/designfile.tsv"
 13 |   comparefile = "$baseDir/comparefile.txt"
 14 | 
 15 |   // Setting main parameters of analysis mode
 16 |   stranded = "no" // "yes" OR "no" OR "reverse"
 17 |   mapq_cutoff = 20 // "255" means unique mapping reads
 18 |   aligners = "star" // "star" OR "bwa" OR "tophat2" OR "hisat2" OR "none"
 19 |   peakCalling_mode = "independence" // "group" OR "independence"
 20 |   peakMerged_mode = "rank" // "rank" OR "macs2" OR "MATK" OR "metpeak" OR "mspc"
 21 |   expression_analysis_mode = "DESeq2" // "DESeq2" OR "edgeR" OR "none"
 22 |   methylation_analysis_mode = "QNB" // "MATK" OR "QNB" OR "Wilcox-test" OR "MeTDiff" OR "edgeR" OR "DESeq2"
 23 | 
 24 |   // Reference genomes
 25 |   matk_jar = "$baseDir/MATK-1.0.jar"
 26 |   fasta = "/data1/zhuky/test-datasets/reference/TEST.fa"
 27 |   gtf = "/data1/zhuky/test-datasets/reference/TEST.gtf"
 28 |   rRNA_fasta = false
 29 |   tophat2_index = false
 30 |   hisat2_index = false
 31 |   bwa_index = false
 32 |   star_index = false
 33 | 
 34 |   // Other command line parameters
 35 |   peak_threshold = "medium" // "low" OR "medium" OR "high"
 36 |   saveReference = false
 37 |   outdir = "$baseDir/results"
 38 |   tracedir = "${params.outdir}/pipeline_info/"
 39 |   name = false
 40 |   multiqc_config = "$baseDir/assets/multiqc_config.yaml"
 41 |   email = false
 42 |   email_on_fail = false
 43 |   maxMultiqcEmailFileSize = 25.MB
 44 |   plaintext_email = false
 45 |   monochrome_logs = false
 46 |   help = false
 47 |   igenomes_base = 's3://ngi-igenomes/igenomes/'
 48 |   tracedir = "${params.outdir}/pipeline_info"
 49 |   igenomes_ignore = false
 50 |   custom_config_version = 'master'
 51 |   custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}"
 52 |   hostnames = false
 53 |   config_profile_description = false
 54 |   config_profile_contact = false
 55 |   config_profile_url = false
 56 | 
 57 |   // Defaults only, expecting to be overwritten
 58 |   max_memory = 128.GB
 59 |   max_cpus = 16
 60 |   max_time = 240.h
 61 | 
 62 |   // skipping modes Options
 63 |   skip_sort = false
 64 |   skip_peakCalling = false
 65 |   skip_diffpeakCalling = false
 66 |   skip_annotation = false
 67 |   skip_m6Aprediction = false
 68 |   skip_qc = false
 69 |   skip_motif = false
 70 |   skip_filterrRNA = false
 71 | 
 72 |   // skipping tools Options
 73 |   // PeakCalling tools
 74 |   skip_metpeak = false
 75 |   skip_macs2 = false
 76 |   skip_matk = false
 77 |   skip_meyer = false
 78 |   // QC Steps
 79 |   skip_fastp = false
 80 |   skip_fastqc = false
 81 |   skip_rseqc = false
 82 |   skip_createbedgraph = true
 83 |   skip_genebody_coverage = true
 84 | 
 85 |   // ignore
 86 |   input = false
 87 |   readPaths = false
 88 |   single_end = false
 89 |   gzip = false
 90 | }
 91 | 
 92 | // Container slug. Stable releases should specify release tag!
 93 | // Developmental code should specify :dev
 94 | process.container = 'kingzhuky/meripseqpipe:dev'
 95 | 
 96 | // Load base.config by default for all pipelines
 97 | includeConfig 'conf/base.config'
 98 | 
 99 | // Load nf-core custom profiles from different Institutions
100 | //try {
101 | // includeConfig "${params.custom_config_base}/nfcore_custom.config"
102 | //} catch (Exception e) {
103 |  // System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config")
104 | //}
105 | 
106 | // Avoid this error:
107 | // WARNING: Your kernel does not support swap limit capabilities or the cgroup is not mounted. Memory limited without swap.
108 | // Testing this in nf-core after discussion here https://github.com/nf-core/tools/pull/351, once this is established and works well, nextflow might implement this behavior as new default.
109 | docker.runOptions = '-u \$(id -u):\$(id -g)'
110 | 
111 | // Load igenomes.config if required
112 | if (!params.igenomes_ignore) {
113 |   includeConfig 'conf/igenomes.config'
114 | }
115 | profiles {
116 |   conda { process.conda = "$baseDir/environment.yml" }
117 |   docker { 
118 |     params.matk_jar = "/MATK-1.0.jar"
119 |     docker.enabled = true
120 |     docker.runOptions = '-u $(id -u):$(id -g)'
121 |   }
122 |   C2 { includeConfig 'conf/C2.config'}
123 |   debug { process.beforeScript = 'echo $HOSTNAME' }
124 |   test { includeConfig 'conf/test.config' }
125 |   test_mixed { includeConfig 'conf/test_mixed.config' }
126 |   test_bam { includeConfig 'conf/test_bam.config' }
127 |   none {
128 |     // Don't load any config (for use with custom home configs)
129 |   }
130 | }
131 | 
132 | // Capture exit codes from upstream processes when piping
133 | process.shell = ['/bin/bash', '-euo', 'pipefail']
134 | 
135 | timeline {
136 |   enabled = true
137 |   file = "${params.tracedir}/execution_timeline.html"
138 | }
139 | report {
140 |   enabled = true
141 |   file = "${params.tracedir}/execution_report.html"
142 | }
143 | trace {
144 |   enabled = true
145 |   file = "${params.tracedir}/execution_trace.txt"
146 | }
147 | dag {
148 |   enabled = true
149 |   file = "${params.tracedir}/pipeline_dag.svg"
150 | }
151 | 
152 | manifest {
153 |   name = 'MeRIPseqPipe'
154 |   author = 'Kaiyu Zhu, Yu Sun, Xiaoqiong Bao'
155 |   homePage = 'https://github.com/canceromics/MeRIPseqPipe'
156 |   description = 'MeRIP-seq analysis pipeline arranged multiple alignment tools, peakCalling tools, Merge Peaks\' methods and methylation analysis methods.'
157 |   mainScript = 'main.nf'
158 |   nextflowVersion = '>=19.04.0'
159 |   version = '1.0dev'
160 | }
161 | 
162 | // Function to ensure that resource requirements don't go beyond
163 | // a maximum limit
164 | def check_max(obj, type) {
165 |   if (type == 'memory') {
166 |     try {
167 |       if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1)
168 |         return params.max_memory as nextflow.util.MemoryUnit
169 |       else
170 |         return obj
171 |     } catch (all) {
172 |       println "   ### ERROR ###   Max memory '${params.max_memory}' is not valid! Using default value: $obj"
173 |       return obj
174 |     }
175 |   } else if (type == 'time') {
176 |     try {
177 |       if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1)
178 |         return params.max_time as nextflow.util.Duration
179 |       else
180 |         return obj
181 |     } catch (all) {
182 |       println "   ### ERROR ###   Max time '${params.max_time}' is not valid! Using default value: $obj"
183 |       return obj
184 |     }
185 |   } else if (type == 'cpus') {
186 |     try {
187 |       return Math.min( obj, params.max_cpus as int )
188 |     } catch (all) {
189 |       println "   ### ERROR ###   Max cpus '${params.max_cpus}' is not valid! Using default value: $obj"
190 |       return obj
191 |     }
192 |   }
193 | }
194 | 


--------------------------------------------------------------------------------
/bin/QC_Peaks_Report.R:
--------------------------------------------------------------------------------
  1 | #!/bin/Rscript
  2 | ## Rscript QC_Peaks_Report.R <desginfile> <peakMerged_mode>
  3 | ### designfile: Sample_id, Input_filename, IP_filename, group_id
  4 | library(ggplot2)
  5 | library(ggseqlogo)
  6 | library(reshape2)
  7 | options(stringsAsFactors = FALSE)
  8 | 
  9 | args <- commandArgs(T) 
 10 | #args <- c("formatted_designfile.txt", "mspc", "group","QCPeaksPlot.RData")
 11 | designfile <- args[1] #"formatted_designfile.txt"
 12 | peakMerged.mode <- args[2]#rank
 13 | peakCalling.mode <- args[3]#"group"
 14 | output.Rdata <- args[4]#"QCPeaksPlot.RData"
 15 | designtable <- read.csv(designfile, head = TRUE, colClasses = c("character"))
 16 | 
 17 | ## Peaks Distribution
 18 | pdf(file = paste0("distribution.plot_",peakMerged.mode,".pdf"),paper = "USr")
 19 | PlotPeaksDitr <- function(files.list, suffix = "[.]anno[.]txt"){
 20 |   distribute_df <- NULL
 21 |   for( file in files.list ){
 22 |     anno.table <- read.table(file, header=F, sep="\t", quote="", stringsAsFactors = F)[,c(1,2,3,15,11,12,13,14,17)]
 23 |     colnames(anno.table) <- c("Chr","ChrStart","ChrEnd","ID","Gene_symbol","Coding","Location","Relative_distance","RNA_type")
 24 |     peak.freq = c(as.numeric(as.vector(anno.table[which(anno.table[,7]=="5UTR"),8])),
 25 |                   as.numeric(as.vector(anno.table[which(anno.table[,7]=="CDS"),8]))+100,
 26 |                   as.numeric(as.vector(anno.table[which(anno.table[,7]=="3UTR"),8]))+200)
 27 |     freq = data.frame(Freq = peak.freq, group = strsplit(file,suffix)[[1]])
 28 |     distribute_df = rbind(distribute_df, freq)
 29 |   }
 30 |   ggplot(distribute_df, aes(x=Freq, colour = group))+
 31 |     geom_line(stat = "density", size=1, adjust = 0.8)+
 32 |     scale_x_continuous(breaks = c(50,150,250), labels = c("5'UTR", "CDS", "3'UTR"))+ #axis labels
 33 |     labs(y="m6A coding peak density",x="Region of gene")+
 34 |     geom_vline(xintercept = c(100,200), linetype = "dashed")+
 35 |     theme_bw()+
 36 |     theme(panel.grid =element_blank(),#remove grid line
 37 |           axis.title.x = element_text(size = 20, angle = 0, face = "plain", colour = "black"),
 38 |           axis.title.y = element_text(size = 20, angle = 90, face = "plain", colour = "black"),
 39 |           axis.text.x = element_text(size = 15,colour = "black"),
 40 |           axis.text.y = element_text(size = 15,colour = "black"),
 41 |           aspect.ratio=1,
 42 |           axis.ticks.x = element_blank()) #remove ticks
 43 | }
 44 | 
 45 | anno.files.list <- dir(pattern = "[.]anno[.]txt")
 46 | ### barplot
 47 | total.distribute <- NULL
 48 | for( file in anno.files.list ){
 49 |   anno.table <- read.table(file, header=F, sep="\t", quote="", stringsAsFactors = F)[,c(1,2,3,15,11,12,13,14,17)]
 50 |   colnames(anno.table) <- c("Chr","ChrStart","ChrEnd","ID","Gene_symbol","Coding","Location","Relative_distance","RNA_type")
 51 |   anno.table[anno.table$Location == "CDS" & anno.table$Relative_distance >= 95,7] <- "Stop Codon"
 52 |   anno.table[anno.table$Location == "3UTR" & anno.table$Relative_distance <= 5,7] <- "Stop Codon"
 53 |   group.name <- strsplit(file,"[.]anno[.]txt")[[1]]
 54 |   freq = data.frame(Location = anno.table[anno.table$Coding == "coding",7], group = group.name)
 55 |   total.distribute = rbind(total.distribute, freq)
 56 | }
 57 | distribute.table <- melt(table(total.distribute))
 58 | distribute.table$Location <- factor(distribute.table$Location,levels = c("intron","3UTR","Stop Codon","CDS","5UTR"))
 59 | col <- c('plum1','pink2','#58B2DC',"#51A8DD","#005CAF")
 60 | distribute.barplot <- ggplot(distribute.table,aes(group, value, fill = Location)) +
 61 |                         geom_bar(stat="identity",position = 'fill') + coord_flip() +
 62 |                         ggtitle("Peaks Distribution") +
 63 |                         scale_y_continuous(expand = c(0, 0)) +
 64 |                         guides(fill = guide_legend(reverse = TRUE)) +
 65 |                         scale_fill_brewer() +
 66 |                         theme(panel.grid =element_blank(), #remove grid line
 67 |                               title = element_text(size = 15, angle = 0, face = "plain", colour = "black"),
 68 |                               axis.text.x = element_text(size = 12,colour = "black"),
 69 |                               axis.text.y = element_text(size = 12,colour = "black"),
 70 |                               panel.background = element_rect(fill = "transparent",colour = NA),
 71 |                               axis.title = element_blank(),
 72 |                               axis.ticks.x = element_blank()) #remove ticks
 73 | distribute.barplot.count <- ggplot(distribute.table,aes(group, value, fill = Location)) +
 74 |                             geom_bar(stat="identity") + coord_flip() +
 75 |                             ggtitle("Peaks Distribution") +
 76 |                             scale_y_continuous(expand = c(0, 0)) +
 77 |                             guides(fill = guide_legend(reverse = TRUE)) +
 78 |                             scale_fill_brewer() +
 79 |                             theme(panel.grid =element_blank(), #remove grid line
 80 |                                   title = element_text(size = 15, angle = 0, face = "plain", colour = "black"),
 81 |                                   axis.text.x = element_text(size = 12,colour = "black"),
 82 |                                   axis.text.y = element_text(size = 12,colour = "black"),
 83 |                                   panel.background = element_rect(fill = "transparent",colour = NA),
 84 |                                   axis.title = element_blank(),
 85 |                                   axis.ticks.x = element_blank()) #remove ticks
 86 | print(distribute.barplot)
 87 | print(distribute.barplot.count)
 88 | ### Curve
 89 | sample.plots.list <- NULL
 90 | sample.list <- if(peakCalling.mode == "group") designtable$Group else designtable$Sample_ID
 91 | for( sample in sample.list ){
 92 |   sample.files.list <- grep(paste0("_",sample,"_normalized"), anno.files.list, value = T)
 93 |   sample.plots.list[[sample]] <- PlotPeaksDitr(sample.files.list, "_normalized[.]anno[.]txt")
 94 |   print(sample.plots.list[[sample]])
 95 | }
 96 | merged.files.list <- grep("merged", anno.files.list, value = T)
 97 | merged.plot <- PlotPeaksDitr(merged.files.list)
 98 | print(merged.plot)
 99 | dev.off()
100 | 
101 | ## Peaks' motif
102 | pdf(file = paste0("motif.plot_",peakMerged.mode,".pdf"),paper = "USr")
103 | ggplot2.multiplot <- function(..., plotlist=NULL, cols=2) {
104 |   # Make a list from the ... arguments and plotlist
105 |   plots <- c(list(...), plotlist)
106 |   numPlots = length(plots)
107 |   
108 |   # Make the panel
109 |   plotCols = cols                          # Number of columns of plots
110 |   plotRows = ceiling(numPlots/plotCols) # Number of rows needed, calculated from # of cols
111 |   
112 |   # Set up the page
113 |   grid::grid.newpage()
114 |   grid::pushViewport(grid::viewport(layout = grid::grid.layout(plotRows, plotCols)))
115 |   vplayout <- function(x, y)
116 |     grid::viewport(layout.pos.row = x, layout.pos.col = y)
117 |   
118 |   # Make each plot, in the correct location
119 |   for (i in 1:numPlots) {
120 |     curRow = ceiling(i/plotCols)
121 |     curCol = (i-1) %% plotCols + 1
122 |     print(plots[[i]], vp = vplayout(curRow, curCol))
123 |   }
124 |   
125 | }
126 | motif_plot <- function(motif, pval, rank){
127 |   ggplot()+
128 |     geom_logo(motif, method = "probability")+
129 |     annotate("text", x=ncol(motif)-0.5, y=1.5, label=paste0("p = ",pval),size = 5)+
130 |     ggtitle(rank)+
131 |     theme(plot.title = element_text(hjust = 0, size = 6))+
132 |     theme_logo()
133 | }
134 | QC.motif.filelist = dir(".",pattern = "motif[1,2,3].motif",recursive = TRUE)
135 | QC.motif.list <- NULL
136 | QC.motif.pvalue <- NULL
137 | motif.peakfiles <- unique(unlist(lapply(strsplit(QC.motif.filelist,"_homer/homerResults"), function(x){x[1]})))
138 | for( peakfile.name in motif.peakfiles ){
139 |   group.motif.list <- NULL
140 |   for (file in grep(paste0(peakfile.name, "_homer/homerResults"), QC.motif.filelist, value = T) ){
141 |     motif_matrix <- read.delim(file,header = F,sep = "\t", check.names=F)
142 |     motif_pvalue <- strsplit(motif_matrix[1,6], split = ":")[[1]][4]
143 |     motif_matrix <- motif_matrix[-1,c(-5,-6)]
144 |     colnames(motif_matrix) <- c("A","C","G","T")
145 |     rownames(motif_matrix) <- c(1:nrow(motif_matrix))
146 |     motif_matrix <- as.matrix(t(motif_matrix))
147 |     motif_name <- strsplit(strsplit(file,split = c("_homer/homerResults/"))[[1]][2],split = "[.]motif")[[1]][1]
148 |     group.motif.list[[motif_name]] <- motif_plot(t(apply(motif_matrix, 1, function(x)as.numeric(x))), motif_pvalue, paste0(peakfile.name,"_",motif_name))
149 |   }
150 |   QC.motif.list[[peakfile.name]] <- group.motif.list
151 |   ggplot2.multiplot(plotlist = QC.motif.list[[peakfile.name]] ,cols = 1)
152 | }
153 | dev.off()
154 | save(distribute.barplot.count,distribute.barplot,sample.plots.list,merged.plot,QC.motif.list,file = output.Rdata)
155 | 


--------------------------------------------------------------------------------
/bin/MeTDiff_diffm6A.R:
--------------------------------------------------------------------------------
  1 | #!/bin/Rscript
  2 | ## Rscript MeTDiff_diffm6A.R <designfile> <compare_str>
  3 | ### designfile: Sample_id, Input_filename, IP_filename, group_id
  4 | ### compare_str: Compairision design (eg: A_vs_B)
  5 | library(MeTDiff)
  6 | args <- commandArgs(T)
  7 | designfile <- args[1]
  8 | compare_str <- args[2]
  9 | 
 10 | .help.digamma <- function(xx,alpha){
 11 |   Tm <- dim(xx)
 12 |   TT <- Tm[1]
 13 |   m <- Tm[2]
 14 |   
 15 |   res <- matrix(0,TT,1)
 16 |   for (ii in 1:m){
 17 |     res <- res + digamma(xx[,ii] + alpha)
 18 |   }
 19 |   return(res)
 20 | }
 21 | 
 22 | .help.trigamma <- function(xx,alpha){
 23 |   Tm <- dim(xx)
 24 |   if (is.null(Tm)){
 25 |     TT <- length(xx)
 26 |     m <- 1
 27 |   }else{
 28 |     TT <- Tm[1]
 29 |     m <- Tm[2]
 30 |   }
 31 |   res <- matrix(0,TT,1)
 32 |   for (ii in 1:m){
 33 |     res <- res + trigamma(xx[,ii] + alpha)
 34 |   }
 35 |   return(res)
 36 | }
 37 | 
 38 | .help.postprob <- function(dxx,dyy,dnn,xx,yy,alpha,beta){
 39 |   N <- length(alpha)
 40 |   Tm <- dim(xx)
 41 |   if (is.null(Tm)){
 42 |     TT <- length(xx)
 43 |     m <- 1
 44 |   }else{
 45 |     TT <- Tm[1]
 46 |     m <- Tm[2]
 47 |   }
 48 |   res <- matrix(0,TT,N)
 49 |   
 50 |   for (ii in 1:m){
 51 |     dnx <- as.matrix(dxx[,ii])
 52 |     dny <- as.matrix(dyy[,ii])
 53 |     dn <- as.matrix(dnn[,ii])
 54 |     x <- as.matrix(xx[,ii])
 55 |     y <- as.matrix(yy[,ii])
 56 |     res <- res + (dn-dnx-dny) %*% matrix(1,1,N) + lgamma(x %*% matrix(1,1,N) + matrix(1,TT) %*% alpha) - 
 57 |       lgamma(matrix(1,TT) %*% (alpha+beta) + (x+y) %*% matrix(1,1,N)) +
 58 |       lgamma(y %*% matrix(1,1,N) + matrix(1,TT) %*% beta) + lgamma(matrix(1,TT) %*% (alpha+beta)) - 
 59 |       lgamma(matrix(1,TT) %*% alpha) - lgamma(matrix(1,TT) %*% beta)
 60 |   }
 61 |   res <- exp(res)
 62 | }
 63 | 
 64 | .help.factorial <- function(count){
 65 |   #compute the log(count!)
 66 |   cm = max(count)
 67 |   if (is.null(ncol(count))){
 68 |     D <- 1
 69 |   }else{
 70 |     D <- ncol(count)
 71 |   }
 72 |   if(cm > 50000){
 73 |     dnorm <- as.matrix(lgamma(data.matrix(count+1)))
 74 |   }
 75 |   else{
 76 |     tmp  <- cumsum(rbind(0,log(as.matrix(1:max(count)))))
 77 |     dnorm <- matrix(tmp[data.matrix(count+1)],ncol=D)
 78 |   }
 79 | }
 80 | 
 81 | .betabinomial.lh <- function(x,y,Nit=40,Npara=1e-9){
 82 |   #   x <- as.matrix(x[peak,])
 83 |   #   y <- as.matrix(y[peak,])
 84 |   N <- 2 # number of states
 85 |   J <- matrix(0,N,1)
 86 |   H <- matrix(0,N,N)
 87 |   T <- nrow(x)
 88 |   IP_mean <- rowMeans(x)
 89 |   INPUT_mean <- rowMeans(y)
 90 |   nip = ncol(x)
 91 |   nin = ncol(y)
 92 |   # if the dimension for x and y does not match
 93 |   if (nip > nin) {
 94 |     avg_input <- round(matrix(rep(INPUT_mean,nip-nin),ncol=nip-nin))
 95 |     y <- cbind(y,avg_input) 
 96 |   }
 97 |   else if (nip < nin){
 98 |     avg_ip <- matrix(rep(IP_mean,nin-nip),ncol=nin-nip)
 99 |     x <- cbind(x,avg_ip) 
100 |   }
101 |   n <- x + y
102 |   m <- ncol(x)
103 |   rr <- x/n
104 |   
105 |   # use another method to initialize
106 |   p1_e <- exp(sum( log(rr) )/(T*m))
107 |   p2_e <- exp(sum( log(1-rr)/(T*m) ))
108 |   alpha <- 1/2 *(1-p2_e)/(1-p1_e-p2_e ) # to avoid 0
109 |   beta <-  1/2 *(1-p1_e)/(1-p1_e-p2_e )
110 |   c = rbind(alpha,beta)
111 |   # add break condition to avoid alpha is na   
112 |   if ( !any(is.finite(beta)) | !is.finite(alpha) | any(beta <= 0) | any(alpha<= 0) ){
113 |     return(list(logl=rnorm(1)*10000,alpha=c(1,1),beta=c(1,1))) 
114 |   }
115 |   for (nit in 1:Nit){
116 |     J[1] <- T*digamma(sum(c))*m - sum( .help.digamma(as.matrix(n),sum(c)) ) + sum( .help.digamma(as.matrix(x),c[1]) ) - T*digamma(c[1])*m
117 |     J[2] <- T*digamma(sum(c))*m - sum( .help.digamma(as.matrix(n),sum(c)) ) + sum( .help.digamma(as.matrix(y),c[2]) ) - T*digamma(c[2])*m 
118 |     H[1,1] <- T*trigamma(sum(c))*m - sum(.help.trigamma(as.matrix(n),sum(c))) + sum(.help.trigamma(as.matrix(x),c[1])) - T*trigamma(c[1])*m
119 |     H[2,2] <- T*trigamma(sum(c))*m - sum(.help.trigamma(as.matrix(n),sum(c))) + sum(.help.trigamma(as.matrix(y),c[2]))  - T*trigamma(c[2])*m    
120 |     H[1,2] <- T*trigamma(sum(c))*m - sum(.help.trigamma(as.matrix(n),sum(c)))
121 |     H[2,1] <- H[1,2]
122 |     eigvalue <- eigen(H)$values
123 |     
124 |     if ( (any(beta < Npara)) | (any(alpha < Npara)) 
125 |          | abs(eigvalue[1]/eigvalue[2]) > 1e12 | abs(eigvalue[1]/eigvalue[2]) < 1e-12
126 |          | any(eigvalue==0) ){   break  }
127 |     
128 |     #     tmp_step <- -solve(H,tol=1e-20) %*% J
129 |     tmp_step <- -solve(H, J) # using newton smoothing
130 |     tmp <- c + tmp_step
131 |     while(any(tmp <= 0)){
132 |       #       warning(sprintf("Could not update the Newton step ...\n"))
133 |       tmp_step <- tmp_step / 20
134 |       tmp <- c + tmp_step
135 |     }
136 |     c <- tmp
137 |     
138 |   }
139 |   #   caculate the likelihood
140 |   alpha <- c[1]
141 |   beta <- c[2]
142 |   dnx <- .help.factorial(x)
143 |   dny <- .help.factorial(y)
144 |   dn <- .help.factorial(n)
145 |   prob <- .help.postprob(dnx,dny,dn,x,y,alpha,beta)
146 |   return(list(logl=sum(log(prob)),alpha=alpha,beta=beta))
147 |   
148 | }
149 | 
150 | # merge and compare two conditions
151 | diff.call.module <- function(meth1,unmeth1,meth2,unmeth2){
152 |   #x = untreated IP, y = untreated input, xx = treated IP, yy = treated input
153 |   no_peak=length(meth1[,1]) #PEAK$loci2peak_merged[,1])
154 |   pvalues <- rep(1,no_peak)
155 |   log.fc <- rep(0,no_peak)
156 |   for (ipeak in 1:no_peak) {
157 |     if (ipeak%%1000 == 0){print(ipeak)}
158 |     x = t(as.array(meth1[ipeak,]))
159 |     y = t(as.matrix(unmeth1[ipeak,]))
160 |     xx = t(as.matrix(meth2[ipeak,]))
161 |     yy = t(as.matrix(unmeth2[ipeak,]))
162 |     xxx = cbind(x,xx)
163 |     yyy = cbind(y,yy)
164 |     #BBtest
165 |     logl1 <- .betabinomial.lh(x,y+1)
166 |     logl2 <- .betabinomial.lh(xx,yy+1)
167 |     logl3 <- .betabinomial.lh(xxx,yyy+1)
168 |     tst <- (logl1$logl+logl2$logl-logl3$logl)*2
169 |     pvalues[ipeak] <- 1 - pchisq(tst,2)
170 |     log.fc[ipeak] <- log2( (sum(xx)+1)/(1+sum(yy)) * (1+sum(y))/(1+sum(x)) ) 
171 |     
172 |   }
173 |   p <- pvalues
174 |   fdr <- p.adjust(pvalues,method='fdr')
175 | 
176 |   DIFF <- list(fdr=fdr,pvalues=p,fc=log.fc)
177 |   # result
178 |   result =list()
179 |   result$DIFF = DIFF
180 |   return(result)
181 |   
182 | }
183 | 
184 | designtable <- read.csv(designfile, head = TRUE, stringsAsFactors=FALSE, colClasses = c("character"), check.names=F)
185 | design.matrix <- as.data.frame(designtable$Group)
186 | rownames(design.matrix) <- designtable$Sample_ID
187 | colnames(design.matrix) <- "Condition"
188 | 
189 | # Get the information of groups from compare_str
190 | if(length(unique(design.matrix$Condition)) < 2){
191 |   stop( "The count of Group is less than two, please check your designfile.")
192 | }else if( compare_str == "two_group" ){
193 |   # Get the information without compare_str beacause of only two groups
194 |   group_id_1 <- unique(design.matrix$Condition)[1]
195 |   group_id_2 <- unique(design.matrix$Condition)[2]
196 | }else{
197 |   # Running MeTDiff quantification with compare_str
198 |   group_id_1 <- strsplit(as.character(compare_str), "_vs_")[[1]][1]
199 |   group_id_2 <- strsplit(as.character(compare_str), "_vs_")[[1]][2]
200 | }
201 | design.matrix <- subset(design.matrix, Condition == group_id_1 | Condition == group_id_2 )
202 | design.matrix$Condition <- factor(design.matrix$Condition,labels = c("control","treatment"))
203 | filelist = list.files(path = ".",pattern = ".count",full.names = T)
204 | ## Generate the matrix of peaks count
205 | rpkm_peaks_list <- NULL
206 | for(sample_id in row.names(design.matrix)){
207 |   input_count_file <- grep(paste0("[.]",sample_id,"[.]input"),filelist,value = TRUE)
208 |   input_count_table <- read.table(file = input_count_file, sep = "\t", row.names = NULL,header = T)
209 | 
210 |   ip_count_file <- grep(paste0("[.]",sample_id,"[.]ip"),filelist,value = TRUE)
211 |   ip_count_table <- read.table(file = ip_count_file, sep = "\t", row.names = NULL, header = T)
212 |   rpkm <- cbind(input_count_table[,5],ip_count_table[,5])
213 |   colnames(rpkm) <- c(paste0(sample_id,".input"),paste0(sample_id,".ip"))
214 |   rpkm_peaks_list <- cbind(rpkm_peaks_list,rpkm)
215 | }
216 | rownames(rpkm_peaks_list) <- ip_count_table$PeakName
217 | 
218 | ## generate design matrix
219 | design.matrix$m6A <- "input"
220 | design.matrix$sample_id <- paste0(rownames(design.matrix),".input")
221 | design.matrix_ip <- design.matrix
222 | design.matrix_ip$m6A <- "IP"
223 | design.matrix_ip$sample_id <- paste0(rownames(design.matrix_ip),".ip")
224 | design.matrix <- rbind(design.matrix,design.matrix_ip)
225 | rownames(design.matrix) <- design.matrix$sample_id
226 | design.matrix$m6A <- factor(design.matrix$m6A)
227 | design.matrix <- design.matrix[colnames(rpkm_peaks_list),]
228 | 
229 | cnts <- rpkm_peaks_list
230 | meta <- design.matrix
231 | run.metdiff <- function(cnts,meta){
232 |   meth1 <- cnts[,which(meta$Condition == 'treatment' & meta$m6A == "IP")]
233 |   meth2 <- cnts[,which(meta$Condition != 'treatment' & meta$m6A == "IP")]
234 |   unmeth1 <- cnts[,which(meta$Condition == 'treatment' & meta$m6A == "input")]
235 |   unmeth2 <- cnts[,which(meta$Condition != 'treatment' & meta$m6A == "input")]
236 |   metdiff.result <- diff.call.module(meth1,unmeth1,meth2,unmeth2)
237 |   results <- data.frame(log2FC= metdiff.result$DIFF$fc, pvalue = metdiff.result$DIFF$pvalues, padj = p.adjust(metdiff.result$DIFF$pvalues,"BH"))
238 |   rownames(results) <- rownames(cnts)
239 |   return(results)
240 | }
241 | results <- run.metdiff(rpkm_peaks_list,design.matrix)
242 | write.table(results,file = paste0("MeTDiff_diffm6A_",group_id_1, "_",group_id_2,".txt") ,sep = "\t",quote = F)
243 | 


--------------------------------------------------------------------------------
/bin/m6A_motif.meme:
--------------------------------------------------------------------------------
  1 | MEME version 5.0.2
  2 | 
  3 | ALPHABET= ACGT
  4 | 
  5 | strands: + -
  6 | 
  7 | Background letter frequencies (from unknown source):
  8 |  A 0.250 C 0.250 G 0.250 T 0.250
  9 | 
 10 | MOTIF 1.1 AAACA
 11 | 
 12 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
 13 |  1.000000  0.000000  0.000000  0.000000
 14 |  1.000000  0.000000  0.000000  0.000000
 15 |  1.000000  0.000000  0.000000  0.000000
 16 |  0.000000  1.000000  0.000000  0.000000
 17 |  1.000000  0.000000  0.000000  0.000000
 18 | 
 19 | 
 20 | MOTIF 1.2 AAACC
 21 | 
 22 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
 23 |  1.000000  0.000000  0.000000  0.000000
 24 |  1.000000  0.000000  0.000000  0.000000
 25 |  1.000000  0.000000  0.000000  0.000000
 26 |  0.000000  1.000000  0.000000  0.000000
 27 |  0.000000  1.000000  0.000000  0.000000
 28 | 
 29 | 
 30 | MOTIF 1.3 AAACH
 31 | 
 32 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
 33 |  1.000000  0.000000  0.000000  0.000000
 34 |  1.000000  0.000000  0.000000  0.000000
 35 |  1.000000  0.000000  0.000000  0.000000
 36 |  0.000000  1.000000  0.000000  0.000000
 37 |  0.333333  0.333333  0.000000  0.333333
 38 | 
 39 | 
 40 | MOTIF 1.4 AAACT
 41 | 
 42 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
 43 |  1.000000  0.000000  0.000000  0.000000
 44 |  1.000000  0.000000  0.000000  0.000000
 45 |  1.000000  0.000000  0.000000  0.000000
 46 |  0.000000  1.000000  0.000000  0.000000
 47 |  0.000000  0.000000  0.000000  1.000000
 48 | 
 49 | 
 50 | MOTIF 1.5 AGACA
 51 | 
 52 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
 53 |  1.000000  0.000000  0.000000  0.000000
 54 |  0.000000  0.000000  1.000000  0.000000
 55 |  1.000000  0.000000  0.000000  0.000000
 56 |  0.000000  1.000000  0.000000  0.000000
 57 |  1.000000  0.000000  0.000000  0.000000
 58 | 
 59 | 
 60 | MOTIF 1.6 AGACC
 61 | 
 62 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
 63 |  1.000000  0.000000  0.000000  0.000000
 64 |  0.000000  0.000000  1.000000  0.000000
 65 |  1.000000  0.000000  0.000000  0.000000
 66 |  0.000000  1.000000  0.000000  0.000000
 67 |  0.000000  1.000000  0.000000  0.000000
 68 | 
 69 | 
 70 | MOTIF 1.7 AGACH
 71 | 
 72 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
 73 |  1.000000  0.000000  0.000000  0.000000
 74 |  0.000000  0.000000  1.000000  0.000000
 75 |  1.000000  0.000000  0.000000  0.000000
 76 |  0.000000  1.000000  0.000000  0.000000
 77 |  0.333333  0.333333  0.000000  0.333333
 78 | 
 79 | 
 80 | MOTIF 1.8 AGACT
 81 | 
 82 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
 83 |  1.000000  0.000000  0.000000  0.000000
 84 |  0.000000  0.000000  1.000000  0.000000
 85 |  1.000000  0.000000  0.000000  0.000000
 86 |  0.000000  1.000000  0.000000  0.000000
 87 |  0.000000  0.000000  0.000000  1.000000
 88 | 
 89 | 
 90 | MOTIF 1.9 ARACA
 91 | 
 92 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
 93 |  1.000000  0.000000  0.000000  0.000000
 94 |  0.500000  0.000000  0.500000  0.000000
 95 |  1.000000  0.000000  0.000000  0.000000
 96 |  0.000000  1.000000  0.000000  0.000000
 97 |  1.000000  0.000000  0.000000  0.000000
 98 | 
 99 | 
100 | MOTIF 1.10 ARACC
101 | 
102 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
103 |  1.000000  0.000000  0.000000  0.000000
104 |  0.500000  0.000000  0.500000  0.000000
105 |  1.000000  0.000000  0.000000  0.000000
106 |  0.000000  1.000000  0.000000  0.000000
107 |  0.000000  1.000000  0.000000  0.000000
108 | 
109 | 
110 | MOTIF 1.11 ARACH
111 | 
112 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
113 |  1.000000  0.000000  0.000000  0.000000
114 |  0.500000  0.000000  0.500000  0.000000
115 |  1.000000  0.000000  0.000000  0.000000
116 |  0.000000  1.000000  0.000000  0.000000
117 |  0.333333  0.333333  0.000000  0.333333
118 | 
119 | 
120 | MOTIF 1.12 ARACT
121 | 
122 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
123 |  1.000000  0.000000  0.000000  0.000000
124 |  0.500000  0.000000  0.500000  0.000000
125 |  1.000000  0.000000  0.000000  0.000000
126 |  0.000000  1.000000  0.000000  0.000000
127 |  0.000000  0.000000  0.000000  1.000000
128 | 
129 | 
130 | MOTIF 1.13 GAACA
131 | 
132 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
133 |  0.000000  0.000000  1.000000  0.000000
134 |  1.000000  0.000000  0.000000  0.000000
135 |  1.000000  0.000000  0.000000  0.000000
136 |  0.000000  1.000000  0.000000  0.000000
137 |  1.000000  0.000000  0.000000  0.000000
138 | 
139 | 
140 | MOTIF 1.14 GAACC
141 | 
142 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
143 |  0.000000  0.000000  1.000000  0.000000
144 |  1.000000  0.000000  0.000000  0.000000
145 |  1.000000  0.000000  0.000000  0.000000
146 |  0.000000  1.000000  0.000000  0.000000
147 |  0.000000  1.000000  0.000000  0.000000
148 | 
149 | 
150 | MOTIF 1.15 GAACH
151 | 
152 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
153 |  0.000000  0.000000  1.000000  0.000000
154 |  1.000000  0.000000  0.000000  0.000000
155 |  1.000000  0.000000  0.000000  0.000000
156 |  0.000000  1.000000  0.000000  0.000000
157 |  0.333333  0.333333  0.000000  0.333333
158 | 
159 | 
160 | MOTIF 1.16 GAACT
161 | 
162 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
163 |  0.000000  0.000000  1.000000  0.000000
164 |  1.000000  0.000000  0.000000  0.000000
165 |  1.000000  0.000000  0.000000  0.000000
166 |  0.000000  1.000000  0.000000  0.000000
167 |  0.000000  0.000000  0.000000  1.000000
168 | 
169 | 
170 | MOTIF 1.17 GGACA
171 | 
172 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
173 |  0.000000  0.000000  1.000000  0.000000
174 |  0.000000  0.000000  1.000000  0.000000
175 |  1.000000  0.000000  0.000000  0.000000
176 |  0.000000  1.000000  0.000000  0.000000
177 |  1.000000  0.000000  0.000000  0.000000
178 | 
179 | 
180 | MOTIF 1.18 GGACC
181 | 
182 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
183 |  0.000000  0.000000  1.000000  0.000000
184 |  0.000000  0.000000  1.000000  0.000000
185 |  1.000000  0.000000  0.000000  0.000000
186 |  0.000000  1.000000  0.000000  0.000000
187 |  0.000000  1.000000  0.000000  0.000000
188 | 
189 | 
190 | MOTIF 1.19 GGACH
191 | 
192 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
193 |  0.000000  0.000000  1.000000  0.000000
194 |  0.000000  0.000000  1.000000  0.000000
195 |  1.000000  0.000000  0.000000  0.000000
196 |  0.000000  1.000000  0.000000  0.000000
197 |  0.333333  0.333333  0.000000  0.333333
198 | 
199 | 
200 | MOTIF 1.20 GGACT
201 | 
202 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
203 |  0.000000  0.000000  1.000000  0.000000
204 |  0.000000  0.000000  1.000000  0.000000
205 |  1.000000  0.000000  0.000000  0.000000
206 |  0.000000  1.000000  0.000000  0.000000
207 |  0.000000  0.000000  0.000000  1.000000
208 | 
209 | 
210 | MOTIF 1.21 GRACA
211 | 
212 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
213 |  0.000000  0.000000  1.000000  0.000000
214 |  0.500000  0.000000  0.500000  0.000000
215 |  1.000000  0.000000  0.000000  0.000000
216 |  0.000000  1.000000  0.000000  0.000000
217 |  1.000000  0.000000  0.000000  0.000000
218 | 
219 | 
220 | MOTIF 1.22 GRACC
221 | 
222 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
223 |  0.000000  0.000000  1.000000  0.000000
224 |  0.500000  0.000000  0.500000  0.000000
225 |  1.000000  0.000000  0.000000  0.000000
226 |  0.000000  1.000000  0.000000  0.000000
227 |  0.000000  1.000000  0.000000  0.000000
228 | 
229 | 
230 | MOTIF 1.23 GRACH
231 | 
232 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
233 |  0.000000  0.000000  1.000000  0.000000
234 |  0.500000  0.000000  0.500000  0.000000
235 |  1.000000  0.000000  0.000000  0.000000
236 |  0.000000  1.000000  0.000000  0.000000
237 |  0.333333  0.333333  0.000000  0.333333
238 | 
239 | 
240 | MOTIF 1.24 GRACT
241 | 
242 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
243 |  0.000000  0.000000  1.000000  0.000000
244 |  0.500000  0.000000  0.500000  0.000000
245 |  1.000000  0.000000  0.000000  0.000000
246 |  0.000000  1.000000  0.000000  0.000000
247 |  0.000000  0.000000  0.000000  1.000000
248 | 
249 | 
250 | MOTIF 1.25 RAACA
251 | 
252 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
253 |  0.500000  0.000000  0.500000  0.000000
254 |  1.000000  0.000000  0.000000  0.000000
255 |  1.000000  0.000000  0.000000  0.000000
256 |  0.000000  1.000000  0.000000  0.000000
257 |  1.000000  0.000000  0.000000  0.000000
258 | 
259 | 
260 | MOTIF 1.26 RAACC
261 | 
262 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
263 |  0.500000  0.000000  0.500000  0.000000
264 |  1.000000  0.000000  0.000000  0.000000
265 |  1.000000  0.000000  0.000000  0.000000
266 |  0.000000  1.000000  0.000000  0.000000
267 |  0.000000  1.000000  0.000000  0.000000
268 | 
269 | 
270 | MOTIF 1.27 RAACH
271 | 
272 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
273 |  0.500000  0.000000  0.500000  0.000000
274 |  1.000000  0.000000  0.000000  0.000000
275 |  1.000000  0.000000  0.000000  0.000000
276 |  0.000000  1.000000  0.000000  0.000000
277 |  0.333333  0.333333  0.000000  0.333333
278 | 
279 | 
280 | MOTIF 1.28 RAACT
281 | 
282 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
283 |  0.500000  0.000000  0.500000  0.000000
284 |  1.000000  0.000000  0.000000  0.000000
285 |  1.000000  0.000000  0.000000  0.000000
286 |  0.000000  1.000000  0.000000  0.000000
287 |  0.000000  0.000000  0.000000  1.000000
288 | 
289 | 
290 | MOTIF 1.29 RGACA
291 | 
292 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
293 |  0.500000  0.000000  0.500000  0.000000
294 |  0.000000  0.000000  1.000000  0.000000
295 |  1.000000  0.000000  0.000000  0.000000
296 |  0.000000  1.000000  0.000000  0.000000
297 |  1.000000  0.000000  0.000000  0.000000
298 | 
299 | 
300 | MOTIF 1.30 RGACC
301 | 
302 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
303 |  0.500000  0.000000  0.500000  0.000000
304 |  0.000000  0.000000  1.000000  0.000000
305 |  1.000000  0.000000  0.000000  0.000000
306 |  0.000000  1.000000  0.000000  0.000000
307 |  0.000000  1.000000  0.000000  0.000000
308 | 
309 | 
310 | MOTIF 1.31 RGACH
311 | 
312 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
313 |  0.500000  0.000000  0.500000  0.000000
314 |  0.000000  0.000000  1.000000  0.000000
315 |  1.000000  0.000000  0.000000  0.000000
316 |  0.000000  1.000000  0.000000  0.000000
317 |  0.333333  0.333333  0.000000  0.333333
318 | 
319 | 
320 | MOTIF 1.32 RGACT
321 | 
322 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
323 |  0.500000  0.000000  0.500000  0.000000
324 |  0.000000  0.000000  1.000000  0.000000
325 |  1.000000  0.000000  0.000000  0.000000
326 |  0.000000  1.000000  0.000000  0.000000
327 |  0.000000  0.000000  0.000000  1.000000
328 | 
329 | 
330 | MOTIF 1.33 RRACA
331 | 
332 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
333 |  0.500000  0.000000  0.500000  0.000000
334 |  0.500000  0.000000  0.500000  0.000000
335 |  1.000000  0.000000  0.000000  0.000000
336 |  0.000000  1.000000  0.000000  0.000000
337 |  1.000000  0.000000  0.000000  0.000000
338 | 
339 | 
340 | MOTIF 1.34 RRACC
341 | 
342 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
343 |  0.500000  0.000000  0.500000  0.000000
344 |  0.500000  0.000000  0.500000  0.000000
345 |  1.000000  0.000000  0.000000  0.000000
346 |  0.000000  1.000000  0.000000  0.000000
347 |  0.000000  1.000000  0.000000  0.000000
348 | 
349 | 
350 | MOTIF 1.35 RRACH
351 | 
352 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
353 |  0.500000  0.000000  0.500000  0.000000
354 |  0.500000  0.000000  0.500000  0.000000
355 |  1.000000  0.000000  0.000000  0.000000
356 |  0.000000  1.000000  0.000000  0.000000
357 |  0.333333  0.333333  0.000000  0.333333
358 | 
359 | 
360 | MOTIF 1.36 RRACT
361 | 
362 | letter-probability matrix: alength= 4 w= 5 nsites= 6 E= 0.0e+000
363 |  0.500000  0.000000  0.500000  0.000000
364 |  0.500000  0.000000  0.500000  0.000000
365 |  1.000000  0.000000  0.000000  0.000000
366 |  0.000000  1.000000  0.000000  0.000000
367 |  0.000000  0.000000  0.000000  1.000000
368 | 


--------------------------------------------------------------------------------
/bin/DiffReport.R:
--------------------------------------------------------------------------------
  1 | #!/bin/Rscript
  2 | ## Rscript arranged_result.R <designfile> <comparefile> <diffm6A_mode> <expression_mode> <mergePeaks_mode>
  3 | ### designfile: Sample_id, Input_filename, IP_filename, group_id
  4 | ### compare_str: Compairision design (eg: A_vs_B)
  5 | args <- commandArgs(T)
  6 | #args <- c("macs2_MeTDiff_DESeq2_arranged_results_2019-12-11.m6APipe", "DiffReport.RData")
  7 | m6APipe.data <- args[1]#"formatted_designfile.txt"
  8 | output.Rdata <- args[2]#"compare_info"
  9 | 
 10 | library(pheatmap)
 11 | library(ggplot2)
 12 | library(ggrepel)
 13 | library(grid)
 14 | library(reshape2)
 15 | load(m6APipe.data)
 16 | 
 17 | draw_colnames_90 <- function (coln, ...) {
 18 |   m = length(coln)
 19 |   x = (1:m)/m - 1/2/m
 20 |   grid.text(coln, x = x, y = unit(0.96, "npc"), vjust = .5, 
 21 |             hjust = 1, rot = 90, gp = gpar(...)) ## 注意缺省值为 'hjust=0' 和'rot=270'
 22 | }
 23 | assignInNamespace(x="draw_colnames", value="draw_colnames_90",
 24 |                   ns=asNamespace("pheatmap"))
 25 | 
 26 | heatmap_dm <- function(mat,coldt){
 27 |   pheatmap(mat, cluster_rows=FALSE, show_rownames=F, cluster_cols=FALSE, annotation_col=coldt, 
 28 |            main = "Heatmap of Different Methylation", scale = "row")
 29 | }
 30 | 
 31 | heatmap_de <- function(mat, coldt){
 32 |   pheatmap(mat, cluster_rows=FALSE, show_rownames=F, cluster_cols=FALSE, annotation_col=coldt, 
 33 |            color = colorRampPalette(c(rep('#1C2B6F',1),'black', rep('#E31E26',1)))(50),
 34 |            main = "Heatmap of Different Expression", scale = "row")
 35 | }
 36 | 
 37 | ECDF_plot <- function(df,value_var,group,plot_title="",test_result=""){
 38 |   if (test_result !=""){
 39 |     if (test_result$p.value <=0){
 40 |       test_anno = paste(test_result$method,"\n",names(test_result$statistic)," = ",signif(test_result$statistic, 3),
 41 |                         "\nP Value < 2.2e-16",sep = "")
 42 |     } else {
 43 |       test_anno = paste(test_result$method,"\n",names(test_result$statistic)," = ",signif(test_result$statistic, 3),
 44 |                         "\nP Value= ",signif(test_result$p.value,3),sep = "")
 45 |     }
 46 |   }else{
 47 |     test_anno = ""
 48 |   }
 49 |   p <-  ggplot(df,aes(x=value_var,group=group,color=group))+theme_test()+
 50 |     stat_ecdf(size = 1)+theme(legend.position=c(0.85,.15))+
 51 |     annotate("text",x=-Inf,y=Inf,vjust=1.5,hjust=-.12,label=test_anno)+ 
 52 |     scale_y_continuous(expand = c(0,0))+scale_x_continuous(expand = c(0,0),limits = c(0,1.02))+
 53 |     labs(title= plot_title, y="Cumulative fraction" , x = "Peaks intensity")+
 54 |     theme(plot.title = element_text(size = 15, angle = 0, face = "plain", colour = "black"),
 55 |           axis.title.x = element_text(size = 15, angle = 0, face = "plain", colour = "black"),
 56 |           axis.title.y = element_text(size = 15, angle = 90, face = "plain", colour = "black"),
 57 |           axis.text.x = element_text(size = 15, angle = 0, face = "plain", colour = "black"),
 58 |           axis.text.y = element_text(size = 15, angle = 0, face = "plain", colour = "black"))
 59 |   return(p)
 60 | }
 61 | 
 62 | volcano_plot_dm = function(res, Sample_1 = "A", Sample_2 = "B", lfc = 0.58, pval = 0.05, groupname = ""){
 63 |   par(mar = c(5, 6, 5, 5))
 64 |   tab = data.frame(logFC = res$log2FC, negLogPval = -log10(res$pvalue)) 
 65 |   tab$gene_name = rownames(res)
 66 |   tab = na.omit(tab)
 67 |   tab$threshold <- "C"
 68 |   tab$threshold[tab$logFC >= lfc & tab$negLogPval > -log10(pval)] <- "B"
 69 |   tab$threshold[tab$logFC <=-lfc & tab$negLogPval > -log10(pval)] <- "A"
 70 |   #tab<-tab%>%mutate(threshold = ifelse(logFC >= lfc & negLogPval > -log10(pval) ,"B", ifelse(logFC<=-lfc & negLogPval > -log10(pval), "A", "C")))
 71 |   n_up = length(which(tab$threshold=="B"))
 72 |   n_down = length(which(tab$threshold=="A"))
 73 |   tab_order = tab[order(tab$negLogPval, decreasing = T),]
 74 |   ggplot(tab_order, aes(x=logFC, y=negLogPval)) +
 75 |     geom_point(aes(colour = threshold)) +
 76 |     scale_colour_manual(values = c("A"= "#619cff", "B"="#f8766d",  "C"= "#c8c8c8"),
 77 |                         labels=c(paste("Down: ", n_down, sep=""),paste("Up: ", n_up, sep = "") , "No sig"), name = NULL) +
 78 |     geom_hline(aes(yintercept=-log10(pval)), linetype="dashed") +
 79 |     geom_vline(aes(xintercept=-lfc), linetype="dashed") +
 80 |     geom_vline(aes(xintercept=lfc), linetype="dashed") +
 81 |     ggtitle(paste("Volcano Plot of Different Methylation in", groupname))+
 82 |     xlab(expression(paste(Log[2], " fold change", sep = ""))) +
 83 |     ylab(expression(paste(-Log[10], " adjusted P value", sep = ""))) +
 84 |     theme_bw() +
 85 |     theme(legend.position = 'top',
 86 |           plot.title = element_text(hjust = 0.5))
 87 | }
 88 | 
 89 | quadrant_plot <- function(quadrant.data, lfc = 0.58 , pval = 0.05, groupname = ""){
 90 |     quadrant.data$threshold <- "nosig"
 91 |     quadrant.data$threshold[quadrant.data$m6A >= lfc & quadrant.data$exp >= lfc &
 92 |                             quadrant.data$m6A.p <= pval & quadrant.data$exp.p <= pval ] <- "Hyper-up"
 93 |     quadrant.data$threshold[quadrant.data$m6A >= lfc & quadrant.data$exp <= -lfc &
 94 |                             quadrant.data$m6A.p <= pval & quadrant.data$exp.p <= pval ] <- "Hyper-down"
 95 |     quadrant.data$threshold[quadrant.data$m6A <= -lfc & quadrant.data$exp >= lfc &
 96 |                               quadrant.data$m6A.p <= pval & quadrant.data$exp.p <= pval ] <- "Hypo-up"
 97 |     quadrant.data$threshold[quadrant.data$m6A <= -lfc & quadrant.data$exp <= -lfc &
 98 |                               quadrant.data$m6A.p <= pval & quadrant.data$exp.p <= pval ] <- "Hypo-down"
 99 |     quadrant.data.length <- table(quadrant.data$threshold)
100 |     quadrant.data <- na.omit(quadrant.data)
101 |     quadrant.data$threshold <- factor(quadrant.data$threshold,levels = c("Hyper-up","Hyper-down","Hypo-up","Hypo-down","nosig"))
102 |     ggplot()+
103 |       geom_point(data = quadrant.data,
104 |                  aes_string(x= "exp" ,y="m6A", color="threshold"),size = 1)+
105 |       geom_hline(yintercept = lfc,linetype="dashed")+
106 |       geom_hline(yintercept = -lfc,linetype="dashed")+
107 |       geom_vline(xintercept = lfc,linetype="dashed")+ylab("dmlogfc")+
108 |       geom_vline(xintercept = -lfc,linetype="dashed")+
109 |       scale_x_continuous(limits = c(-5,5))+
110 |       scale_y_continuous(limits = c(-5,5))+
111 |       ggtitle(paste("Quadrant Plot between Methylation and Expression in", groupname))+
112 |       scale_colour_manual(values = c("Hyper-up" = "#7DB9DE", "Hyper-down" = "#D75455", 
113 |                                      "Hypo-up" = "#7BA23F", "Hypo-down" = "#A35E47", 
114 |                                      "nosig"= "#c8c8c8" ),
115 |                           labels = c(paste("Hyper-up: ",   quadrant.data.length["Hyper-up"], sep=""),
116 |                                      paste("Hyper-down: ",   quadrant.data.length["Hyper-down"], sep = ""),
117 |                                      paste("Hypo-up: ",   quadrant.data.length["Hypo-up"], sep=""),
118 |                                      paste("Hypo-down: ",   quadrant.data.length["Hypo-down"], sep = ""),
119 |                                      "No sig"), name = NULL) +
120 |       theme_classic() +
121 |       theme(plot.title = element_text(hjust = 0.5))+
122 |       geom_point(data = quadrant.data[quadrant.data$threshold!="nosig",],
123 |                  aes_string(x= "exp" ,y="m6A", color="threshold"),size = 1.5)
124 | 
125 | }
126 | matrixcluster <- function(matrixData, cluster_rows = TRUE, cluster_cols = TRUE, cmethod = "complete"){
127 |   if(cluster_rows == TRUE){
128 |     ht <- hclust(dist(matrixData), method = cmethod)          #对行进行聚类
129 |     rowInd <- ht$order                                       #将聚类后行的顺序存为rowInd 
130 |   }else{
131 |     rowInd <- 1:nrow(matrixData)
132 |   }
133 |   
134 |   if(cluster_cols == TRUE){
135 |     ht <- hclust(dist(t(matrixData)), method = cmethod)       #对矩阵进行转置，对原本的列进行聚类
136 |     colInd <- ht$order                                       #将聚类后列的顺序存为colInd 
137 |   }else{
138 |     colInd <- 1:ncol(matrixData)
139 |   }
140 |   
141 |   matrixDataNew <-matrixData[rowInd,colInd]                #将数据按照聚类结果重排行和列
142 |   print(c("ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median", "centroid"))
143 |   return(matrixDataNew)
144 | }
145 | ## dm & de & ecdf
146 | heatmap_dm.list <- NULL
147 | heatmap_de.list <- NULL
148 | volcano_dm.list <- NULL
149 | ecdf.list <- NULL
150 | quadrant.list <- NULL
151 | ecdf.data <- NULL
152 | ecdf.group.data <- melt(m6a.anno.matrix,ID="PeakRegion")
153 | for( group in as.character(compare.list) ){
154 |   group1 = strsplit(group, "_vs_")[[1]][1] 
155 |   group2 = strsplit(group, "_vs_")[[1]][2]
156 |   coldata = subset(as.data.frame(design.matrix), Type==group1|Type==group2)
157 |   coldata$Type = as.factor(coldata$Type)
158 |   ## dm
159 |   dmres = diffm6A.list[[which(names(diffm6A.list)==group)]]
160 |   dmg = subset(dmres, abs(log2FC) > 0.58 & pvalue < 0.05)
161 |   if (nrow(dmg)<=1) dmg = dmres
162 |   matrix.dm = m6a.anno.matrix[,-c(1:3)]
163 |   dm_mat = matrix.dm[dmg$PeakRegion,rownames(coldata)]
164 |   select <- dmg[order(dmg$log2FC, decreasing = TRUE),] 
165 |   dm_mat = log2(dm_mat+1)
166 |   dm_mat = dm_mat[select$PeakRegion,]
167 |   dm_mat = na.omit(dm_mat)
168 |   dm_mat_new <- matrixcluster(dm_mat,cmethod = "single")
169 |   heatmap_dm.list[[group]] <- heatmap_dm(dm_mat_new,coldata)
170 |   ## de
171 |   deres = diffexpression.list[[which(names(diffexpression.list)==group)]]
172 |   deg = subset(deres, abs(log2FoldChange)> 0.58 & pvalue < 0.05)
173 |   if (nrow(deg)<=1) deg = deres
174 |   rownames(deg) = deg$ID
175 |   de_mat = expression.matrix[row.names(deg),rownames(coldata)]
176 |   select <- deg[order(deg$log2FoldChange, decreasing = TRUE), ] 
177 |   de_mat = log2(de_mat+1)
178 |   de_mat = de_mat[rownames(select),]
179 |   de_mat = na.omit(de_mat)
180 |   de_mat_new <- matrixcluster(de_mat,cmethod = "single")
181 |   heatmap_de.list[[group]] <- heatmap_de(de_mat_new,coldata)
182 |   
183 |   ## ecdf
184 |   ecdf.group.data.tmp = subset(ecdf.group.data,variable %in% rownames(coldata))
185 |   ecdf.group.data.tmp$group <- group1
186 |   ecdf.group.data.tmp$group[ecdf.group.data.tmp$variable %in% rownames(coldata)[coldata$Type == group2]] <- group2
187 |   #ecdf.group.data.tmp <- ecdf.group.data.tmp%>%mutate(group = ifelse(variable %in% rownames(coldata)[coldata$Type == group1], group1, group2))
188 |   ecdf.group.data.tmp <- na.omit(ecdf.group.data.tmp)
189 |   ecdf.list[[group]] <- ECDF_plot(ecdf.group.data.tmp,ecdf.group.data.tmp$value,ecdf.group.data.tmp$group)
190 |   ecdf.data <- rbind(ecdf.data,data.frame(data = diffm6A.list[[group]]$log2FC , group = group))
191 |   ## volcano plot
192 |   volcano_dm.list[[group]] <- volcano_plot_dm(diffm6A.list[[group]],groupname = group)
193 |   ## quadrant plot
194 |   diffm6a.results <- diffm6A.list[[group]]
195 |   rownames(diffm6a.results) <- diffm6a.results$PeakRegion
196 |   diffexp.results <- diffexpression.list[[group]]
197 |   rownames(diffexp.results) <- diffexp.results$ID
198 |   quadrant.data <- data.frame(row.names = rownames(diffm6a.results), 
199 |                               m6A = diffm6a.results$log2FC, 
200 |                               m6A.p = diffm6a.results$pvalue,
201 |                               exp = diffexp.results[diffm6a.results$ID,"log2FoldChange"],
202 |                               exp.p = diffexp.results[diffm6a.results$ID,"pvalue"])
203 |   quadrant.list[[group]] <-quadrant_plot(quadrant.data,lfc = 0.58,pval = 0.05,groupname = group)
204 |   
205 |   ## plot
206 |   pdf(file = paste0("heatmap_dm_",group,".pdf"),paper = "USr")
207 |   print(heatmap_dm.list[[group]])
208 |   dev.off()
209 |   pdf(file = paste0("heatmap_de_",group,".pdf"),paper = "USr")
210 |   print(heatmap_de.list[[group]])
211 |   dev.off()
212 |   pdf(file = paste0("volcano_dm_",group,".pdf"),paper = "USr")
213 |   print(volcano_dm.list[[group]])
214 |   dev.off()
215 |   pdf(file = paste0("ecdf_",group,".pdf"),paper = "USr")
216 |   print(ecdf.list[[group]])
217 |   dev.off()
218 |   pdf(file = paste0("quadrant_",group,".pdf"),paper = "USr")
219 |   print(quadrant.list[[group]])
220 |   dev.off()
221 | }
222 | ecdf.data <- na.omit(ecdf.data)
223 | ecdf.data <- ecdf.data[is.finite(ecdf.data$data),]
224 | ecdf.list[["combined"]] <- ECDF_plot(ecdf.data,ecdf.data$data,ecdf.data$group)
225 | pdf(file = paste0("ecdf_","combined.pdf"),paper = "USr")
226 | print(ecdf.list[["combined"]])
227 | dev.off()
228 | save(design.matrix,compare.list,heatmap_dm.list,heatmap_de.list,volcano_dm.list,ecdf.list,quadrant.list,file = output.Rdata)
229 | 


--------------------------------------------------------------------------------
/bin/m6A_annotate_forGTF_xingyang2.pl:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/perl -w
  2 | #perl m6A_annotate_forGTF.pl /data1/database/hg38/GENCODE/gencode.v25.annotation.gtf macs2/merged_Peak.bed macs2/merged_Peak
  3 | 
  4 | use strict;
  5 | use warnings;
  6 | use FindBin qw($Bin);
  7 | 
  8 | if (@ARGV < 3) {
  9 | 	print "
 10 | 		usage: perl m6A_annotate_forGTF.pl <in ref gene.gtf> <in merged_Peak.bed> <out prefix>
 11 | 		program will create multi file: <out>.center <out>.anno.txt <out>.unanno.txt ...
 12 | 		\n";
 13 | 	exit;
 14 | }
 15 | 
 16 | 
 17 | my ($ref_gene_gtf, $peak_bed, $outPrefix) = @ARGV;
 18 | 
 19 | #make a gene type list
 20 | my %GeneType; #refer to ftp://ftp.sanger.ac.uk/pub/gencode/_README_stats.txt of 1) GENES. of https://www.gencodegenes.org/stats/current.html
 21 | foreach (qw (protein_coding)) {
 22 | 	$GeneType{"$_"} = "mRNA"; }
 23 | foreach (qw (3prime_overlapping_ncRNA antisense bidirectional_promoter_lncRNA known_ncrna lincRNA macro_lncRNA non_coding nonsense_mediated_decay non_stop_decay processed_transcript retained_intron sense_intronic sense_overlapping)) {
 24 | 	$GeneType{"$_"} = "Long non-coding RNA"; }
 25 | foreach (qw (miRNA misc_RNA Mt_rRNA Mt_tRNA ribozyme rRNA scaRNA scRNA snoRNA snRNA sRNA vaultRNA)) {
 26 | 	$GeneType{"$_"} = "Others"; }
 27 | foreach (qw (pseudogene polymorphic_pseudogene processed_pseudogene transcribed_processed_pseudogene transcribed_unitary_pseudogene transcribed_unprocessed_pseudogene unitary_pseudogene unprocessed_pseudogene)) {
 28 | 	$GeneType{"$_"} = "Pseudogene"; }
 29 | foreach (qw (IG_C_gene IG_D_gene IG_J_gene IG_V_gene IG_pseudogene IG_C_pseudogene IG_J_pseudogene IG_V_pseudogene TR_C_gene TR_D_gene TR_J_gene TR_V_gene TR_J_pseudogene TR_V_pseudogene processed_transcript TEC)) {
 30 | 	$GeneType{"$_"} = "Others"; }
 31 | 
 32 | `awk '{print \$1"\t"int((\$2+\$3)/2)"\t"int((\$2+\$3)/2)+1"\t"\$1":"\$2"-"\$3}' $peak_bed > $outPrefix.peak_bed.center`;
 33 | 
 34 | #reads all genes of reference and save transcript's information(exon & CDS).
 35 | my (%RefAllGene, %RefPickTran, %RefPickTran_final);
 36 | open IN, $ref_gene_gtf || die;
 37 | open RBED,">$outPrefix.tmp.refSeq.bed" or die;
 38 | while (<IN>) {
 39 | 	chomp;
 40 | 	next if (/^\s*$|^\#/);
 41 | 	my @w = split (/\t/);
 42 | 	next if (@w < 9);
 43 | 	my ($chr, $ftype, $start, $end, $strand, $features) = @w[0,2,3,4,6,8];
 44 | 	next if ($ftype !~ /^exon$|^CDS$|^transcript$/);
 45 | 	my ($gene_id, $transcript_id) = ("") x 2;
 46 | 	$gene_id = $1 if ($features =~ /\bgene_id\s+\"([^\"]+)\";/);
 47 | 	$transcript_id = $1 if ($features =~ /\btranscript_id\s+\"([^\"]+)\";/);
 48 | 	if ($ftype eq "transcript") {
 49 | 		my ($gene_name, $transcript_type) = ("") x 2;
 50 | 		$gene_name = $1 if ($features =~ /\bgene_name\s+\"([^\"]+)\";/);
 51 | 		$transcript_type = $1 if ($features =~ /\btranscript_type\s+\"([^\"]+)\";/);
 52 | 		$RefAllGene{$gene_id}{$transcript_id}{chr} = $chr;
 53 | 		$RefAllGene{$gene_id}{$transcript_id}{start} = $start;
 54 | 		$RefAllGene{$gene_id}{$transcript_id}{end} = $end;
 55 | 		$RefAllGene{$gene_id}{$transcript_id}{strand} = $strand;
 56 | 		$RefAllGene{$gene_id}{$transcript_id}{gene_id} = $gene_id;
 57 | 		$RefAllGene{$gene_id}{$transcript_id}{gene_name} = $gene_name;
 58 | 		$RefAllGene{$gene_id}{$transcript_id}{transcript_type} = $transcript_type;
 59 | 		$RefAllGene{$gene_id}{$transcript_id}{Gene_Type} = (exists $GeneType{$transcript_type}) ? $GeneType{$transcript_type} : "Unknown";
 60 | 		print RBED "$RefAllGene{$gene_id}{$transcript_id}{chr}\t$RefAllGene{$gene_id}{$transcript_id}{start}\t$RefAllGene{$gene_id}{$transcript_id}{end}\t$transcript_id\t0\t$RefAllGene{$gene_id}{$transcript_id}{strand}\t$gene_id\n";
 61 | 	}
 62 | 	else { #$ftype =~ /^exon$|^CDS$/
 63 | 		push @{$RefAllGene{$gene_id}{$transcript_id}{$ftype}}, [$start, $end, $end-$start+1];
 64 | 	}
 65 | }
 66 | close RBED;
 67 | close IN;
 68 | 
 69 | `intersectBed -a $outPrefix.peak_bed.center -b $outPrefix.tmp.refSeq.bed -wa -wb > $outPrefix.refSeq.all.bed`;
 70 | 
 71 | #delete exon if exists CDS. delete shorter transcript if there are multi transcript in a gene.
 72 | open FH,"$outPrefix.refSeq.all.bed" or die; 
 73 | while (<FH>) {
 74 | 	chomp;
 75 | 	my @fields = split "\t";
 76 | 	my ($peakid,$overlap_chr,$overlap_start,$overlap_end,$overlap_transid,$overlap_strand,$overlap_gene_id,$ppos) = @fields[3,4,5,6,7,9,10,1];
 77 | 	#$RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{chr} = $RefAllGene{$overlap_gene_id}{$overlap_transid}{chr};
 78 | 	#$RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{start} = $RefAllGene{$overlap_gene_id}{$overlap_transid}{start};
 79 | 	#$RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{end} = $RefAllGene{$overlap_gene_id}{$overlap_transid}{end};
 80 | 	#$RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{strand} = $RefAllGene{$overlap_gene_id}{$overlap_transid}{strand};
 81 | 	#$RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{gene_id} = $RefAllGene{$overlap_gene_id}{$overlap_transid}{gene_id};
 82 | 	#$RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{gene_name} = $RefAllGene{$overlap_gene_id}{$overlap_transid}{gene_name};
 83 | 	#$RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{transcript_type} = $RefAllGene{$overlap_gene_id}{$overlap_transid}{tanscript_type};
 84 | 	#$RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{Gene_Type} = $RefAllGene{$overlap_gene_id}{$overlap_transid}{Gene_Type};
 85 | 	%{$RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}} = %{$RefAllGene{$overlap_gene_id}{$overlap_transid}};
 86 | 	$RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{chr} = $overlap_chr;
 87 |         $RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{center_start} = $ppos;
 88 |         $RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{center_end} = $ppos+1;
 89 |         $RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{chr_trans} = $overlap_chr;
 90 |         $RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{trans_start} = $overlap_start;
 91 |         $RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{trans_end} = $overlap_end;
 92 | 	$RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{fuck_peakid} = $peakid;
 93 |         $RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{zero} = 0;
 94 |         $RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{strand} = $overlap_strand;
 95 | 	$RefPickTran{$peakid}{$overlap_gene_id}{$overlap_transid}{ppos} = $ppos;
 96 | }
 97 | print "\n";
 98 | foreach my $peakid (keys %RefPickTran) {
 99 | 	my ($cur_array, $cur_total_len, $cur_ftype);
100 |         my ($longest_transcript_id, $longest_sum_len, $ftype, $longest_gene_id) = ("", 0, "","");
101 | 	foreach my $gene_id_temp (keys %{$RefPickTran{$peakid}}){
102 | 		foreach my $transcript_id_temp (keys %{$RefPickTran{$peakid}{$gene_id_temp}}){
103 | 			if (exists $RefPickTran{$peakid}{$gene_id_temp}{$transcript_id_temp}{CDS}) {
104 |    	        		$cur_array = $RefPickTran{$peakid}{$gene_id_temp}{$transcript_id_temp}{CDS};
105 |    	            		$cur_ftype = "CDS";
106 |     			} else {
107 | 				$cur_array = $RefPickTran{$peakid}{$gene_id_temp}{$transcript_id_temp}{exon};
108 | 				$cur_ftype = "exon";
109 |         		}
110 |        	 		foreach (@{$cur_array}) {
111 |                 		$cur_total_len += $_->[2];
112 |        	 		}
113 |        	 		if ($cur_ftype eq $ftype and $cur_total_len > $longest_sum_len) {
114 |                 		$longest_transcript_id = $transcript_id_temp;
115 | 				$longest_gene_id = $gene_id_temp;
116 |                 		$longest_sum_len = $cur_total_len;
117 |                 		$ftype = $cur_ftype;
118 |         		} else {
119 |                 		if($cur_ftype ne $ftype and $cur_ftype eq "CDS"){
120 |                         		$longest_transcript_id = $transcript_id_temp;
121 | 								$longest_gene_id = $gene_id_temp;
122 |                         		$longest_sum_len = $cur_total_len;
123 |                         		$ftype = $cur_ftype;
124 |                 		} else {
125 |                         		if($cur_ftype eq "exon" and $ftype eq ""){
126 |                                			$longest_transcript_id = $transcript_id_temp;
127 | 						$longest_gene_id = $gene_id_temp;
128 |                                			$longest_sum_len = $cur_total_len;
129 |                                			$ftype = $cur_ftype;
130 |                         		}
131 |                 		}
132 |      	   		}
133 | 		}
134 | 	}
135 | 	$RefPickTran_final{$peakid}{$longest_transcript_id} = $RefPickTran{$peakid}{$longest_gene_id}{$longest_transcript_id};
136 |    	$RefPickTran_final{$peakid}{$longest_transcript_id}{gene_type} = $ftype;
137 | 	@{$RefPickTran_final{$peakid}{$longest_transcript_id}{exon}} = sort {$a->[0] <=> $b->[0]} @{$RefPickTran_final{$peakid}{$longest_transcript_id}{exon}};
138 |         if ($ftype eq "CDS") {
139 |         	@{$RefPickTran_final{$peakid}{$longest_transcript_id}{CDS}} = sort {$a->[0] <=> $b->[0]} @{$RefPickTran_final{$peakid}{$longest_transcript_id}{CDS}};
140 |                 my ($total_tran_len, $total_cds_len, $utr5_len, $utr3_len) = (0, 0, 0, 0);
141 |                 my ($cds_start, $cds_end) = ($RefPickTran_final{$peakid}{$longest_transcript_id}{CDS}->[0][0], $RefPickTran_final{$peakid}{$longest_transcript_id}{CDS}->[-1][1]);
142 |                 foreach (@{$RefPickTran_final{$peakid}{$longest_transcript_id}{CDS}}) {
143 |                        	$total_cds_len += $_->[2];
144 |                 }
145 |                 foreach (@{$RefPickTran_final{$peakid}{$longest_transcript_id}{exon}}) {
146 |                        	$total_tran_len += $_->[2];
147 |                        	my ($cur_start, $cur_end, $cur_len) = @{$_};
148 |                        	if ($cds_start > $cur_start) {
149 |                                	if ($cds_start <= $cur_end) { $utr5_len += $cds_start - $cur_start; }
150 |                                	else { $utr5_len += $cur_len; }
151 |                        	}
152 |                        	if ($cds_end < $cur_end) {
153 |                                	if ($cds_end >= $cur_start) { $utr3_len += $cur_end - $cds_end; }
154 |                                	else { $utr3_len += $cur_len; }
155 |                        	}
156 |                 }
157 |                 $RefPickTran_final{$peakid}{$longest_transcript_id}{total_tran_len} = $total_tran_len;
158 |                 $RefPickTran_final{$peakid}{$longest_transcript_id}{total_cds_len} = $total_cds_len;
159 |                 $RefPickTran_final{$peakid}{$longest_transcript_id}{utr5_len} = $utr5_len;
160 | 		$RefPickTran_final{$peakid}{$longest_transcript_id}{utr3_len} = $utr3_len;
161 |         }
162 | 	else { #$ftype eq "exon", that is not coding RNA
163 |    	     my $total_tran_len;
164 |    	     foreach (@{$RefPickTran_final{$peakid}{$longest_transcript_id}{exon}}) { $total_tran_len += $_->[2]; }
165 |    	     $RefPickTran_final{$peakid}{$longest_transcript_id}{total_tran_len} = $total_tran_len;
166 |    	     $RefPickTran_final{$peakid}{$longest_transcript_id}{total_cds_len} = 0;
167 |    	     $RefPickTran_final{$peakid}{$longest_transcript_id}{utr5_len} = 0;
168 |    	     $RefPickTran_final{$peakid}{$longest_transcript_id}{utr3_len} = 0;
169 |     	}
170 | }
171 | 
172 | sub sort_2dArray {
173 | 	my $in_arr = $_[0];
174 | 	my @out_arr = sort {$a->[0] <=> $b->[0]} @{$in_arr};
175 | 	return \@out_arr;
176 | }
177 | 
178 | 
179 | #annotate
180 | my %p2t;
181 | foreach my $peak_id (keys %RefPickTran_final){
182 | 	my $tran_id = (keys  %{$RefPickTran_final{$peak_id}})[0];
183 | 	my $ppos = $RefPickTran_final{$peak_id}{$tran_id}{ppos};
184 | 	if(exists $p2t{$peak_id}){
185 | 		if($p2t{$peak_id}{cdslen}==0){
186 | 			if($RefPickTran_final{$peak_id}{$tran_id}{total_cds_len}<=$p2t{$peak_id}{cdslen}){
187 | 				next;
188 | 			}else{
189 | 				if($RefPickTran_final{$peak_id}{$tran_id}{total_tran_len}<=$p2t{$peak_id}{tlen}){
190 | 					next;
191 | 				}
192 | 			}
193 | 		}else{
194 | 			if($RefPickTran_final{$peak_id}{$tran_id}{total_cds_len}<=$p2t{$peak_id}{cdslen}){
195 | 				next;
196 | 			}
197 | 		}
198 | 	}
199 | 	$p2t{$peak_id}{cdslen}= $RefPickTran_final{$peak_id}{$tran_id}{total_cds_len}; #$cdslen{$tran_id};
200 | 	$p2t{$peak_id}{tlen}=$RefPickTran_final{$peak_id}{$tran_id}{total_tran_len};
201 | 	$p2t{$peak_id}{gene}=$RefPickTran_final{$peak_id}{$tran_id}{gene_name};
202 | 	$p2t{$peak_id}{ts}=$tran_id;
203 | 	$p2t{$peak_id}{ppos}=$ppos;
204 | 	$p2t{$peak_id}{intersect}= $RefPickTran_final{$peak_id}{$tran_id}{chr}."\t".$RefPickTran_final{$peak_id}{$tran_id}{center_start}."\t".$RefPickTran_final{$peak_id}{$tran_id}{center_end}."\t".$peak_id."\t".$RefPickTran_final{$peak_id}{$tran_id}{chr_trans}."\t".$RefPickTran_final{$peak_id}{$tran_id}{trans_start}."\t".$RefPickTran_final{$peak_id}{$tran_id}{trans_end}."\t".$tran_id."\t".$RefPickTran_final{$peak_id}{$tran_id}{zero}."\t".$RefPickTran_final{$peak_id}{$tran_id}{strand};
205 | }
206 | 
207 | open OUT, ">$outPrefix.anno.txt" || die;
208 | foreach my $peak_id (keys %p2t){
209 | 	my $ppos=$p2t{$peak_id}{ppos};
210 | 	my $tran_id=$p2t{$peak_id}{ts};
211 | 	my %Tran = %{$RefPickTran_final{$peak_id}{$tran_id}};
212 | 	my ($bin, $exon_sum_len, $segtype) = (0, 0, "");
213 | 	my $cstatus = ($Tran{gene_type} eq "CDS") ? "coding" : "noncoding";
214 | 	my @cur_array= @{$Tran{exon}};
215 | 	my ($cds_start, $cds_end) = ($Tran{CDS}->[0][0], $Tran{CDS}->[-1][1]);
216 | 	for (my $i=0; $i<=$#cur_array; $i++) {
217 | 		my ($exon_start, $exon_end, $exon_len) = @{$cur_array[$i]};
218 | 		$exon_sum_len += $exon_len;
219 | 		if ($ppos >= $exon_start && $ppos <= $exon_end) {
220 | 			if ($cstatus eq "noncoding") {
221 | #				$bin = int (($ppos - $exon_start) / $exon_len * 100); # $bin is the percentage of ppos in each exon.
222 | 				$bin = int (($exon_sum_len - ($exon_end - $ppos)) / $Tran{total_tran_len} * 100); # $bin is the percentage of ppos in each transcript.
223 | 				$segtype = "exon";
224 | 			}
225 | 			else { #$cstatus eq "coding"
226 | 				if ($ppos < $cds_start) {
227 | 					if ($Tran{utr5_len} == 0) {print join ("\t", $peak_id, $cds_start, $cds_end, $ppos, "\n");}
228 | 					$bin = int (($exon_sum_len - ($exon_end - $ppos)) / $Tran{utr5_len} * 100);
229 | 					$segtype = ($Tran{strand} eq "+") ? "5UTR" : "3UTR";
230 | 				}elsif ($ppos > $cds_end) {
231 | 					$bin = int (($Tran{total_tran_len} - $exon_sum_len + ($exon_end - $ppos)) / $Tran{utr3_len} * 100);
232 | 					$bin = 100 - $bin;
233 | 					$segtype = ($Tran{strand} eq "+") ? "3UTR" : "5UTR";
234 | 				}else {
235 | 					$bin = int (($exon_sum_len - ($exon_end - $ppos) - $Tran{utr5_len}) / $Tran{total_cds_len} * 100);
236 | 					$segtype="CDS";
237 | 				}
238 | 			}
239 | 			last;
240 | 		}
241 | 		else {
242 | 			if ($i < $#cur_array) {# isn't the last one
243 | 				my $next_exon_start = $cur_array[$i+1]->[0];
244 | 				if ($ppos > $exon_end && $ppos < $next_exon_start) {
245 | 					$bin = int (($ppos - $exon_end) / ($next_exon_start - $exon_end) * 100);
246 | 					$segtype = "intron";
247 | 					last;
248 | 				}
249 | 			}
250 | 		}
251 | 	} #end: for (my $i=0; $i<=$#cur_array; $i++)
252 | 	if ($segtype) { #can find peak in the transcript
253 | 		$bin = 100 - $bin if ($Tran{strand} eq "-");
254 | 		print OUT $p2t{$peak_id}{intersect}, "\t", join ("\t", $Tran{gene_name}, $cstatus, $segtype, $bin, $Tran{gene_id}, $Tran{transcript_type}, $Tran{Gene_Type}), "\n";
255 | 	}
256 | }
257 | close OUT;
258 | 
259 | print `perl $Bin/intersec.pl -a $outPrefix.peak_bed.center -na 4 -b $outPrefix.anno.txt -nb 4 -t ua > $outPrefix.unanno.txt`;
260 | 
261 | __END__
262 | 


--------------------------------------------------------------------------------