├── LICENSE
├── README.md
├── bin
    ├── SlurmEasy
    ├── mggFilter
    ├── mggNucmer
    └── mggPlot
└── docs
    ├── docs.txt
    └── plot.png


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Gautier RICHARD
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # mummer-ggplot
  2 | Nucmer from Mummer3 followed by ggplot for nicer and more customisable genome alignment plots.
  3 | 
  4 | ## Installation
  5 | 
  6 | Install conda if it's not already installed, get your distribution here: https://docs.conda.io/en/latest/miniconda.html . For a user on a UNIX-based HPC/cluster, the following can be done, please answer 'yes' to every question. You need to close the current window and open a new one after the installation is done:
  7 | 
  8 | ```
  9 | cd ~
 10 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
 11 | chmod +x Miniconda3-latest-Linux-x86_64.sh
 12 | ./Miniconda3-latest-Linux-x86_64.sh
 13 | ```
 14 | 
 15 | Create a conda virtual environment for mummer-ggplot
 16 | 
 17 | ```
 18 | conda create -n mummer-ggplot -c bioconda -c conda-forge mummer=3.23 deepstats
 19 | ```
 20 | 
 21 | Clone this repository and add the bin folder to your PATH
 22 | 
 23 | ```
 24 | git clone https://github.com/gtrichard/mummer-ggplot/
 25 | cd mummer-ggplot/bin
 26 | mggpath=$(pwd)
 27 | to_export="export PATH=$PATH:$mggpath"
 28 | cat $to_export >> ~/.bashrc
 29 | ```
 30 | 
 31 | To update the tool simply go to the cloned repository location and type `git pull`
 32 | 
 33 | mggNucmer uses SlurmEasy, taken from https://github.com/dpryan79/Misc/blob/master/MPIIE_internal/SlurmEasy . The shebang line must be changed to your python executable.
 34 | 
 35 | You may need to configure it for your cluster (mainly QUEUE line 22, QUEUES line 28 and MAINTEMP line 30).
 36 | 
 37 | ## Example output
 38 | 
 39 | ![plot](./docs/plot.png?raw=true "Example plot")
 40 | 
 41 | ## Tools
 42 | 
 43 | ### mggNucmer
 44 | 
 45 | ```
 46 | usage: mggNucmer --input  fastas/ --output deltas/ --reference ref_fasta --chromosomes chr1 chr2 chr3
 47 | mggNucmer launches nucmer from mummer3 on several chromosomes of multiple
 48 | genomes at once. It takes a folder with fasta files as input and outputs delta
 49 | files in a target folder (mummer format). This tool should be launched
 50 | directly, not with sbatch. It uses SlurmEasy (and thus the slurm job
 51 | scheduler) to launch jobs.
 52 | 
 53 | optional arguments:
 54 |   -h, --help            show this help message and exit
 55 |   --input FOLDER, -i FOLDER
 56 |                         Input folder with fasta files. All files finishing by
 57 |                         '.fa' or '.fasta' will be used. They must be named
 58 |                         using the following convention:
 59 |                         'genome_name.chromosome.fa', for instance 'dm3.2L.fa'.
 60 |   --reference FOLDER, -r FOLDER
 61 |                         Folder path containing all the reference genome '.fa'
 62 |                         or '.fasta' files. This reference will be aligned to
 63 |                         all fasta files stored in the input directory,
 64 |                         chromosome by chromosome (with matching names). Fasta
 65 |                         files must be named using the following convention:
 66 |                         'genome_name.chromosome.fa', for instance 'dm3.2L.fa'.
 67 |   --chromosomes CHROMOSOMES [CHROMOSOMES ...], -c CHROMOSOMES [CHROMOSOMES ...]
 68 |                         Here you can define the chromosome names you want the
 69 |                         program to run on. Simply put the chromosome names one
 70 |                         after the other, separated by a space, for instance
 71 |                         `-c 2R 2L 3L 3R`.
 72 |   --output FOLDER, -o FOLDER
 73 |                         Output folder name where delta files will be stored.
 74 |   --cpu INT, -p INT     Number of CPU to use. Default: 2
 75 |   --mem INT, -m INT     Memory to use per CPU. It can be expressed in Gb or
 76 |                         Mb: 2G or 2000M. This is multiplied by --cpu. Default:
 77 |                         2G
 78 | 
 79 | ```
 80 | 
 81 | 
 82 | ### mggFilter
 83 | 
 84 | ```
 85 | usage: mggFilter --input  deltas/ --identity 98 --length 1000
 86 | 
 87 | mggFile launches delta-filter from mummer3 on several chromosomes of multiple
 88 | genomes at once. It takes a folder with delta files as input and outputs
 89 | delta.filtered files in the same folder (mummer format). This tool should be
 90 | launched by sbatch or SlurmEasy, not directly (opposite of mggNucmer).
 91 | 
 92 | optional arguments:
 93 |   -h, --help            show this help message and exit
 94 |   --input FOLDER, -i FOLDER
 95 |                         Input folder with delta files. All files finishing by
 96 |                         '.delta' will be used. They must be named using the
 97 |                         following convention:
 98 |                         'genome_name.chromosome.fa.delta', for instance
 99 |                         'dm3.2L.fa.delta'.
100 |   --identity INT, -d INT
101 |                         Minimum identity of the alignments found by nucmer
102 |                         between reference and query to be kept in the filtered
103 |                         delta file. Check delta-filter documentation from
104 |                         Mummer for more information.
105 |   --length INT, -l INT  Minimum length of the alignments found by nucmer
106 |                         between reference and query to be kept in the filtered
107 |                         delta file. Check delta-filter documentation from
108 |                         Mummer for more information.
109 | ```
110 | 
111 | ### mggPlot
112 | 
113 | ```
114 | usage: mggPlot --input  deltas/ --output plot.pdf --ncol 12 --width 19 --height 15.7
115 | 
116 | mggPlot takes a folder as input and collects all delta.filtered files to plot
117 | them in an automatic manner. This tool can be executed via sbatch or with
118 | SlurmEasy.
119 | 
120 | optional arguments:
121 |   -h, --help            show this help message and exit
122 |   --input FOLDER, -i FOLDER
123 |                         Input folder with delta.filtered files. All files
124 |                         finishing by '.delta.filtered' will be used. They must
125 |                         be named using the following convention:
126 |                         'genome_name.chromosome.fa.delta.filtered', for
127 |                         instance 'dm3.2L.fa.delta.filtered'.
128 |   --output FILENAME, -o FILENAME
129 |                         Output file name for the PDF file.
130 |   --width FLOAT         Plot width. 15.7 inch by default.
131 |   --height FLOAT        Plot height. 17 inch by default.
132 |   --ncol INT            Number of columns for the plot.
133 |   --nrow INT            Number of rows for the plot.
134 | ```
135 | 
136 | 
137 | ## Tutorial
138 | 
139 | Create a test directory next to the mummer-ggplot git repository that you cloned in the installation steps.
140 | 
141 | ```
142 | mkdir mummer-ggplot_test
143 | cd mummer-ggplot_test
144 | ```
145 | 
146 | Activate the conda virtual environment.
147 | 
148 | ```
149 | conda activate mummer-ggplot
150 | ```
151 | 
152 | Execute the first step: nucmer. It will launch jobs on a slurm queue.
153 | 
154 | ```
155 | mggNucmer ../mummer-ggplot/fasta/reference.fa ../mummer-ggplot/fasta/chromosomes.txt ../mummer-ggplot/fasta/ mggNucmer-out
156 | ```
157 | 
158 | Execute the delta-filter step.
159 | 
160 | ```
161 | mggFilter mggNucmer-out 98 1000
162 | ```
163 | 
164 | Execute the plotting step.
165 | 
166 | ```
167 | mggPlot mggNucmer-out
168 | ```
169 | 
170 | Check your plots :)
171 | 


--------------------------------------------------------------------------------
/bin/SlurmEasy:
--------------------------------------------------------------------------------
  1 | #!~/miniconda3/bin/python3.7
  2 | 
  3 | __description__="""
  4 | SlurmEasy-0.1.1 - A wrapper around sbatch for submitting jobs to slurm.
  5 | 
  6 | I've attempted as much as possible to copy over the functionality and options from SGEeasy.
  7 | 
  8 | TODO: dependency, start delay/time, time limit
  9 | 
 10 | Devon Ryan
 11 | """
 12 | 
 13 | import argparse
 14 | import os
 15 | import datetime
 16 | import subprocess
 17 | import sys
 18 | import pwd
 19 | 
 20 | now = datetime.datetime.now()
 21 | 
 22 | NTHREADS = 16 # -c or -S?
 23 | QUEUE = "genouest" # -p
 24 | EMAIL_TYPE = "ALL" # --mail-type=...
 25 | WORKING_DIR = os.getcwd() # -D
 26 | NAME = pwd.getpwuid(os.getuid())[0] # -J
 27 | MEM = "1000" # --mem-per-cpu=... 3541 5781
 28 | CMD_TO_LOG = False
 29 | QUEUES = ['genouest', 'tiny']
 30 | ENV_STRING = "export LC_ALL=en_US.UTF-8; export LANG=en_US.UTF-8; export LANGUAGE=en_US.UTF-8;"
 31 | MAINTEMP = "/tmp/"
 32 | 
 33 | # Could set a custom stdout and stderr with -o and -e
 34 | 
 35 | def parse_args():
 36 |     parser = argparse.ArgumentParser(prog="SlurmEasy")
 37 | 
 38 |     #These are optional and most have defaults
 39 |     parser.add_argument("-t", "--threads", dest="threads", metavar="INT", help="maximum number of threads (default: '{}')".format(NTHREADS), type=int, default=NTHREADS)
 40 |     parser.add_argument("-l", "--log", dest="log_dir", metavar="logdir", help="directory for writing stdout and stderr log files (default: '{}')".format(WORKING_DIR), type=str, default=WORKING_DIR)
 41 |     parser.add_argument("--no_log", dest="no_log", action="store_true", default=False, help="Do not write log files.")
 42 |     parser.add_argument("-D", "--working_dir", dest="working_dir", metavar="workingdir", help="The working directory, in case you're not already in it and your commands use relative paths (default: '{}')".format(WORKING_DIR), type=str, default=WORKING_DIR)
 43 |     parser.add_argument("--tempdir", dest="tempdir", metavar="tempdir", help="User temp directory (default: '{}/tmp.XXXXXXXXXX')".format(MAINTEMP), type=str, default=MAINTEMP)
 44 |     parser.add_argument("-d", "--dependency", dest="dependency", metavar="INT", help="A job number to use as a dependency. Slurm will then wait until it's completed before starting this one. N.B., an integer is only labeled as being optional here to make integration with snakemake simpler.", type=int, nargs='?', const=False)
 45 |     parser.add_argument("-c", "--cmd_to_log", dest="cmd_to_log", action="store_true", help="echo bash commands to stdout (*.o) LOG file (default: '{}')".format(CMD_TO_LOG), default=CMD_TO_LOG)
 46 |     parser.add_argument("-n", "--name", dest="name", help="job name, which will also affect the log file names (default: '{}')".format(NAME), type=str, default=NAME)
 47 |     parser.add_argument("-m", "--mem-per-cpu", dest="mem", metavar="MEMORY", help="maximum memory per core in MB if not otherwise specified (e.g. '6G', default: '{}')".format(MEM), type=str, default=MEM)
 48 |     parser.add_argument("-M", "--email", dest="email", help="Send an email to this address once the job has finished (see also --emailType) (default: ''). Note that you can specify this without an email address and {}@ie-freiburg.mpg.de will automatically be used.".format(NAME), type=str, const=True, default=False, nargs='?')
 49 |     parser.add_argument("--env_string", dest="env_string", help="Environment settings (default: '{}')".format(ENV_STRING), type=str, default=ENV_STRING)
 50 |     parser.add_argument("--emailType", dest="emailType", help="send email once the job has hit this state: BEGIN, END, FAIL or ALL finished (default: '{}')".format(EMAIL_TYPE), type=str, default=EMAIL_TYPE, choices=['BEGIN', 'END', 'FAIL', 'ALL'])
 51 |     parser.add_argument("-q", "--queuename", dest="partition", help="The queue/partition to use (default: '{}')".format(QUEUE), type=str, default=QUEUE, choices=QUEUES)
 52 |     parser.add_argument("-k", "--keepscript", dest="keep_script", action="store_true", default=False, help="keep auto-generated job script (*.slurm.sh)")
 53 |     parser.add_argument("-w", "--nodelist", default=None, help="If specified, a comma separated list of nodes that can be used, e.g. 'deep14,deep15'")
 54 |     parser.add_argument("-x", "--exclude", default=None, help="If specified, a comma separated list of nodes that can not be used. For example: deep9,deep10")
 55 |     parser.add_argument("-v", "--verbose", dest="verbose", action="store_true", default=False, help="verbose output")
 56 | 
 57 |     #Required arguments
 58 |     parser.add_argument('cmd', metavar="COMMANDS", help="Commands (in quotes!) to be run. This can also be a script.")
 59 | 
 60 |     args = parser.parse_args()
 61 | 
 62 |     return args
 63 | 
 64 | def main():
 65 |     args = parse_args()
 66 | 
 67 |     #Generate a shell script, line by line
 68 |     script = ["#!/bin/bash", "#SBATCH --ntasks-per-node=1"]
 69 | 
 70 |     # -w
 71 |     script.append("#SBATCH -D {}".format(args.working_dir))
 72 | 
 73 |     # -t
 74 |     if args.threads > 1:
 75 |         script.append("#SBATCH -c {}".format(args.threads))
 76 | 
 77 |     # -n
 78 |     if args.name:
 79 |         script.append("#SBATCH -J {}".format(args.name))
 80 | 
 81 |     # -w
 82 |     if args.nodelist:
 83 |         script.append("#SBATCH --nodelist={}".format(args.nodelist))
 84 | 
 85 |     # -x
 86 |     if args.exclude:
 87 |         script.append("#SBATCH --exclude={}".format(args.exclude))
 88 |         
 89 |     # -m
 90 |     if args.mem:
 91 |         if args.partition == 'mono' and args.mem == "5781":
 92 |             # The mono queue only uses the newer nodes, so have it default to a lower per-core memory allocation so more jobs will fit
 93 |             args.mem = "3541"
 94 |         if args.threads > 64 and args.mem == "5781":
 95 |             args.mem = "3541"
 96 |         script.append("#SBATCH --mem-per-cpu={}".format(args.mem))
 97 | 
 98 |     # -M and --emailType
 99 |     if args.email is not False:
100 |         if args.email is True:
101 |             script.append("#SBATCH --mail-user={}@ie-freiburg.mpg.de".format(NAME))
102 |         else:
103 |             script.append("#SBATCH --mail-user={}".format(args.email))
104 |         script.append("#SBATCH --mail-type={}".format(args.emailType))
105 | 
106 |     # -q
107 |     script.append("#SBATCH -p {}".format(args.partition))
108 | 
109 |     # -l
110 |     if args.no_log:
111 |         script.append("#SBATCH -o /dev/null")
112 |         script.append("#SBATCH -e /dev/null")
113 |     else:
114 |         script.append("#SBATCH -o {}/{}.{}.%j.out".format(args.log_dir, args.name, now.strftime('%Y%m%d_%H%M%S')))
115 |         script.append("#SBATCH -e {}/{}.{}.%j.err".format(args.log_dir, args.name, now.strftime('%Y%m%d_%H%M%S')))
116 |     try:
117 |         os.makedirs(args.log_dir)
118 |     except OSError as exc:
119 |         # This will only throw an error if the user lacks permissions or there's a similar error
120 |         if type(exc) == FileExistsError:
121 |             pass
122 |         else:
123 |             raise
124 | 
125 |     # -d
126 |     if args.dependency:
127 |         script.append("#SBATCH -d {}".format(args.dependency))
128 | 
129 |     # Environment variables
130 |     if MAINTEMP == args.tempdir:
131 |         script.append("""
132 | TMPDIR=%s/
133 | scratch=$(mktemp -p $TMPDIR -d -t tmp.XXXXXXXXXX)""" % MAINTEMP)
134 |     else:
135 |         script.append("""
136 | TMPDIR=%s/
137 | mkdir $TMPDIR
138 | scratch=$TMPDIR""" % args.tempdir)
139 | 
140 |     script.append("""
141 | export TMPDIR=$scratch/
142 | export TMP=$scratch/
143 | export TEMP=$scratch/
144 | %s
145 | function cleanup {
146 | cd /
147 |     rm -rf $scratch
148 | }
149 | trap cleanup SIGTERM SIGKILL SIGUSR2""" % args.env_string)
150 | 
151 | 
152 |     # -c echo the command
153 |     if args.cmd_to_log:
154 |         script.append("echo \"{}\"".format(args.cmd))
155 | 
156 |     # Add the command
157 |     script.append("srun {}".format(args.cmd))
158 |     script.append("cleanup")
159 | 
160 |     # -v
161 |     if args.verbose:
162 |         print("\n".join(script))
163 | 
164 |     ofname="{}/{}.{}.slurm.sh".format(args.log_dir, args.name, now.strftime('%Y%m%d_%H%M%S'))
165 |     outfile = open(ofname, "w")
166 |     for line in script:
167 |         outfile.write("{}\n".format(line))
168 |     outfile.close()
169 | 
170 |     cmd2 = ["sbatch", ofname]
171 |     sp = subprocess.Popen(cmd2, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
172 |     stdout, stderr = sp.communicate()
173 | 
174 |     # This is so snakemake can figure out dependencies
175 |     stdout = stdout.decode()
176 |     stderr = stderr.decode()
177 |     if stdout.startswith("Submitted batch"):
178 |         stdout = stdout[20:]
179 | 
180 |     # In python3, this will print b'' if empty!
181 |     if len(stderr):
182 |         sys.stderr.write(stderr)
183 |     if len(stdout):
184 |         sys.stdout.write(stdout)
185 | 
186 |     if not args.keep_script:
187 |         os.remove(ofname)
188 |         if args.verbose:
189 |             print("Removing sbatch script ({})".format(ofname))
190 |     return(sp.returncode)
191 | 
192 | if __name__ == "__main__":
193 |     main()
194 | 
195 | 


--------------------------------------------------------------------------------
/bin/mggFilter:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # LAUNCHES NUCMER ON SEVERAL CHROMOSOMES OF MULTIPLE GENOMES
 3 | 
 4 | 
 5 | #### LIBRARIES LOADING ####
 6 | 
 7 | library( 'argparse' )
 8 | 
 9 | options( show.error.locations = TRUE )
10 | 
11 | 
12 | #### ARGUMENTS PARSING ####
13 | 
14 | parser <- ArgumentParser(description= "mggFile launches delta-filter from mummer3 on several chromosomes of multiple genomes at once. It takes a folder with delta files as input and outputs delta.filtered files in the same folder (mummer format). This tool should be launched by sbatch or SlurmEasy, not directly (opposite of mggNucmer).",
15 |                          usage = "mggFilter --input  deltas/ --identity 98 --length 1000")
16 | 
17 | parser$add_argument("--input","-i", type="character", default=NULL, action="store", dest="input", metavar="FOLDER",
18 |                     help="Input folder with delta files. All files finishing by '.delta' will be used. They must be named using the following convention: 'genome_name.chromosome.fa.delta', for instance 'dm3.2L.fa.delta'.")
19 | 
20 | parser$add_argument("--identity","-d", type="integer", action="store", dest="identity", metavar="INT", default=98,
21 |                     help="Minimum identity of the alignments found by nucmer between reference and query to be kept in the filtered delta file. Check delta-filter documentation from Mummer for more information.")
22 | 
23 | parser$add_argument("--length","-l", type="integer", action="store", dest="length", metavar="INT", default=1000,
24 |                     help="Minimum length of the alignments found by nucmer between reference and query to be kept in the filtered delta file. Check delta-filter documentation from Mummer for more information.")
25 | 
26 | args <- parser$parse_args()
27 | 
28 | #### SANITY CHECK ####
29 | 
30 | if ( is.null( args$input ) ) {
31 |   parser$print_help()
32 |   stop( "An input folder must be supplied.", call.=FALSE )
33 | }
34 | 
35 | #### SET WD AND GET ABS PATHS ####
36 | 
37 | args$input <- normalizePath( args$input )
38 | 
39 | #### DELTA FILTER LAUNCHING ####
40 | 
41 | search_pattern <- paste0("\\.delta$")
42 | inputFiles <- list.files(path = args$input,pattern = search_pattern)
43 | 
44 | for (f in 1:length(inputFiles)){
45 |   command <- paste0("delta-filter -i ", args$identity, " -l ", args$length, " ", args$input,"/",inputFiles[f], " > ", args$input,"/",inputFiles[f],".filtered")
46 |   cat("Doing ", command, "\n")
47 |   system(command)
48 | }
49 | 


--------------------------------------------------------------------------------
/bin/mggNucmer:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # LAUNCHES NUCMER ON SEVERAL CHROMOSOMES OF MULTIPLE GENOMES
 3 | 
 4 | 
 5 | #### LIBRARIES LOADING ####
 6 | 
 7 | library( 'argparse' )
 8 | 
 9 | #options( show.error.locations = TRUE )
10 | 
11 | 
12 | #### ARGUMENTS PARSING ####
13 | 
14 | parser <- ArgumentParser(description= "mggNucmer launches nucmer from mummer3 on several chromosomes of multiple genomes at once. It takes a folder with fasta files as input and outputs delta files in a target folder (mummer format). This tool should be launched directly, not with sbatch. It uses SlurmEasy (and thus the slurm job scheduler) to launch jobs.",
15 |                          usage = "mggNucmer --input  fastas/ --output deltas/")
16 | 
17 | parser$add_argument("--input","-i", type="character", default=NULL, action="store", dest="input", metavar="FOLDER",
18 |                     help="Input folder with fasta files. All files finishing by '.fa' or '.fasta' will be used. They must be named using the following convention: 'genome_name.chromosome.fa', for instance 'dm3.2L.fa'." )
19 | 
20 | parser$add_argument("--reference","-r", type="character", default=NULL, action="store", dest="reference", metavar="FOLDER",
21 |                     help="Folder path containing all the reference genome '.fa' or '.fasta' files. This reference will be aligned to all fasta files stored in the input directory, chromosome by chromosome (with matching names). Fasta files must be named using the following convention: 'genome_name.chromosome.fa', for instance 'dm3.2L.fa'.")
22 | 
23 | parser$add_argument("--chromosomes","-c", type="character",action="store", nargs='+', dest="chromosomes", default=NULL,
24 |                     help="Here you can define the chromosome names you want the program to run on. Simply put the chromosome names one after the other, separated by a space, for instance `-c 2R 2L 3L 3R`." )
25 | 
26 | parser$add_argument("--output", "-o", type="character", default=NULL, action="store", dest="output", metavar="FOLDER",
27 |                     help="Output folder name where delta files will be stored." )
28 | 
29 | parser$add_argument("--cpu", "-p", type="integer", default=2, action="store", dest="cpu", metavar="INT",
30 |                     help="Number of CPU to use. Default: 2" )
31 | 
32 | parser$add_argument("--mem", "-m", type="character", default="2G", action="store", dest="mem", metavar="INT",
33 |                     help="Memory to use per CPU. It can be expressed in Gb or Mb: 2G or 2000M. This is multiplied by --cpu. Default: 2G" )
34 | 
35 | args <- parser$parse_args()
36 | 
37 | 
38 | #### SANITY CHECK ####
39 | 
40 | if ( is.null( args$input ) ) {
41 |   parser$print_help()
42 |   stop( "An input folder must be supplied.", call.=FALSE )
43 | }
44 | 
45 | if ( is.null( args$reference ) ) {
46 |   parser$print_help()
47 |   stop( "A reference genome folder must be supplied.", call.=FALSE )
48 | }
49 | 
50 | if ( is.null( args$chromosomes ) ) {
51 |   parser$print_help()
52 |   stop( "Some chromosome names matching both the reference and input fasta files must be supplied.", call.=FALSE )
53 | }
54 | 
55 | if ( is.null( args$output ) ) {
56 |   parser$print_help()
57 |   stop( "An input folder must be supplied.", call.=FALSE )
58 | }
59 | 
60 | 
61 | #### SET WD AND GET ABS PATHS ####
62 | 
63 | args$input <- normalizePath( args$input )
64 | args$reference <- normalizePath( args$reference )
65 | 
66 | dir.create(path = args$output)
67 | args$output<- normalizePath( args$output )
68 | 
69 | #### NUCMER LAUNCHING ####
70 | 
71 | for (chrom in 1:length(args$chromosomes)){
72 |   chr <- args$chromosomes[chrom]
73 |   search_pattern <- paste0("\\.",chr,".fa$|\\.",chr,".fasta$")
74 |   inputFiles <- list.files(path = args$input,pattern = search_pattern)
75 |   referenceFile <- list.files(path = args$reference,pattern = search_pattern)
76 |   for (f in 1:length(inputFiles)){
77 |     command <- noquote(paste0('SlurmEasy -t ', args$cpu,' --mem-per-cpu ', args$mem,' -n mggNucmer "nucmer ',args$reference,'/',referenceFile,' ',args$input,'/',inputFiles[f],' -p ',args$output,"/",basename(inputFiles[f]),'"'))
78 |     cat("submitting", command, "under ID ")
79 |     system(command)
80 |     cat("\n")
81 |   }
82 | }
83 | 


--------------------------------------------------------------------------------
/bin/mggPlot:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | # LAUNCHES NUCMER ON SEVERAL CHROMOSOMES OF MULTIPLE GENOMES
  3 | 
  4 | 
  5 | #### LIBRARIES LOADING ####
  6 | 
  7 | library( 'argparse' )
  8 | library(dplyr)
  9 | library(magrittr)
 10 | #library(GenomicRanges)
 11 | #library(knitr)
 12 | library(ggplot2)
 13 | library(tidyr)
 14 | library(stringr)
 15 | 
 16 | 
 17 | #options( show.error.locations = TRUE )
 18 | 
 19 | 
 20 | #### ARGUMENTS PARSING ####
 21 | 
 22 | parser <- ArgumentParser(description= "mggPlot takes a folder as input and collects all delta.filtered files to plot them in an automatic manner. This tool can be executed via sbatch or with SlurmEasy.",
 23 |                          usage = "mggPlot --input  deltas/ --output plot.pdf --ncol 12 --width 19 --height 15.7")
 24 | 
 25 | parser$add_argument("--input","-i", type="character", default=NULL, action="store", dest="input", metavar="FOLDER",
 26 |                     help="Input folder with delta.filtered files. All files finishing by '.delta.filtered' will be used. They must be named using the following convention: 'genome_name.chromosome.fa.delta.filtered', for instance 'dm3.2L.fa.delta.filtered'." )
 27 | 
 28 | parser$add_argument("--output","-o", type="character", default=NULL, action="store", dest="output", metavar="FILENAME",
 29 |                     help="Output file name for the PDF file.")
 30 | 
 31 | parser$add_argument("--width", type="double", action="store", dest="width", metavar="FLOAT", default=15.7,
 32 |                     help="Plot width. 15.7 inch by default.")
 33 | 
 34 | parser$add_argument("--height", type="double",action="store", dest="height", metavar="FLOAT", default=17,
 35 |                     help="Plot height. 17 inch by default.")
 36 | 
 37 | parser$add_argument("--ncol", type="integer",action="store", dest="ncol", metavar="INT", default=NULL,
 38 |                     help="Number of columns for the plot.")
 39 | 
 40 | parser$add_argument("--nrow", type="integer",action="store", dest="nrow", metavar="INT", default=NULL,
 41 |                     help="Number of rows for the plot.")
 42 | 
 43 | args <- parser$parse_args()
 44 | 
 45 | #### SANITY CHECK ####
 46 | 
 47 | if ( is.null( args$input ) ) {
 48 |   parser$print_help()
 49 |   stop( "An input folder must be supplied.", call.=FALSE )
 50 | }
 51 | 
 52 | if ( is.null( args$output ) ) {
 53 |   parser$print_help()
 54 |   stop( "An output name must be supplied.", call.=FALSE )
 55 | }
 56 | 
 57 | 
 58 | #### FUNCTIONS ####
 59 | 
 60 | readDelta <- function(deltafile){
 61 |   lines = scan(deltafile, 'a', sep='\n', quiet=TRUE)
 62 |   lines = lines[-1]
 63 |   lines.l = strsplit(lines, ' ')
 64 |   lines.len = lapply(lines.l, length) %>% as.numeric
 65 |   lines.l = lines.l[lines.len != 1]
 66 |   lines.len = lines.len[lines.len != 1]
 67 |   head.pos = which(lines.len == 4)
 68 |   head.id = rep(head.pos, c(head.pos[-1], length(lines.l)+1)-head.pos)
 69 |   mat = matrix(as.numeric(unlist(lines.l[lines.len==7])), 7)
 70 |   res = as.data.frame(t(mat[1:5,]))
 71 |   colnames(res) = c('rs','re','qs','qe','error')
 72 |   res$qid = unlist(lapply(lines.l[head.id[lines.len==7]], '[', 2))
 73 |   res$rid = unlist(lapply(lines.l[head.id[lines.len==7]], '[', 1)) %>% gsub('^>', '', .)
 74 |   res$strand = ifelse(res$qe-res$qs > 0, '+', '-')
 75 |   res
 76 | }
 77 | 
 78 | 
 79 | #### READ FILES FROM FOLDER ####
 80 | 
 81 | args$input <- normalizePath( args$input )
 82 | search_pattern <- paste0("\\.delta.filtered$")
 83 | 
 84 | deltasFiles <- list.files( path = args$input, pattern = search_pattern )
 85 | deltas <- list()
 86 | 
 87 | for ( i in 1:length(deltasFiles)){
 88 |   deltas[[i]] <- readDelta(paste0(args$input,"/",deltasFiles[i]))
 89 | }
 90 | 
 91 | names(deltas) <- sub("\\.*fa.*", "", deltasFiles)
 92 | names(deltas) <- sub("\\genome.", "", names(deltas))
 93 | 
 94 | for ( i in 1:length(deltasFiles)){
 95 |   deltas[[i]]["genome"] <- sub("\\..*", "", names(deltas))[i]
 96 |   deltas[[i]]["chromosome"] <- sub('.*C', 'C' ,sub(".*A", "A", names(deltas)))[i]
 97 | }
 98 | 
 99 | nucmer_data <- do.call(rbind, deltas)
100 | 
101 | 
102 | #### PLOT ####
103 | 
104 | pdf(args$output, width=args$width, height=args$height)
105 | ggplot(nucmer_data, aes(y=rs, yend=re, x=qs, xend=qe, colour=strand)) +
106 |   geom_point(alpha=.15, size = 0.3, pch = 1) +
107 |   scale_color_manual(values = c("red", "blue"))+
108 |   facet_wrap(chromosome ~ genome, scales = "free",strip.position = "top", ncol=args$ncol, nrow=args$nrow)+
109 |   xlab('Other B. napus genome assemblies (in Mb)') +
110 |   ylab('B. napus Darmor v9 genome assembly (in Mb)')+
111 |   scale_x_continuous(breaks= c(0, 1e+07, 2e+07, 3e+07,4e+07,5e+07,6e+07,7e+07,8e+07, 9e+07), labels=c("0", "10", "20", "30", "40","50","60","70","80", "90"))+
112 |   scale_y_continuous(breaks= c(0, 1e+07, 2e+07, 3e+07,4e+07,5e+07,6e+07,7e+07,8e+07, 9e+07), labels=c("0", "10", "20", "30", "40","50","60","70","80", "90"))+
113 |   theme_bw() +
114 |   theme(legend.position="none",
115 |         strip.background=element_blank(),
116 |         strip.text.x = element_text(margin=margin(b=0.85)),
117 |         panel.grid.major = element_blank(),
118 |         panel.grid.minor = element_blank(),
119 |         axis.title.y = element_text(margin = margin(0,10,0,0)),
120 |         axis.title.x = element_text(margin = margin(10,0,0,0)),
121 |         panel.spacing.x = unit(units = "cm", 0.5))
122 | dev.off()
123 | 


--------------------------------------------------------------------------------
/docs/docs.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/docs/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gtrichard/mummer-ggplot/2e2647db207d64f968a94669cf623e15e1a7c3fd/docs/plot.png


--------------------------------------------------------------------------------