├── .gitignore
├── Dockerfiles
    ├── RepeatMasker-onbuild
    │   ├── Dockerfile
    │   └── README.md
    ├── RepeatMasker
    │   ├── Dockerfile
    │   └── README.md
    ├── augustus
    │   ├── Dockerfile
    │   └── fgram_base
    │   │   ├── fgram_base_exon_probs.pbl
    │   │   ├── fgram_base_igenic_probs.pbl
    │   │   ├── fgram_base_intron_probs.pbl
    │   │   ├── fgram_base_metapars.cfg
    │   │   ├── fgram_base_parameters.cfg
    │   │   └── fgram_base_weightmatrix.txt
    ├── basics
    │   └── Dockerfile
    ├── bedtools
    │   └── Dockerfile
    ├── bioruby
    │   └── Dockerfile
    ├── busco
    │   ├── Dockerfile
    │   └── README.md
    ├── chado-helper
    │   ├── Dockerfile
    │   └── README.md
    ├── codingquarry
    │   └── Dockerfile
    ├── cufflinks
    │   └── Dockerfile
    ├── emboss
    │   └── Dockerfile
    ├── gff2gb
    │   └── Dockerfile
    ├── hhblits-fungi
    │   └── Dockerfile
    ├── hhblits-transposon
    │   └── Dockerfile
    ├── interproscan
    │   ├── Dockerfile
    │   └── interproscan.properties
    ├── jamg
    │   └── Dockerfile
    ├── ncbi-blast
    │   └── Dockerfile
    ├── pasa
    │   ├── Dockerfile
    │   ├── Dockerfile-pasaweb
    │   └── conf.txt
    ├── pfam
    │   └── Dockerfile
    ├── proteinortho
    │   └── Dockerfile
    ├── tophat
    │   └── Dockerfile
    └── trinity
    │   └── Dockerfile
├── LICENSE
├── README.md
├── annotate
├── bin
    ├── GG_trinity_accession_incrementer.rb
    ├── augustus_RNAseq_hints.pl
    ├── bed12_to_augustus_junction_hints.pl
    ├── exonerate_to_genbank.rb
    ├── fullerCegmaGFF.rb
    ├── gff2gb
    ├── gff_transpose.rb
    ├── gff_transpose.rb~
    ├── parse_hhr.rb
    ├── pfam_to_gff3.rb
    ├── rename-codons
    ├── rename-fasta
    ├── rename-gff-ids
    └── trim_fasta_all.pl
├── complete.nf
├── genemark-annotate.nf
├── main.nf
├── nextflow.config
├── proteinortho.nf
└── simple-annotate.nf


/.gitignore:
--------------------------------------------------------------------------------
1 | Dockerfiles/RepeatMasker/repeatmaskerlibraries*.tar.gz
2 | .nextflow.*
3 | work
4 | #*
5 | 


--------------------------------------------------------------------------------
/Dockerfiles/RepeatMasker-onbuild/Dockerfile:
--------------------------------------------------------------------------------
  1 | FROM ubuntu:14.04
  2 | 
  3 | MAINTAINER Rob Syme <rob.syme@gmail.com>
  4 | 
  5 | RUN apt-get update && apt-get install -qqy \
  6 |     wget \
  7 |     hmmer \
  8 |     unzip \
  9 |     build-essential
 10 | 
 11 | # Install TRF (for RepeatScout)
 12 | WORKDIR /usr/local/bin
 13 | RUN wget http://tandem.bu.edu/trf/downloads/trf407b.linux64 && mv trf*.linux64 trf && chmod +x trf
 14 | 
 15 | # Basic workdir
 16 | WORKDIR /usr/local
 17 | 
 18 | # Install nseg (for RepeatScout)
 19 | RUN mkdir nseg && \
 20 |     cd nseg && \
 21 |     wget ftp://ftp.ncbi.nih.gov/pub/seg/nseg/* && \
 22 |     make && \
 23 |     mv nseg ../bin && \
 24 |     mv nmerge ../bin
 25 | 
 26 | # Install RepeatScout
 27 | RUN wget http://bix.ucsd.edu/repeatscout/RepeatScout-1.0.5.tar.gz && \
 28 |     tar -xvf RepeatScout* && \
 29 |     rm RepeatScout*.tar.gz && \
 30 |     mv RepeatScout* RepeatScout && \
 31 |     cd RepeatScout && \
 32 |     make	
 33 | 
 34 | # Install RMBlast
 35 | RUN wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/rmblast/2.2.28/ncbi-rmblastn-2.2.28-x64-linux.tar.gz && \
 36 |     tar -xzvf ncbi-rmblastn* && \
 37 |     rm ncbi-rmblastn*.tar.gz && \
 38 |     mv ncbi-rmblastn*/bin/rmblastn bin && \
 39 |     rm -rf ncbi-rmblastn    
 40 | 
 41 | # Install Blast+
 42 | RUN wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.6.0/ncbi-blast-2.6.0+-x64-linux.tar.gz && \
 43 |     tar -xzvf ncbi-blast* && \
 44 |     find ncbi-blast* -type f -executable -exec mv {} bin \; && \  
 45 |     rm -rf ncbi-blast*
 46 |     
 47 | # Install RepeatMasker
 48 | RUN wget http://www.repeatmasker.org/RepeatMasker-open-4-0-7.tar.gz \
 49 |     && tar -xzvf RepeatMasker-open*.tar.gz \
 50 | 	&& rm -f RepeatMasker-open*.tar.gz \
 51 | 	&& perl -0p -e 's/\/usr\/local\/hmmer/\/usr\/bin/g;' \
 52 | 	-e 's/\/usr\/local\/rmblast/\/usr\/local\/bin/g;' \
 53 |     -e 's/DEFAULT_SEARCH_ENGINE = "crossmatch"/DEFAULT_SEARCH_ENGINE = "ncbi"/g;' \
 54 |     -e 's/TRF_PRGM = ""/TRF_PRGM = "\/usr\/local\/bin\/trf"/g;' RepeatMasker/RepeatMaskerConfig.tmpl > RepeatMasker/RepeatMaskerConfig.pm
 55 | 
 56 | # Fix RepeatMasker's strange shebang lines
 57 | RUN cd /usr/local/RepeatMasker \
 58 | 	&& perl -i -0pe 's/^#\!.*perl.*/#\!\/usr\/bin\/env perl/g' \
 59 | 	RepeatMasker \
 60 |     DateRepeats \
 61 |     ProcessRepeats \
 62 |     RepeatProteinMask \
 63 |     DupMasker \
 64 |     util/queryRepeatDatabase.pl \
 65 |     util/queryTaxonomyDatabase.pl \
 66 |     util/rmOutToGFF3.pl \
 67 |     util/rmToUCSCTables.pl
 68 | 
 69 | # Install RIPcal
 70 | RUN wget http://downloads.sourceforge.net/project/ripcal/RIPCAL/RIPCAL_2.0/ripcal2_install.zip \
 71 | 	&& unzip ripcal*.zip \
 72 | 	&& rm ripcal*.zip \
 73 | 	&& mv ripcal* ripcal \
 74 | 	&& cd ripcal \
 75 | 	&& chmod +x perl/*
 76 | 
 77 | # Install RECON
 78 | RUN wget http://www.repeatmasker.org/RepeatModeler/RECON-1.08.tar.gz \
 79 | 	&& tar -xvf RECON* \
 80 | 	&& rm RECON*.tar.gz \
 81 | 	&& mv RECON* recon \
 82 | 	&& cd recon/src \
 83 | 	&& make \
 84 | 	&& make install \
 85 | 	&& perl -i -0pe 's/\$path = "";/\$path = "\/usr\/local\/RECON-1.08\/bin";/g' ../scripts/\recon.pl
 86 | 
 87 | # Install RepeatModeler deps
 88 | RUN apt-get install -qqy libjson-perl liburi-perl liblwp-useragent-determined-perl
 89 | 
 90 | # Install RepeatModeler
 91 | RUN wget http://www.repeatmasker.org/RepeatModeler/RepeatModeler-open-1.0.10.tar.gz \
 92 | 	&& tar -xvf RepeatModeler-*.tar.gz \
 93 | 	&& rm RepeatModeler-*.tar.gz \
 94 | 	&& mv RepeatModeler-*/ RepeatModeler \
 95 | 	&& cd RepeatModeler \
 96 | 	&& perl -i -0pe 's/^#\!.*/#\!\/usr\/bin\/env perl/g' \
 97 | 	configure \
 98 | 	BuildDatabase \
 99 | 	Refiner \
100 | 	RepeatClassifier \
101 | 	RepeatModeler \
102 | 	TRFMask \
103 | 	util/dfamConsensusTool.pl \
104 | 	util/renameIds.pl \
105 | 	util/viewMSA.pl \
106 | 	&& cat RepModelConfig.pm.tmpl \
107 | 	| perl -p -e 's/\$RMBLAST_DIR +=.*;$/\$RMBLAST_DIR = "\/usr\/local\/bin";/g' \
108 | 	| perl -p -e 's/\$RECON_DIR +=.*;$/\$RECON_DIR = "\/usr\/local\/recon\/bin";/g' \
109 | 	| perl -p -e 's/\$RSCOUT_DIR +=.*;$/\$RSCOUT_DIR = "\/usr\/local\/RepeatScout";/g' \
110 | 	> RepModelConfig.pm
111 | 
112 | # I can't bundle the girinst RepBase libraries with the docker image,
113 | # so you'll need to get them yourself. Download them from
114 | # http://www.girinst.org/server/RepBase/protected/repeatmaskerlibraries/RepBaseRepeatMaskerEdition-20170127.tar.gz
115 | 
116 | ONBUILD WORKDIR /usr/local/RepeatMasker
117 | ONBUILD ADD repeatmaskerlibraries.tar.gz /usr/local/RepeatMasker
118 | ONBUILD RUN cd /usr/local/RepeatMasker && util/buildRMLibFromEMBL.pl Libraries/RMRBSeqs.embl > Libraries/RepeatMasker.lib \
119 | 		&& makeblastdb -dbtype nucl -in Libraries/RepeatMasker.lib > /dev/null 2>&1 \
120 |         && makeblastdb -dbtype prot -in Libraries/RepeatPeps.lib > /dev/null 2>&1
121 | 
122 | ENV PATH /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/RepeatMasker:/usr/local/RepeatScout:/usr/local/recon/bin:/usr/local/RepeatModeler
123 | #ENTRYPOINT ["/usr/local/RepeatMasker/RepeatMasker"]
124 | 


--------------------------------------------------------------------------------
/Dockerfiles/RepeatMasker-onbuild/README.md:
--------------------------------------------------------------------------------
 1 | # RepeatMasker-onbuild Container
 2 | 
 3 | This simple container is designed to make it easier to run
 4 | RepeatMasker on new machines. If you have your own permanent 
 5 | [big-ass-server](http://jermdemo.blogspot.ca/2011/06/big-ass-servers-and-myths-of-clusters.html),
 6 | you might want to simply install the software as usual and that's very
 7 | sensible.
 8 | 
 9 | There are also plenty of situations were you might want to use a
10 | container:
11 | 
12 | * You are using compute resources on EC2 or GCE and you don't want to
13 | make a new disk image for each step of the annotation pipeline (and
14 | you don't want the hastle of cloud orchestration tools and scripts.
15 | * A container described by a Dockerfile also provides complete
16 | documentation of how the results were generated, which makes
17 | replication a little easier.
18 | * You are using a [docker-aware pipeline](http://nextflow.io/).
19 | 
20 | ## What Does the Image Contain?
21 | 
22 | This images contains the RepeatMasker binary and its prerequisites
23 | hmmer, rmblast, blast+ and trf. It *does not* contain the RepBase
24 | database. You will need to register and downlod this yourself and then
25 | build a new image based on this one. It also does not contain the
26 | search engines Cross_Match and ABBlast/WUBlast because of licencing
27 | restrictions.
28 | 
29 | ## Running RepeatMasker from inside a container
30 | 
31 | You'll need a copy of the latest
32 | [Repbase-derived RepeatMasker libraries](http://www.girinst.org/server/RepBase/index.php)
33 | (requires
34 | [free registration](http://www.girinst.org/accountservices/register.php)),
35 | renamed as `repeatmaskerlibraries.tar.gz`. We then create a new
36 | Dockerfile and generate our new image
37 | 
38 | ```sh
39 | wget --user your_username \
40 |     --password 12345 \
41 |     -O repeatmaskerlibraries.tar.gz \
42 |     http://www.girinst.org/server/RepBase/protected/repeatmaskerlibraries/RepBaseRepeatMaskerEdition-20170127.tar.gz
43 | echo "FROM robsyme/repeatmasker-onbuild" > Dockerfile
44 | docker build -t myrepeatmasker .
45 | ```
46 | 
47 | We can then run RepeatMasker:
48 | 
49 | ```sh
50 | docker run -v $PWD:/in -w /in myrepeatmasker RepeatMasker scaffolds.fasta
51 | ```
52 | 
53 | This runs the container, mounting the host's current directory (and
54 | all subdirectories) inside the container at `/in` (`-v $PWD/in`). The `w
55 | /in` arguments ensure that the command is run from this new
56 | directory. We then specify that we want to use the `myrepeatmasker`
57 | image we just created. Inside the container, the command `RepeatMasker
58 | scaffolds.fasta` is run.
59 | 
60 | ## Modifying the container
61 | 
62 | You are free to modify the container (perhaps you really want to use
63 | Cross_Match, for example. Simply clone this repository (`git clone
64 | https://github.com/robsyme/nextflow-annotate.git`) and modify the
65 | Dockerfile before building.
66 | 


--------------------------------------------------------------------------------
/Dockerfiles/RepeatMasker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM robsyme/repeatmasker-onbuild:latest
2 | 
3 | MAINTAINER Rob Syme <rob.syme@gmail.com>
4 | 
5 | 


--------------------------------------------------------------------------------
/Dockerfiles/RepeatMasker/README.md:
--------------------------------------------------------------------------------
 1 | # RepeatMasker Container
 2 | 
 3 | The RepBase licence prohibits distribution of the libraries, so we
 4 | need a two-step process to build the final docker image. The first
 5 | step is the installation of the dependencies. This has already been
 6 | done inside the `robsyme/repeatmasker-onbuild` image.
 7 | 
 8 | The second step is to download and install the RepBase libraries. The
 9 | repeatmasker-onbuild image takes care of the installation. It only
10 | requires that you download the repbase images to a file names
11 | 'repeatmaskerlibraries.tar.gz' next to the Dockerfile (in this
12 | directory, for example).
13 | 
14 | The Dockerfile is minimal, containing only:
15 | 
16 | ```
17 | FROM robsyme/repeatmasker-onbuild
18 | ```
19 | 
20 | If you have this tiny Dockerfile and the RepBase libraries, you can
21 | build and use your docker image with:
22 | 
23 | ```sh
24 | docker build -t myrepeatmasker .
25 | cd /path/to/data
26 | docker run -v $PWD:/in -w /in myrepeatmasker RepeatMasker scaffolds.fasta
27 | ```
28 | 
29 | Note that only the current directory (and its children) is mounted
30 | inside the container, so you need to ensure that your scaffolds file
31 | is in the current path tree.
32 | 


--------------------------------------------------------------------------------
/Dockerfiles/augustus/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:16.04
 2 | 
 3 | MAINTAINER Rob Syme <rob.syme@gmail.com>
 4 | 
 5 | RUN apt-get update \
 6 | && apt-get install -qqy \
 7 | build-essential \
 8 | libbamtools-dev \
 9 | libboost-graph-dev \
10 | libboost-iostreams-dev \
11 | libgsl-dev \
12 | liblpsolve55-dev \
13 | libsqlite3-dev \
14 | libsuitesparse-dev \
15 | wget \
16 | zlib1g-dev
17 | 
18 | WORKDIR /usr/local
19 | 
20 | # Install Augustus
21 | RUN wget http://bioinf.uni-greifswald.de/augustus/binaries/augustus.current.tar.gz \
22 | && tar -xvf augustus*.tar.gz \
23 | && rm augustus*.tar.gz \
24 | && cd augustus \
25 | && echo "COMPGENEPRED = true" >> common.mk \
26 | && make \
27 | && make install
28 | 
29 | ENV AUGUSTUS_CONFIG_PATH /usr/local/augustus/config
30 | ENV PATH /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/augustus/bin:/usr/local/augustus/scripts
31 | 


--------------------------------------------------------------------------------
/Dockerfiles/augustus/fgram_base/fgram_base_metapars.cfg:
--------------------------------------------------------------------------------
 1 | # This file contains the list of meta parameters which are subject to optimization. 
 2 | # All other parameters are chosen as given in the species parameter file. The order 
 3 | # of the parameters determines the order in the optimisation process.
 4 | # For each parameter the range of possible values is specified after the parameter
 5 | # name and at least one white space.
 6 | # 3 cases are possible for the range:
 7 | # - an explicit list is given, e.g. protein	"on" "off"
 8 | # - it is an integer range, e.g. window_size	"1"-"5"
 9 | # - it is a range of floating point numbers, e.g. pseudocount	"0.3"_"1.8"
10 | #
11 | # 
12 | # Mario Stanke, 19.12.2006
13 | #
14 | 
15 | /Constant/dss_end		"1"-"4"
16 | /Constant/dss_start		"1"-"3"
17 | /Constant/ass_start		"1"-"3"
18 | /Constant/ass_end		"0"-"4"
19 | /Constant/ass_upwindow_size	"1"-"50"
20 | /IntronModel/d                  "100"-"950"
21 | /IntronModel/ass_motif_memory	"0"-"3"
22 | /IntronModel/ass_motif_radius	"0"-"4"
23 | /ExonModel/tis_motif_memory	"0"-"3"
24 | /ExonModel/tis_motif_radius	"0"-"3"
25 | /Constant/trans_init_window	"0"-"25"
26 | /Constant/init_coding_len	"0"-"18"
27 | /ExonModel/patpseudocount	"0.5"_"5"
28 | /ExonModel/etpseudocount	"0"-"10"
29 | /ExonModel/etorder		"0"-"3"
30 | /Constant/intterm_coding_len	"0"-"13"
31 | /ExonModel/slope_of_bandwidth	"0.05"_"0.6"
32 | /ExonModel/minwindowcount	"1"-"15"
33 | /IGenicModel/patpseudocount	"0.5"_"7"
34 | /IntronModel/patpseudocount	"0.5"_"7"
35 | /IntronModel/slope_of_bandwidth	"0.05"_"0.6"
36 | /IntronModel/minwindowcount	"1"-"8"
37 | /IntronModel/asspseudocount	"0.0005"_"0.03"
38 | /IntronModel/dsspseudocount	"0.0002"_"0.04"
39 | /IntronModel/dssneighborfactor  "0.0001"_"0.01"
40 | /ExonModel/minPatSum		"100"_"600"
41 | /Constant/probNinCoding         "0.15"_".25"
42 | /Constant/decomp_num_steps	"1"-"5"
43 | # comment parameters out that you do not want to be subject of optimization
44 | #/IGenicModel/k                  "4" "3" "5"
45 | #/IntronModel/k                  "4" "3" "5"
46 | #/ExonModel/k                    "4" "3" "5"
47 | 


--------------------------------------------------------------------------------
/Dockerfiles/augustus/fgram_base/fgram_base_parameters.cfg:
--------------------------------------------------------------------------------
  1 | #
  2 | # fgram_base parameters. 
  3 | # 
  4 | # date : 19.12.2006
  5 | #
  6 | 
  7 | #
  8 | # Properties for augustus
  9 | #------------------------------------
 10 | /augustus/verbosity 3     # 0-3, 0: only print the neccessary
 11 | maxDNAPieceSize    200000 # maximum segment that is predicted in one piece
 12 | stopCodonExcludedFromCDS true # make this 'true' if the CDS includes the stop codon (training and prediction)
 13 | 
 14 | # gff output options:
 15 | protein             on    # output predicted protein sequence
 16 | codingseq           off   # output the coding sequence
 17 | cds                 on    # output 'cds' as feature for exons
 18 | start               on    # output start codons (translation start)
 19 | stop                on    # output stop codons  (translation stop)
 20 | introns             on    # output introns
 21 | tss                 on   # output transcription start site
 22 | tts                 on   # output transcription termination site
 23 | print_utr           off   # output 5'UTR and 3'UTR lines in addition to exon lines
 24 | 
 25 | checkExAcc          off   # internal parameter for extrinsic accuracy
 26 | 
 27 | # alternative transcripts and posterior probabilities
 28 | sample                      100   # the number of sampling iterations
 29 | alternatives-from-sampling  false # output alternative transcripts
 30 | minexonintronprob           0.08  # minimal posterior probability of all (coding) exons
 31 | minmeanexonintronprob       0.4   # minimal geometric mean of the posterior probs of introns and exons
 32 | maxtracks                   -1    # maximum number of reported transcripts per gene (-1: no limit)
 33 | keep_viterbi                true  # set to true if all Viterbi transcripts should be reported
 34 | uniqueCDS                   true  # don't report transcripts that differ only in the UTR
 35 | UTR                         off   # predict untranslated regions
 36 | 
 37 | #
 38 | # 
 39 | # The rest of the file contains mainly meta parameters used for training.
 40 | #
 41 | 
 42 | # global constants
 43 | # ----------------------------
 44 | 
 45 | /Constant/trans_init_window           6
 46 | /Constant/ass_upwindow_size           25
 47 | /Constant/ass_start                   3
 48 | /Constant/ass_end                     3
 49 | /Constant/dss_start                   2
 50 | /Constant/dss_end                     4
 51 | /Constant/init_coding_len	      18
 52 | /Constant/intterm_coding_len	      13
 53 | /Constant/tss_upwindow_size           45
 54 | /Constant/decomp_num_at               1
 55 | /Constant/decomp_num_gc               1
 56 | /Constant/gc_range_min		      0.32   # This range has an effect only when decomp_num_steps>1. 
 57 | /Constant/gc_range_max                0.73   # States the minimal and maximal percentage of c or g
 58 | /Constant/decomp_num_steps            3
 59 | /Constant/min_coding_len              201    # no gene with a coding sequence shorter than this is predicted
 60 | /Constant/probNinCoding               0.23
 61 | /Constant/amberprob                   0.33   # Prob(stop codon = tag), if 0 tag is assumed to code for amino acid
 62 | /Constant/ochreprob                   0.33   # Prob(stop codon = taa), if 0 taa is assumed to code for amino acid
 63 | /Constant/opalprob                    0.34   # Prob(stop codon = tga), if 0 tga is assumed to code for amino acid
 64 | /Constant/subopt_transcript_threshold 0.7
 65 | /Constant/almost_identical_maxdiff    10
 66 | 
 67 | # type of weighing, one of  1 = equalWeights, 2 = gcContentClasses, 3 = multiNormalKernel
 68 | /BaseCount/weighingType    3
 69 | # file with the weight matrix (only for multiNormalKernel type weighing)
 70 | /BaseCount/weightMatrixFile   fgram_base_weightmatrix.txt # change this to your species if at all neccessary
 71 | 
 72 | # Properties for IGenicModel
 73 | # ----------------------------
 74 | /IGenicModel/verbosity      0
 75 | /IGenicModel/infile         fgram_base_igenic_probs.pbl   # change this and the other five filenames *_probs.pbl below to your species
 76 | /IGenicModel/outfile        fgram_base_igenic_probs.pbl
 77 | /IGenicModel/patpseudocount 5.0
 78 | /IGenicModel/k              4        # order of the Markov chain for content model, keep equal to /ExonModel/k
 79 | 
 80 | # Properties for ExonModel
 81 | # ----------------------------
 82 | /ExonModel/verbosity          3
 83 | /ExonModel/infile             fgram_base_exon_probs.pbl
 84 | /ExonModel/outfile            fgram_base_exon_probs.pbl
 85 | /ExonModel/patpseudocount     0.5
 86 | /ExonModel/minPatSum          233.3
 87 | /ExonModel/k                  4       # order of the Markov chain for content model
 88 | /ExonModel/etorder	      2
 89 | /ExonModel/etpseudocount      3
 90 | /ExonModel/exonlengthD        2000    # beyond this the distribution is geometric
 91 | /ExonModel/maxexonlength      15000
 92 | /ExonModel/slope_of_bandwidth 0.325
 93 | /ExonModel/minwindowcount     8
 94 | /ExonModel/tis_motif_memory   3
 95 | /ExonModel/tis_motif_radius   0
 96 |  
 97 | # Properties for IntronModel
 98 | # ----------------------------
 99 | /IntronModel/verbosity          0
100 | /IntronModel/infile             fgram_base_intron_probs.pbl
101 | /IntronModel/outfile            fgram_base_intron_probs.pbl
102 | /IntronModel/patpseudocount     5.0
103 | /IntronModel/k                  4     # order of the Markov chain for content model, keep equal to /ExonModel/k
104 | /IntronModel/slope_of_bandwidth 0.4
105 | /IntronModel/minwindowcount     4
106 | /IntronModel/asspseudocount     0.01525
107 | /IntronModel/dsspseudocount     0.0005
108 | /IntronModel/dssneighborfactor  0.007525
109 | #/IntronModel/splicefile         fgram_base_splicefile.txt # this optional file contains additional windows around splice sites for training, uncomment if you have one
110 | /IntronModel/sf_with_motif	false           # if true the splice file is also used to train the branch point region
111 | /IntronModel/d                  100
112 | /IntronModel/ass_motif_memory   1
113 | /IntronModel/ass_motif_radius   0
114 | 
115 | # Properties for UtrModel
116 | # ----------------------------
117 | /UtrModel/verbosity             3
118 | /UtrModel/infile                fgram_base_utr_probs.pbl
119 | /UtrModel/outfile               fgram_base_utr_probs.pbl
120 | /UtrModel/k                     4
121 | /UtrModel/utr5patternweight     0
122 | /UtrModel/utr3patternweight     1.0
123 | /UtrModel/patpseudocount        1
124 | /UtrModel/tssup_k               0
125 | /UtrModel/tssup_patpseudocount  1
126 | /UtrModel/slope_of_bandwidth    0.2375
127 | /UtrModel/minwindowcount        3
128 | /UtrModel/exonlengthD           800
129 | /UtrModel/maxexonlength         1800
130 | /UtrModel/max3singlelength      1800
131 | /UtrModel/max3termlength        1800
132 | /UtrModel/tss_start             8
133 | /UtrModel/tss_end               5
134 | /UtrModel/tata_start            2
135 | /UtrModel/tata_end              10
136 | /UtrModel/tata_pseudocount      2
137 | /UtrModel/d_tss_tata_min        26      # minimal distance between start of tata box (if existent) and tss 
138 | /UtrModel/d_tss_tata_max        37      # maximal distance between start of tata box (if existent) and tss
139 | /UtrModel/d_polyasig_cleavage   14      # the transcription end is predicted this many bases after the polyadenylation signal
140 | /UtrModel/d_polya_cleavage_min  7
141 | /UtrModel/d_polya_cleavage_max  17
142 | /UtrModel/prob_polya            0.4
143 | 


--------------------------------------------------------------------------------
/Dockerfiles/augustus/fgram_base/fgram_base_weightmatrix.txt:
--------------------------------------------------------------------------------
 1 | # 
 2 | # This file contains a matrix used for weighing the training sequences
 3 | # when given an input sequence. Let z = (da, dc, dg, dt) be the vector
 4 | # containing the differences in the relative nucleotide frequencies of
 5 | # two sequences, the input sequence and a training sequence.
 6 | # Then the training sequence has weight proportional to 
 7 | #
 8 | # exp ( - z M z^t)
 9 | #
10 | # with M being the matrix specified below.
11 | # If M is nonsingular, then (apart from a two normalizing factors) M
12 | # is the inverse of the covariance matrix of a multinormal
13 | # distribution - the kernel for the estimation.
14 | 
15 | 
16 | # this matrix is gc-content only, i.e. 
17 | # weight = 10 * exp (-200 * (dc + dg))^2)
18 | # in particular weight <= 10
19 | 0      0      0             0
20 | 0      200    0             0
21 | 0      0      200           0
22 | 0      0      0             0
23 | 
24 | 


--------------------------------------------------------------------------------
/Dockerfiles/basics/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:14.04
 2 | 
 3 | MAINTAINER Rob Syme <rob.syme@gmail.com>
 4 | 
 5 | RUN apt-get update && apt-get install -qqy build-essential python ruby curl htop wget htop
 6 | 
 7 | WORKDIR /opt
 8 | 
 9 | # Samtools
10 | RUN apt-get install -qqy zlib1g-dev libncurses5-dev
11 | ADD http://downloads.sourceforge.net/project/samtools/samtools/1.2/samtools-1.2.tar.bz2 ./
12 | RUN tar -xvf *.tar.bz2 && rm *.tar.bz2 && mv samtools* samtools \
13 |     && cd samtools && make
14 | 
15 | # NCBI-blast
16 | RUN apt-get install -qqy ncbi-blast+
17 | 
18 | # Bioruby
19 | RUN gem install bio
20 | 
21 | # Emboss
22 | RUN apt-get install -qqy emboss
23 | 
24 | # HMMER
25 | RUN apt-get install -qqy hmmer
26 | 
27 | # Bedtools
28 | RUN apt-get install -qqy bedtools
29 | 
30 | # Genome tools
31 | WORKDIR /opt
32 | RUN apt-get install -qqy libcairo2-dev libpango1.0-dev
33 | ADD http://genometools.org/pub/genometools-1.5.6.tar.gz ./
34 | RUN tar -xvf genometools-* && rm -f *.tar.gz && mv genometools* genometools
35 | RUN cd genometools && make 64bit=yes opt=yes universal=no && sudo make install
36 | 
37 | ENV PATH /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/augustus/bin:/opt/tophat:/opt/samtools
38 | 


--------------------------------------------------------------------------------
/Dockerfiles/bedtools/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:14.04
2 | 
3 | MAINTAINER Rob Syme <rob.syme@gmail.com>
4 | 
5 | RUN apt-get update -qq && apt-get install -qqy bedtools samtools
6 | 


--------------------------------------------------------------------------------
/Dockerfiles/bioruby/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:15.04
2 | 
3 | MAINTAINER Rob Syme <rob.syme@gmail.com>
4 | 
5 | RUN apt-get update && apt-get install ruby2.1 -qqy
6 | RUN gem install --no-document --version 1.4.3 bio
7 | 
8 | 


--------------------------------------------------------------------------------
/Dockerfiles/busco/Dockerfile:
--------------------------------------------------------------------------------
 1 | from robsyme/augustus:3.0.3
 2 | 
 3 | MAINTAINER Rob Syme <rob.syme@gmail.com>
 4 | 
 5 | RUN apt-get update && apt-get install -yqq python ncbi-blast+ hmmer emboss
 6 | 
 7 | #Busco
 8 | RUN mkdir /opt/busco
 9 | WORKDIR /opt/busco
10 | ADD http://busco.ezlab.org/files/BUSCO_v1.0.tar.gz /opt/busco/
11 | RUN tar -xzvf BUSCO_v1.0.tar.gz \
12 |     && rm *.tar.gz \
13 |     && sed -i 's/^#!\/bin\/python/#!\/usr\/bin\/env python/' BUSCO_v1.0.py \
14 |     && chmod +x BUSCO_v1.0.py \
15 |     && ln -s BUSCO_v1.0.py busco
16 | ADD http://busco.ezlab.org/files/fungi_buscos.tar.gz /opt/busco/lineages/
17 | RUN cd /opt/busco/lineages/ && tar -xzf *.tar.gz
18 | 
19 | # Genometools
20 | WORKDIR /opt/gt
21 | ADD http://genometools.org/pub/binary_distributions/gt-1.5.7-Linux_x86_64-64bit-barebone.tar.gz /opt/gt/
22 | RUN tar -xvf *.tar.gz && rm *.tar.gz && ln -s gt* current
23 | 
24 | #Samtools
25 | RUN apt-get install samtools
26 | 
27 | ENV PATH /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/augustus/bin:/opt/augustus/scripts:/opt/busco:/opt/gt/current/bin
28 | 
29 | ENTRYPOINT ["/bin/bash"]
30 | 


--------------------------------------------------------------------------------
/Dockerfiles/busco/README.md:
--------------------------------------------------------------------------------
 1 | # BUSCO in docker
 2 | 
 3 | This is a repository that contains the
 4 | [BUSCO](http://busco.ezlab.org/) software for 'assessing genome
 5 | assembly and annotation completeness with single-copy orthologs'. It
 6 | contains preconfigured installations of the BUSCO prerequisites,
 7 | including Augustus 3.0, hmmer, ncbi-blast+, and emboss.
 8 | 
 9 | ## Using the container
10 | 
11 | I have a fungal genome `scaffolds.fasta` in the current directory, I
12 | can run busco by first downloading the profiles (in my case fungi):
13 | 
14 |     wget http://busco.ezlab.org/files/fungi_buscos.tar.gz
15 |     tar -xzvf fungi_buscos.tar.gz && rm fungi_buscos.tar.gz
16 | 
17 | I can then run the busco docker container
18 | 
19 |     docker run --rm -v $PWD:/in -w /in robsyme/busco \
20 |         busco -in scaffolds.fasta -o my_run --lineage fungi
21 | 
22 | I might consider bundling in the profiles into lineage-specific docker
23 | images, but busco unhelpfully prepends a '`.`' to the lineage path, so
24 | I would have to create a runner script that links in the profile
25 | folder into the current working directory, which is a bit messy. For
26 | the moment, I will leave profile managment to the user.
27 | 


--------------------------------------------------------------------------------
/Dockerfiles/chado-helper/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:14.04
 2 | 
 3 | RUN apt-get update && apt-get install -qqy postgresql-client wget build-essential 
 4 | 
 5 | WORKDIR /opt
 6 | ADD http://downloads.sourceforge.net/project/gmod/gmod/chado-1.23/chado-1.23.tar.gz /opt/
 7 | RUN tar -xzvf *.tar.gz && rm *.tar.gz && mv chado-1.23 chado
 8 | 
 9 | WORKDIR /opt/chado
10 | ENV GMOD_ROOT /usr/local/gmod
11 | ENV CHADO_DB_USERNAME chadouser
12 | ENV CHADO_DB_NAME chado
13 | ENV CHADO_DB_HOST chado
14 | 
15 | # Perl bits
16 | RUN apt-get install -qqy libtemplate-perl libxml-simple-perl libdbi-perl libgo-perl libdbd-pg-perl libdbix-dbstag-perl libsql-translator-perl bioperl
17 | RUN sed -i 's/stag-storenode.pl/stag-storenode/' lib/Bio/Chado/Builder.pm
18 | RUN perl Makefile.PL && make && make install
19 | 


--------------------------------------------------------------------------------
/Dockerfiles/chado-helper/README.md:
--------------------------------------------------------------------------------
 1 | # Chado Loading Helper
 2 | 
 3 | This docker image is to help get a new chado database up and running
 4 | quickly.
 5 | 
 6 | ## Steps
 7 | 
 8 | ### Create a new database container and user 'chadouser'
 9 | 
10 | ```sh
11 | docker run -d --name db postgres
12 | ```
13 | 
14 | Now that you've got a blank database, we'll create a new user
15 | 'chadouser' inside that database:
16 | 
17 | ```sh
18 | docker run --rm --link db:chado postgres \
19 |     createuser \
20 |     --host chado \
21 |     --username postgres \
22 |     --createdb \
23 |     --echo \
24 |     --login \
25 |     chadouser
26 | ```
27 | And create the `chado` database:
28 | 
29 | ```sh
30 | docker run --rm --link db:chado postgres \
31 |     createdb \
32 |     -h chado \
33 |     -U chadouser
34 |     chado
35 | ```
36 | 
37 | ### Load the basic schema
38 | 
39 | ```sh
40 | docker run --rm --link db:chado robsyme/chado-helper make load_schema
41 | docker run --rm --link db:chado robsyme/chado-helper make prepdb
42 | ```
43 | 
44 | ### Load the ontologies
45 | 
46 | This step is interactive so that you can specify which ontologies you
47 | wish to load
48 | 
49 | ```sh
50 | docker run --rm --link db:chado --interactive --tty robsyme/chado-helper make ontologies
51 | ```
52 | 
53 | ### Backup the sql
54 | 
55 | Now is probably a good time to take a snapshot of the database so that
56 | you can get back to a clean slate if needed. You can dump the sql to
57 | the current directory using:
58 | 
59 | ```sh
60 | docker run --rm --link db:chado -v $PWD:/dump postgres \
61 |     pg_dump \
62 |     -h chado \
63 |     -U postgres \
64 |     -f /dump/chado_dump.sql\
65 |     chado
66 | ```
67 | 


--------------------------------------------------------------------------------
/Dockerfiles/codingquarry/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:14.04
 2 | 
 3 | MAINTAINER Rob Syme <rob.syme@gmail.com>
 4 | 
 5 | RUN apt-get update && apt-get install -yqq build-essential python
 6 | 
 7 | WORKDIR /opt
 8 | ADD http://downloads.sourceforge.net/project/codingquarry/CodingQuarry_v1.2.tar.gz ./
 9 | RUN apt-get install -qqy libopenmpi-dev
10 | RUN tar -xzvf *.tar.gz && rm *.tar.gz && mv CodingQuarry* CodingQuarry && cd CodingQuarry && make
11 | ENV QUARRY_PATH /opt/CodingQuarry/QuarryFiles
12 | ENV PATH /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/CodingQuarry
13 | 


--------------------------------------------------------------------------------
/Dockerfiles/cufflinks/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM robsyme/tophat
 2 | 
 3 | MAINTAINER Rob Syme <rob.syme@gmail.com>
 4 | 
 5 | WORKDIR /opt
 6 | ADD http://cole-trapnell-lab.github.io/cufflinks/assets/downloads/cufflinks-2.2.1.Linux_x86_64.tar.gz ./
 7 | RUN tar -xzvf *.tar.gz && rm *.tar.gz && mv cufflinks* cufflinks
 8 | ENV PATH /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/augustus/bin:/opt/cufflinks
 9 | 
10 | 
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/Dockerfiles/emboss/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:14.04
2 | 
3 | MAINTAINER Rob Syme <rob.syme@gmail.com>
4 | 
5 | RUN apt-get update
6 | RUN apt-get install -qqy emboss
7 | 


--------------------------------------------------------------------------------
/Dockerfiles/gff2gb/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:14.04
 2 | 
 3 | MAINTAINER Rob Syme <rob.syme@gmail.com>
 4 | 
 5 | RUN apt-get update && apt-get install -qqy wget python python-biopython
 6 | 
 7 | RUN wget https://bootstrap.pypa.io/get-pip.py && python get-pip.py && pip install bcbio-gff 
 8 | 
 9 | ADD https://raw.githubusercontent.com/chapmanb/bcbb/master/gff/Scripts/gff/gff_to_genbank.py /usr/local/bin/
10 | RUN chmod +x /usr/local/bin/gff_to_genbank.py
11 | 
12 | CMD ["/usr/local/bin/gff_to_genbank.py"]
13 | 


--------------------------------------------------------------------------------
/Dockerfiles/hhblits-fungi/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:14.04
 2 | 
 3 | MAINTAINER Rob Syme <rob.syme@gmail.com>
 4 | ENV DEBIAN_FRONTEND noninteractive
 5 | 
 6 | RUN apt-get update && apt-get install -qqy hhsuite ffindex samtools
 7 | RUN mkdir /databases
 8 | WORKDIR /databases
 9 | 
10 | # One of two options here - either download it during docker build
11 | ADD http://downloads.sourceforge.net/project/jamg/databases/fungal_50kclus.tar.bz2 .
12 | # ... or download it yourself next to this Dockerfile and then docker build.
13 | #ADD fungal_50kclus.tar.bz2 . 
14 | 
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/Dockerfiles/hhblits-transposon/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:14.04
 2 | 
 3 | MAINTAINER Rob Syme <rob.syme@gmail.com>
 4 | ENV DEBIAN_FRONTEND noninteractive
 5 | 
 6 | RUN apt-get update && apt-get install -qqy hhsuite ffindex samtools
 7 | RUN mkdir /databases
 8 | WORKDIR /databases
 9 | 
10 | # One of two options here - either download it during docker build
11 | ADD http://downloads.sourceforge.net/project/jamg/databases/transposons.hhblits.tar.bz2 .
12 | # ... or download it yourself next to this Dockerfile and then docker build.
13 | #ADD transposons.hhblits.tar.bz2 .
14 | RUN tar -xvf transposons.hhblits.tar.bz2
15 | 


--------------------------------------------------------------------------------
/Dockerfiles/interproscan/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:14.04
 2 | 
 3 | MAINTAINER Rob Syme <rob.syme@gmail.com>
 4 | 
 5 | RUN apt-get update && apt-get install -y default-jre wget coreutils
 6 | RUN mkdir -p /opt/interproscan && \
 7 |     cd /opt/interproscan && \
 8 |     wget ftp://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.7-48.0/interproscan-5.7-48.0-64-bit.tar.gz* && \
 9 |     md5sum -c interproscan*.md5 && \
10 |     rm *.md5 && \
11 |     tar -pxvzf interproscan*.tar.gz && \
12 |     rm *.tar.gz
13 | RUN ln -s /opt/interproscan/interproscan-5.7-48.0 /opt/interproscan/current
14 | WORKDIR /opt/interproscan/current
15 | RUN apt-get install -qqy ncoils blast2
16 | ADD interproscan.properties /opt/interproscan/current/
17 | ENV PATH /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/interproscan/current
18 | 


--------------------------------------------------------------------------------
/Dockerfiles/interproscan/interproscan.properties:
--------------------------------------------------------------------------------
  1 | # This is the InterProScan configuration file
  2 | 
  3 | ##
  4 | ## Temporary files and directory
  5 | ##
  6 | # The text [UNIQUE], if present, will be replaced by a value unique to your running instance
  7 | 
  8 | # Temporary files used by the analyses will be placed in directories here:
  9 | temporary.file.directory.suffix=[UNIQUE]
 10 | temporary.file.directory=temp/${temporary.file.directory.suffix}
 11 | 
 12 | ##
 13 | ## H2 database
 14 | ##
 15 | # The H2 database is copied by the standalone version of interproscan
 16 | i5.h2.database.original.location=work/template/interpro.zip
 17 | # LOCK_TIMEOUT: Sets the lock timeout (in milliseconds) for the current session
 18 | i5.database.connection.url=jdbc:h2:mem:interpro;LOCK_TIMEOUT=10000000
 19 | 
 20 | ##
 21 | ## binary paths
 22 | ##
 23 | # Configure the version of perl to use when running member databases perl binaries
 24 | perl.command=perl
 25 | 
 26 | # Binary file locations
 27 | binary.hmmer3.path=bin/hmmer/hmmer3/3.1b1
 28 | binary.hmmer3.hmmscan.path=bin/hmmer/hmmer3/3.1b1/hmmscan
 29 | binary.hmmer3.hmmsearch.path=bin/hmmer/hmmer3/3.1b1/hmmsearch
 30 | binary.hmmer2.hmmsearch.path=bin/hmmer/hmmer2/2.3.2/hmmsearch
 31 | binary.hmmer2.hmmpfam.path=bin/hmmer/hmmer2/2.3.2/hmmpfam
 32 | binary.fingerprintscan.path=bin/prints/fingerPRINTScan
 33 | binary.coils.path=/usr/bin/ncoils
 34 | domainfinder3.path=bin/gene3d/DomainFinder3
 35 | binary.prodom.2006.1.prodomblast3i.pl.path=bin/prodom/2006.1/ProDomBlast3i.pl
 36 | # Note: Correct prosite binary distribution for your platform can be downloaded: ftp://ftp.expasy.org/databases/prosite/tools/ps_scan/
 37 | binary.prosite.psscan.pl.path=bin/prosite/ps_scan.pl
 38 | binary.prosite.pfscan.path=bin/prosite/pfscan
 39 | binary.panther.path=bin/panther/7.0/pantherScore.pl
 40 | binary.panther.perl.lib.dir=bin/panther/7.0/lib
 41 | binary.superfamily.1.75.ass3.pl.path=bin/superfamily/1.75/ass3_single_threaded.pl
 42 | binary.pirsf.pl.path=bin/pirsf/2.85/pirsf.pl
 43 | binary.blastall.2.2.6.path=/usr/bin/blastall
 44 | binary.blast.2.2.19.path=bin/blast/2.2.19
 45 | binary.getorf.path=bin/nucleotide/getorf
 46 | # Note: SignalP binary not distributed with InterProScan 5, please install separately e.g. in bin/signalp/4.0/signalp
 47 | binary.signalp.4.0.path=
 48 | # Note: TMHMM binary not distributed with InterProScan 5, please install separately e.g. in bin/tmhmm/2.0c/decodeanhmm
 49 | binary.tmhmm.path=
 50 | # Note: Phobius binary not distributed with InterProScan 5, please install separately e.g. in bin/phobius/1.01/phobius.pl
 51 | binary.phobius.pl.path.1.01=
 52 | 
 53 | ##
 54 | ##  Member database model / data file locations (alphabetically sorted)
 55 | ##
 56 | # Member database model / data file locations (alphabetically sorted)
 57 | coils.new_coil.mat.path.2.2=data/coils/2.2/new_coil.mat
 58 | gene3d.hmm.path.3.5.0=data/gene3d/3.5.0/gene3d_classified.hmm
 59 | gene3d.model2sf_map.path.3.5.0=data/gene3d/3.5.0/model_to_family_map.csv
 60 | hamap.profile.models.path.201311.27=data/hamap/201311.27/hamap.prf
 61 | # It is IMPORTANT to set this temporary directory to a directory on LOCAL disk -
 62 | # network IO will slow the panther analysis down considerably.
 63 | panther.temporary.file.directory=/tmp/
 64 | panther.models.dir.9.0=data/panther/9.0/model
 65 | Pfam-A.hmm.path.27.0=data/pfam/27.0/Pfam-A.hmm
 66 | Pfam-A.seed.path.27.0=data/pfam/27.0/Pfam-A.seed
 67 | Pfam-A.hmm.path.26.0=data/pfam/26.0/Pfam-A.hmm
 68 | Pfam-A.seed.path.26.0=data/pfam/26.0/Pfam-A.seed
 69 | Pfam-C.path.27.0=data/pfam/27.0/Pfam-C
 70 | #Version 2.84
 71 | pirsf.hmm.bin.path.2.84=data/pirsf/2.84/sf_hmm.bin
 72 | pirsf.hmm.subf.bin.path.2.84=data/pirsf/2.84/sf_hmm_subf.bin
 73 | pirsf.hmm.path.2.84=data/pirsf/2.84/sf_hmm
 74 | pirsf.hmm.subf.path.2.84=data/pirsf/2.84/sf_hmm_subf
 75 | pirsf.dat.path.2.84=data/pirsf/2.84/pirsf.dat
 76 | pirsf.sf.tb.path.2.84=data/pirsf/2.84/sf.tb
 77 | pirsf.sf.seq.path.2.84=data/pirsf/2.84/sf.seq
 78 | 
 79 | prints.kdat.path.42.0=data/prints/42.0/prints42_0.kdat
 80 | prints.pval.path.42.0=data/prints/42.0/prints.pval
 81 | prints.hierarchy.path.42.0=data/prints/42.0/FingerPRINTShierarchy.db
 82 | prodom.ipr.path.2006.1=data/prodom/2006.1/prodom.ipr
 83 | prosite.models.path.20.97=data/prosite/20.97/prosite.dat
 84 | prosite.evaluator.models.path.20.97=data/prosite/20.97/evaluator.dat
 85 | signalp.4.0.perl.library.dir=bin/signalp/4.0/lib
 86 | # Note: Smart overlapping and threshold files not distributed with InterProScan 5, please install separately e.g. in data/smart/6.2
 87 | smart.hmm.path.6.2=data/smart/6.2/smart.HMMs
 88 | smart.hmm.bin.path.6.2=data/smart/6.2/smart.HMMs.bin
 89 | smart.overlapping.path.6.2=
 90 | smart.threshold.path.6.2=
 91 | superfamily.hmm.path.3.0=data/superfamily/1.75/hmmlib_1.75
 92 | superfamily.self.hits.path.1.75=data/superfamily/1.75/self_hits.tab
 93 | superfamily.cla.path.1.75=data/superfamily/1.75/dir.cla.scop.txt_1.75
 94 | superfamily.model.tab.path.1.75=data/superfamily/1.75/model.tab
 95 | superfamily.pdbj95d.path.1.75=data/superfamily/1.75/pdbj95d
 96 | tigrfam.hmm.path.13.0=data/tigrfam/13.0/TIGRFAMs_13.0_HMM.LIB
 97 | # Note: TMHMM model files not distributed with InterProScan 5, please install separately e.g. in data/tmhmm/2.0/TMHMM2.0.model
 98 | tmhmm.model.path=
 99 | 
100 | ##
101 | ## cpu options for parallel processing
102 | ##
103 | 
104 | #hmmer cpu options for the different jobs
105 | hmmer3.hmmsearch.cpu.switch.pfama=--cpu 4
106 | hmmer3.hmmsearch.cpu.switch.tigrfam=--cpu 4
107 | hmmer3.hmmsearch.cpu.switch.gene3d=--cpu 4
108 | hmmer3.hmmsearch.cpu.switch.superfamily=--cpu 4
109 | 
110 | hmmer2.hmmpfam.cpu.switch.smart=--cpu 3
111 | hmmer2.hmmpfam.cpu.switch.pirsf=--cpu 4
112 | 
113 | #blastall cpu options
114 | blastall.cpu.switch.pirsf=-a 4
115 | 
116 | #panther binary cpu options (for blastall and hmmsearch)
117 | panther.binary.cpu.switch=-c 4
118 | 
119 | #pirsf binary cpu options (for hmmscan)
120 | pirsf.pl.binary.cpu.switch=-cpu 4
121 | 
122 | 
123 | ##
124 | ## max number of proteins per analysis batch
125 | ##
126 | # These values control the maximum number of proteins put through
127 | # an analysis in one go - different algorithms have different optimum values.
128 | # Note that if you suffer from out of memory errors, reducing these values
129 | # will almost certainly help, but may reduce the speed of analysis.
130 | analysis.max.sequence.count.TMHMM=100
131 | analysis.max.sequence.count.PANTHER=100
132 | analysis.max.sequence.count.SMART=50
133 | analysis.max.sequence.count.TIGRFAM_9=50
134 | analysis.max.sequence.count.TIGRFAM_10=100
135 | analysis.max.sequence.count.GENE3D=50
136 | analysis.max.sequence.count.PRINTS=100
137 | analysis.max.sequence.count.PROSITE_PROFILES=100
138 | analysis.max.sequence.count.PROSITE_PATTERNS=100
139 | analysis.max.sequence.count.PIRSF=50
140 | analysis.max.sequence.count.PRODOM=100
141 | analysis.max.sequence.count.SSF=50
142 | analysis.max.sequence.count.HAMAP=100
143 | analysis.max.sequence.count.PFAM_A=100
144 | analysis.max.sequence.count.COILS=100
145 | analysis.max.sequence.count.PHOBIUS=100
146 | analysis.max.sequence.count.SIGNALP=100
147 | 
148 | ##
149 | ##  General settings
150 | ##
151 | 
152 | # If multiple hosts are sharing the same file system, a delay may be required to
153 | # avoid stale NFS handles
154 | # nfs.delay.milliseconds=0
155 | 
156 | # Instructs I5 to completely clean up after itself - leave set to true.
157 | delete.temporary.directory.on.completion=true
158 | 
159 | ##
160 | ## Broker TCP Connection
161 | ##
162 | 
163 | # A list of TCP ports that should not be used for messaging. (Apart from this, only ports > 1024 and < 65535 will be used.)
164 | tcp.port.exclusion.list=3879,3878,3881,3882
165 | 
166 | ##
167 | ##  precalculated match lookup service
168 | ##
169 | # By default, if the sequence already has matches available from the EBI, this service will look them
170 | # up for you.  Note - at present it will always return all the available matches, ignoring any -appl options
171 | # set on the command line.
172 | precalculated.match.lookup.service.url=http://www.ebi.ac.uk/interpro/match-lookup
173 | 
174 | #proxy set up
175 | precalculated.match.lookup.service.proxy.host=
176 | precalculated.match.lookup.service.proxy.port=3128
177 | 
178 | 
179 | ##
180 | ## getorf configuration for nucleic acid sequences
181 | ##
182 | # the following are roughly the times getorf takes to find sequences of open reading frames (ORFs) in n nucleotide sequences
183 | #number of sequences -> approx. time it takes in our tests
184 | #        600000 -> 10 minutes
185 | #        3600000 -> 1 hour
186 | #        7200000 -> 2 hours
187 | #        43200000 -> 12 hours
188 | 
189 | # JOB: jobLoadNucleicAcidSequence
190 | getorf.minsize=50
191 | 
192 | ##
193 | ## Output format
194 | ##
195 | # TRUE by default, which means all generated graphical output documents (only SVG at the moment) will be archived (using the Linux command tar).
196 | # This simple switch allows you to switch the archive mode off (simply set it to FALSE).
197 | archiveSVGOutput=true
198 | 
199 | ##
200 | ## Master/Stand alone embedded workers
201 | ##
202 | 
203 | # Set the number of embedded workers to the number of processors that you would like to employ
204 | # on the machine you are using to run InterProScan.
205 | #number of embedded workers  a master process can have
206 | number.of.embedded.workers=1
207 | maxnumber.of.embedded.workers=2
208 | 
209 | ##
210 | ## Distributed mode (Cluster mode)
211 | ##
212 | 
213 | #grid name
214 | grid.name=lsf
215 | #grid.name=other-cluster
216 | 
217 | #project name for this run  - use user.digest
218 | user.digest=i5GridRun
219 | 
220 | #grid jobs limit : number of jobs you are allowed to run on the cluster
221 | grid.jobs.limit=1000
222 | 
223 | 
224 | #time between each bjobs or qstat command to check the status of jobs on the cluster
225 | grid.check.interval.seconds=120
226 | 
227 | #allow master interproscan to run binaries ()
228 | master.can.run.binaries=true
229 | 
230 | #deal with unknown step states
231 | recover.unknown.step.state=false
232 | 
233 | #Grid submission commands (e.g. LSF bsub or SGE qsub) for starting remote workers
234 | #commands the master uses to start new remote workers
235 | grid.master.submit.command=bsub -q QUEUE_NAME
236 | grid.master.submit.high.memory.command=bsub -q QUEUE_NAME -M 8192
237 | 
238 | #commands a worker uses to start new remote workers
239 | grid.worker.submit.command=bsub -q QUEUE_NAME
240 | grid.worker.submit.high.memory.command=bsub -q QUEUE_NAME -M 8192
241 | 
242 | # command to start a new worker (new jvm)
243 | worker.command=java -Xms32m -Xmx2048m -jar interproscan-5.jar
244 | # This may be identical to the worker.command argument above, however you may choose to select
245 | # a machine with a much larger available memory, for use when a StepExecution fails.
246 | worker.high.memory.command=java -Xms32m -Xmx2048m -jar interproscan-5.jar
247 | 
248 | # Set the number of embedded workers to the number of processors that you would like to employ
249 | # on the node machine on which the worker will run.
250 | #number of embedded workers in a remote worker
251 | worker.number.of.embedded.workers=1
252 | worker.maxnumber.of.embedded.workers=4
253 | 
254 | # max number of connections to the master
255 | master.maxconsumers=64
256 | 
257 | #number of connections to the worker
258 | worker.maxconsumers=32
259 | 
260 | #throttled network?
261 | grid.throttle=true
262 | 
263 | # max number of jobs a tier 1 worker is allowed on its queue
264 | worker.maxunfinished.jobs=64
265 | 
266 | #network tier depth
267 | max.tier.depth=1
268 | 
269 | # Active MQ JMS broker temporary data directory
270 | jms.broker.temp.directory=activemq-data/localhost/tmp_storage
271 | 


--------------------------------------------------------------------------------
/Dockerfiles/jamg/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:14.04
 2 | 
 3 | MAINTAINER Rob Syme <rob.syme@gmail.com>
 4 | 
 5 | RUN apt-get update && apt-get install -qqy build-essential cdbfasta ncbi-blast+ snap git
 6 | 
 7 | # Insall Augustus
 8 | ADD http://bioinf.uni-greifswald.de/augustus/binaries/augustus-3.1.tar.gz /opt/
 9 | RUN cd /opt && \
10 |     tar -xzvf augustus* && \
11 |     rm -rf *.tar.gz && \
12 |     mv augustus* augustus && \
13 |     cd augustus && \
14 |     make
15 | 
16 | ENV AUGUSTUS_CONFIG_PATH /opt/augustus/config
17 | 
18 | ENV PATH /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/augustus/bin:/opt/augustus/scripts
19 | 
20 | WORKDIR /opt
21 | RUN apt-get install zlib1g-dev wget
22 | RUN git clone https://github.com/genomecuration/JAMg.git jamg 
23 | # && cd jamg \
24 | # && make all
25 | 
26 | 
27 | # gmap
28 | 
29 | # augustus
30 | 
31 | # gff2gbSmallDNA.pl
32 | 
33 | # etraining
34 | 
35 | # filterGenes.pl
36 | 


--------------------------------------------------------------------------------
/Dockerfiles/ncbi-blast/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:14.04
2 | 
3 | MAINTAINER Rob Syme <rob.syme@gmail.com>
4 | 
5 | RUN apt-get update && apt-get install -qqy ncbi-blast+
6 | 


--------------------------------------------------------------------------------
/Dockerfiles/pasa/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:14.04
 2 | 
 3 | MAINTAINER Rob Syme <rob.syme@gmail.com>
 4 | 
 5 | RUN apt-get update -qq && apt-get install -qqy build-essential
 6 | 
 7 | # Install Gmap
 8 | WORKDIR /opt
 9 | ADD http://research-pub.gene.com/gmap/src/gmap-gsnap-2015-07-23.tar.gz ./
10 | RUN tar -xvf gmap*.tar.gz && rm gmap*.tar.gz && mv gmap* gmap && cd gmap && ./configure && make
11 | RUN cd gmap && make install
12 | 
13 | # Install Fasta aligner
14 | RUN apt-get install -qqy zlib1g-dev
15 | ADD http://faculty.virginia.edu/wrpearson/fasta/fasta36/fasta-36.3.8.tar.gz ./
16 | RUN tar -xvf fasta*.tar.gz && rm fasta*.tar.gz && mv fasta* fasta && cd fasta/src && make -f ../make/Makefile.linux64
17 | 
18 | # Install blat aligner
19 | RUN apt-get install -qqy unzip libpng-dev
20 | ENV MACHTYPE=x86_64
21 | RUN mkdir -p ~/bin/$MACHTYPE
22 | ADD http://hgwdev.cse.ucsc.edu/~kent/src/blatSrc35.zip ./
23 | RUN unzip blat* && rm *.zip && mv blat* blat && cd blat && make
24 | 
25 | # Install DBD::mysql, etc
26 | RUN apt-get install -qqy liburi-escape-xs-perl liburi-perl mysql-client libdbd-mysql-perl 
27 | 
28 | # Install PASA
29 | ADD https://github.com/PASApipeline/PASApipeline/archive/v2.0.2.tar.gz ./
30 | RUN tar -xvf *.tar.gz && rm *.tar.gz && mv PASA* pasa && cd pasa && make
31 | ADD conf.txt /opt/pasa/pasa_conf/
32 | ENV PASAHOME=/opt/pasa
33 | 
34 | # Final PATH
35 | ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/fasta/bin:/root/bin/$MACHTYPE:/opt/blat/:/opt/fasta/bin:$PASAHOME/bin:$PASAHOME/scripts:/opt/seqclean
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/Dockerfiles/pasa/Dockerfile-pasaweb:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:14.04
 2 | 
 3 | MAINTAINER Rob Syme <rob.syme@gmail.com>
 4 | 
 5 | RUN apt-get update -qq
 6 | 
 7 | # Install DBD::mysql and apache
 8 | RUN apt-get install -qqy liburi-escape-xs-perl liburi-perl mysql-client libdbd-mysql-perl build-essential zlib1g-dev libgd-perl apache2 libgd-graph-perl
 9 | 
10 | # Install PASA
11 | WORKDIR /usr/lib/cgi-bin
12 | ADD https://github.com/PASApipeline/PASApipeline/archive/v2.0.2.tar.gz ./
13 | RUN tar -xvf *.tar.gz && rm *.tar.gz && mv PASA* pasa && cd pasa && make && chmod -R 755 .
14 | ADD conf.txt /usr/lib/cgi-bin/pasa/pasa_conf/
15 | ENV PASAHOME=/usr/lib/cgi-bin/pasa
16 | 
17 | # Final PATH
18 | ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PASAHOME/bin
19 | 
20 | ENV APACHE_RUN_USER www-data
21 | ENV APACHE_RUN_GROUP www-data
22 | ENV APACHE_PID_FILE /var/run/apache2/apache2.pid
23 | ENV APACHE_RUN_DIR /var/run/apache2
24 | ENV APACHE_LOCK_DIR /var/lock/apache2
25 | ENV APACHE_LOG_DIR /var/log/apache2
26 | 
27 | RUN a2enmod cgi
28 | 
29 | EXPOSE 80
30 | 
31 | CMD ["apache2", "-DFOREGROUND"]
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/Dockerfiles/pasa/conf.txt:
--------------------------------------------------------------------------------
 1 |  
 2 | #####################################
 3 | ## PASA admin settings ##############
 4 | #####################################
 5 | 
 6 | #emails sent to admin on job launch, success, and failure
 7 | PASA_ADMIN_EMAIL=bhaas@tigr.org
 8 | 
 9 | # database to manage pasa jobs; required for daemon-based processing.
10 | PASA_ADMIN_DB=PASA2_admin_06152006_devel
11 | 
12 | # At TIGR, we setup the PASA mysql databases under separate project quotas in different areas of the  
13 | # file system, and then provide a symlink under the mysql data or lib area.
14 | # see below for info on setting up hooks.  By default, simply keep the value below at false.
15 | USE_PASA_DB_SETUP_HOOK=false
16 | 
17 | 
18 | #####################################
19 | ## MySQL settings: ##################
20 | #####################################
21 | 
22 | # server actively running MySQL
23 | MYSQLSERVER=db
24 | 
25 | # read-only username and password
26 | MYSQL_RO_USER=root
27 | MYSQL_RO_PASSWORD=password
28 | 
29 | # read-write username and password
30 | MYSQL_RW_USER=root
31 | MYSQL_RW_PASSWORD=password
32 | 
33 | 
34 | ############################################
35 | # Web browser navigation settings: #########
36 | ############################################
37 | 
38 | BASE_PASA_URL=http://bhaas-lx:8080/cgi-bin/
39 | 
40 | 
41 | #############################################
42 | ## Transcript Sequence Cleaning #############
43 | #############################################
44 | VECTOR_DB=/usr/local/db/vector/UniVec
45 | 
46 | 
47 | 
48 | 
49 | #############################################
50 | ## Hooks ####################################
51 | #############################################
52 | #  Hooks are provided to allow custom code to be called for the following 
53 | #  routines.
54 | #  The methods must be fully qualified with their module name as static methods.
55 | #  Before calling the method, the HOOK_PERL_LIBS path listing is added to the Perl Lib path.
56 | #  The first parameter value provided to the methods is the PASA_conf hash reference, that provides
57 | #  the key value pairs for all entries in this conf file.
58 | #  A single custom parameter value can be added as the value to the 
59 | #  {Package::method}~EXTRA_PARAM key (see example below).  This single param
60 | #  can be packed with any delimiter so that the user can encode several attributes.
61 | #  the special variable __PASAHOME__ can be used and will be replaced by the PASA installation directory value.
62 | #  To access the hooks, use the &Pasa_conf::call_hook() method.
63 | 
64 | 
65 | # comma delimit paths to be added to the perl lib path so the hook modules can be found. 
66 | HOOK_PERL_LIBS=__PASAHOME__/SAMPLE_HOOKS
67 | 
68 | ############
69 | # hook that relocates the mysql database to our filesystem
70 | #####  (no such sillyness here): HOOK_PASA_DB_SETUP=Tigr_hook_routines::copy_template_db_and_symlink
71 | ##### encode some extra info in the extra parameter available:
72 | ##### Tigr_hook_routines::copy_template_db_and_symlink~EXTRA_PARAM=/export/home/software/mysql/data
73 | 
74 | 
75 | ############
76 | # hook that commits updated gene structures to the annotation database
77 | HOOK_GENE_STRUCTURE_UPDATER=Sample_annot_updater::get_updater_obj
78 | 
79 | 
80 | ############
81 | # hook that loads the latest gene structure annotations from an external source
82 | #  ie. from gff files or from a relational database.
83 | #  you build your adapter based on your data format preference.
84 | HOOK_EXISTING_GENE_ANNOTATION_LOADER=GFF3::GFF3_annot_retriever::get_annot_retriever
85 | 


--------------------------------------------------------------------------------
/Dockerfiles/pfam/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:14.04
 2 | 
 3 | MAINTAINER Rob Syme <rob.syme@gmail.com>
 4 | 
 5 | RUN apt-get update && apt-get install -yqq hmmer unzip wget
 6 | 
 7 | WORKDIR /opt
 8 | RUN wget ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam28.0/Pfam-A.hmm.gz && gunzip *.gz && hmmpress Pfam-A.hmm && rm Pfam-A.hmm
 9 | 
10 | 


--------------------------------------------------------------------------------
/Dockerfiles/proteinortho/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:14.04
 2 | 
 3 | MAINTAINER Rob Syme <rob.syme@gmail.com>
 4 | 
 5 | RUN apt-get update && apt-get install -qqy build-essential ncbi-blast+ python perl tree
 6 | 
 7 | ADD http://www.bioinf.uni-leipzig.de/Software/proteinortho/proteinortho_v5.11.tar.gz /opt/
 8 | RUN cd /opt && \
 9 |     tar -xzvf proteinortho_*.tar.gz && \
10 |     rm -rf *.tar.gz && \
11 |     mv proteinortho_v5.11 proteinortho
12 | RUN cd /usr/local/bin && find /opt/proteinortho -type f -executable | xargs -I{} ln -s {} .
13 | 
14 | CMD ["/opt/proteinortho/proteinortho5.pl"]
15 | 


--------------------------------------------------------------------------------
/Dockerfiles/tophat/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:14.04
 2 | 
 3 | MAINTAINER Rob Syme <rob.syme@gmail.com>
 4 | 
 5 | RUN apt-get update && apt-get install -qqy python
 6 | 
 7 | WORKDIR /opt
 8 | ADD https://ccb.jhu.edu/software/tophat/downloads/tophat-2.1.0.Linux_x86_64.tar.gz ./
 9 | RUN tar -xzvf *.tar.gz && rm *.tar.gz && mv tophat* tophat
10 | ENV PATH /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/augustus/bin:/opt/tophat
11 | 


--------------------------------------------------------------------------------
/Dockerfiles/trinity/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:14.04
 2 | 
 3 | MAINTAINER Rob Syme <rob.syme@gmail.com>
 4 | 
 5 | RUN apt-get update -qq && apt-get install -qqy build-essential zlib1g-dev libncurses5-dev
 6 | 
 7 | WORKDIR /opt/
 8 | ADD http://downloads.sourceforge.net/project/samtools/samtools/0.1.19/samtools-0.1.19.tar.bz2 /opt/
 9 | RUN tar -xvf samtools* && rm *.bz2 && mv samtools* samtools && cd samtools && make
10 | 
11 | RUN apt-get install -qqy unzip
12 | ADD http://downloads.sourceforge.net/project/bowtie-bio/bowtie/1.1.2/bowtie-1.1.2-linux-x86_64.zip /opt/
13 | RUN unzip bowtie* && rm *.zip && mv bowtie* bowtie
14 | 
15 | 
16 | RUN apt-get install -qqy curl openjdk-7-jre
17 | ADD https://github.com/trinityrnaseq/trinityrnaseq/archive/v2.0.6.tar.gz /opt/
18 | RUN tar -xvf *.tar.gz && rm *.tar.gz && mv trinity* trinity && cd trinity && make
19 | 
20 | ENV PATH /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/trinity:/opt/samtools:/opt/bowtie
21 | 
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Robert Syme
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # nextflow-annotate
 2 | 
 3 | This is a push to gather together some tools that are helpful for
 4 | genome annotation, and serve as a forkable, version-controlled,
 5 | reusable, and citable record of our pipeline. The steps use nextflow
 6 | as a workflow engine so we can abstract the individual steps from
 7 | their execution environment (SGE, MPI or simple local multithreading).
 8 | 
 9 | This is not a push-button solution, but it can serve as a starting
10 | point for annotating your new genome.
11 | 
12 | ## Prerequisites
13 | 
14 | The minimum prerequisites are [docker](http://docker.io) and
15 | [nextflow](http://nextflow.io), and a fasta file (henceforth
16 | `scaffolds.fasta`) of your genome assembly.
17 | 
18 | Some steps require software or data with licences that restrict
19 | distribution, but I've kept them to a minimum and will make it clear
20 | when those pieces are necessary.
21 | 
22 | ## Steps
23 | 
24 | Each of these steps corresponds to one of the nextflow recipes
25 | provided by this repository.
26 | 
27 | ### Transposon Identification
28 | 
29 | Taking cues from [jamg](http://jamg.sourceforge.net), we transcribe
30 | all of the open reading frames and then use hhblit to match against a
31 | database of known transposons. A GFF file is produced that describes
32 | to position of the transposons that we find.
33 | 
34 | This uses two docker images, which will be pulled automatically from
35 | the docker registry as needed.
36 | 
37 | ### Finding Repeats
38 | Repeats are an important part of the final genome annotation. I
39 | recommend a two-step process:
40 | 
41 | 1. Find denovo repeats with RepeatScout.
42 | 2. Use the RepeatScout output in conjuctions with the latest RepBase
43 | library as input to RepeatMasker
44 | 
45 | I've taken care of the RepeatScout and RepeatMasker installation by
46 | bundling them as docker images. The only hiccup is that RepBase
47 | requires registration.
48 | 


--------------------------------------------------------------------------------
/annotate:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env nextflow
  2 | 
  3 | params.genomes = 'data/**/tmp.fasta'
  4 | genomes = Channel.fromPath(params.genomes)
  5 | 
  6 | params.homologyProbabilityCutoff = 70
  7 | params.evalueCutoff = 1e-3
  8 | params.pvalueCutoff = 1e-6
  9 | params.scoreCutoff = 100
 10 | params.alignmentLengthCutoff = 50
 11 | params.templateLengthCutoff = 30
 12 | 
 13 | def toFasta(defLine, sequence, width=80) {
 14 |   return (">" + defLine + "\n" + wrapString(sequence, width) + "\n")
 15 | }
 16 | 
 17 | def wrapString(text, width=80) {
 18 |   def out = []
 19 |   while(text.length() > width) {
 20 |     out += text[0..(width-1)]
 21 |     text = text[width..-1]
 22 |   }
 23 |   out += text
 24 |   return out.join("\n")
 25 | }
 26 | 
 27 | class HHRHit {
 28 |   float probability
 29 |   float evalue
 30 |   float pvalue
 31 |   float score
 32 |   float structureScore
 33 |   String queryName
 34 |   String queryStart
 35 |   String queryEnd
 36 |   String revString
 37 |   String strainName  
 38 |   String description
 39 |   Integer alignmentLength
 40 |   Integer aaStart
 41 |   Integer aaStop
 42 |   Integer hitStart
 43 |   Integer hitStop
 44 |   Integer templateSize  
 45 | 
 46 |   HHRHit(String result) {
 47 |     // Pull out the query information. It will look something like:
 48 |     // Query         Scaffold_1_318 [160485 - 161177] Length:352063 [Ascochyta_fabae_Af1]
 49 |     (queryName, queryStart, queryEnd, revString, strainName) = (result =~ (/Query\s+(\S+)_\d+ \[(\d+) - (\d+)\] (\(REVERSE SENSE\))?.* \[(.*)\]/))[0][1..-1]
 50 |     
 51 |     // Find the top hit. The line will look something like:
 52 |     //   1 GB:CAA29181 ORF 1 (LINE-elemen  99.9 1.4E-28 4.1E-32  243.9   0.0   61    2-69   1216-1281(1650)
 53 |     def hitData = (result =~ /\s+1\s+(.{30})\s+(\d+\.?\d*)\s+(\d+\.?\d*E?-?\d*)\s+(\d+\.?\d*E?-?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+)\s+(\d+)-(\d+)\s+(\d+)-(\d+)\s*\((\d+)\)/)
 54 |     if (hitData.size() == 0) {
 55 |       // It's likely that many of the orfs won't have any hits. In
 56 |       // this case, just return a 'hit' with score zero
 57 |       score = 0
 58 |       structureScore = 0
 59 |     } else {
 60 |       // If we do find a hit, return a new HHRHit instance.
 61 |       description = hitData[0][1]
 62 |       probability = hitData[0][2].toFloat()
 63 |       evalue = hitData[0][3].toFloat()
 64 |       pvalue = hitData[0][4].toFloat()
 65 |       score = hitData[0][5].toFloat()
 66 |       structureScore = hitData[0][6].toFloat()
 67 |       alignmentLength = hitData[0][7].toInteger()
 68 |       aaStart = hitData[0][8].toInteger()
 69 |       aaStop = hitData[0][9].toInteger()
 70 |       hitStart = hitData[0][10].toInteger()
 71 |       hitStop = hitData[0][11].toInteger()
 72 |       templateSize = hitData[0][12].toInteger()
 73 |     }
 74 |   }
 75 | 
 76 |   String toString() {
 77 |     def out = []
 78 |     out.push "Query:         " + queryName + " (" + queryStart + "-" + queryEnd + ") [" + strainName + "]"
 79 |     out.push "  Description: '" + description + "'"
 80 |     out.push "  Probability: " + probability
 81 |     out.push "  E-value:     " + evalue
 82 |     out.push "  P-value:     " + pvalue
 83 |     out.push "  Score:       " + score
 84 |     return out.join("\n")
 85 |   }
 86 | 
 87 |   String toGFF3() {
 88 |     def hitID = description.split()[0]
 89 |     def uid = "${hitID}.s${hitStart}.e${hitStop}"
 90 |     def out = []
 91 |     out.push queryName
 92 |     out.push 'hhblits'
 93 |     out.push 'protein_match'
 94 |     out.push queryStart + 3 * aaStart - 1
 95 |     out.push queryStart + 3 * aaStop - 1
 96 |     out.push score
 97 |     out.push revString ? "-" : "+"
 98 |     out.push "."
 99 |     out.push "ID=${uid};Name=${uid};Target=${hitID} $hitStart $hitStop"
100 |     println "DONE: ${out.join('\t')}"
101 |     out.join("\t")
102 |   }
103 |   
104 |   String toHints() {
105 |     def out = []
106 |     out.push queryName
107 |     out.push 'hhblits'
108 |     out.push "nonexonpart"
109 |     out.push queryStart + 3 * aaStart - 1
110 |     out.push queryStart + 3 * aaStop - 1
111 |     out.push score
112 |     out.push revString ? "-" : "+"
113 |     out.push "."
114 |     out.push "source=RM;grp=${description.split()[0]};pri=6"
115 |     out.join("\t")
116 |   }
117 |   
118 |   String toGeneID() {
119 |     def out = []
120 |     out.push queryName
121 |     out.push 'hhblits'
122 |     out.push 'sr'
123 |     out.push queryStart + 3 * aaStart - 1
124 |     out.push queryStart + 3 * aaStop - 1
125 |     out.push score
126 |     out.push revString ? "-" : "+"
127 |     out.push '.'
128 |     out.join("\t")
129 |   }
130 | }
131 | 
132 | process cleanGenome {
133 |   input:
134 |   val genome from genomes
135 | 
136 |   output:
137 |   set name, stdout into cleanGenomes
138 |         
139 |   script:
140 |   name = genome.getParent().getBaseName()
141 | 
142 |   """
143 |   awk '/^>/ && !/[.*]/ {print(\$0, "[$name]")} /^>/ && /[.*]/ {print \$0} /^[^>]/ {print(toupper(\$0))}' '$genome'
144 |   sed -ie "s/\015//" "$genome"
145 |   """
146 | }
147 | 
148 | process getorf {
149 |   container 'robsyme/emboss'
150 | 
151 |   input:
152 |   set name, 'maskedGenome' from cleanGenomes
153 | 
154 |   output:
155 |   file 'orfs.aa.fasta' into orfFiles
156 | 
157 |   """
158 |   getorf -sequence $maskedGenome -outseq orfs.aa.fasta -minsize 150 -find 1
159 |   """
160 | }
161 | 
162 | cleanOrfs = orfFiles.splitFasta(record: [header: true, seqString: true])
163 | .filter { record ->
164 |   xCount = record.seqString.count('X')
165 |   length = record.seqString.size()
166 |   xCount / length < 0.3
167 | }
168 | .map { record ->
169 |   record.seqString = record.seqString.replaceAll('X','')
170 |   return toFasta(record.header, record.seqString)
171 | }
172 | 
173 | process hhblit {
174 |   container 'robsyme/hhblits'
175 | 
176 |   input:
177 |   file 'orf.fasta' from cleanOrfs
178 | 
179 |   output:
180 |   stdout into hhblitOutput
181 | 
182 |   """
183 |   hhblits -i orf.fasta -d /databases/transposons -maxmem 5 -cpu 1 -o stdout -e 1E-5 -E 1E-5 -id 80 -p 80 -z 0 -b 0 -B 3 -Z 3 -n 1 -mact 0.5 -v 0
184 |   """
185 | }
186 | 
187 | transposonGFFLines = Channel.create()
188 | transposonHintLines = Channel.create()
189 | transposonGeneIDLines = Channel.create()
190 | 
191 | hhblitOutput
192 | .map { String result -> new HHRHit(result) }
193 | .filter { it.score > 0 }
194 | .filter { it.probability > params.homologyProbabilityCutoff }
195 | .filter { it.evalue < params.evalueCutoff }
196 | .filter { it.pvalue < params.pvalueCutoff }
197 | .filter { it.alignmentLength > params.alignmentLengthCutoff }
198 | .filter { it.templateSize > params.templateLengthCutoff }
199 | .separate(transposonGFFLines, transposonHintLines, transposonGeneIDLines) { [ it.toGFF3(), it.toHints(), it.toGeneID() ] }
200 | 
201 | transposonGFF = transposonGFFLines.collectFile(name: 'transposon_hits.gff3')
202 | 
203 | process sortTransposonHits {
204 |   input:
205 |   file 'gff' from transposonGFF
206 | 
207 |   output:
208 |   stdout into transposonSortedGFF
209 | 
210 |   """
211 |   sort -nk 4,4 $gff | sort -sk 1,1
212 |   """
213 | }
214 | 
215 | transposonSortedGFF.subscribe {
216 |   it.moveTo('./')
217 | }
218 | 


--------------------------------------------------------------------------------
/bin/GG_trinity_accession_incrementer.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | counter = 0
 3 | while ARGF.gets
 4 |   if $_ =~ /^>GG\d\+\|(.*)\n/
 5 |     puts ">GG%d|%s" % [counter += 1, $1]
 6 |   else
 7 |     puts $_
 8 |   end
 9 | end
10 | 


--------------------------------------------------------------------------------
/bin/augustus_RNAseq_hints.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | =pod
  4 | 
  5 | =head1 TODO
  6 | 
  7 | merge all rnseq hints at the end. maintain grp
  8 | run test with 100 genes, not 390+
  9 | 
 10 | 
 11 | =head1 NAME
 12 | 
 13 |  augustus_RNAseq_hints.pl
 14 | 
 15 | =head1 USAGE
 16 | 
 17 | Create hint files for Augustus using RNASeq/EST. One is junction reads (excellent for introns), the other is RNASeq/EST coverage
 18 | 
 19 | Mandatory options:
 20 | 
 21 |  -bam|in           s  The input BAM file (co-ordinate sorted).
 22 |  -genome|fasta     s  The genome assembly FASTA file.
 23 | 
 24 | Other options:
 25 | 
 26 |  -strandness       i  If RNAseq is directional, provide direction: 0 for unknown (default); or 1 for + strand; -1 for - strand
 27 |  -min_score        i  Minimum score for parsing (defaults to 20)
 28 |  -window           i  Window size for coverage graph (defaults to 50)
 29 |  -background_fold  i  Background (defaults to 4), see perldoc
 30 |  -no_hints            Don't create hints file for Augustus, just process junction reads
 31 | 
 32 | =head1 DESCRIPTION
 33 | 
 34 | Background: The problem of getting the intron boundary correct is that rnaseq doesn't go to 0 at the intron, but continues at a background level.
 35 | For that reason, stop if it is -background_fold times lower than a previous 'good' value
 36 | 
 37 | 
 38 | =head1 AUTHORS
 39 | 
 40 |  Alexie Papanicolaou
 41 | 
 42 |         CSIRO Ecosystem Sciences
 43 |         alexie@butterflybase.org
 44 | 
 45 | =head1 DISCLAIMER & LICENSE
 46 | 
 47 | Copyright 2012-2014 the Commonwealth Scientific and Industrial Research Organization. 
 48 | See LICENSE file for license info
 49 | It is provided "as is" without warranty of any kind.
 50 | 
 51 | =cut
 52 | 
 53 | use strict;
 54 | use warnings;
 55 | use Data::Dumper;
 56 | use Getopt::Long;
 57 | use List::Util qw(sum);
 58 | use Pod::Usage;
 59 | use File::Basename;
 60 | use FindBin qw($RealBin);
 61 | use lib ("$RealBin/../PerlLib");
 62 | $ENV{PATH} .= ":$RealBin:$RealBin/../3rd_party/bin/";
 63 | 
 64 | my ( $samtools_exec, $bedtools_exec, $bed_to_aug_script ) = &check_program( 'samtools', 'bedtools','bed12_to_augustus_junction_hints.pl' );
 65 | 
 66 | 
 67 | 
 68 | #Options
 69 | my ( @bamfiles, $genome, $help,$no_hints );
 70 | my $window           = 50;
 71 | my $min_score        = 20;
 72 | my $strandness       = int(0);
 73 | my $background_level = 4;
 74 | pod2usage $! unless &GetOptions(
 75 |             'help'              => \$help,
 76 |             'bam|in:s{,}'          => \@bamfiles,
 77 |             'genome|fasta:s'    => \$genome,
 78 |             'min_score:i'       => \$min_score,
 79 |             'strandness:i'      => \$strandness,
 80 |             'window:i'          => \$window,
 81 |             'background_fold:i' => \$background_level,
 82 | 	    'nohints|no_hints'  => \$no_hints
 83 | );
 84 | 
 85 | pod2usage if $help;
 86 | 
 87 | pod2usage "Cannot find the BAM or genome FASTA file\n"
 88 |   unless $bamfiles[0]
 89 |    && -s $bamfiles[0]
 90 |    && $genome
 91 |    && ( -s $genome || -s $genome . '.fai' );
 92 | 
 93 | my $strand;
 94 | if ( !$strandness || $strandness == 0 ) {
 95 |  $strand = '.';
 96 | }
 97 | elsif ( $strandness > 0 ) {
 98 |  $strand = '+';
 99 | }
100 | elsif ( $strandness < 1 ) {
101 |  $strand = '-';
102 | }
103 | else {
104 |  die;
105 | }
106 | 
107 | my $master_bamfile;
108 | if (scalar(@bamfiles == 1)){
109 |   $master_bamfile = $bamfiles[0];
110 | }else{
111 | 	foreach my $bamfile (@bamfiles){
112 | 		die "Cannot find $bamfile\n" unless -s $bamfile;
113 | 	}
114 | 	$master_bamfile = 'master_bamfile.bam';
115 | 	&process_cmd("$samtools_exec merge -r $master_bamfile ".join(" ",@bamfiles)) unless -s $master_bamfile;
116 | }
117 | 
118 | &process_cmd("$samtools_exec faidx $genome") unless -s $genome . '.fai';
119 | die "Cannot index genome $genome\n" unless -s $genome . '.fai';
120 | 
121 | unless (-e "$master_bamfile.junctions.completed"){
122 |  &process_cmd("$samtools_exec rmdup -S $master_bamfile - | $bedtools_exec bamtobed -bed12 | $bed_to_aug_script -prio 7 -out $master_bamfile.junctions.bed > $master_bamfile.junctions.hints" );
123 |  # For JBrowse
124 |  &process_cmd("$bedtools_exec bedtobam -bed12 -g $genome.fai -i $master_bamfile.junctions.bed| $samtools_exec sort -m 1073741824 - $master_bamfile.junctions");
125 |  &process_cmd("$samtools_exec index $master_bamfile.junctions.bam");
126 |  # For Augustus
127 |  &only_keep_intronic("$master_bamfile.junctions.hints");
128 |  &touch("$master_bamfile.junctions.completed");
129 | }
130 | 
131 | unless (-e "$master_bamfile.coverage.bg.completed"){
132 |  # For JBrowse
133 |  &process_cmd("$bedtools_exec genomecov -split -bg -g $genome.fai -ibam $master_bamfile| sort -S 1G -k1,1 -k2,2n > $master_bamfile.coverage.bg");
134 |  &process_cmd("bedGraphToBigWig $master_bamfile.coverage.bg $genome.fai $master_bamfile.coverage.bw") if `which bedGraphToBigWig`; 
135 |  &touch("$master_bamfile.coverage.bg.completed");
136 | }
137 | 
138 | unless (-e "$master_bamfile.coverage.hints.completed" && !$no_hints){
139 |  &bg2hints("$master_bamfile.coverage.bg") ;
140 |  &merge_hints("$master_bamfile.coverage.hints");
141 |  &touch("$master_bamfile.coverage.hints.completed");
142 | }
143 | 
144 | if (    -e "$master_bamfile.junctions.completed"
145 |      && -e "$master_bamfile.coverage.hints.completed" )
146 | {
147 |  unless (-e "$master_bamfile.rnaseq.completed"){
148 |   my $augustus_script_exec = $RealBin.'/../3rd_party/augustus/scripts/join_mult_hints.pl';
149 |   if (-s $augustus_script_exec){
150 |   	&process_cmd("cat $master_bamfile.junctions.hints.intronic $master_bamfile.coverage.hints| sort -S 1G -n -k 4,4 | sort -S 1G -s -n -k 5,5 | sort -S 1G -s -n -k 3,3 | sort -S 1G -s -k 1,1| $augustus_script_exec > $master_bamfile.rnaseq.hints" );
151 | 	  &touch("$master_bamfile.rnaseq.completed");
152 |   }
153 |  }
154 |  print "Done!\n";
155 | }
156 | elsif (!$no_hints) {
157 |  die "Something went wrong....\n";
158 | }else{
159 |   print "Done, no hints were processed as requested\n";
160 | }
161 | ###
162 | sub check_program() {
163 |  my @paths;
164 |  foreach my $prog (@_) {
165 |   my $path = `which $prog`;
166 |   pod2usage "Error, path to a required program ($prog) cannot be found\n\n"
167 |     unless $path =~ /^\//;
168 |   chomp($path);
169 |   $path = readlink($path) if -l $path;
170 |   push( @paths, $path );
171 |  }
172 |  return @paths;
173 | }
174 | ###
175 | sub process_cmd {
176 |  my ($cmd) = @_;
177 |  print "CMD: $cmd\n";
178 |  my $ret = system($cmd);
179 |  if ( $ret && $ret != 256 ) {
180 |   die "Error, cmd died with ret $ret\n";
181 |  }
182 |  return $ret;
183 | }
184 | 
185 | sub bg2hints() {
186 |  my $bg      = shift;
187 |  my $outfile = $bg;
188 |  $outfile =~ s/.bg$/.hints/;
189 |  open( IN, $bg );
190 |  my ( @array, %area );
191 |  while ( my $ln = <IN> ) {
192 |   chomp($ln);
193 |   my @data = split( "\t", $ln );
194 |   next unless $data[3] >= $min_score;
195 |   # store data in an array
196 |   for ( my $i = $data[1] ; $i <= $data[2] ; $i++ ) {
197 |    # co-ords in bg are 0-based; hints/gff is 1-based
198 |    $area{ $data[0] }{$i+1} = $data[3];
199 |   }
200 |  }
201 | 
202 |  # print final area
203 | #TODO: NB this is still wrong.
204 | #~/workspace/transcripts4community/jamg/test_suite 
205 | #rm -f gsnap.drosoph_50M_vs_droso_opt_temp.concordant_uniq.bam.coverage.hints.completed gsnap.drosoph_50M_vs_droso_opt_temp.concordant_uniq.bam.coverage.hints ; ../bin/augustus_RNAseq_hints.pl -dir ../3rd_party/augustus.2.7 -bam gsnap.drosoph_50M_vs_droso_opt_temp.concordant_uniq.bam -genome optimization.fasta; less gsnap.drosoph_50M_vs_droso_opt_temp.concordant_uniq.bam.coverage.hints
206 |  open( OUT, ">$outfile" );
207 |  foreach my $ref ( sort { $a cmp $b } keys %area ) {
208 |   my @coords = sort { $a <=> $b } ( keys %{ $area{$ref} } );
209 |   for ( my $i = $coords[0] ; $i < @coords ; $i++ ) {
210 |    next if ( !$area{$ref}{$i} );
211 |    my $k = $i + $window;
212 |    $k-- while ( !$area{$ref}{$k} );
213 |    next if $k == $i;
214 |    my @splice;
215 |    
216 |    for ( my $v = $i ; $v <= $k ; $v++ ) {
217 |     my $level = $area{$ref}{$v};
218 |     my $previous_level = $v eq $i ? int(0) : $area{$ref}{$v-1};
219 |     my $next_level = $v eq $k ? 1e6 : $area{$ref}{$v+1};
220 |     # the problem of getting the intron boundary correct is that
221 |     # rnaseq doesn't go to 0 at the intron, but continues at a
222 |     # background level. stop if it is 4 times lower than a previous 'good' value
223 |     if (
224 |     !$level ||
225 |      ( $previous_level  && ( $previous_level > ( $level * $background_level ) ))
226 |      || $next_level && ($level > ( $next_level * $background_level ))
227 |      )
228 |     {
229 |      $k = $v - 1;
230 |      last;
231 |     }
232 |     push( @splice, $level );
233 |    }
234 | #   next if scalar(@splice) < ( $window / 2 );
235 |    my $median = &median( \@splice );
236 |    $median = $splice[0] if !$median;
237 |    next unless $median && $median >= $min_score;
238 |    print OUT $ref
239 |      . "\tRNASeq\texonpart\t"
240 |      . $i . "\t"
241 |      . $k . "\t"
242 |      . $median
243 |      . "\t$strand\t.\tsrc=R;pri=4\n";
244 |    $i +=  $window ;
245 |   }
246 |  }
247 | 
248 |  close OUT;
249 |  close IN;
250 |  return $outfile;
251 | }
252 | 
253 | sub only_keep_intronic(){
254 |  my $file = shift;
255 |  my %hash;
256 |  open (IN,$file);
257 |  while (my $ln=<IN>){
258 |   next unless $ln=~/\tintron\t/;
259 |   if ($ln=~/grp=([^;]+)/){
260 |    $hash{$1}++;
261 |   }
262 |  }
263 |  close IN;
264 |  open (IN,$file);
265 |  open (OUT,">".$file.".intronic");
266 |  while (my $ln=<IN>){
267 |   if ($ln=~/\tintron\t/){
268 |    print OUT $ln ;
269 |   }
270 |   elsif ($ln=~/grp=([^;]+)/){
271 |    print OUT $ln if $hash{$1};
272 |   }
273 |  }
274 |  close IN;
275 |  close OUT;
276 | }
277 | 
278 | sub merge_hints(){
279 |  my $file = shift;
280 |  open (IN,$file);
281 |  open (OUT,">$file.merged");
282 |  my (@current_line,@previous_line);
283 | while (<IN>) {
284 |     @current_line = split /\t/;
285 |     if (!@previous_line){
286 |         @previous_line = @current_line;
287 |     }elsif(($current_line[0] eq $previous_line[0]) && ($current_line[2] eq $previous_line[2]) && 
288 |     (($current_line[3] >= $previous_line[3]) && ($current_line[4] <= $previous_line[4]))
289 |       && ($current_line[6] eq $previous_line[6])){
290 |      # update previous_line by adding current to it
291 |         chomp($previous_line[8]);
292 |         $previous_line[8] =~ s/(grp=[^;]*);*//;
293 |         my $grp = $1;
294 |         $grp .= ';' if $grp;
295 |         $grp = '' if !$grp;
296 |         my ($lm,$m)=(1,1);
297 |         if ($previous_line[8] =~ /mult=(\d+);/){
298 |             $lm = $1;
299 |             $previous_line[8] =~ s/mult=\d+;//;
300 |         }
301 |         if ($current_line[8] =~ /mult=(\d+);/){
302 |             $m = $1;
303 |         }
304 |         $previous_line[8] = "mult=" . ($lm+$m) . ";$grp" . $previous_line[8]."\n";
305 |      
306 |     }elsif (
307 |     !(($current_line[0] eq $previous_line[0]) && ($current_line[2] eq $previous_line[2]) && ($current_line[3] == $previous_line[3]) && ($current_line[4] == $previous_line[4])  && ($current_line[6] eq $previous_line[6]))
308 |     ){
309 |         print OUT join("\t",@previous_line);
310 |         @previous_line = @current_line;
311 |     }
312 |     
313 |      else {
314 |         # update previous_line by adding current to it
315 |         chomp($previous_line[8]);
316 |         $previous_line[8] =~ s/(grp=[^;]*);*//;
317 |         my $grp = $1;
318 |         $grp .= ';' if $grp;
319 |         $grp = '' if !$grp;
320 |         my ($lm,$m)=(1,1);
321 |         if ($previous_line[8] =~ /mult=(\d+);/){
322 |             $lm = $1;
323 |             $previous_line[8] =~ s/mult=\d+;//;
324 |         }
325 |         if ($current_line[8] =~ /mult=(\d+);/){
326 |             $m = $1;
327 |         }
328 |         $previous_line[8] = "mult=" . ($lm+$m) . ";$grp" . $previous_line[8]."\n";
329 |     }
330 |  }
331 |   print OUT join("\t",@previous_line) if (@previous_line);
332 |   close IN;
333 |   close OUT;
334 |   unlink($file);
335 |   rename($file.'.merged',$file);
336 | }
337 | 
338 | sub touch() {
339 |  my $file = shift;
340 |  system("touch $file");
341 | }
342 | 
343 | sub mean() {
344 |  return sum(@_) / @_;
345 | }
346 | 
347 | sub median() {
348 |  my $array_ref = shift;
349 |  my @sorted = sort { $a <=> $b } @{$array_ref};
350 |  return $sorted[ int( @sorted / 2 ) ];
351 | }
352 | 


--------------------------------------------------------------------------------
/bin/bed12_to_augustus_junction_hints.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | =pod
  4 | 
  5 | =head1 USAGE
  6 | 
  7 |  This script will create a hints file for Augustus using junction reads. Junction reads are important because they annotate the introns.
  8 |  Give a bed12 file of junction reads (reduced with samtools dedup if possible) to get intron/exon boundary hints. See bedtools bamtobed to create the bed12
  9 | 
 10 |  example
 11 |   samtools rmdup -S SRR836188.coordSorted.bam - | bedtools bamtobed -bed12 | bed12_to_augustus_junction_hints.pl| ~/software/augustus/scripts/join_mult_hints.pl 
 12 | 
 13 |  Options:
 14 | 
 15 |  -help              This!
 16 |  -exon_min   :i     Minimum exon size (def. 50bp)
 17 |  -score_min  :i     Minimum score (def. 30)
 18 |  -max_exons  :i     Maximum number of exons that a single can span (def. 3)
 19 |  -min_match  :i     Number of min bases for each side of gap (def 20)
 20 |  -strandness :i     If RNAseq is directional, provide direction: 0 for unknown (default); or 1 for + strand; -1 for - strand
 21 | 
 22 | =head1 FORMATS
 23 | 
 24 |  BED12 input format
 25 |  
 26 |     1 chrom - The name of the chromosome (e.g. chr3, chrY, chr2_random) or scaffold (e.g. scaffold10671).
 27 |     2 chromStart - The starting position of the feature in the chromosome or scaffold. 
 28 |  NB The first base in a chromosome is numbered 0.
 29 |     3 chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99. 
 30 |     4 name - Defines the name of the BED line. This label is displayed to the left of the BED line in the Genome Browser window when the track is open to full display mode or directly to the left of the item in pack mode.
 31 |     5 score - A score between 0 and 1000. If the track line useScore attribute is set to 1 for this annotation data set, the score value will determine the level of gray in which this feature is displayed (higher numbers = darker gray). This table shows the Genome Browser's translation of BED score values into shades of gray:
 32 |     6 strand - Defines the strand - either '+' or '-'.
 33 |     7 thickStart - The starting position at which the feature is drawn thickly (for example, the start codon in gene displays).
 34 |     8 thickEnd - The ending position at which the feature is drawn thickly (for example, the stop codon in gene displays).
 35 |     9 itemRgb - An RGB value of the form R,G,B (e.g. 255,0,0). If the track line itemRgb attribute is set to "On", this RBG value will determine the display color of the data contained in this BED line. NOTE: It is recommended that a simple color scheme (eight colors or less) be used with this attribute to avoid overwhelming the color resources of the Genome Browser and your Internet browser.
 36 |     10 blockCount - The number of blocks (exons) in the BED line.
 37 |     11 blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount.
 38 |     12 blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount. 
 39 | 
 40 |  IN example
 41 |     scaffold_0      83      514     USI-EAS034_0010:2:97:6859:21372#0/1     40      +       83      514     255,0,0 2       119,6   0,425
 42 | 
 43 |  OUT format
 44 |   GFF3 with exonpart and intronpart
 45 |  scaffold_0      RNASeq        intronpart      1262    1414    .       -       .       src=JR;pri=5;grp=readname
 46 | 
 47 | =head1 AUTHORS
 48 | 
 49 |  Alexie Papanicolaou
 50 | 
 51 |         CSIRO Ecosystem Sciences
 52 |         alexie@butterflybase.org
 53 | 
 54 | =head1 DISCLAIMER & LICENSE
 55 | 
 56 | Copyright 2012-2014 the Commonwealth Scientific and Industrial Research Organization. 
 57 | See LICENSE file for license info
 58 | It is provided "as is" without warranty of any kind.
 59 | 
 60 | 
 61 | =cut
 62 | 
 63 | use strict;
 64 | use warnings;
 65 | use Pod::Usage;
 66 | use Getopt::Long;
 67 | use FindBin qw($RealBin);
 68 | use lib ("$RealBin/../PerlLib");
 69 | $ENV{PATH} .= ":$RealBin:$RealBin/../3rd_party/bin/";
 70 | 
 71 | my $min_exon_size = 50;
 72 | my $min_score     = 30;
 73 | my $max_exons     = 3;
 74 | my $min_match     = 20;
 75 | my ($help);
 76 | my $priority    = 5;
 77 | my $strandness = int(0);
 78 | my $bed_outfile = 'junctions.bed';
 79 | pod2usage $! unless &GetOptions(
 80 |             'help'        => \$help,
 81 |             'exon_min:i'  => \$min_exon_size,
 82 |             'score_min:i' => \$min_score,
 83 |             'max_exons:i' => \$max_exons,
 84 |             'min_match:i' => \$min_match,
 85 |             'outfile:s'   => \$bed_outfile,
 86 |             'priority:i'  => \$priority,
 87 | 	    'strandness:i' => \$strandness  
 88 | );
 89 | 
 90 | pod2usage if $help;
 91 | 
 92 | my $strand;
 93 | if (!$strandness || $strandness == 0 ){
 94 | 	$strand = '.';
 95 | }elsif ($strandness > 0){
 96 | 	$strand = '+';
 97 | }elsif ($strandness < 1){
 98 | 	$strand = '-';
 99 | }else{
100 | die;
101 | }
102 | 
103 | open( BEDJUNCTIONS, ">$bed_outfile" );
104 | 
105 | OUTER: while ( my $ln = <STDIN> ) {
106 |  chomp($ln);
107 |  my @data = split( "\t", $ln );
108 | 
109 | # too many blocks - i.e. too many exons are being linked... biologically impossible?!
110 |  next if $data[9] > $max_exons;
111 | 
112 |  #too low score
113 |  next if $data[4] < $min_score;
114 | 
115 |  # numbering from 1
116 |  $data[1]++;
117 |  $data[2]++;
118 | 
119 |  #remove any /1 /2 from read name
120 |  $data[3] =~ s/\/[0-2]$//;
121 |  my @blockSizes  = split( ",", $data[10] );
122 |  my @blockStarts = split( ",", $data[11] );
123 |  die unless scalar(@blockSizes) == scalar(@blockStarts);
124 |  for ( my $i = 0 ; $i < @blockStarts ; $i++ ) {
125 |   next OUTER if $blockSizes[$i] < $min_match;
126 |   $blockStarts[$i] += $data[1];
127 |  }
128 |  if ( scalar(@blockSizes) == 1 ) {
129 | 
130 |   # no intron
131 |   my $type  = 'exonpart';
132 |   my $start = $data[1];
133 |   my $stop  = $data[2];
134 |   print $data[0]
135 |     . "\tRNASeq\t"
136 |     . $type . "\t"
137 |     . $start . "\t"
138 |     . $stop . "\t"
139 |     . $data[4] 
140 |     . "\t$strand\t.\tsrc=JR;pri=$priority;grp="
141 |     . $data[3] . ";\n";
142 |  }
143 |  else {
144 |   print BEDJUNCTIONS $ln . "\n";
145 | 
146 |   #exons first
147 |   for ( my $i = 0 ; $i < scalar(@blockStarts) ; $i++ ) {
148 |    my $type  = 'exonpart';
149 |    my $start = $blockStarts[$i];
150 |    my $stop  = $start + $blockSizes[$i] - 1;
151 |    print $data[0]
152 |      . "\tRNASeq\t"
153 |      . $type . "\t"
154 |      . $start . "\t"
155 |      . $stop . "\t"
156 |      . $data[4] 
157 |      . "\t$strand\t.\tsrc=JR;pri=$priority;grp="
158 |      . $data[3] . ";\n";
159 |   }
160 | 
161 |   #introns
162 |   for ( my $i = 1 ; $i < scalar(@blockStarts) ; $i++ ) {
163 |    my $type  = 'intron';
164 |    my $start = ( $blockStarts[ $i - 1 ] + $blockSizes[ $i - 1 ] - 1 ) + 1;
165 |    my $stop  = $blockStarts[$i] - 1;
166 |    print $data[0]
167 |      . "\tRNASeq\t"
168 |      . $type . "\t"
169 |      . $start . "\t"
170 |      . $stop . "\t"
171 |      . $data[4] 
172 |      . "\t$strand\t.\tsrc=JR;pri=$priority;grp="
173 |       . $data[3] . ";\n";
174 |   }
175 |  }
176 | }
177 | close BEDJUNCTIONS;
178 | 


--------------------------------------------------------------------------------
/bin/exonerate_to_genbank.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | require 'pp'
 3 | require 'bio'
 4 | require 'optparse'
 5 | 
 6 | options = {}
 7 | OptionParser.new do |opts|
 8 |   opts.banner = "Usage: exonerage_to_genbank.rb [options]"
 9 | 
10 |   opts.on("-f", "--fasta genome", "Genome fasta file") do |filename|
11 |     options[:fasta] = filename
12 |     #TODO: Check for existance of file.
13 |   end
14 | end.parse!
15 | 
16 | def to_locations(match)
17 |   puts match.captures.join("\t")
18 |   pos = match[:target_start].to_i
19 |   match[:vulgar]
20 |     .split
21 |     .each_slice(3)
22 |     .chunk{ |type, q, t| case type; when /[MS]/; :coding; when /[5I3]/; :intron; else; :other; end}
23 |     .map{ |cls, a| [cls, a.map{|type, q, t| t.to_i}.inject(:+)] }
24 |     .each{|a| p a}
25 | end
26 | 
27 | genome = Hash[Bio::FlatFile.open(options[:fasta]).map{|entry| [entry.entry_id,entry.naseq] }]
28 | genes = Hash.new{|h,k| h[k]=[]}
29 | 
30 | while ARGF.gets
31 |   next unless $_ =~ (/vulgar: (?<query_id>\S+) (?<query_start>\d+) (?<query_end>\d+) (?<query_strand>.) (?<target_id>\S+) (?<target_start>\d+) (?<target_end>\d+) (?<target_strand>.) (?<score>\d+) (?<vulgar>.*)\n/)
32 |   to_locations($~)
33 | end
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/bin/fullerCegmaGFF.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | def generate_transcript_line(id, lines)
 4 |   minPos = lines.min_by{|line| line[3].to_i}[3]
 5 |   maxPos = lines.max_by{|line| line[4].to_i}[4]
 6 |   f = lines.first
 7 |   [] << f[0] << f[1] << "mRNA" << minPos << maxPos << "." << f[6] << "." << "ID=t.#{id}"
 8 | end
 9 | 
10 | def adjust_attributes(id, line)
11 |   line[8] = "ID=c.#{id};Parent=t.#{id}"
12 |   line
13 | end
14 | 
15 | # Read all lines of the GFF
16 | lines = ARGF
17 |   .map do |line|
18 |   split = line.chomp.split("\t")
19 |   split[2] = "CDS"
20 |   split[3] = split[3].to_i
21 |   split[4] = split[4].to_i
22 |   split
23 | end
24 | 
25 | ## Can we sort numerically rather than alphabetically?
26 | 
27 | # First we find the longest common prefix for the chromosome/scaffold names
28 | items = lines.map{|line| line[0]}.uniq
29 | prefix = ''
30 | min, max = items.sort.values_at(0, -1)
31 | min.split(//).each_with_index do |c, i|
32 |   break if c != max[i, 1]
33 |   prefix << c
34 | end
35 | 
36 | # Then make a regular expression that matches the common prefix and then some digits
37 | re = Regexp.new(prefix << "\\d+$")
38 | 
39 | # If *all* of the chromosome/scaffold names match the regular
40 | # expression, we sort on the trailing digits. Otherwise we sort alphabetically
41 | sort_alphabetical = lambda {|line| [line[0][0], line[3][0]]}
42 | sort_numeric = lambda {|line| [line[0][0].match(/\d+$/)[-1].to_i, line[0][3]]}
43 | match_method = items.all?{|item| item =~ re} ? sort_numeric : sort_alphabetical
44 | 
45 | lines.sort_by{|split| split[8]}
46 |   .chunk{|line| line[8]}
47 |   .map do |id, lines|
48 |   transcript = generate_transcript_line(id, lines)
49 |   lines
50 |     .map{|line| adjust_attributes(id, line)}
51 |     .unshift(transcript)
52 | end
53 |   .sort_by(&match_method)
54 |   .each do |a|
55 |   puts a.map{|line| line.join("\t")}
56 | end
57 | 


--------------------------------------------------------------------------------
/bin/gff2gb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Convert a GFF and associated FASTA file into GenBank format.
 3 | 
 4 | Usage:
 5 |     gff_to_genbank.py <GFF annotation file> <FASTA sequence file>
 6 | """
 7 | import sys
 8 | import os
 9 | 
10 | from Bio import SeqIO
11 | from Bio.Alphabet import generic_dna
12 | from Bio import Seq
13 | 
14 | from BCBio import GFF
15 | 
16 | def main(gff_file, fasta_file):
17 |     out_file = "%s.gb" % os.path.splitext(gff_file)[0]
18 |     fasta_input = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta", generic_dna))
19 |     gff_iter = GFF.parse(gff_file, fasta_input)
20 |     SeqIO.write(_check_gff(_fix_ncbi_id(gff_iter)), out_file, "genbank")
21 | 
22 | def _fix_ncbi_id(fasta_iter):
23 |     """GenBank identifiers can only be 16 characters; try to shorten NCBI.
24 |     """
25 |     for rec in fasta_iter:
26 |         if len(rec.name) > 16 and rec.name.find("|") > 0:
27 |             new_id = [x for x in rec.name.split("|") if x][-1]
28 |             print "Warning: shortening NCBI name %s to %s" % (rec.id, new_id)
29 |             rec.id = new_id
30 |             rec.name = new_id
31 |         yield rec
32 | 
33 | def _check_gff(gff_iterator):
34 |     """Check GFF files before feeding to SeqIO to be sure they have sequences.
35 |     """
36 |     for rec in gff_iterator:
37 |         if isinstance(rec.seq, Seq.UnknownSeq):
38 |             print "Warning: FASTA sequence not found for '%s' in GFF file" % (
39 |                     rec.id)
40 |             rec.seq.alphabet = generic_dna
41 |         yield _flatten_features(rec)
42 | 
43 | def _flatten_features(rec):
44 |     """Make sub_features in an input rec flat for output.
45 | 
46 |     GenBank does not handle nested features, so we want to make
47 |     everything top level.
48 |     """
49 |     out = []
50 |     for f in rec.features:
51 |         cur = [f]
52 |         while len(cur) > 0:
53 |             nextf = []
54 |             for curf in cur:
55 |                 out.append(curf)
56 |                 if len(curf.sub_features) > 0:
57 |                     nextf.extend(curf.sub_features)
58 |             cur = nextf
59 |     rec.features = out
60 |     return rec
61 | 
62 | if __name__ == "__main__":
63 |     main(*sys.argv[1:])
64 | 


--------------------------------------------------------------------------------
/bin/gff_transpose.rb:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env ruby
  2 | 
  3 | require 'optparse'
  4 | require 'ostruct'
  5 | require 'pathname'
  6 | require 'set'
  7 | require 'bio'
  8 | require 'pp'
  9 | 
 10 | class Options
 11 |   def self.parse(args)
 12 |     options = OpenStruct.new
 13 |     opts = OptionParser.new do |opts|
 14 |       opts.banner = "Usage: #{$0} [options] --from GFF --to GFF"
 15 |       opts.separator ""
 16 |       opts.separator "Specific options:"
 17 | 
 18 |       opts.on("-f", "--from FILENAME (Required)", "GFF file with features to transpose") do |filename|
 19 |         path = Pathname.new(filename)
 20 |         if path.exist?
 21 |           options.from = path
 22 |         else
 23 |           $stderr.puts("ERROR: Could not find the file #{filename}")
 24 |           $stderr.puts opts.banner
 25 |           exit(1)
 26 |         end
 27 |       end
 28 | 
 29 |       opts.on("-t", "--to FILENAME (Required)", "GFF file describing where the proteins are in nucleotide coorinates") do |filename|
 30 |         path = Pathname.new(filename)
 31 |         if path.exist?
 32 |           options.to = path
 33 |         else
 34 |           $stderr.puts("ERROR: Could not find the file #{filename}")
 35 |           $stderr.puts opts.banner
 36 |           exit(1)
 37 |         end
 38 |       end
 39 | 
 40 |     end
 41 |     opts.parse!(args)
 42 | 
 43 |     unless options.from
 44 |       $stderr.puts "Error: No *TO* GFF3 file supplied\n"
 45 |       $stderr.puts opts.banner
 46 |       exit(1)
 47 |     end
 48 | 
 49 |     unless options.to
 50 |       $stderr.puts "Error: No *FROM* GFF3 file supplied\n"
 51 |       $stderr.puts opts.banner
 52 |       exit(1)
 53 |     end
 54 | 
 55 |     options
 56 |   end
 57 | end
 58 | options = Options.parse(ARGV)
 59 | 
 60 | records_lookup = Bio::GFF::GFF3.new(File.read(options.to))
 61 |                  .records
 62 |                  .find_all{ |record| record.feature == "exon" }
 63 |                  .to_set
 64 |                  .classify{ |record| Hash[record.attributes]["Parent"].gsub('mRNA', 'exon_') }
 65 | 
 66 | File.open(options.from).take_while{ |line| line !~ /FASTA/ }.each do |line|
 67 |   next if line =~ /^#/
 68 | 
 69 |   seqid, source, type, hit_start, hit_stop, score, strand, phase, attributes = line.chomp.split("\t")
 70 |   hit_start = hit_start.to_i
 71 |   hit_stop = hit_stop.to_i
 72 | 
 73 |   begin
 74 |     records = records_lookup[seqid]
 75 |               .sort_by{ |record| record.start }
 76 |               .map{ |record| record.strand = "+" unless record.strand; record }
 77 |   rescue
 78 |     $stderr.puts "\n\nCould not find lookup for '#{seqid}'"
 79 |     exit(1)
 80 |   end
 81 |   
 82 |   exon_length = records.inject(0) do |mem, record|
 83 |     mem += record.end - record.start + 1
 84 |   end
 85 | 
 86 |   ranges = case [records.first.strand,strand].join
 87 |            when "++"
 88 |              [Range.new(hit_start - 1, hit_stop)]
 89 |            when "-+"
 90 |              [Range.new(exon_length - hit_stop, exon_length - hit_start)]
 91 |            when "+-"
 92 |              [Range.new(hit_start - 1, hit_stop - 1)]
 93 |            when "--"
 94 |              [Range.new(exon_length - hit_stop + 1, exon_length - hit_start)]
 95 |            end
 96 |            .map!{ |range| Range.new(range.first + records.first.start, range.last + records.first.start)}
 97 | 
 98 |   records
 99 |     .each_cons(2)
100 |     .map{ |a, b| Range.new(a.end + 1, b.start - 1) }
101 |     .reduce(ranges) do |mem, intron|
102 |     size = intron.last - intron.first + 1
103 |     mem.flat_map do |range|
104 |       # Is there an overlap between this range and an intron?
105 |       if (range.first <= intron.last) and (intron.first <= range.last)
106 |         # If we introduce the intron, does the shifted range still overlap the intron location?
107 |         if range.first + size <= intron.last
108 |           # If so, make the new intron
109 |           [Range.new(range.first, intron.first - 1), Range.new(intron.last + 1, intron.last + range.last - intron.first + 1)]
110 |         else
111 |           # If not, we can just move the region to the right by the intron size
112 |           Range.new(range.first + size, range.last + size)
113 |         end
114 |       elsif intron.last < range.first
115 |         # If there is no overlap and the region is still to the right, we move it right.
116 |         Range.new(range.first + size, range.last + size)
117 |       else
118 |         range
119 |       end
120 |     end
121 |   end.each do |range|
122 |     next if source == "."
123 |     puts [
124 |       records.first.seqname,
125 |       source,
126 |       type,
127 |       range.first,
128 |       range.last,
129 |       score,
130 |       records.first.strand == strand ? "+" : "-",
131 |       ".",
132 |       attributes
133 |     ].join("\t")
134 |   end
135 | end
136 | 


--------------------------------------------------------------------------------
/bin/gff_transpose.rb~:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env ruby
  2 | 
  3 | require 'optparse'
  4 | require 'ostruct'
  5 | require 'pathname'
  6 | require 'set'
  7 | require 'bio'
  8 | 
  9 | 
 10 | class Options
 11 |   def self.parse(args)
 12 |     options = OpenStruct.new
 13 |     opts = OptionParser.new do |opts|
 14 |       opts.banner = "Usage: #{$0} [options] --from GFF --to GFF"
 15 |       opts.separator ""
 16 |       opts.separator "Specific options:"
 17 | 
 18 |       opts.on("-f", "--from FILENAME (Required)", "GFF file with features to transpose") do |filename|
 19 |         path = Pathname.new(filename)
 20 |         if path.exist?
 21 |           options.from = path
 22 |         else
 23 |           $stderr.puts("ERROR: Could not find the file #{filename}")
 24 |           $stderr.puts opts.banner
 25 |           exit(1)
 26 |         end
 27 |       end
 28 | 
 29 |       opts.on("-t", "--to FILENAME (Required)", "GFF file describing where the proteins are in nucleotide coorinates") do |filename|
 30 |         path = Pathname.new(filename)
 31 |         if path.exist?
 32 |           options.to = path
 33 |         else
 34 |           $stderr.puts("ERROR: Could not find the file #{filename}")
 35 |           $stderr.puts opts.banner
 36 |           exit(1)
 37 |         end
 38 |       end
 39 | 
 40 |     end
 41 |     opts.parse!(args)
 42 | 
 43 |     unless options.from
 44 |       $stderr.puts "Error: No *TO* GFF3 file supplied\n"
 45 |       $stderr.puts opts.banner
 46 |       exit(1)
 47 |     end
 48 | 
 49 |     unless options.to
 50 |       $stderr.puts "Error: No *FROM* GFF3 file supplied\n"
 51 |       $stderr.puts opts.banner
 52 |       exit(1)
 53 |     end
 54 | 
 55 |     options
 56 |   end
 57 | end
 58 | options = Options.parse(ARGV)
 59 | 
 60 | records_lookup = Bio::GFF::GFF3.new(File.read(options.to))
 61 |   .records
 62 |   .find_all{ |record| record.feature == "exon" }
 63 |   .to_set
 64 |   .classify{ |record| Hash[record.attributes]["Parent"].gsub('mRNA', 'exon_') }
 65 | 
 66 | File.open(options.from).take_while{ |line| line !~ /FASTA/ }.each do |line|
 67 |   next if line =~ /^#/
 68 | 
 69 |   seqid, source, type, hit_start, hit_stop, score, strand, phase, attributes = line.split("\t")
 70 |   hit_start = hit_start.to_i
 71 |   hit_stop = hit_stop.to_i
 72 | 
 73 |   begin
 74 |     records = records_lookup[seqid].sort_by{ |record| record.start }
 75 |   rescue
 76 |     $stderr.puts "\n\nCould not find lookup for '#{seqid}'"
 77 |     exit(1)
 78 |   end
 79 |   
 80 |   protein_length = records.inject(0) do |mem, record|
 81 |     mem += record.end - record.start + 1
 82 |   end / 3 - 1
 83 | 
 84 |   begin
 85 |     strand = records.first.strand
 86 |   rescue
 87 |     $stderr.puts "Could not find a match to this line:"
 88 |     $stderr.puts line.chomp
 89 |     exit(1)
 90 |   end
 91 | 
 92 |   ranges = case records.first.strand
 93 |            when "+"
 94 |              [Range.new(hit_start - 1, hit_stop)]
 95 |            when "-"
 96 |              [Range.new(protein_length - hit_stop + 1, protein_length - hit_start + 2)]
 97 |            end
 98 |     .map!{ |range| Range.new((range.first) * 3, (range.last) * 3 - 1)}
 99 |     .map!{ |range| Range.new(range.first + records.first.start, range.last + records.first.start)}
100 | 
101 |   records
102 |     .each_cons(2)
103 |     .map{ |a, b| Range.new(a.end + 1, b.start - 1) }
104 |     .reduce(ranges) do |mem, intron|
105 |     size = intron.last - intron.first + 1
106 |     mem.flat_map do |range|
107 |       # Is there an overlap between this range and an intron?
108 |       if (range.first <= intron.last) and (intron.first <= range.last)
109 |         # If we introduce the intron, does the shifted range still overlap the intron location?
110 |         if range.first + size <= intron.last
111 |           # If so, make the new intron
112 |           [Range.new(range.first, intron.first - 1), Range.new(intron.last + 1, intron.last + range.last - intron.first + 1)]
113 |         else
114 |           # If not, we can just move the region to the right by the intron size
115 |           Range.new(range.first + size, range.last + size)
116 |         end
117 |       elsif intron.last < range.first
118 |         # If there is no overlap and the region is still to the right, we move it right.
119 |         Range.new(range.first + size, range.last + size)
120 |       else
121 |         range
122 |       end
123 |     end
124 |   end.each do |range|
125 |     next if source == "."
126 |     puts [
127 |           records.first.seqname,
128 |           source,
129 |           type,
130 |           range.first,
131 |           range.last,
132 |           score,
133 |           records.first.strand,
134 |           ".",
135 |           attributes
136 |          ].join("\t")
137 |   end
138 | end
139 | 


--------------------------------------------------------------------------------
/bin/parse_hhr.rb:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env ruby
  2 | require 'optparse'
  3 | 
  4 | options = {}
  5 | options[:homology_prob_cut] = 70
  6 | options[:eval_cut] = 0.001
  7 | options[:pval_cut] = 0.000001
  8 | options[:score_cut] = 100
  9 | options[:align_col_cut] = 50 
 10 | options[:template_aln_size_cut] = 30
 11 | options[:repeat] = false
 12 | 
 13 | OptionParser.new do |opts|
 14 |   opts.banner = "Usage: parse_hhr.rb [options] input.hhr"
 15 | 
 16 |   opts.on("-o [N]", Float, "--homology_cutoff", "Minimum homology probability (70)") do |f|
 17 |     options[:homology_prob_cut] = f
 18 |   end
 19 |   
 20 |   opts.on("-e [N]", Float, "--evalue_cutoff", "Maximum evalue (1e-3)") do |f|
 21 |     options[:eval_cut] = f
 22 |   end
 23 | 
 24 |   opts.on("-p [N]", Float, "--pvalue_cutoff", "Maximum pvalue (1e-6)") do |f|
 25 |     options[:pval_cut] = f
 26 |   end
 27 | 
 28 |   opts.on("-s [N]", Float, "--score_cutoff", "Minimum score (100)") do |f|
 29 |     options[:pval_cut] = f
 30 |   end
 31 | 
 32 |   opts.on("-a [N]", Float, "--align_length_cutoff", "Minimum length of amino acids in match for query (50)") do |f|
 33 |     options[:align_col_cut] = f
 34 |   end
 35 | 
 36 |   opts.on("-t [N]", Float, "--template_length_cutoff", "Minimum length of amino acids in match for template (30)") do |f|
 37 |     options[:template_aln_size_cut] = f
 38 |   end 
 39 | 
 40 |   opts.on("-r", "--repeat", "Input file is generated from repeat sequence rather than coding sequence") do |r|
 41 |     options[:repeat] = r
 42 |   end
 43 |   
 44 |   opts.on("-h", "--help", "Show this message") do
 45 |     puts opts
 46 |     exit
 47 |   end
 48 | end.parse!
 49 | 
 50 | infile = File.open(ARGV.shift)
 51 | 
 52 | homology_prob_cut = 70
 53 | 
 54 | gff3 = File.open('out.gff3', 'w')
 55 | hints = File.open('out.hints', 'w')
 56 | geneid = File.open('out.geneid', 'w')
 57 | glimmer = File.open('out.glimmer', 'w')
 58 | 
 59 | uid_counter = Hash.new(0)
 60 | 
 61 | while infile.gets
 62 |   p $_
 63 |   case $_
 64 |   when /^\W*Query\s+(?<scaffold_id>\S+)_(?<scaffold_hit_num>\d+) \[(?<orf_start>\d+) - (?<orf_end>\d+)\](?<rev> \(REVERSE SENSE\))?/
 65 |     p $~
 66 |     scaffold_id = $~[:scaffold_id]
 67 |     scaffold_hit_num = $~[:scaffold_hit_num].to_i
 68 |     reverse = ! $~[:rev].nil?
 69 |     orf_start = $~[:orf_start].to_i
 70 |     orf_stop = $~[:orf_end].to_i
 71 |   when /^\s*1 (?<hit_desc>.{30})\s+(?<prob>\d+\.?\d*)\s+(?<evalue>\d+\.?\d*E?-?\d*)\s+(?<pvalue>\d+\.?\d*E?-?\d*)\s+(?<score>\d+\.?\d*)\s+(?<structure_score>\d+\.?\d*)\s+(?<alignment_length>\d+)\s+(?<aa_start>\d+)-(?<aa_stop>\d+)\s+(?<hit_start>\d+)-(?<hit_stop>\d+)\s+\((?<template_size>\d+)\)/
 72 |     next if options[:homology_prob_cut] > $~[:prob].to_f
 73 |     next if options[:eval_cut] < $~[:evalue].to_f
 74 |     next if options[:pval_cut] < $~[:pvalue].to_f
 75 |     next if options[:score_cut] > $~[:score].to_f
 76 |     next if options[:align_col_cut] > $~[:alignment_length].to_i
 77 |     next if options[:template_aln_size_cut] > ($~[:hit_start].to_i - $~[:hit_stop].to_i).abs
 78 |     
 79 |     p $~
 80 |     hit_id = $~[:hit_desc].split.first
 81 |     hit_desc = $~[:hit_desc].split[1..-1].join(' ')
 82 | 
 83 |     uid = "%s.s%s.e%s" % [hit_id, $~[:hit_start], $~[:hit_stop]]
 84 |     hit_count = uid_counter[uid] += 1
 85 |     uid += ".n%d" % hit_count
 86 | 
 87 |     strand = reverse ? '-' : '+'
 88 |     gff_start = reverse ? (orf_start - (3 * $~[:aa_start].to_i)) : (orf_start + (3 * $~[:aa_start].to_i))
 89 |     gff_end = reverse ? (orf_start - (3 * $~[:aa_stop].to_i) + 1) : (orf_start + (3 * $~[:aa_stop].to_i) - 1)
 90 |     type = options[:repeat] ? 'nonexonpart' : 'CDSpart'
 91 | 
 92 |     attributes = {}
 93 |     attributes[:ID] = uid
 94 |     attributes[:Name] = hit_id + "(%s)" % hit_desc
 95 |     attributes[:Target] = "%s %s %s [+]" % [hit_id, $~[:hit_start], $~[:hit_stop]]
 96 |     
 97 |     gff3.puts [scaffold_id, 'hhblits', 'protein_match', gff_start, gff_end, $~[:score], strand, '.', attributes.map{|a| a.join('=')}.join(";")].join("\t")
 98 | 
 99 |     attributes = {}
100 |     attributes[:src] = options[:repeat] ? 'RM' : 'HU'
101 |     attributes[:grp] = hit_id
102 |     attributes[:pri] = options[:repeat] ? 6 : 5
103 |     hints.puts [scaffold_id, 'protein_match', type, gff_start, gff_end, $~[:score], strand, '.', attributes.map{|a| a.join('=')}.join(";")].join("\t")
104 |     
105 |     geneid.puts [scaffold_id, 'hhblits', 'sr', gff_start, gff_end, $~[:score], strand, '.'].join("\t")
106 |     
107 |     if reverse
108 |       glimmer.puts [scaffold_id, gff_end, gff_start, $~[:score], $~[:evalue], "\n\n"].join(" ")
109 |     else
110 |       glimmer.puts [scaffold_id, gff_start, gff_end, $~[:score], $~[:evalue], "\n\n"].join(" ")
111 |     end
112 |   end
113 | end
114 | 


--------------------------------------------------------------------------------
/bin/pfam_to_gff3.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | seqid = "dummy_seqid"
 4 | orfstart = -1
 5 | orfstop = -1
 6 | strand = "?"
 7 | domainid = "domainid"
 8 | domain_description = "domain description"
 9 | domain_num = 0
10 | hit_num = 0
11 | num = '\d*\.?\d+([eE][-+]?\d+)?'
12 | 
13 | while ARGF.gets
14 |   case $_
15 |   when /^#/
16 |     next
17 |   when /^Query:\s+(?<seqid>\S+)_(?<orfnum>\d+)\s+\[L=\d+\]/
18 |     seqid = $~[:seqid]
19 |     orfid = $~[:seqid] + "_" + $~[:orfnum]
20 |   when /Description: \[(?<orfstart>\d+) - (?<orfend>\d+)\] (?<reverse>\(REVERSE SENSE\))?/
21 |     orfstart = $~[:orfstart].to_i
22 |     orfend = $~[:orfend].to_i
23 |     strand = $~[:reverse] ? "-" : "+"
24 |     domain_num = 0
25 |   when /^>>\s+(?<domainid>\S+)\s+(?<domain_description>.*)\n/
26 |     domainid = $~[:domainid]
27 |     domain_description = $~[:domain_description]
28 |     domain_num += 1
29 |   when /\s+(?<hit_num>\d+)\s+[\?\!]\s+(?<score>#{num})\s+(?<bias>#{num})\s+(?<c_evalue>#{num})\s+(?<i_evalue>#{num})\s+(?<hmm_from>\d+)\s+(?<hmm_to>\d+)\s+[\.\[][\.\]]\s+\d+\s+\d+\s+[\.\[][\.\]]\s+(?<domain_start>\d+)\s+(?<domain_end>\d+)/
30 |     domain_from = $~[:domain_start].to_i
31 |     domain_to = $~[:domain_end].to_i
32 |     hit_num = $~[:hit_num]
33 |     domain_position_left = 0
34 |     domain_position_right = 0
35 |     if strand == "+"
36 |       domain_position_left = orfstart + 3 * domain_from
37 |       domain_position_right = orfstart + 3 * domain_to - 1
38 |     else
39 |       domain_position_right = orfstart - 3 * (domain_from - 1)
40 |       domain_position_left = orfstart - 3 * (domain_to - 1) + 1
41 |     end
42 |     
43 |     # Output a new gff annotation
44 |     out = []
45 |     out << seqid
46 |     out << 'pfam'
47 |     out << 'protein_hmm_match'
48 |     out << domain_position_left
49 |     out << domain_position_right
50 |     out << $~[:score]
51 |     out << strand
52 |     out << "."
53 |     attributes = {}
54 |     attributes[:Target] = domainid
55 |     attributes[:description] = domain_description
56 |     attributes[:exon_id] = seqid
57 |     attributes[:orf_id] = orfid
58 |     attributes[:ID] = orfid + "_" + domain_num.to_s + hit_num
59 |     out << attributes.map{|p| p.join('=')}.join(';')
60 |     puts out.join("\t")
61 |   end
62 | end
63 | 


--------------------------------------------------------------------------------
/bin/rename-codons:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | $strain_id = ARGV.shift
 3 | 
 4 | def gene(id_count)
 5 |   sprintf("%s_G%05d", $strain_id, id_count)
 6 | end
 7 | 
 8 | def mrna(id_count)
 9 |   sprintf("%s_R%05d", $strain_id, id_count)
10 | end
11 | 
12 | def cds(id_count)
13 |   sprintf("%s_C%05d", $strain_id, id_count)
14 | end
15 | 
16 | def start_codon(id_count)
17 |   sprintf("%s_START_%05d", $strain_id, id_count)
18 | end
19 | 
20 | def stop_codon(id_count)
21 |   sprintf("%s_STOP_%05d", $strain_id, id_count)
22 | end
23 | 
24 | reached_fasta = false
25 | 
26 | ARGF.each do |line|
27 |   if line =~ /##FASTA/
28 |     reached_fasta = true
29 |   end
30 |   if line !~ /^\S+\t\S+\t\S+_codon/ || reached_fasta
31 |     next
32 |   end
33 | 
34 |   split = line.chomp.split("\t")
35 |   attributes = Hash[split[8].split(";").map{|pair| pair.split(" ")}]
36 |   id_count = attributes["gene_id"].match(/^"(\d+)_/)[1]
37 |   attributes.delete("gene_id")
38 |   attributes.delete("transcript_id")
39 |   attributes.delete("gene_name")
40 |   attributes.delete("transcript_name")
41 | 
42 |   case split[2].downcase
43 |   when "start_codon"
44 |     attributes["ID"] = start_codon(id_count)
45 |     attributes["Parent"] = gene(id_count)
46 |   when "stop_codon"
47 |     attributes["ID"] = stop_codon(id_count)
48 |     attributes["Parent"] = gene(id_count)
49 |   end
50 | 
51 |   split[8] = attributes.map{|attribute| attribute.join("=")}.join(";")
52 |   puts split.join("\t")
53 | end
54 | 


--------------------------------------------------------------------------------
/bin/rename-fasta:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | # How many sequences are there in the file?
 4 | fasta = File.open(ARGV.shift, 'r')
 5 | sequence_count = fasta.
 6 |   each_line.
 7 |   find_all { |line| line =~ /^>/ }.
 8 |   count
 9 | 
10 | # An optional second argument gives the name of the strain
11 | # we replace white space with underscores.
12 | strain_name = if ARGV.empty?
13 |                 "sequence"
14 |               else
15 |                 ARGV.shift.gsub(/\s+/, "_")
16 |               end
17 | 
18 | # The sprintf format string, which will end up looking like:
19 | # ">strain_name_%05d"
20 | format_string = ">#{strain_name}_%0#{Math.log10(sequence_count).ceil}d\n"
21 | 
22 | # Read through each line, replacing the fasta headers
23 | count = 0
24 | fasta.rewind
25 | sequence = ""
26 | fasta.each_line do |line|
27 |   if line =~ /^>/
28 |     sequence.chars.each_slice(80) { |a| puts a.join } if sequence != ""
29 |     printf(format_string, count+=1)
30 |     sequence = ""
31 |   else
32 |     sequence << line.chomp
33 |   end
34 | end
35 | 
36 | sequence.chars.each_slice(80) { |a| puts a.join }
37 | 


--------------------------------------------------------------------------------
/bin/rename-gff-ids:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | $strain_id = ARGV.shift
 3 | 
 4 | def gene(id_count)
 5 |   sprintf("%s_G%05d", $strain_id, id_count)
 6 | end
 7 | 
 8 | def mrna(id_count)
 9 |   sprintf("%s_R%05d", $strain_id, id_count)
10 | end
11 | 
12 | def cds(id_count)
13 |   sprintf("%s_C%05d", $strain_id, id_count)
14 | end
15 | 
16 | reached_fasta = false
17 | 
18 | ARGF.each do |line|
19 |   if line =~ /##FASTA/
20 |     reached_fasta = true
21 |   end
22 |   if line =~ /^#/ || reached_fasta
23 |     puts line
24 |     next
25 |   end
26 |     
27 |   split = line.chomp.split("\t")
28 |   attributes = Hash[split[8].split(";").map{|pair| pair.split("=")}]
29 |   attributes.delete("gene_id")
30 |   attributes.delete("transcript_id")
31 | 
32 |   case split[2].downcase
33 |   when "gene"
34 |     id_count = attributes["ID"].match(/\d+$/)[0]
35 |     attributes["ID"] = gene(id_count)
36 |     attributes["Name"] = gene(id_count)
37 |   when "mrna"
38 |     id_count = attributes["ID"].match(/\d+$/)[0]
39 |     attributes["ID"] = mrna(id_count)
40 |     attributes["Parent"] = gene(id_count)
41 |   when "cds"
42 |     id_count = attributes["Parent"].match(/\d+$/)[0]
43 |     attributes["ID"] = cds(id_count)
44 |     attributes["Parent"] = mrna(id_count)
45 |   end
46 | 
47 |   split[8] = attributes.map{|attribute| attribute.join("=")}.join(";")
48 |   puts split.join("\t")
49 | end
50 | 


--------------------------------------------------------------------------------
/bin/trim_fasta_all.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env  perl
  2 | package trim_fasta_all;
  3 | use strict;
  4 | use warnings;
  5 | 
  6 | use Data::Dumper;
  7 | our $VERSION = '1.0';
  8 | 
  9 | #04MAR11: Added GC/AT ratio check as ratio cutoff
 10 | 
 11 | =head1 NAME
 12 | 
 13 | trim_fasta_all.pl - removes sequences from a FASTA file 
 14 | 
 15 | =head1 VERSION
 16 | 
 17 |  Version 0.2
 18 | 
 19 | =head1 SYNOPSIS
 20 | 
 21 | trim_fasta_all.pl [options] <infiles>
 22 | 
 23 | removes sequences from a FASTA file. See perldoc for more info.
 24 | 
 25 | 	'i|fa|fasta=s'    => FASTA file to trim. You can also give multiples as arguments without any -i/-fa option.
 26 | 	'outfile:s'	=> Optionally, the name of the trimmed outfile
 27 | 	'blastfile:s'	=> BLASTFILE to retrieve sequences from
 28 | 	'blastquery'		=> grab BLAST queries 
 29 | 	'blasthit'       =>  grab BLAST hits
 30 | 	'evalue=s'	=> Evalue cut-off for blastfile (currently broken)
 31 | 	'c|character=s' => Characters to look for. If present, remove sequence.
 32 | 	'le|length=i'   => Number of minimum characters
 33 | 	'p|proportion'  => Discard sequences for which a mononucleotide frequency exceeds this proportion 
 34 | 	'ratio'		=> Discard sequences for which the GC or AT frequency exceeds this ratio
 35 | 	'x'             => Do not include the Xx characters when calculating size of sequence
 36 | 	xdiscard        => Discard if these many Xs
 37 | 	'npl'           => Do not include these characters when calculating size: NPLnpl
 38 | 	'lc|lowercase'  => Do not include lowercase characters when calculating size of sequence (e.g. to not include low quality bases)
 39 | 	'id|idfile=s'   => A second FASTA file containing IDs to remove from FASTA file. Alternatively a text file with one ID per line
 40 | 	'descr'		=> For above: search description line instead of primary id.
 41 | 	'ci'		=> Case insensitivity for above two options
 42 | 	'invert'	=> Invert match (invert output filenames)
 43 | 	'log'           => Keep a log file
 44 | 	'df'            => Do not write discarded sequences (less IO)
 45 | 	'solq'          => Input is FASTQ (Solexa 1.3-1.4)
 46 | 	'sanq'          => Input is FASTQ (Sanger)
 47 | 	'casava18'	=> Input is Fastq from Casava 1.8
 48 | 	'single'	    => Entire output sequence/quality is in a single line (no BioPerl; good for parsing)
 49 | 	'ghash'		=> Use a Glib hash table (less memory, slower)
 50 | 
 51 | =head1 DESCRIPTION
 52 | 
 53 | Processes file (-fa) when certain character(s) are present (-c); or a list of IDs is provided (-id); or a certain length-cut off is not satisfied (-le); or a proportion of nucleotide frequence can be specified (-p) instead. The -log option produces a log file reporting what happened to each sequence 
 54 | The option to not include Xs and/or NPLs and/or lower-case characters in the cut-off calculation is forced with -x and/or -npl and/or -lc respectively.
 55 | Uses BioPerl. A disk-friendly function (-df) prevents the FASTA file of discarded sequences of being written.
 56 | 
 57 | =head1 AUTHORS
 58 | 
 59 |  Alexie Papanicolaou 1 2
 60 | 
 61 | 	1 Max Planck Institute for Chemical Ecology, Germany
 62 | 	2 Centre for Ecology and Conservation, University of Exeter, UK
 63 | 	alexie@butterflybase.org
 64 | 
 65 | =head1 DISCLAIMER & LICENSE
 66 | 
 67 | This software is released under the GNU General Public License version 3 (GPLv3).
 68 | It is provided "as is" without warranty of any kind.
 69 | You can find the terms and conditions at http://www.opensource.org/licenses/gpl-3.0.html.
 70 | Please note that incorporating the whole software or parts of its code in proprietary software
 71 | is prohibited under the current license.
 72 | 
 73 | =head1 BUGS & LIMITATIONS
 74 | 
 75 | None known so far.
 76 | 
 77 | =cut
 78 | use Bio::SeqIO;
 79 | use Bio::SearchIO;
 80 | use Getopt::Long;
 81 | #use Tie::GHash;
 82 | use Pod::Usage;
 83 | $| = 1;
 84 | my (
 85 | 	 $character,    @infiles,     $length_cutoff, $xmask, $xdiscard,
 86 | 	 $nplmask,      $ci,          $blastfile,     $evalue_cutoff,
 87 | 	 $lcmask,       $prop_cutoff, @idfiles,       $log,
 88 | 	 $logfile,      $invert,      $sangerfastq,   $blast_hit,$blast_query,
 89 | 	 $user_outfile, $df,          %ids,           $help,
 90 | 	 $convert2uc,   $descr_flag,  $solexafastq,   $search_accession,
 91 | 	 $seq_search,   $single_line, $ratio_cutoff,  $ghash, $overwrite, $casava
 92 | );
 93 | &GetOptions(
 94 | 	'i|fa|fasta=s{,}' => \@infiles,
 95 | 	'blastfile=s'     => \$blastfile,
 96 | 	#'evalue=s'	=> \$evalue_cutoff,
 97 | 	'c|character=s'   => \$character,
 98 | 	'le|length=i'     => \$length_cutoff,
 99 | 	'p|proportion=f'  => \$prop_cutoff,
100 | 	'ratio=f'	  => \$ratio_cutoff,
101 | 	'x'               => \$xmask,
102 | 	'uc|uppercase'    => \$convert2uc,
103 | 	'npl'             => \$nplmask,
104 | 	'lc|lowercase'    => \$lcmask,
105 | 	'ids|idfile=s{,}' => \@idfiles,
106 | 	'description'     => \$descr_flag,
107 | 	'invert'          => \$invert,
108 | 	'ci'              => \$ci,
109 | 	'log'             => \$log,
110 | 	'df'              => \$df,
111 | 	'h|help'          => \$help,
112 | 	'solq'            => \$solexafastq,
113 | 	'sanq'            => \$sangerfastq,
114 | 	'seq'             => \$seq_search,
115 | 	'outfile:s'       => \$user_outfile,
116 | 	'single'          => \$single_line,
117 | 	'blastquery'     => \$blast_query,
118 | 	'blasthit'       => \$blast_hit,
119 | 	'ghash'	=> \$ghash,
120 | 	'overwrite' => \$overwrite,
121 | 	'casava18'=>\$casava,
122 | 	'xdiscard:i' => \$xdiscard,
123 | 	  #'accessions'=> \$search_accession,
124 | );
125 | if ($help) { pod2usage; }
126 | @infiles = @ARGV if !@infiles;
127 | unless (@infiles) {
128 | 	print "Failed to provide or find input file\n";
129 | 	pod2usage;
130 | }
131 | tie %ids,'Tie::GHash' if $ghash;
132 | 
133 | unless (    $character
134 | 		 || $length_cutoff
135 | 		 || $prop_cutoff || $ratio_cutoff || $xdiscard
136 | 		 || @idfiles
137 | 		 || $blastfile )
138 | {
139 | 	die("Nothing to do!\n");
140 | }
141 | unless ($evalue_cutoff) { $evalue_cutoff = 1; }
142 | my $counter = int(0);
143 | if ($casava){
144 | 	$sangerfastq=1;
145 | 	undef($solexafastq);	
146 | }
147 | foreach my $idfile (@idfiles) {
148 | 	if ( $idfile && -s $idfile ) {
149 | 		my $pattern;
150 | 		if   ($descr_flag) { $pattern = '^\s*\S+\s+(.+)$'; }
151 | 		else               { $pattern = '^[>@]?\s*(\S+)\s*'; }
152 | 		my @test_lines = `head $idfile`;
153 | 		foreach my $test (@test_lines) {
154 | 			if ( $test =~ /^>/ ) { $pattern = "Bio::SeqIO"; }
155 | 		}
156 | #		my $number = `wc -l < $idfile`;
157 | #		chomp($number);
158 | #		$number /= 2 if $pattern eq "Bio::SeqIO";
159 | 		print "Building hash from $idfile with $pattern\n";
160 | 		my $flag;
161 | 
162 | 		if ( $pattern eq "Bio::SeqIO" ) {
163 | 			my $id_obj = new Bio::SeqIO( -file => $idfile, -format => "fasta" );
164 | 			while ( my $object = $id_obj->next_seq() ) {
165 | 				$counter+=length($object->seq().$object->description().' '.$object->id()) if $object->seq();
166 | 				$counter+=length($object->description().' '.$object->id()) if !$object->seq();
167 | 				if    ($seq_search) { $ids{ $object->seq() }         = 1; }
168 | 				elsif ($descr_flag) { $ids{ $object->description() } = 1; }
169 | 				else                { $ids{ $object->id() }          = 1; }
170 | 				$flag = 1 if !$flag;
171 | 			}
172 | 		} else {
173 | 			open( IN, $idfile ) || die();
174 | 			while ( my $line = <IN> ) {
175 | 				$counter+=length($line);
176 | 				if ($ci) {
177 | 					if ( $line =~ /$pattern/i ) {
178 | 						$ids{$1} = 1;
179 | 						$flag = 1 if !$flag;
180 | 					}
181 | 				} else {
182 | 					if ( $line =~ /$pattern/ ) {
183 | 						$ids{$1} = 1;
184 | 						$flag = 1 if !$flag;
185 | 					}
186 | 				}
187 | 			}
188 | 			close(IN);
189 | 		}
190 | 		if ( !$flag ) { die "Failed to get list of IDs to extract...\n"; }
191 | 		else {
192 | 			print "Hash presence of $idfile verified\n";
193 | 		}
194 | 	} elsif ($idfile) {
195 | 		warn "File $idfile is empty or does not exist!\n";
196 | 	}
197 | }
198 | if ( $blastfile && -s $blastfile ) {
199 |     if ($blast_hit){
200 | 	 print "Building HASH for queries and hits from $blastfile...\n";
201 | 	 my @blast_hits = `grep '^>' $blastfile`;
202 |       chomp(@blast_hits);
203 |       foreach my $blast (@blast_hits) {
204 |       #next if $blast=~/^Sbjct|^Query|^Number|^Matrix:|^Gap penalties|^Length|^Database|^BLASTN|^Jinghui|^Database|^programs/i;
205 |         $counter++;
206 |         $blast =~ /^>(\S+)/;
207 |         $ids{$1} = 1;
208 |       }
209 |       print "Found $counter significant results\n";
210 |     }elsif($blast_query){
211 | 	print "Building HASH for queries from $blastfile...\n";
212 | 	my @blast_queries = `grep -B 18 '^Sequences producing' $blastfile |grep '^Query='`;
213 | 	
214 | 	chomp(@blast_queries);
215 | 	foreach (@blast_queries) {
216 | 	  next if $_=~/^Sbjct|^Query|^Number|^Matrix:|^Gap penalties|^Length/i;
217 | 		$counter++;
218 | 		$_ =~ s/^Query=\s+//;
219 | 		$ids{$_} = 1;
220 | 	}
221 | 	print "Found $counter significant results\n";
222 |     }else{
223 |       die "Please provide -blasthit and/or -blastquery\n";
224 |     }
225 | }
226 | foreach my $file (@infiles) {
227 | 	&process($file);
228 | }
229 | #####################################################
230 | sub process ($) {
231 | 	my $fastafile = shift;
232 | 	my $fsize = -s $fastafile;
233 | 	my ( $filein, $fileout, $fileout2);
234 | 	my $fastafiletrim = "$fastafile.trim";
235 | 	$fastafiletrim = $user_outfile if $user_outfile;
236 | 	my $fastafilediscard = "$fastafile.discard";
237 | 	print "Processing... $fastafile as $fastafiletrim  && $fastafilediscard\n";
238 | 	$fastafilediscard = $user_outfile . ".discard" if $user_outfile;
239 | 	if (!-s $fastafile){
240 | 		warn "File not found, skipping\n";
241 | 		return;
242 | 	}if (-s $fastafiletrim){
243 | 		warn "Output file $fastafiletrim already exists\n";
244 | 		return unless $overwrite;
245 | 	}
246 | 	if ($solexafastq) {
247 | 		if ($single_line){
248 | 			open( IN,   $fastafile )           if $single_line;
249 | 			open( OUT1, ">$fastafiletrim" )    if $single_line;
250 | 			open( OUT2, ">$fastafilediscard" ) if $single_line;
251 | 		}else{
252 | 		$filein = new Bio::SeqIO( -file => $fastafile, -format => "fastq-solexa" );
253 | 		$fileout = new Bio::SeqIO( -file => ">$fastafiletrim", -format => "fastq-solexa" );
254 | 		$fileout2 = new Bio::SeqIO(
255 | 					-file   => ">$fastafilediscard",
256 | 					-format => "fastq-solexa"
257 | 			);
258 | 		}
259 | 	} elsif ($sangerfastq) {
260 | 		if ($single_line){
261 | 			open( IN,   $fastafile );
262 | 			open( OUT1, ">$fastafiletrim" );
263 | 			open( OUT2, ">$fastafilediscard" );
264 | 		}else{
265 | 			$filein = new Bio::SeqIO( -file => $fastafile, -format => "fastq" );
266 | 			$fileout =  new Bio::SeqIO( -file => ">$fastafiletrim", -format => "fastq" );
267 | 			$fileout2 =  new Bio::SeqIO( -file => ">$fastafilediscard", -format => "fastq" );
268 | 		}
269 | 	} else {
270 | 		if ($single_line){
271 | 			open( IN,   $fastafile ) ||die("Cannot open $fastafile\n");
272 | 			open( OUT1, ">$fastafiletrim" );
273 | 			open( OUT2, ">$fastafilediscard" );
274 | 		}else{
275 | 			$filein = new Bio::SeqIO( -file => $fastafile, -format => "fasta" );
276 | 			$fileout =  new Bio::SeqIO( -file => ">$fastafiletrim", -format => "fasta" );
277 | 			$fileout2 =  new Bio::SeqIO( -file => ">$fastafilediscard", -format => "fasta" );
278 | 		}
279 | 	}
280 | 	if ($log) {
281 | 		$logfile = $fastafile . ".trim.log";
282 | 		open( LOG, ">$logfile" );
283 | 	}
284 | 	my ( $empty, $discard, $trim );
285 | 	$counter = 0;
286 | 	if ($single_line){
287 | 		print "Processing  as single line FASTA/Q\n";
288 | 	}else{
289 | 		my $number=($sangerfastq || $solexafastq) ? `grep -c "^@" $fastafile` : `grep -c "^>" $fastafile`;
290 | 		chomp($number);		
291 | 		print "$number sequences\n";
292 | 	}
293 | 	my $errors = int(0);
294 | 	while ( my $object = $single_line ? <IN> : $filein->next_seq() ) {
295 | 	        next if !$object;
296 | 		$counter=$single_line ? $counter+length($object) : $counter+1;
297 |        		next if $single_line && $object=~/^\s*$/;
298 | 		my ( $id, $sequence, $description, $qual, $prefix);
299 | 		if ($single_line) {
300 | 			chomp($object);
301 | 			$object =~ /^(\S)(\S+)\s*(.*)/;
302 | 			$prefix = $1;
303 | 			$id          = $2;
304 | 			$description = $3;
305 | 			if (($casava) && $description=~/(\d)\:[A-Z]\:/){
306 | 				$id.='/'.$1;
307 | 			}
308 | 			$sequence    = <IN>;
309 | 			$counter+=length($sequence);
310 | 			chomp($sequence);
311 | 			my $ok = ($prefix eq '>'||$prefix eq '@' || $prefix eq '+') ? 1 : int(0);
312 | 			while ($ok != 1){
313 | 				$errors++;
314 | 				warn "Sequence $counter has a header which starts with $1. This does not seem to be right...\n$object\n$sequence\n\nSkipping...\n";
315 | 				die "\nToo many errors found\n" if $errors > 20;
316 | 				$object          = $sequence;
317 | 				chomp($object);
318 | 				$sequence    = <IN>;
319 | 				$object =~ /^(\S)(\S+)\s*(\S*)/;
320 | 				$prefix = $1;
321 | 		                $id          = $2;
322 |         	                $description = $3;
323 | 				$ok = ($prefix eq '>'||$prefix eq '@' || $prefix eq '+') ? 1 : int(0);
324 | 			}
325 | 			if ( $solexafastq || $sangerfastq ) {
326 | 				$qual = <IN> . <IN>;
327 | 				$counter+=length($qual);
328 | 				chomp($qual);
329 | 			}
330 | 		} else {
331 | 			$id          = $object->id();
332 | 			$sequence    = $object->seq() if ($seq_search);
333 | 			$description = $object->description() ? $object->description() : '';
334 | 		}
335 | 
336 | 		# trim if given an ID file
337 | 		if ( @idfiles || $blastfile ) {
338 | 			if ( $sequence && $seq_search ) {
339 | 				if ( $ids{$sequence} ) {
340 | 						unless ( $df && !$invert ) {
341 | 							if ($single_line) {
342 | 								if ($qual) {
343 | 									print OUT2 "@" . "$id\n$sequence\n$qual\n";
344 | 								} else {
345 | 									print OUT2 ">$id";
346 | 									print OUT2 " $description" if $description;
347 | 									print OUT2 "\n";
348 | 									print OUT2 "$sequence\n";
349 | 								}
350 | 							} else {
351 | 								$fileout2->write_seq($object);
352 | 							}
353 | 						}
354 | 						$discard++;
355 | 						if ($log) {
356 | 							print LOG "Sequence $id discarded because the Sequence was found in idfiles\n";
357 | 						}
358 | 						#DO get it more than once
359 | 						#delete($ids{$sequence});
360 | 						next;
361 | 				} else {
362 | 					next;
363 | 				}
364 | 			} elsif ( exists $ids{$id} && $ids{$id}==1) {
365 | 				unless ( $df && !$invert ) {
366 | 					if ($single_line) {
367 | 						if ($qual) {
368 | 							print OUT2 "@" . "$id\n$sequence\n$qual\n";
369 | 						} else {
370 | 							print OUT2 ">$id";
371 | 							print OUT2 " $description" if $description;
372 | 							print OUT2 "\n";
373 | 							print OUT2 "$sequence\n";
374 | 						}
375 | 					} else {
376 | 						$fileout2->write_seq($object);
377 | 					}
378 | 				}
379 | 				$discard++;
380 | 				if ($log) {
381 | 					print LOG "Sequence $id discarded because the ID was found in idfiles\n";
382 | 				}
383 | 
384 | 				#make sure we don't get it twice
385 | 				$ids{$id}=2;
386 | 				next;
387 | 		    } elsif ( exists $ids{$id}) {
388 | 		    	next;
389 | 				# if id exists multiple times don't write it in any file.
390 | 			} elsif ( exists $ids{ $id . ' ' . $description } && $ids{ $id . ' ' . $description }==1) {
391 | 				unless ( $df && !$invert ) {
392 | 					if ($single_line) {
393 | 						if ($qual) {
394 | 							print OUT2 "@" 
395 | 							  . $id
396 | 							  . $description
397 | 							  . "\n$sequence\n$qual\n";
398 | 						} else {
399 | 							print OUT2 ">" 
400 | 							  . $id
401 | 							  . $description
402 | 							  . "\n$sequence\n";
403 | 						}
404 | 					} else {
405 | 						$fileout2->write_seq($object);
406 | 					}
407 | 				}
408 | 				$discard++;
409 | 				if ($log) {
410 | 					print LOG "Sequence $id.$description discarded because the ID was found in idfiles\n";
411 | 				}
412 | 
413 | 				#make sure we don't get it twice
414 | 				$ids{ $id . ' ' . $description } =2;
415 | 				next;
416 | 			} elsif ( exists $ids{ $id . ' ' . $description }) {
417 | 				 next;
418 | 			}
419 | 		}
420 | 		$sequence = $object->seq() if !$sequence;
421 | 		if ($sequence) {
422 | 			my $seq2 = $sequence;
423 | 			if ($xmask)   { $seq2 =~ s/[X]//ig; }
424 | 			if ($nplmask) { $seq2 =~ s/[NPL]//ig; }
425 | 			if ($lcmask)  { $seq2 =~ s/[a-z]//g; }
426 | 			my $length = length($seq2);
427 | 
428 | 			# trim if given a character(s)
429 | 			if ($character) {
430 | 				if ( $sequence =~ /[$character]/ ) {
431 | 					unless ( $df && !$invert ) {
432 | 						if ($single_line) {
433 | 							if ($qual) {
434 | 								print OUT2 "@" . "$id\n$sequence\n$qual\n";
435 | 							} else {
436 | 								print OUT2 ">$id $description\n$sequence\n";
437 | 
438 | 							}
439 | 						} else {
440 | 							$fileout2->write_seq($object);
441 | 						}
442 | 					}
443 | 					$discard++;
444 | 					if ($log) {
445 | 						print LOG
446 | "Sequence $id discarded because character $character was found\n";
447 | 					}
448 | 					next;
449 | 				}
450 | 			}
451 | 
452 | 			#trim if given a length cutoff
453 | 			if ($length_cutoff) {
454 | 				if ( !$length || $length < $length_cutoff ) {
455 | 					unless ( $df && !$invert ) {
456 | 						if ($single_line) {
457 | 							if ($qual) {
458 | 								print OUT2 "@" . "$id\n$sequence\n$qual\n";
459 | 							} else {
460 | 								print OUT2 ">$id $description\n$sequence\n";
461 | 							}
462 | 						} else {
463 | 							$fileout2->write_seq($object);
464 | 						}
465 | 					}
466 | 					$discard++;
467 | 					if ($log) {
468 | 						print LOG "Sequence $id discarded because length $length was smaller than cutoff $length_cutoff\n";
469 | 					}
470 | 					next;
471 | 				}
472 | 			}
473 | 			# trim if xdiscard
474 | 			if ($xdiscard){
475 | 				my $Xs = ( $sequence =~ tr/X// );
476 | 				if ($Xs >= $xdiscard){
477 | 					unless ( $df && !$invert ) {
478 | 						if ($single_line) {
479 | 							if ($qual) {
480 | 								print OUT2 "@" . "$id\n$sequence\n$qual\n";
481 | 							} else {
482 | 								print OUT2 ">$id $description\n$sequence\n";
483 | 							}
484 | 						} else {
485 | 							$fileout2->write_seq($object);
486 | 						}
487 | 					}
488 | 					$discard++;
489 | 					print LOG "Sequence $id discarded more Xs ($Xs) than allowed ($xdiscard).\n" if $log;
490 | 					next;
491 | 				}
492 | 			}
493 | 			#trim if given a proportion of A/T/C/G
494 | 			if ($prop_cutoff || $ratio_cutoff) {
495 | 				my $As = ( $sequence =~ tr/A// );
496 | 				my $Ts = ( $sequence =~ tr/T// );
497 | 				my $Cs = ( $sequence =~ tr/C// );
498 | 				my $Gs = ( $sequence =~ tr/G// );
499 | 				my $Xs = ( $sequence =~ tr/X// );
500 | 				my $Ns = ( $sequence =~ tr/N// );
501 | 				my $propA   = ( $As / $length );
502 | 				my $propT   = ( $Ts / $length );
503 | 				my $propC   = ( $Cs / $length );
504 | 				my $propG   = ( $Gs / $length );
505 | 				my $propX   = ( $Xs / $length );
506 | 				my $propN   = ( $Ns / $length );
507 | 				my $GCratio = $propG + $propC if $ratio_cutoff;
508 | 				my $ATratio = 1 - $GCratio if $ratio_cutoff;
509 | 				if (  $prop_cutoff &&( 
510 | 					 $propA > $prop_cutoff
511 | 					 || $propT > $prop_cutoff
512 | 					 || $propX > $prop_cutoff
513 | 					 || $propN > $prop_cutoff
514 | 					 || $propG > $prop_cutoff
515 | 					 || $propC > $prop_cutoff )
516 | 				   || $ratio_cutoff && (
517 | 					 $ATratio > $ratio_cutoff
518 | 					 || $GCratio > $ratio_cutoff )
519 | 				   )
520 | 				{
521 | 
522 | 					unless ( $df && !$invert ) {
523 | 						if ($single_line) {
524 | 							if ($qual) {
525 | 								print OUT2 "@" . "$id\n$sequence\n$qual\n";
526 | 							} else {
527 | 								print OUT2 ">$id $description\n$sequence\n";
528 | 							}
529 | 						} else {
530 | 							$fileout2->write_seq($object);
531 | 						}
532 | 					}
533 | 					$discard++;
534 | 					if ($log) {
535 | 						print LOG "Sequence $id discarded because of one nucleotide proportion (A:$propA; T:$propT; G:$propG; C:$propC higher than cutoff $prop_cutoff or GC/AT higher than $ratio_cutoff\n" if $ratio_cutoff && $prop_cutoff;
536 | 						print LOG "Sequence $id discarded because of GC/AT proportion (A:$propA; T:$propT; G:$propG; C:$propC) higher than $ratio_cutoff\n" if $ratio_cutoff;
537 | 						print LOG "Sequence $id discarded because of one nucleotide proportion (A:$propA; T:$propT; G:$propG; C:$propC higher than cutoff $prop_cutoff\n" if $prop_cutoff;
538 | 					}
539 | 					next;
540 | 				}
541 | 			}
542 | 
543 | 			#next has taken care of discards.
544 | 			$trim++;
545 | 			if ($convert2uc) {
546 | 				$object->seq( uc($sequence) ) if !$single_line;
547 | 				$sequence = uc($sequence) if $single_line;
548 | 			}
549 | 			unless ( $df && $invert ) {
550 | 				if ($single_line) {
551 | 					if ($qual) {
552 | 						print OUT1 "@" . "$id\n$sequence\n$qual\n";
553 | 					} else {
554 | 						print OUT1 ">$id $description\n$sequence\n";
555 | 					}
556 | 				} else {
557 | 					$fileout->write_seq($object);
558 | 				}
559 | 			}
560 | 		}    #end if $sequence
561 | 		else {
562 | 			$empty++;
563 | 			if ($log) {
564 | 				print LOG "Sequence $id discard because it was empty\n";
565 | 			}
566 | 			next;
567 | 		}
568 | 	}
569 | 	if ( !$empty )   { $empty   = int(0); }
570 | 	if ( !$discard ) { $discard = int(0); }
571 | 	if ( !$trim )    { $trim    = int(0); }
572 | 	if ($invert) {
573 | 		system("mv -i $fastafiletrim tmpfile");
574 | 		system("mv $fastafilediscard $fastafiletrim");
575 | 		system("mv tmpfile $fastafilediscard");
576 | 		my $temp = $trim;
577 | 		$trim    = $discard;
578 | 		$discard = $temp;
579 | 	}
580 | 	unless ( -s "$fastafilediscard" ) { unlink "$fastafilediscard"; }
581 | 	if ($log) { print LOG "FASTA $fastafile contained ".($empty+$discard+$trim)." sequences\n"; }
582 | 	print "\nDone, $empty were empty and an additional $discard were discarded. Kept $trim as $fastafiletrim\n";
583 | 	if ($log) {	print LOG "\n$empty were empty and an additional $discard were discarded. Kept $trim as $fastafiletrim\n";
584 | 	}
585 | 	close(LOG);
586 | }
587 | print "\n";
588 | 


--------------------------------------------------------------------------------
/complete.nf:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env nextflow
  2 | 
  3 | params.reference = 'data/genome.fasta'
  4 | params.bamfiles = 'data/bams/*.bam'
  5 | params.species = 'fungi'
  6 | params.minscaffoldsize = 1000
  7 | params.maxintronlength = 1000
  8 | params.minintronlength = 10
  9 | params.cufflinks_overlap_radius = 10
 10 | params.cufflinks_pre_mrna_fraction = 0.25
 11 | params.cufflinks_min_isoform_fraction = 0.15
 12 | 
 13 | // Remove small contigs.
 14 | process remove_small_scaffolds {
 15 |   container 'genomicpariscentre/bioperl:1.6.924'
 16 |   
 17 |   input:
 18 |   file 'ref.fasta' from file(params.reference)
 19 | 
 20 |   output:
 21 |   file 'ref_trimmed.fasta' into ref_trimmed_for_filter_mito
 22 | 
 23 |   "trim_fasta_all.pl -i ref.fasta -out ref_trimmed.fasta -length ${params.minscaffoldsize}"
 24 | }
 25 | 
 26 | // We want to remove any scaffolds that show matches to some known
 27 | // mitochondrial sequence. For the moment, the process includes a
 28 | // download of the P. nodorum mitochondrial sequence. To make the
 29 | // search more comprehensive, simply append other sequences ot the
 30 | // 'mitorhondrial.fasta' input file. For the moment, we exclude
 31 | // sequences that have mitochondrial blast hits to more than 20% of
 32 | // their length.
 33 | process filter_mitochondrial {
 34 |   container 'robsyme/basics'
 35 | 
 36 |   input:
 37 |   file 'ref_trimmed.fasta' from ref_trimmed_for_filter_mito
 38 | 
 39 |   output:
 40 |   file 'nuclear_genome.fasta' into scaffolds_for_repeatmasker
 41 |   file 'nuclear_genome.fasta' into scaffolds_for_gff2gb
 42 |   file 'mitochondrial_genome.fasta' into scaffolds_mitochondrial
 43 | 
 44 |   """
 45 | curl 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=NC_009746&rettype=fasta&retmode=text' >> mitochondrial.fasta
 46 | 
 47 | makeblastdb -in mitochondrial.fasta -input_type fasta -dbtype nucl
 48 | samtools faidx ref_trimmed.fasta
 49 | 
 50 | blastn -query ref_trimmed.fasta -db mitochondrial.fasta -evalue 1 -outfmt '6 qseqid qstart qend qlen' -max_target_seqs 1 \
 51 | | awk 'BEGIN{OFS=\"\\t\"} {print \$1, \$2-1, \$3, \"hit_id_\" idcount++, \$4}' \
 52 | | sort -k1,1 -k2,2n \
 53 | | bedtools merge -i - -c 5 -o mean \
 54 | | bedtools complement -i - -g ref_trimmed.fasta.fai \
 55 | | bedtools genomecov -max 1 -i - -g ref_trimmed.fasta.fai \
 56 | | grep -v '^genome' \
 57 | | tee coverage.txt \
 58 | | awk '\$2 > 0 && \$5 > 0.8 {print \$1}' \
 59 | | xargs samtools faidx ref_trimmed.fasta \
 60 | > nuclear_genome.fasta
 61 | 
 62 | awk '\$2 > 0 && \$5 <= 0.8 {print \$1}' coverage.txt \
 63 | | xargs samtools faidx ref_trimmed.fasta \
 64 | > mitochondrial_genome.fasta
 65 | """
 66 | }
 67 | 
 68 | // It's important to mask repetitive sequence before running automated
 69 | // gene calling software. Here we use repeatmasker and the Repbase
 70 | // database to identify and mask repetitive sequence in the nuclear
 71 | // genome.
 72 | process repeatmasker {
 73 |   container 'registry.robsyme.com/repeatmasker'
 74 | 
 75 |   input:
 76 |   file 'ref.fasta' from scaffolds_for_repeatmasker
 77 | 
 78 |   output:
 79 |   file 'ref.fasta.masked' into ref_masked_for_codingquarry
 80 | 
 81 |   "RepeatMasker -qq -frag 5000000 -gff -species ${params.species} -no_is ref.fasta"
 82 | }
 83 | 
 84 | // The user can supply many bam files from many conditions. For the
 85 | // purposes of gene calling, I'm going to merge them into one file for
 86 | // ease of handling. Differentiating conditions is of no use to this
 87 | // pipeline.
 88 | process merge_bams {
 89 |   container 'robsyme/basics'
 90 |   
 91 |   input:
 92 |   file '*.bam' from Channel.fromPath(params.bamfiles).toList()
 93 | 
 94 |   output:
 95 |   file 'merged.bam' into bam_for_cufflinks
 96 |   
 97 |   'samtools merge merged.bam *.bam'
 98 | }
 99 | 
100 | // We would like to identify potential transcripts using cufflinks 
101 | process cufflinks {
102 |   container 'robsyme/cufflinks'
103 | 
104 |   input:
105 |   file 'merged.bam' from bam_for_cufflinks
106 | 
107 |   output:
108 |   file 'transcripts.gtf' into transcripts_gtf_for_codingquarry
109 |   file 'transcripts.gtf' into transcripts_gtf_for_orf_extraction
110 | 
111 |   "cufflinks --overlap-radius ${params.cufflinks_overlap_radius} --pre-mrna-fraction ${params.cufflinks_pre_mrna_fraction} --min-isoform-fraction ${params.cufflinks_min_isoform_fraction} --min-intron-length ${params.minintronlength} --max-intron-length ${params.maxintronlength} merged.bam"
112 | }
113 | 
114 | // The CodingQuarry denovo gene predictor uses intron/exon boundary
115 | // information to improve the accuracy of gene annotation.
116 | process codingquarry {
117 |   container 'robsyme/codingquarry:1.2'
118 | 
119 |   input:
120 |   file 'ref.fasta' from ref_masked_for_codingquarry
121 |   file 'transcripts.gtf' from transcripts_gtf_for_codingquarry
122 | 
123 |   output:
124 |   file 'out/PredictedPass.gff3' into codingquarry_gff_for_gff2gb
125 | 
126 |   '''
127 | CufflinksGTF_to_CodingQuarryGFF3.py transcripts.gtf > transcripts.gff
128 | CodingQuarry -f ref.fasta -t transcripts.gff
129 | '''
130 | }
131 | 
132 | process extract_cufflinks_transcripts {
133 |   container 'robsyme/basics:0.7'
134 | 
135 |   input:
136 |   file 'ref.fasta' from file(params.reference)
137 |   file 'transcripts.gtf' from transcripts_gtf_for_orf_extraction
138 | 
139 |   output:
140 |   file 'transcripts.fasta' into cufflinks_transcripts
141 |   file 'transcripts.gff3' into cufflinks_transcripts_gff
142 | 
143 |   """
144 | gt gtf_to_gff3 -tidy transcripts.gtf > transcripts_unsorted.gff3
145 | gt gff3 -sort -tidy transcripts_unsorted.gff3 > transcripts.gff3
146 | gt extractfeat -type exon -join -seqfile ref.fasta -matchdescstart transcripts.gff3 > transcripts.fasta
147 | """
148 | }
149 | 
150 | // Generate a fasta file of open reading frames.
151 | process identify_orfs {
152 |   container 'robsyme/emboss:6.6.0'
153 |   
154 |   input: 
155 |   file 'transcripts.fasta' from cufflinks_transcripts
156 | 
157 |   output:
158 |   file 'transcript_orfs.fasta' into orfs_fasta
159 |   
160 |   "getorf -sequence transcripts.fasta -outseq transcript_orfs.fasta -minsize 100 -find 0"
161 | }
162 | 
163 | process find_pfam_domains_in_transcript_orfs {
164 |   container 'robsyme/pfam:28.0'
165 | 
166 |   input:
167 |   file 'orfs.fasta' from orfs_fasta.splitFasta(by: 1000)
168 | 
169 |   output:
170 |   file 'orf.domains' into transcript_orf_domains
171 | 
172 |   """
173 | hmmscan -E 1e-5 -o orf.domains /opt/Pfam-A.hmm orfs.fasta
174 | """
175 | }
176 | 
177 | process pfam_output_to_gff {
178 |   container 'robsyme/basics:0.7'
179 |   
180 |   input:
181 |   file 'orf.domains' from transcript_orf_domains
182 |   file 'transcripts.gff3' from cufflinks_transcripts_gff
183 | 
184 |   output:
185 |   file 'domains.gff3' into pfam_gff_hints
186 | 
187 |   """
188 | pfam_to_gff3.rb < orf.domains > orf_domains.gff3
189 | gff_transpose.rb --from orf_domains.gff3 --to transcripts.gff3 > domains.gff3
190 | """
191 | }
192 | 
193 | // The training set for augustus requires that we supply short
194 | // snippets of 'golden' genes which are used for training. Everything
195 | // that is *not* identified as conding sequence is assumed to be
196 | // non-coding. Here we take extract each of the genes +- 200 bp into
197 | // their own genbank file. In cases where genes are separated by less
198 | // than 200 bp, some coding sequence will be included in the neighbor,
199 | // and will be interpreted as 'non-coding' sequence by the augustus
200 | // training algorithm. A more sensible approach would be to divide the 
201 | process gff_to_genbank {
202 |   container 'robsyme/augustus:3.1'
203 | 
204 |   input:
205 |   file 'genome.fasta' from scaffolds_for_gff2gb
206 |   file 'full_length_genes.gff' from codingquarry_gff_for_gff2gb
207 | 
208 |   output:
209 |   file 'out.gb' into golden_genbank_for_training
210 | 
211 |   script:
212 |   "gff2gbSmallDNA.pl full_length_genes.gff genome.fasta 200 out.gb"
213 | }
214 | 
215 | 
216 | 
217 | 
218 | 
219 | 
220 | 
221 | 
222 | 
223 | 
224 | 
225 | 
226 | 
227 | 
228 | 
229 | 
230 | 
231 | 
232 | 
233 | 
234 | 
235 | 
236 | 
237 | 


--------------------------------------------------------------------------------
/genemark-annotate.nf:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env nextflow
 2 | 
 3 | params.genome = '**/scaffolds.fasta'
 4 | 
 5 | (strainNames, genomes) = Channel.fromPath(params.genome).separate(2) { path -> [path.getParent().getBaseName(), path] };
 6 | nameAndSequence = strainNames.merge( genomes ) {name, file -> [name, file]}
 7 | 
 8 | process cleanGenome {
 9 |   input:
10 |     set strainName, 'raw.fasta' from nameAndSequence
11 | 
12 |   output:
13 |   set strainName, 'genome.fasta' into cleanGenome
14 | 
15 |   """
16 |   rename-fasta raw.fasta "${strainName}_scaffold" > genome.fasta
17 |   """
18 | }
19 | 
20 | process trainAndCallGenes {
21 |   input:
22 |   set strainName, 'genome.fasta' from cleanGenome
23 | 
24 |   output:
25 |   set strainName, 'genemark.gtf', 'genome.fasta' into basicGTF
26 | 
27 |   """
28 |   gmes_petap.pl --ES --fungus --sequence genome.fasta
29 |   """
30 | }
31 | 
32 | process gtfToGFF3 {
33 |   input:
34 |   set strainName, 'genemark.gtf' 'genome.fasta' from basicGTF
35 | 
36 |   output:
37 |   set strainName, 'out.gff3.gz' into renamedAnnotations
38 | 
39 |   """
40 |   gt gtf_to_gff3 -tidy genemark.gtf \
41 |     | gt gff3 -sort -tidy \
42 |     | rename-gff-ids $strainName > out.gff3
43 |   rename-codons $strainName genemark.gtf >> out.gff3
44 |   sort -k1,1 -k4,4n out.gff3 > tmp && mv tmp out.gff3
45 |   echo "##FASTA" >> out.gff3
46 |   awk '/^>/ {print \$0, "[${strainName}]"} !/^>/ {print \$0}' genome.fasta >> out.gff3
47 |   gzip --best out.gff3
48 |   """
49 | }
50 | 
51 | renamedAnnotations.subscribe { strainName, gff ->
52 |   gff.copyTo("${strainName}.gff3.gz")
53 | }
54 | 


--------------------------------------------------------------------------------
/main.nf:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env nextflow
  2 | 
  3 | params.reference = 'data/genome.fasta'
  4 | params.scaffoldmin = 1000 // Minimum scaffold size to consider
  5 | params.minsize = 100 // Minimum exon size
  6 | params.species = 'fungi' // Name of species passed to RepeatMasker
  7 | params.maxintronlength = 500 // Maximum intron length
  8 | params.minintronlength = 10 // Maximum intron length
  9 | params.bamfiles = 'data/bams/*.bam'
 10 | params.pasaconf = 'conf/alignAssembly.conf' // Pasa configuration file to set db name etc.
 11 | params.reads = 'data/reads/all.fastq'
 12 | reference_raw = file(params.reference)
 13 | 
 14 | // Remove small scaffolds from analysis.
 15 | process remove_small_scaffolds {
 16 |   container 'genomicpariscentre/bioperl:1.6.924'
 17 |   
 18 |   input:
 19 |   file 'ref.fasta' from reference_raw
 20 | 
 21 |   output:
 22 |   file 'ref_trimmed.fasta' into ref_trimmed_for_orfs
 23 |   file 'ref_trimmed.fasta' into ref_trimmed_for_masking
 24 |   file 'ref_trimmed.fasta' into ref_trimmed_for_softmasking
 25 |   file 'ref_trimmed.fasta' into ref_trimmed_for_trinity
 26 |   file 'ref_trimmed.fasta' into ref_trimmed_for_bamtohints
 27 |   file 'ref_trimmed.fasta' into ref_trimmed_for_pasa
 28 |   file 'ref_trimmed.fasta' into ref_trimmed_for_gff2gb
 29 |   file 'ref_trimmed.fasta' into ref_trimmed_for_busco
 30 |   file 'ref_trimmed.fasta' into ref_trimmed_for_cufflinks
 31 | 
 32 |   "trim_fasta_all.pl -i ref.fasta -out ref_trimmed.fasta -length ${params.scaffoldmin}"
 33 | }
 34 | 
 35 | process busco {
 36 |   container 'robsyme/busco'
 37 |   
 38 |   input:
 39 |   file 'ref.fasta' from ref_trimmed_for_busco
 40 | 
 41 |   output:
 42 |   stdout into debug
 43 |   
 44 |   "ln -s /opt/busco/lineages/fungi . && busco -in ref.fasta -o custom --lineage fungi"
 45 | }
 46 | 
 47 | // Generate a fasta file of open reading frames.
 48 | process identify_orfs {
 49 |   container 'robsyme/emboss'
 50 |   
 51 |   input: 
 52 |   file 'ref.fasta' from ref_trimmed_for_orfs
 53 | 
 54 |   output:
 55 |   file 'ref_exons.aa' into orfs_fasta
 56 |   
 57 |   "getorf -sequence ref.fasta -outseq ref_exons.aa -minsize 300 -find 0"
 58 | }
 59 | 
 60 | // We want to remove ORFs with a high percentage of Xs. Notice that
 61 | // the large orfs file is split into pieces containing 1000 fasta
 62 | // entries each.
 63 | process remove_Xs {
 64 |   container 'robsyme/bioruby'
 65 | 
 66 |   input: 
 67 |   file 'orfs.fasta' from orfs_fasta.splitFasta( by: 5000 )
 68 | 
 69 |   output:
 70 |   stdout into clean_orfs_for_transposons
 71 |   stdout into clean_orfs_for_fungi
 72 | 
 73 |   """
 74 | #!/usr/bin/env ruby
 75 | 
 76 | require 'bio'
 77 | Bio::FlatFile.open('orfs.fasta').each do |entry|
 78 |   next if entry.length < (${params.minsize} / 3)
 79 |   x_percentage = entry.seq.composition['X'] / entry.length.to_f
 80 |   puts entry if x_percentage < 0.3
 81 | end
 82 | """
 83 | }
 84 | 
 85 | // Run HHblits to identify potential transposons in the (cleaned) open
 86 | // reading frames from the 'identify_orfs' step. We run a hhblits
 87 | // process for each open reading frame.
 88 | process hhblits_transposon {
 89 |   container 'robsyme/hhblits-transposon'
 90 |   
 91 |   input:
 92 |   file 'orfs.fasta' from clean_orfs_for_transposons.splitFasta( by: 500 )
 93 | 
 94 |   output:
 95 |   stdout into hhblits_transposon
 96 | 
 97 |   """
 98 | csplit --elide-empty-files --quiet orfs.fasta '/^>/' '{*}'
 99 | for orf in xx*; do
100 |   hhblits -i \$orf -o stdout -d /databases/transposons -e 1e-5 -E 1e-5 -id 80 -n 2
101 | done
102 | """
103 | }
104 | 
105 | process hhblits_fungi {
106 |   container 'robsyme/hhblits-fungi'
107 |   
108 |   input:
109 |   file 'orfs.fasta' from clean_orfs_for_fungi.splitFasta( by: 500 )
110 | 
111 |   output:
112 |   stdout into hhblits_fungi
113 | 
114 | """
115 | csplit --elide-empty-files --quiet orfs.fasta '/^>/' '{*}'
116 | for orf in xx*; do
117 |   hhblits -i \$orf -o stdout -d /databases/fungal_50kclus -e 1e-5 -E 1e-5 -id 80 -n 2
118 | done
119 | """
120 | }
121 | 
122 | //Look at a hhblits output file and generate a gff file of the matches
123 | process parse_transposon_hhr {
124 |   cache 'deep'
125 | 
126 |   input:
127 |   file 'search.hhr' from hhblits_transposon.collectFile()
128 | 
129 |   output:
130 |   file 'out.gff3' into hhblits_transposon_gff
131 | 
132 |   """
133 | parse_hhr.rb \
134 | --homology_cutoff 70 \
135 | --evalue_cutoff 1e-3 \
136 | --pvalue_cutoff 1e-5 \
137 | --score_cutoff 100 \
138 | --align_length_cutoff 50 \
139 | --template_length_cutoff 30 \
140 | --repeat \
141 | search.hhr
142 | """
143 | }
144 | 
145 | process parse_fungi_hhr {
146 |   cache 'deep'
147 | 
148 |   input:
149 |   file 'search.hhr' from hhblits_fungi.collectFile()
150 | 
151 |   output:
152 |   file 'out.gff3' into hhblits_fungi_gff
153 | 
154 |   """
155 | parse_hhr.rb \
156 | --homology_cutoff 70 \
157 | --evalue_cutoff 1e-3 \
158 | --pvalue_cutoff 1e-5 \
159 | --score_cutoff 100 \
160 | --align_length_cutoff 50 \
161 | --template_length_cutoff 30 \
162 | search.hhr
163 | """
164 | }
165 | 
166 | process repeatmasker {
167 |   container 'repeatmasker'
168 | 
169 |   input:
170 |   file 'ref.fasta' from ref_trimmed_for_masking
171 | 
172 |   output:
173 |   file 'ref.fasta.out.gff' into repeats_gff_for_hints
174 |   file 'ref.fasta.out.gff' into repeats_gff_for_softmasking
175 |   file 'ref.fasta.masked' into ref_masked_for_golden
176 |   file 'ref.fasta.masked' into ref_masked_for_codingquarry
177 | 
178 |   "RepeatMasker -qq -frag 5000000 -gff -species ${params.species} -no_is ref.fasta"
179 | }
180 | 
181 | process repeatmasker_gff_to_hints {
182 |   container 'robsyme/bioruby'
183 | 
184 |   input:
185 |   file 'repeats.gff' from repeats_gff_for_hints
186 | 
187 |   output:
188 |   stdout into repeat_hints
189 |   
190 |   '''
191 | #!/usr/bin/env ruby
192 | repeats = File.open("repeats.gff", "r")
193 | 
194 | while repeats.gets
195 |   next if $_ =~ /^#/
196 |   split = $_.split("\t")
197 |   split[2] = "nonexonpart"
198 |   split[8] = "src=RM;pri=6"
199 |   puts split.join("\t")
200 | end
201 | '''
202 | }
203 | 
204 | process softMaskReference {
205 |   container 'robsyme/bedtools'
206 | 
207 |   input:
208 |   file 'ref.fasta' from ref_trimmed_for_softmasking
209 |   file 'repeats.gff' from repeats_gff_for_softmasking
210 | 
211 |   output:
212 |   file 'ref_softmasked.fasta' into ref_softmasked_for_golden
213 | 
214 |   "maskFastaFromBed -soft -fi ref.fasta -fo ref_softmasked.fasta -bed repeats.gff"
215 | }
216 | 
217 | process merge_bams {
218 |   input:
219 |   file '*.bam' from Channel.fromPath(params.bamfiles).toList()
220 | 
221 |   output:
222 |   file 'merged.bam' into mapped_reads
223 |   file 'merged.bam' into mapped_reads_for_bamtohints
224 |   file 'merged.bam' into mapped_reads_for_cufflinks
225 | 
226 |   "samtools merge merged.bam *.bam"
227 | }
228 | 
229 | process cufflinks {
230 |   container 'robsyme/cufflinks'
231 | 
232 |   input:
233 |   file 'merged.bam' from mapped_reads_for_cufflinks
234 | 
235 |   output:
236 |   file 'transcripts.gtf' into transcriptwtranscripts_gtf_for_codingquarry
237 | 
238 |   "cufflinks --max-intron-length ${params.maxintronlength} --min-intron-length ${params.minintronlength} merged.bam"
239 | }
240 | 
241 | process codingquarry {
242 |   container 'robsyme/codingquarry'
243 | 
244 |   input:
245 |   file 'ref.fasta' from ref_masked_for_codingquarry
246 |   file 'transcripts.gtf' from transcriptwtranscripts_gtf_for_codingquarry
247 | 
248 |   output:
249 |   file 'out/PredictedPass.gff3' into codingquarry_gff
250 |   
251 |   '''
252 | CufflinksGTF_to_CodingQuarryGFF3.py transcripts.gtf > transcripts.gff
253 | CodingQuarry -f ref.fasta -t transcripts.gff
254 | '''
255 | }
256 | 
257 | process split_bams_by_scaffold {
258 |   input:
259 |   file 'merged.bam' from mapped_reads
260 | 
261 |   output:
262 |   file '*.bam' into split_bams
263 | 
264 |   """
265 | samtools index merged.bam && \
266 | samtools idxstats merged.bam \
267 | | awk '\$3 > 0 && \$2 > ${params.scaffoldmin} {print \$1}' \
268 | | xargs -n1 -I{} samtools view -b -o {}.bam merged.bam {}
269 | """
270 | }
271 | 
272 | process genome_guided_trinity {
273 |   container 'robsyme/trinity'
274 |   
275 |   input:
276 |   set 'ref.fasta', 'single.bam' from ref_trimmed_for_trinity.spread(split_bams)
277 | 
278 |   output:
279 |   file 'trinity_out_dir/Trinity-GG.fasta' into genome_guided_trinity_split
280 |   
281 |   "Trinity --genome_guided_bam single.bam --genome_guided_max_intron ${params.maxintronlength} --max_memory 2G --jaccard_clip --CPU 1 --full_cleanup"
282 | }
283 | 
284 | process collate_genome_guided_transcripts {
285 |   input:
286 |   stdin genome_guided_trinity_split.collectFile().map{ it.text }
287 | 
288 |   output:
289 |   stdout into genome_guided_trinity
290 |   
291 |   '''
292 | #!/usr/bin/awk -f
293 | /^>/ {
294 |   sub(/>GG[0-9]+/, ">GG" count++)
295 |   print
296 | }
297 | 
298 | /^[^>]/ {
299 |   print $0
300 | }
301 | '''
302 | }
303 | 
304 | process denovo_trinity {
305 |   container 'robsyme/trinity'
306 | 
307 |   input:
308 |   file 'reads.fastq' from file(params.reads)
309 |   
310 |   output:
311 |   file 'trinity_out_dir.Trinity.fasta' into denovo_trinity
312 | 
313 |   "Trinity --seqType fq --single reads.fastq --max_memory 2G --CPU 2 --jaccard_clip --full_cleanup"
314 | }
315 | 
316 | process bam_to_hints {
317 |   container 'robsyme/bedtools'
318 |   
319 |   input:
320 |   file 'ref.fasta' from ref_trimmed_for_bamtohints
321 |   file 'all.bam' from mapped_reads_for_bamtohints
322 | 
323 |   output:
324 |   file 'all.bam.junctions.hints' into augustus_hints
325 |   
326 |   "augustus_RNAseq_hints.pl --genome ref.fasta --bam all.bam"
327 | }
328 | 
329 | 
330 | // Note that I had to start a separate mysql docker container: docker
331 | // run --name pasadb -e MYSQL_ROOT_PASSWORD=password -e MYSQL_DATABASE=pasa -e MYSQL_USER=pasauser -e MYSQL_PASSWORD=password mysql
332 | process pasa {
333 |   container 'robsyme/pasa'
334 | 
335 |   input:
336 |   file 'GG_raw.fasta' from genome_guided_trinity
337 |   file 'DN_raw.fasta' from denovo_trinity
338 |   file 'ref.fasta' from ref_trimmed_for_pasa
339 |   file 'alignAssembly.config' from file(params.pasaconf)
340 | 
341 |   output:
342 |   file '*.assemblies.fasta.transdecoder.pep' into pasa_cds_for_golden
343 |   file '*.assemblies.fasta.transdecoder.genome.gff3' into pasa_gff_for_fl
344 |   file '*.assemblies.fasta.transdecoder.pep' into pasa_cds_for_fl
345 |   file 'ref.fasta' into reference_genome
346 |   
347 |   """
348 | grep '^>' DN_raw.fasta          \
349 | | awk '{print(substr(\$1, 2))}' \
350 | > DN_raw.list
351 | 
352 | cat DN_raw.fasta GG_raw.fasta > transcripts.fasta
353 | 
354 | /opt/pasa/scripts/Launch_PASA_pipeline.pl       \
355 |   -c alignAssembly.config                       \
356 |   --MAX_INTRON_LENGTH ${params.maxintronlength} \
357 |   --stringent_alignment_overlap 30.0            \
358 |   -C                                            \
359 |   -r                                            \
360 |   -R                                            \
361 |   -g ref.fasta                                  \
362 |   -t transcripts.fasta                          \
363 |   --TDN DN_raw.list                             \
364 |   --ALIGNERS blat,gmap                          \
365 |   --TRANSDECODER                                \
366 |   --CPU 2
367 | 
368 | /opt/pasa/scripts/build_comprehensive_transcriptome.dbi \
369 | -c alignAssembly.config                        \
370 | -t transcripts.fasta                                    \
371 | --min_per_ID 95                                         \
372 | --min_per_aligned 30
373 | 
374 | /opt/pasa/scripts/pasa_asmbls_to_training_set.dbi \
375 | --pasa_transcripts_fasta *.assemblies.fasta       \
376 | --pasa_transcripts_gff3 *.pasa_assemblies.gff3
377 | """
378 | }
379 | 
380 | // Pull out the full-length transcripts identified by pasa (and Transdecoder)
381 | process find_full_length_proteins {
382 |   container 'robsyme/bioruby'
383 |   
384 |   input:
385 |   stdin pasa_cds_for_golden.map{ it.text }
386 | 
387 |   output:
388 |   stdout into full_pasa_pep_fasta
389 | 
390 |   """
391 | #!/usr/bin/env ruby
392 | require 'bio'
393 | 
394 | Bio::FlatFile.auto(ARGF).each do |entry|
395 |   puts entry if entry.definition =~ /type:complete/
396 | end
397 | """
398 | }
399 | 
400 | process exclude_partial_genes_from_gff {
401 |   container 'robsyme/bioruby'
402 | 
403 |   input:
404 |   file 'hits.gff3' from pasa_gff_for_fl
405 |   file 'peptide.fasta' from pasa_cds_for_fl
406 |   
407 |   output:
408 |   stdout into full_length_gff
409 | 
410 |   '''
411 | #!/usr/bin/env ruby
412 | require "bio"
413 | require "set"
414 | 
415 | full_length_ids = Bio::FlatFile
416 | .open("peptide.fasta")
417 | .find_all{ |entry| entry.definition =~ /type:complete/ }
418 | .map{ |entry| entry.entry_id }
419 | .to_set
420 | 
421 | File.open("hits.gff3").each do |line|
422 |   next unless line =~ /ID=(cds.)?([^\\|]+)\\|/
423 |   next unless full_length_ids.include?($2)
424 |   scaffold_name = line.split("\t").first
425 |   puts line
426 | end
427 | '''
428 | }
429 | 
430 | // The input to Augustus training requires that we provide the
431 | // 'golden' annotations as a genbank format, but it's not just any
432 | // genbank format, there are some restrictions.  
433 | // 
434 | // For the best results, we should remove proteins that are too
435 | // similar. Augusutus will also assume that all nucleotides not
436 | // annotated as coding sequence are non-coding sequence, so we need to
437 | // trim the output to the coding sequence += a small margin either
438 | // side. Note that this is not simply a conversion of gff to genbank.
439 | process gff_to_genbank {
440 |   container 'robsyme/augustus'
441 | 
442 |   input:
443 |   file 'genome.fasta' from ref_trimmed_for_gff2gb
444 |   file 'full_length_genes.gff' from full_length_gff
445 | 
446 |   output:
447 |   file 'out.gb' into golden_genbank_for_training
448 | 
449 |   "gff2gbSmallDNA.pl full_length_genes.gff genome.fasta 1000 out.gb"
450 | }
451 | 
452 | process train_augustus {
453 |   container 'robsyme/augustus'
454 | 
455 |   input:
456 |   file 'custom.gb' from golden_genbank_for_training
457 | 
458 |   output:
459 |   file 'custom.tar.gz' into augustus_trained_parameters
460 | 
461 |   """
462 | mkdir -p /opt/augustus/config/species/custom/
463 | cp /opt/augustus/config/species/generic/generic_parameters.cfg /opt/augustus/config/species/custom/custom_parameters.cfg
464 | cp /opt/augustus/config/species/generic/generic_weightmatrix.txt /opt/augustus/config/species/custom/
465 | /opt/augustus/bin/etraining --species=custom custom.gb
466 | /opt/augustus/scripts/optimize_augustus.pl --species=custom custom.gb
467 | tar -czvf custom.tar.gz /opt/augustus/config/species/custom
468 | """
469 | }
470 | 
471 | debug.subscribe{ println("DEBUG: $it") }
472 | 
473 | 
474 | // TODO: Evaluate whether it is at all helpful to supply cufflinks gtf as 'exonpart' hints to augustus. The problem with cufflinks is the concatentation of overlapping transcripts. When those transcripts are from opposite directions, supplying a stranded hint to augustus may prevent the annotation of one of genes that form the fused transcript.
475 | // TODO: Perhaps I can do ORF detection on the cufflinks transcripts and then run those ORFs through pfam and signalP detected domains can be converted into hints for augustus.
476 | 
477 | 


--------------------------------------------------------------------------------
/nextflow.config:
--------------------------------------------------------------------------------
1 | manifest {
2 |     homePage = 'http://github.com/robsyme/nextflow-annotation'
3 |     description = 'Fungal genome annotation workflow'
4 |     mainScript = 'annotate.nf'
5 | }
6 | 


--------------------------------------------------------------------------------
/proteinortho.nf:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env nextflow
 2 | 
 3 | (f1, f2, f3) = Channel.fromPath(params.fasta).separate(3){ [it,it,it] }
 4 | 
 5 | params.cpus = 1
 6 | params.outdir = 'proteinortho_out'
 7 | outdir = file(params.outdir)
 8 | outdir.mkdirs()
 9 | 
10 | 
11 | process indexGenomes {
12 |   container 'robsyme/proteinortho'
13 |   storeDir outdir
14 |   
15 |   input: 
16 |   file '*' from f1.toList()
17 | 
18 |   output:
19 |   file '*' into db1
20 |   file '*' into db2
21 |   
22 |   "proteinortho5.pl -step=1 *.fasta"
23 | }
24 | 
25 | def list = []
26 | f2.eachWithIndex{ unit, idx -> list.add(idx) }
27 | 
28 | process runBlasts {
29 |   container 'robsyme/proteinortho'
30 |   storeDir outdir
31 | 
32 |   input:
33 |   file '*' from db1
34 |   file "*" from f2.toList()
35 |   each index from list[0..-3]
36 | 
37 |   output:
38 |   file 'myproject.*' into blastresults
39 | 
40 |   "proteinortho5.pl -verbose -step=2 -startat=$index -stopat=$index -cpus=${params.cpus} *.fasta"
41 | }
42 | 
43 | process performClustering {
44 |   container 'robsyme/proteinortho'
45 |   storeDir outdir
46 | 
47 |   input:
48 |   file '*' from blastresults
49 |   file '*' from db2
50 |   file '*' from f3.toList()
51 | 
52 |   output:
53 |   file 'myproject.*' into proteinortho_out
54 |   
55 |   "proteinortho5.pl -step=3 -singles -verbose *.fasta"
56 | }
57 | 
58 | proteinortho_out.flatten().subscribe{ println("Proteinortho output file: $it") }
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/simple-annotate.nf:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env nextflow
 2 | 
 3 | genome = file(params.genome)
 4 | cegmaFile = file(params.cegma)
 5 | strainName = genome.getParent().getBaseName()
 6 | outFilename = params.out
 7 | 
 8 | process cleanGenome {
 9 |   input:
10 |   genome
11 | 
12 |   output:
13 |   stdout into cleanGenomes
14 |         
15 |   script:
16 |   '''
17 |   awk '/^>/ && !/[.*]/ {print(\$0, "[$strainName]")} /^>/ && /[.*]/ {print \$0} /^[^>]/ {print(toupper(\$0))}' '$genome' | sed "s/\015//"
18 |   '''
19 | }
20 | 
21 | (fastaForGFF, fastaForAug) = cleanGenomes.separate(2){ [it, it] }
22 | 
23 | process cegmaGFFtoFullerGFF {
24 |   input:
25 |   file 'cegmaFile' from cegmaFile
26 | 
27 |   output:
28 |   stdout fullGFF
29 | 
30 |   '''
31 |   fullerCegmaGFF.rb $cegmaFile
32 |   '''
33 | }
34 | 
35 | process cegmaGFFToGenbank {
36 |   container 'robsyme/augustus'
37 |   
38 |   input:
39 |   file gff from fullGFF
40 |   file fasta from fastaForGFF
41 | 
42 |   output:
43 |   file 'out.gb' into trainingGenbank
44 |   
45 |   '''
46 |   gff2gbSmallDNA.pl $gff $fasta 5000 out.gb
47 |   '''
48 | }
49 | 
50 | process trainAndCallGenes {
51 |   container 'robsyme/augustus'
52 | 
53 |   input:
54 |   file trainingGenbank
55 |   file genome from fastaForAug
56 | 
57 |   output:
58 |   file 'out.txt' into trainedFile
59 | 
60 |   '''
61 |   optimize_augustus.pl --species=fusarium_graminearum $trainingGenbank
62 |   etraining --species=fusarium_graminearum $trainingGenbank
63 |   augustus --species=fusarium_graminearum --gff3=on $genome > out.txt
64 |   '''
65 | }
66 | 
67 | trainedFile.subscribe { trained ->
68 |   trained.copyTo(outFilename)
69 | }
70 | 
71 | 


--------------------------------------------------------------------------------