├── .gitignore ├── Dockerfiles ├── RepeatMasker-onbuild │ ├── Dockerfile │ └── README.md ├── RepeatMasker │ ├── Dockerfile │ └── README.md ├── augustus │ ├── Dockerfile │ └── fgram_base │ │ ├── fgram_base_exon_probs.pbl │ │ ├── fgram_base_igenic_probs.pbl │ │ ├── fgram_base_intron_probs.pbl │ │ ├── fgram_base_metapars.cfg │ │ ├── fgram_base_parameters.cfg │ │ └── fgram_base_weightmatrix.txt ├── basics │ └── Dockerfile ├── bedtools │ └── Dockerfile ├── bioruby │ └── Dockerfile ├── busco │ ├── Dockerfile │ └── README.md ├── chado-helper │ ├── Dockerfile │ └── README.md ├── codingquarry │ └── Dockerfile ├── cufflinks │ └── Dockerfile ├── emboss │ └── Dockerfile ├── gff2gb │ └── Dockerfile ├── hhblits-fungi │ └── Dockerfile ├── hhblits-transposon │ └── Dockerfile ├── interproscan │ ├── Dockerfile │ └── interproscan.properties ├── jamg │ └── Dockerfile ├── ncbi-blast │ └── Dockerfile ├── pasa │ ├── Dockerfile │ ├── Dockerfile-pasaweb │ └── conf.txt ├── pfam │ └── Dockerfile ├── proteinortho │ └── Dockerfile ├── tophat │ └── Dockerfile └── trinity │ └── Dockerfile ├── LICENSE ├── README.md ├── annotate ├── bin ├── GG_trinity_accession_incrementer.rb ├── augustus_RNAseq_hints.pl ├── bed12_to_augustus_junction_hints.pl ├── exonerate_to_genbank.rb ├── fullerCegmaGFF.rb ├── gff2gb ├── gff_transpose.rb ├── gff_transpose.rb~ ├── parse_hhr.rb ├── pfam_to_gff3.rb ├── rename-codons ├── rename-fasta ├── rename-gff-ids └── trim_fasta_all.pl ├── complete.nf ├── genemark-annotate.nf ├── main.nf ├── nextflow.config ├── proteinortho.nf └── simple-annotate.nf /.gitignore: -------------------------------------------------------------------------------- 1 | Dockerfiles/RepeatMasker/repeatmaskerlibraries*.tar.gz 2 | .nextflow.* 3 | work 4 | #* 5 | -------------------------------------------------------------------------------- /Dockerfiles/RepeatMasker-onbuild/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04 2 | 3 | MAINTAINER Rob Syme 4 | 5 | RUN apt-get update && apt-get install -qqy \ 6 | wget \ 7 | hmmer \ 8 | unzip \ 9 | build-essential 10 | 11 | # Install TRF (for RepeatScout) 12 | WORKDIR /usr/local/bin 13 | RUN wget http://tandem.bu.edu/trf/downloads/trf407b.linux64 && mv trf*.linux64 trf && chmod +x trf 14 | 15 | # Basic workdir 16 | WORKDIR /usr/local 17 | 18 | # Install nseg (for RepeatScout) 19 | RUN mkdir nseg && \ 20 | cd nseg && \ 21 | wget ftp://ftp.ncbi.nih.gov/pub/seg/nseg/* && \ 22 | make && \ 23 | mv nseg ../bin && \ 24 | mv nmerge ../bin 25 | 26 | # Install RepeatScout 27 | RUN wget http://bix.ucsd.edu/repeatscout/RepeatScout-1.0.5.tar.gz && \ 28 | tar -xvf RepeatScout* && \ 29 | rm RepeatScout*.tar.gz && \ 30 | mv RepeatScout* RepeatScout && \ 31 | cd RepeatScout && \ 32 | make 33 | 34 | # Install RMBlast 35 | RUN wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/rmblast/2.2.28/ncbi-rmblastn-2.2.28-x64-linux.tar.gz && \ 36 | tar -xzvf ncbi-rmblastn* && \ 37 | rm ncbi-rmblastn*.tar.gz && \ 38 | mv ncbi-rmblastn*/bin/rmblastn bin && \ 39 | rm -rf ncbi-rmblastn 40 | 41 | # Install Blast+ 42 | RUN wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.6.0/ncbi-blast-2.6.0+-x64-linux.tar.gz && \ 43 | tar -xzvf ncbi-blast* && \ 44 | find ncbi-blast* -type f -executable -exec mv {} bin \; && \ 45 | rm -rf ncbi-blast* 46 | 47 | # Install RepeatMasker 48 | RUN wget http://www.repeatmasker.org/RepeatMasker-open-4-0-7.tar.gz \ 49 | && tar -xzvf RepeatMasker-open*.tar.gz \ 50 | && rm -f RepeatMasker-open*.tar.gz \ 51 | && perl -0p -e 's/\/usr\/local\/hmmer/\/usr\/bin/g;' \ 52 | -e 's/\/usr\/local\/rmblast/\/usr\/local\/bin/g;' \ 53 | -e 's/DEFAULT_SEARCH_ENGINE = "crossmatch"/DEFAULT_SEARCH_ENGINE = "ncbi"/g;' \ 54 | -e 's/TRF_PRGM = ""/TRF_PRGM = "\/usr\/local\/bin\/trf"/g;' RepeatMasker/RepeatMaskerConfig.tmpl > RepeatMasker/RepeatMaskerConfig.pm 55 | 56 | # Fix RepeatMasker's strange shebang lines 57 | RUN cd /usr/local/RepeatMasker \ 58 | && perl -i -0pe 's/^#\!.*perl.*/#\!\/usr\/bin\/env perl/g' \ 59 | RepeatMasker \ 60 | DateRepeats \ 61 | ProcessRepeats \ 62 | RepeatProteinMask \ 63 | DupMasker \ 64 | util/queryRepeatDatabase.pl \ 65 | util/queryTaxonomyDatabase.pl \ 66 | util/rmOutToGFF3.pl \ 67 | util/rmToUCSCTables.pl 68 | 69 | # Install RIPcal 70 | RUN wget http://downloads.sourceforge.net/project/ripcal/RIPCAL/RIPCAL_2.0/ripcal2_install.zip \ 71 | && unzip ripcal*.zip \ 72 | && rm ripcal*.zip \ 73 | && mv ripcal* ripcal \ 74 | && cd ripcal \ 75 | && chmod +x perl/* 76 | 77 | # Install RECON 78 | RUN wget http://www.repeatmasker.org/RepeatModeler/RECON-1.08.tar.gz \ 79 | && tar -xvf RECON* \ 80 | && rm RECON*.tar.gz \ 81 | && mv RECON* recon \ 82 | && cd recon/src \ 83 | && make \ 84 | && make install \ 85 | && perl -i -0pe 's/\$path = "";/\$path = "\/usr\/local\/RECON-1.08\/bin";/g' ../scripts/\recon.pl 86 | 87 | # Install RepeatModeler deps 88 | RUN apt-get install -qqy libjson-perl liburi-perl liblwp-useragent-determined-perl 89 | 90 | # Install RepeatModeler 91 | RUN wget http://www.repeatmasker.org/RepeatModeler/RepeatModeler-open-1.0.10.tar.gz \ 92 | && tar -xvf RepeatModeler-*.tar.gz \ 93 | && rm RepeatModeler-*.tar.gz \ 94 | && mv RepeatModeler-*/ RepeatModeler \ 95 | && cd RepeatModeler \ 96 | && perl -i -0pe 's/^#\!.*/#\!\/usr\/bin\/env perl/g' \ 97 | configure \ 98 | BuildDatabase \ 99 | Refiner \ 100 | RepeatClassifier \ 101 | RepeatModeler \ 102 | TRFMask \ 103 | util/dfamConsensusTool.pl \ 104 | util/renameIds.pl \ 105 | util/viewMSA.pl \ 106 | && cat RepModelConfig.pm.tmpl \ 107 | | perl -p -e 's/\$RMBLAST_DIR +=.*;$/\$RMBLAST_DIR = "\/usr\/local\/bin";/g' \ 108 | | perl -p -e 's/\$RECON_DIR +=.*;$/\$RECON_DIR = "\/usr\/local\/recon\/bin";/g' \ 109 | | perl -p -e 's/\$RSCOUT_DIR +=.*;$/\$RSCOUT_DIR = "\/usr\/local\/RepeatScout";/g' \ 110 | > RepModelConfig.pm 111 | 112 | # I can't bundle the girinst RepBase libraries with the docker image, 113 | # so you'll need to get them yourself. Download them from 114 | # http://www.girinst.org/server/RepBase/protected/repeatmaskerlibraries/RepBaseRepeatMaskerEdition-20170127.tar.gz 115 | 116 | ONBUILD WORKDIR /usr/local/RepeatMasker 117 | ONBUILD ADD repeatmaskerlibraries.tar.gz /usr/local/RepeatMasker 118 | ONBUILD RUN cd /usr/local/RepeatMasker && util/buildRMLibFromEMBL.pl Libraries/RMRBSeqs.embl > Libraries/RepeatMasker.lib \ 119 | && makeblastdb -dbtype nucl -in Libraries/RepeatMasker.lib > /dev/null 2>&1 \ 120 | && makeblastdb -dbtype prot -in Libraries/RepeatPeps.lib > /dev/null 2>&1 121 | 122 | ENV PATH /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/RepeatMasker:/usr/local/RepeatScout:/usr/local/recon/bin:/usr/local/RepeatModeler 123 | #ENTRYPOINT ["/usr/local/RepeatMasker/RepeatMasker"] 124 | -------------------------------------------------------------------------------- /Dockerfiles/RepeatMasker-onbuild/README.md: -------------------------------------------------------------------------------- 1 | # RepeatMasker-onbuild Container 2 | 3 | This simple container is designed to make it easier to run 4 | RepeatMasker on new machines. If you have your own permanent 5 | [big-ass-server](http://jermdemo.blogspot.ca/2011/06/big-ass-servers-and-myths-of-clusters.html), 6 | you might want to simply install the software as usual and that's very 7 | sensible. 8 | 9 | There are also plenty of situations were you might want to use a 10 | container: 11 | 12 | * You are using compute resources on EC2 or GCE and you don't want to 13 | make a new disk image for each step of the annotation pipeline (and 14 | you don't want the hastle of cloud orchestration tools and scripts. 15 | * A container described by a Dockerfile also provides complete 16 | documentation of how the results were generated, which makes 17 | replication a little easier. 18 | * You are using a [docker-aware pipeline](http://nextflow.io/). 19 | 20 | ## What Does the Image Contain? 21 | 22 | This images contains the RepeatMasker binary and its prerequisites 23 | hmmer, rmblast, blast+ and trf. It *does not* contain the RepBase 24 | database. You will need to register and downlod this yourself and then 25 | build a new image based on this one. It also does not contain the 26 | search engines Cross_Match and ABBlast/WUBlast because of licencing 27 | restrictions. 28 | 29 | ## Running RepeatMasker from inside a container 30 | 31 | You'll need a copy of the latest 32 | [Repbase-derived RepeatMasker libraries](http://www.girinst.org/server/RepBase/index.php) 33 | (requires 34 | [free registration](http://www.girinst.org/accountservices/register.php)), 35 | renamed as `repeatmaskerlibraries.tar.gz`. We then create a new 36 | Dockerfile and generate our new image 37 | 38 | ```sh 39 | wget --user your_username \ 40 | --password 12345 \ 41 | -O repeatmaskerlibraries.tar.gz \ 42 | http://www.girinst.org/server/RepBase/protected/repeatmaskerlibraries/RepBaseRepeatMaskerEdition-20170127.tar.gz 43 | echo "FROM robsyme/repeatmasker-onbuild" > Dockerfile 44 | docker build -t myrepeatmasker . 45 | ``` 46 | 47 | We can then run RepeatMasker: 48 | 49 | ```sh 50 | docker run -v $PWD:/in -w /in myrepeatmasker RepeatMasker scaffolds.fasta 51 | ``` 52 | 53 | This runs the container, mounting the host's current directory (and 54 | all subdirectories) inside the container at `/in` (`-v $PWD/in`). The `w 55 | /in` arguments ensure that the command is run from this new 56 | directory. We then specify that we want to use the `myrepeatmasker` 57 | image we just created. Inside the container, the command `RepeatMasker 58 | scaffolds.fasta` is run. 59 | 60 | ## Modifying the container 61 | 62 | You are free to modify the container (perhaps you really want to use 63 | Cross_Match, for example. Simply clone this repository (`git clone 64 | https://github.com/robsyme/nextflow-annotate.git`) and modify the 65 | Dockerfile before building. 66 | -------------------------------------------------------------------------------- /Dockerfiles/RepeatMasker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM robsyme/repeatmasker-onbuild:latest 2 | 3 | MAINTAINER Rob Syme 4 | 5 | -------------------------------------------------------------------------------- /Dockerfiles/RepeatMasker/README.md: -------------------------------------------------------------------------------- 1 | # RepeatMasker Container 2 | 3 | The RepBase licence prohibits distribution of the libraries, so we 4 | need a two-step process to build the final docker image. The first 5 | step is the installation of the dependencies. This has already been 6 | done inside the `robsyme/repeatmasker-onbuild` image. 7 | 8 | The second step is to download and install the RepBase libraries. The 9 | repeatmasker-onbuild image takes care of the installation. It only 10 | requires that you download the repbase images to a file names 11 | 'repeatmaskerlibraries.tar.gz' next to the Dockerfile (in this 12 | directory, for example). 13 | 14 | The Dockerfile is minimal, containing only: 15 | 16 | ``` 17 | FROM robsyme/repeatmasker-onbuild 18 | ``` 19 | 20 | If you have this tiny Dockerfile and the RepBase libraries, you can 21 | build and use your docker image with: 22 | 23 | ```sh 24 | docker build -t myrepeatmasker . 25 | cd /path/to/data 26 | docker run -v $PWD:/in -w /in myrepeatmasker RepeatMasker scaffolds.fasta 27 | ``` 28 | 29 | Note that only the current directory (and its children) is mounted 30 | inside the container, so you need to ensure that your scaffolds file 31 | is in the current path tree. 32 | -------------------------------------------------------------------------------- /Dockerfiles/augustus/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | MAINTAINER Rob Syme 4 | 5 | RUN apt-get update \ 6 | && apt-get install -qqy \ 7 | build-essential \ 8 | libbamtools-dev \ 9 | libboost-graph-dev \ 10 | libboost-iostreams-dev \ 11 | libgsl-dev \ 12 | liblpsolve55-dev \ 13 | libsqlite3-dev \ 14 | libsuitesparse-dev \ 15 | wget \ 16 | zlib1g-dev 17 | 18 | WORKDIR /usr/local 19 | 20 | # Install Augustus 21 | RUN wget http://bioinf.uni-greifswald.de/augustus/binaries/augustus.current.tar.gz \ 22 | && tar -xvf augustus*.tar.gz \ 23 | && rm augustus*.tar.gz \ 24 | && cd augustus \ 25 | && echo "COMPGENEPRED = true" >> common.mk \ 26 | && make \ 27 | && make install 28 | 29 | ENV AUGUSTUS_CONFIG_PATH /usr/local/augustus/config 30 | ENV PATH /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/augustus/bin:/usr/local/augustus/scripts 31 | -------------------------------------------------------------------------------- /Dockerfiles/augustus/fgram_base/fgram_base_metapars.cfg: -------------------------------------------------------------------------------- 1 | # This file contains the list of meta parameters which are subject to optimization. 2 | # All other parameters are chosen as given in the species parameter file. The order 3 | # of the parameters determines the order in the optimisation process. 4 | # For each parameter the range of possible values is specified after the parameter 5 | # name and at least one white space. 6 | # 3 cases are possible for the range: 7 | # - an explicit list is given, e.g. protein "on" "off" 8 | # - it is an integer range, e.g. window_size "1"-"5" 9 | # - it is a range of floating point numbers, e.g. pseudocount "0.3"_"1.8" 10 | # 11 | # 12 | # Mario Stanke, 19.12.2006 13 | # 14 | 15 | /Constant/dss_end "1"-"4" 16 | /Constant/dss_start "1"-"3" 17 | /Constant/ass_start "1"-"3" 18 | /Constant/ass_end "0"-"4" 19 | /Constant/ass_upwindow_size "1"-"50" 20 | /IntronModel/d "100"-"950" 21 | /IntronModel/ass_motif_memory "0"-"3" 22 | /IntronModel/ass_motif_radius "0"-"4" 23 | /ExonModel/tis_motif_memory "0"-"3" 24 | /ExonModel/tis_motif_radius "0"-"3" 25 | /Constant/trans_init_window "0"-"25" 26 | /Constant/init_coding_len "0"-"18" 27 | /ExonModel/patpseudocount "0.5"_"5" 28 | /ExonModel/etpseudocount "0"-"10" 29 | /ExonModel/etorder "0"-"3" 30 | /Constant/intterm_coding_len "0"-"13" 31 | /ExonModel/slope_of_bandwidth "0.05"_"0.6" 32 | /ExonModel/minwindowcount "1"-"15" 33 | /IGenicModel/patpseudocount "0.5"_"7" 34 | /IntronModel/patpseudocount "0.5"_"7" 35 | /IntronModel/slope_of_bandwidth "0.05"_"0.6" 36 | /IntronModel/minwindowcount "1"-"8" 37 | /IntronModel/asspseudocount "0.0005"_"0.03" 38 | /IntronModel/dsspseudocount "0.0002"_"0.04" 39 | /IntronModel/dssneighborfactor "0.0001"_"0.01" 40 | /ExonModel/minPatSum "100"_"600" 41 | /Constant/probNinCoding "0.15"_".25" 42 | /Constant/decomp_num_steps "1"-"5" 43 | # comment parameters out that you do not want to be subject of optimization 44 | #/IGenicModel/k "4" "3" "5" 45 | #/IntronModel/k "4" "3" "5" 46 | #/ExonModel/k "4" "3" "5" 47 | -------------------------------------------------------------------------------- /Dockerfiles/augustus/fgram_base/fgram_base_parameters.cfg: -------------------------------------------------------------------------------- 1 | # 2 | # fgram_base parameters. 3 | # 4 | # date : 19.12.2006 5 | # 6 | 7 | # 8 | # Properties for augustus 9 | #------------------------------------ 10 | /augustus/verbosity 3 # 0-3, 0: only print the neccessary 11 | maxDNAPieceSize 200000 # maximum segment that is predicted in one piece 12 | stopCodonExcludedFromCDS true # make this 'true' if the CDS includes the stop codon (training and prediction) 13 | 14 | # gff output options: 15 | protein on # output predicted protein sequence 16 | codingseq off # output the coding sequence 17 | cds on # output 'cds' as feature for exons 18 | start on # output start codons (translation start) 19 | stop on # output stop codons (translation stop) 20 | introns on # output introns 21 | tss on # output transcription start site 22 | tts on # output transcription termination site 23 | print_utr off # output 5'UTR and 3'UTR lines in addition to exon lines 24 | 25 | checkExAcc off # internal parameter for extrinsic accuracy 26 | 27 | # alternative transcripts and posterior probabilities 28 | sample 100 # the number of sampling iterations 29 | alternatives-from-sampling false # output alternative transcripts 30 | minexonintronprob 0.08 # minimal posterior probability of all (coding) exons 31 | minmeanexonintronprob 0.4 # minimal geometric mean of the posterior probs of introns and exons 32 | maxtracks -1 # maximum number of reported transcripts per gene (-1: no limit) 33 | keep_viterbi true # set to true if all Viterbi transcripts should be reported 34 | uniqueCDS true # don't report transcripts that differ only in the UTR 35 | UTR off # predict untranslated regions 36 | 37 | # 38 | # 39 | # The rest of the file contains mainly meta parameters used for training. 40 | # 41 | 42 | # global constants 43 | # ---------------------------- 44 | 45 | /Constant/trans_init_window 6 46 | /Constant/ass_upwindow_size 25 47 | /Constant/ass_start 3 48 | /Constant/ass_end 3 49 | /Constant/dss_start 2 50 | /Constant/dss_end 4 51 | /Constant/init_coding_len 18 52 | /Constant/intterm_coding_len 13 53 | /Constant/tss_upwindow_size 45 54 | /Constant/decomp_num_at 1 55 | /Constant/decomp_num_gc 1 56 | /Constant/gc_range_min 0.32 # This range has an effect only when decomp_num_steps>1. 57 | /Constant/gc_range_max 0.73 # States the minimal and maximal percentage of c or g 58 | /Constant/decomp_num_steps 3 59 | /Constant/min_coding_len 201 # no gene with a coding sequence shorter than this is predicted 60 | /Constant/probNinCoding 0.23 61 | /Constant/amberprob 0.33 # Prob(stop codon = tag), if 0 tag is assumed to code for amino acid 62 | /Constant/ochreprob 0.33 # Prob(stop codon = taa), if 0 taa is assumed to code for amino acid 63 | /Constant/opalprob 0.34 # Prob(stop codon = tga), if 0 tga is assumed to code for amino acid 64 | /Constant/subopt_transcript_threshold 0.7 65 | /Constant/almost_identical_maxdiff 10 66 | 67 | # type of weighing, one of 1 = equalWeights, 2 = gcContentClasses, 3 = multiNormalKernel 68 | /BaseCount/weighingType 3 69 | # file with the weight matrix (only for multiNormalKernel type weighing) 70 | /BaseCount/weightMatrixFile fgram_base_weightmatrix.txt # change this to your species if at all neccessary 71 | 72 | # Properties for IGenicModel 73 | # ---------------------------- 74 | /IGenicModel/verbosity 0 75 | /IGenicModel/infile fgram_base_igenic_probs.pbl # change this and the other five filenames *_probs.pbl below to your species 76 | /IGenicModel/outfile fgram_base_igenic_probs.pbl 77 | /IGenicModel/patpseudocount 5.0 78 | /IGenicModel/k 4 # order of the Markov chain for content model, keep equal to /ExonModel/k 79 | 80 | # Properties for ExonModel 81 | # ---------------------------- 82 | /ExonModel/verbosity 3 83 | /ExonModel/infile fgram_base_exon_probs.pbl 84 | /ExonModel/outfile fgram_base_exon_probs.pbl 85 | /ExonModel/patpseudocount 0.5 86 | /ExonModel/minPatSum 233.3 87 | /ExonModel/k 4 # order of the Markov chain for content model 88 | /ExonModel/etorder 2 89 | /ExonModel/etpseudocount 3 90 | /ExonModel/exonlengthD 2000 # beyond this the distribution is geometric 91 | /ExonModel/maxexonlength 15000 92 | /ExonModel/slope_of_bandwidth 0.325 93 | /ExonModel/minwindowcount 8 94 | /ExonModel/tis_motif_memory 3 95 | /ExonModel/tis_motif_radius 0 96 | 97 | # Properties for IntronModel 98 | # ---------------------------- 99 | /IntronModel/verbosity 0 100 | /IntronModel/infile fgram_base_intron_probs.pbl 101 | /IntronModel/outfile fgram_base_intron_probs.pbl 102 | /IntronModel/patpseudocount 5.0 103 | /IntronModel/k 4 # order of the Markov chain for content model, keep equal to /ExonModel/k 104 | /IntronModel/slope_of_bandwidth 0.4 105 | /IntronModel/minwindowcount 4 106 | /IntronModel/asspseudocount 0.01525 107 | /IntronModel/dsspseudocount 0.0005 108 | /IntronModel/dssneighborfactor 0.007525 109 | #/IntronModel/splicefile fgram_base_splicefile.txt # this optional file contains additional windows around splice sites for training, uncomment if you have one 110 | /IntronModel/sf_with_motif false # if true the splice file is also used to train the branch point region 111 | /IntronModel/d 100 112 | /IntronModel/ass_motif_memory 1 113 | /IntronModel/ass_motif_radius 0 114 | 115 | # Properties for UtrModel 116 | # ---------------------------- 117 | /UtrModel/verbosity 3 118 | /UtrModel/infile fgram_base_utr_probs.pbl 119 | /UtrModel/outfile fgram_base_utr_probs.pbl 120 | /UtrModel/k 4 121 | /UtrModel/utr5patternweight 0 122 | /UtrModel/utr3patternweight 1.0 123 | /UtrModel/patpseudocount 1 124 | /UtrModel/tssup_k 0 125 | /UtrModel/tssup_patpseudocount 1 126 | /UtrModel/slope_of_bandwidth 0.2375 127 | /UtrModel/minwindowcount 3 128 | /UtrModel/exonlengthD 800 129 | /UtrModel/maxexonlength 1800 130 | /UtrModel/max3singlelength 1800 131 | /UtrModel/max3termlength 1800 132 | /UtrModel/tss_start 8 133 | /UtrModel/tss_end 5 134 | /UtrModel/tata_start 2 135 | /UtrModel/tata_end 10 136 | /UtrModel/tata_pseudocount 2 137 | /UtrModel/d_tss_tata_min 26 # minimal distance between start of tata box (if existent) and tss 138 | /UtrModel/d_tss_tata_max 37 # maximal distance between start of tata box (if existent) and tss 139 | /UtrModel/d_polyasig_cleavage 14 # the transcription end is predicted this many bases after the polyadenylation signal 140 | /UtrModel/d_polya_cleavage_min 7 141 | /UtrModel/d_polya_cleavage_max 17 142 | /UtrModel/prob_polya 0.4 143 | -------------------------------------------------------------------------------- /Dockerfiles/augustus/fgram_base/fgram_base_weightmatrix.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file contains a matrix used for weighing the training sequences 3 | # when given an input sequence. Let z = (da, dc, dg, dt) be the vector 4 | # containing the differences in the relative nucleotide frequencies of 5 | # two sequences, the input sequence and a training sequence. 6 | # Then the training sequence has weight proportional to 7 | # 8 | # exp ( - z M z^t) 9 | # 10 | # with M being the matrix specified below. 11 | # If M is nonsingular, then (apart from a two normalizing factors) M 12 | # is the inverse of the covariance matrix of a multinormal 13 | # distribution - the kernel for the estimation. 14 | 15 | 16 | # this matrix is gc-content only, i.e. 17 | # weight = 10 * exp (-200 * (dc + dg))^2) 18 | # in particular weight <= 10 19 | 0 0 0 0 20 | 0 200 0 0 21 | 0 0 200 0 22 | 0 0 0 0 23 | 24 | -------------------------------------------------------------------------------- /Dockerfiles/basics/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04 2 | 3 | MAINTAINER Rob Syme 4 | 5 | RUN apt-get update && apt-get install -qqy build-essential python ruby curl htop wget htop 6 | 7 | WORKDIR /opt 8 | 9 | # Samtools 10 | RUN apt-get install -qqy zlib1g-dev libncurses5-dev 11 | ADD http://downloads.sourceforge.net/project/samtools/samtools/1.2/samtools-1.2.tar.bz2 ./ 12 | RUN tar -xvf *.tar.bz2 && rm *.tar.bz2 && mv samtools* samtools \ 13 | && cd samtools && make 14 | 15 | # NCBI-blast 16 | RUN apt-get install -qqy ncbi-blast+ 17 | 18 | # Bioruby 19 | RUN gem install bio 20 | 21 | # Emboss 22 | RUN apt-get install -qqy emboss 23 | 24 | # HMMER 25 | RUN apt-get install -qqy hmmer 26 | 27 | # Bedtools 28 | RUN apt-get install -qqy bedtools 29 | 30 | # Genome tools 31 | WORKDIR /opt 32 | RUN apt-get install -qqy libcairo2-dev libpango1.0-dev 33 | ADD http://genometools.org/pub/genometools-1.5.6.tar.gz ./ 34 | RUN tar -xvf genometools-* && rm -f *.tar.gz && mv genometools* genometools 35 | RUN cd genometools && make 64bit=yes opt=yes universal=no && sudo make install 36 | 37 | ENV PATH /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/augustus/bin:/opt/tophat:/opt/samtools 38 | -------------------------------------------------------------------------------- /Dockerfiles/bedtools/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04 2 | 3 | MAINTAINER Rob Syme 4 | 5 | RUN apt-get update -qq && apt-get install -qqy bedtools samtools 6 | -------------------------------------------------------------------------------- /Dockerfiles/bioruby/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:15.04 2 | 3 | MAINTAINER Rob Syme 4 | 5 | RUN apt-get update && apt-get install ruby2.1 -qqy 6 | RUN gem install --no-document --version 1.4.3 bio 7 | 8 | -------------------------------------------------------------------------------- /Dockerfiles/busco/Dockerfile: -------------------------------------------------------------------------------- 1 | from robsyme/augustus:3.0.3 2 | 3 | MAINTAINER Rob Syme 4 | 5 | RUN apt-get update && apt-get install -yqq python ncbi-blast+ hmmer emboss 6 | 7 | #Busco 8 | RUN mkdir /opt/busco 9 | WORKDIR /opt/busco 10 | ADD http://busco.ezlab.org/files/BUSCO_v1.0.tar.gz /opt/busco/ 11 | RUN tar -xzvf BUSCO_v1.0.tar.gz \ 12 | && rm *.tar.gz \ 13 | && sed -i 's/^#!\/bin\/python/#!\/usr\/bin\/env python/' BUSCO_v1.0.py \ 14 | && chmod +x BUSCO_v1.0.py \ 15 | && ln -s BUSCO_v1.0.py busco 16 | ADD http://busco.ezlab.org/files/fungi_buscos.tar.gz /opt/busco/lineages/ 17 | RUN cd /opt/busco/lineages/ && tar -xzf *.tar.gz 18 | 19 | # Genometools 20 | WORKDIR /opt/gt 21 | ADD http://genometools.org/pub/binary_distributions/gt-1.5.7-Linux_x86_64-64bit-barebone.tar.gz /opt/gt/ 22 | RUN tar -xvf *.tar.gz && rm *.tar.gz && ln -s gt* current 23 | 24 | #Samtools 25 | RUN apt-get install samtools 26 | 27 | ENV PATH /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/augustus/bin:/opt/augustus/scripts:/opt/busco:/opt/gt/current/bin 28 | 29 | ENTRYPOINT ["/bin/bash"] 30 | -------------------------------------------------------------------------------- /Dockerfiles/busco/README.md: -------------------------------------------------------------------------------- 1 | # BUSCO in docker 2 | 3 | This is a repository that contains the 4 | [BUSCO](http://busco.ezlab.org/) software for 'assessing genome 5 | assembly and annotation completeness with single-copy orthologs'. It 6 | contains preconfigured installations of the BUSCO prerequisites, 7 | including Augustus 3.0, hmmer, ncbi-blast+, and emboss. 8 | 9 | ## Using the container 10 | 11 | I have a fungal genome `scaffolds.fasta` in the current directory, I 12 | can run busco by first downloading the profiles (in my case fungi): 13 | 14 | wget http://busco.ezlab.org/files/fungi_buscos.tar.gz 15 | tar -xzvf fungi_buscos.tar.gz && rm fungi_buscos.tar.gz 16 | 17 | I can then run the busco docker container 18 | 19 | docker run --rm -v $PWD:/in -w /in robsyme/busco \ 20 | busco -in scaffolds.fasta -o my_run --lineage fungi 21 | 22 | I might consider bundling in the profiles into lineage-specific docker 23 | images, but busco unhelpfully prepends a '`.`' to the lineage path, so 24 | I would have to create a runner script that links in the profile 25 | folder into the current working directory, which is a bit messy. For 26 | the moment, I will leave profile managment to the user. 27 | -------------------------------------------------------------------------------- /Dockerfiles/chado-helper/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04 2 | 3 | RUN apt-get update && apt-get install -qqy postgresql-client wget build-essential 4 | 5 | WORKDIR /opt 6 | ADD http://downloads.sourceforge.net/project/gmod/gmod/chado-1.23/chado-1.23.tar.gz /opt/ 7 | RUN tar -xzvf *.tar.gz && rm *.tar.gz && mv chado-1.23 chado 8 | 9 | WORKDIR /opt/chado 10 | ENV GMOD_ROOT /usr/local/gmod 11 | ENV CHADO_DB_USERNAME chadouser 12 | ENV CHADO_DB_NAME chado 13 | ENV CHADO_DB_HOST chado 14 | 15 | # Perl bits 16 | RUN apt-get install -qqy libtemplate-perl libxml-simple-perl libdbi-perl libgo-perl libdbd-pg-perl libdbix-dbstag-perl libsql-translator-perl bioperl 17 | RUN sed -i 's/stag-storenode.pl/stag-storenode/' lib/Bio/Chado/Builder.pm 18 | RUN perl Makefile.PL && make && make install 19 | -------------------------------------------------------------------------------- /Dockerfiles/chado-helper/README.md: -------------------------------------------------------------------------------- 1 | # Chado Loading Helper 2 | 3 | This docker image is to help get a new chado database up and running 4 | quickly. 5 | 6 | ## Steps 7 | 8 | ### Create a new database container and user 'chadouser' 9 | 10 | ```sh 11 | docker run -d --name db postgres 12 | ``` 13 | 14 | Now that you've got a blank database, we'll create a new user 15 | 'chadouser' inside that database: 16 | 17 | ```sh 18 | docker run --rm --link db:chado postgres \ 19 | createuser \ 20 | --host chado \ 21 | --username postgres \ 22 | --createdb \ 23 | --echo \ 24 | --login \ 25 | chadouser 26 | ``` 27 | And create the `chado` database: 28 | 29 | ```sh 30 | docker run --rm --link db:chado postgres \ 31 | createdb \ 32 | -h chado \ 33 | -U chadouser 34 | chado 35 | ``` 36 | 37 | ### Load the basic schema 38 | 39 | ```sh 40 | docker run --rm --link db:chado robsyme/chado-helper make load_schema 41 | docker run --rm --link db:chado robsyme/chado-helper make prepdb 42 | ``` 43 | 44 | ### Load the ontologies 45 | 46 | This step is interactive so that you can specify which ontologies you 47 | wish to load 48 | 49 | ```sh 50 | docker run --rm --link db:chado --interactive --tty robsyme/chado-helper make ontologies 51 | ``` 52 | 53 | ### Backup the sql 54 | 55 | Now is probably a good time to take a snapshot of the database so that 56 | you can get back to a clean slate if needed. You can dump the sql to 57 | the current directory using: 58 | 59 | ```sh 60 | docker run --rm --link db:chado -v $PWD:/dump postgres \ 61 | pg_dump \ 62 | -h chado \ 63 | -U postgres \ 64 | -f /dump/chado_dump.sql\ 65 | chado 66 | ``` 67 | -------------------------------------------------------------------------------- /Dockerfiles/codingquarry/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04 2 | 3 | MAINTAINER Rob Syme 4 | 5 | RUN apt-get update && apt-get install -yqq build-essential python 6 | 7 | WORKDIR /opt 8 | ADD http://downloads.sourceforge.net/project/codingquarry/CodingQuarry_v1.2.tar.gz ./ 9 | RUN apt-get install -qqy libopenmpi-dev 10 | RUN tar -xzvf *.tar.gz && rm *.tar.gz && mv CodingQuarry* CodingQuarry && cd CodingQuarry && make 11 | ENV QUARRY_PATH /opt/CodingQuarry/QuarryFiles 12 | ENV PATH /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/CodingQuarry 13 | -------------------------------------------------------------------------------- /Dockerfiles/cufflinks/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM robsyme/tophat 2 | 3 | MAINTAINER Rob Syme 4 | 5 | WORKDIR /opt 6 | ADD http://cole-trapnell-lab.github.io/cufflinks/assets/downloads/cufflinks-2.2.1.Linux_x86_64.tar.gz ./ 7 | RUN tar -xzvf *.tar.gz && rm *.tar.gz && mv cufflinks* cufflinks 8 | ENV PATH /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/augustus/bin:/opt/cufflinks 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Dockerfiles/emboss/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04 2 | 3 | MAINTAINER Rob Syme 4 | 5 | RUN apt-get update 6 | RUN apt-get install -qqy emboss 7 | -------------------------------------------------------------------------------- /Dockerfiles/gff2gb/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04 2 | 3 | MAINTAINER Rob Syme 4 | 5 | RUN apt-get update && apt-get install -qqy wget python python-biopython 6 | 7 | RUN wget https://bootstrap.pypa.io/get-pip.py && python get-pip.py && pip install bcbio-gff 8 | 9 | ADD https://raw.githubusercontent.com/chapmanb/bcbb/master/gff/Scripts/gff/gff_to_genbank.py /usr/local/bin/ 10 | RUN chmod +x /usr/local/bin/gff_to_genbank.py 11 | 12 | CMD ["/usr/local/bin/gff_to_genbank.py"] 13 | -------------------------------------------------------------------------------- /Dockerfiles/hhblits-fungi/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04 2 | 3 | MAINTAINER Rob Syme 4 | ENV DEBIAN_FRONTEND noninteractive 5 | 6 | RUN apt-get update && apt-get install -qqy hhsuite ffindex samtools 7 | RUN mkdir /databases 8 | WORKDIR /databases 9 | 10 | # One of two options here - either download it during docker build 11 | ADD http://downloads.sourceforge.net/project/jamg/databases/fungal_50kclus.tar.bz2 . 12 | # ... or download it yourself next to this Dockerfile and then docker build. 13 | #ADD fungal_50kclus.tar.bz2 . 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /Dockerfiles/hhblits-transposon/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04 2 | 3 | MAINTAINER Rob Syme 4 | ENV DEBIAN_FRONTEND noninteractive 5 | 6 | RUN apt-get update && apt-get install -qqy hhsuite ffindex samtools 7 | RUN mkdir /databases 8 | WORKDIR /databases 9 | 10 | # One of two options here - either download it during docker build 11 | ADD http://downloads.sourceforge.net/project/jamg/databases/transposons.hhblits.tar.bz2 . 12 | # ... or download it yourself next to this Dockerfile and then docker build. 13 | #ADD transposons.hhblits.tar.bz2 . 14 | RUN tar -xvf transposons.hhblits.tar.bz2 15 | -------------------------------------------------------------------------------- /Dockerfiles/interproscan/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04 2 | 3 | MAINTAINER Rob Syme 4 | 5 | RUN apt-get update && apt-get install -y default-jre wget coreutils 6 | RUN mkdir -p /opt/interproscan && \ 7 | cd /opt/interproscan && \ 8 | wget ftp://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.7-48.0/interproscan-5.7-48.0-64-bit.tar.gz* && \ 9 | md5sum -c interproscan*.md5 && \ 10 | rm *.md5 && \ 11 | tar -pxvzf interproscan*.tar.gz && \ 12 | rm *.tar.gz 13 | RUN ln -s /opt/interproscan/interproscan-5.7-48.0 /opt/interproscan/current 14 | WORKDIR /opt/interproscan/current 15 | RUN apt-get install -qqy ncoils blast2 16 | ADD interproscan.properties /opt/interproscan/current/ 17 | ENV PATH /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/interproscan/current 18 | -------------------------------------------------------------------------------- /Dockerfiles/interproscan/interproscan.properties: -------------------------------------------------------------------------------- 1 | # This is the InterProScan configuration file 2 | 3 | ## 4 | ## Temporary files and directory 5 | ## 6 | # The text [UNIQUE], if present, will be replaced by a value unique to your running instance 7 | 8 | # Temporary files used by the analyses will be placed in directories here: 9 | temporary.file.directory.suffix=[UNIQUE] 10 | temporary.file.directory=temp/${temporary.file.directory.suffix} 11 | 12 | ## 13 | ## H2 database 14 | ## 15 | # The H2 database is copied by the standalone version of interproscan 16 | i5.h2.database.original.location=work/template/interpro.zip 17 | # LOCK_TIMEOUT: Sets the lock timeout (in milliseconds) for the current session 18 | i5.database.connection.url=jdbc:h2:mem:interpro;LOCK_TIMEOUT=10000000 19 | 20 | ## 21 | ## binary paths 22 | ## 23 | # Configure the version of perl to use when running member databases perl binaries 24 | perl.command=perl 25 | 26 | # Binary file locations 27 | binary.hmmer3.path=bin/hmmer/hmmer3/3.1b1 28 | binary.hmmer3.hmmscan.path=bin/hmmer/hmmer3/3.1b1/hmmscan 29 | binary.hmmer3.hmmsearch.path=bin/hmmer/hmmer3/3.1b1/hmmsearch 30 | binary.hmmer2.hmmsearch.path=bin/hmmer/hmmer2/2.3.2/hmmsearch 31 | binary.hmmer2.hmmpfam.path=bin/hmmer/hmmer2/2.3.2/hmmpfam 32 | binary.fingerprintscan.path=bin/prints/fingerPRINTScan 33 | binary.coils.path=/usr/bin/ncoils 34 | domainfinder3.path=bin/gene3d/DomainFinder3 35 | binary.prodom.2006.1.prodomblast3i.pl.path=bin/prodom/2006.1/ProDomBlast3i.pl 36 | # Note: Correct prosite binary distribution for your platform can be downloaded: ftp://ftp.expasy.org/databases/prosite/tools/ps_scan/ 37 | binary.prosite.psscan.pl.path=bin/prosite/ps_scan.pl 38 | binary.prosite.pfscan.path=bin/prosite/pfscan 39 | binary.panther.path=bin/panther/7.0/pantherScore.pl 40 | binary.panther.perl.lib.dir=bin/panther/7.0/lib 41 | binary.superfamily.1.75.ass3.pl.path=bin/superfamily/1.75/ass3_single_threaded.pl 42 | binary.pirsf.pl.path=bin/pirsf/2.85/pirsf.pl 43 | binary.blastall.2.2.6.path=/usr/bin/blastall 44 | binary.blast.2.2.19.path=bin/blast/2.2.19 45 | binary.getorf.path=bin/nucleotide/getorf 46 | # Note: SignalP binary not distributed with InterProScan 5, please install separately e.g. in bin/signalp/4.0/signalp 47 | binary.signalp.4.0.path= 48 | # Note: TMHMM binary not distributed with InterProScan 5, please install separately e.g. in bin/tmhmm/2.0c/decodeanhmm 49 | binary.tmhmm.path= 50 | # Note: Phobius binary not distributed with InterProScan 5, please install separately e.g. in bin/phobius/1.01/phobius.pl 51 | binary.phobius.pl.path.1.01= 52 | 53 | ## 54 | ## Member database model / data file locations (alphabetically sorted) 55 | ## 56 | # Member database model / data file locations (alphabetically sorted) 57 | coils.new_coil.mat.path.2.2=data/coils/2.2/new_coil.mat 58 | gene3d.hmm.path.3.5.0=data/gene3d/3.5.0/gene3d_classified.hmm 59 | gene3d.model2sf_map.path.3.5.0=data/gene3d/3.5.0/model_to_family_map.csv 60 | hamap.profile.models.path.201311.27=data/hamap/201311.27/hamap.prf 61 | # It is IMPORTANT to set this temporary directory to a directory on LOCAL disk - 62 | # network IO will slow the panther analysis down considerably. 63 | panther.temporary.file.directory=/tmp/ 64 | panther.models.dir.9.0=data/panther/9.0/model 65 | Pfam-A.hmm.path.27.0=data/pfam/27.0/Pfam-A.hmm 66 | Pfam-A.seed.path.27.0=data/pfam/27.0/Pfam-A.seed 67 | Pfam-A.hmm.path.26.0=data/pfam/26.0/Pfam-A.hmm 68 | Pfam-A.seed.path.26.0=data/pfam/26.0/Pfam-A.seed 69 | Pfam-C.path.27.0=data/pfam/27.0/Pfam-C 70 | #Version 2.84 71 | pirsf.hmm.bin.path.2.84=data/pirsf/2.84/sf_hmm.bin 72 | pirsf.hmm.subf.bin.path.2.84=data/pirsf/2.84/sf_hmm_subf.bin 73 | pirsf.hmm.path.2.84=data/pirsf/2.84/sf_hmm 74 | pirsf.hmm.subf.path.2.84=data/pirsf/2.84/sf_hmm_subf 75 | pirsf.dat.path.2.84=data/pirsf/2.84/pirsf.dat 76 | pirsf.sf.tb.path.2.84=data/pirsf/2.84/sf.tb 77 | pirsf.sf.seq.path.2.84=data/pirsf/2.84/sf.seq 78 | 79 | prints.kdat.path.42.0=data/prints/42.0/prints42_0.kdat 80 | prints.pval.path.42.0=data/prints/42.0/prints.pval 81 | prints.hierarchy.path.42.0=data/prints/42.0/FingerPRINTShierarchy.db 82 | prodom.ipr.path.2006.1=data/prodom/2006.1/prodom.ipr 83 | prosite.models.path.20.97=data/prosite/20.97/prosite.dat 84 | prosite.evaluator.models.path.20.97=data/prosite/20.97/evaluator.dat 85 | signalp.4.0.perl.library.dir=bin/signalp/4.0/lib 86 | # Note: Smart overlapping and threshold files not distributed with InterProScan 5, please install separately e.g. in data/smart/6.2 87 | smart.hmm.path.6.2=data/smart/6.2/smart.HMMs 88 | smart.hmm.bin.path.6.2=data/smart/6.2/smart.HMMs.bin 89 | smart.overlapping.path.6.2= 90 | smart.threshold.path.6.2= 91 | superfamily.hmm.path.3.0=data/superfamily/1.75/hmmlib_1.75 92 | superfamily.self.hits.path.1.75=data/superfamily/1.75/self_hits.tab 93 | superfamily.cla.path.1.75=data/superfamily/1.75/dir.cla.scop.txt_1.75 94 | superfamily.model.tab.path.1.75=data/superfamily/1.75/model.tab 95 | superfamily.pdbj95d.path.1.75=data/superfamily/1.75/pdbj95d 96 | tigrfam.hmm.path.13.0=data/tigrfam/13.0/TIGRFAMs_13.0_HMM.LIB 97 | # Note: TMHMM model files not distributed with InterProScan 5, please install separately e.g. in data/tmhmm/2.0/TMHMM2.0.model 98 | tmhmm.model.path= 99 | 100 | ## 101 | ## cpu options for parallel processing 102 | ## 103 | 104 | #hmmer cpu options for the different jobs 105 | hmmer3.hmmsearch.cpu.switch.pfama=--cpu 4 106 | hmmer3.hmmsearch.cpu.switch.tigrfam=--cpu 4 107 | hmmer3.hmmsearch.cpu.switch.gene3d=--cpu 4 108 | hmmer3.hmmsearch.cpu.switch.superfamily=--cpu 4 109 | 110 | hmmer2.hmmpfam.cpu.switch.smart=--cpu 3 111 | hmmer2.hmmpfam.cpu.switch.pirsf=--cpu 4 112 | 113 | #blastall cpu options 114 | blastall.cpu.switch.pirsf=-a 4 115 | 116 | #panther binary cpu options (for blastall and hmmsearch) 117 | panther.binary.cpu.switch=-c 4 118 | 119 | #pirsf binary cpu options (for hmmscan) 120 | pirsf.pl.binary.cpu.switch=-cpu 4 121 | 122 | 123 | ## 124 | ## max number of proteins per analysis batch 125 | ## 126 | # These values control the maximum number of proteins put through 127 | # an analysis in one go - different algorithms have different optimum values. 128 | # Note that if you suffer from out of memory errors, reducing these values 129 | # will almost certainly help, but may reduce the speed of analysis. 130 | analysis.max.sequence.count.TMHMM=100 131 | analysis.max.sequence.count.PANTHER=100 132 | analysis.max.sequence.count.SMART=50 133 | analysis.max.sequence.count.TIGRFAM_9=50 134 | analysis.max.sequence.count.TIGRFAM_10=100 135 | analysis.max.sequence.count.GENE3D=50 136 | analysis.max.sequence.count.PRINTS=100 137 | analysis.max.sequence.count.PROSITE_PROFILES=100 138 | analysis.max.sequence.count.PROSITE_PATTERNS=100 139 | analysis.max.sequence.count.PIRSF=50 140 | analysis.max.sequence.count.PRODOM=100 141 | analysis.max.sequence.count.SSF=50 142 | analysis.max.sequence.count.HAMAP=100 143 | analysis.max.sequence.count.PFAM_A=100 144 | analysis.max.sequence.count.COILS=100 145 | analysis.max.sequence.count.PHOBIUS=100 146 | analysis.max.sequence.count.SIGNALP=100 147 | 148 | ## 149 | ## General settings 150 | ## 151 | 152 | # If multiple hosts are sharing the same file system, a delay may be required to 153 | # avoid stale NFS handles 154 | # nfs.delay.milliseconds=0 155 | 156 | # Instructs I5 to completely clean up after itself - leave set to true. 157 | delete.temporary.directory.on.completion=true 158 | 159 | ## 160 | ## Broker TCP Connection 161 | ## 162 | 163 | # A list of TCP ports that should not be used for messaging. (Apart from this, only ports > 1024 and < 65535 will be used.) 164 | tcp.port.exclusion.list=3879,3878,3881,3882 165 | 166 | ## 167 | ## precalculated match lookup service 168 | ## 169 | # By default, if the sequence already has matches available from the EBI, this service will look them 170 | # up for you. Note - at present it will always return all the available matches, ignoring any -appl options 171 | # set on the command line. 172 | precalculated.match.lookup.service.url=http://www.ebi.ac.uk/interpro/match-lookup 173 | 174 | #proxy set up 175 | precalculated.match.lookup.service.proxy.host= 176 | precalculated.match.lookup.service.proxy.port=3128 177 | 178 | 179 | ## 180 | ## getorf configuration for nucleic acid sequences 181 | ## 182 | # the following are roughly the times getorf takes to find sequences of open reading frames (ORFs) in n nucleotide sequences 183 | #number of sequences -> approx. time it takes in our tests 184 | # 600000 -> 10 minutes 185 | # 3600000 -> 1 hour 186 | # 7200000 -> 2 hours 187 | # 43200000 -> 12 hours 188 | 189 | # JOB: jobLoadNucleicAcidSequence 190 | getorf.minsize=50 191 | 192 | ## 193 | ## Output format 194 | ## 195 | # TRUE by default, which means all generated graphical output documents (only SVG at the moment) will be archived (using the Linux command tar). 196 | # This simple switch allows you to switch the archive mode off (simply set it to FALSE). 197 | archiveSVGOutput=true 198 | 199 | ## 200 | ## Master/Stand alone embedded workers 201 | ## 202 | 203 | # Set the number of embedded workers to the number of processors that you would like to employ 204 | # on the machine you are using to run InterProScan. 205 | #number of embedded workers a master process can have 206 | number.of.embedded.workers=1 207 | maxnumber.of.embedded.workers=2 208 | 209 | ## 210 | ## Distributed mode (Cluster mode) 211 | ## 212 | 213 | #grid name 214 | grid.name=lsf 215 | #grid.name=other-cluster 216 | 217 | #project name for this run - use user.digest 218 | user.digest=i5GridRun 219 | 220 | #grid jobs limit : number of jobs you are allowed to run on the cluster 221 | grid.jobs.limit=1000 222 | 223 | 224 | #time between each bjobs or qstat command to check the status of jobs on the cluster 225 | grid.check.interval.seconds=120 226 | 227 | #allow master interproscan to run binaries () 228 | master.can.run.binaries=true 229 | 230 | #deal with unknown step states 231 | recover.unknown.step.state=false 232 | 233 | #Grid submission commands (e.g. LSF bsub or SGE qsub) for starting remote workers 234 | #commands the master uses to start new remote workers 235 | grid.master.submit.command=bsub -q QUEUE_NAME 236 | grid.master.submit.high.memory.command=bsub -q QUEUE_NAME -M 8192 237 | 238 | #commands a worker uses to start new remote workers 239 | grid.worker.submit.command=bsub -q QUEUE_NAME 240 | grid.worker.submit.high.memory.command=bsub -q QUEUE_NAME -M 8192 241 | 242 | # command to start a new worker (new jvm) 243 | worker.command=java -Xms32m -Xmx2048m -jar interproscan-5.jar 244 | # This may be identical to the worker.command argument above, however you may choose to select 245 | # a machine with a much larger available memory, for use when a StepExecution fails. 246 | worker.high.memory.command=java -Xms32m -Xmx2048m -jar interproscan-5.jar 247 | 248 | # Set the number of embedded workers to the number of processors that you would like to employ 249 | # on the node machine on which the worker will run. 250 | #number of embedded workers in a remote worker 251 | worker.number.of.embedded.workers=1 252 | worker.maxnumber.of.embedded.workers=4 253 | 254 | # max number of connections to the master 255 | master.maxconsumers=64 256 | 257 | #number of connections to the worker 258 | worker.maxconsumers=32 259 | 260 | #throttled network? 261 | grid.throttle=true 262 | 263 | # max number of jobs a tier 1 worker is allowed on its queue 264 | worker.maxunfinished.jobs=64 265 | 266 | #network tier depth 267 | max.tier.depth=1 268 | 269 | # Active MQ JMS broker temporary data directory 270 | jms.broker.temp.directory=activemq-data/localhost/tmp_storage 271 | -------------------------------------------------------------------------------- /Dockerfiles/jamg/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04 2 | 3 | MAINTAINER Rob Syme 4 | 5 | RUN apt-get update && apt-get install -qqy build-essential cdbfasta ncbi-blast+ snap git 6 | 7 | # Insall Augustus 8 | ADD http://bioinf.uni-greifswald.de/augustus/binaries/augustus-3.1.tar.gz /opt/ 9 | RUN cd /opt && \ 10 | tar -xzvf augustus* && \ 11 | rm -rf *.tar.gz && \ 12 | mv augustus* augustus && \ 13 | cd augustus && \ 14 | make 15 | 16 | ENV AUGUSTUS_CONFIG_PATH /opt/augustus/config 17 | 18 | ENV PATH /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/augustus/bin:/opt/augustus/scripts 19 | 20 | WORKDIR /opt 21 | RUN apt-get install zlib1g-dev wget 22 | RUN git clone https://github.com/genomecuration/JAMg.git jamg 23 | # && cd jamg \ 24 | # && make all 25 | 26 | 27 | # gmap 28 | 29 | # augustus 30 | 31 | # gff2gbSmallDNA.pl 32 | 33 | # etraining 34 | 35 | # filterGenes.pl 36 | -------------------------------------------------------------------------------- /Dockerfiles/ncbi-blast/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04 2 | 3 | MAINTAINER Rob Syme 4 | 5 | RUN apt-get update && apt-get install -qqy ncbi-blast+ 6 | -------------------------------------------------------------------------------- /Dockerfiles/pasa/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04 2 | 3 | MAINTAINER Rob Syme 4 | 5 | RUN apt-get update -qq && apt-get install -qqy build-essential 6 | 7 | # Install Gmap 8 | WORKDIR /opt 9 | ADD http://research-pub.gene.com/gmap/src/gmap-gsnap-2015-07-23.tar.gz ./ 10 | RUN tar -xvf gmap*.tar.gz && rm gmap*.tar.gz && mv gmap* gmap && cd gmap && ./configure && make 11 | RUN cd gmap && make install 12 | 13 | # Install Fasta aligner 14 | RUN apt-get install -qqy zlib1g-dev 15 | ADD http://faculty.virginia.edu/wrpearson/fasta/fasta36/fasta-36.3.8.tar.gz ./ 16 | RUN tar -xvf fasta*.tar.gz && rm fasta*.tar.gz && mv fasta* fasta && cd fasta/src && make -f ../make/Makefile.linux64 17 | 18 | # Install blat aligner 19 | RUN apt-get install -qqy unzip libpng-dev 20 | ENV MACHTYPE=x86_64 21 | RUN mkdir -p ~/bin/$MACHTYPE 22 | ADD http://hgwdev.cse.ucsc.edu/~kent/src/blatSrc35.zip ./ 23 | RUN unzip blat* && rm *.zip && mv blat* blat && cd blat && make 24 | 25 | # Install DBD::mysql, etc 26 | RUN apt-get install -qqy liburi-escape-xs-perl liburi-perl mysql-client libdbd-mysql-perl 27 | 28 | # Install PASA 29 | ADD https://github.com/PASApipeline/PASApipeline/archive/v2.0.2.tar.gz ./ 30 | RUN tar -xvf *.tar.gz && rm *.tar.gz && mv PASA* pasa && cd pasa && make 31 | ADD conf.txt /opt/pasa/pasa_conf/ 32 | ENV PASAHOME=/opt/pasa 33 | 34 | # Final PATH 35 | ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/fasta/bin:/root/bin/$MACHTYPE:/opt/blat/:/opt/fasta/bin:$PASAHOME/bin:$PASAHOME/scripts:/opt/seqclean 36 | 37 | 38 | -------------------------------------------------------------------------------- /Dockerfiles/pasa/Dockerfile-pasaweb: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04 2 | 3 | MAINTAINER Rob Syme 4 | 5 | RUN apt-get update -qq 6 | 7 | # Install DBD::mysql and apache 8 | RUN apt-get install -qqy liburi-escape-xs-perl liburi-perl mysql-client libdbd-mysql-perl build-essential zlib1g-dev libgd-perl apache2 libgd-graph-perl 9 | 10 | # Install PASA 11 | WORKDIR /usr/lib/cgi-bin 12 | ADD https://github.com/PASApipeline/PASApipeline/archive/v2.0.2.tar.gz ./ 13 | RUN tar -xvf *.tar.gz && rm *.tar.gz && mv PASA* pasa && cd pasa && make && chmod -R 755 . 14 | ADD conf.txt /usr/lib/cgi-bin/pasa/pasa_conf/ 15 | ENV PASAHOME=/usr/lib/cgi-bin/pasa 16 | 17 | # Final PATH 18 | ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PASAHOME/bin 19 | 20 | ENV APACHE_RUN_USER www-data 21 | ENV APACHE_RUN_GROUP www-data 22 | ENV APACHE_PID_FILE /var/run/apache2/apache2.pid 23 | ENV APACHE_RUN_DIR /var/run/apache2 24 | ENV APACHE_LOCK_DIR /var/lock/apache2 25 | ENV APACHE_LOG_DIR /var/log/apache2 26 | 27 | RUN a2enmod cgi 28 | 29 | EXPOSE 80 30 | 31 | CMD ["apache2", "-DFOREGROUND"] 32 | 33 | 34 | -------------------------------------------------------------------------------- /Dockerfiles/pasa/conf.txt: -------------------------------------------------------------------------------- 1 | 2 | ##################################### 3 | ## PASA admin settings ############## 4 | ##################################### 5 | 6 | #emails sent to admin on job launch, success, and failure 7 | PASA_ADMIN_EMAIL=bhaas@tigr.org 8 | 9 | # database to manage pasa jobs; required for daemon-based processing. 10 | PASA_ADMIN_DB=PASA2_admin_06152006_devel 11 | 12 | # At TIGR, we setup the PASA mysql databases under separate project quotas in different areas of the 13 | # file system, and then provide a symlink under the mysql data or lib area. 14 | # see below for info on setting up hooks. By default, simply keep the value below at false. 15 | USE_PASA_DB_SETUP_HOOK=false 16 | 17 | 18 | ##################################### 19 | ## MySQL settings: ################## 20 | ##################################### 21 | 22 | # server actively running MySQL 23 | MYSQLSERVER=db 24 | 25 | # read-only username and password 26 | MYSQL_RO_USER=root 27 | MYSQL_RO_PASSWORD=password 28 | 29 | # read-write username and password 30 | MYSQL_RW_USER=root 31 | MYSQL_RW_PASSWORD=password 32 | 33 | 34 | ############################################ 35 | # Web browser navigation settings: ######### 36 | ############################################ 37 | 38 | BASE_PASA_URL=http://bhaas-lx:8080/cgi-bin/ 39 | 40 | 41 | ############################################# 42 | ## Transcript Sequence Cleaning ############# 43 | ############################################# 44 | VECTOR_DB=/usr/local/db/vector/UniVec 45 | 46 | 47 | 48 | 49 | ############################################# 50 | ## Hooks #################################### 51 | ############################################# 52 | # Hooks are provided to allow custom code to be called for the following 53 | # routines. 54 | # The methods must be fully qualified with their module name as static methods. 55 | # Before calling the method, the HOOK_PERL_LIBS path listing is added to the Perl Lib path. 56 | # The first parameter value provided to the methods is the PASA_conf hash reference, that provides 57 | # the key value pairs for all entries in this conf file. 58 | # A single custom parameter value can be added as the value to the 59 | # {Package::method}~EXTRA_PARAM key (see example below). This single param 60 | # can be packed with any delimiter so that the user can encode several attributes. 61 | # the special variable __PASAHOME__ can be used and will be replaced by the PASA installation directory value. 62 | # To access the hooks, use the &Pasa_conf::call_hook() method. 63 | 64 | 65 | # comma delimit paths to be added to the perl lib path so the hook modules can be found. 66 | HOOK_PERL_LIBS=__PASAHOME__/SAMPLE_HOOKS 67 | 68 | ############ 69 | # hook that relocates the mysql database to our filesystem 70 | ##### (no such sillyness here): HOOK_PASA_DB_SETUP=Tigr_hook_routines::copy_template_db_and_symlink 71 | ##### encode some extra info in the extra parameter available: 72 | ##### Tigr_hook_routines::copy_template_db_and_symlink~EXTRA_PARAM=/export/home/software/mysql/data 73 | 74 | 75 | ############ 76 | # hook that commits updated gene structures to the annotation database 77 | HOOK_GENE_STRUCTURE_UPDATER=Sample_annot_updater::get_updater_obj 78 | 79 | 80 | ############ 81 | # hook that loads the latest gene structure annotations from an external source 82 | # ie. from gff files or from a relational database. 83 | # you build your adapter based on your data format preference. 84 | HOOK_EXISTING_GENE_ANNOTATION_LOADER=GFF3::GFF3_annot_retriever::get_annot_retriever 85 | -------------------------------------------------------------------------------- /Dockerfiles/pfam/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04 2 | 3 | MAINTAINER Rob Syme 4 | 5 | RUN apt-get update && apt-get install -yqq hmmer unzip wget 6 | 7 | WORKDIR /opt 8 | RUN wget ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam28.0/Pfam-A.hmm.gz && gunzip *.gz && hmmpress Pfam-A.hmm && rm Pfam-A.hmm 9 | 10 | -------------------------------------------------------------------------------- /Dockerfiles/proteinortho/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04 2 | 3 | MAINTAINER Rob Syme 4 | 5 | RUN apt-get update && apt-get install -qqy build-essential ncbi-blast+ python perl tree 6 | 7 | ADD http://www.bioinf.uni-leipzig.de/Software/proteinortho/proteinortho_v5.11.tar.gz /opt/ 8 | RUN cd /opt && \ 9 | tar -xzvf proteinortho_*.tar.gz && \ 10 | rm -rf *.tar.gz && \ 11 | mv proteinortho_v5.11 proteinortho 12 | RUN cd /usr/local/bin && find /opt/proteinortho -type f -executable | xargs -I{} ln -s {} . 13 | 14 | CMD ["/opt/proteinortho/proteinortho5.pl"] 15 | -------------------------------------------------------------------------------- /Dockerfiles/tophat/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04 2 | 3 | MAINTAINER Rob Syme 4 | 5 | RUN apt-get update && apt-get install -qqy python 6 | 7 | WORKDIR /opt 8 | ADD https://ccb.jhu.edu/software/tophat/downloads/tophat-2.1.0.Linux_x86_64.tar.gz ./ 9 | RUN tar -xzvf *.tar.gz && rm *.tar.gz && mv tophat* tophat 10 | ENV PATH /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/augustus/bin:/opt/tophat 11 | -------------------------------------------------------------------------------- /Dockerfiles/trinity/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04 2 | 3 | MAINTAINER Rob Syme 4 | 5 | RUN apt-get update -qq && apt-get install -qqy build-essential zlib1g-dev libncurses5-dev 6 | 7 | WORKDIR /opt/ 8 | ADD http://downloads.sourceforge.net/project/samtools/samtools/0.1.19/samtools-0.1.19.tar.bz2 /opt/ 9 | RUN tar -xvf samtools* && rm *.bz2 && mv samtools* samtools && cd samtools && make 10 | 11 | RUN apt-get install -qqy unzip 12 | ADD http://downloads.sourceforge.net/project/bowtie-bio/bowtie/1.1.2/bowtie-1.1.2-linux-x86_64.zip /opt/ 13 | RUN unzip bowtie* && rm *.zip && mv bowtie* bowtie 14 | 15 | 16 | RUN apt-get install -qqy curl openjdk-7-jre 17 | ADD https://github.com/trinityrnaseq/trinityrnaseq/archive/v2.0.6.tar.gz /opt/ 18 | RUN tar -xvf *.tar.gz && rm *.tar.gz && mv trinity* trinity && cd trinity && make 19 | 20 | ENV PATH /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/trinity:/opt/samtools:/opt/bowtie 21 | 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Robert Syme 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # nextflow-annotate 2 | 3 | This is a push to gather together some tools that are helpful for 4 | genome annotation, and serve as a forkable, version-controlled, 5 | reusable, and citable record of our pipeline. The steps use nextflow 6 | as a workflow engine so we can abstract the individual steps from 7 | their execution environment (SGE, MPI or simple local multithreading). 8 | 9 | This is not a push-button solution, but it can serve as a starting 10 | point for annotating your new genome. 11 | 12 | ## Prerequisites 13 | 14 | The minimum prerequisites are [docker](http://docker.io) and 15 | [nextflow](http://nextflow.io), and a fasta file (henceforth 16 | `scaffolds.fasta`) of your genome assembly. 17 | 18 | Some steps require software or data with licences that restrict 19 | distribution, but I've kept them to a minimum and will make it clear 20 | when those pieces are necessary. 21 | 22 | ## Steps 23 | 24 | Each of these steps corresponds to one of the nextflow recipes 25 | provided by this repository. 26 | 27 | ### Transposon Identification 28 | 29 | Taking cues from [jamg](http://jamg.sourceforge.net), we transcribe 30 | all of the open reading frames and then use hhblit to match against a 31 | database of known transposons. A GFF file is produced that describes 32 | to position of the transposons that we find. 33 | 34 | This uses two docker images, which will be pulled automatically from 35 | the docker registry as needed. 36 | 37 | ### Finding Repeats 38 | Repeats are an important part of the final genome annotation. I 39 | recommend a two-step process: 40 | 41 | 1. Find denovo repeats with RepeatScout. 42 | 2. Use the RepeatScout output in conjuctions with the latest RepBase 43 | library as input to RepeatMasker 44 | 45 | I've taken care of the RepeatScout and RepeatMasker installation by 46 | bundling them as docker images. The only hiccup is that RepBase 47 | requires registration. 48 | -------------------------------------------------------------------------------- /annotate: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | 3 | params.genomes = 'data/**/tmp.fasta' 4 | genomes = Channel.fromPath(params.genomes) 5 | 6 | params.homologyProbabilityCutoff = 70 7 | params.evalueCutoff = 1e-3 8 | params.pvalueCutoff = 1e-6 9 | params.scoreCutoff = 100 10 | params.alignmentLengthCutoff = 50 11 | params.templateLengthCutoff = 30 12 | 13 | def toFasta(defLine, sequence, width=80) { 14 | return (">" + defLine + "\n" + wrapString(sequence, width) + "\n") 15 | } 16 | 17 | def wrapString(text, width=80) { 18 | def out = [] 19 | while(text.length() > width) { 20 | out += text[0..(width-1)] 21 | text = text[width..-1] 22 | } 23 | out += text 24 | return out.join("\n") 25 | } 26 | 27 | class HHRHit { 28 | float probability 29 | float evalue 30 | float pvalue 31 | float score 32 | float structureScore 33 | String queryName 34 | String queryStart 35 | String queryEnd 36 | String revString 37 | String strainName 38 | String description 39 | Integer alignmentLength 40 | Integer aaStart 41 | Integer aaStop 42 | Integer hitStart 43 | Integer hitStop 44 | Integer templateSize 45 | 46 | HHRHit(String result) { 47 | // Pull out the query information. It will look something like: 48 | // Query Scaffold_1_318 [160485 - 161177] Length:352063 [Ascochyta_fabae_Af1] 49 | (queryName, queryStart, queryEnd, revString, strainName) = (result =~ (/Query\s+(\S+)_\d+ \[(\d+) - (\d+)\] (\(REVERSE SENSE\))?.* \[(.*)\]/))[0][1..-1] 50 | 51 | // Find the top hit. The line will look something like: 52 | // 1 GB:CAA29181 ORF 1 (LINE-elemen 99.9 1.4E-28 4.1E-32 243.9 0.0 61 2-69 1216-1281(1650) 53 | def hitData = (result =~ /\s+1\s+(.{30})\s+(\d+\.?\d*)\s+(\d+\.?\d*E?-?\d*)\s+(\d+\.?\d*E?-?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+)\s+(\d+)-(\d+)\s+(\d+)-(\d+)\s*\((\d+)\)/) 54 | if (hitData.size() == 0) { 55 | // It's likely that many of the orfs won't have any hits. In 56 | // this case, just return a 'hit' with score zero 57 | score = 0 58 | structureScore = 0 59 | } else { 60 | // If we do find a hit, return a new HHRHit instance. 61 | description = hitData[0][1] 62 | probability = hitData[0][2].toFloat() 63 | evalue = hitData[0][3].toFloat() 64 | pvalue = hitData[0][4].toFloat() 65 | score = hitData[0][5].toFloat() 66 | structureScore = hitData[0][6].toFloat() 67 | alignmentLength = hitData[0][7].toInteger() 68 | aaStart = hitData[0][8].toInteger() 69 | aaStop = hitData[0][9].toInteger() 70 | hitStart = hitData[0][10].toInteger() 71 | hitStop = hitData[0][11].toInteger() 72 | templateSize = hitData[0][12].toInteger() 73 | } 74 | } 75 | 76 | String toString() { 77 | def out = [] 78 | out.push "Query: " + queryName + " (" + queryStart + "-" + queryEnd + ") [" + strainName + "]" 79 | out.push " Description: '" + description + "'" 80 | out.push " Probability: " + probability 81 | out.push " E-value: " + evalue 82 | out.push " P-value: " + pvalue 83 | out.push " Score: " + score 84 | return out.join("\n") 85 | } 86 | 87 | String toGFF3() { 88 | def hitID = description.split()[0] 89 | def uid = "${hitID}.s${hitStart}.e${hitStop}" 90 | def out = [] 91 | out.push queryName 92 | out.push 'hhblits' 93 | out.push 'protein_match' 94 | out.push queryStart + 3 * aaStart - 1 95 | out.push queryStart + 3 * aaStop - 1 96 | out.push score 97 | out.push revString ? "-" : "+" 98 | out.push "." 99 | out.push "ID=${uid};Name=${uid};Target=${hitID} $hitStart $hitStop" 100 | println "DONE: ${out.join('\t')}" 101 | out.join("\t") 102 | } 103 | 104 | String toHints() { 105 | def out = [] 106 | out.push queryName 107 | out.push 'hhblits' 108 | out.push "nonexonpart" 109 | out.push queryStart + 3 * aaStart - 1 110 | out.push queryStart + 3 * aaStop - 1 111 | out.push score 112 | out.push revString ? "-" : "+" 113 | out.push "." 114 | out.push "source=RM;grp=${description.split()[0]};pri=6" 115 | out.join("\t") 116 | } 117 | 118 | String toGeneID() { 119 | def out = [] 120 | out.push queryName 121 | out.push 'hhblits' 122 | out.push 'sr' 123 | out.push queryStart + 3 * aaStart - 1 124 | out.push queryStart + 3 * aaStop - 1 125 | out.push score 126 | out.push revString ? "-" : "+" 127 | out.push '.' 128 | out.join("\t") 129 | } 130 | } 131 | 132 | process cleanGenome { 133 | input: 134 | val genome from genomes 135 | 136 | output: 137 | set name, stdout into cleanGenomes 138 | 139 | script: 140 | name = genome.getParent().getBaseName() 141 | 142 | """ 143 | awk '/^>/ && !/[.*]/ {print(\$0, "[$name]")} /^>/ && /[.*]/ {print \$0} /^[^>]/ {print(toupper(\$0))}' '$genome' 144 | sed -ie "s/\015//" "$genome" 145 | """ 146 | } 147 | 148 | process getorf { 149 | container 'robsyme/emboss' 150 | 151 | input: 152 | set name, 'maskedGenome' from cleanGenomes 153 | 154 | output: 155 | file 'orfs.aa.fasta' into orfFiles 156 | 157 | """ 158 | getorf -sequence $maskedGenome -outseq orfs.aa.fasta -minsize 150 -find 1 159 | """ 160 | } 161 | 162 | cleanOrfs = orfFiles.splitFasta(record: [header: true, seqString: true]) 163 | .filter { record -> 164 | xCount = record.seqString.count('X') 165 | length = record.seqString.size() 166 | xCount / length < 0.3 167 | } 168 | .map { record -> 169 | record.seqString = record.seqString.replaceAll('X','') 170 | return toFasta(record.header, record.seqString) 171 | } 172 | 173 | process hhblit { 174 | container 'robsyme/hhblits' 175 | 176 | input: 177 | file 'orf.fasta' from cleanOrfs 178 | 179 | output: 180 | stdout into hhblitOutput 181 | 182 | """ 183 | hhblits -i orf.fasta -d /databases/transposons -maxmem 5 -cpu 1 -o stdout -e 1E-5 -E 1E-5 -id 80 -p 80 -z 0 -b 0 -B 3 -Z 3 -n 1 -mact 0.5 -v 0 184 | """ 185 | } 186 | 187 | transposonGFFLines = Channel.create() 188 | transposonHintLines = Channel.create() 189 | transposonGeneIDLines = Channel.create() 190 | 191 | hhblitOutput 192 | .map { String result -> new HHRHit(result) } 193 | .filter { it.score > 0 } 194 | .filter { it.probability > params.homologyProbabilityCutoff } 195 | .filter { it.evalue < params.evalueCutoff } 196 | .filter { it.pvalue < params.pvalueCutoff } 197 | .filter { it.alignmentLength > params.alignmentLengthCutoff } 198 | .filter { it.templateSize > params.templateLengthCutoff } 199 | .separate(transposonGFFLines, transposonHintLines, transposonGeneIDLines) { [ it.toGFF3(), it.toHints(), it.toGeneID() ] } 200 | 201 | transposonGFF = transposonGFFLines.collectFile(name: 'transposon_hits.gff3') 202 | 203 | process sortTransposonHits { 204 | input: 205 | file 'gff' from transposonGFF 206 | 207 | output: 208 | stdout into transposonSortedGFF 209 | 210 | """ 211 | sort -nk 4,4 $gff | sort -sk 1,1 212 | """ 213 | } 214 | 215 | transposonSortedGFF.subscribe { 216 | it.moveTo('./') 217 | } 218 | -------------------------------------------------------------------------------- /bin/GG_trinity_accession_incrementer.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | counter = 0 3 | while ARGF.gets 4 | if $_ =~ /^>GG\d\+\|(.*)\n/ 5 | puts ">GG%d|%s" % [counter += 1, $1] 6 | else 7 | puts $_ 8 | end 9 | end 10 | -------------------------------------------------------------------------------- /bin/augustus_RNAseq_hints.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | =pod 4 | 5 | =head1 TODO 6 | 7 | merge all rnseq hints at the end. maintain grp 8 | run test with 100 genes, not 390+ 9 | 10 | 11 | =head1 NAME 12 | 13 | augustus_RNAseq_hints.pl 14 | 15 | =head1 USAGE 16 | 17 | Create hint files for Augustus using RNASeq/EST. One is junction reads (excellent for introns), the other is RNASeq/EST coverage 18 | 19 | Mandatory options: 20 | 21 | -bam|in s The input BAM file (co-ordinate sorted). 22 | -genome|fasta s The genome assembly FASTA file. 23 | 24 | Other options: 25 | 26 | -strandness i If RNAseq is directional, provide direction: 0 for unknown (default); or 1 for + strand; -1 for - strand 27 | -min_score i Minimum score for parsing (defaults to 20) 28 | -window i Window size for coverage graph (defaults to 50) 29 | -background_fold i Background (defaults to 4), see perldoc 30 | -no_hints Don't create hints file for Augustus, just process junction reads 31 | 32 | =head1 DESCRIPTION 33 | 34 | Background: The problem of getting the intron boundary correct is that rnaseq doesn't go to 0 at the intron, but continues at a background level. 35 | For that reason, stop if it is -background_fold times lower than a previous 'good' value 36 | 37 | 38 | =head1 AUTHORS 39 | 40 | Alexie Papanicolaou 41 | 42 | CSIRO Ecosystem Sciences 43 | alexie@butterflybase.org 44 | 45 | =head1 DISCLAIMER & LICENSE 46 | 47 | Copyright 2012-2014 the Commonwealth Scientific and Industrial Research Organization. 48 | See LICENSE file for license info 49 | It is provided "as is" without warranty of any kind. 50 | 51 | =cut 52 | 53 | use strict; 54 | use warnings; 55 | use Data::Dumper; 56 | use Getopt::Long; 57 | use List::Util qw(sum); 58 | use Pod::Usage; 59 | use File::Basename; 60 | use FindBin qw($RealBin); 61 | use lib ("$RealBin/../PerlLib"); 62 | $ENV{PATH} .= ":$RealBin:$RealBin/../3rd_party/bin/"; 63 | 64 | my ( $samtools_exec, $bedtools_exec, $bed_to_aug_script ) = &check_program( 'samtools', 'bedtools','bed12_to_augustus_junction_hints.pl' ); 65 | 66 | 67 | 68 | #Options 69 | my ( @bamfiles, $genome, $help,$no_hints ); 70 | my $window = 50; 71 | my $min_score = 20; 72 | my $strandness = int(0); 73 | my $background_level = 4; 74 | pod2usage $! unless &GetOptions( 75 | 'help' => \$help, 76 | 'bam|in:s{,}' => \@bamfiles, 77 | 'genome|fasta:s' => \$genome, 78 | 'min_score:i' => \$min_score, 79 | 'strandness:i' => \$strandness, 80 | 'window:i' => \$window, 81 | 'background_fold:i' => \$background_level, 82 | 'nohints|no_hints' => \$no_hints 83 | ); 84 | 85 | pod2usage if $help; 86 | 87 | pod2usage "Cannot find the BAM or genome FASTA file\n" 88 | unless $bamfiles[0] 89 | && -s $bamfiles[0] 90 | && $genome 91 | && ( -s $genome || -s $genome . '.fai' ); 92 | 93 | my $strand; 94 | if ( !$strandness || $strandness == 0 ) { 95 | $strand = '.'; 96 | } 97 | elsif ( $strandness > 0 ) { 98 | $strand = '+'; 99 | } 100 | elsif ( $strandness < 1 ) { 101 | $strand = '-'; 102 | } 103 | else { 104 | die; 105 | } 106 | 107 | my $master_bamfile; 108 | if (scalar(@bamfiles == 1)){ 109 | $master_bamfile = $bamfiles[0]; 110 | }else{ 111 | foreach my $bamfile (@bamfiles){ 112 | die "Cannot find $bamfile\n" unless -s $bamfile; 113 | } 114 | $master_bamfile = 'master_bamfile.bam'; 115 | &process_cmd("$samtools_exec merge -r $master_bamfile ".join(" ",@bamfiles)) unless -s $master_bamfile; 116 | } 117 | 118 | &process_cmd("$samtools_exec faidx $genome") unless -s $genome . '.fai'; 119 | die "Cannot index genome $genome\n" unless -s $genome . '.fai'; 120 | 121 | unless (-e "$master_bamfile.junctions.completed"){ 122 | &process_cmd("$samtools_exec rmdup -S $master_bamfile - | $bedtools_exec bamtobed -bed12 | $bed_to_aug_script -prio 7 -out $master_bamfile.junctions.bed > $master_bamfile.junctions.hints" ); 123 | # For JBrowse 124 | &process_cmd("$bedtools_exec bedtobam -bed12 -g $genome.fai -i $master_bamfile.junctions.bed| $samtools_exec sort -m 1073741824 - $master_bamfile.junctions"); 125 | &process_cmd("$samtools_exec index $master_bamfile.junctions.bam"); 126 | # For Augustus 127 | &only_keep_intronic("$master_bamfile.junctions.hints"); 128 | &touch("$master_bamfile.junctions.completed"); 129 | } 130 | 131 | unless (-e "$master_bamfile.coverage.bg.completed"){ 132 | # For JBrowse 133 | &process_cmd("$bedtools_exec genomecov -split -bg -g $genome.fai -ibam $master_bamfile| sort -S 1G -k1,1 -k2,2n > $master_bamfile.coverage.bg"); 134 | &process_cmd("bedGraphToBigWig $master_bamfile.coverage.bg $genome.fai $master_bamfile.coverage.bw") if `which bedGraphToBigWig`; 135 | &touch("$master_bamfile.coverage.bg.completed"); 136 | } 137 | 138 | unless (-e "$master_bamfile.coverage.hints.completed" && !$no_hints){ 139 | &bg2hints("$master_bamfile.coverage.bg") ; 140 | &merge_hints("$master_bamfile.coverage.hints"); 141 | &touch("$master_bamfile.coverage.hints.completed"); 142 | } 143 | 144 | if ( -e "$master_bamfile.junctions.completed" 145 | && -e "$master_bamfile.coverage.hints.completed" ) 146 | { 147 | unless (-e "$master_bamfile.rnaseq.completed"){ 148 | my $augustus_script_exec = $RealBin.'/../3rd_party/augustus/scripts/join_mult_hints.pl'; 149 | if (-s $augustus_script_exec){ 150 | &process_cmd("cat $master_bamfile.junctions.hints.intronic $master_bamfile.coverage.hints| sort -S 1G -n -k 4,4 | sort -S 1G -s -n -k 5,5 | sort -S 1G -s -n -k 3,3 | sort -S 1G -s -k 1,1| $augustus_script_exec > $master_bamfile.rnaseq.hints" ); 151 | &touch("$master_bamfile.rnaseq.completed"); 152 | } 153 | } 154 | print "Done!\n"; 155 | } 156 | elsif (!$no_hints) { 157 | die "Something went wrong....\n"; 158 | }else{ 159 | print "Done, no hints were processed as requested\n"; 160 | } 161 | ### 162 | sub check_program() { 163 | my @paths; 164 | foreach my $prog (@_) { 165 | my $path = `which $prog`; 166 | pod2usage "Error, path to a required program ($prog) cannot be found\n\n" 167 | unless $path =~ /^\//; 168 | chomp($path); 169 | $path = readlink($path) if -l $path; 170 | push( @paths, $path ); 171 | } 172 | return @paths; 173 | } 174 | ### 175 | sub process_cmd { 176 | my ($cmd) = @_; 177 | print "CMD: $cmd\n"; 178 | my $ret = system($cmd); 179 | if ( $ret && $ret != 256 ) { 180 | die "Error, cmd died with ret $ret\n"; 181 | } 182 | return $ret; 183 | } 184 | 185 | sub bg2hints() { 186 | my $bg = shift; 187 | my $outfile = $bg; 188 | $outfile =~ s/.bg$/.hints/; 189 | open( IN, $bg ); 190 | my ( @array, %area ); 191 | while ( my $ln = ) { 192 | chomp($ln); 193 | my @data = split( "\t", $ln ); 194 | next unless $data[3] >= $min_score; 195 | # store data in an array 196 | for ( my $i = $data[1] ; $i <= $data[2] ; $i++ ) { 197 | # co-ords in bg are 0-based; hints/gff is 1-based 198 | $area{ $data[0] }{$i+1} = $data[3]; 199 | } 200 | } 201 | 202 | # print final area 203 | #TODO: NB this is still wrong. 204 | #~/workspace/transcripts4community/jamg/test_suite 205 | #rm -f gsnap.drosoph_50M_vs_droso_opt_temp.concordant_uniq.bam.coverage.hints.completed gsnap.drosoph_50M_vs_droso_opt_temp.concordant_uniq.bam.coverage.hints ; ../bin/augustus_RNAseq_hints.pl -dir ../3rd_party/augustus.2.7 -bam gsnap.drosoph_50M_vs_droso_opt_temp.concordant_uniq.bam -genome optimization.fasta; less gsnap.drosoph_50M_vs_droso_opt_temp.concordant_uniq.bam.coverage.hints 206 | open( OUT, ">$outfile" ); 207 | foreach my $ref ( sort { $a cmp $b } keys %area ) { 208 | my @coords = sort { $a <=> $b } ( keys %{ $area{$ref} } ); 209 | for ( my $i = $coords[0] ; $i < @coords ; $i++ ) { 210 | next if ( !$area{$ref}{$i} ); 211 | my $k = $i + $window; 212 | $k-- while ( !$area{$ref}{$k} ); 213 | next if $k == $i; 214 | my @splice; 215 | 216 | for ( my $v = $i ; $v <= $k ; $v++ ) { 217 | my $level = $area{$ref}{$v}; 218 | my $previous_level = $v eq $i ? int(0) : $area{$ref}{$v-1}; 219 | my $next_level = $v eq $k ? 1e6 : $area{$ref}{$v+1}; 220 | # the problem of getting the intron boundary correct is that 221 | # rnaseq doesn't go to 0 at the intron, but continues at a 222 | # background level. stop if it is 4 times lower than a previous 'good' value 223 | if ( 224 | !$level || 225 | ( $previous_level && ( $previous_level > ( $level * $background_level ) )) 226 | || $next_level && ($level > ( $next_level * $background_level )) 227 | ) 228 | { 229 | $k = $v - 1; 230 | last; 231 | } 232 | push( @splice, $level ); 233 | } 234 | # next if scalar(@splice) < ( $window / 2 ); 235 | my $median = &median( \@splice ); 236 | $median = $splice[0] if !$median; 237 | next unless $median && $median >= $min_score; 238 | print OUT $ref 239 | . "\tRNASeq\texonpart\t" 240 | . $i . "\t" 241 | . $k . "\t" 242 | . $median 243 | . "\t$strand\t.\tsrc=R;pri=4\n"; 244 | $i += $window ; 245 | } 246 | } 247 | 248 | close OUT; 249 | close IN; 250 | return $outfile; 251 | } 252 | 253 | sub only_keep_intronic(){ 254 | my $file = shift; 255 | my %hash; 256 | open (IN,$file); 257 | while (my $ln=){ 258 | next unless $ln=~/\tintron\t/; 259 | if ($ln=~/grp=([^;]+)/){ 260 | $hash{$1}++; 261 | } 262 | } 263 | close IN; 264 | open (IN,$file); 265 | open (OUT,">".$file.".intronic"); 266 | while (my $ln=){ 267 | if ($ln=~/\tintron\t/){ 268 | print OUT $ln ; 269 | } 270 | elsif ($ln=~/grp=([^;]+)/){ 271 | print OUT $ln if $hash{$1}; 272 | } 273 | } 274 | close IN; 275 | close OUT; 276 | } 277 | 278 | sub merge_hints(){ 279 | my $file = shift; 280 | open (IN,$file); 281 | open (OUT,">$file.merged"); 282 | my (@current_line,@previous_line); 283 | while () { 284 | @current_line = split /\t/; 285 | if (!@previous_line){ 286 | @previous_line = @current_line; 287 | }elsif(($current_line[0] eq $previous_line[0]) && ($current_line[2] eq $previous_line[2]) && 288 | (($current_line[3] >= $previous_line[3]) && ($current_line[4] <= $previous_line[4])) 289 | && ($current_line[6] eq $previous_line[6])){ 290 | # update previous_line by adding current to it 291 | chomp($previous_line[8]); 292 | $previous_line[8] =~ s/(grp=[^;]*);*//; 293 | my $grp = $1; 294 | $grp .= ';' if $grp; 295 | $grp = '' if !$grp; 296 | my ($lm,$m)=(1,1); 297 | if ($previous_line[8] =~ /mult=(\d+);/){ 298 | $lm = $1; 299 | $previous_line[8] =~ s/mult=\d+;//; 300 | } 301 | if ($current_line[8] =~ /mult=(\d+);/){ 302 | $m = $1; 303 | } 304 | $previous_line[8] = "mult=" . ($lm+$m) . ";$grp" . $previous_line[8]."\n"; 305 | 306 | }elsif ( 307 | !(($current_line[0] eq $previous_line[0]) && ($current_line[2] eq $previous_line[2]) && ($current_line[3] == $previous_line[3]) && ($current_line[4] == $previous_line[4]) && ($current_line[6] eq $previous_line[6])) 308 | ){ 309 | print OUT join("\t",@previous_line); 310 | @previous_line = @current_line; 311 | } 312 | 313 | else { 314 | # update previous_line by adding current to it 315 | chomp($previous_line[8]); 316 | $previous_line[8] =~ s/(grp=[^;]*);*//; 317 | my $grp = $1; 318 | $grp .= ';' if $grp; 319 | $grp = '' if !$grp; 320 | my ($lm,$m)=(1,1); 321 | if ($previous_line[8] =~ /mult=(\d+);/){ 322 | $lm = $1; 323 | $previous_line[8] =~ s/mult=\d+;//; 324 | } 325 | if ($current_line[8] =~ /mult=(\d+);/){ 326 | $m = $1; 327 | } 328 | $previous_line[8] = "mult=" . ($lm+$m) . ";$grp" . $previous_line[8]."\n"; 329 | } 330 | } 331 | print OUT join("\t",@previous_line) if (@previous_line); 332 | close IN; 333 | close OUT; 334 | unlink($file); 335 | rename($file.'.merged',$file); 336 | } 337 | 338 | sub touch() { 339 | my $file = shift; 340 | system("touch $file"); 341 | } 342 | 343 | sub mean() { 344 | return sum(@_) / @_; 345 | } 346 | 347 | sub median() { 348 | my $array_ref = shift; 349 | my @sorted = sort { $a <=> $b } @{$array_ref}; 350 | return $sorted[ int( @sorted / 2 ) ]; 351 | } 352 | -------------------------------------------------------------------------------- /bin/bed12_to_augustus_junction_hints.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | =pod 4 | 5 | =head1 USAGE 6 | 7 | This script will create a hints file for Augustus using junction reads. Junction reads are important because they annotate the introns. 8 | Give a bed12 file of junction reads (reduced with samtools dedup if possible) to get intron/exon boundary hints. See bedtools bamtobed to create the bed12 9 | 10 | example 11 | samtools rmdup -S SRR836188.coordSorted.bam - | bedtools bamtobed -bed12 | bed12_to_augustus_junction_hints.pl| ~/software/augustus/scripts/join_mult_hints.pl 12 | 13 | Options: 14 | 15 | -help This! 16 | -exon_min :i Minimum exon size (def. 50bp) 17 | -score_min :i Minimum score (def. 30) 18 | -max_exons :i Maximum number of exons that a single can span (def. 3) 19 | -min_match :i Number of min bases for each side of gap (def 20) 20 | -strandness :i If RNAseq is directional, provide direction: 0 for unknown (default); or 1 for + strand; -1 for - strand 21 | 22 | =head1 FORMATS 23 | 24 | BED12 input format 25 | 26 | 1 chrom - The name of the chromosome (e.g. chr3, chrY, chr2_random) or scaffold (e.g. scaffold10671). 27 | 2 chromStart - The starting position of the feature in the chromosome or scaffold. 28 | NB The first base in a chromosome is numbered 0. 29 | 3 chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99. 30 | 4 name - Defines the name of the BED line. This label is displayed to the left of the BED line in the Genome Browser window when the track is open to full display mode or directly to the left of the item in pack mode. 31 | 5 score - A score between 0 and 1000. If the track line useScore attribute is set to 1 for this annotation data set, the score value will determine the level of gray in which this feature is displayed (higher numbers = darker gray). This table shows the Genome Browser's translation of BED score values into shades of gray: 32 | 6 strand - Defines the strand - either '+' or '-'. 33 | 7 thickStart - The starting position at which the feature is drawn thickly (for example, the start codon in gene displays). 34 | 8 thickEnd - The ending position at which the feature is drawn thickly (for example, the stop codon in gene displays). 35 | 9 itemRgb - An RGB value of the form R,G,B (e.g. 255,0,0). If the track line itemRgb attribute is set to "On", this RBG value will determine the display color of the data contained in this BED line. NOTE: It is recommended that a simple color scheme (eight colors or less) be used with this attribute to avoid overwhelming the color resources of the Genome Browser and your Internet browser. 36 | 10 blockCount - The number of blocks (exons) in the BED line. 37 | 11 blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount. 38 | 12 blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount. 39 | 40 | IN example 41 | scaffold_0 83 514 USI-EAS034_0010:2:97:6859:21372#0/1 40 + 83 514 255,0,0 2 119,6 0,425 42 | 43 | OUT format 44 | GFF3 with exonpart and intronpart 45 | scaffold_0 RNASeq intronpart 1262 1414 . - . src=JR;pri=5;grp=readname 46 | 47 | =head1 AUTHORS 48 | 49 | Alexie Papanicolaou 50 | 51 | CSIRO Ecosystem Sciences 52 | alexie@butterflybase.org 53 | 54 | =head1 DISCLAIMER & LICENSE 55 | 56 | Copyright 2012-2014 the Commonwealth Scientific and Industrial Research Organization. 57 | See LICENSE file for license info 58 | It is provided "as is" without warranty of any kind. 59 | 60 | 61 | =cut 62 | 63 | use strict; 64 | use warnings; 65 | use Pod::Usage; 66 | use Getopt::Long; 67 | use FindBin qw($RealBin); 68 | use lib ("$RealBin/../PerlLib"); 69 | $ENV{PATH} .= ":$RealBin:$RealBin/../3rd_party/bin/"; 70 | 71 | my $min_exon_size = 50; 72 | my $min_score = 30; 73 | my $max_exons = 3; 74 | my $min_match = 20; 75 | my ($help); 76 | my $priority = 5; 77 | my $strandness = int(0); 78 | my $bed_outfile = 'junctions.bed'; 79 | pod2usage $! unless &GetOptions( 80 | 'help' => \$help, 81 | 'exon_min:i' => \$min_exon_size, 82 | 'score_min:i' => \$min_score, 83 | 'max_exons:i' => \$max_exons, 84 | 'min_match:i' => \$min_match, 85 | 'outfile:s' => \$bed_outfile, 86 | 'priority:i' => \$priority, 87 | 'strandness:i' => \$strandness 88 | ); 89 | 90 | pod2usage if $help; 91 | 92 | my $strand; 93 | if (!$strandness || $strandness == 0 ){ 94 | $strand = '.'; 95 | }elsif ($strandness > 0){ 96 | $strand = '+'; 97 | }elsif ($strandness < 1){ 98 | $strand = '-'; 99 | }else{ 100 | die; 101 | } 102 | 103 | open( BEDJUNCTIONS, ">$bed_outfile" ); 104 | 105 | OUTER: while ( my $ln = ) { 106 | chomp($ln); 107 | my @data = split( "\t", $ln ); 108 | 109 | # too many blocks - i.e. too many exons are being linked... biologically impossible?! 110 | next if $data[9] > $max_exons; 111 | 112 | #too low score 113 | next if $data[4] < $min_score; 114 | 115 | # numbering from 1 116 | $data[1]++; 117 | $data[2]++; 118 | 119 | #remove any /1 /2 from read name 120 | $data[3] =~ s/\/[0-2]$//; 121 | my @blockSizes = split( ",", $data[10] ); 122 | my @blockStarts = split( ",", $data[11] ); 123 | die unless scalar(@blockSizes) == scalar(@blockStarts); 124 | for ( my $i = 0 ; $i < @blockStarts ; $i++ ) { 125 | next OUTER if $blockSizes[$i] < $min_match; 126 | $blockStarts[$i] += $data[1]; 127 | } 128 | if ( scalar(@blockSizes) == 1 ) { 129 | 130 | # no intron 131 | my $type = 'exonpart'; 132 | my $start = $data[1]; 133 | my $stop = $data[2]; 134 | print $data[0] 135 | . "\tRNASeq\t" 136 | . $type . "\t" 137 | . $start . "\t" 138 | . $stop . "\t" 139 | . $data[4] 140 | . "\t$strand\t.\tsrc=JR;pri=$priority;grp=" 141 | . $data[3] . ";\n"; 142 | } 143 | else { 144 | print BEDJUNCTIONS $ln . "\n"; 145 | 146 | #exons first 147 | for ( my $i = 0 ; $i < scalar(@blockStarts) ; $i++ ) { 148 | my $type = 'exonpart'; 149 | my $start = $blockStarts[$i]; 150 | my $stop = $start + $blockSizes[$i] - 1; 151 | print $data[0] 152 | . "\tRNASeq\t" 153 | . $type . "\t" 154 | . $start . "\t" 155 | . $stop . "\t" 156 | . $data[4] 157 | . "\t$strand\t.\tsrc=JR;pri=$priority;grp=" 158 | . $data[3] . ";\n"; 159 | } 160 | 161 | #introns 162 | for ( my $i = 1 ; $i < scalar(@blockStarts) ; $i++ ) { 163 | my $type = 'intron'; 164 | my $start = ( $blockStarts[ $i - 1 ] + $blockSizes[ $i - 1 ] - 1 ) + 1; 165 | my $stop = $blockStarts[$i] - 1; 166 | print $data[0] 167 | . "\tRNASeq\t" 168 | . $type . "\t" 169 | . $start . "\t" 170 | . $stop . "\t" 171 | . $data[4] 172 | . "\t$strand\t.\tsrc=JR;pri=$priority;grp=" 173 | . $data[3] . ";\n"; 174 | } 175 | } 176 | } 177 | close BEDJUNCTIONS; 178 | -------------------------------------------------------------------------------- /bin/exonerate_to_genbank.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | require 'pp' 3 | require 'bio' 4 | require 'optparse' 5 | 6 | options = {} 7 | OptionParser.new do |opts| 8 | opts.banner = "Usage: exonerage_to_genbank.rb [options]" 9 | 10 | opts.on("-f", "--fasta genome", "Genome fasta file") do |filename| 11 | options[:fasta] = filename 12 | #TODO: Check for existance of file. 13 | end 14 | end.parse! 15 | 16 | def to_locations(match) 17 | puts match.captures.join("\t") 18 | pos = match[:target_start].to_i 19 | match[:vulgar] 20 | .split 21 | .each_slice(3) 22 | .chunk{ |type, q, t| case type; when /[MS]/; :coding; when /[5I3]/; :intron; else; :other; end} 23 | .map{ |cls, a| [cls, a.map{|type, q, t| t.to_i}.inject(:+)] } 24 | .each{|a| p a} 25 | end 26 | 27 | genome = Hash[Bio::FlatFile.open(options[:fasta]).map{|entry| [entry.entry_id,entry.naseq] }] 28 | genes = Hash.new{|h,k| h[k]=[]} 29 | 30 | while ARGF.gets 31 | next unless $_ =~ (/vulgar: (?\S+) (?\d+) (?\d+) (?.) (?\S+) (?\d+) (?\d+) (?.) (?\d+) (?.*)\n/) 32 | to_locations($~) 33 | end 34 | 35 | 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /bin/fullerCegmaGFF.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | def generate_transcript_line(id, lines) 4 | minPos = lines.min_by{|line| line[3].to_i}[3] 5 | maxPos = lines.max_by{|line| line[4].to_i}[4] 6 | f = lines.first 7 | [] << f[0] << f[1] << "mRNA" << minPos << maxPos << "." << f[6] << "." << "ID=t.#{id}" 8 | end 9 | 10 | def adjust_attributes(id, line) 11 | line[8] = "ID=c.#{id};Parent=t.#{id}" 12 | line 13 | end 14 | 15 | # Read all lines of the GFF 16 | lines = ARGF 17 | .map do |line| 18 | split = line.chomp.split("\t") 19 | split[2] = "CDS" 20 | split[3] = split[3].to_i 21 | split[4] = split[4].to_i 22 | split 23 | end 24 | 25 | ## Can we sort numerically rather than alphabetically? 26 | 27 | # First we find the longest common prefix for the chromosome/scaffold names 28 | items = lines.map{|line| line[0]}.uniq 29 | prefix = '' 30 | min, max = items.sort.values_at(0, -1) 31 | min.split(//).each_with_index do |c, i| 32 | break if c != max[i, 1] 33 | prefix << c 34 | end 35 | 36 | # Then make a regular expression that matches the common prefix and then some digits 37 | re = Regexp.new(prefix << "\\d+$") 38 | 39 | # If *all* of the chromosome/scaffold names match the regular 40 | # expression, we sort on the trailing digits. Otherwise we sort alphabetically 41 | sort_alphabetical = lambda {|line| [line[0][0], line[3][0]]} 42 | sort_numeric = lambda {|line| [line[0][0].match(/\d+$/)[-1].to_i, line[0][3]]} 43 | match_method = items.all?{|item| item =~ re} ? sort_numeric : sort_alphabetical 44 | 45 | lines.sort_by{|split| split[8]} 46 | .chunk{|line| line[8]} 47 | .map do |id, lines| 48 | transcript = generate_transcript_line(id, lines) 49 | lines 50 | .map{|line| adjust_attributes(id, line)} 51 | .unshift(transcript) 52 | end 53 | .sort_by(&match_method) 54 | .each do |a| 55 | puts a.map{|line| line.join("\t")} 56 | end 57 | -------------------------------------------------------------------------------- /bin/gff2gb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Convert a GFF and associated FASTA file into GenBank format. 3 | 4 | Usage: 5 | gff_to_genbank.py 6 | """ 7 | import sys 8 | import os 9 | 10 | from Bio import SeqIO 11 | from Bio.Alphabet import generic_dna 12 | from Bio import Seq 13 | 14 | from BCBio import GFF 15 | 16 | def main(gff_file, fasta_file): 17 | out_file = "%s.gb" % os.path.splitext(gff_file)[0] 18 | fasta_input = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta", generic_dna)) 19 | gff_iter = GFF.parse(gff_file, fasta_input) 20 | SeqIO.write(_check_gff(_fix_ncbi_id(gff_iter)), out_file, "genbank") 21 | 22 | def _fix_ncbi_id(fasta_iter): 23 | """GenBank identifiers can only be 16 characters; try to shorten NCBI. 24 | """ 25 | for rec in fasta_iter: 26 | if len(rec.name) > 16 and rec.name.find("|") > 0: 27 | new_id = [x for x in rec.name.split("|") if x][-1] 28 | print "Warning: shortening NCBI name %s to %s" % (rec.id, new_id) 29 | rec.id = new_id 30 | rec.name = new_id 31 | yield rec 32 | 33 | def _check_gff(gff_iterator): 34 | """Check GFF files before feeding to SeqIO to be sure they have sequences. 35 | """ 36 | for rec in gff_iterator: 37 | if isinstance(rec.seq, Seq.UnknownSeq): 38 | print "Warning: FASTA sequence not found for '%s' in GFF file" % ( 39 | rec.id) 40 | rec.seq.alphabet = generic_dna 41 | yield _flatten_features(rec) 42 | 43 | def _flatten_features(rec): 44 | """Make sub_features in an input rec flat for output. 45 | 46 | GenBank does not handle nested features, so we want to make 47 | everything top level. 48 | """ 49 | out = [] 50 | for f in rec.features: 51 | cur = [f] 52 | while len(cur) > 0: 53 | nextf = [] 54 | for curf in cur: 55 | out.append(curf) 56 | if len(curf.sub_features) > 0: 57 | nextf.extend(curf.sub_features) 58 | cur = nextf 59 | rec.features = out 60 | return rec 61 | 62 | if __name__ == "__main__": 63 | main(*sys.argv[1:]) 64 | -------------------------------------------------------------------------------- /bin/gff_transpose.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'optparse' 4 | require 'ostruct' 5 | require 'pathname' 6 | require 'set' 7 | require 'bio' 8 | require 'pp' 9 | 10 | class Options 11 | def self.parse(args) 12 | options = OpenStruct.new 13 | opts = OptionParser.new do |opts| 14 | opts.banner = "Usage: #{$0} [options] --from GFF --to GFF" 15 | opts.separator "" 16 | opts.separator "Specific options:" 17 | 18 | opts.on("-f", "--from FILENAME (Required)", "GFF file with features to transpose") do |filename| 19 | path = Pathname.new(filename) 20 | if path.exist? 21 | options.from = path 22 | else 23 | $stderr.puts("ERROR: Could not find the file #{filename}") 24 | $stderr.puts opts.banner 25 | exit(1) 26 | end 27 | end 28 | 29 | opts.on("-t", "--to FILENAME (Required)", "GFF file describing where the proteins are in nucleotide coorinates") do |filename| 30 | path = Pathname.new(filename) 31 | if path.exist? 32 | options.to = path 33 | else 34 | $stderr.puts("ERROR: Could not find the file #{filename}") 35 | $stderr.puts opts.banner 36 | exit(1) 37 | end 38 | end 39 | 40 | end 41 | opts.parse!(args) 42 | 43 | unless options.from 44 | $stderr.puts "Error: No *TO* GFF3 file supplied\n" 45 | $stderr.puts opts.banner 46 | exit(1) 47 | end 48 | 49 | unless options.to 50 | $stderr.puts "Error: No *FROM* GFF3 file supplied\n" 51 | $stderr.puts opts.banner 52 | exit(1) 53 | end 54 | 55 | options 56 | end 57 | end 58 | options = Options.parse(ARGV) 59 | 60 | records_lookup = Bio::GFF::GFF3.new(File.read(options.to)) 61 | .records 62 | .find_all{ |record| record.feature == "exon" } 63 | .to_set 64 | .classify{ |record| Hash[record.attributes]["Parent"].gsub('mRNA', 'exon_') } 65 | 66 | File.open(options.from).take_while{ |line| line !~ /FASTA/ }.each do |line| 67 | next if line =~ /^#/ 68 | 69 | seqid, source, type, hit_start, hit_stop, score, strand, phase, attributes = line.chomp.split("\t") 70 | hit_start = hit_start.to_i 71 | hit_stop = hit_stop.to_i 72 | 73 | begin 74 | records = records_lookup[seqid] 75 | .sort_by{ |record| record.start } 76 | .map{ |record| record.strand = "+" unless record.strand; record } 77 | rescue 78 | $stderr.puts "\n\nCould not find lookup for '#{seqid}'" 79 | exit(1) 80 | end 81 | 82 | exon_length = records.inject(0) do |mem, record| 83 | mem += record.end - record.start + 1 84 | end 85 | 86 | ranges = case [records.first.strand,strand].join 87 | when "++" 88 | [Range.new(hit_start - 1, hit_stop)] 89 | when "-+" 90 | [Range.new(exon_length - hit_stop, exon_length - hit_start)] 91 | when "+-" 92 | [Range.new(hit_start - 1, hit_stop - 1)] 93 | when "--" 94 | [Range.new(exon_length - hit_stop + 1, exon_length - hit_start)] 95 | end 96 | .map!{ |range| Range.new(range.first + records.first.start, range.last + records.first.start)} 97 | 98 | records 99 | .each_cons(2) 100 | .map{ |a, b| Range.new(a.end + 1, b.start - 1) } 101 | .reduce(ranges) do |mem, intron| 102 | size = intron.last - intron.first + 1 103 | mem.flat_map do |range| 104 | # Is there an overlap between this range and an intron? 105 | if (range.first <= intron.last) and (intron.first <= range.last) 106 | # If we introduce the intron, does the shifted range still overlap the intron location? 107 | if range.first + size <= intron.last 108 | # If so, make the new intron 109 | [Range.new(range.first, intron.first - 1), Range.new(intron.last + 1, intron.last + range.last - intron.first + 1)] 110 | else 111 | # If not, we can just move the region to the right by the intron size 112 | Range.new(range.first + size, range.last + size) 113 | end 114 | elsif intron.last < range.first 115 | # If there is no overlap and the region is still to the right, we move it right. 116 | Range.new(range.first + size, range.last + size) 117 | else 118 | range 119 | end 120 | end 121 | end.each do |range| 122 | next if source == "." 123 | puts [ 124 | records.first.seqname, 125 | source, 126 | type, 127 | range.first, 128 | range.last, 129 | score, 130 | records.first.strand == strand ? "+" : "-", 131 | ".", 132 | attributes 133 | ].join("\t") 134 | end 135 | end 136 | -------------------------------------------------------------------------------- /bin/gff_transpose.rb~: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'optparse' 4 | require 'ostruct' 5 | require 'pathname' 6 | require 'set' 7 | require 'bio' 8 | 9 | 10 | class Options 11 | def self.parse(args) 12 | options = OpenStruct.new 13 | opts = OptionParser.new do |opts| 14 | opts.banner = "Usage: #{$0} [options] --from GFF --to GFF" 15 | opts.separator "" 16 | opts.separator "Specific options:" 17 | 18 | opts.on("-f", "--from FILENAME (Required)", "GFF file with features to transpose") do |filename| 19 | path = Pathname.new(filename) 20 | if path.exist? 21 | options.from = path 22 | else 23 | $stderr.puts("ERROR: Could not find the file #{filename}") 24 | $stderr.puts opts.banner 25 | exit(1) 26 | end 27 | end 28 | 29 | opts.on("-t", "--to FILENAME (Required)", "GFF file describing where the proteins are in nucleotide coorinates") do |filename| 30 | path = Pathname.new(filename) 31 | if path.exist? 32 | options.to = path 33 | else 34 | $stderr.puts("ERROR: Could not find the file #{filename}") 35 | $stderr.puts opts.banner 36 | exit(1) 37 | end 38 | end 39 | 40 | end 41 | opts.parse!(args) 42 | 43 | unless options.from 44 | $stderr.puts "Error: No *TO* GFF3 file supplied\n" 45 | $stderr.puts opts.banner 46 | exit(1) 47 | end 48 | 49 | unless options.to 50 | $stderr.puts "Error: No *FROM* GFF3 file supplied\n" 51 | $stderr.puts opts.banner 52 | exit(1) 53 | end 54 | 55 | options 56 | end 57 | end 58 | options = Options.parse(ARGV) 59 | 60 | records_lookup = Bio::GFF::GFF3.new(File.read(options.to)) 61 | .records 62 | .find_all{ |record| record.feature == "exon" } 63 | .to_set 64 | .classify{ |record| Hash[record.attributes]["Parent"].gsub('mRNA', 'exon_') } 65 | 66 | File.open(options.from).take_while{ |line| line !~ /FASTA/ }.each do |line| 67 | next if line =~ /^#/ 68 | 69 | seqid, source, type, hit_start, hit_stop, score, strand, phase, attributes = line.split("\t") 70 | hit_start = hit_start.to_i 71 | hit_stop = hit_stop.to_i 72 | 73 | begin 74 | records = records_lookup[seqid].sort_by{ |record| record.start } 75 | rescue 76 | $stderr.puts "\n\nCould not find lookup for '#{seqid}'" 77 | exit(1) 78 | end 79 | 80 | protein_length = records.inject(0) do |mem, record| 81 | mem += record.end - record.start + 1 82 | end / 3 - 1 83 | 84 | begin 85 | strand = records.first.strand 86 | rescue 87 | $stderr.puts "Could not find a match to this line:" 88 | $stderr.puts line.chomp 89 | exit(1) 90 | end 91 | 92 | ranges = case records.first.strand 93 | when "+" 94 | [Range.new(hit_start - 1, hit_stop)] 95 | when "-" 96 | [Range.new(protein_length - hit_stop + 1, protein_length - hit_start + 2)] 97 | end 98 | .map!{ |range| Range.new((range.first) * 3, (range.last) * 3 - 1)} 99 | .map!{ |range| Range.new(range.first + records.first.start, range.last + records.first.start)} 100 | 101 | records 102 | .each_cons(2) 103 | .map{ |a, b| Range.new(a.end + 1, b.start - 1) } 104 | .reduce(ranges) do |mem, intron| 105 | size = intron.last - intron.first + 1 106 | mem.flat_map do |range| 107 | # Is there an overlap between this range and an intron? 108 | if (range.first <= intron.last) and (intron.first <= range.last) 109 | # If we introduce the intron, does the shifted range still overlap the intron location? 110 | if range.first + size <= intron.last 111 | # If so, make the new intron 112 | [Range.new(range.first, intron.first - 1), Range.new(intron.last + 1, intron.last + range.last - intron.first + 1)] 113 | else 114 | # If not, we can just move the region to the right by the intron size 115 | Range.new(range.first + size, range.last + size) 116 | end 117 | elsif intron.last < range.first 118 | # If there is no overlap and the region is still to the right, we move it right. 119 | Range.new(range.first + size, range.last + size) 120 | else 121 | range 122 | end 123 | end 124 | end.each do |range| 125 | next if source == "." 126 | puts [ 127 | records.first.seqname, 128 | source, 129 | type, 130 | range.first, 131 | range.last, 132 | score, 133 | records.first.strand, 134 | ".", 135 | attributes 136 | ].join("\t") 137 | end 138 | end 139 | -------------------------------------------------------------------------------- /bin/parse_hhr.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | require 'optparse' 3 | 4 | options = {} 5 | options[:homology_prob_cut] = 70 6 | options[:eval_cut] = 0.001 7 | options[:pval_cut] = 0.000001 8 | options[:score_cut] = 100 9 | options[:align_col_cut] = 50 10 | options[:template_aln_size_cut] = 30 11 | options[:repeat] = false 12 | 13 | OptionParser.new do |opts| 14 | opts.banner = "Usage: parse_hhr.rb [options] input.hhr" 15 | 16 | opts.on("-o [N]", Float, "--homology_cutoff", "Minimum homology probability (70)") do |f| 17 | options[:homology_prob_cut] = f 18 | end 19 | 20 | opts.on("-e [N]", Float, "--evalue_cutoff", "Maximum evalue (1e-3)") do |f| 21 | options[:eval_cut] = f 22 | end 23 | 24 | opts.on("-p [N]", Float, "--pvalue_cutoff", "Maximum pvalue (1e-6)") do |f| 25 | options[:pval_cut] = f 26 | end 27 | 28 | opts.on("-s [N]", Float, "--score_cutoff", "Minimum score (100)") do |f| 29 | options[:pval_cut] = f 30 | end 31 | 32 | opts.on("-a [N]", Float, "--align_length_cutoff", "Minimum length of amino acids in match for query (50)") do |f| 33 | options[:align_col_cut] = f 34 | end 35 | 36 | opts.on("-t [N]", Float, "--template_length_cutoff", "Minimum length of amino acids in match for template (30)") do |f| 37 | options[:template_aln_size_cut] = f 38 | end 39 | 40 | opts.on("-r", "--repeat", "Input file is generated from repeat sequence rather than coding sequence") do |r| 41 | options[:repeat] = r 42 | end 43 | 44 | opts.on("-h", "--help", "Show this message") do 45 | puts opts 46 | exit 47 | end 48 | end.parse! 49 | 50 | infile = File.open(ARGV.shift) 51 | 52 | homology_prob_cut = 70 53 | 54 | gff3 = File.open('out.gff3', 'w') 55 | hints = File.open('out.hints', 'w') 56 | geneid = File.open('out.geneid', 'w') 57 | glimmer = File.open('out.glimmer', 'w') 58 | 59 | uid_counter = Hash.new(0) 60 | 61 | while infile.gets 62 | p $_ 63 | case $_ 64 | when /^\W*Query\s+(?\S+)_(?\d+) \[(?\d+) - (?\d+)\](? \(REVERSE SENSE\))?/ 65 | p $~ 66 | scaffold_id = $~[:scaffold_id] 67 | scaffold_hit_num = $~[:scaffold_hit_num].to_i 68 | reverse = ! $~[:rev].nil? 69 | orf_start = $~[:orf_start].to_i 70 | orf_stop = $~[:orf_end].to_i 71 | when /^\s*1 (?.{30})\s+(?\d+\.?\d*)\s+(?\d+\.?\d*E?-?\d*)\s+(?\d+\.?\d*E?-?\d*)\s+(?\d+\.?\d*)\s+(?\d+\.?\d*)\s+(?\d+)\s+(?\d+)-(?\d+)\s+(?\d+)-(?\d+)\s+\((?\d+)\)/ 72 | next if options[:homology_prob_cut] > $~[:prob].to_f 73 | next if options[:eval_cut] < $~[:evalue].to_f 74 | next if options[:pval_cut] < $~[:pvalue].to_f 75 | next if options[:score_cut] > $~[:score].to_f 76 | next if options[:align_col_cut] > $~[:alignment_length].to_i 77 | next if options[:template_aln_size_cut] > ($~[:hit_start].to_i - $~[:hit_stop].to_i).abs 78 | 79 | p $~ 80 | hit_id = $~[:hit_desc].split.first 81 | hit_desc = $~[:hit_desc].split[1..-1].join(' ') 82 | 83 | uid = "%s.s%s.e%s" % [hit_id, $~[:hit_start], $~[:hit_stop]] 84 | hit_count = uid_counter[uid] += 1 85 | uid += ".n%d" % hit_count 86 | 87 | strand = reverse ? '-' : '+' 88 | gff_start = reverse ? (orf_start - (3 * $~[:aa_start].to_i)) : (orf_start + (3 * $~[:aa_start].to_i)) 89 | gff_end = reverse ? (orf_start - (3 * $~[:aa_stop].to_i) + 1) : (orf_start + (3 * $~[:aa_stop].to_i) - 1) 90 | type = options[:repeat] ? 'nonexonpart' : 'CDSpart' 91 | 92 | attributes = {} 93 | attributes[:ID] = uid 94 | attributes[:Name] = hit_id + "(%s)" % hit_desc 95 | attributes[:Target] = "%s %s %s [+]" % [hit_id, $~[:hit_start], $~[:hit_stop]] 96 | 97 | gff3.puts [scaffold_id, 'hhblits', 'protein_match', gff_start, gff_end, $~[:score], strand, '.', attributes.map{|a| a.join('=')}.join(";")].join("\t") 98 | 99 | attributes = {} 100 | attributes[:src] = options[:repeat] ? 'RM' : 'HU' 101 | attributes[:grp] = hit_id 102 | attributes[:pri] = options[:repeat] ? 6 : 5 103 | hints.puts [scaffold_id, 'protein_match', type, gff_start, gff_end, $~[:score], strand, '.', attributes.map{|a| a.join('=')}.join(";")].join("\t") 104 | 105 | geneid.puts [scaffold_id, 'hhblits', 'sr', gff_start, gff_end, $~[:score], strand, '.'].join("\t") 106 | 107 | if reverse 108 | glimmer.puts [scaffold_id, gff_end, gff_start, $~[:score], $~[:evalue], "\n\n"].join(" ") 109 | else 110 | glimmer.puts [scaffold_id, gff_start, gff_end, $~[:score], $~[:evalue], "\n\n"].join(" ") 111 | end 112 | end 113 | end 114 | -------------------------------------------------------------------------------- /bin/pfam_to_gff3.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | seqid = "dummy_seqid" 4 | orfstart = -1 5 | orfstop = -1 6 | strand = "?" 7 | domainid = "domainid" 8 | domain_description = "domain description" 9 | domain_num = 0 10 | hit_num = 0 11 | num = '\d*\.?\d+([eE][-+]?\d+)?' 12 | 13 | while ARGF.gets 14 | case $_ 15 | when /^#/ 16 | next 17 | when /^Query:\s+(?\S+)_(?\d+)\s+\[L=\d+\]/ 18 | seqid = $~[:seqid] 19 | orfid = $~[:seqid] + "_" + $~[:orfnum] 20 | when /Description: \[(?\d+) - (?\d+)\] (?\(REVERSE SENSE\))?/ 21 | orfstart = $~[:orfstart].to_i 22 | orfend = $~[:orfend].to_i 23 | strand = $~[:reverse] ? "-" : "+" 24 | domain_num = 0 25 | when /^>>\s+(?\S+)\s+(?.*)\n/ 26 | domainid = $~[:domainid] 27 | domain_description = $~[:domain_description] 28 | domain_num += 1 29 | when /\s+(?\d+)\s+[\?\!]\s+(?#{num})\s+(?#{num})\s+(?#{num})\s+(?#{num})\s+(?\d+)\s+(?\d+)\s+[\.\[][\.\]]\s+\d+\s+\d+\s+[\.\[][\.\]]\s+(?\d+)\s+(?\d+)/ 30 | domain_from = $~[:domain_start].to_i 31 | domain_to = $~[:domain_end].to_i 32 | hit_num = $~[:hit_num] 33 | domain_position_left = 0 34 | domain_position_right = 0 35 | if strand == "+" 36 | domain_position_left = orfstart + 3 * domain_from 37 | domain_position_right = orfstart + 3 * domain_to - 1 38 | else 39 | domain_position_right = orfstart - 3 * (domain_from - 1) 40 | domain_position_left = orfstart - 3 * (domain_to - 1) + 1 41 | end 42 | 43 | # Output a new gff annotation 44 | out = [] 45 | out << seqid 46 | out << 'pfam' 47 | out << 'protein_hmm_match' 48 | out << domain_position_left 49 | out << domain_position_right 50 | out << $~[:score] 51 | out << strand 52 | out << "." 53 | attributes = {} 54 | attributes[:Target] = domainid 55 | attributes[:description] = domain_description 56 | attributes[:exon_id] = seqid 57 | attributes[:orf_id] = orfid 58 | attributes[:ID] = orfid + "_" + domain_num.to_s + hit_num 59 | out << attributes.map{|p| p.join('=')}.join(';') 60 | puts out.join("\t") 61 | end 62 | end 63 | -------------------------------------------------------------------------------- /bin/rename-codons: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | $strain_id = ARGV.shift 3 | 4 | def gene(id_count) 5 | sprintf("%s_G%05d", $strain_id, id_count) 6 | end 7 | 8 | def mrna(id_count) 9 | sprintf("%s_R%05d", $strain_id, id_count) 10 | end 11 | 12 | def cds(id_count) 13 | sprintf("%s_C%05d", $strain_id, id_count) 14 | end 15 | 16 | def start_codon(id_count) 17 | sprintf("%s_START_%05d", $strain_id, id_count) 18 | end 19 | 20 | def stop_codon(id_count) 21 | sprintf("%s_STOP_%05d", $strain_id, id_count) 22 | end 23 | 24 | reached_fasta = false 25 | 26 | ARGF.each do |line| 27 | if line =~ /##FASTA/ 28 | reached_fasta = true 29 | end 30 | if line !~ /^\S+\t\S+\t\S+_codon/ || reached_fasta 31 | next 32 | end 33 | 34 | split = line.chomp.split("\t") 35 | attributes = Hash[split[8].split(";").map{|pair| pair.split(" ")}] 36 | id_count = attributes["gene_id"].match(/^"(\d+)_/)[1] 37 | attributes.delete("gene_id") 38 | attributes.delete("transcript_id") 39 | attributes.delete("gene_name") 40 | attributes.delete("transcript_name") 41 | 42 | case split[2].downcase 43 | when "start_codon" 44 | attributes["ID"] = start_codon(id_count) 45 | attributes["Parent"] = gene(id_count) 46 | when "stop_codon" 47 | attributes["ID"] = stop_codon(id_count) 48 | attributes["Parent"] = gene(id_count) 49 | end 50 | 51 | split[8] = attributes.map{|attribute| attribute.join("=")}.join(";") 52 | puts split.join("\t") 53 | end 54 | -------------------------------------------------------------------------------- /bin/rename-fasta: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | # How many sequences are there in the file? 4 | fasta = File.open(ARGV.shift, 'r') 5 | sequence_count = fasta. 6 | each_line. 7 | find_all { |line| line =~ /^>/ }. 8 | count 9 | 10 | # An optional second argument gives the name of the strain 11 | # we replace white space with underscores. 12 | strain_name = if ARGV.empty? 13 | "sequence" 14 | else 15 | ARGV.shift.gsub(/\s+/, "_") 16 | end 17 | 18 | # The sprintf format string, which will end up looking like: 19 | # ">strain_name_%05d" 20 | format_string = ">#{strain_name}_%0#{Math.log10(sequence_count).ceil}d\n" 21 | 22 | # Read through each line, replacing the fasta headers 23 | count = 0 24 | fasta.rewind 25 | sequence = "" 26 | fasta.each_line do |line| 27 | if line =~ /^>/ 28 | sequence.chars.each_slice(80) { |a| puts a.join } if sequence != "" 29 | printf(format_string, count+=1) 30 | sequence = "" 31 | else 32 | sequence << line.chomp 33 | end 34 | end 35 | 36 | sequence.chars.each_slice(80) { |a| puts a.join } 37 | -------------------------------------------------------------------------------- /bin/rename-gff-ids: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | $strain_id = ARGV.shift 3 | 4 | def gene(id_count) 5 | sprintf("%s_G%05d", $strain_id, id_count) 6 | end 7 | 8 | def mrna(id_count) 9 | sprintf("%s_R%05d", $strain_id, id_count) 10 | end 11 | 12 | def cds(id_count) 13 | sprintf("%s_C%05d", $strain_id, id_count) 14 | end 15 | 16 | reached_fasta = false 17 | 18 | ARGF.each do |line| 19 | if line =~ /##FASTA/ 20 | reached_fasta = true 21 | end 22 | if line =~ /^#/ || reached_fasta 23 | puts line 24 | next 25 | end 26 | 27 | split = line.chomp.split("\t") 28 | attributes = Hash[split[8].split(";").map{|pair| pair.split("=")}] 29 | attributes.delete("gene_id") 30 | attributes.delete("transcript_id") 31 | 32 | case split[2].downcase 33 | when "gene" 34 | id_count = attributes["ID"].match(/\d+$/)[0] 35 | attributes["ID"] = gene(id_count) 36 | attributes["Name"] = gene(id_count) 37 | when "mrna" 38 | id_count = attributes["ID"].match(/\d+$/)[0] 39 | attributes["ID"] = mrna(id_count) 40 | attributes["Parent"] = gene(id_count) 41 | when "cds" 42 | id_count = attributes["Parent"].match(/\d+$/)[0] 43 | attributes["ID"] = cds(id_count) 44 | attributes["Parent"] = mrna(id_count) 45 | end 46 | 47 | split[8] = attributes.map{|attribute| attribute.join("=")}.join(";") 48 | puts split.join("\t") 49 | end 50 | -------------------------------------------------------------------------------- /bin/trim_fasta_all.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | package trim_fasta_all; 3 | use strict; 4 | use warnings; 5 | 6 | use Data::Dumper; 7 | our $VERSION = '1.0'; 8 | 9 | #04MAR11: Added GC/AT ratio check as ratio cutoff 10 | 11 | =head1 NAME 12 | 13 | trim_fasta_all.pl - removes sequences from a FASTA file 14 | 15 | =head1 VERSION 16 | 17 | Version 0.2 18 | 19 | =head1 SYNOPSIS 20 | 21 | trim_fasta_all.pl [options] 22 | 23 | removes sequences from a FASTA file. See perldoc for more info. 24 | 25 | 'i|fa|fasta=s' => FASTA file to trim. You can also give multiples as arguments without any -i/-fa option. 26 | 'outfile:s' => Optionally, the name of the trimmed outfile 27 | 'blastfile:s' => BLASTFILE to retrieve sequences from 28 | 'blastquery' => grab BLAST queries 29 | 'blasthit' => grab BLAST hits 30 | 'evalue=s' => Evalue cut-off for blastfile (currently broken) 31 | 'c|character=s' => Characters to look for. If present, remove sequence. 32 | 'le|length=i' => Number of minimum characters 33 | 'p|proportion' => Discard sequences for which a mononucleotide frequency exceeds this proportion 34 | 'ratio' => Discard sequences for which the GC or AT frequency exceeds this ratio 35 | 'x' => Do not include the Xx characters when calculating size of sequence 36 | xdiscard => Discard if these many Xs 37 | 'npl' => Do not include these characters when calculating size: NPLnpl 38 | 'lc|lowercase' => Do not include lowercase characters when calculating size of sequence (e.g. to not include low quality bases) 39 | 'id|idfile=s' => A second FASTA file containing IDs to remove from FASTA file. Alternatively a text file with one ID per line 40 | 'descr' => For above: search description line instead of primary id. 41 | 'ci' => Case insensitivity for above two options 42 | 'invert' => Invert match (invert output filenames) 43 | 'log' => Keep a log file 44 | 'df' => Do not write discarded sequences (less IO) 45 | 'solq' => Input is FASTQ (Solexa 1.3-1.4) 46 | 'sanq' => Input is FASTQ (Sanger) 47 | 'casava18' => Input is Fastq from Casava 1.8 48 | 'single' => Entire output sequence/quality is in a single line (no BioPerl; good for parsing) 49 | 'ghash' => Use a Glib hash table (less memory, slower) 50 | 51 | =head1 DESCRIPTION 52 | 53 | Processes file (-fa) when certain character(s) are present (-c); or a list of IDs is provided (-id); or a certain length-cut off is not satisfied (-le); or a proportion of nucleotide frequence can be specified (-p) instead. The -log option produces a log file reporting what happened to each sequence 54 | The option to not include Xs and/or NPLs and/or lower-case characters in the cut-off calculation is forced with -x and/or -npl and/or -lc respectively. 55 | Uses BioPerl. A disk-friendly function (-df) prevents the FASTA file of discarded sequences of being written. 56 | 57 | =head1 AUTHORS 58 | 59 | Alexie Papanicolaou 1 2 60 | 61 | 1 Max Planck Institute for Chemical Ecology, Germany 62 | 2 Centre for Ecology and Conservation, University of Exeter, UK 63 | alexie@butterflybase.org 64 | 65 | =head1 DISCLAIMER & LICENSE 66 | 67 | This software is released under the GNU General Public License version 3 (GPLv3). 68 | It is provided "as is" without warranty of any kind. 69 | You can find the terms and conditions at http://www.opensource.org/licenses/gpl-3.0.html. 70 | Please note that incorporating the whole software or parts of its code in proprietary software 71 | is prohibited under the current license. 72 | 73 | =head1 BUGS & LIMITATIONS 74 | 75 | None known so far. 76 | 77 | =cut 78 | use Bio::SeqIO; 79 | use Bio::SearchIO; 80 | use Getopt::Long; 81 | #use Tie::GHash; 82 | use Pod::Usage; 83 | $| = 1; 84 | my ( 85 | $character, @infiles, $length_cutoff, $xmask, $xdiscard, 86 | $nplmask, $ci, $blastfile, $evalue_cutoff, 87 | $lcmask, $prop_cutoff, @idfiles, $log, 88 | $logfile, $invert, $sangerfastq, $blast_hit,$blast_query, 89 | $user_outfile, $df, %ids, $help, 90 | $convert2uc, $descr_flag, $solexafastq, $search_accession, 91 | $seq_search, $single_line, $ratio_cutoff, $ghash, $overwrite, $casava 92 | ); 93 | &GetOptions( 94 | 'i|fa|fasta=s{,}' => \@infiles, 95 | 'blastfile=s' => \$blastfile, 96 | #'evalue=s' => \$evalue_cutoff, 97 | 'c|character=s' => \$character, 98 | 'le|length=i' => \$length_cutoff, 99 | 'p|proportion=f' => \$prop_cutoff, 100 | 'ratio=f' => \$ratio_cutoff, 101 | 'x' => \$xmask, 102 | 'uc|uppercase' => \$convert2uc, 103 | 'npl' => \$nplmask, 104 | 'lc|lowercase' => \$lcmask, 105 | 'ids|idfile=s{,}' => \@idfiles, 106 | 'description' => \$descr_flag, 107 | 'invert' => \$invert, 108 | 'ci' => \$ci, 109 | 'log' => \$log, 110 | 'df' => \$df, 111 | 'h|help' => \$help, 112 | 'solq' => \$solexafastq, 113 | 'sanq' => \$sangerfastq, 114 | 'seq' => \$seq_search, 115 | 'outfile:s' => \$user_outfile, 116 | 'single' => \$single_line, 117 | 'blastquery' => \$blast_query, 118 | 'blasthit' => \$blast_hit, 119 | 'ghash' => \$ghash, 120 | 'overwrite' => \$overwrite, 121 | 'casava18'=>\$casava, 122 | 'xdiscard:i' => \$xdiscard, 123 | #'accessions'=> \$search_accession, 124 | ); 125 | if ($help) { pod2usage; } 126 | @infiles = @ARGV if !@infiles; 127 | unless (@infiles) { 128 | print "Failed to provide or find input file\n"; 129 | pod2usage; 130 | } 131 | tie %ids,'Tie::GHash' if $ghash; 132 | 133 | unless ( $character 134 | || $length_cutoff 135 | || $prop_cutoff || $ratio_cutoff || $xdiscard 136 | || @idfiles 137 | || $blastfile ) 138 | { 139 | die("Nothing to do!\n"); 140 | } 141 | unless ($evalue_cutoff) { $evalue_cutoff = 1; } 142 | my $counter = int(0); 143 | if ($casava){ 144 | $sangerfastq=1; 145 | undef($solexafastq); 146 | } 147 | foreach my $idfile (@idfiles) { 148 | if ( $idfile && -s $idfile ) { 149 | my $pattern; 150 | if ($descr_flag) { $pattern = '^\s*\S+\s+(.+)$'; } 151 | else { $pattern = '^[>@]?\s*(\S+)\s*'; } 152 | my @test_lines = `head $idfile`; 153 | foreach my $test (@test_lines) { 154 | if ( $test =~ /^>/ ) { $pattern = "Bio::SeqIO"; } 155 | } 156 | # my $number = `wc -l < $idfile`; 157 | # chomp($number); 158 | # $number /= 2 if $pattern eq "Bio::SeqIO"; 159 | print "Building hash from $idfile with $pattern\n"; 160 | my $flag; 161 | 162 | if ( $pattern eq "Bio::SeqIO" ) { 163 | my $id_obj = new Bio::SeqIO( -file => $idfile, -format => "fasta" ); 164 | while ( my $object = $id_obj->next_seq() ) { 165 | $counter+=length($object->seq().$object->description().' '.$object->id()) if $object->seq(); 166 | $counter+=length($object->description().' '.$object->id()) if !$object->seq(); 167 | if ($seq_search) { $ids{ $object->seq() } = 1; } 168 | elsif ($descr_flag) { $ids{ $object->description() } = 1; } 169 | else { $ids{ $object->id() } = 1; } 170 | $flag = 1 if !$flag; 171 | } 172 | } else { 173 | open( IN, $idfile ) || die(); 174 | while ( my $line = ) { 175 | $counter+=length($line); 176 | if ($ci) { 177 | if ( $line =~ /$pattern/i ) { 178 | $ids{$1} = 1; 179 | $flag = 1 if !$flag; 180 | } 181 | } else { 182 | if ( $line =~ /$pattern/ ) { 183 | $ids{$1} = 1; 184 | $flag = 1 if !$flag; 185 | } 186 | } 187 | } 188 | close(IN); 189 | } 190 | if ( !$flag ) { die "Failed to get list of IDs to extract...\n"; } 191 | else { 192 | print "Hash presence of $idfile verified\n"; 193 | } 194 | } elsif ($idfile) { 195 | warn "File $idfile is empty or does not exist!\n"; 196 | } 197 | } 198 | if ( $blastfile && -s $blastfile ) { 199 | if ($blast_hit){ 200 | print "Building HASH for queries and hits from $blastfile...\n"; 201 | my @blast_hits = `grep '^>' $blastfile`; 202 | chomp(@blast_hits); 203 | foreach my $blast (@blast_hits) { 204 | #next if $blast=~/^Sbjct|^Query|^Number|^Matrix:|^Gap penalties|^Length|^Database|^BLASTN|^Jinghui|^Database|^programs/i; 205 | $counter++; 206 | $blast =~ /^>(\S+)/; 207 | $ids{$1} = 1; 208 | } 209 | print "Found $counter significant results\n"; 210 | }elsif($blast_query){ 211 | print "Building HASH for queries from $blastfile...\n"; 212 | my @blast_queries = `grep -B 18 '^Sequences producing' $blastfile |grep '^Query='`; 213 | 214 | chomp(@blast_queries); 215 | foreach (@blast_queries) { 216 | next if $_=~/^Sbjct|^Query|^Number|^Matrix:|^Gap penalties|^Length/i; 217 | $counter++; 218 | $_ =~ s/^Query=\s+//; 219 | $ids{$_} = 1; 220 | } 221 | print "Found $counter significant results\n"; 222 | }else{ 223 | die "Please provide -blasthit and/or -blastquery\n"; 224 | } 225 | } 226 | foreach my $file (@infiles) { 227 | &process($file); 228 | } 229 | ##################################################### 230 | sub process ($) { 231 | my $fastafile = shift; 232 | my $fsize = -s $fastafile; 233 | my ( $filein, $fileout, $fileout2); 234 | my $fastafiletrim = "$fastafile.trim"; 235 | $fastafiletrim = $user_outfile if $user_outfile; 236 | my $fastafilediscard = "$fastafile.discard"; 237 | print "Processing... $fastafile as $fastafiletrim && $fastafilediscard\n"; 238 | $fastafilediscard = $user_outfile . ".discard" if $user_outfile; 239 | if (!-s $fastafile){ 240 | warn "File not found, skipping\n"; 241 | return; 242 | }if (-s $fastafiletrim){ 243 | warn "Output file $fastafiletrim already exists\n"; 244 | return unless $overwrite; 245 | } 246 | if ($solexafastq) { 247 | if ($single_line){ 248 | open( IN, $fastafile ) if $single_line; 249 | open( OUT1, ">$fastafiletrim" ) if $single_line; 250 | open( OUT2, ">$fastafilediscard" ) if $single_line; 251 | }else{ 252 | $filein = new Bio::SeqIO( -file => $fastafile, -format => "fastq-solexa" ); 253 | $fileout = new Bio::SeqIO( -file => ">$fastafiletrim", -format => "fastq-solexa" ); 254 | $fileout2 = new Bio::SeqIO( 255 | -file => ">$fastafilediscard", 256 | -format => "fastq-solexa" 257 | ); 258 | } 259 | } elsif ($sangerfastq) { 260 | if ($single_line){ 261 | open( IN, $fastafile ); 262 | open( OUT1, ">$fastafiletrim" ); 263 | open( OUT2, ">$fastafilediscard" ); 264 | }else{ 265 | $filein = new Bio::SeqIO( -file => $fastafile, -format => "fastq" ); 266 | $fileout = new Bio::SeqIO( -file => ">$fastafiletrim", -format => "fastq" ); 267 | $fileout2 = new Bio::SeqIO( -file => ">$fastafilediscard", -format => "fastq" ); 268 | } 269 | } else { 270 | if ($single_line){ 271 | open( IN, $fastafile ) ||die("Cannot open $fastafile\n"); 272 | open( OUT1, ">$fastafiletrim" ); 273 | open( OUT2, ">$fastafilediscard" ); 274 | }else{ 275 | $filein = new Bio::SeqIO( -file => $fastafile, -format => "fasta" ); 276 | $fileout = new Bio::SeqIO( -file => ">$fastafiletrim", -format => "fasta" ); 277 | $fileout2 = new Bio::SeqIO( -file => ">$fastafilediscard", -format => "fasta" ); 278 | } 279 | } 280 | if ($log) { 281 | $logfile = $fastafile . ".trim.log"; 282 | open( LOG, ">$logfile" ); 283 | } 284 | my ( $empty, $discard, $trim ); 285 | $counter = 0; 286 | if ($single_line){ 287 | print "Processing as single line FASTA/Q\n"; 288 | }else{ 289 | my $number=($sangerfastq || $solexafastq) ? `grep -c "^@" $fastafile` : `grep -c "^>" $fastafile`; 290 | chomp($number); 291 | print "$number sequences\n"; 292 | } 293 | my $errors = int(0); 294 | while ( my $object = $single_line ? : $filein->next_seq() ) { 295 | next if !$object; 296 | $counter=$single_line ? $counter+length($object) : $counter+1; 297 | next if $single_line && $object=~/^\s*$/; 298 | my ( $id, $sequence, $description, $qual, $prefix); 299 | if ($single_line) { 300 | chomp($object); 301 | $object =~ /^(\S)(\S+)\s*(.*)/; 302 | $prefix = $1; 303 | $id = $2; 304 | $description = $3; 305 | if (($casava) && $description=~/(\d)\:[A-Z]\:/){ 306 | $id.='/'.$1; 307 | } 308 | $sequence = ; 309 | $counter+=length($sequence); 310 | chomp($sequence); 311 | my $ok = ($prefix eq '>'||$prefix eq '@' || $prefix eq '+') ? 1 : int(0); 312 | while ($ok != 1){ 313 | $errors++; 314 | warn "Sequence $counter has a header which starts with $1. This does not seem to be right...\n$object\n$sequence\n\nSkipping...\n"; 315 | die "\nToo many errors found\n" if $errors > 20; 316 | $object = $sequence; 317 | chomp($object); 318 | $sequence = ; 319 | $object =~ /^(\S)(\S+)\s*(\S*)/; 320 | $prefix = $1; 321 | $id = $2; 322 | $description = $3; 323 | $ok = ($prefix eq '>'||$prefix eq '@' || $prefix eq '+') ? 1 : int(0); 324 | } 325 | if ( $solexafastq || $sangerfastq ) { 326 | $qual = . ; 327 | $counter+=length($qual); 328 | chomp($qual); 329 | } 330 | } else { 331 | $id = $object->id(); 332 | $sequence = $object->seq() if ($seq_search); 333 | $description = $object->description() ? $object->description() : ''; 334 | } 335 | 336 | # trim if given an ID file 337 | if ( @idfiles || $blastfile ) { 338 | if ( $sequence && $seq_search ) { 339 | if ( $ids{$sequence} ) { 340 | unless ( $df && !$invert ) { 341 | if ($single_line) { 342 | if ($qual) { 343 | print OUT2 "@" . "$id\n$sequence\n$qual\n"; 344 | } else { 345 | print OUT2 ">$id"; 346 | print OUT2 " $description" if $description; 347 | print OUT2 "\n"; 348 | print OUT2 "$sequence\n"; 349 | } 350 | } else { 351 | $fileout2->write_seq($object); 352 | } 353 | } 354 | $discard++; 355 | if ($log) { 356 | print LOG "Sequence $id discarded because the Sequence was found in idfiles\n"; 357 | } 358 | #DO get it more than once 359 | #delete($ids{$sequence}); 360 | next; 361 | } else { 362 | next; 363 | } 364 | } elsif ( exists $ids{$id} && $ids{$id}==1) { 365 | unless ( $df && !$invert ) { 366 | if ($single_line) { 367 | if ($qual) { 368 | print OUT2 "@" . "$id\n$sequence\n$qual\n"; 369 | } else { 370 | print OUT2 ">$id"; 371 | print OUT2 " $description" if $description; 372 | print OUT2 "\n"; 373 | print OUT2 "$sequence\n"; 374 | } 375 | } else { 376 | $fileout2->write_seq($object); 377 | } 378 | } 379 | $discard++; 380 | if ($log) { 381 | print LOG "Sequence $id discarded because the ID was found in idfiles\n"; 382 | } 383 | 384 | #make sure we don't get it twice 385 | $ids{$id}=2; 386 | next; 387 | } elsif ( exists $ids{$id}) { 388 | next; 389 | # if id exists multiple times don't write it in any file. 390 | } elsif ( exists $ids{ $id . ' ' . $description } && $ids{ $id . ' ' . $description }==1) { 391 | unless ( $df && !$invert ) { 392 | if ($single_line) { 393 | if ($qual) { 394 | print OUT2 "@" 395 | . $id 396 | . $description 397 | . "\n$sequence\n$qual\n"; 398 | } else { 399 | print OUT2 ">" 400 | . $id 401 | . $description 402 | . "\n$sequence\n"; 403 | } 404 | } else { 405 | $fileout2->write_seq($object); 406 | } 407 | } 408 | $discard++; 409 | if ($log) { 410 | print LOG "Sequence $id.$description discarded because the ID was found in idfiles\n"; 411 | } 412 | 413 | #make sure we don't get it twice 414 | $ids{ $id . ' ' . $description } =2; 415 | next; 416 | } elsif ( exists $ids{ $id . ' ' . $description }) { 417 | next; 418 | } 419 | } 420 | $sequence = $object->seq() if !$sequence; 421 | if ($sequence) { 422 | my $seq2 = $sequence; 423 | if ($xmask) { $seq2 =~ s/[X]//ig; } 424 | if ($nplmask) { $seq2 =~ s/[NPL]//ig; } 425 | if ($lcmask) { $seq2 =~ s/[a-z]//g; } 426 | my $length = length($seq2); 427 | 428 | # trim if given a character(s) 429 | if ($character) { 430 | if ( $sequence =~ /[$character]/ ) { 431 | unless ( $df && !$invert ) { 432 | if ($single_line) { 433 | if ($qual) { 434 | print OUT2 "@" . "$id\n$sequence\n$qual\n"; 435 | } else { 436 | print OUT2 ">$id $description\n$sequence\n"; 437 | 438 | } 439 | } else { 440 | $fileout2->write_seq($object); 441 | } 442 | } 443 | $discard++; 444 | if ($log) { 445 | print LOG 446 | "Sequence $id discarded because character $character was found\n"; 447 | } 448 | next; 449 | } 450 | } 451 | 452 | #trim if given a length cutoff 453 | if ($length_cutoff) { 454 | if ( !$length || $length < $length_cutoff ) { 455 | unless ( $df && !$invert ) { 456 | if ($single_line) { 457 | if ($qual) { 458 | print OUT2 "@" . "$id\n$sequence\n$qual\n"; 459 | } else { 460 | print OUT2 ">$id $description\n$sequence\n"; 461 | } 462 | } else { 463 | $fileout2->write_seq($object); 464 | } 465 | } 466 | $discard++; 467 | if ($log) { 468 | print LOG "Sequence $id discarded because length $length was smaller than cutoff $length_cutoff\n"; 469 | } 470 | next; 471 | } 472 | } 473 | # trim if xdiscard 474 | if ($xdiscard){ 475 | my $Xs = ( $sequence =~ tr/X// ); 476 | if ($Xs >= $xdiscard){ 477 | unless ( $df && !$invert ) { 478 | if ($single_line) { 479 | if ($qual) { 480 | print OUT2 "@" . "$id\n$sequence\n$qual\n"; 481 | } else { 482 | print OUT2 ">$id $description\n$sequence\n"; 483 | } 484 | } else { 485 | $fileout2->write_seq($object); 486 | } 487 | } 488 | $discard++; 489 | print LOG "Sequence $id discarded more Xs ($Xs) than allowed ($xdiscard).\n" if $log; 490 | next; 491 | } 492 | } 493 | #trim if given a proportion of A/T/C/G 494 | if ($prop_cutoff || $ratio_cutoff) { 495 | my $As = ( $sequence =~ tr/A// ); 496 | my $Ts = ( $sequence =~ tr/T// ); 497 | my $Cs = ( $sequence =~ tr/C// ); 498 | my $Gs = ( $sequence =~ tr/G// ); 499 | my $Xs = ( $sequence =~ tr/X// ); 500 | my $Ns = ( $sequence =~ tr/N// ); 501 | my $propA = ( $As / $length ); 502 | my $propT = ( $Ts / $length ); 503 | my $propC = ( $Cs / $length ); 504 | my $propG = ( $Gs / $length ); 505 | my $propX = ( $Xs / $length ); 506 | my $propN = ( $Ns / $length ); 507 | my $GCratio = $propG + $propC if $ratio_cutoff; 508 | my $ATratio = 1 - $GCratio if $ratio_cutoff; 509 | if ( $prop_cutoff &&( 510 | $propA > $prop_cutoff 511 | || $propT > $prop_cutoff 512 | || $propX > $prop_cutoff 513 | || $propN > $prop_cutoff 514 | || $propG > $prop_cutoff 515 | || $propC > $prop_cutoff ) 516 | || $ratio_cutoff && ( 517 | $ATratio > $ratio_cutoff 518 | || $GCratio > $ratio_cutoff ) 519 | ) 520 | { 521 | 522 | unless ( $df && !$invert ) { 523 | if ($single_line) { 524 | if ($qual) { 525 | print OUT2 "@" . "$id\n$sequence\n$qual\n"; 526 | } else { 527 | print OUT2 ">$id $description\n$sequence\n"; 528 | } 529 | } else { 530 | $fileout2->write_seq($object); 531 | } 532 | } 533 | $discard++; 534 | if ($log) { 535 | print LOG "Sequence $id discarded because of one nucleotide proportion (A:$propA; T:$propT; G:$propG; C:$propC higher than cutoff $prop_cutoff or GC/AT higher than $ratio_cutoff\n" if $ratio_cutoff && $prop_cutoff; 536 | print LOG "Sequence $id discarded because of GC/AT proportion (A:$propA; T:$propT; G:$propG; C:$propC) higher than $ratio_cutoff\n" if $ratio_cutoff; 537 | print LOG "Sequence $id discarded because of one nucleotide proportion (A:$propA; T:$propT; G:$propG; C:$propC higher than cutoff $prop_cutoff\n" if $prop_cutoff; 538 | } 539 | next; 540 | } 541 | } 542 | 543 | #next has taken care of discards. 544 | $trim++; 545 | if ($convert2uc) { 546 | $object->seq( uc($sequence) ) if !$single_line; 547 | $sequence = uc($sequence) if $single_line; 548 | } 549 | unless ( $df && $invert ) { 550 | if ($single_line) { 551 | if ($qual) { 552 | print OUT1 "@" . "$id\n$sequence\n$qual\n"; 553 | } else { 554 | print OUT1 ">$id $description\n$sequence\n"; 555 | } 556 | } else { 557 | $fileout->write_seq($object); 558 | } 559 | } 560 | } #end if $sequence 561 | else { 562 | $empty++; 563 | if ($log) { 564 | print LOG "Sequence $id discard because it was empty\n"; 565 | } 566 | next; 567 | } 568 | } 569 | if ( !$empty ) { $empty = int(0); } 570 | if ( !$discard ) { $discard = int(0); } 571 | if ( !$trim ) { $trim = int(0); } 572 | if ($invert) { 573 | system("mv -i $fastafiletrim tmpfile"); 574 | system("mv $fastafilediscard $fastafiletrim"); 575 | system("mv tmpfile $fastafilediscard"); 576 | my $temp = $trim; 577 | $trim = $discard; 578 | $discard = $temp; 579 | } 580 | unless ( -s "$fastafilediscard" ) { unlink "$fastafilediscard"; } 581 | if ($log) { print LOG "FASTA $fastafile contained ".($empty+$discard+$trim)." sequences\n"; } 582 | print "\nDone, $empty were empty and an additional $discard were discarded. Kept $trim as $fastafiletrim\n"; 583 | if ($log) { print LOG "\n$empty were empty and an additional $discard were discarded. Kept $trim as $fastafiletrim\n"; 584 | } 585 | close(LOG); 586 | } 587 | print "\n"; 588 | -------------------------------------------------------------------------------- /complete.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | 3 | params.reference = 'data/genome.fasta' 4 | params.bamfiles = 'data/bams/*.bam' 5 | params.species = 'fungi' 6 | params.minscaffoldsize = 1000 7 | params.maxintronlength = 1000 8 | params.minintronlength = 10 9 | params.cufflinks_overlap_radius = 10 10 | params.cufflinks_pre_mrna_fraction = 0.25 11 | params.cufflinks_min_isoform_fraction = 0.15 12 | 13 | // Remove small contigs. 14 | process remove_small_scaffolds { 15 | container 'genomicpariscentre/bioperl:1.6.924' 16 | 17 | input: 18 | file 'ref.fasta' from file(params.reference) 19 | 20 | output: 21 | file 'ref_trimmed.fasta' into ref_trimmed_for_filter_mito 22 | 23 | "trim_fasta_all.pl -i ref.fasta -out ref_trimmed.fasta -length ${params.minscaffoldsize}" 24 | } 25 | 26 | // We want to remove any scaffolds that show matches to some known 27 | // mitochondrial sequence. For the moment, the process includes a 28 | // download of the P. nodorum mitochondrial sequence. To make the 29 | // search more comprehensive, simply append other sequences ot the 30 | // 'mitorhondrial.fasta' input file. For the moment, we exclude 31 | // sequences that have mitochondrial blast hits to more than 20% of 32 | // their length. 33 | process filter_mitochondrial { 34 | container 'robsyme/basics' 35 | 36 | input: 37 | file 'ref_trimmed.fasta' from ref_trimmed_for_filter_mito 38 | 39 | output: 40 | file 'nuclear_genome.fasta' into scaffolds_for_repeatmasker 41 | file 'nuclear_genome.fasta' into scaffolds_for_gff2gb 42 | file 'mitochondrial_genome.fasta' into scaffolds_mitochondrial 43 | 44 | """ 45 | curl 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=NC_009746&rettype=fasta&retmode=text' >> mitochondrial.fasta 46 | 47 | makeblastdb -in mitochondrial.fasta -input_type fasta -dbtype nucl 48 | samtools faidx ref_trimmed.fasta 49 | 50 | blastn -query ref_trimmed.fasta -db mitochondrial.fasta -evalue 1 -outfmt '6 qseqid qstart qend qlen' -max_target_seqs 1 \ 51 | | awk 'BEGIN{OFS=\"\\t\"} {print \$1, \$2-1, \$3, \"hit_id_\" idcount++, \$4}' \ 52 | | sort -k1,1 -k2,2n \ 53 | | bedtools merge -i - -c 5 -o mean \ 54 | | bedtools complement -i - -g ref_trimmed.fasta.fai \ 55 | | bedtools genomecov -max 1 -i - -g ref_trimmed.fasta.fai \ 56 | | grep -v '^genome' \ 57 | | tee coverage.txt \ 58 | | awk '\$2 > 0 && \$5 > 0.8 {print \$1}' \ 59 | | xargs samtools faidx ref_trimmed.fasta \ 60 | > nuclear_genome.fasta 61 | 62 | awk '\$2 > 0 && \$5 <= 0.8 {print \$1}' coverage.txt \ 63 | | xargs samtools faidx ref_trimmed.fasta \ 64 | > mitochondrial_genome.fasta 65 | """ 66 | } 67 | 68 | // It's important to mask repetitive sequence before running automated 69 | // gene calling software. Here we use repeatmasker and the Repbase 70 | // database to identify and mask repetitive sequence in the nuclear 71 | // genome. 72 | process repeatmasker { 73 | container 'registry.robsyme.com/repeatmasker' 74 | 75 | input: 76 | file 'ref.fasta' from scaffolds_for_repeatmasker 77 | 78 | output: 79 | file 'ref.fasta.masked' into ref_masked_for_codingquarry 80 | 81 | "RepeatMasker -qq -frag 5000000 -gff -species ${params.species} -no_is ref.fasta" 82 | } 83 | 84 | // The user can supply many bam files from many conditions. For the 85 | // purposes of gene calling, I'm going to merge them into one file for 86 | // ease of handling. Differentiating conditions is of no use to this 87 | // pipeline. 88 | process merge_bams { 89 | container 'robsyme/basics' 90 | 91 | input: 92 | file '*.bam' from Channel.fromPath(params.bamfiles).toList() 93 | 94 | output: 95 | file 'merged.bam' into bam_for_cufflinks 96 | 97 | 'samtools merge merged.bam *.bam' 98 | } 99 | 100 | // We would like to identify potential transcripts using cufflinks 101 | process cufflinks { 102 | container 'robsyme/cufflinks' 103 | 104 | input: 105 | file 'merged.bam' from bam_for_cufflinks 106 | 107 | output: 108 | file 'transcripts.gtf' into transcripts_gtf_for_codingquarry 109 | file 'transcripts.gtf' into transcripts_gtf_for_orf_extraction 110 | 111 | "cufflinks --overlap-radius ${params.cufflinks_overlap_radius} --pre-mrna-fraction ${params.cufflinks_pre_mrna_fraction} --min-isoform-fraction ${params.cufflinks_min_isoform_fraction} --min-intron-length ${params.minintronlength} --max-intron-length ${params.maxintronlength} merged.bam" 112 | } 113 | 114 | // The CodingQuarry denovo gene predictor uses intron/exon boundary 115 | // information to improve the accuracy of gene annotation. 116 | process codingquarry { 117 | container 'robsyme/codingquarry:1.2' 118 | 119 | input: 120 | file 'ref.fasta' from ref_masked_for_codingquarry 121 | file 'transcripts.gtf' from transcripts_gtf_for_codingquarry 122 | 123 | output: 124 | file 'out/PredictedPass.gff3' into codingquarry_gff_for_gff2gb 125 | 126 | ''' 127 | CufflinksGTF_to_CodingQuarryGFF3.py transcripts.gtf > transcripts.gff 128 | CodingQuarry -f ref.fasta -t transcripts.gff 129 | ''' 130 | } 131 | 132 | process extract_cufflinks_transcripts { 133 | container 'robsyme/basics:0.7' 134 | 135 | input: 136 | file 'ref.fasta' from file(params.reference) 137 | file 'transcripts.gtf' from transcripts_gtf_for_orf_extraction 138 | 139 | output: 140 | file 'transcripts.fasta' into cufflinks_transcripts 141 | file 'transcripts.gff3' into cufflinks_transcripts_gff 142 | 143 | """ 144 | gt gtf_to_gff3 -tidy transcripts.gtf > transcripts_unsorted.gff3 145 | gt gff3 -sort -tidy transcripts_unsorted.gff3 > transcripts.gff3 146 | gt extractfeat -type exon -join -seqfile ref.fasta -matchdescstart transcripts.gff3 > transcripts.fasta 147 | """ 148 | } 149 | 150 | // Generate a fasta file of open reading frames. 151 | process identify_orfs { 152 | container 'robsyme/emboss:6.6.0' 153 | 154 | input: 155 | file 'transcripts.fasta' from cufflinks_transcripts 156 | 157 | output: 158 | file 'transcript_orfs.fasta' into orfs_fasta 159 | 160 | "getorf -sequence transcripts.fasta -outseq transcript_orfs.fasta -minsize 100 -find 0" 161 | } 162 | 163 | process find_pfam_domains_in_transcript_orfs { 164 | container 'robsyme/pfam:28.0' 165 | 166 | input: 167 | file 'orfs.fasta' from orfs_fasta.splitFasta(by: 1000) 168 | 169 | output: 170 | file 'orf.domains' into transcript_orf_domains 171 | 172 | """ 173 | hmmscan -E 1e-5 -o orf.domains /opt/Pfam-A.hmm orfs.fasta 174 | """ 175 | } 176 | 177 | process pfam_output_to_gff { 178 | container 'robsyme/basics:0.7' 179 | 180 | input: 181 | file 'orf.domains' from transcript_orf_domains 182 | file 'transcripts.gff3' from cufflinks_transcripts_gff 183 | 184 | output: 185 | file 'domains.gff3' into pfam_gff_hints 186 | 187 | """ 188 | pfam_to_gff3.rb < orf.domains > orf_domains.gff3 189 | gff_transpose.rb --from orf_domains.gff3 --to transcripts.gff3 > domains.gff3 190 | """ 191 | } 192 | 193 | // The training set for augustus requires that we supply short 194 | // snippets of 'golden' genes which are used for training. Everything 195 | // that is *not* identified as conding sequence is assumed to be 196 | // non-coding. Here we take extract each of the genes +- 200 bp into 197 | // their own genbank file. In cases where genes are separated by less 198 | // than 200 bp, some coding sequence will be included in the neighbor, 199 | // and will be interpreted as 'non-coding' sequence by the augustus 200 | // training algorithm. A more sensible approach would be to divide the 201 | process gff_to_genbank { 202 | container 'robsyme/augustus:3.1' 203 | 204 | input: 205 | file 'genome.fasta' from scaffolds_for_gff2gb 206 | file 'full_length_genes.gff' from codingquarry_gff_for_gff2gb 207 | 208 | output: 209 | file 'out.gb' into golden_genbank_for_training 210 | 211 | script: 212 | "gff2gbSmallDNA.pl full_length_genes.gff genome.fasta 200 out.gb" 213 | } 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | -------------------------------------------------------------------------------- /genemark-annotate.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | 3 | params.genome = '**/scaffolds.fasta' 4 | 5 | (strainNames, genomes) = Channel.fromPath(params.genome).separate(2) { path -> [path.getParent().getBaseName(), path] }; 6 | nameAndSequence = strainNames.merge( genomes ) {name, file -> [name, file]} 7 | 8 | process cleanGenome { 9 | input: 10 | set strainName, 'raw.fasta' from nameAndSequence 11 | 12 | output: 13 | set strainName, 'genome.fasta' into cleanGenome 14 | 15 | """ 16 | rename-fasta raw.fasta "${strainName}_scaffold" > genome.fasta 17 | """ 18 | } 19 | 20 | process trainAndCallGenes { 21 | input: 22 | set strainName, 'genome.fasta' from cleanGenome 23 | 24 | output: 25 | set strainName, 'genemark.gtf', 'genome.fasta' into basicGTF 26 | 27 | """ 28 | gmes_petap.pl --ES --fungus --sequence genome.fasta 29 | """ 30 | } 31 | 32 | process gtfToGFF3 { 33 | input: 34 | set strainName, 'genemark.gtf' 'genome.fasta' from basicGTF 35 | 36 | output: 37 | set strainName, 'out.gff3.gz' into renamedAnnotations 38 | 39 | """ 40 | gt gtf_to_gff3 -tidy genemark.gtf \ 41 | | gt gff3 -sort -tidy \ 42 | | rename-gff-ids $strainName > out.gff3 43 | rename-codons $strainName genemark.gtf >> out.gff3 44 | sort -k1,1 -k4,4n out.gff3 > tmp && mv tmp out.gff3 45 | echo "##FASTA" >> out.gff3 46 | awk '/^>/ {print \$0, "[${strainName}]"} !/^>/ {print \$0}' genome.fasta >> out.gff3 47 | gzip --best out.gff3 48 | """ 49 | } 50 | 51 | renamedAnnotations.subscribe { strainName, gff -> 52 | gff.copyTo("${strainName}.gff3.gz") 53 | } 54 | -------------------------------------------------------------------------------- /main.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | 3 | params.reference = 'data/genome.fasta' 4 | params.scaffoldmin = 1000 // Minimum scaffold size to consider 5 | params.minsize = 100 // Minimum exon size 6 | params.species = 'fungi' // Name of species passed to RepeatMasker 7 | params.maxintronlength = 500 // Maximum intron length 8 | params.minintronlength = 10 // Maximum intron length 9 | params.bamfiles = 'data/bams/*.bam' 10 | params.pasaconf = 'conf/alignAssembly.conf' // Pasa configuration file to set db name etc. 11 | params.reads = 'data/reads/all.fastq' 12 | reference_raw = file(params.reference) 13 | 14 | // Remove small scaffolds from analysis. 15 | process remove_small_scaffolds { 16 | container 'genomicpariscentre/bioperl:1.6.924' 17 | 18 | input: 19 | file 'ref.fasta' from reference_raw 20 | 21 | output: 22 | file 'ref_trimmed.fasta' into ref_trimmed_for_orfs 23 | file 'ref_trimmed.fasta' into ref_trimmed_for_masking 24 | file 'ref_trimmed.fasta' into ref_trimmed_for_softmasking 25 | file 'ref_trimmed.fasta' into ref_trimmed_for_trinity 26 | file 'ref_trimmed.fasta' into ref_trimmed_for_bamtohints 27 | file 'ref_trimmed.fasta' into ref_trimmed_for_pasa 28 | file 'ref_trimmed.fasta' into ref_trimmed_for_gff2gb 29 | file 'ref_trimmed.fasta' into ref_trimmed_for_busco 30 | file 'ref_trimmed.fasta' into ref_trimmed_for_cufflinks 31 | 32 | "trim_fasta_all.pl -i ref.fasta -out ref_trimmed.fasta -length ${params.scaffoldmin}" 33 | } 34 | 35 | process busco { 36 | container 'robsyme/busco' 37 | 38 | input: 39 | file 'ref.fasta' from ref_trimmed_for_busco 40 | 41 | output: 42 | stdout into debug 43 | 44 | "ln -s /opt/busco/lineages/fungi . && busco -in ref.fasta -o custom --lineage fungi" 45 | } 46 | 47 | // Generate a fasta file of open reading frames. 48 | process identify_orfs { 49 | container 'robsyme/emboss' 50 | 51 | input: 52 | file 'ref.fasta' from ref_trimmed_for_orfs 53 | 54 | output: 55 | file 'ref_exons.aa' into orfs_fasta 56 | 57 | "getorf -sequence ref.fasta -outseq ref_exons.aa -minsize 300 -find 0" 58 | } 59 | 60 | // We want to remove ORFs with a high percentage of Xs. Notice that 61 | // the large orfs file is split into pieces containing 1000 fasta 62 | // entries each. 63 | process remove_Xs { 64 | container 'robsyme/bioruby' 65 | 66 | input: 67 | file 'orfs.fasta' from orfs_fasta.splitFasta( by: 5000 ) 68 | 69 | output: 70 | stdout into clean_orfs_for_transposons 71 | stdout into clean_orfs_for_fungi 72 | 73 | """ 74 | #!/usr/bin/env ruby 75 | 76 | require 'bio' 77 | Bio::FlatFile.open('orfs.fasta').each do |entry| 78 | next if entry.length < (${params.minsize} / 3) 79 | x_percentage = entry.seq.composition['X'] / entry.length.to_f 80 | puts entry if x_percentage < 0.3 81 | end 82 | """ 83 | } 84 | 85 | // Run HHblits to identify potential transposons in the (cleaned) open 86 | // reading frames from the 'identify_orfs' step. We run a hhblits 87 | // process for each open reading frame. 88 | process hhblits_transposon { 89 | container 'robsyme/hhblits-transposon' 90 | 91 | input: 92 | file 'orfs.fasta' from clean_orfs_for_transposons.splitFasta( by: 500 ) 93 | 94 | output: 95 | stdout into hhblits_transposon 96 | 97 | """ 98 | csplit --elide-empty-files --quiet orfs.fasta '/^>/' '{*}' 99 | for orf in xx*; do 100 | hhblits -i \$orf -o stdout -d /databases/transposons -e 1e-5 -E 1e-5 -id 80 -n 2 101 | done 102 | """ 103 | } 104 | 105 | process hhblits_fungi { 106 | container 'robsyme/hhblits-fungi' 107 | 108 | input: 109 | file 'orfs.fasta' from clean_orfs_for_fungi.splitFasta( by: 500 ) 110 | 111 | output: 112 | stdout into hhblits_fungi 113 | 114 | """ 115 | csplit --elide-empty-files --quiet orfs.fasta '/^>/' '{*}' 116 | for orf in xx*; do 117 | hhblits -i \$orf -o stdout -d /databases/fungal_50kclus -e 1e-5 -E 1e-5 -id 80 -n 2 118 | done 119 | """ 120 | } 121 | 122 | //Look at a hhblits output file and generate a gff file of the matches 123 | process parse_transposon_hhr { 124 | cache 'deep' 125 | 126 | input: 127 | file 'search.hhr' from hhblits_transposon.collectFile() 128 | 129 | output: 130 | file 'out.gff3' into hhblits_transposon_gff 131 | 132 | """ 133 | parse_hhr.rb \ 134 | --homology_cutoff 70 \ 135 | --evalue_cutoff 1e-3 \ 136 | --pvalue_cutoff 1e-5 \ 137 | --score_cutoff 100 \ 138 | --align_length_cutoff 50 \ 139 | --template_length_cutoff 30 \ 140 | --repeat \ 141 | search.hhr 142 | """ 143 | } 144 | 145 | process parse_fungi_hhr { 146 | cache 'deep' 147 | 148 | input: 149 | file 'search.hhr' from hhblits_fungi.collectFile() 150 | 151 | output: 152 | file 'out.gff3' into hhblits_fungi_gff 153 | 154 | """ 155 | parse_hhr.rb \ 156 | --homology_cutoff 70 \ 157 | --evalue_cutoff 1e-3 \ 158 | --pvalue_cutoff 1e-5 \ 159 | --score_cutoff 100 \ 160 | --align_length_cutoff 50 \ 161 | --template_length_cutoff 30 \ 162 | search.hhr 163 | """ 164 | } 165 | 166 | process repeatmasker { 167 | container 'repeatmasker' 168 | 169 | input: 170 | file 'ref.fasta' from ref_trimmed_for_masking 171 | 172 | output: 173 | file 'ref.fasta.out.gff' into repeats_gff_for_hints 174 | file 'ref.fasta.out.gff' into repeats_gff_for_softmasking 175 | file 'ref.fasta.masked' into ref_masked_for_golden 176 | file 'ref.fasta.masked' into ref_masked_for_codingquarry 177 | 178 | "RepeatMasker -qq -frag 5000000 -gff -species ${params.species} -no_is ref.fasta" 179 | } 180 | 181 | process repeatmasker_gff_to_hints { 182 | container 'robsyme/bioruby' 183 | 184 | input: 185 | file 'repeats.gff' from repeats_gff_for_hints 186 | 187 | output: 188 | stdout into repeat_hints 189 | 190 | ''' 191 | #!/usr/bin/env ruby 192 | repeats = File.open("repeats.gff", "r") 193 | 194 | while repeats.gets 195 | next if $_ =~ /^#/ 196 | split = $_.split("\t") 197 | split[2] = "nonexonpart" 198 | split[8] = "src=RM;pri=6" 199 | puts split.join("\t") 200 | end 201 | ''' 202 | } 203 | 204 | process softMaskReference { 205 | container 'robsyme/bedtools' 206 | 207 | input: 208 | file 'ref.fasta' from ref_trimmed_for_softmasking 209 | file 'repeats.gff' from repeats_gff_for_softmasking 210 | 211 | output: 212 | file 'ref_softmasked.fasta' into ref_softmasked_for_golden 213 | 214 | "maskFastaFromBed -soft -fi ref.fasta -fo ref_softmasked.fasta -bed repeats.gff" 215 | } 216 | 217 | process merge_bams { 218 | input: 219 | file '*.bam' from Channel.fromPath(params.bamfiles).toList() 220 | 221 | output: 222 | file 'merged.bam' into mapped_reads 223 | file 'merged.bam' into mapped_reads_for_bamtohints 224 | file 'merged.bam' into mapped_reads_for_cufflinks 225 | 226 | "samtools merge merged.bam *.bam" 227 | } 228 | 229 | process cufflinks { 230 | container 'robsyme/cufflinks' 231 | 232 | input: 233 | file 'merged.bam' from mapped_reads_for_cufflinks 234 | 235 | output: 236 | file 'transcripts.gtf' into transcriptwtranscripts_gtf_for_codingquarry 237 | 238 | "cufflinks --max-intron-length ${params.maxintronlength} --min-intron-length ${params.minintronlength} merged.bam" 239 | } 240 | 241 | process codingquarry { 242 | container 'robsyme/codingquarry' 243 | 244 | input: 245 | file 'ref.fasta' from ref_masked_for_codingquarry 246 | file 'transcripts.gtf' from transcriptwtranscripts_gtf_for_codingquarry 247 | 248 | output: 249 | file 'out/PredictedPass.gff3' into codingquarry_gff 250 | 251 | ''' 252 | CufflinksGTF_to_CodingQuarryGFF3.py transcripts.gtf > transcripts.gff 253 | CodingQuarry -f ref.fasta -t transcripts.gff 254 | ''' 255 | } 256 | 257 | process split_bams_by_scaffold { 258 | input: 259 | file 'merged.bam' from mapped_reads 260 | 261 | output: 262 | file '*.bam' into split_bams 263 | 264 | """ 265 | samtools index merged.bam && \ 266 | samtools idxstats merged.bam \ 267 | | awk '\$3 > 0 && \$2 > ${params.scaffoldmin} {print \$1}' \ 268 | | xargs -n1 -I{} samtools view -b -o {}.bam merged.bam {} 269 | """ 270 | } 271 | 272 | process genome_guided_trinity { 273 | container 'robsyme/trinity' 274 | 275 | input: 276 | set 'ref.fasta', 'single.bam' from ref_trimmed_for_trinity.spread(split_bams) 277 | 278 | output: 279 | file 'trinity_out_dir/Trinity-GG.fasta' into genome_guided_trinity_split 280 | 281 | "Trinity --genome_guided_bam single.bam --genome_guided_max_intron ${params.maxintronlength} --max_memory 2G --jaccard_clip --CPU 1 --full_cleanup" 282 | } 283 | 284 | process collate_genome_guided_transcripts { 285 | input: 286 | stdin genome_guided_trinity_split.collectFile().map{ it.text } 287 | 288 | output: 289 | stdout into genome_guided_trinity 290 | 291 | ''' 292 | #!/usr/bin/awk -f 293 | /^>/ { 294 | sub(/>GG[0-9]+/, ">GG" count++) 295 | print 296 | } 297 | 298 | /^[^>]/ { 299 | print $0 300 | } 301 | ''' 302 | } 303 | 304 | process denovo_trinity { 305 | container 'robsyme/trinity' 306 | 307 | input: 308 | file 'reads.fastq' from file(params.reads) 309 | 310 | output: 311 | file 'trinity_out_dir.Trinity.fasta' into denovo_trinity 312 | 313 | "Trinity --seqType fq --single reads.fastq --max_memory 2G --CPU 2 --jaccard_clip --full_cleanup" 314 | } 315 | 316 | process bam_to_hints { 317 | container 'robsyme/bedtools' 318 | 319 | input: 320 | file 'ref.fasta' from ref_trimmed_for_bamtohints 321 | file 'all.bam' from mapped_reads_for_bamtohints 322 | 323 | output: 324 | file 'all.bam.junctions.hints' into augustus_hints 325 | 326 | "augustus_RNAseq_hints.pl --genome ref.fasta --bam all.bam" 327 | } 328 | 329 | 330 | // Note that I had to start a separate mysql docker container: docker 331 | // run --name pasadb -e MYSQL_ROOT_PASSWORD=password -e MYSQL_DATABASE=pasa -e MYSQL_USER=pasauser -e MYSQL_PASSWORD=password mysql 332 | process pasa { 333 | container 'robsyme/pasa' 334 | 335 | input: 336 | file 'GG_raw.fasta' from genome_guided_trinity 337 | file 'DN_raw.fasta' from denovo_trinity 338 | file 'ref.fasta' from ref_trimmed_for_pasa 339 | file 'alignAssembly.config' from file(params.pasaconf) 340 | 341 | output: 342 | file '*.assemblies.fasta.transdecoder.pep' into pasa_cds_for_golden 343 | file '*.assemblies.fasta.transdecoder.genome.gff3' into pasa_gff_for_fl 344 | file '*.assemblies.fasta.transdecoder.pep' into pasa_cds_for_fl 345 | file 'ref.fasta' into reference_genome 346 | 347 | """ 348 | grep '^>' DN_raw.fasta \ 349 | | awk '{print(substr(\$1, 2))}' \ 350 | > DN_raw.list 351 | 352 | cat DN_raw.fasta GG_raw.fasta > transcripts.fasta 353 | 354 | /opt/pasa/scripts/Launch_PASA_pipeline.pl \ 355 | -c alignAssembly.config \ 356 | --MAX_INTRON_LENGTH ${params.maxintronlength} \ 357 | --stringent_alignment_overlap 30.0 \ 358 | -C \ 359 | -r \ 360 | -R \ 361 | -g ref.fasta \ 362 | -t transcripts.fasta \ 363 | --TDN DN_raw.list \ 364 | --ALIGNERS blat,gmap \ 365 | --TRANSDECODER \ 366 | --CPU 2 367 | 368 | /opt/pasa/scripts/build_comprehensive_transcriptome.dbi \ 369 | -c alignAssembly.config \ 370 | -t transcripts.fasta \ 371 | --min_per_ID 95 \ 372 | --min_per_aligned 30 373 | 374 | /opt/pasa/scripts/pasa_asmbls_to_training_set.dbi \ 375 | --pasa_transcripts_fasta *.assemblies.fasta \ 376 | --pasa_transcripts_gff3 *.pasa_assemblies.gff3 377 | """ 378 | } 379 | 380 | // Pull out the full-length transcripts identified by pasa (and Transdecoder) 381 | process find_full_length_proteins { 382 | container 'robsyme/bioruby' 383 | 384 | input: 385 | stdin pasa_cds_for_golden.map{ it.text } 386 | 387 | output: 388 | stdout into full_pasa_pep_fasta 389 | 390 | """ 391 | #!/usr/bin/env ruby 392 | require 'bio' 393 | 394 | Bio::FlatFile.auto(ARGF).each do |entry| 395 | puts entry if entry.definition =~ /type:complete/ 396 | end 397 | """ 398 | } 399 | 400 | process exclude_partial_genes_from_gff { 401 | container 'robsyme/bioruby' 402 | 403 | input: 404 | file 'hits.gff3' from pasa_gff_for_fl 405 | file 'peptide.fasta' from pasa_cds_for_fl 406 | 407 | output: 408 | stdout into full_length_gff 409 | 410 | ''' 411 | #!/usr/bin/env ruby 412 | require "bio" 413 | require "set" 414 | 415 | full_length_ids = Bio::FlatFile 416 | .open("peptide.fasta") 417 | .find_all{ |entry| entry.definition =~ /type:complete/ } 418 | .map{ |entry| entry.entry_id } 419 | .to_set 420 | 421 | File.open("hits.gff3").each do |line| 422 | next unless line =~ /ID=(cds.)?([^\\|]+)\\|/ 423 | next unless full_length_ids.include?($2) 424 | scaffold_name = line.split("\t").first 425 | puts line 426 | end 427 | ''' 428 | } 429 | 430 | // The input to Augustus training requires that we provide the 431 | // 'golden' annotations as a genbank format, but it's not just any 432 | // genbank format, there are some restrictions. 433 | // 434 | // For the best results, we should remove proteins that are too 435 | // similar. Augusutus will also assume that all nucleotides not 436 | // annotated as coding sequence are non-coding sequence, so we need to 437 | // trim the output to the coding sequence += a small margin either 438 | // side. Note that this is not simply a conversion of gff to genbank. 439 | process gff_to_genbank { 440 | container 'robsyme/augustus' 441 | 442 | input: 443 | file 'genome.fasta' from ref_trimmed_for_gff2gb 444 | file 'full_length_genes.gff' from full_length_gff 445 | 446 | output: 447 | file 'out.gb' into golden_genbank_for_training 448 | 449 | "gff2gbSmallDNA.pl full_length_genes.gff genome.fasta 1000 out.gb" 450 | } 451 | 452 | process train_augustus { 453 | container 'robsyme/augustus' 454 | 455 | input: 456 | file 'custom.gb' from golden_genbank_for_training 457 | 458 | output: 459 | file 'custom.tar.gz' into augustus_trained_parameters 460 | 461 | """ 462 | mkdir -p /opt/augustus/config/species/custom/ 463 | cp /opt/augustus/config/species/generic/generic_parameters.cfg /opt/augustus/config/species/custom/custom_parameters.cfg 464 | cp /opt/augustus/config/species/generic/generic_weightmatrix.txt /opt/augustus/config/species/custom/ 465 | /opt/augustus/bin/etraining --species=custom custom.gb 466 | /opt/augustus/scripts/optimize_augustus.pl --species=custom custom.gb 467 | tar -czvf custom.tar.gz /opt/augustus/config/species/custom 468 | """ 469 | } 470 | 471 | debug.subscribe{ println("DEBUG: $it") } 472 | 473 | 474 | // TODO: Evaluate whether it is at all helpful to supply cufflinks gtf as 'exonpart' hints to augustus. The problem with cufflinks is the concatentation of overlapping transcripts. When those transcripts are from opposite directions, supplying a stranded hint to augustus may prevent the annotation of one of genes that form the fused transcript. 475 | // TODO: Perhaps I can do ORF detection on the cufflinks transcripts and then run those ORFs through pfam and signalP detected domains can be converted into hints for augustus. 476 | 477 | -------------------------------------------------------------------------------- /nextflow.config: -------------------------------------------------------------------------------- 1 | manifest { 2 | homePage = 'http://github.com/robsyme/nextflow-annotation' 3 | description = 'Fungal genome annotation workflow' 4 | mainScript = 'annotate.nf' 5 | } 6 | -------------------------------------------------------------------------------- /proteinortho.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | 3 | (f1, f2, f3) = Channel.fromPath(params.fasta).separate(3){ [it,it,it] } 4 | 5 | params.cpus = 1 6 | params.outdir = 'proteinortho_out' 7 | outdir = file(params.outdir) 8 | outdir.mkdirs() 9 | 10 | 11 | process indexGenomes { 12 | container 'robsyme/proteinortho' 13 | storeDir outdir 14 | 15 | input: 16 | file '*' from f1.toList() 17 | 18 | output: 19 | file '*' into db1 20 | file '*' into db2 21 | 22 | "proteinortho5.pl -step=1 *.fasta" 23 | } 24 | 25 | def list = [] 26 | f2.eachWithIndex{ unit, idx -> list.add(idx) } 27 | 28 | process runBlasts { 29 | container 'robsyme/proteinortho' 30 | storeDir outdir 31 | 32 | input: 33 | file '*' from db1 34 | file "*" from f2.toList() 35 | each index from list[0..-3] 36 | 37 | output: 38 | file 'myproject.*' into blastresults 39 | 40 | "proteinortho5.pl -verbose -step=2 -startat=$index -stopat=$index -cpus=${params.cpus} *.fasta" 41 | } 42 | 43 | process performClustering { 44 | container 'robsyme/proteinortho' 45 | storeDir outdir 46 | 47 | input: 48 | file '*' from blastresults 49 | file '*' from db2 50 | file '*' from f3.toList() 51 | 52 | output: 53 | file 'myproject.*' into proteinortho_out 54 | 55 | "proteinortho5.pl -step=3 -singles -verbose *.fasta" 56 | } 57 | 58 | proteinortho_out.flatten().subscribe{ println("Proteinortho output file: $it") } 59 | 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /simple-annotate.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | 3 | genome = file(params.genome) 4 | cegmaFile = file(params.cegma) 5 | strainName = genome.getParent().getBaseName() 6 | outFilename = params.out 7 | 8 | process cleanGenome { 9 | input: 10 | genome 11 | 12 | output: 13 | stdout into cleanGenomes 14 | 15 | script: 16 | ''' 17 | awk '/^>/ && !/[.*]/ {print(\$0, "[$strainName]")} /^>/ && /[.*]/ {print \$0} /^[^>]/ {print(toupper(\$0))}' '$genome' | sed "s/\015//" 18 | ''' 19 | } 20 | 21 | (fastaForGFF, fastaForAug) = cleanGenomes.separate(2){ [it, it] } 22 | 23 | process cegmaGFFtoFullerGFF { 24 | input: 25 | file 'cegmaFile' from cegmaFile 26 | 27 | output: 28 | stdout fullGFF 29 | 30 | ''' 31 | fullerCegmaGFF.rb $cegmaFile 32 | ''' 33 | } 34 | 35 | process cegmaGFFToGenbank { 36 | container 'robsyme/augustus' 37 | 38 | input: 39 | file gff from fullGFF 40 | file fasta from fastaForGFF 41 | 42 | output: 43 | file 'out.gb' into trainingGenbank 44 | 45 | ''' 46 | gff2gbSmallDNA.pl $gff $fasta 5000 out.gb 47 | ''' 48 | } 49 | 50 | process trainAndCallGenes { 51 | container 'robsyme/augustus' 52 | 53 | input: 54 | file trainingGenbank 55 | file genome from fastaForAug 56 | 57 | output: 58 | file 'out.txt' into trainedFile 59 | 60 | ''' 61 | optimize_augustus.pl --species=fusarium_graminearum $trainingGenbank 62 | etraining --species=fusarium_graminearum $trainingGenbank 63 | augustus --species=fusarium_graminearum --gff3=on $genome > out.txt 64 | ''' 65 | } 66 | 67 | trainedFile.subscribe { trained -> 68 | trained.copyTo(outFilename) 69 | } 70 | 71 | --------------------------------------------------------------------------------