├── hosts ├── external_programs └── trf409.linux64 ├── install_dependencies ├── repbase_password.yaml ├── tasks │ ├── ik_perl.yaml │ ├── hmmer.yaml │ ├── rmblast.yaml │ ├── exonerate.yaml │ ├── snap.yaml │ ├── genemark.yaml │ ├── augustus.yaml │ ├── maker.yaml │ └── repeatmasker.yaml └── install_gene_prediction_dependencies.yaml ├── maker_opts ├── maker_exe.ctl ├── maker_bopts.ctl └── maker_opts.ctl ├── README.md └── run_gene_predictions.sh /hosts: -------------------------------------------------------------------------------- 1 | localhost ansible_connection=local 2 | -------------------------------------------------------------------------------- /external_programs/trf409.linux64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guyleonard/gene_prediction_pipeline/HEAD/external_programs/trf409.linux64 -------------------------------------------------------------------------------- /install_dependencies/repbase_password.yaml: -------------------------------------------------------------------------------- 1 | $ANSIBLE_VAULT;1.1;AES256 2 | 62666531393432656430623763656531613766656366626139636465336134333561383166363233 3 | 3336623364373964303662653763346239303466666165350a356662616333626431663231623566 4 | 66353031303734343438366665643861613461633933613061623163633363316361373434623935 5 | 3430393337383633650a616431316434313536393932313231383531336337663539316431353932 6 | 39663965353663666630306435373462613931656466353361626666336566303634 7 | -------------------------------------------------------------------------------- /install_dependencies/tasks/ik_perl.yaml: -------------------------------------------------------------------------------- 1 | - name: Cloning KorfLab Perl_utils 2 | git: repo=https://github.com/KorfLab/Perl_utils.git dest="{{source_dir}}/Perl_utils" clone=yes update=yes 3 | 4 | - name: Installing Perl Modules 5 | copy: 6 | src: "{{item}}" 7 | dest: /usr/lib/perl5 8 | mode: 0644 9 | with_fileglob: 10 | - "{{source_dir}}/Perl_utils/*.pm" 11 | 12 | - name: Installing Perl Scripts 13 | copy: 14 | src: "{{item}}" 15 | dest: /usr/local/bin 16 | mode: 0755 17 | with_fileglob: 18 | - "{{source_dir}}/Perl_utils/*.pl" 19 | 20 | - name: Changing root user:group to cs02gl:cs02gl 21 | file: path="{{source_dir}}/Perl_utils" owner=cs02gl group=cs02gl recurse=yes 22 | -------------------------------------------------------------------------------- /install_dependencies/tasks/hmmer.yaml: -------------------------------------------------------------------------------- 1 | - name: Downloading HMMER v{{version}} 2 | get_url: url="http://eddylab.org/software/hmmer3/{{version}}/hmmer-{{version}}-linux-intel-x86_64.tar.gz" dest="{{source_dir}}/hmmer-{{version}}-linux-intel-x86_64.tar.gz" 3 | 4 | - name: Uncompress HMMER 5 | unarchive: 6 | src="{{source_dir}}/hmmer-{{version}}-linux-intel-x86_64.tar.gz" 7 | dest="{{software_dir}}" 8 | copy=no 9 | creates="{{software_dir}}/hmmer-{{version}}-linux-intel-x86_64/binaries/hmmer" 10 | 11 | - name: Installing HMMER 12 | copy: 13 | src: "{{item}}" 14 | dest: /usr/local/bin 15 | mode: 0755 16 | with_fileglob: 17 | - "{{software_dir}}/hmmer-{{version}}-linux-intel-x86_64/binaries/*" 18 | 19 | - name: Changing root user:group to cs02gl:cs02gl 20 | file: path="{{software_dir}}/hmmer-{{version}}-linux-intel-x86_64" owner=cs02gl group=cs02gl recurse=yes 21 | 22 | -------------------------------------------------------------------------------- /install_dependencies/tasks/rmblast.yaml: -------------------------------------------------------------------------------- 1 | - name: Download RMBlast 2 | get_url: 3 | url="ftp://ftp.ncbi.nlm.nih.gov/blast/executables/rmblast/2.2.28/ncbi-rmblastn-2.2.28-x64-linux.tar.gz" 4 | dest="{{source_dir}}/ncbi-rmblastn-2.2.28-x64-linux.tar.gz" 5 | force_basic_auth=yes 6 | register: get_url_result 7 | until: "'OK' in get_url_result.msg or 'file already exists' in get_url_result.msg" 8 | retries: 5 9 | delay: 10 10 | 11 | - name: Uncompress RMBlast 12 | unarchive: 13 | src="{{source_dir}}/ncbi-rmblastn-2.2.28-x64-linux.tar.gz" 14 | dest="{{software_dir}}" 15 | copy=no 16 | creates="{{software_dir}}/ncbi-rmblastn-2.2.28/LICENSE" 17 | 18 | - name: Copy rmblastn to /usr/bin 19 | copy: src="{{software_dir}}/ncbi-rmblastn-2.2.28/bin/rmblastn" dest=/usr/local/bin/rmblastn mode=0755 20 | 21 | - name: Changing root user:group to cs02gl:cs02gl 22 | file: path="{{software_dir}}/ncbi-rmblastn-2.2.28" owner=cs02gl group=cs02gl recurse=yes 23 | 24 | -------------------------------------------------------------------------------- /install_dependencies/tasks/exonerate.yaml: -------------------------------------------------------------------------------- 1 | - name: Download Exonerate 2 | get_url: 3 | url="http://ftp.ebi.ac.uk/pub/software/vertebrategenomics/exonerate/exonerate-2.2.0-x86_64.tar.gz" 4 | dest="{{source_dir}}/exonerate-2.2.0-x86_64.tar.gz" 5 | force_basic_auth=yes 6 | register: get_url_result 7 | until: "'OK' in get_url_result.msg or 'file already exists' in get_url_result.msg" 8 | retries: 5 9 | delay: 10 10 | 11 | - name: Uncompress Exonerate 12 | unarchive: 13 | src="{{source_dir}}/exonerate-2.2.0-x86_64.tar.gz" 14 | dest="{{software_dir}}" 15 | copy=no 16 | creates="{{software_dir}}/exonerate-2.2.0-x86_64/bin/exonerate" 17 | 18 | - name: Installing Exonerate 19 | copy: 20 | src: "{{item}}" 21 | dest: /usr/local/bin 22 | mode: 0755 23 | with_fileglob: 24 | - "{{software_dir}}/exonerate-2.2.0-x86_64/bin/*" 25 | 26 | - name: Changing root user:group to cs02gl:cs02gl 27 | file: path="{{software_dir}}/exonerate-2.2.0-x86_64/" owner=cs02gl group=cs02gl recurse=yes 28 | -------------------------------------------------------------------------------- /install_dependencies/install_gene_prediction_dependencies.yaml: -------------------------------------------------------------------------------- 1 | - hosts: all 2 | vars: 3 | - source_dir: /home/cs02gl/programs/.source 4 | - software_dir: /home/cs02gl/programs 5 | 6 | pre_tasks: 7 | - name: Create Software Dir 8 | file: path={{software_dir}} state=directory 9 | - name: Create Software Source Dir 10 | file: path={{source_dir}} state=directory 11 | 12 | tasks: 13 | ## SNAP and Perl_libs 14 | - include: tasks/snap.yaml version=2013-11-29 15 | tags: snap 16 | - include: tasks/ik_perl.yaml 17 | tags: perl 18 | 19 | 20 | ## GENEMARK 21 | - include: tasks/genemark.yaml 22 | tags: genemark 23 | 24 | 25 | ## MAKER 26 | # RMBLAST 27 | - include: tasks/rmblast.yaml 28 | tags: repeatmasker, rmblast, maker 29 | 30 | # REPEATMASKER 31 | - include: tasks/repeatmasker.yaml version=4-0-6 32 | tags: repeatmasker, maker 33 | 34 | # HMMER 35 | - include: tasks/hmmer.yaml version=3.1b2 36 | tags: hmmer, repeatmasker, maker 37 | 38 | # EXONERATE 39 | - include: tasks/exonerate.yaml 40 | tags: exonerate, maker 41 | 42 | # MAKER 43 | - include: tasks/maker.yaml 44 | tags: maker 45 | 46 | 47 | ## AUGUSTUS 48 | - include: tasks/augustus.yaml 49 | tags: augustus 50 | -------------------------------------------------------------------------------- /maker_opts/maker_exe.ctl: -------------------------------------------------------------------------------- 1 | #-----Location of Executables Used by MAKER/EVALUATOR 2 | makeblastdb=/usr/bin/makeblastdb #location of NCBI+ makeblastdb executable 3 | blastn=/usr/bin/blastn #location of NCBI+ blastn executable 4 | blastx=/usr/bin/blastx #location of NCBI+ blastx executable 5 | tblastx=/usr/bin/tblastx #location of NCBI+ tblastx executable 6 | formatdb=/usr/bin/formatdb #location of NCBI formatdb executable 7 | blastall=/usr/bin/blastall #location of NCBI blastall executable 8 | xdformat= #location of WUBLAST xdformat executable 9 | blasta= #location of WUBLAST blasta executable 10 | RepeatMasker=/usr/local/bin/RepeatMasker #location of RepeatMasker executable 11 | exonerate=/usr/local/bin/exonerate #location of exonerate executable 12 | 13 | #-----Ab-initio Gene Prediction Algorithms 14 | snap=/usr/local/bin/snap #location of snap executable 15 | gmhmme3=/home/cs02gl/programs/gm_et_linux_64/gmes_petap/gmhmme3 #location of eukaryotic genemark executable 16 | gmhmmp= #location of prokaryotic genemark executable 17 | augustus=/usr/local/bin/augustus #location of augustus executable 18 | fgenesh= #location of fgenesh executable 19 | tRNAscan-SE= #location of trnascan executable 20 | snoscan= #location of snoscan executable 21 | 22 | #-----Other Algorithms 23 | probuild=/home/cs02gl/programs/gm_et_linux_64/gmes_petap/probuild #location of probuild executable (required for genemark) 24 | -------------------------------------------------------------------------------- /maker_opts/maker_bopts.ctl: -------------------------------------------------------------------------------- 1 | #-----BLAST and Exonerate Statistics Thresholds 2 | blast_type=ncbi+ #set to 'ncbi+', 'ncbi' or 'wublast' 3 | 4 | pcov_blastn=0.8 #Blastn Percent Coverage Threhold EST-Genome Alignments 5 | pid_blastn=0.85 #Blastn Percent Identity Threshold EST-Genome Aligments 6 | eval_blastn=1e-10 #Blastn eval cutoff 7 | bit_blastn=40 #Blastn bit cutoff 8 | depth_blastn=0 #Blastn depth cutoff (0 to disable cutoff) 9 | 10 | pcov_blastx=0.5 #Blastx Percent Coverage Threhold Protein-Genome Alignments 11 | pid_blastx=0.4 #Blastx Percent Identity Threshold Protein-Genome Aligments 12 | eval_blastx=1e-06 #Blastx eval cutoff 13 | bit_blastx=30 #Blastx bit cutoff 14 | depth_blastx=0 #Blastx depth cutoff (0 to disable cutoff) 15 | 16 | pcov_tblastx=0.8 #tBlastx Percent Coverage Threhold alt-EST-Genome Alignments 17 | pid_tblastx=0.85 #tBlastx Percent Identity Threshold alt-EST-Genome Aligments 18 | eval_tblastx=1e-10 #tBlastx eval cutoff 19 | bit_tblastx=40 #tBlastx bit cutoff 20 | depth_tblastx=0 #tBlastx depth cutoff (0 to disable cutoff) 21 | 22 | pcov_rm_blastx=0.5 #Blastx Percent Coverage Threhold For Transposable Element Masking 23 | pid_rm_blastx=0.4 #Blastx Percent Identity Threshold For Transposbale Element Masking 24 | eval_rm_blastx=1e-06 #Blastx eval cutoff for transposable element masking 25 | bit_rm_blastx=30 #Blastx bit cutoff for transposable element masking 26 | 27 | ep_score_limit=20 #Exonerate protein percent of maximal score threshold 28 | en_score_limit=20 #Exonerate nucleotide percent of maximal score threshold 29 | -------------------------------------------------------------------------------- /install_dependencies/tasks/snap.yaml: -------------------------------------------------------------------------------- 1 | - name: Downloading SNAP v{{version}} 2 | get_url: url="http://korflab.ucdavis.edu/Software/snap-{{version}}.tar.gz" dest="{{source_dir}}/snap-{{version}}.tar.gz" 3 | register: get_url_result 4 | until: "'OK' in get_url_result.msg or 'file already exists' in get_url_result.msg" 5 | retries: 5 6 | delay: 10 7 | 8 | - name: Uncompress SNAP v{{version}} 9 | unarchive: 10 | src="{{source_dir}}/snap-{{version}}.tar.gz" 11 | dest="{{software_dir}}" 12 | copy=no 13 | creates="{{software_dir}}/snap/LICENSE" 14 | 15 | - name: SNAP - Running make 16 | make: chdir="{{software_dir}}/snap" 17 | 18 | - name: Linking snap 19 | file: src="{{software_dir}}/snap/snap" dest=/usr/local/bin/snap state=link 20 | 21 | - name: Linking fathom 22 | file: src="{{software_dir}}/snap/fathom" dest=/usr/local/bin/fathom state=link 23 | 24 | - name: Linking forge 25 | file: src="{{software_dir}}/snap/forge" dest=/usr/local/bin/forge state=link 26 | 27 | - name: Linking hmm-info 28 | file: src="{{software_dir}}/snap/hmm-info" dest=/usr/local/bin/hmm-info state=link 29 | 30 | - name: Linking exonpairs 31 | file: src="{{software_dir}}/snap/exonpairs" dest=/usr/local/bin/exonpairs state=link 32 | 33 | - name: Installing Scripts 34 | copy: 35 | src: "{{item}}" 36 | dest: /usr/local/bin 37 | mode: 0755 38 | with_fileglob: 39 | - "{{software_dir}}/snap/*.pl" 40 | 41 | - name: Changing root user:group to cs02gl:cs02gl 42 | file: path="{{software_dir}}/snap" owner=cs02gl group=cs02gl recurse=yes 43 | -------------------------------------------------------------------------------- /install_dependencies/tasks/genemark.yaml: -------------------------------------------------------------------------------- 1 | - name: Downloading GeneMark-ES / ET v.4.32 2 | get_url: url=http://topaz.gatech.edu/GeneMark/tmp/GMtool_QHwSL/gm_et_linux_64.tar.gz dest="{{source_dir}}/gm_et_linux_64.tar.gz" 3 | register: get_url_result 4 | until: "'OK' in get_url_result.msg or 'file already exists' in get_url_result.msg" 5 | retries: 5 6 | delay: 10 7 | 8 | - name: Downloading GeneMark-ES / ET v.4.32 License Key 9 | get_url: url=http://topaz.gatech.edu/GeneMark/tmp/GMtool_QHwSL/gm_key_64.gz dest="{{source_dir}}/gm_key_64.gz" 10 | register: get_url_result 11 | until: "'OK' in get_url_result.msg or 'file already exists' in get_url_result.msg" 12 | retries: 5 13 | delay: 10 14 | 15 | - name: Uncompress GeneMark 16 | unarchive: 17 | src="{{source_dir}}/gm_et_linux_64.tar.gz" 18 | dest="{{software_dir}}" 19 | copy=no 20 | creates="{{software_dir}}/gm_et_linux_64/gmes_petap/INSTALL" 21 | 22 | - name: Uncompress GeneMark License 23 | shell: gunzip gm_key_64.gz 24 | args: 25 | chdir: "{{source_dir}}" 26 | ignore_errors: yes 27 | 28 | - name: Copy License to User 29 | copy: src="{{source_dir}}/gm_key_64" dest=/home/cs02gl/.gm_key owner=cs02gl group=cs02gl 30 | 31 | - name: Changing root user:group to cs02gl:cs02gl 32 | file: path="{{software_dir}}/gm_et_linux_64/" owner=cs02gl group=cs02gl recurse=yes 33 | 34 | - name: Making Sure cpanminus is Installed 35 | apt: name=cpanminus state=present 36 | 37 | - name: Installing Perl Dependencies 38 | cpanm: name={{item}} 39 | with_items: 40 | - Logger::Simple 41 | - Hash::Merge 42 | - Parallel::ForkManager 43 | -------------------------------------------------------------------------------- /install_dependencies/tasks/augustus.yaml: -------------------------------------------------------------------------------- 1 | - name: Downloading Augustus 2 | get_url: url="http://bioinf.uni-greifswald.de/augustus/binaries/augustus.current.tar.gz" dest="{{source_dir}}/augustus.current.tar.gz" 3 | register: get_url_result 4 | until: "'OK' in get_url_result.msg or 'file already exists' in get_url_result.msg" 5 | retries: 5 6 | delay: 10 7 | 8 | - name: Install Apt Dependencies 9 | apt: name="{{item}}" state=present 10 | with_items: 11 | - libboost-iostreams-dev 12 | - zlib1g-dev 13 | - bamtools 14 | - libbamtools-dev 15 | 16 | - name: Uncompress Augusuts 17 | unarchive: 18 | src="{{source_dir}}/augustus.current.tar.gz" 19 | dest="{{software_dir}}" 20 | copy=no 21 | creates="{{software_dir}}/augustus-3.2.2/README.TXT" 22 | 23 | - name: Augustus - Running make 24 | make: chdir="{{software_dir}}/augustus-3.2.2" 25 | 26 | - name: Augustus - Running make install 27 | make: chdir="{{software_dir}}/augustus-3.2.2" target=install 28 | 29 | - name: Installing Augustus Scripts 30 | file: 31 | src: "{{software_dir}}/augustus-3.2.2/scripts/{{item}}" 32 | dest: "/usr/local/bin/{{item}}" 33 | state: link 34 | with_lines: ls "{{software_dir}}/augustus-3.2.2/scripts" 35 | 36 | - name: Add export AUGUSTUS_CONFIG_PATH 37 | shell: "export AUGUSTUS_CONFIG_PATH={{software_dir}}/augustus-3.2.2/config" 38 | 39 | - name: Adding AUGUSTUS_CONFIG_PATH to .bashrc 40 | lineinfile: dest=/home/cs02gl/.bashrc line="export AUGUSTUS_CONFIG_PATH={{software_dir}}/augustus-3.2.2/config" backup=yes 41 | 42 | - name: Changing root user:group to cs02gl:cs02gl 43 | file: path="{{software_dir}}/augustus-3.2.2" owner=cs02gl group=cs02gl recurse=yes 44 | -------------------------------------------------------------------------------- /install_dependencies/tasks/maker.yaml: -------------------------------------------------------------------------------- 1 | - name: Install Dependencies from CPAN 2 | cpanm: name={{item}} 3 | with_items: 4 | - Bio::Perl 5 | - Bit::Vector 6 | - DBD::SQLite 7 | - DBI 8 | - Error 9 | - Error::Simple 10 | - File::NFSLock 11 | - File::Which 12 | - forks 13 | - forks::shared 14 | - Inline 15 | - Inline::C 16 | - IO::All 17 | - IO::Prompt 18 | - Perl::Unsafe::Signals 19 | - PerlIO::gzip 20 | - Proc::ProcessTable 21 | - Proc::Simple 22 | - threads 23 | - URI::Escape 24 | 25 | - name: Install Dependencies from Apt 26 | apt: name={{item}} 27 | with_items: 28 | - ncbi-blast+ 29 | - ncbi-blast+-legacy 30 | 31 | - name: Download MAKER 32 | get_url: 33 | url="http://yandell.topaz.genetics.utah.edu/maker_downloads/8AB0/DB9A/F7F4/3DE5C1FC9204F76F8491CC138C45/maker-2.31.8.tgz" 34 | dest="{{source_dir}}/maker-2.31.8.tgz" 35 | force_basic_auth=yes 36 | register: get_url_result 37 | until: "'OK' in get_url_result.msg or 'file already exists' in get_url_result.msg" 38 | retries: 5 39 | delay: 10 40 | 41 | - name: Uncompress MAKER 42 | unarchive: 43 | src="{{source_dir}}/maker-2.31.8.tgz" 44 | dest="{{software_dir}}" 45 | copy=no 46 | creates="{{software_dir}}/maker/INSTALL" 47 | 48 | - name: configure 49 | shell: perl Build.PL 50 | args: 51 | chdir: "{{software_dir}}/maker/src" 52 | 53 | - name: build install 54 | shell: ./Build install 55 | args: 56 | chdir: "{{software_dir}}/maker/src" 57 | 58 | - name: Installing Maker binaries 59 | file: 60 | src: "{{software_dir}}/maker/bin/{{item}}" 61 | dest: "/usr/local/bin/{{item}}" 62 | state: link 63 | with_lines: ls "{{software_dir}}/maker/bin/" 64 | 65 | - name: Installing Maker Libraries 66 | file: 67 | src: "{{software_dir}}/maker/lib/{{item}}" 68 | dest: "/usr/local/lib/{{item}}" 69 | state: link 70 | with_lines: ls "{{software_dir}}/maker/lib/" 71 | ignore_errors: yes 72 | 73 | - name: Changing root user:group to cs02gl:cs02gl 74 | file: path="{{software_dir}}/maker" owner=cs02gl group=cs02gl recurse=yes 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Gene Prediction Workflow 2 | 3 | A suggested workflow for predicting genes from the assembly of your favourite genome. 4 | 5 | ![SAGA Workflow](https://github.com/guyleonard/single_cell_workflow/blob/master/images/gene_prediction.png) 6 | 7 | This really is just a suggestion and ths script is a little bit hard coded and messy - in need of a lot of update/reconfiguration if you want to use it. 8 | 9 | ## Install Dependencies 10 | 11 | An [Ansible]() playbook to install the software: 12 | 13 | You can call the playbook to install like this: 14 | 15 | ansible-playbook install_gene_prediction_dependencies.yaml --sudo -K -c local -i "localhost," --ask-vault-pass 16 | 17 | There are also tags so you can install one or many components in a go: 18 | 19 | ansible-playbook install_gene_prediction_dependencies.yaml --sudo -K -c local -i "localhost," --ask-vault-pass --tags repbase,hmmer 20 | 21 | ### RepeatMasker Libraries 22 | RepeatMasker libraries require the user to obtain a username and password for access to [Repbase](http://www.girinst.org/repbase/). You should do this now, and make sure you also update the download link in the repeatmasker.yaml - unfortunately RepBase do not seem to keep links to previous version live - I despair. 23 | 24 | For ansible installation the password is stored in an ansible 'vault' file. This file is also password protected, so the RepeatMasker install will not work for any external users of this repo, therefore you will need to make your own vault, containing your own password 25 | with this command: 26 | 27 | ansible-vault create repbase_password.yml 28 | 29 | and add your password like so: 30 | ```yaml 31 | --- 32 | repbase_password: PASSWORD 33 | ``` 34 | 35 | Your username is in the repeatmasker.yaml taskbook. 36 | 37 | 38 | 39 | ### rmblast 40 | rmblast won't currently download with Ansible 2.1.1.0 as there's something up with ftp downloads, so you will have to manually download it yourself and place it in the .source dir.!? 41 | 42 | ### trf 43 | You have to click a ridiculous web form button to get an in-browser download. What is this the '90s? In the meantime, I will just distribute the file here. (╯°□°)╯︵ ┻━┻ 44 | 45 | ### genemark 46 | Yet another case of seriously outdated and pointless software download/license models. You have to fill in a web form, agree to a non-standard licence and then get given a temporary download location. $&\*! This bad, bad practice has got to stop. It is not 1996 anymore. Crikey. 47 | 48 | ### genemark 49 | It has a 400 day licence. After that you need to get a new one. :| 50 | 51 | ## Other Dependencies 52 | 1. [pigz](http://zlib.net/pigz/) - Parallel GZIP 53 | 2. tee - GNU Core 54 | 3. time - *nix Core 55 | -------------------------------------------------------------------------------- /install_dependencies/tasks/repeatmasker.yaml: -------------------------------------------------------------------------------- 1 | - include_vars: "{{playbook_dir}}/repbase_password.yaml" 2 | 3 | - name: Download RepeatMasker Repbase Library 4 | get_url: 5 | url: "http://www.girinst.org/server/RepBase/protected/repeatmaskerlibraries/RepBaseRepeatMaskerEdition-20170127.tar.gz" 6 | url_username: guyleonard 7 | url_password: "{{repbase_password}}" 8 | force_basic_auth: yes 9 | dest: "{{source_dir}}/repeatmaskerlibraries-20160829.tar.gz" 10 | 11 | - name: Download RepeatMasker {{version}} 12 | get_url: 13 | url="http://www.repeatmasker.org/RepeatMasker-open-{{version}}.tar.gz" 14 | dest="{{source_dir}}/RepeatMasker-open-{{version}}.tar.gz" 15 | register: get_url_result 16 | until: "'OK' in get_url_result.msg or 'file already exists' in get_url_result.msg" 17 | retries: 5 18 | delay: 10 19 | 20 | - name: Uncompress RepeatMasker {{version}} 21 | unarchive: 22 | src="{{source_dir}}/RepeatMasker-open-{{version}}.tar.gz" 23 | dest="{{software_dir}}" 24 | copy=no 25 | creates="{{software_dir}}/RepeatMasker/INSTALL" 26 | 27 | - name: Uncompress Repbase Libraries 28 | unarchive: 29 | src="{{source_dir}}/repeatmaskerlibraries-20160829.tar.gz" 30 | dest="{{software_dir}}/RepeatMasker" 31 | copy=no 32 | creates="{{software_dir}}/RepeatMasker/Libraries/README" 33 | 34 | - name: Changing root user:group to cs02gl:cs02gl 35 | file: path="{{software_dir}}/RepeatMasker" owner=cs02gl group=cs02gl recurse=yes 36 | 37 | # needs command input - got to do this one yourself for now 38 | #- name: Configure RepeatMasker 39 | # shell: perl ./configure 40 | # args: 41 | # chdir: "{{software_dir}}/RepeatMasker/" 42 | 43 | - name: Linking DupMasker 44 | file: src="{{software_dir}}/RepeatMasker/DupMasker" dest=/usr/local/bin/DupMasker state=link 45 | 46 | - name: Linking RepeatProteinMask 47 | file: src="{{software_dir}}/RepeatMasker/RepeatProteinMask" dest=/usr/local/bin/DupMasker state=link 48 | 49 | - name: Linking ProcessRepeats 50 | file: src="{{software_dir}}/RepeatMasker/ProcessRepeats" dest=/usr/local/bin/ProcessRepeats state=link 51 | 52 | - name: Linking DateRepeats 53 | file: src="{{software_dir}}/RepeatMasker/DateRepeats" dest=/usr/local/bin/DateRepeats state=link 54 | 55 | - name: Linking DupMasker 56 | file: src="{{software_dir}}/RepeatMasker/RepeatMasker" dest=/usr/local/bin/RepeatMasker state=link 57 | 58 | 59 | # This program is included for my own use, they hide it behind a web-download CGI button 60 | # and some non-standard "licence" terms. Super annoying in 2016. Super annoying in 1999. 61 | - name: Copy TRF to /usr/loca/bin 62 | copy: src="{{playbook_dir}}/external_programs/trf409.linux64" dest=/usr/local/bin/trf mode=0755 63 | -------------------------------------------------------------------------------- /maker_opts/maker_opts.ctl: -------------------------------------------------------------------------------- 1 | #-----Genome (these are always required) 2 | genome= #genome sequence (fasta file or fasta embeded in GFF3 file) 3 | organism_type=eukaryotic #eukaryotic or prokaryotic. Default is eukaryotic 4 | 5 | #-----Re-annotation Using MAKER Derived GFF3 6 | maker_gff= #MAKER derived GFF3 file 7 | est_pass=0 #use ESTs in maker_gff: 1 = yes, 0 = no 8 | altest_pass=0 #use alternate organism ESTs in maker_gff: 1 = yes, 0 = no 9 | protein_pass=0 #use protein alignments in maker_gff: 1 = yes, 0 = no 10 | rm_pass=0 #use repeats in maker_gff: 1 = yes, 0 = no 11 | model_pass=0 #use gene models in maker_gff: 1 = yes, 0 = no 12 | pred_pass=0 #use ab-initio predictions in maker_gff: 1 = yes, 0 = no 13 | other_pass=0 #passthrough anyything else in maker_gff: 1 = yes, 0 = no 14 | 15 | #-----EST Evidence (for best results provide a file for at least one) 16 | est= #set of ESTs or assembled mRNA-seq in fasta format 17 | altest= #EST/cDNA sequence file in fasta format from an alternate organism 18 | est_gff= #aligned ESTs or mRNA-seq from an external GFF3 file 19 | altest_gff= #aligned ESTs from a closly relate species in GFF3 format 20 | 21 | #-----Protein Homology Evidence (for best results provide a file for at least one) 22 | protein= #protein sequence file in fasta format (i.e. from mutiple oransisms) 23 | protein_gff= #aligned protein homology evidence from an external GFF3 file 24 | 25 | #-----Repeat Masking (leave values blank to skip repeat masking) 26 | model_org=all #select a model organism for RepBase masking in RepeatMasker 27 | rmlib= #provide an organism specific repeat library in fasta format for RepeatMasker 28 | repeat_protein= #provide a fasta file of transposable element proteins for RepeatRunner 29 | rm_gff= #pre-identified repeat elements from an external GFF3 file 30 | prok_rm=0 #forces MAKER to repeatmask prokaryotes (no reason to change this), 1 = yes, 0 = no 31 | softmask=1 #use soft-masking rather than hard-masking in BLAST (i.e. seg and dust filtering) 32 | 33 | #-----Gene Prediction 34 | snaphmm= #SNAP HMM file 35 | gmhmm= #GeneMark HMM file 36 | augustus_species= #Augustus gene prediction species model 37 | fgenesh_par_file= #FGENESH parameter file 38 | pred_gff= #ab-initio predictions from an external GFF3 file 39 | model_gff= #annotated gene models from an external GFF3 file (annotation pass-through) 40 | est2genome=0 #infer gene predictions directly from ESTs, 1 = yes, 0 = no 41 | protein2genome=0 #infer predictions from protein homology, 1 = yes, 0 = no 42 | trna=0 #find tRNAs with tRNAscan, 1 = yes, 0 = no 43 | snoscan_rrna= #rRNA file to have Snoscan find snoRNAs 44 | unmask=0 #also run ab-initio prediction programs on unmasked sequence, 1 = yes, 0 = no 45 | 46 | #-----Other Annotation Feature Types (features MAKER doesn't recognize) 47 | other_gff= #extra features to pass-through to final MAKER generated GFF3 file 48 | 49 | #-----External Application Behavior Options 50 | alt_peptide=C #amino acid used to replace non-standard amino acids in BLAST databases 51 | cpus=1 #max number of cpus to use in BLAST and RepeatMasker (not for MPI, leave 1 when using MPI) 52 | 53 | #-----MAKER Behavior Options 54 | max_dna_len=100000 #length for dividing up contigs into chunks (increases/decreases memory usage) 55 | min_contig=1 #skip genome contigs below this length (under 10kb are often useless) 56 | 57 | pred_flank=200 #flank for extending evidence clusters sent to gene predictors 58 | pred_stats=0 #report AED and QI statistics for all predictions as well as models 59 | AED_threshold=1 #Maximum Annotation Edit Distance allowed (bound by 0 and 1) 60 | min_protein=0 #require at least this many amino acids in predicted proteins 61 | alt_splice=0 #Take extra steps to try and find alternative splicing, 1 = yes, 0 = no 62 | always_complete=0 #extra steps to force start and stop codons, 1 = yes, 0 = no 63 | map_forward=0 #map names and attributes forward from old GFF3 genes, 1 = yes, 0 = no 64 | keep_preds=0 #Concordance threshold to add unsupported gene prediction (bound by 0 and 1) 65 | 66 | split_hit=10000 #length for the splitting of hits (expected max intron size for evidence alignments) 67 | single_exon=0 #consider single exon EST evidence when generating annotations, 1 = yes, 0 = no 68 | single_length=250 #min length required for single exon ESTs if 'single_exon is enabled' 69 | correct_est_fusion=0 #limits use of ESTs in annotation to avoid fusion genes 70 | 71 | tries=2 #number of times to try a contig if there is a failure for some reason 72 | clean_try=0 #remove all data from previous run before retrying, 1 = yes, 0 = no 73 | clean_up=0 #removes theVoid directory with individual analysis files, 1 = yes, 0 = no 74 | TMP= #specify a directory other than the system default temporary directory for temporary files 75 | -------------------------------------------------------------------------------- /run_gene_predictions.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Guy Leonard MMXVI 3 | # Number of processor cores 4 | THREADS=8 5 | 6 | # Dependency Checks 7 | #command -v pigz >/dev/null 2>&1 || { echo "I require pigz but it's not installed. Aborting." >&2; exit 1;} 8 | #command -v blastn >/dev/null 2>&1 || { echo "I require BLASTn but it's not installed. Aborting." >&2; exit 1;} 9 | #command -v multiqc >/dev/null 2>&1 || { echo "I require MultiQC but it's not installed. Aborting." >&2; exit 1;} 10 | 11 | 12 | 13 | # Working Directory 14 | WD=`pwd` 15 | echo "Working Directory: $WD" 16 | # Script Dir 17 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 18 | 19 | # Get dirnames for current Single Cell Library 20 | # Locations of FASTQs = Sample_**_***/raw_illumina_reads/ 21 | for DIRS in $WD/*; do 22 | 23 | if [ -d ${DIRS} ]; then 24 | echo "Working in ${DIRS}" 25 | 26 | GENE_DIR=$DIRS/raw_illumina_reads/GENE_PREDS 27 | mkdir -p $GENE_DIR 28 | 29 | SAMPLE_NAME="$(basename $DIRS)" 30 | echo "Sample: $SAMPLE_NAME" 31 | 32 | ## CEGMA 33 | # Already run, files are in CEGMA 34 | CEGMA_DIR=$DIRS/raw_illumina_reads/CEGMA 35 | CEGMA_GFF=$CEGMA_DIR/cegma.cegma.gff 36 | 37 | ## GENOME = scaffold 38 | GENOME=$DIRS/raw_illumina_reads/SPADES/overlapped_and_paired/scaffolds.fasta 39 | 40 | 41 | ## SNAP 1 42 | SNAP1_DIR=$GENE_DIR/SNAP1 43 | mkdir -p $SNAP1_DIR 44 | cd $SNAP1_DIR 45 | cegma2zff ${CEGMA_GFF} ${GENOME} | tee snap.log 46 | fathom genome.ann genome.dna -categorize 1000 | tee -a snap.log 47 | fathom -export 1000 -plus uni.ann uni.dna | tee -a snap.log 48 | forge export.ann export.dna | tee -a snap.log 49 | hmm-assembler.pl ${GENOME} . > cegma_snap.hmm | tee -a snap.log 50 | cd ../ 51 | 52 | 53 | ## GeneMark 54 | GENEMARK_DIR=$GENE_DIR/GENEMARK 55 | mkdir -p $GENEMARK_DIR 56 | cd $GENEMARK_DIR 57 | # setting minimum gene prediction to lower than default 300 - just in case! 58 | # setting minimum contig to 1000bp as the 50Kbp is quite high for SAGs 59 | gmes_petap.pl --ES --cores 24 --min_gene_prediction 100 --min_contig 1000 --sequence ${GENOME} | tee genemark.log 60 | cd ../ 61 | 62 | 63 | ## MAKER 1 64 | MAKER_DIR=$GENE_DIR/MAKER 65 | mkdir -p $MAKER_DIR 66 | cd $MAKER_DIR 67 | # Other Maker Option Files 68 | cp $SCRIPT_DIR/maker_opts/maker_bopts.ctl $MAKER_DIR 69 | cp $SCRIPT_DIR/maker_opts/maker_exe.ctl $MAKER_DIR 70 | 71 | # Maker Options 72 | echo "genome=${GENOME}" > $MAKER_DIR/maker_opts_1.ctl 73 | echo "organism_type=eukaryotic" >> $MAKER_DIR/maker_opts_1.ctl 74 | echo "model_org=all" >> $MAKER_DIR/maker_opts_1.ctl 75 | echo "softmask=1" >> $MAKER_DIR/maker_opts_1.ctl 76 | echo "snaphmm=$SNAP1_DIR/cegma_snap.hmm" >> $MAKER_DIR/maker_opts_1.ctl 77 | echo "gmhmm=$GENEMARK_DIR/output/gmhmm.mod" >> $MAKER_DIR/maker_opts_1.ctl 78 | echo "min_contig=100" >> $MAKER_DIR/maker_opts_1.ctl 79 | echo "keep_preds=1" >> $MAKER_DIR/maker_opts_1.ctl 80 | echo "cpus=24" >> $MAKER_DIR/maker_opts_1.ctl 81 | ln -s $MAKER_DIR/maker_opts_1.ctl $MAKER_DIR/maker_opts.ctl 82 | 83 | maker ${genome} -base run_1 | tee maker_run_1.log 84 | 85 | gff3_merge -d $MAKER_DIR/run_1.maker.output/run_1_master_datastore_index.log 86 | mv run_1.all.gff maker_run_1.all.gff 87 | MAKER_GFF=$MAKER_DIR/maker_run_1.all.gff 88 | cd ../ 89 | 90 | ## SNAP 2 91 | SNAP2_DIR=$GENE_DIR/SNAP2 92 | mkdir -p $SNAP2_DIR 93 | cd $SNAP2_DIR 94 | 95 | maker2zff -n ${MAKER_GFF} | tee snap.log 96 | fathom genome.ann genome.dna -categorize 1000 | tee -a snap.log 97 | fathom -export 1000 -plus uni.ann uni.dna | tee -a snap.log 98 | forge export.ann export.dna | tee -a snap.log 99 | hmm-assembler.pl ${GENOME} . > maker_snap_2.hmm | tee -a snap.log 100 | SNAP_ZFF=$SNAP2_DIR/genome.ann 101 | cd ../ 102 | 103 | 104 | ## AUGUSTUS 105 | AUGUSTUS_DIR=$GENE_DIR/AUGUSTUS 106 | mkdir -p $AUGUSTUS_DIR 107 | cd $AUGUSTUS_DIR 108 | zff2gff3.pl $SNAP_ZFF | perl -plne 's/\t(\S+)$/\t\.\t$1/' >snap2_genome.gff 109 | SNAP2_GENOME=$AUGUSTUS_DIR/snap2_genome.gff 110 | autoAug.pl --genome=$GENOME --species=$SAMPLE_NAME --trainingset=$SNAP2_GENOME --singleCPU --noutr -v --useexisting | tee autoAug.log 111 | cd ../ 112 | 113 | ## MAKER 2 114 | cd $MAKER_DIR 115 | 116 | # Maker Options 117 | echo "genome=${GENOME}" > $MAKER_DIR/maker_opts_2.ctl 118 | echo "organism_type=eukaryotic" >> $MAKER_DIR/maker_opts_2.ctl 119 | echo "model_org=all" >> $MAKER_DIR/maker_opts_2.ctl 120 | echo "snaphmm=$SNAP2_DIR/maker_snap_2.hmm" >> $MAKER_DIR/maker_opts_2.ctl 121 | echo "gmhmm=$GENEMARK_DIR/output/gmhmm.mod" >> $MAKER_DIR/maker_opts_2.ctl 122 | echo "augustus_species=$SAMPLE_NAME" >> $MAKER_DIR/maker_opts_2.ctl 123 | #echo "rm_gff=$MAKER_DIR/maker_run_1.all.gff" >> $MAKER_DIR/maker_opts_2.ctl # previous maker run for repeat masks to save time 124 | echo "min_contig=50" >> $MAKER_DIR/maker_opts_2.ctl 125 | echo "pred_stats=1" >> $MAKER_DIR/maker_opts_2.ctl 126 | echo "min_protein=20" >> $MAKER_DIR/maker_opts_2.ctl 127 | echo "alt_splice=1" >> $MAKER_DIR/maker_opts_2.ctl 128 | echo "keep_preds=1" >> $MAKER_DIR/maker_opts_2.ctl 129 | echo "evaluate=1" >> $MAKER_DIR/maker_opts_2.ctl 130 | echo "cpus=24" >> $MAKER_DIR/maker_opts_2.ctl 131 | 132 | rm $MAKER_DIR/maker_opts.ctl 133 | ln -s $MAKER_DIR/maker_opts_2.ctl $MAKER_DIR/maker_opts.ctl 134 | 135 | maker ${genome} -base run_2 | tee maker_run_2.log 136 | 137 | ## Collate GFF3 + FASTA 138 | gff3_merge -d $MAKER_DIR/run_2.maker.output/run_2_master_datastore_index.log 139 | mv run_2.all.gff maker_run_2.all.gff 140 | 141 | fasta_merge -d $MAKER_DIR/run_2.maker.output/run_2_master_datastore_index.log 142 | 143 | cd ../ 144 | 145 | fi 146 | done 147 | --------------------------------------------------------------------------------