├── .github ├── ISSUE_TEMPLATE │ └── bug_report.md └── workflows │ ├── main.yml │ ├── python-publish.yml │ ├── release-triggered.yml │ ├── release.yml │ ├── singularity-deploy.yml │ └── triggered-build.yml ├── .gitignore ├── .readthedocs.yaml ├── .zenodo.json ├── CITATION.cff ├── Dockerfile ├── Dockerfile2 ├── LICENSE.md ├── MANIFEST.in ├── README.md ├── Singularity ├── docs ├── .gitignore ├── Makefile ├── annotate.rst ├── commands.rst ├── compare.rst ├── conda.rst ├── conf.py ├── databases.rst ├── dependencies.rst ├── docker.rst ├── evidence.rst ├── index.rst ├── install.rst ├── make.bat ├── manual.rst ├── predict.rst ├── prepare.rst ├── requirements.txt ├── tutorials.rst ├── update.rst └── utilities.rst ├── funannotate-docker ├── funannotate-logo.png ├── funannotate-podman ├── funannotate ├── __init__.py ├── __version__.py ├── annotate.py ├── aux_scripts │ ├── augustus_parallel.py │ ├── enrichment_parallel.py │ ├── fasta2agp.py │ ├── filterGenemark.pl │ ├── filterIntronsFindStrand.pl │ ├── funannotate-BUSCO2-py2.py │ ├── funannotate-BUSCO2.py │ ├── funannotate-p2g.py │ ├── funannotate-runEVM.py │ ├── genemark_gtf2gff3.pl │ ├── getEggNog.sh │ ├── hmmer_parallel.py │ ├── iprscan-local.py │ ├── iprscan2annotations.py │ ├── pal2nal.pl │ ├── phobius-multiproc.py │ ├── phobius-remote.pl │ ├── runIPRscan.py │ ├── sam2bam.sh │ ├── tbl2asn_parallel.py │ ├── trinity.py │ ├── trnascan2gff3.pl │ └── xmlcombine.py ├── check.py ├── clean.py ├── compare.py ├── config │ ├── EOG092C0B3U.prfl │ ├── TruSeq3-PE.fa │ ├── TruSeq3-SE.fa │ ├── busco_test.fa │ ├── codeml.config │ ├── extrinsic.E.XNT.RM.cfg │ ├── smcogs.txt │ ├── test.sbt │ └── tf_interpro.txt ├── database.py ├── downloads.json ├── fix.py ├── funannotate.py ├── html_template │ ├── css │ │ ├── bootstrap.min.css │ │ └── starter-template.css │ └── js │ │ ├── bootstrap.min.js │ │ ├── ie-emulation-modes-warning.js │ │ ├── ie10-viewport-bug-workaround.js │ │ └── jquery.min.js ├── interlap.py ├── iprscan.py ├── library.py ├── mask.py ├── outgroups.py ├── predict.py ├── remote.py ├── resources.py ├── setupDB.py ├── sort.py ├── species.py ├── stackedBarGraph.py ├── test.py ├── train.py ├── update.py └── utilities │ ├── __init__.py │ ├── bam2gff3.py │ ├── contrast.py │ ├── gbk2parts.py │ ├── get_longest_isoform.py │ ├── gff2prot.py │ ├── gff2tbl.py │ ├── gff_reformat.py │ ├── quarry2gff3.py │ ├── stats.py │ ├── stringtie2gff3.py │ └── tbl2gbk.py └── setup.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | 5 | --- 6 | 7 | **Are you using the latest release?** 8 | If you are not using the latest release of funannotate, please upgrade, if bug persists then report here. 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **What command did you issue?** 14 | Copy/paste the command used. 15 | 16 | **Logfiles** 17 | Please provide relavent log files of the error. 18 | 19 | **OS/Install Information** 20 | - output of `funannotate check --show-versions` 21 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | # This is a basic workflow to help you get started with Actions 2 | 3 | name: CI to Docker Hub funannotate-slim 4 | 5 | # Controls when the action will run. 6 | on: 7 | # Triggers the workflow on push or pull request events but only for the master branch 8 | push: 9 | branches: [ master ] 10 | 11 | # Allows you to run this workflow manually from the Actions tab 12 | workflow_dispatch: 13 | 14 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 15 | jobs: 16 | # This workflow contains a single job called "build" 17 | build: 18 | # The type of runner that the job will run on 19 | runs-on: ubuntu-latest 20 | 21 | steps: 22 | 23 | - name: Check Out Repo 24 | uses: actions/checkout@v2 25 | 26 | - name: Login to Docker Hub 27 | uses: docker/login-action@v1 28 | with: 29 | username: ${{ secrets.DOCKER_HUB_USERNAME }} 30 | password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }} 31 | 32 | - name: Set up Docker Buildx 33 | id: buildx 34 | uses: docker/setup-buildx-action@v1 35 | 36 | - name: Build and push 37 | id: docker_build 38 | uses: docker/build-push-action@v2 39 | with: 40 | context: ./ 41 | file: ./Dockerfile 42 | push: true 43 | tags: nextgenusfs/funannotate-slim:latest 44 | 45 | - name: Image digest 46 | run: echo ${{ steps.docker_build.outputs.digest }} 47 | 48 | - name: Repository Dispatch 49 | uses: peter-evans/repository-dispatch@v1 50 | with: 51 | token: ${{ secrets.REPO_ACCESS_TOKEN }} 52 | repository: nextgenusfs/funannotate 53 | event-type: docker-hub-complete 54 | client-payload: '{"ref": "${{ github.ref }}", "sha": "${{ github.sha }}"}' 55 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [ published ] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.x' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_API_TOKEN }} -------------------------------------------------------------------------------- /.github/workflows/release-triggered.yml: -------------------------------------------------------------------------------- 1 | # This is a basic workflow to help you get started with Actions 2 | 3 | name: Release funannotate to Docker Hub 4 | 5 | # Controls when the action will run. 6 | on: 7 | repository_dispatch: 8 | types: [docker-hub-release-complete] 9 | 10 | # Allows you to run this workflow manually from the Actions tab 11 | workflow_dispatch: 12 | 13 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 14 | jobs: 15 | # This workflow contains a single job called "build" 16 | build: 17 | # The type of runner that the job will run on 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - name: Check Out Repo 22 | uses: actions/checkout@v2 23 | 24 | - name: Get release 25 | id: get_release 26 | uses: kaliber5/action-get-release@v1 27 | with: 28 | token: ${{ github.token }} 29 | latest: true 30 | 31 | - name: Login to Docker Hub 32 | uses: docker/login-action@v1 33 | with: 34 | username: ${{ secrets.DOCKER_HUB_USERNAME }} 35 | password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }} 36 | 37 | - name: Set up Docker Buildx 38 | id: buildx 39 | uses: docker/setup-buildx-action@v1 40 | 41 | - name: Build and push 42 | id: docker_build 43 | uses: docker/build-push-action@v2 44 | with: 45 | context: ./ 46 | file: ./Dockerfile2 47 | push: true 48 | tags: nextgenusfs/funannotate:${{ steps.get_release.outputs.tag_name }} 49 | 50 | - name: Image digest 51 | run: echo ${{ steps.docker_build.outputs.digest }} 52 | 53 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | # This is a basic workflow to help you get started with Actions 2 | 3 | name: Release funannotate-slim to Docker Hub 4 | 5 | # Controls when the action will run. 6 | on: 7 | release: 8 | types: [created] 9 | 10 | # Allows you to run this workflow manually from the Actions tab 11 | workflow_dispatch: 12 | 13 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 14 | jobs: 15 | # This workflow contains a single job called "build" 16 | build: 17 | # The type of runner that the job will run on 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - name: Check Out Repo 22 | uses: actions/checkout@v2 23 | 24 | - name: Get release 25 | id: get_release 26 | uses: kaliber5/action-get-release@v1 27 | with: 28 | token: ${{ github.token }} 29 | latest: true 30 | 31 | - name: Test git release scrapper 32 | id: test_release 33 | run: echo ${{ steps.get_release.outputs.tag_name }} 34 | 35 | - name: Login to Docker Hub 36 | uses: docker/login-action@v1 37 | with: 38 | username: ${{ secrets.DOCKER_HUB_USERNAME }} 39 | password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }} 40 | 41 | - name: Set up Docker Buildx 42 | id: buildx 43 | uses: docker/setup-buildx-action@v1 44 | 45 | - name: Build and push 46 | id: docker_build 47 | uses: docker/build-push-action@v2 48 | with: 49 | context: ./ 50 | file: ./Dockerfile 51 | push: true 52 | tags: nextgenusfs/funannotate-slim:${{ steps.get_release.outputs.tag_name }} 53 | 54 | - name: Image digest 55 | run: echo ${{ steps.docker_build.outputs.digest }} 56 | 57 | - name: Repository Dispatch 58 | uses: peter-evans/repository-dispatch@v1 59 | with: 60 | token: ${{ secrets.REPO_ACCESS_TOKEN }} 61 | repository: nextgenusfs/funannotate 62 | event-type: docker-hub-release-complete 63 | client-payload: '{"ref": "${{ github.ref }}", "sha": "${{ github.sha }}"}' 64 | -------------------------------------------------------------------------------- /.github/workflows/singularity-deploy.yml: -------------------------------------------------------------------------------- 1 | # This is a basic workflow to help you get started with Actions 2 | 3 | name: CI Singularity 4 | 5 | # Controls when the action will run. 6 | on: 7 | repository_dispatch: 8 | types: [singularity-ready] 9 | 10 | # Allows you to run this workflow manually from the Actions tab 11 | workflow_dispatch: 12 | 13 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 14 | jobs: 15 | deploy: 16 | name: Deploy to Singularity 17 | 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - uses: actions/checkout@v1 22 | 23 | - uses: chrnorm/deployment-action@releases/v1 24 | name: Create GitHub deployment 25 | id: deployment 26 | with: 27 | token: ${{ secrets.REPO_ACCESS_TOKEN }} 28 | environment: production 29 | 30 | 31 | - name: Update deployment status (success) 32 | if: success() 33 | uses: chrnorm/deployment-status@releases/v1 34 | with: 35 | token: ${{ secrets.REPO_ACCESS_TOKEN }} 36 | state: "success" 37 | deployment_id: ${{ steps.deployment.outputs.deployment_id }} 38 | 39 | - name: Update deployment status (failure) 40 | if: failure() 41 | uses: chrnorm/deployment-status@releases/v1 42 | with: 43 | token: ${{ secrets.REPO_ACCESS_TOKEN }} 44 | state: "failure" 45 | deployment_id: ${{ steps.deployment.outputs.deployment_id }} -------------------------------------------------------------------------------- /.github/workflows/triggered-build.yml: -------------------------------------------------------------------------------- 1 | # This is a basic workflow to help you get started with Actions 2 | 3 | name: CI to Docker Hub funannotate 4 | 5 | # Controls when the action will run. 6 | on: 7 | repository_dispatch: 8 | types: [docker-hub-complete] 9 | 10 | # Allows you to run this workflow manually from the Actions tab 11 | workflow_dispatch: 12 | 13 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 14 | jobs: 15 | # This workflow contains a single job called "build" 16 | build: 17 | # The type of runner that the job will run on 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | 22 | - name: Check Out Repo 23 | uses: actions/checkout@v2 24 | 25 | - name: Login to Docker Hub 26 | uses: docker/login-action@v1 27 | with: 28 | username: ${{ secrets.DOCKER_HUB_USERNAME }} 29 | password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }} 30 | 31 | - name: Set up Docker Buildx 32 | id: buildx 33 | uses: docker/setup-buildx-action@v1 34 | 35 | - name: Build and push 36 | id: docker_build 37 | uses: docker/build-push-action@v2 38 | with: 39 | context: ./ 40 | file: ./Dockerfile2 41 | push: true 42 | tags: nextgenusfs/funannotate:latest 43 | 44 | - name: Image digest 45 | run: echo ${{ steps.docker_build.outputs.digest }} 46 | 47 | - name: Repository Dispatch 48 | uses: peter-evans/repository-dispatch@v1 49 | with: 50 | token: ${{ secrets.REPO_ACCESS_TOKEN }} 51 | repository: nextgenusfs/funannotate 52 | event-type: singularity-ready 53 | client-payload: '{"ref": "${{ github.ref }}", "sha": "${{ github.sha }}"}' 54 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.pyc 3 | */.DS_Store 4 | */*.pyc 5 | dockerbuild/ 6 | sample_data/ 7 | .DS_Store 8 | funannotate.egg-info 9 | .idea 10 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.11" 13 | 14 | # Build documentation in the docs/ directory with Sphinx 15 | sphinx: 16 | configuration: docs/conf.py 17 | 18 | # We recommend specifying your dependencies to enable reproducible builds: 19 | # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 20 | python: 21 | install: 22 | - requirements: docs/requirements.txt -------------------------------------------------------------------------------- /.zenodo.json: -------------------------------------------------------------------------------- 1 | { 2 | "creators": [ 3 | { 4 | "name": "Jonathan M. Palmer", 5 | "affiliation": "USDA Forest Service" 6 | }, 7 | { 8 | "name": "Jason E. Stajich", 9 | "affiliation": "UC Riverside" 10 | } 11 | ], 12 | "description": "funannotate is a pipeline for genome annotation (built specifically for fungi, but will also work with higher eukaryotes).", 13 | "keywords": [ 14 | "genome", 15 | "annotation", 16 | "software" 17 | ], 18 | "license": "BSD-2", 19 | "title": "Funannotate: a pipeline for eukaryotic genome annotation" 20 | } 21 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | # This CITATION.cff file was generated with cffinit. 2 | # Visit https://bit.ly/cffinit to generate yours today! 3 | 4 | cff-version: 1.2.0 5 | title: Funannotate 6 | message: >- 7 | If you use this software, please cite it using the 8 | metadata from this file. 9 | type: software 10 | authors: 11 | - given-names: Jonathan M. 12 | family-names: Palmer 13 | email: nextgenusfs@gmail.com 14 | affiliation: USDA Forest Service 15 | orcid: 'https://orcid.org/0000-0003-0929-3658' 16 | - given-names: Jason E. 17 | family-names: Stajich 18 | email: jason.stajich@ucr.edu 19 | orcid: 'https://orcid.org/0000-0002-7591-0020' 20 | affiliation: University of California-Riverside 21 | identifiers: 22 | - type: url 23 | value: 'https://funannotate.readthedocs.io/' 24 | description: ReadTheDocs documentation 25 | - type: doi 26 | value: 10.5281/zenodo.1134477 27 | description: Zenodo archive of Funannotate software 28 | repository-code: 'https://github.com/nextgenusfs/funannotate' 29 | repository-artifact: 'https://doi.org/10.5281/zenodo.1134477' 30 | abstract: >- 31 | Funannotate is a genome prediction, annotation, and 32 | comparison software package. It was originally written to 33 | annotate fungal genomes (small eukaryotes ~ 30 Mb 34 | genomes), but has evolved over time to accomodate larger 35 | genomes. The impetus for this software package was to be 36 | able to accurately and easily annotate a genome for 37 | submission to NCBI GenBank. 38 | keywords: 39 | - bioinformatics 40 | - genome annotation 41 | - genomics 42 | - gene prediction 43 | license: BSD-2-Clause 44 | version: 1.8.16 45 | date-released: '2023-08-22' 46 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # start with miniconda3 as build environment 2 | FROM condaforge/mambaforge AS build 3 | 4 | # Update, install mamba and conda-pack: 5 | RUN mamba install -n base --yes conda-pack 6 | 7 | # Install funannotate deps from bioconda 8 | # here specifying specific versions to be able to set ENV below 9 | RUN mamba create -c conda-forge -c bioconda -c defaults \ 10 | -n funannotate --yes "python>=3.6,<3.9" "biopython<1.80" xlrd==1.2.0 \ 11 | "trinity==2.8.5" "evidencemodeler==1.1.1" "pasa==2.4.1" "codingquarry==2.0" \ 12 | "proteinortho==6.0.16" goatools matplotlib-base natsort numpy pigz \ 13 | pandas psutil requests "scikit-learn<1.0.0" scipy seaborn "blast=2.2.31" \ 14 | tantan bedtools hmmer exonerate "diamond>=2.0.5" tbl2asn blat "trnascan-se>=2.0" \ 15 | ucsc-pslcdnafilter trimmomatic raxml iqtree trimal "mafft>=7" hisat2 \ 16 | "kallisto==0.46.1" minimap2 stringtie "salmon>=0.9" "samtools>=1.9" \ 17 | glimmerhmm bamtools perl perl-yaml perl-file-which perl-local-lib perl-dbd-mysql perl-clone perl-hash-merge \ 18 | perl-soap-lite perl-json perl-logger-simple perl-scalar-util-numeric perl-math-utils perl-mce \ 19 | perl-text-soundex perl-parallel-forkmanager perl-db-file perl-perl4-corelibs ete3 distro \ 20 | && conda clean -a -y 21 | 22 | # Since we want the most recent, install from repo, remove snap as broken 23 | SHELL ["conda", "run", "-n", "funannotate", "/bin/bash", "-c"] 24 | RUN python -m pip install git+https://github.com/nextgenusfs/funannotate.git 25 | 26 | # package with conda-pack 27 | RUN conda-pack --ignore-missing-files -n funannotate -o /tmp/env.tar && \ 28 | mkdir /venv && cd /venv && tar xf /tmp/env.tar && \ 29 | rm /tmp/env.tar 30 | 31 | # We've put venv in same path it'll be in final image 32 | RUN /venv/bin/conda-unpack 33 | 34 | # Now build environment 35 | FROM debian:buster AS runtime 36 | 37 | # Copy /venv from the previous stage: 38 | COPY --from=build /venv /venv 39 | 40 | # Install debian snap via apt-get 41 | RUN apt-get update && apt-get install -y snap augustus augustus-data locales locales-all libgl1 procps && \ 42 | rm -rf /var/lib/apt/lists/* && \ 43 | ln -s /usr/bin/snap-hmm /usr/bin/snap && \ 44 | rm "/venv/bin/fasta" && \ 45 | ln -s "/venv/bin/fasta36" "/venv/bin/fasta" 46 | 47 | # add it to the PATH and add env variables 48 | ENV PATH="/venv/bin:$PATH" \ 49 | AUGUSTUS_CONFIG_PATH="/usr/share/augustus/config" \ 50 | EVM_HOME="/venv/opt/evidencemodeler-1.1.1" \ 51 | PASAHOME="/venv/opt/pasa-2.4.1" \ 52 | TRINITYHOME="/venv/opt/trinity-2.8.5" \ 53 | QUARRY_PATH="/venv/opt/codingquarry-2.0/QuarryFiles" \ 54 | ZOE="/usr/share/snap" \ 55 | USER="me" \ 56 | FUNANNOTATE_DB="/opt/databases" 57 | 58 | # When image is run, run the code with the environment 59 | SHELL ["/bin/bash", "-c"] 60 | CMD funannotate 61 | -------------------------------------------------------------------------------- /Dockerfile2: -------------------------------------------------------------------------------- 1 | FROM nextgenusfs/funannotate-slim 2 | 3 | # install databases 4 | RUN funannotate setup -i all --wget -b dikarya microsporidia embryophyta metazoa arthropoda vertebrata protists tetrapoda 5 | 6 | # When image is run, run the code with the environment 7 | SHELL ["/bin/bash", "-c"] 8 | CMD funannotate 9 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2016, Jonathan M. Palmer 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE.md 3 | include docs/* 4 | include funannotate/aux_scripts/* 5 | include funannotate/config/* 6 | include funannotate/utilities/* 7 | include funannotate/html_template/* 8 | include funannotate/html_template/css/* 9 | include funannotate/html_template/js/* 10 | include scripts/* 11 | include funannotate/downloads.json 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Latest Github release](https://img.shields.io/github/release/nextgenusfs/funannotate.svg)](https://github.com/nextgenusfs/funannotate/releases/latest) 2 | [![DOI](https://zenodo.org/badge/48254740.svg)](https://zenodo.org/badge/latestdoi/48254740) 3 | ![Conda](https://img.shields.io/conda/dn/bioconda/funannotate) 4 | ![Docker Image Size (tag)](https://img.shields.io/docker/image-size/nextgenusfs/funannotate/latest) 5 | ![Docker Pulls](https://img.shields.io/docker/pulls/nextgenusfs/funannotate) 6 | [![https://www.singularity-hub.org/static/img/hosted-singularity--hub-%23e32929.svg](https://www.singularity-hub.org/static/img/hosted-singularity--hub-%23e32929.svg)](https://singularity-hub.org/collections/5068) 7 | [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/funannotate/README.html) 8 | [![European Galaxy server](https://img.shields.io/badge/usegalaxy-.eu-brightgreen?logo=)](https://usegalaxy.eu/root?tool_id=funannotate_annotate) 9 | 10 | 11 | ![Alt text](funannotate-logo.png?raw=true "Funannotate") 12 | 13 | funannotate is a pipeline for genome annotation (built specifically for fungi, but will also work with higher eukaryotes). Installation, usage, and more information can be found at [http://funannotate.readthedocs.io](http://funannotate.readthedocs.io) 14 | 15 | #### Quickest start Docker: 16 | 17 | You can use docker to run `funannotate`. Caveats are that GeneMark is not included in the docker image (see licensing below and you can complain to the developers for making it difficult to distribute/use). I've also written a bash script that can run the docker image and auto-detect/include the proper user/volume bindings. This docker image is built off of the latest code in master, so it will be ahead of the tagged releases. The image includes the required databases as well, if you want just funannotate without the databases then that is located on docker hub as well `nextgenusfs/funannotate-slim`. So this route can be achieved with: 18 | 19 | ``` 20 | # download/pull the image from docker hub 21 | $ docker pull nextgenusfs/funannotate 22 | 23 | # download bash wrapper script (optional) 24 | $ wget -O funannotate-docker https://raw.githubusercontent.com/nextgenusfs/funannotate/master/funannotate-docker 25 | 26 | # might need to make this executable on your system 27 | $ chmod +x /path/to/funannotate-docker 28 | 29 | # assuming it is in your PATH, now you can run this script as if it were the funannotate executable script 30 | $ funannotate-docker test -t predict --cpus 12 31 | ``` 32 | 33 | #### Quickstart Bioconda install: 34 | 35 | The pipeline can be installed with conda (via [bioconda](https://bioconda.github.io/)): 36 | ``` 37 | #add appropriate channels 38 | conda config --add channels defaults 39 | conda config --add channels bioconda 40 | conda config --add channels conda-forge 41 | 42 | #then create environment 43 | conda create -n funannotate "python>=3.6,<3.9" funannotate 44 | ``` 45 | If `conda` is taking forever to solve the environment, I would recommend giving [mamba](https://github.com/mamba-org/mamba) a try: 46 | ``` 47 | #install mamba into base environment 48 | conda install -n base mamba 49 | 50 | #then use mamba as drop in replacmeent 51 | mamba create -n funannotate funannotate 52 | ``` 53 | 54 | If you want to use GeneMark-ES/ET you will need to install that manually following developers instructions: 55 | http://topaz.gatech.edu/GeneMark/license_download.cgi 56 | 57 | Note that you will need to change the shebang line for all perl scripts in GeneMark to use `/usr/bin/env perl`. 58 | You will then also need to add `gmes_petap.pl` to the $PATH or set the environmental variable $GENEMARK_PATH to the gmes_petap directory. 59 | 60 | To install just the python funannotate package, you can do this with pip: 61 | ``` 62 | python -m pip install funannotate 63 | ``` 64 | 65 | To install the most updated code in master you can run: 66 | ``` 67 | python -m pip install git+https://github.com/nextgenusfs/funannotate.git 68 | ``` 69 | # Citation 70 | Jonathan M. Palmer, & Jason Stajich. (2020). Funannotate v1.8.1: Eukaryotic genome annotation (v1.8). Zenodo. https://doi.org/10.5281/zenodo.1134477 71 | -------------------------------------------------------------------------------- /Singularity: -------------------------------------------------------------------------------- 1 | Bootstrap: docker 2 | From: nextgenusfs/funannotate 3 | 4 | %help 5 | Built from Docker Hub nextgenusfs/funannotate -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | _build 2 | _static 3 | _template 4 | old -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = Funannotate 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/compare.rst: -------------------------------------------------------------------------------- 1 | 2 | .. _compare: 3 | 4 | Comparative genomics 5 | ================================ 6 | A typical workflow in a genomics project would be to compare your newly sequenced/assembled/annotated genome to other organisms. The impetus behind :code:`funannotate compare` was that there was previously no way to easily compare multiple genomes. Funannotate stores all annotation in GenBank flat file format, while some people don't like this format as it is difficult to parse with standard unix tools, the main advantage is that the annotation can be stored in a standardized format and retrieved in the same way for each genome. GFF3 is the common output of many annotation tools, however, this doesn't work well for functional annotation as all of the "information" is stored in a single column. At any rate, :code:`funannotate compare` can take either folders containing "funannotated" genomes or GBK files --> the output is stats, graphs, CSV files, phylogeny, etc all summarized in HTML format. 7 | 8 | .. code-block:: none 9 | 10 | Usage: funannotate compare 11 | version: 1.8.14 12 | 13 | Description: Script does light-weight comparative genomics between funannotated genomes. Output 14 | is graphs, phylogeny, CSV files, etc --> visualized in web-browser. 15 | 16 | Required: 17 | -i, --input List of funannotate genome folders or GBK files 18 | 19 | Optional: 20 | -o, --out Output folder name. Default: funannotate_compare 21 | -d, --database Path to funannotate database. Default: $FUNANNOTATE_DB 22 | --cpus Number of CPUs to use. Default: 2 23 | --run_dnds Calculate dN/dS ratio on all orthologs. [estimate,full] 24 | --go_fdr P-value for FDR GO-enrichment. Default: 0.05 25 | --heatmap_stdev Cut-off for heatmap. Default: 1.0 26 | --num_orthos Number of Single-copy orthologs to use for ML. Default: 500 27 | --bootstrap Number of boostrap replicates to run with RAxML. Default: 100 28 | --outgroup Name of species to use for ML outgroup. Default: no outgroup 29 | --proteinortho ProteinOrtho5 POFF results. 30 | --ml_method Maxmimum Likelihood method: Default: raxml [raxml,iqtree] 31 | --ml_model Substitution model for IQtree. Default: modelfinder 32 | --no-progress Do not print progress to stdout for long sub jobs 33 | -------------------------------------------------------------------------------- /docs/conda.rst: -------------------------------------------------------------------------------- 1 | 2 | .. _conda: 3 | 4 | Conda mediated Installation 5 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 6 | 7 | I'd really like to build a bioconda installation package, but would need some help. You can however install quite a few of the dependencies with conda. 8 | 9 | 10 | **If you are on LINUX -- start here:** 11 | 12 | .. code-block:: none 13 | 14 | #If you do not have conda, install: download miniconda2 or miniconda3, miniconda3 shown 15 | wget --quiet https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh 16 | /bin/bash ~/miniconda.sh -b -p /conda/installation/path 17 | 18 | #setup bioconda repository 19 | conda config --add channels defaults 20 | conda config --add channels etetoolkit 21 | conda config --add channels bioconda 22 | conda config --add channels conda-forge 23 | 24 | #now create a conda environment and install dependencies 25 | conda create -y -n funannotate python=2.7 numpy pandas scipy matplotlib seaborn \ 26 | natsort scikit-learn psutil biopython requests blast rmblast goatools fisher \ 27 | bamtools augustus bedtools hmmer exonerate diamond>=0.9 tbl2asn ucsc-pslcdnafilter \ 28 | samtools raxml trimal mafft>=7 iqtree kallisto>=0.46.0 bowtie2 infernal mummer minimap2 blat \ 29 | trinity>=2.6.6 evidencemodeler pasa>=2.3 codingquarry stringtie gmap=2017.11.15 snap \ 30 | ete3 salmon>=0.9 jellyfish>=2.2 htslib trnascan-se hisat2 glimmerhmm \ 31 | trf perl-threaded perl-db-file perl-bioperl perl-dbd-mysql perl-dbd-sqlite \ 32 | perl-text-soundex perl-scalar-util-numeric perl-data-dumper perl-dbi perl-clone \ 33 | perl-json perl-logger-simple perl-hash-merge perl-yaml perl-pod-usage perl-getopt-long \ 34 | perl-parallel-forkmanager perl-carp perl-soap-lite perl-class-inspector perl-app-cpanminus 35 | 36 | #if you are going to use remote search also need LWP module (not on conda) 37 | cpanm LWP 38 | 39 | **If you are on MacOS X -- start here:** 40 | 41 | .. code-block:: none 42 | 43 | #If you do not have conda, install: download miniconda2 or miniconda3, miniconda3 shown 44 | wget --quiet https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh 45 | /bin/bash ~/miniconda.sh -b -p /conda/installation/path 46 | 47 | #setup bioconda repository 48 | conda config --add channels defaults 49 | conda config --add channels etetoolkit 50 | conda config --add channels bioconda 51 | conda config --add channels conda-forge 52 | 53 | #now create a conda environment and install dependencies 54 | conda create -y -n funannotate python=2.7 numpy pandas scipy matplotlib seaborn \ 55 | natsort scikit-learn psutil biopython requests blast rmblast goatools fisher \ 56 | bedtools hmmer exonerate diamond>=0.9 tbl2asn ucsc-pslcdnafilter \ 57 | samtools raxml trimal mafft>=7 iqtree kallisto>=0.46.0 bowtie2 infernal mummer \ 58 | evidencemodeler gmap=2017.11.15 hisat2 blat minimap2 snap glimmerhmm \ 59 | ete3 salmon>=0.9 jellyfish>=2.2 htslib trnascan-se codingquarry \ 60 | trf perl-threaded perl-db-file perl-bioperl perl-dbd-mysql perl-dbd-sqlite \ 61 | perl-text-soundex perl-scalar-util-numeric perl-data-dumper perl-dbi perl-clone \ 62 | perl-json perl-logger-simple perl-hash-merge perl-yaml perl-pod-usage perl-getopt-long \ 63 | perl-parallel-forkmanager perl-carp perl-soap-lite perl-class-inspector perl-app-cpanminus 64 | 65 | #if you are going to use remote search also need LWP module (not on conda) 66 | cpanm LWP 67 | 68 | 69 | MacOSX: Need to install bamtools/augustus/trinity/pasa manually: 70 | 71 | Install bamtools/Augustus from here: https://github.com/nextgenusfs/augustus 72 | 73 | Trinity: https://github.com/trinityrnaseq/trinityrnaseq 74 | 75 | PASA: https://github.com/PASApipeline/PASApipeline 76 | 77 | 78 | **The above will automatically install most of the dependencies, below there are a few manual steps.** 79 | 80 | 1. Download/install GeneMark-ES/ET: (gmes_petap.pl must be in PATH) 81 | http://exon.gatech.edu/GeneMark/license_download.cgi 82 | 83 | * make sure to activate the license and move into proper location. you can test proper installation by running `gmes_petap.pl` in the terminal -- you should see help menu. Be careful of the shebang line, default is `/usr/bin/perl` which most likely is not what you want, more appropriate is `/usr/bin/env perl` 84 | 85 | 2. Install RepeatMasker/RepeatModeler http://www.repeatmasker.org 86 | 87 | 88 | 2b. Download Repbase RepeatMasker Libraries if you have not done so already. 89 | 90 | .. code-block:: none 91 | 92 | wget --user name --password pass http://www.girinst.org/server/RepBase/protected/repeatmaskerlibraries/RepBaseRepeatMaskerEdition-20170127.tar.gz 93 | tar zxvf RepBaseRepeatMaskerEdition-20170127.tar.gz -C /path/to/repeatmasker/location 94 | cd /path/to/repeatmasker/location 95 | ./configure 96 | 97 | #Soft-link a repeatmasker utility script into the PATH (may not need to do this depending on install) 98 | ln -s /path/to/repeatmasker/location/repeatmasker/util/rmOutToGFF3.pl /usr/local/bin/rmOutToGFF3.pl 99 | 100 | 101 | 3. Setup Eggnog-mapper v4.5 or v5.0 [v5.0 is not being parsed properly yet in v1.5.3] 102 | 103 | .. code-block:: none 104 | 105 | #clone the eggnog mapper repo into a location you have read/write access 106 | git clone https://github.com/jhcepas/eggnog-mapper.git 107 | 108 | #move into folder and setup - this will put into eggnog-mapper/data location 109 | cd eggnog-mapper 110 | download_eggnog_data.py 111 | 112 | #finally add to your funannotate conda env so it is in path when env is activated 113 | ln -s /path/to/eggnog-mapper/emapper.py /path/to/conda/envs/funannotate/bin/emapper.py 114 | 115 | 116 | NOTE: MacOSX users -- the diamond version shipped with eggnog-mapper needs to be swapped 117 | out as the binary provided is compiled on linux. Run a small test with emapper.py to check 118 | functionality `emapper.py -m diamond -i test.fa -o test` 119 | 120 | 121 | 4. Clone the funannotate repo and add to PATH 122 | 123 | .. code-block:: none 124 | 125 | git clone https://github.com/nextgenusfs/funannotate.git 126 | 127 | #add to PATH 128 | ln -s /path/to/funannotate/funannotate /path/to/conda/envs/funannotate/bin/funannotate 129 | 130 | 5. Run funannotate check --show-versions, fix any issues. You will need to export some ENV variables. 131 | 132 | .. code-block:: none 133 | 134 | export EVM_HOME=/path/to/conda/envs/funannotate/opt/evidencemodeler-v1.1.1 135 | export TRINITYHOME=/path/to/conda/envs/funannotate/opt/trinity-2.6.6 136 | export PASAHOME=/path/to/conda/envs/funannotate/opt/pasa-2.3.3 137 | export AUGUSTUS_CONFIG_PATH=/path/to/augustus/config 138 | export GENEMARK_PATH=/path/to/gmes_petap_dir 139 | export FUNANNOTATE_DB=/path/to/funannotateDB 140 | 141 | 6. Setup funannotate databases, specify any location you have read/write access to to `-d` -- this is $FUNANNOTATE_DB 142 | 143 | .. code-block:: none 144 | 145 | funannotate setup -d /path/to/DB 146 | 147 | 7. If you want these ENV variables to be activated when you activate the conda environment, you can add them as a shell script to the the activate location of your environment, i.e. `/path/to/conda/envs/funannotate/etc/conda/activate.d/` and then you can put the corresponding `unset` commands in the deactivate directory, i.e. `/path/to/conda/envs/funannotate/etc/conda/deactivate.d/` 148 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Funannotate documentation build configuration file, created by 4 | # sphinx-quickstart on Sat Nov 18 22:41:39 2017. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | # If extensions (or modules to document with autodoc) are in another directory, 16 | # add these directories to sys.path here. If the directory is relative to the 17 | # documentation root, use os.path.abspath to make it absolute, like shown here. 18 | # 19 | # import os 20 | # import sys 21 | # sys.path.insert(0, os.path.abspath('.')) 22 | 23 | 24 | # -- General configuration ------------------------------------------------ 25 | 26 | # If your documentation needs a minimal Sphinx version, state it here. 27 | # 28 | # needs_sphinx = '1.0' 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = [] 34 | 35 | # Add any paths that contain templates here, relative to this directory. 36 | templates_path = ['_templates'] 37 | 38 | # The suffix(es) of source filenames. 39 | # You can specify multiple suffix as a list of string: 40 | # 41 | # source_suffix = ['.rst', '.md'] 42 | source_suffix = '.rst' 43 | 44 | # The master toctree document. 45 | master_doc = 'index' 46 | 47 | # General information about the project. 48 | project = u'Funannotate' 49 | copyright = u'2017, Jon Palmer' 50 | author = u'Jon Palmer' 51 | 52 | # The version info for the project you're documenting, acts as replacement for 53 | # |version| and |release|, also used in various other places throughout the 54 | # built documents. 55 | # 56 | # The short X.Y version. 57 | version = u'1.8.16' 58 | # The full version, including alpha/beta/rc tags. 59 | release = u'1.8.16' 60 | 61 | # The language for content autogenerated by Sphinx. Refer to documentation 62 | # for a list of supported languages. 63 | # 64 | # This is also used if you do content translation via gettext catalogs. 65 | # Usually you set "language" from the command line for these cases. 66 | language = None 67 | 68 | # List of patterns, relative to source directory, that match files and 69 | # directories to ignore when looking for source files. 70 | # This patterns also effect to html_static_path and html_extra_path 71 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 72 | 73 | # The name of the Pygments (syntax highlighting) style to use. 74 | pygments_style = 'sphinx' 75 | 76 | # If true, `todo` and `todoList` produce output, else they produce nothing. 77 | todo_include_todos = False 78 | 79 | 80 | # -- Options for HTML output ---------------------------------------------- 81 | 82 | # The theme to use for HTML and HTML Help pages. See the documentation for 83 | # a list of builtin themes. 84 | # 85 | import sphinx_rtd_theme 86 | html_theme = 'sphinx_rtd_theme' 87 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 88 | 89 | # Theme options are theme-specific and customize the look and feel of a theme 90 | # further. For a list of options available for each theme, see the 91 | # documentation. 92 | # 93 | # html_theme_options = {} 94 | 95 | # Add any paths that contain custom static files (such as style sheets) here, 96 | # relative to this directory. They are copied after the builtin static files, 97 | # so a file named "default.css" will overwrite the builtin "default.css". 98 | html_static_path = ['_static'] 99 | 100 | # Custom sidebar templates, must be a dictionary that maps document names 101 | # to template names. 102 | # 103 | # This is required for the alabaster theme 104 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars 105 | html_sidebars = { 106 | '**': [ 107 | 'relations.html', # needs 'show_related': True theme option to display 108 | 'searchbox.html', 109 | ] 110 | } 111 | 112 | 113 | # -- Options for HTMLHelp output ------------------------------------------ 114 | 115 | # Output file base name for HTML help builder. 116 | htmlhelp_basename = 'Funannotatedoc' 117 | 118 | 119 | # -- Options for LaTeX output --------------------------------------------- 120 | 121 | latex_elements = { 122 | # The paper size ('letterpaper' or 'a4paper'). 123 | # 124 | # 'papersize': 'letterpaper', 125 | 126 | # The font size ('10pt', '11pt' or '12pt'). 127 | # 128 | # 'pointsize': '10pt', 129 | 130 | # Additional stuff for the LaTeX preamble. 131 | # 132 | # 'preamble': '', 133 | 134 | # Latex figure (float) alignment 135 | # 136 | # 'figure_align': 'htbp', 137 | } 138 | 139 | # Grouping the document tree into LaTeX files. List of tuples 140 | # (source start file, target name, title, 141 | # author, documentclass [howto, manual, or own class]). 142 | latex_documents = [ 143 | (master_doc, 'Funannotate.tex', u'Funannotate Documentation', 144 | u'Jon Palmer', 'manual'), 145 | ] 146 | 147 | 148 | # -- Options for manual page output --------------------------------------- 149 | 150 | # One entry per manual page. List of tuples 151 | # (source start file, name, description, authors, manual section). 152 | man_pages = [ 153 | (master_doc, 'funannotate', u'Funannotate Documentation', 154 | [author], 1) 155 | ] 156 | 157 | 158 | # -- Options for Texinfo output ------------------------------------------- 159 | 160 | # Grouping the document tree into Texinfo files. List of tuples 161 | # (source start file, target name, title, author, 162 | # dir menu entry, description, category) 163 | texinfo_documents = [ 164 | (master_doc, 'Funannotate', u'Funannotate Documentation', 165 | author, 'Funannotate', 'One line description of project.', 166 | 'Miscellaneous'), 167 | ] 168 | -------------------------------------------------------------------------------- /docs/databases.rst: -------------------------------------------------------------------------------- 1 | 2 | .. _databases: 3 | 4 | Annotation Databases 5 | ================================ 6 | 7 | Funannotate uses several publicly available databases, they can be installed with the :code:`funannotate setup` command. The currently installed databases and version numbers can be displayed with the :code:`funannotate database` command. 8 | 9 | Initial setup is simple and requires only a path to a database location, this can (should) be set using the $FUNANNOTATE_DB environmental variable. If $FUNANNOTATE_DB is set, then the script will use that location by default, otherwise you will need to specify a location to the script i.e.: 10 | 11 | .. code-block:: none 12 | 13 | funannotate setup -d $HOME/funannotate_db 14 | 15 | 16 | You could then update the databases if $FUNANNOTATE_DB is set like this: 17 | 18 | .. code-block:: none 19 | 20 | funannotate setup -i all --update 21 | 22 | #or force update of just one database 23 | funannotate setup -i uniprot --force 24 | 25 | 26 | This will download and format the databases, they can be displayed like so: 27 | 28 | .. code-block:: none 29 | 30 | $ funannotate database 31 | 32 | Funannotate Databases currently installed: 33 | 34 | Database Type Version Date Num_Records Md5checksum 35 | merops diamond 12.5 2023-01-19 5098 6cd3c3dd85650394ce4e3dacb591f2a5 36 | uniprot diamond 2024_01 2024-01-24 570830 c7507ea16b3c4807971c663994cad329 37 | dbCAN hmmer3 11.0 2022-08-09 699 fb112af319a5001fbf547eac29e7c3b5 38 | pfam hmmer3 36.0 2023-07 20795 0725495ccf049a4f198fcc0a92f7f38c 39 | repeats diamond 1.0 2022-03-13 11950 4e8cafc3eea47ec7ba505bb1e3465d21 40 | go text 2024-01-17 2024-01-17 47729 7e6b9974184dda306e6e07631f1783af 41 | mibig diamond 1.4 2022-03-13 31023 118f2c11edde36c81bdea030a0228492 42 | interpro xml 98.0 2024-01-25 40768 502ea05009761b893dedb56d5ea89c48 43 | busco_outgroups outgroups 1.0 2024-03-04 8 6795b1d4545850a4226829c7ae8ef058 44 | gene2product text 1.92 2023-10-02 34459 32a4a80987720e0872377de3207dc0f5 45 | 46 | To update a database type: 47 | funannotate setup -i DBNAME -d $HOME/funannotate_db --force 48 | 49 | To see install BUSCO outgroups type: 50 | funannotate database --show-outgroups 51 | 52 | To see BUSCO tree type: 53 | funannotate database --show-buscos 54 | 55 | 56 | 57 | Similarly, database sources can be updated with the :code:`funannotate setup` command, for example to update the gene2product database to its most recent version you would run: 58 | 59 | .. code-block:: none 60 | 61 | $ funannotate setup -d $HOME/funannotate_db -i gene2product --update 62 | 63 | 64 | -------------------------------------------------------------------------------- /docs/dependencies.rst: -------------------------------------------------------------------------------- 1 | 2 | .. _dependencies: 3 | 4 | Dependencies 5 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 6 | Funannotate has a lot of dependencies. However, it also comes with a few tools to help you get everything installed. The first is that of :code:`funannotate check`. You'll see in the output below that the :code:`fasta` tool is missing, which is Bill Pearsons :code:`fasta36` a dependency of the PASA pipeline. Also the :code:`$PASAHOME`` and :code:`$TRINITYHOME`` variables are not set, that is because on this particular machine they are not installed, i.e. funannotate will alert you at runtime if it is missing a dependency. 7 | 8 | .. code-block:: none 9 | 10 | $ funannotate check --show-versions 11 | ------------------------------------------------------- 12 | Checking dependencies for funannotate v1.4.0 13 | ------------------------------------------------------- 14 | You are running Python v 2.7.11. Now checking python packages... 15 | biopython: 1.70 16 | goatools: 0.7.11 17 | matplotlib: 2.1.1 18 | natsort: 5.2.0 19 | numpy: 1.12.1 20 | pandas: 0.22.0 21 | psutil: 5.4.3 22 | requests: 2.18.4 23 | scikit-learn: 0.19.0 24 | scipy: 0.19.1 25 | seaborn: 0.8.1 26 | All 11 python packages installed 27 | 28 | 29 | You are running Perl v 5.026001. Now checking perl modules... 30 | Bio::Perl: 1.007002 31 | Carp: 1.42 32 | Clone: 0.39 33 | DBD::SQLite: 1.56 34 | DBD::mysql: 4.046 35 | DBI: 1.641 36 | DB_File: 1.84 37 | Data::Dumper: 2.167 38 | File::Basename: 2.85 39 | File::Which: 1.22 40 | Getopt::Long: 2.5 41 | Hash::Merge: 0.300 42 | JSON: 2.97001 43 | LWP::UserAgent: 6.33 44 | Logger::Simple: 2.0 45 | POSIX: 1.76 46 | Parallel::ForkManager: 1.19 47 | Pod::Usage: 1.69 48 | Scalar::Util::Numeric: 0.40 49 | Storable: 2.62 50 | Text::Soundex: 3.05 51 | Thread::Queue: 3.12 52 | Tie::File: 1.02 53 | URI::Escape: 3.31 54 | YAML: 1.24 55 | threads: 2.21 56 | threads::shared: 1.58 57 | All 27 Perl modules installed 58 | 59 | 60 | Checking external dependencies... 61 | RepeatMasker: RepeatMasker 4.0.7 62 | RepeatModeler: RepeatModeler 1.0.11 63 | Trinity: 2.5.1 64 | augustus: 3.2.1 65 | bamtools: bamtools 2.4.0 66 | bedtools: bedtools v2.27.1 67 | blat: BLAT v35 68 | diamond: diamond 0.9.19 69 | emapper.py: emapper-1.0.3 70 | ete3: 3.1.1 71 | exonerate: exonerate 2.4.0 72 | fasta: no way to determine 73 | gmap: 2017-06-20 74 | gmes_petap.pl: 4.30 75 | hisat2: 2.1.0 76 | hmmscan: HMMER 3.1b2 (February 2015) 77 | hmmsearch: HMMER 3.1b2 (February 2015) 78 | java: 1.8.0_92 79 | kallisto: 0.43.1 80 | mafft: v7.313 (2017/Nov/15) 81 | makeblastdb: makeblastdb 2.7.1+ 82 | minimap2: 2.10-r761 83 | nucmer: 3.1 84 | pslCDnaFilter: no way to determine 85 | rmblastn: rmblastn 2.2.27+ 86 | samtools: samtools 1.8 87 | tRNAscan-SE: 1.23 (April 2002) 88 | tbl2asn: unknown, likely 25.3 89 | tblastn: tblastn 2.7.1+ 90 | trimal: trimAl v1.4.rev15 build[2013-12-17] 91 | All 30 external dependencies are installed 92 | 93 | Checking Environmental Variables... 94 | $FUNANNOTATE_DB=/usr/local/share/funannotate 95 | $PASAHOME=/Users/jon/software/PASApipeline 96 | $TRINITYHOME=/usr/local/opt/trinity 97 | $EVM_HOME=/Users/jon/software/evidencemodeler 98 | $AUGUSTUS_CONFIG_PATH=/Users/jon/software/augustus/config 99 | $GENEMARK_PATH=/Users/jon/software/gmes_petap 100 | $BAMTOOLS_PATH=/Users/jon/software/bamtools-2.4.0/bin 101 | All 7 environmental variables are set 102 | ------------------------------------------------------- 103 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /docs/docker.rst: -------------------------------------------------------------------------------- 1 | 2 | .. _docker: 3 | 4 | Docker Installation 5 | ================================ 6 | Docker is a solution where most of the dependencies are installed and you can start annotating 7 | right away. Because some software and data require individual licensing, the core components 8 | of funannotate are packaged into a docker container, but you must download a few things and 9 | run a docker build locally to get a working container. Note that Eggnog-mapper is not installed 10 | in Docker container as the databases were too large. 11 | 12 | 1) Download 64 bit Linux GeneMark-ET/ES Key (gm_key_64.gz) from http://exon.gatech.edu/Genemark/license_download.cgi 13 | 14 | 15 | 2) Download RepeatMasker libraries. Register for username at RepBase http://www.girinst.org/repbase/. You can then download the RepeatMasker Libraries most recent version, alternatively can download from command line like so: 16 | 17 | .. code-block:: none 18 | 19 | wget --user name --password pass \ 20 | https://www.girinst.org/server/archive/RepBase23.09/protected/repeatmaskerlibraries/RepBaseRepeatMaskerEdition-20170127.tar.gz 21 | 22 | 3) Get SignalP4.1 for linux 64 from CBS http://www.cbs.dtu.dk/cgi-bin/sw_request?signalp 23 | 24 | 25 | 4) Download Dockerfile: 26 | 27 | .. code-block:: none 28 | 29 | wget https://raw.githubusercontent.com/nextgenusfs/funannotate/1.5.1/dockerbuild/Dockerfile 30 | 31 | 5) You should now have the following files in the same directory: 32 | 33 | .. code-block:: none 34 | 35 | Dockerfile 36 | gm_key_64.gz 37 | RepBaseRepeatMaskerEdition-20170127.tar.gz 38 | signalp-4.1f.Linux.tar.gz 39 | 40 | Now you can Build the docker container, which will setup the remaining tools and then download and format funannotate databases.: 41 | 42 | .. code-block:: none 43 | 44 | docker build -t funannotate -f Dockerfile . 45 | 46 | 47 | **Running the Docker container with your data:** 48 | 49 | In order to run the docker container, you need to put all the files you will use for input to funannotate into the same folder, you can then launch the Docker container and mount your current folder with the following command: 50 | 51 | .. code-block:: none 52 | 53 | #container is deleted after you exit 54 | docker run -it --rm -v $PWD:/home/linuxbrew/data funannotate 55 | 56 | #keep container, i.e. mysql databases generated, however will take a lot of HD space 57 | docker run -it -v $PWD:/home/linuxbrew/data funannotate 58 | 59 | This will bring you to a bash prompt within the docker container where all dependencies are installed, so you can now issue the funannotate commands on your data. 60 | 61 | **Limitations with Docker:** 62 | 63 | The funannotate docker image does not contain Eggnog-mapper because the databases sizes are too large (> 20 GB). Eggnog-mapper is an important component of functional annotation, you can run this on the eggnog-mapper webserver and pass results to funannotate or perhaps set up an additional docker image running the eggnog-mapper software. 64 | 65 | **Mac OSX users:** 66 | 67 | The default storage-driver on docker for Mac is the overlay2 driver. This driver seems to be incompatible with running/launching MySQL, thus if you are getting errors running funannotate you will need to change your storage-driver to "aufs". This can be done in Docker preferences, Daemon tab, Advanced tab, and then change the storage-driver. **Note this will delete all Docker images/containers on your virtual disk.** 68 | 69 | .. code-block:: none 70 | 71 | { 72 | "storage-driver" : "aufs", 73 | "debug" : true, 74 | "experimental" : true 75 | } 76 | -------------------------------------------------------------------------------- /docs/evidence.rst: -------------------------------------------------------------------------------- 1 | 2 | .. _evidence: 3 | 4 | Providing evidence to funannotate 5 | ================================== 6 | 7 | Funannotate uses Evidence Modeler to combine *ab initio* gene model predictions with evidence (transcripts or proteins) aligned to the genome. Therefore, the evidence that you supply at runtime for :code:`--transcript_evidence` and :code:`--protein_evidence` are important. By default, funannotate will use the UniProtKb/SwissProt curated protein database for protein evidence. However, you can specify other forms of protein evidence, perhaps from a well-annotated closely related species, using the :code:`--protein_evidence` option. Multiple files can be passed to both :code:`--transcript_evidence` or :code:`--protein_evidence` by separating the files by spaces, for example: 8 | 9 | .. code-block:: none 10 | 11 | funannotate predict -i genome.fa -s "Awesome species" --transcript_evidence trinity.fasta myESTs.fa \ 12 | -o output --protein_evidence closely_related.fasta $FUNANNOTATE_DB/uniprot_sprot.fasta 13 | 14 | You'll notice in this example, I also added the UniProt/SwissProt protein models located in the funannotate database. I should also note that adding protein evidence from ab initio predictors of closely related species should be avoided, this is because those models have not been validated. What you are trying to do here is to provide the software with high-quality protein models so that information can be used to direct the *ab initio* gene prediction algorithms, so providing them with incorrect/truncated proteins isn't going to help your accuracy and in many cases it may hurt. It is often okay to just stick with the default UniProtKb/SwissProt protein evidence. 15 | 16 | **Sources of Evidence that work well:** 17 | 18 | 1. De-novo RNA-seq assemblies (i.e. output of Trinity) 19 | 2. ESTs (for fungal genomes ESTs from related species can be downloaded from JGI Mycocosm) 20 | 3. Curated Protein models from closely related species 21 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. Funannotate documentation master file, created by 2 | sphinx-quickstart on Sat Nov 18 22:41:39 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Funannotate documentation 7 | ======================================= 8 | 9 | .. toctree:: 10 | :hidden: 11 | 12 | install 13 | prepare 14 | predict 15 | evidence 16 | update 17 | annotate 18 | compare 19 | databases 20 | tutorials 21 | commands 22 | utilities 23 | 24 | 25 | Funannotate is a genome prediction, annotation, and comparison software package. It was originally written to annotate fungal genomes (small eukaryotes ~ 30 Mb genomes), but has evolved over time to accomodate larger genomes. The impetus for this software package was to be able to accurately and easily annotate a genome for submission to NCBI GenBank. Existing tools (such as Maker) require significant manually editing to comply with GenBank submission rules, thus funannotate is aimed at simplifying the genome submission process. 26 | 27 | Funannotate is also a lightweight comparative genomics platform. Genomes that have had functional annotation added via the :code:`funannotate annotate` command can be run through the :code:`funannotate compare` script that outputs html based whole genome comparisons. The software can run orthologous clustering, construct whole-genome phylogenies, run Gene Ontology enrichment analysis, as well as calculate dN/dS ratios for orthologous clusters under positive selection. 28 | 29 | 30 | 31 | * :ref:`install` 32 | * :ref:`prepare` 33 | * :ref:`predict` 34 | * :ref:`update` 35 | * :ref:`annotate` 36 | * :ref:`compare` 37 | * :ref:`tutorials` 38 | * :ref:`utilities` 39 | -------------------------------------------------------------------------------- /docs/install.rst: -------------------------------------------------------------------------------- 1 | 2 | .. _install: 3 | 4 | Installation 5 | ================================ 6 | 7 | .. toctree:: 8 | :hidden: 9 | 10 | dependencies 11 | 12 | Funannotate has a lot of dependencies and therefore installation is the most difficult part 13 | of executing the pipeline. The funannotate pipeline is written in python and can be installed 14 | with pip, i.e. `pip install funannotate`. You can see a list of :ref:`dependencies`, 15 | 16 | ### Quickest start Docker: 17 | 18 | You can use docker to run `funannotate`. Caveats are that GeneMark is not included in the docker image (see licensing below and you can complain to the developers for making it difficult to distribute/use). I've also written a bash script that can run the docker image and auto-detect/include the proper user/volume bindings. This docker image is built off of the latest code in master, so it will be ahead of the tagged releases. The image includes the required databases as well, if you want just funannotate without the databases then that is located on docker hub as well `nextgenusfs/funannotate-slim`. So this route can be achieved with: 19 | 20 | .. code-block:: none 21 | 22 | # download/pull the image from docker hub 23 | $ docker pull nextgenusfs/funannotate 24 | 25 | # download bash wrapper script (optional) 26 | $ wget -O funannotate-docker https://raw.githubusercontent.com/nextgenusfs/funannotate/master/funannotate-docker 27 | 28 | # might need to make this executable on your system 29 | $ chmod +x /path/to/funannotate-docker 30 | 31 | # assuming it is in your PATH, now you can run this script as if it were the funannotate executable script 32 | $ funannotate-docker test -t predict --cpus 12 33 | 34 | 35 | #### Quickstart Bioconda install: 36 | 37 | The pipeline can be installed with conda (via [bioconda](https://bioconda.github.io/)): 38 | 39 | .. code-block:: none 40 | 41 | #add appropriate channels 42 | conda config --add channels defaults 43 | conda config --add channels bioconda 44 | conda config --add channels conda-forge 45 | 46 | #then create environment 47 | conda create -n funannotate "python>=3.6,<3.9" funannotate 48 | 49 | If `conda` is taking forever to solve the environment, I would recommend giving [mamba](https://github.com/mamba-org/mamba) a try: 50 | 51 | .. code-block:: none 52 | 53 | #install mamba into base environment 54 | conda install -n base mamba 55 | 56 | #then use mamba as drop in replacmeent 57 | mamba create -n funannotate funannotate 58 | 59 | 60 | If you want to use GeneMark-ES/ET you will need to install that manually following developers instructions: 61 | http://topaz.gatech.edu/GeneMark/license_download.cgi 62 | 63 | Note that you will need to change the shebang line for all perl scripts in GeneMark to use `/usr/bin/env perl`. 64 | You will then also need to add `gmes_petap.pl` to the $PATH or set the environmental variable $GENEMARK_PATH to the gmes_petap directory. 65 | 66 | To install just the python funannotate package, you can do this with pip: 67 | 68 | .. code-block:: none 69 | 70 | python -m pip install funannotate 71 | 72 | To install the most updated code in master you can run: 73 | 74 | .. code-block:: none 75 | 76 | python -m pip install git+https://github.com/nextgenusfs/funannotate.git 77 | 78 | 79 | 80 | Please setup database and test your installation locally using the following: 81 | 82 | .. code-block:: none 83 | 84 | #start up conda ENV 85 | conda activate funannotate 86 | 87 | #check that all modules are installed 88 | funannotate check --show-versions 89 | 90 | #download/setup databases to a writable/readable location 91 | funannotate setup -d $HOME/funannotate_db 92 | 93 | #set ENV variable for $FUNANNOTATE_DB 94 | echo "export FUNANNOTATE_DB=$HOME/funannotate_db" > /conda/installation/path/envs/funannotate/etc/conda/activate.d/funannotate.sh 95 | echo "unset FUNANNOTATE_DB" > /conda/installation/path/envs/funannotate/etc/conda/deactivate.d/funannotate.sh 96 | 97 | #run tests -- requires internet connection to download data 98 | funannotate test -t all --cpus X 99 | 100 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=Funannotate 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/manual.rst: -------------------------------------------------------------------------------- 1 | 2 | .. _manual: 3 | 4 | Manual Installation: "The Professional" 5 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 6 | You can simply download a release and get going, dependency hell awaits you, but I'm not worried because you know what you are doing. See :ref:`dependencies` that are needed to run funannotate. 7 | 8 | .. code-block:: none 9 | 10 | wget https://github.com/nextgenusfs/funannotate/archive/1.0.0.tar.gz 11 | tar -zxvf 1.0.0.tar.gz 12 | export PATH=/path/to/funannotate:$PATH 13 | 14 | -------------------------------------------------------------------------------- /docs/prepare.rst: -------------------------------------------------------------------------------- 1 | 2 | .. _prepare: 3 | 4 | Preparing your Assembly 5 | -------------------------------- 6 | There are a few things that you can do to your multi-FASTA assembly to get it "ready" to be annotated. These steps include methods for removing small repetitive contigs from an assembly, sorting/renaming contig headers so they do not cause problems during prediction step, and repeatmasking your assembely (required). 7 | 8 | 9 | Cleaning your Assembly 10 | ================================ 11 | When working with haploid assemblies, sometimes you want to remove some repetitive contigs that are contained in other scaffolds of the assembly. If the repeats are indeed unique, then we want to keep them in the assembly. Funannotate can help "clean" up repetitive contigs in your assembly. This is done using a "leave one out" methodology using minimap2 or mummer (nucmer), where the the shortest contigs/scaffolds are aligned to the rest of the assembly to determine if it is repetitive. The script loops through the contigs starting with the shortest and workings its way to the N50 of the assembly, dropping contigs/scaffolds that are greater than the percent coverage of overlap (:code:`--cov`) and the percent identity of overlap (:code:`--pident`). 12 | 13 | .. code-block:: none 14 | 15 | $ funannotate clean 16 | 17 | Usage: funannotate clean 18 | version: 1.8.16 19 | 20 | Description: The script sorts contigs by size, starting with shortest contigs it uses minimap2 21 | to find contigs duplicated elsewhere, and then removes duplicated contigs. 22 | 23 | Arguments: 24 | -i, --input Multi-fasta genome file (Required) 25 | -o, --out Cleaned multi-fasta output file (Required) 26 | -p, --pident Percent identity of overlap. Default = 95 27 | -c, --cov Percent coverage of overlap. Default = 95 28 | -m, --minlen Minimum length of contig to keep. Default = 500 29 | --exhaustive Test every contig. Default is to stop at N50 value. 30 | 31 | 32 | Sorting/Rename FASTA Headers 33 | ================================ 34 | NCBI limits the number of characters in a FASTA header for submission to 16 characters and Augustus also has problems with longer contig/scaffold names. You can use this simple script to sort your assembly by length and then rename the FASTA headers. 35 | 36 | .. code-block:: none 37 | 38 | $funannotate sort 39 | 40 | Usage: funannotate sort 41 | version: 1.8.16 42 | 43 | Description: This script sorts the input contigs by size (longest->shortest) and then relabels 44 | the contigs with a simple name (e.g. scaffold_1). Augustus can have problems with 45 | some complicated contig names. 46 | 47 | Arguments: 48 | -i, --input Multi-fasta genome file. (Required) 49 | -o, --out Sorted by size and relabeled output file. (Required) 50 | -b, --base Base name to relabel contigs. Default: scaffold 51 | --minlen Shorter contigs are discarded. Default: 0 52 | 53 | 54 | .. _repeatmasking 55 | 56 | RepeatMasking your Assembly 57 | ================================ 58 | This is an essential step in the annotation process. As of v1.4.0 repeatmasking has been decoupled from :code:`funannotate predict` in order to make it more flexible and accomodate those users that don't have access to the RepBase library (a requirement of RepeatMasker). The :code:`funannotate mask` command default is to run simple masking using tantan. The script is a wrapper for RepeatModeler and RepeatMasker, however you can use any external program to softmask your assembly. Softmasking is where repeats are represented by lowercase letters and all non-repetitive regions are uppercase letters. One alternative to RepeatMasker is RED (REpeat Detector) you can find a wrapper for this program `Redmask `_. 59 | 60 | .. code-block:: none 61 | 62 | $funannotate mask 63 | 64 | Usage: funannotate mask 65 | version: 1.8.16 66 | 67 | Description: This script is a wrapper for repeat masking. Default is to run very simple 68 | repeat masking with tantan. The script can also run RepeatMasker and/or 69 | RepeatModeler. It will generate a softmasked genome. Tantan is probably not 70 | sufficient for soft-masking an assembly, but with RepBase no longer being 71 | available RepeatMasker/Modeler may not be functional for many users. 72 | 73 | Arguments: 74 | -i, --input Multi-FASTA genome file. (Required) 75 | -o, --out Output softmasked FASTA file. (Required) 76 | 77 | Optional: 78 | -m, --method Method to use. Default: tantan [repeatmasker, repeatmodeler] 79 | -s, --repeatmasker_species Species to use for RepeatMasker 80 | -l, --repeatmodeler_lib Custom repeat database (FASTA format) 81 | --cpus Number of cpus to use. Default: 2 82 | --debug Keep intermediate files 83 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx_rtd_theme -------------------------------------------------------------------------------- /docs/update.rst: -------------------------------------------------------------------------------- 1 | 2 | .. _update: 3 | 4 | Adding UTRs and refining predictions 5 | ================================ 6 | If you have RNA-seq data and would like to use the PASA-mediated "annotation comparison" to add UTRs and refine gene model predictions, this can be accomplished using the :code:`funannotate update` command. This script can also be run as a stand-alone to re-align RNA-seq data and/or update an existing GenBank genome. 7 | 8 | If you have run :code:`funannotate train` and then :code:`funannotate predict`, this script will re-use those data and you can simply pass :code:`funannotate update -i folder --cpus 12`. This will add the gene predictions to the SQL database and then walk through each gene comparing to existing PASA alignments, PASA will make some adjustments to the gene models. As recommended by PASA developers, this is run twice in :code:`funannotate update`. 9 | 10 | 11 | Why is :code:`funannotate update` so slow?? 12 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 13 | 14 | The default SQL database for PASA is set to use SQLite -- this is for compatibility. However, the limitation is that SQLite database in PASA is single threaded due to SQLite database lock issue. Thus even if you pass multiple cpus to the script, it will run all of the PASA steps single threaded, which can take a long time depending on PASA alignments and genome size. If you `setup PASA to use MySQL `_, then the scripts can run PASA multi-threaded and :code:`funannotate update` will run much faster. 15 | 16 | 17 | .. code-block:: none 18 | 19 | Usage: funannotate update 20 | version: 1.8.14 21 | 22 | Description: Script will run PASA mediated update of gene models. It can directly update 23 | the annotation from an NCBI downloaded GenBank file using RNA-seq data or can be 24 | used after funannotate predict to refine UTRs and gene model predictions. Kallisto 25 | is used to evidence filter most likely PASA gene models. Dependencies are 26 | hisat2, Trinity, samtools, fasta, minimap2, PASA, kallisto, bedtools. 27 | 28 | Required: 29 | -i, --input Funannotate folder or Genome in GenBank format (.gbk,.gbff). 30 | or 31 | -f, --fasta Genome in FASTA format 32 | -g, --gff Annotation in GFF3 format 33 | --species Species name, use quotes for binomial, e.g. "Aspergillus fumigatus" 34 | 35 | Optional: 36 | -o, --out Output folder name 37 | -l, --left Left/Forward FASTQ Illumina reads (R1) 38 | -r, --right Right/Reverse FASTQ Illumina reads (R2) 39 | -s, --single Single ended FASTQ reads 40 | --stranded If RNA-seq library stranded. [RF,FR,F,R,no] 41 | --left_norm Normalized left FASTQ reads (R1) 42 | --right_norm Normalized right FASTQ reads (R2) 43 | --single_norm Normalized single-ended FASTQ reads 44 | --pacbio_isoseq PacBio long-reads 45 | --nanopore_cdna Nanopore cDNA long-reads 46 | --nanopore_mrna Nanopore mRNA direct long-reads 47 | --trinity Pre-computed Trinity transcripts (FASTA) 48 | --jaccard_clip Turn on jaccard clip for dense genomes [Recommended for fungi] 49 | --no_normalize_reads Skip read Normalization 50 | --no_trimmomatic Skip Quality Trimming of reads 51 | --memory RAM to use for Jellyfish. Default: 50G 52 | -c, --coverage Depth to normalize reads. Default: 50 53 | -m, --min_coverage Min depth for normalizing reads. Default: 5 54 | --pasa_config PASA assembly config file, i.e. from previous PASA run 55 | --pasa_db Database to use. Default: sqlite [mysql,sqlite] 56 | --pasa_alignment_overlap PASA --stringent_alignment_overlap. Default: 30.0 57 | --aligners Aligners to use with PASA: Default: minimap2 blat [gmap] 58 | --pasa_min_avg_per_id PASA --MIN_AVG_PER_ID. Default: 95 59 | --pasa_num_bp_splice PASA --NUM_BP_PERFECT_SPLICE_BOUNDARY. Default: 3 60 | --max_intronlen Maximum intron length. Default: 3000 61 | --min_protlen Minimum protein length. Default: 50 62 | --alt_transcripts Expression threshold (percent) to keep alt transcripts. Default: 0.1 [0-1] 63 | --p2g NCBI p2g file (if updating NCBI annotation) 64 | -t, --tbl2asn Assembly parameters for tbl2asn. Example: "-l paired-ends" 65 | --name Locus tag name (assigned by NCBI?). Default: use existing 66 | --sbt NCBI Submission file 67 | --species Species name, use quotes for binomial, e.g. "Aspergillus fumigatus" 68 | --strain Strain name 69 | --isolate Isolate name 70 | --SeqCenter Sequencing facilty for NCBI tbl file. Default: CFMR 71 | --SeqAccession Sequence accession number for NCBI tbl file. Default: 12345 72 | --cpus Number of CPUs to use. Default: 2 73 | 74 | ENV Vars: If not passed, will try to load from your $PATH. 75 | --PASAHOME 76 | --TRINITYHOME 77 | -------------------------------------------------------------------------------- /docs/utilities.rst: -------------------------------------------------------------------------------- 1 | 2 | .. _utilities: 3 | 4 | Utilities 5 | ================================ 6 | There are several scripts that maybe useful to users to convert between different formats, these scripts are housed in the :code:`funannotate util` submenu. 7 | 8 | 9 | .. code-block:: none 10 | 11 | $ funannotate util 12 | 13 | Usage: funannotate util 14 | version: 1.8.16 15 | 16 | Commands: 17 | stats Generate assembly and annotation stats 18 | contrast Compare annotations to reference (GFF3 or GBK annotations) 19 | tbl2gbk Convert TBL format to GenBank format 20 | gbk2parts Convert GBK file to individual components 21 | gff2prot Convert GFF3 + FASTA files to protein FASTA 22 | gff2tbl Convert GFF3 format to NCBI annotation table (tbl) 23 | bam2gff3 Convert BAM coord-sorted transcript alignments to GFF3 24 | prot2genome Map proteins to genome generating GFF3 protein alignments 25 | stringtie2gff3 Convert GTF (stringTIE) to GFF3 format 26 | quarry2gff3 Convert CodingQuarry output to proper GFF3 format 27 | gff-rename Sort GFF3 file and rename gene models 28 | 29 | Generate genome assembly stats 30 | ------------------------------ 31 | To generate genome assembly stats in a JSON file. 32 | 33 | .. code-block:: none 34 | 35 | $ funannotate util stats 36 | 37 | Usage: funannotate util stats 38 | version: 1.8.16 39 | 40 | Description: Generate JSON file with genome assembly and annotation stats. 41 | 42 | Arguments: 43 | -f, --fasta Genome FASTA file (Required) 44 | -o, --out Output file (JSON format) 45 | -g, --gff3 Genome Annotation (GFF3 format) 46 | -t, --tbl Genome Annotation (NCBI TBL format) 47 | --transcript_alignments Transcript alignments (GFF3 format) 48 | --protein_alignments Protein alignments (GFF3 format) 49 | 50 | Comparing/contrast annotations to a reference 51 | --------------------------------------- 52 | To compare/contrast genome annotations between different GFF3 or GBK files. 53 | 54 | .. code-block:: none 55 | 56 | $ funannotate util contrast 57 | 58 | Usage: funannotate util contrast 59 | version: 1.8.16 60 | 61 | Description: Compare/constrast annotations to reference. Annotations in either GBK or GFF3 format. 62 | 63 | Arguments: -r, --reference Reference Annotation. GFF3 or GBK format 64 | -f, --fasta Genome FASTA. Required if GFF3 used 65 | -q, --query Annotation query. GFF3 or GBK format 66 | -o, --output Output basename 67 | -c, --calculate_pident Measure protein percent identity between query and reference 68 | 69 | Format Conversion 70 | --------------------------------------- 71 | 72 | .. code-block:: none 73 | 74 | $ funannotate util tbl2gbk 75 | 76 | Usage: funannotate util tbl2gbk 77 | version: 1.8.16 78 | 79 | Description: Convert NCBI TBL annotations + Genome FASTA to GenBank format. 80 | 81 | Required: -i, --tbl Annotation in NCBI tbl format 82 | -f, --fasta Genome FASTA file. 83 | -s, --species Species name, use quotes for binomial, e.g. "Aspergillus fumigatus" 84 | Optional: 85 | --isolate Isolate name 86 | --strain Strain name 87 | --sbt NCBI Submission Template file 88 | -t, --tbl2asn Assembly parameters for tbl2asn. Example: "-l paired-ends" 89 | -o, --output Output basename 90 | 91 | 92 | .. code-block:: none 93 | 94 | $ funannotate util gbk2parts 95 | 96 | Usage: funannotate util gbk2parts 97 | version: 1.8.16 98 | 99 | Description: Convert GenBank file to its individual components (parts) tbl, protein 100 | FASTA, transcript FASTA, and contig/scaffold FASTA. 101 | 102 | Arguments: -g, --gbk Input Genome in GenBank format 103 | -o, --output Output basename 104 | 105 | 106 | .. code-block:: none 107 | 108 | $ funannotate util gff2prot 109 | 110 | Usage: funannotate util gff2prot 111 | version: 1.8.16 112 | 113 | Description: Convert GFF3 file and genome FASTA to protein sequences. FASTA output to stdout. 114 | 115 | Arguments: -g, --gff3 Reference Annotation. GFF3 format 116 | -f, --fasta Genome FASTA file. 117 | --no_stop Dont print stop codons 118 | 119 | .. code-block:: none 120 | 121 | $ funannotate util gff2tbl 122 | 123 | Usage: funannotate util gff2tbl 124 | version: 1.8.16 125 | 126 | Description: Convert GFF3 file into NCBI tbl format. Tbl output to stdout. 127 | 128 | Arguments: 129 | -g, --gff3 Reference Annotation. GFF3 format 130 | -f, --fasta Genome FASTA file. 131 | 132 | 133 | .. code-block:: none 134 | 135 | $ funannotate util bam2gff3 136 | 137 | Usage: funannotate util bam2gff3 138 | version: 1.8.16 139 | 140 | Description: Convert BAM coordsorted transcript alignments to GFF3 format. 141 | 142 | Arguments: -i, --bam BAM file (coord-sorted) 143 | -o, --output GFF3 output file 144 | 145 | 146 | .. code-block:: none 147 | 148 | $ funannotate util protein2genome 149 | 150 | Usage: funannotate util prot2genome 151 | version: 1.8.16 152 | 153 | Description: Map proteins to genome using exonerate. Output is EVM compatible GFF3 file. 154 | 155 | Arguments: -g, --genome Genome FASTA format (Required) 156 | -p, --proteins Proteins FASTA format (Required) 157 | -o, --out GFF3 output file (Required) 158 | -f, --filter Pre-filtering method. Default: diamond [diamond,tblastn] 159 | -t, --tblastn_out Output to save tblastn results. Default: off 160 | --tblastn Use existing tblastn results 161 | --ploidy Ploidy of assembly. Default: 1 162 | --maxintron Max intron length. Default: 3000 163 | --cpus Number of cpus to use. Default: 2 164 | --EVM_HOME Location of Evidence Modeler home directory. Default: $EVM_HOME 165 | --tmpdir Volume/location to write temporary files. Default: /tmp 166 | --logfile Logfile output file 167 | 168 | .. code-block:: none 169 | 170 | $ funannotate util stringtie2gff3 171 | 172 | Usage: funannotate util stringtie2gff3 173 | version: 1.8.16 174 | 175 | Description: Convert StringTIE GTF format to GFF3 funannotate compatible format. Output 176 | to stdout. 177 | 178 | Arguments: -i, --input GTF file from stringTIE 179 | 180 | .. code-block:: none 181 | 182 | $ funannotate util quarry2gff3 183 | 184 | Usage: funannotate util quarry2gff3 185 | version: 1.8.16 186 | 187 | Description: Convert CodingQuarry output GFF to proper GFF3 format. Output to stdout. 188 | 189 | Arguments: -i, --input CodingQuarry output GFF file. (PredictedPass.gff3) 190 | 191 | .. code-block:: none 192 | 193 | $ funannotate util gff-rename 194 | 195 | Usage: funannotate util gff-rename 196 | version: 1.8.16 197 | 198 | Description: Sort GFF3 file by contigs and rename gene models. 199 | 200 | Arguments: -g, --gff3 Reference Annotation. GFF3 format 201 | -f, --fasta Genome FASTA file. 202 | -o, --out Output GFF3 file 203 | -l, --locus_tag Locus tag to use. Default: FUN 204 | -n, --numbering Start number for genes. Default: 1 205 | -------------------------------------------------------------------------------- /funannotate-docker: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | realpath() { 4 | OURPWD=$PWD 5 | cd "$(dirname "$1")" 6 | LINK=$(readlink "$(basename "$1")") 7 | while [ "$LINK" ]; do 8 | cd "$(dirname "$LINK")" 9 | LINK=$(readlink "$(basename "$1")") 10 | done 11 | REALPATH="$PWD/$(basename "$1")" 12 | cd "$OURPWD" 13 | echo "$REALPATH" 14 | } 15 | 16 | timezone() { 17 | if [ "$(uname)" == "Darwin" ]; then 18 | TZ=$(readlink /etc/localtime | sed 's#/var/db/timezone/zoneinfo/##') 19 | else 20 | TZ=$(readlink /etc/timezone) 21 | fi 22 | echo $TZ 23 | } 24 | 25 | # Only allocate tty if one is detected. See - https://stackoverflow.com/questions/911168 26 | if [[ -t 0 ]]; then IT+=(-i); fi 27 | if [[ -t 1 ]]; then IT+=(-t); fi 28 | 29 | USER="$(id -u $(logname)):$(id -g $(logname))" 30 | WORKDIR="$(realpath .)" 31 | MOUNT="type=bind,source=${WORKDIR},target=${WORKDIR}" 32 | TZ="$(timezone)" 33 | 34 | exec docker run --rm "${IT[@]}" --user "${USER}" -e TZ="${TZ}" --workdir "${WORKDIR}" --mount "${MOUNT}" nextgenusfs/funannotate:latest funannotate "$@" -------------------------------------------------------------------------------- /funannotate-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextgenusfs/funannotate/033a883081a83a161798ecc17eaf77b16b5c552b/funannotate-logo.png -------------------------------------------------------------------------------- /funannotate-podman: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | realpath() { 4 | OURPWD=$PWD 5 | cd "$(dirname "$1")" 6 | LINK=$(readlink "$(basename "$1")") 7 | while [ "$LINK" ]; do 8 | cd "$(dirname "$LINK")" 9 | LINK=$(readlink "$(basename "$1")") 10 | done 11 | REALPATH="$PWD/$(basename "$1")" 12 | cd "$OURPWD" 13 | echo "$REALPATH" 14 | } 15 | 16 | timezone() { 17 | if [ "$(uname)" == "Darwin" ]; then 18 | TZ=$(readlink /etc/localtime | sed 's#/var/db/timezone/zoneinfo/##') 19 | else 20 | TZ=$(readlink /etc/timezone) 21 | fi 22 | echo $TZ 23 | } 24 | 25 | # Only allocate tty if one is detected. See - https://stackoverflow.com/questions/911168 26 | if [[ -t 0 ]]; then IT+=(-i); fi 27 | if [[ -t 1 ]]; then IT+=(-t); fi 28 | 29 | USER="$(id -u $(logname)):$(id -g $(logname))" 30 | WORKDIR="$(realpath .)" 31 | MOUNT="type=bind,source=${WORKDIR},target=${WORKDIR}" 32 | TZ="$(timezone)" 33 | 34 | 35 | exec podman run --rm "${IT[@]}" -e TZ="${TZ}" --workdir "${WORKDIR}" --mount "${MOUNT}" nextgenusfs/funannotate:latest funannotate "$@" 36 | # ` --user "${USER}" ` is not needed in rootless mode 37 | -------------------------------------------------------------------------------- /funannotate/__init__.py: -------------------------------------------------------------------------------- 1 | from .__version__ import __version__ 2 | -------------------------------------------------------------------------------- /funannotate/__version__.py: -------------------------------------------------------------------------------- 1 | VERSION = (1, 8, 17) 2 | 3 | __version__ = ".".join(map(str, VERSION)) 4 | -------------------------------------------------------------------------------- /funannotate/aux_scripts/augustus_parallel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import subprocess 5 | import os 6 | import uuid 7 | import shutil 8 | import argparse 9 | from Bio import SeqIO 10 | import funannotate.library as lib 11 | 12 | # setup menu with argparse 13 | 14 | 15 | class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): 16 | def __init__(self, prog): 17 | super(MyFormatter, self).__init__(prog, max_help_position=48) 18 | 19 | 20 | parser = argparse.ArgumentParser(prog='augustus_parallel.py', 21 | usage="%(prog)s [options] -i genome.fasta -s botrytis_cinera -o prediction_output_base", 22 | description='''Script runs augustus in parallel to use multiple processors''', 23 | epilog="""Written by Jon Palmer (2016) nextgenusfs@gmail.com""", 24 | formatter_class=MyFormatter) 25 | parser.add_argument('-i', '--input', required=True, 26 | help='Genome in FASTA format') 27 | parser.add_argument('-o', '--out', required=True, 28 | help='Basename of output files') 29 | parser.add_argument('-s', '--species', required=True, 30 | help='Augustus species name') 31 | parser.add_argument('--hints', help='Hints file (PE)') 32 | parser.add_argument('--cpus', default=2, type=int, 33 | help='Number of CPUs to run') 34 | parser.add_argument('-v', '--debug', action='store_true', 35 | help='Keep intermediate files') 36 | parser.add_argument('--logfile', default='augustus-parallel.log', 37 | help='logfile') 38 | parser.add_argument('--local_augustus') 39 | parser.add_argument('--AUGUSTUS_CONFIG_PATH') 40 | parser.add_argument('-e', '--extrinsic', help='augustus extrinsic file') 41 | parser.add_argument('--no-progress', dest='progress', action='store_false', 42 | help='no progress on multiprocessing') 43 | args = parser.parse_args() 44 | 45 | # check for augustus installation 46 | if args.AUGUSTUS_CONFIG_PATH: 47 | AUGUSTUS = args.AUGUSTUS_CONFIG_PATH 48 | else: 49 | try: 50 | AUGUSTUS = os.environ["AUGUSTUS_CONFIG_PATH"] 51 | except KeyError: 52 | print("$AUGUSTUS_CONFIG_PATH environmental variable not found, Augustus is not properly configured") 53 | sys.exit(1) 54 | 55 | if AUGUSTUS.endswith('config'): 56 | AUGUSTUS_BASE = AUGUSTUS.replace('config', '') 57 | elif AUGUSTUS.endswith('config'+os.sep): 58 | AUGUSTUS_BASE = AUGUSTUS.replace('config'+os.sep, '') 59 | else: 60 | AUGUSTUS_BASE = AUGUSTUS 61 | 62 | # see if local species passed 63 | if args.local_augustus: 64 | LOCALAUGUSTUS = args.local_augustus 65 | else: 66 | LOCALAUGUSTUS = AUGUSTUS 67 | 68 | # setup hints and extrinic input, hard coded for protein and transcript alignments from funannotate 69 | extrinsic = '--extrinsicCfgFile={:}'.format(args.extrinsic) 70 | 71 | 72 | def countGFFgenes(input): 73 | count = 0 74 | with open(input, 'r') as f: 75 | for line in f: 76 | if "\tgene\t" in line: 77 | count += 1 78 | return count 79 | 80 | 81 | def runAugustus(Input): 82 | if '_part' in Input: 83 | chr = Input.split('_part')[0] 84 | else: 85 | chr = Input 86 | species = '--species='+args.species 87 | hints_input = '--hintsfile='+args.hints 88 | aug_out = os.path.join(tmpdir, Input+'.augustus.gff3') 89 | core_cmd = ['augustus', species, '--AUGUSTUS_CONFIG_PATH={:}'.format(LOCALAUGUSTUS), '--softmasking=1', 90 | '--gff3=on', '--UTR=off', '--stopCodonExcludedFromCDS=False', os.path.join(tmpdir, chr+'.fa')] 91 | if args.hints: 92 | core_cmd.insert(2, extrinsic) 93 | core_cmd.insert(3, hints_input) 94 | if Input in ranges: 95 | start = ranges.get(Input)[0] 96 | end = ranges.get(Input)[1] 97 | core_cmd.insert(2, '--predictionStart='+str(start)) 98 | core_cmd.insert(3, '--predictionEnd='+str(end)) 99 | # try using library module 100 | lib.runSubprocess(core_cmd, '.', lib.log, capture_output=aug_out) 101 | 102 | 103 | log_name = args.logfile 104 | if os.path.isfile(log_name): 105 | os.remove(log_name) 106 | 107 | # initialize script, log system info and cmd issue at runtime 108 | lib.setupLogging(log_name) 109 | cmd_args = " ".join(sys.argv)+'\n' 110 | lib.log.debug(cmd_args) 111 | 112 | lib.log.debug('AUGUSTUS_CONFIG_PATH={:}'.format(AUGUSTUS)) 113 | lib.log.debug('Augustus Base directory={:}'.format(AUGUSTUS_BASE)) 114 | lib.log.debug('Local Augustus path={:}'.format(LOCALAUGUSTUS)) 115 | 116 | # first step is to split input fasta file into individual files in tmp folder 117 | lib.log.debug("Splitting contigs and hints files") 118 | tmpdir = 'augustus_tmp_'+str(uuid.uuid4()) 119 | os.makedirs(tmpdir) 120 | scaffolds = [] 121 | global ranges 122 | ranges = {} 123 | with open(args.input, 'r') as InputFasta: 124 | for record in SeqIO.parse(InputFasta, 'fasta'): 125 | contiglength = len(record.seq) 126 | if contiglength > 500000: # split large contigs 127 | num_parts = contiglength / 500000 + 1 128 | chunks = contiglength / num_parts 129 | for i in range(0, int(num_parts)): 130 | name = str(record.id)+'_part'+str(i+1) 131 | scaffolds.append(name) 132 | outputfile = os.path.join(tmpdir, str(record.id)+'.fa') 133 | if i == 0: # this is first record 134 | start = 1 135 | end = chunks + 10000 136 | else: 137 | start = end - 10000 138 | end = start + chunks + 10000 139 | if end > contiglength: 140 | end = contiglength 141 | if not name in ranges: 142 | ranges[name] = (start, end) 143 | with open(outputfile, 'w') as output: 144 | SeqIO.write(record, output, 'fasta') 145 | else: 146 | name = str(record.id) 147 | scaffolds.append(name) 148 | outputfile = os.path.join(tmpdir, name+'.fa') 149 | with open(outputfile, 'w') as output: 150 | SeqIO.write(record, output, 'fasta') 151 | 152 | # now loop through each scaffold running augustus 153 | if args.cpus > len(scaffolds): 154 | num = len(scaffolds) 155 | else: 156 | num = args.cpus 157 | lib.log.debug("Running Augustus on %i chunks, using %i CPUs" % 158 | (len(scaffolds), num)) 159 | lib.runMultiProgress(runAugustus, scaffolds, num, progress=args.progress) 160 | 161 | 162 | lib.log.debug("Augustus prediction is finished, now concatenating results") 163 | with open(os.path.join(tmpdir, 'augustus_all.gff3'), 'w') as output: 164 | for file in scaffolds: 165 | file = os.path.join(tmpdir, file+'.augustus.gff3') 166 | with open(file) as input: 167 | output.write(input.read()) 168 | 169 | if lib.checkannotations(os.path.join(tmpdir, 'augustus_all.gff3')): 170 | lib.log.debug('Augustus finished, now joining results') 171 | if lib.which_path('join_aug_pred.pl'): 172 | join_script = 'join_aug_pred.pl' 173 | else: 174 | join_script = os.path.join(AUGUSTUS_BASE, 'scripts', 'join_aug_pred.pl') 175 | 176 | cmd = '{:} < {:} > {:}'.format(join_script, os.path.join( 177 | tmpdir, 'augustus_all.gff3'), args.out) 178 | lib.log.debug(cmd) 179 | 180 | with open(args.out, 'w') as finalout: 181 | with open(os.path.join(tmpdir, 'augustus_all.gff3'), 'r') as infile: 182 | subprocess.call([join_script], stdin=infile, stdout=finalout) 183 | 184 | if not args.debug: 185 | shutil.rmtree(tmpdir) 186 | lib.log.info('{:,} predictions from Augustus'.format(countGFFgenes(args.out))) 187 | -------------------------------------------------------------------------------- /funannotate/aux_scripts/enrichment_parallel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import argparse 6 | import subprocess 7 | import funannotate.library as lib 8 | 9 | 10 | def runGOenrichment(input): 11 | basename = os.path.basename(input).replace('.txt', '') 12 | goa_out = os.path.join(args.out, basename+'.go.enrichment.txt') 13 | go_log = os.path.join(args.out, basename+'.go.enrichment.log') 14 | if not lib.checkannotations(goa_out): 15 | cmd = ['find_enrichment.py', '--obo', os.path.join(FUNDB, 'go.obo'), 16 | '--pval', '0.001', '--alpha', '0.001', '--method', 'fdr', 17 | '--outfile', goa_out, input, os.path.join(args.input, 'population.txt'), 18 | os.path.join(args.input, 'associations.txt')] 19 | with open(go_log, 'w') as outfile: 20 | outfile.write('{}\n'.format(' '.join(cmd))) 21 | with open(go_log, 'a') as outfile: 22 | subprocess.call(cmd, stdout=outfile, stderr=outfile) 23 | 24 | 25 | def GO_safe_run(*args, **kwargs): 26 | """Call run(), catch exceptions.""" 27 | try: 28 | runGOenrichment(*args, **kwargs) 29 | except Exception as e: 30 | print(("error: %s run(*%r, **%r)" % (e, args, kwargs))) 31 | 32 | # setup menu with argparse 33 | 34 | 35 | class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): 36 | def __init__(self, prog): 37 | super(MyFormatter, self).__init__(prog, max_help_position=48) 38 | 39 | 40 | parser = argparse.ArgumentParser(prog='enrichment_parallel.py', 41 | description='''Run goatools enrichment in parallel.''', 42 | epilog="""Written by Jon Palmer (2019) nextgenusfs@gmail.com""", 43 | formatter_class=MyFormatter) 44 | parser.add_argument('-i', '--input', required=True, 45 | help='folder of protein fasta files') 46 | parser.add_argument('-d', '--db', required=True, 47 | help='location of HMM database') 48 | parser.add_argument('-c', '--cpus', default=1, type=int, 49 | help='location of HMM database') 50 | parser.add_argument('-o', '--out', required=True, help='output file') 51 | args = parser.parse_args() 52 | 53 | global FUNDB, FNULL 54 | FUNDB = args.db 55 | FNULL = open(os.devnull, 'w') 56 | 57 | # now loop through each genome comparing to population 58 | file_list = [] 59 | for f in os.listdir(args.input): 60 | if f.startswith('associations'): 61 | continue 62 | if f.startswith('population'): 63 | continue 64 | file = os.path.join(args.input, f) 65 | if lib.checkannotations(file): 66 | file_list.append(file) 67 | else: 68 | print(' WARNING: skipping {} as no GO terms'.format(f)) 69 | 70 | # run over multiple CPUs 71 | if len(file_list) > args.cpus: 72 | procs = args.cpus 73 | else: 74 | procs = len(file_list) 75 | 76 | lib.runMultiProgress(GO_safe_run, file_list, procs, progress=False) 77 | -------------------------------------------------------------------------------- /funannotate/aux_scripts/fasta2agp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # based on fasta2agp.pl from david.studholme@tsl.ac.uk 5 | # rewritten in python by Jason Stajich @hyphaltip 6 | 7 | import os 8 | import sys 9 | import re 10 | import csv 11 | import argparse 12 | import warnings 13 | from Bio import SeqIO 14 | from Bio.Seq import Seq 15 | from Bio.SeqRecord import SeqRecord 16 | 17 | def parse_scaffolds_makeagp(scaffolds,agpout,ctgsout): 18 | x = 0 19 | i = 0 20 | spadesnamepat = re.compile(r'^NODE_(\d+)_length_\d+_cov_\d+') 21 | numnamepat = re.compile(r'^(\d+)$') 22 | validSeq = re.compile(r'^[ACGTRYSWKMBDHVN]+$',flags=re.IGNORECASE) 23 | with open(agpout, 'w') as agpoutfh: 24 | csvout = csv.writer(agpoutfh,delimiter="\t",lineterminator="\n") 25 | with open(ctgsout,"w") as ctgoutfh: 26 | with open(scaffolds, 'r') as scaff_in: 27 | for seq in SeqIO.parse(scaff_in, "fasta"): 28 | supercontig_id = seq.id 29 | supercontig_seq = seq.seq 30 | supercontig_desc = seq.description 31 | supercontig_length = len(seq); 32 | x = 0 33 | m = spadesnamepat.match(supercontig_id) or spadesnamepat.match(supercontig_id) 34 | if m: 35 | supercontig_id = "scf_%s"%(m.match(1)) 36 | start_pos = 1 # keep track of whereabouts in this supercontig we are 37 | substring_sequences = {} 38 | for substring_sequence in re.split(r'(N{10,})',str(supercontig_seq),maxsplit=0,flags=re.IGNORECASE): 39 | if len(substring_sequence) == 0: 40 | continue 41 | object1 = supercontig_id 42 | object_beg2 = start_pos 43 | object_end3 = start_pos + len(substring_sequence) - 1 44 | part_number4 = x 45 | x += 1 46 | component_type5 = None 47 | component_id6a = None 48 | gap_length6b = None 49 | component_beg7a = None 50 | gap_type7b = None 51 | component_end8a = None 52 | linkage8b = None 53 | orientation9a = None 54 | filler9b = None 55 | if re.match(r'^N+$',substring_sequence): 56 | ### This is poly-N gap between contigs 57 | component_type5 = 'N' 58 | gap_length6b = len(substring_sequence) 59 | gap_type7b = 'scaffold' 60 | linkage8b = 'yes' 61 | filler9b = 'paired-ends' 62 | elif validSeq.match(substring_sequence): 63 | ### This is a contig 64 | i+=1 # a counter, used for generating unique contig names 65 | component_type5 = 'W' 66 | component_id6a = "contig_%d"%(i) 67 | component_beg7a = 1 68 | component_end8a = len(substring_sequence) 69 | orientation9a = '+' 70 | ### Print FastA formatted contig 71 | record = SeqRecord( Seq(substring_sequence), 72 | id=component_id6a, 73 | description="") 74 | SeqIO.write(record, ctgoutfh, "fasta") 75 | else: 76 | print("Illegal characters in sequence") 77 | print(substring_sequence) 78 | return 79 | 80 | start_pos += len (substring_sequence) 81 | part_number4 += 1 82 | if component_type5 == 'N': 83 | ### print AGP line for gap 84 | csvout.writerow([object1,object_beg2,object_end3, part_number4,component_type5,gap_length6b,gap_type7b,linkage8b,filler9b]) 85 | else: 86 | ### print AGP line for contig 87 | csvout.writerow([object1,object_beg2,object_end3, part_number4,component_type5,component_id6a,component_beg7a,component_end8a,orientation9a]) 88 | 89 | 90 | def main(args): 91 | class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): 92 | def __init__(self, prog): 93 | super(MyFormatter, self).__init__(prog, max_help_position=48) 94 | 95 | 96 | parser = argparse.ArgumentParser( 97 | prog='fasta2agp.py', 98 | description='''Convert FastA format scaffolds file into contigs file and print the AGP based on parsing gaps (N runs).''', 99 | #usage='''fasta2agp.py scaffolds.fa > scaffolds.agp''', 100 | epilog="""Written by Jason Stajich @hyphaltip (2021) jasonstajich.phd@gmail.com""", 101 | formatter_class=MyFormatter) 102 | parser.add_argument('--ext', default='contigs.fsa', 103 | help='Default extensions for output contigs file') 104 | parser.add_argument('scaffoldfile', nargs='?',help='Scaffolds FastA file') 105 | parser.add_argument('agpfile', nargs='?',type=argparse.FileType('w'), default=sys.stdout, 106 | help='AGP output file (defaults to STDOUT)') 107 | args = parser.parse_args(args) 108 | ctgfile = args.scaffoldfile + "." + args.ext 109 | m = re.match(r'^(\S+)\.(fa|fasta|fsa)$',args.scaffoldfile) 110 | if m: 111 | ctgfile = m.group(1) 112 | m = re.match(r'^(\S+)\.scaffolds?$',ctgfile) 113 | if m: 114 | ctgfile = "{}.{}".format(m.group(1),args.ext) 115 | # run cmd 116 | parse_scaffolds_makeagp(args.scaffoldfile,args.agpfile,ctgfile) 117 | 118 | if __name__ == "__main__": 119 | main(sys.argv[1:]) -------------------------------------------------------------------------------- /funannotate/aux_scripts/filterIntronsFindStrand.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | #################################################################################################### 4 | # # 5 | # filterIntronsFindStrand.pl - finds corresponding strand for introns in fasta file # 6 | # optionally set the score column to the 'mult' entry with --score # 7 | # # 8 | # Author: Simone Lange # 9 | # # 10 | # Contact: katharina.hoff@uni-greifswald.de # 11 | # # 12 | # Release date: January 7th 2015 # 13 | # # 14 | # This script is under the Artistic Licence # 15 | # (http://www.opensource.org/licenses/artistic-license.php) # 16 | # # 17 | #################################################################################################### 18 | 19 | # ------------------------------------------------------------------ 20 | # | file creation and findStrand() | Simone Lange |06.10.2014 | 21 | # | add getScore() for score option | |07.10.2014 | 22 | # | add error message if sequence | |23.01.2015 | 23 | # | name of hints and fasta file do | | | 24 | # | not match -> program stops then | | | 25 | # ------------------------------------------------------------------ 26 | 27 | use strict; 28 | use warnings; 29 | use Getopt::Long; 30 | 31 | 32 | 33 | my $usage = <<'ENDUSAGE'; 34 | 35 | filterIntronsFindStrand.pl find corresponding strand for introns from two input files genome.fa and introns.gff 36 | 37 | SYNOPSIS 38 | 39 | filterIntronsFindStrand.pl genome.fa introns.gff [OPTIONS] > introns.s.f.gff 40 | 41 | genome.fa DNA file in fasta format 42 | introns.gff corresponding introns file in gff format 43 | 44 | 45 | OPTIONS 46 | 47 | --help Print this help message 48 | --allowed=gtag,gcaag,atac Allowed acceptor and donor splice site types 49 | --score Set score to 'mult' entry or '1', if the last column does not contain a 'mult' entry 50 | --genome=genome.fa see above 51 | --introns=introns.gff see above 52 | 53 | 54 | 55 | 56 | DESCRIPTION 57 | 58 | Example: 59 | 60 | filterIntronsFindStrand.pl genome.fa introns.gff [OPTIONS] > introns.s.f.gff 61 | 62 | ENDUSAGE 63 | 64 | 65 | my ($genome, $introns, @allowed, $mult_score, $help); 66 | my %annos; # keys: sequences, elements: annotations 67 | my $seqname; 68 | my $seq; 69 | 70 | if(@ARGV==0){ 71 | print "$usage\n"; 72 | exit(0); 73 | } 74 | 75 | GetOptions( 'introns=s' => \$introns, 76 | 'genome=s' => \$genome, 77 | 'score!' => \$mult_score, 78 | 'allowed=s' => \@allowed, 79 | 'help!' => \$help); 80 | 81 | if($help){ 82 | print $usage; 83 | exit(0); 84 | } 85 | 86 | # set $genome 87 | if(!defined($genome)){ 88 | $genome = $ARGV[0]; 89 | } 90 | 91 | # set $introns 92 | if(!defined($introns)){ 93 | $introns = $ARGV[1]; 94 | } 95 | 96 | # set allowed splice site types 97 | if(@allowed){ 98 | @allowed = split(/[\s,]/, join(',',@allowed)); 99 | }else{ 100 | @allowed = ("gtag", "gcag", "atac"); 101 | } 102 | 103 | # check whether the files exist 104 | if(! -f "$genome"){ 105 | print "Genome file $genome does not exist. Please check.\n"; 106 | exit(1); 107 | } 108 | 109 | if(! -f "$introns"){ 110 | print "Introns file $introns does not exist. Please check.\n"; 111 | exit(1); 112 | } 113 | 114 | # genome file in fasta format 115 | open (FASTA, "<".$genome) or die "Cannot open file: $genome\n"; 116 | $/="\n>"; 117 | while() { 118 | /[>]*(.*)\n/; 119 | $seqname = $1; 120 | $seq = $'; 121 | $seq =~ s/>//; 122 | $seq =~ s/\n//g; 123 | $annos{$seqname} = $seq; 124 | } 125 | close(FASTA) or die("Could not close fasta file $genome!\n"); 126 | 127 | # introns hintsfile in gff format 128 | open (INTRONS, "<".$introns) or die "Cannot open file: $introns\n"; 129 | $/="\n"; 130 | while(){ 131 | chomp; 132 | my @line = split(/\t/, $_); 133 | my $strand = findStrand($line[0], $line[3], $line[4]); 134 | my $score; 135 | if($mult_score){ 136 | $score = getScore($line[8]); 137 | }else{ 138 | $score = $line[5]; 139 | } 140 | if($strand){ 141 | print "$line[0]\t$line[1]\t$line[2]\t$line[3]\t$line[4]\t$score\t$strand\t$line[7]\t$line[8]\n"; 142 | } 143 | } 144 | close(INTRONS) or die("Could not close introns file $introns!\n"); 145 | 146 | 147 | ############### sub functions ############## 148 | 149 | 150 | # find strand for introns 151 | # look up start and end position and check if it matches allowed splice site patterns 152 | sub findStrand{ 153 | my $seqname = shift; 154 | my $start = shift; 155 | my $end = shift; 156 | my $type; 157 | my $reverse; 158 | if(defined($annos{$seqname})){ 159 | $type = lc(substr($annos{$seqname}, $start-1,2)).lc(substr($annos{$seqname}, $end-2,2)); 160 | $reverse = reverse($type); 161 | $reverse =~ tr/agct/tcga/; 162 | foreach (@allowed){ 163 | if($_ eq $type){ 164 | return "+"; 165 | }elsif($_ eq $reverse){ 166 | return "-"; 167 | } 168 | } 169 | return 0; 170 | }else{ 171 | print STDERR "WARNING: '$seqname' does not match any sequence in the fasta file. Maybe the two files do not belong together.\n"; 172 | # print STDERR "The program terminates here.\n"; 173 | # exit(1) 174 | } 175 | } 176 | 177 | # get score from mult entry 178 | sub getScore{ 179 | my $column = shift; 180 | my $score; 181 | if($column =~ m/mult=(\d+)/){ 182 | $score = $1; 183 | }else{ 184 | $score = 1; 185 | } 186 | return $score; 187 | } 188 | 189 | 190 | 191 | -------------------------------------------------------------------------------- /funannotate/aux_scripts/genemark_gtf2gff3.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | #script from the Maker 2.31.8 distribution 4 | 5 | eval 'exec /usr/bin/perl -S $0 ${1+"$@"}' 6 | if 0; # not running under some shell 7 | 8 | use warnings; 9 | use strict; 10 | 11 | #usage statement 12 | my $usage = " 13 | USAGE: 14 | genemark_gtf2gff3 15 | 16 | This converts genemark's GTF output into GFF3 format. 17 | The script prints to STDOUT. Use the '>' character to 18 | redirect output into a file. 19 | 20 | "; 21 | 22 | my $file = shift; 23 | 24 | #error checking 25 | if (! $file ){ 26 | print $usage; 27 | exit; 28 | } 29 | 30 | if(! -e $file){ 31 | warn "ERROR: The file $file does not exist\n"; 32 | print $usage; 33 | } 34 | 35 | #parse file 36 | open(IN, "< $file"); 37 | my %genes; 38 | while(my $line = ){ 39 | chomp $line; 40 | my @F = split(/\t/, $line); 41 | next if(@F < 8); 42 | next if($F[2] ne 'CDS'); 43 | 44 | #genemark by default only fills in the ids and not the names 45 | my ($g) = $F[8] =~ /gene_id \"([^\"]+)\"/; 46 | ($g) = $F[8] =~ /gene_name \"([^\"]+)\"/ if(! defined $g); 47 | my ($t) = $F[8] =~ /transcript_id \"([^\"]+)\"/; 48 | ($t) = $F[8] =~ /transcript_name \"([^\"]+)\"/ if(! defined $t); 49 | 50 | die "ERROR: Cannot understand format\n". 51 | "expecting -> gene_id \"xxxx\"\; transcript_id \"xxxx\"\;\n" 52 | if(! defined $g || ! defined $t); 53 | 54 | #get cintig name 55 | my $s = $F[0]; 56 | 57 | #set needed column information 58 | $genes{$s}{$g}{seqid} = $F[0] if(! $genes{$s}{$g}{seqid}); 59 | $genes{$s}{$g}{source} = $F[1] if(! $genes{$s}{$g}{source}); 60 | $genes{$s}{$g}{strand} = $F[6] if(! $genes{$s}{$g}{strand}); 61 | 62 | $genes{$s}{$g}{mRNA}{$t}{seqid} = $F[0] if(! $genes{$s}{$g}{mRNA}{$t}{seqid}); 63 | $genes{$s}{$g}{mRNA}{$t}{source} = $F[1] if(! $genes{$s}{$g}{mRNA}{$t}{source}); 64 | $genes{$s}{$g}{mRNA}{$t}{strand} = $F[6] if(! $genes{$s}{$g}{mRNA}{$t}{strand}); 65 | $genes{$s}{$g}{mRNA}{$t}{parent} = $g if(! $genes{$s}{$g}{mRNA}{$t}{parent}); 66 | 67 | #set start/end of gene 68 | $genes{$s}{$g}{B} = $F[3] if(! defined $genes{$s}{$g}{B} || $F[3] < $genes{$s}{$g}{B}); 69 | $genes{$s}{$g}{E} = $F[4] if(! defined $genes{$s}{$g}{E} || $F[4] > $genes{$s}{$g}{E}); 70 | 71 | #set start/end of transcript 72 | $genes{$s}{$g}{mRNA}{$t}{B} = $F[3] if(! defined $genes{$s}{$g}{mRNA}{$t}{B} || 73 | $F[3] < $genes{$s}{$g}{mRNA}{$t}{B} 74 | ); 75 | $genes{$s}{$g}{mRNA}{$t}{E} = $F[4] if(! defined $genes{$s}{$g}{mRNA}{$t}{E} || 76 | $F[4] > $genes{$s}{$g}{mRNA}{$t}{E} 77 | ); 78 | 79 | #add CDS to transcript 80 | my %c = (seqid => $F[0], 81 | source => $F[1], 82 | B => $F[3], 83 | E => $F[4], 84 | score => $F[5], 85 | strand => $F[6], 86 | phase => $F[7], 87 | parent => $t 88 | ); 89 | 90 | push (@{$genes{$s}{$g}{mRNA}{$t}{CDS}}, \%c); 91 | } 92 | close(IN); 93 | 94 | 95 | #build GFF3 structure and dump to file 96 | print "\#\#gff-version 3\n"; 97 | gff3_contig(\%genes); 98 | 99 | #-------------------------------------------------------------------------- 100 | #-------------------------------- SUBS ------------------------------------ 101 | #-------------------------------------------------------------------------- 102 | sub gff3_contig { 103 | my $hash = shift; 104 | foreach my $f (keys %$hash){ 105 | gff3_gene($hash->{$f}); 106 | } 107 | } 108 | #-------------------------------------------------------------------------- 109 | sub gff3_gene { 110 | my $hash = shift; 111 | 112 | foreach my $g (sort {$hash->{$a}{B} <=> $hash->{$b}{B}} keys %$hash) { 113 | my $gene = $hash->{$g}; 114 | 115 | print join("\t",$gene->{seqid},$gene->{source},'gene',$gene->{B}, 116 | $gene->{E},'.',$gene->{strand},'.',sprintf('ID=%s;Name=%s',$g,$g)),"\n"; 117 | 118 | gff3_mRNA($gene->{mRNA}); 119 | } 120 | } 121 | #-------------------------------------------------------------------------- 122 | sub gff3_mRNA { 123 | my $hash = shift; 124 | 125 | foreach my $t (keys %$hash){ 126 | my $mRNA = $hash->{$t}; 127 | print join("\t",$mRNA->{seqid},$mRNA->{source},"mRNA", 128 | $mRNA->{B},$mRNA->{E},'.',$mRNA->{strand},'.', 129 | sprintf('ID=%s;Name=%s;Parent=%s',$t,$t,$mRNA->{parent})),"\n"; 130 | 131 | gff3_CDS($mRNA->{CDS}); 132 | } 133 | } 134 | #-------------------------------------------------------------------------- 135 | sub gff3_CDS { 136 | my $array = shift; 137 | 138 | #define the id 139 | my $i = 1; 140 | 141 | my @exons; 142 | my @CDSs; 143 | foreach my $c (@$array){ 144 | #make exon line 145 | my $id = $c->{parent} .":exon:". $i; 146 | my $exon = join("\t",$c->{seqid},$c->{source},"exon", 147 | $c->{B},$c->{E},'.',$c->{strand},'.', 148 | sprintf('ID=%s;Name=%s;Parent=%s',$id,$id,$c->{parent}))."\n"; 149 | push(@exons, $exon); 150 | 151 | #make CDS line 152 | $id = $c->{parent} .":CDS:". $i++; 153 | my $cds = join("\t",$c->{seqid},$c->{source},"CDS",$c->{B},$c->{E},$c->{score}, 154 | $c->{strand},$c->{phase},sprintf("ID=%s;Name=%s;Parent=%s",$id,$id,$c->{parent}))."\n"; 155 | push(@CDSs, $cds); 156 | } 157 | 158 | #print all exons together then all CDSs together 159 | print join('', @exons); 160 | print join('', @CDSs); 161 | } 162 | 163 | 164 | 165 | 166 | -------------------------------------------------------------------------------- /funannotate/aux_scripts/getEggNog.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z "$1" ]; then 4 | echo "Usage: getEggNog.sh fuNOG directory" 5 | exit 6 | fi 7 | 8 | 9 | EGGNOG=$1 10 | wget -c --tries=0 --read-timeout=20 http://eggnogdb.embl.de/download/eggnog_4.5/data/$EGGNOG/$EGGNOG.hmm.tar.gz 11 | wget -c --tries=0 --read-timeout=20 http://eggnogdb.embl.de/download/eggnog_4.5/data/$EGGNOG/$EGGNOG.annotations.tsv.gz 12 | gunzip $EGGNOG.annotations.tsv.gz 13 | tar -zxf $EGGNOG.hmm.tar.gz 14 | find $EGGNOG\_hmm/ -maxdepth 1 -type f -name '*.hmm' -exec cat '{}' \; > $EGGNOG\_4.5.hmm 15 | hmmpress $EGGNOG\_4.5.hmm 16 | rm $EGGNOG.hmm.tar.gz 17 | rm -R $EGGNOG\_hmm/ 18 | for i in $EGGNOG\*; do 19 | mv $i $2/ 20 | done 21 | echo "Done, $EGGNOG DB is now ready to use" -------------------------------------------------------------------------------- /funannotate/aux_scripts/hmmer_parallel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import argparse 6 | import warnings 7 | import subprocess 8 | from natsort import natsorted 9 | import funannotate.library as lib 10 | 11 | with warnings.catch_warnings(): 12 | warnings.simplefilter("ignore") 13 | from Bio import SearchIO 14 | 15 | 16 | def PfamHmmer(input): 17 | HMM = os.path.join(FUNDB, "Pfam-A.hmm") 18 | base = os.path.basename(input).split(".fa")[0] 19 | pfam_out = os.path.join(os.path.dirname(input), base + ".pfam.txt") 20 | cmd = ["hmmsearch", "--domtblout", pfam_out, "--cpu", "1", "--cut_ga", HMM, input] 21 | subprocess.call(cmd, stdout=FNULL, stderr=FNULL) 22 | 23 | 24 | def safe_run(*args, **kwargs): 25 | """Call run(), catch exceptions.""" 26 | try: 27 | PfamHmmer(*args, **kwargs) 28 | except Exception as e: 29 | print(("error: %s run(*%r, **%r)" % (e, args, kwargs))) 30 | 31 | 32 | def combineHmmerOutputs(inputList, output): 33 | # function to combine multiple HMMER runs with proper header/footer so biopython can read 34 | allHeadFoot = [] 35 | with open(inputList[0], "r") as infile: 36 | for line in infile: 37 | if line.startswith("#"): 38 | allHeadFoot.append(line) 39 | with open(output, "w") as out: 40 | for x in allHeadFoot[:3]: 41 | out.write(x) 42 | for file in inputList: 43 | with open(file, "r") as resultin: 44 | for line in resultin: 45 | if line.startswith("#") or line.startswith("\n"): 46 | continue 47 | out.write(line) 48 | for y in allHeadFoot[3:]: 49 | out.write(y) 50 | 51 | 52 | def multiPFAMsearch(inputList, cpus, tmpdir, output): 53 | # run hmmerscan multithreaded by running at same time 54 | # input is a list of files, run multiprocessing on them 55 | pfam_results = os.path.join(os.path.dirname(tmpdir), "pfam.txt") 56 | pfam_filtered = os.path.join(os.path.dirname(tmpdir), "pfam.filtered.txt") 57 | lib.runMultiProgress(safe_run, inputList, cpus, progress=False) 58 | 59 | # now grab results and combine, kind of tricky as there are header and footers for each 60 | resultList = [ 61 | os.path.join(tmpdir, f) 62 | for f in os.listdir(tmpdir) 63 | if os.path.isfile(os.path.join(tmpdir, f)) and f.endswith(".pfam.txt") 64 | ] 65 | combineHmmerOutputs(resultList, pfam_results) 66 | 67 | # now parse results 68 | with open(output, "w") as out: 69 | with open(pfam_filtered, "w") as filtered: 70 | with open(pfam_results, "r") as results: 71 | for qresult in SearchIO.parse(results, "hmmsearch3-domtab"): 72 | hits = qresult.hits 73 | num_hits = len(hits) 74 | if num_hits > 0: 75 | for i in range(0, num_hits): 76 | hit_evalue = hits[i].evalue 77 | query = hits[i].id 78 | pfam = qresult.accession.split(".")[0] 79 | hmmLen = qresult.seq_len 80 | hmm_aln = int(hits[i].hsps[0].hit_end) - int( 81 | hits[i].hsps[0].hit_start 82 | ) 83 | coverage = hmm_aln / float(hmmLen) 84 | if coverage < 0.50: # coverage needs to be at least 50% 85 | continue 86 | filtered.write( 87 | "%s\t%s\t%s\t%f\n" % (query, pfam, hit_evalue, coverage) 88 | ) 89 | out.write("%s\tdb_xref\tPFAM:%s\n" % (query, pfam)) 90 | 91 | 92 | def dbCANHmmer(input): 93 | HMM = os.path.join(FUNDB, "dbCAN.hmm") 94 | base = os.path.basename(input).split(".fa")[0] 95 | outfiles = os.path.join(os.path.dirname(input), base + ".dbcan.txt") 96 | cmd = ["hmmscan", "--domtblout", outfiles, "--cpu", "1", "-E", "1e-15", HMM, input] 97 | subprocess.call(cmd, stdout=FNULL, stderr=FNULL) 98 | 99 | 100 | def safe_run2(*args, **kwargs): 101 | """Call run(), catch exceptions.""" 102 | try: 103 | dbCANHmmer(*args, **kwargs) 104 | except Exception as e: 105 | print(("error: %s run(*%r, **%r)" % (e, args, kwargs))) 106 | 107 | 108 | def dbCANsearch(inputList, cpus, evalue, tmpdir, output): 109 | # run hmmerscan 110 | dbCAN_out = os.path.join(tmpdir, "dbCAN.txt") 111 | dbCAN_filtered = os.path.join(tmpdir, "dbCAN.filtered.txt") 112 | lib.runMultiProgress(safe_run2, inputList, cpus, progress=False) 113 | # now grab results 114 | resultList = [ 115 | os.path.join(tmpdir, f) 116 | for f in os.listdir(tmpdir) 117 | if os.path.isfile(os.path.join(tmpdir, f)) and f.endswith(".dbcan.txt") 118 | ] 119 | combineHmmerOutputs(resultList, dbCAN_out) 120 | 121 | # now parse results 122 | Results = {} 123 | with open(dbCAN_filtered, "w") as filtered: 124 | filtered.write( 125 | "#HMM_family\tHMM_len\tQuery_ID\tQuery_len\tE-value\tHMM_start\tHMM_end\tQuery_start\tQuery_end\tCoverage\n" 126 | ) 127 | with open(dbCAN_out, "r") as results: 128 | for qresult in SearchIO.parse(results, "hmmscan3-domtab"): 129 | query_length = qresult.seq_len 130 | hits = qresult.hits 131 | num_hits = len(hits) 132 | if num_hits > 0: 133 | for i in range(0, num_hits): 134 | hit_evalue = hits[i].evalue 135 | if hit_evalue > evalue: 136 | continue 137 | hit = hits[i].id 138 | hmmLen = hits[i].seq_len 139 | hmm_aln = int(hits[i].hsps[0].hit_end) - int( 140 | hits[i].hsps[0].hit_start 141 | ) 142 | coverage = hmm_aln / float(hmmLen) 143 | if coverage < 0.35: 144 | continue 145 | query = hits[i].query_id 146 | filtered.write( 147 | "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\n" 148 | % ( 149 | hit, 150 | hmmLen, 151 | query, 152 | query_length, 153 | hit_evalue, 154 | hits[i].hsps[0].hit_start, 155 | hits[i].hsps[0].hit_end, 156 | hits[i].hsps[0].query_start, 157 | hits[i].hsps[0].query_end, 158 | coverage, 159 | ) 160 | ) 161 | if query not in Results: 162 | Results[query] = [hit] 163 | else: 164 | Results[query].append(hit) 165 | # run through results and simplify subdomain hits 166 | with open(output, "w") as out: 167 | for k, v in natsorted(Results.items()): 168 | simplified = [] 169 | for x in v: 170 | if "_" in x: 171 | cazy, subdomain = x.rsplit("_", 1) 172 | if cazy not in simplified: 173 | simplified.append(cazy) 174 | else: 175 | if not x in simplified: 176 | simplified.append(x) 177 | for hit in simplified: 178 | out.write("{}\tnote\tCAZy:{}\n".format(k, hit)) 179 | 180 | 181 | class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): 182 | def __init__(self, prog): 183 | super(MyFormatter, self).__init__(prog, max_help_position=48) 184 | 185 | 186 | parser = argparse.ArgumentParser( 187 | prog="hmmer_parallel.py", 188 | description="""Run hmmer3 multipthreaded.""", 189 | epilog="""Written by Jon Palmer (2019) nextgenusfs@gmail.com""", 190 | formatter_class=MyFormatter, 191 | ) 192 | parser.add_argument( 193 | "-i", "--input", required=True, help="folder of protein fasta files" 194 | ) 195 | parser.add_argument( 196 | "-m", 197 | "--method", 198 | default="pfam", 199 | choices=["pfam", "cazy"], 200 | help="database to search", 201 | ) 202 | parser.add_argument("-d", "--db", required=True, help="location of HMM database") 203 | parser.add_argument( 204 | "-c", "--cpus", default=1, type=int, help="location of HMM database" 205 | ) 206 | parser.add_argument("-o", "--out", required=True, help="output file") 207 | args = parser.parse_args() 208 | 209 | global FUNDB, FNULL 210 | FUNDB = args.db 211 | FNULL = open(os.devnull, "w") 212 | splitProts = [ 213 | os.path.join(args.input, f) 214 | for f in os.listdir(args.input) 215 | if os.path.isfile(os.path.join(args.input, f)) 216 | ] 217 | if args.method == "pfam": 218 | multiPFAMsearch(splitProts, args.cpus, args.input, args.out) 219 | elif args.method == "cazy": 220 | dbCANsearch(splitProts, args.cpus, 1e-17, args.input, args.out) 221 | -------------------------------------------------------------------------------- /funannotate/aux_scripts/iprscan2annotations.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # script written for funannotate by Jon Palmer (2017) 4 | # it will parse an interproscan5 xml file and generate 5 | # genome annotation file for GO terms and IPR terms 6 | 7 | import sys 8 | import os 9 | import xml.etree.cElementTree as etree 10 | from goatools import obo_parser 11 | 12 | 13 | def convertGOattribute(namespacein): 14 | namespace = namespacein.upper() 15 | if namespace == "BIOLOGICAL_PROCESS": 16 | attribute = "go_process" 17 | elif namespace == "MOLECULAR_FUNCTION": 18 | attribute = "go_function" 19 | elif namespace == "CELLULAR_COMPONENT": 20 | attribute = "go_component" 21 | else: 22 | # print(f'Error parsing XML GO terms: {namespace} is not a valid term') 23 | attribute = "go_unknown" 24 | # sys.exit(1) 25 | return attribute 26 | 27 | 28 | def main(): 29 | """Main step of intepro annotations to tab delimited script.""" 30 | 31 | if len(sys.argv) < 2: 32 | print("Usage: iprscan2annotations.py IPRSCAN.xml OUTPUT.annotations.txt") 33 | sys.exit(1) 34 | 35 | goDict = {} 36 | for item in obo_parser.OBOReader( 37 | os.path.join(os.environ["FUNANNOTATE_DB"], "go.obo") 38 | ): 39 | namespace = convertGOattribute(item.namespace) 40 | goDict[item.id] = {"name": item.name, "namespace": namespace} 41 | for nm in item.alt_ids: # also index by alt_id since that may be reported 42 | goDict[nm] = {"name": item.name, "namespace": namespace} 43 | with open(sys.argv[2], "w") as output: 44 | with open(sys.argv[1]) as xml_file: 45 | tree = etree.iterparse(xml_file) 46 | for _, elem in tree: 47 | if "}" in elem.tag: 48 | elem.tag = elem.tag.split("}", 1)[1] 49 | for at in list(elem.attrib.keys()): 50 | if "}" in at: 51 | newat = at.split("}", 1)[1] 52 | elem.attrib[newat] = elem.attrib[at] 53 | del elem.attrib[at] 54 | root = tree.root 55 | # iterate through each of the protein hits 56 | for hits in root: 57 | IDs = [] 58 | iprs = [] 59 | gos = {} 60 | signalp = [] 61 | for lv1 in hits: 62 | if lv1.tag == "xref": 63 | name = lv1.get("id") 64 | IDs.append(name) 65 | if lv1.tag == "matches": 66 | for e in lv1.findall(".//entry"): 67 | if not e.get("ac") in iprs: 68 | iprs.append(e.get("ac")) 69 | for g in lv1.findall(".//go-xref"): 70 | cat = g.get("category", None) 71 | goID = g.get("id", None) 72 | desc = g.get("name", None) 73 | if not goID: 74 | continue 75 | if not cat or not desc: 76 | if goID in goDict: 77 | cat = goDict[goID]["namespace"] 78 | desc = goDict[goID]["name"] 79 | else: 80 | continue 81 | # cat = "" 82 | # desc = "" 83 | # print(f"No GO term {goID} in obo DB") 84 | else: 85 | cat = convertGOattribute(cat) 86 | goHit = (cat, desc, goID) 87 | if goID not in gos: 88 | gos[goID] = goHit 89 | # signalp is processed elsewhere 90 | # do we just skip this parsing even? 91 | for s in lv1.findall(".//signalp-match"): 92 | for lib in s.findall(".//signature-library-release"): 93 | if lib.get("library") == "SIGNALP_EUK": 94 | for loc in s.findall(".//signalp-location"): 95 | signalp.append( 96 | (loc.get("start"), loc.get("end")) 97 | ) 98 | # print out annotation file if IPR domains 99 | if len(iprs) > 0: 100 | for i in IDs: 101 | for x in iprs: 102 | output.write(f"{i}\tdb_xref\tInterPro:{x}\n") 103 | if len(gos) > 0: 104 | for i in IDs: 105 | for goid in gos: 106 | x = gos[goid] 107 | GOID = x[2].replace("GO:", "") 108 | output.write(f"{i}\t{x[0]}\t{x[1]}|{GOID}||IEA\n") 109 | 110 | 111 | if __name__ == "__main__": 112 | main() 113 | -------------------------------------------------------------------------------- /funannotate/aux_scripts/phobius-multiproc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import os 6 | import uuid 7 | import time 8 | import multiprocessing 9 | import argparse 10 | import shutil 11 | import funannotate.library as lib 12 | 13 | # setup menu with argparse 14 | 15 | 16 | class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): 17 | def __init__(self, prog): 18 | super(MyFormatter, self).__init__(prog, max_help_position=48) 19 | 20 | 21 | parser = argparse.ArgumentParser( 22 | prog='phobius-multiproc.py', 23 | usage="%(prog)s [options] -i proteome.fasta", 24 | description='''Script that runs phobius remotely.''', 25 | epilog="""Written by Jon Palmer (2016) nextgenusfs@gmail.com""", 26 | formatter_class=MyFormatter) 27 | parser.add_argument('-i', '--input', required=True, help='whole proteome') 28 | parser.add_argument('-o', '--out', required=True, help='Phobius results') 29 | parser.add_argument('-e', '--email', help='Email address for IPRSCAN server') 30 | parser.add_argument('-l', '--logfile', 31 | default='phobius-multiproc.log', help='Logfile') 32 | parser.add_argument('--debug', action='store_true', 33 | help='Keep intermediate files') 34 | args = parser.parse_args() 35 | 36 | 37 | def runPhobiusRemote(Input): 38 | base = Input.split('/')[-1] 39 | base = base.split('.fa')[0] 40 | OUTPATH = os.path.join(TMPDIR, base) 41 | cmd = ['perl', os.path.join(parentdir, 'phobius-remote.pl'), 42 | '--email', args.email, '-f', 'short', '--outfile', base, Input] 43 | lib.runSubprocess(cmd, TMPDIR, lib.log) 44 | time.sleep(1) # make sure there is time for all files to show up 45 | os.rename(OUTPATH+'.out.txt', OUTPATH+'.phobius') 46 | os.remove(OUTPATH+'.sequence.txt') 47 | 48 | 49 | def runPhobiusLocal(Input): 50 | base = Input.split('/')[-1] 51 | base = base.split('.fa')[0] 52 | OUTPATH = os.path.join(TMPDIR, base+'.phobius') 53 | cmd = ['phobius.pl', '-short', Input] 54 | lib.runSubprocess(cmd, TMPDIR, lib.log, capture_output=OUTPATH, raise_not_exit=True) 55 | 56 | 57 | global parentdir 58 | parentdir = os.path.join(os.path.dirname(__file__)) 59 | 60 | # create log file 61 | log_name = args.logfile 62 | if os.path.isfile(log_name): 63 | os.remove(log_name) 64 | 65 | # initialize script, log system info and cmd issue at runtime 66 | lib.setupLogging(log_name) 67 | FNULL = open(os.devnull, 'w') 68 | cmd_args = " ".join(sys.argv)+'\n' 69 | lib.log.debug(cmd_args) 70 | 71 | # create tmpdir to store fasta files and output files 72 | TMPDIR = 'phobius_' + str(uuid.uuid4()) 73 | 74 | # split fasta 75 | lib.splitFASTA(args.input, TMPDIR) 76 | 77 | # now get list of files in tmpdir 78 | proteins = [] 79 | for file in os.listdir(TMPDIR): 80 | if file.endswith('.fa'): 81 | proteins.append(file) 82 | 83 | # now run the script 84 | if lib.which('phobius.pl'): 85 | lib.runMultiProgress(runPhobiusLocal, proteins, 86 | multiprocessing.cpu_count()) 87 | else: 88 | lib.runMultiProgress(runPhobiusRemote, proteins, 89 | 29) # max is 30 jobs at a time 90 | 91 | # collect all results 92 | phobius = [] 93 | for file in os.listdir(TMPDIR): 94 | if file.endswith('.phobius'): 95 | phobius.append(os.path.join(TMPDIR, file)) 96 | 97 | # write output 98 | TMdomain = 0 99 | SigPep = 0 100 | with open(args.out, 'w') as output: 101 | output.write("%s\t%s\t%s\t%s\n" % ('ID', 'TM', 'SP', 'Prediction')) 102 | for x in phobius: 103 | with open(x, 'r') as input: 104 | line = input.readlines() 105 | try: 106 | result = line[1].split(' ') 107 | result = [x for x in result if x] 108 | if result[1] == 'prediction': 109 | continue 110 | if int(result[1]) > 0: 111 | TMdomain += 1 112 | if result[2] == 'Y': 113 | SigPep += 1 114 | output.write("%s\t%s\t%s\t%s\n" % ( 115 | result[0], result[1], 116 | result[2], result[3].replace('\n', ''))) 117 | except IndexError: 118 | pass 119 | 120 | # clean 121 | if not args.debug: 122 | shutil.rmtree(TMPDIR) 123 | lib.log.debug("%i total proteins, %i TMdomain, %i Signal Peptide" % 124 | (len(phobius), TMdomain, SigPep)) 125 | -------------------------------------------------------------------------------- /funannotate/aux_scripts/sam2bam.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #simple wrapper for running aligner program and piping output to samtools view/sort 4 | 5 | if [ -z "$3" ]; then 6 | echo 'Usage: sam2bam.sh "aligner_command" bam_threads bam_output' 7 | echo '**The double quotes are required around aligner command**' 8 | exit 9 | fi 10 | 11 | #construct the command 12 | cmd="$1 | samtools view -@ $2 -bS - | samtools sort -@ $2 -o $3 -" 13 | 14 | #run the command 15 | eval $cmd 16 | -------------------------------------------------------------------------------- /funannotate/aux_scripts/trnascan2gff3.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | #modified by Jon Palmer (2016) to print correct product ID in field 9 4 | 5 | =head1 NAME 6 | 7 | tRNAScan_SE_to_gff3.pl - convert raw output of tRNAScan-SE to gff3 8 | 9 | =head1 SYNOPSIS 10 | 11 | USAGE: convert_tRNAScanSE_to_gff3.pl 12 | --input=/path/to/some_file.out 13 | 14 | =head1 OPTIONS 15 | 16 | B<--input,-i> 17 | The raw output from tRNAScan-SE: 18 | 19 | Sequence tRNA Bounds tRNA Anti Intron Bounds Cove 20 | Name tRNA # Begin End Type Codon Begin End Score 21 | -------- ------ ---- ------ ---- ----- ----- ---- ------ 22 | tp.assembly.567468735.1 1 91820 91902 Tyr GTA 91857 91866 66.58 23 | tp.assembly.567468735.1 2 171777 171849 Phe GAA 0 0 70.28 24 | tp.assembly.567468735.1 3 172144 172215 His GTG 0 0 64.04 25 | tp.assembly.567468735.1 4 852847 852919 Thr AGT 0 0 75.69 26 | tp.assembly.567468735.1 5 877291 877362 Trp CCA 0 0 68.97 27 | tp.assembly.567468735.1 6 1468229 1468300 Cys GCA 0 0 72.10 28 | tp.assembly.567468735.1 7 2507459 2507530 Pro AGG 0 0 62.33 29 | tp.assembly.567468735.1 8 2507198 2507127 Pro CGG 0 0 65.73 30 | tp.assembly.567468735.1 9 2506317 2506246 Pro TGG 0 0 66.60 31 | tp.assembly.567468735.1 10 2463785 2463713 Lys TTT 0 0 79.47 32 | tp.assembly.567468735.1 11 2191149 2191069 Leu CAG 0 0 57.47 33 | tp.assembly.567468735.1 12 1633307 1633237 Gly CCC 0 0 65.52 34 | tp.assembly.567468735.1 13 1255051 1254968 Leu CAA 0 0 60.46 35 | tp.assembly.567468735.1 14 251108 251037 Asp GTC 0 0 59.48 36 | tp.assembly.567468735.1 15 250520 250449 Asp GTC 0 0 59.48 37 | 38 | B<--log,-l> 39 | Log file 40 | 41 | B<--help,-h> 42 | This help message 43 | 44 | =head1 DESCRIPTION 45 | 46 | File converter 47 | 48 | =head1 INPUT 49 | 50 | Input above. 51 | 52 | =head1 OUTPUT 53 | 54 | GFF3 to STDOUT 55 | 56 | =head1 CONTACT 57 | 58 | Kyle Tretina 59 | kyletretina@gmail.com 60 | 61 | =cut 62 | 63 | use warnings; 64 | use strict; 65 | use Getopt::Long qw(:config no_ignore_case no_auto_abbrev pass_through); 66 | use Pod::Usage; 67 | 68 | my %options = (); 69 | my $results = GetOptions (\%options, 70 | 'input|i=s', 71 | 'log|l=s', 72 | 'help|h') || pod2usage(); 73 | 74 | ## display documentation 75 | if( $options{'help'} ){ 76 | pod2usage( {-exitval => 0, -verbose => 2, -output => \*STDERR} ); 77 | } 78 | 79 | ## make sure everything passed was peachy 80 | &check_parameters(\%options); 81 | 82 | ## open the log if requested 83 | my $logfh; 84 | if (defined $options{log}) { 85 | open($logfh, ">$options{log}") || die "can't create log file: $!"; 86 | } 87 | 88 | ## open the input file 89 | my $ifh; 90 | open($ifh, "<$options{input}") || die "can't open input file: $!"; 91 | 92 | # all output needs the gff header 93 | print "##gff-version 3\n"; 94 | 95 | ## globals 96 | my $i=1; 97 | 98 | ## parse the file 99 | foreach my $line (<$ifh>){ 100 | my @cols = split /[\t]/, $line; 101 | chomp @cols; 102 | my $contig = $cols[0]; 103 | 104 | if ($contig =~ /^(.+?)\s+$/) { 105 | $contig = $1; 106 | } 107 | 108 | ## skip the header lines 109 | next if $contig eq 'Sequence' || $contig eq 'Name' || $contig eq '--------'; 110 | 111 | my $start = trim($cols[2]); 112 | my $stop = trim($cols[3]); 113 | my $target = $cols[4]; 114 | my $anticodon = $cols[5]; 115 | my @prod = split '\_', $cols[4]; 116 | my $product; 117 | my $note; 118 | my $length = abs($stop - $start); 119 | if ( $length >= '150' ) { 120 | next; 121 | } 122 | if ( $prod[0] eq "Pseudo") { 123 | next; 124 | #$product = "tRNA-Xxx"; 125 | #$note = "Predicted $anticodon anticodon"; 126 | } 127 | elsif ( $prod[0] eq "Sup") { 128 | next; 129 | #$product = "tRNA-Xxx"; 130 | #$note = "Predicted $anticodon anticodon, putative tRNA Suppressor" 131 | } 132 | elsif ( $prod[0] eq "Undet") { 133 | next; } 134 | else { 135 | $product = "tRNA-$prod[0]"; 136 | $note = "Predicted $anticodon anticodon"; } 137 | my $score = $cols[8]; 138 | if ($start < $stop){ 139 | print "$contig\ttRNAScan-SE\tgene\t$start\t$stop\t$score\t+\t.\tID=$target\_$i\n"; 140 | print "$contig\ttRNAScan-SE\ttRNA\t$start\t$stop\t$score\t+\t.\tID=$target\_$i\_tRNA;Parent=$target\_$i;product=$product;note=$note\n"; 141 | print "$contig\ttRNAScan-SE\texon\t$start\t$stop\t$score\t+\t.\tID=$target\_$i\_exon;Parent=$target\_$i\_tRNA\n"; 142 | $i++; 143 | }else{ 144 | print "$contig\ttRNAScan-SE\tgene\t$stop\t$start\t$score\t-\t.\tID=$target\_$i\n"; 145 | print "$contig\ttRNAScan-SE\ttRNA\t$stop\t$start\t$score\t-\t.\tID=$target\_$i\_tRNA;Parent=$target\_$i;product=$product;note=$note\n"; 146 | print "$contig\ttRNAScan-SE\texon\t$stop\t$start\t$score\t-\t.\tID=$target\_$i\_exon;Parent=$target\_$i\_tRNA\n"; 147 | $i++; 148 | } 149 | } 150 | 151 | exit(0); 152 | 153 | sub trim { my $s = shift; $s =~ s/^\s+|\s+$//g; return $s }; 154 | 155 | sub _log { 156 | my $msg = shift; 157 | print $logfh "$msg\n" if $logfh; 158 | } 159 | 160 | sub check_parameters { 161 | my $options = shift; 162 | ## make sure required arguments were passed 163 | my @required = qw( input ); 164 | for my $option ( @required ) { 165 | unless ( defined $$options{$option} ) { 166 | die "--$option is a required option"; 167 | } 168 | } 169 | ## handle some defaults 170 | $options{optional_argument2} = 'foo' unless ($options{optional_argument2}); 171 | } -------------------------------------------------------------------------------- /funannotate/aux_scripts/xmlcombine.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import os 4 | import os.path 5 | import fnmatch 6 | from xml.etree import cElementTree 7 | 8 | cElementTree.register_namespace( 9 | '', "http://www.ebi.ac.uk/interpro/resources/schemas/interproscan5") 10 | 11 | 12 | def run(xml_files): 13 | first = None 14 | for filename in xml_files: 15 | data = cElementTree.parse(filename).getroot() 16 | if first is None: 17 | first = data 18 | else: 19 | first.extend(data) 20 | if first is not None: 21 | print(cElementTree.tostring(first)) 22 | 23 | 24 | if __name__ == "__main__": 25 | xml_files = [os.path.join(dirpath, f) 26 | for dirpath, dirnames, files in os.walk(sys.argv[1]) 27 | for f in fnmatch.filter(files, '*.xml')] 28 | run(xml_files) 29 | -------------------------------------------------------------------------------- /funannotate/config/TruSeq3-PE.fa: -------------------------------------------------------------------------------- 1 | >PrefixPE/1 2 | TACACTCTTTCCCTACACGACGCTCTTCCGATCT 3 | >PrefixPE/2 4 | GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT -------------------------------------------------------------------------------- /funannotate/config/TruSeq3-SE.fa: -------------------------------------------------------------------------------- 1 | >TruSeq3_IndexedAdapter 2 | AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC 3 | >TruSeq3_UniversalAdapter 4 | AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA -------------------------------------------------------------------------------- /funannotate/config/busco_test.fa: -------------------------------------------------------------------------------- 1 | >example 2 | ATACGACGTACCCGTGCGTCAATTGCTACGGCGCATGCCTTCTTGTCGAGGGTTTTTTCTGGAAGCGGTCAGAGATGTTGAATAATGATGGCATGAGTTACGAGAAATGCGATTGTATTTTAGCATGCATAGACTATCAACATTGATGTTGTCCACGACTGTCCCTCTCCCGCCGGTGCGGTCATCAAACACATTCCTGCAATAGCTAACAGTAGACGAAATACTCATCACCCACCTACTTATAATCGTAATAGAGGACCGCAACCAAGCATATGGGTTATCCTGGATTGGATAGCTGAGGTCAGAGACCTTTTATTAAGGGTCATAGCCTGTCTACTCCGTAGAACGAAGTAGTGTTGTACCTACAATTATGCTCATTACATACTGTCATAACATAATATATTCTCATGACATCTTGAAAAAAAGATACCTCTAAATATCAAAGTAAAGCCGATTACCAAATACTTCGTTTATGGCTTCTCTGGATATAGATTTCCGCTCTGCGTATTGCCTAAATCGTTAGGGTTCCAAAAGGCCACCTTATCATGAAACAAACCTTGCAAACGCAGAAGATATCAAAACCCATAAGAATGAGTCTTAGAATTATTAATAAATGTTTGTAGTAAAAGAAGGGAGAGCGCTTACATCAATGAGGTTCTGAGCATACTCCCCAAAGCGGACATTTTAGGGCTAGCCCTATTATTCAATTCAATCGGAGATTTCCCCCAAGCTCCGAGGATGAGCTGCGGCAACCACCGGCGACGACCATCGCATACATCTCGTCGCAATGGACATTTCCGATCTTATCGAGCCCCCGCAGAAGCGCCTCAAGACTGAGGATATCTCCAGCGCAGACGAGGTTGTTCTTCCCGCTGGCGGAATCACGCCGCAGACCGACAACGAAATCGACGAGCAGTTATCGAAGGAGATTGAAGTTGGCATCACTGAGTTTGTCAGCGCTGATAATGAGGGTTTCGCGGGGATTTTGAAGAAAAGGTATTCTTAACTGATACGGTTGGGGTTTGATCTGAGTGCTGACTATTGCCAGATACACAGATTTCCTTGTGAACGAGATCCTGCCCTCGGGGAAAGTTCTGCATCTGACGAATACCACTGCACCTAATACCAATGATGAGGCGACTCCAGTCCAGGCAGATAAGAAGCCGGCCGAAGATAAGCCAAAAGAGCCCGAAACTCCCGCAGAGAAGTTGCCTGCTCCAGTTGAGTTTCAATTAGCGGAGGAAGATGAGGCGCTTCTGGACACTTTATTCGGCACCCAAAACACCAAGAAAATTGTCGCCCTCCATAAGAAGGCACTGGCAAATCCAAAGACTAAGCCAAGCGATCTGGGACGATTGAACACAGTCGTTGTCAACGACCGCGATCAGCGCATCAAAATGCACCAGGCAATTCGTCGCATCTTCAATTCGCAGATTGAATCTTCAACAGACAGTGAAGGAATGATGGTTATCTCAGTCGCTGCCAACCGCAACAAGAAGAATCCACAGGGAGGTGGAGGCGGGCGTGAGAGGCCGCGCGTGAATTGGGACGAACTGGGCGGACAGTATCTGCACTTTACTATTTACAAGGAGAACAAGGACACCATGGAGGTCATCTCGTTCATCGCCCGCCAACTGAAGATGAATCCGAAGAGCTTCCAGTTCGCGGGGACCAAAGATCGCCGCGGAGTAACCGTGCAGAGGGCATGCGCTTATCGCTTGCAAGCCGATCGCCTCGCGAAGCTCAATCGAACGCTCCGCAATGCCGTCGTTGGCGACTTCGAATACCAACCTCACGGCCTCGAGCTCGGCGACCTCTATGGGAACGAGTTCGTCGTGACTCTCCGCGAGTGCGAGGTTCCTGGCATCAACATCCAAGACCCCGCATCAGCCGTAGCCAAGACAAAGGAGCTCGTCAACACTTCACTCAAGAACCTCTACCAAAGAGGTTACTTCAACTACTACGGCCTACAACGTTTCGGCTCTTTCGCAACCCGCACTGACACAGTGGGCGTGAAGATACTGCAGGACGACTTCAAGGGCGCCTGCGACGCTATCCTCGACTACAGCCCACACATCCTCGCCGCGGCACAAGCAGAATTAGGCCAGGGCGAAGGCGAAGGCGCCACACCTACCAACATCAGCTCTGAAGATAAAGCACGCGCCCTCGCAATCCACATCTTCCGAACAACTGACCGCGTCACGGACGCTCTCGAAAAAATGCCTCGCAAGTTCTCCGCAGAATCGAACATCATTCGCCACCTCGGCCGGTCAAAGAACGATTACCTTGGCGCCCTGCAGACCATTCCCCGTAATCTCCGCCTCATGTATGTCCACGCCTACCAGTCCCTCGTCTGGAACCTTGCTGTTGGCGAGCGCTGGCGCCTGTACGGCGACCGCGTTGTAGAAGGCGATCTTGTCCTCATCCACGAACACCGCGACAAAGACGGCAACTCTTCCTATACCACACCCGCCCCCGGTGCAGGAGCTAGCGGCGAAACCACTACAATTGACGCAGACGGCGAAATCATTATCGTCCCGCAAGAACACGACTCAGCCTTTGCCGTCGAAGACACATTCACCCGCGCCCGAGCCCTAACCGCCGCCGAAGCGAACAGCGGCCTCTACAGCATCTTCGATATCGTCCTACCTCTCCCTGGCTTCGACGTCCTATACCCGCCAAACAAAATGACGGACTTCTATAAAGAGTTCATGGGTAGCTCCCGCGGCGGCGGATTGGATCCCTTCAACATGCGGAGAAAGTGGAAGGACGCGAGTTTAAGCGGGAGTTATCGAAAAGTTCTTAGTCGGATGGGCAGGGACTACTCTGTTGATGTGGTGCTTTATAGCAGGGATGAGGAGCAGTTTGTCCGGACTGATCTAGAGAATTTGACCCTCAAGACGAGGGATGGAGGGGATGTGGATTTGGAGAAGAAAGAGGGGAAGAGTGAAGGGGATAAGCTTGCTGTTGTCCTCAAGTTCCAGCTTGGATCGAGCCAGTATGCAACCATGGCGTTGAGAGAGTTGATGAGGGGAAAGGTGAAGGCGTATAAGCCAGACTTTGGAGGTGGGCGGTAGACTGTCGTAGCCGCCGTCGTGAACCACTCAGCTCATACGGTGTGTATACAAAGCTACGACCTTATAGGTCTATACATTCTTCATCTAAAACAACCAAATCCGTACACTCAACTTTCTGACTGGATATAGAGCAAGATTATGAACATTAAATAGACACTATATGATCTCGAGAAACCCTTGAACAAATAATCAGGATTAGAACGCTGAGAGTTGACTAGAGGCCGGGTAGGTAACTAACTCCCCCGATGCGCGGCCTGTCGATGCTCATGGTCATCGGCACCGAAACTGGACATAGACGAGATatcatcatcatcatcatcatcatcatcatcCCGGAGCCCTCTATCTCTCATCATCGCTTCGCGATCGTTGATCTCCGACACTACAGACATATTGTCGTCATTGTCATTTGGACGGTCAAATGGTGAACGCGGCCGTGCCGCTGCTTCTCTAGCGCGCTCTTCGGATGTTCGCCAGTTGAGATCTCGGTCTGTTActgcaactccagctccggctgcagcacctgctACCGGGGCCGCCGATGATGTGCTGTCTACTCCTGTCCCCGGCTCGGTCCCTGGTGGACGGAGATAAGGTGGAGGCGGAGGTAGATTGTCGTTGTTGCTGGGAAACGGGTTACGGTTAACTGGTCCTGGGGCCGGAACGGGGGCGATCTGGGGTTGACTAGGGAGGAAGGCAGAAGACGAAGTGTCTAGTTGGCGTG -------------------------------------------------------------------------------- /funannotate/config/codeml.config: -------------------------------------------------------------------------------- 1 | seqfile = INPUTFILEHERE 2 | treefile = INPUTTREEHERE 3 | outfile = OUTPUTFILEHERE 4 | noisy = 9 * 0,1,2,3,9: how much rubbish on the screen 5 | verbose = 1 * 1: detailed output, 0: concise output 6 | runmode = 0 * 0: user tree; 1: semi-automatic; 2: automatic 7 | * 3: StepwiseAddition; (4,5):PerturbationNNI; -2: pairwise 8 | 9 | seqtype = 1 * 1:codons; 2:AAs; 3:codons-->AAs 10 | CodonFreq = 2 * 0:1/61 each, 1:F1X4, 2:F3X4, 3:codon table 11 | clock = 0 * 0: no clock, unrooted tree, 1: clock, rooted tree 12 | aaDist = 0 * 0:equal, +:geometric; -:linear, {1-5:G1974,Miyata,c,p,v} 13 | model = 0 14 | 15 | NSsites = 0 16 | * 0:one w; 1:NearlyNeutral; 2:PositiveSelection; 3:discrete; 17 | * 4:freqs; 5:gamma;6:2gamma;7:beta;8:beta&w;9:betaγ10:3normal 18 | icode = 0 * 0:standard genetic code; 1:mammalian mt; 2-10:see below 19 | Mgene = 0 * 0:rates, 1:separate; 2:pi, 3:kappa, 4:all 20 | 21 | fix_kappa = 0 * 1: kappa fixed, 0: kappa to be estimated 22 | kappa = 1 * initial or fixed kappa 23 | fix_omega = 0 * 1: omega or omega_1 fixed, 0: estimate 24 | omega = 1 * initial or fixed omega, for codons or codon-based AAs 25 | ncatG = 10 * # of categories in the dG or AdG models of rates 26 | 27 | getSE = 0 * 0: don't want them, 1: want S.E.s of estimates 28 | RateAncestor = 0 * (0,1,2): rates (alpha>0) or ancestral states (1 or 2) 29 | Small_Diff = .45e-6 30 | cleandata = 1 * remove sites with ambiguity data (1:yes, 0:no)? 31 | fix_blength = 0 * 0: ignore, -1: random, 1: initial, 2: fixed 32 | -------------------------------------------------------------------------------- /funannotate/config/extrinsic.E.XNT.RM.cfg: -------------------------------------------------------------------------------- 1 | # 2 | # extrinsic information 3 | # date: 07/28/2018 4 | # Jon Palmer 5 | 6 | # source of extrinsic information: 7 | # M manual anchor (required) 8 | # P protein database hit 9 | # XNT protein homology prediction by exonerate 10 | # E est database hit 11 | # C combined est/protein database hit 12 | # D Dialign 13 | # R retroposed genes 14 | # T transMapped refSeqs 15 | 16 | [SOURCES] 17 | M E XNT RM 18 | 19 | 20 | # 21 | # individual_liability: Only unsatisfiable hints are disregarded. By default this flag is not set 22 | # and the whole hint group is disregarded when one hint in it is unsatisfiable. 23 | # 24 | [SOURCE-PARAMETERS] 25 | XNT individual_liability 26 | 27 | # feature bonus malus gradelevelcolumns 28 | # r+/r- 29 | # 30 | # the gradelevel colums have the following format for each source 31 | # sourcecharacter numscoreclasses boundary ... boundary gradequot ... gradequot 32 | # 33 | 34 | [GENERAL] 35 | start 1 1 M 1 1e+100 E 1 1 XNT 1 1 RM 1 1 36 | stop 1 1 M 1 1e+100 E 1 1 XNT 1 1 RM 1 1 37 | tss 1 1 M 1 1e+100 E 1 10 XNT 1 1 RM 1 1 38 | tts 1 1 M 1 1e+100 E 1 100 XNT 1 1 RM 1 1 39 | ass 1 1 M 1 1e+100 E 1 20 XNT 1 1 RM 1 1 40 | dss 1 1 M 1 1e+100 E 1 20 XNT 1 1 RM 1 1 41 | exonpart 1 1 M 1 1e+100 E 1 1e3 XNT 1 1 RM 1 1 42 | exon 1 1 M 1 1e+100 E 1 5e3 XNT 1 1 RM 1 1 43 | intronpart 1 1 M 1 1e+100 E 1 1 XNT 1 1 RM 1 1 44 | intron 1 1 M 1 1e+100 E 1 5e4 XNT 1 1e3 RM 1 1 45 | CDSpart 1 1 M 1 1e+100 E 1 1 XNT 1 1e4 RM 1 1 46 | CDS 1 1 M 1 1e+100 E 1 1 XNT 1 1 RM 1 1 47 | UTRpart 1 1 M 1 1e+100 E 1 1 XNT 1 1 RM 1 1 48 | UTR 1 1 M 1 1e+100 E 1 1 XNT 1 1 RM 1 1 49 | irpart 1 1 M 1 1e+100 E 1 1 XNT 1 1 RM 1 1 50 | nonexonpart 1 1 M 1 1e+100 E 1 1 XNT 1 1 RM 1 1.15 51 | 52 | 53 | # chlamy EST score 54 | # 0: public EST 55 | # 1: Chun, no terminus 56 | # 2: Chun, with terminus 57 | # 58 | # Explanation: see original extrinsic.cfg file 59 | # 60 | -------------------------------------------------------------------------------- /funannotate/config/test.sbt: -------------------------------------------------------------------------------- 1 | Submit-block ::= { 2 | contact { 3 | contact { 4 | name name { 5 | last "Palmer", 6 | first "Jonathan" 7 | }, 8 | affil std { 9 | affil "USDA Forest Service", 10 | div "CFMR", 11 | city "Madison", 12 | sub "WI", 13 | country "USA", 14 | street "1 Gifford Pinchot Drive", 15 | email "nextgenusfs@gmail.com", 16 | fax "", 17 | phone "555-555-5555", 18 | postal-code "53726" 19 | } 20 | } 21 | }, 22 | cit { 23 | authors { 24 | names std { 25 | { 26 | name name { 27 | last "Palmer", 28 | first "Jonathan", 29 | initials "J.M.", 30 | suffix "" 31 | } 32 | } 33 | }, 34 | affil std { 35 | affil "USDA Forest Service", 36 | div "CFMR", 37 | city "Madison", 38 | sub "WI", 39 | country "USA", 40 | street "1 Gifford Pinchot Drive", 41 | postal-code "53726" 42 | } 43 | } 44 | }, 45 | subtype new 46 | } 47 | 48 | Seqdesc ::= pub { 49 | pub { 50 | gen { 51 | cit "unpublished", 52 | authors { 53 | names std { 54 | { 55 | name name { 56 | last "Palmer", 57 | first "Jonathan", 58 | initials "J.M.", 59 | suffix "" 60 | } 61 | } 62 | }, 63 | affil std { 64 | affil "USDA Forest Service", 65 | div "CFMR", 66 | city "Madison", 67 | sub "WI", 68 | country "USA", 69 | street "1 Gifford Pinchot Drive", 70 | postal-code "53726" 71 | } 72 | }, 73 | title “Annotate generated by FunAnnotate: fungal automated genome annotation” 74 | } 75 | } 76 | } 77 | 78 | -------------------------------------------------------------------------------- /funannotate/config/tf_interpro.txt: -------------------------------------------------------------------------------- 1 | IPR000967,NF-X1-type zinc finger 2 | IPR006856,Mating-type protein MAT alpha-1 HMG-Box 3 | IPR018501,DDT domain 4 | IPR007196,CCR4-Not complex component 5 | IPR007396,Putative FMN-binding domain 6 | IPR004595,TFIIH C1-like domain 7 | IPR004181,MIZ zinc finger 8 | IPR000818,TEA/ATTS domain family 9 | IPR001387,Helix-turn-helix 10 | IPR001289,CCAAT-binding TF (CBF-B/NF-YA) subunit B 11 | IPR003120,STE-like TF 12 | IPR003150,RFX DNA-binding domain 13 | IPR004198,Zinc finger 14 | IPR007604,CP2 TF 15 | IPR008895,YL1 nuclear protein 16 | IPR010770,SGT1 protein 17 | IPR018004,KilA-N domain 18 | IPR024061,NDT80/PhoG-like DNA-binding family 19 | IPR003656,BED zinc finger 20 | IPR018060,Bacterial regulatory HTH proteins 21 | IPR005011,SART-1 family 22 | IPR002100,SRF-type TF (DNA-binding and dimerization domain) 23 | IPR010666,GRF zinc finger 24 | IPR000232,HSF-type DNA-binding 25 | IPR001766,Fork head domain 26 | IPR001356,Homeobox domain 27 | IPR000679,GATA zinc finger 28 | IPR013767,PAS fold 29 | IPR001878,Zinc knuckle (CCHC) 30 | IPR004827,Basic region leucine zipper 2 31 | IPR007889,Helix-turn-helix 32 | IPR011598,Helix-loop-helix DNA-binding domain 33 | IPR001005,Myb-like DNA-binding domain 34 | IPR004827,bZIP TF 1 35 | IPR007087,Zinc finger (C2H2) 36 | IPR001138,Fungal Zn(2)-Cys(6) binuclear cluster domain 37 | IPR007219,Fungal-specific TF domain 38 | -------------------------------------------------------------------------------- /funannotate/database.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import os 6 | import funannotate.library as lib 7 | import funannotate.resources as resources 8 | 9 | 10 | def main(args): 11 | # setup funannotate DB path 12 | try: 13 | FUNDB = os.environ["FUNANNOTATE_DB"] 14 | except KeyError: 15 | print('$FUNANNOTATE_DB not found, run funannotate setup and export ENV variable') 16 | sys.exit(1) 17 | if '--show-outgroups' in args: 18 | try: 19 | files = [f for f in os.listdir(os.path.join(FUNDB, 'outgroups'))] 20 | except OSError: 21 | print(( 22 | 'ERROR: %s/outgroups folder is not found, run funannotate setup.' % FUNDB)) 23 | sys.exit(1) 24 | files = [x.replace('_buscos.fa', '') for x in files] 25 | files = [x for x in files if not x.startswith('.')] 26 | print("-----------------------------") 27 | print("BUSCO Outgroups:") 28 | print("-----------------------------") 29 | print((lib.list_columns(files, cols=3))) 30 | print('') 31 | 32 | elif '--show-buscos' in args: 33 | print("-----------------------------") 34 | print("BUSCO DB tree: (# of models)") 35 | print("-----------------------------") 36 | print((resources.buscoTree)) 37 | else: 38 | dbfile = os.path.join(FUNDB, 'funannotate-db-info.txt') 39 | db_list = [['Database', 'Type', 'Version', 40 | 'Date', 'Num_Records', 'Md5checksum']] 41 | if not os.path.isfile(dbfile): 42 | print('Database is not properly configured, re-run funannotate setup') 43 | sys.exit(1) 44 | with open(dbfile, 'r') as infile: 45 | for line in infile: 46 | line = line.rstrip() 47 | cols = line.split('\t') 48 | del cols[2] 49 | db_list.append(cols) 50 | msg = lib.bold_underline('Funannotate Databases currently installed:') 51 | print(('\n'+msg+'\n')) 52 | lib.print_table(db_list, alignments='LLLLRL', max_col_width=60) 53 | 54 | print(( 55 | '\nTo update a database type:\n\tfunannotate setup -i DBNAME -d {:} --force\n'.format(FUNDB))) 56 | print('To see install BUSCO outgroups type:\n\tfunannotate database --show-outgroups\n') 57 | print('To see BUSCO tree type:\n\tfunannotate database --show-buscos\n') 58 | 59 | 60 | if __name__ == "__main__": 61 | main(sys.argv[1:]) 62 | -------------------------------------------------------------------------------- /funannotate/downloads.json: -------------------------------------------------------------------------------- 1 | { 2 | "downloads": { 3 | "uniprot": "https://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz", 4 | "uniprot-release": "https://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/reldate.txt", 5 | "merops": "https://ftp.ebi.ac.uk/pub/databases/merops/current_release/meropsscan.lib", 6 | "dbCAN": "https://bcb.unl.edu/dbCAN2/download/Databases/V11/dbCAN-HMMdb-V11.txt", 7 | "dbCAN-tsv": "https://bcb.unl.edu/dbCAN2/download/Databases/V11/CAZyDB.08062022.fam-activities.txt", 8 | "dbCAN-log": "https://bcb.unl.edu/dbCAN2/download/Databases/V11/readme.txt", 9 | "pfam": "https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz", 10 | "pfam-tsv": "https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz", 11 | "pfam-log": "https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam.version.gz", 12 | "outgroups": "https://osf.io/r9sne/download?version=1", 13 | "repeats": "https://osf.io/vp87c/download?version=1", 14 | "go-obo": "https://purl.obolibrary.org/obo/go.obo", 15 | "mibig": "https://dl.secondarymetabolites.org/mibig/mibig_prot_seqs_1.4.fasta", 16 | "interpro": "https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/interpro.xml.gz", 17 | "interpro-tsv": "https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/entry.list", 18 | "gene2product": "https://raw.githubusercontent.com/nextgenusfs/gene2product/master/ncbi_cleaned_gene_products.txt" 19 | }, 20 | "busco": { 21 | "fungi": [ 22 | "https://osf.io/xvzmu/download?version=1", 23 | "fungi_odb9" 24 | ], 25 | "microsporidia": [ 26 | "https://osf.io/r47nx/download?version=1", 27 | "microsporidia_odb9" 28 | ], 29 | "dikarya": [ 30 | "https://osf.io/av6f8/download?version=1", 31 | "dikarya_odb9" 32 | ], 33 | "ascomycota": [ 34 | "https://osf.io/z2736/download?version=1", 35 | "ascomycota_odb9" 36 | ], 37 | "pezizomycotina": [ 38 | "https://osf.io/bj3sm/download?version=1", 39 | "pezizomycotina_odb9" 40 | ], 41 | "eurotiomycetes": [ 42 | "https://osf.io/nvt3z/download?version=1", 43 | "eurotiomycetes_odb9" 44 | ], 45 | "sordariomycetes": [ 46 | "https://osf.io/r24kn/download?version=1", 47 | "sordariomyceta_odb9" 48 | ], 49 | "saccharomycetes": [ 50 | "https://osf.io/mpu2k/download?version=1", 51 | "saccharomyceta_odb9" 52 | ], 53 | "saccharomycetales": [ 54 | "https://osf.io/dhk47/download?version=1", 55 | "saccharomycetales_odb9" 56 | ], 57 | "basidiomycota": [ 58 | "https://osf.io/2xnsj/download?version=1", 59 | "basidiomycota_odb9" 60 | ], 61 | "eukaryota": [ 62 | "https://osf.io/psj2k/download?version=1", 63 | "eukaryota_odb9" 64 | ], 65 | "protists": [ 66 | "https://osf.io/a4tsk/download?version=1", 67 | "protists_ensembl" 68 | ], 69 | "alveolata_stramenophiles": [ 70 | "https://osf.io/waqpe/download?version=1", 71 | "alveolata_stramenophiles_ensembl" 72 | ], 73 | "metazoa": [ 74 | "https://osf.io/5bvam/download?version=1", 75 | "metazoa_odb9" 76 | ], 77 | "nematoda": [ 78 | "https://osf.io/u87d3/download?version=1", 79 | "nematoda_odb9" 80 | ], 81 | "arthropoda": [ 82 | "https://osf.io/w26ez/download?version=1", 83 | "arthropoda_odb9" 84 | ], 85 | "insecta": [ 86 | "https://osf.io/8qsa5/download?version=1", 87 | "insecta_odb9" 88 | ], 89 | "endopterygota": [ 90 | "https://osf.io/pxdqg/download?version=1", 91 | "endopterygota_odb9" 92 | ], 93 | "hymenoptera": [ 94 | "https://osf.io/q4ce6/download?version=1", 95 | "hymenoptera_odb9" 96 | ], 97 | "diptera": [ 98 | "https://osf.io/e2n49/download?version=1", 99 | "diptera_odb9" 100 | ], 101 | "vertebrata": [ 102 | "https://osf.io/w6kf8/download?version=1", 103 | "vertebrata_odb9" 104 | ], 105 | "actinopterygii": [ 106 | "https://osf.io/dj2cw/download?version=1", 107 | "actinopterygii_odb9" 108 | ], 109 | "tetrapoda": [ 110 | "https://osf.io/bp4cf/download?version=1", 111 | "tetrapoda_odb9" 112 | ], 113 | "aves": [ 114 | "https://osf.io/e7qym/download?version=1", 115 | "aves_odb9" 116 | ], 117 | "mammalia": [ 118 | "https://osf.io/dvy5m/download?version=1", 119 | "mammalia_odb9" 120 | ], 121 | "euarchontoglires": [ 122 | "https://osf.io/p3nc7/download?version=1", 123 | "euarchontoglires_odb9" 124 | ], 125 | "laurasiatheria": [ 126 | "https://osf.io/2v9hj/download?version=1", 127 | "laurasiatheria_odb9" 128 | ], 129 | "embryophyta": [ 130 | "https://osf.io/m67p4/download?version=1", 131 | "embryophyta_odb9" 132 | ] 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /funannotate/fix.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import os 6 | import uuid 7 | import shutil 8 | import argparse 9 | import subprocess 10 | import funannotate.library as lib 11 | 12 | 13 | def main(args): 14 | # setup menu with argparse 15 | class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): 16 | def __init__(self, prog): 17 | super(MyFormatter, self).__init__(prog, max_help_position=48) 18 | parser = argparse.ArgumentParser(prog='fix', usage="%(prog)s [options] -i genome.GBK -t genome.tbl", 19 | description='''Script will update annotation of a Genbank file with new tbl.''', 20 | epilog="""Written by Jon Palmer (2017) nextgenusfs@gmail.com""", 21 | formatter_class=MyFormatter) 22 | parser.add_argument('-i', '--input', required=True, 23 | help='Genome in GBK format') 24 | parser.add_argument('-t', '--tbl', required=True, 25 | help='Genome annotation in NCBI tbl format') 26 | parser.add_argument( 27 | '-d', '--drop', help='List of locus_tag to remove/drop from annotation') 28 | parser.add_argument('-o', '--out', help='Basename of output files') 29 | parser.add_argument('--tbl2asn', default='-l paired-ends', 30 | help='Parameters for tbl2asn, linkage and gap info') 31 | args = parser.parse_args(args) 32 | 33 | parentdir = os.path.join(os.path.dirname(__file__)) 34 | 35 | # create log file 36 | log_name = 'funannotate-fix.log' 37 | if os.path.isfile(log_name): 38 | os.remove(log_name) 39 | 40 | # initialize script, log system info and cmd issue at runtime 41 | lib.setupLogging(log_name) 42 | cmd_args = " ".join(sys.argv)+'\n' 43 | lib.log.debug(cmd_args) 44 | print("-------------------------------------------------------") 45 | lib.SystemInfo() 46 | 47 | # get version of funannotate 48 | version = lib.get_version() 49 | lib.log.info("Running %s" % version) 50 | 51 | # create output and temporary directory 52 | if args.out: 53 | basedir = args.out 54 | else: 55 | # get location from tbl file 56 | basedir = os.path.dirname(args.tbl) 57 | if basedir == '': 58 | basedir = '.' 59 | 60 | if not os.path.isdir(basedir): 61 | os.makedirs(basedir) 62 | if not os.path.isdir(os.path.join(basedir, 'tbl2asn')): 63 | os.makedirs(os.path.join(basedir, 'tbl2asn')) 64 | 65 | # copy over the annotation file to tbl2asn folder, or process if args.drop passed 66 | if args.drop: 67 | lib.tblfilter(args.tbl, args.drop, os.path.join( 68 | basedir, 'tbl2asn', 'genome.tbl')) 69 | else: 70 | shutil.copyfile(args.tbl, os.path.join( 71 | basedir, 'tbl2asn', 'genome.tbl')) 72 | 73 | # get information info from GBK file 74 | organism, strain, isolate, accession, WGS_accession, gb_gi, version = lib.getGBKinfo( 75 | args.input) 76 | locustag, genenum, justify = lib.getGBKLocusTag(args.input) 77 | if strain: 78 | organism_name = organism+'_'+strain 79 | elif isolate: 80 | organism_name = organism+'_'+isolate 81 | else: 82 | organism_name = organism 83 | organism_name = organism_name.replace(' ', '_') 84 | 85 | # extract fasta file from genbank file, 86 | lib.log.info('Extracting genome sequence and parsing meta information') 87 | contigs, genes, trnas = lib.countGenBank(args.input) 88 | lib.log.info('{:,} contigs containing {:,} protein coding genes and {:,} tRNA genes'.format( 89 | contigs, genes, trnas)) 90 | lib.gb2dna(args.input, os.path.join(basedir, 'tbl2asn', 'genome.fsa')) 91 | 92 | # assuming that this is the predict_results dir or update_results dir, but check first and then archive 93 | if '_results' in basedir: 94 | archivedir = os.path.join(basedir, 'archive_'+str(uuid.uuid4())) 95 | lib.log.info( 96 | 'Found pre-existing funannotate files, archiving to %s' % archivedir) 97 | os.makedirs(archivedir) 98 | # move files in results to archive dir 99 | for file in os.listdir(basedir): 100 | if 'pasa-reannotation' in file or 'WGS_accession' in file or 'ncbi.p2g' in file or '.parameters.json' in file: 101 | continue 102 | if os.path.isfile(os.path.join(basedir, file)): 103 | os.rename(os.path.join(basedir, file), 104 | os.path.join(archivedir, file)) 105 | 106 | # now we can run tbl2asn 107 | SBT = os.path.join(parentdir, 'config', 'test.sbt') 108 | discrep = os.path.join(basedir, organism_name+'.discrepency.txt') 109 | if not version: 110 | version = 1 111 | lib.log.info('Converting to GenBank format') 112 | # have to run as subprocess because of multiprocessing issues 113 | cmd = [sys.executable, os.path.join(parentdir, 'aux_scripts', 'tbl2asn_parallel.py'), 114 | '-i', os.path.join(basedir, 'tbl2asn', 'genome.tbl'), '-f', os.path.join( 115 | basedir, 'tbl2asn', 'genome.fsa'), 116 | '-o', os.path.join(basedir, 'tbl2asn'), '--sbt', SBT, '-d', discrep, 117 | '-s', organism, '-t', args.tbl2asn, '-v', str(version), '-c', '4'] 118 | if isolate: 119 | cmd += ['--isolate', isolate] 120 | if strain: 121 | cmd += ['--strain', strain] 122 | lib.log.debug(' '.join(cmd)) 123 | subprocess.call(cmd) 124 | 125 | # now get GBK files from folder 126 | lib.log.info('Generating output files.') 127 | # setup final output files 128 | final_fasta = os.path.join(basedir, organism_name + '.scaffolds.fa') 129 | final_gff = os.path.join(basedir, organism_name + '.gff3') 130 | final_gbk = os.path.join(basedir, organism_name + '.gbk') 131 | final_tbl = os.path.join(basedir, organism_name + '.tbl') 132 | final_proteins = os.path.join(basedir, organism_name + '.proteins.fa') 133 | final_transcripts = os.path.join( 134 | basedir, organism_name + '.mrna-transcripts.fa') 135 | final_cds_transcripts = os.path.join( 136 | basedir, organism_name + '.cds-transcripts.fa') 137 | final_validation = os.path.join(basedir, organism_name+'.validation.txt') 138 | final_error = os.path.join(basedir, organism_name+'.error.summary.txt') 139 | final_fixes = os.path.join( 140 | basedir, organism_name+'.models-need-fixing.txt') 141 | 142 | # retrieve files/reorganize 143 | shutil.copyfile(os.path.join(basedir, 'tbl2asn', 'genome.gbf'), final_gbk) 144 | shutil.copyfile(os.path.join(basedir, 'tbl2asn', 'genome.tbl'), final_tbl) 145 | shutil.copyfile(os.path.join(basedir, 'tbl2asn', 146 | 'genome.val'), final_validation) 147 | shutil.copyfile(os.path.join(basedir, 'tbl2asn', 148 | 'errorsummary.val'), final_error) 149 | lib.tbl2allout(final_tbl, os.path.join(basedir, 'tbl2asn', 'genome.fsa'), final_gff, 150 | final_proteins, final_transcripts, final_cds_transcripts, final_fasta) 151 | errors = lib.ncbiCheckErrors( 152 | final_error, final_validation, locustag, final_fixes) 153 | if errors > 0: 154 | lib.log.info("Manually edit the tbl file %s, then run:\n\nfunannotate fix -i %s -t %s\n" % 155 | (final_tbl, final_gbk, final_tbl)) 156 | else: 157 | contigs, genes, trnas = lib.countGenBank(final_gbk) 158 | lib.log.info('Output genome consists of: {:,} contigs containing {:,} protein coding genes and {:,} tRNA genes'.format( 159 | contigs, genes, trnas)) 160 | 161 | # clean up 162 | shutil.rmtree(os.path.join(basedir, 'tbl2asn')) 163 | 164 | 165 | if __name__ == "__main__": 166 | main(sys.argv[1:]) 167 | -------------------------------------------------------------------------------- /funannotate/html_template/css/starter-template.css: -------------------------------------------------------------------------------- 1 | body { 2 | padding-top: 50px; 3 | } 4 | .starter-template { 5 | padding: 40px 15px; 6 | text-align: center; 7 | } 8 | .table { 9 | padding: 40px 15px; 10 | text-align: left; 11 | } 12 | .center-table { 13 | padding: 40px 15px; 14 | text-align: center !important;; 15 | } 16 | .table td { 17 | text-align: center; 18 | vertical-align: middle; 19 | } 20 | -------------------------------------------------------------------------------- /funannotate/html_template/js/ie-emulation-modes-warning.js: -------------------------------------------------------------------------------- 1 | // NOTICE!! DO NOT USE ANY OF THIS JAVASCRIPT 2 | // IT'S JUST JUNK FOR OUR DOCS! 3 | // ++++++++++++++++++++++++++++++++++++++++++ 4 | /*! 5 | * Copyright 2014-2015 Twitter, Inc. 6 | * 7 | * Licensed under the Creative Commons Attribution 3.0 Unported License. For 8 | * details, see https://creativecommons.org/licenses/by/3.0/. 9 | */ 10 | // Intended to prevent false-positive bug reports about Bootstrap not working properly in old versions of IE due to folks testing using IE's unreliable emulation modes. 11 | (function () { 12 | 'use strict'; 13 | 14 | function emulatedIEMajorVersion() { 15 | var groups = /MSIE ([0-9.]+)/.exec(window.navigator.userAgent) 16 | if (groups === null) { 17 | return null 18 | } 19 | var ieVersionNum = parseInt(groups[1], 10) 20 | var ieMajorVersion = Math.floor(ieVersionNum) 21 | return ieMajorVersion 22 | } 23 | 24 | function actualNonEmulatedIEMajorVersion() { 25 | // Detects the actual version of IE in use, even if it's in an older-IE emulation mode. 26 | // IE JavaScript conditional compilation docs: https://msdn.microsoft.com/library/121hztk3%28v=vs.94%29.aspx 27 | // @cc_on docs: https://msdn.microsoft.com/library/8ka90k2e%28v=vs.94%29.aspx 28 | var jscriptVersion = new Function('/*@cc_on return @_jscript_version; @*/')() // jshint ignore:line 29 | if (jscriptVersion === undefined) { 30 | return 11 // IE11+ not in emulation mode 31 | } 32 | if (jscriptVersion < 9) { 33 | return 8 // IE8 (or lower; haven't tested on IE<8) 34 | } 35 | return jscriptVersion // IE9 or IE10 in any mode, or IE11 in non-IE11 mode 36 | } 37 | 38 | var ua = window.navigator.userAgent 39 | if (ua.indexOf('Opera') > -1 || ua.indexOf('Presto') > -1) { 40 | return // Opera, which might pretend to be IE 41 | } 42 | var emulated = emulatedIEMajorVersion() 43 | if (emulated === null) { 44 | return // Not IE 45 | } 46 | var nonEmulated = actualNonEmulatedIEMajorVersion() 47 | 48 | if (emulated !== nonEmulated) { 49 | window.alert('WARNING: You appear to be using IE' + nonEmulated + ' in IE' + emulated + ' emulation mode.\nIE emulation modes can behave significantly differently from ACTUAL older versions of IE.\nPLEASE DON\'T FILE BOOTSTRAP BUGS based on testing in IE emulation modes!') 50 | } 51 | })(); 52 | -------------------------------------------------------------------------------- /funannotate/html_template/js/ie10-viewport-bug-workaround.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * IE10 viewport hack for Surface/desktop Windows 8 bug 3 | * Copyright 2014-2015 Twitter, Inc. 4 | * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE) 5 | */ 6 | 7 | // See the Getting Started docs for more information: 8 | // http://getbootstrap.com/getting-started/#support-ie10-width 9 | 10 | (function () { 11 | 'use strict'; 12 | 13 | if (navigator.userAgent.match(/IEMobile\/10\.0/)) { 14 | var msViewportStyle = document.createElement('style') 15 | msViewportStyle.appendChild( 16 | document.createTextNode( 17 | '@-ms-viewport{width:auto!important}' 18 | ) 19 | ) 20 | document.querySelector('head').appendChild(msViewportStyle) 21 | } 22 | 23 | })(); 24 | -------------------------------------------------------------------------------- /funannotate/iprscan.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import os 6 | import subprocess 7 | 8 | 9 | def main(args): 10 | # little wrapper to run interproscan multiprocessing 11 | # just pass this onto iprscan-local.py 12 | parentdir = os.path.join(os.path.dirname(__file__)) 13 | cmd = [sys.executable, os.path.join( 14 | parentdir, 'aux_scripts', 'iprscan-local.py')] 15 | cmd += args 16 | subprocess.call(cmd) 17 | 18 | 19 | if __name__ == "__main__": 20 | main(sys.argv[1:]) 21 | -------------------------------------------------------------------------------- /funannotate/outgroups.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import os 6 | import shutil 7 | import argparse 8 | from Bio import SeqIO 9 | import funannotate.library as lib 10 | 11 | 12 | def main(args): 13 | # setup menu with argparse 14 | class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): 15 | def __init__(self, prog): 16 | super(MyFormatter, self).__init__(prog, max_help_position=48) 17 | parser = argparse.ArgumentParser(prog='funannotate-predict.py', usage="%(prog)s [options] -i genome.fasta", 18 | description='''Script that adds a proteome to the outgroups.''', 19 | epilog="""Written by Jon Palmer (2016) nextgenusfs@gmail.com""", 20 | formatter_class=MyFormatter) 21 | parser.add_argument('-i', '--input', required=True, 22 | help='Proteome in FASTA format') 23 | parser.add_argument('-s', '--species', required=True, 24 | help='Species name "binomial in quotes"') 25 | parser.add_argument('-b', '--busco_db', default='dikarya', choices=['fungi', 'microsporidia', 'dikarya', 'ascomycota', 'pezizomycotina', 'eurotiomycetes', 'sordariomycetes', 'saccharomycetes', 'saccharomycetales', 'basidiomycota', 'eukaryota', 'protists', 26 | 'alveolata_stramenophiles', 'metazoa', 'nematoda', 'arthropoda', 'insecta', 'endopterygota', 'hymenoptera', 'diptera', 'vertebrata', 'actinopterygii', 'tetrapoda', 'aves', 'mammalia', 'euarchontoglires', 'laurasiatheria', 'embryophyta'], help='BUSCO database to use') 27 | parser.add_argument('-c', '--cpus', default=2, type=int, 28 | help='Number of CPUs to use') 29 | parser.add_argument('-d', '--database', 30 | help='Path to funannotate database, $FUNANNOTATE_DB') 31 | args = parser.parse_args(args) 32 | 33 | if args.database: 34 | FUNDB = args.database 35 | else: 36 | try: 37 | FUNDB = os.environ["FUNANNOTATE_DB"] 38 | except KeyError: 39 | lib.log.error( 40 | 'Funannotate database not properly configured, run funannotate setup.') 41 | sys.exit(1) 42 | 43 | parentdir = os.path.join(os.path.dirname(__file__)) 44 | 45 | # get base name 46 | species = args.species.replace(' ', '_').lower()+'.'+args.busco_db 47 | OUTGROUPS = os.path.join(FUNDB, 'outgroups') 48 | 49 | # create log file 50 | log_name = species+'-add2outgroups.log' 51 | if os.path.isfile(log_name): 52 | os.remove(log_name) 53 | 54 | # initialize script, log system info and cmd issue at runtime 55 | lib.setupLogging(log_name) 56 | cmd_args = " ".join(sys.argv)+'\n' 57 | lib.log.debug(cmd_args) 58 | print("-------------------------------------------------------") 59 | lib.SystemInfo() 60 | 61 | # get version of funannotate 62 | version = lib.get_version() 63 | lib.log.info("Running %s" % version) 64 | 65 | # check buscos, download if necessary 66 | if not os.path.isdir(os.path.join(FUNDB, args.busco_db)): 67 | lib.log.error("%s busco database is missing, install with funannotate setup -b %s" % 68 | (args.busco_db, args.busco_db)) 69 | sys.exit(1) 70 | 71 | ProtCount = lib.countfasta(args.input) 72 | lib.log.info('{0:,}'.format(ProtCount) + ' protein records loaded') 73 | 74 | # convert to proteins and screen with busco 75 | lib.log.info("Looking for BUSCO models with %s DB" % args.busco_db) 76 | BUSCODB = os.path.join(FUNDB, args.busco_db) 77 | BUSCO = os.path.join(parentdir, 'aux_scripts', 'funannotate-BUSCO2.py') 78 | cmd = [sys.executable, BUSCO, '-i', os.path.abspath( 79 | args.input), '-m', 'proteins', '--lineage', BUSCODB, '-o', species, '--cpu', str(args.cpus), '-f'] 80 | lib.runSubprocess(cmd, '.', lib.log) 81 | 82 | # check that it ran correctly 83 | busco_results = os.path.join('run_'+species, 'full_table_'+species+'.tsv') 84 | if not lib.checkannotations(busco_results): 85 | lib.log.error("BUSCO failed, check logfile") 86 | sys.exit(1) 87 | nameChange = {} 88 | with open(busco_results, 'rU') as input: 89 | for line in input: 90 | if line.startswith('#'): 91 | continue 92 | cols = line.split('\t') 93 | if cols[1] == 'Complete': 94 | if not cols[2] in nameChange: 95 | nameChange[cols[2]] = cols[0] 96 | else: 97 | lib.log.error( 98 | "Duplicate ID found: %s %s. Removing from results" % (cols[2], cols[0])) 99 | del nameChange[cols[2]] 100 | 101 | # output counts 102 | lib.log.info('{0:,}'.format(len(nameChange)) + ' BUSCO models found') 103 | 104 | # index the proteome for parsing 105 | SeqRecords = SeqIO.to_dict(SeqIO.parse(args.input, 'fasta')) 106 | 107 | # setup output proteome 108 | busco_out = os.path.join(OUTGROUPS, species+'_buscos.fa') 109 | with open(busco_out, 'w') as output: 110 | for k, v in list(nameChange.items()): 111 | rec = SeqRecords[k] 112 | output.write('>%s\n%s\n' % (v, rec.seq)) 113 | lib.log.info("Results written to: %s" % busco_out) 114 | 115 | # clean up your mess 116 | shutil.rmtree('run_'+species) 117 | shutil.rmtree('tmp') 118 | 119 | 120 | if __name__ == "__main__": 121 | main(sys.argv[1:]) 122 | -------------------------------------------------------------------------------- /funannotate/sort.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import absolute_import, division, print_function, unicode_literals 5 | 6 | import sys 7 | import argparse 8 | from Bio.SeqIO.FastaIO import SimpleFastaParser 9 | from funannotate.library import countfasta, softwrap 10 | 11 | 12 | def SortRenameHeaders(input, basename, output, minlen=0, simplify=False): 13 | Seqs = [] 14 | with open(input, "r") as infile: 15 | for header, sequence in SimpleFastaParser(infile): 16 | Seqs.append((header, len(sequence), sequence)) 17 | # sort by length 18 | sortedSeqs = sorted(Seqs, key=lambda x: x[1], reverse=True) 19 | # loop through and return contigs and keepers 20 | counter = 1 21 | with open(output, "w") as outfile: 22 | for name, length, seq in sortedSeqs: 23 | if simplify: # try to just split at first space 24 | if " " in name: 25 | newName = name.split(" ")[0] 26 | else: 27 | newName = name 28 | else: 29 | newName = f"{basename}_{counter}" 30 | if len(newName) > 16: 31 | print( 32 | f"Error. {newName} fasta header too long.", 33 | "Choose a different --base name.", 34 | "NCBI/GenBank max is 16 characters.", 35 | ) 36 | raise SystemExit(1) 37 | if minlen > 0: 38 | if length >= minlen: 39 | # ony write if length 40 | outfile.write(">{:}\n{:}\n".format(newName, softwrap(seq))) 41 | else: 42 | # always write if we aren't filtering by length 43 | outfile.write(">{:}\n{:}\n".format(newName, softwrap(seq))) 44 | counter += 1 45 | 46 | 47 | def main(args): 48 | # setup menu with argparse 49 | class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): 50 | def __init__(self, prog): 51 | super(MyFormatter, self).__init__(prog, max_help_position=48) 52 | 53 | parser = argparse.ArgumentParser( 54 | prog="sort_rename.py", 55 | usage="%(prog)s [options] -i genome.fa -o sorted.fa", 56 | description="Script that sorts input by length and then renames contig headers.", 57 | epilog="""Written by Jon Palmer (2016) nextgenusfs@gmail.com""", 58 | formatter_class=MyFormatter, 59 | ) 60 | parser.add_argument("-i", "--input", required=True, help="Multi-fasta genome file") 61 | parser.add_argument("-o", "--out", required=True, help="Cleaned output (FASTA)") 62 | parser.add_argument( 63 | "-b", "--base", default="scaffold", help="Basename of contig header" 64 | ) 65 | parser.add_argument( 66 | "-s", 67 | "--simplify", 68 | action="store_true", 69 | help="Try to simplify headers, split at first space", 70 | ) 71 | parser.add_argument( 72 | "-m", "--minlen", type=int, help="Contigs shorter than threshold are discarded" 73 | ) 74 | args = parser.parse_args(args) 75 | 76 | print(("{:,} contigs records loaded".format(countfasta(args.input)))) 77 | print("Sorting and renaming contig headers") 78 | if args.minlen: 79 | print(("Removing contigs less than {:} bp".format(args.minlen))) 80 | SortRenameHeaders( 81 | args.input, args.base, args.out, minlen=args.minlen, simplify=args.simplify 82 | ) 83 | print(("{:,} contigs saved to file".format(countfasta(args.out)))) 84 | 85 | 86 | if __name__ == "__main__": 87 | main(sys.argv[1:]) 88 | -------------------------------------------------------------------------------- /funannotate/species.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import os 6 | import argparse 7 | from natsort import natsorted 8 | import json 9 | import shutil 10 | import funannotate.library as lib 11 | 12 | 13 | def speciesAvailable(dir): 14 | # return dictionary of species name and path to info.json file 15 | Results = {} 16 | for f in os.listdir(dir): 17 | ff = os.path.join(dir, f) 18 | if os.path.isdir(ff) and lib.checkannotations(os.path.join(ff, 'info.json')): 19 | with open(os.path.join(ff, 'info.json')) as infile: 20 | data = json.load(infile) 21 | Results[f] = data 22 | return Results 23 | 24 | 25 | def showAll(dir): 26 | Table = [] 27 | TableHeader = ['Species', 'Augustus', 'GeneMark', 28 | 'Snap', 'GlimmerHMM', 'CodingQuarry', 'Date'] 29 | for f in os.listdir(dir): 30 | ff = os.path.join(dir, f) 31 | if os.path.isdir(ff) and lib.checkannotations(os.path.join(ff, 'info.json')): 32 | with open(os.path.join(ff, 'info.json')) as infile: 33 | data = json.load(infile) 34 | sources = [f] 35 | for x in ['augustus', 'genemark', 'snap', 'glimmerhmm', 'codingquarry']: 36 | if x in data: 37 | if len(data[x][0]) < 1: 38 | sources.append('None') 39 | else: 40 | sourceFile = data[x][0]['source'] 41 | if ': ' in sourceFile: 42 | sourceFile = sourceFile.split(':')[0] 43 | sources.append(sourceFile) 44 | sources.append(data['augustus'][0]['date']) 45 | Table.append(sources) 46 | Table = natsorted(Table, key=lambda x: x[0]) 47 | Table.insert(0, TableHeader) 48 | lib.print_table(Table, max_col_width=40) 49 | 50 | 51 | def copyDir(src, dest): 52 | try: 53 | shutil.copytree(src, dest) 54 | # Directories are the same 55 | except shutil.Error as e: 56 | print(('Directory not copied. Error: %s' % e)) 57 | # Any error saying that the directory doesn't exist 58 | except OSError as e: 59 | print(('Directory not copied. Error: %s' % e)) 60 | 61 | 62 | def main(args): 63 | # setup menu with argparse 64 | class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): 65 | def __init__(self, prog): 66 | super(MyFormatter, self).__init__(prog, max_help_position=48) 67 | parser = argparse.ArgumentParser(prog='species.py', 68 | description='''Script to show/update species training parameters.''', 69 | epilog="""Written by Jon Palmer (2018) nextgenusfs@gmail.com""", 70 | formatter_class=MyFormatter) 71 | parser.add_argument('-s', '--species', help='Species name to show/update') 72 | parser.add_argument('-a', '--add', '--add-parameters', 73 | dest='add', help='Parameter JSON file to add to database') 74 | parser.add_argument('-p', '--parameters', dest='parameters', 75 | help='Parameter JSON file to add to database') 76 | parser.add_argument('-d', '--database', 77 | help='Path to funannotate database, $FUNANNOTATE_DB') 78 | args = parser.parse_args(args) 79 | 80 | # setup funannotate DB path 81 | if args.database: 82 | FUNDB = args.database 83 | else: 84 | try: 85 | FUNDB = os.environ["FUNANNOTATE_DB"] 86 | except KeyError: 87 | print('Funannotate database not properly configured, run funannotate setup.') 88 | sys.exit(1) 89 | 90 | # process input here 91 | if args.parameters: # just pretty-print JSON file 92 | with open(args.parameters) as input: 93 | table = json.load(input) 94 | print((json.dumps(table, indent=3))) 95 | elif args.species and args.add: # have one to add to database 96 | SpFound = speciesAvailable(os.path.join(FUNDB, 'trained_species')) 97 | if not os.access(os.path.join(FUNDB, 'trained_species'), os.W_OK | os.X_OK): 98 | print(('ERROR: you do not have permissions to write to {:}'.format( 99 | os.path.join(FUNDB, 'trained_species')))) 100 | sys.exit(1) 101 | if args.species in SpFound: 102 | print(('ERROR: {:} is already in database, choose a different name or delete existing to continue'.format( 103 | args.species))) 104 | sys.exit(1) 105 | print(('Adding {:} to Database'.format(args.species))) 106 | newLoc = os.path.abspath(os.path.join( 107 | FUNDB, 'trained_species', args.species)) 108 | if not os.path.isdir(newLoc): 109 | os.makedirs(newLoc) 110 | with open(args.add) as infile: 111 | data = json.load(infile) 112 | for x in data: 113 | if 'path' not in data[x][0]: 114 | continue 115 | newPath = os.path.join( 116 | newLoc, os.path.basename(data[x][0]['path'])) 117 | if os.path.isdir(data[x][0]['path']): 118 | copyDir(data[x][0]['path'], newPath) 119 | elif os.path.isfile(data[x][0]['path']): 120 | shutil.copyfile(data[x][0]['path'], newPath) 121 | data[x][0]['path'] = os.path.abspath(newPath) 122 | # print new data to terminal 123 | print(('Following training data added for {:}'.format(args.species))) 124 | print((json.dumps(data, indent=3))) 125 | with open(os.path.join(newLoc, 'info.json'), 'w') as outfile: 126 | json.dump(data, outfile) 127 | 128 | elif args.species: # look for in database and pretty-print JSON file 129 | SpFound = speciesAvailable(os.path.join(FUNDB, 'trained_species')) 130 | if args.species in SpFound: 131 | print((json.dumps(SpFound[args.species], indent=3))) 132 | else: 133 | print(('{:} not found in Funannotate trained species folder'.format( 134 | args.species))) 135 | print('Valid species are:') 136 | showAll(os.path.join(FUNDB, 'trained_species')) 137 | else: 138 | # just show all available species in the database and their training data 139 | showAll(os.path.join(FUNDB, 'trained_species')) 140 | # row_str = colour(row_str, header_format) 141 | print('\n') 142 | print((lib.colour('Options for this script:', 'bold'))) 143 | print((lib.colour(' To print a parameter file to terminal:', 'none'))) 144 | print((lib.colour(' funannotate species -p myparameters.json', 'dim'))) 145 | print((lib.colour( 146 | ' To print the parameters details from a species in the database:', 'none'))) 147 | print((lib.colour(' funannotate species -s aspergillus_fumigatus', 'dim'))) 148 | print((lib.colour(' To add a new species to database:', 'none'))) 149 | print((lib.colour( 150 | ' funannotate species -s new_species_name -a new_species_name.parameters.json\n', 'dim'))) 151 | 152 | 153 | if __name__ == "__main__": 154 | main(sys.argv[1:]) 155 | -------------------------------------------------------------------------------- /funannotate/utilities/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextgenusfs/funannotate/033a883081a83a161798ecc17eaf77b16b5c552b/funannotate/utilities/__init__.py -------------------------------------------------------------------------------- /funannotate/utilities/bam2gff3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import argparse 6 | import funannotate.library as lib 7 | 8 | 9 | def main(args): 10 | # setup menu with argparse 11 | class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): 12 | def __init__(self, prog): 13 | super(MyFormatter, self).__init__(prog, max_help_position=48) 14 | parser = argparse.ArgumentParser(prog='bam2gff3.py', 15 | description='''Script to convert BAM to GFF3.''', 16 | epilog="""Written by Jon Palmer (2018) nextgenusfs@gmail.com""", 17 | formatter_class=MyFormatter) 18 | parser.add_argument('-i', '--bam', required=True, help='input BAM') 19 | parser.add_argument('-o', '--output', required=True, help='Output GFF3') 20 | args = parser.parse_args(args) 21 | 22 | # convert BAM to gff3 23 | lib.bam2gff3(args.bam, args.output) 24 | 25 | 26 | if __name__ == "__main__": 27 | main(sys.argv[1:]) 28 | -------------------------------------------------------------------------------- /funannotate/utilities/gbk2parts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import argparse 6 | import funannotate.library as lib 7 | 8 | 9 | def main(args): 10 | # setup menu with argparse 11 | class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): 12 | def __init__(self, prog): 13 | super(MyFormatter, self).__init__(prog, max_help_position=48) 14 | parser = argparse.ArgumentParser(prog='gbk2parts.py', 15 | description='''Script to convert GBK file to its components.''', 16 | epilog="""Written by Jon Palmer (2018) nextgenusfs@gmail.com""", 17 | formatter_class=MyFormatter) 18 | parser.add_argument('-g', '--gbk', required=True, 19 | help='Genome in GenBank format') 20 | parser.add_argument('-o', '--output', required=True, 21 | help='Output basename') 22 | args = parser.parse_args(args) 23 | 24 | # setup output files 25 | tblout = f'{args.output}.tbl' 26 | gffout = f'{args.output}.gff3' 27 | protout = f'{args.output}.proteins.fa' 28 | transout = f'{args.output}.mrna-transcripts.fa' 29 | cdsout = f'{args.output}.cds-transcripts.fa' 30 | dnaout = f'{args.output}.scaffolds.fa' 31 | lib.gb2parts(args.gbk, tblout, gffout, protout, transout, cdsout, dnaout) 32 | 33 | if __name__ == "__main__": 34 | main(sys.argv[1:]) 35 | -------------------------------------------------------------------------------- /funannotate/utilities/get_longest_isoform.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys, re, os, gzip, argparse 5 | from Bio import SeqIO 6 | 7 | def main(inargs): 8 | # setup menu with argparse 9 | class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): 10 | def __init__(self, prog): 11 | super(MyFormatter, self).__init__(prog, max_help_position=48) 12 | parser = argparse.ArgumentParser(prog='get_longest_isoform', 13 | description='''Script to extract longest isoform of protein or transcript file from funannotate or where gene is tagged in header.''', 14 | epilog="""Written by Jason Stajich (2022) @hyphaltip""", 15 | formatter_class=MyFormatter) 16 | parser.add_argument('-i', '--input', required=True, 17 | help='fasta formatted transcript or protein file') 18 | parser.add_argument('-o', '--output', help='Output basename') 19 | 20 | parser.add_argument('-v', '--verbose', help='Extra verbose output',dest='verbose', default=False, action='store_true') 21 | 22 | args = parser.parse_args(inargs) 23 | genes = {} 24 | if not args.output: 25 | args.output = args.input + ".longest" 26 | transmatch = re.compile(r'\-T\d+$') 27 | genematch = re.compile(r'gene[:=](\S+)') 28 | recCount = 0 29 | handle = args.input 30 | if args.input.endswith('.gz'): 31 | handle = gzip.open(args.input,"rt") 32 | for rec in SeqIO.parse(handle, "fasta"): 33 | id = rec.id 34 | description = rec.description 35 | geneid = id 36 | m = transmatch.search(id) 37 | if m: 38 | geneid = description.split()[1] 39 | else: 40 | m = genematch.search(description) 41 | if m: 42 | geneid = m.group(1) 43 | if geneid == id: 44 | if args.verbose: 45 | print("Warning: could not parse gene name from header '{}' '{}'".format(id,description)) 46 | if geneid not in genes or len(rec) > len(genes[geneid]): 47 | genes[geneid] = rec 48 | recCount += 1 49 | 50 | print("{} genes and {} total sequences (isoforms) seen".format(len(genes),recCount)) 51 | SeqIO.write(genes.values(),args.output,'fasta') 52 | 53 | if __name__ == "__main__": 54 | main(sys.argv[1:]) 55 | -------------------------------------------------------------------------------- /funannotate/utilities/gff2prot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import argparse 6 | from natsort import natsorted 7 | import funannotate.library as lib 8 | 9 | 10 | def main(args): 11 | # setup menu with argparse 12 | class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): 13 | def __init__(self, prog): 14 | super(MyFormatter, self).__init__(prog, max_help_position=48) 15 | parser = argparse.ArgumentParser(prog='gff2prot.py', 16 | description='''Script to convert GFF3 and FASTA proteins.''', 17 | epilog="""Written by Jon Palmer (2018) nextgenusfs@gmail.com""", 18 | formatter_class=MyFormatter) 19 | parser.add_argument('-g', '--gff3', required=True, 20 | help='Genome annotation GFF3 format') 21 | parser.add_argument('-f', '--fasta', required=True, 22 | help='Genome in FASTA format') 23 | parser.add_argument('--no_stop', action='store_true', 24 | help='Dont print stop codon') 25 | args = parser.parse_args(args) 26 | 27 | # translate GFF3 to proteins 28 | # load into dictionary 29 | Genes = {} 30 | Genes = lib.gff2dict(args.gff3, args.fasta, Genes) 31 | 32 | for k, v in natsorted(list(Genes.items())): 33 | if v['type'] == 'mRNA': 34 | for i, x in enumerate(v['ids']): 35 | if args.no_stop: 36 | Prot = v['protein'][i].rstrip('*') 37 | else: 38 | Prot = v['protein'][i] 39 | sys.stdout.write('>%s %s\n%s\n' % (x, k, lib.softwrap(Prot))) 40 | 41 | 42 | if __name__ == "__main__": 43 | main(sys.argv[1:]) 44 | -------------------------------------------------------------------------------- /funannotate/utilities/gff_reformat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import argparse 6 | from natsort import natsorted 7 | from collections import OrderedDict 8 | import funannotate.library as lib 9 | 10 | 11 | def main(args): 12 | # setup menu with argparse 13 | class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): 14 | def __init__(self, prog): 15 | super(MyFormatter, self).__init__(prog, max_help_position=48) 16 | parser = argparse.ArgumentParser(prog='gff_reformat.py', 17 | description='''Script to rename gene models GFF3 file.''', 18 | epilog="""Written by Jon Palmer (2020) nextgenusfs@gmail.com""", 19 | formatter_class=MyFormatter) 20 | parser.add_argument('-g', '--gff3', required=True, 21 | help='Genome annotation GFF3 format') 22 | parser.add_argument('-f', '--fasta', required=True, 23 | help='Genome in FASTA format') 24 | parser.add_argument('-l', '--locus_tag', default='FUN', 25 | help='Basename of gene names') 26 | parser.add_argument('-n', '--numbering', default=1, type=int, 27 | help='Start numbering at') 28 | parser.add_argument('-o', '--out', required=True, help='Output GFF3') 29 | args = parser.parse_args(args) 30 | 31 | # load into dictionary 32 | Genes = {} 33 | Genes = lib.gff2dict(args.gff3, args.fasta, Genes) 34 | print('Parsed {:,} gene models from {}'.format(len(Genes), args.gff3)) 35 | 36 | # now create ordered dictionary and sort by contig and position 37 | def _sortDict(d): 38 | return (d[1]['contig'], d[1]['location'][0]) 39 | 40 | sGenes = natsorted(iter(Genes.items()), key=_sortDict) 41 | sortedGenes = OrderedDict(sGenes) 42 | renamedGenes = {} 43 | counter = args.numbering 44 | args.locus_tag = args.locus_tag.rstrip('_') 45 | transcripts = 0 46 | for k, v in list(sortedGenes.items()): 47 | locusTag = args.locus_tag+'_'+str(counter).zfill(6) 48 | renamedGenes[locusTag] = v 49 | renamedGenes[locusTag]['gene_synonym'].append(k) 50 | newIds = [] 51 | for i in range(0, len(v['ids'])): 52 | newIds.append('{}-T{}'.format(locusTag, i+1)) 53 | transcripts += 1 54 | renamedGenes[locusTag]['ids'] = newIds 55 | counter += 1 56 | 57 | # write to gff3 58 | lib.dict2gff3(renamedGenes, args.out) 59 | print('Sorted and renamed {:,} gene models {:,} transcripts: {}'.format( 60 | len(renamedGenes), transcripts, args.out)) 61 | 62 | if __name__ == "__main__": 63 | main(sys.argv[1:]) 64 | -------------------------------------------------------------------------------- /funannotate/utilities/quarry2gff3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import argparse 6 | 7 | 8 | def main(args): 9 | # setup menu with argparse 10 | class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): 11 | def __init__(self, prog): 12 | super(MyFormatter, self).__init__(prog, max_help_position=48) 13 | parser = argparse.ArgumentParser(prog='codingquarry2gff3.py', 14 | description='''Script to convert CodingQuarry GFF3 to proper GFF3 format.''', 15 | epilog="""Written by Jon Palmer (2018) nextgenusfs@gmail.com""", 16 | formatter_class=MyFormatter) 17 | parser.add_argument('-i', '--input', required=True, 18 | help='CodingQuarry annotation file') 19 | parser.add_argument('-n', '--numbering', default=1, 20 | type=int, help='Gene numbering starts at') 21 | args = parser.parse_args(args) 22 | 23 | sys.stdout.write(("##gff-version 3\n")) 24 | exonCounts = {} 25 | GeneCount = args.numbering 26 | with open(args.input, 'r') as infile: 27 | for line in infile: 28 | line = line.strip() 29 | contig, source, feature, start, end, score, strand, phase, attributes = line.split( 30 | '\t') 31 | source = 'CodingQuarry' 32 | ID, Parent, Name = (None,)*3 33 | info = attributes.split(';') 34 | for x in info: 35 | if x.startswith('ID='): 36 | ID = x.replace('ID=', '') 37 | elif x.startswith('Parent='): 38 | Parent = x.replace('Parent=', '') 39 | if ID and ' ' in ID: 40 | ID = ID.split(' ')[0] 41 | if Parent and ' ' in Parent: 42 | Parent = Parent.split(' ')[0] 43 | if feature == 'gene': 44 | geneID = 'gene_'+str(GeneCount) 45 | transID = 'transcript_'+str(GeneCount)+'-T1' 46 | # if not ID in geneRef: 47 | # geneRef[ID] = (geneID, transID) 48 | sys.stdout.write('{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\tID={:};Name={:};Alias={:};\n'.format( 49 | contig, source, feature, start, end, score, strand, phase, geneID, geneID, ID)) 50 | sys.stdout.write('{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\tID={:};Parent={:};Alias={:};\n'.format( 51 | contig, source, 'mRNA', start, end, '.', strand, '.', transID, geneID, ID)) 52 | GeneCount += 1 53 | elif feature == 'CDS': 54 | # if trimID in geneRef: 55 | # geneID,transID = geneRef.get(trimID) 56 | if not transID in exonCounts: 57 | exonCounts[transID] = 1 58 | else: 59 | exonCounts[transID] += 1 60 | num = exonCounts.get(transID) 61 | sys.stdout.write('{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\tID={:}.exon{:};Parent={:};\n'.format( 62 | contig, source, 'exon', start, end, '.', strand, '.', transID, num, transID)) 63 | sys.stdout.write('{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\tID={:}.cds;Parent={:};\n'.format( 64 | contig, source, feature, start, end, score, strand, phase, transID, transID)) 65 | 66 | 67 | if __name__ == "__main__": 68 | main(sys.argv[1:]) 69 | -------------------------------------------------------------------------------- /funannotate/utilities/stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import argparse 6 | import funannotate.library as lib 7 | 8 | 9 | def main(args): 10 | # setup menu with argparse 11 | class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): 12 | def __init__(self, prog): 13 | super(MyFormatter, self).__init__(prog, max_help_position=48) 14 | parser = argparse.ArgumentParser(prog='stats.py', 15 | description='''Script to run some simple genome annotation stats''', 16 | epilog="""Written by Jon Palmer (2020) nextgenusfs@gmail.com""", 17 | formatter_class=MyFormatter) 18 | parser.add_argument('-f', '--fasta', required=True, 19 | help='Genome in FASTA format') 20 | parser.add_argument('-o', '--out', required=True, 21 | help='JSON output stats file') 22 | parser.add_argument('-g', '--gff', 23 | help='Genome annotation in GFF3 format') 24 | parser.add_argument('-t', '--tbl', 25 | help='Genome annotation in TBL format') 26 | parser.add_argument('--transcript_alignments', 27 | help='transcript alignments in GFF3 format') 28 | parser.add_argument('--protein_alignments', 29 | help='protein alignments in GFF3 format') 30 | args = parser.parse_args(args) 31 | 32 | 33 | if not args.gff and not args.tbl: 34 | print('Warning: no genome annotation passed (-t or -g), will only output genome assembly stats') 35 | elif args.tbl: 36 | print('Generating stats from Genome FASTA file and TBL annotation') 37 | lib.annotation_summary(args.fasta, args.out, tbl=args.tbl, 38 | transcripts=args.transcript_alignments, 39 | proteins=args.protein_alignments) 40 | elif args.gff: 41 | print('Generating stats from Genome FASTA file and GFF3 annotation') 42 | lib.annotation_summary(args.fasta, args.out, gff=args.gff, 43 | transcripts=args.transcript_alignments, 44 | proteins=args.protein_alignments) 45 | print('Finished writing JSON stats file: {}'.format(args.out)) 46 | 47 | 48 | if __name__ == "__main__": 49 | main(sys.argv[1:]) 50 | -------------------------------------------------------------------------------- /funannotate/utilities/stringtie2gff3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import argparse 6 | import funannotate.library as lib 7 | 8 | 9 | def dict2gff3(input): 10 | from collections import OrderedDict 11 | ''' 12 | function to convert funannotate gene dictionary to gff3 output 13 | ''' 14 | def _sortDict(d): 15 | return (d[1]['contig'], d[1]['location'][0]) 16 | # sort the annotations by contig and start location 17 | sGenes = sorted(iter(input.items()), key=_sortDict) 18 | sortedGenes = OrderedDict(sGenes) 19 | # then loop through and write GFF3 format 20 | sys.stdout.write("##gff-version 3\n") 21 | for k, v in list(sortedGenes.items()): 22 | sys.stdout.write("{:}\t{:}\tgene\t{:}\t{:}\t.\t{:}\t.\tID={:};\n".format( 23 | v['contig'], v['source'], v['location'][0], v['location'][1], v['strand'], k)) 24 | for i in range(0, len(v['ids'])): 25 | # build extra annotations for each transcript if applicable 26 | # now write mRNA feature 27 | sys.stdout.write("{:}\t{:}\t{:}\t{:}\t{:}\t.\t{:}\t.\tID={:};Parent={:};TPM={:}\n".format( 28 | v['contig'], v['source'], v['type'], v['location'][0], v['location'][1], v['strand'], v['ids'][i], k, v['tpm'][i])) 29 | if v['type'] == 'mRNA': 30 | if '5UTR' in v: 31 | # if 5'UTR then write those first 32 | num_5utrs = len(v['5UTR'][i]) 33 | if num_5utrs > 0: 34 | for z in range(0, num_5utrs): 35 | u_num = z + 1 36 | sys.stdout.write("{:}\t{:}\tfive_prime_UTR\t{:}\t{:}\t.\t{:}\t.\tID={:}.utr5p{:};Parent={:};\n".format( 37 | v['contig'], v['source'], v['5UTR'][i][z][0], v['5UTR'][i][z][1], v['strand'], v['ids'][i], u_num, v['ids'][i])) 38 | # write the exons 39 | num_exons = len(v['mRNA'][i]) 40 | for x in range(0, num_exons): 41 | ex_num = x + 1 42 | sys.stdout.write("{:}\t{:}\texon\t{:}\t{:}\t.\t{:}\t.\tID={:}.exon{:};Parent={:};\n".format( 43 | v['contig'], v['source'], v['mRNA'][i][x][0], v['mRNA'][i][x][1], v['strand'], v['ids'][i], ex_num, v['ids'][i])) 44 | # if 3'UTR then write 45 | if '3UTR' in v: 46 | num_3utrs = len(v['3UTR'][i]) 47 | if num_3utrs > 0: 48 | for z in range(0, num_3utrs): 49 | u_num = z + 1 50 | sys.stdout.write("{:}\t{:}\tthree_prime_UTR\t{:}\t{:}\t.\t{:}\t.\tID={:}.utr3p{:};Parent={:};\n".format( 51 | v['contig'], v['source'], v['3UTR'][i][z][0], v['3UTR'][i][z][1], v['strand'], v['ids'][i], u_num, v['ids'][i])) 52 | if v['type'] == 'mRNA': 53 | num_cds = len(v['CDS'][i]) 54 | # GFF3 phase is 1 less than flat file 55 | current_phase = v['codon_start'][i] - 1 56 | for y in range(0, num_cds): 57 | sys.stdout.write("{:}\t{:}\tCDS\t{:}\t{:}\t.\t{:}\t{:}\tID={:}.cds;Parent={:};\n".format( 58 | v['contig'], v['source'], v['CDS'][i][y][0], v['CDS'][i][y][1], v['strand'], current_phase, v['ids'][i], v['ids'][i])) 59 | current_phase = ( 60 | current_phase - (int(v['CDS'][i][y][1]) - int(v['CDS'][i][y][0]) + 1)) % 3 61 | if current_phase == 3: 62 | current_phase = 0 63 | 64 | 65 | def main(args): 66 | # setup menu with argparse 67 | parser = argparse.ArgumentParser(prog='stringtie2gff.py', 68 | description='''Script to convert StringTie GTF file to GFF3.''', 69 | epilog="""Written by Jon Palmer (2018) nextgenusfs@gmail.com""") 70 | parser.add_argument('-i', '--input', required=True, 71 | help='StringTie GTF file') 72 | args = parser.parse_args(args) 73 | 74 | Genes = lib.gtf2dict(args.input) 75 | dict2gff3(Genes) 76 | 77 | 78 | if __name__ == "__main__": 79 | main(sys.argv[1:]) 80 | -------------------------------------------------------------------------------- /funannotate/utilities/tbl2gbk.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import os 6 | import argparse 7 | import shutil 8 | import subprocess 9 | from natsort import natsorted 10 | import funannotate.library as lib 11 | from Bio import SeqIO 12 | 13 | 14 | def runSubprocess(cmd, dir): 15 | proc = subprocess.Popen( 16 | cmd, cwd=dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 17 | stdout, stderr = proc.communicate() 18 | if stdout: 19 | print(stdout) 20 | 21 | 22 | def runtbl2asn(folder, template, discrepency, organism, isolate, strain, parameters, version): 23 | ''' 24 | function to run NCBI tbl2asn 25 | ''' 26 | # get funannotate version 27 | fun_version = lib.get_version() 28 | # input should be a folder 29 | if not os.path.isdir(folder): 30 | print(("tbl2asn error: %s is not a directory, exiting" % folder)) 31 | sys.exit(1) 32 | # based on organism, isolate, strain, construct meta info for -j flag 33 | if not organism: 34 | print("tbl2asn error: organism not specified") 35 | sys.exit(1) 36 | meta = "[organism=" + organism + "]" 37 | if isolate: 38 | isolate_meta = "[isolate=" + isolate + "]" 39 | meta = meta + " " + isolate_meta 40 | if strain: 41 | strain_meta = "[strain=" + strain + "]" 42 | meta = meta + " " + strain_meta 43 | cmd = ['tbl2asn', '-y', '"Annotated using '+fun_version+'"', '-N', 44 | str(version), '-p', folder, '-t', template, '-M', 'n', '-Z', discrepency, '-j', '"'+meta+'"', '-V', 'b', '-c', 'fx', '-T', '-a', 'r10u', '-l', 'paired-ends'] 45 | # check for custom parameters 46 | if parameters: 47 | params = parameters.split(' ') 48 | cmd = cmd + params 49 | runSubprocess(cmd, '.') 50 | return ' '.join(cmd) 51 | 52 | 53 | def locustagGB(input): 54 | tag = [] 55 | with open(input, 'r') as infile: 56 | for record in SeqIO.parse(infile, 'genbank'): 57 | for f in record.features: 58 | if f.type == 'gene': 59 | locusTag, ID, Parent = lib.getID(f, f.type) 60 | tag.append(locusTag) 61 | break 62 | return tag[0].split('_', -1)[0] 63 | 64 | 65 | def ncbiCheckErrors(error, validation, genename, fixOut): 66 | ncbi_error = 0 67 | actual_error = 0 68 | with open(error, 'r') as errors: 69 | for line in errors: 70 | line = line.strip() 71 | if 'ERROR' in line: 72 | num = line.split(' ')[0] 73 | ncbi_error += int(num) 74 | # if errors in summary, then parse validation report, only get errors with gene names 75 | if ncbi_error > 0: 76 | # see if we can get the gene models that need to be fixed 77 | needFixing = {} 78 | with open(validation, 'r') as validationFile: 79 | for line in validationFile: 80 | line = line.strip() 81 | if line.startswith('ERROR') and genename in line: 82 | actual_error += 1 83 | parts = line.split(' ') 84 | for x in parts: 85 | if genename in x: 86 | ID = x.split('|')[-1] 87 | if '-' in ID: 88 | ID = ID.split('-')[0] 89 | reason = line.split(' FEATURE:')[0] 90 | reason = reason.split('] ')[-1] 91 | if not ID in needFixing: 92 | needFixing[ID] = reason 93 | if actual_error > 0: 94 | print(("There are %i gene models that need to be fixed." % 95 | actual_error)) 96 | print('-------------------------------------------------------') 97 | with open(fixOut, 'w') as fix: 98 | fix.write('#GeneID\tError Message\n') 99 | for k, v in natsorted(list(needFixing.items())): 100 | fix.write('%s\t%s\n' % (k, v)) 101 | print(('%s\t%s' % (k, v))) 102 | return actual_error 103 | 104 | 105 | def main(args): 106 | # setup menu with argparse 107 | class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): 108 | def __init__(self, prog): 109 | super(MyFormatter, self).__init__(prog, max_help_position=48) 110 | parser = argparse.ArgumentParser(prog='gbk2parts.py', 111 | description='''Script to convert GBK file to its components.''', 112 | epilog="""Written by Jon Palmer (2018) nextgenusfs@gmail.com""", 113 | formatter_class=MyFormatter) 114 | parser.add_argument('-i', '--tbl', required=True, 115 | help='Genome annotation in tbl format') 116 | parser.add_argument('-f', '--fasta', required=True, 117 | help='Genome in FASTA format') 118 | parser.add_argument('-s', '--species', required=True, 119 | help='Species name (e.g. "Aspergillus fumigatus") use quotes if there is a space') 120 | parser.add_argument('--isolate', help='Isolate name (e.g. Af293)') 121 | parser.add_argument('--strain', help='Strain name (e.g. CEA10)') 122 | parser.add_argument( 123 | '-t', '--tbl2asn', help='Custom parameters for tbl2asn, example: linkage and gap info') 124 | parser.add_argument('--sbt', help='tbl2asn template file') 125 | parser.add_argument('-o', '--output', help='Output basename') 126 | args = parser.parse_args(args) 127 | 128 | parentdir = os.path.dirname(lib.__file__) 129 | 130 | # see if organism/species/isolate was passed at command line 131 | organism = None 132 | if args.species: 133 | organism = args.species 134 | else: 135 | organism = os.path.basename(args.tbl).split('.t')[0] 136 | if args.strain: 137 | organism_name = organism+'_'+args.strain 138 | elif args.isolate: 139 | organism_name = organism+'_'+args.isolate 140 | else: 141 | organism_name = organism 142 | organism_name = organism_name.replace(' ', '_') 143 | if args.output: 144 | outputname = args.output 145 | else: 146 | outputname = organism_name 147 | 148 | # create tmp folder to run tbl2asn from 149 | # make tmp folder 150 | tmp = outputname + '_tmp' 151 | if not os.path.exists(tmp): 152 | os.makedirs(tmp) 153 | 154 | # now move files into proper location 155 | if not lib.checkannotations(args.fasta): 156 | print(('FASTA genome file not found: {:}'.format(args.fasta))) 157 | sys.exit(1) 158 | if not lib.checkannotations(args.tbl): 159 | print(('TBL annotations file not found: {:}'.format(args.tbl))) 160 | sys.exit(1) 161 | shutil.copyfile(args.fasta, os.path.join(tmp, 'genome.fsa')) 162 | shutil.copyfile(args.tbl, os.path.join(tmp, 'genome.tbl')) 163 | 164 | # now we can run tbl2asn 165 | if args.sbt: 166 | SBT = args.sbt 167 | else: 168 | SBT = os.path.join(parentdir, 'config', 'test.sbt') 169 | discrep = outputname+'.discrepency.txt' 170 | version = 1 171 | runtbl2asn(tmp, SBT, discrep, organism, 172 | args.isolate, args.strain, args.tbl2asn, version) 173 | 174 | # check the output for errors for NCBI 175 | final_fixes = os.path.join(tmp, 'models-need-fixing.txt') 176 | prefix = locustagGB(os.path.join(tmp, 'genome.gbf')) 177 | errors = ncbiCheckErrors(os.path.join(tmp, 'errorsummary.val'), os.path.join( 178 | tmp, 'genome.val'), prefix, final_fixes) 179 | 180 | # get output files 181 | gbkout = outputname+'.gbk' 182 | shutil.copyfile(os.path.join(tmp, 'genome.gbf'), gbkout) 183 | sqnout = outputname + '.sqn' 184 | shutil.copyfile(os.path.join(tmp, 'genome.sqn'), sqnout) 185 | if errors < 1: 186 | lib.SafeRemove(tmp) 187 | 188 | 189 | if __name__ == "__main__": 190 | main(sys.argv[1:]) 191 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Note: To use the 'upload' functionality of this file, you must: 5 | # $ pip install twine 6 | 7 | import io 8 | import os 9 | import sys 10 | from shutil import rmtree 11 | 12 | from setuptools import find_packages, setup, Command 13 | 14 | # Package meta-data. 15 | NAME = "funannotate" 16 | DESCRIPTION = "funannotate: eukaryotic genome annotation pipeline" 17 | URL = "https://github.com/nextgenusfs/funannotate" 18 | EMAIL = "nextgenusfs@gmail.com" 19 | AUTHOR = "Jon Palmer" 20 | REQUIRES_PYTHON = ">=3.6.0, <3.12" 21 | VERSION = None 22 | 23 | # What packages are required for this module to be executed? 24 | REQUIRED = [ 25 | "biopython<1.80", 26 | "goatools", 27 | "seaborn", 28 | "psutil", 29 | "pandas", 30 | "matplotlib", 31 | "natsort", 32 | "numpy", 33 | "requests", 34 | "scikit-learn", 35 | "scipy", 36 | "distro", 37 | ] 38 | 39 | # What packages are optional? 40 | EXTRAS = { 41 | # 'fancy feature': ['django'], 42 | } 43 | 44 | # The rest you shouldn't have to touch too much :) 45 | # ------------------------------------------------ 46 | # Except, perhaps the License and Trove Classifiers! 47 | # If you do change the License, remember to change the Trove Classifier for that! 48 | 49 | here = os.path.abspath(os.path.dirname(__file__)) 50 | 51 | # Import the README and use it as the long-description. 52 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file! 53 | try: 54 | with io.open(os.path.join(here, "README.md"), encoding="utf-8") as f: 55 | long_description = "\n" + f.read() 56 | except FileNotFoundError: 57 | long_description = DESCRIPTION 58 | 59 | # Load the package's __version__.py module as a dictionary. 60 | about = {} 61 | if not VERSION: 62 | with open(os.path.join(here, NAME, "__version__.py")) as f: 63 | exec(f.read(), about) 64 | else: 65 | about["__version__"] = VERSION 66 | 67 | 68 | class UploadCommand(Command): 69 | """Support setup.py upload.""" 70 | 71 | description = "Build and publish the package." 72 | user_options = [] 73 | 74 | @staticmethod 75 | def status(s): 76 | """Prints things in bold.""" 77 | print(("\033[1m{0}\033[0m".format(s))) 78 | 79 | def initialize_options(self): 80 | pass 81 | 82 | def finalize_options(self): 83 | pass 84 | 85 | def run(self): 86 | try: 87 | self.status("Removing previous builds…") 88 | rmtree(os.path.join(here, "dist")) 89 | except OSError: 90 | pass 91 | 92 | self.status("Building Source and Wheel (universal) distribution…") 93 | os.system("{0} setup.py sdist bdist_wheel --universal".format(sys.executable)) 94 | 95 | self.status("Uploading the package to PyPI via Twine…") 96 | os.system("twine upload dist/*") 97 | 98 | self.status("Pushing git tags…") 99 | os.system("git tag v{0}".format(about["__version__"])) 100 | os.system("git push --tags") 101 | 102 | sys.exit() 103 | 104 | 105 | # Where the magic happens: 106 | setup( 107 | name=NAME, 108 | version=about["__version__"], 109 | description=DESCRIPTION, 110 | long_description=long_description, 111 | long_description_content_type="text/markdown", 112 | author=AUTHOR, 113 | author_email=EMAIL, 114 | python_requires=REQUIRES_PYTHON, 115 | url=URL, 116 | packages=find_packages(exclude=("tests",)), 117 | entry_points={ 118 | "console_scripts": ["funannotate=funannotate.funannotate:main"], 119 | }, 120 | install_requires=REQUIRED, 121 | extras_require=EXTRAS, 122 | include_package_data=True, 123 | license="BSD-2", 124 | # scripts=['scripts/funannotate'], 125 | classifiers=[ 126 | # Trove classifiers 127 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers 128 | "Development Status :: 4 - Beta", 129 | "License :: OSI Approved :: BSD License", 130 | "Programming Language :: Python", 131 | "Operating System :: Unix", 132 | "Intended Audience :: Science/Research", 133 | "Topic :: Scientific/Engineering :: Bio-Informatics", 134 | ], 135 | cmdclass={ 136 | "upload": UploadCommand, 137 | }, 138 | ) 139 | --------------------------------------------------------------------------------