├── .github
    ├── ISSUE_TEMPLATE
    │   └── bug_report.md
    └── workflows
    │   ├── main.yml
    │   ├── python-publish.yml
    │   ├── release-triggered.yml
    │   ├── release.yml
    │   ├── singularity-deploy.yml
    │   └── triggered-build.yml
├── .gitignore
├── .readthedocs.yaml
├── .zenodo.json
├── CITATION.cff
├── Dockerfile
├── Dockerfile2
├── LICENSE.md
├── MANIFEST.in
├── README.md
├── Singularity
├── docs
    ├── .gitignore
    ├── Makefile
    ├── annotate.rst
    ├── commands.rst
    ├── compare.rst
    ├── conda.rst
    ├── conf.py
    ├── databases.rst
    ├── dependencies.rst
    ├── docker.rst
    ├── evidence.rst
    ├── index.rst
    ├── install.rst
    ├── make.bat
    ├── manual.rst
    ├── predict.rst
    ├── prepare.rst
    ├── requirements.txt
    ├── tutorials.rst
    ├── update.rst
    └── utilities.rst
├── funannotate-docker
├── funannotate-logo.png
├── funannotate-podman
├── funannotate
    ├── __init__.py
    ├── __version__.py
    ├── annotate.py
    ├── aux_scripts
    │   ├── augustus_parallel.py
    │   ├── enrichment_parallel.py
    │   ├── fasta2agp.py
    │   ├── filterGenemark.pl
    │   ├── filterIntronsFindStrand.pl
    │   ├── funannotate-BUSCO2-py2.py
    │   ├── funannotate-BUSCO2.py
    │   ├── funannotate-p2g.py
    │   ├── funannotate-runEVM.py
    │   ├── genemark_gtf2gff3.pl
    │   ├── getEggNog.sh
    │   ├── hmmer_parallel.py
    │   ├── iprscan-local.py
    │   ├── iprscan2annotations.py
    │   ├── pal2nal.pl
    │   ├── phobius-multiproc.py
    │   ├── phobius-remote.pl
    │   ├── runIPRscan.py
    │   ├── sam2bam.sh
    │   ├── tbl2asn_parallel.py
    │   ├── trinity.py
    │   ├── trnascan2gff3.pl
    │   └── xmlcombine.py
    ├── check.py
    ├── clean.py
    ├── compare.py
    ├── config
    │   ├── EOG092C0B3U.prfl
    │   ├── TruSeq3-PE.fa
    │   ├── TruSeq3-SE.fa
    │   ├── busco_test.fa
    │   ├── codeml.config
    │   ├── extrinsic.E.XNT.RM.cfg
    │   ├── smcogs.txt
    │   ├── test.sbt
    │   └── tf_interpro.txt
    ├── database.py
    ├── downloads.json
    ├── fix.py
    ├── funannotate.py
    ├── html_template
    │   ├── css
    │   │   ├── bootstrap.min.css
    │   │   └── starter-template.css
    │   └── js
    │   │   ├── bootstrap.min.js
    │   │   ├── ie-emulation-modes-warning.js
    │   │   ├── ie10-viewport-bug-workaround.js
    │   │   └── jquery.min.js
    ├── interlap.py
    ├── iprscan.py
    ├── library.py
    ├── mask.py
    ├── outgroups.py
    ├── predict.py
    ├── remote.py
    ├── resources.py
    ├── setupDB.py
    ├── sort.py
    ├── species.py
    ├── stackedBarGraph.py
    ├── test.py
    ├── train.py
    ├── update.py
    └── utilities
    │   ├── __init__.py
    │   ├── bam2gff3.py
    │   ├── contrast.py
    │   ├── gbk2parts.py
    │   ├── get_longest_isoform.py
    │   ├── gff2prot.py
    │   ├── gff2tbl.py
    │   ├── gff_reformat.py
    │   ├── quarry2gff3.py
    │   ├── stats.py
    │   ├── stringtie2gff3.py
    │   └── tbl2gbk.py
└── setup.py


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | 
 5 | ---
 6 | 
 7 | **Are you using the latest release?**
 8 | If you are not using the latest release of funannotate, please upgrade, if bug persists then report here.
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **What command did you issue?**
14 | Copy/paste the command used.
15 | 
16 | **Logfiles**
17 | Please provide relavent log files of the error. 
18 | 
19 | **OS/Install Information**
20 |  -  output of `funannotate check --show-versions`
21 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | # This is a basic workflow to help you get started with Actions
 2 | 
 3 | name: CI to Docker Hub funannotate-slim
 4 | 
 5 | # Controls when the action will run.
 6 | on:
 7 |   # Triggers the workflow on push or pull request events but only for the master branch
 8 |   push:
 9 |     branches: [ master ]
10 | 
11 |   # Allows you to run this workflow manually from the Actions tab
12 |   workflow_dispatch:
13 | 
14 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel
15 | jobs:
16 |   # This workflow contains a single job called "build"
17 |   build:
18 |     # The type of runner that the job will run on
19 |     runs-on: ubuntu-latest
20 | 
21 |     steps:
22 | 
23 |       - name: Check Out Repo
24 |         uses: actions/checkout@v2
25 | 
26 |       - name: Login to Docker Hub
27 |         uses: docker/login-action@v1
28 |         with:
29 |           username: ${{ secrets.DOCKER_HUB_USERNAME }}
30 |           password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
31 | 
32 |       - name: Set up Docker Buildx
33 |         id: buildx
34 |         uses: docker/setup-buildx-action@v1
35 | 
36 |       - name: Build and push
37 |         id: docker_build
38 |         uses: docker/build-push-action@v2
39 |         with:
40 |           context: ./
41 |           file: ./Dockerfile
42 |           push: true
43 |           tags: nextgenusfs/funannotate-slim:latest
44 | 
45 |       - name: Image digest
46 |         run: echo ${{ steps.docker_build.outputs.digest }}
47 | 
48 |       - name: Repository Dispatch
49 |         uses: peter-evans/repository-dispatch@v1
50 |         with:
51 |           token: ${{ secrets.REPO_ACCESS_TOKEN }}
52 |           repository: nextgenusfs/funannotate
53 |           event-type: docker-hub-complete
54 |           client-payload: '{"ref": "${{ github.ref }}", "sha": "${{ github.sha }}"}'
55 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [ published ]
14 | 
15 | permissions:
16 |   contents: read
17 | 
18 | jobs:
19 |   deploy:
20 | 
21 |     runs-on: ubuntu-latest
22 | 
23 |     steps:
24 |     - uses: actions/checkout@v3
25 |     - name: Set up Python
26 |       uses: actions/setup-python@v3
27 |       with:
28 |         python-version: '3.x'
29 |     - name: Install dependencies
30 |       run: |
31 |         python -m pip install --upgrade pip
32 |         pip install build
33 |     - name: Build package
34 |       run: python -m build
35 |     - name: Publish package
36 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37 |       with:
38 |         user: __token__
39 |         password: ${{ secrets.PYPI_API_TOKEN }}


--------------------------------------------------------------------------------
/.github/workflows/release-triggered.yml:
--------------------------------------------------------------------------------
 1 | # This is a basic workflow to help you get started with Actions
 2 | 
 3 | name: Release funannotate to Docker Hub
 4 | 
 5 | # Controls when the action will run.
 6 | on:
 7 |   repository_dispatch:
 8 |     types: [docker-hub-release-complete]
 9 | 
10 |   # Allows you to run this workflow manually from the Actions tab
11 |   workflow_dispatch:
12 | 
13 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel
14 | jobs:
15 |   # This workflow contains a single job called "build"
16 |   build:
17 |     # The type of runner that the job will run on
18 |     runs-on: ubuntu-latest
19 | 
20 |     steps:
21 |       - name: Check Out Repo
22 |         uses: actions/checkout@v2
23 | 
24 |       - name: Get release
25 |         id: get_release
26 |         uses: kaliber5/action-get-release@v1
27 |         with:
28 |           token: ${{ github.token }}
29 |           latest: true
30 | 
31 |       - name: Login to Docker Hub
32 |         uses: docker/login-action@v1
33 |         with:
34 |           username: ${{ secrets.DOCKER_HUB_USERNAME }}
35 |           password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
36 | 
37 |       - name: Set up Docker Buildx
38 |         id: buildx
39 |         uses: docker/setup-buildx-action@v1
40 | 
41 |       - name: Build and push
42 |         id: docker_build
43 |         uses: docker/build-push-action@v2
44 |         with:
45 |           context: ./
46 |           file: ./Dockerfile2
47 |           push: true
48 |           tags: nextgenusfs/funannotate:${{ steps.get_release.outputs.tag_name }}
49 | 
50 |       - name: Image digest
51 |         run: echo ${{ steps.docker_build.outputs.digest }}
52 | 
53 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | # This is a basic workflow to help you get started with Actions
 2 | 
 3 | name: Release funannotate-slim to Docker Hub
 4 | 
 5 | # Controls when the action will run.
 6 | on:
 7 |   release:
 8 |     types: [created]
 9 | 
10 |   # Allows you to run this workflow manually from the Actions tab
11 |   workflow_dispatch:
12 | 
13 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel
14 | jobs:
15 |   # This workflow contains a single job called "build"
16 |   build:
17 |     # The type of runner that the job will run on
18 |     runs-on: ubuntu-latest
19 | 
20 |     steps:
21 |       - name: Check Out Repo
22 |         uses: actions/checkout@v2
23 | 
24 |       - name: Get release
25 |         id: get_release
26 |         uses: kaliber5/action-get-release@v1
27 |         with:
28 |           token: ${{ github.token }}
29 |           latest: true
30 | 
31 |       - name: Test git release scrapper
32 |         id: test_release
33 |         run: echo ${{ steps.get_release.outputs.tag_name }}
34 | 
35 |       - name: Login to Docker Hub
36 |         uses: docker/login-action@v1
37 |         with:
38 |           username: ${{ secrets.DOCKER_HUB_USERNAME }}
39 |           password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
40 | 
41 |       - name: Set up Docker Buildx
42 |         id: buildx
43 |         uses: docker/setup-buildx-action@v1
44 | 
45 |       - name: Build and push
46 |         id: docker_build
47 |         uses: docker/build-push-action@v2
48 |         with:
49 |           context: ./
50 |           file: ./Dockerfile
51 |           push: true
52 |           tags: nextgenusfs/funannotate-slim:${{ steps.get_release.outputs.tag_name }}
53 | 
54 |       - name: Image digest
55 |         run: echo ${{ steps.docker_build.outputs.digest }}
56 | 
57 |       - name: Repository Dispatch
58 |         uses: peter-evans/repository-dispatch@v1
59 |         with:
60 |           token: ${{ secrets.REPO_ACCESS_TOKEN }}
61 |           repository: nextgenusfs/funannotate
62 |           event-type: docker-hub-release-complete
63 |           client-payload: '{"ref": "${{ github.ref }}", "sha": "${{ github.sha }}"}'
64 | 


--------------------------------------------------------------------------------
/.github/workflows/singularity-deploy.yml:
--------------------------------------------------------------------------------
 1 | # This is a basic workflow to help you get started with Actions
 2 | 
 3 | name: CI Singularity
 4 | 
 5 | # Controls when the action will run.
 6 | on:
 7 |   repository_dispatch:
 8 |     types: [singularity-ready]
 9 | 
10 |   # Allows you to run this workflow manually from the Actions tab
11 |   workflow_dispatch:
12 | 
13 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel
14 | jobs:
15 |   deploy:
16 |     name: Deploy to Singularity
17 | 
18 |     runs-on: ubuntu-latest
19 | 
20 |     steps:
21 |       - uses: actions/checkout@v1
22 | 
23 |       - uses: chrnorm/deployment-action@releases/v1
24 |         name: Create GitHub deployment
25 |         id: deployment
26 |         with:
27 |           token: ${{ secrets.REPO_ACCESS_TOKEN }}
28 |           environment: production
29 | 
30 | 
31 |       - name: Update deployment status (success)
32 |         if: success()
33 |         uses: chrnorm/deployment-status@releases/v1
34 |         with:
35 |           token: ${{ secrets.REPO_ACCESS_TOKEN }}
36 |           state: "success"
37 |           deployment_id: ${{ steps.deployment.outputs.deployment_id }}
38 | 
39 |       - name: Update deployment status (failure)
40 |         if: failure()
41 |         uses: chrnorm/deployment-status@releases/v1
42 |         with:
43 |           token: ${{ secrets.REPO_ACCESS_TOKEN }}
44 |           state: "failure"
45 |           deployment_id: ${{ steps.deployment.outputs.deployment_id }}


--------------------------------------------------------------------------------
/.github/workflows/triggered-build.yml:
--------------------------------------------------------------------------------
 1 | # This is a basic workflow to help you get started with Actions
 2 | 
 3 | name: CI to Docker Hub funannotate
 4 | 
 5 | # Controls when the action will run.
 6 | on:
 7 |   repository_dispatch:
 8 |     types: [docker-hub-complete]
 9 | 
10 |   # Allows you to run this workflow manually from the Actions tab
11 |   workflow_dispatch:
12 | 
13 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel
14 | jobs:
15 |   # This workflow contains a single job called "build"
16 |   build:
17 |     # The type of runner that the job will run on
18 |     runs-on: ubuntu-latest
19 | 
20 |     steps:
21 | 
22 |       - name: Check Out Repo
23 |         uses: actions/checkout@v2
24 | 
25 |       - name: Login to Docker Hub
26 |         uses: docker/login-action@v1
27 |         with:
28 |           username: ${{ secrets.DOCKER_HUB_USERNAME }}
29 |           password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
30 | 
31 |       - name: Set up Docker Buildx
32 |         id: buildx
33 |         uses: docker/setup-buildx-action@v1
34 | 
35 |       - name: Build and push
36 |         id: docker_build
37 |         uses: docker/build-push-action@v2
38 |         with:
39 |           context: ./
40 |           file: ./Dockerfile2
41 |           push: true
42 |           tags: nextgenusfs/funannotate:latest
43 | 
44 |       - name: Image digest
45 |         run: echo ${{ steps.docker_build.outputs.digest }}
46 | 
47 |       - name: Repository Dispatch
48 |         uses: peter-evans/repository-dispatch@v1
49 |         with:
50 |           token: ${{ secrets.REPO_ACCESS_TOKEN }}
51 |           repository: nextgenusfs/funannotate
52 |           event-type: singularity-ready
53 |           client-payload: '{"ref": "${{ github.ref }}", "sha": "${{ github.sha }}"}'
54 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *~
 2 | *.pyc
 3 | */.DS_Store
 4 | */*.pyc
 5 | dockerbuild/
 6 | sample_data/
 7 | .DS_Store
 8 | funannotate.egg-info
 9 | .idea
10 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the version of Python and other tools you might need
 9 | build:
10 |   os: ubuntu-22.04
11 |   tools:
12 |     python: "3.11"
13 | 
14 | # Build documentation in the docs/ directory with Sphinx
15 | sphinx:
16 |   configuration: docs/conf.py
17 | 
18 | # We recommend specifying your dependencies to enable reproducible builds:
19 | # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
20 | python:
21 |   install:
22 |   - requirements: docs/requirements.txt


--------------------------------------------------------------------------------
/.zenodo.json:
--------------------------------------------------------------------------------
 1 | {
 2 |  "creators": [
 3 |       {
 4 |         "name": "Jonathan M. Palmer",
 5 |         "affiliation": "USDA Forest Service"
 6 |       },
 7 |       {
 8 |         "name": "Jason E. Stajich",
 9 |         "affiliation": "UC Riverside"
10 |       }
11 |     ],
12 |   "description": "funannotate is a pipeline for genome annotation (built specifically for fungi, but will also work with higher eukaryotes).",
13 |   "keywords": [
14 |     "genome",
15 |     "annotation",
16 |     "software"
17 |   ],
18 |   "license": "BSD-2",
19 |   "title": "Funannotate: a pipeline for eukaryotic genome annotation"
20 | }
21 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | # This CITATION.cff file was generated with cffinit.
 2 | # Visit https://bit.ly/cffinit to generate yours today!
 3 | 
 4 | cff-version: 1.2.0
 5 | title: Funannotate
 6 | message: >-
 7 |   If you use this software, please cite it using the
 8 |   metadata from this file.
 9 | type: software
10 | authors:
11 |   - given-names: Jonathan M.
12 |     family-names: Palmer
13 |     email: nextgenusfs@gmail.com
14 |     affiliation: USDA Forest Service
15 |     orcid: 'https://orcid.org/0000-0003-0929-3658'
16 |   - given-names: Jason E.
17 |     family-names: Stajich
18 |     email: jason.stajich@ucr.edu
19 |     orcid: 'https://orcid.org/0000-0002-7591-0020'
20 |     affiliation: University of California-Riverside
21 | identifiers:
22 |   - type: url
23 |     value: 'https://funannotate.readthedocs.io/'
24 |     description: ReadTheDocs documentation
25 |   - type: doi
26 |     value: 10.5281/zenodo.1134477
27 |     description: Zenodo archive of Funannotate software
28 | repository-code: 'https://github.com/nextgenusfs/funannotate'
29 | repository-artifact: 'https://doi.org/10.5281/zenodo.1134477'
30 | abstract: >-
31 |   Funannotate is a genome prediction, annotation, and
32 |   comparison software package. It was originally written to
33 |   annotate fungal genomes (small eukaryotes ~ 30 Mb
34 |   genomes), but has evolved over time to accomodate larger
35 |   genomes. The impetus for this software package was to be
36 |   able to accurately and easily annotate a genome for
37 |   submission to NCBI GenBank. 
38 | keywords:
39 |   - bioinformatics
40 |   - genome annotation
41 |   - genomics
42 |   - gene prediction
43 | license: BSD-2-Clause
44 | version: 1.8.16
45 | date-released: '2023-08-22'
46 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # start with miniconda3 as build environment
 2 | FROM condaforge/mambaforge AS build
 3 | 
 4 | # Update, install mamba and conda-pack:
 5 | RUN mamba install -n base --yes conda-pack
 6 | 
 7 | # Install funannotate deps from bioconda
 8 | # here specifying specific versions to be able to set ENV below
 9 | RUN mamba create -c conda-forge -c bioconda -c defaults \
10 |     -n funannotate --yes "python>=3.6,<3.9" "biopython<1.80" xlrd==1.2.0 \
11 |     "trinity==2.8.5" "evidencemodeler==1.1.1" "pasa==2.4.1" "codingquarry==2.0" \
12 |     "proteinortho==6.0.16" goatools matplotlib-base natsort numpy pigz \
13 |     pandas psutil requests "scikit-learn<1.0.0" scipy seaborn "blast=2.2.31" \
14 |     tantan bedtools hmmer exonerate "diamond>=2.0.5" tbl2asn blat "trnascan-se>=2.0" \
15 |     ucsc-pslcdnafilter trimmomatic raxml iqtree trimal "mafft>=7" hisat2 \
16 |     "kallisto==0.46.1" minimap2 stringtie "salmon>=0.9" "samtools>=1.9" \
17 |     glimmerhmm bamtools perl perl-yaml perl-file-which perl-local-lib perl-dbd-mysql perl-clone perl-hash-merge \
18 |     perl-soap-lite perl-json perl-logger-simple perl-scalar-util-numeric perl-math-utils perl-mce \
19 |     perl-text-soundex perl-parallel-forkmanager perl-db-file perl-perl4-corelibs ete3 distro \
20 |     && conda clean -a -y
21 | 
22 | # Since we want the most recent, install from repo, remove snap as broken
23 | SHELL ["conda", "run", "-n", "funannotate", "/bin/bash", "-c"]
24 | RUN python -m pip install git+https://github.com/nextgenusfs/funannotate.git
25 | 
26 | # package with conda-pack
27 | RUN conda-pack --ignore-missing-files -n funannotate -o /tmp/env.tar && \
28 |     mkdir /venv && cd /venv && tar xf /tmp/env.tar && \
29 |     rm /tmp/env.tar
30 | 
31 | # We've put venv in same path it'll be in final image
32 | RUN /venv/bin/conda-unpack
33 | 
34 | # Now build environment
35 | FROM debian:buster AS runtime
36 | 
37 | # Copy /venv from the previous stage:
38 | COPY --from=build /venv /venv
39 | 
40 | # Install debian snap via apt-get
41 | RUN apt-get update && apt-get install -y snap augustus augustus-data locales locales-all libgl1 procps && \
42 |     rm -rf /var/lib/apt/lists/* && \
43 |     ln -s /usr/bin/snap-hmm /usr/bin/snap && \
44 |     rm "/venv/bin/fasta" && \
45 |     ln -s "/venv/bin/fasta36" "/venv/bin/fasta"
46 | 
47 | # add it to the PATH and add env variables
48 | ENV PATH="/venv/bin:$PATH" \
49 |     AUGUSTUS_CONFIG_PATH="/usr/share/augustus/config" \
50 |     EVM_HOME="/venv/opt/evidencemodeler-1.1.1" \
51 |     PASAHOME="/venv/opt/pasa-2.4.1" \
52 |     TRINITYHOME="/venv/opt/trinity-2.8.5" \
53 |     QUARRY_PATH="/venv/opt/codingquarry-2.0/QuarryFiles" \
54 |     ZOE="/usr/share/snap" \
55 |     USER="me" \
56 |     FUNANNOTATE_DB="/opt/databases"
57 | 
58 | # When image is run, run the code with the environment
59 | SHELL ["/bin/bash", "-c"]
60 | CMD funannotate
61 | 


--------------------------------------------------------------------------------
/Dockerfile2:
--------------------------------------------------------------------------------
1 | FROM nextgenusfs/funannotate-slim
2 | 
3 | # install databases
4 | RUN funannotate setup -i all --wget -b dikarya microsporidia embryophyta metazoa arthropoda vertebrata protists tetrapoda
5 | 
6 | # When image is run, run the code with the environment
7 | SHELL ["/bin/bash", "-c"]
8 | CMD funannotate
9 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2016, Jonathan M. Palmer
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include README.md
 2 | include LICENSE.md
 3 | include docs/*
 4 | include funannotate/aux_scripts/*
 5 | include funannotate/config/*
 6 | include funannotate/utilities/*
 7 | include funannotate/html_template/*
 8 | include funannotate/html_template/css/*
 9 | include funannotate/html_template/js/*
10 | include scripts/*
11 | include funannotate/downloads.json
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Latest Github release](https://img.shields.io/github/release/nextgenusfs/funannotate.svg)](https://github.com/nextgenusfs/funannotate/releases/latest)
 2 | [![DOI](https://zenodo.org/badge/48254740.svg)](https://zenodo.org/badge/latestdoi/48254740)
 3 | ![Conda](https://img.shields.io/conda/dn/bioconda/funannotate)
 4 | ![Docker Image Size (tag)](https://img.shields.io/docker/image-size/nextgenusfs/funannotate/latest)
 5 | ![Docker Pulls](https://img.shields.io/docker/pulls/nextgenusfs/funannotate)
 6 | [![https://www.singularity-hub.org/static/img/hosted-singularity--hub-%23e32929.svg](https://www.singularity-hub.org/static/img/hosted-singularity--hub-%23e32929.svg)](https://singularity-hub.org/collections/5068)
 7 | [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/funannotate/README.html)
 8 | [![European Galaxy server](https://img.shields.io/badge/usegalaxy-.eu-brightgreen?logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABgAAAASCAYAAABB7B6eAAAABGdBTUEAALGPC/xhBQAAACBjSFJNAAB6JgAAgIQAAPoAAACA6AAAdTAAAOpgAAA6mAAAF3CculE8AAAACXBIWXMAAAsTAAALEwEAmpwYAAACC2lUWHRYTUw6Y29tLmFkb2JlLnhtcAAAAAAAPHg6eG1wbWV0YSB4bWxuczp4PSJhZG9iZTpuczptZXRhLyIgeDp4bXB0az0iWE1QIENvcmUgNS40LjAiPgogICA8cmRmOlJERiB4bWxuczpyZGY9Imh0dHA6Ly93d3cudzMub3JnLzE5OTkvMDIvMjItcmRmLXN5bnRheC1ucyMiPgogICAgICA8cmRmOkRlc2NyaXB0aW9uIHJkZjphYm91dD0iIgogICAgICAgICAgICB4bWxuczp0aWZmPSJodHRwOi8vbnMuYWRvYmUuY29tL3RpZmYvMS4wLyI+CiAgICAgICAgIDx0aWZmOlJlc29sdXRpb25Vbml0PjI8L3RpZmY6UmVzb2x1dGlvblVuaXQ+CiAgICAgICAgIDx0aWZmOkNvbXByZXNzaW9uPjE8L3RpZmY6Q29tcHJlc3Npb24+CiAgICAgICAgIDx0aWZmOk9yaWVudGF0aW9uPjE8L3RpZmY6T3JpZW50YXRpb24+CiAgICAgICAgIDx0aWZmOlBob3RvbWV0cmljSW50ZXJwcmV0YXRpb24+MjwvdGlmZjpQaG90b21ldHJpY0ludGVycHJldGF0aW9uPgogICAgICA8L3JkZjpEZXNjcmlwdGlvbj4KICAgPC9yZGY6UkRGPgo8L3g6eG1wbWV0YT4KD0UqkwAAAn9JREFUOBGlVEuLE0EQruqZiftwDz4QYT1IYM8eFkHFw/4HYX+GB3/B4l/YP+CP8OBNTwpCwFMQXAQPKtnsg5nJZpKdni6/6kzHvAYDFtRUT71f3UwAEbkLch9ogQxcBwRKMfAnM1/CBwgrbxkgPAYqlBOy1jfovlaPsEiWPROZmqmZKKzOYCJb/AbdYLso9/9B6GppBRqCrjSYYaquZq20EUKAzVpjo1FzWRDVrNay6C/HDxT92wXrAVCH3ASqq5VqEtv1WZ13Mdwf8LFyyKECNbgHHAObWhScf4Wnj9CbQpPzWYU3UFoX3qkhlG8AY2BTQt5/EA7qaEPQsgGLWied0A8VKrHAsCC1eJ6EFoUd1v6GoPOaRAtDPViUr/wPzkIFV9AaAZGtYB568VyJfijV+ZBzlVZJ3W7XHB2RESGe4opXIGzRTdjcAupOK09RA6kzr1NTrTj7V1ugM4VgPGWEw+e39CxO6JUw5XhhKihmaDacU2GiR0Ohcc4cZ+Kq3AjlEnEeRSazLs6/9b/kh4eTC+hngE3QQD7Yyclxsrf3cpxsPXn+cFdenF9aqlBXMXaDiEyfyfawBz2RqC/O9WF1ysacOpytlUSoqNrtfbS642+4D4CS9V3xb4u8P/ACI4O810efRu6KsC0QnjHJGaq4IOGUjWTo/YDZDB3xSIxcGyNlWcTucb4T3in/3IaueNrZyX0lGOrWndstOr+w21UlVFokILjJLFhPukbVY8OmwNQ3nZgNJNmKDccusSb4UIe+gtkI+9/bSLJDjqn763f5CQ5TLApmICkqwR0QnUPKZFIUnoozWcQuRbC0Km02knj0tPYx63furGs3x/iPnz83zJDVNtdP3QAAAABJRU5ErkJggg==)](https://usegalaxy.eu/root?tool_id=funannotate_annotate)
 9 | 
10 | 
11 | ![Alt text](funannotate-logo.png?raw=true "Funannotate")
12 | 
13 | funannotate is a pipeline for genome annotation (built specifically for fungi, but will also work with higher eukaryotes). Installation, usage, and more information can be found at [http://funannotate.readthedocs.io](http://funannotate.readthedocs.io)
14 | 
15 | #### Quickest start Docker:
16 | 
17 | You can use docker to run `funannotate`. Caveats are that GeneMark is not included in the docker image (see licensing below and you can complain to the developers for making it difficult to distribute/use). I've also written a bash script that can run the docker image and auto-detect/include the proper user/volume bindings.  This docker image is built off of the latest code in master, so it will be ahead of the tagged releases. The image includes the required databases as well, if you want just funannotate without the databases then that is located on docker hub as well `nextgenusfs/funannotate-slim`. So this route can be achieved with:
18 | 
19 | ```
20 | # download/pull the image from docker hub
21 | $ docker pull nextgenusfs/funannotate
22 | 
23 | # download bash wrapper script (optional)
24 | $ wget -O funannotate-docker https://raw.githubusercontent.com/nextgenusfs/funannotate/master/funannotate-docker
25 | 
26 | # might need to make this executable on your system
27 | $ chmod +x /path/to/funannotate-docker
28 | 
29 | # assuming it is in your PATH, now you can run this script as if it were the funannotate executable script
30 | $ funannotate-docker test -t predict --cpus 12
31 | ```
32 | 
33 | #### Quickstart Bioconda install:
34 | 
35 | The pipeline can be installed with conda (via [bioconda](https://bioconda.github.io/)):
36 | ```
37 | #add appropriate channels
38 | conda config --add channels defaults
39 | conda config --add channels bioconda
40 | conda config --add channels conda-forge
41 | 
42 | #then create environment
43 | conda create -n funannotate "python>=3.6,<3.9" funannotate
44 | ```
45 | If `conda` is taking forever to solve the environment, I would recommend giving [mamba](https://github.com/mamba-org/mamba) a try:
46 | ```
47 | #install mamba into base environment
48 | conda install -n base mamba
49 | 
50 | #then use mamba as drop in replacmeent
51 | mamba create -n funannotate funannotate
52 | ```
53 | 
54 | If you want to use GeneMark-ES/ET you will need to install that manually following developers instructions:
55 | http://topaz.gatech.edu/GeneMark/license_download.cgi
56 | 
57 | Note that you will need to change the shebang line for all perl scripts in GeneMark to use `/usr/bin/env perl`.
58 | You will then also need to add `gmes_petap.pl` to the $PATH or set the environmental variable $GENEMARK_PATH to the gmes_petap directory.
59 | 
60 | To install just the python funannotate package, you can do this with pip:
61 | ```
62 | python -m pip install funannotate
63 | ```
64 | 
65 | To install the most updated code in master you can run:
66 | ```
67 | python -m pip install git+https://github.com/nextgenusfs/funannotate.git
68 | ```
69 | # Citation
70 | Jonathan M. Palmer, & Jason Stajich. (2020). Funannotate v1.8.1: Eukaryotic genome annotation (v1.8). Zenodo. https://doi.org/10.5281/zenodo.1134477
71 | 


--------------------------------------------------------------------------------
/Singularity:
--------------------------------------------------------------------------------
1 | Bootstrap: docker
2 | From: nextgenusfs/funannotate
3 | 
4 | %help
5 | Built from Docker Hub nextgenusfs/funannotate


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | _build
2 | _static
3 | _template
4 | old


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = Funannotate
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/compare.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | .. _compare:
 3 | 
 4 | Comparative genomics
 5 | ================================
 6 | A typical workflow in a genomics project would be to compare your newly sequenced/assembled/annotated genome to other organisms. The impetus behind :code:`funannotate compare` was that there was previously no way to easily compare multiple genomes. Funannotate stores all annotation in GenBank flat file format, while some people don't like this format as it is difficult to parse with standard unix tools, the main advantage is that the annotation can be stored in a standardized format and retrieved in the same way for each genome. GFF3 is the common output of many annotation tools, however, this doesn't work well for functional annotation as all of the "information" is stored in a single column.  At any rate, :code:`funannotate compare` can take either folders containing "funannotated" genomes or GBK files --> the output is stats, graphs, CSV files, phylogeny, etc all summarized in HTML format.
 7 | 
 8 | .. code-block:: none
 9 | 
10 | 	Usage:       funannotate compare <arguments>
11 | 	version:     1.8.14
12 | 
13 | 	Description: Script does light-weight comparative genomics between funannotated genomes.  Output
14 | 				 is graphs, phylogeny, CSV files, etc --> visualized in web-browser.
15 | 
16 | 	Required:
17 | 	  -i, --input         List of funannotate genome folders or GBK files
18 | 
19 | 	Optional:
20 | 	  -o, --out           Output folder name. Default: funannotate_compare
21 | 	  -d, --database      Path to funannotate database. Default: $FUNANNOTATE_DB
22 | 	  --cpus              Number of CPUs to use. Default: 2
23 | 	  --run_dnds          Calculate dN/dS ratio on all orthologs. [estimate,full]
24 | 	  --go_fdr            P-value for FDR GO-enrichment. Default: 0.05
25 | 	  --heatmap_stdev     Cut-off for heatmap. Default: 1.0
26 | 	  --num_orthos        Number of Single-copy orthologs to use for ML. Default: 500
27 | 	  --bootstrap         Number of boostrap replicates to run with RAxML. Default: 100
28 | 	  --outgroup          Name of species to use for ML outgroup. Default: no outgroup
29 | 	  --proteinortho      ProteinOrtho5 POFF results.
30 | 	  --ml_method         Maxmimum Likelihood method: Default: raxml [raxml,iqtree]
31 |           --ml_model          Substitution model for IQtree. Default: modelfinder
32 | 	  --no-progress       Do not print progress to stdout for long sub jobs
33 | 


--------------------------------------------------------------------------------
/docs/conda.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | .. _conda:
  3 | 
  4 | Conda mediated Installation
  5 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  6 | 
  7 | I'd really like to build a bioconda installation package, but would need some help.  You can however install quite a few of the dependencies with conda.
  8 | 
  9 | 
 10 | **If you are on LINUX -- start here:**
 11 | 
 12 | .. code-block:: none
 13 |     
 14 |     #If you do not have conda, install: download miniconda2 or miniconda3, miniconda3 shown
 15 |     wget --quiet https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
 16 |     /bin/bash ~/miniconda.sh -b -p /conda/installation/path
 17 |     
 18 |     #setup bioconda repository
 19 |     conda config --add channels defaults
 20 |     conda config --add channels etetoolkit
 21 |     conda config --add channels bioconda
 22 |     conda config --add channels conda-forge
 23 |     
 24 |     #now create a conda environment and install dependencies
 25 |     conda create -y -n funannotate python=2.7 numpy pandas scipy matplotlib seaborn \
 26 |         natsort scikit-learn psutil biopython requests blast rmblast goatools fisher  \
 27 |         bamtools augustus bedtools hmmer exonerate diamond>=0.9 tbl2asn ucsc-pslcdnafilter \
 28 |         samtools raxml trimal mafft>=7 iqtree kallisto>=0.46.0 bowtie2 infernal mummer minimap2 blat \
 29 |         trinity>=2.6.6 evidencemodeler pasa>=2.3 codingquarry stringtie gmap=2017.11.15 snap \
 30 |         ete3 salmon>=0.9 jellyfish>=2.2 htslib trnascan-se hisat2 glimmerhmm \
 31 |         trf perl-threaded perl-db-file perl-bioperl perl-dbd-mysql perl-dbd-sqlite \
 32 |         perl-text-soundex perl-scalar-util-numeric perl-data-dumper perl-dbi perl-clone \
 33 |         perl-json perl-logger-simple perl-hash-merge perl-yaml perl-pod-usage perl-getopt-long \
 34 |         perl-parallel-forkmanager perl-carp perl-soap-lite perl-class-inspector perl-app-cpanminus
 35 |     
 36 |     #if you are going to use remote search also need LWP module (not on conda)
 37 |     cpanm LWP
 38 |     
 39 | **If you are on MacOS X -- start here:**
 40 | 
 41 | .. code-block:: none
 42 |     
 43 |     #If you do not have conda, install: download miniconda2 or miniconda3, miniconda3 shown
 44 |     wget --quiet https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
 45 |     /bin/bash ~/miniconda.sh -b -p /conda/installation/path
 46 |     
 47 |     #setup bioconda repository
 48 |     conda config --add channels defaults
 49 |     conda config --add channels etetoolkit
 50 |     conda config --add channels bioconda
 51 |     conda config --add channels conda-forge
 52 |     
 53 |     #now create a conda environment and install dependencies
 54 |     conda create -y -n funannotate python=2.7 numpy pandas scipy matplotlib seaborn \
 55 |         natsort scikit-learn psutil biopython requests blast rmblast goatools fisher \
 56 |         bedtools hmmer exonerate diamond>=0.9 tbl2asn ucsc-pslcdnafilter \
 57 |         samtools raxml trimal mafft>=7 iqtree kallisto>=0.46.0 bowtie2 infernal mummer \
 58 |         evidencemodeler  gmap=2017.11.15 hisat2 blat minimap2 snap glimmerhmm  \
 59 |         ete3 salmon>=0.9 jellyfish>=2.2 htslib trnascan-se codingquarry \
 60 |         trf perl-threaded perl-db-file perl-bioperl perl-dbd-mysql perl-dbd-sqlite \
 61 |         perl-text-soundex perl-scalar-util-numeric perl-data-dumper perl-dbi perl-clone \
 62 |         perl-json perl-logger-simple perl-hash-merge perl-yaml perl-pod-usage perl-getopt-long \
 63 |         perl-parallel-forkmanager perl-carp perl-soap-lite perl-class-inspector perl-app-cpanminus
 64 |     
 65 |     #if you are going to use remote search also need LWP module (not on conda)
 66 |     cpanm LWP
 67 | 
 68 |     
 69 | MacOSX: Need to install bamtools/augustus/trinity/pasa manually:
 70 | 
 71 | Install bamtools/Augustus from here: https://github.com/nextgenusfs/augustus
 72 | 
 73 | Trinity: https://github.com/trinityrnaseq/trinityrnaseq
 74 | 
 75 | PASA: https://github.com/PASApipeline/PASApipeline
 76 |     
 77 |     
 78 | **The above will automatically install most of the dependencies, below there are a few manual steps.**
 79 |         
 80 |     1.  Download/install GeneMark-ES/ET: (gmes_petap.pl must be in PATH)
 81 |         http://exon.gatech.edu/GeneMark/license_download.cgi
 82 |         
 83 |         * make sure to activate the license and move into proper location. you can test proper installation by running `gmes_petap.pl` in the terminal -- you should see help menu. Be careful of the shebang line, default is `/usr/bin/perl` which most likely is not what you want, more appropriate is `/usr/bin/env perl`
 84 |         
 85 |     2.  Install RepeatMasker/RepeatModeler  http://www.repeatmasker.org
 86 |     
 87 |      
 88 |     2b. Download Repbase RepeatMasker Libraries if you have not done so already.
 89 | 
 90 |     .. code-block:: none 
 91 |       
 92 |         wget --user name --password pass http://www.girinst.org/server/RepBase/protected/repeatmaskerlibraries/RepBaseRepeatMaskerEdition-20170127.tar.gz
 93 |         tar zxvf RepBaseRepeatMaskerEdition-20170127.tar.gz -C /path/to/repeatmasker/location
 94 |         cd /path/to/repeatmasker/location
 95 |         ./configure
 96 | 
 97 |         #Soft-link a repeatmasker utility script into the PATH (may not need to do this depending on install)
 98 |         ln -s /path/to/repeatmasker/location/repeatmasker/util/rmOutToGFF3.pl /usr/local/bin/rmOutToGFF3.pl
 99 | 
100 | 
101 |     3. Setup Eggnog-mapper v4.5 or v5.0 [v5.0 is not being parsed properly yet in v1.5.3]
102 |     
103 |      .. code-block:: none
104 |         
105 |         #clone the eggnog mapper repo into a location you have read/write access
106 |         git clone https://github.com/jhcepas/eggnog-mapper.git
107 |         
108 |         #move into folder and setup - this will put into eggnog-mapper/data location
109 |         cd eggnog-mapper
110 |         download_eggnog_data.py
111 |         
112 |         #finally add to your funannotate conda env so it is in path when env is activated
113 |         ln -s /path/to/eggnog-mapper/emapper.py /path/to/conda/envs/funannotate/bin/emapper.py
114 |         
115 | 	
116 | 	NOTE: MacOSX users -- the diamond version shipped with eggnog-mapper needs to be swapped 
117 | 	out as the binary provided is compiled on linux. Run a small test with emapper.py to check 
118 | 	functionality `emapper.py -m diamond -i test.fa -o test`
119 |     
120 |    
121 |     4. Clone the funannotate repo and add to PATH
122 |     
123 |      .. code-block:: none
124 |      
125 |         git clone https://github.com/nextgenusfs/funannotate.git
126 |         
127 |         #add to PATH
128 |         ln -s /path/to/funannotate/funannotate /path/to/conda/envs/funannotate/bin/funannotate
129 |         
130 |     5. Run funannotate check --show-versions, fix any issues. You will need to export some ENV variables.
131 |     
132 |     .. code-block:: none
133 | 
134 |         export EVM_HOME=/path/to/conda/envs/funannotate/opt/evidencemodeler-v1.1.1
135 |         export TRINITYHOME=/path/to/conda/envs/funannotate/opt/trinity-2.6.6
136 |         export PASAHOME=/path/to/conda/envs/funannotate/opt/pasa-2.3.3
137 |         export AUGUSTUS_CONFIG_PATH=/path/to/augustus/config
138 |         export GENEMARK_PATH=/path/to/gmes_petap_dir
139 |         export FUNANNOTATE_DB=/path/to/funannotateDB
140 |         
141 |     6.  Setup funannotate databases, specify any location you have read/write access to to `-d` -- this is $FUNANNOTATE_DB
142 | 
143 |     .. code-block:: none
144 |         
145 |         funannotate setup -d /path/to/DB
146 |         
147 |     7.  If you want these ENV variables to be activated when you activate the conda environment, you can add them as a shell script to the the activate location of your environment, i.e. `/path/to/conda/envs/funannotate/etc/conda/activate.d/` and then you can put the corresponding `unset` commands in the deactivate directory, i.e. `/path/to/conda/envs/funannotate/etc/conda/deactivate.d/`
148 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Funannotate documentation build configuration file, created by
  4 | # sphinx-quickstart on Sat Nov 18 22:41:39 2017.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | # If extensions (or modules to document with autodoc) are in another directory,
 16 | # add these directories to sys.path here. If the directory is relative to the
 17 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 18 | #
 19 | # import os
 20 | # import sys
 21 | # sys.path.insert(0, os.path.abspath('.'))
 22 | 
 23 | 
 24 | # -- General configuration ------------------------------------------------
 25 | 
 26 | # If your documentation needs a minimal Sphinx version, state it here.
 27 | #
 28 | # needs_sphinx = '1.0'
 29 | 
 30 | # Add any Sphinx extension module names here, as strings. They can be
 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 32 | # ones.
 33 | extensions = []
 34 | 
 35 | # Add any paths that contain templates here, relative to this directory.
 36 | templates_path = ['_templates']
 37 | 
 38 | # The suffix(es) of source filenames.
 39 | # You can specify multiple suffix as a list of string:
 40 | #
 41 | # source_suffix = ['.rst', '.md']
 42 | source_suffix = '.rst'
 43 | 
 44 | # The master toctree document.
 45 | master_doc = 'index'
 46 | 
 47 | # General information about the project.
 48 | project = u'Funannotate'
 49 | copyright = u'2017, Jon Palmer'
 50 | author = u'Jon Palmer'
 51 | 
 52 | # The version info for the project you're documenting, acts as replacement for
 53 | # |version| and |release|, also used in various other places throughout the
 54 | # built documents.
 55 | #
 56 | # The short X.Y version.
 57 | version = u'1.8.16'
 58 | # The full version, including alpha/beta/rc tags.
 59 | release = u'1.8.16'
 60 | 
 61 | # The language for content autogenerated by Sphinx. Refer to documentation
 62 | # for a list of supported languages.
 63 | #
 64 | # This is also used if you do content translation via gettext catalogs.
 65 | # Usually you set "language" from the command line for these cases.
 66 | language = None
 67 | 
 68 | # List of patterns, relative to source directory, that match files and
 69 | # directories to ignore when looking for source files.
 70 | # This patterns also effect to html_static_path and html_extra_path
 71 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 72 | 
 73 | # The name of the Pygments (syntax highlighting) style to use.
 74 | pygments_style = 'sphinx'
 75 | 
 76 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 77 | todo_include_todos = False
 78 | 
 79 | 
 80 | # -- Options for HTML output ----------------------------------------------
 81 | 
 82 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 83 | # a list of builtin themes.
 84 | #
 85 | import sphinx_rtd_theme
 86 | html_theme = 'sphinx_rtd_theme'
 87 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
 88 | 
 89 | # Theme options are theme-specific and customize the look and feel of a theme
 90 | # further.  For a list of options available for each theme, see the
 91 | # documentation.
 92 | #
 93 | # html_theme_options = {}
 94 | 
 95 | # Add any paths that contain custom static files (such as style sheets) here,
 96 | # relative to this directory. They are copied after the builtin static files,
 97 | # so a file named "default.css" will overwrite the builtin "default.css".
 98 | html_static_path = ['_static']
 99 | 
100 | # Custom sidebar templates, must be a dictionary that maps document names
101 | # to template names.
102 | #
103 | # This is required for the alabaster theme
104 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
105 | html_sidebars = {
106 |     '**': [
107 |         'relations.html',  # needs 'show_related': True theme option to display
108 |         'searchbox.html',
109 |     ]
110 | }
111 | 
112 | 
113 | # -- Options for HTMLHelp output ------------------------------------------
114 | 
115 | # Output file base name for HTML help builder.
116 | htmlhelp_basename = 'Funannotatedoc'
117 | 
118 | 
119 | # -- Options for LaTeX output ---------------------------------------------
120 | 
121 | latex_elements = {
122 |     # The paper size ('letterpaper' or 'a4paper').
123 |     #
124 |     # 'papersize': 'letterpaper',
125 | 
126 |     # The font size ('10pt', '11pt' or '12pt').
127 |     #
128 |     # 'pointsize': '10pt',
129 | 
130 |     # Additional stuff for the LaTeX preamble.
131 |     #
132 |     # 'preamble': '',
133 | 
134 |     # Latex figure (float) alignment
135 |     #
136 |     # 'figure_align': 'htbp',
137 | }
138 | 
139 | # Grouping the document tree into LaTeX files. List of tuples
140 | # (source start file, target name, title,
141 | #  author, documentclass [howto, manual, or own class]).
142 | latex_documents = [
143 |     (master_doc, 'Funannotate.tex', u'Funannotate Documentation',
144 |      u'Jon Palmer', 'manual'),
145 | ]
146 | 
147 | 
148 | # -- Options for manual page output ---------------------------------------
149 | 
150 | # One entry per manual page. List of tuples
151 | # (source start file, name, description, authors, manual section).
152 | man_pages = [
153 |     (master_doc, 'funannotate', u'Funannotate Documentation',
154 |      [author], 1)
155 | ]
156 | 
157 | 
158 | # -- Options for Texinfo output -------------------------------------------
159 | 
160 | # Grouping the document tree into Texinfo files. List of tuples
161 | # (source start file, target name, title, author,
162 | #  dir menu entry, description, category)
163 | texinfo_documents = [
164 |     (master_doc, 'Funannotate', u'Funannotate Documentation',
165 |      author, 'Funannotate', 'One line description of project.',
166 |      'Miscellaneous'),
167 | ]
168 | 


--------------------------------------------------------------------------------
/docs/databases.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | .. _databases:
 3 | 
 4 | Annotation Databases
 5 | ================================
 6 |  
 7 | Funannotate uses several publicly available databases, they can be installed with the :code:`funannotate setup` command.  The currently installed databases and version numbers can be displayed with the :code:`funannotate database` command.
 8 | 
 9 | Initial setup is simple and requires only a path to a database location, this can (should) be set using the $FUNANNOTATE_DB environmental variable.  If $FUNANNOTATE_DB is set, then the script will use that location by default, otherwise you will need to specify a location to the script i.e.:
10 | 
11 | .. code-block:: none
12 | 
13 |     funannotate setup -d $HOME/funannotate_db
14 |     
15 |     
16 | You could then update the databases if $FUNANNOTATE_DB is set like this:
17 | 
18 | .. code-block:: none
19 | 
20 |     funannotate setup -i all --update
21 |     
22 |     #or force update of just one database
23 |     funannotate setup -i uniprot --force
24 |     
25 | 
26 | This will download and format the databases, they can be displayed like so:
27 | 
28 | .. code-block:: none
29 | 
30 |     $ funannotate database
31 | 
32 | 	Funannotate Databases currently installed:
33 | 
34 | 	  Database          Type        Version      Date         Num_Records   Md5checksum                     
35 |           merops            diamond     12.5         2023-01-19          5098   6cd3c3dd85650394ce4e3dacb591f2a5
36 |           uniprot           diamond     2024_01      2024-01-24        570830   c7507ea16b3c4807971c663994cad329
37 |           dbCAN             hmmer3      11.0         2022-08-09           699   fb112af319a5001fbf547eac29e7c3b5
38 |           pfam              hmmer3      36.0         2023-07            20795   0725495ccf049a4f198fcc0a92f7f38c
39 |           repeats           diamond     1.0          2022-03-13         11950   4e8cafc3eea47ec7ba505bb1e3465d21
40 |           go                text        2024-01-17   2024-01-17         47729   7e6b9974184dda306e6e07631f1783af
41 |           mibig             diamond     1.4          2022-03-13         31023   118f2c11edde36c81bdea030a0228492
42 |           interpro          xml         98.0         2024-01-25         40768   502ea05009761b893dedb56d5ea89c48
43 |           busco_outgroups   outgroups   1.0          2024-03-04             8   6795b1d4545850a4226829c7ae8ef058
44 |           gene2product      text        1.92         2023-10-02         34459   32a4a80987720e0872377de3207dc0f5
45 | 
46 | 	To update a database type:
47 | 		funannotate setup -i DBNAME -d $HOME/funannotate_db --force
48 | 
49 | 	To see install BUSCO outgroups type:
50 | 		funannotate database --show-outgroups
51 | 
52 | 	To see BUSCO tree type:
53 | 		funannotate database --show-buscos
54 | 
55 | 
56 | 
57 | Similarly, database sources can be updated with the :code:`funannotate setup` command, for example to update the gene2product database to its most recent version you would run:
58 | 
59 | .. code-block:: none
60 | 
61 |     $ funannotate setup -d $HOME/funannotate_db -i gene2product --update
62 |     
63 |     
64 | 


--------------------------------------------------------------------------------
/docs/dependencies.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | .. _dependencies:
  3 | 
  4 | Dependencies
  5 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  6 | Funannotate has a lot of dependencies.  However, it also comes with a few tools to help you get everything installed.  The first is that of :code:`funannotate check`.  You'll see in the output below that the :code:`fasta` tool is missing, which is Bill Pearsons :code:`fasta36` a dependency of the PASA pipeline.  Also the :code:`$PASAHOME`` and :code:`$TRINITYHOME`` variables are not set, that is because on this particular machine they are not installed, i.e. funannotate will alert you at runtime if it is missing a dependency.
  7 | 
  8 | .. code-block:: none
  9 |     
 10 | 	$ funannotate check --show-versions
 11 | 	-------------------------------------------------------
 12 | 	Checking dependencies for funannotate v1.4.0
 13 | 	-------------------------------------------------------
 14 | 	You are running Python v 2.7.11. Now checking python packages...
 15 | 	biopython: 1.70
 16 | 	goatools: 0.7.11
 17 | 	matplotlib: 2.1.1
 18 | 	natsort: 5.2.0
 19 | 	numpy: 1.12.1
 20 | 	pandas: 0.22.0
 21 | 	psutil: 5.4.3
 22 | 	requests: 2.18.4
 23 | 	scikit-learn: 0.19.0
 24 | 	scipy: 0.19.1
 25 | 	seaborn: 0.8.1
 26 | 	All 11 python packages installed
 27 | 
 28 | 
 29 | 	You are running Perl v 5.026001. Now checking perl modules...
 30 | 	Bio::Perl: 1.007002
 31 | 	Carp: 1.42
 32 | 	Clone: 0.39
 33 | 	DBD::SQLite: 1.56
 34 | 	DBD::mysql: 4.046
 35 | 	DBI: 1.641
 36 | 	DB_File: 1.84
 37 | 	Data::Dumper: 2.167
 38 | 	File::Basename: 2.85
 39 | 	File::Which: 1.22
 40 | 	Getopt::Long: 2.5
 41 | 	Hash::Merge: 0.300
 42 | 	JSON: 2.97001
 43 | 	LWP::UserAgent: 6.33
 44 | 	Logger::Simple: 2.0
 45 | 	POSIX: 1.76
 46 | 	Parallel::ForkManager: 1.19
 47 | 	Pod::Usage: 1.69
 48 | 	Scalar::Util::Numeric: 0.40
 49 | 	Storable: 2.62
 50 | 	Text::Soundex: 3.05
 51 | 	Thread::Queue: 3.12
 52 | 	Tie::File: 1.02
 53 | 	URI::Escape: 3.31
 54 | 	YAML: 1.24
 55 | 	threads: 2.21
 56 | 	threads::shared: 1.58
 57 | 	All 27 Perl modules installed
 58 | 
 59 | 
 60 | 	Checking external dependencies...
 61 | 	RepeatMasker: RepeatMasker 4.0.7
 62 | 	RepeatModeler: RepeatModeler 1.0.11
 63 | 	Trinity: 2.5.1
 64 | 	augustus: 3.2.1
 65 | 	bamtools: bamtools 2.4.0
 66 | 	bedtools: bedtools v2.27.1
 67 | 	blat: BLAT v35
 68 | 	diamond: diamond 0.9.19
 69 | 	emapper.py: emapper-1.0.3
 70 | 	ete3: 3.1.1
 71 | 	exonerate: exonerate 2.4.0
 72 | 	fasta: no way to determine
 73 | 	gmap: 2017-06-20
 74 | 	gmes_petap.pl: 4.30
 75 | 	hisat2: 2.1.0
 76 | 	hmmscan: HMMER 3.1b2 (February 2015)
 77 | 	hmmsearch: HMMER 3.1b2 (February 2015)
 78 | 	java: 1.8.0_92
 79 | 	kallisto: 0.43.1
 80 | 	mafft: v7.313 (2017/Nov/15)
 81 | 	makeblastdb: makeblastdb 2.7.1+
 82 | 	minimap2: 2.10-r761
 83 | 	nucmer: 3.1
 84 | 	pslCDnaFilter: no way to determine
 85 | 	rmblastn: rmblastn 2.2.27+
 86 | 	samtools: samtools 1.8
 87 | 	tRNAscan-SE: 1.23 (April 2002)
 88 | 	tbl2asn: unknown, likely 25.3
 89 | 	tblastn: tblastn 2.7.1+
 90 | 	trimal: trimAl v1.4.rev15 build[2013-12-17]
 91 | 	All 30 external dependencies are installed
 92 | 
 93 | 	Checking Environmental Variables...
 94 | 	$FUNANNOTATE_DB=/usr/local/share/funannotate
 95 | 	$PASAHOME=/Users/jon/software/PASApipeline
 96 | 	$TRINITYHOME=/usr/local/opt/trinity
 97 | 	$EVM_HOME=/Users/jon/software/evidencemodeler
 98 | 	$AUGUSTUS_CONFIG_PATH=/Users/jon/software/augustus/config
 99 | 	$GENEMARK_PATH=/Users/jon/software/gmes_petap
100 | 	$BAMTOOLS_PATH=/Users/jon/software/bamtools-2.4.0/bin
101 | 	All 7 environmental variables are set
102 | 	-------------------------------------------------------
103 | 
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------
/docs/docker.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | .. _docker:
 3 | 
 4 | Docker Installation
 5 | ================================
 6 | Docker is a solution where most of the dependencies are installed and you can start annotating
 7 | right away. Because some software and data require individual licensing, the core components
 8 | of funannotate are packaged into a docker container, but you must download a few things and
 9 | run a docker build locally to get a working container. Note that Eggnog-mapper is not installed
10 | in Docker container as the databases were too large.
11 | 
12 | 1) Download 64 bit Linux GeneMark-ET/ES Key (gm_key_64.gz) from http://exon.gatech.edu/Genemark/license_download.cgi
13 | 
14 | 
15 | 2) Download RepeatMasker libraries. Register for username at RepBase http://www.girinst.org/repbase/. You can then download the RepeatMasker Libraries most recent version, alternatively can download from command line like so:
16 | 
17 | .. code-block:: none
18 | 
19 |     wget --user name --password pass \
20 |         https://www.girinst.org/server/archive/RepBase23.09/protected/repeatmaskerlibraries/RepBaseRepeatMaskerEdition-20170127.tar.gz
21 |     
22 | 3) Get SignalP4.1 for linux 64 from CBS http://www.cbs.dtu.dk/cgi-bin/sw_request?signalp
23 | 
24 | 
25 | 4) Download Dockerfile:
26 | 
27 | .. code-block:: none
28 | 
29 |     wget https://raw.githubusercontent.com/nextgenusfs/funannotate/1.5.1/dockerbuild/Dockerfile
30 | 
31 | 5) You should now have the following files in the same directory:
32 | 
33 | .. code-block:: none
34 |     
35 |     Dockerfile
36 |     gm_key_64.gz
37 |     RepBaseRepeatMaskerEdition-20170127.tar.gz
38 |     signalp-4.1f.Linux.tar.gz
39 | 
40 | Now you can Build the docker container, which will setup the remaining tools and then download and format funannotate databases.:
41 | 
42 | .. code-block:: none
43 | 
44 |     docker build -t funannotate -f Dockerfile .
45 |     
46 | 
47 | **Running the Docker container with your data:**
48 | 
49 | In order to run the docker container, you need to put all the files you will use for input to funannotate into the same folder, you can then launch the Docker container and mount your current folder with the following command:
50 | 
51 | .. code-block:: none
52 | 
53 |     #container is deleted after you exit
54 |     docker run -it --rm -v $PWD:/home/linuxbrew/data funannotate
55 |     
56 |     #keep container, i.e. mysql databases generated, however will take a lot of HD space
57 |     docker run -it -v $PWD:/home/linuxbrew/data funannotate
58 | 
59 | This will bring you to a bash prompt within the docker container where all dependencies are installed, so you can now issue the funannotate commands on your data. 
60 | 
61 | **Limitations with Docker:**
62 | 
63 | The funannotate docker image does not contain Eggnog-mapper because the databases sizes are too large (> 20 GB).  Eggnog-mapper is an important component of functional annotation, you can run this on the eggnog-mapper webserver and pass results to funannotate or perhaps set up an additional docker image running the eggnog-mapper software.
64 | 
65 | **Mac OSX users:**
66 | 
67 | The default storage-driver on docker for Mac is the overlay2 driver.  This driver seems to be incompatible with running/launching MySQL, thus if you are getting errors running funannotate you will need to change your storage-driver to "aufs".  This can be done in Docker preferences, Daemon tab, Advanced tab, and then change the storage-driver.  **Note this will delete all Docker images/containers on your virtual disk.**
68 | 
69 | .. code-block:: none
70 | 
71 |   {
72 |   "storage-driver" : "aufs",
73 |   "debug" : true,
74 |   "experimental" : true
75 |   }
76 | 


--------------------------------------------------------------------------------
/docs/evidence.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | .. _evidence:
 3 | 
 4 | Providing evidence to funannotate
 5 | ==================================
 6 | 
 7 | Funannotate uses Evidence Modeler to combine *ab initio* gene model predictions with evidence (transcripts or proteins) aligned to the genome. Therefore, the evidence that you supply at runtime for :code:`--transcript_evidence` and :code:`--protein_evidence` are important. By default, funannotate will use the UniProtKb/SwissProt curated protein database for protein evidence.  However, you can specify other forms of protein evidence, perhaps from a well-annotated closely related species, using the :code:`--protein_evidence` option.  Multiple files can be passed to both :code:`--transcript_evidence` or :code:`--protein_evidence` by separating the files by spaces, for example:
 8 | 
 9 | .. code-block:: none
10 | 
11 |     funannotate predict -i genome.fa -s "Awesome species" --transcript_evidence trinity.fasta myESTs.fa \
12 |         -o output --protein_evidence closely_related.fasta $FUNANNOTATE_DB/uniprot_sprot.fasta
13 |         
14 | You'll notice in this example, I also added the UniProt/SwissProt protein models located in the funannotate database. I should also note that adding protein evidence from ab initio predictors of closely related species should be avoided, this is because those models have not been validated.  What you are trying to do here is to provide the software with high-quality protein models so that information can be used to direct the *ab initio* gene prediction algorithms, so providing them with incorrect/truncated proteins isn't going to help your accuracy and in many cases it may hurt.  It is often okay to just stick with the default UniProtKb/SwissProt protein evidence.
15 | 
16 | **Sources of Evidence that work well:**
17 | 
18 | 1. De-novo RNA-seq assemblies (i.e. output of Trinity)
19 | 2. ESTs (for fungal genomes ESTs from related species can be downloaded from JGI Mycocosm)
20 | 3. Curated Protein models from closely related species
21 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. Funannotate documentation master file, created by
 2 |    sphinx-quickstart on Sat Nov 18 22:41:39 2017.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Funannotate documentation
 7 | =======================================
 8 | 
 9 | .. toctree::
10 |    :hidden:
11 |   
12 |    install
13 |    prepare
14 |    predict
15 |    evidence
16 |    update
17 |    annotate
18 |    compare
19 |    databases
20 |    tutorials
21 |    commands
22 |    utilities
23 | 
24 | 
25 | Funannotate is a genome prediction, annotation, and comparison software package. It was originally written to annotate fungal genomes (small eukaryotes ~ 30 Mb genomes), but has evolved over time to accomodate larger genomes. The impetus for this software package was to be able to accurately and easily annotate a genome for submission to NCBI GenBank. Existing tools (such as Maker) require significant manually editing to comply with GenBank submission rules, thus funannotate is aimed at simplifying the genome submission process.
26 | 
27 | Funannotate is also a lightweight comparative genomics platform. Genomes that have had functional annotation added via the :code:`funannotate annotate` command can be run through the :code:`funannotate compare` script that outputs html based whole genome comparisons. The software can run orthologous clustering, construct whole-genome phylogenies, run Gene Ontology enrichment analysis, as well as calculate dN/dS ratios for orthologous clusters under positive selection.
28 | 
29 | 
30 | 
31 | * :ref:`install`
32 | * :ref:`prepare`
33 | * :ref:`predict`
34 | * :ref:`update`
35 | * :ref:`annotate`
36 | * :ref:`compare`
37 | * :ref:`tutorials`
38 | * :ref:`utilities`
39 | 


--------------------------------------------------------------------------------
/docs/install.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | .. _install:
  3 | 
  4 | Installation
  5 | ================================
  6 | 
  7 | .. toctree::
  8 |    :hidden:
  9 | 
 10 |    dependencies
 11 | 
 12 | Funannotate has a lot of dependencies and therefore installation is the most difficult part
 13 | of executing the pipeline. The  funannotate pipeline is written in python and can be installed
 14 | with pip, i.e. `pip install funannotate`.  You can see a list of :ref:`dependencies`,
 15 | 
 16 | ### Quickest start Docker:
 17 | 
 18 | You can use docker to run `funannotate`. Caveats are that GeneMark is not included in the docker image (see licensing below and you can complain to the developers for making it difficult to distribute/use). I've also written a bash script that can run the docker image and auto-detect/include the proper user/volume bindings.  This docker image is built off of the latest code in master, so it will be ahead of the tagged releases. The image includes the required databases as well, if you want just funannotate without the databases then that is located on docker hub as well `nextgenusfs/funannotate-slim`. So this route can be achieved with:
 19 | 
 20 | .. code-block:: none
 21 | 
 22 |     # download/pull the image from docker hub
 23 |     $ docker pull nextgenusfs/funannotate
 24 | 
 25 |     # download bash wrapper script (optional)
 26 |     $ wget -O funannotate-docker https://raw.githubusercontent.com/nextgenusfs/funannotate/master/funannotate-docker
 27 | 
 28 |     # might need to make this executable on your system
 29 |     $ chmod +x /path/to/funannotate-docker
 30 | 
 31 |     # assuming it is in your PATH, now you can run this script as if it were the funannotate executable script
 32 |     $ funannotate-docker test -t predict --cpus 12
 33 | 
 34 | 
 35 | #### Quickstart Bioconda install:
 36 | 
 37 | The pipeline can be installed with conda (via [bioconda](https://bioconda.github.io/)):
 38 | 
 39 | .. code-block:: none
 40 | 
 41 |     #add appropriate channels
 42 |     conda config --add channels defaults
 43 |     conda config --add channels bioconda
 44 |     conda config --add channels conda-forge
 45 | 
 46 |     #then create environment
 47 |     conda create -n funannotate "python>=3.6,<3.9" funannotate
 48 | 
 49 | If `conda` is taking forever to solve the environment, I would recommend giving [mamba](https://github.com/mamba-org/mamba) a try:
 50 | 
 51 | .. code-block:: none
 52 | 
 53 |     #install mamba into base environment
 54 |     conda install -n base mamba
 55 | 
 56 |     #then use mamba as drop in replacmeent
 57 |     mamba create -n funannotate funannotate
 58 | 
 59 | 
 60 | If you want to use GeneMark-ES/ET you will need to install that manually following developers instructions:
 61 | http://topaz.gatech.edu/GeneMark/license_download.cgi
 62 | 
 63 | Note that you will need to change the shebang line for all perl scripts in GeneMark to use `/usr/bin/env perl`.
 64 | You will then also need to add `gmes_petap.pl` to the $PATH or set the environmental variable $GENEMARK_PATH to the gmes_petap directory.
 65 | 
 66 | To install just the python funannotate package, you can do this with pip:
 67 | 
 68 | .. code-block:: none
 69 | 
 70 |     python -m pip install funannotate
 71 | 
 72 | To install the most updated code in master you can run:
 73 | 
 74 | .. code-block:: none
 75 | 
 76 |     python -m pip install git+https://github.com/nextgenusfs/funannotate.git
 77 | 
 78 | 
 79 | 
 80 | Please setup database and test your installation locally using the following:
 81 | 
 82 | .. code-block:: none
 83 | 
 84 | 	#start up conda ENV
 85 | 	conda activate funannotate
 86 | 
 87 | 	#check that all modules are installed
 88 | 	funannotate check --show-versions
 89 | 
 90 | 	#download/setup databases to a writable/readable location
 91 | 	funannotate setup -d $HOME/funannotate_db
 92 | 
 93 | 	#set ENV variable for $FUNANNOTATE_DB
 94 | 	echo "export FUNANNOTATE_DB=$HOME/funannotate_db" > /conda/installation/path/envs/funannotate/etc/conda/activate.d/funannotate.sh
 95 | 	echo "unset FUNANNOTATE_DB" > /conda/installation/path/envs/funannotate/etc/conda/deactivate.d/funannotate.sh
 96 | 
 97 | 	#run tests -- requires internet connection to download data
 98 | 	funannotate test -t all --cpus X
 99 | 
100 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=Funannotate
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | 	echo.installed, then set the SPHINXBUILD environment variable to point
21 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | 	echo.may add the Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/manual.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | .. _manual:
 3 | 
 4 | Manual Installation: "The Professional"
 5 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 6 | You can simply download a release and get going, dependency hell awaits you, but I'm not worried because you know what you are doing. See :ref:`dependencies` that are needed to run funannotate.
 7 | 
 8 | .. code-block:: none
 9 |     
10 |     wget https://github.com/nextgenusfs/funannotate/archive/1.0.0.tar.gz
11 |     tar -zxvf 1.0.0.tar.gz
12 |     export PATH=/path/to/funannotate:$PATH
13 | 
14 | 


--------------------------------------------------------------------------------
/docs/prepare.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | .. _prepare:
 3 | 
 4 | Preparing your Assembly
 5 | --------------------------------
 6 | There are a few things that you can do to your multi-FASTA assembly to get it "ready" to be annotated.  These steps include methods for removing small repetitive contigs from an assembly, sorting/renaming contig headers so they do not cause problems during prediction step, and repeatmasking your assembely (required).
 7 | 
 8 | 
 9 | Cleaning your Assembly
10 | ================================
11 | When working with haploid assemblies, sometimes you want to remove some repetitive contigs that are contained in other scaffolds of the assembly. If the repeats are indeed unique, then we want to keep them in the assembly. Funannotate can help "clean" up repetitive contigs in your assembly.  This is done using a "leave one out" methodology using minimap2 or mummer (nucmer), where the the shortest contigs/scaffolds are aligned to the rest of the assembly to determine if it is repetitive. The script loops through the contigs starting with the shortest and workings its way to the N50 of the assembly, dropping contigs/scaffolds that are greater than the percent coverage of overlap (:code:`--cov`) and the percent identity of overlap (:code:`--pident`).
12 | 
13 | .. code-block:: none
14 | 
15 |     $ funannotate clean
16 | 
17 | 	Usage:       funannotate clean <arguments>
18 | 	version:     1.8.16
19 | 
20 | 	Description: The script sorts contigs by size, starting with shortest contigs it uses minimap2
21 | 	             to find contigs duplicated elsewhere, and then removes duplicated contigs.
22 | 
23 | 	Arguments:
24 | 	  -i, --input    Multi-fasta genome file (Required)
25 | 	  -o, --out      Cleaned multi-fasta output file (Required)
26 | 	  -p, --pident   Percent identity of overlap. Default = 95
27 | 	  -c, --cov      Percent coverage of overlap. Default = 95
28 | 	  -m, --minlen   Minimum length of contig to keep. Default = 500
29 | 	  --exhaustive   Test every contig. Default is to stop at N50 value.
30 | 
31 | 
32 | Sorting/Rename FASTA Headers
33 | ================================
34 | NCBI limits the number of characters in a FASTA header for submission to 16 characters and Augustus also has problems with longer contig/scaffold names. You can use this simple script to sort your assembly by length and then rename the FASTA headers.
35 | 
36 | .. code-block:: none
37 | 
38 |     $funannotate sort
39 | 
40 | 	Usage:       funannotate sort <arguments>
41 | 	version:     1.8.16
42 | 
43 | 	Description: This script sorts the input contigs by size (longest->shortest) and then relabels
44 | 	             the contigs with a simple name (e.g. scaffold_1).  Augustus can have problems with
45 | 		     some complicated contig names.
46 | 
47 | 	Arguments:
48 | 	  -i, --input    Multi-fasta genome file. (Required)
49 | 	  -o, --out      Sorted by size and relabeled output file. (Required)
50 | 	  -b, --base     Base name to relabel contigs. Default: scaffold
51 | 	  --minlen       Shorter contigs are discarded. Default: 0
52 | 
53 | 
54 | .. _repeatmasking
55 | 
56 | RepeatMasking your Assembly
57 | ================================
58 | This is an essential step in the annotation process. As of v1.4.0 repeatmasking has been decoupled from :code:`funannotate predict` in order to make it more flexible and accomodate those users that don't have access to the RepBase library (a requirement of RepeatMasker). The :code:`funannotate mask` command default is to run simple masking using tantan.  The script is a wrapper for RepeatModeler and RepeatMasker, however you can use any external program to softmask your assembly.  Softmasking is where repeats are represented by lowercase letters and all non-repetitive regions are uppercase letters. One alternative to RepeatMasker is RED (REpeat Detector) you can find a wrapper for this program `Redmask <https://github.com/nextgenusfs/redmask>`_.
59 | 
60 | .. code-block:: none
61 | 
62 |     $funannotate mask
63 | 
64 | 	Usage:       funannotate mask <arguments>
65 | 	version:     1.8.16
66 | 
67 | 	Description: This script is a wrapper for repeat masking. Default is to run very simple
68 | 		     repeat masking with tantan. The script can also run RepeatMasker and/or
69 | 		     RepeatModeler. It will generate a softmasked genome. Tantan is probably not
70 | 		     sufficient for soft-masking an assembly, but with RepBase no longer being
71 | 		     available RepeatMasker/Modeler may not be functional for many users.
72 | 
73 | 	Arguments:
74 | 	  -i, --input                  Multi-FASTA genome file. (Required)
75 | 	  -o, --out                    Output softmasked FASTA file. (Required)
76 | 
77 | 	Optional:
78 | 	  -m, --method                 Method to use. Default: tantan [repeatmasker, repeatmodeler]
79 | 	  -s, --repeatmasker_species   Species to use for RepeatMasker
80 | 	  -l, --repeatmodeler_lib      Custom repeat database (FASTA format)
81 | 	  --cpus                       Number of cpus to use. Default: 2
82 | 	  --debug                      Keep intermediate files
83 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx_rtd_theme


--------------------------------------------------------------------------------
/docs/update.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | .. _update:
 3 | 
 4 | Adding UTRs and refining predictions
 5 | ================================
 6 | If you have RNA-seq data and would like to use the PASA-mediated "annotation comparison" to add UTRs and refine gene model predictions, this can be accomplished using the :code:`funannotate update` command. This script can also be run as a stand-alone to re-align RNA-seq data and/or update an existing GenBank genome.
 7 | 
 8 | If you have run :code:`funannotate train` and then :code:`funannotate predict`, this script will re-use those data and you can simply pass :code:`funannotate update -i folder --cpus 12`.  This will add the gene predictions to the SQL database and then walk through each gene comparing to existing PASA alignments, PASA will make some adjustments to the gene models. As recommended by PASA developers, this is run twice in :code:`funannotate update`.
 9 | 
10 | 
11 | Why is :code:`funannotate update` so slow??
12 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
13 | 
14 | The default SQL database for PASA is set to use SQLite -- this is for compatibility.  However, the limitation is that SQLite database in PASA is single threaded due to SQLite database lock issue. Thus even if you pass multiple cpus to the script, it will run all of the PASA steps single threaded, which can take a long time depending on PASA alignments and genome size. If you `setup PASA to use MySQL <https://github.com/PASApipeline/PASApipeline/wiki/setting-up-pasa-mysql>`_, then the scripts can run PASA multi-threaded and :code:`funannotate update` will run much faster.
15 | 
16 | 
17 | .. code-block:: none
18 | 
19 | 	Usage:       funannotate update <arguments>
20 | 	version:     1.8.14
21 | 
22 | 	Description: Script will run PASA mediated update of gene models. It can directly update
23 | 	the annotation from an NCBI downloaded GenBank file using RNA-seq data or can be
24 | 	used after funannotate predict to refine UTRs and gene model predictions. Kallisto
25 | 	is used to evidence filter most likely PASA gene models. Dependencies are
26 | 	hisat2, Trinity, samtools, fasta, minimap2, PASA, kallisto, bedtools.
27 | 
28 | 	Required:
29 | 	  -i, --input              Funannotate folder or Genome in GenBank format (.gbk,.gbff).
30 | 		or
31 | 	  -f, --fasta              Genome in FASTA format
32 | 	  -g, --gff                Annotation in GFF3 format
33 | 	  --species                Species name, use quotes for binomial, e.g. "Aspergillus fumigatus"
34 | 
35 | 	Optional:
36 | 	  -o, --out                Output folder name
37 | 	  -l, --left               Left/Forward FASTQ Illumina reads (R1)
38 | 	  -r, --right              Right/Reverse FASTQ Illumina reads (R2)
39 | 	  -s, --single             Single ended FASTQ reads
40 | 	  --stranded               If RNA-seq library stranded. [RF,FR,F,R,no]
41 | 	  --left_norm              Normalized left FASTQ reads (R1)
42 | 	  --right_norm             Normalized right FASTQ reads (R2)
43 | 	  --single_norm            Normalized single-ended FASTQ reads
44 | 	  --pacbio_isoseq          PacBio long-reads
45 | 	  --nanopore_cdna          Nanopore cDNA long-reads
46 | 	  --nanopore_mrna          Nanopore mRNA direct long-reads
47 | 	  --trinity                Pre-computed Trinity transcripts (FASTA)
48 | 	  --jaccard_clip           Turn on jaccard clip for dense genomes [Recommended for fungi]
49 | 	  --no_normalize_reads     Skip read Normalization
50 | 	  --no_trimmomatic         Skip Quality Trimming of reads
51 | 	  --memory                 RAM to use for Jellyfish. Default: 50G
52 | 	  -c, --coverage           Depth to normalize reads. Default: 50
53 | 	  -m, --min_coverage       Min depth for normalizing reads. Default: 5
54 | 	  --pasa_config            PASA assembly config file, i.e. from previous PASA run
55 | 	  --pasa_db                Database to use. Default: sqlite [mysql,sqlite]
56 | 	  --pasa_alignment_overlap PASA --stringent_alignment_overlap. Default: 30.0
57 |     --aligners               Aligners to use with PASA: Default: minimap2 blat [gmap]
58 |     --pasa_min_avg_per_id    PASA --MIN_AVG_PER_ID. Default: 95
59 |     --pasa_num_bp_splice     PASA --NUM_BP_PERFECT_SPLICE_BOUNDARY. Default: 3
60 | 	  --max_intronlen          Maximum intron length. Default: 3000
61 | 	  --min_protlen            Minimum protein length. Default: 50
62 | 	  --alt_transcripts        Expression threshold (percent) to keep alt transcripts. Default: 0.1 [0-1]
63 | 	  --p2g                    NCBI p2g file (if updating NCBI annotation)
64 | 	  -t, --tbl2asn            Assembly parameters for tbl2asn. Example: "-l paired-ends"
65 | 	  --name                   Locus tag name (assigned by NCBI?). Default: use existing
66 | 	  --sbt                    NCBI Submission file
67 | 	  --species                Species name, use quotes for binomial, e.g. "Aspergillus fumigatus"
68 | 	  --strain                 Strain name
69 | 	  --isolate                Isolate name
70 | 	  --SeqCenter              Sequencing facilty for NCBI tbl file. Default: CFMR
71 | 	  --SeqAccession           Sequence accession number for NCBI tbl file. Default: 12345
72 | 	  --cpus                   Number of CPUs to use. Default: 2
73 | 
74 | 	ENV Vars:  If not passed, will try to load from your $PATH.
75 | 	  --PASAHOME
76 | 	  --TRINITYHOME
77 | 


--------------------------------------------------------------------------------
/docs/utilities.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | .. _utilities:
  3 | 
  4 | Utilities
  5 | ================================
  6 | There are several scripts that maybe useful to users to convert between different formats, these scripts are housed in the :code:`funannotate util` submenu.
  7 | 
  8 | 
  9 | .. code-block:: none
 10 | 
 11 |     $ funannotate util
 12 | 
 13 | 	Usage:       funannotate util <arguments>
 14 | 	version:     1.8.16
 15 | 
 16 | 	Commands:
 17 |           stats              Generate assembly and annotation stats
 18 | 	  contrast           Compare annotations to reference (GFF3 or GBK annotations)
 19 | 	  tbl2gbk            Convert TBL format to GenBank format
 20 | 	  gbk2parts          Convert GBK file to individual components
 21 | 	  gff2prot           Convert GFF3 + FASTA files to protein FASTA
 22 | 	  gff2tbl            Convert GFF3 format to NCBI annotation table (tbl)
 23 | 	  bam2gff3           Convert BAM coord-sorted transcript alignments to GFF3
 24 | 	  prot2genome        Map proteins to genome generating GFF3 protein alignments
 25 | 	  stringtie2gff3     Convert GTF (stringTIE) to GFF3 format
 26 | 	  quarry2gff3        Convert CodingQuarry output to proper GFF3 format
 27 |     gff-rename         Sort GFF3 file and rename gene models
 28 | 
 29 | Generate genome assembly stats
 30 | ------------------------------
 31 | To generate genome assembly stats in a JSON file.
 32 | 
 33 | .. code-block:: none
 34 | 
 35 |     $ funannotate util stats
 36 | 
 37 | 	Usage:       funannotate util stats <arguments>
 38 | 	version:     1.8.16
 39 | 
 40 | 	Description: Generate JSON file with genome assembly and annotation stats.
 41 | 
 42 | 	Arguments:
 43 |           -f, --fasta              Genome FASTA file (Required)
 44 |           -o, --out                Output file (JSON format)
 45 |           -g, --gff3               Genome Annotation (GFF3 format)
 46 |           -t, --tbl                Genome Annotation (NCBI TBL format)
 47 |           --transcript_alignments  Transcript alignments (GFF3 format)
 48 |           --protein_alignments     Protein alignments (GFF3 format)
 49 | 
 50 | Comparing/contrast annotations to a reference
 51 | ---------------------------------------
 52 | To compare/contrast genome annotations between different GFF3 or GBK files.
 53 | 
 54 | .. code-block:: none
 55 | 
 56 |     $ funannotate util contrast
 57 | 
 58 | 	Usage:       funannotate util contrast <arguments>
 59 | 	version:     1.8.16
 60 | 
 61 | 	Description: Compare/constrast annotations to reference. Annotations in either GBK or GFF3 format.
 62 | 
 63 | 	Arguments: -r, --reference            Reference Annotation. GFF3 or GBK format
 64 |                    -f, --fasta                Genome FASTA. Required if GFF3 used
 65 |                    -q, --query                Annotation query. GFF3 or GBK format
 66 |                    -o, --output               Output basename
 67 |                    -c, --calculate_pident     Measure protein percent identity between query and reference
 68 | 
 69 | Format Conversion
 70 | ---------------------------------------
 71 | 
 72 | .. code-block:: none
 73 | 
 74 |     $ funannotate util tbl2gbk
 75 | 
 76 | 	Usage:       funannotate util tbl2gbk <arguments>
 77 | 	version:     1.8.16
 78 | 
 79 | 	Description: Convert NCBI TBL annotations + Genome FASTA to GenBank format.
 80 | 
 81 | 	Required:    -i, --tbl          Annotation in NCBI tbl format
 82 | 				 -f, --fasta        Genome FASTA file.
 83 | 				 -s, --species      Species name, use quotes for binomial, e.g. "Aspergillus fumigatus"
 84 | 	Optional:
 85 | 				 --isolate          Isolate name
 86 | 				 --strain           Strain name
 87 | 				 --sbt              NCBI Submission Template file
 88 | 				 -t, --tbl2asn      Assembly parameters for tbl2asn. Example: "-l paired-ends"
 89 | 				 -o, --output       Output basename
 90 | 
 91 | 
 92 | .. code-block:: none
 93 | 
 94 |     $ funannotate util gbk2parts
 95 | 
 96 | 	Usage:       funannotate util gbk2parts <arguments>
 97 | 	version:     1.8.16
 98 | 
 99 | 	Description: Convert GenBank file to its individual components (parts) tbl, protein
100 | 				 FASTA, transcript FASTA, and contig/scaffold FASTA.
101 | 
102 | 	Arguments:   -g, --gbk          Input Genome in GenBank format
103 | 				       -o, --output       Output basename
104 | 
105 | 
106 | .. code-block:: none
107 | 
108 |     $ funannotate util gff2prot
109 | 
110 | 	Usage:       funannotate util gff2prot <arguments>
111 | 	version:     1.8.16
112 | 
113 | 	Description: Convert GFF3 file and genome FASTA to protein sequences. FASTA output to stdout.
114 | 
115 | 	Arguments: -g, --gff3           Reference Annotation. GFF3 format
116 |                    -f, --fasta          Genome FASTA file.
117 |                    --no_stop            Dont print stop codons
118 | 
119 | .. code-block:: none
120 | 
121 |     $ funannotate util gff2tbl
122 | 
123 | 	Usage:       funannotate util gff2tbl <arguments>
124 | 	version:     1.8.16
125 | 
126 | 	Description: Convert GFF3 file into NCBI tbl format. Tbl output to stdout.
127 | 
128 | 	Arguments:
129 | 	  -g, --gff3           Reference Annotation. GFF3 format
130 | 	  -f, --fasta          Genome FASTA file.
131 | 
132 | 
133 | .. code-block:: none
134 | 
135 |     $ funannotate util bam2gff3
136 | 
137 | 	Usage:       funannotate util bam2gff3 <arguments>
138 | 	version:     1.8.16
139 | 
140 | 	Description: Convert BAM coordsorted transcript alignments to GFF3 format.
141 | 
142 | 	Arguments: -i, --bam           BAM file (coord-sorted)
143 |                    -o, --output        GFF3 output file
144 | 
145 | 
146 | .. code-block:: none
147 | 
148 |     $ funannotate util protein2genome
149 | 
150 | 	Usage:       funannotate util prot2genome <arguments>
151 | 	version:     1.8.16
152 | 
153 | 	Description: Map proteins to genome using exonerate. Output is EVM compatible GFF3 file.
154 | 
155 | 	Arguments:   -g, --genome       Genome FASTA format (Required)
156 |                      -p, --proteins     Proteins FASTA format (Required)
157 |                      -o, --out          GFF3 output file (Required)
158 |                      -f, --filter       Pre-filtering method. Default: diamond [diamond,tblastn]
159 |                      -t, --tblastn_out  Output to save tblastn results. Default: off
160 |                       --tblastn          Use existing tblastn results
161 |                      --ploidy           Ploidy of assembly. Default: 1
162 |                      --maxintron        Max intron length. Default: 3000
163 |                      --cpus             Number of cpus to use. Default: 2
164 |                      --EVM_HOME         Location of Evidence Modeler home directory. Default: $EVM_HOME
165 |                      --tmpdir           Volume/location to write temporary files. Default: /tmp
166 |                      --logfile          Logfile output file
167 | 
168 | .. code-block:: none
169 | 
170 |     $ funannotate util stringtie2gff3
171 | 
172 | 	Usage:       funannotate util stringtie2gff3 <arguments>
173 | 	version:     1.8.16
174 | 
175 | 	Description: Convert StringTIE GTF format to GFF3 funannotate compatible format. Output
176 | 				 to stdout.
177 | 
178 | 	Arguments:   -i, --input        GTF file from stringTIE
179 | 
180 | .. code-block:: none
181 | 
182 |     $ funannotate util quarry2gff3
183 | 
184 | 	Usage:       funannotate util quarry2gff3 <arguments>
185 | 	version:     1.8.16
186 | 
187 | 	Description: Convert CodingQuarry output GFF to proper GFF3 format. Output to stdout.
188 | 
189 | 	Arguments:   -i, --input        CodingQuarry output GFF file. (PredictedPass.gff3)
190 | 
191 |   .. code-block:: none
192 | 
193 |     $ funannotate util gff-rename
194 | 
195 | 	Usage:       funannotate util gff-rename <arguments>
196 | 	version:     1.8.16
197 | 
198 | 	Description: Sort GFF3 file by contigs and rename gene models.
199 | 
200 | 	Arguments:   -g, --gff3           Reference Annotation. GFF3 format
201 |                      -f, --fasta          Genome FASTA file.
202 |                      -o, --out            Output GFF3 file
203 |                      -l, --locus_tag      Locus tag to use. Default: FUN
204 |                      -n, --numbering      Start number for genes. Default: 1
205 | 


--------------------------------------------------------------------------------
/funannotate-docker:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | realpath() {
 4 |   OURPWD=$PWD
 5 |   cd "$(dirname "$1")"
 6 |   LINK=$(readlink "$(basename "$1")")
 7 |   while [ "$LINK" ]; do
 8 |     cd "$(dirname "$LINK")"
 9 |     LINK=$(readlink "$(basename "$1")")
10 |   done
11 |   REALPATH="$PWD/$(basename "$1")"
12 |   cd "$OURPWD"
13 |   echo "$REALPATH"
14 | }
15 | 
16 | timezone() {
17 |     if [ "$(uname)" == "Darwin" ]; then
18 |         TZ=$(readlink /etc/localtime | sed 's#/var/db/timezone/zoneinfo/##')
19 |     else
20 |         TZ=$(readlink /etc/timezone)
21 |     fi
22 |     echo $TZ
23 | }
24 | 
25 | # Only allocate tty if one is detected. See - https://stackoverflow.com/questions/911168
26 | if [[ -t 0 ]]; then IT+=(-i); fi
27 | if [[ -t 1 ]]; then IT+=(-t); fi
28 | 
29 | USER="$(id -u $(logname)):$(id -g $(logname))"
30 | WORKDIR="$(realpath .)"
31 | MOUNT="type=bind,source=${WORKDIR},target=${WORKDIR}"
32 | TZ="$(timezone)"
33 | 
34 | exec docker run --rm "${IT[@]}" --user "${USER}" -e TZ="${TZ}" --workdir "${WORKDIR}" --mount "${MOUNT}" nextgenusfs/funannotate:latest funannotate "$@"


--------------------------------------------------------------------------------
/funannotate-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextgenusfs/funannotate/033a883081a83a161798ecc17eaf77b16b5c552b/funannotate-logo.png


--------------------------------------------------------------------------------
/funannotate-podman:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | realpath() {
 4 |   OURPWD=$PWD
 5 |   cd "$(dirname "$1")"
 6 |   LINK=$(readlink "$(basename "$1")")
 7 |   while [ "$LINK" ]; do
 8 |     cd "$(dirname "$LINK")"
 9 |     LINK=$(readlink "$(basename "$1")")
10 |   done
11 |   REALPATH="$PWD/$(basename "$1")"
12 |   cd "$OURPWD"
13 |   echo "$REALPATH"
14 | }
15 | 
16 | timezone() {
17 |     if [ "$(uname)" == "Darwin" ]; then
18 |         TZ=$(readlink /etc/localtime | sed 's#/var/db/timezone/zoneinfo/##')
19 |     else
20 |         TZ=$(readlink /etc/timezone)
21 |     fi
22 |     echo $TZ
23 | }
24 | 
25 | # Only allocate tty if one is detected. See - https://stackoverflow.com/questions/911168
26 | if [[ -t 0 ]]; then IT+=(-i); fi
27 | if [[ -t 1 ]]; then IT+=(-t); fi
28 | 
29 | USER="$(id -u $(logname)):$(id -g $(logname))"
30 | WORKDIR="$(realpath .)"
31 | MOUNT="type=bind,source=${WORKDIR},target=${WORKDIR}"
32 | TZ="$(timezone)"
33 | 
34 | 
35 | exec podman run --rm "${IT[@]}" -e TZ="${TZ}" --workdir "${WORKDIR}" --mount "${MOUNT}" nextgenusfs/funannotate:latest funannotate "$@"
36 | # ` --user "${USER}" ` is not needed in rootless mode
37 | 


--------------------------------------------------------------------------------
/funannotate/__init__.py:
--------------------------------------------------------------------------------
1 | from .__version__ import __version__
2 | 


--------------------------------------------------------------------------------
/funannotate/__version__.py:
--------------------------------------------------------------------------------
1 | VERSION = (1, 8, 17)
2 | 
3 | __version__ = ".".join(map(str, VERSION))
4 | 


--------------------------------------------------------------------------------
/funannotate/aux_scripts/augustus_parallel.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import sys
  4 | import subprocess
  5 | import os
  6 | import uuid
  7 | import shutil
  8 | import argparse
  9 | from Bio import SeqIO
 10 | import funannotate.library as lib
 11 | 
 12 | # setup menu with argparse
 13 | 
 14 | 
 15 | class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
 16 |     def __init__(self, prog):
 17 |         super(MyFormatter, self).__init__(prog, max_help_position=48)
 18 | 
 19 | 
 20 | parser = argparse.ArgumentParser(prog='augustus_parallel.py',
 21 |                                  usage="%(prog)s [options] -i genome.fasta -s botrytis_cinera -o prediction_output_base",
 22 |                                  description='''Script runs augustus in parallel to use multiple processors''',
 23 |                                  epilog="""Written by Jon Palmer (2016) nextgenusfs@gmail.com""",
 24 |                                  formatter_class=MyFormatter)
 25 | parser.add_argument('-i', '--input', required=True,
 26 |                     help='Genome in FASTA format')
 27 | parser.add_argument('-o', '--out', required=True,
 28 |                     help='Basename of output files')
 29 | parser.add_argument('-s', '--species', required=True,
 30 |                     help='Augustus species name')
 31 | parser.add_argument('--hints', help='Hints file (PE)')
 32 | parser.add_argument('--cpus', default=2, type=int,
 33 |                     help='Number of CPUs to run')
 34 | parser.add_argument('-v', '--debug', action='store_true',
 35 |                     help='Keep intermediate files')
 36 | parser.add_argument('--logfile', default='augustus-parallel.log',
 37 |                     help='logfile')
 38 | parser.add_argument('--local_augustus')
 39 | parser.add_argument('--AUGUSTUS_CONFIG_PATH')
 40 | parser.add_argument('-e', '--extrinsic', help='augustus extrinsic file')
 41 | parser.add_argument('--no-progress', dest='progress', action='store_false',
 42 |                     help='no progress on multiprocessing')
 43 | args = parser.parse_args()
 44 | 
 45 | # check for augustus installation
 46 | if args.AUGUSTUS_CONFIG_PATH:
 47 |     AUGUSTUS = args.AUGUSTUS_CONFIG_PATH
 48 | else:
 49 |     try:
 50 |         AUGUSTUS = os.environ["AUGUSTUS_CONFIG_PATH"]
 51 |     except KeyError:
 52 |         print("$AUGUSTUS_CONFIG_PATH environmental variable not found, Augustus is not properly configured")
 53 |         sys.exit(1)
 54 | 
 55 | if AUGUSTUS.endswith('config'):
 56 |     AUGUSTUS_BASE = AUGUSTUS.replace('config', '')
 57 | elif AUGUSTUS.endswith('config'+os.sep):
 58 |     AUGUSTUS_BASE = AUGUSTUS.replace('config'+os.sep, '')
 59 | else:
 60 |     AUGUSTUS_BASE = AUGUSTUS
 61 | 
 62 | # see if local species passed
 63 | if args.local_augustus:
 64 |     LOCALAUGUSTUS = args.local_augustus
 65 | else:
 66 |     LOCALAUGUSTUS = AUGUSTUS
 67 | 
 68 | # setup hints and extrinic input, hard coded for protein and transcript alignments from funannotate
 69 | extrinsic = '--extrinsicCfgFile={:}'.format(args.extrinsic)
 70 | 
 71 | 
 72 | def countGFFgenes(input):
 73 |     count = 0
 74 |     with open(input, 'r') as f:
 75 |         for line in f:
 76 |             if "\tgene\t" in line:
 77 |                 count += 1
 78 |     return count
 79 | 
 80 | 
 81 | def runAugustus(Input):
 82 |     if '_part' in Input:
 83 |         chr = Input.split('_part')[0]
 84 |     else:
 85 |         chr = Input
 86 |     species = '--species='+args.species
 87 |     hints_input = '--hintsfile='+args.hints
 88 |     aug_out = os.path.join(tmpdir, Input+'.augustus.gff3')
 89 |     core_cmd = ['augustus', species, '--AUGUSTUS_CONFIG_PATH={:}'.format(LOCALAUGUSTUS), '--softmasking=1',
 90 |                 '--gff3=on', '--UTR=off', '--stopCodonExcludedFromCDS=False', os.path.join(tmpdir, chr+'.fa')]
 91 |     if args.hints:
 92 |         core_cmd.insert(2, extrinsic)
 93 |         core_cmd.insert(3, hints_input)
 94 |     if Input in ranges:
 95 |         start = ranges.get(Input)[0]
 96 |         end = ranges.get(Input)[1]
 97 |         core_cmd.insert(2, '--predictionStart='+str(start))
 98 |         core_cmd.insert(3, '--predictionEnd='+str(end))
 99 |     # try using library module
100 |     lib.runSubprocess(core_cmd, '.', lib.log, capture_output=aug_out)
101 | 
102 | 
103 | log_name = args.logfile
104 | if os.path.isfile(log_name):
105 |     os.remove(log_name)
106 | 
107 | # initialize script, log system info and cmd issue at runtime
108 | lib.setupLogging(log_name)
109 | cmd_args = " ".join(sys.argv)+'\n'
110 | lib.log.debug(cmd_args)
111 | 
112 | lib.log.debug('AUGUSTUS_CONFIG_PATH={:}'.format(AUGUSTUS))
113 | lib.log.debug('Augustus Base directory={:}'.format(AUGUSTUS_BASE))
114 | lib.log.debug('Local Augustus path={:}'.format(LOCALAUGUSTUS))
115 | 
116 | # first step is to split input fasta file into individual files in tmp folder
117 | lib.log.debug("Splitting contigs and hints files")
118 | tmpdir = 'augustus_tmp_'+str(uuid.uuid4())
119 | os.makedirs(tmpdir)
120 | scaffolds = []
121 | global ranges
122 | ranges = {}
123 | with open(args.input, 'r') as InputFasta:
124 |     for record in SeqIO.parse(InputFasta, 'fasta'):
125 |         contiglength = len(record.seq)
126 |         if contiglength > 500000:  # split large contigs
127 |             num_parts = contiglength / 500000 + 1
128 |             chunks = contiglength / num_parts
129 |             for i in range(0, int(num_parts)):
130 |                 name = str(record.id)+'_part'+str(i+1)
131 |                 scaffolds.append(name)
132 |                 outputfile = os.path.join(tmpdir, str(record.id)+'.fa')
133 |                 if i == 0:  # this is first record
134 |                     start = 1
135 |                     end = chunks + 10000
136 |                 else:
137 |                     start = end - 10000
138 |                     end = start + chunks + 10000
139 |                 if end > contiglength:
140 |                     end = contiglength
141 |                 if not name in ranges:
142 |                     ranges[name] = (start, end)
143 |                 with open(outputfile, 'w') as output:
144 |                     SeqIO.write(record, output, 'fasta')
145 |         else:
146 |             name = str(record.id)
147 |             scaffolds.append(name)
148 |             outputfile = os.path.join(tmpdir, name+'.fa')
149 |             with open(outputfile, 'w') as output:
150 |                 SeqIO.write(record, output, 'fasta')
151 | 
152 | # now loop through each scaffold running augustus
153 | if args.cpus > len(scaffolds):
154 |     num = len(scaffolds)
155 | else:
156 |     num = args.cpus
157 | lib.log.debug("Running Augustus on %i chunks, using %i CPUs" %
158 |               (len(scaffolds), num))
159 | lib.runMultiProgress(runAugustus, scaffolds, num, progress=args.progress)
160 | 
161 | 
162 | lib.log.debug("Augustus prediction is finished, now concatenating results")
163 | with open(os.path.join(tmpdir, 'augustus_all.gff3'), 'w') as output:
164 |     for file in scaffolds:
165 |         file = os.path.join(tmpdir, file+'.augustus.gff3')
166 |         with open(file) as input:
167 |             output.write(input.read())
168 | 
169 | if lib.checkannotations(os.path.join(tmpdir, 'augustus_all.gff3')):
170 |     lib.log.debug('Augustus finished, now joining results')
171 | if lib.which_path('join_aug_pred.pl'):
172 |     join_script = 'join_aug_pred.pl'
173 | else:
174 |     join_script = os.path.join(AUGUSTUS_BASE, 'scripts', 'join_aug_pred.pl')
175 | 
176 | cmd = '{:} < {:} > {:}'.format(join_script, os.path.join(
177 |     tmpdir, 'augustus_all.gff3'), args.out)
178 | lib.log.debug(cmd)
179 | 
180 | with open(args.out, 'w') as finalout:
181 |     with open(os.path.join(tmpdir, 'augustus_all.gff3'), 'r') as infile:
182 |         subprocess.call([join_script], stdin=infile, stdout=finalout)
183 | 
184 | if not args.debug:
185 |     shutil.rmtree(tmpdir)
186 | lib.log.info('{:,} predictions from Augustus'.format(countGFFgenes(args.out)))
187 | 


--------------------------------------------------------------------------------
/funannotate/aux_scripts/enrichment_parallel.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | import argparse
 6 | import subprocess
 7 | import funannotate.library as lib
 8 | 
 9 | 
10 | def runGOenrichment(input):
11 |     basename = os.path.basename(input).replace('.txt', '')
12 |     goa_out = os.path.join(args.out, basename+'.go.enrichment.txt')
13 |     go_log = os.path.join(args.out, basename+'.go.enrichment.log')
14 |     if not lib.checkannotations(goa_out):
15 |         cmd = ['find_enrichment.py', '--obo', os.path.join(FUNDB, 'go.obo'),
16 |                '--pval', '0.001', '--alpha', '0.001', '--method', 'fdr',
17 |                '--outfile', goa_out, input, os.path.join(args.input, 'population.txt'),
18 |                os.path.join(args.input, 'associations.txt')]
19 |         with open(go_log, 'w') as outfile:
20 |             outfile.write('{}\n'.format(' '.join(cmd)))
21 |         with open(go_log, 'a') as outfile:
22 |             subprocess.call(cmd, stdout=outfile, stderr=outfile)
23 | 
24 | 
25 | def GO_safe_run(*args, **kwargs):
26 |     """Call run(), catch exceptions."""
27 |     try:
28 |         runGOenrichment(*args, **kwargs)
29 |     except Exception as e:
30 |         print(("error: %s run(*%r, **%r)" % (e, args, kwargs)))
31 | 
32 | # setup menu with argparse
33 | 
34 | 
35 | class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
36 |     def __init__(self, prog):
37 |         super(MyFormatter, self).__init__(prog, max_help_position=48)
38 | 
39 | 
40 | parser = argparse.ArgumentParser(prog='enrichment_parallel.py',
41 |                                  description='''Run goatools enrichment in parallel.''',
42 |                                  epilog="""Written by Jon Palmer (2019) nextgenusfs@gmail.com""",
43 |                                  formatter_class=MyFormatter)
44 | parser.add_argument('-i', '--input', required=True,
45 |                     help='folder of protein fasta files')
46 | parser.add_argument('-d', '--db', required=True,
47 |                     help='location of HMM database')
48 | parser.add_argument('-c', '--cpus', default=1, type=int,
49 |                     help='location of HMM database')
50 | parser.add_argument('-o', '--out', required=True, help='output file')
51 | args = parser.parse_args()
52 | 
53 | global FUNDB, FNULL
54 | FUNDB = args.db
55 | FNULL = open(os.devnull, 'w')
56 | 
57 | # now loop through each genome comparing to population
58 | file_list = []
59 | for f in os.listdir(args.input):
60 |     if f.startswith('associations'):
61 |         continue
62 |     if f.startswith('population'):
63 |         continue
64 |     file = os.path.join(args.input, f)
65 |     if lib.checkannotations(file):
66 |         file_list.append(file)
67 |     else:
68 |         print('  WARNING: skipping {} as no GO terms'.format(f))
69 | 
70 | # run over multiple CPUs
71 | if len(file_list) > args.cpus:
72 |     procs = args.cpus
73 | else:
74 |     procs = len(file_list)
75 | 
76 | lib.runMultiProgress(GO_safe_run, file_list, procs, progress=False)
77 | 


--------------------------------------------------------------------------------
/funannotate/aux_scripts/fasta2agp.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # based on fasta2agp.pl from david.studholme@tsl.ac.uk
  5 | # rewritten in python by Jason Stajich @hyphaltip
  6 | 
  7 | import os
  8 | import sys
  9 | import re
 10 | import csv
 11 | import argparse
 12 | import warnings
 13 | from Bio import SeqIO
 14 | from Bio.Seq import Seq
 15 | from Bio.SeqRecord import SeqRecord
 16 | 
 17 | def parse_scaffolds_makeagp(scaffolds,agpout,ctgsout):
 18 |     x = 0
 19 |     i = 0
 20 |     spadesnamepat = re.compile(r'^NODE_(\d+)_length_\d+_cov_\d+')
 21 |     numnamepat = re.compile(r'^(\d+)$')
 22 |     validSeq   = re.compile(r'^[ACGTRYSWKMBDHVN]+$',flags=re.IGNORECASE)
 23 |     with open(agpout, 'w') as agpoutfh:
 24 |         csvout = csv.writer(agpoutfh,delimiter="\t",lineterminator="\n")
 25 |         with open(ctgsout,"w") as ctgoutfh:
 26 |             with open(scaffolds, 'r') as scaff_in:
 27 |                 for seq in SeqIO.parse(scaff_in, "fasta"):
 28 |                     supercontig_id = seq.id
 29 |                     supercontig_seq = seq.seq
 30 |                     supercontig_desc = seq.description
 31 |                     supercontig_length = len(seq);
 32 |                     x = 0
 33 |                     m = spadesnamepat.match(supercontig_id) or spadesnamepat.match(supercontig_id)
 34 |                     if m:
 35 |                         supercontig_id = "scf_%s"%(m.match(1))
 36 |                     start_pos = 1 # keep track of whereabouts in this supercontig we are
 37 |                     substring_sequences = {}
 38 |                     for substring_sequence in re.split(r'(N{10,})',str(supercontig_seq),maxsplit=0,flags=re.IGNORECASE):
 39 |                         if len(substring_sequence) == 0:
 40 |                             continue
 41 |                         object1         = supercontig_id
 42 |                         object_beg2     = start_pos
 43 |                         object_end3     = start_pos + len(substring_sequence) - 1
 44 |                         part_number4    =  x
 45 |                         x += 1
 46 |                         component_type5 = None
 47 |                         component_id6a  = None
 48 |                         gap_length6b    = None
 49 |                         component_beg7a = None
 50 |                         gap_type7b      = None
 51 |                         component_end8a = None
 52 |                         linkage8b       = None
 53 |                         orientation9a   = None
 54 |                         filler9b        = None
 55 |                         if re.match(r'^N+$',substring_sequence):
 56 |                             ### This is poly-N gap between contigs
 57 |                             component_type5 = 'N'
 58 |                             gap_length6b    = len(substring_sequence)
 59 |                             gap_type7b      = 'scaffold'
 60 |                             linkage8b       = 'yes'
 61 |                             filler9b        = 'paired-ends'
 62 |                         elif validSeq.match(substring_sequence):
 63 |                             ### This is a contig
 64 |                             i+=1 # a counter, used for generating unique contig names
 65 |                             component_type5 = 'W'
 66 |                             component_id6a = "contig_%d"%(i)
 67 |                             component_beg7a = 1
 68 |                             component_end8a = len(substring_sequence)
 69 |                             orientation9a = '+'
 70 |                             ### Print FastA formatted contig
 71 |                             record = SeqRecord( Seq(substring_sequence),
 72 |                                                 id=component_id6a,
 73 |                                                 description="")
 74 |                             SeqIO.write(record, ctgoutfh, "fasta")
 75 |                         else:
 76 |                             print("Illegal characters in sequence")
 77 |                             print(substring_sequence)
 78 |                             return
 79 | 
 80 |                         start_pos += len (substring_sequence)
 81 |                         part_number4 += 1
 82 |                         if component_type5 == 'N':
 83 |                             ### print AGP line for gap
 84 |                             csvout.writerow([object1,object_beg2,object_end3, part_number4,component_type5,gap_length6b,gap_type7b,linkage8b,filler9b])
 85 |                         else:
 86 |                             ### print AGP line for contig
 87 |                             csvout.writerow([object1,object_beg2,object_end3, part_number4,component_type5,component_id6a,component_beg7a,component_end8a,orientation9a])
 88 | 
 89 | 
 90 | def main(args):
 91 |     class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
 92 |         def __init__(self, prog):
 93 |             super(MyFormatter, self).__init__(prog, max_help_position=48)
 94 | 
 95 | 
 96 |     parser = argparse.ArgumentParser(
 97 |         prog='fasta2agp.py',
 98 |         description='''Convert FastA format scaffolds file into contigs file and print the AGP based on parsing gaps (N runs).''',
 99 |         #usage='''fasta2agp.py scaffolds.fa > scaffolds.agp''',
100 |         epilog="""Written by Jason Stajich @hyphaltip (2021) jasonstajich.phd@gmail.com""",
101 |         formatter_class=MyFormatter)
102 |     parser.add_argument('--ext', default='contigs.fsa',
103 |                         help='Default extensions for output contigs file')
104 |     parser.add_argument('scaffoldfile', nargs='?',help='Scaffolds FastA file')
105 |     parser.add_argument('agpfile', nargs='?',type=argparse.FileType('w'), default=sys.stdout,
106 |                         help='AGP output file (defaults to STDOUT)')
107 |     args = parser.parse_args(args)
108 |     ctgfile = args.scaffoldfile + "." + args.ext
109 |     m = re.match(r'^(\S+)\.(fa|fasta|fsa)$',args.scaffoldfile)
110 |     if m:
111 |         ctgfile = m.group(1)
112 |         m = re.match(r'^(\S+)\.scaffolds?$',ctgfile)
113 |         if m:
114 |             ctgfile = "{}.{}".format(m.group(1),args.ext)
115 |     # run cmd
116 |     parse_scaffolds_makeagp(args.scaffoldfile,args.agpfile,ctgfile)
117 | 
118 | if __name__ == "__main__":
119 |     main(sys.argv[1:])


--------------------------------------------------------------------------------
/funannotate/aux_scripts/filterIntronsFindStrand.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | ####################################################################################################
  4 | #                                                                                                  #
  5 | # filterIntronsFindStrand.pl - finds corresponding strand for introns in fasta file                #
  6 | #                              optionally set the score column to the 'mult' entry with --score    #
  7 | #                                                                                                  #
  8 | # Author: Simone Lange                                                                             #
  9 | #                                                                                                  #
 10 | # Contact: katharina.hoff@uni-greifswald.de                                                        #
 11 | #                                                                                                  #
 12 | # Release date: January 7th 2015                                                                   #
 13 | #                                                                                                  #
 14 | # This script is under the Artistic Licence                                                        #
 15 | # (http://www.opensource.org/licenses/artistic-license.php)                                        #
 16 | #                                                                                                  #
 17 | ####################################################################################################
 18 | 
 19 | # ------------------------------------------------------------------
 20 | # | file creation and findStrand()    | Simone Lange   |06.10.2014 |
 21 | # | add getScore() for score option   |                |07.10.2014 |
 22 | # | add error message if sequence     |                |23.01.2015 |
 23 | # | name of hints and fasta file do   |                |           |
 24 | # | not match -> program   stops then |                |           |
 25 | # ------------------------------------------------------------------
 26 | 
 27 | use strict;
 28 | use warnings;
 29 | use Getopt::Long;
 30 | 
 31 | 
 32 | 
 33 | my $usage = <<'ENDUSAGE';
 34 | 
 35 | filterIntronsFindStrand.pl     find corresponding strand for introns from two input files genome.fa and introns.gff
 36 | 
 37 | SYNOPSIS
 38 | 
 39 | filterIntronsFindStrand.pl genome.fa introns.gff [OPTIONS] > introns.s.f.gff
 40 | 
 41 |   genome.fa           DNA file in fasta format
 42 |   introns.gff         corresponding introns file in gff format
 43 |     
 44 |     
 45 | OPTIONS
 46 | 
 47 |     --help                          Print this help message
 48 |     --allowed=gtag,gcaag,atac       Allowed acceptor and donor splice site types
 49 |     --score                         Set score to 'mult' entry or '1', if the last column does not contain a 'mult' entry
 50 |     --genome=genome.fa              see above
 51 |     --introns=introns.gff           see above
 52 |                                     
 53 | 
 54 |                           
 55 | 
 56 | DESCRIPTION
 57 |       
 58 |   Example:
 59 | 
 60 |     filterIntronsFindStrand.pl genome.fa introns.gff [OPTIONS] > introns.s.f.gff
 61 | 
 62 | ENDUSAGE
 63 | 
 64 | 
 65 | my ($genome, $introns, @allowed, $mult_score, $help);
 66 | my %annos; # keys: sequences, elements: annotations
 67 | my $seqname;
 68 | my $seq;
 69 | 
 70 | if(@ARGV==0){
 71 |   print "$usage\n"; 
 72 |   exit(0);
 73 | }
 74 | 
 75 | GetOptions( 'introns=s' => \$introns,
 76 |             'genome=s'  => \$genome,
 77 |             'score!'    => \$mult_score,
 78 |             'allowed=s' => \@allowed,
 79 |             'help!'     => \$help);
 80 | 
 81 | if($help){
 82 |   print $usage;
 83 |   exit(0);
 84 | }
 85 | 
 86 | # set $genome
 87 | if(!defined($genome)){
 88 |   $genome = $ARGV[0];
 89 | }
 90 | 
 91 | # set $introns
 92 | if(!defined($introns)){
 93 |   $introns = $ARGV[1];
 94 | }
 95 | 
 96 | # set allowed splice site types
 97 | if(@allowed){
 98 |   @allowed = split(/[\s,]/, join(',',@allowed));
 99 | }else{
100 |   @allowed = ("gtag", "gcag", "atac");
101 | }
102 | 
103 | # check whether the files exist
104 | if(! -f "$genome"){
105 |   print "Genome file $genome does not exist. Please check.\n";
106 |   exit(1);
107 | }
108 | 
109 | if(! -f "$introns"){
110 |   print "Introns file $introns does not exist. Please check.\n";
111 |   exit(1);
112 | }
113 | 
114 | # genome file in fasta format
115 | open (FASTA, "<".$genome) or die "Cannot open file: $genome\n";
116 | $/="\n>";
117 | while(<FASTA>) {
118 |   /[>]*(.*)\n/;
119 |   $seqname = $1;
120 |   $seq = $';
121 |   $seq =~ s/>//; 
122 |   $seq =~ s/\n//g;
123 |   $annos{$seqname} = $seq;
124 | }
125 | close(FASTA) or die("Could not close fasta file $genome!\n");
126 | 
127 | # introns hintsfile in gff format
128 | open (INTRONS, "<".$introns) or die "Cannot open file: $introns\n";
129 | $/="\n";
130 | while(<INTRONS>){
131 |   chomp;
132 |   my @line = split(/\t/, $_);
133 |   my $strand = findStrand($line[0], $line[3], $line[4]);
134 |   my $score;
135 |   if($mult_score){
136 |     $score = getScore($line[8]);
137 |   }else{
138 |     $score = $line[5];
139 |   }
140 |   if($strand){
141 |     print "$line[0]\t$line[1]\t$line[2]\t$line[3]\t$line[4]\t$score\t$strand\t$line[7]\t$line[8]\n";
142 |   }
143 | }
144 | close(INTRONS) or die("Could not close introns file $introns!\n");
145 | 
146 | 
147 |                            ############### sub functions ##############
148 | 
149 | 
150 | # find strand for introns
151 | # look up start and end position and check if it matches allowed splice site patterns 
152 | sub findStrand{
153 |   my $seqname = shift;
154 |   my $start = shift;
155 |   my $end = shift;
156 |   my $type;
157 |   my $reverse;
158 |   if(defined($annos{$seqname})){
159 |     $type = lc(substr($annos{$seqname}, $start-1,2)).lc(substr($annos{$seqname}, $end-2,2));
160 |     $reverse = reverse($type);
161 |     $reverse =~ tr/agct/tcga/;
162 |     foreach (@allowed){
163 |       if($_ eq $type){
164 |         return "+";
165 |       }elsif($_ eq $reverse){
166 |         return "-";
167 |       }
168 |     }
169 |     return 0;
170 |   }else{
171 |     print STDERR "WARNING: '$seqname' does not match any sequence in the fasta file. Maybe the two files do not belong together.\n";
172 |   #  print STDERR "The program terminates here.\n";
173 |   #  exit(1)
174 |   }
175 | }
176 | 
177 | # get score from mult entry
178 | sub getScore{
179 |   my $column = shift;
180 |   my $score;
181 |   if($column =~ m/mult=(\d+)/){
182 |     $score = $1;
183 |   }else{
184 |     $score = 1;
185 |   } 
186 |   return $score;
187 | }
188 |   
189 | 
190 | 
191 | 


--------------------------------------------------------------------------------
/funannotate/aux_scripts/genemark_gtf2gff3.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl 
  2 | 
  3 | #script from the Maker 2.31.8 distribution
  4 | 
  5 | eval 'exec /usr/bin/perl  -S $0 ${1+"$@"}'
  6 |     if 0; # not running under some shell
  7 | 
  8 | use warnings;
  9 | use strict;
 10 | 
 11 | #usage statement
 12 | my $usage = "
 13 | USAGE:
 14 |       genemark_gtf2gff3 <filename>
 15 | 
 16 |       This converts genemark's GTF output into GFF3 format.
 17 |       The script prints to STDOUT. Use the '>' character to
 18 |       redirect output into a file.
 19 | 
 20 | ";
 21 | 
 22 | my $file = shift;
 23 | 
 24 | #error checking
 25 | if (! $file ){
 26 |     print $usage;
 27 |     exit;
 28 | }
 29 | 
 30 | if(! -e $file){
 31 |     warn "ERROR: The file $file does not exist\n";
 32 |     print $usage;
 33 | }
 34 | 
 35 | #parse file
 36 | open(IN, "< $file");
 37 | my %genes;
 38 | while(my $line = <IN>){
 39 |     chomp $line;
 40 |     my @F = split(/\t/, $line);
 41 |     next if(@F < 8);
 42 |     next if($F[2] ne 'CDS');
 43 | 
 44 |     #genemark by default only fills in the ids and not the names
 45 |     my ($g) = $F[8] =~ /gene_id \"([^\"]+)\"/;
 46 |        ($g) = $F[8] =~ /gene_name \"([^\"]+)\"/ if(! defined $g);
 47 |     my ($t) = $F[8] =~ /transcript_id \"([^\"]+)\"/;
 48 |        ($t) = $F[8] =~ /transcript_name \"([^\"]+)\"/ if(! defined $t);
 49 | 
 50 |     die "ERROR: Cannot understand format\n".
 51 | 	"expecting -> gene_id \"xxxx\"\; transcript_id \"xxxx\"\;\n"
 52 | 	if(! defined $g || ! defined $t);
 53 | 
 54 |     #get cintig name
 55 |     my $s = $F[0];
 56 |     
 57 |     #set needed column information
 58 |     $genes{$s}{$g}{seqid}  = $F[0] if(! $genes{$s}{$g}{seqid});
 59 |     $genes{$s}{$g}{source} = $F[1] if(! $genes{$s}{$g}{source});
 60 |     $genes{$s}{$g}{strand} = $F[6] if(! $genes{$s}{$g}{strand});
 61 | 
 62 |     $genes{$s}{$g}{mRNA}{$t}{seqid}  = $F[0] if(! $genes{$s}{$g}{mRNA}{$t}{seqid});
 63 |     $genes{$s}{$g}{mRNA}{$t}{source} = $F[1] if(! $genes{$s}{$g}{mRNA}{$t}{source});
 64 |     $genes{$s}{$g}{mRNA}{$t}{strand} = $F[6] if(! $genes{$s}{$g}{mRNA}{$t}{strand});
 65 |     $genes{$s}{$g}{mRNA}{$t}{parent} = $g if(! $genes{$s}{$g}{mRNA}{$t}{parent});
 66 | 
 67 |     #set start/end of gene
 68 |     $genes{$s}{$g}{B} = $F[3] if(! defined $genes{$s}{$g}{B} || $F[3] < $genes{$s}{$g}{B});
 69 |     $genes{$s}{$g}{E} = $F[4] if(! defined $genes{$s}{$g}{E} || $F[4] > $genes{$s}{$g}{E});
 70 | 
 71 |     #set start/end of transcript
 72 |     $genes{$s}{$g}{mRNA}{$t}{B} = $F[3] if(! defined $genes{$s}{$g}{mRNA}{$t}{B} ||
 73 | 					   $F[3] < $genes{$s}{$g}{mRNA}{$t}{B}
 74 | 					   );
 75 |     $genes{$s}{$g}{mRNA}{$t}{E} = $F[4] if(! defined $genes{$s}{$g}{mRNA}{$t}{E} ||
 76 | 					   $F[4] > $genes{$s}{$g}{mRNA}{$t}{E}
 77 | 					   );
 78 |     
 79 |     #add CDS to transcript
 80 |     my %c = (seqid  => $F[0],
 81 | 	     source => $F[1],
 82 | 	     B      => $F[3],
 83 | 	     E      => $F[4],
 84 | 	     score  => $F[5],
 85 | 	     strand => $F[6],
 86 | 	     phase  => $F[7],
 87 | 	     parent => $t
 88 | 	     );
 89 | 
 90 |     push (@{$genes{$s}{$g}{mRNA}{$t}{CDS}}, \%c);
 91 | }
 92 | close(IN);
 93 | 
 94 | 
 95 | #build GFF3 structure and dump to file
 96 | print "\#\#gff-version 3\n";
 97 | gff3_contig(\%genes);
 98 | 
 99 | #--------------------------------------------------------------------------
100 | #-------------------------------- SUBS ------------------------------------
101 | #--------------------------------------------------------------------------
102 | sub gff3_contig {
103 |     my $hash = shift;
104 |     foreach my $f (keys %$hash){
105 | 	gff3_gene($hash->{$f});
106 |     }
107 | }
108 | #--------------------------------------------------------------------------
109 | sub gff3_gene {
110 |     my $hash = shift;
111 | 
112 |     foreach my $g (sort {$hash->{$a}{B} <=> $hash->{$b}{B}} keys %$hash) {
113 | 	my $gene = $hash->{$g};
114 | 	
115 | 	print join("\t",$gene->{seqid},$gene->{source},'gene',$gene->{B},
116 | 		   $gene->{E},'.',$gene->{strand},'.',sprintf('ID=%s;Name=%s',$g,$g)),"\n";
117 | 	
118 | 	gff3_mRNA($gene->{mRNA});
119 |     }
120 | }
121 | #--------------------------------------------------------------------------
122 | sub gff3_mRNA {
123 |     my $hash = shift;
124 | 
125 |     foreach my $t (keys %$hash){
126 | 	my $mRNA = $hash->{$t};
127 | 	print join("\t",$mRNA->{seqid},$mRNA->{source},"mRNA",
128 | 		   $mRNA->{B},$mRNA->{E},'.',$mRNA->{strand},'.',
129 | 		   sprintf('ID=%s;Name=%s;Parent=%s',$t,$t,$mRNA->{parent})),"\n";
130 | 	
131 | 	gff3_CDS($mRNA->{CDS});
132 |     }
133 | }
134 | #--------------------------------------------------------------------------
135 | sub gff3_CDS {
136 |     my $array = shift;
137 | 
138 |     #define the id
139 |     my $i = 1;
140 | 
141 |     my @exons;
142 |     my @CDSs;
143 |     foreach my $c (@$array){
144 | 	#make exon line
145 | 	my $id = $c->{parent} .":exon:". $i;
146 | 	my $exon = join("\t",$c->{seqid},$c->{source},"exon",
147 | 			$c->{B},$c->{E},'.',$c->{strand},'.',
148 | 			sprintf('ID=%s;Name=%s;Parent=%s',$id,$id,$c->{parent}))."\n";
149 | 	push(@exons, $exon);
150 | 	
151 | 	#make CDS line
152 | 	$id = $c->{parent} .":CDS:". $i++;
153 | 	my $cds = join("\t",$c->{seqid},$c->{source},"CDS",$c->{B},$c->{E},$c->{score},
154 | 		       $c->{strand},$c->{phase},sprintf("ID=%s;Name=%s;Parent=%s",$id,$id,$c->{parent}))."\n";
155 | 	push(@CDSs, $cds);
156 |     }
157 |     
158 |     #print all exons together then all CDSs together
159 |     print join('', @exons);
160 |     print join('', @CDSs);
161 | }
162 | 
163 | 
164 | 
165 | 
166 | 


--------------------------------------------------------------------------------
/funannotate/aux_scripts/getEggNog.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z "$1" ]; then
 4 |     echo "Usage: getEggNog.sh fuNOG directory"
 5 |     exit
 6 | fi
 7 | 
 8 | 
 9 | EGGNOG=$1
10 | wget -c --tries=0 --read-timeout=20 http://eggnogdb.embl.de/download/eggnog_4.5/data/$EGGNOG/$EGGNOG.hmm.tar.gz
11 | wget -c --tries=0 --read-timeout=20 http://eggnogdb.embl.de/download/eggnog_4.5/data/$EGGNOG/$EGGNOG.annotations.tsv.gz
12 | gunzip $EGGNOG.annotations.tsv.gz
13 | tar -zxf $EGGNOG.hmm.tar.gz
14 | find $EGGNOG\_hmm/ -maxdepth 1 -type f -name '*.hmm' -exec cat '{}' \; > $EGGNOG\_4.5.hmm
15 | hmmpress $EGGNOG\_4.5.hmm
16 | rm $EGGNOG.hmm.tar.gz
17 | rm -R $EGGNOG\_hmm/
18 | for i in $EGGNOG\*; do
19 |     mv $i $2/
20 | done
21 | echo "Done, $EGGNOG DB is now ready to use"


--------------------------------------------------------------------------------
/funannotate/aux_scripts/hmmer_parallel.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import os
  5 | import argparse
  6 | import warnings
  7 | import subprocess
  8 | from natsort import natsorted
  9 | import funannotate.library as lib
 10 | 
 11 | with warnings.catch_warnings():
 12 |     warnings.simplefilter("ignore")
 13 |     from Bio import SearchIO
 14 | 
 15 | 
 16 | def PfamHmmer(input):
 17 |     HMM = os.path.join(FUNDB, "Pfam-A.hmm")
 18 |     base = os.path.basename(input).split(".fa")[0]
 19 |     pfam_out = os.path.join(os.path.dirname(input), base + ".pfam.txt")
 20 |     cmd = ["hmmsearch", "--domtblout", pfam_out, "--cpu", "1", "--cut_ga", HMM, input]
 21 |     subprocess.call(cmd, stdout=FNULL, stderr=FNULL)
 22 | 
 23 | 
 24 | def safe_run(*args, **kwargs):
 25 |     """Call run(), catch exceptions."""
 26 |     try:
 27 |         PfamHmmer(*args, **kwargs)
 28 |     except Exception as e:
 29 |         print(("error: %s run(*%r, **%r)" % (e, args, kwargs)))
 30 | 
 31 | 
 32 | def combineHmmerOutputs(inputList, output):
 33 |     # function to combine multiple HMMER runs with proper header/footer so biopython can read
 34 |     allHeadFoot = []
 35 |     with open(inputList[0], "r") as infile:
 36 |         for line in infile:
 37 |             if line.startswith("#"):
 38 |                 allHeadFoot.append(line)
 39 |     with open(output, "w") as out:
 40 |         for x in allHeadFoot[:3]:
 41 |             out.write(x)
 42 |         for file in inputList:
 43 |             with open(file, "r") as resultin:
 44 |                 for line in resultin:
 45 |                     if line.startswith("#") or line.startswith("\n"):
 46 |                         continue
 47 |                     out.write(line)
 48 |         for y in allHeadFoot[3:]:
 49 |             out.write(y)
 50 | 
 51 | 
 52 | def multiPFAMsearch(inputList, cpus, tmpdir, output):
 53 |     # run hmmerscan multithreaded by running at same time
 54 |     # input is a list of files, run multiprocessing on them
 55 |     pfam_results = os.path.join(os.path.dirname(tmpdir), "pfam.txt")
 56 |     pfam_filtered = os.path.join(os.path.dirname(tmpdir), "pfam.filtered.txt")
 57 |     lib.runMultiProgress(safe_run, inputList, cpus, progress=False)
 58 | 
 59 |     # now grab results and combine, kind of tricky as there are header and footers for each
 60 |     resultList = [
 61 |         os.path.join(tmpdir, f)
 62 |         for f in os.listdir(tmpdir)
 63 |         if os.path.isfile(os.path.join(tmpdir, f)) and f.endswith(".pfam.txt")
 64 |     ]
 65 |     combineHmmerOutputs(resultList, pfam_results)
 66 | 
 67 |     # now parse results
 68 |     with open(output, "w") as out:
 69 |         with open(pfam_filtered, "w") as filtered:
 70 |             with open(pfam_results, "r") as results:
 71 |                 for qresult in SearchIO.parse(results, "hmmsearch3-domtab"):
 72 |                     hits = qresult.hits
 73 |                     num_hits = len(hits)
 74 |                     if num_hits > 0:
 75 |                         for i in range(0, num_hits):
 76 |                             hit_evalue = hits[i].evalue
 77 |                             query = hits[i].id
 78 |                             pfam = qresult.accession.split(".")[0]
 79 |                             hmmLen = qresult.seq_len
 80 |                             hmm_aln = int(hits[i].hsps[0].hit_end) - int(
 81 |                                 hits[i].hsps[0].hit_start
 82 |                             )
 83 |                             coverage = hmm_aln / float(hmmLen)
 84 |                             if coverage < 0.50:  # coverage needs to be at least 50%
 85 |                                 continue
 86 |                             filtered.write(
 87 |                                 "%s\t%s\t%s\t%f\n" % (query, pfam, hit_evalue, coverage)
 88 |                             )
 89 |                             out.write("%s\tdb_xref\tPFAM:%s\n" % (query, pfam))
 90 | 
 91 | 
 92 | def dbCANHmmer(input):
 93 |     HMM = os.path.join(FUNDB, "dbCAN.hmm")
 94 |     base = os.path.basename(input).split(".fa")[0]
 95 |     outfiles = os.path.join(os.path.dirname(input), base + ".dbcan.txt")
 96 |     cmd = ["hmmscan", "--domtblout", outfiles, "--cpu", "1", "-E", "1e-15", HMM, input]
 97 |     subprocess.call(cmd, stdout=FNULL, stderr=FNULL)
 98 | 
 99 | 
100 | def safe_run2(*args, **kwargs):
101 |     """Call run(), catch exceptions."""
102 |     try:
103 |         dbCANHmmer(*args, **kwargs)
104 |     except Exception as e:
105 |         print(("error: %s run(*%r, **%r)" % (e, args, kwargs)))
106 | 
107 | 
108 | def dbCANsearch(inputList, cpus, evalue, tmpdir, output):
109 |     # run hmmerscan
110 |     dbCAN_out = os.path.join(tmpdir, "dbCAN.txt")
111 |     dbCAN_filtered = os.path.join(tmpdir, "dbCAN.filtered.txt")
112 |     lib.runMultiProgress(safe_run2, inputList, cpus, progress=False)
113 |     # now grab results
114 |     resultList = [
115 |         os.path.join(tmpdir, f)
116 |         for f in os.listdir(tmpdir)
117 |         if os.path.isfile(os.path.join(tmpdir, f)) and f.endswith(".dbcan.txt")
118 |     ]
119 |     combineHmmerOutputs(resultList, dbCAN_out)
120 | 
121 |     # now parse results
122 |     Results = {}
123 |     with open(dbCAN_filtered, "w") as filtered:
124 |         filtered.write(
125 |             "#HMM_family\tHMM_len\tQuery_ID\tQuery_len\tE-value\tHMM_start\tHMM_end\tQuery_start\tQuery_end\tCoverage\n"
126 |         )
127 |         with open(dbCAN_out, "r") as results:
128 |             for qresult in SearchIO.parse(results, "hmmscan3-domtab"):
129 |                 query_length = qresult.seq_len
130 |                 hits = qresult.hits
131 |                 num_hits = len(hits)
132 |                 if num_hits > 0:
133 |                     for i in range(0, num_hits):
134 |                         hit_evalue = hits[i].evalue
135 |                         if hit_evalue > evalue:
136 |                             continue
137 |                         hit = hits[i].id
138 |                         hmmLen = hits[i].seq_len
139 |                         hmm_aln = int(hits[i].hsps[0].hit_end) - int(
140 |                             hits[i].hsps[0].hit_start
141 |                         )
142 |                         coverage = hmm_aln / float(hmmLen)
143 |                         if coverage < 0.35:
144 |                             continue
145 |                         query = hits[i].query_id
146 |                         filtered.write(
147 |                             "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\n"
148 |                             % (
149 |                                 hit,
150 |                                 hmmLen,
151 |                                 query,
152 |                                 query_length,
153 |                                 hit_evalue,
154 |                                 hits[i].hsps[0].hit_start,
155 |                                 hits[i].hsps[0].hit_end,
156 |                                 hits[i].hsps[0].query_start,
157 |                                 hits[i].hsps[0].query_end,
158 |                                 coverage,
159 |                             )
160 |                         )
161 |                         if query not in Results:
162 |                             Results[query] = [hit]
163 |                         else:
164 |                             Results[query].append(hit)
165 |     # run through results and simplify subdomain hits
166 |     with open(output, "w") as out:
167 |         for k, v in natsorted(Results.items()):
168 |             simplified = []
169 |             for x in v:
170 |                 if "_" in x:
171 |                     cazy, subdomain = x.rsplit("_", 1)
172 |                     if cazy not in simplified:
173 |                         simplified.append(cazy)
174 |                 else:
175 |                     if not x in simplified:
176 |                         simplified.append(x)
177 |             for hit in simplified:
178 |                 out.write("{}\tnote\tCAZy:{}\n".format(k, hit))
179 | 
180 | 
181 | class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
182 |     def __init__(self, prog):
183 |         super(MyFormatter, self).__init__(prog, max_help_position=48)
184 | 
185 | 
186 | parser = argparse.ArgumentParser(
187 |     prog="hmmer_parallel.py",
188 |     description="""Run hmmer3 multipthreaded.""",
189 |     epilog="""Written by Jon Palmer (2019) nextgenusfs@gmail.com""",
190 |     formatter_class=MyFormatter,
191 | )
192 | parser.add_argument(
193 |     "-i", "--input", required=True, help="folder of protein fasta files"
194 | )
195 | parser.add_argument(
196 |     "-m",
197 |     "--method",
198 |     default="pfam",
199 |     choices=["pfam", "cazy"],
200 |     help="database to search",
201 | )
202 | parser.add_argument("-d", "--db", required=True, help="location of HMM database")
203 | parser.add_argument(
204 |     "-c", "--cpus", default=1, type=int, help="location of HMM database"
205 | )
206 | parser.add_argument("-o", "--out", required=True, help="output file")
207 | args = parser.parse_args()
208 | 
209 | global FUNDB, FNULL
210 | FUNDB = args.db
211 | FNULL = open(os.devnull, "w")
212 | splitProts = [
213 |     os.path.join(args.input, f)
214 |     for f in os.listdir(args.input)
215 |     if os.path.isfile(os.path.join(args.input, f))
216 | ]
217 | if args.method == "pfam":
218 |     multiPFAMsearch(splitProts, args.cpus, args.input, args.out)
219 | elif args.method == "cazy":
220 |     dbCANsearch(splitProts, args.cpus, 1e-17, args.input, args.out)
221 | 


--------------------------------------------------------------------------------
/funannotate/aux_scripts/iprscan2annotations.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # script written for funannotate by Jon Palmer (2017)
  4 | # it will parse an interproscan5 xml file and generate
  5 | # genome annotation file for GO terms and IPR terms
  6 | 
  7 | import sys
  8 | import os
  9 | import xml.etree.cElementTree as etree
 10 | from goatools import obo_parser
 11 | 
 12 | 
 13 | def convertGOattribute(namespacein):
 14 |     namespace = namespacein.upper()
 15 |     if namespace == "BIOLOGICAL_PROCESS":
 16 |         attribute = "go_process"
 17 |     elif namespace == "MOLECULAR_FUNCTION":
 18 |         attribute = "go_function"
 19 |     elif namespace == "CELLULAR_COMPONENT":
 20 |         attribute = "go_component"
 21 |     else:
 22 |         # print(f'Error parsing XML GO terms: {namespace} is not a valid term')
 23 |         attribute = "go_unknown"
 24 |         # sys.exit(1)
 25 |     return attribute
 26 | 
 27 | 
 28 | def main():
 29 |     """Main step of intepro annotations to tab delimited script."""
 30 | 
 31 |     if len(sys.argv) < 2:
 32 |         print("Usage: iprscan2annotations.py IPRSCAN.xml OUTPUT.annotations.txt")
 33 |         sys.exit(1)
 34 | 
 35 |     goDict = {}
 36 |     for item in obo_parser.OBOReader(
 37 |         os.path.join(os.environ["FUNANNOTATE_DB"], "go.obo")
 38 |     ):
 39 |         namespace = convertGOattribute(item.namespace)
 40 |         goDict[item.id] = {"name": item.name, "namespace": namespace}
 41 |         for nm in item.alt_ids:  # also index by alt_id since that may be reported
 42 |             goDict[nm] = {"name": item.name, "namespace": namespace}
 43 |     with open(sys.argv[2], "w") as output:
 44 |         with open(sys.argv[1]) as xml_file:
 45 |             tree = etree.iterparse(xml_file)
 46 |             for _, elem in tree:
 47 |                 if "}" in elem.tag:
 48 |                     elem.tag = elem.tag.split("}", 1)[1]
 49 |                 for at in list(elem.attrib.keys()):
 50 |                     if "}" in at:
 51 |                         newat = at.split("}", 1)[1]
 52 |                         elem.attrib[newat] = elem.attrib[at]
 53 |                         del elem.attrib[at]
 54 |             root = tree.root
 55 |             # iterate through each of the protein hits
 56 |             for hits in root:
 57 |                 IDs = []
 58 |                 iprs = []
 59 |                 gos = {}
 60 |                 signalp = []
 61 |                 for lv1 in hits:
 62 |                     if lv1.tag == "xref":
 63 |                         name = lv1.get("id")
 64 |                         IDs.append(name)
 65 |                     if lv1.tag == "matches":
 66 |                         for e in lv1.findall(".//entry"):
 67 |                             if not e.get("ac") in iprs:
 68 |                                 iprs.append(e.get("ac"))
 69 |                         for g in lv1.findall(".//go-xref"):
 70 |                             cat = g.get("category", None)
 71 |                             goID = g.get("id", None)
 72 |                             desc = g.get("name", None)
 73 |                             if not goID:
 74 |                                 continue
 75 |                             if not cat or not desc:
 76 |                                 if goID in goDict:
 77 |                                     cat = goDict[goID]["namespace"]
 78 |                                     desc = goDict[goID]["name"]
 79 |                                 else:
 80 |                                     continue
 81 |                                     # cat = ""
 82 |                                     # desc = ""
 83 |                                     # print(f"No GO term {goID} in obo DB")
 84 |                             else:
 85 |                                 cat = convertGOattribute(cat)
 86 |                             goHit = (cat, desc, goID)
 87 |                             if goID not in gos:
 88 |                                 gos[goID] = goHit
 89 |                         # signalp is processed elsewhere
 90 |                         # do we just skip this parsing even?
 91 |                         for s in lv1.findall(".//signalp-match"):
 92 |                             for lib in s.findall(".//signature-library-release"):
 93 |                                 if lib.get("library") == "SIGNALP_EUK":
 94 |                                     for loc in s.findall(".//signalp-location"):
 95 |                                         signalp.append(
 96 |                                             (loc.get("start"), loc.get("end"))
 97 |                                         )
 98 |                 # print out annotation file if IPR domains
 99 |                 if len(iprs) > 0:
100 |                     for i in IDs:
101 |                         for x in iprs:
102 |                             output.write(f"{i}\tdb_xref\tInterPro:{x}\n")
103 |                 if len(gos) > 0:
104 |                     for i in IDs:
105 |                         for goid in gos:
106 |                             x = gos[goid]
107 |                             GOID = x[2].replace("GO:", "")
108 |                             output.write(f"{i}\t{x[0]}\t{x[1]}|{GOID}||IEA\n")
109 | 
110 | 
111 | if __name__ == "__main__":
112 |     main()
113 | 


--------------------------------------------------------------------------------
/funannotate/aux_scripts/phobius-multiproc.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import sys
  5 | import os
  6 | import uuid
  7 | import time
  8 | import multiprocessing
  9 | import argparse
 10 | import shutil
 11 | import funannotate.library as lib
 12 | 
 13 | # setup menu with argparse
 14 | 
 15 | 
 16 | class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
 17 |     def __init__(self, prog):
 18 |         super(MyFormatter, self).__init__(prog, max_help_position=48)
 19 | 
 20 | 
 21 | parser = argparse.ArgumentParser(
 22 |     prog='phobius-multiproc.py',
 23 |     usage="%(prog)s [options] -i proteome.fasta",
 24 |     description='''Script that runs phobius remotely.''',
 25 |     epilog="""Written by Jon Palmer (2016) nextgenusfs@gmail.com""",
 26 |     formatter_class=MyFormatter)
 27 | parser.add_argument('-i', '--input', required=True, help='whole proteome')
 28 | parser.add_argument('-o', '--out', required=True, help='Phobius results')
 29 | parser.add_argument('-e', '--email', help='Email address for IPRSCAN server')
 30 | parser.add_argument('-l', '--logfile',
 31 |                     default='phobius-multiproc.log', help='Logfile')
 32 | parser.add_argument('--debug', action='store_true',
 33 |                     help='Keep intermediate files')
 34 | args = parser.parse_args()
 35 | 
 36 | 
 37 | def runPhobiusRemote(Input):
 38 |     base = Input.split('/')[-1]
 39 |     base = base.split('.fa')[0]
 40 |     OUTPATH = os.path.join(TMPDIR, base)
 41 |     cmd = ['perl', os.path.join(parentdir, 'phobius-remote.pl'),
 42 |            '--email', args.email, '-f', 'short', '--outfile', base, Input]
 43 |     lib.runSubprocess(cmd, TMPDIR, lib.log)
 44 |     time.sleep(1)  # make sure there is time for all files to show up
 45 |     os.rename(OUTPATH+'.out.txt', OUTPATH+'.phobius')
 46 |     os.remove(OUTPATH+'.sequence.txt')
 47 | 
 48 | 
 49 | def runPhobiusLocal(Input):
 50 |     base = Input.split('/')[-1]
 51 |     base = base.split('.fa')[0]
 52 |     OUTPATH = os.path.join(TMPDIR, base+'.phobius')
 53 |     cmd = ['phobius.pl', '-short', Input]
 54 |     lib.runSubprocess(cmd, TMPDIR, lib.log, capture_output=OUTPATH, raise_not_exit=True)
 55 | 
 56 | 
 57 | global parentdir
 58 | parentdir = os.path.join(os.path.dirname(__file__))
 59 | 
 60 | # create log file
 61 | log_name = args.logfile
 62 | if os.path.isfile(log_name):
 63 |     os.remove(log_name)
 64 | 
 65 | # initialize script, log system info and cmd issue at runtime
 66 | lib.setupLogging(log_name)
 67 | FNULL = open(os.devnull, 'w')
 68 | cmd_args = " ".join(sys.argv)+'\n'
 69 | lib.log.debug(cmd_args)
 70 | 
 71 | # create tmpdir to store fasta files and output files
 72 | TMPDIR = 'phobius_' + str(uuid.uuid4())
 73 | 
 74 | # split fasta
 75 | lib.splitFASTA(args.input, TMPDIR)
 76 | 
 77 | # now get list of files in tmpdir
 78 | proteins = []
 79 | for file in os.listdir(TMPDIR):
 80 |     if file.endswith('.fa'):
 81 |         proteins.append(file)
 82 | 
 83 | # now run the script
 84 | if lib.which('phobius.pl'):
 85 |     lib.runMultiProgress(runPhobiusLocal, proteins,
 86 |                          multiprocessing.cpu_count())
 87 | else:
 88 |     lib.runMultiProgress(runPhobiusRemote, proteins,
 89 |                          29)  # max is 30 jobs at a time
 90 | 
 91 | # collect all results
 92 | phobius = []
 93 | for file in os.listdir(TMPDIR):
 94 |     if file.endswith('.phobius'):
 95 |         phobius.append(os.path.join(TMPDIR, file))
 96 | 
 97 | # write output
 98 | TMdomain = 0
 99 | SigPep = 0
100 | with open(args.out, 'w') as output:
101 |     output.write("%s\t%s\t%s\t%s\n" % ('ID', 'TM', 'SP', 'Prediction'))
102 |     for x in phobius:
103 |         with open(x, 'r') as input:
104 |             line = input.readlines()
105 |             try:
106 |                 result = line[1].split(' ')
107 |                 result = [x for x in result if x]
108 |                 if result[1] == 'prediction':
109 |                     continue
110 |                 if int(result[1]) > 0:
111 |                     TMdomain += 1
112 |                 if result[2] == 'Y':
113 |                     SigPep += 1
114 |                 output.write("%s\t%s\t%s\t%s\n" % (
115 |                     result[0], result[1],
116 |                     result[2], result[3].replace('\n', '')))
117 |             except IndexError:
118 |                 pass
119 | 
120 | # clean
121 | if not args.debug:
122 |     shutil.rmtree(TMPDIR)
123 | lib.log.debug("%i total proteins, %i TMdomain, %i Signal Peptide" %
124 |               (len(phobius), TMdomain, SigPep))
125 | 


--------------------------------------------------------------------------------
/funannotate/aux_scripts/sam2bam.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #simple wrapper for running aligner program and piping output to samtools view/sort
 4 | 
 5 | if [ -z "$3" ]; then
 6 |     echo 'Usage: sam2bam.sh "aligner_command" bam_threads bam_output'
 7 |     echo '**The double quotes are required around aligner command**'
 8 |     exit
 9 | fi
10 | 
11 | #construct the command
12 | cmd="$1 | samtools view -@ $2 -bS - | samtools sort -@ $2 -o $3 -"
13 | 
14 | #run the command
15 | eval $cmd
16 | 


--------------------------------------------------------------------------------
/funannotate/aux_scripts/trnascan2gff3.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | #modified by Jon Palmer (2016) to print correct product ID in field 9
  4 | 
  5 | =head1 NAME
  6 | 
  7 | tRNAScan_SE_to_gff3.pl - convert raw output of tRNAScan-SE to gff3
  8 | 
  9 | =head1 SYNOPSIS
 10 | 
 11 | USAGE: convert_tRNAScanSE_to_gff3.pl 
 12 |             --input=/path/to/some_file.out 
 13 | 
 14 | =head1 OPTIONS
 15 | 
 16 | B<--input,-i>
 17 |     The raw output from tRNAScan-SE:
 18 |     
 19 |     Sequence                        tRNA    Bounds  tRNA    Anti    Intron Bounds   Cove
 20 |     Name                    tRNA #  Begin   End     Type    Codon   Begin   End     Score
 21 |     --------                ------  ----    ------  ----    -----   -----   ----    ------
 22 |     tp.assembly.567468735.1         1       91820   91902   Tyr     GTA     91857   91866   66.58
 23 |     tp.assembly.567468735.1         2       171777  171849  Phe     GAA     0       0       70.28
 24 |     tp.assembly.567468735.1         3       172144  172215  His     GTG     0       0       64.04
 25 |     tp.assembly.567468735.1         4       852847  852919  Thr     AGT     0       0       75.69
 26 |     tp.assembly.567468735.1         5       877291  877362  Trp     CCA     0       0       68.97
 27 |     tp.assembly.567468735.1         6       1468229 1468300 Cys     GCA     0       0       72.10
 28 |     tp.assembly.567468735.1         7       2507459 2507530 Pro     AGG     0       0       62.33
 29 |     tp.assembly.567468735.1         8       2507198 2507127 Pro     CGG     0       0       65.73
 30 |     tp.assembly.567468735.1         9       2506317 2506246 Pro     TGG     0       0       66.60
 31 |     tp.assembly.567468735.1         10      2463785 2463713 Lys     TTT     0       0       79.47
 32 |     tp.assembly.567468735.1         11      2191149 2191069 Leu     CAG     0       0       57.47
 33 |     tp.assembly.567468735.1         12      1633307 1633237 Gly     CCC     0       0       65.52
 34 |     tp.assembly.567468735.1         13      1255051 1254968 Leu     CAA     0       0       60.46
 35 |     tp.assembly.567468735.1         14      251108  251037  Asp     GTC     0       0       59.48
 36 |     tp.assembly.567468735.1         15      250520  250449  Asp     GTC     0       0       59.48
 37 | 
 38 | B<--log,-l> 
 39 |     Log file
 40 | 
 41 | B<--help,-h>
 42 |     This help message
 43 | 
 44 | =head1  DESCRIPTION
 45 | 
 46 | File converter
 47 | 
 48 | =head1  INPUT
 49 | 
 50 | Input above.
 51 | 
 52 | =head1  OUTPUT
 53 | 
 54 | GFF3 to STDOUT
 55 | 
 56 | =head1  CONTACT
 57 | 
 58 |     Kyle Tretina
 59 |     kyletretina@gmail.com
 60 | 
 61 | =cut
 62 | 
 63 | use warnings;
 64 | use strict;
 65 | use Getopt::Long qw(:config no_ignore_case no_auto_abbrev pass_through);
 66 | use Pod::Usage;
 67 | 
 68 | my %options = ();
 69 | my $results = GetOptions (\%options, 
 70 |                           'input|i=s',
 71 |                           'log|l=s',
 72 |                           'help|h') || pod2usage();
 73 | 
 74 | ## display documentation
 75 | if( $options{'help'} ){
 76 |     pod2usage( {-exitval => 0, -verbose => 2, -output => \*STDERR} );
 77 | }
 78 | 
 79 | ## make sure everything passed was peachy
 80 | &check_parameters(\%options);
 81 | 
 82 | ## open the log if requested
 83 | my $logfh;
 84 | if (defined $options{log}) {
 85 |     open($logfh, ">$options{log}") || die "can't create log file: $!";
 86 | }
 87 | 
 88 | ## open the input file
 89 | my $ifh;
 90 | open($ifh, "<$options{input}") || die "can't open input file: $!";
 91 | 
 92 | # all output needs the gff header
 93 | print "##gff-version 3\n";
 94 | 
 95 | ## globals
 96 | my $i=1;
 97 | 
 98 | ## parse the file
 99 | foreach my $line (<$ifh>){
100 | 	my @cols = split /[\t]/, $line;
101 | 	chomp @cols;
102 | 	my $contig = $cols[0];
103 | 
104 |     if ($contig =~ /^(.+?)\s+$/) {
105 |         $contig = $1;
106 |     }
107 | 
108 |     ## skip the header lines
109 |     next if $contig eq 'Sequence' || $contig eq 'Name' || $contig eq '--------';
110 |     
111 | 	my $start = trim($cols[2]);
112 | 	my $stop = trim($cols[3]);
113 | 	my $target = $cols[4];
114 | 	my $anticodon = $cols[5];
115 | 	my @prod = split '\_', $cols[4];
116 | 	my $product;
117 | 	my $note;
118 | 	my $length = abs($stop - $start);
119 |     if ( $length >= '150' ) {
120 |         next;
121 |         }
122 |     if ( $prod[0] eq "Pseudo") {
123 |         next;
124 |         #$product = "tRNA-Xxx";
125 |         #$note = "Predicted $anticodon anticodon"; 
126 |         }
127 |     elsif ( $prod[0] eq "Sup") {
128 |         next;
129 |         #$product = "tRNA-Xxx";
130 |         #$note = "Predicted $anticodon anticodon, putative tRNA Suppressor" 
131 |         }
132 |     elsif ( $prod[0] eq "Undet") {
133 |         next; }
134 |     else {
135 |         $product = "tRNA-$prod[0]";
136 |         $note = "Predicted $anticodon anticodon"; }
137 | 	my $score = $cols[8];
138 | 	if ($start < $stop){
139 | 		print "$contig\ttRNAScan-SE\tgene\t$start\t$stop\t$score\t+\t.\tID=$target\_$i\n";
140 | 		print "$contig\ttRNAScan-SE\ttRNA\t$start\t$stop\t$score\t+\t.\tID=$target\_$i\_tRNA;Parent=$target\_$i;product=$product;note=$note\n";
141 | 		print "$contig\ttRNAScan-SE\texon\t$start\t$stop\t$score\t+\t.\tID=$target\_$i\_exon;Parent=$target\_$i\_tRNA\n";
142 | 		$i++;
143 | 	}else{
144 | 		print "$contig\ttRNAScan-SE\tgene\t$stop\t$start\t$score\t-\t.\tID=$target\_$i\n";
145 |                 print "$contig\ttRNAScan-SE\ttRNA\t$stop\t$start\t$score\t-\t.\tID=$target\_$i\_tRNA;Parent=$target\_$i;product=$product;note=$note\n";
146 |                 print "$contig\ttRNAScan-SE\texon\t$stop\t$start\t$score\t-\t.\tID=$target\_$i\_exon;Parent=$target\_$i\_tRNA\n";
147 | 		$i++;
148 | 	}
149 | }
150 | 
151 | exit(0);
152 | 
153 | sub  trim { my $s = shift; $s =~ s/^\s+|\s+$//g; return $s };
154 | 
155 | sub _log {
156 |     my $msg = shift;
157 |     print $logfh "$msg\n" if $logfh;
158 | }
159 | 
160 | sub check_parameters {
161 |     my $options = shift;
162 |     ## make sure required arguments were passed
163 |     my @required = qw( input );
164 |     for my $option ( @required ) {
165 |         unless  ( defined $$options{$option} ) {
166 |             die "--$option is a required option";
167 |         }
168 |     }
169 |     ## handle some defaults
170 |     $options{optional_argument2}   = 'foo'  unless ($options{optional_argument2});
171 | }


--------------------------------------------------------------------------------
/funannotate/aux_scripts/xmlcombine.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import os
 4 | import os.path
 5 | import fnmatch
 6 | from xml.etree import cElementTree
 7 | 
 8 | cElementTree.register_namespace(
 9 |     '', "http://www.ebi.ac.uk/interpro/resources/schemas/interproscan5")
10 | 
11 | 
12 | def run(xml_files):
13 |     first = None
14 |     for filename in xml_files:
15 |         data = cElementTree.parse(filename).getroot()
16 |         if first is None:
17 |             first = data
18 |         else:
19 |             first.extend(data)
20 |     if first is not None:
21 |         print(cElementTree.tostring(first))
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     xml_files = [os.path.join(dirpath, f)
26 |                  for dirpath, dirnames, files in os.walk(sys.argv[1])
27 |                  for f in fnmatch.filter(files, '*.xml')]
28 |     run(xml_files)
29 | 


--------------------------------------------------------------------------------
/funannotate/config/TruSeq3-PE.fa:
--------------------------------------------------------------------------------
1 | >PrefixPE/1
2 | TACACTCTTTCCCTACACGACGCTCTTCCGATCT
3 | >PrefixPE/2
4 | GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT


--------------------------------------------------------------------------------
/funannotate/config/TruSeq3-SE.fa:
--------------------------------------------------------------------------------
1 | >TruSeq3_IndexedAdapter
2 | AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC
3 | >TruSeq3_UniversalAdapter
4 | AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA


--------------------------------------------------------------------------------
/funannotate/config/busco_test.fa:
--------------------------------------------------------------------------------
1 | >example
2 | ATACGACGTACCCGTGCGTCAATTGCTACGGCGCATGCCTTCTTGTCGAGGGTTTTTTCTGGAAGCGGTCAGAGATGTTGAATAATGATGGCATGAGTTACGAGAAATGCGATTGTATTTTAGCATGCATAGACTATCAACATTGATGTTGTCCACGACTGTCCCTCTCCCGCCGGTGCGGTCATCAAACACATTCCTGCAATAGCTAACAGTAGACGAAATACTCATCACCCACCTACTTATAATCGTAATAGAGGACCGCAACCAAGCATATGGGTTATCCTGGATTGGATAGCTGAGGTCAGAGACCTTTTATTAAGGGTCATAGCCTGTCTACTCCGTAGAACGAAGTAGTGTTGTACCTACAATTATGCTCATTACATACTGTCATAACATAATATATTCTCATGACATCTTGAAAAAAAGATACCTCTAAATATCAAAGTAAAGCCGATTACCAAATACTTCGTTTATGGCTTCTCTGGATATAGATTTCCGCTCTGCGTATTGCCTAAATCGTTAGGGTTCCAAAAGGCCACCTTATCATGAAACAAACCTTGCAAACGCAGAAGATATCAAAACCCATAAGAATGAGTCTTAGAATTATTAATAAATGTTTGTAGTAAAAGAAGGGAGAGCGCTTACATCAATGAGGTTCTGAGCATACTCCCCAAAGCGGACATTTTAGGGCTAGCCCTATTATTCAATTCAATCGGAGATTTCCCCCAAGCTCCGAGGATGAGCTGCGGCAACCACCGGCGACGACCATCGCATACATCTCGTCGCAATGGACATTTCCGATCTTATCGAGCCCCCGCAGAAGCGCCTCAAGACTGAGGATATCTCCAGCGCAGACGAGGTTGTTCTTCCCGCTGGCGGAATCACGCCGCAGACCGACAACGAAATCGACGAGCAGTTATCGAAGGAGATTGAAGTTGGCATCACTGAGTTTGTCAGCGCTGATAATGAGGGTTTCGCGGGGATTTTGAAGAAAAGGTATTCTTAACTGATACGGTTGGGGTTTGATCTGAGTGCTGACTATTGCCAGATACACAGATTTCCTTGTGAACGAGATCCTGCCCTCGGGGAAAGTTCTGCATCTGACGAATACCACTGCACCTAATACCAATGATGAGGCGACTCCAGTCCAGGCAGATAAGAAGCCGGCCGAAGATAAGCCAAAAGAGCCCGAAACTCCCGCAGAGAAGTTGCCTGCTCCAGTTGAGTTTCAATTAGCGGAGGAAGATGAGGCGCTTCTGGACACTTTATTCGGCACCCAAAACACCAAGAAAATTGTCGCCCTCCATAAGAAGGCACTGGCAAATCCAAAGACTAAGCCAAGCGATCTGGGACGATTGAACACAGTCGTTGTCAACGACCGCGATCAGCGCATCAAAATGCACCAGGCAATTCGTCGCATCTTCAATTCGCAGATTGAATCTTCAACAGACAGTGAAGGAATGATGGTTATCTCAGTCGCTGCCAACCGCAACAAGAAGAATCCACAGGGAGGTGGAGGCGGGCGTGAGAGGCCGCGCGTGAATTGGGACGAACTGGGCGGACAGTATCTGCACTTTACTATTTACAAGGAGAACAAGGACACCATGGAGGTCATCTCGTTCATCGCCCGCCAACTGAAGATGAATCCGAAGAGCTTCCAGTTCGCGGGGACCAAAGATCGCCGCGGAGTAACCGTGCAGAGGGCATGCGCTTATCGCTTGCAAGCCGATCGCCTCGCGAAGCTCAATCGAACGCTCCGCAATGCCGTCGTTGGCGACTTCGAATACCAACCTCACGGCCTCGAGCTCGGCGACCTCTATGGGAACGAGTTCGTCGTGACTCTCCGCGAGTGCGAGGTTCCTGGCATCAACATCCAAGACCCCGCATCAGCCGTAGCCAAGACAAAGGAGCTCGTCAACACTTCACTCAAGAACCTCTACCAAAGAGGTTACTTCAACTACTACGGCCTACAACGTTTCGGCTCTTTCGCAACCCGCACTGACACAGTGGGCGTGAAGATACTGCAGGACGACTTCAAGGGCGCCTGCGACGCTATCCTCGACTACAGCCCACACATCCTCGCCGCGGCACAAGCAGAATTAGGCCAGGGCGAAGGCGAAGGCGCCACACCTACCAACATCAGCTCTGAAGATAAAGCACGCGCCCTCGCAATCCACATCTTCCGAACAACTGACCGCGTCACGGACGCTCTCGAAAAAATGCCTCGCAAGTTCTCCGCAGAATCGAACATCATTCGCCACCTCGGCCGGTCAAAGAACGATTACCTTGGCGCCCTGCAGACCATTCCCCGTAATCTCCGCCTCATGTATGTCCACGCCTACCAGTCCCTCGTCTGGAACCTTGCTGTTGGCGAGCGCTGGCGCCTGTACGGCGACCGCGTTGTAGAAGGCGATCTTGTCCTCATCCACGAACACCGCGACAAAGACGGCAACTCTTCCTATACCACACCCGCCCCCGGTGCAGGAGCTAGCGGCGAAACCACTACAATTGACGCAGACGGCGAAATCATTATCGTCCCGCAAGAACACGACTCAGCCTTTGCCGTCGAAGACACATTCACCCGCGCCCGAGCCCTAACCGCCGCCGAAGCGAACAGCGGCCTCTACAGCATCTTCGATATCGTCCTACCTCTCCCTGGCTTCGACGTCCTATACCCGCCAAACAAAATGACGGACTTCTATAAAGAGTTCATGGGTAGCTCCCGCGGCGGCGGATTGGATCCCTTCAACATGCGGAGAAAGTGGAAGGACGCGAGTTTAAGCGGGAGTTATCGAAAAGTTCTTAGTCGGATGGGCAGGGACTACTCTGTTGATGTGGTGCTTTATAGCAGGGATGAGGAGCAGTTTGTCCGGACTGATCTAGAGAATTTGACCCTCAAGACGAGGGATGGAGGGGATGTGGATTTGGAGAAGAAAGAGGGGAAGAGTGAAGGGGATAAGCTTGCTGTTGTCCTCAAGTTCCAGCTTGGATCGAGCCAGTATGCAACCATGGCGTTGAGAGAGTTGATGAGGGGAAAGGTGAAGGCGTATAAGCCAGACTTTGGAGGTGGGCGGTAGACTGTCGTAGCCGCCGTCGTGAACCACTCAGCTCATACGGTGTGTATACAAAGCTACGACCTTATAGGTCTATACATTCTTCATCTAAAACAACCAAATCCGTACACTCAACTTTCTGACTGGATATAGAGCAAGATTATGAACATTAAATAGACACTATATGATCTCGAGAAACCCTTGAACAAATAATCAGGATTAGAACGCTGAGAGTTGACTAGAGGCCGGGTAGGTAACTAACTCCCCCGATGCGCGGCCTGTCGATGCTCATGGTCATCGGCACCGAAACTGGACATAGACGAGATatcatcatcatcatcatcatcatcatcatcCCGGAGCCCTCTATCTCTCATCATCGCTTCGCGATCGTTGATCTCCGACACTACAGACATATTGTCGTCATTGTCATTTGGACGGTCAAATGGTGAACGCGGCCGTGCCGCTGCTTCTCTAGCGCGCTCTTCGGATGTTCGCCAGTTGAGATCTCGGTCTGTTActgcaactccagctccggctgcagcacctgctACCGGGGCCGCCGATGATGTGCTGTCTACTCCTGTCCCCGGCTCGGTCCCTGGTGGACGGAGATAAGGTGGAGGCGGAGGTAGATTGTCGTTGTTGCTGGGAAACGGGTTACGGTTAACTGGTCCTGGGGCCGGAACGGGGGCGATCTGGGGTTGACTAGGGAGGAAGGCAGAAGACGAAGTGTCTAGTTGGCGTG


--------------------------------------------------------------------------------
/funannotate/config/codeml.config:
--------------------------------------------------------------------------------
 1 | seqfile = INPUTFILEHERE
 2 | treefile = INPUTTREEHERE
 3 | outfile = OUTPUTFILEHERE
 4 | noisy = 9 * 0,1,2,3,9: how much rubbish on the screen
 5 | verbose = 1 * 1: detailed output, 0: concise output
 6 | runmode = 0 * 0: user tree; 1: semi-automatic; 2: automatic
 7 | * 3: StepwiseAddition; (4,5):PerturbationNNI; -2: pairwise
 8 | 
 9 | seqtype = 1 * 1:codons; 2:AAs; 3:codons-->AAs
10 | CodonFreq = 2 * 0:1/61 each, 1:F1X4, 2:F3X4, 3:codon table
11 | clock = 0 * 0: no clock, unrooted tree, 1: clock, rooted tree
12 | aaDist = 0 * 0:equal, +:geometric; -:linear, {1-5:G1974,Miyata,c,p,v}
13 | model = 0
14 | 
15 | NSsites = 0
16 | * 0:one w; 1:NearlyNeutral; 2:PositiveSelection; 3:discrete;
17 | * 4:freqs; 5:gamma;6:2gamma;7:beta;8:beta&w;9:beta&gamma;10:3normal
18 | icode = 0 * 0:standard genetic code; 1:mammalian mt; 2-10:see below
19 | Mgene = 0 * 0:rates, 1:separate; 2:pi, 3:kappa, 4:all
20 | 
21 | fix_kappa = 0 * 1: kappa fixed, 0: kappa to be estimated
22 | kappa = 1 * initial or fixed kappa
23 | fix_omega = 0 * 1: omega or omega_1 fixed, 0: estimate
24 | omega = 1 * initial or fixed omega, for codons or codon-based AAs
25 | ncatG = 10 * # of categories in the dG or AdG models of rates
26 | 
27 | getSE = 0 * 0: don't want them, 1: want S.E.s of estimates
28 | RateAncestor = 0 * (0,1,2): rates (alpha>0) or ancestral states (1 or 2)
29 | Small_Diff = .45e-6
30 | cleandata = 1 * remove sites with ambiguity data (1:yes, 0:no)?
31 | fix_blength = 0 * 0: ignore, -1: random, 1: initial, 2: fixed 
32 | 


--------------------------------------------------------------------------------
/funannotate/config/extrinsic.E.XNT.RM.cfg:
--------------------------------------------------------------------------------
 1 | #
 2 | # extrinsic information
 3 | # date: 07/28/2018
 4 | # Jon Palmer
 5 | 
 6 | # source of extrinsic information:
 7 | # M manual anchor (required)
 8 | # P protein database hit
 9 | # XNT protein homology prediction by exonerate
10 | # E est database hit
11 | # C combined est/protein database hit
12 | # D Dialign
13 | # R retroposed genes
14 | # T transMapped refSeqs
15 | 
16 | [SOURCES]
17 | M E XNT RM
18 | 
19 | 
20 | #
21 | # individual_liability: Only unsatisfiable hints are disregarded. By default this flag is not set
22 | # and the whole hint group is disregarded when one hint in it is unsatisfiable.
23 | #
24 | [SOURCE-PARAMETERS]
25 | XNT individual_liability
26 | 
27 | #   feature        bonus         malus   gradelevelcolumns
28 | #		r+/r-
29 | #
30 | # the gradelevel colums have the following format for each source
31 | # sourcecharacter numscoreclasses boundary    ...  boundary    gradequot  ...  gradequot
32 | # 
33 | 
34 | [GENERAL]
35 |       start             1             1  M    1  1e+100  E    1       1   XNT  1     1  RM  1  1
36 |        stop             1             1  M    1  1e+100  E    1       1   XNT  1     1  RM  1  1
37 |         tss             1             1  M    1  1e+100  E    1      10   XNT  1     1  RM  1  1
38 |         tts             1             1  M    1  1e+100  E    1     100   XNT  1     1  RM  1  1
39 |         ass             1             1  M    1  1e+100  E    1      20   XNT  1     1  RM  1  1
40 |         dss             1             1  M    1  1e+100  E    1      20   XNT  1     1  RM  1  1
41 |    exonpart             1             1  M    1  1e+100  E    1     1e3   XNT  1     1  RM  1  1
42 |        exon             1             1  M    1  1e+100  E    1     5e3   XNT  1     1  RM  1  1
43 |  intronpart             1             1  M    1  1e+100  E    1       1   XNT  1     1  RM  1  1
44 |      intron             1             1  M    1  1e+100  E    1     5e4   XNT  1   1e3  RM  1  1
45 |     CDSpart             1             1  M    1  1e+100  E    1       1   XNT  1   1e4  RM  1  1
46 |         CDS             1             1  M    1  1e+100  E    1       1   XNT  1     1  RM  1  1
47 |     UTRpart             1             1  M    1  1e+100  E    1       1   XNT  1     1  RM  1  1
48 |         UTR             1             1  M    1  1e+100  E    1       1   XNT  1     1  RM  1  1
49 |      irpart             1             1  M    1  1e+100  E    1       1   XNT  1     1  RM  1  1
50 | nonexonpart             1             1  M    1  1e+100  E    1       1   XNT  1     1  RM  1  1.15
51 | 
52 | 
53 | # chlamy EST score
54 | # 0: public EST
55 | # 1: Chun, no terminus
56 | # 2: Chun, with terminus
57 | #
58 | # Explanation: see original extrinsic.cfg file
59 | #
60 | 


--------------------------------------------------------------------------------
/funannotate/config/test.sbt:
--------------------------------------------------------------------------------
 1 | Submit-block ::= {
 2 |   contact {
 3 |     contact {
 4 |       name name {
 5 |         last "Palmer",
 6 |         first "Jonathan"
 7 |       },
 8 |       affil std {
 9 |         affil "USDA Forest Service",
10 |         div "CFMR",
11 |         city "Madison",
12 |         sub "WI",
13 |         country "USA",
14 |         street "1 Gifford Pinchot Drive",
15 |         email "nextgenusfs@gmail.com",
16 |         fax "",
17 |         phone "555-555-5555",
18 |         postal-code "53726"
19 |       }
20 |     }
21 |   },
22 |   cit {
23 |     authors {
24 |       names std {
25 |         {
26 |           name name {
27 |             last "Palmer",
28 |             first "Jonathan",
29 |             initials "J.M.",
30 |             suffix ""
31 |           }
32 |         }
33 |       },
34 |       affil std {
35 |         affil "USDA Forest Service",
36 |         div "CFMR",
37 |         city "Madison",
38 |         sub "WI",
39 |         country "USA",
40 |         street "1 Gifford Pinchot Drive",
41 |         postal-code "53726"
42 |       }
43 |     }
44 |   },
45 |   subtype new
46 | }
47 | 
48 | Seqdesc ::= pub {
49 |   pub {
50 |     gen {
51 |       cit "unpublished",
52 |       authors {
53 |         names std {
54 |           {
55 |             name name {
56 |               last "Palmer",
57 |               first "Jonathan",
58 |               initials "J.M.",
59 |               suffix ""
60 |             }
61 |           }
62 |         },
63 |         affil std {
64 |           affil "USDA Forest Service",
65 |           div "CFMR",
66 |           city "Madison",
67 |           sub "WI",
68 |           country "USA",
69 |           street "1 Gifford Pinchot Drive",
70 |           postal-code "53726"
71 |         }
72 |       },
73 |       title “Annotate generated by FunAnnotate: fungal automated genome annotation”
74 |     }
75 |   }
76 | }
77 | 
78 | 


--------------------------------------------------------------------------------
/funannotate/config/tf_interpro.txt:
--------------------------------------------------------------------------------
 1 | IPR000967,NF-X1-type zinc finger
 2 | IPR006856,Mating-type protein MAT alpha-1 HMG-Box
 3 | IPR018501,DDT domain
 4 | IPR007196,CCR4-Not complex component
 5 | IPR007396,Putative FMN-binding domain
 6 | IPR004595,TFIIH C1-like domain
 7 | IPR004181,MIZ zinc finger
 8 | IPR000818,TEA/ATTS domain family
 9 | IPR001387,Helix-turn-helix
10 | IPR001289,CCAAT-binding TF (CBF-B/NF-YA) subunit B
11 | IPR003120,STE-like TF
12 | IPR003150,RFX DNA-binding domain
13 | IPR004198,Zinc finger
14 | IPR007604,CP2 TF
15 | IPR008895,YL1 nuclear protein
16 | IPR010770,SGT1 protein
17 | IPR018004,KilA-N domain
18 | IPR024061,NDT80/PhoG-like DNA-binding family
19 | IPR003656,BED zinc finger
20 | IPR018060,Bacterial regulatory HTH proteins
21 | IPR005011,SART-1 family
22 | IPR002100,SRF-type TF (DNA-binding and dimerization domain)
23 | IPR010666,GRF zinc finger
24 | IPR000232,HSF-type DNA-binding
25 | IPR001766,Fork head domain
26 | IPR001356,Homeobox domain
27 | IPR000679,GATA zinc finger
28 | IPR013767,PAS fold
29 | IPR001878,Zinc knuckle (CCHC)
30 | IPR004827,Basic region leucine zipper 2
31 | IPR007889,Helix-turn-helix
32 | IPR011598,Helix-loop-helix DNA-binding domain
33 | IPR001005,Myb-like DNA-binding domain
34 | IPR004827,bZIP TF 1
35 | IPR007087,Zinc finger (C2H2)
36 | IPR001138,Fungal Zn(2)-Cys(6) binuclear cluster domain
37 | IPR007219,Fungal-specific TF domain
38 | 


--------------------------------------------------------------------------------
/funannotate/database.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | import os
 6 | import funannotate.library as lib
 7 | import funannotate.resources as resources
 8 | 
 9 | 
10 | def main(args):
11 |     # setup funannotate DB path
12 |     try:
13 |         FUNDB = os.environ["FUNANNOTATE_DB"]
14 |     except KeyError:
15 |         print('$FUNANNOTATE_DB not found, run funannotate setup and export ENV variable')
16 |         sys.exit(1)
17 |     if '--show-outgroups' in args:
18 |         try:
19 |             files = [f for f in os.listdir(os.path.join(FUNDB, 'outgroups'))]
20 |         except OSError:
21 |             print((
22 |                 'ERROR: %s/outgroups folder is not found, run funannotate setup.' % FUNDB))
23 |             sys.exit(1)
24 |         files = [x.replace('_buscos.fa', '') for x in files]
25 |         files = [x for x in files if not x.startswith('.')]
26 |         print("-----------------------------")
27 |         print("BUSCO Outgroups:")
28 |         print("-----------------------------")
29 |         print((lib.list_columns(files, cols=3)))
30 |         print('')
31 | 
32 |     elif '--show-buscos' in args:
33 |         print("-----------------------------")
34 |         print("BUSCO DB tree: (# of models)")
35 |         print("-----------------------------")
36 |         print((resources.buscoTree))
37 |     else:
38 |         dbfile = os.path.join(FUNDB, 'funannotate-db-info.txt')
39 |         db_list = [['Database', 'Type', 'Version',
40 |                     'Date', 'Num_Records', 'Md5checksum']]
41 |         if not os.path.isfile(dbfile):
42 |             print('Database is not properly configured, re-run funannotate setup')
43 |             sys.exit(1)
44 |         with open(dbfile, 'r') as infile:
45 |             for line in infile:
46 |                 line = line.rstrip()
47 |                 cols = line.split('\t')
48 |                 del cols[2]
49 |                 db_list.append(cols)
50 |         msg = lib.bold_underline('Funannotate Databases currently installed:')
51 |         print(('\n'+msg+'\n'))
52 |         lib.print_table(db_list, alignments='LLLLRL', max_col_width=60)
53 | 
54 |         print((
55 |             '\nTo update a database type:\n\tfunannotate setup -i DBNAME -d {:} --force\n'.format(FUNDB)))
56 |         print('To see install BUSCO outgroups type:\n\tfunannotate database --show-outgroups\n')
57 |         print('To see BUSCO tree type:\n\tfunannotate database --show-buscos\n')
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     main(sys.argv[1:])
62 | 


--------------------------------------------------------------------------------
/funannotate/downloads.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "downloads": {
  3 |     "uniprot": "https://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz",
  4 |     "uniprot-release": "https://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/reldate.txt",
  5 |     "merops": "https://ftp.ebi.ac.uk/pub/databases/merops/current_release/meropsscan.lib",
  6 |     "dbCAN": "https://bcb.unl.edu/dbCAN2/download/Databases/V11/dbCAN-HMMdb-V11.txt",
  7 |     "dbCAN-tsv": "https://bcb.unl.edu/dbCAN2/download/Databases/V11/CAZyDB.08062022.fam-activities.txt",
  8 |     "dbCAN-log": "https://bcb.unl.edu/dbCAN2/download/Databases/V11/readme.txt",
  9 |     "pfam": "https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz",
 10 |     "pfam-tsv": "https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz",
 11 |     "pfam-log": "https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam.version.gz",
 12 |     "outgroups": "https://osf.io/r9sne/download?version=1",
 13 |     "repeats": "https://osf.io/vp87c/download?version=1",
 14 |     "go-obo": "https://purl.obolibrary.org/obo/go.obo",
 15 |     "mibig": "https://dl.secondarymetabolites.org/mibig/mibig_prot_seqs_1.4.fasta",
 16 |     "interpro": "https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/interpro.xml.gz",
 17 |     "interpro-tsv": "https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/entry.list",
 18 |     "gene2product": "https://raw.githubusercontent.com/nextgenusfs/gene2product/master/ncbi_cleaned_gene_products.txt"
 19 |   },
 20 |   "busco": {
 21 |     "fungi": [
 22 |       "https://osf.io/xvzmu/download?version=1",
 23 |       "fungi_odb9"
 24 |     ],
 25 |     "microsporidia": [
 26 |       "https://osf.io/r47nx/download?version=1",
 27 |       "microsporidia_odb9"
 28 |     ],
 29 |     "dikarya": [
 30 |       "https://osf.io/av6f8/download?version=1",
 31 |       "dikarya_odb9"
 32 |     ],
 33 |     "ascomycota": [
 34 |       "https://osf.io/z2736/download?version=1",
 35 |       "ascomycota_odb9"
 36 |     ],
 37 |     "pezizomycotina": [
 38 |       "https://osf.io/bj3sm/download?version=1",
 39 |       "pezizomycotina_odb9"
 40 |     ],
 41 |     "eurotiomycetes": [
 42 |       "https://osf.io/nvt3z/download?version=1",
 43 |       "eurotiomycetes_odb9"
 44 |     ],
 45 |     "sordariomycetes": [
 46 |       "https://osf.io/r24kn/download?version=1",
 47 |       "sordariomyceta_odb9"
 48 |     ],
 49 |     "saccharomycetes": [
 50 |       "https://osf.io/mpu2k/download?version=1",
 51 |       "saccharomyceta_odb9"
 52 |     ],
 53 |     "saccharomycetales": [
 54 |       "https://osf.io/dhk47/download?version=1",
 55 |       "saccharomycetales_odb9"
 56 |     ],
 57 |     "basidiomycota": [
 58 |       "https://osf.io/2xnsj/download?version=1",
 59 |       "basidiomycota_odb9"
 60 |     ],
 61 |     "eukaryota": [
 62 |       "https://osf.io/psj2k/download?version=1",
 63 |       "eukaryota_odb9"
 64 |     ],
 65 |     "protists": [
 66 |       "https://osf.io/a4tsk/download?version=1",
 67 |       "protists_ensembl"
 68 |     ],
 69 |     "alveolata_stramenophiles": [
 70 |       "https://osf.io/waqpe/download?version=1",
 71 |       "alveolata_stramenophiles_ensembl"
 72 |     ],
 73 |     "metazoa": [
 74 |       "https://osf.io/5bvam/download?version=1",
 75 |       "metazoa_odb9"
 76 |     ],
 77 |     "nematoda": [
 78 |       "https://osf.io/u87d3/download?version=1",
 79 |       "nematoda_odb9"
 80 |     ],
 81 |     "arthropoda": [
 82 |       "https://osf.io/w26ez/download?version=1",
 83 |       "arthropoda_odb9"
 84 |     ],
 85 |     "insecta": [
 86 |       "https://osf.io/8qsa5/download?version=1",
 87 |       "insecta_odb9"
 88 |     ],
 89 |     "endopterygota": [
 90 |       "https://osf.io/pxdqg/download?version=1",
 91 |       "endopterygota_odb9"
 92 |     ],
 93 |     "hymenoptera": [
 94 |       "https://osf.io/q4ce6/download?version=1",
 95 |       "hymenoptera_odb9"
 96 |     ],
 97 |     "diptera": [
 98 |       "https://osf.io/e2n49/download?version=1",
 99 |       "diptera_odb9"
100 |     ],
101 |     "vertebrata": [
102 |       "https://osf.io/w6kf8/download?version=1",
103 |       "vertebrata_odb9"
104 |     ],
105 |     "actinopterygii": [
106 |       "https://osf.io/dj2cw/download?version=1",
107 |       "actinopterygii_odb9"
108 |     ],
109 |     "tetrapoda": [
110 |       "https://osf.io/bp4cf/download?version=1",
111 |       "tetrapoda_odb9"
112 |     ],
113 |     "aves": [
114 |       "https://osf.io/e7qym/download?version=1",
115 |       "aves_odb9"
116 |     ],
117 |     "mammalia": [
118 |       "https://osf.io/dvy5m/download?version=1",
119 |       "mammalia_odb9"
120 |     ],
121 |     "euarchontoglires": [
122 |       "https://osf.io/p3nc7/download?version=1",
123 |       "euarchontoglires_odb9"
124 |     ],
125 |     "laurasiatheria": [
126 |       "https://osf.io/2v9hj/download?version=1",
127 |       "laurasiatheria_odb9"
128 |     ],
129 |     "embryophyta": [
130 |       "https://osf.io/m67p4/download?version=1",
131 |       "embryophyta_odb9"
132 |     ]
133 |   }
134 | }
135 | 


--------------------------------------------------------------------------------
/funannotate/fix.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import sys
  5 | import os
  6 | import uuid
  7 | import shutil
  8 | import argparse
  9 | import subprocess
 10 | import funannotate.library as lib
 11 | 
 12 | 
 13 | def main(args):
 14 |     # setup menu with argparse
 15 |     class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
 16 |         def __init__(self, prog):
 17 |             super(MyFormatter, self).__init__(prog, max_help_position=48)
 18 |     parser = argparse.ArgumentParser(prog='fix', usage="%(prog)s [options] -i genome.GBK -t genome.tbl",
 19 |                                      description='''Script will update annotation of a Genbank file with new tbl.''',
 20 |                                      epilog="""Written by Jon Palmer (2017) nextgenusfs@gmail.com""",
 21 |                                      formatter_class=MyFormatter)
 22 |     parser.add_argument('-i', '--input', required=True,
 23 |                         help='Genome in GBK format')
 24 |     parser.add_argument('-t', '--tbl', required=True,
 25 |                         help='Genome annotation in NCBI tbl format')
 26 |     parser.add_argument(
 27 |         '-d', '--drop', help='List of locus_tag to remove/drop from annotation')
 28 |     parser.add_argument('-o', '--out', help='Basename of output files')
 29 |     parser.add_argument('--tbl2asn', default='-l paired-ends',
 30 |                         help='Parameters for tbl2asn, linkage and gap info')
 31 |     args = parser.parse_args(args)
 32 | 
 33 |     parentdir = os.path.join(os.path.dirname(__file__))
 34 | 
 35 |     # create log file
 36 |     log_name = 'funannotate-fix.log'
 37 |     if os.path.isfile(log_name):
 38 |         os.remove(log_name)
 39 | 
 40 |     # initialize script, log system info and cmd issue at runtime
 41 |     lib.setupLogging(log_name)
 42 |     cmd_args = " ".join(sys.argv)+'\n'
 43 |     lib.log.debug(cmd_args)
 44 |     print("-------------------------------------------------------")
 45 |     lib.SystemInfo()
 46 | 
 47 |     # get version of funannotate
 48 |     version = lib.get_version()
 49 |     lib.log.info("Running %s" % version)
 50 | 
 51 |     # create output and temporary directory
 52 |     if args.out:
 53 |         basedir = args.out
 54 |     else:
 55 |         # get location from tbl file
 56 |         basedir = os.path.dirname(args.tbl)
 57 |         if basedir == '':
 58 |             basedir = '.'
 59 | 
 60 |     if not os.path.isdir(basedir):
 61 |         os.makedirs(basedir)
 62 |     if not os.path.isdir(os.path.join(basedir, 'tbl2asn')):
 63 |         os.makedirs(os.path.join(basedir, 'tbl2asn'))
 64 | 
 65 |     # copy over the annotation file to tbl2asn folder, or process if args.drop passed
 66 |     if args.drop:
 67 |         lib.tblfilter(args.tbl, args.drop, os.path.join(
 68 |             basedir, 'tbl2asn', 'genome.tbl'))
 69 |     else:
 70 |         shutil.copyfile(args.tbl, os.path.join(
 71 |             basedir, 'tbl2asn', 'genome.tbl'))
 72 | 
 73 |     # get information info from GBK file
 74 |     organism, strain, isolate, accession, WGS_accession, gb_gi, version = lib.getGBKinfo(
 75 |         args.input)
 76 |     locustag, genenum, justify = lib.getGBKLocusTag(args.input)
 77 |     if strain:
 78 |         organism_name = organism+'_'+strain
 79 |     elif isolate:
 80 |         organism_name = organism+'_'+isolate
 81 |     else:
 82 |         organism_name = organism
 83 |     organism_name = organism_name.replace(' ', '_')
 84 | 
 85 |     # extract fasta file from genbank file,
 86 |     lib.log.info('Extracting genome sequence and parsing meta information')
 87 |     contigs, genes, trnas = lib.countGenBank(args.input)
 88 |     lib.log.info('{:,} contigs containing {:,} protein coding genes and {:,} tRNA genes'.format(
 89 |         contigs, genes, trnas))
 90 |     lib.gb2dna(args.input, os.path.join(basedir, 'tbl2asn', 'genome.fsa'))
 91 | 
 92 |     # assuming that this is the predict_results dir or update_results dir, but check first and then archive
 93 |     if '_results' in basedir:
 94 |         archivedir = os.path.join(basedir, 'archive_'+str(uuid.uuid4()))
 95 |         lib.log.info(
 96 |             'Found pre-existing funannotate files, archiving to %s' % archivedir)
 97 |         os.makedirs(archivedir)
 98 |         # move files in results to archive dir
 99 |         for file in os.listdir(basedir):
100 |             if 'pasa-reannotation' in file or 'WGS_accession' in file or 'ncbi.p2g' in file or '.parameters.json' in file:
101 |                 continue
102 |             if os.path.isfile(os.path.join(basedir, file)):
103 |                 os.rename(os.path.join(basedir, file),
104 |                           os.path.join(archivedir, file))
105 | 
106 |     # now we can run tbl2asn
107 |     SBT = os.path.join(parentdir, 'config', 'test.sbt')
108 |     discrep = os.path.join(basedir, organism_name+'.discrepency.txt')
109 |     if not version:
110 |         version = 1
111 |     lib.log.info('Converting to GenBank format')
112 |     # have to run as subprocess because of multiprocessing issues
113 |     cmd = [sys.executable, os.path.join(parentdir, 'aux_scripts', 'tbl2asn_parallel.py'),
114 |            '-i', os.path.join(basedir, 'tbl2asn', 'genome.tbl'), '-f', os.path.join(
115 |                basedir, 'tbl2asn', 'genome.fsa'),
116 |            '-o', os.path.join(basedir, 'tbl2asn'), '--sbt', SBT, '-d', discrep,
117 |            '-s', organism, '-t', args.tbl2asn, '-v', str(version), '-c', '4']
118 |     if isolate:
119 |         cmd += ['--isolate', isolate]
120 |     if strain:
121 |         cmd += ['--strain', strain]
122 |     lib.log.debug(' '.join(cmd))
123 |     subprocess.call(cmd)
124 | 
125 |     # now get GBK files from folder
126 |     lib.log.info('Generating output files.')
127 |     # setup final output files
128 |     final_fasta = os.path.join(basedir, organism_name + '.scaffolds.fa')
129 |     final_gff = os.path.join(basedir, organism_name + '.gff3')
130 |     final_gbk = os.path.join(basedir, organism_name + '.gbk')
131 |     final_tbl = os.path.join(basedir, organism_name + '.tbl')
132 |     final_proteins = os.path.join(basedir, organism_name + '.proteins.fa')
133 |     final_transcripts = os.path.join(
134 |         basedir, organism_name + '.mrna-transcripts.fa')
135 |     final_cds_transcripts = os.path.join(
136 |         basedir, organism_name + '.cds-transcripts.fa')
137 |     final_validation = os.path.join(basedir, organism_name+'.validation.txt')
138 |     final_error = os.path.join(basedir, organism_name+'.error.summary.txt')
139 |     final_fixes = os.path.join(
140 |         basedir, organism_name+'.models-need-fixing.txt')
141 | 
142 |     # retrieve files/reorganize
143 |     shutil.copyfile(os.path.join(basedir, 'tbl2asn', 'genome.gbf'), final_gbk)
144 |     shutil.copyfile(os.path.join(basedir, 'tbl2asn', 'genome.tbl'), final_tbl)
145 |     shutil.copyfile(os.path.join(basedir, 'tbl2asn',
146 |                                  'genome.val'), final_validation)
147 |     shutil.copyfile(os.path.join(basedir, 'tbl2asn',
148 |                                  'errorsummary.val'), final_error)
149 |     lib.tbl2allout(final_tbl, os.path.join(basedir, 'tbl2asn', 'genome.fsa'), final_gff,
150 |                    final_proteins, final_transcripts, final_cds_transcripts, final_fasta)
151 |     errors = lib.ncbiCheckErrors(
152 |         final_error, final_validation, locustag, final_fixes)
153 |     if errors > 0:
154 |         lib.log.info("Manually edit the tbl file %s, then run:\n\nfunannotate fix -i %s -t %s\n" %
155 |                      (final_tbl, final_gbk, final_tbl))
156 |     else:
157 |         contigs, genes, trnas = lib.countGenBank(final_gbk)
158 |         lib.log.info('Output genome consists of: {:,} contigs containing {:,} protein coding genes and {:,} tRNA genes'.format(
159 |             contigs, genes, trnas))
160 | 
161 |     # clean up
162 |     shutil.rmtree(os.path.join(basedir, 'tbl2asn'))
163 | 
164 | 
165 | if __name__ == "__main__":
166 |     main(sys.argv[1:])
167 | 


--------------------------------------------------------------------------------
/funannotate/html_template/css/starter-template.css:
--------------------------------------------------------------------------------
 1 | body {
 2 |   padding-top: 50px;
 3 | }
 4 | .starter-template {
 5 |   padding: 40px 15px;
 6 |   text-align: center;
 7 | }
 8 | .table {
 9 |   padding: 40px 15px;
10 |   text-align: left; 
11 | }
12 | .center-table {
13 |   padding: 40px 15px;
14 |   text-align: center !important;; 
15 | }
16 | .table td {
17 |    text-align: center; 
18 |    vertical-align: middle; 
19 | }
20 | 


--------------------------------------------------------------------------------
/funannotate/html_template/js/ie-emulation-modes-warning.js:
--------------------------------------------------------------------------------
 1 | // NOTICE!! DO NOT USE ANY OF THIS JAVASCRIPT
 2 | // IT'S JUST JUNK FOR OUR DOCS!
 3 | // ++++++++++++++++++++++++++++++++++++++++++
 4 | /*!
 5 |  * Copyright 2014-2015 Twitter, Inc.
 6 |  *
 7 |  * Licensed under the Creative Commons Attribution 3.0 Unported License. For
 8 |  * details, see https://creativecommons.org/licenses/by/3.0/.
 9 |  */
10 | // Intended to prevent false-positive bug reports about Bootstrap not working properly in old versions of IE due to folks testing using IE's unreliable emulation modes.
11 | (function () {
12 |   'use strict';
13 | 
14 |   function emulatedIEMajorVersion() {
15 |     var groups = /MSIE ([0-9.]+)/.exec(window.navigator.userAgent)
16 |     if (groups === null) {
17 |       return null
18 |     }
19 |     var ieVersionNum = parseInt(groups[1], 10)
20 |     var ieMajorVersion = Math.floor(ieVersionNum)
21 |     return ieMajorVersion
22 |   }
23 | 
24 |   function actualNonEmulatedIEMajorVersion() {
25 |     // Detects the actual version of IE in use, even if it's in an older-IE emulation mode.
26 |     // IE JavaScript conditional compilation docs: https://msdn.microsoft.com/library/121hztk3%28v=vs.94%29.aspx
27 |     // @cc_on docs: https://msdn.microsoft.com/library/8ka90k2e%28v=vs.94%29.aspx
28 |     var jscriptVersion = new Function('/*@cc_on return @_jscript_version; @*/')() // jshint ignore:line
29 |     if (jscriptVersion === undefined) {
30 |       return 11 // IE11+ not in emulation mode
31 |     }
32 |     if (jscriptVersion < 9) {
33 |       return 8 // IE8 (or lower; haven't tested on IE<8)
34 |     }
35 |     return jscriptVersion // IE9 or IE10 in any mode, or IE11 in non-IE11 mode
36 |   }
37 | 
38 |   var ua = window.navigator.userAgent
39 |   if (ua.indexOf('Opera') > -1 || ua.indexOf('Presto') > -1) {
40 |     return // Opera, which might pretend to be IE
41 |   }
42 |   var emulated = emulatedIEMajorVersion()
43 |   if (emulated === null) {
44 |     return // Not IE
45 |   }
46 |   var nonEmulated = actualNonEmulatedIEMajorVersion()
47 | 
48 |   if (emulated !== nonEmulated) {
49 |     window.alert('WARNING: You appear to be using IE' + nonEmulated + ' in IE' + emulated + ' emulation mode.\nIE emulation modes can behave significantly differently from ACTUAL older versions of IE.\nPLEASE DON\'T FILE BOOTSTRAP BUGS based on testing in IE emulation modes!')
50 |   }
51 | })();
52 | 


--------------------------------------------------------------------------------
/funannotate/html_template/js/ie10-viewport-bug-workaround.js:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * IE10 viewport hack for Surface/desktop Windows 8 bug
 3 |  * Copyright 2014-2015 Twitter, Inc.
 4 |  * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE)
 5 |  */
 6 | 
 7 | // See the Getting Started docs for more information:
 8 | // http://getbootstrap.com/getting-started/#support-ie10-width
 9 | 
10 | (function () {
11 |   'use strict';
12 | 
13 |   if (navigator.userAgent.match(/IEMobile\/10\.0/)) {
14 |     var msViewportStyle = document.createElement('style')
15 |     msViewportStyle.appendChild(
16 |       document.createTextNode(
17 |         '@-ms-viewport{width:auto!important}'
18 |       )
19 |     )
20 |     document.querySelector('head').appendChild(msViewportStyle)
21 |   }
22 | 
23 | })();
24 | 


--------------------------------------------------------------------------------
/funannotate/iprscan.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | import os
 6 | import subprocess
 7 | 
 8 | 
 9 | def main(args):
10 |     # little wrapper to run interproscan multiprocessing
11 |     # just pass this onto iprscan-local.py
12 |     parentdir = os.path.join(os.path.dirname(__file__))
13 |     cmd = [sys.executable, os.path.join(
14 |         parentdir, 'aux_scripts', 'iprscan-local.py')]
15 |     cmd += args
16 |     subprocess.call(cmd)
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     main(sys.argv[1:])
21 | 


--------------------------------------------------------------------------------
/funannotate/outgroups.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import sys
  5 | import os
  6 | import shutil
  7 | import argparse
  8 | from Bio import SeqIO
  9 | import funannotate.library as lib
 10 | 
 11 | 
 12 | def main(args):
 13 |     # setup menu with argparse
 14 |     class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
 15 |         def __init__(self, prog):
 16 |             super(MyFormatter, self).__init__(prog, max_help_position=48)
 17 |     parser = argparse.ArgumentParser(prog='funannotate-predict.py', usage="%(prog)s [options] -i genome.fasta",
 18 |                                      description='''Script that adds a proteome to the outgroups.''',
 19 |                                      epilog="""Written by Jon Palmer (2016) nextgenusfs@gmail.com""",
 20 |                                      formatter_class=MyFormatter)
 21 |     parser.add_argument('-i', '--input', required=True,
 22 |                         help='Proteome in FASTA format')
 23 |     parser.add_argument('-s', '--species', required=True,
 24 |                         help='Species name "binomial in quotes"')
 25 |     parser.add_argument('-b', '--busco_db', default='dikarya', choices=['fungi', 'microsporidia', 'dikarya', 'ascomycota', 'pezizomycotina', 'eurotiomycetes', 'sordariomycetes', 'saccharomycetes', 'saccharomycetales', 'basidiomycota', 'eukaryota', 'protists',
 26 |                                                                         'alveolata_stramenophiles', 'metazoa', 'nematoda', 'arthropoda', 'insecta', 'endopterygota', 'hymenoptera', 'diptera', 'vertebrata', 'actinopterygii', 'tetrapoda', 'aves', 'mammalia', 'euarchontoglires', 'laurasiatheria', 'embryophyta'], help='BUSCO database to use')
 27 |     parser.add_argument('-c', '--cpus', default=2, type=int,
 28 |                         help='Number of CPUs to use')
 29 |     parser.add_argument('-d', '--database',
 30 |                         help='Path to funannotate database, $FUNANNOTATE_DB')
 31 |     args = parser.parse_args(args)
 32 | 
 33 |     if args.database:
 34 |         FUNDB = args.database
 35 |     else:
 36 |         try:
 37 |             FUNDB = os.environ["FUNANNOTATE_DB"]
 38 |         except KeyError:
 39 |             lib.log.error(
 40 |                 'Funannotate database not properly configured, run funannotate setup.')
 41 |             sys.exit(1)
 42 | 
 43 |     parentdir = os.path.join(os.path.dirname(__file__))
 44 | 
 45 |     # get base name
 46 |     species = args.species.replace(' ', '_').lower()+'.'+args.busco_db
 47 |     OUTGROUPS = os.path.join(FUNDB, 'outgroups')
 48 | 
 49 |     # create log file
 50 |     log_name = species+'-add2outgroups.log'
 51 |     if os.path.isfile(log_name):
 52 |         os.remove(log_name)
 53 | 
 54 |     # initialize script, log system info and cmd issue at runtime
 55 |     lib.setupLogging(log_name)
 56 |     cmd_args = " ".join(sys.argv)+'\n'
 57 |     lib.log.debug(cmd_args)
 58 |     print("-------------------------------------------------------")
 59 |     lib.SystemInfo()
 60 | 
 61 |     # get version of funannotate
 62 |     version = lib.get_version()
 63 |     lib.log.info("Running %s" % version)
 64 | 
 65 |     # check buscos, download if necessary
 66 |     if not os.path.isdir(os.path.join(FUNDB, args.busco_db)):
 67 |         lib.log.error("%s busco database is missing, install with funannotate setup -b %s" %
 68 |                       (args.busco_db, args.busco_db))
 69 |         sys.exit(1)
 70 | 
 71 |     ProtCount = lib.countfasta(args.input)
 72 |     lib.log.info('{0:,}'.format(ProtCount) + ' protein records loaded')
 73 | 
 74 |     # convert to proteins and screen with busco
 75 |     lib.log.info("Looking for BUSCO models with %s DB" % args.busco_db)
 76 |     BUSCODB = os.path.join(FUNDB, args.busco_db)
 77 |     BUSCO = os.path.join(parentdir, 'aux_scripts', 'funannotate-BUSCO2.py')
 78 |     cmd = [sys.executable, BUSCO, '-i', os.path.abspath(
 79 |         args.input), '-m', 'proteins', '--lineage', BUSCODB, '-o', species, '--cpu', str(args.cpus), '-f']
 80 |     lib.runSubprocess(cmd, '.', lib.log)
 81 | 
 82 |     # check that it ran correctly
 83 |     busco_results = os.path.join('run_'+species, 'full_table_'+species+'.tsv')
 84 |     if not lib.checkannotations(busco_results):
 85 |         lib.log.error("BUSCO failed, check logfile")
 86 |         sys.exit(1)
 87 |     nameChange = {}
 88 |     with open(busco_results, 'rU') as input:
 89 |         for line in input:
 90 |             if line.startswith('#'):
 91 |                 continue
 92 |             cols = line.split('\t')
 93 |             if cols[1] == 'Complete':
 94 |                 if not cols[2] in nameChange:
 95 |                     nameChange[cols[2]] = cols[0]
 96 |                 else:
 97 |                     lib.log.error(
 98 |                         "Duplicate ID found: %s %s. Removing from results" % (cols[2], cols[0]))
 99 |                     del nameChange[cols[2]]
100 | 
101 |     # output counts
102 |     lib.log.info('{0:,}'.format(len(nameChange)) + ' BUSCO models found')
103 | 
104 |     # index the proteome for parsing
105 |     SeqRecords = SeqIO.to_dict(SeqIO.parse(args.input, 'fasta'))
106 | 
107 |     # setup output proteome
108 |     busco_out = os.path.join(OUTGROUPS, species+'_buscos.fa')
109 |     with open(busco_out, 'w') as output:
110 |         for k, v in list(nameChange.items()):
111 |             rec = SeqRecords[k]
112 |             output.write('>%s\n%s\n' % (v, rec.seq))
113 |     lib.log.info("Results written to: %s" % busco_out)
114 | 
115 |     # clean up your mess
116 |     shutil.rmtree('run_'+species)
117 |     shutil.rmtree('tmp')
118 | 
119 | 
120 | if __name__ == "__main__":
121 |     main(sys.argv[1:])
122 | 


--------------------------------------------------------------------------------
/funannotate/sort.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from __future__ import absolute_import, division, print_function, unicode_literals
 5 | 
 6 | import sys
 7 | import argparse
 8 | from Bio.SeqIO.FastaIO import SimpleFastaParser
 9 | from funannotate.library import countfasta, softwrap
10 | 
11 | 
12 | def SortRenameHeaders(input, basename, output, minlen=0, simplify=False):
13 |     Seqs = []
14 |     with open(input, "r") as infile:
15 |         for header, sequence in SimpleFastaParser(infile):
16 |             Seqs.append((header, len(sequence), sequence))
17 |     # sort by length
18 |     sortedSeqs = sorted(Seqs, key=lambda x: x[1], reverse=True)
19 |     # loop through and return contigs and keepers
20 |     counter = 1
21 |     with open(output, "w") as outfile:
22 |         for name, length, seq in sortedSeqs:
23 |             if simplify:  # try to just split at first space
24 |                 if " " in name:
25 |                     newName = name.split(" ")[0]
26 |                 else:
27 |                     newName = name
28 |             else:
29 |                 newName = f"{basename}_{counter}"
30 |             if len(newName) > 16:
31 |                 print(
32 |                     f"Error. {newName} fasta header too long.",
33 |                     "Choose a different --base name.",
34 |                     "NCBI/GenBank max is 16 characters.",
35 |                 )
36 |                 raise SystemExit(1)
37 |             if minlen > 0:
38 |                 if length >= minlen:
39 |                     # ony write if length
40 |                     outfile.write(">{:}\n{:}\n".format(newName, softwrap(seq)))
41 |             else:
42 |                 # always write if we aren't filtering by length
43 |                 outfile.write(">{:}\n{:}\n".format(newName, softwrap(seq)))
44 |             counter += 1
45 | 
46 | 
47 | def main(args):
48 |     # setup menu with argparse
49 |     class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
50 |         def __init__(self, prog):
51 |             super(MyFormatter, self).__init__(prog, max_help_position=48)
52 | 
53 |     parser = argparse.ArgumentParser(
54 |         prog="sort_rename.py",
55 |         usage="%(prog)s [options] -i genome.fa -o sorted.fa",
56 |         description="Script that sorts input by length and then renames contig headers.",
57 |         epilog="""Written by Jon Palmer (2016) nextgenusfs@gmail.com""",
58 |         formatter_class=MyFormatter,
59 |     )
60 |     parser.add_argument("-i", "--input", required=True, help="Multi-fasta genome file")
61 |     parser.add_argument("-o", "--out", required=True, help="Cleaned output (FASTA)")
62 |     parser.add_argument(
63 |         "-b", "--base", default="scaffold", help="Basename of contig header"
64 |     )
65 |     parser.add_argument(
66 |         "-s",
67 |         "--simplify",
68 |         action="store_true",
69 |         help="Try to simplify headers, split at first space",
70 |     )
71 |     parser.add_argument(
72 |         "-m", "--minlen", type=int, help="Contigs shorter than threshold are discarded"
73 |     )
74 |     args = parser.parse_args(args)
75 | 
76 |     print(("{:,} contigs records loaded".format(countfasta(args.input))))
77 |     print("Sorting and renaming contig headers")
78 |     if args.minlen:
79 |         print(("Removing contigs less than {:} bp".format(args.minlen)))
80 |     SortRenameHeaders(
81 |         args.input, args.base, args.out, minlen=args.minlen, simplify=args.simplify
82 |     )
83 |     print(("{:,} contigs saved to file".format(countfasta(args.out))))
84 | 
85 | 
86 | if __name__ == "__main__":
87 |     main(sys.argv[1:])
88 | 


--------------------------------------------------------------------------------
/funannotate/species.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import sys
  5 | import os
  6 | import argparse
  7 | from natsort import natsorted
  8 | import json
  9 | import shutil
 10 | import funannotate.library as lib
 11 | 
 12 | 
 13 | def speciesAvailable(dir):
 14 |     # return dictionary of species name and path to info.json file
 15 |     Results = {}
 16 |     for f in os.listdir(dir):
 17 |         ff = os.path.join(dir, f)
 18 |         if os.path.isdir(ff) and lib.checkannotations(os.path.join(ff, 'info.json')):
 19 |             with open(os.path.join(ff, 'info.json')) as infile:
 20 |                 data = json.load(infile)
 21 |             Results[f] = data
 22 |     return Results
 23 | 
 24 | 
 25 | def showAll(dir):
 26 |     Table = []
 27 |     TableHeader = ['Species', 'Augustus', 'GeneMark',
 28 |                    'Snap', 'GlimmerHMM', 'CodingQuarry', 'Date']
 29 |     for f in os.listdir(dir):
 30 |         ff = os.path.join(dir, f)
 31 |         if os.path.isdir(ff) and lib.checkannotations(os.path.join(ff, 'info.json')):
 32 |             with open(os.path.join(ff, 'info.json')) as infile:
 33 |                 data = json.load(infile)
 34 |             sources = [f]
 35 |             for x in ['augustus', 'genemark', 'snap', 'glimmerhmm', 'codingquarry']:
 36 |                 if x in data:
 37 |                     if len(data[x][0]) < 1:
 38 |                         sources.append('None')
 39 |                     else:
 40 |                         sourceFile = data[x][0]['source']
 41 |                         if ': ' in sourceFile:
 42 |                             sourceFile = sourceFile.split(':')[0]
 43 |                         sources.append(sourceFile)
 44 |             sources.append(data['augustus'][0]['date'])
 45 |         Table.append(sources)
 46 |     Table = natsorted(Table, key=lambda x: x[0])
 47 |     Table.insert(0, TableHeader)
 48 |     lib.print_table(Table, max_col_width=40)
 49 | 
 50 | 
 51 | def copyDir(src, dest):
 52 |     try:
 53 |         shutil.copytree(src, dest)
 54 |     # Directories are the same
 55 |     except shutil.Error as e:
 56 |         print(('Directory not copied. Error: %s' % e))
 57 |     # Any error saying that the directory doesn't exist
 58 |     except OSError as e:
 59 |         print(('Directory not copied. Error: %s' % e))
 60 | 
 61 | 
 62 | def main(args):
 63 |     # setup menu with argparse
 64 |     class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
 65 |         def __init__(self, prog):
 66 |             super(MyFormatter, self).__init__(prog, max_help_position=48)
 67 |     parser = argparse.ArgumentParser(prog='species.py',
 68 |                                      description='''Script to show/update species training parameters.''',
 69 |                                      epilog="""Written by Jon Palmer (2018) nextgenusfs@gmail.com""",
 70 |                                      formatter_class=MyFormatter)
 71 |     parser.add_argument('-s', '--species', help='Species name to show/update')
 72 |     parser.add_argument('-a', '--add', '--add-parameters',
 73 |                         dest='add', help='Parameter JSON file to add to database')
 74 |     parser.add_argument('-p', '--parameters', dest='parameters',
 75 |                         help='Parameter JSON file to add to database')
 76 |     parser.add_argument('-d', '--database',
 77 |                         help='Path to funannotate database, $FUNANNOTATE_DB')
 78 |     args = parser.parse_args(args)
 79 | 
 80 |     # setup funannotate DB path
 81 |     if args.database:
 82 |         FUNDB = args.database
 83 |     else:
 84 |         try:
 85 |             FUNDB = os.environ["FUNANNOTATE_DB"]
 86 |         except KeyError:
 87 |             print('Funannotate database not properly configured, run funannotate setup.')
 88 |             sys.exit(1)
 89 | 
 90 |     # process input here
 91 |     if args.parameters:  # just pretty-print JSON file
 92 |         with open(args.parameters) as input:
 93 |             table = json.load(input)
 94 |         print((json.dumps(table, indent=3)))
 95 |     elif args.species and args.add:  # have one to add to database
 96 |         SpFound = speciesAvailable(os.path.join(FUNDB, 'trained_species'))
 97 |         if not os.access(os.path.join(FUNDB, 'trained_species'), os.W_OK | os.X_OK):
 98 |             print(('ERROR: you do not have permissions to write to {:}'.format(
 99 |                 os.path.join(FUNDB, 'trained_species'))))
100 |             sys.exit(1)
101 |         if args.species in SpFound:
102 |             print(('ERROR: {:} is already in database, choose a different name or delete existing to continue'.format(
103 |                 args.species)))
104 |             sys.exit(1)
105 |         print(('Adding {:} to Database'.format(args.species)))
106 |         newLoc = os.path.abspath(os.path.join(
107 |             FUNDB, 'trained_species', args.species))
108 |         if not os.path.isdir(newLoc):
109 |             os.makedirs(newLoc)
110 |         with open(args.add) as infile:
111 |             data = json.load(infile)
112 |         for x in data:
113 |             if 'path' not in data[x][0]:
114 |                 continue
115 |             newPath = os.path.join(
116 |                 newLoc, os.path.basename(data[x][0]['path']))
117 |             if os.path.isdir(data[x][0]['path']):
118 |                 copyDir(data[x][0]['path'], newPath)
119 |             elif os.path.isfile(data[x][0]['path']):
120 |                 shutil.copyfile(data[x][0]['path'], newPath)
121 |             data[x][0]['path'] = os.path.abspath(newPath)
122 |         # print new data to terminal
123 |         print(('Following training data added for {:}'.format(args.species)))
124 |         print((json.dumps(data, indent=3)))
125 |         with open(os.path.join(newLoc, 'info.json'), 'w') as outfile:
126 |             json.dump(data, outfile)
127 | 
128 |     elif args.species:  # look for in database and pretty-print JSON file
129 |         SpFound = speciesAvailable(os.path.join(FUNDB, 'trained_species'))
130 |         if args.species in SpFound:
131 |             print((json.dumps(SpFound[args.species], indent=3)))
132 |         else:
133 |             print(('{:} not found in Funannotate trained species folder'.format(
134 |                 args.species)))
135 |             print('Valid species are:')
136 |             showAll(os.path.join(FUNDB, 'trained_species'))
137 |     else:
138 |         # just show all available species in the database and their training data
139 |         showAll(os.path.join(FUNDB, 'trained_species'))
140 |         # row_str = colour(row_str, header_format)
141 |         print('\n')
142 |         print((lib.colour('Options for this script:', 'bold')))
143 |         print((lib.colour(' To print a parameter file to terminal:', 'none')))
144 |         print((lib.colour('   funannotate species -p myparameters.json', 'dim')))
145 |         print((lib.colour(
146 |             ' To print the parameters details from a species in the database:', 'none')))
147 |         print((lib.colour('   funannotate species -s aspergillus_fumigatus', 'dim')))
148 |         print((lib.colour(' To add a new species to database:', 'none')))
149 |         print((lib.colour(
150 |             '   funannotate species -s new_species_name -a new_species_name.parameters.json\n', 'dim')))
151 | 
152 | 
153 | if __name__ == "__main__":
154 |     main(sys.argv[1:])
155 | 


--------------------------------------------------------------------------------
/funannotate/utilities/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextgenusfs/funannotate/033a883081a83a161798ecc17eaf77b16b5c552b/funannotate/utilities/__init__.py


--------------------------------------------------------------------------------
/funannotate/utilities/bam2gff3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | import argparse
 6 | import funannotate.library as lib
 7 | 
 8 | 
 9 | def main(args):
10 |     # setup menu with argparse
11 |     class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
12 |         def __init__(self, prog):
13 |             super(MyFormatter, self).__init__(prog, max_help_position=48)
14 |     parser = argparse.ArgumentParser(prog='bam2gff3.py',
15 |                                      description='''Script to convert BAM to GFF3.''',
16 |                                      epilog="""Written by Jon Palmer (2018) nextgenusfs@gmail.com""",
17 |                                      formatter_class=MyFormatter)
18 |     parser.add_argument('-i', '--bam', required=True, help='input BAM')
19 |     parser.add_argument('-o', '--output', required=True, help='Output GFF3')
20 |     args = parser.parse_args(args)
21 | 
22 |     # convert BAM to gff3
23 |     lib.bam2gff3(args.bam, args.output)
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     main(sys.argv[1:])
28 | 


--------------------------------------------------------------------------------
/funannotate/utilities/gbk2parts.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | import argparse
 6 | import funannotate.library as lib
 7 | 
 8 | 
 9 | def main(args):
10 |     # setup menu with argparse
11 |     class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
12 |         def __init__(self, prog):
13 |             super(MyFormatter, self).__init__(prog, max_help_position=48)
14 |     parser = argparse.ArgumentParser(prog='gbk2parts.py',
15 |                                      description='''Script to convert GBK file to its components.''',
16 |                                      epilog="""Written by Jon Palmer (2018) nextgenusfs@gmail.com""",
17 |                                      formatter_class=MyFormatter)
18 |     parser.add_argument('-g', '--gbk', required=True,
19 |                         help='Genome in GenBank format')
20 |     parser.add_argument('-o', '--output', required=True,
21 |                         help='Output basename')
22 |     args = parser.parse_args(args)
23 | 
24 |     # setup output files
25 |     tblout = f'{args.output}.tbl'
26 |     gffout = f'{args.output}.gff3'
27 |     protout = f'{args.output}.proteins.fa'
28 |     transout = f'{args.output}.mrna-transcripts.fa'
29 |     cdsout = f'{args.output}.cds-transcripts.fa'
30 |     dnaout = f'{args.output}.scaffolds.fa'
31 |     lib.gb2parts(args.gbk, tblout, gffout, protout, transout, cdsout, dnaout)
32 | 
33 | if __name__ == "__main__":
34 |     main(sys.argv[1:])
35 | 


--------------------------------------------------------------------------------
/funannotate/utilities/get_longest_isoform.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys, re, os, gzip, argparse
 5 | from Bio import SeqIO
 6 | 
 7 | def main(inargs):
 8 |         # setup menu with argparse
 9 |     class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
10 |         def __init__(self, prog):
11 |             super(MyFormatter, self).__init__(prog, max_help_position=48)
12 |     parser = argparse.ArgumentParser(prog='get_longest_isoform',
13 |                                      description='''Script to extract longest isoform of protein or transcript file from funannotate or where gene is tagged in header.''',
14 |                                      epilog="""Written by Jason Stajich (2022) @hyphaltip""",
15 |                                      formatter_class=MyFormatter)
16 |     parser.add_argument('-i', '--input', required=True,
17 |                         help='fasta formatted transcript or protein file')
18 |     parser.add_argument('-o', '--output', help='Output basename')
19 | 
20 |     parser.add_argument('-v', '--verbose', help='Extra verbose output',dest='verbose', default=False, action='store_true')
21 | 
22 |     args = parser.parse_args(inargs)
23 |     genes = {}
24 |     if not args.output:
25 |         args.output = args.input + ".longest"
26 |     transmatch = re.compile(r'\-T\d+$')
27 |     genematch  = re.compile(r'gene[:=](\S+)')
28 |     recCount = 0
29 |     handle = args.input
30 |     if args.input.endswith('.gz'):
31 |         handle =  gzip.open(args.input,"rt")
32 |     for rec in SeqIO.parse(handle, "fasta"):
33 |         id = rec.id
34 |         description = rec.description
35 |         geneid = id
36 |         m = transmatch.search(id)
37 |         if m:
38 |             geneid = description.split()[1]
39 |         else:
40 |             m = genematch.search(description)
41 |             if m:
42 |                 geneid = m.group(1)
43 |         if geneid == id:
44 |             if args.verbose:
45 |                 print("Warning: could not parse gene name from header '{}' '{}'".format(id,description))
46 |         if geneid not in genes or len(rec) > len(genes[geneid]):
47 |             genes[geneid] = rec
48 |         recCount += 1
49 | 
50 |     print("{} genes and {} total sequences (isoforms) seen".format(len(genes),recCount))
51 |     SeqIO.write(genes.values(),args.output,'fasta')
52 | 
53 | if __name__ == "__main__":
54 |     main(sys.argv[1:])
55 | 


--------------------------------------------------------------------------------
/funannotate/utilities/gff2prot.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | import argparse
 6 | from natsort import natsorted
 7 | import funannotate.library as lib
 8 | 
 9 | 
10 | def main(args):
11 |     # setup menu with argparse
12 |     class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
13 |         def __init__(self, prog):
14 |             super(MyFormatter, self).__init__(prog, max_help_position=48)
15 |     parser = argparse.ArgumentParser(prog='gff2prot.py',
16 |                                      description='''Script to convert GFF3 and FASTA proteins.''',
17 |                                      epilog="""Written by Jon Palmer (2018) nextgenusfs@gmail.com""",
18 |                                      formatter_class=MyFormatter)
19 |     parser.add_argument('-g', '--gff3', required=True,
20 |                         help='Genome annotation GFF3 format')
21 |     parser.add_argument('-f', '--fasta', required=True,
22 |                         help='Genome in FASTA format')
23 |     parser.add_argument('--no_stop', action='store_true',
24 |                         help='Dont print stop codon')
25 |     args = parser.parse_args(args)
26 | 
27 |     # translate GFF3 to proteins
28 |     # load into dictionary
29 |     Genes = {}
30 |     Genes = lib.gff2dict(args.gff3, args.fasta, Genes)
31 | 
32 |     for k, v in natsorted(list(Genes.items())):
33 |         if v['type'] == 'mRNA':
34 |             for i, x in enumerate(v['ids']):
35 |                 if args.no_stop:
36 |                     Prot = v['protein'][i].rstrip('*')
37 |                 else:
38 |                     Prot = v['protein'][i]
39 |                 sys.stdout.write('>%s %s\n%s\n' % (x, k, lib.softwrap(Prot)))
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     main(sys.argv[1:])
44 | 


--------------------------------------------------------------------------------
/funannotate/utilities/gff_reformat.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | import argparse
 6 | from natsort import natsorted
 7 | from collections import OrderedDict
 8 | import funannotate.library as lib
 9 | 
10 | 
11 | def main(args):
12 |     # setup menu with argparse
13 |     class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
14 |         def __init__(self, prog):
15 |             super(MyFormatter, self).__init__(prog, max_help_position=48)
16 |     parser = argparse.ArgumentParser(prog='gff_reformat.py',
17 |                                      description='''Script to rename gene models GFF3 file.''',
18 |                                      epilog="""Written by Jon Palmer (2020) nextgenusfs@gmail.com""",
19 |                                      formatter_class=MyFormatter)
20 |     parser.add_argument('-g', '--gff3', required=True,
21 |                         help='Genome annotation GFF3 format')
22 |     parser.add_argument('-f', '--fasta', required=True,
23 |                         help='Genome in FASTA format')
24 |     parser.add_argument('-l', '--locus_tag', default='FUN',
25 |                         help='Basename of gene names')
26 |     parser.add_argument('-n', '--numbering', default=1, type=int,
27 |                         help='Start numbering at')
28 |     parser.add_argument('-o', '--out', required=True, help='Output GFF3')
29 |     args = parser.parse_args(args)
30 | 
31 |     # load into dictionary
32 |     Genes = {}
33 |     Genes = lib.gff2dict(args.gff3, args.fasta, Genes)
34 |     print('Parsed {:,} gene models from {}'.format(len(Genes), args.gff3))
35 | 
36 |     # now create ordered dictionary and sort by contig and position
37 |     def _sortDict(d):
38 |         return (d[1]['contig'], d[1]['location'][0])
39 | 
40 |     sGenes = natsorted(iter(Genes.items()), key=_sortDict)
41 |     sortedGenes = OrderedDict(sGenes)
42 |     renamedGenes = {}
43 |     counter = args.numbering
44 |     args.locus_tag = args.locus_tag.rstrip('_')
45 |     transcripts = 0
46 |     for k, v in list(sortedGenes.items()):
47 |         locusTag = args.locus_tag+'_'+str(counter).zfill(6)
48 |         renamedGenes[locusTag] = v
49 |         renamedGenes[locusTag]['gene_synonym'].append(k)
50 |         newIds = []
51 |         for i in range(0, len(v['ids'])):
52 |             newIds.append('{}-T{}'.format(locusTag, i+1))
53 |             transcripts += 1
54 |         renamedGenes[locusTag]['ids'] = newIds
55 |         counter += 1
56 | 
57 |     # write to gff3
58 |     lib.dict2gff3(renamedGenes, args.out)
59 |     print('Sorted and renamed {:,} gene models {:,} transcripts: {}'.format(
60 |         len(renamedGenes), transcripts, args.out))
61 | 
62 | if __name__ == "__main__":
63 |     main(sys.argv[1:])
64 | 


--------------------------------------------------------------------------------
/funannotate/utilities/quarry2gff3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | import argparse
 6 | 
 7 | 
 8 | def main(args):
 9 |     # setup menu with argparse
10 |     class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
11 |         def __init__(self, prog):
12 |             super(MyFormatter, self).__init__(prog, max_help_position=48)
13 |     parser = argparse.ArgumentParser(prog='codingquarry2gff3.py',
14 |                                      description='''Script to convert CodingQuarry GFF3 to proper GFF3 format.''',
15 |                                      epilog="""Written by Jon Palmer (2018) nextgenusfs@gmail.com""",
16 |                                      formatter_class=MyFormatter)
17 |     parser.add_argument('-i', '--input', required=True,
18 |                         help='CodingQuarry annotation file')
19 |     parser.add_argument('-n', '--numbering', default=1,
20 |                         type=int, help='Gene numbering starts at')
21 |     args = parser.parse_args(args)
22 | 
23 |     sys.stdout.write(("##gff-version 3\n"))
24 |     exonCounts = {}
25 |     GeneCount = args.numbering
26 |     with open(args.input, 'r') as infile:
27 |         for line in infile:
28 |             line = line.strip()
29 |             contig, source, feature, start, end, score, strand, phase, attributes = line.split(
30 |                 '\t')
31 |             source = 'CodingQuarry'
32 |             ID, Parent, Name = (None,)*3
33 |             info = attributes.split(';')
34 |             for x in info:
35 |                 if x.startswith('ID='):
36 |                     ID = x.replace('ID=', '')
37 |                 elif x.startswith('Parent='):
38 |                     Parent = x.replace('Parent=', '')
39 |             if ID and ' ' in ID:
40 |                 ID = ID.split(' ')[0]
41 |             if Parent and ' ' in Parent:
42 |                 Parent = Parent.split(' ')[0]
43 |             if feature == 'gene':
44 |                 geneID = 'gene_'+str(GeneCount)
45 |                 transID = 'transcript_'+str(GeneCount)+'-T1'
46 |                 # if not ID in geneRef:
47 |                 #	geneRef[ID] = (geneID, transID)
48 |                 sys.stdout.write('{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\tID={:};Name={:};Alias={:};\n'.format(
49 |                     contig, source, feature, start, end, score, strand, phase, geneID, geneID, ID))
50 |                 sys.stdout.write('{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\tID={:};Parent={:};Alias={:};\n'.format(
51 |                     contig, source, 'mRNA', start, end, '.', strand, '.', transID, geneID, ID))
52 |                 GeneCount += 1
53 |             elif feature == 'CDS':
54 |                 # if trimID in geneRef:
55 |                 #	geneID,transID = geneRef.get(trimID)
56 |                 if not transID in exonCounts:
57 |                     exonCounts[transID] = 1
58 |                 else:
59 |                     exonCounts[transID] += 1
60 |                 num = exonCounts.get(transID)
61 |                 sys.stdout.write('{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\tID={:}.exon{:};Parent={:};\n'.format(
62 |                     contig, source, 'exon', start, end, '.', strand, '.', transID, num, transID))
63 |                 sys.stdout.write('{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\tID={:}.cds;Parent={:};\n'.format(
64 |                     contig, source, feature, start, end, score, strand, phase, transID, transID))
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     main(sys.argv[1:])
69 | 


--------------------------------------------------------------------------------
/funannotate/utilities/stats.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | import argparse
 6 | import funannotate.library as lib
 7 | 
 8 | 
 9 | def main(args):
10 |     # setup menu with argparse
11 |     class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
12 |         def __init__(self, prog):
13 |             super(MyFormatter, self).__init__(prog, max_help_position=48)
14 |     parser = argparse.ArgumentParser(prog='stats.py',
15 |                                      description='''Script to run some simple genome annotation stats''',
16 |                                      epilog="""Written by Jon Palmer (2020) nextgenusfs@gmail.com""",
17 |                                      formatter_class=MyFormatter)
18 |     parser.add_argument('-f', '--fasta', required=True,
19 |                         help='Genome in FASTA format')
20 |     parser.add_argument('-o', '--out', required=True,
21 |                         help='JSON output stats file')
22 |     parser.add_argument('-g', '--gff',
23 |                         help='Genome annotation in GFF3 format')
24 |     parser.add_argument('-t', '--tbl',
25 |                         help='Genome annotation in TBL format')
26 |     parser.add_argument('--transcript_alignments',
27 |                         help='transcript alignments in GFF3 format')
28 |     parser.add_argument('--protein_alignments',
29 |                         help='protein alignments in GFF3 format')
30 |     args = parser.parse_args(args)
31 | 
32 | 
33 |     if not args.gff and not args.tbl:
34 |         print('Warning: no genome annotation passed (-t or -g), will only output genome assembly stats')
35 |     elif args.tbl:
36 |         print('Generating stats from Genome FASTA file and TBL annotation')
37 |         lib.annotation_summary(args.fasta, args.out, tbl=args.tbl,
38 |                                transcripts=args.transcript_alignments,
39 |                                proteins=args.protein_alignments)
40 |     elif args.gff:
41 |         print('Generating stats from Genome FASTA file and GFF3 annotation')
42 |         lib.annotation_summary(args.fasta, args.out, gff=args.gff,
43 |                                transcripts=args.transcript_alignments,
44 |                                proteins=args.protein_alignments)
45 |     print('Finished writing JSON stats file: {}'.format(args.out))
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     main(sys.argv[1:])
50 | 


--------------------------------------------------------------------------------
/funannotate/utilities/stringtie2gff3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | import argparse
 6 | import funannotate.library as lib
 7 | 
 8 | 
 9 | def dict2gff3(input):
10 |     from collections import OrderedDict
11 |     '''
12 |     function to convert funannotate gene dictionary to gff3 output
13 |     '''
14 |     def _sortDict(d):
15 |         return (d[1]['contig'], d[1]['location'][0])
16 |     # sort the annotations by contig and start location
17 |     sGenes = sorted(iter(input.items()), key=_sortDict)
18 |     sortedGenes = OrderedDict(sGenes)
19 |     # then loop through and write GFF3 format
20 |     sys.stdout.write("##gff-version 3\n")
21 |     for k, v in list(sortedGenes.items()):
22 |         sys.stdout.write("{:}\t{:}\tgene\t{:}\t{:}\t.\t{:}\t.\tID={:};\n".format(
23 |             v['contig'], v['source'], v['location'][0], v['location'][1], v['strand'], k))
24 |         for i in range(0, len(v['ids'])):
25 |             # build extra annotations for each transcript if applicable
26 |             # now write mRNA feature
27 |             sys.stdout.write("{:}\t{:}\t{:}\t{:}\t{:}\t.\t{:}\t.\tID={:};Parent={:};TPM={:}\n".format(
28 |                 v['contig'], v['source'], v['type'], v['location'][0], v['location'][1], v['strand'], v['ids'][i], k, v['tpm'][i]))
29 |             if v['type'] == 'mRNA':
30 |                 if '5UTR' in v:
31 |                     # if 5'UTR then write those first
32 |                     num_5utrs = len(v['5UTR'][i])
33 |                     if num_5utrs > 0:
34 |                         for z in range(0, num_5utrs):
35 |                             u_num = z + 1
36 |                             sys.stdout.write("{:}\t{:}\tfive_prime_UTR\t{:}\t{:}\t.\t{:}\t.\tID={:}.utr5p{:};Parent={:};\n".format(
37 |                                 v['contig'], v['source'], v['5UTR'][i][z][0], v['5UTR'][i][z][1], v['strand'], v['ids'][i], u_num, v['ids'][i]))
38 |                 # write the exons
39 |                 num_exons = len(v['mRNA'][i])
40 |                 for x in range(0, num_exons):
41 |                     ex_num = x + 1
42 |                     sys.stdout.write("{:}\t{:}\texon\t{:}\t{:}\t.\t{:}\t.\tID={:}.exon{:};Parent={:};\n".format(
43 |                         v['contig'], v['source'], v['mRNA'][i][x][0], v['mRNA'][i][x][1], v['strand'], v['ids'][i], ex_num, v['ids'][i]))
44 |                 # if 3'UTR then write
45 |                 if '3UTR' in v:
46 |                     num_3utrs = len(v['3UTR'][i])
47 |                     if num_3utrs > 0:
48 |                         for z in range(0, num_3utrs):
49 |                             u_num = z + 1
50 |                             sys.stdout.write("{:}\t{:}\tthree_prime_UTR\t{:}\t{:}\t.\t{:}\t.\tID={:}.utr3p{:};Parent={:};\n".format(
51 |                                 v['contig'], v['source'], v['3UTR'][i][z][0], v['3UTR'][i][z][1], v['strand'], v['ids'][i], u_num, v['ids'][i]))
52 |             if v['type'] == 'mRNA':
53 |                 num_cds = len(v['CDS'][i])
54 |                 # GFF3 phase is 1 less than flat file
55 |                 current_phase = v['codon_start'][i] - 1
56 |                 for y in range(0, num_cds):
57 |                     sys.stdout.write("{:}\t{:}\tCDS\t{:}\t{:}\t.\t{:}\t{:}\tID={:}.cds;Parent={:};\n".format(
58 |                         v['contig'], v['source'], v['CDS'][i][y][0], v['CDS'][i][y][1], v['strand'], current_phase, v['ids'][i], v['ids'][i]))
59 |                     current_phase = (
60 |                         current_phase - (int(v['CDS'][i][y][1]) - int(v['CDS'][i][y][0]) + 1)) % 3
61 |                     if current_phase == 3:
62 |                         current_phase = 0
63 | 
64 | 
65 | def main(args):
66 |         # setup menu with argparse
67 |     parser = argparse.ArgumentParser(prog='stringtie2gff.py',
68 |                                      description='''Script to convert StringTie GTF file to GFF3.''',
69 |                                      epilog="""Written by Jon Palmer (2018) nextgenusfs@gmail.com""")
70 |     parser.add_argument('-i', '--input', required=True,
71 |                         help='StringTie GTF file')
72 |     args = parser.parse_args(args)
73 | 
74 |     Genes = lib.gtf2dict(args.input)
75 |     dict2gff3(Genes)
76 | 
77 | 
78 | if __name__ == "__main__":
79 |     main(sys.argv[1:])
80 | 


--------------------------------------------------------------------------------
/funannotate/utilities/tbl2gbk.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import sys
  5 | import os
  6 | import argparse
  7 | import shutil
  8 | import subprocess
  9 | from natsort import natsorted
 10 | import funannotate.library as lib
 11 | from Bio import SeqIO
 12 | 
 13 | 
 14 | def runSubprocess(cmd, dir):
 15 |     proc = subprocess.Popen(
 16 |         cmd, cwd=dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 17 |     stdout, stderr = proc.communicate()
 18 |     if stdout:
 19 |         print(stdout)
 20 | 
 21 | 
 22 | def runtbl2asn(folder, template, discrepency, organism, isolate, strain, parameters, version):
 23 |     '''
 24 |     function to run NCBI tbl2asn
 25 |     '''
 26 |     # get funannotate version
 27 |     fun_version = lib.get_version()
 28 |     # input should be a folder
 29 |     if not os.path.isdir(folder):
 30 |         print(("tbl2asn error: %s is not a directory, exiting" % folder))
 31 |         sys.exit(1)
 32 |     # based on organism, isolate, strain, construct meta info for -j flag
 33 |     if not organism:
 34 |         print("tbl2asn error: organism not specified")
 35 |         sys.exit(1)
 36 |     meta = "[organism=" + organism + "]"
 37 |     if isolate:
 38 |         isolate_meta = "[isolate=" + isolate + "]"
 39 |         meta = meta + " " + isolate_meta
 40 |     if strain:
 41 |         strain_meta = "[strain=" + strain + "]"
 42 |         meta = meta + " " + strain_meta
 43 |     cmd = ['tbl2asn', '-y', '"Annotated using '+fun_version+'"', '-N',
 44 |            str(version), '-p', folder, '-t', template, '-M', 'n', '-Z', discrepency, '-j', '"'+meta+'"', '-V', 'b', '-c', 'fx', '-T', '-a', 'r10u', '-l', 'paired-ends']
 45 |     # check for custom parameters
 46 |     if parameters:
 47 |         params = parameters.split(' ')
 48 |         cmd = cmd + params
 49 |     runSubprocess(cmd, '.')
 50 |     return ' '.join(cmd)
 51 | 
 52 | 
 53 | def locustagGB(input):
 54 |     tag = []
 55 |     with open(input, 'r') as infile:
 56 |         for record in SeqIO.parse(infile, 'genbank'):
 57 |             for f in record.features:
 58 |                 if f.type == 'gene':
 59 |                     locusTag, ID, Parent = lib.getID(f, f.type)
 60 |                     tag.append(locusTag)
 61 |                     break
 62 |     return tag[0].split('_', -1)[0]
 63 | 
 64 | 
 65 | def ncbiCheckErrors(error, validation, genename, fixOut):
 66 |     ncbi_error = 0
 67 |     actual_error = 0
 68 |     with open(error, 'r') as errors:
 69 |         for line in errors:
 70 |             line = line.strip()
 71 |             if 'ERROR' in line:
 72 |                 num = line.split(' ')[0]
 73 |                 ncbi_error += int(num)
 74 |     # if errors in summary, then parse validation report, only get errors with gene names
 75 |     if ncbi_error > 0:
 76 |         # see if we can get the gene models that need to be fixed
 77 |         needFixing = {}
 78 |         with open(validation, 'r') as validationFile:
 79 |             for line in validationFile:
 80 |                 line = line.strip()
 81 |                 if line.startswith('ERROR') and genename in line:
 82 |                     actual_error += 1
 83 |                     parts = line.split(' ')
 84 |                     for x in parts:
 85 |                         if genename in x:
 86 |                             ID = x.split('|')[-1]
 87 |                     if '-' in ID:
 88 |                         ID = ID.split('-')[0]
 89 |                     reason = line.split(' FEATURE:')[0]
 90 |                     reason = reason.split('] ')[-1]
 91 |                     if not ID in needFixing:
 92 |                         needFixing[ID] = reason
 93 |         if actual_error > 0:
 94 |             print(("There are %i gene models that need to be fixed." %
 95 |                   actual_error))
 96 |             print('-------------------------------------------------------')
 97 |             with open(fixOut, 'w') as fix:
 98 |                 fix.write('#GeneID\tError Message\n')
 99 |                 for k, v in natsorted(list(needFixing.items())):
100 |                     fix.write('%s\t%s\n' % (k, v))
101 |                     print(('%s\t%s' % (k, v)))
102 |     return actual_error
103 | 
104 | 
105 | def main(args):
106 |         # setup menu with argparse
107 |     class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
108 |         def __init__(self, prog):
109 |             super(MyFormatter, self).__init__(prog, max_help_position=48)
110 |     parser = argparse.ArgumentParser(prog='gbk2parts.py',
111 |                                      description='''Script to convert GBK file to its components.''',
112 |                                      epilog="""Written by Jon Palmer (2018) nextgenusfs@gmail.com""",
113 |                                      formatter_class=MyFormatter)
114 |     parser.add_argument('-i', '--tbl', required=True,
115 |                         help='Genome annotation in tbl format')
116 |     parser.add_argument('-f', '--fasta', required=True,
117 |                         help='Genome in FASTA format')
118 |     parser.add_argument('-s', '--species', required=True,
119 |                         help='Species name (e.g. "Aspergillus fumigatus") use quotes if there is a space')
120 |     parser.add_argument('--isolate', help='Isolate name (e.g. Af293)')
121 |     parser.add_argument('--strain', help='Strain name (e.g. CEA10)')
122 |     parser.add_argument(
123 |         '-t', '--tbl2asn', help='Custom parameters for tbl2asn, example: linkage and gap info')
124 |     parser.add_argument('--sbt', help='tbl2asn template file')
125 |     parser.add_argument('-o', '--output', help='Output basename')
126 |     args = parser.parse_args(args)
127 | 
128 |     parentdir = os.path.dirname(lib.__file__)
129 | 
130 |     # see if organism/species/isolate was passed at command line
131 |     organism = None
132 |     if args.species:
133 |         organism = args.species
134 |     else:
135 |         organism = os.path.basename(args.tbl).split('.t')[0]
136 |     if args.strain:
137 |         organism_name = organism+'_'+args.strain
138 |     elif args.isolate:
139 |         organism_name = organism+'_'+args.isolate
140 |     else:
141 |         organism_name = organism
142 |     organism_name = organism_name.replace(' ', '_')
143 |     if args.output:
144 |         outputname = args.output
145 |     else:
146 |         outputname = organism_name
147 | 
148 |     # create tmp folder to run tbl2asn from
149 |     # make tmp folder
150 |     tmp = outputname + '_tmp'
151 |     if not os.path.exists(tmp):
152 |         os.makedirs(tmp)
153 | 
154 |     # now move files into proper location
155 |     if not lib.checkannotations(args.fasta):
156 |         print(('FASTA genome file not found: {:}'.format(args.fasta)))
157 |         sys.exit(1)
158 |     if not lib.checkannotations(args.tbl):
159 |         print(('TBL annotations file not found: {:}'.format(args.tbl)))
160 |         sys.exit(1)
161 |     shutil.copyfile(args.fasta, os.path.join(tmp, 'genome.fsa'))
162 |     shutil.copyfile(args.tbl, os.path.join(tmp, 'genome.tbl'))
163 | 
164 |     # now we can run tbl2asn
165 |     if args.sbt:
166 |         SBT = args.sbt
167 |     else:
168 |         SBT = os.path.join(parentdir, 'config', 'test.sbt')
169 |     discrep = outputname+'.discrepency.txt'
170 |     version = 1
171 |     runtbl2asn(tmp, SBT, discrep, organism,
172 |                args.isolate, args.strain, args.tbl2asn, version)
173 | 
174 |     # check the output for errors for NCBI
175 |     final_fixes = os.path.join(tmp, 'models-need-fixing.txt')
176 |     prefix = locustagGB(os.path.join(tmp, 'genome.gbf'))
177 |     errors = ncbiCheckErrors(os.path.join(tmp, 'errorsummary.val'), os.path.join(
178 |         tmp, 'genome.val'), prefix, final_fixes)
179 | 
180 |     # get output files
181 |     gbkout = outputname+'.gbk'
182 |     shutil.copyfile(os.path.join(tmp, 'genome.gbf'), gbkout)
183 |     sqnout = outputname + '.sqn'
184 |     shutil.copyfile(os.path.join(tmp, 'genome.sqn'), sqnout)
185 |     if errors < 1:
186 |         lib.SafeRemove(tmp)
187 | 
188 | 
189 | if __name__ == "__main__":
190 |     main(sys.argv[1:])
191 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Note: To use the 'upload' functionality of this file, you must:
  5 | #   $ pip install twine
  6 | 
  7 | import io
  8 | import os
  9 | import sys
 10 | from shutil import rmtree
 11 | 
 12 | from setuptools import find_packages, setup, Command
 13 | 
 14 | # Package meta-data.
 15 | NAME = "funannotate"
 16 | DESCRIPTION = "funannotate: eukaryotic genome annotation pipeline"
 17 | URL = "https://github.com/nextgenusfs/funannotate"
 18 | EMAIL = "nextgenusfs@gmail.com"
 19 | AUTHOR = "Jon Palmer"
 20 | REQUIRES_PYTHON = ">=3.6.0, <3.12"
 21 | VERSION = None
 22 | 
 23 | # What packages are required for this module to be executed?
 24 | REQUIRED = [
 25 |     "biopython<1.80",
 26 |     "goatools",
 27 |     "seaborn",
 28 |     "psutil",
 29 |     "pandas",
 30 |     "matplotlib",
 31 |     "natsort",
 32 |     "numpy",
 33 |     "requests",
 34 |     "scikit-learn",
 35 |     "scipy",
 36 |     "distro",
 37 | ]
 38 | 
 39 | # What packages are optional?
 40 | EXTRAS = {
 41 |     # 'fancy feature': ['django'],
 42 | }
 43 | 
 44 | # The rest you shouldn't have to touch too much :)
 45 | # ------------------------------------------------
 46 | # Except, perhaps the License and Trove Classifiers!
 47 | # If you do change the License, remember to change the Trove Classifier for that!
 48 | 
 49 | here = os.path.abspath(os.path.dirname(__file__))
 50 | 
 51 | # Import the README and use it as the long-description.
 52 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file!
 53 | try:
 54 |     with io.open(os.path.join(here, "README.md"), encoding="utf-8") as f:
 55 |         long_description = "\n" + f.read()
 56 | except FileNotFoundError:
 57 |     long_description = DESCRIPTION
 58 | 
 59 | # Load the package's __version__.py module as a dictionary.
 60 | about = {}
 61 | if not VERSION:
 62 |     with open(os.path.join(here, NAME, "__version__.py")) as f:
 63 |         exec(f.read(), about)
 64 | else:
 65 |     about["__version__"] = VERSION
 66 | 
 67 | 
 68 | class UploadCommand(Command):
 69 |     """Support setup.py upload."""
 70 | 
 71 |     description = "Build and publish the package."
 72 |     user_options = []
 73 | 
 74 |     @staticmethod
 75 |     def status(s):
 76 |         """Prints things in bold."""
 77 |         print(("\033[1m{0}\033[0m".format(s)))
 78 | 
 79 |     def initialize_options(self):
 80 |         pass
 81 | 
 82 |     def finalize_options(self):
 83 |         pass
 84 | 
 85 |     def run(self):
 86 |         try:
 87 |             self.status("Removing previous builds…")
 88 |             rmtree(os.path.join(here, "dist"))
 89 |         except OSError:
 90 |             pass
 91 | 
 92 |         self.status("Building Source and Wheel (universal) distribution…")
 93 |         os.system("{0} setup.py sdist bdist_wheel --universal".format(sys.executable))
 94 | 
 95 |         self.status("Uploading the package to PyPI via Twine…")
 96 |         os.system("twine upload dist/*")
 97 | 
 98 |         self.status("Pushing git tags…")
 99 |         os.system("git tag v{0}".format(about["__version__"]))
100 |         os.system("git push --tags")
101 | 
102 |         sys.exit()
103 | 
104 | 
105 | # Where the magic happens:
106 | setup(
107 |     name=NAME,
108 |     version=about["__version__"],
109 |     description=DESCRIPTION,
110 |     long_description=long_description,
111 |     long_description_content_type="text/markdown",
112 |     author=AUTHOR,
113 |     author_email=EMAIL,
114 |     python_requires=REQUIRES_PYTHON,
115 |     url=URL,
116 |     packages=find_packages(exclude=("tests",)),
117 |     entry_points={
118 |         "console_scripts": ["funannotate=funannotate.funannotate:main"],
119 |     },
120 |     install_requires=REQUIRED,
121 |     extras_require=EXTRAS,
122 |     include_package_data=True,
123 |     license="BSD-2",
124 |     # scripts=['scripts/funannotate'],
125 |     classifiers=[
126 |         # Trove classifiers
127 |         # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
128 |         "Development Status :: 4 - Beta",
129 |         "License :: OSI Approved :: BSD License",
130 |         "Programming Language :: Python",
131 |         "Operating System :: Unix",
132 |         "Intended Audience :: Science/Research",
133 |         "Topic :: Scientific/Engineering :: Bio-Informatics",
134 |     ],
135 |     cmdclass={
136 |         "upload": UploadCommand,
137 |     },
138 | )
139 | 


--------------------------------------------------------------------------------