├── .editorconfig ├── .gitignore ├── .travis.yml ├── AUTHORS.rst ├── CONTRIBUTING.rst ├── HISTORY.rst ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.rst ├── docs ├── Makefile ├── authors.rst ├── conf.py ├── contributing.rst ├── history.rst ├── index.rst ├── installation.rst ├── make.bat ├── readme.rst └── usage.rst ├── examples ├── annotations.gff ├── fix_pseudogene.py ├── gff_fix.py ├── gff_valid.py ├── phase_test.gff3 ├── phase_test.gff3.md └── validate.py ├── gff3 ├── __init__.py └── gff3.py ├── requirements.txt ├── setup.cfg ├── setup.py ├── tests ├── __init__.py └── test_gff3.py └── tox.ini /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | charset = utf-8 11 | end_of_line = lf 12 | 13 | [*.bat] 14 | indent_style = tab 15 | end_of_line = crlf 16 | 17 | [LICENSE] 18 | insert_final_newline = false 19 | 20 | [Makefile] 21 | indent_style = tab 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | htmlcov 29 | 30 | # Translations 31 | *.mo 32 | 33 | # Mr Developer 34 | .mr.developer.cfg 35 | .project 36 | .pydevproject 37 | 38 | # Complexity 39 | output/*.html 40 | output/*/index.html 41 | 42 | # Sphinx 43 | docs/_build 44 | 45 | # PyCharm 46 | .idea -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Config file for automatic testing at travis-ci.org 2 | 3 | language: python 4 | 5 | python: 6 | - "3.4" 7 | - "3.3" 8 | - "2.7" 9 | - "2.6" 10 | - "pypy" 11 | 12 | # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors 13 | install: pip install -r requirements.txt 14 | 15 | # command to run tests, e.g. python setup.py test 16 | script: python setup.py test 17 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Credits 3 | ======= 4 | 5 | Development Lead 6 | ---------------- 7 | 8 | * Han Lin 9 | 10 | Contributors 11 | ------------ 12 | 13 | None yet. Why not be the first? 14 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Contributing 3 | ============ 4 | 5 | Contributions are welcome, and they are greatly appreciated! Every 6 | little bit helps, and credit will always be given. 7 | 8 | You can contribute in many ways: 9 | 10 | Types of Contributions 11 | ---------------------- 12 | 13 | Report Bugs 14 | ~~~~~~~~~~~ 15 | 16 | Report bugs at https://github.com/hotdogee/gff3-py/issues. 17 | 18 | If you are reporting a bug, please include: 19 | 20 | * Your operating system name and version. 21 | * Any details about your local setup that might be helpful in troubleshooting. 22 | * Detailed steps to reproduce the bug. 23 | 24 | Fix Bugs 25 | ~~~~~~~~ 26 | 27 | Look through the GitHub issues for bugs. Anything tagged with "bug" 28 | is open to whoever wants to implement it. 29 | 30 | Implement Features 31 | ~~~~~~~~~~~~~~~~~~ 32 | 33 | Look through the GitHub issues for features. Anything tagged with "feature" 34 | is open to whoever wants to implement it. 35 | 36 | Write Documentation 37 | ~~~~~~~~~~~~~~~~~~~ 38 | 39 | gff3 could always use more documentation, whether as part of the 40 | official gff3 docs, in docstrings, or even on the web in blog posts, 41 | articles, and such. 42 | 43 | Submit Feedback 44 | ~~~~~~~~~~~~~~~ 45 | 46 | The best way to send feedback is to file an issue at https://github.com/hotdogee/gff3-py/issues. 47 | 48 | If you are proposing a feature: 49 | 50 | * Explain in detail how it would work. 51 | * Keep the scope as narrow as possible, to make it easier to implement. 52 | * Remember that this is a volunteer-driven project, and that contributions 53 | are welcome :) 54 | -------------------------------------------------------------------------------- /HISTORY.rst: -------------------------------------------------------------------------------- 1 | .. :changelog: 2 | 3 | History 4 | ------- 5 | 6 | 1.0.1 (2021-11-12) 7 | --------------------- 8 | 9 | * new source distribution uploaded 10 | 11 | 1.0.0 (2018-12-01) 12 | --------------------- 13 | 14 | * Fix Python3 issues 15 | * Added sequence functions: complement(seq) and translate(seq) 16 | * Added fasta write function: fasta_dict_to_file(fasta_dict, fasta_file, line_char_limit=None) 17 | * Added Gff method to return the sequence of line_data: sequence(self, line_data, child_type=None, reference=None) 18 | * Gff.write no longer prints redundent '###' when the whole gene is marked as removed 19 | 20 | 21 | 0.3.0 (2015-03-10) 22 | --------------------- 23 | 24 | * Fixed phase checking. 25 | 26 | 0.2.0 (2015-01-28) 27 | --------------------- 28 | 29 | * Supports python 2.6, 2.7, 3.3, 3.4, pypy. 30 | * Don't report empty attributes as errors. 31 | * Improved documentation. 32 | 33 | 0.1.0 (2014-12-11) 34 | --------------------- 35 | 36 | * First release on PyPI. 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Han Lin 2 | 3 | Permission to use, copy, modify, and/or distribute this software for any 4 | purpose with or without fee is hereby granted, provided that the above 5 | copyright notice and this permission notice appear in all copies. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 10 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 12 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 13 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS.rst 2 | include CONTRIBUTING.rst 3 | include HISTORY.rst 4 | include LICENSE 5 | include README.rst 6 | 7 | recursive-include tests * 8 | recursive-exclude * __pycache__ 9 | recursive-exclude * *.py[co] 10 | 11 | recursive-include docs *.rst conf.py Makefile make.bat 12 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean-pyc clean-build docs clean 2 | 3 | help: 4 | @echo "clean - remove all build, test, coverage and Python artifacts" 5 | @echo "clean-build - remove build artifacts" 6 | @echo "clean-pyc - remove Python file artifacts" 7 | @echo "clean-test - remove test and coverage artifacts" 8 | @echo "lint - check style with flake8" 9 | @echo "test - run tests quickly with the default Python" 10 | @echo "test-all - run tests on every Python version with tox" 11 | @echo "coverage - check code coverage quickly with the default Python" 12 | @echo "docs - generate Sphinx HTML documentation, including API docs" 13 | @echo "release - package and upload a release" 14 | @echo "dist - package" 15 | 16 | clean: clean-build clean-pyc clean-test 17 | 18 | clean-build: 19 | rm -fr build/ 20 | rm -fr dist/ 21 | rm -fr *.egg-info 22 | 23 | clean-pyc: 24 | find . -name '*.pyc' -exec rm -f {} + 25 | find . -name '*.pyo' -exec rm -f {} + 26 | find . -name '*~' -exec rm -f {} + 27 | find . -name '__pycache__' -exec rm -fr {} + 28 | 29 | clean-test: 30 | rm -fr .tox/ 31 | rm -f .coverage 32 | rm -fr htmlcov/ 33 | 34 | lint: 35 | flake8 gff3 tests 36 | 37 | test: 38 | python setup.py test 39 | 40 | test-all: 41 | tox 42 | 43 | coverage: 44 | coverage run --source gff3 setup.py test 45 | coverage report -m 46 | coverage html 47 | open htmlcov/index.html 48 | 49 | docs: 50 | rm -f docs/gff3.rst 51 | rm -f docs/modules.rst 52 | sphinx-apidoc -o docs/ gff3 53 | $(MAKE) -C docs clean 54 | $(MAKE) -C docs html 55 | open docs/_build/html/index.html 56 | 57 | release: clean 58 | python setup.py sdist upload 59 | python setup.py bdist_wheel upload 60 | 61 | dist: clean 62 | python setup.py sdist 63 | python setup.py bdist_wheel 64 | ls -l dist 65 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | =============================== 2 | gff3-py 3 | =============================== 4 | 5 | .. image:: https://badge.fury.io/py/gff3.png 6 | :target: http://badge.fury.io/py/gff3 7 | 8 | .. image:: https://travis-ci.org/hotdogee/gff3-py.png?branch=master 9 | :target: https://travis-ci.org/hotdogee/gff3-py 10 | 11 | .. image:: https://pypip.in/d/gff3/badge.png 12 | :target: https://pypi.python.org/pypi/gff3 13 | 14 | 15 | Manipulate genomic features and validate the syntax and reference sequence of your |GFF3|_ files. 16 | 17 | * Free software: BSD license 18 | * Documentation: https://gff3-py.readthedocs.org. 19 | 20 | Features 21 | -------- 22 | 23 | * **Simple data structures**: Parses a |GFF3|_ file into a structure composed of simple python |dict|_ and |list|_. 24 | * **Validation**: Validates the |GFF3|_ syntax on parse, and saves the error messages in the parsed structure. 25 | * **Best effort parsing**: Despite any detected errors, continue to parse the whole file and make as much sense to it as possible. 26 | * Uses the python |logging|_ library to log error messages with support for custom loggers. 27 | * Parses embeded or external |FASTA|_ sequences to check bounds and number of ``N`` s. 28 | * Check and correct the phase for ``CDS`` features. 29 | * Tree traversal methods ``ancestors`` and ``descendants`` returns a simple ``list`` in Breadth-first search order. 30 | * Transfer children and parents using the ``adopt`` and ``adopted`` methods. 31 | * Test for overlapping features using the ``overlap`` method. 32 | * Remove a feature and its associated features using the ``remove`` method. 33 | * Write the modified structure to a GFF3 file using the ``write`` mthod. 34 | 35 | Quick Start 36 | ----------- 37 | 38 | An example that just parses a GFF3 file named ``annotations.gff`` and validates it 39 | using an external FASTA file named ``annotations.fa`` looks like: 40 | 41 | .. code:: python 42 | 43 | # validate.py 44 | # ============ 45 | from gff3 import Gff3 46 | 47 | # initialize a Gff3 object 48 | gff = Gff3() 49 | # parse GFF3 file and do syntax checking, this populates gff.lines and gff.features 50 | # if an embedded ##FASTA directive is found, parse the sequences into gff.fasta_embedded 51 | gff.parse('annotations.gff') 52 | # parse the external FASTA file into gff.fasta_external 53 | gff.parse_fasta_external('annotations.fa') 54 | # Check seqid, bounds and the number of Ns in each feature using one or more reference sources 55 | gff.check_reference(allowed_num_of_n=0, feature_types=['CDS']) 56 | # Checks whether child features are within the coordinate boundaries of parent features 57 | gff.check_parent_boundary() 58 | # Calculates the correct phase and checks if it matches the given phase for CDS features 59 | gff.check_phase() 60 | 61 | A more feature complete GFF3 validator with a command line interface which also generates validation 62 | report in MarkDown is available under ``examples/gff_valid.py`` 63 | 64 | The following example demonstrates how to filter, tranverse, and modify the parsed gff3 ``lines`` list. 65 | 66 | 1. Change features with type ``exon`` to ``pseudogenic_exon`` and type ``transcript`` to ``pseudogenic_transcript`` if the feature has an ancestor of type ``pseudogene`` 67 | 68 | 2. If a ``pseudogene`` feature overlaps with a ``gene`` feature, move all of the children from the ``pseudogene`` feature to the ``gene`` feature, and remove the ``pseudogene`` feature. 69 | 70 | .. code:: python 71 | 72 | # fix_pseudogene.py 73 | # ================= 74 | from gff3 import Gff3 75 | gff = Gff3('annotations.gff') 76 | type_map = {'exon': 'pseudogenic_exon', 'transcript': 'pseudogenic_transcript'} 77 | pseudogenes = [line for line in gff.lines if line['line_type'] == 'feature' and line['type'] == 'pseudogene'] 78 | for pseudogene in pseudogenes: 79 | # convert types 80 | for line in gff.descendants(pseudogene): 81 | if line['type'] in type_map: 82 | line['type'] = type_map[line['type']] 83 | # find overlapping gene 84 | overlapping_genes = [line for line in gff.lines if line['line_type'] == 'feature' and line['type'] == 'gene' and gff.overlap(line, pseudogene)] 85 | if overlapping_genes: 86 | # move pseudogene children to overlapping gene 87 | gff.adopt(pseudogene, overlapping_genes[0]) 88 | # remove pseudogene 89 | gff.remove(pseudogene) 90 | gff.write('annotations_fixed.gff') 91 | 92 | .. |GFF3| replace:: ``GFF3`` 93 | .. |dict| replace:: ``dict`` 94 | .. |list| replace:: ``list`` 95 | .. |logging| replace:: ``logging`` 96 | .. |FASTA| replace:: ``FASTA`` 97 | 98 | .. _GFF3: http://www.sequenceontology.org/gff3.shtml 99 | .. _dict: https://docs.python.org/2/tutorial/datastructures.html#dictionaries 100 | .. _list: https://docs.python.org/2/tutorial/datastructures.html#more-on-lists 101 | .. _logging: https://docs.python.org/2/library/logging.html 102 | .. _FASTA: http://en.wikipedia.org/wiki/FASTA_format 103 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/gff3.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/gff3.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/gff3" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/gff3" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | -------------------------------------------------------------------------------- /docs/authors.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../AUTHORS.rst 2 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # gff3 documentation build configuration file, created by 5 | # sphinx-quickstart on Tue Jul 9 22:26:36 2013. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | import sys 17 | import os 18 | 19 | # If extensions (or modules to document with autodoc) are in another 20 | # directory, add these directories to sys.path here. If the directory is 21 | # relative to the documentation root, use os.path.abspath to make it 22 | # absolute, like shown here. 23 | #sys.path.insert(0, os.path.abspath('.')) 24 | 25 | # Get the project root dir, which is the parent dir of this 26 | cwd = os.getcwd() 27 | project_root = os.path.dirname(cwd) 28 | 29 | # Insert the project root dir as the first element in the PYTHONPATH. 30 | # This lets us ensure that the source package is imported, and that its 31 | # version is used. 32 | sys.path.insert(0, project_root) 33 | 34 | import gff3 35 | 36 | # -- General configuration --------------------------------------------- 37 | 38 | # If your documentation needs a minimal Sphinx version, state it here. 39 | #needs_sphinx = '1.0' 40 | 41 | # Add any Sphinx extension module names here, as strings. They can be 42 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 43 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode'] 44 | 45 | # Add any paths that contain templates here, relative to this directory. 46 | templates_path = ['_templates'] 47 | 48 | # The suffix of source filenames. 49 | source_suffix = '.rst' 50 | 51 | # The encoding of source files. 52 | #source_encoding = 'utf-8-sig' 53 | 54 | # The master toctree document. 55 | master_doc = 'index' 56 | 57 | # General information about the project. 58 | project = u'gff3-py' 59 | copyright = u'2014, Han Lin' 60 | 61 | # The version info for the project you're documenting, acts as replacement 62 | # for |version| and |release|, also used in various other places throughout 63 | # the built documents. 64 | # 65 | # The short X.Y version. 66 | version = gff3.__version__ 67 | # The full version, including alpha/beta/rc tags. 68 | release = gff3.__version__ 69 | 70 | # The language for content autogenerated by Sphinx. Refer to documentation 71 | # for a list of supported languages. 72 | #language = None 73 | 74 | # There are two options for replacing |today|: either, you set today to 75 | # some non-false value, then it is used: 76 | #today = '' 77 | # Else, today_fmt is used as the format for a strftime call. 78 | #today_fmt = '%B %d, %Y' 79 | 80 | # List of patterns, relative to source directory, that match files and 81 | # directories to ignore when looking for source files. 82 | exclude_patterns = ['_build'] 83 | 84 | # The reST default role (used for this markup: `text`) to use for all 85 | # documents. 86 | #default_role = None 87 | 88 | # If true, '()' will be appended to :func: etc. cross-reference text. 89 | #add_function_parentheses = True 90 | 91 | # If true, the current module name will be prepended to all description 92 | # unit titles (such as .. function::). 93 | #add_module_names = True 94 | 95 | # If true, sectionauthor and moduleauthor directives will be shown in the 96 | # output. They are ignored by default. 97 | #show_authors = False 98 | 99 | # The name of the Pygments (syntax highlighting) style to use. 100 | pygments_style = 'sphinx' 101 | 102 | # A list of ignored prefixes for module index sorting. 103 | #modindex_common_prefix = [] 104 | 105 | # If true, keep warnings as "system message" paragraphs in the built 106 | # documents. 107 | #keep_warnings = False 108 | 109 | 110 | # -- Options for HTML output ------------------------------------------- 111 | 112 | # The theme to use for HTML and HTML Help pages. See the documentation for 113 | # a list of builtin themes. 114 | html_theme = 'default' 115 | 116 | # Theme options are theme-specific and customize the look and feel of a 117 | # theme further. For a list of options available for each theme, see the 118 | # documentation. 119 | #html_theme_options = {} 120 | 121 | # Add any paths that contain custom themes here, relative to this directory. 122 | #html_theme_path = [] 123 | 124 | # The name for this set of Sphinx documents. If None, it defaults to 125 | # " v documentation". 126 | #html_title = None 127 | 128 | # A shorter title for the navigation bar. Default is the same as 129 | # html_title. 130 | #html_short_title = None 131 | 132 | # The name of an image file (relative to this directory) to place at the 133 | # top of the sidebar. 134 | #html_logo = None 135 | 136 | # The name of an image file (within the static path) to use as favicon 137 | # of the docs. This file should be a Windows icon file (.ico) being 138 | # 16x16 or 32x32 pixels large. 139 | #html_favicon = None 140 | 141 | # Add any paths that contain custom static files (such as style sheets) 142 | # here, relative to this directory. They are copied after the builtin 143 | # static files, so a file named "default.css" will overwrite the builtin 144 | # "default.css". 145 | html_static_path = ['_static'] 146 | 147 | # If not '', a 'Last updated on:' timestamp is inserted at every page 148 | # bottom, using the given strftime format. 149 | #html_last_updated_fmt = '%b %d, %Y' 150 | 151 | # If true, SmartyPants will be used to convert quotes and dashes to 152 | # typographically correct entities. 153 | #html_use_smartypants = True 154 | 155 | # Custom sidebar templates, maps document names to template names. 156 | #html_sidebars = {} 157 | 158 | # Additional templates that should be rendered to pages, maps page names 159 | # to template names. 160 | #html_additional_pages = {} 161 | 162 | # If false, no module index is generated. 163 | #html_domain_indices = True 164 | 165 | # If false, no index is generated. 166 | #html_use_index = True 167 | 168 | # If true, the index is split into individual pages for each letter. 169 | #html_split_index = False 170 | 171 | # If true, links to the reST sources are added to the pages. 172 | #html_show_sourcelink = True 173 | 174 | # If true, "Created using Sphinx" is shown in the HTML footer. 175 | # Default is True. 176 | #html_show_sphinx = True 177 | 178 | # If true, "(C) Copyright ..." is shown in the HTML footer. 179 | # Default is True. 180 | #html_show_copyright = True 181 | 182 | # If true, an OpenSearch description file will be output, and all pages 183 | # will contain a tag referring to it. The value of this option 184 | # must be the base URL from which the finished HTML is served. 185 | #html_use_opensearch = '' 186 | 187 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 188 | #html_file_suffix = None 189 | 190 | # Output file base name for HTML help builder. 191 | htmlhelp_basename = 'gff3doc' 192 | 193 | 194 | # -- Options for LaTeX output ------------------------------------------ 195 | 196 | latex_elements = { 197 | # The paper size ('letterpaper' or 'a4paper'). 198 | #'papersize': 'letterpaper', 199 | 200 | # The font size ('10pt', '11pt' or '12pt'). 201 | #'pointsize': '10pt', 202 | 203 | # Additional stuff for the LaTeX preamble. 204 | #'preamble': '', 205 | } 206 | 207 | # Grouping the document tree into LaTeX files. List of tuples 208 | # (source start file, target name, title, author, documentclass 209 | # [howto/manual]). 210 | latex_documents = [ 211 | ('index', 'gff3.tex', 212 | u'gff3-py Documentation', 213 | u'Han Lin', 'manual'), 214 | ] 215 | 216 | # The name of an image file (relative to this directory) to place at 217 | # the top of the title page. 218 | #latex_logo = None 219 | 220 | # For "manual" documents, if this is true, then toplevel headings 221 | # are parts, not chapters. 222 | #latex_use_parts = False 223 | 224 | # If true, show page references after internal links. 225 | #latex_show_pagerefs = False 226 | 227 | # If true, show URL addresses after external links. 228 | #latex_show_urls = False 229 | 230 | # Documents to append as an appendix to all manuals. 231 | #latex_appendices = [] 232 | 233 | # If false, no module index is generated. 234 | #latex_domain_indices = True 235 | 236 | 237 | # -- Options for manual page output ------------------------------------ 238 | 239 | # One entry per manual page. List of tuples 240 | # (source start file, name, description, authors, manual section). 241 | man_pages = [ 242 | ('index', 'gff3', 243 | u'gff3-py Documentation', 244 | [u'Han Lin'], 1) 245 | ] 246 | 247 | # If true, show URL addresses after external links. 248 | #man_show_urls = False 249 | 250 | 251 | # -- Options for Texinfo output ---------------------------------------- 252 | 253 | # Grouping the document tree into Texinfo files. List of tuples 254 | # (source start file, target name, title, author, 255 | # dir menu entry, description, category) 256 | texinfo_documents = [ 257 | ('index', 'gff3', 258 | u'gff3-py Documentation', 259 | u'Han Lin', 260 | 'gff3', 261 | 'One line description of project.', 262 | 'Miscellaneous'), 263 | ] 264 | 265 | # Documents to append as an appendix to all manuals. 266 | #texinfo_appendices = [] 267 | 268 | # If false, no module index is generated. 269 | #texinfo_domain_indices = True 270 | 271 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 272 | #texinfo_show_urls = 'footnote' 273 | 274 | # If true, do not generate a @detailmenu in the "Top" node's menu. 275 | #texinfo_no_detailmenu = False 276 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../CONTRIBUTING.rst 2 | -------------------------------------------------------------------------------- /docs/history.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../HISTORY.rst 2 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. gff3 documentation master file, created by 2 | sphinx-quickstart on Tue Jul 9 22:26:36 2013. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to gff3-py's documentation! 7 | ====================================== 8 | 9 | Contents: 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | readme 15 | installation 16 | usage 17 | contributing 18 | authors 19 | history 20 | 21 | Indices and tables 22 | ================== 23 | 24 | * :ref:`genindex` 25 | * :ref:`modindex` 26 | * :ref:`search` 27 | 28 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Installation 3 | ============ 4 | 5 | At the command line:: 6 | 7 | $ easy_install gff3 8 | 9 | Or, if you have virtualenvwrapper installed:: 10 | 11 | $ mkvirtualenv gff3 12 | $ pip install gff3 13 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | goto end 41 | ) 42 | 43 | if "%1" == "clean" ( 44 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 45 | del /q /s %BUILDDIR%\* 46 | goto end 47 | ) 48 | 49 | 50 | %SPHINXBUILD% 2> nul 51 | if errorlevel 9009 ( 52 | echo. 53 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 54 | echo.installed, then set the SPHINXBUILD environment variable to point 55 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 56 | echo.may add the Sphinx directory to PATH. 57 | echo. 58 | echo.If you don't have Sphinx installed, grab it from 59 | echo.http://sphinx-doc.org/ 60 | exit /b 1 61 | ) 62 | 63 | if "%1" == "html" ( 64 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 68 | goto end 69 | ) 70 | 71 | if "%1" == "dirhtml" ( 72 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 76 | goto end 77 | ) 78 | 79 | if "%1" == "singlehtml" ( 80 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 84 | goto end 85 | ) 86 | 87 | if "%1" == "pickle" ( 88 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can process the pickle files. 92 | goto end 93 | ) 94 | 95 | if "%1" == "json" ( 96 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 97 | if errorlevel 1 exit /b 1 98 | echo. 99 | echo.Build finished; now you can process the JSON files. 100 | goto end 101 | ) 102 | 103 | if "%1" == "htmlhelp" ( 104 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 105 | if errorlevel 1 exit /b 1 106 | echo. 107 | echo.Build finished; now you can run HTML Help Workshop with the ^ 108 | .hhp project file in %BUILDDIR%/htmlhelp. 109 | goto end 110 | ) 111 | 112 | if "%1" == "qthelp" ( 113 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 114 | if errorlevel 1 exit /b 1 115 | echo. 116 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 117 | .qhcp project file in %BUILDDIR%/qthelp, like this: 118 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\gff3.qhcp 119 | echo.To view the help file: 120 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\gff3.ghc 121 | goto end 122 | ) 123 | 124 | if "%1" == "devhelp" ( 125 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished. 129 | goto end 130 | ) 131 | 132 | if "%1" == "epub" ( 133 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 137 | goto end 138 | ) 139 | 140 | if "%1" == "latex" ( 141 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 145 | goto end 146 | ) 147 | 148 | if "%1" == "latexpdf" ( 149 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 150 | cd %BUILDDIR%/latex 151 | make all-pdf 152 | cd %BUILDDIR%/.. 153 | echo. 154 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 155 | goto end 156 | ) 157 | 158 | if "%1" == "latexpdfja" ( 159 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 160 | cd %BUILDDIR%/latex 161 | make all-pdf-ja 162 | cd %BUILDDIR%/.. 163 | echo. 164 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 165 | goto end 166 | ) 167 | 168 | if "%1" == "text" ( 169 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 170 | if errorlevel 1 exit /b 1 171 | echo. 172 | echo.Build finished. The text files are in %BUILDDIR%/text. 173 | goto end 174 | ) 175 | 176 | if "%1" == "man" ( 177 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 178 | if errorlevel 1 exit /b 1 179 | echo. 180 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 181 | goto end 182 | ) 183 | 184 | if "%1" == "texinfo" ( 185 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 186 | if errorlevel 1 exit /b 1 187 | echo. 188 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 189 | goto end 190 | ) 191 | 192 | if "%1" == "gettext" ( 193 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 194 | if errorlevel 1 exit /b 1 195 | echo. 196 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 197 | goto end 198 | ) 199 | 200 | if "%1" == "changes" ( 201 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 202 | if errorlevel 1 exit /b 1 203 | echo. 204 | echo.The overview file is in %BUILDDIR%/changes. 205 | goto end 206 | ) 207 | 208 | if "%1" == "linkcheck" ( 209 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 210 | if errorlevel 1 exit /b 1 211 | echo. 212 | echo.Link check complete; look for any errors in the above output ^ 213 | or in %BUILDDIR%/linkcheck/output.txt. 214 | goto end 215 | ) 216 | 217 | if "%1" == "doctest" ( 218 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 219 | if errorlevel 1 exit /b 1 220 | echo. 221 | echo.Testing of doctests in the sources finished, look at the ^ 222 | results in %BUILDDIR%/doctest/output.txt. 223 | goto end 224 | ) 225 | 226 | if "%1" == "xml" ( 227 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 228 | if errorlevel 1 exit /b 1 229 | echo. 230 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 231 | goto end 232 | ) 233 | 234 | if "%1" == "pseudoxml" ( 235 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 236 | if errorlevel 1 exit /b 1 237 | echo. 238 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 239 | goto end 240 | ) 241 | 242 | :end 243 | -------------------------------------------------------------------------------- /docs/readme.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../README.rst 2 | -------------------------------------------------------------------------------- /docs/usage.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Usage 3 | ======== 4 | 5 | To use gff3-py in a project:: 6 | 7 | from gff3 import Gff3 8 | 9 | 10 | .. autoclass:: gff3.Gff3 11 | :members: -------------------------------------------------------------------------------- /examples/fix_pseudogene.py: -------------------------------------------------------------------------------- 1 | import sys 2 | # try to import from project first 3 | from os.path import dirname 4 | sys.path.insert(1, dirname(dirname(__file__))) 5 | from gff3 import Gff3 6 | 7 | gff = Gff3('annotations.gff') 8 | type_map = {'exon': 'pseudogenic_exon', 'transcript': 'pseudogenic_transcript'} 9 | pseudogenes = [line for line in gff.lines if line['type'] == 'pseudogene'] 10 | for pseudogene in pseudogenes: 11 | # convert types 12 | for line in gff.descendants(pseudogene): 13 | if line['type'] in type_map: 14 | line['type'] = type_map[line['type']] 15 | # find overlapping gene 16 | overlapping_genes = [line for line in gff.lines if line['type'] == 'gene' and gff.overlap(line, pseudogene)] 17 | if overlapping_genes: 18 | # move pseudogene children to overlapping gene 19 | gff.adopt(pseudogene, overlapping_genes[0]) 20 | # remove pseudogene 21 | gff.remove(pseudogene) 22 | gff.write('annotations_fixed.gff') -------------------------------------------------------------------------------- /examples/gff_fix.py: -------------------------------------------------------------------------------- 1 | #! /usr/local/bin/python2.7 2 | # Copyright (C) 2014 Han Lin 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 3 of the License, or 7 | # (at your option) any later version. 8 | 9 | """ 10 | Check a GFF3 file for errors and unwanted features, with an option to correct the errors and output a valid GFF3 file. 11 | 12 | Count the number of Ns in each feature, remove features with N count greater than the specified threshold. (Requires FASTA) 13 | Check and remove features with an end coordinates larger than the landmark sequence length. (Requires FASTA or ##sequence-region) 14 | Check if the ##sequence-region matches the FASTA file. (Requires FASTA and ##sequence-region) 15 | Add the ##sequence-region directives if missing. (Requires FASTA) 16 | Check and correct the phase for CDS features. 17 | 18 | Changelog: 19 | """ 20 | 21 | import sys 22 | from collections import OrderedDict 23 | from collections import defaultdict 24 | from itertools import groupby 25 | from urllib import quote, unquote 26 | from textwrap import wrap 27 | import re 28 | import logging 29 | # try to import from project first 30 | from os.path import dirname 31 | sys.path.insert(1, dirname(dirname(__file__))) 32 | from gff3 import Gff3 33 | 34 | __version__ = '1.0' 35 | 36 | 37 | def query_yes_no(question, default='yes'): 38 | """Ask a yes/no question via raw_input() and return their answer. 39 | 40 | 'question' is a string that is presented to the user. 41 | 'default' is the presumed answer if the user just hits . 42 | It must be 'yes' (the default), 'no' or None (meaning 43 | an answer is required of the user). 44 | 45 | The 'answer' return value is one of 'yes' or 'no'. 46 | """ 47 | valid = {'yes': True, 'y': True, 'ye': True, 48 | 'no': False, 'n': False} 49 | if default is None: 50 | prompt = ' [y/n] ' 51 | elif default == 'yes': 52 | prompt = ' [Y/n] ' 53 | elif default == 'no': 54 | prompt = ' [y/N] ' 55 | else: 56 | raise ValueError('invalid default answer: "%s"' % default) 57 | 58 | while True: 59 | sys.stderr.write(question + prompt) 60 | choice = raw_input().strip().lower() 61 | if default is not None and choice == '': 62 | return valid[default] 63 | elif choice in valid: 64 | return valid[choice] 65 | else: 66 | sys.stderr.write('Please respond with "y" or "n".\n') 67 | # gff_valid.py < annotations.gff > annotations.gff.validation_report 68 | # gff_valid.py -g agla_v1_1_NALmod.gff3 > agla_v1_1_NALmod.gff3.validation_report.md 69 | # gff_valid.py -g clec_v1_1_NALmod.gff3 > clec_v1_1_NALmod.gff3.validation_report.md 70 | # gff_valid.py -g ofas_v1_1_NALmod.gff3 > ofas_v1_1_NALmod.gff3.validation_report.md 71 | if __name__ == '__main__': 72 | logger_stderr = logging.getLogger(__name__+'stderr') 73 | logger_stderr.setLevel(logging.INFO) 74 | stderr_handler = logging.StreamHandler() 75 | stderr_handler.setFormatter(logging.Formatter('%(levelname)-8s %(message)s')) 76 | logger_stderr.addHandler(stderr_handler) 77 | logger_null = logging.getLogger(__name__+'null') 78 | null_handler = logging.NullHandler() 79 | logger_null.addHandler(null_handler) 80 | import argparse 81 | from textwrap import dedent 82 | parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=dedent("""\ 83 | Validate a GFF3 file for syntax and formating errors, parent relationship and reference sequence sanity. 84 | 85 | Features: 86 | 1. Check syntax and formatting according to gff3 version 1.21. 87 | 2. Count the number of Ns greater than the specified threshold (default: 0) in specified feature types (default: CDS). (Requires FASTA) 88 | 3. Check for features with an end coordinates larger than the landmark sequence length. (Requires FASTA or ##sequence-region) 89 | 4. Check if the ##sequence-region matches the FASTA file. (Requires FASTA and ##sequence-region) 90 | 5. Check whether child features are within the coordinate boundaries of parent features. 91 | 6. Check for the correct phase of CDS features. 92 | 93 | Inputs: 94 | 1. GFF3: reads from STDIN by default, may specify the file name with the -g argument 95 | 2. (optional) FASTA: specify the file name with the -f argument, will use the embedded ##FASTA in the GFF3 file if the external FASTA file is not specified 96 | 97 | Outputs: 98 | 1. MarkDown: contains validation summary and detail sections, writes to STDOUT by default, may specify the file name with the -r argument 99 | 100 | Examples: 101 | 1. Use default arguments, inout and output redirection: 102 | %(prog)s < a.gff > a_validation_report.txt 103 | 2. Specify the input, output file names and options using short arguments: 104 | %(prog)s -g a.gff -f a.fa -n 5 -t CDS exon -r a_validation_report.txt 105 | 3. Specify the input, output file names and options using long arguments: 106 | %(prog)s --gff_file a.gff --fasta_file a.fa --allowed_num_of_n 0 --check_n_feature_types CDS --report_file a_validation_report.txt 107 | """)) 108 | parser.add_argument('-g', '--gff_file', type=str, help='GFF3 file to validate (default: STDIN)') 109 | parser.add_argument('-f', '--fasta_file', type=str, help='The external reference FASTA file for the GFF3 files, has precedence over the ##FASTA section if both exist (default: None)') 110 | parser.add_argument('-n', '--allowed_num_of_n', type=int, default=0, 111 | help='Max number of Ns allowed in a feature, anything more will be reported as an error (default: 0)') 112 | parser.add_argument('-t', '--check_n_feature_types', nargs='*', default=['CDS'], 113 | help='Count the number of Ns in each feature with the type specified, multiple types may be specified, ex: -t CDS exon (default: "CDS")') 114 | parser.add_argument('-r', '--report_file', type=str, help='Validation report file (default: STDOUT)') 115 | parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + __version__) 116 | 117 | test_lv = 1 # debug 118 | if test_lv == 0: 119 | args = parser.parse_args(['-g', 'annotations.gff']) 120 | else: 121 | args = parser.parse_args() 122 | 123 | if args.gff_file: 124 | logger_stderr.info('Checking GFF3 file (%s)...', args.gff_file) 125 | elif not sys.stdin.isatty(): # if STDIN connected to pipe or file 126 | args.gff_file = sys.stdin 127 | logger_stderr.info('Reading from STDIN...') 128 | else: # no input 129 | parser.print_help() 130 | sys.exit(1) 131 | 132 | logger_stderr.info('Checking syntax and formatting...') 133 | gff3 = Gff3(gff_file=args.gff_file, fasta_external=args.fasta_file, logger=logger_null) 134 | logger_stderr.info('Checking reference seqid, bounds and N count...') 135 | gff3.check_reference(allowed_num_of_n=args.allowed_num_of_n, feature_types=args.check_n_feature_types) 136 | logger_stderr.info('Checking parent boundaries...') 137 | gff3.check_parent_boundary() 138 | 139 | gff3.check_phase() 140 | 141 | if args.report_file: 142 | logger_stderr.info('Writing validation report (%s)...', args.report_file) 143 | report_fh = open(args.report_file, 'wb') 144 | else: 145 | report_fh = sys.stdout 146 | 147 | # Validation Summary 148 | report_fh.write('# GFF3 Validation Report') 149 | if args.gff_file and sys.stdin.isatty(): 150 | report_fh.write(': {0:s}'.format(args.gff_file)) 151 | report_fh.write('\n\n') 152 | 153 | report_fh.write('# Validation Summary\n') 154 | error_lines = [line for line in gff3.lines if line['line_errors']] 155 | if len(error_lines) == 0: 156 | report_fh.write('* Found 0 errors\n') 157 | else: 158 | error_list = [error for line in error_lines for error in line['line_errors']] 159 | error_types = sorted(list(set([error['error_type'] for error in error_list]))) 160 | for error_type in error_types: 161 | report_fh.write('* Found {0:d} {1:s} errors in {2:d} lines\n'.format( 162 | len([error for error in error_list if error['error_type'] == error_type]), error_type, 163 | len([line for line in error_lines if [error for error in line['line_errors'] if error['error_type'] == error_type]]))) 164 | 165 | report_fh.write('\n') 166 | report_fh.write('# Detected Errors\n') 167 | for line in error_lines: 168 | report_fh.write('* Line {0:d}: {1:s}\n'.format(line['line_index'] + 1, line['line_raw'].strip())) 169 | for error in line['line_errors']: 170 | report_fh.write('\t- {error_type}: {message}\n'.format(error_type=error['error_type'], message=error['message'])) 171 | 172 | if args.report_file: 173 | report_fh.close() -------------------------------------------------------------------------------- /examples/gff_valid.py: -------------------------------------------------------------------------------- 1 | #! /usr/local/bin/python2.7 2 | # Copyright (C) 2014 Han Lin 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 3 of the License, or 7 | # (at your option) any later version. 8 | 9 | """ 10 | Check a GFF3 file for errors and output a validation report in markdown 11 | 12 | Count the number of Ns in each feature, remove features with N count greater than the specified threshold. (Requires FASTA) 13 | Check and remove features with an end coordinates larger than the landmark sequence length. (Requires FASTA or ##sequence-region) 14 | Check if the ##sequence-region matches the FASTA file. (Requires FASTA and ##sequence-region) 15 | Add the ##sequence-region directives if missing. (Requires FASTA) 16 | Check and correct the phase for CDS features. 17 | 18 | Changelog: 19 | """ 20 | 21 | import sys 22 | import re 23 | import logging 24 | from collections import OrderedDict 25 | from collections import defaultdict 26 | from itertools import groupby 27 | from urllib import quote, unquote 28 | from textwrap import wrap 29 | # try to import from project first 30 | from os.path import dirname 31 | sys.path.insert(1, dirname(dirname(__file__))) 32 | from gff3 import Gff3 33 | 34 | __version__ = '1.1' 35 | 36 | 37 | def query_yes_no(question, default='yes'): 38 | """Ask a yes/no question via raw_input() and return their answer. 39 | 40 | 'question' is a string that is presented to the user. 41 | 'default' is the presumed answer if the user just hits . 42 | It must be 'yes' (the default), 'no' or None (meaning 43 | an answer is required of the user). 44 | 45 | The 'answer' return value is one of 'yes' or 'no'. 46 | """ 47 | valid = {'yes': True, 'y': True, 'ye': True, 48 | 'no': False, 'n': False} 49 | if default is None: 50 | prompt = ' [y/n] ' 51 | elif default == 'yes': 52 | prompt = ' [Y/n] ' 53 | elif default == 'no': 54 | prompt = ' [y/N] ' 55 | else: 56 | raise ValueError('invalid default answer: "%s"' % default) 57 | 58 | while True: 59 | sys.stderr.write(question + prompt) 60 | choice = raw_input().strip().lower() 61 | if default is not None and choice == '': 62 | return valid[default] 63 | elif choice in valid: 64 | return valid[choice] 65 | else: 66 | sys.stderr.write('Please respond with "y" or "n".\n') 67 | # gff_valid.py < annotations.gff > annotations.gff.validation_report 68 | # gff_valid.py -g agla_v1_1_NALmod.gff3 > agla_v1_1_NALmod.gff3.validation_report.md 69 | # gff_valid.py -g clec_v1_1_NALmod.gff3 > clec_v1_1_NALmod.gff3.validation_report.md 70 | # gff_valid.py -g ofas_v1_1_NALmod.gff3 > ofas_v1_1_NALmod.gff3.validation_report.md 71 | if __name__ == '__main__': 72 | logger_stderr = logging.getLogger(__name__+'stderr') 73 | logger_stderr.setLevel(logging.INFO) 74 | stderr_handler = logging.StreamHandler() 75 | stderr_handler.setFormatter(logging.Formatter('%(levelname)-8s %(message)s')) 76 | logger_stderr.addHandler(stderr_handler) 77 | logger_null = logging.getLogger(__name__+'null') 78 | null_handler = logging.NullHandler() 79 | logger_null.addHandler(null_handler) 80 | import argparse 81 | from textwrap import dedent 82 | parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=dedent("""\ 83 | Validate a GFF3 file for syntax and formating errors, parent relationship and reference sequence sanity. 84 | 85 | Features: 86 | 1. Check syntax and formatting according to gff3 version 1.21. 87 | 2. Count the number of Ns greater than the specified threshold (default: 0) in specified feature types (default: CDS). (Requires FASTA) 88 | 3. Check for features with an end coordinates larger than the landmark sequence length. (Requires FASTA or ##sequence-region) 89 | 4. Check if the ##sequence-region matches the FASTA file. (Requires FASTA and ##sequence-region) 90 | 5. Check whether child features are within the coordinate boundaries of parent features. 91 | 6. Check for the correct phase of CDS features. 92 | 93 | Inputs: 94 | 1. GFF3: reads from STDIN by default, may specify the file name with the -g argument 95 | 2. (optional) FASTA: specify the file name with the -f argument, will use the embedded ##FASTA in the GFF3 file if the external FASTA file is not specified 96 | 97 | Outputs: 98 | 1. MarkDown: contains validation summary and detail sections, writes to STDOUT by default, may specify the file name with the -r argument 99 | 100 | Examples: 101 | 1. Use default arguments, inout and output redirection: 102 | %(prog)s < a.gff > a_validation_report.txt 103 | 2. Specify the input, output file names and options using short arguments: 104 | %(prog)s -g a.gff -f a.fa -n 5 -t CDS exon -r a_validation_report.txt 105 | 3. Specify the input, output file names and options using long arguments: 106 | %(prog)s --gff_file a.gff --fasta_file a.fa --allowed_num_of_n 0 --check_n_feature_types CDS --report_file a_validation_report.txt 107 | """)) 108 | parser.add_argument('-g', '--gff_file', type=str, help='GFF3 file to validate (default: STDIN)') 109 | parser.add_argument('-f', '--fasta_file', type=str, help='The external reference FASTA file for the GFF3 files, has precedence over the ##FASTA section if both exist (default: None)') 110 | parser.add_argument('-n', '--allowed_num_of_n', type=int, default=0, 111 | help='Max number of Ns allowed in a feature, anything more will be reported as an error (default: 0)') 112 | parser.add_argument('-t', '--check_n_feature_types', nargs='*', default=['CDS'], 113 | help='Count the number of Ns in each feature with the type specified, multiple types may be specified, ex: -t CDS exon (default: "CDS")') 114 | parser.add_argument('-r', '--report_file', type=str, help='Validation report file (default: STDOUT)') 115 | parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + __version__) 116 | 117 | test_lv = 1 # debug 118 | if test_lv == 0: 119 | args = parser.parse_args(['-g', 'annotations.gff']) 120 | else: 121 | args = parser.parse_args() 122 | 123 | if args.gff_file: 124 | logger_stderr.info('Checking GFF3 file (%s)...', args.gff_file) 125 | elif not sys.stdin.isatty(): # if STDIN connected to pipe or file 126 | args.gff_file = sys.stdin 127 | logger_stderr.info('Reading from STDIN...') 128 | else: # no input 129 | parser.print_help() 130 | sys.exit(1) 131 | 132 | logger_stderr.info('Checking syntax and formatting...') 133 | gff3 = Gff3(gff_file=args.gff_file, fasta_external=args.fasta_file, logger=logger_null) 134 | logger_stderr.info('Checking reference seqid, bounds and N count...') 135 | gff3.check_reference(allowed_num_of_n=args.allowed_num_of_n, feature_types=args.check_n_feature_types) 136 | logger_stderr.info('Checking parent boundaries...') 137 | gff3.check_parent_boundary() 138 | 139 | gff3.check_phase() 140 | 141 | if args.report_file: 142 | logger_stderr.info('Writing validation report (%s)...', args.report_file) 143 | report_fh = open(args.report_file, 'wb') 144 | else: 145 | report_fh = sys.stdout 146 | 147 | # Validation Summary 148 | report_fh.write('# GFF3 Validation Report') 149 | if args.gff_file and sys.stdin.isatty(): 150 | report_fh.write(': {0:s}'.format(args.gff_file)) 151 | report_fh.write('\n\n') 152 | 153 | report_fh.write('# Validation Summary\n') 154 | error_lines = [line for line in gff3.lines if line['line_errors']] 155 | if len(error_lines) == 0: 156 | report_fh.write('* Found 0 errors\n') 157 | else: 158 | error_list = [error for line in error_lines for error in line['line_errors']] 159 | error_types = sorted(list(set([error['error_type'] for error in error_list]))) 160 | for error_type in error_types: 161 | report_fh.write('* Found {0:d} {1:s} errors in {2:d} lines\n'.format( 162 | len([error for error in error_list if error['error_type'] == error_type]), error_type, 163 | len([line for line in error_lines if [error for error in line['line_errors'] if error['error_type'] == error_type]]))) 164 | 165 | report_fh.write('\n') 166 | report_fh.write('# Detected Errors\n') 167 | for line in error_lines: 168 | report_fh.write('* Line {0:d}: {1:s}\n'.format(line['line_index'] + 1, line['line_raw'].strip())) 169 | for error in line['line_errors']: 170 | report_fh.write('\t- {error_type}: {message}\n'.format(error_type=error['error_type'], message=error['message'])) 171 | 172 | if args.report_file: 173 | report_fh.close() -------------------------------------------------------------------------------- /examples/phase_test.gff3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hotdogee/gff3-py/e09df4beb8c67efeb10d1197e3cc98dc8ce59139/examples/phase_test.gff3 -------------------------------------------------------------------------------- /examples/phase_test.gff3.md: -------------------------------------------------------------------------------- 1 | # GFF3 Validation Report 2 | 3 | # Validation Summary 4 | * Found 28 PHASE errors in 28 lines 5 | 6 | # Detected Errors 7 | * Line 50: Scaffold1 WebApollo CDS 1177308 1177459 . + 1 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 8 | - PHASE: Wrong phase 1, should be 2 9 | * Line 51: Scaffold1 WebApollo CDS 1177543 1177716 . + 0 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 10 | - PHASE: Wrong phase 0, should be 2 11 | * Line 52: Scaffold1 WebApollo CDS 1178935 1179223 . + 0 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 12 | - PHASE: Wrong phase 0, should be 1 13 | * Line 54: Scaffold1 WebApollo CDS 1183361 1183513 . + 2 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 14 | - PHASE: Wrong phase 2, should be 1 15 | * Line 57: Scaffold1 WebApollo CDS 1188397 1188560 . + 2 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 16 | - PHASE: Wrong phase 2, should be 1 17 | * Line 58: Scaffold1 WebApollo CDS 1189584 1189771 . + 1 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 18 | - PHASE: Wrong phase 1, should be 0 19 | * Line 59: Scaffold1 WebApollo CDS 1190237 1190428 . + 0 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 20 | - PHASE: Wrong phase 0, should be 1 21 | * Line 60: Scaffold1 WebApollo CDS 1190549 1190749 . + 0 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 22 | - PHASE: Wrong phase 0, should be 2 23 | * Line 61: Scaffold1 WebApollo CDS 1192019 1192263 . + 0 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 24 | - PHASE: Wrong phase 0, should be 1 25 | * Line 62: Scaffold1 WebApollo CDS 1193380 1193494 . + 2 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 26 | - PHASE: Wrong phase 2, should be 0 27 | * Line 63: Scaffold1 WebApollo CDS 1193579 1193770 . + 0 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 28 | - PHASE: Wrong phase 0, should be 2 29 | * Line 64: Scaffold1 WebApollo CDS 1195224 1195412 . + 0 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 30 | - PHASE: Wrong phase 0, should be 1 31 | * Line 65: Scaffold1 WebApollo CDS 1199147 1199311 . + 0 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 32 | - PHASE: Wrong phase 0, should be 2 33 | * Line 66: Scaffold1 WebApollo CDS 1199927 1200166 . + 0 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 34 | - PHASE: Wrong phase 0, should be 1 35 | * Line 67: Scaffold1 WebApollo CDS 1200664 1200876 . + 0 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 36 | - PHASE: Wrong phase 0, should be 2 37 | * Line 68: Scaffold1 WebApollo CDS 1202550 1202690 . + 0 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 38 | - PHASE: Wrong phase 0, should be 1 39 | * Line 69: Scaffold1 WebApollo CDS 1218961 1219161 . + 0 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 40 | - PHASE: Wrong phase 0, should be 2 41 | * Line 70: Scaffold1 WebApollo CDS 1221344 1221502 . + 0 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 42 | - PHASE: Wrong phase 0, should be 1 43 | * Line 71: Scaffold1 WebApollo CDS 1222068 1222331 . + 0 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 44 | - PHASE: Wrong phase 0, should be 2 45 | * Line 72: Scaffold1 WebApollo CDS 1226580 1226846 . + 0 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 46 | - PHASE: Wrong phase 0, should be 1 47 | * Line 73: Scaffold1 WebApollo CDS 1230949 1231157 . + 0 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 48 | - PHASE: Wrong phase 0, should be 2 49 | * Line 79: Scaffold1 WebApollo CDS 1250986 1251124 . + 2 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 50 | - PHASE: Wrong phase 2, should be 1 51 | * Line 80: Scaffold1 WebApollo CDS 1251226 1251384 . + 0 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 52 | - PHASE: Wrong phase 0, should be 1 53 | * Line 81: Scaffold1 WebApollo CDS 1254745 1254849 . + 0 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 54 | - PHASE: Wrong phase 0, should be 2 55 | * Line 82: Scaffold1 WebApollo CDS 1256286 1256392 . + 0 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 56 | - PHASE: Wrong phase 0, should be 1 57 | * Line 83: Scaffold1 WebApollo CDS 1257900 1257970 . + 2 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 58 | - PHASE: Wrong phase 2, should be 0 59 | * Line 85: Scaffold1 WebApollo CDS 1270962 1271143 . + 1 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 60 | - PHASE: Wrong phase 1, should be 2 61 | * Line 86: Scaffold1 WebApollo CDS 1278339 1278592 . + 0 ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio 62 | - PHASE: Wrong phase 0, should be 2 63 | -------------------------------------------------------------------------------- /examples/validate.py: -------------------------------------------------------------------------------- 1 | import sys 2 | # try to import from project first 3 | from os.path import dirname 4 | sys.path.insert(1, dirname(dirname(__file__))) 5 | from gff3 import Gff3 6 | 7 | # initialize a Gff3 object 8 | gff = Gff3() 9 | # parse GFF3 file and do syntax checking, this populates gff.lines and gff.features 10 | # if an embedded ##FASTA directive is found, parse the sequences into gff.fasta_embedded 11 | gff.parse('annotations.gff') 12 | # parse the external FASTA file into gff.fasta_external 13 | #gff.parse_fasta_external('annotations.fa') 14 | # Check seqid, bounds and the number of Ns in each feature using one or more reference sources 15 | gff.check_reference(allowed_num_of_n=0, feature_types=['CDS']) 16 | # Checks whether child features are within the coordinate boundaries of parent features 17 | gff.check_parent_boundary() 18 | # Calculates the correct phase and checks if it matches the given phase for CDS features 19 | gff.check_phase() -------------------------------------------------------------------------------- /gff3/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Manipulate genomic features and validate the syntax and reference sequence of your GFF3 files""" 3 | from __future__ import absolute_import 4 | from .gff3 import Gff3 5 | __all__ = ['Gff3'] 6 | 7 | VERSION = (1, 0, 1) 8 | __version__ = '.'.join(map(str, VERSION[0:3])) + ''.join(VERSION[3:]) 9 | __author__ = 'Han Lin' 10 | __email__ = 'hotdogee [at] gmail [dot] com' 11 | __homepage__ = 'https://github.com/hotdogee/gff3-py' 12 | __docformat__ = 'restructuredtext' 13 | -------------------------------------------------------------------------------- /gff3/gff3.py: -------------------------------------------------------------------------------- 1 | #! /usr/local/bin/python2.7 2 | # -*- coding: utf-8 -*- 3 | # Copyright (C) 2014 Han Lin 4 | 5 | """ 6 | Check a GFF3 file for errors and unwanted features, with an option to correct the errors and output a valid GFF3 file. 7 | 8 | Count the number of Ns in each feature, remove features with N count greater than the specified threshold. (Requires FASTA) 9 | Check and remove features with an end coordinates larger than the landmark sequence length. (Requires FASTA or ##sequence-region) 10 | Check if the ##sequence-region matches the FASTA file. (Requires FASTA and ##sequence-region) 11 | Add the ##sequence-region directives if missing. (Requires FASTA) 12 | Check and correct the phase for CDS features. 13 | """ 14 | from __future__ import print_function 15 | 16 | #from collections import OrderedDict # not available in 2.6 17 | from collections import defaultdict 18 | from itertools import groupby 19 | try: 20 | from urllib import quote, unquote 21 | except ImportError: 22 | from urllib.parse import quote, unquote 23 | from textwrap import wrap 24 | import sys 25 | import re 26 | import string 27 | import logging 28 | logger = logging.getLogger(__name__) 29 | #log.basicConfig(level=logging.DEBUG, format='%(levelname)-8s %(message)s') 30 | logger.setLevel(logging.INFO) 31 | if not logger.handlers: 32 | lh = logging.StreamHandler() 33 | lh.setFormatter(logging.Formatter('%(levelname)-8s %(message)s')) 34 | logger.addHandler(lh) 35 | 36 | try: 37 | COMPLEMENT_TRANS = string.maketrans('TAGCtagc', 'ATCGATCG') 38 | except AttributeError: 39 | COMPLEMENT_TRANS = str.maketrans('TAGCtagc', 'ATCGATCG') 40 | def complement(seq): 41 | return seq.translate(COMPLEMENT_TRANS) 42 | 43 | BASES = ['t', 'c', 'a', 'g'] 44 | CODONS = [a+b+c for a in BASES for b in BASES for c in BASES] 45 | AMINO_ACIDS = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG' 46 | CODON_TABLE = dict(zip(CODONS, AMINO_ACIDS)) 47 | def translate(seq): 48 | seq = seq.lower().replace('\n', '').replace(' ', '') 49 | peptide = '' 50 | for i in range(0, len(seq), 3): 51 | codon = seq[i: i+3] 52 | amino_acid = CODON_TABLE.get(codon, '!') 53 | if amino_acid != '!': # end of seq 54 | peptide += amino_acid 55 | return peptide 56 | 57 | def fasta_file_to_dict(fasta_file, id=True, header=False, seq=False): 58 | """Returns a dict from a fasta file and the number of sequences as the second return value. 59 | fasta_file can be a string path or a file object. 60 | The key of fasta_dict can be set using the keyword arguments and 61 | results in a combination of id, header, sequence, in that order. joined with '||'. (default: id) 62 | Duplicate keys are checked and a warning is logged if found. 63 | The value of fasta_dict is a python dict with 3 keys: header, id and seq 64 | 65 | Changelog: 66 | 2014/11/17: 67 | * Added support for url escaped id 68 | """ 69 | fasta_file_f = fasta_file 70 | if isinstance(fasta_file, str): 71 | fasta_file_f = open(fasta_file, 'rb') 72 | 73 | fasta_dict = OrderedDict() 74 | keys = ['id', 'header', 'seq'] 75 | flags = dict([('id', id), ('header', header), ('seq', seq)]) 76 | entry = dict([('id', ''), ('header', ''), ('seq', '')]) 77 | count = 0 78 | line_num = 0 79 | 80 | for line in fasta_file_f: 81 | line = line.strip() 82 | if line and line[0] == '>': 83 | count += 1 84 | key = '||'.join([entry[i] for i in keys if flags[i]]) 85 | if key: # key != '' 86 | if key in fasta_dict: # check for duplicate key 87 | logger.warning('%s : Line %d : Duplicate %s [%s] : ID = [%s].', fasta_file_f.name, line_num, '||'.join([i for i in keys if flags[i]]), key[:25] + (key[25:] and '..'), entry['id']) 88 | entry['seq'] = ''.join(entry['seq']) 89 | fasta_dict[key] = entry 90 | # check for url escaped id 91 | if id: 92 | unescaped_id = unquote(entry['id']) 93 | if id != unescaped_id: 94 | key = '||'.join([unescaped_id] + [entry[i] for i in keys if i != 'id' and flags[i]]) 95 | entry['unescaped_id'] = unescaped_id 96 | fasta_dict[key] = entry 97 | entry = dict() 98 | entry['header'] = line 99 | entry['id'] = line.split()[0][1:] 100 | entry['seq'] = [] 101 | else: 102 | entry['seq'].append(line.upper()) 103 | line_num += 1 104 | 105 | if isinstance(fasta_file, str): 106 | fasta_file_f.close() 107 | 108 | key = '||'.join([entry[i] for i in keys if flags[i]]) 109 | if key: # key != '' 110 | if key in fasta_dict: 111 | logger.warning('%s : Line %d : Duplicate %s [%s] : ID = [%s].', fasta_file_f.name, line_num, '||'.join([i for i in keys if flags[i]]), key[:25] + (key[25:] and '..'), entry['id']) 112 | entry['seq'] = ''.join(entry['seq']) 113 | fasta_dict[key] = entry 114 | # check for url escaped id 115 | if id: 116 | unescaped_id = unquote(entry['id']) 117 | if id != unescaped_id: 118 | key = '||'.join([unescaped_id] + [entry[i] for i in keys if i != 'id' and flags[i]]) 119 | entry['unescaped_id'] = unescaped_id 120 | fasta_dict[key] = entry 121 | 122 | return fasta_dict, count 123 | 124 | def fasta_dict_to_file(fasta_dict, fasta_file, line_char_limit=None): 125 | """Write fasta_dict to fasta_file 126 | 127 | :param fasta_dict: returned by fasta_file_to_dict 128 | :param fasta_file: output file can be a string path or a file object 129 | :param line_char_limit: None = no limit (default) 130 | :return: None 131 | """ 132 | fasta_fp = fasta_file 133 | if isinstance(fasta_file, str): 134 | fasta_fp = open(fasta_file, 'wb') 135 | 136 | for key in fasta_dict: 137 | seq = fasta_dict[key]['seq'] 138 | if line_char_limit: 139 | seq = '\n'.join([seq[i:i+line_char_limit] for i in range(0, len(seq), line_char_limit)]) 140 | fasta_fp.write(u'{0:s}\n{1:s}\n'.format(fasta_dict[key]['header'], seq)) 141 | 142 | 143 | class Gff3(object): 144 | def __init__(self, gff_file=None, fasta_external=None, logger=logger): 145 | self.logger = logger 146 | self.lines = [] 147 | self.features = {} 148 | self.unresolved_parents = {} 149 | self.fasta_embedded = {} 150 | self.fasta_external = {} 151 | if gff_file: 152 | self.parse(gff_file) 153 | if fasta_external: 154 | self.parse_fasta_external(fasta_external) 155 | 156 | error_format = 'Line {current_line_num}: {error_type}: {message}\n-> {line}' 157 | 158 | def add_line_error(self, line_data, error_info, log_level=logging.ERROR): 159 | """Helper function to record and log an error message 160 | 161 | :param line_data: dict 162 | :param error_info: dict 163 | :param logger: 164 | :param log_level: int 165 | :return: 166 | """ 167 | if not error_info: return 168 | try: 169 | line_data['line_errors'].append(error_info) 170 | except KeyError: 171 | line_data['line_errors'] = [error_info] 172 | except TypeError: # no line_data 173 | pass 174 | try: 175 | self.logger.log(log_level, Gff3.error_format.format(current_line_num=line_data['line_index'] + 1, error_type=error_info['error_type'], message=error_info['message'], line=line_data['line_raw'].rstrip())) 176 | except AttributeError: # no logger 177 | pass 178 | 179 | def check_unresolved_parents(self): 180 | # check if any unresolved parents are now resolvable 181 | if len(self.unresolved_parents) > 0: 182 | self.logger.info('%d unresolved forward referencing parent ids, trying global lookup...' % len(self.unresolved_parents)) 183 | globally_resolved_parents = set() 184 | for feature_id in self.unresolved_parents: 185 | if feature_id in self.features: 186 | self.logger.info(' Resolved parent id: {0:s}, defined in lines: {1:s}, referenced in lines: {2:s}'.format( 187 | feature_id, 188 | ','.join([str(line_data['line_index'] + 1) for line_data in self.features[feature_id]]), 189 | ','.join([str(line_data['line_index'] + 1) for line_data in self.unresolved_parents[feature_id]]))) 190 | globally_resolved_parents.add(feature_id) 191 | for line_data in self.unresolved_parents[feature_id]: 192 | line_data['parents'].append(self.features[feature_id]) 193 | for ld in self.features[feature_id]: 194 | # no need to check if line_data in ld['children'], because it is impossible, each ld maps to only one feature_id, so the ld we get are all different 195 | ld['children'].append(line_data) 196 | still_unresolved_parents = sorted(list(set(self.unresolved_parents) - globally_resolved_parents)) 197 | if len(still_unresolved_parents) > 0: 198 | self.logger.info('{0:d} unresolved parent ids:'.format(len(still_unresolved_parents))) 199 | for feature_id in still_unresolved_parents: 200 | self.logger.info(' Unresolved parent id: {0:s}, referenced in lines: {1:s}'.format(feature_id, ','.join( 201 | [str(line_data['line_index'] + 1) for line_data in self.unresolved_parents[feature_id]]))) 202 | 203 | def check_parent_boundary(self): 204 | """ 205 | checks whether child features are within the coordinate boundaries of parent features 206 | 207 | :return: 208 | """ 209 | for line in self.lines: 210 | for parent_feature in line['parents']: 211 | ok = False 212 | for parent_line in parent_feature: 213 | if parent_line['start'] <= line['start'] and line['end'] <= parent_line['end']: 214 | ok = True 215 | break 216 | if not ok: 217 | self.add_line_error(line, {'message': 'This feature is not contained within the feature boundaries of parent: {0:s}: {1:s}'.format( 218 | parent_feature[0]['attributes']['ID'], 219 | ','.join(['({0:s}, {1:d}, {2:d})'.format(line['seqid'], line['start'], line['end']) for line in parent_feature]) 220 | ), 'error_type': 'BOUNDS', 'location': 'parent_boundary'}) 221 | 222 | def check_phase(self): 223 | """ 224 | 1. get a list of CDS with the same parent 225 | 2. sort according to strand 226 | 3. calculate and validate phase 227 | """ 228 | plus_minus = set(['+', '-']) 229 | for k, g in groupby(sorted([line for line in self.lines if line['line_type'] == 'feature' and line['type'] == 'CDS' and 'Parent' in line['attributes']], key=lambda x: x['attributes']['Parent']), key=lambda x: x['attributes']['Parent']): 230 | cds_list = list(g) 231 | strand_set = list(set([line['strand'] for line in cds_list])) 232 | if len(strand_set) != 1: 233 | for line in cds_list: 234 | self.add_line_error(line, {'message': 'Inconsistent CDS strand with parent: {0:s}'.format(k), 'error_type': 'STRAND'}) 235 | continue 236 | if len(cds_list) == 1: 237 | if cds_list[0]['phase'] != 0: 238 | self.add_line_error(cds_list[0], {'message': 'Wrong phase {0:d}, should be {1:d}'.format(cds_list[0]['phase'], 0), 'error_type': 'PHASE'}) 239 | continue 240 | strand = strand_set[0] 241 | if strand not in plus_minus: 242 | # don't process unknown strands 243 | continue 244 | if strand == '-': 245 | # sort end descending 246 | sorted_cds_list = sorted(cds_list, key=lambda x: x['end'], reverse=True) 247 | else: 248 | sorted_cds_list = sorted(cds_list, key=lambda x: x['start']) 249 | phase = 0 250 | for line in sorted_cds_list: 251 | if line['phase'] != phase: 252 | self.add_line_error(line, {'message': 'Wrong phase {0:d}, should be {1:d}'.format(line['phase'], phase), 'error_type': 'PHASE'}) 253 | phase = (3 - ((line['end'] - line['start'] + 1 - phase) % 3)) % 3 254 | 255 | def parse_fasta_external(self, fasta_file): 256 | self.fasta_external, count = fasta_file_to_dict(fasta_file) 257 | 258 | def check_reference(self, sequence_region=False, fasta_embedded=False, fasta_external=False, check_bounds=True, check_n=True, allowed_num_of_n=0, feature_types=('CDS',)): 259 | """ 260 | Check seqid, bounds and the number of Ns in each feature using one or more reference sources. 261 | 262 | Seqid check: check if the seqid can be found in the reference sources. 263 | 264 | Bounds check: check the start and end fields of each features and log error if the values aren't within the seqid sequence length, requires at least one of these sources: ##sequence-region, embedded #FASTA, or external FASTA file. 265 | 266 | Ns check: count the number of Ns in each feature with the type specified in *line_types (default: 'CDS') and log an error if the number is greater than allowed_num_of_n (default: 0), requires at least one of these sources: embedded #FASTA, or external FASTA file. 267 | 268 | When called with all source parameters set as False (default), check all available sources, and log debug message if unable to perform a check due to none of the reference sources being available. 269 | 270 | If any source parameter is set to True, check only those sources marked as True, log error if those sources don't exist. 271 | 272 | :param sequence_region: check bounds using the ##sequence-region directive (default: False) 273 | :param fasta_embedded: check bounds using the embedded fasta specified by the ##FASTA directive (default: False) 274 | :param fasta_external: check bounds using the external fasta given by the self.parse_fasta_external (default: False) 275 | :param check_bounds: If False, don't run the bounds check (default: True) 276 | :param check_n: If False, don't run the Ns check (default: True) 277 | :param allowed_num_of_n: only report features with a number of Ns greater than the specified value (default: 0) 278 | :param feature_types: only check features of these feature_types, multiple types may be specified, if none are specified, check only 'CDS' 279 | :return: error_lines: a set of line_index(int) with errors detected by check_reference 280 | """ 281 | # collect lines with errors in this set 282 | error_lines = set() 283 | # check if we have a parsed gff3 284 | if not self.lines: 285 | self.logger.debug('.parse(gff_file) before calling .check_bounds()') 286 | return error_lines 287 | # setup default line_types 288 | check_n_feature_types = set(feature_types) 289 | if len(check_n_feature_types) == 0: 290 | check_n_feature_types.add('CDS') 291 | # compile regex 292 | n_segments_finditer = re.compile(r'[Nn]+').finditer 293 | # check_all_sources mode 294 | check_all_sources = True 295 | if sequence_region or fasta_embedded or fasta_external: 296 | check_all_sources = False 297 | # get a list of line_data with valid start and end coordinates and unescape the seqid 298 | start_end_error_locations = set(('start', 'end', 'start,end')) 299 | valid_line_data_seqid = [(line_data, unquote(line_data['seqid'])) for line_data in self.lines if line_data['line_type'] == 'feature' and line_data['seqid'] != '.' and (not line_data['line_errors'] or not [error_info for error_info in line_data['line_errors'] if 'location' in error_info and error_info['location'] in start_end_error_locations])] 300 | checked_at_least_one_source = False 301 | # check directive 302 | # don't use any directives with errors 303 | valid_sequence_regions = dict([(unquote(line_data['seqid']), line_data) for line_data in self.lines if line_data['directive'] == '##sequence-region' and not line_data['line_errors']]) 304 | unresolved_seqid = set() 305 | if (check_all_sources or sequence_region) and valid_sequence_regions: 306 | checked_at_least_one_source = True 307 | for line_data, seqid in valid_line_data_seqid: 308 | if seqid not in valid_sequence_regions and seqid not in unresolved_seqid: 309 | unresolved_seqid.add(seqid) 310 | error_lines.add(line_data['line_index']) 311 | self.add_line_error(line_data, {'message': u'Seqid not found in any ##sequence-region: {0:s}'.format( 312 | seqid), 'error_type': 'BOUNDS', 'location': 'sequence_region'}) 313 | continue 314 | if line_data['start'] < valid_sequence_regions[seqid]['start']: 315 | error_lines.add(line_data['line_index']) 316 | self.add_line_error(line_data, {'message': 'Start is less than the ##sequence-region start: %d' % valid_sequence_regions[seqid]['start'], 'error_type': 'BOUNDS', 'location': 'sequence_region'}) 317 | if line_data['end'] > valid_sequence_regions[seqid]['end']: 318 | error_lines.add(line_data['line_index']) 319 | self.add_line_error(line_data, {'message': 'End is greater than the ##sequence-region end: %d' % valid_sequence_regions[seqid]['end'], 'error_type': 'BOUNDS', 'location': 'sequence_region'}) 320 | elif sequence_region: 321 | self.logger.debug('##sequence-region not found in GFF3') 322 | # check fasta_embedded 323 | unresolved_seqid = set() 324 | if (check_all_sources or fasta_embedded) and self.fasta_embedded: 325 | checked_at_least_one_source = True 326 | for line_data, seqid in valid_line_data_seqid: 327 | if seqid not in self.fasta_embedded and seqid not in unresolved_seqid: 328 | unresolved_seqid.add(seqid) 329 | error_lines.add(line_data['line_index']) 330 | self.add_line_error(line_data, {'message': 'Seqid not found in the embedded ##FASTA: %s' % seqid, 'error_type': 'BOUNDS', 'location': 'fasta_embedded'}) 331 | continue 332 | # check bounds 333 | if line_data['end'] > len(self.fasta_embedded[seqid]['seq']): 334 | error_lines.add(line_data['line_index']) 335 | self.add_line_error(line_data, {'message': 'End is greater than the embedded ##FASTA sequence length: %d' % len(self.fasta_embedded[seqid]['seq']), 'error_type': 'BOUNDS', 'location': 'fasta_embedded'}) 336 | # check n 337 | if check_n and line_data['type'] in check_n_feature_types: 338 | """ 339 | >>> timeit("a.lower().count('n')", "import re; a = ('ASDKADSJHFIUDNNNNNNNnnnnSHFD'*50)") 340 | 5.540903252684302 341 | >>> timeit("a.count('n'); a.count('N')", "import re; a = ('ASDKADSJHFIUDNNNNNNNnnnnSHFD'*50)") 342 | 2.3504867946058425 343 | >>> timeit("re.findall('[Nn]+', a)", "import re; a = ('ASDKADSJHFIUDNNNNNNNnnnnSHFD'*50)") 344 | 30.60731204915959 345 | """ 346 | n_count = self.fasta_embedded[seqid]['seq'].count('N', line_data['start'] - 1, line_data['end']) + self.fasta_embedded[seqid]['seq'].count('n', line_data['start'] - 1, line_data['end']) 347 | if n_count > allowed_num_of_n: 348 | # get detailed segments info 349 | n_segments = [(m.start(), m.end() - m.start()) for m in n_segments_finditer(self.fasta_embedded[seqid]['seq'], line_data['start'] - 1, line_data['end'])] 350 | n_segments_str = ['(%d, %d)' % (m[0], m[1]) for m in n_segments] 351 | error_lines.add(line_data['line_index']) 352 | self.add_line_error(line_data, {'message': 'Found %d Ns in %s feature of length %d using the embedded ##FASTA, consists of %d segment (start, length): %s' % (n_count, line_data['type'], line_data['end'] - line_data['start'], len(n_segments), ', '.join(n_segments_str)), 'error_type': 'N_COUNT', 'n_segments': n_segments, 'location': 'fasta_embedded'}) 353 | elif fasta_embedded: 354 | self.logger.debug('Embedded ##FASTA not found in GFF3') 355 | # check fasta_external 356 | unresolved_seqid = set() 357 | if (check_all_sources or fasta_external) and self.fasta_external: 358 | checked_at_least_one_source = True 359 | for line_data, seqid in valid_line_data_seqid: 360 | if seqid not in self.fasta_external and seqid not in unresolved_seqid: 361 | unresolved_seqid.add(seqid) 362 | error_lines.add(line_data['line_index']) 363 | self.add_line_error(line_data, {'message': 'Seqid not found in the external FASTA file: %s' % seqid, 'error_type': 'BOUNDS', 'location': 'fasta_external'}) 364 | continue 365 | # check bounds 366 | if line_data['end'] > len(self.fasta_external[seqid]['seq']): 367 | error_lines.add(line_data['line_index']) 368 | self.add_line_error(line_data, {'message': 'End is greater than the external FASTA sequence length: %d' % len(self.fasta_external[seqid]['seq']), 'error_type': 'BOUNDS', 'location': 'fasta_external'}) 369 | # check n 370 | if check_n and line_data['type'] in check_n_feature_types: 371 | n_count = self.fasta_external[seqid]['seq'].count('N', line_data['start'] - 1, line_data['end']) + self.fasta_external[seqid]['seq'].count('n', line_data['start'] - 1, line_data['end']) 372 | if n_count > allowed_num_of_n: 373 | # get detailed segments info 374 | n_segments = [(m.start(), m.end() - m.start()) for m in n_segments_finditer(self.fasta_external[seqid]['seq'], line_data['start'] - 1, line_data['end'])] 375 | n_segments_str = ['(%d, %d)' % (m[0], m[1]) for m in n_segments] 376 | error_lines.add(line_data['line_index']) 377 | self.add_line_error(line_data, {'message': 'Found %d Ns in %s feature of length %d using the external FASTA, consists of %d segment (start, length): %s' % (n_count, line_data['type'], line_data['end'] - line_data['start'], len(n_segments), ', '.join(n_segments_str)), 'error_type': 'N_COUNT', 'n_segments': n_segments, 'location': 'fasta_external'}) 378 | elif fasta_external: 379 | self.logger.debug('External FASTA file not given') 380 | if check_all_sources and not checked_at_least_one_source: 381 | self.logger.debug('Unable to perform bounds check, requires at least one of the following sources: ##sequence-region, embedded ##FASTA, or external FASTA file') 382 | return error_lines 383 | 384 | def parse(self, gff_file, strict=False): 385 | """Parse the gff file into the following data structures: 386 | 387 | * lines(list of line_data(dict)) 388 | - line_index(int): the index in lines 389 | - line_raw(str) 390 | - line_type(str in ['feature', 'directive', 'comment', 'blank', 'unknown']) 391 | - line_errors(list of str): a list of error messages 392 | - line_status(str in ['normal', 'modified', 'removed']) 393 | - parents(list of feature(list of line_data(dict))): may have multiple parents 394 | - children(list of line_data(dict)) 395 | - extra fields depending on line_type 396 | * directive 397 | - directive(str in ['##gff-version', '##sequence-region', '##feature-ontology', '##attribute-ontology', '##source-ontology', '##species', '##genome-build', '###', '##FASTA']) 398 | - extra fields depending on directive 399 | * feature 400 | - seqid(str): must escape any characters not in the set [a-zA-Z0-9.:^*$@!+_?-|] using RFC 3986 Percent-Encoding 401 | - source(str) 402 | - type(str in so_types) 403 | - start(int) 404 | - end(int) 405 | - score(float) 406 | - strand(str in ['+', '-', '.', '?']) 407 | - phase(int in [0, 1, 2]) 408 | - attributes(dict of tag(str) to value) 409 | - ID(str) 410 | - Name(str) 411 | - Alias(list of str): multi value 412 | - Parent(list of str): multi value 413 | - Target(dict) 414 | - target_id(str) 415 | - start(int) 416 | - end(int) 417 | - strand(str in ['+', '-', '']) 418 | - Gap(str): CIGAR format 419 | - Derives_from(str) 420 | - Note(list of str): multi value 421 | - Dbxref(list of str): multi value 422 | - Ontology_term(list of str): multi value 423 | - Is_circular(str in ['true']) 424 | * fasta_dict(dict of id(str) to sequence_item(dict)) 425 | - id(str) 426 | - header(str) 427 | - seq(str) 428 | - line_length(int) 429 | 430 | * features(dict of feature_id(str in line_data['attributes']['ID']) to feature(list of line_data(dict))) 431 | 432 | A feature is a list of line_data(dict), since all lines that share an ID collectively represent a single feature. 433 | 434 | During serialization, line_data(dict) references should be converted into line_index(int) 435 | 436 | :param gff_file: a string path or file object 437 | :param strict: when true, throw exception on syntax and format errors. when false, use best effort to finish parsing while logging errors 438 | """ 439 | valid_strand = set(('+', '-', '.', '?')) 440 | valid_phase = set((0, 1, 2)) 441 | multi_value_attributes = set(('Parent', 'Alias', 'Note', 'Dbxref', 'Ontology_term')) 442 | valid_attribute_target_strand = set(('+', '-', '')) 443 | reserved_attributes = set(('ID', 'Name', 'Alias', 'Parent', 'Target', 'Gap', 'Derives_from', 'Note', 'Dbxref', 'Ontology_term', 'Is_circular')) 444 | 445 | # illegal character check 446 | # Literal use of tab, newline, carriage return, the percent (%) sign, and control characters must be encoded using RFC 3986 Percent-Encoding; no other characters may be encoded. 447 | # control characters: \x00-\x1f\x7f this includes tab(\x09), newline(\x0a), carriage return(\x0d) 448 | # seqid may contain any characters, but must escape any characters not in the set [a-zA-Z0-9.:^*$@!+_?-|] 449 | # URL escaping rules are used for tags or values containing the following characters: ",=;". 450 | #>>> timeit("unescaped_seqid('Un.7589')", "import re; unescaped_seqid = re.compile(r'[^a-zA-Z0-9.:^*$@!+_?|%-]|%(?![0-9a-fA-F]{2})').search") 451 | #0.4128372745785036 452 | #>>> timeit("unescaped_seqid2('Un.7589')", "import re; unescaped_seqid2 = re.compile(r'^([a-zA-Z0-9.:^*$@!+_?|-]|%[0-9a-fA-F]{2})+$').search") 453 | #0.9012313532265175 454 | unescaped_seqid = re.compile(r'[^a-zA-Z0-9.:^*$@!+_?|%-]|%(?![0-9a-fA-F]{2})').search 455 | unescaped_field = re.compile(r'[\x00-\x1f\x7f]|%(?![0-9a-fA-F]{2})').search 456 | 457 | gff_fp = gff_file 458 | if isinstance(gff_file, str): 459 | gff_fp = open(gff_file, 'r') 460 | 461 | lines = [] 462 | current_line_num = 1 # line numbers start at 1 463 | features = defaultdict(list) 464 | # key = the unresolved id, value = a list of line_data(dict) 465 | unresolved_parents = defaultdict(list) 466 | 467 | for line_raw in gff_fp: 468 | line_data = { 469 | 'line_index': current_line_num - 1, 470 | 'line_raw': line_raw, 471 | 'line_status': 'normal', 472 | 'parents': [], 473 | 'children': [], 474 | 'line_type': '', 475 | 'directive': '', 476 | 'line_errors': [], 477 | 'type': '', 478 | } 479 | line_strip = line_raw.strip() 480 | if line_strip != line_raw[:len(line_strip)]: 481 | self.add_line_error(line_data, {'message': 'White chars not allowed at the start of a line', 'error_type': 'FORMAT', 'location': ''}) 482 | if current_line_num == 1 and not line_strip.startswith('##gff-version'): 483 | self.add_line_error(line_data, {'message': '"##gff-version" missing from the first line', 'error_type': 'FORMAT', 'location': ''}) 484 | if len(line_strip) == 0: 485 | line_data['line_type'] = 'blank' 486 | continue 487 | if line_strip.startswith('##'): 488 | line_data['line_type'] = 'directive' 489 | if line_strip.startswith('##sequence-region'): 490 | # ##sequence-region seqid start end 491 | # This element is optional, but strongly encouraged because it allows parsers to perform bounds checking on features. 492 | # only one ##sequence-region directive may be given for any given seqid 493 | # all features on that landmark feature (having that seqid) must be contained within the range defined by that ##sequence-region diretive. An exception to this rule is allowed when a landmark feature is marked with the Is_circular attribute. 494 | line_data['directive'] = '##sequence-region' 495 | tokens = list(line_strip.split()[1:]) 496 | if len(tokens) != 3: 497 | self.add_line_error(line_data, {'message': 'Expecting 3 fields, got %d: %s' % (len(tokens) - 1, repr(tokens[1:])), 'error_type': 'FORMAT', 'location': ''}) 498 | if len(tokens) > 0: 499 | line_data['seqid'] = tokens[0] 500 | # check for duplicate ##sequence-region seqid 501 | if [True for d in lines if ('directive' in d and d['directive'] == '##sequence-region' and 'seqid' in d and d['seqid'] == line_data['seqid'])]: 502 | self.add_line_error(line_data, {'message': '##sequence-region seqid: "%s" may only appear once' % line_data['seqid'], 'error_type': 'FORMAT', 'location': ''}) 503 | try: 504 | all_good = True 505 | try: 506 | line_data['start'] = int(tokens[1]) 507 | if line_data['start'] < 1: 508 | self.add_line_error(line_data, {'message': 'Start is not a valid 1-based integer coordinate: "%s"' % tokens[1], 'error_type': 'FORMAT', 'location': ''}) 509 | except ValueError: 510 | all_good = False 511 | self.add_line_error(line_data, {'message': 'Start is not a valid integer: "%s"' % tokens[1], 'error_type': 'FORMAT', 'location': ''}) 512 | line_data['start'] = tokens[1] 513 | try: 514 | line_data['end'] = int(tokens[2]) 515 | if line_data['end'] < 1: 516 | self.add_line_error(line_data, {'message': 'End is not a valid 1-based integer coordinate: "%s"' % tokens[2], 'error_type': 'FORMAT', 'location': ''}) 517 | except ValueError: 518 | all_good = False 519 | self.add_line_error(line_data, {'message': 'End is not a valid integer: "%s"' % tokens[2], 'error_type': 'FORMAT', 'location': ''}) 520 | line_data['start'] = tokens[2] 521 | # if all_good then both start and end are int, so we can check if start is not less than or equal to end 522 | if all_good and line_data['start'] > line_data['end']: 523 | self.add_line_error(line_data, {'message': 'Start is not less than or equal to end', 'error_type': 'FORMAT', 'location': ''}) 524 | except IndexError: 525 | pass 526 | elif line_strip.startswith('##gff-version'): 527 | # The GFF version, always 3 in this specification must be present, must be the topmost line of the file and may only appear once in the file. 528 | line_data['directive'] = '##gff-version' 529 | # check if it appeared before 530 | if [True for d in lines if ('directive' in d and d['directive'] == '##gff-version')]: 531 | self.add_line_error(line_data, {'message': '##gff-version missing from the first line', 'error_type': 'FORMAT', 'location': ''}) 532 | tokens = list(line_strip.split()[1:]) 533 | if len(tokens) != 1: 534 | self.add_line_error(line_data, {'message': 'Expecting 1 field, got %d: %s' % (len(tokens) - 1, repr(tokens[1:])), 'error_type': 'FORMAT', 'location': ''}) 535 | if len(tokens) > 0: 536 | try: 537 | line_data['version'] = int(tokens[0]) 538 | if line_data['version'] != 3: 539 | self.add_line_error(line_data, {'message': 'Version is not "3": "%s"' % tokens[0], 'error_type': 'FORMAT', 'location': ''}) 540 | except ValueError: 541 | self.add_line_error(line_data, {'message': 'Version is not a valid integer: "%s"' % tokens[0], 'error_type': 'FORMAT', 'location': ''}) 542 | line_data['version'] = tokens[0] 543 | elif line_strip.startswith('###'): 544 | # This directive (three # signs in a row) indicates that all forward references to feature IDs that have been seen to this point have been resolved. 545 | line_data['directive'] = '###' 546 | elif line_strip.startswith('##FASTA'): 547 | # This notation indicates that the annotation portion of the file is at an end and that the 548 | # remainder of the file contains one or more sequences (nucleotide or protein) in FASTA format. 549 | line_data['directive'] = '##FASTA' 550 | self.logger.info('Reading embedded ##FASTA sequence') 551 | self.fasta_embedded, count = fasta_file_to_dict(gff_fp) 552 | self.logger.info('%d sequences read' % len(self.fasta_embedded)) 553 | elif line_strip.startswith('##feature-ontology'): 554 | # ##feature-ontology URI 555 | # This directive indicates that the GFF3 file uses the ontology of feature types located at the indicated URI or URL. 556 | line_data['directive'] = '##feature-ontology' 557 | tokens = list(line_strip.split()[1:]) 558 | if len(tokens) != 1: 559 | self.add_line_error(line_data, {'message': 'Expecting 1 field, got %d: %s' % (len(tokens) - 1, repr(tokens[1:])), 'error_type': 'FORMAT', 'location': ''}) 560 | if len(tokens) > 0: 561 | line_data['URI'] = tokens[0] 562 | elif line_strip.startswith('##attribute-ontology'): 563 | # ##attribute-ontology URI 564 | # This directive indicates that the GFF3 uses the ontology of attribute names located at the indicated URI or URL. 565 | line_data['directive'] = '##attribute-ontology' 566 | tokens = list(line_strip.split()[1:]) 567 | if len(tokens) != 1: 568 | self.add_line_error(line_data, {'message': 'Expecting 1 field, got %d: %s' % (len(tokens) - 1, repr(tokens[1:])), 'error_type': 'FORMAT', 'location': ''}) 569 | if len(tokens) > 0: 570 | line_data['URI'] = tokens[0] 571 | elif line_strip.startswith('##source-ontology'): 572 | # ##source-ontology URI 573 | # This directive indicates that the GFF3 uses the ontology of source names located at the indicated URI or URL. 574 | line_data['directive'] = '##source-ontology' 575 | tokens = list(line_strip.split()[1:]) 576 | if len(tokens) != 1: 577 | self.add_line_error(line_data, {'message': 'Expecting 1 field, got %d: %s' % (len(tokens) - 1, repr(tokens[1:])), 'error_type': 'FORMAT', 'location': ''}) 578 | if len(tokens) > 0: 579 | line_data['URI'] = tokens[0] 580 | elif line_strip.startswith('##species'): 581 | # ##species NCBI_Taxonomy_URI 582 | # This directive indicates the species that the annotations apply to. 583 | line_data['directive'] = '##species' 584 | tokens = list(line_strip.split()[1:]) 585 | if len(tokens) != 1: 586 | self.add_line_error(line_data, {'message': 'Expecting 1 field, got %d: %s' % (len(tokens) - 1, repr(tokens[1:])), 'error_type': 'FORMAT', 'location': ''}) 587 | if len(tokens) > 0: 588 | line_data['NCBI_Taxonomy_URI'] = tokens[0] 589 | elif line_strip.startswith('##genome-build'): 590 | # ##genome-build source buildName 591 | # The genome assembly build name used for the coordinates given in the file. 592 | line_data['directive'] = '##genome-build' 593 | tokens = list(line_strip.split()[1:]) 594 | if len(tokens) != 2: 595 | self.add_line_error(line_data, {'message': 'Expecting 2 fields, got %d: %s' % (len(tokens) - 1, repr(tokens[1:])), 'error_type': 'FORMAT', 'location': ''}) 596 | if len(tokens) > 0: 597 | line_data['source'] = tokens[0] 598 | try: 599 | line_data['buildName'] = tokens[1] 600 | except IndexError: 601 | pass 602 | else: 603 | self.add_line_error(line_data, {'message': 'Unknown directive', 'error_type': 'FORMAT', 'location': ''}) 604 | tokens = list(line_strip.split()) 605 | line_data['directive'] = tokens[0] 606 | elif line_strip.startswith('#'): 607 | line_data['line_type'] = 'comment' 608 | else: 609 | # line_type may be a feature or unknown 610 | line_data['line_type'] = 'feature' 611 | tokens = list(map(str.strip, line_raw.split('\t'))) 612 | if len(tokens) != 9: 613 | self.add_line_error(line_data, {'message': 'Features should contain 9 fields, got %d: %s' % (len(tokens) - 1, repr(tokens[1:])), 'error_type': 'FORMAT', 'location': ''}) 614 | for i, t in enumerate(tokens): 615 | if not t: 616 | self.add_line_error(line_data, {'message': 'Empty field: %d, must have a "."' % (i + 1), 'error_type': 'FORMAT', 'location': ''}) 617 | try: 618 | line_data['seqid'] = tokens[0] 619 | if unescaped_seqid(tokens[0]): 620 | self.add_line_error(line_data, {'message': 'Seqid must escape any characters not in the set [a-zA-Z0-9.:^*$@!+_?-|]: "%s"' % tokens[0], 'error_type': 'FORMAT', 'location': ''}) 621 | line_data['source'] = tokens[1] 622 | if unescaped_field(tokens[1]): 623 | self.add_line_error(line_data, {'message': 'Source must escape the percent (%%) sign and any control characters: "%s"' % tokens[1], 'error_type': 'FORMAT', 'location': ''}) 624 | line_data['type'] = tokens[2] 625 | if unescaped_field(tokens[2]): 626 | self.add_line_error(line_data, {'message': 'Type must escape the percent (%%) sign and any control characters: "%s"' % tokens[2], 'error_type': 'FORMAT', 'location': ''}) 627 | all_good = True 628 | try: 629 | line_data['start'] = int(tokens[3]) 630 | if line_data['start'] < 1: 631 | self.add_line_error(line_data, {'message': 'Start is not a valid 1-based integer coordinate: "%s"' % tokens[3], 'error_type': 'FORMAT', 'location': 'start'}) 632 | except ValueError: 633 | all_good = False 634 | line_data['start'] = tokens[3] 635 | if line_data['start'] != '.': 636 | self.add_line_error(line_data, {'message': 'Start is not a valid integer: "%s"' % line_data['start'], 'error_type': 'FORMAT', 'location': 'start'}) 637 | try: 638 | line_data['end'] = int(tokens[4]) 639 | if line_data['end'] < 1: 640 | self.add_line_error(line_data, {'message': 'End is not a valid 1-based integer coordinate: "%s"' % tokens[4], 'error_type': 'FORMAT', 'location': 'end'}) 641 | except ValueError: 642 | all_good = False 643 | line_data['end'] = tokens[4] 644 | if line_data['end'] != '.': 645 | self.add_line_error(line_data, {'message': 'End is not a valid integer: "%s"' % line_data['end'], 'error_type': 'FORMAT', 'location': 'end'}) 646 | # if all_good then both start and end are int, so we can check if start is not less than or equal to end 647 | if all_good and line_data['start'] > line_data['end']: 648 | self.add_line_error(line_data, {'message': 'Start is not less than or equal to end', 'error_type': 'FORMAT', 'location': 'start,end'}) 649 | try: 650 | line_data['score'] = float(tokens[5]) 651 | except ValueError: 652 | line_data['score'] = tokens[5] 653 | if line_data['score'] != '.': 654 | self.add_line_error(line_data, {'message': 'Score is not a valid floating point number: "%s"' % line_data['score'], 'error_type': 'FORMAT', 'location': ''}) 655 | line_data['strand'] = tokens[6] 656 | if line_data['strand'] not in valid_strand: # set(['+', '-', '.', '?']) 657 | self.add_line_error(line_data, {'message': 'Strand has illegal characters: "%s"' % tokens[6], 'error_type': 'FORMAT', 'location': ''}) 658 | try: 659 | line_data['phase'] = int(tokens[7]) 660 | if line_data['phase'] not in valid_phase: # set([0, 1, 2]) 661 | self.add_line_error(line_data, {'message': 'Phase is not 0, 1, or 2: "%s"' % tokens[7], 'error_type': 'FORMAT', 'location': ''}) 662 | except ValueError: 663 | line_data['phase'] = tokens[7] 664 | if line_data['phase'] != '.': 665 | self.add_line_error(line_data, {'message': 'Phase is not a valid integer: "%s"' % line_data['phase'], 'error_type': 'FORMAT', 'location': ''}) 666 | elif line_data['type'] == 'CDS': 667 | self.add_line_error(line_data, {'message': 'Phase is required for all CDS features', 'error_type': 'FORMAT', 'location': ''}) 668 | # parse attributes, ex: ID=exon00003;Parent=mRNA00001,mRNA00003;Name=EXON.1 669 | # URL escaping rules are used for tags or values containing the following characters: ",=;". Spaces are allowed in this field, but tabs must be replaced with the %09 URL escape. 670 | # Note that attribute names are case sensitive. "Parent" is not the same as "parent". 671 | # All attributes that begin with an uppercase letter are reserved for later use. Attributes that begin with a lowercase letter can be used freely by applications. 672 | if unescaped_field(tokens[8]): 673 | self.add_line_error(line_data, {'message': 'Attributes must escape the percent (%) sign and any control characters', 'error_type': 'FORMAT', 'location': ''}) 674 | attribute_tokens = tuple(tuple(t for t in a.split('=')) for a in tokens[8].split(';') if a) 675 | line_data['attributes'] = {} 676 | if len(attribute_tokens) == 1 and len(attribute_tokens[0]) == 1 and attribute_tokens[0][0] == '.': 677 | pass # no attributes 678 | else: 679 | for a in attribute_tokens: 680 | if len(a) != 2: 681 | self.add_line_error(line_data, {'message': 'Attributes must contain one and only one equal (=) sign: "%s"' % ('='.join(a)), 'error_type': 'FORMAT', 'location': ''}) 682 | try: 683 | tag, value = a 684 | except ValueError: 685 | tag, value = a[0], '' 686 | if not tag: 687 | self.add_line_error(line_data, {'message': 'Empty attribute tag: "%s"' % '='.join(a), 'error_type': 'FORMAT', 'location': ''}) 688 | if not value.strip(): 689 | self.add_line_error(line_data, {'message': 'Empty attribute value: "%s"' % '='.join(a), 'error_type': 'FORMAT', 'location': ''}, log_level=logging.WARNING) 690 | if tag in line_data['attributes']: 691 | self.add_line_error(line_data, {'message': 'Found multiple attribute tags: "%s"' % tag, 'error_type': 'FORMAT', 'location': ''}) 692 | if tag in multi_value_attributes: # set(['Parent', 'Alias', 'Note', 'Dbxref', 'Ontology_term']) 693 | if value.find(', ') >= 0: 694 | self.add_line_error(line_data, {'message': 'Found ", " in %s attribute, possible unescaped ",": "%s"' % (tag, value), 'error_type': 'FORMAT', 'location': ''}, log_level=logging.WARNING) 695 | # In addition to Parent, the Alias, Note, Dbxref and Ontology_term attributes can have multiple values. 696 | if tag in line_data['attributes']: # if this tag has been seen before 697 | if tag == 'Note': # don't check for duplicate notes 698 | line_data['attributes'][tag].extend(value.split(',')) 699 | else: # only add non duplicate values 700 | line_data['attributes'][tag].extend([s for s in value.split(',') if s not in line_data['attributes'][tag]]) 701 | else: 702 | line_data['attributes'][tag] = value.split(',') 703 | # check for duplicate values 704 | if tag != 'Note' and len(line_data['attributes'][tag]) != len(set(line_data['attributes'][tag])): 705 | count_values = [(len(list(group)), key) for key, group in groupby(sorted(line_data['attributes'][tag]))] 706 | self.add_line_error(line_data, {'message': '%s attribute has identical values (count, value): %s' % (tag, ', '.join(['(%d, %s)' % (c, v) for c, v in count_values if c > 1])), 'error_type': 'FORMAT', 'location': ''}) 707 | # remove duplicate 708 | line_data['attributes'][tag] = list(set(line_data['attributes'][tag])) 709 | 710 | if tag == 'Parent': 711 | for feature_id in line_data['attributes']['Parent']: 712 | try: 713 | line_data['parents'].append(features[feature_id]) 714 | for ld in features[feature_id]: 715 | # no need to check if line_data in ld['children'], because it is impossible, each ld maps to only one feature_id, so the ld we get are all different 716 | ld['children'].append(line_data) 717 | except KeyError: # features[id] 718 | self.add_line_error(line_data, {'message': '%s attribute has unresolved forward reference: %s' % (tag, feature_id), 'error_type': 'FORMAT', 'location': ''}) 719 | unresolved_parents[feature_id].append(line_data) 720 | elif tag == 'Target': 721 | if value.find(',') >= 0: 722 | self.add_line_error(line_data, {'message': 'Value of %s attribute contains unescaped ",": "%s"' % (tag, value), 'error_type': 'FORMAT', 'location': ''}) 723 | target_tokens = value.split(' ') 724 | if len(target_tokens) < 3 or len(target_tokens) > 4: 725 | self.add_line_error(line_data, {'message': 'Target attribute should have 3 or 4 values, got %d: %s' % (len(target_tokens), repr(tokens)), 'error_type': 'FORMAT', 'location': ''}) 726 | line_data['attributes'][tag] = {} 727 | try: 728 | line_data['attributes'][tag]['target_id'] = target_tokens[0] 729 | all_good = True 730 | try: 731 | line_data['attributes'][tag]['start'] = int(target_tokens[1]) 732 | if line_data['attributes'][tag]['start'] < 1: 733 | self.add_line_error(line_data, {'message': 'Start value of Target attribute is not a valid 1-based integer coordinate: "%s"' % target_tokens[1], 'error_type': 'FORMAT', 'location': ''}) 734 | except ValueError: 735 | all_good = False 736 | line_data['attributes'][tag]['start'] = target_tokens[1] 737 | self.add_line_error(line_data, {'message': 'Start value of Target attribute is not a valid integer: "%s"' % line_data['attributes'][tag]['start'], 'error_type': 'FORMAT', 'location': ''}) 738 | try: 739 | line_data['attributes'][tag]['end'] = int(target_tokens[2]) 740 | if line_data['attributes'][tag]['end'] < 1: 741 | self.add_line_error(line_data, {'message': 'End value of Target attribute is not a valid 1-based integer coordinate: "%s"' % target_tokens[2], 'error_type': 'FORMAT', 'location': ''}) 742 | except ValueError: 743 | all_good = False 744 | line_data['attributes'][tag]['end'] = target_tokens[2] 745 | self.add_line_error(line_data, {'message': 'End value of Target attribute is not a valid integer: "%s"' % line_data['attributes'][tag]['end'], 'error_type': 'FORMAT', 'location': ''}) 746 | # if all_good then both start and end are int, so we can check if start is not less than or equal to end 747 | if all_good and line_data['attributes'][tag]['start'] > line_data['attributes'][tag]['end']: 748 | self.add_line_error(line_data, {'message': 'Start is not less than or equal to end', 'error_type': 'FORMAT', 'location': ''}) 749 | line_data['attributes'][tag]['strand'] = target_tokens[3] 750 | if line_data['attributes'][tag]['strand'] not in valid_attribute_target_strand: # set(['+', '-', '']) 751 | self.add_line_error(line_data, {'message': 'Strand value of Target attribute has illegal characters: "%s"' % line_data['attributes'][tag]['strand'], 'error_type': 'FORMAT', 'location': ''}) 752 | except IndexError: 753 | pass 754 | else: 755 | if value.find(',') >= 0: 756 | self.add_line_error(line_data, {'message': 'Value of %s attribute contains unescaped ",": "%s"' % (tag, value), 'error_type': 'FORMAT', 'location': ''}) 757 | line_data['attributes'][tag] = value 758 | if tag == 'Is_circular' and value != 'true': 759 | self.add_line_error(line_data, {'message': 'Value of Is_circular attribute is not "true": "%s"' % value, 'error_type': 'FORMAT', 'location': ''}) 760 | elif tag[:1].isupper() and tag not in reserved_attributes: # {'ID', 'Name', 'Alias', 'Parent', 'Target', 'Gap', 'Derives_from', 'Note', 'Dbxref', 'Ontology_term', 'Is_circular'} 761 | self.add_line_error(line_data, {'message': 'Unknown reserved (uppercase) attribute: "%s"' % tag, 'error_type': 'FORMAT', 'location': ''}) 762 | elif tag == 'ID': 763 | # check for duplicate ID in non-adjacent lines 764 | if value in features and lines[-1]['attributes'][tag] != value: 765 | self.add_line_error(line_data, {'message': 'Duplicate ID: "%s" in non-adjacent lines: %s' % (value, ','.join([str(f['line_index'] + 1) for f in features[value]])), 'error_type': 'FORMAT', 'location': ''}, log_level=logging.WARNING) 766 | features[value].append(line_data) 767 | except IndexError: 768 | pass 769 | current_line_num += 1 770 | lines.append(line_data) 771 | 772 | if isinstance(gff_file, str): 773 | gff_fp.close() 774 | 775 | # global look up of unresolved parents 776 | for feature_id in unresolved_parents: 777 | if feature_id in features: 778 | for line in unresolved_parents[feature_id]: 779 | self.add_line_error(line, {'message': 'Unresolved forward reference: "%s", found defined in lines: %s' % (feature_id, ','.join([str(ld['line_index'] + 1) for ld in features[feature_id]])), 'error_type': 'FORMAT', 'location': ''}) 780 | 781 | self.lines = lines 782 | self.features = features 783 | return 1 784 | 785 | def descendants(self, line_data): 786 | """ 787 | BFS graph algorithm 788 | :param line_data: line_data(dict) with line_data['line_index'] or line_index(int) 789 | :return: list of line_data(dict) 790 | """ 791 | # get start node 792 | try: 793 | start = line_data['line_index'] 794 | except TypeError: 795 | start = self.lines[line_data]['line_index'] 796 | visited_set, visited_list, queue = set(), [], [start] 797 | while queue: 798 | node = queue.pop(0) 799 | if node not in visited_set: 800 | visited_set.add(node) 801 | visited_list.append(self.lines[node]) 802 | queue.extend([ld['line_index'] for ld in self.lines[node]['children'] if ld['line_index'] not in visited_set]) 803 | return visited_list[1:] 804 | 805 | def ancestors(self, line_data): 806 | """ 807 | BFS graph algorithm 808 | 809 | :param line_data: line_data(dict) with line_data['line_index'] or line_index(int) 810 | :return: list of line_data(dict) 811 | """ 812 | # get start node 813 | try: 814 | start = line_data['line_index'] 815 | except TypeError: 816 | start = self.lines[line_data]['line_index'] 817 | visited_set, visited_list, queue = set(), [], [start] 818 | while queue: 819 | node = queue.pop(0) 820 | if node not in visited_set: 821 | visited_set.add(node) 822 | visited_list.append(self.lines[node]) 823 | queue.extend([ld['line_index'] for f in self.lines[node]['parents'] for ld in f if ld['line_index'] not in visited_set]) 824 | return visited_list[1:] 825 | 826 | def adopt(self, old_parent, new_parent): 827 | """ 828 | Transfer children from old_parent to new_parent 829 | 830 | :param old_parent: feature_id(str) or line_index(int) or line_data(dict) or feature 831 | :param new_parent: feature_id(str) or line_index(int) or line_data(dict) 832 | :return: List of children transferred 833 | """ 834 | try: # assume line_data(dict) 835 | old_id = old_parent['attributes']['ID'] 836 | except TypeError: 837 | try: # assume line_index(int) 838 | old_id = self.lines[old_parent]['attributes']['ID'] 839 | except TypeError: # assume feature_id(str) 840 | old_id = old_parent 841 | old_feature = self.features[old_id] 842 | old_indexes = [ld['line_index'] for ld in old_feature] 843 | try: # assume line_data(dict) 844 | new_id = new_parent['attributes']['ID'] 845 | except TypeError: 846 | try: # assume line_index(int) 847 | new_id = self.lines[new_parent]['attributes']['ID'] 848 | except TypeError: # assume feature_id(str) 849 | new_id = new_parent 850 | new_feature = self.features[new_id] 851 | new_indexes = [ld['line_index'] for ld in new_feature] 852 | # build a list of children to be moved 853 | # add the child to the new parent's children list if its not already there 854 | # update the child's parent list and parent attribute 855 | # finally remove the old parent's children list 856 | children = old_feature[0]['children'] 857 | new_parent_children_set = set([ld['line_index'] for ld in new_feature[0]['children']]) 858 | for child in children: 859 | if child['line_index'] not in new_parent_children_set: 860 | new_parent_children_set.add(child['line_index']) 861 | for new_ld in new_feature: 862 | new_ld['children'].append(child) 863 | child['parents'].append(new_feature) 864 | child['attributes']['Parent'].append(new_id) 865 | # remove multiple, list.remove() only removes 1 866 | child['parents'] = [f for f in child['parents'] if f[0]['attributes']['ID'] != old_id] 867 | child['attributes']['Parent'] = [d for d in child['attributes']['Parent'] if d != old_id] 868 | for old_ld in old_feature: 869 | old_ld['children'] = [] 870 | return children 871 | 872 | def adopted(self, old_child, new_child): 873 | """ 874 | Transfer parents from old_child to new_child 875 | 876 | :param old_child: line_data(dict) with line_data['line_index'] or line_index(int) 877 | :param new_child: line_data(dict) with line_data['line_index'] or line_index(int) 878 | :return: List of parents transferred 879 | """ 880 | pass 881 | 882 | def overlap(self, line_data_a, line_data_b): 883 | return line_data_a['seqid'] == line_data_b['seqid'] and (line_data_a['start'] <= line_data_b['start'] and line_data_b['start'] <= line_data_a['end'] or 884 | line_data_a['start'] <= line_data_b['end'] and line_data_b['end'] <= line_data_a['end'] or 885 | line_data_b['start'] <= line_data_a['start'] and line_data_a['end'] <= line_data_b['end']) 886 | 887 | def remove(self, line_data, root_type=None): 888 | """ 889 | Marks line_data and all of its associated feature's 'line_status' as 'removed', does not actually remove the line_data from the data structure. 890 | The write function checks the 'line_status' when writing the gff file. 891 | Find the root parent of line_data of type root_type, remove all of its descendants. 892 | If the root parent has a parent with no children after the remove, remove the root parent's parent recursively. 893 | 894 | :param line_data: 895 | :param root_type: 896 | :return: 897 | """ 898 | roots = [ld for ld in self.ancestors(line_data) if (root_type and ld['line_type'] == root_type) or (not root_type and not ld['parents'])] or [line_data] 899 | for root in roots: 900 | root['line_status'] = 'removed' 901 | root_descendants = self.descendants(root) 902 | for root_descendant in root_descendants: 903 | root_descendant['line_status'] = 'removed' 904 | root_ancestors = self.ancestors(root) # BFS, so we will process closer ancestors first 905 | for root_ancestor in root_ancestors: 906 | if len([ld for ld in root_ancestor['children'] if ld['line_status'] != 'removed']) == 0: # if all children of a root_ancestor is removed 907 | # remove this root_ancestor 908 | root_ancestor['line_status'] = 'removed' 909 | 910 | 911 | def fix(self): 912 | pass 913 | 914 | def write(self, gff_file, embed_fasta=None, fasta_char_limit=None): 915 | gff_fp = gff_file 916 | if isinstance(gff_file, str): 917 | gff_fp = open(gff_file, 'wb') 918 | 919 | wrote_sequence_region = set() 920 | # build sequence region data 921 | sequence_regions = {} 922 | if self.fasta_external: 923 | for seqid in self.fasta_external: 924 | sequence_regions[seqid] = (1, len(self.fasta_external[seqid]['seq'])) 925 | elif self.fasta_embedded: 926 | for seqid in self.fasta_embedded: 927 | sequence_regions[seqid] = (1, len(self.fasta_embedded[seqid]['seq'])) 928 | else: 929 | pass 930 | 931 | wrote_lines = set() 932 | field_keys = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase'] 933 | reserved_attributes = ['ID', 'Name', 'Alias', 'Parent', 'Target', 'Gap', 'Derives_from', 'Note', 'Dbxref', 'Ontology_term', 'Is_circular'] 934 | attributes_sort_map = defaultdict(int, zip(reserved_attributes, range(len(reserved_attributes), 0, -1))) 935 | def write_feature(line_data): 936 | if line_data['line_status'] == 'removed': 937 | return 938 | field_list = [str(line_data[k]) for k in field_keys] 939 | attribute_list = [] 940 | for k, v in sorted(line_data['attributes'].items(), key=lambda x: attributes_sort_map[x[0]], reverse=True): 941 | if isinstance(v, list): 942 | v = ','.join(v) 943 | attribute_list.append('%s=%s' % (str(k), str(v))) 944 | field_list.append(';'.join(attribute_list)) 945 | gff_fp.write('\t'.join(field_list) + '\n') 946 | wrote_lines.add(line_data['line_index']) 947 | # write directives 948 | ignore_directives = ['##sequence-region', '###', '##FASTA'] 949 | directives_lines = [line_data for line_data in self.lines if line_data['line_type'] == 'directive' and line_data['directive'] not in ignore_directives] 950 | for directives_line in directives_lines: 951 | gff_fp.write(directives_line['line_raw']) 952 | 953 | # write features 954 | # get a list of root nodes 955 | root_lines = [line_data for line_data in self.lines if line_data['line_type'] == 'feature' and not line_data['parents']] 956 | 957 | for root_line in root_lines: 958 | lines_wrote = len(wrote_lines) 959 | if root_line['line_index'] in wrote_lines: 960 | continue 961 | # write #sequence-region if new seqid 962 | if root_line['seqid'] not in wrote_sequence_region: 963 | if root_line['seqid'] in sequence_regions: 964 | gff_fp.write('##sequence-region %s %d %d\n' % (root_line['seqid'], sequence_regions[root_line['seqid']][0], sequence_regions[root_line['seqid']][1])) 965 | wrote_sequence_region.add(root_line['seqid']) 966 | try: 967 | root_feature = self.features[root_line['attributes']['ID']] 968 | except KeyError: 969 | root_feature = [root_line] 970 | for line_data in root_feature: 971 | write_feature(line_data) 972 | descendants = self.descendants(root_line) 973 | for descendant in descendants: 974 | if descendant['line_index'] in wrote_lines: 975 | continue 976 | write_feature(descendant) 977 | # check if we actually wrote something 978 | if lines_wrote != len(wrote_lines): 979 | gff_fp.write('###\n') 980 | # write fasta 981 | fasta = embed_fasta or self.fasta_external or self.fasta_embedded 982 | if fasta and embed_fasta != False: 983 | gff_fp.write('##FASTA\n') 984 | fasta_dict_to_file(fasta, gff_fp, line_char_limit=fasta_char_limit) 985 | 986 | if isinstance(gff_file, str): 987 | gff_fp.close() 988 | 989 | def sequence(self, line_data, child_type=None, reference=None): 990 | """ 991 | Get the sequence of line_data, according to the columns 'seqid', 'start', 'end', 'strand'. 992 | Requires fasta reference. 993 | When used on 'mRNA' type line_data, child_type can be used to specify which kind of sequence to return: 994 | * child_type=None: pre-mRNA, returns the sequence of line_data from start to end, reverse complement according to strand. (default) 995 | * child_type='exon': mature mRNA, concatenates the sequences of children type 'exon'. 996 | * child_type='CDS': coding sequence, concatenates the sequences of children type 'CDS'. Use the helper 997 | function translate(seq) on the returned value to obtain the protein sequence. 998 | 999 | :param line_data: line_data(dict) with line_data['line_index'] or line_index(int) 1000 | :param child_type: None or feature type(string) 1001 | :param reference: If None, will use self.fasta_external or self.fasta_embedded(dict) 1002 | :return: sequence(string) 1003 | """ 1004 | # get start node 1005 | reference = reference or self.fasta_external or self.fasta_embedded 1006 | if not reference: 1007 | raise Exception('External or embedded fasta reference needed') 1008 | try: 1009 | line_index = line_data['line_index'] 1010 | except TypeError: 1011 | line_index = self.lines[line_data]['line_index'] 1012 | ld = self.lines[line_index] 1013 | if ld['type'] != 'feature': 1014 | return None 1015 | seq = reference[ld['seqid']][ld['start']-1:ld['end']] 1016 | if ld['strand'] == '-': 1017 | seq = complement(seq[::-1]) 1018 | return seq 1019 | 1020 | def type_tree(self): 1021 | class node(object): 1022 | def __init__(self, value, children=None): 1023 | self.value = value or '' 1024 | self.children = children or set() 1025 | 1026 | def __repr__(self, level=0): 1027 | ret = '\t' * level + repr(self.value) + '\n' 1028 | for child in sorted(list(self.children), key=lambda x: x.value): 1029 | ret += child.__repr__(level+1) 1030 | return ret 1031 | root_set = set() 1032 | node_dict = {} 1033 | feature_line_list = [line_data for line_data in self.lines if line_data['line_type'] == 'feature'] 1034 | for line_data in feature_line_list: 1035 | if len(line_data['children']) > 0: 1036 | parent_type = line_data['type'] 1037 | if parent_type not in node_dict: 1038 | node_dict[parent_type] = node(parent_type) 1039 | if len(line_data['parents']) == 0: 1040 | root_set.add(node_dict[parent_type]) 1041 | for child_ld in line_data['children']: 1042 | child_type = child_ld['type'] 1043 | if child_type not in node_dict: 1044 | node_dict[child_type] = node(child_type) 1045 | if parent_type == child_type and child_type == 'mRNA': 1046 | print(line_data['line_index'], child_ld['line_index']) 1047 | else: 1048 | node_dict[parent_type].children.add(node_dict[child_type]) 1049 | return sorted(list(root_set), key=lambda x: x.value) 1050 | 1051 | try: 1052 | from collections import OrderedDict 1053 | except ImportError: 1054 | # Backport of OrderedDict() class that runs on Python 2.4, 2.5, 2.6, 2.7 and pypy. 1055 | # Passes Python2.7's test suite and incorporates all the latest updates. 1056 | 1057 | try: 1058 | from thread import get_ident as _get_ident 1059 | except ImportError: 1060 | from dummy_thread import get_ident as _get_ident 1061 | 1062 | try: 1063 | from _abcoll import KeysView, ValuesView, ItemsView 1064 | except ImportError: 1065 | pass 1066 | 1067 | 1068 | class OrderedDict(dict): 1069 | 'Dictionary that remembers insertion order' 1070 | # An inherited dict maps keys to values. 1071 | # The inherited dict provides __getitem__, __len__, __contains__, and get. 1072 | # The remaining methods are order-aware. 1073 | # Big-O running times for all methods are the same as for regular dictionaries. 1074 | 1075 | # The internal self.__map dictionary maps keys to links in a doubly linked list. 1076 | # The circular doubly linked list starts and ends with a sentinel element. 1077 | # The sentinel element never gets deleted (this simplifies the algorithm). 1078 | # Each link is stored as a list of length three: [PREV, NEXT, KEY]. 1079 | 1080 | def __init__(self, *args, **kwds): 1081 | '''Initialize an ordered dictionary. Signature is the same as for 1082 | regular dictionaries, but keyword arguments are not recommended 1083 | because their insertion order is arbitrary. 1084 | 1085 | ''' 1086 | if len(args) > 1: 1087 | raise TypeError('expected at most 1 arguments, got %d' % len(args)) 1088 | try: 1089 | self.__root 1090 | except AttributeError: 1091 | self.__root = root = [] # sentinel node 1092 | root[:] = [root, root, None] 1093 | self.__map = {} 1094 | self.__update(*args, **kwds) 1095 | 1096 | def __setitem__(self, key, value, dict_setitem=dict.__setitem__): 1097 | 'od.__setitem__(i, y) <==> od[i]=y' 1098 | # Setting a new item creates a new link which goes at the end of the linked 1099 | # list, and the inherited dictionary is updated with the new key/value pair. 1100 | if key not in self: 1101 | root = self.__root 1102 | last = root[0] 1103 | last[1] = root[0] = self.__map[key] = [last, root, key] 1104 | dict_setitem(self, key, value) 1105 | 1106 | def __delitem__(self, key, dict_delitem=dict.__delitem__): 1107 | 'od.__delitem__(y) <==> del od[y]' 1108 | # Deleting an existing item uses self.__map to find the link which is 1109 | # then removed by updating the links in the predecessor and successor nodes. 1110 | dict_delitem(self, key) 1111 | link_prev, link_next, key = self.__map.pop(key) 1112 | link_prev[1] = link_next 1113 | link_next[0] = link_prev 1114 | 1115 | def __iter__(self): 1116 | 'od.__iter__() <==> iter(od)' 1117 | root = self.__root 1118 | curr = root[1] 1119 | while curr is not root: 1120 | yield curr[2] 1121 | curr = curr[1] 1122 | 1123 | def __reversed__(self): 1124 | 'od.__reversed__() <==> reversed(od)' 1125 | root = self.__root 1126 | curr = root[0] 1127 | while curr is not root: 1128 | yield curr[2] 1129 | curr = curr[0] 1130 | 1131 | def clear(self): 1132 | 'od.clear() -> None. Remove all items from od.' 1133 | try: 1134 | for node in self.__map.itervalues(): 1135 | del node[:] 1136 | root = self.__root 1137 | root[:] = [root, root, None] 1138 | self.__map.clear() 1139 | except AttributeError: 1140 | pass 1141 | dict.clear(self) 1142 | 1143 | def popitem(self, last=True): 1144 | '''od.popitem() -> (k, v), return and remove a (key, value) pair. 1145 | Pairs are returned in LIFO order if last is true or FIFO order if false. 1146 | 1147 | ''' 1148 | if not self: 1149 | raise KeyError('dictionary is empty') 1150 | root = self.__root 1151 | if last: 1152 | link = root[0] 1153 | link_prev = link[0] 1154 | link_prev[1] = root 1155 | root[0] = link_prev 1156 | else: 1157 | link = root[1] 1158 | link_next = link[1] 1159 | root[1] = link_next 1160 | link_next[0] = root 1161 | key = link[2] 1162 | del self.__map[key] 1163 | value = dict.pop(self, key) 1164 | return key, value 1165 | 1166 | # -- the following methods do not depend on the internal structure -- 1167 | 1168 | def keys(self): 1169 | 'od.keys() -> list of keys in od' 1170 | return list(self) 1171 | 1172 | def values(self): 1173 | 'od.values() -> list of values in od' 1174 | return [self[key] for key in self] 1175 | 1176 | def items(self): 1177 | 'od.items() -> list of (key, value) pairs in od' 1178 | return [(key, self[key]) for key in self] 1179 | 1180 | def iterkeys(self): 1181 | 'od.iterkeys() -> an iterator over the keys in od' 1182 | return iter(self) 1183 | 1184 | def itervalues(self): 1185 | 'od.itervalues -> an iterator over the values in od' 1186 | for k in self: 1187 | yield self[k] 1188 | 1189 | def iteritems(self): 1190 | 'od.iteritems -> an iterator over the (key, value) items in od' 1191 | for k in self: 1192 | yield (k, self[k]) 1193 | 1194 | def update(*args, **kwds): 1195 | '''od.update(E, **F) -> None. Update od from dict/iterable E and F. 1196 | 1197 | If E is a dict instance, does: for k in E: od[k] = E[k] 1198 | If E has a .keys() method, does: for k in E.keys(): od[k] = E[k] 1199 | Or if E is an iterable of items, does: for k, v in E: od[k] = v 1200 | In either case, this is followed by: for k, v in F.items(): od[k] = v 1201 | 1202 | ''' 1203 | if len(args) > 2: 1204 | raise TypeError('update() takes at most 2 positional ' 1205 | 'arguments (%d given)' % (len(args),)) 1206 | elif not args: 1207 | raise TypeError('update() takes at least 1 argument (0 given)') 1208 | self = args[0] 1209 | # Make progressively weaker assumptions about "other" 1210 | other = () 1211 | if len(args) == 2: 1212 | other = args[1] 1213 | if isinstance(other, dict): 1214 | for key in other: 1215 | self[key] = other[key] 1216 | elif hasattr(other, 'keys'): 1217 | for key in other.keys(): 1218 | self[key] = other[key] 1219 | else: 1220 | for key, value in other: 1221 | self[key] = value 1222 | for key, value in kwds.items(): 1223 | self[key] = value 1224 | 1225 | __update = update # let subclasses override update without breaking __init__ 1226 | 1227 | __marker = object() 1228 | 1229 | def pop(self, key, default=__marker): 1230 | '''od.pop(k[,d]) -> v, remove specified key and return the corresponding value. 1231 | If key is not found, d is returned if given, otherwise KeyError is raised. 1232 | 1233 | ''' 1234 | if key in self: 1235 | result = self[key] 1236 | del self[key] 1237 | return result 1238 | if default is self.__marker: 1239 | raise KeyError(key) 1240 | return default 1241 | 1242 | def setdefault(self, key, default=None): 1243 | 'od.setdefault(k[,d]) -> od.get(k,d), also set od[k]=d if k not in od' 1244 | if key in self: 1245 | return self[key] 1246 | self[key] = default 1247 | return default 1248 | 1249 | def __repr__(self, _repr_running={}): 1250 | 'od.__repr__() <==> repr(od)' 1251 | call_key = id(self), _get_ident() 1252 | if call_key in _repr_running: 1253 | return '...' 1254 | _repr_running[call_key] = 1 1255 | try: 1256 | if not self: 1257 | return '%s()' % (self.__class__.__name__,) 1258 | return '%s(%r)' % (self.__class__.__name__, self.items()) 1259 | finally: 1260 | del _repr_running[call_key] 1261 | 1262 | def __reduce__(self): 1263 | 'Return state information for pickling' 1264 | items = [[k, self[k]] for k in self] 1265 | inst_dict = vars(self).copy() 1266 | for k in vars(OrderedDict()): 1267 | inst_dict.pop(k, None) 1268 | if inst_dict: 1269 | return (self.__class__, (items,), inst_dict) 1270 | return self.__class__, (items,) 1271 | 1272 | def copy(self): 1273 | 'od.copy() -> a shallow copy of od' 1274 | return self.__class__(self) 1275 | 1276 | @classmethod 1277 | def fromkeys(cls, iterable, value=None): 1278 | '''OD.fromkeys(S[, v]) -> New ordered dictionary with keys from S 1279 | and values equal to v (which defaults to None). 1280 | 1281 | ''' 1282 | d = cls() 1283 | for key in iterable: 1284 | d[key] = value 1285 | return d 1286 | 1287 | def __eq__(self, other): 1288 | '''od.__eq__(y) <==> od==y. Comparison to another OD is order-sensitive 1289 | while comparison to a regular mapping is order-insensitive. 1290 | 1291 | ''' 1292 | if isinstance(other, OrderedDict): 1293 | return len(self)==len(other) and self.items() == other.items() 1294 | return dict.__eq__(self, other) 1295 | 1296 | def __ne__(self, other): 1297 | return not self == other 1298 | 1299 | # -- the following methods are only used in Python 2.7 -- 1300 | 1301 | def viewkeys(self): 1302 | "od.viewkeys() -> a set-like object providing a view on od's keys" 1303 | return KeysView(self) 1304 | 1305 | def viewvalues(self): 1306 | "od.viewvalues() -> an object providing a view on od's values" 1307 | return ValuesView(self) 1308 | 1309 | def viewitems(self): 1310 | "od.viewitems() -> a set-like object providing a view on od's items" 1311 | return ItemsView(self) 1312 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | wheel==0.23.0 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [wheel] 2 | universal = 1 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | try: 6 | from setuptools import setup 7 | except ImportError: 8 | from distutils.core import setup 9 | 10 | 11 | readme = open('README.rst').read() 12 | history = open('HISTORY.rst').read().replace('.. :changelog:', '') 13 | 14 | requirements = [ 15 | # TODO: put package requirements here 16 | ] 17 | 18 | test_requirements = [ 19 | # TODO: put package test requirements here 20 | ] 21 | 22 | setup( 23 | name='gff3', 24 | version='1.0.1', 25 | description='Manipulate genomic features and validate the syntax and reference sequence of your GFF3 files.', 26 | long_description=readme + '\n\n' + history, 27 | author='Han Lin', 28 | author_email='hotdogee@gmail.com', 29 | url='https://github.com/hotdogee/gff3-py', 30 | packages=[ 31 | 'gff3', 32 | ], 33 | package_dir={'gff3': 34 | 'gff3'}, 35 | include_package_data=True, 36 | install_requires=requirements, 37 | license="BSD", 38 | zip_safe=False, 39 | keywords='gff3', 40 | classifiers=[ 41 | 'Development Status :: 2 - Pre-Alpha', 42 | 'Intended Audience :: Developers', 43 | 'License :: OSI Approved :: BSD License', 44 | 'Natural Language :: English', 45 | "Programming Language :: Python :: 2", 46 | 'Programming Language :: Python :: 2.6', 47 | 'Programming Language :: Python :: 2.7', 48 | 'Programming Language :: Python :: 3', 49 | 'Programming Language :: Python :: 3.3', 50 | 'Programming Language :: Python :: 3.4', 51 | ], 52 | test_suite='tests', 53 | tests_require=test_requirements 54 | ) 55 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /tests/test_gff3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | test_gff3 6 | ---------------------------------- 7 | 8 | Tests for `gff3` module. 9 | """ 10 | 11 | import unittest 12 | 13 | from gff3 import gff3 14 | 15 | 16 | class TestGff3(unittest.TestCase): 17 | 18 | def setUp(self): 19 | pass 20 | 21 | def test_something(self): 22 | pass 23 | 24 | def tearDown(self): 25 | pass 26 | 27 | if __name__ == '__main__': 28 | unittest.main() 29 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py26, py27, py33, py34, py35, py36 3 | 4 | [testenv] 5 | setenv = 6 | PYTHONPATH = {toxinidir}:{toxinidir}/gff3 7 | commands = python setup.py test 8 | deps = 9 | -r {toxinidir}/requirements.txt 10 | --------------------------------------------------------------------------------