├── .editorconfig
├── .gitignore
├── .travis.yml
├── AUTHORS.rst
├── CONTRIBUTING.rst
├── HISTORY.rst
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.rst
├── docs
    ├── Makefile
    ├── authors.rst
    ├── conf.py
    ├── contributing.rst
    ├── history.rst
    ├── index.rst
    ├── installation.rst
    ├── make.bat
    ├── readme.rst
    └── usage.rst
├── examples
    ├── annotations.gff
    ├── fix_pseudogene.py
    ├── gff_fix.py
    ├── gff_valid.py
    ├── phase_test.gff3
    ├── phase_test.gff3.md
    └── validate.py
├── gff3
    ├── __init__.py
    └── gff3.py
├── requirements.txt
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    └── test_gff3.py
└── tox.ini


/.editorconfig:
--------------------------------------------------------------------------------
 1 | # http://editorconfig.org
 2 | 
 3 | root = true
 4 | 
 5 | [*]
 6 | indent_style = space
 7 | indent_size = 4
 8 | trim_trailing_whitespace = true
 9 | insert_final_newline = true
10 | charset = utf-8
11 | end_of_line = lf
12 | 
13 | [*.bat]
14 | indent_style = tab
15 | end_of_line = crlf
16 | 
17 | [LICENSE]
18 | insert_final_newline = false
19 | 
20 | [Makefile]
21 | indent_style = tab
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | 
21 | # Installer logs
22 | pip-log.txt
23 | 
24 | # Unit test / coverage reports
25 | .coverage
26 | .tox
27 | nosetests.xml
28 | htmlcov
29 | 
30 | # Translations
31 | *.mo
32 | 
33 | # Mr Developer
34 | .mr.developer.cfg
35 | .project
36 | .pydevproject
37 | 
38 | # Complexity
39 | output/*.html
40 | output/*/index.html
41 | 
42 | # Sphinx
43 | docs/_build
44 | 
45 | # PyCharm
46 | .idea


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Config file for automatic testing at travis-ci.org
 2 | 
 3 | language: python
 4 | 
 5 | python:
 6 |   - "3.4"
 7 |   - "3.3"
 8 |   - "2.7"
 9 |   - "2.6"
10 |   - "pypy"
11 | 
12 | # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
13 | install: pip install -r requirements.txt
14 | 
15 | # command to run tests, e.g. python setup.py test
16 | script: python setup.py test
17 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | Credits
 3 | =======
 4 | 
 5 | Development Lead
 6 | ----------------
 7 | 
 8 | * Han Lin <hotdogee@gmail.com>
 9 | 
10 | Contributors
11 | ------------
12 | 
13 | None yet. Why not be the first?
14 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
 1 | ============
 2 | Contributing
 3 | ============
 4 | 
 5 | Contributions are welcome, and they are greatly appreciated! Every
 6 | little bit helps, and credit will always be given.
 7 | 
 8 | You can contribute in many ways:
 9 | 
10 | Types of Contributions
11 | ----------------------
12 | 
13 | Report Bugs
14 | ~~~~~~~~~~~
15 | 
16 | Report bugs at https://github.com/hotdogee/gff3-py/issues.
17 | 
18 | If you are reporting a bug, please include:
19 | 
20 | * Your operating system name and version.
21 | * Any details about your local setup that might be helpful in troubleshooting.
22 | * Detailed steps to reproduce the bug.
23 | 
24 | Fix Bugs
25 | ~~~~~~~~
26 | 
27 | Look through the GitHub issues for bugs. Anything tagged with "bug"
28 | is open to whoever wants to implement it.
29 | 
30 | Implement Features
31 | ~~~~~~~~~~~~~~~~~~
32 | 
33 | Look through the GitHub issues for features. Anything tagged with "feature"
34 | is open to whoever wants to implement it.
35 | 
36 | Write Documentation
37 | ~~~~~~~~~~~~~~~~~~~
38 | 
39 | gff3 could always use more documentation, whether as part of the
40 | official gff3 docs, in docstrings, or even on the web in blog posts,
41 | articles, and such.
42 | 
43 | Submit Feedback
44 | ~~~~~~~~~~~~~~~
45 | 
46 | The best way to send feedback is to file an issue at https://github.com/hotdogee/gff3-py/issues.
47 | 
48 | If you are proposing a feature:
49 | 
50 | * Explain in detail how it would work.
51 | * Keep the scope as narrow as possible, to make it easier to implement.
52 | * Remember that this is a volunteer-driven project, and that contributions
53 |   are welcome :)
54 | 


--------------------------------------------------------------------------------
/HISTORY.rst:
--------------------------------------------------------------------------------
 1 | .. :changelog:
 2 | 
 3 | History
 4 | -------
 5 | 
 6 | 1.0.1 (2021-11-12)
 7 | ---------------------
 8 | 
 9 | * new source distribution uploaded
10 | 
11 | 1.0.0 (2018-12-01)
12 | ---------------------
13 | 
14 | * Fix Python3 issues
15 | * Added sequence functions: complement(seq) and translate(seq)
16 | * Added fasta write function: fasta_dict_to_file(fasta_dict, fasta_file, line_char_limit=None)
17 | * Added Gff method to return the sequence of line_data: sequence(self, line_data, child_type=None, reference=None)
18 | * Gff.write no longer prints redundent '###' when the whole gene is marked as removed
19 | 
20 | 
21 | 0.3.0 (2015-03-10)
22 | ---------------------
23 | 
24 | * Fixed phase checking.
25 | 
26 | 0.2.0 (2015-01-28)
27 | ---------------------
28 | 
29 | * Supports python 2.6, 2.7, 3.3, 3.4, pypy.
30 | * Don't report empty attributes as errors.
31 | * Improved documentation.
32 | 
33 | 0.1.0 (2014-12-11)
34 | ---------------------
35 | 
36 | * First release on PyPI.
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014, Han Lin <hotdogee[at]gmail[dot]com>
 2 | 
 3 | Permission to use, copy, modify, and/or distribute this software for any
 4 | purpose with or without fee is hereby granted, provided that the above
 5 | copyright notice and this permission notice appear in all copies.
 6 | 
 7 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 8 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 9 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include AUTHORS.rst
 2 | include CONTRIBUTING.rst
 3 | include HISTORY.rst
 4 | include LICENSE
 5 | include README.rst
 6 | 
 7 | recursive-include tests *
 8 | recursive-exclude * __pycache__
 9 | recursive-exclude * *.py[co]
10 | 
11 | recursive-include docs *.rst conf.py Makefile make.bat
12 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: clean-pyc clean-build docs clean
 2 | 
 3 | help:
 4 | 	@echo "clean - remove all build, test, coverage and Python artifacts"
 5 | 	@echo "clean-build - remove build artifacts"
 6 | 	@echo "clean-pyc - remove Python file artifacts"
 7 | 	@echo "clean-test - remove test and coverage artifacts"
 8 | 	@echo "lint - check style with flake8"
 9 | 	@echo "test - run tests quickly with the default Python"
10 | 	@echo "test-all - run tests on every Python version with tox"
11 | 	@echo "coverage - check code coverage quickly with the default Python"
12 | 	@echo "docs - generate Sphinx HTML documentation, including API docs"
13 | 	@echo "release - package and upload a release"
14 | 	@echo "dist - package"
15 | 
16 | clean: clean-build clean-pyc clean-test
17 | 
18 | clean-build:
19 | 	rm -fr build/
20 | 	rm -fr dist/
21 | 	rm -fr *.egg-info
22 | 
23 | clean-pyc:
24 | 	find . -name '*.pyc' -exec rm -f {} +
25 | 	find . -name '*.pyo' -exec rm -f {} +
26 | 	find . -name '*~' -exec rm -f {} +
27 | 	find . -name '__pycache__' -exec rm -fr {} +
28 | 
29 | clean-test:
30 | 	rm -fr .tox/
31 | 	rm -f .coverage
32 | 	rm -fr htmlcov/
33 | 
34 | lint:
35 | 	flake8 gff3 tests
36 | 
37 | test:
38 | 	python setup.py test
39 | 
40 | test-all:
41 | 	tox
42 | 
43 | coverage:
44 | 	coverage run --source gff3 setup.py test
45 | 	coverage report -m
46 | 	coverage html
47 | 	open htmlcov/index.html
48 | 
49 | docs:
50 | 	rm -f docs/gff3.rst
51 | 	rm -f docs/modules.rst
52 | 	sphinx-apidoc -o docs/ gff3
53 | 	$(MAKE) -C docs clean
54 | 	$(MAKE) -C docs html
55 | 	open docs/_build/html/index.html
56 | 
57 | release: clean
58 | 	python setup.py sdist upload
59 | 	python setup.py bdist_wheel upload
60 | 
61 | dist: clean
62 | 	python setup.py sdist
63 | 	python setup.py bdist_wheel
64 | 	ls -l dist
65 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ===============================
  2 | gff3-py
  3 | ===============================
  4 | 
  5 | .. image:: https://badge.fury.io/py/gff3.png
  6 |    :target: http://badge.fury.io/py/gff3
  7 | 
  8 | .. image:: https://travis-ci.org/hotdogee/gff3-py.png?branch=master
  9 |    :target: https://travis-ci.org/hotdogee/gff3-py
 10 | 
 11 | .. image:: https://pypip.in/d/gff3/badge.png
 12 |    :target: https://pypi.python.org/pypi/gff3
 13 | 
 14 | 
 15 | Manipulate genomic features and validate the syntax and reference sequence of your |GFF3|_ files.
 16 | 
 17 | * Free software: BSD license
 18 | * Documentation: https://gff3-py.readthedocs.org.
 19 | 
 20 | Features
 21 | --------
 22 | 
 23 | * **Simple data structures**: Parses a |GFF3|_ file into a structure composed of simple python |dict|_ and |list|_.
 24 | * **Validation**: Validates the |GFF3|_ syntax on parse, and saves the error messages in the parsed structure.
 25 | * **Best effort parsing**: Despite any detected errors, continue to parse the whole file and make as much sense to it as possible.
 26 | * Uses the python |logging|_ library to log error messages with support for custom loggers.
 27 | * Parses embeded or external |FASTA|_ sequences to check bounds and number of ``N`` s.
 28 | * Check and correct the phase for ``CDS`` features.
 29 | * Tree traversal methods ``ancestors`` and ``descendants`` returns a simple ``list`` in Breadth-first search order.
 30 | * Transfer children and parents using the ``adopt`` and ``adopted`` methods.
 31 | * Test for overlapping features using the ``overlap`` method.
 32 | * Remove a feature and its associated features using the ``remove`` method.
 33 | * Write the modified structure to a GFF3 file using the ``write`` mthod.
 34 | 
 35 | Quick Start
 36 | -----------
 37 | 
 38 | An example that just parses a GFF3 file named ``annotations.gff`` and validates it 
 39 | using an external FASTA file named ``annotations.fa`` looks like:
 40 | 
 41 | .. code:: python
 42 | 
 43 |     # validate.py
 44 |     # ============
 45 |     from gff3 import Gff3
 46 | 
 47 |     # initialize a Gff3 object
 48 |     gff = Gff3()
 49 |     # parse GFF3 file and do syntax checking, this populates gff.lines and gff.features
 50 |     # if an embedded ##FASTA directive is found, parse the sequences into gff.fasta_embedded
 51 |     gff.parse('annotations.gff')
 52 |     # parse the external FASTA file into gff.fasta_external
 53 |     gff.parse_fasta_external('annotations.fa')
 54 |     # Check seqid, bounds and the number of Ns in each feature using one or more reference sources
 55 |     gff.check_reference(allowed_num_of_n=0, feature_types=['CDS'])
 56 |     # Checks whether child features are within the coordinate boundaries of parent features
 57 |     gff.check_parent_boundary()
 58 |     # Calculates the correct phase and checks if it matches the given phase for CDS features
 59 |     gff.check_phase()
 60 |     
 61 | A more feature complete GFF3 validator with a command line interface which also generates validation
 62 | report in MarkDown is available under ``examples/gff_valid.py``
 63 | 
 64 | The following example demonstrates how to filter, tranverse, and modify the parsed gff3 ``lines`` list.
 65 | 
 66 | 1. Change features with type ``exon`` to ``pseudogenic_exon`` and type ``transcript`` to ``pseudogenic_transcript`` if the feature has an ancestor of type ``pseudogene``
 67 | 
 68 | 2. If a ``pseudogene`` feature overlaps with a ``gene`` feature, move all of the children from the ``pseudogene`` feature to the ``gene`` feature, and remove the ``pseudogene`` feature.
 69 | 
 70 | .. code:: python
 71 | 
 72 |     # fix_pseudogene.py
 73 |     # =================
 74 |     from gff3 import Gff3
 75 |     gff = Gff3('annotations.gff')
 76 |     type_map = {'exon': 'pseudogenic_exon', 'transcript': 'pseudogenic_transcript'}
 77 |     pseudogenes = [line for line in gff.lines if line['line_type'] == 'feature' and line['type'] == 'pseudogene']
 78 |     for pseudogene in pseudogenes:
 79 |         # convert types
 80 |         for line in gff.descendants(pseudogene):
 81 |             if line['type'] in type_map:
 82 |                 line['type'] = type_map[line['type']]
 83 |         # find overlapping gene
 84 |         overlapping_genes = [line for line in gff.lines if line['line_type'] == 'feature' and line['type'] == 'gene' and gff.overlap(line, pseudogene)]
 85 |         if overlapping_genes:
 86 |             # move pseudogene children to overlapping gene
 87 |             gff.adopt(pseudogene, overlapping_genes[0])
 88 |             # remove pseudogene
 89 |             gff.remove(pseudogene)
 90 |     gff.write('annotations_fixed.gff')
 91 | 
 92 | .. |GFF3| replace:: ``GFF3``
 93 | .. |dict| replace:: ``dict``
 94 | .. |list| replace:: ``list``
 95 | .. |logging| replace:: ``logging``
 96 | .. |FASTA| replace:: ``FASTA``
 97 | 
 98 | .. _GFF3: http://www.sequenceontology.org/gff3.shtml
 99 | .. _dict: https://docs.python.org/2/tutorial/datastructures.html#dictionaries
100 | .. _list: https://docs.python.org/2/tutorial/datastructures.html#more-on-lists
101 | .. _logging: https://docs.python.org/2/library/logging.html
102 | .. _FASTA: http://en.wikipedia.org/wiki/FASTA_format
103 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 
 49 | clean:
 50 | 	rm -rf $(BUILDDIR)/*
 51 | 
 52 | html:
 53 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 54 | 	@echo
 55 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 56 | 
 57 | dirhtml:
 58 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 59 | 	@echo
 60 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 61 | 
 62 | singlehtml:
 63 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 64 | 	@echo
 65 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 66 | 
 67 | pickle:
 68 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 69 | 	@echo
 70 | 	@echo "Build finished; now you can process the pickle files."
 71 | 
 72 | json:
 73 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 74 | 	@echo
 75 | 	@echo "Build finished; now you can process the JSON files."
 76 | 
 77 | htmlhelp:
 78 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 79 | 	@echo
 80 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 81 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 82 | 
 83 | qthelp:
 84 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 85 | 	@echo
 86 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 87 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 88 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/gff3.qhcp"
 89 | 	@echo "To view the help file:"
 90 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/gff3.qhc"
 91 | 
 92 | devhelp:
 93 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 94 | 	@echo
 95 | 	@echo "Build finished."
 96 | 	@echo "To view the help file:"
 97 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/gff3"
 98 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/gff3"
 99 | 	@echo "# devhelp"
100 | 
101 | epub:
102 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
103 | 	@echo
104 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
105 | 
106 | latex:
107 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
108 | 	@echo
109 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
110 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
111 | 	      "(use \`make latexpdf' here to do that automatically)."
112 | 
113 | latexpdf:
114 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
115 | 	@echo "Running LaTeX files through pdflatex..."
116 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
117 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
118 | 
119 | latexpdfja:
120 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
121 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
122 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
123 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
124 | 
125 | text:
126 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
127 | 	@echo
128 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
129 | 
130 | man:
131 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
132 | 	@echo
133 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
134 | 
135 | texinfo:
136 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
137 | 	@echo
138 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
139 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
140 | 	      "(use \`make info' here to do that automatically)."
141 | 
142 | info:
143 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
144 | 	@echo "Running Texinfo files through makeinfo..."
145 | 	make -C $(BUILDDIR)/texinfo info
146 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
147 | 
148 | gettext:
149 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
150 | 	@echo
151 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
152 | 
153 | changes:
154 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
155 | 	@echo
156 | 	@echo "The overview file is in $(BUILDDIR)/changes."
157 | 
158 | linkcheck:
159 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
160 | 	@echo
161 | 	@echo "Link check complete; look for any errors in the above output " \
162 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
163 | 
164 | doctest:
165 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
166 | 	@echo "Testing of doctests in the sources finished, look at the " \
167 | 	      "results in $(BUILDDIR)/doctest/output.txt."
168 | 
169 | xml:
170 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
171 | 	@echo
172 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
173 | 
174 | pseudoxml:
175 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
176 | 	@echo
177 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
178 | 


--------------------------------------------------------------------------------
/docs/authors.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../AUTHORS.rst
2 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # gff3 documentation build configuration file, created by
  5 | # sphinx-quickstart on Tue Jul  9 22:26:36 2013.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | import sys
 17 | import os
 18 | 
 19 | # If extensions (or modules to document with autodoc) are in another
 20 | # directory, add these directories to sys.path here. If the directory is
 21 | # relative to the documentation root, use os.path.abspath to make it
 22 | # absolute, like shown here.
 23 | #sys.path.insert(0, os.path.abspath('.'))
 24 | 
 25 | # Get the project root dir, which is the parent dir of this
 26 | cwd = os.getcwd()
 27 | project_root = os.path.dirname(cwd)
 28 | 
 29 | # Insert the project root dir as the first element in the PYTHONPATH.
 30 | # This lets us ensure that the source package is imported, and that its
 31 | # version is used.
 32 | sys.path.insert(0, project_root)
 33 | 
 34 | import gff3
 35 | 
 36 | # -- General configuration ---------------------------------------------
 37 | 
 38 | # If your documentation needs a minimal Sphinx version, state it here.
 39 | #needs_sphinx = '1.0'
 40 | 
 41 | # Add any Sphinx extension module names here, as strings. They can be
 42 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 43 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode']
 44 | 
 45 | # Add any paths that contain templates here, relative to this directory.
 46 | templates_path = ['_templates']
 47 | 
 48 | # The suffix of source filenames.
 49 | source_suffix = '.rst'
 50 | 
 51 | # The encoding of source files.
 52 | #source_encoding = 'utf-8-sig'
 53 | 
 54 | # The master toctree document.
 55 | master_doc = 'index'
 56 | 
 57 | # General information about the project.
 58 | project = u'gff3-py'
 59 | copyright = u'2014, Han Lin'
 60 | 
 61 | # The version info for the project you're documenting, acts as replacement
 62 | # for |version| and |release|, also used in various other places throughout
 63 | # the built documents.
 64 | #
 65 | # The short X.Y version.
 66 | version = gff3.__version__
 67 | # The full version, including alpha/beta/rc tags.
 68 | release = gff3.__version__
 69 | 
 70 | # The language for content autogenerated by Sphinx. Refer to documentation
 71 | # for a list of supported languages.
 72 | #language = None
 73 | 
 74 | # There are two options for replacing |today|: either, you set today to
 75 | # some non-false value, then it is used:
 76 | #today = ''
 77 | # Else, today_fmt is used as the format for a strftime call.
 78 | #today_fmt = '%B %d, %Y'
 79 | 
 80 | # List of patterns, relative to source directory, that match files and
 81 | # directories to ignore when looking for source files.
 82 | exclude_patterns = ['_build']
 83 | 
 84 | # The reST default role (used for this markup: `text`) to use for all
 85 | # documents.
 86 | #default_role = None
 87 | 
 88 | # If true, '()' will be appended to :func: etc. cross-reference text.
 89 | #add_function_parentheses = True
 90 | 
 91 | # If true, the current module name will be prepended to all description
 92 | # unit titles (such as .. function::).
 93 | #add_module_names = True
 94 | 
 95 | # If true, sectionauthor and moduleauthor directives will be shown in the
 96 | # output. They are ignored by default.
 97 | #show_authors = False
 98 | 
 99 | # The name of the Pygments (syntax highlighting) style to use.
100 | pygments_style = 'sphinx'
101 | 
102 | # A list of ignored prefixes for module index sorting.
103 | #modindex_common_prefix = []
104 | 
105 | # If true, keep warnings as "system message" paragraphs in the built
106 | # documents.
107 | #keep_warnings = False
108 | 
109 | 
110 | # -- Options for HTML output -------------------------------------------
111 | 
112 | # The theme to use for HTML and HTML Help pages.  See the documentation for
113 | # a list of builtin themes.
114 | html_theme = 'default'
115 | 
116 | # Theme options are theme-specific and customize the look and feel of a
117 | # theme further.  For a list of options available for each theme, see the
118 | # documentation.
119 | #html_theme_options = {}
120 | 
121 | # Add any paths that contain custom themes here, relative to this directory.
122 | #html_theme_path = []
123 | 
124 | # The name for this set of Sphinx documents.  If None, it defaults to
125 | # "<project> v<release> documentation".
126 | #html_title = None
127 | 
128 | # A shorter title for the navigation bar.  Default is the same as
129 | # html_title.
130 | #html_short_title = None
131 | 
132 | # The name of an image file (relative to this directory) to place at the
133 | # top of the sidebar.
134 | #html_logo = None
135 | 
136 | # The name of an image file (within the static path) to use as favicon
137 | # of the docs.  This file should be a Windows icon file (.ico) being
138 | # 16x16 or 32x32 pixels large.
139 | #html_favicon = None
140 | 
141 | # Add any paths that contain custom static files (such as style sheets)
142 | # here, relative to this directory. They are copied after the builtin
143 | # static files, so a file named "default.css" will overwrite the builtin
144 | # "default.css".
145 | html_static_path = ['_static']
146 | 
147 | # If not '', a 'Last updated on:' timestamp is inserted at every page
148 | # bottom, using the given strftime format.
149 | #html_last_updated_fmt = '%b %d, %Y'
150 | 
151 | # If true, SmartyPants will be used to convert quotes and dashes to
152 | # typographically correct entities.
153 | #html_use_smartypants = True
154 | 
155 | # Custom sidebar templates, maps document names to template names.
156 | #html_sidebars = {}
157 | 
158 | # Additional templates that should be rendered to pages, maps page names
159 | # to template names.
160 | #html_additional_pages = {}
161 | 
162 | # If false, no module index is generated.
163 | #html_domain_indices = True
164 | 
165 | # If false, no index is generated.
166 | #html_use_index = True
167 | 
168 | # If true, the index is split into individual pages for each letter.
169 | #html_split_index = False
170 | 
171 | # If true, links to the reST sources are added to the pages.
172 | #html_show_sourcelink = True
173 | 
174 | # If true, "Created using Sphinx" is shown in the HTML footer.
175 | # Default is True.
176 | #html_show_sphinx = True
177 | 
178 | # If true, "(C) Copyright ..." is shown in the HTML footer.
179 | # Default is True.
180 | #html_show_copyright = True
181 | 
182 | # If true, an OpenSearch description file will be output, and all pages
183 | # will contain a <link> tag referring to it.  The value of this option
184 | # must be the base URL from which the finished HTML is served.
185 | #html_use_opensearch = ''
186 | 
187 | # This is the file name suffix for HTML files (e.g. ".xhtml").
188 | #html_file_suffix = None
189 | 
190 | # Output file base name for HTML help builder.
191 | htmlhelp_basename = 'gff3doc'
192 | 
193 | 
194 | # -- Options for LaTeX output ------------------------------------------
195 | 
196 | latex_elements = {
197 |     # The paper size ('letterpaper' or 'a4paper').
198 |     #'papersize': 'letterpaper',
199 | 
200 |     # The font size ('10pt', '11pt' or '12pt').
201 |     #'pointsize': '10pt',
202 | 
203 |     # Additional stuff for the LaTeX preamble.
204 |     #'preamble': '',
205 | }
206 | 
207 | # Grouping the document tree into LaTeX files. List of tuples
208 | # (source start file, target name, title, author, documentclass
209 | # [howto/manual]).
210 | latex_documents = [
211 |     ('index', 'gff3.tex',
212 |      u'gff3-py Documentation',
213 |      u'Han Lin', 'manual'),
214 | ]
215 | 
216 | # The name of an image file (relative to this directory) to place at
217 | # the top of the title page.
218 | #latex_logo = None
219 | 
220 | # For "manual" documents, if this is true, then toplevel headings
221 | # are parts, not chapters.
222 | #latex_use_parts = False
223 | 
224 | # If true, show page references after internal links.
225 | #latex_show_pagerefs = False
226 | 
227 | # If true, show URL addresses after external links.
228 | #latex_show_urls = False
229 | 
230 | # Documents to append as an appendix to all manuals.
231 | #latex_appendices = []
232 | 
233 | # If false, no module index is generated.
234 | #latex_domain_indices = True
235 | 
236 | 
237 | # -- Options for manual page output ------------------------------------
238 | 
239 | # One entry per manual page. List of tuples
240 | # (source start file, name, description, authors, manual section).
241 | man_pages = [
242 |     ('index', 'gff3',
243 |      u'gff3-py Documentation',
244 |      [u'Han Lin'], 1)
245 | ]
246 | 
247 | # If true, show URL addresses after external links.
248 | #man_show_urls = False
249 | 
250 | 
251 | # -- Options for Texinfo output ----------------------------------------
252 | 
253 | # Grouping the document tree into Texinfo files. List of tuples
254 | # (source start file, target name, title, author,
255 | #  dir menu entry, description, category)
256 | texinfo_documents = [
257 |     ('index', 'gff3',
258 |      u'gff3-py Documentation',
259 |      u'Han Lin',
260 |      'gff3',
261 |      'One line description of project.',
262 |      'Miscellaneous'),
263 | ]
264 | 
265 | # Documents to append as an appendix to all manuals.
266 | #texinfo_appendices = []
267 | 
268 | # If false, no module index is generated.
269 | #texinfo_domain_indices = True
270 | 
271 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
272 | #texinfo_show_urls = 'footnote'
273 | 
274 | # If true, do not generate a @detailmenu in the "Top" node's menu.
275 | #texinfo_no_detailmenu = False
276 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../CONTRIBUTING.rst
2 | 


--------------------------------------------------------------------------------
/docs/history.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../HISTORY.rst
2 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. gff3 documentation master file, created by
 2 |    sphinx-quickstart on Tue Jul  9 22:26:36 2013.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to gff3-py's documentation!
 7 | ======================================
 8 | 
 9 | Contents:
10 | 
11 | .. toctree::
12 |    :maxdepth: 2
13 | 
14 |    readme
15 |    installation
16 |    usage
17 |    contributing
18 |    authors
19 |    history
20 | 
21 | Indices and tables
22 | ==================
23 | 
24 | * :ref:`genindex`
25 | * :ref:`modindex`
26 | * :ref:`search`
27 | 
28 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
 1 | ============
 2 | Installation
 3 | ============
 4 | 
 5 | At the command line::
 6 | 
 7 |     $ easy_install gff3
 8 | 
 9 | Or, if you have virtualenvwrapper installed::
10 | 
11 |     $ mkvirtualenv gff3
12 |     $ pip install gff3
13 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	goto end
 41 | )
 42 | 
 43 | if "%1" == "clean" (
 44 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 45 | 	del /q /s %BUILDDIR%\*
 46 | 	goto end
 47 | )
 48 | 
 49 | 
 50 | %SPHINXBUILD% 2> nul
 51 | if errorlevel 9009 (
 52 | 	echo.
 53 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 54 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 55 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 56 | 	echo.may add the Sphinx directory to PATH.
 57 | 	echo.
 58 | 	echo.If you don't have Sphinx installed, grab it from
 59 | 	echo.http://sphinx-doc.org/
 60 | 	exit /b 1
 61 | )
 62 | 
 63 | if "%1" == "html" (
 64 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "dirhtml" (
 72 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "singlehtml" (
 80 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "pickle" (
 88 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can process the pickle files.
 92 | 	goto end
 93 | )
 94 | 
 95 | if "%1" == "json" (
 96 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 97 | 	if errorlevel 1 exit /b 1
 98 | 	echo.
 99 | 	echo.Build finished; now you can process the JSON files.
100 | 	goto end
101 | )
102 | 
103 | if "%1" == "htmlhelp" (
104 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
105 | 	if errorlevel 1 exit /b 1
106 | 	echo.
107 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
108 | .hhp project file in %BUILDDIR%/htmlhelp.
109 | 	goto end
110 | )
111 | 
112 | if "%1" == "qthelp" (
113 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
114 | 	if errorlevel 1 exit /b 1
115 | 	echo.
116 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
117 | .qhcp project file in %BUILDDIR%/qthelp, like this:
118 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\gff3.qhcp
119 | 	echo.To view the help file:
120 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\gff3.ghc
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "devhelp" (
125 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "epub" (
133 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "latex" (
141 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "latexpdf" (
149 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
150 | 	cd %BUILDDIR%/latex
151 | 	make all-pdf
152 | 	cd %BUILDDIR%/..
153 | 	echo.
154 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
155 | 	goto end
156 | )
157 | 
158 | if "%1" == "latexpdfja" (
159 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
160 | 	cd %BUILDDIR%/latex
161 | 	make all-pdf-ja
162 | 	cd %BUILDDIR%/..
163 | 	echo.
164 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
165 | 	goto end
166 | )
167 | 
168 | if "%1" == "text" (
169 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
170 | 	if errorlevel 1 exit /b 1
171 | 	echo.
172 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
173 | 	goto end
174 | )
175 | 
176 | if "%1" == "man" (
177 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
178 | 	if errorlevel 1 exit /b 1
179 | 	echo.
180 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
181 | 	goto end
182 | )
183 | 
184 | if "%1" == "texinfo" (
185 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
186 | 	if errorlevel 1 exit /b 1
187 | 	echo.
188 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
189 | 	goto end
190 | )
191 | 
192 | if "%1" == "gettext" (
193 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
194 | 	if errorlevel 1 exit /b 1
195 | 	echo.
196 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
197 | 	goto end
198 | )
199 | 
200 | if "%1" == "changes" (
201 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
202 | 	if errorlevel 1 exit /b 1
203 | 	echo.
204 | 	echo.The overview file is in %BUILDDIR%/changes.
205 | 	goto end
206 | )
207 | 
208 | if "%1" == "linkcheck" (
209 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
210 | 	if errorlevel 1 exit /b 1
211 | 	echo.
212 | 	echo.Link check complete; look for any errors in the above output ^
213 | or in %BUILDDIR%/linkcheck/output.txt.
214 | 	goto end
215 | )
216 | 
217 | if "%1" == "doctest" (
218 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
219 | 	if errorlevel 1 exit /b 1
220 | 	echo.
221 | 	echo.Testing of doctests in the sources finished, look at the ^
222 | results in %BUILDDIR%/doctest/output.txt.
223 | 	goto end
224 | )
225 | 
226 | if "%1" == "xml" (
227 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
228 | 	if errorlevel 1 exit /b 1
229 | 	echo.
230 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
231 | 	goto end
232 | )
233 | 
234 | if "%1" == "pseudoxml" (
235 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
236 | 	if errorlevel 1 exit /b 1
237 | 	echo.
238 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
239 | 	goto end
240 | )
241 | 
242 | :end
243 | 


--------------------------------------------------------------------------------
/docs/readme.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../README.rst
2 | 


--------------------------------------------------------------------------------
/docs/usage.rst:
--------------------------------------------------------------------------------
 1 | ========
 2 | Usage
 3 | ========
 4 | 
 5 | To use gff3-py in a project::
 6 | 
 7 |     from gff3 import Gff3
 8 | 
 9 |     
10 | .. autoclass:: gff3.Gff3
11 |    :members:


--------------------------------------------------------------------------------
/examples/fix_pseudogene.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | # try to import from project first
 3 | from os.path import dirname
 4 | sys.path.insert(1, dirname(dirname(__file__)))
 5 | from gff3 import Gff3
 6 | 
 7 | gff = Gff3('annotations.gff')
 8 | type_map = {'exon': 'pseudogenic_exon', 'transcript': 'pseudogenic_transcript'}
 9 | pseudogenes = [line for line in gff.lines if line['type'] == 'pseudogene']
10 | for pseudogene in pseudogenes:
11 |     # convert types
12 |     for line in gff.descendants(pseudogene):
13 |         if line['type'] in type_map:
14 |             line['type'] = type_map[line['type']]
15 |     # find overlapping gene
16 |     overlapping_genes = [line for line in gff.lines if line['type'] == 'gene' and gff.overlap(line, pseudogene)]
17 |     if overlapping_genes:
18 |         # move pseudogene children to overlapping gene
19 |         gff.adopt(pseudogene, overlapping_genes[0])
20 |         # remove pseudogene
21 |         gff.remove(pseudogene)
22 | gff.write('annotations_fixed.gff')


--------------------------------------------------------------------------------
/examples/gff_fix.py:
--------------------------------------------------------------------------------
  1 | #! /usr/local/bin/python2.7
  2 | # Copyright (C) 2014  Han Lin <hotdogee [at] gmail [dot] com>
  3 | #
  4 | # This program is free software; you can redistribute it and/or modify
  5 | # it under the terms of the GNU General Public License as published by
  6 | # the Free Software Foundation; either version 3 of the License, or
  7 | # (at your option) any later version.
  8 | 
  9 | """
 10 | Check a GFF3 file for errors and unwanted features, with an option to correct the errors and output a valid GFF3 file.
 11 | 
 12 | Count the number of Ns in each feature, remove features with N count greater than the specified threshold. (Requires FASTA)
 13 | Check and remove features with an end coordinates larger than the landmark sequence length. (Requires FASTA or ##sequence-region)
 14 | Check if the ##sequence-region matches the FASTA file. (Requires FASTA and ##sequence-region)
 15 | Add the ##sequence-region directives if missing. (Requires FASTA)
 16 | Check and correct the phase for CDS features.
 17 | 
 18 | Changelog:
 19 | """
 20 | 
 21 | import sys
 22 | from collections import OrderedDict
 23 | from collections import defaultdict
 24 | from itertools import groupby
 25 | from urllib import quote, unquote
 26 | from textwrap import wrap
 27 | import re
 28 | import logging
 29 | # try to import from project first
 30 | from os.path import dirname
 31 | sys.path.insert(1, dirname(dirname(__file__)))
 32 | from gff3 import Gff3
 33 | 
 34 | __version__ = '1.0'
 35 | 
 36 | 
 37 | def query_yes_no(question, default='yes'):
 38 |     """Ask a yes/no question via raw_input() and return their answer.
 39 | 
 40 |     'question' is a string that is presented to the user.
 41 |     'default' is the presumed answer if the user just hits <Enter>.
 42 |         It must be 'yes' (the default), 'no' or None (meaning
 43 |         an answer is required of the user).
 44 | 
 45 |     The 'answer' return value is one of 'yes' or 'no'.
 46 |     """
 47 |     valid = {'yes': True, 'y': True, 'ye': True,
 48 |              'no': False, 'n': False}
 49 |     if default is None:
 50 |         prompt = ' [y/n] '
 51 |     elif default == 'yes':
 52 |         prompt = ' [Y/n] '
 53 |     elif default == 'no':
 54 |         prompt = ' [y/N] '
 55 |     else:
 56 |         raise ValueError('invalid default answer: "%s"' % default)
 57 | 
 58 |     while True:
 59 |         sys.stderr.write(question + prompt)
 60 |         choice = raw_input().strip().lower()
 61 |         if default is not None and choice == '':
 62 |             return valid[default]
 63 |         elif choice in valid:
 64 |             return valid[choice]
 65 |         else:
 66 |             sys.stderr.write('Please respond with "y" or "n".\n')
 67 | # gff_valid.py < annotations.gff > annotations.gff.validation_report
 68 | # gff_valid.py -g agla_v1_1_NALmod.gff3 > agla_v1_1_NALmod.gff3.validation_report.md
 69 | # gff_valid.py -g clec_v1_1_NALmod.gff3 > clec_v1_1_NALmod.gff3.validation_report.md
 70 | # gff_valid.py -g ofas_v1_1_NALmod.gff3 > ofas_v1_1_NALmod.gff3.validation_report.md
 71 | if __name__ == '__main__':
 72 |     logger_stderr = logging.getLogger(__name__+'stderr')
 73 |     logger_stderr.setLevel(logging.INFO)
 74 |     stderr_handler = logging.StreamHandler()
 75 |     stderr_handler.setFormatter(logging.Formatter('%(levelname)-8s %(message)s'))
 76 |     logger_stderr.addHandler(stderr_handler)
 77 |     logger_null = logging.getLogger(__name__+'null')
 78 |     null_handler = logging.NullHandler()
 79 |     logger_null.addHandler(null_handler)
 80 |     import argparse
 81 |     from textwrap import dedent
 82 |     parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=dedent("""\
 83 |     Validate a GFF3 file for syntax and formating errors, parent relationship and reference sequence sanity.
 84 | 
 85 |     Features:
 86 |     1. Check syntax and formatting according to gff3 version 1.21.
 87 |     2. Count the number of Ns greater than the specified threshold (default: 0) in specified feature types (default: CDS). (Requires FASTA)
 88 |     3. Check for features with an end coordinates larger than the landmark sequence length. (Requires FASTA or ##sequence-region)
 89 |     4. Check if the ##sequence-region matches the FASTA file. (Requires FASTA and ##sequence-region)
 90 |     5. Check whether child features are within the coordinate boundaries of parent features.
 91 |     6. Check for the correct phase of CDS features.
 92 | 
 93 |     Inputs:
 94 |     1. GFF3: reads from STDIN by default, may specify the file name with the -g argument
 95 |     2. (optional) FASTA: specify the file name with the -f argument, will use the embedded ##FASTA in the GFF3 file if the external FASTA file is not specified
 96 | 
 97 |     Outputs:
 98 |     1. MarkDown: contains validation summary and detail sections, writes to STDOUT by default, may specify the file name with the -r argument
 99 | 
100 |     Examples:
101 |     1. Use default arguments, inout and output redirection:
102 |         %(prog)s < a.gff > a_validation_report.txt
103 |     2. Specify the input, output file names and options using short arguments:
104 |         %(prog)s -g a.gff -f a.fa -n 5 -t CDS exon -r a_validation_report.txt
105 |     3. Specify the input, output file names and options using long arguments:
106 |         %(prog)s --gff_file a.gff --fasta_file a.fa --allowed_num_of_n 0 --check_n_feature_types CDS --report_file a_validation_report.txt
107 |     """))
108 |     parser.add_argument('-g', '--gff_file', type=str, help='GFF3 file to validate (default: STDIN)')
109 |     parser.add_argument('-f', '--fasta_file', type=str, help='The external reference FASTA file for the GFF3 files, has precedence over the ##FASTA section if both exist (default: None)')
110 |     parser.add_argument('-n', '--allowed_num_of_n', type=int, default=0,
111 |                         help='Max number of Ns allowed in a feature, anything more will be reported as an error (default: 0)')
112 |     parser.add_argument('-t', '--check_n_feature_types', nargs='*', default=['CDS'],
113 |                         help='Count the number of Ns in each feature with the type specified, multiple types may be specified, ex: -t CDS exon (default: "CDS")')
114 |     parser.add_argument('-r', '--report_file', type=str, help='Validation report file (default: STDOUT)')
115 |     parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + __version__)
116 | 
117 |     test_lv = 1 # debug
118 |     if test_lv == 0:
119 |         args = parser.parse_args(['-g', 'annotations.gff'])
120 |     else:
121 |         args = parser.parse_args()
122 | 
123 |     if args.gff_file:
124 |         logger_stderr.info('Checking GFF3 file (%s)...', args.gff_file)
125 |     elif not sys.stdin.isatty(): # if STDIN connected to pipe or file
126 |         args.gff_file = sys.stdin
127 |         logger_stderr.info('Reading from STDIN...')
128 |     else: # no input
129 |         parser.print_help()
130 |         sys.exit(1)
131 | 
132 |     logger_stderr.info('Checking syntax and formatting...')
133 |     gff3 = Gff3(gff_file=args.gff_file, fasta_external=args.fasta_file, logger=logger_null)
134 |     logger_stderr.info('Checking reference seqid, bounds and N count...')
135 |     gff3.check_reference(allowed_num_of_n=args.allowed_num_of_n, feature_types=args.check_n_feature_types)
136 |     logger_stderr.info('Checking parent boundaries...')
137 |     gff3.check_parent_boundary()
138 | 
139 |     gff3.check_phase()
140 | 
141 |     if args.report_file:
142 |         logger_stderr.info('Writing validation report (%s)...', args.report_file)
143 |         report_fh = open(args.report_file, 'wb')
144 |     else:
145 |         report_fh = sys.stdout
146 | 
147 |     # Validation Summary
148 |     report_fh.write('# GFF3 Validation Report')
149 |     if args.gff_file and sys.stdin.isatty():
150 |         report_fh.write(': {0:s}'.format(args.gff_file))
151 |     report_fh.write('\n\n')
152 | 
153 |     report_fh.write('# Validation Summary\n')
154 |     error_lines = [line for line in gff3.lines if line['line_errors']]
155 |     if len(error_lines) == 0:
156 |         report_fh.write('* Found 0 errors\n')
157 |     else:
158 |         error_list = [error for line in error_lines for error in line['line_errors']]
159 |         error_types = sorted(list(set([error['error_type'] for error in error_list])))
160 |         for error_type in error_types:
161 |             report_fh.write('* Found {0:d} {1:s} errors in {2:d} lines\n'.format(
162 |                 len([error for error in error_list if error['error_type'] == error_type]), error_type,
163 |                 len([line for line in error_lines if [error for error in line['line_errors'] if error['error_type'] == error_type]])))
164 | 
165 |         report_fh.write('\n')
166 |         report_fh.write('# Detected Errors\n')
167 |         for line in error_lines:
168 |             report_fh.write('* Line {0:d}: {1:s}\n'.format(line['line_index'] + 1, line['line_raw'].strip()))
169 |             for error in line['line_errors']:
170 |                 report_fh.write('\t- {error_type}: {message}\n'.format(error_type=error['error_type'], message=error['message']))
171 | 
172 |     if args.report_file:
173 |         report_fh.close()


--------------------------------------------------------------------------------
/examples/gff_valid.py:
--------------------------------------------------------------------------------
  1 | #! /usr/local/bin/python2.7
  2 | # Copyright (C) 2014  Han Lin <hotdogee [at] gmail [dot] com>
  3 | #
  4 | # This program is free software; you can redistribute it and/or modify
  5 | # it under the terms of the GNU General Public License as published by
  6 | # the Free Software Foundation; either version 3 of the License, or
  7 | # (at your option) any later version.
  8 | 
  9 | """
 10 | Check a GFF3 file for errors and output a validation report in markdown
 11 | 
 12 | Count the number of Ns in each feature, remove features with N count greater than the specified threshold. (Requires FASTA)
 13 | Check and remove features with an end coordinates larger than the landmark sequence length. (Requires FASTA or ##sequence-region)
 14 | Check if the ##sequence-region matches the FASTA file. (Requires FASTA and ##sequence-region)
 15 | Add the ##sequence-region directives if missing. (Requires FASTA)
 16 | Check and correct the phase for CDS features.
 17 | 
 18 | Changelog:
 19 | """
 20 | 
 21 | import sys
 22 | import re
 23 | import logging
 24 | from collections import OrderedDict
 25 | from collections import defaultdict
 26 | from itertools import groupby
 27 | from urllib import quote, unquote
 28 | from textwrap import wrap
 29 | # try to import from project first
 30 | from os.path import dirname
 31 | sys.path.insert(1, dirname(dirname(__file__)))
 32 | from gff3 import Gff3
 33 | 
 34 | __version__ = '1.1'
 35 | 
 36 | 
 37 | def query_yes_no(question, default='yes'):
 38 |     """Ask a yes/no question via raw_input() and return their answer.
 39 | 
 40 |     'question' is a string that is presented to the user.
 41 |     'default' is the presumed answer if the user just hits <Enter>.
 42 |         It must be 'yes' (the default), 'no' or None (meaning
 43 |         an answer is required of the user).
 44 | 
 45 |     The 'answer' return value is one of 'yes' or 'no'.
 46 |     """
 47 |     valid = {'yes': True, 'y': True, 'ye': True,
 48 |              'no': False, 'n': False}
 49 |     if default is None:
 50 |         prompt = ' [y/n] '
 51 |     elif default == 'yes':
 52 |         prompt = ' [Y/n] '
 53 |     elif default == 'no':
 54 |         prompt = ' [y/N] '
 55 |     else:
 56 |         raise ValueError('invalid default answer: "%s"' % default)
 57 | 
 58 |     while True:
 59 |         sys.stderr.write(question + prompt)
 60 |         choice = raw_input().strip().lower()
 61 |         if default is not None and choice == '':
 62 |             return valid[default]
 63 |         elif choice in valid:
 64 |             return valid[choice]
 65 |         else:
 66 |             sys.stderr.write('Please respond with "y" or "n".\n')
 67 | # gff_valid.py < annotations.gff > annotations.gff.validation_report
 68 | # gff_valid.py -g agla_v1_1_NALmod.gff3 > agla_v1_1_NALmod.gff3.validation_report.md
 69 | # gff_valid.py -g clec_v1_1_NALmod.gff3 > clec_v1_1_NALmod.gff3.validation_report.md
 70 | # gff_valid.py -g ofas_v1_1_NALmod.gff3 > ofas_v1_1_NALmod.gff3.validation_report.md
 71 | if __name__ == '__main__':
 72 |     logger_stderr = logging.getLogger(__name__+'stderr')
 73 |     logger_stderr.setLevel(logging.INFO)
 74 |     stderr_handler = logging.StreamHandler()
 75 |     stderr_handler.setFormatter(logging.Formatter('%(levelname)-8s %(message)s'))
 76 |     logger_stderr.addHandler(stderr_handler)
 77 |     logger_null = logging.getLogger(__name__+'null')
 78 |     null_handler = logging.NullHandler()
 79 |     logger_null.addHandler(null_handler)
 80 |     import argparse
 81 |     from textwrap import dedent
 82 |     parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=dedent("""\
 83 |     Validate a GFF3 file for syntax and formating errors, parent relationship and reference sequence sanity.
 84 | 
 85 |     Features:
 86 |     1. Check syntax and formatting according to gff3 version 1.21.
 87 |     2. Count the number of Ns greater than the specified threshold (default: 0) in specified feature types (default: CDS). (Requires FASTA)
 88 |     3. Check for features with an end coordinates larger than the landmark sequence length. (Requires FASTA or ##sequence-region)
 89 |     4. Check if the ##sequence-region matches the FASTA file. (Requires FASTA and ##sequence-region)
 90 |     5. Check whether child features are within the coordinate boundaries of parent features.
 91 |     6. Check for the correct phase of CDS features.
 92 | 
 93 |     Inputs:
 94 |     1. GFF3: reads from STDIN by default, may specify the file name with the -g argument
 95 |     2. (optional) FASTA: specify the file name with the -f argument, will use the embedded ##FASTA in the GFF3 file if the external FASTA file is not specified
 96 | 
 97 |     Outputs:
 98 |     1. MarkDown: contains validation summary and detail sections, writes to STDOUT by default, may specify the file name with the -r argument
 99 | 
100 |     Examples:
101 |     1. Use default arguments, inout and output redirection:
102 |         %(prog)s < a.gff > a_validation_report.txt
103 |     2. Specify the input, output file names and options using short arguments:
104 |         %(prog)s -g a.gff -f a.fa -n 5 -t CDS exon -r a_validation_report.txt
105 |     3. Specify the input, output file names and options using long arguments:
106 |         %(prog)s --gff_file a.gff --fasta_file a.fa --allowed_num_of_n 0 --check_n_feature_types CDS --report_file a_validation_report.txt
107 |     """))
108 |     parser.add_argument('-g', '--gff_file', type=str, help='GFF3 file to validate (default: STDIN)')
109 |     parser.add_argument('-f', '--fasta_file', type=str, help='The external reference FASTA file for the GFF3 files, has precedence over the ##FASTA section if both exist (default: None)')
110 |     parser.add_argument('-n', '--allowed_num_of_n', type=int, default=0,
111 |                         help='Max number of Ns allowed in a feature, anything more will be reported as an error (default: 0)')
112 |     parser.add_argument('-t', '--check_n_feature_types', nargs='*', default=['CDS'],
113 |                         help='Count the number of Ns in each feature with the type specified, multiple types may be specified, ex: -t CDS exon (default: "CDS")')
114 |     parser.add_argument('-r', '--report_file', type=str, help='Validation report file (default: STDOUT)')
115 |     parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + __version__)
116 | 
117 |     test_lv = 1 # debug
118 |     if test_lv == 0:
119 |         args = parser.parse_args(['-g', 'annotations.gff'])
120 |     else:
121 |         args = parser.parse_args()
122 | 
123 |     if args.gff_file:
124 |         logger_stderr.info('Checking GFF3 file (%s)...', args.gff_file)
125 |     elif not sys.stdin.isatty(): # if STDIN connected to pipe or file
126 |         args.gff_file = sys.stdin
127 |         logger_stderr.info('Reading from STDIN...')
128 |     else: # no input
129 |         parser.print_help()
130 |         sys.exit(1)
131 | 
132 |     logger_stderr.info('Checking syntax and formatting...')
133 |     gff3 = Gff3(gff_file=args.gff_file, fasta_external=args.fasta_file, logger=logger_null)
134 |     logger_stderr.info('Checking reference seqid, bounds and N count...')
135 |     gff3.check_reference(allowed_num_of_n=args.allowed_num_of_n, feature_types=args.check_n_feature_types)
136 |     logger_stderr.info('Checking parent boundaries...')
137 |     gff3.check_parent_boundary()
138 | 
139 |     gff3.check_phase()
140 | 
141 |     if args.report_file:
142 |         logger_stderr.info('Writing validation report (%s)...', args.report_file)
143 |         report_fh = open(args.report_file, 'wb')
144 |     else:
145 |         report_fh = sys.stdout
146 | 
147 |     # Validation Summary
148 |     report_fh.write('# GFF3 Validation Report')
149 |     if args.gff_file and sys.stdin.isatty():
150 |         report_fh.write(': {0:s}'.format(args.gff_file))
151 |     report_fh.write('\n\n')
152 | 
153 |     report_fh.write('# Validation Summary\n')
154 |     error_lines = [line for line in gff3.lines if line['line_errors']]
155 |     if len(error_lines) == 0:
156 |         report_fh.write('* Found 0 errors\n')
157 |     else:
158 |         error_list = [error for line in error_lines for error in line['line_errors']]
159 |         error_types = sorted(list(set([error['error_type'] for error in error_list])))
160 |         for error_type in error_types:
161 |             report_fh.write('* Found {0:d} {1:s} errors in {2:d} lines\n'.format(
162 |                 len([error for error in error_list if error['error_type'] == error_type]), error_type,
163 |                 len([line for line in error_lines if [error for error in line['line_errors'] if error['error_type'] == error_type]])))
164 | 
165 |         report_fh.write('\n')
166 |         report_fh.write('# Detected Errors\n')
167 |         for line in error_lines:
168 |             report_fh.write('* Line {0:d}: {1:s}\n'.format(line['line_index'] + 1, line['line_raw'].strip()))
169 |             for error in line['line_errors']:
170 |                 report_fh.write('\t- {error_type}: {message}\n'.format(error_type=error['error_type'], message=error['message']))
171 | 
172 |     if args.report_file:
173 |         report_fh.close()


--------------------------------------------------------------------------------
/examples/phase_test.gff3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hotdogee/gff3-py/e09df4beb8c67efeb10d1197e3cc98dc8ce59139/examples/phase_test.gff3


--------------------------------------------------------------------------------
/examples/phase_test.gff3.md:
--------------------------------------------------------------------------------
 1 | # GFF3 Validation Report
 2 | 
 3 | # Validation Summary
 4 | * Found 28 PHASE errors in 28 lines
 5 | 
 6 | # Detected Errors
 7 | * Line 50: Scaffold1	WebApollo	CDS	1177308	1177459	.	+	1	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
 8 | 	- PHASE: Wrong phase 1, should be 2
 9 | * Line 51: Scaffold1	WebApollo	CDS	1177543	1177716	.	+	0	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
10 | 	- PHASE: Wrong phase 0, should be 2
11 | * Line 52: Scaffold1	WebApollo	CDS	1178935	1179223	.	+	0	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
12 | 	- PHASE: Wrong phase 0, should be 1
13 | * Line 54: Scaffold1	WebApollo	CDS	1183361	1183513	.	+	2	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
14 | 	- PHASE: Wrong phase 2, should be 1
15 | * Line 57: Scaffold1	WebApollo	CDS	1188397	1188560	.	+	2	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
16 | 	- PHASE: Wrong phase 2, should be 1
17 | * Line 58: Scaffold1	WebApollo	CDS	1189584	1189771	.	+	1	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
18 | 	- PHASE: Wrong phase 1, should be 0
19 | * Line 59: Scaffold1	WebApollo	CDS	1190237	1190428	.	+	0	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
20 | 	- PHASE: Wrong phase 0, should be 1
21 | * Line 60: Scaffold1	WebApollo	CDS	1190549	1190749	.	+	0	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
22 | 	- PHASE: Wrong phase 0, should be 2
23 | * Line 61: Scaffold1	WebApollo	CDS	1192019	1192263	.	+	0	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
24 | 	- PHASE: Wrong phase 0, should be 1
25 | * Line 62: Scaffold1	WebApollo	CDS	1193380	1193494	.	+	2	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
26 | 	- PHASE: Wrong phase 2, should be 0
27 | * Line 63: Scaffold1	WebApollo	CDS	1193579	1193770	.	+	0	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
28 | 	- PHASE: Wrong phase 0, should be 2
29 | * Line 64: Scaffold1	WebApollo	CDS	1195224	1195412	.	+	0	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
30 | 	- PHASE: Wrong phase 0, should be 1
31 | * Line 65: Scaffold1	WebApollo	CDS	1199147	1199311	.	+	0	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
32 | 	- PHASE: Wrong phase 0, should be 2
33 | * Line 66: Scaffold1	WebApollo	CDS	1199927	1200166	.	+	0	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
34 | 	- PHASE: Wrong phase 0, should be 1
35 | * Line 67: Scaffold1	WebApollo	CDS	1200664	1200876	.	+	0	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
36 | 	- PHASE: Wrong phase 0, should be 2
37 | * Line 68: Scaffold1	WebApollo	CDS	1202550	1202690	.	+	0	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
38 | 	- PHASE: Wrong phase 0, should be 1
39 | * Line 69: Scaffold1	WebApollo	CDS	1218961	1219161	.	+	0	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
40 | 	- PHASE: Wrong phase 0, should be 2
41 | * Line 70: Scaffold1	WebApollo	CDS	1221344	1221502	.	+	0	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
42 | 	- PHASE: Wrong phase 0, should be 1
43 | * Line 71: Scaffold1	WebApollo	CDS	1222068	1222331	.	+	0	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
44 | 	- PHASE: Wrong phase 0, should be 2
45 | * Line 72: Scaffold1	WebApollo	CDS	1226580	1226846	.	+	0	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
46 | 	- PHASE: Wrong phase 0, should be 1
47 | * Line 73: Scaffold1	WebApollo	CDS	1230949	1231157	.	+	0	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
48 | 	- PHASE: Wrong phase 0, should be 2
49 | * Line 79: Scaffold1	WebApollo	CDS	1250986	1251124	.	+	2	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
50 | 	- PHASE: Wrong phase 2, should be 1
51 | * Line 80: Scaffold1	WebApollo	CDS	1251226	1251384	.	+	0	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
52 | 	- PHASE: Wrong phase 0, should be 1
53 | * Line 81: Scaffold1	WebApollo	CDS	1254745	1254849	.	+	0	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
54 | 	- PHASE: Wrong phase 0, should be 2
55 | * Line 82: Scaffold1	WebApollo	CDS	1256286	1256392	.	+	0	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
56 | 	- PHASE: Wrong phase 0, should be 1
57 | * Line 83: Scaffold1	WebApollo	CDS	1257900	1257970	.	+	2	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
58 | 	- PHASE: Wrong phase 2, should be 0
59 | * Line 85: Scaffold1	WebApollo	CDS	1270962	1271143	.	+	1	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
60 | 	- PHASE: Wrong phase 1, should be 2
61 | * Line 86: Scaffold1	WebApollo	CDS	1278339	1278592	.	+	0	ID=43F646FAA11B5075698A73959D6FB383;Name=43F646FAA11B5075698A73959D6FB383;Parent=41832FDEEC226086FDE25CFA1AADAE2D;date_last_modified=2014-08-24;date_creation=2014-08-24;owner=kpanfilio
62 | 	- PHASE: Wrong phase 0, should be 2
63 | 


--------------------------------------------------------------------------------
/examples/validate.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | # try to import from project first
 3 | from os.path import dirname
 4 | sys.path.insert(1, dirname(dirname(__file__)))
 5 | from gff3 import Gff3
 6 | 
 7 | # initialize a Gff3 object
 8 | gff = Gff3()
 9 | # parse GFF3 file and do syntax checking, this populates gff.lines and gff.features
10 | # if an embedded ##FASTA directive is found, parse the sequences into gff.fasta_embedded
11 | gff.parse('annotations.gff')
12 | # parse the external FASTA file into gff.fasta_external
13 | #gff.parse_fasta_external('annotations.fa')
14 | # Check seqid, bounds and the number of Ns in each feature using one or more reference sources
15 | gff.check_reference(allowed_num_of_n=0, feature_types=['CDS'])
16 | # Checks whether child features are within the coordinate boundaries of parent features
17 | gff.check_parent_boundary()
18 | # Calculates the correct phase and checks if it matches the given phase for CDS features
19 | gff.check_phase()


--------------------------------------------------------------------------------
/gff3/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Manipulate genomic features and validate the syntax and reference sequence of your GFF3 files"""
 3 | from __future__ import absolute_import
 4 | from .gff3 import Gff3 
 5 | __all__ = ['Gff3']
 6 | 
 7 | VERSION = (1, 0, 1)
 8 | __version__ = '.'.join(map(str, VERSION[0:3])) + ''.join(VERSION[3:])
 9 | __author__ = 'Han Lin'
10 | __email__ = 'hotdogee [at] gmail [dot] com'
11 | __homepage__ = 'https://github.com/hotdogee/gff3-py'
12 | __docformat__ = 'restructuredtext'
13 | 


--------------------------------------------------------------------------------
/gff3/gff3.py:
--------------------------------------------------------------------------------
   1 | #! /usr/local/bin/python2.7
   2 | # -*- coding: utf-8 -*-
   3 | # Copyright (C) 2014  Han Lin <hotdogee [at] gmail [dot] com>
   4 | 
   5 | """
   6 | Check a GFF3 file for errors and unwanted features, with an option to correct the errors and output a valid GFF3 file.
   7 | 
   8 | Count the number of Ns in each feature, remove features with N count greater than the specified threshold. (Requires FASTA)
   9 | Check and remove features with an end coordinates larger than the landmark sequence length. (Requires FASTA or ##sequence-region)
  10 | Check if the ##sequence-region matches the FASTA file. (Requires FASTA and ##sequence-region)
  11 | Add the ##sequence-region directives if missing. (Requires FASTA)
  12 | Check and correct the phase for CDS features.
  13 | """
  14 | from __future__ import print_function
  15 | 
  16 | #from collections import OrderedDict # not available in 2.6
  17 | from collections import defaultdict
  18 | from itertools import groupby
  19 | try:
  20 |     from urllib import quote, unquote
  21 | except ImportError:
  22 |     from urllib.parse import quote, unquote
  23 | from textwrap import wrap
  24 | import sys
  25 | import re
  26 | import string
  27 | import logging
  28 | logger = logging.getLogger(__name__)
  29 | #log.basicConfig(level=logging.DEBUG, format='%(levelname)-8s %(message)s')
  30 | logger.setLevel(logging.INFO)
  31 | if not logger.handlers:
  32 |     lh = logging.StreamHandler()
  33 |     lh.setFormatter(logging.Formatter('%(levelname)-8s %(message)s'))
  34 |     logger.addHandler(lh)
  35 | 
  36 | try:
  37 |     COMPLEMENT_TRANS = string.maketrans('TAGCtagc', 'ATCGATCG')
  38 | except AttributeError:
  39 |     COMPLEMENT_TRANS = str.maketrans('TAGCtagc', 'ATCGATCG')
  40 | def complement(seq):
  41 |     return seq.translate(COMPLEMENT_TRANS)
  42 | 
  43 | BASES = ['t', 'c', 'a', 'g']
  44 | CODONS = [a+b+c for a in BASES for b in BASES for c in BASES]
  45 | AMINO_ACIDS = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
  46 | CODON_TABLE = dict(zip(CODONS, AMINO_ACIDS))
  47 | def translate(seq):
  48 |     seq = seq.lower().replace('\n', '').replace(' ', '')
  49 |     peptide = ''
  50 |     for i in range(0, len(seq), 3):
  51 |         codon = seq[i: i+3]
  52 |         amino_acid = CODON_TABLE.get(codon, '!')
  53 |         if amino_acid != '!': # end of seq
  54 |             peptide += amino_acid
  55 |     return peptide
  56 | 
  57 | def fasta_file_to_dict(fasta_file, id=True, header=False, seq=False):
  58 |     """Returns a dict from a fasta file and the number of sequences as the second return value.
  59 |     fasta_file can be a string path or a file object.
  60 |     The key of fasta_dict can be set using the keyword arguments and
  61 |     results in a combination of id, header, sequence, in that order. joined with '||'. (default: id)
  62 |     Duplicate keys are checked and a warning is logged if found.
  63 |     The value of fasta_dict is a python dict with 3 keys: header, id and seq
  64 | 
  65 |     Changelog:
  66 |     2014/11/17:
  67 |     * Added support for url escaped id
  68 |     """
  69 |     fasta_file_f = fasta_file
  70 |     if isinstance(fasta_file, str):
  71 |         fasta_file_f = open(fasta_file, 'rb')
  72 | 
  73 |     fasta_dict = OrderedDict()
  74 |     keys = ['id', 'header', 'seq']
  75 |     flags = dict([('id', id), ('header', header), ('seq', seq)])
  76 |     entry = dict([('id', ''), ('header', ''), ('seq', '')])
  77 |     count = 0
  78 |     line_num = 0
  79 | 
  80 |     for line in fasta_file_f:
  81 |         line = line.strip()
  82 |         if line and line[0] == '>':
  83 |             count += 1
  84 |             key = '||'.join([entry[i] for i in keys if flags[i]])
  85 |             if key: # key != ''
  86 |                 if key in fasta_dict: # check for duplicate key
  87 |                     logger.warning('%s : Line %d : Duplicate %s [%s] : ID = [%s].', fasta_file_f.name, line_num, '||'.join([i for i in keys if flags[i]]), key[:25] + (key[25:] and '..'), entry['id'])
  88 |                 entry['seq'] = ''.join(entry['seq'])
  89 |                 fasta_dict[key] = entry
  90 |                 # check for url escaped id
  91 |                 if id:
  92 |                     unescaped_id = unquote(entry['id'])
  93 |                     if id != unescaped_id:
  94 |                         key = '||'.join([unescaped_id] + [entry[i] for i in keys if i != 'id' and flags[i]])
  95 |                         entry['unescaped_id'] = unescaped_id
  96 |                         fasta_dict[key] = entry
  97 |                 entry = dict()
  98 |             entry['header'] = line
  99 |             entry['id'] = line.split()[0][1:]
 100 |             entry['seq'] = []
 101 |         else:
 102 |             entry['seq'].append(line.upper())
 103 |         line_num += 1
 104 | 
 105 |     if isinstance(fasta_file, str):
 106 |         fasta_file_f.close()
 107 | 
 108 |     key = '||'.join([entry[i] for i in keys if flags[i]])
 109 |     if key: # key != ''
 110 |         if key in fasta_dict:
 111 |             logger.warning('%s : Line %d : Duplicate %s [%s] : ID = [%s].', fasta_file_f.name, line_num, '||'.join([i for i in keys if flags[i]]), key[:25] + (key[25:] and '..'), entry['id'])
 112 |         entry['seq'] = ''.join(entry['seq'])
 113 |         fasta_dict[key] = entry
 114 |         # check for url escaped id
 115 |         if id:
 116 |             unescaped_id = unquote(entry['id'])
 117 |             if id != unescaped_id:
 118 |                 key = '||'.join([unescaped_id] + [entry[i] for i in keys if i != 'id' and flags[i]])
 119 |                 entry['unescaped_id'] = unescaped_id
 120 |                 fasta_dict[key] = entry
 121 | 
 122 |     return fasta_dict, count
 123 | 
 124 | def fasta_dict_to_file(fasta_dict, fasta_file, line_char_limit=None):
 125 |     """Write fasta_dict to fasta_file
 126 | 
 127 |     :param fasta_dict: returned by fasta_file_to_dict
 128 |     :param fasta_file: output file can be a string path or a file object
 129 |     :param line_char_limit: None = no limit (default)
 130 |     :return: None
 131 |     """
 132 |     fasta_fp = fasta_file
 133 |     if isinstance(fasta_file, str):
 134 |         fasta_fp = open(fasta_file, 'wb')
 135 | 
 136 |     for key in fasta_dict:
 137 |         seq = fasta_dict[key]['seq']
 138 |         if line_char_limit:
 139 |             seq = '\n'.join([seq[i:i+line_char_limit] for i in range(0, len(seq), line_char_limit)])
 140 |         fasta_fp.write(u'{0:s}\n{1:s}\n'.format(fasta_dict[key]['header'], seq))
 141 | 
 142 | 
 143 | class Gff3(object):
 144 |     def __init__(self, gff_file=None, fasta_external=None, logger=logger):
 145 |         self.logger = logger
 146 |         self.lines = []
 147 |         self.features = {}
 148 |         self.unresolved_parents = {}
 149 |         self.fasta_embedded = {}
 150 |         self.fasta_external = {}
 151 |         if gff_file:
 152 |             self.parse(gff_file)
 153 |         if fasta_external:
 154 |             self.parse_fasta_external(fasta_external)
 155 | 
 156 |     error_format = 'Line {current_line_num}: {error_type}: {message}\n-> {line}'
 157 | 
 158 |     def add_line_error(self, line_data, error_info, log_level=logging.ERROR):
 159 |         """Helper function to record and log an error message
 160 | 
 161 |         :param line_data: dict
 162 |         :param error_info: dict
 163 |         :param logger:
 164 |         :param log_level: int
 165 |         :return:
 166 |         """
 167 |         if not error_info: return
 168 |         try:
 169 |             line_data['line_errors'].append(error_info)
 170 |         except KeyError:
 171 |             line_data['line_errors'] = [error_info]
 172 |         except TypeError: # no line_data
 173 |             pass
 174 |         try:
 175 |             self.logger.log(log_level, Gff3.error_format.format(current_line_num=line_data['line_index'] + 1, error_type=error_info['error_type'], message=error_info['message'], line=line_data['line_raw'].rstrip()))
 176 |         except AttributeError: # no logger
 177 |             pass
 178 | 
 179 |     def check_unresolved_parents(self):
 180 |         # check if any unresolved parents are now resolvable
 181 |         if len(self.unresolved_parents) > 0:
 182 |             self.logger.info('%d unresolved forward referencing parent ids, trying global lookup...' % len(self.unresolved_parents))
 183 |             globally_resolved_parents = set()
 184 |             for feature_id in self.unresolved_parents:
 185 |                 if feature_id in self.features:
 186 |                     self.logger.info('  Resolved parent id: {0:s}, defined in lines: {1:s}, referenced in lines: {2:s}'.format(
 187 |                         feature_id,
 188 |                         ','.join([str(line_data['line_index'] + 1) for line_data in self.features[feature_id]]),
 189 |                         ','.join([str(line_data['line_index'] + 1) for line_data in self.unresolved_parents[feature_id]])))
 190 |                     globally_resolved_parents.add(feature_id)
 191 |                     for line_data in self.unresolved_parents[feature_id]:
 192 |                         line_data['parents'].append(self.features[feature_id])
 193 |                         for ld in self.features[feature_id]:
 194 |                             # no need to check if line_data in ld['children'], because it is impossible, each ld maps to only one feature_id, so the ld we get are all different
 195 |                             ld['children'].append(line_data)
 196 |             still_unresolved_parents = sorted(list(set(self.unresolved_parents) - globally_resolved_parents))
 197 |             if len(still_unresolved_parents) > 0:
 198 |                 self.logger.info('{0:d} unresolved parent ids:'.format(len(still_unresolved_parents)))
 199 |                 for feature_id in still_unresolved_parents:
 200 |                     self.logger.info('  Unresolved parent id: {0:s}, referenced in lines: {1:s}'.format(feature_id, ','.join(
 201 |                         [str(line_data['line_index'] + 1) for line_data in self.unresolved_parents[feature_id]])))
 202 | 
 203 |     def check_parent_boundary(self):
 204 |         """
 205 |         checks whether child features are within the coordinate boundaries of parent features
 206 | 
 207 |         :return:
 208 |         """
 209 |         for line in self.lines:
 210 |             for parent_feature in line['parents']:
 211 |                 ok = False
 212 |                 for parent_line in parent_feature:
 213 |                     if parent_line['start'] <= line['start'] and line['end'] <= parent_line['end']:
 214 |                         ok = True
 215 |                         break
 216 |                 if not ok:
 217 |                     self.add_line_error(line, {'message': 'This feature is not contained within the feature boundaries of parent: {0:s}: {1:s}'.format(
 218 |                         parent_feature[0]['attributes']['ID'],
 219 |                         ','.join(['({0:s}, {1:d}, {2:d})'.format(line['seqid'], line['start'], line['end']) for line in parent_feature])
 220 |                     ), 'error_type': 'BOUNDS', 'location': 'parent_boundary'})
 221 | 
 222 |     def check_phase(self):
 223 |         """
 224 |         1. get a list of CDS with the same parent
 225 |         2. sort according to strand
 226 |         3. calculate and validate phase
 227 |         """
 228 |         plus_minus = set(['+', '-'])
 229 |         for k, g in groupby(sorted([line for line in self.lines if  line['line_type'] == 'feature' and line['type'] == 'CDS' and 'Parent' in line['attributes']], key=lambda x: x['attributes']['Parent']), key=lambda x: x['attributes']['Parent']):
 230 |             cds_list = list(g)
 231 |             strand_set = list(set([line['strand'] for line in cds_list]))
 232 |             if len(strand_set) != 1:
 233 |                 for line in cds_list:
 234 |                     self.add_line_error(line, {'message': 'Inconsistent CDS strand with parent: {0:s}'.format(k), 'error_type': 'STRAND'})
 235 |                 continue
 236 |             if len(cds_list) == 1:
 237 |                 if cds_list[0]['phase'] != 0:
 238 |                     self.add_line_error(cds_list[0], {'message': 'Wrong phase {0:d}, should be {1:d}'.format(cds_list[0]['phase'], 0), 'error_type': 'PHASE'})
 239 |                 continue
 240 |             strand = strand_set[0]
 241 |             if strand not in plus_minus:
 242 |                 # don't process unknown strands
 243 |                 continue
 244 |             if strand == '-':
 245 |                 # sort end descending
 246 |                 sorted_cds_list = sorted(cds_list, key=lambda x: x['end'], reverse=True)
 247 |             else:
 248 |                 sorted_cds_list = sorted(cds_list, key=lambda x: x['start'])
 249 |             phase = 0
 250 |             for line in sorted_cds_list:
 251 |                 if line['phase'] != phase:
 252 |                     self.add_line_error(line, {'message': 'Wrong phase {0:d}, should be {1:d}'.format(line['phase'], phase), 'error_type': 'PHASE'})
 253 |                 phase = (3 - ((line['end'] - line['start'] + 1 - phase) % 3)) % 3
 254 | 
 255 |     def parse_fasta_external(self, fasta_file):
 256 |         self.fasta_external, count = fasta_file_to_dict(fasta_file)
 257 | 
 258 |     def check_reference(self, sequence_region=False, fasta_embedded=False, fasta_external=False, check_bounds=True, check_n=True, allowed_num_of_n=0, feature_types=('CDS',)):
 259 |         """
 260 |         Check seqid, bounds and the number of Ns in each feature using one or more reference sources.
 261 | 
 262 |         Seqid check: check if the seqid can be found in the reference sources.
 263 | 
 264 |         Bounds check: check the start and end fields of each features and log error if the values aren't within the seqid sequence length, requires at least one of these sources: ##sequence-region, embedded #FASTA, or external FASTA file.
 265 | 
 266 |         Ns check: count the number of Ns in each feature with the type specified in *line_types (default: 'CDS') and log an error if the number is greater than allowed_num_of_n (default: 0), requires at least one of these sources: embedded #FASTA, or external FASTA file.
 267 | 
 268 |         When called with all source parameters set as False (default), check all available sources, and log debug message if unable to perform a check due to none of the reference sources being available.
 269 | 
 270 |         If any source parameter is set to True, check only those sources marked as True, log error if those sources don't exist.
 271 | 
 272 |         :param sequence_region: check bounds using the ##sequence-region directive (default: False)
 273 |         :param fasta_embedded: check bounds using the embedded fasta specified by the ##FASTA directive (default: False)
 274 |         :param fasta_external: check bounds using the external fasta given by the self.parse_fasta_external (default: False)
 275 |         :param check_bounds: If False, don't run the bounds check (default: True)
 276 |         :param check_n: If False, don't run the Ns check (default: True)
 277 |         :param allowed_num_of_n: only report features with a number of Ns greater than the specified value (default: 0)
 278 |         :param feature_types: only check features of these feature_types, multiple types may be specified, if none are specified, check only 'CDS'
 279 |         :return: error_lines: a set of line_index(int) with errors detected by check_reference
 280 |         """
 281 |         # collect lines with errors in this set
 282 |         error_lines = set()
 283 |         # check if we have a parsed gff3
 284 |         if not self.lines:
 285 |             self.logger.debug('.parse(gff_file) before calling .check_bounds()')
 286 |             return error_lines
 287 |         # setup default line_types
 288 |         check_n_feature_types = set(feature_types)
 289 |         if len(check_n_feature_types) == 0:
 290 |             check_n_feature_types.add('CDS')
 291 |         # compile regex
 292 |         n_segments_finditer = re.compile(r'[Nn]+').finditer
 293 |         # check_all_sources mode
 294 |         check_all_sources = True
 295 |         if sequence_region or fasta_embedded or fasta_external:
 296 |             check_all_sources = False
 297 |         # get a list of line_data with valid start and end coordinates and unescape the seqid
 298 |         start_end_error_locations = set(('start', 'end', 'start,end'))
 299 |         valid_line_data_seqid = [(line_data, unquote(line_data['seqid'])) for line_data in self.lines if line_data['line_type'] == 'feature' and line_data['seqid'] != '.' and (not line_data['line_errors'] or not [error_info for error_info in line_data['line_errors'] if 'location' in error_info and error_info['location'] in start_end_error_locations])]
 300 |         checked_at_least_one_source = False
 301 |         # check directive
 302 |         # don't use any directives with errors
 303 |         valid_sequence_regions = dict([(unquote(line_data['seqid']), line_data) for line_data in self.lines if line_data['directive'] == '##sequence-region' and not line_data['line_errors']])
 304 |         unresolved_seqid = set()
 305 |         if (check_all_sources or sequence_region) and valid_sequence_regions:
 306 |             checked_at_least_one_source = True
 307 |             for line_data, seqid in valid_line_data_seqid:
 308 |                 if seqid not in valid_sequence_regions and seqid not in unresolved_seqid:
 309 |                     unresolved_seqid.add(seqid)
 310 |                     error_lines.add(line_data['line_index'])
 311 |                     self.add_line_error(line_data, {'message': u'Seqid not found in any ##sequence-region: {0:s}'.format(
 312 |                         seqid), 'error_type': 'BOUNDS', 'location': 'sequence_region'})
 313 |                     continue
 314 |                 if line_data['start'] < valid_sequence_regions[seqid]['start']:
 315 |                     error_lines.add(line_data['line_index'])
 316 |                     self.add_line_error(line_data, {'message': 'Start is less than the ##sequence-region start: %d' % valid_sequence_regions[seqid]['start'], 'error_type': 'BOUNDS', 'location': 'sequence_region'})
 317 |                 if line_data['end'] > valid_sequence_regions[seqid]['end']:
 318 |                     error_lines.add(line_data['line_index'])
 319 |                     self.add_line_error(line_data, {'message': 'End is greater than the ##sequence-region end: %d' % valid_sequence_regions[seqid]['end'], 'error_type': 'BOUNDS', 'location': 'sequence_region'})
 320 |         elif sequence_region:
 321 |             self.logger.debug('##sequence-region not found in GFF3')
 322 |         # check fasta_embedded
 323 |         unresolved_seqid = set()
 324 |         if (check_all_sources or fasta_embedded) and self.fasta_embedded:
 325 |             checked_at_least_one_source = True
 326 |             for line_data, seqid in valid_line_data_seqid:
 327 |                 if seqid not in self.fasta_embedded and seqid not in unresolved_seqid:
 328 |                     unresolved_seqid.add(seqid)
 329 |                     error_lines.add(line_data['line_index'])
 330 |                     self.add_line_error(line_data, {'message': 'Seqid not found in the embedded ##FASTA: %s' % seqid, 'error_type': 'BOUNDS', 'location': 'fasta_embedded'})
 331 |                     continue
 332 |                 # check bounds
 333 |                 if line_data['end'] > len(self.fasta_embedded[seqid]['seq']):
 334 |                     error_lines.add(line_data['line_index'])
 335 |                     self.add_line_error(line_data, {'message': 'End is greater than the embedded ##FASTA sequence length: %d' % len(self.fasta_embedded[seqid]['seq']), 'error_type': 'BOUNDS', 'location': 'fasta_embedded'})
 336 |                 # check n
 337 |                 if check_n and line_data['type'] in check_n_feature_types:
 338 |                     """
 339 |                     >>> timeit("a.lower().count('n')", "import re; a = ('ASDKADSJHFIUDNNNNNNNnnnnSHFD'*50)")
 340 |                     5.540903252684302
 341 |                     >>> timeit("a.count('n'); a.count('N')", "import re; a = ('ASDKADSJHFIUDNNNNNNNnnnnSHFD'*50)")
 342 |                     2.3504867946058425
 343 |                     >>> timeit("re.findall('[Nn]+', a)", "import re; a = ('ASDKADSJHFIUDNNNNNNNnnnnSHFD'*50)")
 344 |                     30.60731204915959
 345 |                     """
 346 |                     n_count = self.fasta_embedded[seqid]['seq'].count('N', line_data['start'] - 1, line_data['end']) + self.fasta_embedded[seqid]['seq'].count('n', line_data['start'] - 1, line_data['end'])
 347 |                     if n_count > allowed_num_of_n:
 348 |                         # get detailed segments info
 349 |                         n_segments = [(m.start(), m.end() - m.start()) for m in n_segments_finditer(self.fasta_embedded[seqid]['seq'], line_data['start'] - 1, line_data['end'])]
 350 |                         n_segments_str = ['(%d, %d)' % (m[0], m[1]) for m in n_segments]
 351 |                         error_lines.add(line_data['line_index'])
 352 |                         self.add_line_error(line_data, {'message': 'Found %d Ns in %s feature of length %d using the embedded ##FASTA, consists of %d segment (start, length): %s' % (n_count, line_data['type'], line_data['end'] - line_data['start'], len(n_segments), ', '.join(n_segments_str)), 'error_type': 'N_COUNT', 'n_segments': n_segments, 'location': 'fasta_embedded'})
 353 |         elif fasta_embedded:
 354 |             self.logger.debug('Embedded ##FASTA not found in GFF3')
 355 |         # check fasta_external
 356 |         unresolved_seqid = set()
 357 |         if (check_all_sources or fasta_external) and self.fasta_external:
 358 |             checked_at_least_one_source = True
 359 |             for line_data, seqid in valid_line_data_seqid:
 360 |                 if seqid not in self.fasta_external and seqid not in unresolved_seqid:
 361 |                     unresolved_seqid.add(seqid)
 362 |                     error_lines.add(line_data['line_index'])
 363 |                     self.add_line_error(line_data, {'message': 'Seqid not found in the external FASTA file: %s' % seqid, 'error_type': 'BOUNDS', 'location': 'fasta_external'})
 364 |                     continue
 365 |                 # check bounds
 366 |                 if line_data['end'] > len(self.fasta_external[seqid]['seq']):
 367 |                     error_lines.add(line_data['line_index'])
 368 |                     self.add_line_error(line_data, {'message': 'End is greater than the external FASTA sequence length: %d' % len(self.fasta_external[seqid]['seq']), 'error_type': 'BOUNDS', 'location': 'fasta_external'})
 369 |                 # check n
 370 |                 if check_n and line_data['type'] in check_n_feature_types:
 371 |                     n_count = self.fasta_external[seqid]['seq'].count('N', line_data['start'] - 1, line_data['end']) + self.fasta_external[seqid]['seq'].count('n', line_data['start'] - 1, line_data['end'])
 372 |                     if n_count > allowed_num_of_n:
 373 |                         # get detailed segments info
 374 |                         n_segments = [(m.start(), m.end() - m.start()) for m in n_segments_finditer(self.fasta_external[seqid]['seq'], line_data['start'] - 1, line_data['end'])]
 375 |                         n_segments_str = ['(%d, %d)' % (m[0], m[1]) for m in n_segments]
 376 |                         error_lines.add(line_data['line_index'])
 377 |                         self.add_line_error(line_data, {'message': 'Found %d Ns in %s feature of length %d using the external FASTA, consists of %d segment (start, length): %s' % (n_count, line_data['type'], line_data['end'] - line_data['start'], len(n_segments), ', '.join(n_segments_str)), 'error_type': 'N_COUNT', 'n_segments': n_segments, 'location': 'fasta_external'})
 378 |         elif fasta_external:
 379 |             self.logger.debug('External FASTA file not given')
 380 |         if check_all_sources and not checked_at_least_one_source:
 381 |             self.logger.debug('Unable to perform bounds check, requires at least one of the following sources: ##sequence-region, embedded ##FASTA, or external FASTA file')
 382 |         return error_lines
 383 | 
 384 |     def parse(self, gff_file, strict=False):
 385 |         """Parse the gff file into the following data structures:
 386 | 
 387 |         * lines(list of line_data(dict))
 388 |             - line_index(int): the index in lines
 389 |             - line_raw(str)
 390 |             - line_type(str in ['feature', 'directive', 'comment', 'blank', 'unknown'])
 391 |             - line_errors(list of str): a list of error messages
 392 |             - line_status(str in ['normal', 'modified', 'removed'])
 393 |             - parents(list of feature(list of line_data(dict))): may have multiple parents
 394 |             - children(list of line_data(dict))
 395 |             - extra fields depending on line_type
 396 |             * directive
 397 |                 - directive(str in ['##gff-version', '##sequence-region', '##feature-ontology', '##attribute-ontology', '##source-ontology', '##species', '##genome-build', '###', '##FASTA'])
 398 |                 - extra fields depending on directive
 399 |             * feature
 400 |                 - seqid(str): must escape any characters not in the set [a-zA-Z0-9.:^*$@!+_?-|] using RFC 3986 Percent-Encoding
 401 |                 - source(str)
 402 |                 - type(str in so_types)
 403 |                 - start(int)
 404 |                 - end(int)
 405 |                 - score(float)
 406 |                 - strand(str in ['+', '-', '.', '?'])
 407 |                 - phase(int in [0, 1, 2])
 408 |                 - attributes(dict of tag(str) to value)
 409 |                     - ID(str)
 410 |                     - Name(str)
 411 |                     - Alias(list of str): multi value
 412 |                     - Parent(list of str): multi value
 413 |                     - Target(dict)
 414 |                         - target_id(str)
 415 |                         - start(int)
 416 |                         - end(int)
 417 |                         - strand(str in ['+', '-', ''])
 418 |                     - Gap(str): CIGAR format
 419 |                     - Derives_from(str)
 420 |                     - Note(list of str): multi value
 421 |                     - Dbxref(list of str): multi value
 422 |                     - Ontology_term(list of str): multi value
 423 |                     - Is_circular(str in ['true'])
 424 |             * fasta_dict(dict of id(str) to sequence_item(dict))
 425 |                 - id(str)
 426 |                 - header(str)
 427 |                 - seq(str)
 428 |                 - line_length(int)
 429 | 
 430 |         * features(dict of feature_id(str in line_data['attributes']['ID']) to feature(list of line_data(dict)))
 431 | 
 432 |         A feature is a list of line_data(dict), since all lines that share an ID collectively represent a single feature.
 433 | 
 434 |         During serialization, line_data(dict) references should be converted into line_index(int)
 435 | 
 436 |         :param gff_file: a string path or file object
 437 |         :param strict: when true, throw exception on syntax and format errors. when false, use best effort to finish parsing while logging errors
 438 |         """
 439 |         valid_strand = set(('+', '-', '.', '?'))
 440 |         valid_phase = set((0, 1, 2))
 441 |         multi_value_attributes = set(('Parent', 'Alias', 'Note', 'Dbxref', 'Ontology_term'))
 442 |         valid_attribute_target_strand = set(('+', '-', ''))
 443 |         reserved_attributes = set(('ID', 'Name', 'Alias', 'Parent', 'Target', 'Gap', 'Derives_from', 'Note', 'Dbxref', 'Ontology_term', 'Is_circular'))
 444 | 
 445 |         # illegal character check
 446 |         # Literal use of tab, newline, carriage return, the percent (%) sign, and control characters must be encoded using RFC 3986 Percent-Encoding; no other characters may be encoded.
 447 |         # control characters: \x00-\x1f\x7f this includes tab(\x09), newline(\x0a), carriage return(\x0d)
 448 |         # seqid may contain any characters, but must escape any characters not in the set [a-zA-Z0-9.:^*$@!+_?-|]
 449 |         # URL escaping rules are used for tags or values containing the following characters: ",=;".
 450 |         #>>> timeit("unescaped_seqid('Un.7589')", "import re; unescaped_seqid = re.compile(r'[^a-zA-Z0-9.:^*$@!+_?|%-]|%(?![0-9a-fA-F]{2})').search")
 451 |         #0.4128372745785036
 452 |         #>>> timeit("unescaped_seqid2('Un.7589')", "import re; unescaped_seqid2 = re.compile(r'^([a-zA-Z0-9.:^*$@!+_?|-]|%[0-9a-fA-F]{2})+$').search")
 453 |         #0.9012313532265175
 454 |         unescaped_seqid = re.compile(r'[^a-zA-Z0-9.:^*$@!+_?|%-]|%(?![0-9a-fA-F]{2})').search
 455 |         unescaped_field = re.compile(r'[\x00-\x1f\x7f]|%(?![0-9a-fA-F]{2})').search
 456 | 
 457 |         gff_fp = gff_file
 458 |         if isinstance(gff_file, str):
 459 |             gff_fp = open(gff_file, 'r')
 460 | 
 461 |         lines = []
 462 |         current_line_num = 1 # line numbers start at 1
 463 |         features = defaultdict(list)
 464 |         # key = the unresolved id, value = a list of line_data(dict)
 465 |         unresolved_parents = defaultdict(list)
 466 | 
 467 |         for line_raw in gff_fp:
 468 |             line_data = {
 469 |                 'line_index': current_line_num - 1,
 470 |                 'line_raw': line_raw,
 471 |                 'line_status': 'normal',
 472 |                 'parents': [],
 473 |                 'children': [],
 474 |                 'line_type': '',
 475 |                 'directive': '',
 476 |                 'line_errors': [],
 477 |                 'type': '',
 478 |             }
 479 |             line_strip = line_raw.strip()
 480 |             if line_strip != line_raw[:len(line_strip)]:
 481 |                 self.add_line_error(line_data, {'message': 'White chars not allowed at the start of a line', 'error_type': 'FORMAT', 'location': ''})
 482 |             if current_line_num == 1 and not line_strip.startswith('##gff-version'):
 483 |                 self.add_line_error(line_data, {'message': '"##gff-version" missing from the first line', 'error_type': 'FORMAT', 'location': ''})
 484 |             if len(line_strip) == 0:
 485 |                 line_data['line_type'] = 'blank'
 486 |                 continue
 487 |             if line_strip.startswith('##'):
 488 |                 line_data['line_type'] = 'directive'
 489 |                 if line_strip.startswith('##sequence-region'):
 490 |                     # ##sequence-region seqid start end
 491 |                     # This element is optional, but strongly encouraged because it allows parsers to perform bounds checking on features.
 492 |                     # only one ##sequence-region directive may be given for any given seqid
 493 |                     # all features on that landmark feature (having that seqid) must be contained within the range defined by that ##sequence-region diretive. An exception to this rule is allowed when a landmark feature is marked with the Is_circular attribute.
 494 |                     line_data['directive'] = '##sequence-region'
 495 |                     tokens = list(line_strip.split()[1:])
 496 |                     if len(tokens) != 3:
 497 |                         self.add_line_error(line_data, {'message': 'Expecting 3 fields, got %d: %s' % (len(tokens) - 1, repr(tokens[1:])), 'error_type': 'FORMAT', 'location': ''})
 498 |                     if len(tokens) > 0:
 499 |                         line_data['seqid'] = tokens[0]
 500 |                         # check for duplicate ##sequence-region seqid
 501 |                         if [True for d in lines if ('directive' in d and d['directive'] == '##sequence-region' and 'seqid' in d and d['seqid'] == line_data['seqid'])]:
 502 |                             self.add_line_error(line_data, {'message': '##sequence-region seqid: "%s" may only appear once' % line_data['seqid'], 'error_type': 'FORMAT', 'location': ''})
 503 |                         try:
 504 |                             all_good = True
 505 |                             try:
 506 |                                 line_data['start'] = int(tokens[1])
 507 |                                 if line_data['start'] < 1:
 508 |                                     self.add_line_error(line_data, {'message': 'Start is not a valid 1-based integer coordinate: "%s"' % tokens[1], 'error_type': 'FORMAT', 'location': ''})
 509 |                             except ValueError:
 510 |                                 all_good = False
 511 |                                 self.add_line_error(line_data, {'message': 'Start is not a valid integer: "%s"' % tokens[1], 'error_type': 'FORMAT', 'location': ''})
 512 |                                 line_data['start'] = tokens[1]
 513 |                             try:
 514 |                                 line_data['end'] = int(tokens[2])
 515 |                                 if line_data['end'] < 1:
 516 |                                     self.add_line_error(line_data, {'message': 'End is not a valid 1-based integer coordinate: "%s"' % tokens[2], 'error_type': 'FORMAT', 'location': ''})
 517 |                             except ValueError:
 518 |                                 all_good = False
 519 |                                 self.add_line_error(line_data, {'message': 'End is not a valid integer: "%s"' % tokens[2], 'error_type': 'FORMAT', 'location': ''})
 520 |                                 line_data['start'] = tokens[2]
 521 |                             # if all_good then both start and end are int, so we can check if start is not less than or equal to end
 522 |                             if all_good and line_data['start'] > line_data['end']:
 523 |                                 self.add_line_error(line_data, {'message': 'Start is not less than or equal to end', 'error_type': 'FORMAT', 'location': ''})
 524 |                         except IndexError:
 525 |                             pass
 526 |                 elif line_strip.startswith('##gff-version'):
 527 |                     # The GFF version, always 3 in this specification must be present, must be the topmost line of the file and may only appear once in the file.
 528 |                     line_data['directive'] = '##gff-version'
 529 |                     # check if it appeared before
 530 |                     if [True for d in lines if ('directive' in d and d['directive'] == '##gff-version')]:
 531 |                         self.add_line_error(line_data, {'message': '##gff-version missing from the first line', 'error_type': 'FORMAT', 'location': ''})
 532 |                     tokens = list(line_strip.split()[1:])
 533 |                     if len(tokens) != 1:
 534 |                         self.add_line_error(line_data, {'message': 'Expecting 1 field, got %d: %s' % (len(tokens) - 1, repr(tokens[1:])), 'error_type': 'FORMAT', 'location': ''})
 535 |                     if len(tokens) > 0:
 536 |                         try:
 537 |                             line_data['version'] = int(tokens[0])
 538 |                             if line_data['version'] != 3:
 539 |                                 self.add_line_error(line_data, {'message': 'Version is not "3": "%s"' % tokens[0], 'error_type': 'FORMAT', 'location': ''})
 540 |                         except ValueError:
 541 |                             self.add_line_error(line_data, {'message': 'Version is not a valid integer: "%s"' % tokens[0], 'error_type': 'FORMAT', 'location': ''})
 542 |                             line_data['version'] = tokens[0]
 543 |                 elif line_strip.startswith('###'):
 544 |                     # This directive (three # signs in a row) indicates that all forward references to feature IDs that have been seen to this point have been resolved.
 545 |                     line_data['directive'] = '###'
 546 |                 elif line_strip.startswith('##FASTA'):
 547 |                     # This notation indicates that the annotation portion of the file is at an end and that the
 548 |                     # remainder of the file contains one or more sequences (nucleotide or protein) in FASTA format.
 549 |                     line_data['directive'] = '##FASTA'
 550 |                     self.logger.info('Reading embedded ##FASTA sequence')
 551 |                     self.fasta_embedded, count = fasta_file_to_dict(gff_fp)
 552 |                     self.logger.info('%d sequences read' % len(self.fasta_embedded))
 553 |                 elif line_strip.startswith('##feature-ontology'):
 554 |                     # ##feature-ontology URI
 555 |                     # This directive indicates that the GFF3 file uses the ontology of feature types located at the indicated URI or URL.
 556 |                     line_data['directive'] = '##feature-ontology'
 557 |                     tokens = list(line_strip.split()[1:])
 558 |                     if len(tokens) != 1:
 559 |                         self.add_line_error(line_data, {'message': 'Expecting 1 field, got %d: %s' % (len(tokens) - 1, repr(tokens[1:])), 'error_type': 'FORMAT', 'location': ''})
 560 |                     if len(tokens) > 0:
 561 |                         line_data['URI'] = tokens[0]
 562 |                 elif line_strip.startswith('##attribute-ontology'):
 563 |                     # ##attribute-ontology URI
 564 |                     # This directive indicates that the GFF3 uses the ontology of attribute names located at the indicated URI or URL.
 565 |                     line_data['directive'] = '##attribute-ontology'
 566 |                     tokens = list(line_strip.split()[1:])
 567 |                     if len(tokens) != 1:
 568 |                         self.add_line_error(line_data, {'message': 'Expecting 1 field, got %d: %s' % (len(tokens) - 1, repr(tokens[1:])), 'error_type': 'FORMAT', 'location': ''})
 569 |                     if len(tokens) > 0:
 570 |                         line_data['URI'] = tokens[0]
 571 |                 elif line_strip.startswith('##source-ontology'):
 572 |                     # ##source-ontology URI
 573 |                     # This directive indicates that the GFF3 uses the ontology of source names located at the indicated URI or URL.
 574 |                     line_data['directive'] = '##source-ontology'
 575 |                     tokens = list(line_strip.split()[1:])
 576 |                     if len(tokens) != 1:
 577 |                         self.add_line_error(line_data, {'message': 'Expecting 1 field, got %d: %s' % (len(tokens) - 1, repr(tokens[1:])), 'error_type': 'FORMAT', 'location': ''})
 578 |                     if len(tokens) > 0:
 579 |                         line_data['URI'] = tokens[0]
 580 |                 elif line_strip.startswith('##species'):
 581 |                     # ##species NCBI_Taxonomy_URI
 582 |                     # This directive indicates the species that the annotations apply to.
 583 |                     line_data['directive'] = '##species'
 584 |                     tokens = list(line_strip.split()[1:])
 585 |                     if len(tokens) != 1:
 586 |                         self.add_line_error(line_data, {'message': 'Expecting 1 field, got %d: %s' % (len(tokens) - 1, repr(tokens[1:])), 'error_type': 'FORMAT', 'location': ''})
 587 |                     if len(tokens) > 0:
 588 |                         line_data['NCBI_Taxonomy_URI'] = tokens[0]
 589 |                 elif line_strip.startswith('##genome-build'):
 590 |                     # ##genome-build source buildName
 591 |                     # The genome assembly build name used for the coordinates given in the file.
 592 |                     line_data['directive'] = '##genome-build'
 593 |                     tokens = list(line_strip.split()[1:])
 594 |                     if len(tokens) != 2:
 595 |                         self.add_line_error(line_data, {'message': 'Expecting 2 fields, got %d: %s' % (len(tokens) - 1, repr(tokens[1:])), 'error_type': 'FORMAT', 'location': ''})
 596 |                     if len(tokens) > 0:
 597 |                         line_data['source'] = tokens[0]
 598 |                         try:
 599 |                             line_data['buildName'] = tokens[1]
 600 |                         except IndexError:
 601 |                             pass
 602 |                 else:
 603 |                     self.add_line_error(line_data, {'message': 'Unknown directive', 'error_type': 'FORMAT', 'location': ''})
 604 |                     tokens = list(line_strip.split())
 605 |                     line_data['directive'] = tokens[0]
 606 |             elif line_strip.startswith('#'):
 607 |                 line_data['line_type'] = 'comment'
 608 |             else:
 609 |                 # line_type may be a feature or unknown
 610 |                 line_data['line_type'] = 'feature'
 611 |                 tokens = list(map(str.strip, line_raw.split('\t')))
 612 |                 if len(tokens) != 9:
 613 |                     self.add_line_error(line_data, {'message': 'Features should contain 9 fields, got %d: %s' % (len(tokens) - 1, repr(tokens[1:])), 'error_type': 'FORMAT', 'location': ''})
 614 |                 for i, t in enumerate(tokens):
 615 |                     if not t:
 616 |                         self.add_line_error(line_data, {'message': 'Empty field: %d, must have a "."' % (i + 1), 'error_type': 'FORMAT', 'location': ''})
 617 |                 try:
 618 |                     line_data['seqid'] = tokens[0]
 619 |                     if unescaped_seqid(tokens[0]):
 620 |                         self.add_line_error(line_data, {'message': 'Seqid must escape any characters not in the set [a-zA-Z0-9.:^*$@!+_?-|]: "%s"' % tokens[0], 'error_type': 'FORMAT', 'location': ''})
 621 |                     line_data['source'] = tokens[1]
 622 |                     if unescaped_field(tokens[1]):
 623 |                         self.add_line_error(line_data, {'message': 'Source must escape the percent (%%) sign and any control characters: "%s"' % tokens[1], 'error_type': 'FORMAT', 'location': ''})
 624 |                     line_data['type'] = tokens[2]
 625 |                     if unescaped_field(tokens[2]):
 626 |                         self.add_line_error(line_data, {'message': 'Type must escape the percent (%%) sign and any control characters: "%s"' % tokens[2], 'error_type': 'FORMAT', 'location': ''})
 627 |                     all_good = True
 628 |                     try:
 629 |                         line_data['start'] = int(tokens[3])
 630 |                         if line_data['start'] < 1:
 631 |                             self.add_line_error(line_data, {'message': 'Start is not a valid 1-based integer coordinate: "%s"' % tokens[3], 'error_type': 'FORMAT', 'location': 'start'})
 632 |                     except ValueError:
 633 |                         all_good = False
 634 |                         line_data['start'] = tokens[3]
 635 |                         if line_data['start'] != '.':
 636 |                             self.add_line_error(line_data, {'message': 'Start is not a valid integer: "%s"' % line_data['start'], 'error_type': 'FORMAT', 'location': 'start'})
 637 |                     try:
 638 |                         line_data['end'] = int(tokens[4])
 639 |                         if line_data['end'] < 1:
 640 |                             self.add_line_error(line_data, {'message': 'End is not a valid 1-based integer coordinate: "%s"' % tokens[4], 'error_type': 'FORMAT', 'location': 'end'})
 641 |                     except ValueError:
 642 |                         all_good = False
 643 |                         line_data['end'] = tokens[4]
 644 |                         if line_data['end'] != '.':
 645 |                             self.add_line_error(line_data, {'message': 'End is not a valid integer: "%s"' % line_data['end'], 'error_type': 'FORMAT', 'location': 'end'})
 646 |                     # if all_good then both start and end are int, so we can check if start is not less than or equal to end
 647 |                     if all_good and line_data['start'] > line_data['end']:
 648 |                         self.add_line_error(line_data, {'message': 'Start is not less than or equal to end', 'error_type': 'FORMAT', 'location': 'start,end'})
 649 |                     try:
 650 |                         line_data['score'] = float(tokens[5])
 651 |                     except ValueError:
 652 |                         line_data['score'] = tokens[5]
 653 |                         if line_data['score'] != '.':
 654 |                             self.add_line_error(line_data, {'message': 'Score is not a valid floating point number: "%s"' % line_data['score'], 'error_type': 'FORMAT', 'location': ''})
 655 |                     line_data['strand'] = tokens[6]
 656 |                     if line_data['strand'] not in valid_strand: # set(['+', '-', '.', '?'])
 657 |                         self.add_line_error(line_data, {'message': 'Strand has illegal characters: "%s"' % tokens[6], 'error_type': 'FORMAT', 'location': ''})
 658 |                     try:
 659 |                         line_data['phase'] = int(tokens[7])
 660 |                         if line_data['phase'] not in valid_phase: # set([0, 1, 2])
 661 |                             self.add_line_error(line_data, {'message': 'Phase is not 0, 1, or 2: "%s"' % tokens[7], 'error_type': 'FORMAT', 'location': ''})
 662 |                     except ValueError:
 663 |                         line_data['phase'] = tokens[7]
 664 |                         if line_data['phase'] != '.':
 665 |                             self.add_line_error(line_data, {'message': 'Phase is not a valid integer: "%s"' % line_data['phase'], 'error_type': 'FORMAT', 'location': ''})
 666 |                         elif line_data['type'] == 'CDS':
 667 |                             self.add_line_error(line_data, {'message': 'Phase is required for all CDS features', 'error_type': 'FORMAT', 'location': ''})
 668 |                     # parse attributes, ex: ID=exon00003;Parent=mRNA00001,mRNA00003;Name=EXON.1
 669 |                     # URL escaping rules are used for tags or values containing the following characters: ",=;". Spaces are allowed in this field, but tabs must be replaced with the %09 URL escape.
 670 |                     # Note that attribute names are case sensitive. "Parent" is not the same as "parent".
 671 |                     # All attributes that begin with an uppercase letter are reserved for later use. Attributes that begin with a lowercase letter can be used freely by applications.
 672 |                     if unescaped_field(tokens[8]):
 673 |                         self.add_line_error(line_data, {'message': 'Attributes must escape the percent (%) sign and any control characters', 'error_type': 'FORMAT', 'location': ''})
 674 |                     attribute_tokens = tuple(tuple(t for t in a.split('=')) for a in tokens[8].split(';') if a)
 675 |                     line_data['attributes'] = {}
 676 |                     if len(attribute_tokens) == 1 and len(attribute_tokens[0]) == 1 and attribute_tokens[0][0] == '.':
 677 |                         pass # no attributes
 678 |                     else:
 679 |                         for a in attribute_tokens:
 680 |                             if len(a) != 2:
 681 |                                 self.add_line_error(line_data, {'message': 'Attributes must contain one and only one equal (=) sign: "%s"' % ('='.join(a)), 'error_type': 'FORMAT', 'location': ''})
 682 |                             try:
 683 |                                 tag, value = a
 684 |                             except ValueError:
 685 |                                 tag, value = a[0], ''
 686 |                             if not tag:
 687 |                                 self.add_line_error(line_data, {'message': 'Empty attribute tag: "%s"' % '='.join(a), 'error_type': 'FORMAT', 'location': ''})
 688 |                             if not value.strip():
 689 |                                 self.add_line_error(line_data, {'message': 'Empty attribute value: "%s"' % '='.join(a), 'error_type': 'FORMAT', 'location': ''}, log_level=logging.WARNING)
 690 |                             if tag in line_data['attributes']:
 691 |                                 self.add_line_error(line_data, {'message': 'Found multiple attribute tags: "%s"' % tag, 'error_type': 'FORMAT', 'location': ''})
 692 |                             if tag in multi_value_attributes: # set(['Parent', 'Alias', 'Note', 'Dbxref', 'Ontology_term'])
 693 |                                 if value.find(', ') >= 0:
 694 |                                     self.add_line_error(line_data, {'message': 'Found ", " in %s attribute, possible unescaped ",": "%s"' % (tag, value), 'error_type': 'FORMAT', 'location': ''}, log_level=logging.WARNING)
 695 |                                 # In addition to Parent, the Alias, Note, Dbxref and Ontology_term attributes can have multiple values.
 696 |                                 if tag in line_data['attributes']: # if this tag has been seen before
 697 |                                     if tag == 'Note': # don't check for duplicate notes
 698 |                                         line_data['attributes'][tag].extend(value.split(','))
 699 |                                     else: # only add non duplicate values
 700 |                                         line_data['attributes'][tag].extend([s for s in value.split(',') if s not in line_data['attributes'][tag]])
 701 |                                 else:
 702 |                                     line_data['attributes'][tag] = value.split(',')
 703 |                                 # check for duplicate values
 704 |                                 if tag != 'Note' and len(line_data['attributes'][tag]) != len(set(line_data['attributes'][tag])):
 705 |                                     count_values = [(len(list(group)), key) for key, group in groupby(sorted(line_data['attributes'][tag]))]
 706 |                                     self.add_line_error(line_data, {'message': '%s attribute has identical values (count, value): %s' % (tag, ', '.join(['(%d, %s)' % (c, v) for c, v in count_values if c > 1])), 'error_type': 'FORMAT', 'location': ''})
 707 |                                     # remove duplicate
 708 |                                     line_data['attributes'][tag] = list(set(line_data['attributes'][tag]))
 709 | 
 710 |                                 if tag == 'Parent':
 711 |                                     for feature_id in line_data['attributes']['Parent']:
 712 |                                         try:
 713 |                                             line_data['parents'].append(features[feature_id])
 714 |                                             for ld in features[feature_id]:
 715 |                                                 # no need to check if line_data in ld['children'], because it is impossible, each ld maps to only one feature_id, so the ld we get are all different
 716 |                                                 ld['children'].append(line_data)
 717 |                                         except KeyError: # features[id]
 718 |                                             self.add_line_error(line_data, {'message': '%s attribute has unresolved forward reference: %s' % (tag, feature_id), 'error_type': 'FORMAT', 'location': ''})
 719 |                                             unresolved_parents[feature_id].append(line_data)
 720 |                             elif tag == 'Target':
 721 |                                 if value.find(',') >= 0:
 722 |                                     self.add_line_error(line_data, {'message': 'Value of %s attribute contains unescaped ",": "%s"' % (tag, value), 'error_type': 'FORMAT', 'location': ''})
 723 |                                 target_tokens = value.split(' ')
 724 |                                 if len(target_tokens) < 3 or len(target_tokens) > 4:
 725 |                                     self.add_line_error(line_data, {'message': 'Target attribute should have 3 or 4 values, got %d: %s' % (len(target_tokens), repr(tokens)), 'error_type': 'FORMAT', 'location': ''})
 726 |                                 line_data['attributes'][tag] = {}
 727 |                                 try:
 728 |                                     line_data['attributes'][tag]['target_id'] = target_tokens[0]
 729 |                                     all_good = True
 730 |                                     try:
 731 |                                         line_data['attributes'][tag]['start'] = int(target_tokens[1])
 732 |                                         if line_data['attributes'][tag]['start'] < 1:
 733 |                                             self.add_line_error(line_data, {'message': 'Start value of Target attribute is not a valid 1-based integer coordinate: "%s"' % target_tokens[1], 'error_type': 'FORMAT', 'location': ''})
 734 |                                     except ValueError:
 735 |                                         all_good = False
 736 |                                         line_data['attributes'][tag]['start'] = target_tokens[1]
 737 |                                         self.add_line_error(line_data, {'message': 'Start value of Target attribute is not a valid integer: "%s"' % line_data['attributes'][tag]['start'], 'error_type': 'FORMAT', 'location': ''})
 738 |                                     try:
 739 |                                         line_data['attributes'][tag]['end'] = int(target_tokens[2])
 740 |                                         if line_data['attributes'][tag]['end'] < 1:
 741 |                                             self.add_line_error(line_data, {'message': 'End value of Target attribute is not a valid 1-based integer coordinate: "%s"' % target_tokens[2], 'error_type': 'FORMAT', 'location': ''})
 742 |                                     except ValueError:
 743 |                                         all_good = False
 744 |                                         line_data['attributes'][tag]['end'] = target_tokens[2]
 745 |                                         self.add_line_error(line_data, {'message': 'End value of Target attribute is not a valid integer: "%s"' % line_data['attributes'][tag]['end'], 'error_type': 'FORMAT', 'location': ''})
 746 |                                     # if all_good then both start and end are int, so we can check if start is not less than or equal to end
 747 |                                     if all_good and line_data['attributes'][tag]['start'] > line_data['attributes'][tag]['end']:
 748 |                                         self.add_line_error(line_data, {'message': 'Start is not less than or equal to end', 'error_type': 'FORMAT', 'location': ''})
 749 |                                     line_data['attributes'][tag]['strand'] = target_tokens[3]
 750 |                                     if line_data['attributes'][tag]['strand'] not in valid_attribute_target_strand: # set(['+', '-', ''])
 751 |                                         self.add_line_error(line_data, {'message': 'Strand value of Target attribute has illegal characters: "%s"' % line_data['attributes'][tag]['strand'], 'error_type': 'FORMAT', 'location': ''})
 752 |                                 except IndexError:
 753 |                                     pass
 754 |                             else:
 755 |                                 if value.find(',') >= 0:
 756 |                                     self.add_line_error(line_data, {'message': 'Value of %s attribute contains unescaped ",": "%s"' % (tag, value), 'error_type': 'FORMAT', 'location': ''})
 757 |                                 line_data['attributes'][tag] = value
 758 |                                 if tag == 'Is_circular' and value != 'true':
 759 |                                     self.add_line_error(line_data, {'message': 'Value of Is_circular attribute is not "true": "%s"' % value, 'error_type': 'FORMAT', 'location': ''})
 760 |                                 elif tag[:1].isupper() and tag not in reserved_attributes: # {'ID', 'Name', 'Alias', 'Parent', 'Target', 'Gap', 'Derives_from', 'Note', 'Dbxref', 'Ontology_term', 'Is_circular'}
 761 |                                     self.add_line_error(line_data, {'message': 'Unknown reserved (uppercase) attribute: "%s"' % tag, 'error_type': 'FORMAT', 'location': ''})
 762 |                                 elif tag == 'ID':
 763 |                                     # check for duplicate ID in non-adjacent lines
 764 |                                     if value in features and lines[-1]['attributes'][tag] != value:
 765 |                                         self.add_line_error(line_data, {'message': 'Duplicate ID: "%s" in non-adjacent lines: %s' % (value, ','.join([str(f['line_index'] + 1) for f in features[value]])), 'error_type': 'FORMAT', 'location': ''}, log_level=logging.WARNING)
 766 |                                     features[value].append(line_data)
 767 |                 except IndexError:
 768 |                     pass
 769 |             current_line_num += 1
 770 |             lines.append(line_data)
 771 | 
 772 |         if isinstance(gff_file, str):
 773 |             gff_fp.close()
 774 | 
 775 |         # global look up of unresolved parents
 776 |         for feature_id in unresolved_parents:
 777 |             if feature_id in features:
 778 |                 for line in unresolved_parents[feature_id]:
 779 |                     self.add_line_error(line, {'message': 'Unresolved forward reference: "%s", found defined in lines: %s' % (feature_id, ','.join([str(ld['line_index'] + 1) for ld in features[feature_id]])), 'error_type': 'FORMAT', 'location': ''})
 780 | 
 781 |         self.lines = lines
 782 |         self.features = features
 783 |         return 1
 784 | 
 785 |     def descendants(self, line_data):
 786 |         """
 787 |         BFS graph algorithm
 788 |         :param line_data: line_data(dict) with line_data['line_index'] or line_index(int)
 789 |         :return: list of line_data(dict)
 790 |         """
 791 |         # get start node
 792 |         try:
 793 |             start = line_data['line_index']
 794 |         except TypeError:
 795 |             start = self.lines[line_data]['line_index']
 796 |         visited_set, visited_list, queue = set(), [], [start]
 797 |         while queue:
 798 |             node = queue.pop(0)
 799 |             if node not in visited_set:
 800 |                 visited_set.add(node)
 801 |                 visited_list.append(self.lines[node])
 802 |                 queue.extend([ld['line_index'] for ld in self.lines[node]['children'] if ld['line_index'] not in visited_set])
 803 |         return visited_list[1:]
 804 | 
 805 |     def ancestors(self, line_data):
 806 |         """
 807 |         BFS graph algorithm
 808 | 
 809 |         :param line_data: line_data(dict) with line_data['line_index'] or line_index(int)
 810 |         :return: list of line_data(dict)
 811 |         """
 812 |         # get start node
 813 |         try:
 814 |             start = line_data['line_index']
 815 |         except TypeError:
 816 |             start = self.lines[line_data]['line_index']
 817 |         visited_set, visited_list, queue = set(), [], [start]
 818 |         while queue:
 819 |             node = queue.pop(0)
 820 |             if node not in visited_set:
 821 |                 visited_set.add(node)
 822 |                 visited_list.append(self.lines[node])
 823 |                 queue.extend([ld['line_index'] for f in self.lines[node]['parents'] for ld in f if ld['line_index'] not in visited_set])
 824 |         return visited_list[1:]
 825 | 
 826 |     def adopt(self, old_parent, new_parent):
 827 |         """
 828 |         Transfer children from old_parent to new_parent
 829 | 
 830 |         :param old_parent: feature_id(str) or line_index(int) or line_data(dict) or feature
 831 |         :param new_parent: feature_id(str) or line_index(int) or line_data(dict)
 832 |         :return: List of children transferred
 833 |         """
 834 |         try: # assume line_data(dict)
 835 |             old_id = old_parent['attributes']['ID']
 836 |         except TypeError:
 837 |             try: # assume line_index(int)
 838 |                 old_id = self.lines[old_parent]['attributes']['ID']
 839 |             except TypeError: # assume feature_id(str)
 840 |                 old_id = old_parent
 841 |         old_feature = self.features[old_id]
 842 |         old_indexes = [ld['line_index'] for ld in old_feature]
 843 |         try: # assume line_data(dict)
 844 |             new_id = new_parent['attributes']['ID']
 845 |         except TypeError:
 846 |             try: # assume line_index(int)
 847 |                 new_id = self.lines[new_parent]['attributes']['ID']
 848 |             except TypeError: # assume feature_id(str)
 849 |                 new_id = new_parent
 850 |         new_feature = self.features[new_id]
 851 |         new_indexes = [ld['line_index'] for ld in new_feature]
 852 |         # build a list of children to be moved
 853 |         # add the child to the new parent's children list if its not already there
 854 |         # update the child's parent list and parent attribute
 855 |         # finally remove the old parent's children list
 856 |         children = old_feature[0]['children']
 857 |         new_parent_children_set = set([ld['line_index'] for ld in new_feature[0]['children']])
 858 |         for child in children:
 859 |             if child['line_index'] not in new_parent_children_set:
 860 |                 new_parent_children_set.add(child['line_index'])
 861 |                 for new_ld in new_feature:
 862 |                     new_ld['children'].append(child)
 863 |                 child['parents'].append(new_feature)
 864 |                 child['attributes']['Parent'].append(new_id)
 865 |             # remove multiple, list.remove() only removes 1
 866 |             child['parents'] = [f for f in child['parents'] if f[0]['attributes']['ID'] != old_id]
 867 |             child['attributes']['Parent'] = [d for d in child['attributes']['Parent'] if d != old_id]
 868 |         for old_ld in old_feature:
 869 |             old_ld['children'] = []
 870 |         return children
 871 | 
 872 |     def adopted(self, old_child, new_child):
 873 |         """
 874 |         Transfer parents from old_child to new_child
 875 | 
 876 |         :param old_child: line_data(dict) with line_data['line_index'] or line_index(int)
 877 |         :param new_child: line_data(dict) with line_data['line_index'] or line_index(int)
 878 |         :return: List of parents transferred
 879 |         """
 880 |         pass
 881 | 
 882 |     def overlap(self, line_data_a, line_data_b):
 883 |         return line_data_a['seqid'] == line_data_b['seqid'] and (line_data_a['start'] <= line_data_b['start'] and line_data_b['start'] <= line_data_a['end'] or
 884 |                 line_data_a['start'] <= line_data_b['end'] and line_data_b['end'] <= line_data_a['end'] or
 885 |                 line_data_b['start'] <= line_data_a['start'] and line_data_a['end'] <= line_data_b['end'])
 886 | 
 887 |     def remove(self, line_data, root_type=None):
 888 |         """
 889 |         Marks line_data and all of its associated feature's 'line_status' as 'removed', does not actually remove the line_data from the data structure.
 890 |         The write function checks the 'line_status' when writing the gff file.
 891 |         Find the root parent of line_data of type root_type, remove all of its descendants.
 892 |         If the root parent has a parent with no children after the remove, remove the root parent's parent recursively.
 893 | 
 894 |         :param line_data:
 895 |         :param root_type:
 896 |         :return:
 897 |         """
 898 |         roots = [ld for ld in self.ancestors(line_data) if (root_type and ld['line_type'] == root_type) or (not root_type and not ld['parents'])] or [line_data]
 899 |         for root in roots:
 900 |             root['line_status'] = 'removed'
 901 |             root_descendants = self.descendants(root)
 902 |             for root_descendant in root_descendants:
 903 |                 root_descendant['line_status'] = 'removed'
 904 |             root_ancestors = self.ancestors(root) # BFS, so we will process closer ancestors first
 905 |             for root_ancestor in root_ancestors:
 906 |                 if len([ld for ld in root_ancestor['children'] if ld['line_status'] != 'removed']) == 0: # if all children of a root_ancestor is removed
 907 |                     # remove this root_ancestor
 908 |                     root_ancestor['line_status'] = 'removed'
 909 | 
 910 | 
 911 |     def fix(self):
 912 |         pass
 913 | 
 914 |     def write(self, gff_file, embed_fasta=None, fasta_char_limit=None):
 915 |         gff_fp = gff_file
 916 |         if isinstance(gff_file, str):
 917 |             gff_fp = open(gff_file, 'wb')
 918 | 
 919 |         wrote_sequence_region = set()
 920 |         # build sequence region data
 921 |         sequence_regions = {}
 922 |         if self.fasta_external:
 923 |             for seqid in self.fasta_external:
 924 |                 sequence_regions[seqid] = (1, len(self.fasta_external[seqid]['seq']))
 925 |         elif self.fasta_embedded:
 926 |             for seqid in self.fasta_embedded:
 927 |                 sequence_regions[seqid] = (1, len(self.fasta_embedded[seqid]['seq']))
 928 |         else:
 929 |             pass
 930 | 
 931 |         wrote_lines = set()
 932 |         field_keys = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase']
 933 |         reserved_attributes = ['ID', 'Name', 'Alias', 'Parent', 'Target', 'Gap', 'Derives_from', 'Note', 'Dbxref', 'Ontology_term', 'Is_circular']
 934 |         attributes_sort_map = defaultdict(int, zip(reserved_attributes, range(len(reserved_attributes), 0, -1)))
 935 |         def write_feature(line_data):
 936 |             if line_data['line_status'] == 'removed':
 937 |                 return
 938 |             field_list = [str(line_data[k]) for k in field_keys]
 939 |             attribute_list = []
 940 |             for k, v in sorted(line_data['attributes'].items(), key=lambda x: attributes_sort_map[x[0]], reverse=True):
 941 |                 if isinstance(v, list):
 942 |                     v = ','.join(v)
 943 |                 attribute_list.append('%s=%s' % (str(k), str(v)))
 944 |             field_list.append(';'.join(attribute_list))
 945 |             gff_fp.write('\t'.join(field_list) + '\n')
 946 |             wrote_lines.add(line_data['line_index'])
 947 |         # write directives
 948 |         ignore_directives = ['##sequence-region', '###', '##FASTA']
 949 |         directives_lines = [line_data for line_data in self.lines if line_data['line_type'] == 'directive' and line_data['directive'] not in ignore_directives]
 950 |         for directives_line in directives_lines:
 951 |             gff_fp.write(directives_line['line_raw'])
 952 | 
 953 |         # write features
 954 |         # get a list of root nodes
 955 |         root_lines = [line_data for line_data in self.lines if line_data['line_type'] == 'feature' and not line_data['parents']]
 956 | 
 957 |         for root_line in root_lines:
 958 |             lines_wrote = len(wrote_lines)
 959 |             if root_line['line_index'] in wrote_lines:
 960 |                 continue
 961 |             # write #sequence-region if new seqid
 962 |             if root_line['seqid'] not in wrote_sequence_region:
 963 |                 if root_line['seqid'] in sequence_regions:
 964 |                     gff_fp.write('##sequence-region %s %d %d\n' % (root_line['seqid'], sequence_regions[root_line['seqid']][0], sequence_regions[root_line['seqid']][1]))
 965 |                 wrote_sequence_region.add(root_line['seqid'])
 966 |             try:
 967 |                 root_feature = self.features[root_line['attributes']['ID']]
 968 |             except KeyError:
 969 |                 root_feature = [root_line]
 970 |             for line_data in root_feature:
 971 |                 write_feature(line_data)
 972 |             descendants = self.descendants(root_line)
 973 |             for descendant in descendants:
 974 |                 if descendant['line_index'] in wrote_lines:
 975 |                     continue
 976 |                 write_feature(descendant)
 977 |             # check if we actually wrote something
 978 |             if lines_wrote != len(wrote_lines):
 979 |                 gff_fp.write('###\n')
 980 |         # write fasta
 981 |         fasta = embed_fasta or self.fasta_external or self.fasta_embedded
 982 |         if fasta and embed_fasta != False:
 983 |             gff_fp.write('##FASTA\n')
 984 |             fasta_dict_to_file(fasta, gff_fp, line_char_limit=fasta_char_limit)
 985 | 
 986 |         if isinstance(gff_file, str):
 987 |             gff_fp.close()
 988 | 
 989 |     def sequence(self, line_data, child_type=None, reference=None):
 990 |         """
 991 |         Get the sequence of line_data, according to the columns 'seqid', 'start', 'end', 'strand'.
 992 |         Requires fasta reference.
 993 |         When used on 'mRNA' type line_data, child_type can be used to specify which kind of sequence to return:
 994 |         * child_type=None:  pre-mRNA, returns the sequence of line_data from start to end, reverse complement according to strand. (default)
 995 |         * child_type='exon':  mature mRNA, concatenates the sequences of children type 'exon'.
 996 |         * child_type='CDS':  coding sequence, concatenates the sequences of children type 'CDS'. Use the helper
 997 |                              function translate(seq) on the returned value to obtain the protein sequence.
 998 | 
 999 |         :param line_data: line_data(dict) with line_data['line_index'] or line_index(int)
1000 |         :param child_type: None or feature type(string)
1001 |         :param reference: If None, will use self.fasta_external or self.fasta_embedded(dict)
1002 |         :return: sequence(string)
1003 |         """
1004 |         # get start node
1005 |         reference = reference or self.fasta_external or self.fasta_embedded
1006 |         if not reference:
1007 |             raise Exception('External or embedded fasta reference needed')
1008 |         try:
1009 |             line_index = line_data['line_index']
1010 |         except TypeError:
1011 |             line_index = self.lines[line_data]['line_index']
1012 |         ld = self.lines[line_index]
1013 |         if ld['type'] != 'feature':
1014 |             return None
1015 |         seq = reference[ld['seqid']][ld['start']-1:ld['end']]
1016 |         if ld['strand'] == '-':
1017 |             seq = complement(seq[::-1])
1018 |         return seq
1019 | 
1020 |     def type_tree(self):
1021 |         class node(object):
1022 |             def __init__(self, value, children=None):
1023 |                 self.value = value or ''
1024 |                 self.children = children or set()
1025 | 
1026 |             def __repr__(self, level=0):
1027 |                 ret = '\t' * level + repr(self.value) + '\n'
1028 |                 for child in sorted(list(self.children), key=lambda x: x.value):
1029 |                     ret += child.__repr__(level+1)
1030 |                 return ret
1031 |         root_set = set()
1032 |         node_dict = {}
1033 |         feature_line_list = [line_data for line_data in self.lines if line_data['line_type'] == 'feature']
1034 |         for line_data in feature_line_list:
1035 |             if len(line_data['children']) > 0:
1036 |                 parent_type = line_data['type']
1037 |                 if parent_type not in node_dict:
1038 |                     node_dict[parent_type] = node(parent_type)
1039 |                 if len(line_data['parents']) == 0:
1040 |                     root_set.add(node_dict[parent_type])
1041 |                 for child_ld in line_data['children']:
1042 |                     child_type = child_ld['type']
1043 |                     if child_type not in node_dict:
1044 |                         node_dict[child_type] = node(child_type)
1045 |                     if parent_type == child_type and child_type == 'mRNA':
1046 |                         print(line_data['line_index'], child_ld['line_index'])
1047 |                     else:
1048 |                         node_dict[parent_type].children.add(node_dict[child_type])
1049 |         return sorted(list(root_set), key=lambda x: x.value)
1050 | 
1051 | try:
1052 |     from collections import OrderedDict
1053 | except ImportError:
1054 |     # Backport of OrderedDict() class that runs on Python 2.4, 2.5, 2.6, 2.7 and pypy.
1055 |     # Passes Python2.7's test suite and incorporates all the latest updates.
1056 | 
1057 |     try:
1058 |         from thread import get_ident as _get_ident
1059 |     except ImportError:
1060 |         from dummy_thread import get_ident as _get_ident
1061 | 
1062 |     try:
1063 |         from _abcoll import KeysView, ValuesView, ItemsView
1064 |     except ImportError:
1065 |         pass
1066 | 
1067 | 
1068 |     class OrderedDict(dict):
1069 |         'Dictionary that remembers insertion order'
1070 |         # An inherited dict maps keys to values.
1071 |         # The inherited dict provides __getitem__, __len__, __contains__, and get.
1072 |         # The remaining methods are order-aware.
1073 |         # Big-O running times for all methods are the same as for regular dictionaries.
1074 | 
1075 |         # The internal self.__map dictionary maps keys to links in a doubly linked list.
1076 |         # The circular doubly linked list starts and ends with a sentinel element.
1077 |         # The sentinel element never gets deleted (this simplifies the algorithm).
1078 |         # Each link is stored as a list of length three:  [PREV, NEXT, KEY].
1079 | 
1080 |         def __init__(self, *args, **kwds):
1081 |             '''Initialize an ordered dictionary.  Signature is the same as for
1082 |             regular dictionaries, but keyword arguments are not recommended
1083 |             because their insertion order is arbitrary.
1084 | 
1085 |             '''
1086 |             if len(args) > 1:
1087 |                 raise TypeError('expected at most 1 arguments, got %d' % len(args))
1088 |             try:
1089 |                 self.__root
1090 |             except AttributeError:
1091 |                 self.__root = root = []                     # sentinel node
1092 |                 root[:] = [root, root, None]
1093 |                 self.__map = {}
1094 |             self.__update(*args, **kwds)
1095 | 
1096 |         def __setitem__(self, key, value, dict_setitem=dict.__setitem__):
1097 |             'od.__setitem__(i, y) <==> od[i]=y'
1098 |             # Setting a new item creates a new link which goes at the end of the linked
1099 |             # list, and the inherited dictionary is updated with the new key/value pair.
1100 |             if key not in self:
1101 |                 root = self.__root
1102 |                 last = root[0]
1103 |                 last[1] = root[0] = self.__map[key] = [last, root, key]
1104 |             dict_setitem(self, key, value)
1105 | 
1106 |         def __delitem__(self, key, dict_delitem=dict.__delitem__):
1107 |             'od.__delitem__(y) <==> del od[y]'
1108 |             # Deleting an existing item uses self.__map to find the link which is
1109 |             # then removed by updating the links in the predecessor and successor nodes.
1110 |             dict_delitem(self, key)
1111 |             link_prev, link_next, key = self.__map.pop(key)
1112 |             link_prev[1] = link_next
1113 |             link_next[0] = link_prev
1114 | 
1115 |         def __iter__(self):
1116 |             'od.__iter__() <==> iter(od)'
1117 |             root = self.__root
1118 |             curr = root[1]
1119 |             while curr is not root:
1120 |                 yield curr[2]
1121 |                 curr = curr[1]
1122 | 
1123 |         def __reversed__(self):
1124 |             'od.__reversed__() <==> reversed(od)'
1125 |             root = self.__root
1126 |             curr = root[0]
1127 |             while curr is not root:
1128 |                 yield curr[2]
1129 |                 curr = curr[0]
1130 | 
1131 |         def clear(self):
1132 |             'od.clear() -> None.  Remove all items from od.'
1133 |             try:
1134 |                 for node in self.__map.itervalues():
1135 |                     del node[:]
1136 |                 root = self.__root
1137 |                 root[:] = [root, root, None]
1138 |                 self.__map.clear()
1139 |             except AttributeError:
1140 |                 pass
1141 |             dict.clear(self)
1142 | 
1143 |         def popitem(self, last=True):
1144 |             '''od.popitem() -> (k, v), return and remove a (key, value) pair.
1145 |             Pairs are returned in LIFO order if last is true or FIFO order if false.
1146 | 
1147 |             '''
1148 |             if not self:
1149 |                 raise KeyError('dictionary is empty')
1150 |             root = self.__root
1151 |             if last:
1152 |                 link = root[0]
1153 |                 link_prev = link[0]
1154 |                 link_prev[1] = root
1155 |                 root[0] = link_prev
1156 |             else:
1157 |                 link = root[1]
1158 |                 link_next = link[1]
1159 |                 root[1] = link_next
1160 |                 link_next[0] = root
1161 |             key = link[2]
1162 |             del self.__map[key]
1163 |             value = dict.pop(self, key)
1164 |             return key, value
1165 | 
1166 |         # -- the following methods do not depend on the internal structure --
1167 | 
1168 |         def keys(self):
1169 |             'od.keys() -> list of keys in od'
1170 |             return list(self)
1171 | 
1172 |         def values(self):
1173 |             'od.values() -> list of values in od'
1174 |             return [self[key] for key in self]
1175 | 
1176 |         def items(self):
1177 |             'od.items() -> list of (key, value) pairs in od'
1178 |             return [(key, self[key]) for key in self]
1179 | 
1180 |         def iterkeys(self):
1181 |             'od.iterkeys() -> an iterator over the keys in od'
1182 |             return iter(self)
1183 | 
1184 |         def itervalues(self):
1185 |             'od.itervalues -> an iterator over the values in od'
1186 |             for k in self:
1187 |                 yield self[k]
1188 | 
1189 |         def iteritems(self):
1190 |             'od.iteritems -> an iterator over the (key, value) items in od'
1191 |             for k in self:
1192 |                 yield (k, self[k])
1193 | 
1194 |         def update(*args, **kwds):
1195 |             '''od.update(E, **F) -> None.  Update od from dict/iterable E and F.
1196 | 
1197 |             If E is a dict instance, does:           for k in E: od[k] = E[k]
1198 |             If E has a .keys() method, does:         for k in E.keys(): od[k] = E[k]
1199 |             Or if E is an iterable of items, does:   for k, v in E: od[k] = v
1200 |             In either case, this is followed by:     for k, v in F.items(): od[k] = v
1201 | 
1202 |             '''
1203 |             if len(args) > 2:
1204 |                 raise TypeError('update() takes at most 2 positional '
1205 |                                 'arguments (%d given)' % (len(args),))
1206 |             elif not args:
1207 |                 raise TypeError('update() takes at least 1 argument (0 given)')
1208 |             self = args[0]
1209 |             # Make progressively weaker assumptions about "other"
1210 |             other = ()
1211 |             if len(args) == 2:
1212 |                 other = args[1]
1213 |             if isinstance(other, dict):
1214 |                 for key in other:
1215 |                     self[key] = other[key]
1216 |             elif hasattr(other, 'keys'):
1217 |                 for key in other.keys():
1218 |                     self[key] = other[key]
1219 |             else:
1220 |                 for key, value in other:
1221 |                     self[key] = value
1222 |             for key, value in kwds.items():
1223 |                 self[key] = value
1224 | 
1225 |         __update = update  # let subclasses override update without breaking __init__
1226 | 
1227 |         __marker = object()
1228 | 
1229 |         def pop(self, key, default=__marker):
1230 |             '''od.pop(k[,d]) -> v, remove specified key and return the corresponding value.
1231 |             If key is not found, d is returned if given, otherwise KeyError is raised.
1232 | 
1233 |             '''
1234 |             if key in self:
1235 |                 result = self[key]
1236 |                 del self[key]
1237 |                 return result
1238 |             if default is self.__marker:
1239 |                 raise KeyError(key)
1240 |             return default
1241 | 
1242 |         def setdefault(self, key, default=None):
1243 |             'od.setdefault(k[,d]) -> od.get(k,d), also set od[k]=d if k not in od'
1244 |             if key in self:
1245 |                 return self[key]
1246 |             self[key] = default
1247 |             return default
1248 | 
1249 |         def __repr__(self, _repr_running={}):
1250 |             'od.__repr__() <==> repr(od)'
1251 |             call_key = id(self), _get_ident()
1252 |             if call_key in _repr_running:
1253 |                 return '...'
1254 |             _repr_running[call_key] = 1
1255 |             try:
1256 |                 if not self:
1257 |                     return '%s()' % (self.__class__.__name__,)
1258 |                 return '%s(%r)' % (self.__class__.__name__, self.items())
1259 |             finally:
1260 |                 del _repr_running[call_key]
1261 | 
1262 |         def __reduce__(self):
1263 |             'Return state information for pickling'
1264 |             items = [[k, self[k]] for k in self]
1265 |             inst_dict = vars(self).copy()
1266 |             for k in vars(OrderedDict()):
1267 |                 inst_dict.pop(k, None)
1268 |             if inst_dict:
1269 |                 return (self.__class__, (items,), inst_dict)
1270 |             return self.__class__, (items,)
1271 | 
1272 |         def copy(self):
1273 |             'od.copy() -> a shallow copy of od'
1274 |             return self.__class__(self)
1275 | 
1276 |         @classmethod
1277 |         def fromkeys(cls, iterable, value=None):
1278 |             '''OD.fromkeys(S[, v]) -> New ordered dictionary with keys from S
1279 |             and values equal to v (which defaults to None).
1280 | 
1281 |             '''
1282 |             d = cls()
1283 |             for key in iterable:
1284 |                 d[key] = value
1285 |             return d
1286 | 
1287 |         def __eq__(self, other):
1288 |             '''od.__eq__(y) <==> od==y.  Comparison to another OD is order-sensitive
1289 |             while comparison to a regular mapping is order-insensitive.
1290 | 
1291 |             '''
1292 |             if isinstance(other, OrderedDict):
1293 |                 return len(self)==len(other) and self.items() == other.items()
1294 |             return dict.__eq__(self, other)
1295 | 
1296 |         def __ne__(self, other):
1297 |             return not self == other
1298 | 
1299 |         # -- the following methods are only used in Python 2.7 --
1300 | 
1301 |         def viewkeys(self):
1302 |             "od.viewkeys() -> a set-like object providing a view on od's keys"
1303 |             return KeysView(self)
1304 | 
1305 |         def viewvalues(self):
1306 |             "od.viewvalues() -> an object providing a view on od's values"
1307 |             return ValuesView(self)
1308 | 
1309 |         def viewitems(self):
1310 |             "od.viewitems() -> a set-like object providing a view on od's items"
1311 |             return ItemsView(self)
1312 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | wheel==0.23.0
2 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [wheel]
2 | universal = 1
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | 
 5 | try:
 6 |     from setuptools import setup
 7 | except ImportError:
 8 |     from distutils.core import setup
 9 | 
10 | 
11 | readme = open('README.rst').read()
12 | history = open('HISTORY.rst').read().replace('.. :changelog:', '')
13 | 
14 | requirements = [
15 |     # TODO: put package requirements here
16 | ]
17 | 
18 | test_requirements = [
19 |     # TODO: put package test requirements here
20 | ]
21 | 
22 | setup(
23 |     name='gff3',
24 |     version='1.0.1',
25 |     description='Manipulate genomic features and validate the syntax and reference sequence of your GFF3 files.',
26 |     long_description=readme + '\n\n' + history,
27 |     author='Han Lin',
28 |     author_email='hotdogee@gmail.com',
29 |     url='https://github.com/hotdogee/gff3-py',
30 |     packages=[
31 |         'gff3',
32 |     ],
33 |     package_dir={'gff3':
34 |                  'gff3'},
35 |     include_package_data=True,
36 |     install_requires=requirements,
37 |     license="BSD",
38 |     zip_safe=False,
39 |     keywords='gff3',
40 |     classifiers=[
41 |         'Development Status :: 2 - Pre-Alpha',
42 |         'Intended Audience :: Developers',
43 |         'License :: OSI Approved :: BSD License',
44 |         'Natural Language :: English',
45 |         "Programming Language :: Python :: 2",
46 |         'Programming Language :: Python :: 2.6',
47 |         'Programming Language :: Python :: 2.7',
48 |         'Programming Language :: Python :: 3',
49 |         'Programming Language :: Python :: 3.3',
50 |         'Programming Language :: Python :: 3.4',
51 |     ],
52 |     test_suite='tests',
53 |     tests_require=test_requirements
54 | )
55 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/tests/test_gff3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | test_gff3
 6 | ----------------------------------
 7 | 
 8 | Tests for `gff3` module.
 9 | """
10 | 
11 | import unittest
12 | 
13 | from gff3 import gff3
14 | 
15 | 
16 | class TestGff3(unittest.TestCase):
17 | 
18 |     def setUp(self):
19 |         pass
20 | 
21 |     def test_something(self):
22 |         pass
23 | 
24 |     def tearDown(self):
25 |         pass
26 | 
27 | if __name__ == '__main__':
28 |     unittest.main()
29 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py26, py27, py33, py34, py35, py36
 3 | 
 4 | [testenv]
 5 | setenv =
 6 |     PYTHONPATH = {toxinidir}:{toxinidir}/gff3
 7 | commands = python setup.py test
 8 | deps =
 9 |     -r {toxinidir}/requirements.txt
10 | 


--------------------------------------------------------------------------------