├── SeqFindr ├── __init__.py ├── SeqFindr ├── vfdb_to_seqfindr ├── imaging.py ├── config.py ├── vfdb_to_seqfindr.py ├── blast.py ├── util.py └── seqfindr.py ├── .env ├── tests ├── doctests │ ├── blast.tests │ ├── imaging.tests │ ├── SeqFindr.tests │ ├── config │ │ ├── SeqFindr.cfg.1 │ │ ├── SeqFindr.cfg.6 │ │ ├── SeqFindr.cfg.2 │ │ ├── SeqFindr.cfg.5 │ │ ├── SeqFindr.cfg │ │ ├── SeqFindr.cfg.3 │ │ └── SeqFindr.cfg.4 │ ├── vfdb_to_seqfindr.tests │ ├── run_tests.sh │ ├── util │ │ ├── sample_DB.fa.1 │ │ ├── sample_DB.fa.2 │ │ └── sample_DB.fa │ ├── run_tests.py │ ├── util.tests │ └── config.tests └── unittests │ ├── __init__.py │ ├── run_tests.sh │ ├── test_config.py │ ├── test_imaging.py │ ├── test_vfdb_to_seqfindr.py │ ├── context.py │ ├── test_blast.py │ ├── test_util.py │ └── test_seqfindr.py ├── CONTRIBUTING.rst ├── example ├── consensus │ ├── NOTES.rst │ ├── strain1.fa │ ├── strain2.fa │ └── strain3.fa ├── dummy.order ├── run1.png ├── run2.png ├── run3.png ├── run4.png ├── CU_fimbriae.png ├── run1_small.png ├── run2_small.png ├── run3_small.png ├── run4_small.png ├── dendrogram_run3.png ├── dendrogram_run3_small.png ├── assemblies │ ├── strain2.fa │ ├── strain3.fa │ ├── strain1.fa │ └── NOTES.rst ├── run_examples.sh └── Antibiotic_markers.fa ├── docs ├── modules.rst ├── index.rst ├── SeqFindr.rst ├── developer.rst ├── Makefile └── conf.py ├── requirements.txt ├── MANIFEST.in ├── .landscape.yaml ├── requirements-dev.txt ├── .gitignore ├── .bumpversion.cfg ├── .travis.yml ├── TODO.rst ├── CHANGES.rst ├── do_release.sh ├── setup.py ├── HELP.rst ├── LICENSE └── README.rst /SeqFindr/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.env: -------------------------------------------------------------------------------- 1 | workon SeqFindr 2 | -------------------------------------------------------------------------------- /SeqFindr/SeqFindr: -------------------------------------------------------------------------------- 1 | seqfindr.py -------------------------------------------------------------------------------- /tests/doctests/blast.tests: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/doctests/imaging.tests: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/unittests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/doctests/SeqFindr.tests: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | docs/developer.rst -------------------------------------------------------------------------------- /tests/doctests/config/SeqFindr.cfg.1: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/doctests/vfdb_to_seqfindr.tests: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /SeqFindr/vfdb_to_seqfindr: -------------------------------------------------------------------------------- 1 | vfdb_to_seqfindr.py -------------------------------------------------------------------------------- /example/consensus/NOTES.rst: -------------------------------------------------------------------------------- 1 | ../assemblies/NOTES.rst -------------------------------------------------------------------------------- /example/dummy.order: -------------------------------------------------------------------------------- 1 | strain3 2 | strain2 3 | strain1 4 | -------------------------------------------------------------------------------- /tests/doctests/config/SeqFindr.cfg.6: -------------------------------------------------------------------------------- 1 | fake = a, b, c 2 | -------------------------------------------------------------------------------- /tests/doctests/config/SeqFindr.cfg.2: -------------------------------------------------------------------------------- 1 | category_colors = 2 | -------------------------------------------------------------------------------- /tests/unittests/run_tests.sh: -------------------------------------------------------------------------------- 1 | py.test --cov ../../SeqFindr/ 2 | -------------------------------------------------------------------------------- /tests/unittests/test_config.py: -------------------------------------------------------------------------------- 1 | from context import config 2 | -------------------------------------------------------------------------------- /tests/unittests/test_imaging.py: -------------------------------------------------------------------------------- 1 | from context import imaging 2 | -------------------------------------------------------------------------------- /tests/doctests/config/SeqFindr.cfg.5: -------------------------------------------------------------------------------- 1 | category_colors = [(0,0,0),(r,g,b)] 2 | -------------------------------------------------------------------------------- /example/run1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mscook/SeqFindR/HEAD/example/run1.png -------------------------------------------------------------------------------- /example/run2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mscook/SeqFindR/HEAD/example/run2.png -------------------------------------------------------------------------------- /example/run3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mscook/SeqFindR/HEAD/example/run3.png -------------------------------------------------------------------------------- /example/run4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mscook/SeqFindR/HEAD/example/run4.png -------------------------------------------------------------------------------- /tests/doctests/config/SeqFindr.cfg: -------------------------------------------------------------------------------- 1 | category_colors = [(0,0,0),(255,255,255)] 2 | -------------------------------------------------------------------------------- /tests/doctests/config/SeqFindr.cfg.3: -------------------------------------------------------------------------------- 1 | category_colors = [(0,0,0),(255,255,255) 2 | -------------------------------------------------------------------------------- /tests/doctests/config/SeqFindr.cfg.4: -------------------------------------------------------------------------------- 1 | category_colors = [(0,0,0),(255,255)] 2 | -------------------------------------------------------------------------------- /tests/unittests/test_vfdb_to_seqfindr.py: -------------------------------------------------------------------------------- 1 | from context import vfdb_to_seqfindr 2 | -------------------------------------------------------------------------------- /example/CU_fimbriae.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mscook/SeqFindR/HEAD/example/CU_fimbriae.png -------------------------------------------------------------------------------- /example/run1_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mscook/SeqFindR/HEAD/example/run1_small.png -------------------------------------------------------------------------------- /example/run2_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mscook/SeqFindR/HEAD/example/run2_small.png -------------------------------------------------------------------------------- /example/run3_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mscook/SeqFindR/HEAD/example/run3_small.png -------------------------------------------------------------------------------- /example/run4_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mscook/SeqFindR/HEAD/example/run4_small.png -------------------------------------------------------------------------------- /docs/modules.rst: -------------------------------------------------------------------------------- 1 | SeqFindr 2 | ======== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | SeqFindr 8 | -------------------------------------------------------------------------------- /example/dendrogram_run3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mscook/SeqFindR/HEAD/example/dendrogram_run3.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.8.1 2 | scipy==0.14.0 3 | matplotlib==1.3.1 4 | biopython==1.64 5 | ghalton==0.6 6 | -------------------------------------------------------------------------------- /example/dendrogram_run3_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mscook/SeqFindR/HEAD/example/dendrogram_run3_small.png -------------------------------------------------------------------------------- /tests/doctests/run_tests.sh: -------------------------------------------------------------------------------- 1 | pip install coverage 2 | coverage run run_tests.py 3 | coverage report --omit "*/Bio/*" 4 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include requirements.txt 3 | include README.rst 4 | recursive-include docs/_build/html * 5 | -------------------------------------------------------------------------------- /.landscape.yaml: -------------------------------------------------------------------------------- 1 | doc-warnings: yes 2 | test-warnings: no 3 | strictness: medium 4 | autodetect: yes 5 | ignore-paths: 6 | - docs 7 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pytest==2.6.4 2 | pytest-cov==1.8.1 3 | pytest-mock==0.4.0 4 | Sphinx==1.2.3 5 | bumpversion==0.5.0 6 | wheel==0.24.0 7 | twine==1.3.1 8 | -------------------------------------------------------------------------------- /tests/doctests/util/sample_DB.fa.1: -------------------------------------------------------------------------------- 1 | >70-tem8674, bla-TEM, Beta-lactams Antibiotic resistance (ampicillin), Unknown sp. [Beta-lactams] 2 | AAAGTTCTGCTATGTGGCGCGGTATTATCCCGTGTTGACGCCGGGCAAGAGCAACTCGGTCGCCGCATAC 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | holding/* 2 | build/* 3 | dist/* 4 | SeqFindR.egg-info/* 5 | *.pyc 6 | .coverage 7 | example/Antibiotic_markers_trimmed.fa 8 | example/consensus/stripped/ 9 | _build 10 | .DS_Store 11 | *.spec 12 | .idea/ 13 | -------------------------------------------------------------------------------- /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.35.0 3 | commit = True 4 | tag = False 5 | 6 | [bumpversion:file:setup.py] 7 | 8 | [bumpversion:file:HELP.rst] 9 | 10 | [bumpversion:file:SeqFindr/seqfindr.py] 11 | 12 | [bumpversion:file:do_release.sh] 13 | 14 | [bumpversion:file:docs/conf.py] 15 | 16 | -------------------------------------------------------------------------------- /tests/unittests/context.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import pytest 4 | 5 | sys.path.insert(0, os.path.abspath('../../')) 6 | 7 | from SeqFindr import blast 8 | from SeqFindr import config 9 | from SeqFindr import imaging 10 | from SeqFindr import seqfindr 11 | from SeqFindr import util 12 | from SeqFindr import vfdb_to_seqfindr 13 | -------------------------------------------------------------------------------- /tests/doctests/run_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys, os, doctest 4 | 5 | sys.path.insert(0, "../") 6 | 7 | doctest.testfile("blast.tests") 8 | doctest.testfile("imaging.tests") 9 | doctest.testfile("SeqFindR.tests") 10 | doctest.testfile("vfdb_to_seqfindr.tests") 11 | doctest.testfile("config.tests") 12 | doctest.testfile("util.tests") 13 | -------------------------------------------------------------------------------- /example/assemblies/strain2.fa: -------------------------------------------------------------------------------- 1 | >Contig1, strain2 2 | AAAGTTCTGCTATGTGGCGCGGTATTATCCCGTGTTGACGCCGGGCAAGAGCAACTCGGTCGCCGCATAC 3 | CTCAAGCGGCTGCGGGCTGGCGTGTACCGCCAGCGGCAGGGTGGCTAACAGGGAGATAATACACAGGCGA 4 | AAACAACCTTCAGTTCCTTCAAATAATGGAGATGCGACAGTAGAGATATCTGTTGATGCACTGGCGCTGC 5 | TCGATCAGTCCAAGTGGCCCATCTTCGAGGGGCCGGACGCTACGGAAGGAGCTGTGGACCAGCAGCACAC 6 | GGCGCATCGGGCTTCCCATACAATCGATAGATTGTCGCACCTGATTGCCCGACATTATCGCGAGCCCATT 7 | AGTCATAGCCGAATAGCCTCTCCACCCAAGCGGCCGGAGAACCTGCGTGCAATCCATCTTGTTCAATCAT 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - 2.7 4 | before_install: 5 | # We do this to make sure we get dependencies 6 | - sudo apt-get update -qq 7 | - sudo apt-get install -qq python-numpy python-scipy python-matplotlib python-biopython python-dev libatlas-dev liblapack-dev gfortran libfreetype6-dev libfreetype6 libpng-dev 8 | install: 9 | - pip install -r requirements.txt --use-mirrors 10 | - pip install -e . 11 | notifications: 12 | email: true 13 | script: 14 | # cd tests && sh TEST.sh 15 | - echo 'Not testing ATM' 16 | -------------------------------------------------------------------------------- /tests/doctests/util.tests: -------------------------------------------------------------------------------- 1 | >>> from SeqFindr import util 2 | >>> 3 | >>> # A good file 4 | >>> util.check_database('util/sample_DB.fa') 5 | SeqFindr database checks [PASSED] 6 | >>> 7 | >>> # A good file (single entry) 8 | >>> util.check_database('util/sample_DB.fa.1') 9 | SeqFindr database checks [PASSED] 10 | >>> 11 | >>> # A BAD file (missing a comma) 12 | >>> util.check_database('util/sample_DB.fa.2') 13 | Traceback (most recent call last): 14 | Exception: Database is not formatted correctly 15 | >>> 16 | 17 | -------------------------------------------------------------------------------- /example/assemblies/strain3.fa: -------------------------------------------------------------------------------- 1 | >Contig 1, strain 3 2 | AAAGTTCTGCTATGCCGTGTTGACGCCGGGCAAGAGCAACTCGGTCGCCGCATACAAACAACCTTCAGTT 3 | CCTTCAAATAATGGAGATGCGACAGTAGAGATATCTGTTGATGCACTGGCGCTGCGTAGCGCAGGCTAAT 4 | TTACTGCTACTTTTACAAAGCACGAAAACACCATTGACGGCTTCGGCAGAGAACTCGCTGATTGCCATTG 5 | TAATCCCAATATTCTCCATTTTGAGTATCAAGAACGGAAACACCTATACGAGCAGATGATGTCGTCGTGC 6 | ACAATGACTTCTACAGCGCGGAGAATCTCGCTCTCTCCAGGGGAAGCCGCCCGAGTGAGGTGCATGCGAG 7 | CCTGTAGGACTCTATGTGCTTTGTAGGCCAGTCCACTGGTGGTACTTCACACCGGTTTGGACTCCGAGTT 8 | TTCGAATTGCCTCCGTTATTGCCTTCCGCGTATGCATCGCGATATCTCCGGCGCATCGATAGATTGTCGC 9 | ACCTGATTGCCCGACATTATCGCGAGCCCATTAGTCATAGCCGAATAGCCTCTCCACCCAAGCGGCCGGA 10 | GAACCTGCGTGCAATCCATCTTGTTCAATCAT 11 | -------------------------------------------------------------------------------- /example/assemblies/strain1.fa: -------------------------------------------------------------------------------- 1 | >Contig 1, strain 1 2 | AAAGTTCTGCTATGCCGTGTTGACGCCGGGCAAGAGCAACTCGGTCGCCGCATACAAACAACCTTCAGTT 3 | CCTTCAAATAATGGAGATGCGACAGTAGAGATATCTGTTGATGCACTGGCGCTGCGTAGCGCAGGCTAAT 4 | TTACTGCTACTTTTACAAAGCACGAAAACACCATTGACGGCTTCGGCAGAGAACTCGCTGATTGCCATTG 5 | TAATCCCAATATTCTCCATTTTGAGTATCAAGAACGGAAACACCTATACGAGCAGATGATGTCGTCGTGC 6 | ACAACAATGGTGACTTCTACAGCGCGGAGAATCTCGCTCTCTCCAGGGGAAGCCGCCCGAGTGAGGTGCA 7 | TGCGAGCCTGTAGGACTCTATGTGCTTTGTAGGCCAGTCCACTGGTGGTACTTCACACCGGTTTGGACTC 8 | CGAGTTTTCGAATTGCCTCCGTTATTGCCTTCCGCGTATGCATCGCGATATCTCCGGCGCATCGATAGAT 9 | TGTCGCACCTGATTGCCCGACATTATCGCGAGCCCATTAGTCATAGCCGAATAGCCTCTCCACCCAAGCG 10 | GCCGGAGAACCTGCGTGCAATCCATCTTGTTCAATCAT 11 | -------------------------------------------------------------------------------- /example/assemblies/NOTES.rst: -------------------------------------------------------------------------------- 1 | Artificial data configuration 2 | ============================= 3 | 4 | 5 | Strain1 6 | ------- 7 | 8 | Missing: 9 | * 70-shv86 10 | * 70-ctx143 11 | * 70-aac3(IV)380 12 | 13 | Mis-assembly of: 14 | * 70-aphA(1)1310 15 | * 70-tem8674 16 | 17 | 18 | Strain2 19 | ------- 20 | 21 | Missing: 22 | * 70-oxa(7)295 23 | * 70-pse(4)348 24 | * 70-ctx143 25 | * 70-aadA1588 26 | * 70-aadB1778 27 | * 70-aacC(2)200 28 | 29 | 30 | 31 | Strain 3 32 | -------- 33 | Missing: 34 | * 70-shv86 35 | * 70-ctx143 36 | * 70-aac3(IV)380 37 | 38 | Mis-assembly of: 39 | * 70-aphA(1)1310 40 | * 70-tem8674 41 | * 70-aadA1588 42 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | SeqFindr 2 | ======== 3 | 4 | SeqFindr - easily create informative genomic feature plots. 5 | 6 | 7 | User/Quickstart/Tutorial Documentation 8 | -------------------------------------- 9 | 10 | See on `SeqFindr GitHub`_ site. 11 | 12 | 13 | API Documentation 14 | ----------------- 15 | 16 | Explore the available methods of SeqFindr 17 | 18 | .. toctree:: 19 | :maxdepth: 1 20 | 21 | modules.rst 22 | 23 | 24 | Developer Documentation 25 | ----------------------- 26 | 27 | Learn how to contribute to the development of SeqFindr. 28 | 29 | .. toctree:: 30 | :maxdepth: 1 31 | 32 | developer.rst 33 | 34 | 35 | 36 | Indices and tables 37 | ------------------ 38 | 39 | * :ref:`genindex` 40 | * :ref:`modindex` 41 | * :ref:`search` 42 | 43 | 44 | 45 | .. _`SeqFindr GitHub`: https://github.com/mscook/SeqFindr 46 | -------------------------------------------------------------------------------- /TODO.rst: -------------------------------------------------------------------------------- 1 | SeqFindr release map 2 | ==================== 3 | 4 | See open `issues`_. Feel free to fork, clone, fix, test, push and send a pull 5 | request. Please, before you submit a pull request, could you sync with the 6 | upstream (this) master. Please see this `tutorial`_ on how to do this. 7 | 8 | 9 | Minor 10 | ----- 11 | 12 | Current small fixes: 13 | * Provide a figure rotation option 14 | 15 | Image rotation can be accomplished by doing something like this:: 16 | 17 | import Image 18 | img = Image.open("plot.jpg") 19 | img2 = img.rotate(45) 20 | img2.show() 21 | img2.save("rotate.jpg") 22 | 23 | 24 | Major 25 | ----- 26 | 27 | Current major fixes/improvements: 28 | * make into a web app 29 | * more extensive tests 30 | * SeqFindr-d3js 31 | 32 | .. _issues: https://github.com/mscook/SeqFindr/issues?direction=desc&sort=created&state=open 33 | .. _tutorial: https://help.github.com/articles/syncing-a-fork 34 | -------------------------------------------------------------------------------- /example/consensus/strain1.fa: -------------------------------------------------------------------------------- 1 | >70-tem8674 2 | AAAGTTCTGCTATGTGGCGCGGTATTATCCCGTGTTGACGCCGGGCAAGAGCAACTCGGTCGCCGCATAC 3 | >70-shv86 4 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 5 | >70-oxa(1)256 6 | AAACAACCTTCAGTTCCTTCAAATAATGGAGATGCGACAGTAGAGATATCTGTTGATGCACTGGCGCTGC 7 | >70-oxa(7)295 8 | GTAGCGCAGGCTAATTTACTGCTACTTTTACAAAGCACGAAAACACCATTGACGGCTTCGGCAGAGAACT 9 | >70-pse(4)348 10 | CGCTGATTGCCATTGTAATCCCAATATTCTCCATTTTGAGTATCAAGAACGGAAACACCTATACGAGCAG 11 | >70-ctx143 12 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 13 | >70-aadA1588 14 | ATGATGTCGTCGTGCACAACAATGGTGACTTCTACAGCGCGGAGAATCTCGCTCTCTCCAGGGGAAGCCG 15 | >70-aadB1778 16 | CCCGAGTGAGGTGCATGCGAGCCTGTAGGACTCTATGTGCTTTGTAGGCCAGTCCACTGGTGGTACTTCA 17 | >70-aacC(2)200 18 | CACCGGTTTGGACTCCGAGTTTTCGAATTGCCTCCGTTATTGCCTTCCGCGTATGCATCGCGATATCTCC 19 | >70-aac3(IV)380 20 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 21 | >70-aphA(1)1310 22 | GGCGCATCGGGCTTCCCATACAATCGATAGATTGTCGCACCTGATTGCCCGACATTATCGCGAGCCCATT 23 | >70-aphA(2)220 24 | AGTCATAGCCGAATAGCCTCTCCACCCAAGCGGCCGGAGAACCTGCGTGCAATCCATCTTGTTCAATCAT 25 | -------------------------------------------------------------------------------- /example/consensus/strain2.fa: -------------------------------------------------------------------------------- 1 | >70-tem8674 2 | AAAGTTCTGCTATGTGGCGCGGTATTATCCCGTGTTGACGCCGGGCAAGAGCAACTCGGTCGCCGCATAC 3 | >70-shv86 4 | CTCAAGCGGCTGCGGGCTGGCGTGTACCGCCAGCGGCAGGGTGGCTAACAGGGAGATAATACACAGGCGA 5 | >70-oxa(1)256 6 | AAACAACCTTCAGTTCCTTCAAATAATGGAGATGCGACAGTAGAGATATCTGTTGATGCACTGGCGCTGC 7 | >70-oxa(7)295 8 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 9 | >70-pse(4)348 10 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 11 | >70-ctx143 12 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 13 | >70-aadA1588 14 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 15 | >70-aadB1778 16 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 17 | >70-aacC(2)200 18 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 19 | >70-aac3(IV)380 20 | TCGATCAGTCCAAGTGGCCCATCTTCGAGGGGCCGGACGCTACGGAAGGAGCTGTGGACCAGCAGCACAC 21 | >70-aphA(1)1310 22 | GGCGCATCGGGCTTCCCATACAATCGATAGATTGTCGCACCTGATTGCCCGACATTATCGCGAGCCCATT 23 | >70-aphA(2)220 24 | AGTCATAGCCGAATAGCCTCTCCACCCAAGCGGCCGGAGAACCTGCGTGCAATCCATCTTGTTCAATCAT 25 | -------------------------------------------------------------------------------- /example/consensus/strain3.fa: -------------------------------------------------------------------------------- 1 | >70-tem8674 2 | AAAGTTCTGCTATGTGGCGCGGTATTATCCCGTGTTGACGCCGGGCAAGAGCAACTCGGTCGCCGCATAC 3 | >70-shv86 4 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 5 | >70-oxa(1)256 6 | AAACAACCTTCAGTTCCTTCAAATAATGGAGATGCGACAGTAGAGATATCTGTTGATGCACTGGCGCTGC 7 | >70-oxa(7)295 8 | GTAGCGCAGGCTAATTTACTGCTACTTTTACAAAGCACGAAAACACCATTGACGGCTTCGGCAGAGAACT 9 | >70-pse(4)348 10 | CGCTGATTGCCATTGTAATCCCAATATTCTCCATTTTGAGTATCAAGAACGGAAACACCTATACGAGCAG 11 | >70-ctx143 12 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 13 | >70-aadA1588 14 | ATGATGTCGTCGTGCACAACAATGGTGACTTCTACAGCGCGGAGAATCTCGCTCTCTCCAGGGGAAGCCG 15 | >70-aadB1778 16 | CCCGAGTGAGGTGCATGCGAGCCTGTAGGACTCTATGTGCTTTGTAGGCCAGTCCACTGGTGGTACTTCA 17 | >70-aacC(2)200 18 | CACCGGTTTGGACTCCGAGTTTTCGAATTGCCTCCGTTATTGCCTTCCGCGTATGCATCGCGATATCTCC 19 | >70-aac3(IV)380 20 | NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 21 | >70-aphA(1)1310 22 | GGCGCATCGGGCTTCCCATACAATCGATAGATTGTCGCACCTGATTGCCCGACATTATCGCGAGCCCATT 23 | >70-aphA(2)220 24 | AGTCATAGCCGAATAGCCTCTCCACCCAAGCGGCCGGAGAACCTGCGTGCAATCCATCTTGTTCAATCAT 25 | -------------------------------------------------------------------------------- /docs/SeqFindr.rst: -------------------------------------------------------------------------------- 1 | SeqFindr package 2 | ================ 3 | 4 | Submodules 5 | ---------- 6 | 7 | SeqFindr.blast module 8 | --------------------- 9 | 10 | .. automodule:: SeqFindr.blast 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | SeqFindr.config module 16 | ---------------------- 17 | 18 | .. automodule:: SeqFindr.config 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | SeqFindr.imaging module 24 | ----------------------- 25 | 26 | .. automodule:: SeqFindr.imaging 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | SeqFindr.seqfindr module 32 | ------------------------ 33 | 34 | .. automodule:: SeqFindr.seqfindr 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | SeqFindr.util module 40 | -------------------- 41 | 42 | .. automodule:: SeqFindr.util 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | SeqFindr.vfdb_to_seqfindr module 48 | -------------------------------- 49 | 50 | .. automodule:: SeqFindr.vfdb_to_seqfindr 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | 56 | Module contents 57 | --------------- 58 | 59 | .. automodule:: SeqFindr 60 | :members: 61 | :undoc-members: 62 | :show-inheritance: 63 | -------------------------------------------------------------------------------- /tests/unittests/test_blast.py: -------------------------------------------------------------------------------- 1 | from context import blast 2 | from context import pytest 3 | import os 4 | import glob 5 | 6 | 7 | def test_make_BLAST_database(tmpdir): 8 | """ 9 | Test the make_BLAST_database() function 10 | 11 | Function signature:: 12 | 13 | make_BLAST_database(fasta_file) 14 | """ 15 | fasta = os.path.abspath("../../example/Antibiotic_markers.fa") 16 | tmpdir = str(tmpdir) 17 | os.chdir(str(tmpdir)) 18 | os.mkdir("DBs") 19 | blast.make_BLAST_database(fasta) 20 | infs = glob.glob(os.path.join(tmpdir, "DBs")+"/*") 21 | tidied = [] 22 | for f in infs: 23 | tidied.append(f.split('/')[-1]) 24 | assert len(infs) == 4 25 | assert "Antibiotic_markers.fa.nhr" in tidied 26 | assert "Antibiotic_markers.fa.nin" in tidied 27 | assert "Antibiotic_markers.fa.nsq" in tidied 28 | assert "Antibiotic_markers.fa" in tidied 29 | 30 | 31 | def test_run_BLAST(): 32 | """ 33 | Test the run_BLAST() function 34 | 35 | Function signature:: 36 | 37 | run_BLAST(query, database, args) 38 | """ 39 | pass 40 | 41 | 42 | def test_parse_BLAST(): 43 | """ 44 | Test the parse_BLAST() function 45 | 46 | Function signature:: 47 | 48 | parse_BLAST(blast_results, tol, careful) 49 | """ 50 | pass 51 | -------------------------------------------------------------------------------- /tests/unittests/test_util.py: -------------------------------------------------------------------------------- 1 | from context import util 2 | from context import pytest 3 | 4 | 5 | def test_del_from_list(): 6 | """ 7 | Test the del_from_list function 8 | 9 | Function signature:: 10 | del_from_list(target, index_positions) 11 | """ 12 | # Standard 13 | test1 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 14 | del_me = [0, 3, 10] 15 | ret = util.del_from_list(test1, del_me) 16 | assert ret == [1, 2, 4, 5, 6, 7, 8, 9] 17 | 18 | # Empty initial list 19 | test2 = [] 20 | del_me = [1] 21 | with pytest.raises(ValueError): 22 | ret = util.del_from_list(test2, del_me) 23 | 24 | # Not possible 25 | test3 = [1, 2, 3] 26 | del_me = [1, 2, 3, 4] 27 | with pytest.raises(ValueError): 28 | ret = util.del_from_list(test3, del_me) 29 | 30 | # Negative indexes 31 | test4 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 32 | del_me = [-3, -10] 33 | with pytest.raises(ValueError): 34 | ret = util.del_from_list(test4, del_me) 35 | 36 | # Index that does not exist 37 | test5 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 38 | del_me = [11] 39 | with pytest.raises(ValueError): 40 | ret = util.del_from_list(test5, del_me) 41 | 42 | # Index that does not exist x 2 43 | test5 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 44 | del_me = [10] 45 | ret = util.del_from_list(test5, del_me) 46 | assert ret == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] 47 | -------------------------------------------------------------------------------- /CHANGES.rst: -------------------------------------------------------------------------------- 1 | SeqFindr changelog 2 | ================== 3 | 4 | Please use *git log* in preference to information as it is sporadically 5 | updated and may be incomplete. 6 | 7 | 2014-07-10 Mitchell Stanton-Cook : 8 | * Rewrote setup.py 9 | * Added a release script 10 | * Moved to semantic versioning 11 | * SeqFindR --> SeqFindr (as 'R' looks to associate with R) 12 | * Improved the doumentation 13 | * Added bumpversion and do_release to make releases significantly easier 14 | 15 | 2013-08-16 Mitchell Stanton-Cook : 16 | * General refactors & code cleaning/optimisations 17 | * Split into: blast.py, config.py, imaging.py, SeqFindr.py & util.py 18 | * fixed meta information 19 | * Installation should (python setup.py install) should now be more robust) 20 | * fixed argparse. Required options are now required not optional (database 21 | & ass_dir) 22 | * output & output_prefix now do something... 23 | * New option -s/--strip (will strip off the 1st and last N bases from the 24 | mapping consensuses and database to avoid uncalled bases at the start and 25 | end of runs 26 | * Improved output image control (dpi, size, color & seed control) 27 | 28 | 2013-07-16 Nabil-Fareed Alikhan : 29 | * Added Ability to use amino acid sequences as Virluence factors 30 | * Added helper method to automatically detect type of sequence file 31 | (nucl or pro) 32 | * Added commandline override option for above auto-detection ( -R ) 33 | * Replaced a number of system calls and pathnames with cross platform 34 | friendly alternatives 35 | * Added support for fasta file extensions .fna, .fas, .fa; rather than just 36 | .fas (For query sequences) 37 | * Added tBLASTx functionality and option to trigger it ( -X ) 38 | -------------------------------------------------------------------------------- /example/run_examples.sh: -------------------------------------------------------------------------------- 1 | SeqFindr Antibiotic_markers.fa assemblies/ -o run1 -l 2 | SeqFindr Antibiotic_markers.fa assemblies/ -m consensus/ -o run2 -l 3 | SeqFindr Antibiotic_markers.fa assemblies/ -m consensus/ -o run3 -l -r 4 | SeqFindr Antibiotic_markers.fa assemblies/ -m consensus/ -o run4 -l -r --index_file dummy.order 5 | SeqFindr Antibiotic_markers.fa assemblies/ -o run5 -l --careful 0.1 6 | SeqFindr Antibiotic_markers.fa assemblies/ -m consensus/ -o run6 -l -r --index_file dummy.order --invert 7 | SeqFindr Antibiotic_markers.fa assemblies/ -m consensus/ -o run7 -l --index_file dummy.order --invert 8 | SeqFindr Antibiotic_markers.fa assemblies/ -o run8 -l --index_file dummy.order --invert 9 | # New option --remove_empty_cols 10 | SeqFindr Antibiotic_markers.fa assemblies/ -o run9 -l --remove_empty_cols 11 | SeqFindr Antibiotic_markers.fa assemblies/ -m consensus/ -o run10 -l --remove_empty_cols 12 | SeqFindr Antibiotic_markers.fa assemblies/ -m consensus/ -o run11 -l -r --remove_empty_cols 13 | SeqFindr Antibiotic_markers.fa assemblies/ -m consensus/ -o run12 -l -r --index_file dummy.order --remove_empty_cols 14 | SeqFindr Antibiotic_markers.fa assemblies/ -o run13 -l --careful 0.1 --remove_empty_cols 15 | SeqFindr Antibiotic_markers.fa assemblies/ -m consensus/ -o run14 -l -r --index_file dummy.order --invert --remove_empty_cols 16 | SeqFindr Antibiotic_markers.fa assemblies/ -m consensus/ -o run15 -l --index_file dummy.order --invert --remove_empty_cols 17 | SeqFindr Antibiotic_markers.fa assemblies/ -o run16 -l --index_file dummy.order --invert --remove_empty_cols 18 | # Existing data 19 | SeqFindr Antibiotic_markers.fa assemblies/ -o run17 -l --existing_data run1 20 | SeqFindr Antibiotic_markers.fa assemblies/ -m consensus/ -o run18 -l -r --existing_data run4 21 | # UPGMA clustering 22 | SeqFindr Antibiotic_markers.fa assemblies/ -m consensus/ -o run19 -l --UPGMA_clustering 23 | -------------------------------------------------------------------------------- /tests/unittests/test_seqfindr.py: -------------------------------------------------------------------------------- 1 | from context import seqfindr 2 | from context import pytest 3 | import numpy as np 4 | 5 | 6 | def test_strip_uninteresting(): 7 | """ 8 | Test the strip_uninteresting function 9 | 10 | Function signature:: 11 | 12 | strip_uninteresting(matrix, query_classes, query_list, cons, invert) 13 | """ 14 | # No cons 15 | matrix = np.array([(0.5, 2, 3), (0.5, 5, 6)]) 16 | nm, newqc, newql = seqfindr.strip_uninteresting(matrix, ['a', 'b', 'c'], 17 | ['a1', 'b1', 'c1'], None, 18 | False) 19 | assert newqc == ['b', 'c'] 20 | assert newql == ['b1', 'c1'] 21 | assert nm.all() == np.array([(2, 3), (5, 6)]).all() 22 | 23 | # Cons 24 | matrix = np.array([(1.0, 1.0, 3), (0.5, 1.0, 6)]) 25 | nm, newqc, newql = seqfindr.strip_uninteresting(matrix, ['a', 'b', 'c'], 26 | ['a1', 'b1', 'c1'], True, 27 | False) 28 | assert newqc == ['a', 'c'] 29 | assert newql == ['a1', 'c1'] 30 | assert nm.all() == np.array([(1.0, 3), (0.5, 6)]).all() 31 | 32 | 33 | # Cons, invert 34 | matrix = np.array([(-1.0, -1.0, -3), (-0.5, -1.0, -6)]) 35 | nm, newqc, newql = seqfindr.strip_uninteresting(matrix, ['a', 'b', 'c'], 36 | ['a1', 'b1', 'c1'], True, 37 | True) 38 | assert newqc == ['a', 'c'] 39 | assert newql == ['a1', 'c1'] 40 | assert nm.all() == np.array([(-1.0, -3), (-0.5, -6)]).all() 41 | 42 | 43 | def test_check_singularity(): 44 | """ 45 | Test the check_singularity function 46 | 47 | Function signature:: 48 | 49 | check_singularity(matrix, cons, invert) 50 | """ 51 | matrix = np.array([(0.5, 0.5, 0.5), (0.5, 0.5, 0.5)]) 52 | with pytest.raises(ValueError): 53 | seqfindr.check_singularity(matrix, None, False) 54 | -------------------------------------------------------------------------------- /do_release.sh: -------------------------------------------------------------------------------- 1 | #VERSION=0.35.0 2 | 3 | # Lets run the examples first... 4 | pip uninstall SeqFindr 5 | python setup.py install 6 | cd example 7 | sh run_examples.sh 8 | cd .. 9 | rm -rf example/run1 10 | rm -rf example/run2 11 | rm -rf example/run3 12 | rm -rf example/run4 13 | rm -rf example/run5 14 | rm -rf example/run6 15 | rm -rf example/run7 16 | rm -rf example/run8 17 | rm -rf example/run9 18 | rm -rf example/run10 19 | rm -rf example/run11 20 | rm -rf example/run12 21 | rm -rf example/run13 22 | rm -rf example/run14 23 | rm -rf example/run15 24 | rm -rf example/run16 25 | rm -rf example/run17 26 | rm -rf example/run18 27 | rm -rf example/run19 28 | pip uninstall SeqFindr 29 | python setup.py clean 30 | 31 | 32 | # Do all the versioning stuff here.. 33 | bumpversion minor 34 | 35 | 36 | # Clean, test, build the source distribution & pip install it 37 | # Need to get exit statuses here... 38 | python setup.py clean 39 | #python setup.py test 40 | #STATUS=`echo $?` 41 | #if [ $STATUS -eq 0 ]; then 42 | # echo "" 43 | #else 44 | # echo "Tests failed. Will not release" 45 | # exit 46 | #fi 47 | 48 | python setup.py sdist bdist_wheel 49 | pip install dist/SeqFindr-$VERSION.tar.gz 50 | STATUS=`echo $?` 51 | if [ $STATUS -eq 0 ]; then 52 | echo "" 53 | else 54 | echo "Package is not pip installable. Will not release" 55 | exit 56 | fi 57 | 58 | 59 | # Docs 60 | # Need to get exit statuses here... 61 | cd docs 62 | make clean 63 | sphinx-apidoc -o API ../SeqFindr 64 | mv API/* . 65 | rmdir API 66 | make html 67 | cd .. 68 | 69 | git push 70 | # tag & push the tag to github 71 | GIT=`git status` 72 | CLEAN='# On branch master nothing to commit, working directory clean' 73 | if [ "$s1" == "$s2" ]; then 74 | git tag v$VERSION 75 | git push --tags 76 | else 77 | echo "Git not clean. Will not release" 78 | exit 79 | fi 80 | 81 | 82 | # Upload to PyPI & clean 83 | twine upload -u mscook -p $PYPIPASS dist/* && python setup.py clean 84 | 85 | # Update mirrors 86 | cd ../BeatsonLab-MicrobialGenomics/MIRRORS/ 87 | sh update.sh 88 | cd ../../SeqFindR/ 89 | 90 | -------------------------------------------------------------------------------- /tests/doctests/util/sample_DB.fa.2: -------------------------------------------------------------------------------- 1 | >70-tem8674, bla-TEM, Beta-lactams Antibiotic resistance (ampicillin) xxxx [Beta-lactams] 2 | AAAGTTCTGCTATGTGGCGCGGTATTATCCCGTGTTGACGCCGGGCAAGAGCAACTCGGTCGCCGCATAC 3 | >70-shv86, bla-SHV, Beta-lactams Antibiotic resistance (ampicillin) xxxx [Beta-lactams] 4 | CTCAAGCGGCTGCGGGCTGGCGTGTACCGCCAGCGGCAGGGTGGCTAACAGGGAGATAATACACAGGCGA 5 | >70-oxa(1)256, bla-OXA-1, Beta-lactams Antibiotic resistance (ampicillin) xxxx [Beta-lactams] 6 | AAACAACCTTCAGTTCCTTCAAATAATGGAGATGCGACAGTAGAGATATCTGTTGATGCACTGGCGCTGC 7 | >70-oxa(7)295, bla-OXA-7, Beta-lactams Antibiotic resistance (ampicillin) xxxx [Beta-lactams] 8 | GTAGCGCAGGCTAATTTACTGCTACTTTTACAAAGCACGAAAACACCATTGACGGCTTCGGCAGAGAACT 9 | >70-pse(4)348, bla-PSE-4, Beta-lactams Antibiotic resistance (ampicillin) xxxx [Beta-lactams] 10 | CGCTGATTGCCATTGTAATCCCAATATTCTCCATTTTGAGTATCAAGAACGGAAACACCTATACGAGCAG 11 | >70-ctx143, bla-CTX-M-1, Beta-lactams Antibiotic resistance (ampicillin) xxxx [Beta-lactams] 12 | ATACAGCGGCACACTTCCTAACAACAGCGTGACGGTTGCCGTCGCCATCAGCGTGAACTGACGCAGTGA 13 | >70-aadA1588, ant(3'')-Ia, Aminoglycosides Antibiotic resistance (streptomycin;spectinomycin) xxxx [Aminoglycosides] 14 | ATGATGTCGTCGTGCACAACAATGGTGACTTCTACAGCGCGGAGAATCTCGCTCTCTCCAGGGGAAGCCG 15 | >70-aadB1778, ant(2'')-Ia, Aminoglycosides Antibiotic resistance (kanamycin;neomycin;gentamicin) xxxx [Aminoglycosides] 16 | CCCGAGTGAGGTGCATGCGAGCCTGTAGGACTCTATGTGCTTTGTAGGCCAGTCCACTGGTGGTACTTCA 17 | >70-aacC(2)200, aac(3)-Iia, Aminoglycosides Antibiotic resistance (gentamicin) xxxx [Aminoglycosides] 18 | CACCGGTTTGGACTCCGAGTTTTCGAATTGCCTCCGTTATTGCCTTCCGCGTATGCATCGCGATATCTCC 19 | >70-aac3(IV)380, aac(3)-IV, Aminoglycosides Antibiotic resistance (gentamicin) xxxx [Aminoglycosides] 20 | TCGATCAGTCCAAGTGGCCCATCTTCGAGGGGCCGGACGCTACGGAAGGAGCTGTGGACCAGCAGCACAC 21 | >70-aphA(1)1310, aph(3')-Ia, Aminoglycosides Antibiotic resistance (kanamycin;neomycin) xxxx [Aminoglycosides] 22 | GGCGCATCGGGCTTCCCATACAATCGATAGATTGTCGCACCTGATTGCCCGACATTATCGCGAGCCCATT 23 | >70-aphA(2)220, aph(3')-Iia, Aminoglycosides Antibiotic resistance (kanamycin;neomycin) xxxx [Aminoglycosides] 24 | AGTCATAGCCGAATAGCCTCTCCACCCAAGCGGCCGGAGAACCTGCGTGCAATCCATCTTGTTCAATCAT 25 | -------------------------------------------------------------------------------- /example/Antibiotic_markers.fa: -------------------------------------------------------------------------------- 1 | >70-tem8674, bla-TEM, Beta-lactams Antibiotic resistance (ampicillin), Unknown sp. [Beta-lactams] 2 | AAAGTTCTGCTATGTGGCGCGGTATTATCCCGTGTTGACGCCGGGCAAGAGCAACTCGGTCGCCGCATAC 3 | >70-shv86, bla-SHV, Beta-lactams Antibiotic resistance (ampicillin), Unknown sp. [Beta-lactams] 4 | CTCAAGCGGCTGCGGGCTGGCGTGTACCGCCAGCGGCAGGGTGGCTAACAGGGAGATAATACACAGGCGA 5 | >70-oxa(1)256, bla-OXA-1, Beta-lactams Antibiotic resistance (ampicillin), Unknown sp. [Beta-lactams] 6 | AAACAACCTTCAGTTCCTTCAAATAATGGAGATGCGACAGTAGAGATATCTGTTGATGCACTGGCGCTGC 7 | >70-oxa(7)295, bla-OXA-7, Beta-lactams Antibiotic resistance (ampicillin), Unknown sp. [Beta-lactams] 8 | GTAGCGCAGGCTAATTTACTGCTACTTTTACAAAGCACGAAAACACCATTGACGGCTTCGGCAGAGAACT 9 | >70-pse(4)348, bla-PSE-4, Beta-lactams Antibiotic resistance (ampicillin), Unknown sp. [Beta-lactams] 10 | CGCTGATTGCCATTGTAATCCCAATATTCTCCATTTTGAGTATCAAGAACGGAAACACCTATACGAGCAG 11 | >70-ctx143, bla-CTX-M-1, Beta-lactams Antibiotic resistance (ampicillin), Unknown sp. [Beta-lactams] 12 | ATACAGCGGCACACTTCCTAACAACAGCGTGACGGTTGCCGTCGCCATCAGCGTGAACTGACGCAGTGA 13 | >70-aadA1588, ant(3'')-Ia, Aminoglycosides Antibiotic resistance (streptomycin;spectinomycin), Unknown sp. [Aminoglycosides] 14 | ATGATGTCGTCGTGCACAACAATGGTGACTTCTACAGCGCGGAGAATCTCGCTCTCTCCAGGGGAAGCCG 15 | >70-aadB1778, ant(2'')-Ia, Aminoglycosides Antibiotic resistance (kanamycin;neomycin;gentamicin), Unknown sp. [Aminoglycosides] 16 | CCCGAGTGAGGTGCATGCGAGCCTGTAGGACTCTATGTGCTTTGTAGGCCAGTCCACTGGTGGTACTTCA 17 | >70-aacC(2)200, aac(3)-Iia, Aminoglycosides Antibiotic resistance (gentamicin), Unknown sp. [Aminoglycosides] 18 | CACCGGTTTGGACTCCGAGTTTTCGAATTGCCTCCGTTATTGCCTTCCGCGTATGCATCGCGATATCTCC 19 | >70-aac3(IV)380, aac(3)-IV, Aminoglycosides Antibiotic resistance (gentamicin), Unknown sp. [Aminoglycosides] 20 | TCGATCAGTCCAAGTGGCCCATCTTCGAGGGGCCGGACGCTACGGAAGGAGCTGTGGACCAGCAGCACAC 21 | >70-aphA(1)1310, aph(3')-Ia, Aminoglycosides Antibiotic resistance (kanamycin;neomycin), Unknown sp. [Aminoglycosides] 22 | GGCGCATCGGGCTTCCCATACAATCGATAGATTGTCGCACCTGATTGCCCGACATTATCGCGAGCCCATT 23 | >70-aphA(2)220, aph(3')-Iia, Aminoglycosides Antibiotic resistance (kanamycin;neomycin), Unknown sp. [Aminoglycosides] 24 | AGTCATAGCCGAATAGCCTCTCCACCCAAGCGGCCGGAGAACCTGCGTGCAATCCATCTTGTTCAATCAT 25 | -------------------------------------------------------------------------------- /tests/doctests/config.tests: -------------------------------------------------------------------------------- 1 | >>> from SeqFindr import config 2 | >>> 3 | >>> # Test with no (assuming ~/.SeqFindr.cfg does not exist 4 | >>> SeqFindr_cfg = config.SeqFindrConfig() 5 | >>> SeqFindr_cfg.dump_items() 6 | >>> 7 | >>> # What happens if we don't exist 8 | >>> SeqFindr_cfg = config.SeqFindrConfig('/i/do/not/exist/SeqFindr.cfg') 9 | >>> SeqFindr_cfg.dump_items() 10 | >>> 11 | >>> # What about an empty config 12 | >>> SeqFindr_cfg = config.SeqFindrConfig('config/SeqFindr.cfg.1') 13 | >>> SeqFindr_cfg.dump_items() 14 | >>> 15 | >>> # What about a config with category_colors = line empty 16 | >>> SeqFindr_cfg = config.SeqFindrConfig('config/SeqFindr.cfg.2') 17 | >>> SeqFindr_cfg.dump_items() 18 | >>> 19 | >>> # Malformed list 20 | >>> SeqFindr_cfg = config.SeqFindrConfig('config/SeqFindr.cfg.3') 21 | >>> SeqFindr_cfg.dump_items() 22 | >>> 23 | >>> # Not a triplet.. 24 | >>> SeqFindr_cfg = config.SeqFindrConfig('config/SeqFindr.cfg.4') 25 | >>> SeqFindr_cfg.dump_items() 26 | category_colors = [(0.0, 0.0, 0.0)] 27 | >>> 28 | >>> # Strings not ints 29 | >>> SeqFindr_cfg = config.SeqFindrConfig('config/SeqFindr.cfg.5') 30 | >>> SeqFindr_cfg.dump_items() 31 | >>> 32 | >>> # Non-existant option 33 | >>> SeqFindr_cfg = config.SeqFindrConfig('config/SeqFindr.cfg.6') 34 | >>> SeqFindr_cfg.dump_items() 35 | >>> 36 | >>> # Expectant output 37 | >>> SeqFindr_cfg = config.SeqFindrConfig('config/SeqFindr.cfg') 38 | >>> SeqFindr_cfg.dump_items() 39 | category_colors = [(0.0, 0.0, 0.0), (1.0, 1.0, 1.0)] 40 | >>> 41 | >>> # For completeness test the getters and setters 42 | >>> SeqFindr_cfg = config.SeqFindrConfig('config/SeqFindr.cfg') 43 | >>> print SeqFindr_cfg['category_colors'] 44 | [(0.0, 0.0, 0.0), (1.0, 1.0, 1.0)] 45 | >>> print SeqFindr_cfg['Non existant'] 46 | None 47 | >>> SeqFindr_cfg['category_colors'] = [(0.1, 0.9, 0.7), (0.0, 1.0, 1.0)] 48 | >>> print SeqFindr_cfg['category_colors'] 49 | [(0.1, 0.9, 0.7), (0.0, 1.0, 1.0)] 50 | >>> SeqFindr_cfg['non_existant'] = [(0.1, 0.9, 0.7), (0.0, 1.0, 1.0)] 51 | >>> print SeqFindr_cfg['non_existant'] 52 | None 53 | -------------------------------------------------------------------------------- /tests/doctests/util/sample_DB.fa: -------------------------------------------------------------------------------- 1 | >70-tem8674, bla-TEM, Beta-lactams Antibiotic resistance (ampicillin), Unknown sp. [Beta-lactams] 2 | AAAGTTCTGCTATGTGGCGCGGTATTATCCCGTGTTGACGCCGGGCAAGAGCAACTCGGTCGCCGCATAC 3 | >70-shv86, bla-SHV, Beta-lactams Antibiotic resistance (ampicillin), Unknown sp. [Beta-lactams] 4 | CTCAAGCGGCTGCGGGCTGGCGTGTACCGCCAGCGGCAGGGTGGCTAACAGGGAGATAATACACAGGCGA 5 | >70-oxa(1)256, bla-OXA-1, Beta-lactams Antibiotic resistance (ampicillin), Unknown sp. [Beta-lactams] 6 | AAACAACCTTCAGTTCCTTCAAATAATGGAGATGCGACAGTAGAGATATCTGTTGATGCACTGGCGCTGC 7 | >70-oxa(7)295, bla-OXA-7, Beta-lactams Antibiotic resistance (ampicillin), Unknown sp. [Beta-lactams] 8 | GTAGCGCAGGCTAATTTACTGCTACTTTTACAAAGCACGAAAACACCATTGACGGCTTCGGCAGAGAACT 9 | >70-pse(4)348, bla-PSE-4, Beta-lactams Antibiotic resistance (ampicillin), Unknown sp. [Beta-lactams] 10 | CGCTGATTGCCATTGTAATCCCAATATTCTCCATTTTGAGTATCAAGAACGGAAACACCTATACGAGCAG 11 | >70-ctx143, bla-CTX-M-1, Beta-lactams Antibiotic resistance (ampicillin), Unknown sp. [Beta-lactams] 12 | ATACAGCGGCACACTTCCTAACAACAGCGTGACGGTTGCCGTCGCCATCAGCGTGAACTGACGCAGTGA 13 | >70-aadA1588, ant(3'')-Ia, Aminoglycosides Antibiotic resistance (streptomycin;spectinomycin), Unknown sp. [Aminoglycosides] 14 | ATGATGTCGTCGTGCACAACAATGGTGACTTCTACAGCGCGGAGAATCTCGCTCTCTCCAGGGGAAGCCG 15 | >70-aadB1778, ant(2'')-Ia, Aminoglycosides Antibiotic resistance (kanamycin;neomycin;gentamicin), Unknown sp. [Aminoglycosides] 16 | CCCGAGTGAGGTGCATGCGAGCCTGTAGGACTCTATGTGCTTTGTAGGCCAGTCCACTGGTGGTACTTCA 17 | >70-aacC(2)200, aac(3)-Iia, Aminoglycosides Antibiotic resistance (gentamicin), Unknown sp. [Aminoglycosides] 18 | CACCGGTTTGGACTCCGAGTTTTCGAATTGCCTCCGTTATTGCCTTCCGCGTATGCATCGCGATATCTCC 19 | >70-aac3(IV)380, aac(3)-IV, Aminoglycosides Antibiotic resistance (gentamicin), Unknown sp. [Aminoglycosides] 20 | TCGATCAGTCCAAGTGGCCCATCTTCGAGGGGCCGGACGCTACGGAAGGAGCTGTGGACCAGCAGCACAC 21 | >70-aphA(1)1310, aph(3')-Ia, Aminoglycosides Antibiotic resistance (kanamycin;neomycin), Unknown sp. [Aminoglycosides] 22 | GGCGCATCGGGCTTCCCATACAATCGATAGATTGTCGCACCTGATTGCCCGACATTATCGCGAGCCCATT 23 | >70-aphA(2)220, aph(3')-Iia, Aminoglycosides Antibiotic resistance (kanamycin;neomycin), Unknown sp. [Aminoglycosides] 24 | AGTCATAGCCGAATAGCCTCTCCACCCAAGCGGCCGGAGAACCTGCGTGCAATCCATCTTGTTCAATCAT 25 | -------------------------------------------------------------------------------- /SeqFindr/imaging.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 Mitchell Stanton-Cook Licensed under the 2 | # Educational Community License, Version 2.0 (the "License"); you may 3 | # not use this file except in compliance with the License. You may 4 | # obtain a copy of the License at 5 | # 6 | # http://www.osedu.org/licenses/ECL-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, 9 | # software distributed under the License is distributed on an "AS IS" 10 | # BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11 | # or implied. See the License for the specific language governing 12 | # permissions and limitations under the License. 13 | 14 | 15 | HALTON = False 16 | 17 | import random 18 | try: 19 | import ghalton 20 | HALTON = True 21 | except ImportError: 22 | print "Using Standard" 23 | 24 | 25 | def hsv_to_rgb(h, s, v): 26 | """ 27 | Convert HSV to RGB 28 | 29 | :param h: hue 30 | :param s: saturation 31 | :param v: value 32 | """ 33 | h_i = int(h*6) 34 | f = h*6 - h_i 35 | p = v * (1 - s) 36 | q = v * (1 - f*s) 37 | t = v * (1 - (1 - f) * s) 38 | if h_i == 0: 39 | r, g, b = v, t, p 40 | elif h_i == 1: 41 | r, g, b = q, v, p 42 | elif h_i == 2: 43 | r, g, b = p, v, t 44 | elif h_i == 3: 45 | r, g, b = p, q, v 46 | elif h_i == 4: 47 | r, g, b = t, p, v 48 | elif h_i == 5: 49 | r, g, b = v, p, q 50 | else: 51 | print "Problem" 52 | print [int(r*256), int(g*256), int(b*256)] 53 | return [int(r*256), int(g*256), int(b*256)] 54 | 55 | 56 | def generate_colors(number_required, seed): 57 | """ 58 | Generate a list of length number of distinct "good" random colors 59 | 60 | See: https://github.com/fmder/ghalton 61 | 62 | Based on http://martin.ankerl.com/2009/12/09/ 63 | how-to-create-random-colors-programmatically/ 64 | 65 | :param number_required: int 66 | :param seed: the random seed 67 | 68 | :type: int 69 | :type: int 70 | 71 | :rtype: a list of lists in the form: [[243, 137, 121], [232, 121, 243], 72 | [216, 121, 243]] 73 | """ 74 | rgb_list = [] 75 | if HALTON is True: 76 | sequencer = ghalton.GeneralizedHalton(3, seed) 77 | points = sequencer.get(int(number_required)) 78 | for p in points: 79 | print p 80 | rgb_list.append(p) 81 | else: 82 | golden_ratio_conjugate = 0.618033988749895 83 | random.seed(seed) 84 | h = random.random() 85 | for i in range(0, int(number_required)): 86 | # h = random.random() 87 | h += golden_ratio_conjugate 88 | h %= 1 89 | # h = (random.random()+golden_ratio_conjugate) % 1 90 | rgb = hsv_to_rgb(h, 0.5, 0.6) 91 | rgb = rgb[0]/255.0, rgb[1]/255.0, rgb[2]/255.0 92 | rgb_list.append(rgb) 93 | return rgb_list 94 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | import glob 6 | 7 | # Try and import pip. We'll stop if it is not present 8 | try: 9 | import pip 10 | except ImportError: 11 | print "Installation of SeqFindr requires pip. Please install it! See -" 12 | print "http://pip.readthedocs.org/en/latest/installing.html" 13 | sys.exit(1) 14 | 15 | from setuptools import setup 16 | 17 | __title__ = 'SeqFindr' 18 | __version__ = '0.35.0' 19 | __description__ = "A tool to easily create informative genomic feature plots" 20 | __author__ = 'Mitchell Stanton-Cook' 21 | __license__ = 'ECL 2.0' 22 | __author_email__ = "m.stantoncook@gmail.com" 23 | __url__ = 'http://github.com/mscook/SeqFindr' 24 | 25 | 26 | # Helper functions 27 | if sys.argv[-1] == 'publish': 28 | print "Please use twine or do_release.sh" 29 | sys.exit() 30 | 31 | if sys.argv[-1] == 'clean': 32 | os.system('rm -rf SeqFindr.egg-info build dist') 33 | sys.exit() 34 | 35 | if sys.argv[-1] == 'docs': 36 | os.system('cd docs && make html') 37 | sys.exit() 38 | 39 | 40 | packages = [__title__, ] 41 | 42 | requires = [] 43 | with open('requirements.txt') as fin: 44 | lines = fin.readlines() 45 | for line in lines: 46 | requires.append(line.strip()) 47 | 48 | # Build lists to package the docs 49 | html, sources, static = [], [], [] 50 | html_f = glob.glob('docs/_build/html/*') 51 | accessory = glob.glob('docs/_build/html/*/*') 52 | for f in html_f: 53 | if os.path.isfile(f): 54 | html.append(f) 55 | for f in accessory: 56 | if f.find("_static") != -1: 57 | if os.path.isfile(f): 58 | static.append(f) 59 | elif f.find("_sources"): 60 | if os.path.isfile(f): 61 | sources.append(f) 62 | 63 | setup( 64 | name=__title__, 65 | version=__version__, 66 | description=__description__, 67 | long_description=open('README.rst').read(), 68 | author=__author__, 69 | author_email=__author_email__, 70 | url=__url__, 71 | packages=packages, 72 | test_suite="tests", 73 | package_dir={__title__: __title__}, 74 | scripts=[__title__+'/'+__title__, __title__+'/vfdb_to_seqfindr'], 75 | package_data={}, 76 | data_files=[('', ['LICENSE', 'requirements.txt', 'README.rst']), 77 | ('docs', html), ('docs/_static', static), 78 | ('docs/_sources', sources)], 79 | include_package_data=True, 80 | install_requires=requires, 81 | license=__license__, 82 | zip_safe=False, 83 | classifiers=('Development Status :: 3 - Alpha', 84 | 'Environment :: Console', 85 | 'Intended Audience :: Science/Research', 86 | 'License :: OSI Approved', 87 | 'Natural Language :: English', 88 | 'Operating System :: POSIX :: Linux', 89 | 'Programming Language :: Python', 90 | 'Programming Language :: Python :: 2.7', 91 | 'Programming Language :: Python :: 2 :: Only', 92 | 'Topic :: Scientific/Engineering :: Bio-Informatics', 93 | 'Topic :: Scientific/Engineering :: Visualization',), 94 | ) 95 | -------------------------------------------------------------------------------- /docs/developer.rst: -------------------------------------------------------------------------------- 1 | SeqFindr Developer HOWTO 2 | ======================== 3 | 4 | In addition to what is described here, `this document by Jeff Forcier`_ and 5 | `this talk from Carl Meyer`_ provide wonderful footings for developing on/in 6 | open source projects. 7 | 8 | 9 | Maintaining a consistent development environment 10 | ------------------------------------------------- 11 | 12 | **1)** Ensure all development in performed within a virtualenv. A good way too 13 | bootstrap this is via virtualenv-burrito_. 14 | 15 | Execute the installation using:: 16 | 17 | $ curl -sL https://raw.githubusercontent.com/brainsik/virtualenv-burrito/master/virtualenv-burrito.sh | $SHELL 18 | 19 | 20 | **2)** Make a virtualenv called SeqFindr:: 21 | 22 | $ mkvirtualenv SeqFindr 23 | 24 | 25 | **3)** Install autoenv_:: 26 | 27 | $ git clone git://github.com/kennethreitz/autoenv.git ~/.autoenv 28 | $ echo 'source ~/.autoenv/activate.sh' >> ~/.bashrc 29 | 30 | 31 | Get the current code from GitHub 32 | -------------------------------- 33 | 34 | Something like this:: 35 | 36 | $ cd $PATH_WHERE_I_KEEP_MY_REPOS 37 | $ git clone https://github.com/mscook/SeqFindr.git 38 | 39 | 40 | Install dependencies 41 | -------------------- 42 | 43 | Something like this:: 44 | 45 | $ cd SeqFindr 46 | $ # Assuming you installed autoenv - 47 | $ # You'll want to say 'y' as this will activate the virtualenv each time you enter the code directory 48 | $ # Otherwise - 49 | $ # workon SeqFindr 50 | $ pip install -r requirements.txt 51 | $ pip install -r requirements-dev.txt 52 | 53 | 54 | Familiarise yourself with the code 55 | ---------------------------------- 56 | 57 | TODO. 58 | 59 | 60 | Development workflow 61 | -------------------- 62 | 63 | Use GitHub. You will have already cloned the SeqFindr repo (if you followed 64 | instructions above). To make things easier, please fork 65 | (https://github.com/mscook/SeqFindr/fork) and update your local copy to point to 66 | your fork. 67 | 68 | Something like this:: 69 | 70 | $ # Assuming your fork is like this 71 | $ # https://github.com/$YOUR_USERNAME/SeqFindr/ 72 | $ vi .git/config 73 | $ # Replace: 74 | $ # url = git@github.com:mscook/SeqFindr.git 75 | $ # with: 76 | $ # url = git@github.com:$YOUR_USERNAME/SeqFindr.git 77 | 78 | With this setup you will be able to push development changes to your fork and 79 | submit Pull Requests to the core SeqFindr repo when you're happy. 80 | 81 | **Important Note:** Upstream changes will not be synced to your fork by 82 | default. Please, before submitting a pull request please sync your fork with 83 | any upstream changes (specifically handle any merge conflicts). Info on 84 | syncing a fork can be found here_. 85 | 86 | 87 | Code style/testing/Continuous Integration 88 | ------------------------------------------ 89 | 90 | We try to make joining and/or modifying the SeqFindr project simple. 91 | 92 | General: 93 | * As close to PEP8 as possible but I ain't no Saint. Just a long as it's 94 | clean and readable, 95 | * Using standard lib UnitTest. There are convenience functions 96 | check_coverage.sh & tests/run_tests.sh respectively. We would prefer 97 | SMART test vs 100 % coverage. 98 | 99 | In the master GitHub repository we use hooks that call: 100 | * landscape.io (code QC) 101 | * Travis CI (continuous integration) 102 | * ReadTheDocs (documentation building) 103 | 104 | .. _virtualenv-burrito: https://github.com/brainsik/virtualenv-burrito 105 | .. _autoenv: https://github.com/kennethreitz/autoenv 106 | .. _here: https://help.github.com/articles/syncing-a-fork 107 | .. _doctest: http://pythontesting.net/framework/doctest/doctest-introduction/ 108 | 109 | .. _`this document by Jeff Forcier`: http://www.contribution-guide.org 110 | .. _`this talk from Carl Meyer`: http://pyvideo.org/video/2637/set-your-code-free-releasing-and-maintaining-an 111 | 112 | -------------------------------------------------------------------------------- /SeqFindr/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 Mitchell Stanton-Cook Licensed under the 2 | # Educational Community License, Version 2.0 (the "License"); you may 3 | # not use this file except in compliance with the License. You may 4 | # obtain a copy of the License at 5 | # 6 | # http://www.osedu.org/licenses/ECL-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, 9 | # software distributed under the License is distributed on an "AS IS" 10 | # BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11 | # or implied. See the License for the specific language governing 12 | # permissions and limitations under the License. 13 | 14 | """ 15 | SeqFindr configuration class: 100% test coverage, > 9 PyLint score 16 | """ 17 | 18 | import os 19 | import sys 20 | import ast 21 | 22 | 23 | class SeqFindrConfig(object): 24 | """ 25 | A SeqFindr configuration class - subtle manipulation to plots 26 | """ 27 | 28 | def __init__(self, alt_location=None): 29 | self.config = read_config(alt_location) 30 | 31 | def __getitem__(self, key): 32 | try: 33 | return self.config[key] 34 | except KeyError: 35 | return None 36 | 37 | def __setitem__(self, key, item): 38 | # Should probably validate this ie. [(r1,g1,b1),...,(rn,gn,bn)] 39 | excepted = ['category_colors'] 40 | if key in excepted: 41 | self.config[key] = item 42 | 43 | def dump_items(self): 44 | """ 45 | Prints all set configuration options to STDOUT 46 | """ 47 | for key, value in self.config.items(): 48 | print ("%s = %s") % (key, value) 49 | 50 | 51 | def read_config(alt_location): 52 | """ 53 | Read a SeqFindr configuration file 54 | 55 | Currently only supports category colors in RGB format 56 | 57 | category_colors = [(0,0,0),(255,255,255),....,(r,g,b)] 58 | """ 59 | if alt_location is None: 60 | cfg_location = os.path.expanduser('~/')+'.SeqFindr.cfg' 61 | else: 62 | cfg_location = os.path.expanduser(alt_location) 63 | cfg = {} 64 | try: 65 | with open(os.path.expanduser(cfg_location)) as fin: 66 | sys.stderr.write("Using a SeqFindr config file: %s\n" % 67 | (cfg_location)) 68 | colors, line_count = [], 0 69 | for line in fin: 70 | line_count = line_count+1 71 | if line.startswith('category_colors'): 72 | option, rgb_list = line.split('=') 73 | option = option.strip().strip(' ') 74 | rgb_list = rgb_list.strip().strip(' ') 75 | if rgb_list == '': 76 | sys.stderr.write("\tNo options could be parsed. " 77 | "Using defaults\n") 78 | break 79 | try: 80 | rgb_list = ast.literal_eval(rgb_list) 81 | except (ValueError, SyntaxError) as exception: 82 | sys.stderr.write("\tMalformed settings line: " 83 | "%s\n" % (str(rgb_list))) 84 | break 85 | for element in rgb_list: 86 | try: 87 | fixed = (element[0]/255.0, 88 | element[1]/255.0, 89 | element[2]/255.0) 90 | except IndexError: 91 | sys.stderr.write("\tMalformed RGB: %s. " 92 | "Skipping\n" 93 | % (str(element))) 94 | break 95 | colors.append(fixed) 96 | cfg[option] = colors 97 | break 98 | else: 99 | sys.stderr.write("\tNot supported option: %s" % (line)) 100 | if line_count == 0: 101 | sys.stderr.write("\tEmpty configuration file\n") 102 | except IOError: 103 | sys.stderr.write("No SeqFindr config file found at: %s. " 104 | "Using defaults\n" % (cfg_location)) 105 | return cfg 106 | -------------------------------------------------------------------------------- /HELP.rst: -------------------------------------------------------------------------------- 1 | SeqFindr 2 | ======== 3 | 4 | Something like this:: 5 | 6 | $ SeqFindr -h 7 | 8 | usage: SeqFindr [-h] [-R {nucl,prot}] [-X] [--evalue EVALUE] [--short] [-v] 9 | [-o OUTPUT] [-p OUTPUT_PREFIX] [-t TOL] [-m CONS] [-l] [-r] 10 | [-g] [--index_file INDEX_FILE] [--color COLOR] [--DPI DPI] 11 | [--seed SEED] [--svg] [--size SIZE] [-s STRIP] [-c CAREFUL] 12 | [--EXISTING_MATRIX] [--BLAST_THREADS BLAST_THREADS] 13 | seqs_of_interest assembly_dir 14 | 15 | SeqFindr v 0.35.0 - A tool to easily create informative genomic feature plots 16 | (http://github.com/mscook/SeqFindr) 17 | 18 | positional arguments: 19 | seqs_of_interest Full path to FASTA file containing a set of sequences 20 | of interest 21 | assembly_dir Full path to directory containing a set of assemblies 22 | in FASTA format 23 | 24 | optional arguments: 25 | -h, --help show this help message and exit 26 | -v, --verbose verbose output 27 | 28 | Optional algorithm options: 29 | Options relating to the SeqFindr algorithm 30 | 31 | -t TOL, --tol TOL Similarity cutoff [default = 0.95] 32 | -m CONS, --cons CONS Full path to directory containing mapping consensuses 33 | [default = None]. See manual for more info 34 | -r, --reshape Differentiate between mapping and assembly hits in the 35 | figure [default = no differentiation] 36 | --index_file INDEX_FILE 37 | Maintain the y axis strain order according to order 38 | given in this file. Otherwise clustering by row 39 | similarity. [default = do clustering]. See manual for 40 | more info 41 | -s STRIP, --strip STRIP 42 | Strip the 1st and last N bases of mapping consensuses 43 | & database [default = 10] 44 | -c CAREFUL, --careful CAREFUL 45 | Manually consider hits that fall (tol-careful) below 46 | the cutoff. [default = 0]. With default tol (0.95) & 47 | careful = 0.2, we will manually inspect all hits in 48 | 0.95-0.75 range 49 | 50 | Optional input/output options: 51 | Options relating to input and output 52 | 53 | -o OUTPUT, --output OUTPUT 54 | Output the results to this location 55 | -p OUTPUT_PREFIX, --output_prefix OUTPUT_PREFIX 56 | Give all result files this prefix 57 | --EXISTING_MATRIX Use existing SeqFindr matrix (reformat the plot) 58 | [default = False] 59 | 60 | Figure options: 61 | Options relating to the output figure 62 | 63 | -l, --label_genes Label the x axis with the query identifier [default = 64 | False] 65 | -g, --grid Figure has grid lines [default = True] 66 | --color COLOR The color index [default = None]. See manual for more 67 | info 68 | --DPI DPI DPI of figure [default = 300] 69 | --seed SEED Color generation seed 70 | --svg Draws figure in svg 71 | --size SIZE Size of figure [default = 10x12 (inches)] 72 | 73 | BLAST options: 74 | Options relating to BLAST 75 | 76 | -R {nucl,prot}, --reftype {nucl,prot} 77 | Reference Sequence type. If not given will try to 78 | detect it 79 | -X, --tblastx Run tBLASTx rather than BLASTn 80 | --evalue EVALUE BLAST evalue (Expect) 81 | --short Have short queries i.e. PCR Primers 82 | --BLAST_THREADS BLAST_THREADS 83 | Use this number of threads in BLAST run [default = 1] 84 | 85 | Licence: ECL 2.0 by Mitchell Stanton-Cook, Nabil Alikhan & Hamza Khan 86 | 87 | 88 | 89 | vfdb_to_seqfindr 90 | ---------------- 91 | 92 | Something like this:: 93 | 94 | $ vfdb_to_seqfindr -h 95 | 96 | usage: vfdb_to_seqfindr [-h] [-i INFILE] [-o OUTFILE] [-c CLASS_FILE] [-b] 97 | 98 | Convert VFDB formatted files (or like) to SeqFindr formatted database files 99 | 100 | optional arguments: 101 | -h, --help show this help message and exit 102 | -i INFILE, --infile INFILE 103 | [Required] fullpath to the in fasta file 104 | -o OUTFILE, --outfile OUTFILE 105 | [Required] fullpath to the out fasta file 106 | -c CLASS_FILE, --class_file CLASS_FILE 107 | [Optional] full path to a file containing factor 108 | classifications 109 | -b, --blank_class [Optional] set classification blank even if such exist 110 | 111 | Licence: ECL by Mitchell Stanton-Cook 112 | 113 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/SeqFindr.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/SeqFindr.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/SeqFindr" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/SeqFindr" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | -------------------------------------------------------------------------------- /SeqFindr/vfdb_to_seqfindr.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2013-2014 Mitchell Stanton-Cook Licensed under the 4 | # Educational Community License, Version 2.0 (the "License"); you may 5 | # not use this file except in compliance with the License. You may 6 | # obtain a copy of the License at 7 | # 8 | # http://www.osedu.org/licenses/ECL-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an "AS IS" 12 | # BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | # or implied. See the License for the specific language governing 14 | # permissions and limitations under the License. 15 | 16 | 17 | """ 18 | vfdb_to_seqfindr 19 | ================ 20 | 21 | Convert VFDB formatted files (or like) to SeqFindr formatted database files 22 | 23 | 24 | VFDB: Virulence Factors Database 25 | www.mgc.ac.cn/VFs/ 26 | a reference database for bacterial virulence factors. 27 | 28 | 29 | This is based on a sample file (TOTAL_Strep_VFs.fas) provided by 30 | Nouri Ben Zakour. 31 | 32 | Examples:: 33 | 34 | # Default (will set VFDB classification identifiers as the classification) 35 | $ vfdb_to_seqfindr -i TOTAL_Strep_VFs.fas -o TOTAL_Strep_VFs.sqf 36 | 37 | # Sets any classification to blank ([ ]) 38 | $ vfdb_to_seqfindr -i TOTAL_Strep_VFs.fas -o TOTAL_Strep_VFs.sqf -b 39 | 40 | # Reads a user defined classification. 1 per in same order as input 41 | # sequences 42 | $ python convert_vfdb_to_SeqFindr.py -i TOTAL_Strep_VFs.fas 43 | -o TOTAL_Strep_VFs.sqf -c blah.dat 44 | 45 | 46 | About option --class_file 47 | ------------------------- 48 | 49 | Suppose you want to annotate a VF class with user defined values. Simply 50 | develop a file containing the scheme (1-1 matching). If you had 6 input 51 | sequences and the first 3 are Fe transporters and the next two are Toxins 52 | and the final sequence is Misc your class file would look like this: 53 | 54 | Fe transporter 55 | Fe transporter 56 | Fe transporter 57 | Toxins 58 | Toxins 59 | Misc 60 | """ 61 | 62 | __author__ = "Mitchell Stanton-Cook" 63 | __licence__ = "ECL" 64 | __version__ = "0.2" 65 | __email__ = "m.stantoncook@gmail.com" 66 | epi = "Licence: " + __licence__ + " by " + __author__ + " <" + __email__ + ">" 67 | USAGE = "vfdb_to_seqfindr -h" 68 | 69 | 70 | import sys 71 | import os 72 | import traceback 73 | import argparse 74 | import time 75 | import fileinput 76 | import shutil 77 | from Bio import SeqIO 78 | 79 | 80 | def main(args): 81 | count = 0 82 | if args.class_file is not None: 83 | with open(os.path.expanduser(args.class_file)) as class_in: 84 | class_lines = class_in.readlines() 85 | args.blank_class = False 86 | with open(os.path.expanduser(args.outfile), "w") as fout: 87 | with open(os.path.expanduser(args.infile)) as fin: 88 | classi = '[ ]' 89 | for line in fin: 90 | if line.startswith('>'): 91 | elements = line.split(' ') 92 | identifier = elements[0].strip() 93 | common_name = elements[1].strip() 94 | # For the annotation 95 | tmp = line.split('-')[1:] 96 | rjoin = '-'.join(tmp) 97 | ann = rjoin.split('[')[0].replace(',', ';').strip() 98 | spec = line.split('[')[1].split(']')[0].strip() 99 | # For the classification 100 | tmp = elements[-1] 101 | if args.class_file is not None: 102 | classi = '[ %s ]' % (class_lines[count].strip()) 103 | count = count+1 104 | else: 105 | count = count+1 106 | if args.blank_class is False: 107 | if tmp.find('(') != -1: 108 | classi = tmp.strip().replace('(', '[').replace(')', ']') 109 | fout.write('%s, %s, %s, %s %s\n' % (identifier, common_name, ann, spec, classi)) 110 | else: 111 | fout.write(line.strip().upper()+'\n') 112 | print 'Wrote %s records' % count 113 | if not args.blank_class: 114 | order_by_class(args) 115 | 116 | 117 | def order_by_class(args): 118 | """ 119 | Ensure that all particualr classes are in the same block 120 | """ 121 | d = {} 122 | with open(args.outfile, "rU") as fin: 123 | for record in SeqIO.parse(fin, "fasta"): 124 | cur_class = record.description.split('[')[-1].split(']')[0].strip() 125 | if not d.has_key(cur_class): 126 | d[cur_class] = [] 127 | cur = d[cur_class] 128 | cur.append(record) 129 | d[cur_class] = cur 130 | else: 131 | cur = d[cur_class] 132 | cur.append(record) 133 | d[cur_class] = cur 134 | BASE, EXT = os.path.splitext(args.outfile) 135 | sub_files = [] 136 | for key in d.keys(): 137 | # Write each of the subfiles 138 | tmp = key.replace(' ', '_') 139 | sub_files.append(BASE+"_"+tmp+EXT) 140 | with open(sub_files[-1], 'w') as fout: 141 | cur = d[key] 142 | for e in cur: 143 | fout.write('>'+e.description+'\n') 144 | fout.write(str(e.seq)+'\n') 145 | # Write the concatenated 146 | with open(BASE+".tmp", 'w') as fout: 147 | for line in fileinput.input(sub_files): 148 | fout.write(line) 149 | shutil.move(BASE+".tmp", args.outfile) 150 | 151 | 152 | if __name__ == '__main__': 153 | try: 154 | start_time = time.time() 155 | 156 | desc = __doc__.split('\n\n')[1].strip() 157 | parser = argparse.ArgumentParser(description=desc, epilog=epi) 158 | parser.add_argument('-i', '--infile', action='store', 159 | help='[Required] fullpath to the in fasta file') 160 | parser.add_argument('-o', '--outfile', action='store', 161 | help='[Required] fullpath to the out fasta file') 162 | parser.add_argument('-c', '--class_file', action='store', default=None, 163 | help='[Optional] full path to a file containing ' 164 | 'factor classifications') 165 | parser.add_argument('-b', '--blank_class', action='store_true', 166 | default=False, help='[Optional] set ' 167 | 'classification blank even if such exist') 168 | parser.set_defaults(func=main) 169 | args = parser.parse_args() 170 | msg = "Missing required arguments.\nPlease run: vfdb_to_seqfindr -h" 171 | if args.infile is None: 172 | print msg 173 | sys.exit(1) 174 | if args.outfile is None: 175 | print msg 176 | sys.exit(1) 177 | if args.verbose: 178 | print "Executing @ " + time.asctime() 179 | args.func(args) 180 | if args.verbose: 181 | print "Ended @ " + time.asctime() 182 | print 'Exec time minutes %f:' % ((time.time() - start_time) / 60.0) 183 | sys.exit(0) 184 | except KeyboardInterrupt, e: 185 | # Ctrl-C 186 | raise e 187 | except SystemExit, e: 188 | # sys.exit() 189 | raise e 190 | except Exception, e: 191 | print 'ERROR, UNEXPECTED EXCEPTION' 192 | print str(e) 193 | traceback.print_exc() 194 | sys.exit(1) 195 | -------------------------------------------------------------------------------- /SeqFindr/blast.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 Mitchell Stanton-Cook Licensed under the 2 | # Educational Community License, Version 2.0 (the "License"); you may 3 | # not use this file except in compliance with the License. You may 4 | # obtain a copy of the License at 5 | # 6 | # http://www.osedu.org/licenses/ECL-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, 9 | # software distributed under the License is distributed on an "AS IS" 10 | # BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11 | # or implied. See the License for the specific language governing 12 | # permissions and limitations under the License. 13 | 14 | """ 15 | SeqFindr BLAST methods 16 | """ 17 | 18 | import subprocess 19 | import shutil 20 | import os 21 | import sys 22 | 23 | from Bio.Blast import NCBIXML 24 | from Bio.Blast.Applications import NcbiblastnCommandline 25 | from Bio.Blast.Applications import NcbitblastnCommandline 26 | from Bio.Blast.Applications import NcbitblastxCommandline 27 | 28 | import SeqFindr.util 29 | 30 | 31 | def make_BLAST_database(fasta_file): 32 | """ 33 | Given a fasta_file, generate a nucleotide BLAST database 34 | 35 | Database will end up in DB/ of working directory or OUTPUT/DB if an 36 | output directory is given in the arguments 37 | 38 | :param fasta_file: full path to a fasta file 39 | :type fasta_file: string 40 | 41 | :rtype: the strain id **(must be delimited by '_')** 42 | """ 43 | proc = subprocess.Popen(["makeblastdb", "-in", fasta_file, "-dbtype", 44 | 'nucl'], stdout=subprocess.PIPE) 45 | sys.stderr.write(proc.stdout.read()) 46 | for file_ext in ['.nhr', '.nin', '.nsq']: 47 | path = fasta_file + file_ext 48 | shutil.move(path, os.path.join('DBs', os.path.basename(path))) 49 | sys.stderr.write(("Getting %s and assocaiated database files to the DBs " 50 | "location\n") % (fasta_file)) 51 | shutil.copy2(fasta_file, os.path.join('DBs', os.path.basename(fasta_file))) 52 | return os.path.basename(fasta_file).split('_')[0] 53 | 54 | 55 | def run_BLAST(query, database, args, cons_run): 56 | """ 57 | Given a mfa of query sequences of interest & a database, search for them. 58 | 59 | Important to note: 60 | * Turns dust filter off, 61 | * Only a single target sequence (top hit), 62 | * Output in XML format as blast.xml. 63 | 64 | # TODO: Add evalue filtering ? 65 | # TODO: add task='blastn' to use blastn scoring ? 66 | 67 | .. warning:: default is megablast 68 | 69 | .. warning:: tblastx funcationality has not been checked 70 | 71 | :param query: the fullpath to the vf.mfa 72 | :param database: the full path of the databse to search for the vf in 73 | :param args: the arguments parsed to argparse 74 | :param cons_run: part of a mapping consensus run 75 | 76 | :type query: string 77 | :type database: string 78 | :type args: argparse args (dictionary) 79 | :type cons_run: boolean 80 | 81 | :returns: the path of the blast.xml file 82 | """ 83 | tmp1 = os.path.splitext(query.split('/')[-1])[0] 84 | tmp2 = os.path.splitext(database.split('/')[-1])[0] 85 | if not cons_run: 86 | outfile = os.path.join("BLAST_results/", 87 | "DB="+tmp1+"ID="+tmp2+"_blast.xml") 88 | else: 89 | outfile = os.path.join("BLAST_results/", 90 | "cons_DB="+tmp1+"ID="+tmp2+"_blast.xml") 91 | protein = False 92 | # File type not specified, determine using util.is_protein() 93 | if args.reftype is None: 94 | if SeqFindr.util.is_protein(query) != -1: 95 | protein = True 96 | sys.stderr.write('%s is protein' % (query)) 97 | elif args.reftype == 'prot': 98 | protein = True 99 | sys.stderr.write('%s is protein\n' % (query)) 100 | run_command = '' 101 | if protein: 102 | sys.stderr.write('Using tblastn\n') 103 | run_command = NcbitblastnCommandline(query=query, seg='no', 104 | db=database, outfmt=5, num_threads=args.BLAST_THREADS, 105 | max_target_seqs=1, evalue=args.evalue, out=outfile) 106 | else: 107 | if args.tblastx: 108 | sys.stderr.write('Using tblastx\n') 109 | run_command = NcbitblastxCommandline(query=query, seg='no', 110 | db=database, outfmt=5, num_threads=args.BLAST_THREADS, 111 | max_target_seqs=1, evalue=args.evalue, 112 | out=outfile) 113 | else: 114 | sys.stderr.write('Using blastn\n') 115 | if args.short == False: 116 | run_command = NcbiblastnCommandline(query=query, dust='no', 117 | db=database, outfmt=5, 118 | num_threads=args.BLAST_THREADS, 119 | max_target_seqs=1, evalue=args.evalue, 120 | out=outfile) 121 | else: 122 | sys.stderr.write('Optimising for short query sequences\n') 123 | run_command = NcbiblastnCommandline(query=query, dust='no', 124 | db=database, outfmt=5, word_size=7, 125 | num_threads=args.BLAST_THREADS, evalue=1000, 126 | max_target_seqs=1, out=outfile) 127 | 128 | sys.stderr.write(str(run_command)+"\n") 129 | run_command() 130 | return os.path.join(os.getcwd(), outfile) 131 | 132 | 133 | def parse_BLAST(blast_results, tol, cov, careful): 134 | """ 135 | Using NCBIXML parse the BLAST results, storing & returning good hits 136 | 137 | :param blast_results: full path to a blast run output file (in XML format) 138 | :param tol: the cutoff threshold (see above for explaination) 139 | :param cov: alignement coverage cut-off (see above for explaination) 140 | 141 | :type blast_results: string 142 | :type tol: float 143 | :type cov: float 144 | 145 | 146 | :rtype: list of satifying hit names 147 | """ 148 | if os.path.isfile(os.path.expanduser(blast_results)): 149 | hits = [] 150 | for record in NCBIXML.parse(open(blast_results)): 151 | for align in record.alignments: 152 | for hsp in align.hsps: 153 | hit_name = record.query.split(',')[1].strip() 154 | # cutoff is now calculated with reference to the alignment length 155 | cutoff = hsp.identities/float(hsp.align_length) 156 | 157 | # added condition that the alignment length (hsp.align_length) must be at least equal to the length of the target sequence 158 | # added coverage option allowing the alignment length to be shorter than the length of the target sequence (DEFAULT=1) 159 | if cutoff >= tol and (record.query_length * cov) <= hsp.align_length: 160 | hits.append(hit_name.strip()) 161 | 162 | # New method for the --careful option 163 | # added condition that the alignment length (hsp.align_length) must be at least equal to the length of the target sequence 164 | elif cutoff >= tol-careful and (record.query_length * cov) <= hsp.align_length: 165 | print "Please confirm this hit:" 166 | print "Name,SeqFindr score,Len(align),Len(query),Identities,Gaps" 167 | print "%s,%f,%i,%i,%i,%i" % (hit_name, cutoff, hsp.align_length, record.query_length, hsp.identities, hsp.gaps) 168 | accept = raw_input("Should this be considered a hit? (y/N)") 169 | if accept == '': 170 | pass 171 | elif accept.lower() == 'n': 172 | pass 173 | elif accept.lower() == 'y': 174 | hits.append(hit_name.strip()) 175 | else: 176 | print "Input must be y, n or enter." 177 | print "Assuming n" 178 | else: 179 | pass 180 | else: 181 | sys.stderr.write("BLAST results do not exist. Exiting.\n") 182 | sys.exit(1) 183 | return hits 184 | -------------------------------------------------------------------------------- /SeqFindr/util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2014 Mitchell Stanton-Cook Licensed under the 2 | # Educational Community License, Version 2.0 (the "License"); you may 3 | # not use this file except in compliance with the License. You may 4 | # obtain a copy of the License at 5 | # 6 | # http://www.osedu.org/licenses/ECL-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, 9 | # software distributed under the License is distributed on an "AS IS" 10 | # BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11 | # or implied. See the License for the specific language governing 12 | # permissions and limitations under the License. 13 | 14 | """ 15 | SeqFindr utility methods 16 | """ 17 | 18 | import os 19 | import sys 20 | import re 21 | from Bio import SeqIO 22 | 23 | 24 | def ensure_paths_for_args(args): 25 | """ 26 | Ensure all arguments with paths are absolute & have simplification removed 27 | 28 | Just apply os.path.abspath & os.path.expanduser 29 | 30 | :param args: the arguments given from argparse 31 | 32 | :returns: an updated args 33 | """ 34 | args.seqs_of_interest = os.path.abspath( 35 | os.path.expanduser(args.seqs_of_interest)) 36 | args.assembly_dir = os.path.abspath(os.path.expanduser(args.assembly_dir)) 37 | if args.output is not None: 38 | args.output = os.path.abspath(os.path.expanduser(args.output)) 39 | if args.cons is not None: 40 | args.cons = os.path.abspath(os.path.expanduser(args.cons)) 41 | if args.index_file is not None: 42 | args.index_file = os.path.abspath(os.path.expanduser(args.index_file)) 43 | if args.existing_data is not None: 44 | args.existing_data = os.path.abspath(os.path.expanduser(args.existing_data)) 45 | return args 46 | 47 | 48 | def init_output_dirs(output_dir): 49 | """ 50 | Create the output base (if needed) and change dir to it 51 | 52 | :param args: the arguments given from argparse 53 | """ 54 | current_dir = os.getcwd() 55 | if output_dir is not None: 56 | if not os.path.exists(output_dir): 57 | os.makedirs(output_dir) 58 | else: 59 | sys.stderr.write("Output directory exists\n") 60 | os.chdir(output_dir) 61 | try: 62 | os.mkdir("DBs") 63 | except OSError: 64 | sys.stderr.write("A DBs directory exists. Overwriting\n") 65 | try: 66 | os.mkdir("BLAST_results") 67 | except OSError: 68 | sys.stderr.write("A BLAST_results directory exists.") 69 | return current_dir 70 | 71 | 72 | def get_fasta_files(data_path): 73 | """ 74 | Returns all files ending with .fas/.fa/fna in a directory 75 | 76 | :param data_path: the full path to the directory of interest 77 | 78 | :returns: a list of fasta files (valid extensions: .fas, .fna, .fa 79 | """ 80 | in_files = [] 81 | for files in os.listdir(data_path): 82 | if files.endswith(".fas") or files.endswith(".fna") \ 83 | or files.endswith(".fa") or files.endswith(".fasta"): 84 | in_files.append(os.path.join(data_path, files)) 85 | return in_files 86 | 87 | 88 | def order_inputs(order_index_file, dir_listing): 89 | """ 90 | Given an order index file, maintain this order in the matrix plot 91 | 92 | **This implies no clustering.** Typically used when you already have 93 | a phlogenetic tree. 94 | 95 | :param order_index_file: full path to a ordered file (1 entry per line) 96 | :param dir_listing: a listing from util.get_fasta_files 97 | 98 | :type order_index_file: string 99 | :type dir_listing: list 100 | 101 | :rtype: list of updated glob.glob dir listing to match order specified 102 | """ 103 | with open(order_index_file) as fin: 104 | lines = fin.readlines() 105 | if len(lines) != len(dir_listing): 106 | print len(lines), len(dir_listing) 107 | sys.stderr.write("In order_inputs(). Length mismatch\n") 108 | sys.exit(1) 109 | ordered = [] 110 | for l in lines: 111 | cord = l.strip() 112 | for d in dir_listing: 113 | tmp = os.path.basename(d.strip()) 114 | if tmp.find('_') == -1: 115 | cur = tmp.split('.')[0] 116 | else: 117 | cur = tmp.split("_")[0] 118 | if cur == cord: 119 | ordered.append(d) 120 | break 121 | if len(ordered) != len(dir_listing): 122 | print len(ordered) 123 | print len(dir_listing) 124 | sys.stderr.write("In order_inputs(). Not 1-1 matching. Typo?\n") 125 | sys.stderr.write("In ordered: "+str(ordered)+"\n") 126 | sys.stderr.write("In dir listing:" + str(dir_listing)+"\n") 127 | sys.exit(1) 128 | return ordered 129 | 130 | 131 | def is_protein(fasta_file): 132 | """ 133 | Checks if a FASTA file is protein or nucleotide. 134 | 135 | Will return -1 if no protein detected 136 | 137 | TODO: Abiguity characters? 138 | TODO: exception if mix of protein/nucleotide? 139 | 140 | :param fasta_file: path to input FASTA file 141 | 142 | :type fasta_file: string 143 | 144 | :returns: number of protein sequences in fasta_file (int) 145 | """ 146 | protein_hits = -1 147 | with open(fasta_file, 'rU') as fin: 148 | for record in SeqIO.parse(fin, 'fasta'): 149 | if re.match('[^ATCGNatcgn]+', str(record.seq)) is not None: 150 | protein_hits += 1 151 | return protein_hits 152 | 153 | 154 | def check_database(database_file): 155 | """ 156 | Check the database conforms to the SeqFindr format 157 | 158 | .. note:: this is not particulalry extensive 159 | 160 | :args database_file: full path to a database file as a string 161 | 162 | :type database_file: string 163 | """ 164 | at_least_one = 0 165 | stored_categories = [] 166 | with open(database_file) as db_in: 167 | for line in db_in: 168 | if line.startswith('>'): 169 | at_least_one += 1 170 | # Do the check 171 | if len(line.split(',')) != 4 or line.split(',')[-1].count(']') != 1 or line.split(',')[-1].count('[') != 1: 172 | raise Exception("Database is not formatted correctly at this line: " + line) 173 | else: 174 | tmp = line.split(',')[-1] 175 | cur = tmp.split('[')[-1].split(']')[0].strip() 176 | stored_categories.append(cur) 177 | if at_least_one == 0: 178 | raise Exception("Database contains no fasta headers") 179 | # Check that the categories maintain the correct order. 180 | cat_counts = len(set(stored_categories)) 181 | prev = stored_categories[0] 182 | # There will always be 1 183 | detected_cats = 1 184 | for i in range(1, len(stored_categories)): 185 | if stored_categories[i] != prev: 186 | detected_cats += 1 187 | prev = stored_categories[i] 188 | if cat_counts != detected_cats: 189 | print ("Please ensure that your classifications ([ element ]) are " 190 | "grouped") 191 | sys.exit(1) 192 | print "SeqFindr database checks [PASSED]" 193 | 194 | 195 | def del_from_list(target, index_positions): 196 | """ 197 | Deletes the elements in a list given by a index_positions list 198 | 199 | :param target: a target list to have items removed 200 | :param index_positions: a list of index positions to be removed from 201 | the target list 202 | 203 | :type target: list 204 | :type index_positions: list 205 | 206 | :returns: a list with the elements removed defined by the index_positions 207 | list 208 | """ 209 | if target == []: 210 | raise ValueError("target list must not be empty") 211 | if len(index_positions) > len(target): 212 | raise ValueError("target list contains less elements then " 213 | "to be removed") 214 | if not all(x >= 0 for x in index_positions): 215 | raise ValueError("index_positions need to be positive") 216 | for e in index_positions: 217 | if e >= len(target): 218 | raise ValueError("index_positions > len target list") 219 | for offset, index in enumerate(index_positions): 220 | index -= offset 221 | del target[index] 222 | return target 223 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # SeqFindr documentation build configuration file, created by 4 | # sphinx-quickstart on Tue Mar 11 11:33:49 2014. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | 18 | # If extensions (or modules to document with autodoc) are in another directory, 19 | # add these directories to sys.path here. If the directory is relative to the 20 | # documentation root, use os.path.abspath to make it absolute, like shown here. 21 | #sys.path.insert(0, os.path.abspath('.')) 22 | 23 | # -- General configuration ------------------------------------------------ 24 | 25 | # If your documentation needs a minimal Sphinx version, state it here. 26 | #needs_sphinx = '1.0' 27 | 28 | # Add any Sphinx extension module names here, as strings. They can be 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 30 | # ones. 31 | extensions = [ 32 | 'sphinx.ext.autodoc', 33 | 'sphinx.ext.doctest', 34 | 'sphinx.ext.intersphinx', 35 | 'sphinx.ext.todo', 36 | 'sphinx.ext.coverage', 37 | 'sphinx.ext.mathjax', 38 | ] 39 | 40 | # Add any paths that contain templates here, relative to this directory. 41 | templates_path = ['_templates'] 42 | 43 | # The suffix of source filenames. 44 | source_suffix = '.rst' 45 | 46 | # The encoding of source files. 47 | #source_encoding = 'utf-8-sig' 48 | 49 | # The master toctree document. 50 | master_doc = 'index' 51 | 52 | # General information about the project. 53 | project = u'SeqFindr' 54 | copyright = u'2013-2014, Mitchell Stanton-Cook, Nabil Alikhan & Hamza Khan' 55 | 56 | # The version info for the project you're documenting, acts as replacement for 57 | # |version| and |release|, also used in various other places throughout the 58 | # built documents. 59 | # 60 | # The short X.Y version. 61 | version = '0.31' 62 | # The full version, including alpha/beta/rc tags. 63 | release = '0.35.0' 64 | 65 | # The language for content autogenerated by Sphinx. Refer to documentation 66 | # for a list of supported languages. 67 | #language = None 68 | 69 | # There are two options for replacing |today|: either, you set today to some 70 | # non-false value, then it is used: 71 | #today = '' 72 | # Else, today_fmt is used as the format for a strftime call. 73 | #today_fmt = '%B %d, %Y' 74 | 75 | # List of patterns, relative to source directory, that match files and 76 | # directories to ignore when looking for source files. 77 | exclude_patterns = ['_build'] 78 | 79 | # The reST default role (used for this markup: `text`) to use for all 80 | # documents. 81 | #default_role = None 82 | 83 | # If true, '()' will be appended to :func: etc. cross-reference text. 84 | #add_function_parentheses = True 85 | 86 | # If true, the current module name will be prepended to all description 87 | # unit titles (such as .. function::). 88 | #add_module_names = True 89 | 90 | # If true, sectionauthor and moduleauthor directives will be shown in the 91 | # output. They are ignored by default. 92 | #show_authors = False 93 | 94 | # The name of the Pygments (syntax highlighting) style to use. 95 | pygments_style = 'sphinx' 96 | 97 | # A list of ignored prefixes for module index sorting. 98 | #modindex_common_prefix = [] 99 | 100 | # If true, keep warnings as "system message" paragraphs in the built documents. 101 | #keep_warnings = False 102 | 103 | 104 | # -- Options for HTML output ---------------------------------------------- 105 | 106 | # The theme to use for HTML and HTML Help pages. See the documentation for 107 | # a list of builtin themes. 108 | html_theme = 'default' 109 | 110 | # Theme options are theme-specific and customize the look and feel of a theme 111 | # further. For a list of options available for each theme, see the 112 | # documentation. 113 | #html_theme_options = {} 114 | 115 | # Add any paths that contain custom themes here, relative to this directory. 116 | #html_theme_path = [] 117 | 118 | # The name for this set of Sphinx documents. If None, it defaults to 119 | # " v documentation". 120 | #html_title = None 121 | 122 | # A shorter title for the navigation bar. Default is the same as html_title. 123 | #html_short_title = None 124 | 125 | # The name of an image file (relative to this directory) to place at the top 126 | # of the sidebar. 127 | #html_logo = None 128 | 129 | # The name of an image file (within the static path) to use as favicon of the 130 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 131 | # pixels large. 132 | #html_favicon = None 133 | 134 | # Add any paths that contain custom static files (such as style sheets) here, 135 | # relative to this directory. They are copied after the builtin static files, 136 | # so a file named "default.css" will overwrite the builtin "default.css". 137 | html_static_path = ['_static'] 138 | 139 | # Add any extra paths that contain custom files (such as robots.txt or 140 | # .htaccess) here, relative to this directory. These files are copied 141 | # directly to the root of the documentation. 142 | #html_extra_path = [] 143 | 144 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 145 | # using the given strftime format. 146 | #html_last_updated_fmt = '%b %d, %Y' 147 | 148 | # If true, SmartyPants will be used to convert quotes and dashes to 149 | # typographically correct entities. 150 | #html_use_smartypants = True 151 | 152 | # Custom sidebar templates, maps document names to template names. 153 | #html_sidebars = {} 154 | 155 | # Additional templates that should be rendered to pages, maps page names to 156 | # template names. 157 | #html_additional_pages = {} 158 | 159 | # If false, no module index is generated. 160 | #html_domain_indices = True 161 | 162 | # If false, no index is generated. 163 | #html_use_index = True 164 | 165 | # If true, the index is split into individual pages for each letter. 166 | #html_split_index = False 167 | 168 | # If true, links to the reST sources are added to the pages. 169 | #html_show_sourcelink = True 170 | 171 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 172 | #html_show_sphinx = True 173 | 174 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 175 | #html_show_copyright = True 176 | 177 | # If true, an OpenSearch description file will be output, and all pages will 178 | # contain a tag referring to it. The value of this option must be the 179 | # base URL from which the finished HTML is served. 180 | #html_use_opensearch = '' 181 | 182 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 183 | #html_file_suffix = None 184 | 185 | # Output file base name for HTML help builder. 186 | htmlhelp_basename = 'SeqFindrdoc' 187 | 188 | 189 | # -- Options for LaTeX output --------------------------------------------- 190 | 191 | latex_elements = { 192 | # The paper size ('letterpaper' or 'a4paper'). 193 | #'papersize': 'letterpaper', 194 | 195 | # The font size ('10pt', '11pt' or '12pt'). 196 | #'pointsize': '10pt', 197 | 198 | # Additional stuff for the LaTeX preamble. 199 | #'preamble': '', 200 | } 201 | 202 | # Grouping the document tree into LaTeX files. List of tuples 203 | # (source start file, target name, title, 204 | # author, documentclass [howto, manual, or own class]). 205 | latex_documents = [ 206 | ('index', 'SeqFindr.tex', u'SeqFindr Documentation', 207 | u'Mitchell Stanton-Cook, Nabil Alikhan \\& Hamza Khan', 'manual'), 208 | ] 209 | 210 | # The name of an image file (relative to this directory) to place at the top of 211 | # the title page. 212 | #latex_logo = None 213 | 214 | # For "manual" documents, if this is true, then toplevel headings are parts, 215 | # not chapters. 216 | #latex_use_parts = False 217 | 218 | # If true, show page references after internal links. 219 | #latex_show_pagerefs = False 220 | 221 | # If true, show URL addresses after external links. 222 | #latex_show_urls = False 223 | 224 | # Documents to append as an appendix to all manuals. 225 | #latex_appendices = [] 226 | 227 | # If false, no module index is generated. 228 | #latex_domain_indices = True 229 | 230 | 231 | # -- Options for manual page output --------------------------------------- 232 | 233 | # One entry per manual page. List of tuples 234 | # (source start file, name, description, authors, manual section). 235 | man_pages = [ 236 | ('index', 'seqfindr', u'SeqFindr Documentation', 237 | [u'Mitchell Stanton-Cook, Nabil Alikhan & Hamza Khan'], 1) 238 | ] 239 | 240 | # If true, show URL addresses after external links. 241 | #man_show_urls = False 242 | 243 | 244 | # -- Options for Texinfo output ------------------------------------------- 245 | 246 | # Grouping the document tree into Texinfo files. List of tuples 247 | # (source start file, target name, title, author, 248 | # dir menu entry, description, category) 249 | texinfo_documents = [ 250 | ('index', 'SeqFindr', u'SeqFindr Documentation', 251 | u'Mitchell Stanton-Cook, Nabil Alikhan & Hamza Khan', 'SeqFindr', 'One line description of project.', 252 | 'Miscellaneous'), 253 | ] 254 | 255 | # Documents to append as an appendix to all manuals. 256 | #texinfo_appendices = [] 257 | 258 | # If false, no module index is generated. 259 | #texinfo_domain_indices = True 260 | 261 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 262 | #texinfo_show_urls = 'footnote' 263 | 264 | # If true, do not generate a @detailmenu in the "Top" node's menu. 265 | #texinfo_no_detailmenu = False 266 | 267 | 268 | # Example configuration for intersphinx: refer to the Python standard library. 269 | intersphinx_mapping = {'http://docs.python.org/': None} 270 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Educational Community License Version 2.0, April 2007 The Educational Community License version 2.0 ("ECL") consists of the Apache 2.0 license, modified to change the scope of the patent grant in section 3 to be specific to the needs of the education communities using this license. The original Apache 2.0 license can be found at: http://www.apache.org/licenses/LICENSE-2.0 TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. Any patent license granted hereby with respect to contributions by an individual employed by an institution or organization is limited to patent claims where the individual that is the author of the Work is also the inventor of the patent claims licensed, and where the organization or institution has the right to grant such license under applicable grant and research funding agreements. No other express or implied licenses are granted. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: You must give any other recipients of the Work or Derivative Works a copy of this License; and You must cause any modified files to carry prominent notices stating that You changed the files; and You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Educational Community License to your work To apply the Educational Community License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Educational Community License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.osedu.org/licenses/ECL-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. 2 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | SeqFindr 2 | ======== 3 | 4 | SeqFindr - easily create informative genomic feature plots. It's a 5 | bioinfomagicians tool to detect the presence or absence of genomic features 6 | given a database describing these features & a set of draft and/or complete 7 | genomes. We work with bacterial genomes & as such SeqFindr has only been 8 | tested with bacterial genomes. 9 | 10 | 11 | **I am on vacation from 08/12/14 -> 06/01/15**. User support will not happen 12 | in this period. **Stupidly I've done some releases the day/night before I 13 | leave.** 14 | 15 | If you have problems:: 16 | 17 | $ # (sudo if neded) 18 | $ pip uninstall SeqFindr 19 | $ # Run the above command numerous times to ensure it's gone 20 | # pip install SeqFindr==0.33.1 21 | 22 | 23 | .. image:: https://pypip.in/version/SeqFindr/badge.svg 24 | :target: https://pypi.python.org/pypi/SeqFindr/ 25 | :alt: Latest Version 26 | 27 | .. image:: https://pypip.in/download/SeqFindr/badge.svg 28 | :target: https://pypi.python.org/pypi/SeqFindr/ 29 | :alt: Downloads 30 | 31 | .. image:: https://travis-ci.org/mscook/SeqFindR.svg?branch=master 32 | :target: https://travis-ci.org/mscook/SeqFindR 33 | :alt: Build status 34 | 35 | .. image:: https://landscape.io/github/mscook/SeqFindR/master/landscape.png 36 | :target: https://landscape.io/github/mscook/SeqFindR/master 37 | :alt: Code Health 38 | 39 | 40 | Documentation 41 | ------------- 42 | 43 | Please use this README.rst as the core SeqFindr user documentation. 44 | 45 | These are works in progress: 46 | * `SeqFindr documentation`_ 47 | * `SeqFindr official site`_ 48 | 49 | 50 | News 51 | ---- 52 | 53 | **07/12/14: I am on vacation from 08/12/14 -> 06/01/15**. User support will 54 | not happen in this period. **Stupidly I've done some releases the day/night 55 | before I leave.** 56 | 57 | If you have problems:: 58 | 59 | $ # (sudo if neded) 60 | $ pip uninstall SeqFindr 61 | $ # Run the above command numerous times to ensure it's gone 62 | # pip install SeqFindr==0.33.1 63 | 64 | 65 | **18/11/14:** Version 0.4.0 now has new option --remove_empty_cols. It will 66 | strip out entire columns where no hits were detected. 67 | 68 | 69 | **28/07/14:** Fixed a bug where axes were shifted when using newer versions 70 | of matplotlib. 71 | 72 | 73 | **Important:** Were you using a specific SeqFindr version as a dependency 74 | for you project and it has disappeared from PyPI? 75 | 76 | We recently activated a name change of SeqFind*R* to SeqFind*r*. This was to 77 | avoid potential users believing this was a R package. Unfortunately, PyPI 78 | while aware that SeqFindR and SeqFindr were different packages did not like 79 | the potential confusion. As a consequence the only resolution was to delete 80 | SeqFind*R* completely (and losing all PyPI published releases) and registering 81 | SeqFind*r* and starting fresh. All previous 10 releases, while not available 82 | on PyPi are still available on GitHub. If you require a previous release you 83 | can actually do something like this (SeqFindr v0.26):: 84 | 85 | pip install -e git://github.com/mscook/SeqFindr.git@v0.26 86 | 87 | 88 | **Version 0.31.1 released on 10 July 2014.** 89 | 90 | **We are now testing SeqFindr builds on both Linux & MacOSX systems.** 91 | 92 | Best use "git log" for a changelog as the changelog_ for most recent 93 | changes/fixes/enhancements may not be up to date. 94 | 95 | 96 | Citation 97 | -------- 98 | 99 | Cite this Github repository if you use SeqFindr to generate figures 100 | for publications:: 101 | 102 | STANTON-COOK M, NF ALIKHAN, FORDE BM, BEN ZAKOUR NL & BEATSON SA^. 103 | SeqFindr - easily create informative genomic feature plots. 104 | https://github.com/mscook/SeqFindr. 105 | 106 | TODO: Couple SeqFindr with ZENODO! 107 | 108 | 109 | Installation 110 | ------------ 111 | 112 | SeqFindr is a commandline application. If you're not familiar with the 113 | commandline we recommend you ask local IT support to help you install it. 114 | 115 | We now test SeqFindr builds on both Linux (Ubuntu >= 12.04) and MacOSX 116 | (Mavericks) systems. 117 | 118 | You will need to install/have installed: 119 | * ncbiblast >= 2.2.27 120 | * python >= 2.7 (**Python 3 is not supported**) 121 | 122 | You can check these are installed by:: 123 | 124 | $ python --version 125 | $ blastn -version 126 | 127 | Installation of python or blastn (without a package manager) is beyond the 128 | scope of this document. 129 | 130 | If you have both python and blastn you need to (if not already present) 131 | install pip_. 132 | 133 | You can check if pip_ exists with:: 134 | 135 | $ which pip 136 | 137 | If you get a "not found", please read the `pip installation instructions`_. 138 | 139 | **If you already have pip we do suggest you upgrade it.** We are using version 140 | 1.5.6 at the time of writing this document. 141 | 142 | You can upgrade pip_ like this:: 143 | 144 | $ pip install --upgrade pip 145 | 146 | 147 | The following python libraries_ should be installed (automatically) if you follow 148 | the installation instructions detailed below. 149 | 150 | We use the following python libraries_: 151 | * numpy >= 1.6.1 152 | * scipy >= 0.10.1 153 | * matplotlib >= 1.1.0 154 | * biopython >= 1.59 155 | * ghalton>=0.6 156 | 157 | These libraries will also have dependencies (i.e. atlas, lapack, fortran 158 | compilers, freetype and png). **These most likely won't be installed on 159 | your computer. Please install these before attempting the installation.** 160 | 161 | Linux (Ubuntu) 162 | ~~~~~~~~~~~~~~ 163 | 164 | SeqFindr uses 3rd party packages that are extremely important for scientific 165 | computing but are notoriously difficult to install. While *pip install * 166 | *--user SeqFindr* may work we recommend you install these 3rd party packages 167 | using apt-get. 168 | 169 | Run:: 170 | 171 | $ sudo apt-get install python-numpy python-scipy python-matplotlib python-biopython python-dev libatlas-dev liblapack-dev gfortran libfreetype6-dev libfreetype6 libpng-dev 172 | 173 | Now pip_ install SeqFindr:: 174 | 175 | $ pip install --user SeqFindr 176 | 177 | We use the --user option of pip_ to put SeqFindr in: /home/$USER/.local/bin/ 178 | You need to add this location to you ~/.bash_profile. 179 | 180 | Add SeqFindr to your path:: 181 | 182 | $ echo 'export PATH=$PATH:/home/$USER/.local/bin/' >> ~/.bash_profile 183 | 184 | Finally install BLAST+:: 185 | 186 | $ sudo apt-get install ncbi-blast+ 187 | 188 | **Test it:** 189 | 190 | Run:: 191 | 192 | $ SeqFindr -h 193 | $ python -c 'import SeqFindr; print SeqFindr' 194 | 195 | 196 | MacOSX (Mavericks) 197 | ~~~~~~~~~~~~~~~~~~ 198 | 199 | **You'll need to have the equivalents of python-dev libatlas-dev liblapack-dev 200 | gfortran libfreetype6-dev libfreetype6 & libpng-dev installed.** We had no 201 | problems installing SeqFindr on a recently acquired OSX Mavericks machine 202 | using the homebrew package manager. 203 | 204 | The installed packages on this machine via:: 205 | 206 | $ brew list 207 | 208 | Are available at this gist_. 209 | 210 | pip install SeqFindr:: 211 | 212 | $ pip install --user SeqFindr 213 | 214 | We use the --user option of pip_ to put SeqFindr in: /home/$USER/.local/bin/ 215 | You need to add this location to you ~/.bash_profile. 216 | 217 | Add SeqFindr to your path:: 218 | 219 | $ echo 'export PATH=$PATH:/home/$USER/.local/bin/' >> ~/.bash_profile 220 | 221 | Finally install BLAST+:: 222 | 223 | $ sudo brew install blast 224 | 225 | **Test it:** 226 | 227 | Run:: 228 | 229 | $ SeqFindr -h 230 | $ python -c 'import SeqFindr; print SeqFindr' 231 | 232 | 233 | Upgrading SeqFindr 234 | ~~~~~~~~~~~~~~~~~~ 235 | 236 | You can upgrade like this:: 237 | 238 | pip install --upgrade SeqFindr 239 | 240 | 241 | **Please regularly check back to make sure you're running the most recent 242 | SeqFindr version.** 243 | 244 | 245 | 246 | Example figure produced by SeqFindr 247 | ----------------------------------- 248 | 249 | SeqFindr CU fimbriae genes image. 110 E. *coli* strains were investigated. 250 | Order is according to phylogenetic analysis. Black blocks represent gene 251 | presence. 252 | 253 | .. image:: https://raw.github.com/mscook/SeqFindr/master/example/CU_fimbriae.png 254 | :alt: SeqFindr CU fimbriae genes image 255 | :align: center 256 | 257 | 258 | SeqFindr database files 259 | ----------------------- 260 | 261 | The SeqFindr database is in multi-fasta format. The header needs to be 262 | formatted with *4 comma separated* elements. We concede that inventing 263 | another file format is annoying, but, future versions of SeqFindr will 264 | exploit this information. 265 | 266 | The elements headers are: 267 | * identifier, 268 | * common name **(this is taken as the gene label in the plot)**, 269 | * description and 270 | * species 271 | 272 | The final element, separated by **[]** contains a classification. This 273 | information is used by SeqFindr to draw different coloured blocks. 274 | 275 | An example:: 276 | 277 | >70-tem8674, bla-TEM, Beta-lactams Antibiotic resistance (ampicillin), Unknown sp. [Beta-lactams] 278 | AAAGTTCTGCTATGTGGCGCGGTATTATCCCGTGTTGACGCCGGGCAAGAGCAACTCGGTCGCCGCATAC 279 | >70-shv86, bla-SHV, Beta-lactams Antibiotic resistance (ampicillin), Unknown sp. [Beta-lactams] 280 | CTCAAGCGGCTGCGGGCTGGCGTGTACCGCCAGCGGCAGGGTGGCTAACAGGGAGATAATACACAGGCGA 281 | >70-oxa(1)256, bla-OXA-1, Beta-lactams Antibiotic resistance (ampicillin), Unknown sp. [Beta-lactams] 282 | >70-tetB190, tet(B), Tetracycline Antibiotic resistance (tetracycline), Unknown sp. [Tetracycline] 283 | CAAAGTGGTTAGCGATATCTTCCGAAGCAATAAATTCACGTAATAACGTTGGCAAGACTGGCATGATAAG 284 | 285 | **Note:** if you do not have all information you can simplify the expected 286 | database header to:: 287 | 288 | >, bla-TEM, , [classification] 289 | 290 | 291 | The script **vfdb_to_seqfindr** is now included in SeqFindr to convert VFDB 292 | formatted files (or like) to SeqFindr formatted database files. 293 | 294 | VFDB: Virulence Factors Database (www.mgc.ac.cn/VFs/) is a reference database 295 | for bacterial virulence factors. 296 | 297 | At this stage we have tested this script on limited internal datasets. 298 | Success/mileage will depend on the consistency of the VFDB formatting. 299 | 300 | 301 | Example usage of **vfdb_to_seqfindr**:: 302 | 303 | # Default (will set VFDB classification identifiers as the classification) 304 | $ vfdb_to_seqfindr -i TOTAL_Strep_VFs.fas -o TOTAL_Strep_VFs.sqf 305 | 306 | # Sets any classification to blank ([ ]) 307 | $ vfdb_to_seqfindr -i TOTAL_Strep_VFs.fas -o TOTAL_Strep_VFs.sqf -b 308 | 309 | # Reads a user defined classification. 1 per in same order as input 310 | # sequences 311 | $ python convert_vfdb_to_SeqFindr.py -i TOTAL_Strep_VFs.fas -o TOTAL_Strep_VFs.sqf -c user.class 312 | 313 | 314 | The -c (--class_file) option is very useful. Suppose you want to annotate your 315 | sequences of interest with user defined classification values. Simply develop a 316 | file containing the scheme as pass using the -c option (3rd example above). 317 | A sample file for the situation where you had 7 input sequences with the first 318 | 3 Fe transporters, the next two Toxins, the next a Misc and the final 319 | sequence is a Toxin would look like this:: 320 | 321 | Fe transporter 322 | Fe transporter 323 | Fe transporter 324 | Toxin 325 | Toxin 326 | Misc 327 | Toxin 328 | 329 | 330 | How does SeqFindr determine positive hits 331 | ----------------------------------------- 332 | 333 | We use the following calculation:: 334 | 335 | hsp.identities/float(record.query_length) >= tol 336 | 337 | Where: 338 | * hsp.identities is number of identities in the high-scoring pairs between 339 | the query (database entry) and subject (contig/scaffold/mapping 340 | consensus), 341 | * record.query_length is the length of the database entry and, 342 | * tol is the cutoff threshold to accept a hit (0.95 default) 343 | 344 | For a database entry of 200 bp you can have up to 10 mismatches/gaps without 345 | being penalised. 346 | 347 | **Why not just use max identity?** 348 | * Eliminate effects of scaffolding characters/gaps, 349 | * Handle poor coverage etc. in mapping consensuses where N characters/gaps 350 | may be introduced 351 | 352 | **What problems may this approach cause?** I'm still looking into it... 353 | 354 | 355 | Fine grain configuration 356 | ------------------------ 357 | 358 | SeqFindr can read a configuration file. At the moment you can only redefine 359 | the category colors (suppose you want to use a set of fixed colors instead of 360 | the default randomly generated). The configuration file is expected to expand 361 | in the future. 362 | 363 | To define category colors:: 364 | 365 | touch ~/.SeqFindr.cfg 366 | vi ~/.SeqFindr.cfg 367 | # Add something like 368 | category_colors = [(100,60,201), (255,0,99)] 369 | 370 | Category colors can be any RGB triplet. You could use a tool similar to this 371 | one: http://www.colorschemer.com/online.html 372 | 373 | For example the first row of colors in RGB is: 374 | (51,102,255), (102,51,255), (204,51,255), (255,51,204) 375 | 376 | 377 | Short PCR primers 378 | ----------------- 379 | 380 | In some cases you may want to screen using PCR primers. Please use the --short 381 | option. Here we adjust BLASTn parameters wordsize = 7 & Expect Value = 1000 382 | 383 | 384 | Tutorial 385 | -------- 386 | 387 | We provide a script_ to run all the examples. **Note:** We have changed the 388 | color generation code. As a consequence the background colors will be 389 | different when running this yourself. The results will not change. 390 | 391 | Navigate to the SeqFindr/example directory (from git clone). The following files should be present: 392 | * A database file called *Antibiotic_markers.fa* 393 | * An ordering file called *dummy.order* (-i option) 394 | * An assemblies directory containing *strain1.fa, strain2.fa and strain3.fa* 395 | * A consensus directory containing *strain1.fa, strain2.fa and strain3.fa* 396 | (-m option) 397 | 398 | **Note:** the assembly and consensus directories contain: 399 | * the same number of files (3 each) 400 | * there is a 1-1 filename mapping (strain1.fa, strain2.fa, strain3.fa == 401 | strain1.fa, strain2.fa, strain3.fa) 402 | * there are only fasta files. If you wish to include complete genomes 403 | either download the genomes in fasta format OR convert the Genbank or 404 | EMBL files to fasta format. 405 | 406 | The toy assemblies and consensuses were generated such that: 407 | * **strain1** was missing: 70-shv86, 70-ctx143 and 70-aac3(IV)380 with 408 | mis-assembly of 70-aphA(1)1310 & 70-tem8674 409 | * **strain2** was missing: 70-oxa(7)295, 70-pse(4)348 70-ctx143, 410 | 70-aadA1588, 70-aadB1778 and 70-aacC(2)200 411 | * **strain2** was missing 70-shv86, 70-ctx143 and 70-aac3(IV)380 with 412 | mis-assembly of 70-aphA(1)1310, 70-tem8674 and 70-aadA1588 413 | 414 | 415 | Running all the examples at once 416 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 417 | 418 | Something like this:: 419 | 420 | $ # Assuming you git cloned, python setup.py install 421 | $ cd SeqFindr/example 422 | $ ./run_examples.sh 423 | $ # See directories run1/ run2/ run3/ run4/ 424 | 425 | 426 | Run 1 - Looking at only assemblies 427 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 428 | 429 | Command:: 430 | 431 | SeqFindr Antibiotic_markers.fa assemblies/ -o run1 -l 432 | 433 | .. image:: https://raw.github.com/mscook/SeqFindr/master/example/run1_small.png 434 | :alt: run1 435 | :align: center 436 | 437 | 438 | Link to full size run1_. 439 | 440 | 441 | Run 2 - Combining assembly and mapping consensus data 442 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 443 | 444 | Command:: 445 | 446 | SeqFindr Antibiotic_markers.fa assemblies/ -m consensus/ -o run2 -l 447 | 448 | .. image:: https://raw.github.com/mscook/SeqFindr/master/example/run2_small.png 449 | :alt: run2 450 | :align: center 451 | 452 | 453 | Link to full size run2_. 454 | 455 | 456 | Run 3 - Combining assembly and mapping consensus data with differentiation between hits 457 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 458 | 459 | Command:: 460 | 461 | SeqFindr Antibiotic_markers.fa assemblies/ -m consensus/ -o run3 -l -r 462 | 463 | .. image:: https://raw.github.com/mscook/SeqFindr/master/example/run3_small.png 464 | :alt: run3 465 | :align: center 466 | 467 | 468 | Link to full size run3_. 469 | 470 | 471 | The clustering dendrogram looks like this: 472 | 473 | .. image:: https://raw.github.com/mscook/SeqFindr/master/example/dendrogram_run3_small.png 474 | :alt: run3 dendrogram 475 | :align: center 476 | 477 | 478 | Link to full size dendrogram_. 479 | 480 | 481 | Run 4 - Combining assembly and mapping consensus data with defined ordering 482 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 483 | 484 | **Note:** the ordering file is defined using the option *--index_file*. The 485 | ordering file **must** contain the same number of strains as the assemblies 486 | directory and the strain names must agree (TODO - add a script to flag issues). 487 | 488 | Command:: 489 | 490 | SeqFindr Antibiotic_markers.fa assemblies/ -m consensus/ -o run4 -l -r --index_file dummy.order 491 | 492 | .. image:: https://raw.github.com/mscook/SeqFindr/master/example/run4_small.png 493 | :alt: run4 494 | :align: center 495 | 496 | 497 | Link to full size run4_. 498 | 499 | 500 | How to generate mapping consensus data 501 | -------------------------------------- 502 | 503 | **We strongly recommend that you use mapping consensus data.** It minimises 504 | the effects of missassembly and collapsed repeats. 505 | 506 | We use Nesoni_. We use the database file (in multi-fasta format) as the 507 | reference for mapping. Nesoni_ has no issues with multifasta files as 508 | references (BWA will treat them as separate chromosomes). 509 | The workflow is something like this:: 510 | 511 | $ nesoni make-reference myref ref-sequences.fa 512 | $ # for each strain 513 | $ # nesoni analyse-sample: mysample myref pairs: reads1.fastq reads2.fastq 514 | $ # extract the consensus.fa file 515 | 516 | 517 | For those of you using a cluster running PBSPro see: 518 | https://github.com/mscook/SeqFindr_nesoni 519 | This is a script that generates a job array, submits and cleans up the 520 | mapping results ready for input to SeqFindr. 521 | 522 | The output from the described workflow and SeqFindr_nesoni is a consensus.fa 523 | file which we term the mapping consensus. This file is a multi-fasta file of 524 | the consensus base calls relative to the database sequences. 525 | 526 | Caveats: 527 | * you will probably want to allow multi-mapping reads (giving *--monogamous 528 | no --random yes* to nesoni consensus) (this is default for 529 | SeqFindr_nesoni), 530 | * The (poor) alignment of reads at the start and the end of the database 531 | genes can result in N base calls. This can result in downstream false 532 | negatives. 533 | 534 | **SeqFindr now provides a solution to minimise the effects of poor mapping at 535 | the start and end of the given sequences.** 536 | 537 | The SeqFindr option is -s or --STRIP:: 538 | 539 | -s STRIP, --strip STRIP Strip the 1st and last N bases of mapping consensuses & database [default = 10] 540 | 541 | By default this strips the 1st and last 10 bases from the mapping consensuses. 542 | We have had good results with this value. Feel free to experiment with 543 | different values (say, -s 0, -s 5, -s 10, -s 15). Please see image-compare_ 544 | a script we developed to compare the effects of different values of -s on the 545 | resultant figures. 546 | 547 | 548 | SeqFindr usage options 549 | ---------------------- 550 | 551 | See the help listing_. You can get this yourself with:: 552 | 553 | $ SeqFindr -h 554 | 555 | 556 | Future 557 | ------ 558 | 559 | Please see the TODO_ for future SeqFindr project directions. 560 | 561 | 562 | 563 | 564 | 565 | .. _pip: http://www.pip-installer.org/en/latest/ 566 | .. _libraries: https://github.com/mscook/SeqFindr/blob/master/requirements.txt 567 | .. _image-compare: https://github.com/mscook/image-compare 568 | .. _listing: https://github.com/mscook/SeqFindr/blob/master/HELP.rst 569 | .. _changelog: https://github.com/mscook/SeqFindr/blob/master/CHANGES.rst 570 | .. _TODO: https://github.com/mscook/SeqFindr/blob/master/TODO.rst 571 | .. _script: https://raw.github.com/mscook/SeqFindr/master/example/run_examples.sh 572 | .. _run1: https://raw.github.com/mscook/SeqFindr/master/example/run1.png 573 | .. _run2: https://raw.github.com/mscook/SeqFindr/master/example/run2.png 574 | .. _run3: https://raw.github.com/mscook/SeqFindr/master/example/run3.png 575 | .. _dendrogram: https://raw.github.com/mscook/SeqFindr/master/example/dendrogram_run3.png 576 | .. _run4: https://raw.github.com/mscook/SeqFindr/master/example/run4.png 577 | .. _Nesoni: http://www.vicbioinformatics.com/software.nesoni.shtml 578 | .. _SeqFindr documentation: http://seqfindr.rtfd.org 579 | .. _SeqFindr official site: http://mscook.github.io/SeqFindR/ 580 | .. _gist: https://gist.github.com/mscook/ef7499fc9d2138f17c7f 581 | .. _pip installation instructions: http://pip.readthedocs.org/en/latest/installing.html 582 | -------------------------------------------------------------------------------- /SeqFindr/seqfindr.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2013-2014 Mitchell Stanton-Cook Licensed under the 4 | # Educational Community License, Version 2.0 (the "License"); you may 5 | # not use this file except in compliance with the License. You may 6 | # obtain a copy of the License at 7 | # 8 | # http://www.osedu.org/licenses/ECL-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, 11 | # software distributed under the License is distributed on an "AS IS" 12 | # BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | # or implied. See the License for the specific language governing 14 | # permissions and limitations under the License. 15 | 16 | """ 17 | A tool to easily create informative genomic feature plots 18 | """ 19 | 20 | import sys 21 | import os 22 | import traceback 23 | import argparse 24 | import time 25 | import copy 26 | import glob 27 | 28 | import matplotlib 29 | matplotlib.use('Agg') 30 | import matplotlib.pyplot as plt 31 | import matplotlib.cm as cm 32 | from matplotlib.ticker import MultipleLocator, FormatStrFormatter 33 | 34 | import numpy as np 35 | 36 | from scipy.cluster.hierarchy import linkage, average, dendrogram 37 | from scipy.spatial.distance import pdist 38 | 39 | from Bio import SeqIO 40 | 41 | from SeqFindr import imaging 42 | from SeqFindr import config 43 | from SeqFindr import util 44 | from SeqFindr import blast 45 | 46 | # Stop clustering going nuts... 47 | sys.setrecursionlimit(1000000) 48 | 49 | __title__ = 'SeqFindr' 50 | __version__ = '0.35.0' 51 | __description__ = "A tool to easily create informative genomic feature plots" 52 | __author__ = 'Mitchell Stanton-Cook, Nabil Alikhan & Hamza Khan' 53 | __license__ = 'ECL 2.0' 54 | __author_email__ = "m.stantoncook@gmail.com" 55 | __url__ = 'http://github.com/mscook/SeqFindr' 56 | 57 | epi = "Licence: %s by %s <%s>" % (__license__, 58 | __author__, 59 | __author_email__) 60 | __doc__ = " %s v%s - %s (%s)" % (__title__, 61 | __version__, 62 | __description__, 63 | __url__) 64 | 65 | 66 | def prepare_queries(args): 67 | """ 68 | Given a set of sequences of interest, extract all query & query classes 69 | 70 | A sequence of interest file is a mfa file in the format: 71 | 72 | >ident, gene id, annotation, organism [class] 73 | 74 | query = gene id 75 | query_class = class 76 | 77 | Location of sequence of interest file is defined by args.seqs_of_interest 78 | 79 | :param args: the argparse args containing args.seqs_of_interest 80 | (fullpath) to a sequence of interest DB (mfa file) 81 | 82 | :type args: argparse args 83 | 84 | :rtype: 2 lists, 1) of all queries and, 2) corresponding query classes 85 | """ 86 | query_list, query_classes = [], [] 87 | 88 | with open(args.seqs_of_interest, "rU") as fin: 89 | records = SeqIO.parse(fin, "fasta") 90 | for rec in records: 91 | cur = rec.description 92 | query_list.append(cur.split(',')[1].strip()) 93 | query_classes.append(cur.split('[')[-1].split(']')[0].strip()) 94 | unique = list(set(query_list)) 95 | sys.stderr.write("Investigating %i features\n" % (len(unique))) 96 | for e in unique: 97 | if query_list.count(e) != 1: 98 | sys.stderr.write("Duplicates found for: %s\n" % (e)) 99 | sys.stderr.write("Fix duplicates\n") 100 | sys.exit(1) 101 | return query_list, query_classes 102 | 103 | 104 | def strip_bases(args): 105 | """ 106 | Strip the 1st and last 'N' bases from mapping consensuses 107 | 108 | Uses: 109 | * args.cons 110 | * args.seqs_of_interest 111 | * arg.strip 112 | 113 | To avoid the effects of lead in and lead out coverage resulting in 114 | uncalled bases 115 | 116 | :param args: the argparse args containing args.strip value 117 | 118 | :type args: argparse args 119 | 120 | :rtype: the updated args to reflect the args.cons & 121 | args.seqs_of_interest location 122 | """ 123 | # Get in the fasta files in the consensus directory 124 | fasta_in = util.get_fasta_files(args.cons) 125 | # Build a stripped directory 126 | new_cons_dir = os.path.join(args.cons, 'stripped') 127 | try: 128 | os.mkdir(new_cons_dir) 129 | except OSError: 130 | sys.stderr.write("A stripped directory exists. Overwriting\n") 131 | # Update the args.cons to the stripped directory 132 | args.cons = new_cons_dir 133 | args.strip = int(args.strip) 134 | # Strip the start and end 135 | for fa in fasta_in: 136 | tmp = os.path.basename(fa) 137 | out = os.path.join(args.cons, tmp) 138 | with open(fa, "rU") as fin, open(out, 'w') as fout: 139 | records = SeqIO.parse(fin, "fasta") 140 | for rec in records: 141 | rec.seq = rec.seq[args.strip:-args.strip] 142 | SeqIO.write(rec, fout, "fasta") 143 | # Trim the db as well 144 | tmp = args.seqs_of_interest.split('.') 145 | stripdb = '.'.join(tmp[:-1])+'_trimmed.'+tmp[-1] 146 | with open(args.seqs_of_interest, "rU") as fin, open(stripdb, 'w') as fout: 147 | records = SeqIO.parse(fin, "fasta") 148 | for rec in records: 149 | rec.seq = rec.seq[args.strip:-args.strip] 150 | SeqIO.write(rec, fout, "fasta") 151 | # Update the args.seqs_of_interest 152 | args.seqs_of_interest = stripdb 153 | return args 154 | 155 | 156 | def build_matrix_row(all_vfs, accepted_hits, score=None): 157 | """ 158 | Populate row given all possible hits, accepted hits and an optional score 159 | 160 | :param all_vfs: a list of all virulence factor ids 161 | :param accepted_hits: a list of a hits that passed the cutoof 162 | :param score: the value to fill the matrix with (default = None which 163 | implies 0.5) 164 | 165 | :type all_vfs: list 166 | :type accepted_hits: list 167 | :type score: float 168 | 169 | :rtype: a list of floats 170 | """ 171 | if score is None: 172 | score = 0.0 173 | row = [] 174 | for factor in all_vfs: 175 | if factor in accepted_hits: 176 | row.append(score) 177 | else: 178 | row.append(0.5) 179 | return row 180 | 181 | 182 | def match_matrix_rows(ass_mat, cons_mat): 183 | """ 184 | Reorder a second matrix based on the first row element of the 1st matrix 185 | 186 | :param ass_mat: a 2D list of scores 187 | :param cons_mat: a 2D list scores 188 | 189 | :type ass_mat: list 190 | :type cons_mat: list 191 | 192 | :rtype: 2 matricies (2D lists) 193 | """ 194 | reordered_ass, reordered_cons = [], [] 195 | for i in range(0, len(ass_mat)): 196 | for j in range(0, len(cons_mat)): 197 | if ass_mat[i][0] == cons_mat[j][0]: 198 | reordered_ass.append(ass_mat[i][1:]) 199 | reordered_cons.append(cons_mat[j][1:]) 200 | break 201 | return reordered_ass, reordered_cons 202 | 203 | 204 | def strip_id_from_matrix(mat): 205 | """ 206 | Remove the ID (1st row element) form a matrix 207 | 208 | :param mat: a 2D list 209 | 210 | :rtype: a 2D list with the 1st row elelemnt (ID) removed 211 | """ 212 | new_mat = [] 213 | for i in range(0, len(mat)): 214 | new_mat.append(mat[i][1:]) 215 | return new_mat 216 | 217 | 218 | def cluster_matrix(matrix, labels, dpi, by_cols, algorithm): 219 | """ 220 | From a matrix, generate a distance matrix & perform hierarchical clustering 221 | 222 | :param matrix: a numpy matrix of scores 223 | :param labels: the ids for all row elements or column elements 224 | :param dpi: the resolution to save the diagram at 225 | :param by_cols: whether to perform the clustering by row similarity 226 | (default) or column similarity. 227 | :param algorithm: the clustering algorithm (linkage (default, False) 228 | or UPGMA) 229 | 230 | :type matrix: numpy matrix 231 | :type labels: list 232 | :type dpi: int 233 | :type by_cols: boolean (default == False) 234 | :type algorithm: boolean 235 | 236 | :returns: a tuple of the updated (clustered) matrix & the updated labels 237 | """ 238 | if by_cols: 239 | matrix = matrix.transpose() 240 | print "\nClustering the matrix" 241 | # Clear any matplotlib formatting 242 | plt.clf() 243 | fig = plt.figure() 244 | ax = fig.add_subplot(111) 245 | # Hide x labels/ticks 246 | ax.set_yticklabels([]) 247 | ax.set_yticks([]) 248 | plt.xticks(fontsize=6) 249 | Y = pdist(matrix) 250 | if not algorithm: 251 | Z = linkage(Y) 252 | print "Linkage algorithm\n" 253 | else: 254 | Z = average(Y) 255 | print "UPGMA algorithm\n" 256 | dend = dendrogram(Z, labels=labels, link_color_func=None) 257 | plt.savefig("dendrogram.png", dpi=dpi) 258 | # Reshape 259 | ordered_index = dend['leaves'] 260 | updated_labels = dend['ivl'] 261 | tmp = [] 262 | for i in range(0, len(ordered_index)): 263 | tmp.append(list(matrix[ordered_index[i], :])) 264 | matrix = np.array(tmp) 265 | if by_cols: 266 | matrix = matrix.transpose() 267 | return matrix, updated_labels 268 | 269 | 270 | def plot_matrix(matrix, strain_labels, vfs_classes, gene_labels, 271 | show_gene_labels, color_index, config_object, grid, seed, 272 | dpi, size, svg, cluster_column, aspect='auto'): 273 | """ 274 | Plot the VF hit matrix 275 | 276 | :param matrix: the numpy matrix of scores 277 | :param strain_labels: the strain (y labels) 278 | :param vfs_classes: the VFS class (in mfa header [class]) 279 | :param gene_labels: the gene labels 280 | :param show_gene_labels: wheter top plot the gene labels 281 | :param color_index: for a single class, choose a specific color 282 | """ 283 | if config_object['category_colors'] is not None: 284 | colors = config_object['category_colors'] 285 | else: 286 | colors = imaging.generate_colors(len(set(vfs_classes)), seed) 287 | if color_index is not None: 288 | colors = [colors[(color_index)]] 289 | # Build the regions to be shaded differently 290 | if not cluster_column: 291 | regions, prev = [], 0 292 | for i in xrange(0, len(vfs_classes)-1): 293 | if vfs_classes[i] != vfs_classes[i+1]: 294 | regions.append([prev+0.5, i+0.5]) 295 | prev = i 296 | regions.append([prev+0.5, len(vfs_classes)-0.5]) 297 | regions[0][0] = regions[0][0]-1.0 298 | else: 299 | regions = [[-1, len(gene_labels)]] 300 | plt.clf() 301 | fig = plt.figure() 302 | ax = fig.add_subplot(111) 303 | # aspect auto to widen 304 | ax.matshow(matrix, cmap=cm.gray, aspect=aspect) 305 | # Make sure every strain 306 | ax.yaxis.set_major_locator(MultipleLocator(1)) 307 | ax.yaxis.set_major_formatter(FormatStrFormatter('%s')) 308 | ax.set_yticklabels(strain_labels) 309 | if len(gene_labels) < 999: 310 | ax.xaxis.set_major_locator(MultipleLocator(1)) 311 | ax.xaxis.set_major_formatter(FormatStrFormatter('%s')) 312 | ax.xaxis.grid(False) 313 | if show_gene_labels: 314 | ax.set_xticklabels(['', ''] + gene_labels) 315 | # ax.set_xticklabels(gene_labels) 316 | # ax.set_xticklabels([''] + gene_labels), rotation=90) 317 | for i in xrange(0, len(regions)): 318 | plt.axvspan(regions[i][0], regions[i][1], facecolor=colors[i], 319 | alpha=0.1) 320 | if show_gene_labels: 321 | ax.tick_params(axis='both', which='both', labelsize=6, direction='out', 322 | labelleft='on', labelright='off', labelbottom='off', 323 | labeltop='on', left='on', right='off', bottom='off', 324 | top='on') 325 | else: 326 | ax.tick_params(axis='both', which='both', labelsize=6, direction='out', 327 | labelleft='on', labelright='off', labelbottom='off', 328 | labeltop='off', left='on', right='off', bottom='off', 329 | top='off') 330 | plt.xticks(rotation=90) 331 | if grid: 332 | ax.grid(True) 333 | x, y = size.split('x') 334 | x, y = float(x), float(y) 335 | fig.set_size_inches(x, y, dpi=dpi) 336 | if svg: 337 | plt.savefig("results.svg", bbox_inches='tight', dpi=dpi) 338 | else: 339 | plt.savefig("results.png", bbox_inches='tight', dpi=dpi) 340 | 341 | 342 | def determine_nohit_score(cons, invert): 343 | """ 344 | Determine the value in the matrix assigned to nohit given SeqFindr options 345 | 346 | :param cons: whether the Seqfindr run is using mapping consensus data 347 | or not 348 | :param invert: whether the Seqfindr run is inverting (missing hits to 349 | be shown as black bars. 350 | 351 | :type cons: None of boolean 352 | :type cons: boolean 353 | 354 | :returns: the value defined as no hit in the results matrix 355 | """ 356 | if cons is None: 357 | nohit = 0.5 358 | else: 359 | nohit = 1.0 360 | if invert: 361 | nohit = nohit*-1.0 362 | return nohit 363 | 364 | 365 | def strip_uninteresting(matrix, query_classes, query_list, cons, invert): 366 | """ 367 | Remove any columns where all elements in every position are absent 368 | 369 | Also handles the query classes and x_lables. 370 | 371 | .. attention:: new feature added in version 0.4.0 372 | 373 | Toogle using: **args.remove_empty_cols** 374 | 375 | :param matrix: the SeqFindr hit matrix 376 | :param query_classes: a list of query classes 377 | :param query_list: a query list (x labels) 378 | :param cons: whether the Seqfindr run is using mapping consensus data 379 | or not 380 | :param invert: whether the Seqfindr run is inverting (missing hits to 381 | be shown as black bars. 382 | 383 | :returns: a tuple with three elements which are the: updated SeqFindr 384 | matrix, the updated query_classes list and the updated 385 | query_list respectively. 386 | """ 387 | nohit = determine_nohit_score(cons, invert) 388 | to_remove = [] 389 | for idx, column in enumerate(matrix.T): 390 | target = len(column) 391 | count = 0 392 | for elem in column: 393 | if elem == nohit: 394 | count += 1 395 | if count == target: 396 | to_remove.append(idx) 397 | new = np.delete(matrix, to_remove, 1) 398 | query_classes = util.del_from_list(query_classes, to_remove) 399 | query_list = util. del_from_list(query_list, to_remove) 400 | return new, query_classes, query_list 401 | 402 | 403 | def check_singularity(matrix, cons, invert): 404 | """ 405 | Check if there are any informative sites in the matrix 406 | """ 407 | nohit = determine_nohit_score(cons, invert) 408 | if np.all(matrix == nohit): 409 | msg = ("There are no informative sites (no hits) in the SeqFindr " 410 | "matrix. Consider lowering hit tolerance (-t/--t") 411 | raise ValueError(msg) 412 | 413 | 414 | def do_run(args, data_path, match_score, vfs_list, cons_run): 415 | """ 416 | Perform a SeqFindr run 417 | """ 418 | matrix, y_label, exist_ord = [], [], [] 419 | in_files = util.get_fasta_files(data_path) 420 | # Reorder if requested 421 | if args.index_file is not None: 422 | in_files = util.order_inputs(args.index_file, in_files) 423 | # Handle and existing run 424 | if args.existing_data is not None: 425 | cleaned, blastxml_tmp = [], [] 426 | blast_xml = glob.glob(os.path.abspath(args.existing_data)+"/BLAST_results/*") 427 | if not cons_run: 428 | for e in blast_xml: 429 | if e.find("cons_DB=") == -1: 430 | blastxml_tmp.append(e) 431 | else: 432 | for e in blast_xml: 433 | if e.find("cons_DB=") != -1: 434 | blastxml_tmp.append(e) 435 | blast_xml = [] 436 | blast_xml = blastxml_tmp 437 | for e in blast_xml: 438 | sid = e.split("ID=")[-1].split("_blast.xml")[0] 439 | cleaned.append(sid) 440 | for i in in_files: 441 | for j in cleaned: 442 | if i.find(j) != -1: 443 | y_label.append(j) 444 | break 445 | for i in blast_xml: 446 | for j in y_label: 447 | if i.find(j) != -1: 448 | exist_ord.append(i) 449 | break 450 | in_files = exist_ord 451 | # Make sure XML are right order 452 | for idx, subject in enumerate(in_files): 453 | if args.existing_data is None: 454 | strain_id = blast.make_BLAST_database(subject) 455 | y_label.append(strain_id) 456 | database = os.path.basename(subject) 457 | blast_xml = blast.run_BLAST(args.seqs_of_interest, os.path.join(os.getcwd(), "DBs/"+database), args, cons_run) 458 | accepted_hits = blast.parse_BLAST(blast_xml, float(args.tol), float(args.cov), args.careful) 459 | else: 460 | strain_id = y_label[idx] 461 | accepted_hits = blast.parse_BLAST(in_files[idx], float(args.tol), float(args.cov), args.careful) 462 | row = build_matrix_row(vfs_list, accepted_hits, match_score) 463 | row.insert(0, strain_id) 464 | matrix.append(row) 465 | return matrix, y_label 466 | 467 | 468 | def core(args): 469 | """ 470 | The 'core' SeqFindr method 471 | 472 | TODO: Exception handling if do_run fails or produces no results 473 | 474 | :param args: the arguments given from argparse 475 | """ 476 | DEFAULT_NO_HIT, ASS_WT, CONS_WT = 0.5, -0.15, -0.85 477 | cons_run = False 478 | args = util.ensure_paths_for_args(args) 479 | configObject = config.SeqFindrConfig() 480 | util.check_database(args.seqs_of_interest) 481 | util.init_output_dirs(args.output) 482 | query_list, query_classes = prepare_queries(args) 483 | results_a, ylab = do_run(args, args.assembly_dir, ASS_WT, query_list, cons_run) 484 | if args.cons is not None: 485 | cons_run = True 486 | args = strip_bases(args) 487 | # TODO: Exception handling if do_run fails or produces no results. 488 | # Should be caught here before throwing ugly exceptions downstream. 489 | results_m, _ = do_run(args, args.cons, CONS_WT, query_list, cons_run) 490 | if len(results_m) == len(results_a): 491 | results_a, results_m = match_matrix_rows(results_a, results_m) 492 | DEFAULT_NO_HIT = 1.0 493 | matrix = np.array(results_a) + np.array(results_m) 494 | else: 495 | print "\nAssemblies and mapping consensuses don't match\n" 496 | sys.exit(1) 497 | else: 498 | args.reshape = False 499 | results_a = strip_id_from_matrix(results_a) 500 | matrix = np.array(results_a) 501 | # cluster if not ordered 502 | if args.index_file is None: 503 | if not args.cluster_column: 504 | matrix, ylab = cluster_matrix(matrix, ylab, args.DPI, 505 | args.cluster_column, 506 | args.UPGMA_clustering) 507 | else: 508 | tmp = copy.deepcopy(ylab) 509 | matrix, ylab = cluster_matrix(matrix, query_list, args.DPI, 510 | args.cluster_column, 511 | args.UPGMA_clustering) 512 | query_list = ylab 513 | ylab = tmp 514 | np.savetxt("matrix.csv", matrix, delimiter=",") 515 | # Add the buffer 516 | newrow = [DEFAULT_NO_HIT] * matrix.shape[1] 517 | # matrix = np.vstack([newrow, matrix]) 518 | matrix = np.vstack([newrow, matrix]) 519 | # Handle new option to only show presence 520 | cutoff = 0.49 521 | if args.reshape is True: 522 | cutoff = 0.99 523 | for x in np.nditer(matrix, op_flags=['readwrite']): 524 | if x < cutoff: 525 | x[...] = -1.0 526 | ylab = ['', ''] + ylab 527 | if args.invert: 528 | for elem in np.nditer(matrix, op_flags=['readwrite']): 529 | if elem < cutoff: 530 | elem[...] = -cutoff-0.01 531 | matrix[0,:] *= -1 532 | if args.reshape is False: 533 | matrix[0,:] *= 0.0 534 | matrix[0,:] += -0.5 535 | matrix = matrix*-1 536 | # Remove empty columns 537 | if args.remove_empty_cols: 538 | matrix, query_classes, query_list = strip_uninteresting(matrix, 539 | query_classes, 540 | query_list, 541 | args.cons, 542 | args.invert) 543 | # Check for singular matrix 544 | check_singularity(matrix, args.cons, args.invert) 545 | plot_matrix(matrix, ylab, query_classes, query_list, args.label_genes, 546 | args.color, configObject, args.grid, args.seed, args.DPI, 547 | args.size, args.svg, args.cluster_column) 548 | # Handle labels here 549 | #os.system("rm blast.xml") 550 | #os.system("rm DBs/*") 551 | 552 | 553 | if __name__ == '__main__': 554 | try: 555 | start_time = time.time() 556 | 557 | parser = argparse.ArgumentParser(description=__doc__, epilog=epi) 558 | alg = parser.add_argument_group('Optional algorithm options', 559 | ('Options relating to the SeqFindr ' 560 | 'algorithm')) 561 | io = parser.add_argument_group('Optional input/output options', 562 | ('Options relating to input and ' 563 | 'output')) 564 | fig = parser.add_argument_group('Figure options', 565 | ('Options relating to the output ' 566 | 'figure')) 567 | blast_opt = parser.add_argument_group('BLAST options', 568 | ('Options relating to BLAST')) 569 | blast_opt.add_argument('-R', '--reftype', action='store', 570 | help=('Reference Sequence type. If not given ' 571 | 'will try to detect it'), dest='reftype', 572 | choices=('nucl', 'prot'), default=None) 573 | blast_opt.add_argument('-X', '--tblastx', action='store_true', 574 | default=False, 575 | help=('Run tBLASTx rather than BLASTn')) 576 | blast_opt.add_argument('--evalue', action='store', type=float, 577 | default='0.0001', 578 | help=('BLAST evalue (Expect)')) 579 | blast_opt.add_argument('--short', action='store_true', 580 | default=False, help=('Have short queries i.e. ' 581 | 'PCR Primers')) 582 | parser.add_argument('-v', '--verbose', action='store_true', 583 | default=False, help='verbose output') 584 | io.add_argument('-o', '--output', action='store', default=None, 585 | help=('Output the results to this location')) 586 | io.add_argument('-p', '--output_prefix', action='store', default=None, 587 | help=('Give all result files this prefix')) 588 | # Required options now positional arguments 589 | parser.add_argument('seqs_of_interest', action='store', 590 | help=('Full path to FASTA file containing a ' 591 | 'set of sequences of interest')) 592 | parser.add_argument('assembly_dir', action='store', 593 | help=('Full path to directory containing a ' 594 | 'set of assemblies in FASTA format')) 595 | alg.add_argument('-t', '--tol', action='store', type=float, 596 | default=0.95, 597 | help=('Similarity cutoff [default = 0.95]')) 598 | alg.add_argument('--cov', action='store', type=float, 599 | default=1.0, 600 | help=('Proportion of query covered cutoff [default = 1.0]')) 601 | alg.add_argument('-m', '--cons', action='store', default=None, 602 | help=('Full path to directory containing mapping ' 603 | 'consensuses [default = None]. See manual for ' 604 | 'more info')) 605 | fig.add_argument('-l', '--label_genes', action='store_true', 606 | default=False, 607 | help=('Label the x axis with the query identifier ' 608 | '[default = False]')) 609 | alg.add_argument('-r', '--reshape', action='store_false', default=True, 610 | help=('Differentiate between mapping and assembly ' 611 | 'hits in the figure [default = no ' 612 | 'differentiation]')) 613 | fig.add_argument('-g', '--grid', action='store_false', default=True, 614 | help='Figure has grid lines [default = True]') 615 | alg.add_argument('--index_file', action='store', default=None, 616 | help=('Maintain the y axis strain order according to ' 617 | 'order given in this file. Otherwise ' 618 | 'clustering by row similarity. [default = do ' 619 | 'clustering]. See manual for more info')) 620 | alg.add_argument('--cluster_column', action='store_true', 621 | default=False, 622 | help=('Cluster by column similarity rather than row')) 623 | fig.add_argument('--color', action='store', default=None, type=int, 624 | help=('The color index [default = None]. See manual ' 625 | 'for more info')) 626 | fig.add_argument('--invert', action='store_true', default=False, 627 | help=('Invert the shading so that missing hits are ' 628 | 'black [default = False].')) 629 | fig.add_argument('--remove_empty_cols', action='store_true', 630 | default=False, help=('Remove columns that have no ' 631 | 'hits [default = False].')) 632 | fig.add_argument('--DPI', action='store', type=int, default=300, 633 | help='DPI of figure [default = 300]') 634 | fig.add_argument('--seed', action='store', type=int, default=99, 635 | help='Color generation seed') 636 | fig.add_argument('--svg', action='store_true', default=False, 637 | help=('Draws figure in svg')) 638 | fig.add_argument('--size', action='store', type=str, default='10x12', 639 | help='Size of figure [default = 10x12 (inches)]') 640 | alg.add_argument('-s', '--strip', action='store', default=10, 641 | help=('Strip the 1st and last N bases of mapping ' 642 | 'consensuses & database [default = 10]')) 643 | alg.add_argument('-c', '--careful', action='store', type=float, 644 | default=0, 645 | help=('Manually consider hits that fall ' 646 | '(tol-careful) below the cutoff. [default = 0].' 647 | ' With default tol (0.95) & careful = 0.2, we ' 648 | 'will manually inspect all hits in 0.95-0.75 ' 649 | 'range')) 650 | io.add_argument('-e', '--existing_data', action='store', 651 | default=None, 652 | help=('Full path to an existing SeqFindr run ' 653 | 'directory. Must contain a BLAST_results ' 654 | 'directory')) 655 | blast_opt.add_argument('--BLAST_THREADS', action='store', type=int, 656 | default=1, help=('Use this number of threads ' 657 | 'in BLAST run [default = 1]')) 658 | alg.add_argument('--UPGMA_clustering', action='store_true', default=False, 659 | help=('Use UPGMA the clustering algorithm. ' 660 | 'Default is the linkage algorithm')) 661 | parser.set_defaults(func=core) 662 | args = parser.parse_args() 663 | if args.verbose: 664 | print "Executing @ " + time.asctime() 665 | args.func(args) 666 | if args.verbose: 667 | print "Ended @ " + time.asctime() 668 | print 'Exec time minutes %f:' % ((time.time() - start_time) / 60.0) 669 | sys.exit(0) 670 | except KeyboardInterrupt, e: 671 | # Ctrl-C 672 | raise e 673 | except SystemExit, e: 674 | # sys.exit() 675 | raise e 676 | except Exception, e: 677 | print 'ERROR, UNEXPECTED EXCEPTION' 678 | print str(e) 679 | traceback.print_exc() 680 | sys.exit(1) 681 | --------------------------------------------------------------------------------