├── .gitignore ├── .gitlab-ci.yml ├── .gitmodules ├── LICENSE ├── README.md ├── VERSIONS.rst ├── bin ├── alnChangeDelim ├── alnConvertGI ├── alnFilterSeqSize ├── alnParseID ├── alnReplaceHeaders ├── annotateMSA ├── scaCore ├── scaProcessMSA └── scaSectorID ├── docs ├── Makefile ├── make.bat └── source │ ├── SCA_DHFR.rst │ ├── SCA_G.rst │ ├── SCA_S1A.rst │ ├── SCA_betalactamase.rst │ ├── _static │ ├── BLactamase_sec_hier.png │ ├── DHFR_decompv2.png │ ├── DHFR_sec_hier.png │ ├── Gprot_sec_hier.png │ ├── Gprot_secstruct.png │ ├── SCA_DHFR_13_0.png │ ├── SCA_DHFR_16_1.png │ ├── SCA_DHFR_20_0.png │ ├── SCA_DHFR_22_0.png │ ├── SCA_DHFR_26_0.png │ ├── SCA_DHFR_29_0.png │ ├── SCA_DHFR_30_0.png │ ├── SCA_DHFR_7_0.png │ ├── SCA_G_17_0.png │ ├── SCA_G_21_0.png │ ├── SCA_G_24_1.png │ ├── SCA_G_26_1.png │ ├── SCA_G_28_0.png │ ├── SCA_G_31_1.png │ ├── SCA_G_33_0.png │ ├── SCA_G_35_0.png │ ├── SCA_G_37_0.png │ ├── SCA_G_42_0.png │ ├── SCA_G_44_0.png │ ├── SCA_G_9_0.png │ ├── SCA_S1A_17_0.png │ ├── SCA_S1A_20_0.png │ ├── SCA_S1A_23_1.png │ ├── SCA_S1A_25_1.png │ ├── SCA_S1A_27_0.png │ ├── SCA_S1A_30_1.png │ ├── SCA_S1A_32_0.png │ ├── SCA_S1A_38_0.png │ ├── SCA_S1A_41_0.png │ ├── SCA_S1A_9_0.png │ ├── SCA_betalactamase_10_0.png │ ├── SCA_betalactamase_16_0.png │ ├── SCA_betalactamase_19_1.png │ ├── SCA_betalactamase_21_1.png │ ├── SCA_betalactamase_23_0.png │ ├── SCA_betalactamase_28_0.png │ ├── SCA_betalactamase_30_0.png │ ├── SCA_betalactamase_8_1.png │ ├── favicon.ico │ ├── github-download-screenshot.png │ └── logo.png │ ├── annotateMSA.rst │ ├── conf.py │ ├── examples.rst │ ├── get_started.rst │ ├── index.rst │ ├── install.rst │ ├── modules.rst │ ├── scaCore.rst │ ├── scaProcessMSA.rst │ ├── scaSectorID.rst │ ├── scaTools.rst │ ├── usage.rst │ └── versions.rst ├── figs ├── BLactamase_sec_hier.png ├── DHFR_decompv2.png ├── DHFR_sec_hier.png ├── Gprot_sec_hier.png └── Gprot_secstruct.png ├── notebooks ├── SCA_DHFR.ipynb ├── SCA_G.ipynb ├── SCA_S1A.ipynb └── SCA_betalactamase.ipynb ├── pysca ├── __init__.py ├── scaTools.py └── settings.py ├── scripts ├── getPfamDB.sh ├── rstZipFixUrl.sh └── runAllNBCalcs.sh └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | *.zip 3 | *.xz 4 | *.gz 5 | .listing 6 | *.sw[ap] 7 | *.DS_Store 8 | *~ 9 | 10 | docs/build 11 | output/ 12 | *.bak 13 | 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # C extensions 20 | *.so 21 | 22 | # Distribution / packaging 23 | .Python 24 | build/ 25 | develop-eggs/ 26 | dist/ 27 | downloads/ 28 | eggs/ 29 | .eggs/ 30 | lib/ 31 | lib64/ 32 | parts/ 33 | sdist/ 34 | var/ 35 | wheels/ 36 | pip-wheel-metadata/ 37 | share/python-wheels/ 38 | *.egg-info/ 39 | .installed.cfg 40 | *.egg 41 | MANIFEST 42 | 43 | # PyInstaller 44 | # Usually these files are written by a python script from a template 45 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 46 | *.manifest 47 | *.spec 48 | 49 | # Installer logs 50 | pip-log.txt 51 | pip-delete-this-directory.txt 52 | 53 | # Unit test / coverage reports 54 | htmlcov/ 55 | .tox/ 56 | .nox/ 57 | .coverage 58 | .coverage.* 59 | .cache 60 | nosetests.xml 61 | coverage.xml 62 | *.cover 63 | .hypothesis/ 64 | .pytest_cache/ 65 | 66 | # Translations 67 | *.mo 68 | *.pot 69 | 70 | # Django stuff: 71 | *.log 72 | local_settings.py 73 | db.sqlite3 74 | db.sqlite3-journal 75 | 76 | # Flask stuff: 77 | instance/ 78 | .webassets-cache 79 | 80 | # Scrapy stuff: 81 | .scrapy 82 | 83 | # Sphinx documentation 84 | docs/_build/ 85 | 86 | # PyBuilder 87 | target/ 88 | 89 | # Jupyter Notebook 90 | .ipynb_checkpoints 91 | 92 | # IPython 93 | profile_default/ 94 | ipython_config.py 95 | 96 | # pyenv 97 | .python-version 98 | 99 | # pipenv 100 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 101 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 102 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 103 | # install all needed dependencies. 104 | #Pipfile.lock 105 | 106 | # celery beat schedule file 107 | celerybeat-schedule 108 | 109 | # SageMath parsed files 110 | *.sage.py 111 | 112 | # Environments 113 | .env 114 | .venv 115 | env/ 116 | venv/ 117 | ENV/ 118 | env.bak/ 119 | venv.bak/ 120 | 121 | # Spyder project settings 122 | .spyderproject 123 | .spyproject 124 | 125 | # Rope project settings 126 | .ropeproject 127 | 128 | # mkdocs documentation 129 | /site 130 | 131 | # mypy 132 | .mypy_cache/ 133 | .dmypy.json 134 | dmypy.json 135 | 136 | # Pyre type checker 137 | .pyre/ 138 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | image: debian:bullseye-slim 2 | 3 | pages: 4 | script: 5 | - apt-get update 6 | - apt-get upgrade -y 7 | - apt-get install -y python3 python3-numpy python3-scipy python3-biopython python3-pip python3-matplotlib python3-sphinx python3-sphinx-rtd-theme 8 | - apt-get install -y make 9 | - pip3 install . 10 | - mkdir -p docs/modules 11 | - for file in bin/*; do cp "${file}" "docs/modules/`basename $file`.py"; done 12 | - cp pysca/scaTools.py docs/modules 13 | - make -C docs dirhtml 14 | - mv docs/build/dirhtml public/ 15 | artifacts: 16 | paths: 17 | - public 18 | only: 19 | - master 20 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "data"] 2 | path = data 3 | url = https://gitlab.com/ranganathanlab/pySCA-data.git 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Olivier Rivoire, Rama Ranganathan, Kimberly Reynolds, Ansel 2 | George 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its contributors 16 | may be used to endorse or promote products derived from this software without 17 | specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pySCA 2 | 3 | ![Website Build Status](https://gitlab.com/ranganathanlab/pySCA/badges/master/pipeline.svg) 4 | 5 | > 09.2020 6 | > 7 | > Copyright (C) 2019 Olivier Rivoire, Rama Ranganathan, and Kimberly Reynolds 8 | > 9 | > This program is free software distributed under the BSD 3-clause license, 10 | > please see the file LICENSE for details. 11 | 12 | The current version of the Statistical Coupling Analysis (SCA) analysis is 13 | implemented in Python. This directory contains the necessary code for running 14 | the SCA calculations, as well examples/tutorials for the dihydrofolate 15 | reductase (DHFR) enzyme family, the S1A serine proteases, the small G-protein 16 | family and the Beta-lactamase enzyme family. The tutorials are distributed as 17 | Jupyter notebooks; for details please see: 18 | [https://jupyter.org/](https://jupyter.org/). 19 | 20 | For installation instructions, and an introduction to using the toolbox, please 21 | refer to the website: 22 | 23 | [https://ranganathanlab.gitlab.io/pySCA](https://ranganathanlab.gitlab.io/pySCA) 24 | 25 | or look through the [RST files](docs/source) included with the pySCA 26 | distribution. 27 | 28 | ## Contents of `/` 29 | 30 | | | | 31 | | :--- | :--- | 32 | | bin/ | Executables for running SCA analysis functions | 33 | | data/ | Input data (including those needed for the tutorials) | 34 | | docs/ | HTML documentation (generated by Sphinx) | 35 | | figs/ | Figures used for the notebooks and documentation | 36 | | notebooks/ | Example SCA notebooks | 37 | | output/ | Output files (empty at install, use `runAllNBCalcs.sh`) | 38 | | pysca/ | Python code for SCA | 39 | | scripts/ | Utility scripts used to generate example data | 40 | 41 | ## Contents of `bin/` 42 | 43 | | | | 44 | | :--- | :--- | 45 | | annotateMSA | Annotates alignments with phylogenetic/taxonomic information | 46 | | scaProcessMSA | Conducts some initial processing of the sequence alignment | 47 | | scaCore | Runs the core SCA calculations | 48 | | scaSectorID | Defines sectors given the results of the calculations in scaCore | 49 | 50 | ## Contents of `pysca/` 51 | 52 | | | | 53 | | :--- | :--- | 54 | | scaTools.py | The SCA toolbox - functions for the SCA calculations | 55 | | settings.py | Global configuration settings for the analysis | 56 | 57 | ## Contents of `notebooks/` 58 | 59 | | | | 60 | | :--- | :--- | 61 | | SCA_DHFR.ipynb | Example for DHFR | 62 | | SCA_G.ipynb | Example for the small G proteins | 63 | | SCA_betalactamase.ipynb | Example for the beta-lactamases | 64 | | SCA_S1A.ipynb | Example for the S1A serine protease | 65 | -------------------------------------------------------------------------------- /VERSIONS.rst: -------------------------------------------------------------------------------- 1 | docs/source/versions.rst -------------------------------------------------------------------------------- /bin/alnChangeDelim: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """ 3 | A script to change the field delimiter of a multiple sequence alignment (FASTA 4 | format). 5 | 6 | **Arguments** 7 | Input_MSA.fasta (the alignment to be processed) 8 | 9 | **Keyword Arguments** 10 | --output output file name, default: FilteredAln.fa 11 | --old-delim delimiter separating header fields, default: "_" 12 | --new-delim delimiter separating header fields 13 | 14 | :By: Kim Reynolds 15 | :On: 6.5.2015 16 | 17 | Copyright (C) 2015 Olivier Rivoire, Rama Ranganathan, Kimberly Reynolds 18 | 19 | This program is free software distributed under the BSD 3-clause 20 | license, please see the file LICENSE for details. 21 | """ 22 | 23 | import argparse 24 | import sys 25 | import statistics as stat 26 | from pysca import scaTools as sca 27 | 28 | if __name__ == "__main__": 29 | # Parse inputs 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument("alignment", help="Input Sequence Alignment") 32 | parser.add_argument( 33 | "-o", 34 | "--output", 35 | dest="outputfile", 36 | default="output.acc", 37 | help="specify an outputfile name", 38 | ) 39 | parser.add_argument( 40 | "-d", 41 | "--old-delim", 42 | dest="old_delim", 43 | help="specify the field delimiter in the header", 44 | ) 45 | parser.add_argument( 46 | "-n", 47 | "--new-delim", 48 | dest="new_delim", 49 | help="specify the field delimiter in the header", 50 | ) 51 | 52 | options = parser.parse_args() 53 | 54 | if (options.new_delim is None) or (options.old_delim is None): 55 | sys.exit("ERROR: Input and output delimiters must be specified.") 56 | 57 | headers, seqs = sca.readAlg(options.alignment) 58 | 59 | # Check that the old delimiter and new delimiters return a consistent 60 | # number of fields across all sequences. 61 | counts = [] 62 | checks = [] 63 | for i, header in enumerate(headers): 64 | # Check that the old delimiter works. 65 | fields = header.split(options.old_delim) 66 | counts.append(len(fields)) 67 | # Check that the new delimiter is not found inside the fields. 68 | checks.append(sum([options.new_delim in field for field in fields])) 69 | 70 | # Assume the correct number of fields is the mode of the entire set. 71 | count = stat.mode(counts) 72 | 73 | # Print error messages for each sequences where either the number of fields 74 | # is inconsistent or if the new delimiter is a bad choice given the content 75 | # of the fields. 76 | arewegood = True 77 | for i, header in enumerate(headers): 78 | if counts[i] != count: 79 | print("WARNING: sequence %s has %s fields" % (header, counts[i])) 80 | # arewegood = False 81 | if checks[i] > 0: 82 | print( 83 | "ERROR: delimiter '%s' incompatible with %s" 84 | % (options.new_delim, header) 85 | ) 86 | arewegood = False 87 | 88 | if not arewegood: 89 | sys.exit("Errors found. Output not written.") 90 | 91 | # Write the file if no serious errors are found. 92 | f = open(options.outputfile, "w") 93 | for i, header in enumerate(headers): 94 | fields = header.split(options.old_delim) 95 | f.write(">%s\n" % (options.new_delim).join(fields)) 96 | f.write("%s\n" % seqs[i]) 97 | print("Done. Output written to %s." % options.outputfile) 98 | -------------------------------------------------------------------------------- /bin/alnConvertGI: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """ 3 | A script to convert GI numbers in the header of a FASTA file to accession 4 | numbers. 5 | 6 | **Arguments** 7 | Input_MSA.fasta (the alignment to be processed) 8 | 9 | **Keyword Arguments** 10 | --output output file name, default: FilteredAln.fa 11 | --delim delimiter separating header fields, default: "_" 12 | --email email to associate with Entrez web API queries 13 | 14 | :By: Kim Reynolds 15 | :On: 6.5.2015 16 | 17 | Copyright (C) 2015 Olivier Rivoire, Rama Ranganathan, Kimberly Reynolds 18 | 19 | This program is free software distributed under the BSD 3-clause 20 | license, please see the file LICENSE for details. 21 | """ 22 | 23 | import argparse 24 | import sys 25 | from Bio import Entrez 26 | from pysca import scaTools as sca 27 | from pysca import settings 28 | 29 | if __name__ == "__main__": 30 | # Parse inputs 31 | parser = argparse.ArgumentParser() 32 | parser.add_argument("alignment", help="Input Sequence Alignment") 33 | parser.add_argument( 34 | "-o", 35 | "--output", 36 | dest="outputfile", 37 | default="output.acc", 38 | help="specify an outputfile name", 39 | ) 40 | parser.add_argument( 41 | "-d", 42 | "--delim", 43 | dest="delim", 44 | default="_", 45 | help="specify the field delimiter in the header", 46 | ) 47 | parser.add_argument( 48 | "-e", 49 | "--entrez_email", 50 | dest="email", 51 | default=None, 52 | help="email address for querying Entrez web API", 53 | ) 54 | 55 | options = parser.parse_args() 56 | 57 | if options.email is None: 58 | Entrez.email = settings.entrezemail 59 | else: 60 | Entrez.email = options.email 61 | 62 | headers, seqs = sca.readAlg(options.alignment) 63 | gis = [h.split(options.delim)[1] for h in headers] 64 | 65 | # Check that the GI numbers are valid. 66 | for i, gi in enumerate(gis): 67 | if not gi.isdigit(): 68 | print("Invalid GI '%s' at line %s. Omitting." % (gi, i)) 69 | gis[i] = "0" # Needs to be a character, not an int. 70 | 71 | gi_blocksize = 50 # more GIs need to be submitted as a POST request 72 | gi_blocks = [ 73 | gis[x : x + gi_blocksize] for x in range(0, len(gis), gi_blocksize) 74 | ] 75 | 76 | # Query the Entrez web API with GI numbers and store the retured accession 77 | # numbers in an array. 78 | acc_ids = [] 79 | for gi_block in gi_blocks: 80 | handle = Entrez.efetch(db="protein", rettype="acc", id=gi_block) 81 | res = handle.read().splitlines() 82 | handle.close() 83 | if len(res) == len(gi_block): 84 | acc_ids.extend([acc_id if acc_id else "0" for acc_id in res]) 85 | else: 86 | sys.exit("ERROR: Different number of accession IDs returned.") 87 | 88 | # Using '_' as a delimiter is a problem for accession numbers because they 89 | # are often in the form XX_XXXXX.1, meaning the number will be split. If 90 | # the supplied (or defaulted) delimited is '_', convert the delimiter to 91 | # something else. 92 | if options.delim == "_": 93 | print( 94 | "WARNING: '_' is not a good delimiter for accession " 95 | "numbers (e.g. YP_969813.1)." 96 | ) 97 | print("The output will use '___' as a delimiter instead.") 98 | newdelim = "___" 99 | else: 100 | newdelim = options.delim 101 | 102 | # Replace GI field with accession numbers in the headers and write the 103 | # updated alignment to disk. 104 | f = open(options.outputfile, "w") 105 | for i, header in enumerate(headers): 106 | fields = header.split(options.delim) 107 | fields[0] = "ref" 108 | fields[1] = acc_ids[i] 109 | f.write(">%s\n" % (newdelim).join(fields)) 110 | f.write("%s\n" % seqs[i]) 111 | print("Done. Output written to %s." % options.outputfile) 112 | -------------------------------------------------------------------------------- /bin/alnFilterSeqSize: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """ 3 | A basic script to filter a fasta file of sequences by size - a useful step to 4 | remove partial sequences or sequences that would potentially introduce a large 5 | number of gaps in the alignment. This script reads in the alignment, computes 6 | the average sequence length, and outputs a new alignment that keeps sequences 7 | of length mean +/- tolerance (tolerance default = 50) 8 | 9 | **Arguments** 10 | 11 | Input_MSA.fasta (the alignment to be processed) 12 | 13 | **Keyword Arguments** 14 | 15 | --tolerance, -t allowable sequence length variation (in number of 16 | amino acids), default: 50 17 | --output output file name, default: FilteredAln.fa 18 | 19 | :By: Kim Reynolds 20 | :On: 6.5.2015 21 | 22 | Copyright (C) 2015 Olivier Rivoire, Rama Ranganathan, Kimberly Reynolds 23 | 24 | This program is free software distributed under the BSD 3-clause license, 25 | please see the file LICENSE for details. 26 | """ 27 | 28 | import argparse 29 | import numpy as np 30 | from pysca import scaTools as sca 31 | 32 | if __name__ == "__main__": 33 | # Parse inputs 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument("alignment", help="Input Sequence Alignment") 36 | parser.add_argument( 37 | "-t", 38 | "--tolerance", 39 | dest="tol", 40 | type=int, 41 | default=50, 42 | help="allowable sequence length variation in number of" 43 | " amino acids (alignment will be trimmed to mean" 44 | " +/-tolerance, default = 50)", 45 | ) 46 | parser.add_argument( 47 | "--output", 48 | dest="outputfile", 49 | default="FilteredAln.fa", 50 | help="specify an outputfile name", 51 | ) 52 | 53 | options = parser.parse_args() 54 | 55 | headers, seqs = sca.readAlg(options.alignment) 56 | seqLen = np.zeros((len(seqs), 1)).astype(int) 57 | for i, k in enumerate(seqs): 58 | seqLen[i] = len(k) 59 | avgLen = seqLen.mean() 60 | print("Average sequence length: %i" % avgLen) 61 | print("Min: %i, Max %i" % (seqLen.min(), seqLen.max())) 62 | minsz = avgLen - options.tol 63 | maxsz = avgLen + options.tol 64 | print("Keeping sequences in the range: %i - %i" % (minsz, maxsz)) 65 | 66 | keepSeqs = list() 67 | keepHeaders = list() 68 | for i, k in enumerate(seqLen): 69 | if (k > minsz) & (k < maxsz): 70 | keepSeqs.append(seqs[i]) 71 | keepHeaders.append(headers[i]) 72 | 73 | print("Keeping %i of %i total sequences" % (len(keepSeqs), len(seqLen))) 74 | 75 | f = open(options.outputfile, "w") 76 | for i, k in enumerate(keepSeqs): 77 | f.write(">%s\n" % keepHeaders[i]) 78 | f.write("%s\n" % keepSeqs[i]) 79 | f.close() 80 | -------------------------------------------------------------------------------- /bin/alnParseID: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """ 3 | A script to parse accession numbers from the headers of an alignment with 4 | typical Blast formatting. 5 | 6 | **Arguments** 7 | Input_MSA.fasta (the alignment to be processed) 8 | 9 | **Keyword Arguments** 10 | --output output file name, default: FilteredAln.fa 11 | --delim delimiter for fields in the header for each sequence, 12 | default: '_' 13 | 14 | :By: Kim Reynolds 15 | :On: 6.5.2015 16 | 17 | Copyright (C) 2015 Olivier Rivoire, Rama Ranganathan, Kimberly Reynolds 18 | 19 | This program is free software distributed under the BSD 3-clause 20 | license, please see the file LICENSE for details. 21 | """ 22 | 23 | import argparse 24 | import os 25 | import sys 26 | from pysca import scaTools as sca 27 | 28 | if __name__ == "__main__": 29 | # Parse inputs 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument( 32 | "-i", 33 | "--input", 34 | dest="alignment", 35 | required=True, 36 | help="input sequence alignment", 37 | ) 38 | parser.add_argument( 39 | "-o", 40 | "--output", 41 | dest="outputfile", 42 | default=None, 43 | help="specify an outputfile name", 44 | ) 45 | parser.add_argument( 46 | "-d", 47 | "--delim", 48 | dest="delim", 49 | default="_", 50 | help="specify the field delimiter in the header", 51 | ) 52 | parser.add_argument( 53 | "-t", 54 | "--type", 55 | dest="type", 56 | default="gi", 57 | required=True, 58 | help="type of identifier to parse out of the header " 59 | "('gi' or 'acc')", 60 | ) 61 | options = parser.parse_args() 62 | 63 | # Read in the MSA. 64 | headers, seqs = sca.readAlg(options.alignment) 65 | 66 | # Get index of accession number in the header fields. 67 | if options.type == "gi": 68 | separator = "gi" 69 | elif options.type == "acc" or options.type == "ref": 70 | separator = "ref" 71 | else: 72 | sys.exit("ID type %s not known" % options.type) 73 | 74 | try: 75 | # acc_idx = (headers[0].split(options.delim)).index('res') + 1 76 | acc_idx = (headers[0].split(options.delim)).index(separator) + 1 77 | except BaseException as e: 78 | print("ERROR: %s" % e) 79 | sys.exit("Accession field not found in %s." % options.alignment) 80 | 81 | acc_ids = [h.split(options.delim)[acc_idx] for h in headers] 82 | 83 | if options.outputfile: 84 | outputfile = options.outputfile 85 | else: 86 | outputfile = ( 87 | os.path.splitext(options.alignment)[0] + "." + options.type 88 | ) 89 | 90 | f = open(outputfile, "w") 91 | for acc_id in acc_ids: 92 | f.write("%s\n" % acc_id) 93 | f.close() 94 | -------------------------------------------------------------------------------- /bin/alnReplaceHeaders: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | A script that replaces the headers of one FASTA file with headers from another. 4 | It assumes the sequences in the two FASTA files are in identical order. This is 5 | useful, for example, when working with Promals3D alignments (which often have 6 | the header information truncated). 7 | 8 | **Arguments** 9 | Headers.fasta (Alignment that is providing the headers) 10 | Sequences.fasta (Alignment that is providing the sequences) 11 | 12 | **Keyword Arguments** 13 | --headers header alignment file name 14 | --seqs sequences alignment file name 15 | --output output file name, default: FixedHeaders.fa 16 | 17 | :By: Kim Reynolds 18 | :On: 6.5.2015 19 | 20 | Copyright (C) 2015 Olivier Rivoire, Rama Ranganathan, Kimberly Reynolds 21 | 22 | This program is free software distributed under the BSD 3-clause 23 | license, please see the file LICENSE for details. 24 | """ 25 | 26 | import argparse 27 | import sys 28 | from pysca import scaTools as sca 29 | 30 | if __name__ == "__main__": 31 | # Parse inputs 32 | parser = argparse.ArgumentParser() 33 | parser.add_argument( 34 | "-r", 35 | "--headers", 36 | dest="alg_headers", 37 | help="alignment providing the headers", 38 | ) 39 | parser.add_argument( 40 | "-s", 41 | "--sequences", 42 | dest="alg_seqs", 43 | help="alignment providing the sequences", 44 | ) 45 | parser.add_argument( 46 | "-o", 47 | "--output", 48 | dest="outputfile", 49 | default="FixedHeaders.fa", 50 | help="specify an outputfile name", 51 | ) 52 | options = parser.parse_args() 53 | 54 | print("WARNING: This script assumes that the headers of the two input") 55 | print("FASTA files are in IDENTICAL order. If this is NOT true, the") 56 | print("script will give incorrect results.") 57 | 58 | if (options.alg_headers is None) or (options.alg_seqs is None): 59 | sys.exit("Incorrect usage. (See `alnReplaceHeaders.py --help`)") 60 | 61 | headers1, seqs1 = sca.readAlg(options.alg_headers) 62 | headers2, seqs2 = sca.readAlg(options.alg_seqs) 63 | 64 | if len(seqs2) != len(headers1): 65 | sys.exit("ERROR: The length of the two alignments does not match.") 66 | 67 | f = open(options.outputfile, "w") 68 | for i, k in enumerate(headers1): 69 | f.write(">%s\n" % k) 70 | f.write("%s\n" % seqs2[i]) 71 | f.close() 72 | -------------------------------------------------------------------------------- /bin/annotateMSA: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """ 3 | The annotateMSA script provides utilities to automatically annotate sequence 4 | headers (for a FASTA file) with taxonomic information. Currently this can be 5 | done in one of two ways: 6 | 7 | 1) For Pfam alignments, annotations can be extracted from the file 8 | pfamseq.txt (please download from: 9 | ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/database_files/pfamseq.txt.gz) 10 | 11 | 2) For Blast alignments, annotations can be added using the NCBI Entrez 12 | utilities provided by BioPython. They can be based on GI or accession 13 | numbers that are used to query NCBI for taxonomy information (note that 14 | this approach requires a network connection). 15 | 16 | To extract GI or accession numbers, use the scripts alnParseGI.py or 17 | alnParseAcc.py, respectively. 18 | 19 | For both the Pfam and NCBI utilities, the process of sequence annotation *can 20 | be slow* (on the order of hours, particularly for NCBI entrez with larger 21 | alignments). However, the annotation process only needs to be run once per 22 | alignment. 23 | 24 | **Keyword Arguments** 25 | -i, --input Some input sequence alignment, Default: Input_MSA.fasta 26 | -o, --output Specify an output file, Default: Output_MSA.an 27 | -a, --annot Annotation method. Options are 'pfam' or 'ncbi'. 28 | Default: 'pfam' 29 | -l, --idList This argument is necessary for the 'ncbi' method. 30 | Specifies a file containing a list of GI numbers 31 | corresponding to the sequence order in the alignment; a 32 | number of "0" indicates that a GI number wasn't 33 | assigned for a particular sequence. 34 | -g, --giList Deprecated. Identical to '--idList' and kept to keep 35 | the CLI consistent with older versions of pySCA. 36 | -p, --pfam_seq Location of the pfamseq.txt file. Defaults to 37 | path2pfamseq (specified at the top of scaTools.py) 38 | -m, --delimiter Character(s) used for separating fields in the sequence 39 | headers of the annotated output. Default: '|' 40 | 41 | **Examples**:: 42 | 43 | annotateMSA -i PF00186_full.txt -o PF00186_full.an -a 'pfam' 44 | annotateMSA -i DHFR_PEPM3.fasta -o DHFR_PEPM3.an -a 'ncbi' -l DHFR_PEPM3.gi 45 | 46 | :By: Rama Ranganathan, Kim Reynolds 47 | :On: 9.22.2014 48 | 49 | Copyright (C) 2015 Olivier Rivoire, Rama Ranganathan, Kimberly Reynolds 50 | 51 | This program is free software distributed under the BSD 3-clause license, 52 | please see the file LICENSE for details. 53 | """ 54 | 55 | import sys 56 | import argparse 57 | import os 58 | from pysca import scaTools as sca 59 | from pysca import settings 60 | 61 | if __name__ == "__main__": 62 | 63 | # parse inputs 64 | parser = argparse.ArgumentParser() 65 | parser.add_argument( 66 | "-i", 67 | "--input", 68 | required=True, 69 | dest="Input_MSA", 70 | help="input sequence alignment", 71 | ) 72 | parser.add_argument( 73 | "-o", 74 | "--output", 75 | dest="output", 76 | default="Output.an", 77 | help="Outputfile name. Default: Output.an", 78 | ) 79 | parser.add_argument( 80 | "-a", 81 | "--annot", 82 | dest="annot", 83 | default="pfam", 84 | help="Annotation method. Options are 'pfam' or 'ncbi'." 85 | " Default: 'pfam'", 86 | ) 87 | parser.add_argument( 88 | "-l", 89 | "--idList", 90 | dest="idList", 91 | default=None, 92 | help="This argument is necessary for the 'ncbi' " 93 | "method. Specifies a file containing a list of " 94 | "GI or accession numbers corresponding to the " 95 | "sequence order in the alignment; a number of 0 " 96 | "indicates that one wasn't assigned for a " 97 | "particular sequence.", 98 | ) 99 | parser.add_argument( 100 | "-g", 101 | "--giList", 102 | dest="idList", 103 | default=None, 104 | help="Command kept for compatibility with previous " 105 | "versions. Use '-l' or '--idList' instead.", 106 | ) 107 | parser.add_argument( 108 | "-p", 109 | "--pfam_seq", 110 | dest="pfamseq", 111 | default=None, 112 | help="Location of the pfamseq.txt file. Defaults to " 113 | "path2pfamseq (specified in settings.py)", 114 | ) 115 | parser.add_argument( 116 | "-d", 117 | "--pfam_db", 118 | dest="pfamdb", 119 | default=None, 120 | help="Location of the pfamseq.db file. Priority over " 121 | "pfamseq.txt file. Defaults to path2pfamseqdb " 122 | "(specified in settings.py)", 123 | ) 124 | parser.add_argument( 125 | "-e", 126 | "--entrez_email", 127 | dest="email", 128 | default=None, 129 | help="email address for querying Entrez web API", 130 | ) 131 | parser.add_argument( 132 | "-m", 133 | "--delimiter", 134 | dest="delimiter", 135 | default="|", 136 | help="delimiter for fields for generated FASTA files.", 137 | ) 138 | options = parser.parse_args() 139 | 140 | if (options.annot != "pfam") & (options.annot != "ncbi"): 141 | sys.exit( 142 | "The option -a must be set to 'pfam' or 'ncbi' - other" 143 | " keywords are not allowed." 144 | ) 145 | 146 | if options.annot == "ncbi": 147 | if (options.idList is None) and (options.giList is None): 148 | sys.exit( 149 | "To use NCBI Entrez annotation, you must specify a file " 150 | "containing a list of GI numbers (see the --idList " 151 | "argument)." 152 | ) 153 | 154 | if options.annot == "pfam": 155 | # Annotate a Pfam alignment 156 | if options.pfamdb is not None: # default to db query over txt search 157 | sca.AnnotPfamDB( 158 | options.Input_MSA, 159 | options.output, 160 | options.pfamdb, 161 | options.delimiter, 162 | ) 163 | elif options.pfamseq is not None: 164 | sca.AnnotPfam( 165 | options.Input_MSA, 166 | options.output, 167 | options.pfamseq, 168 | options.delimiter, 169 | ) 170 | else: 171 | # If no database or text file supplied to annotateMSA, then default 172 | # to the files defined in settings.py. 173 | if os.path.exists(settings.path2pfamseqdb): 174 | sca.AnnotPfamDB( 175 | options.Input_MSA, options.output, options.delimiter 176 | ) 177 | elif os.path.exists(settings.path2pfamseq): 178 | sca.AnnotPfam( 179 | options.Input_MSA, options.output, options.delimiter 180 | ) 181 | else: 182 | sys.exit("No Pfam file found. Exiting.") 183 | elif options.annot == "ncbi": 184 | # Annotate using GI numbers/NCBI Entrez 185 | if options.email is None: 186 | sca.AnnotNCBI(options.Input_MSA, options.output, options.idList) 187 | else: 188 | sca.AnnotNCBI( 189 | options.Input_MSA, 190 | options.output, 191 | options.idList, 192 | options.email, 193 | options.delimiter, 194 | ) 195 | -------------------------------------------------------------------------------- /bin/scaCore: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """ 3 | The scaCore script runs the core calculations for SCA, and stores the output 4 | using the Python tool pickle. These calculations can be divided into two parts: 5 | 6 | 1) Sequence correlations: 7 | 8 | a) Compute simMat = the global sequence similarity matrix for the 9 | alignment 10 | b) Compute Useq and Uica = the eigenvectors (and independent 11 | components) for the following sequence correlation matrices: 12 | 13 | * unweighted (:math:`U^0`) 14 | * sequence weights applied (:math:`U^1`) 15 | * both sequence and position weights applied (:math:`U^2`) 16 | 17 | 2) Positional correlations: 18 | 19 | a) Compute the single-site position weights and positional conservation 20 | values (:math:`D_i` and :math:`D_i^a`) 21 | b) Compute the dimension-reduced SCA correlation matrix 22 | :math:`\\tilde{C_{ij}}`, the projected alignment :math:`tX`, and the 23 | projector 24 | c) Compute Ntrials of the randomized SCA matrix, and the eigenvectors 25 | and eigenvalues associated with each 26 | 27 | **Arguments** 28 | 29 | **Keyword Arguments** 30 | -i \*.db (the database produced by running scaProcessMSA) 31 | -n norm type for dimension-reducing the sca matrix. Options 32 | are: 'spec' (the spectral norm) or 'frob' (frobenius 33 | norm). Default: frob 34 | -l lambda parameter for pseudo-counting the alignment. 35 | Default: 0.03 36 | --Ntrials, -t number of randomization trials 37 | --matlab, -m write out the results of these calculations to a MATLAB 38 | workspace for further analysis 39 | 40 | **Example**:: 41 | 42 | scaCore -i PF00071_full.db 43 | 44 | :By: Rama Ranganathan, Kim Reynolds 45 | :On: 8.5.2014 46 | 47 | Copyright (C) 2015 Olivier Rivoire, Rama Ranganathan, Kimberly Reynolds 48 | 49 | This program is free software distributed under the BSD 3-clause license, 50 | please see the file LICENSE for details. 51 | """ 52 | 53 | import sys 54 | import time 55 | import os 56 | import pickle 57 | import argparse 58 | from scipy.io import savemat 59 | from pysca import scaTools as sca 60 | 61 | if __name__ == "__main__": 62 | 63 | # Parse inputs 64 | parser = argparse.ArgumentParser() 65 | parser.add_argument( 66 | "-i" "--input", 67 | dest="inputdb", 68 | required=True, 69 | help="database from running scaProcessMSA", 70 | ) 71 | parser.add_argument( 72 | "-o" "--output", 73 | dest="outputdb", 74 | default=None, 75 | help="output file for core calculations", 76 | ) 77 | parser.add_argument( 78 | "-n", 79 | dest="norm", 80 | default="frob", 81 | help="norm type for dimension-reducing the sca matrix." 82 | "Options are: 'spec' (the spectral norm) or " 83 | "'frob' (frobenius norm). Default: frob", 84 | ) 85 | parser.add_argument( 86 | "-t", 87 | "--Ntrials", 88 | dest="Ntrials", 89 | default=10, 90 | type=int, 91 | help="number of randomization trials", 92 | ) 93 | parser.add_argument( 94 | "-l", 95 | dest="lbda", 96 | default=0.03, 97 | type=float, 98 | help="lambda parameter for pseudo-counting the " 99 | "alignment. Default: 0.03", 100 | ) 101 | parser.add_argument( 102 | "-q", 103 | dest="kseq", 104 | default=30, 105 | type=int, 106 | help="number of eigenvectors to computes in sequence matrix. " 107 | "Default: 30", 108 | ) 109 | parser.add_argument( 110 | "-c", 111 | dest="kica", 112 | default=15, 113 | type=int, 114 | help="number of independent components to compute from sequence " 115 | "alignment matrix. Default: 15", 116 | ) 117 | parser.add_argument( 118 | "-m", 119 | "--matlab", 120 | dest="matfile", 121 | action="store_true", 122 | default=False, 123 | help="write out the results of these calculations to " 124 | "a MATLAB workspace for further analysis.", 125 | ) 126 | options = parser.parse_args() 127 | 128 | if (options.norm != "frob") & (options.norm != "spec"): 129 | sys.exit( 130 | "The option -n must be set to 'frob' or 'spec' - other " 131 | "keywords are not allowed." 132 | ) 133 | 134 | # extract the necessary stuff from the database... 135 | db_in = pickle.load(open(options.inputdb, "rb")) 136 | D_in = db_in["sequence"] 137 | 138 | msa_num = D_in["msa_num"] 139 | seqw = D_in["seqw"] 140 | Nseq = D_in["Nseq"] 141 | Npos = D_in["Npos"] 142 | ats = D_in["ats"] 143 | hd = D_in["hd"] 144 | 145 | # sequence analysis 146 | print("Computing the sequence projections.") 147 | Useq, Uica = sca.seqProj( 148 | msa_num, seqw, kseq=options.kseq, kica=options.kica 149 | ) 150 | 151 | print("Computing sequence similarity matrix.") 152 | simMat = sca.seqSim(msa_num) 153 | 154 | # SCA calculations 155 | print("Computing the SCA conservation and correlation values.") 156 | Wia, Dia, Di = sca.posWeights(msa_num, seqw, options.lbda) 157 | Csca, tX, Proj = sca.scaMat(msa_num, seqw, options.norm, options.lbda) 158 | 159 | # Matrix randomizations 160 | print("Computing matrix randomizations...") 161 | start = time.time() 162 | Vrand, Lrand, Crand = sca.randomize( 163 | msa_num, options.Ntrials, seqw, options.lbda 164 | ) 165 | end = time.time() 166 | print( 167 | "Randomizations complete, %i trials, time: %.1f minutes" 168 | % (options.Ntrials, (end - start) / 60) 169 | ) 170 | 171 | # saving... 172 | if options.outputdb is None: 173 | fn = os.path.basename(options.inputdb) 174 | output_path = os.path.abspath(os.path.dirname(options.inputdb)) 175 | else: 176 | fn = os.path.basename(options.outputdb) 177 | output_path = os.path.abspath(os.path.dirname(options.outputdb)) 178 | fn_noext = os.path.splitext(fn)[0] 179 | 180 | D = {} 181 | D["Useq"] = Useq 182 | D["Uica"] = Uica 183 | D["simMat"] = simMat 184 | D["lbda"] = options.lbda 185 | D["Dia"] = Dia 186 | D["Di"] = Di 187 | D["Csca"] = Csca 188 | D["tX"] = tX 189 | D["Proj"] = Proj 190 | D["Ntrials"] = options.Ntrials 191 | D["Vrand"] = Vrand 192 | D["Lrand"] = Lrand 193 | D["Crand"] = Crand 194 | 195 | db = {} 196 | db["sequence"] = D_in 197 | db["sca"] = D 198 | 199 | print( 200 | "Calculations complete, writing to database file " 201 | + os.path.join(output_path, fn_noext) 202 | ) 203 | pickle.dump(db, open(os.path.join(output_path, fn_noext) + ".db", "wb")) 204 | 205 | if options.matfile: 206 | savemat( 207 | os.path.join(output_path, fn_noext) + ".mat", 208 | db, 209 | appendmat=True, 210 | oned_as="column", 211 | ) 212 | -------------------------------------------------------------------------------- /bin/scaProcessMSA: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """ 3 | The scaProcessMSA script conducts the basic steps in multiple sequence 4 | alignment (MSA) pre-processing for SCA, and stores the results using the python 5 | tool pickle: 6 | 7 | 1) Trim the alignment, either by truncating to a reference sequence 8 | (specified with the -t flag) or by removing excessively gapped 9 | positions (set to positions with more than 40% gaps) 10 | 11 | 2) Identify/designate a reference sequence in the alignment, and create a 12 | mapping of the alignment numberings to position numberings for the 13 | reference sequence. The reference sequence can be specified in one of 14 | four ways: 15 | 16 | a) By supplying a PDB file - in this case, the reference sequence 17 | is taken from the PDB (see the pdb kwarg) 18 | 19 | b) By supplying a reference sequence directly (as a fasta file - 20 | see the refseq kwarg) 21 | 22 | c) By supplying the index of the reference sequence in the 23 | alignment (see the refseq kwarg) 24 | 25 | d) If no reference sequence is supplied by the user, one is 26 | automatically selected using the scaTools function chooseRef. 27 | 28 | The position numbers (for mapping the alignment) can be specified in 29 | one of three ways: 30 | 31 | a) By supplying a PDB file - in this case the alignment positions 32 | are mapped to structure positions 33 | 34 | b) By supplying a list of reference positions (see the refpos 35 | kwarg) 36 | 37 | c) If no reference positions are supplied by the user, sequential 38 | numbering (starting at 1) is assumed. 39 | 40 | 3) Filter sequences to remove highly gapped sequences, and sequences with 41 | an identity below or above some minimum or maximum value to the 42 | reference sequence (see the parameters kwarg) 43 | 4) Filter positions to remove highly gapped positions (default 20% gaps, 44 | can also be set using --parameters) 45 | 5) Calculate sequence weights and write out the final alignment and other 46 | variables 47 | 48 | **Key Arguments** 49 | --alignment, -a Input_MSA.fasta (the alignment to be processed, 50 | typically the headers contain taxonomic information for 51 | the sequences). 52 | --pdb, -s PDB identifier (ex: 1RX2) 53 | --pdbdir, -b directory where PDB files are stored 54 | --chainID, -c chain ID in the PDB for the reference sequence 55 | --species, -f species of the reference sequence 56 | --refseq, -r reference sequence, supplied as a fasta file 57 | --refpos, -o reference positions, supplied as a text file with one 58 | position specified per line 59 | --refindex, -i reference sequence number in the alignment, COUNTING 60 | FROM 0 61 | --parameters, -p list of parameters for filtering the alignment: 62 | [max_frac_gaps for positions, max_frac_gaps for 63 | sequences, min SID to reference seq, max SID to 64 | reference seq] 65 | default values: [0.2, 0.2, 0.2, 0.8] (see filterPos and 66 | filterSeq functions for details) 67 | --selectSeqs, -n subsample the alignment to (1.5 * the number of 68 | effective sequences) to reduce computational time, 69 | default: False 70 | --truncate, -t truncate the alignment to the positions in the reference 71 | PDB, default: False 72 | --matlab, -m write out the results of this script to a matlab 73 | workspace for further analysis 74 | --dest, -d destination for output files 75 | 76 | **Example**:: 77 | 78 | scaProcessMSA -a PF00071_full.an -s 5P21 -c A -f 'Homo sapiens' 79 | 80 | :By: Rama Ranganathan 81 | :On: 8.5.2014 82 | 83 | Copyright (C) 2015 Olivier Rivoire, Rama Ranganathan, Kimberly Reynolds 84 | 85 | This program is free software distributed under the BSD 3-clause license, 86 | please see the file LICENSE for details. 87 | """ 88 | 89 | import sys 90 | import os 91 | import pickle 92 | import argparse 93 | import numpy as np 94 | from scipy.io import savemat 95 | from pysca import scaTools as sca 96 | from pysca import settings 97 | 98 | if __name__ == "__main__": 99 | # Parse inputs 100 | parser = argparse.ArgumentParser() 101 | parser.add_argument( 102 | "-a", 103 | "--alignment", 104 | dest="alignment", 105 | required=True, 106 | help="Input Sequence Alignment", 107 | ) 108 | parser.add_argument( 109 | "-d", 110 | "--dest", 111 | dest="destination", 112 | default=None, 113 | help="specify an output directory", 114 | ) 115 | parser.add_argument( 116 | "-s", "--pdb", dest="pdbid", help="PDB identifier (ex: 1RX2)" 117 | ) 118 | parser.add_argument( 119 | "-b", 120 | "--pdbdir", 121 | dest="pdbdir", 122 | default=None, 123 | help="directory where PDBs are stored", 124 | ) 125 | parser.add_argument( 126 | "-c", 127 | "--chainID", 128 | dest="chainID", 129 | default="A", 130 | help="chain ID in the PDB for the reference sequence", 131 | ) 132 | parser.add_argument( 133 | "-f", 134 | "--species", 135 | dest="species", 136 | help="species of the reference sequence", 137 | ) 138 | parser.add_argument( 139 | "-r", 140 | "--refseq", 141 | dest="refseq", 142 | help="reference sequence, supplied as a fasta file", 143 | ) 144 | parser.add_argument( 145 | "-o", 146 | "--refpos", 147 | dest="refpos", 148 | help="reference positions, supplied as a text file " 149 | "with one position specified per line", 150 | ) 151 | parser.add_argument( 152 | "-i", 153 | "--refindex", 154 | dest="i_ref", 155 | type=int, 156 | help="reference sequence number in the alignment, " "COUNTING FROM 0", 157 | ) 158 | parser.add_argument( 159 | "-p", 160 | "--parameters", 161 | dest="parameters", 162 | default=[0.2, 0.2, 0.2, 0.8], 163 | type=float, 164 | nargs=4, 165 | help="list of parameters for filtering the alignment: " 166 | "[max_frac_gaps for positions, max_frac_gaps for " 167 | "sequences, min SID to reference seq, max SID to " 168 | "reference seq] default values: [0.2, 0.2, 0.2, " 169 | "0.8] (see filterPos and filterSeq functions for " 170 | "details).", 171 | ) 172 | parser.add_argument( 173 | "-n", 174 | "--selectSeqs", 175 | action="store_true", 176 | dest="Nselect", 177 | default=False, 178 | help="subsample the alignment to (1.5 * the number of " 179 | "effective sequences) to reduce computational " 180 | "time, default: False", 181 | ) 182 | parser.add_argument( 183 | "-t", 184 | "--truncate", 185 | action="store_true", 186 | dest="truncate", 187 | default=False, 188 | help="truncate the alignment to the positions in the " 189 | "reference PDB, default: False", 190 | ) 191 | parser.add_argument( 192 | "-m", 193 | "--matlab", 194 | action="store_true", 195 | dest="matfile", 196 | default=False, 197 | help="write out the results of this script to a matlab" 198 | " workspace for further analysis", 199 | ) 200 | options = parser.parse_args() 201 | 202 | # A little bit of error checking/feedback for the user. 203 | if options.i_ref is None: 204 | if options.species is not None and options.pdbid is None: 205 | print("No PDBid, ignoring species...") 206 | options.species = None 207 | if options.refseq is not None and options.refpos is None: 208 | print( 209 | "Using reference sequence but no position list provided! " 210 | "Just numbering positions 1 to length(sequence)" 211 | ) 212 | if options.pdbid is not None: 213 | print("And...ignoring the PDB file...") 214 | options.pdbid = None 215 | options.refpos = [i + 1 for i in range(len(options.refseq))] 216 | if options.refseq is not None and options.refpos is not None: 217 | print("Using the reference sequence and position list...") 218 | if options.pdbid is not None: 219 | print("And...ignoring the PDB file...") 220 | options.pdbid = None 221 | else: 222 | i_ref = options.i_ref 223 | 224 | # Pick an output directory. 225 | if options.destination is None: 226 | if settings.path2output is None: 227 | destination = os.getcwd() 228 | else: 229 | destination = os.path.abspath(settings.path2output) 230 | else: 231 | destination = os.path.abspath(options.destination) 232 | 233 | if not os.path.exists(destination): 234 | os.makedirs(destination) 235 | 236 | # Set the directory where PDB files are stored. 237 | if options.pdbdir is None: 238 | pdbdir = settings.path2structures 239 | elif os.path.exists(options.pdbdir): 240 | pdbdir = options.pdbdir 241 | else: 242 | sys.exit("PDB directory '%s/' not found." % options.pdbdir) 243 | 244 | # Read in initial alignment 245 | headers_full, sequences_full = sca.readAlg(options.alignment) 246 | print( 247 | "Loaded alignment of %i sequences, %i positions." 248 | % (len(headers_full), len(sequences_full[0])) 249 | ) 250 | 251 | if options.i_ref is not None: 252 | ref_header = headers_full[options.i_ref] 253 | ref_sequence = (sequences_full[options.i_ref]).replace(".", "-") 254 | 255 | # Check the alignment and remove sequences containing non-standard amino 256 | # acids 257 | print("Checking alignment for non-standard amino acids") 258 | alg_out = list() 259 | hd_out = list() 260 | for i, k in enumerate(sequences_full): 261 | flag = 0 262 | l = k.replace(".", "-") 263 | for j, aa in enumerate(l): 264 | if aa not in "ACDEFGHIKLMNPQRSTVWY-": 265 | flag = 1 266 | if flag == 0: 267 | alg_out.append(l) 268 | hd_out.append(headers_full[i]) 269 | headers_full = hd_out 270 | sequences_full = alg_out 271 | print( 272 | "Aligment size after removing sequences with non-standard amino " 273 | "acids: %i" % (len(sequences_full)) 274 | ) 275 | 276 | # Do an initial trimming to remove excessively gapped positions - this is 277 | # critical for building a correct ATS 278 | print("Trimming alignment for highly gapped positions (80% or more).") 279 | alg_out, poskeep = sca.filterPos(sequences_full, [1], 0.8) 280 | sequences_ori = sequences_full 281 | sequences_full = alg_out 282 | print( 283 | "Alignment size post-trimming: %i positions" % len(sequences_full[0]) 284 | ) 285 | 286 | if options.i_ref is not None: 287 | ref_sequence = "".join([ref_sequence[i] for i in poskeep]) 288 | 289 | # If i_ref is directly provided, we use it, ignoring all else. 290 | # Otherwise, we explore the other ways of specifying a reference 291 | # sequences: (1) providing a PDBid (chainID defaults to 'A'), (2) 292 | # providing the protein sequence with position numbers (defaults to 293 | # just sequence numbering). If none of these is provided, we just make 294 | # an alignment based numbering for ats. If a PDBid is provided, there 295 | # is an option to also provide species information to permit 296 | # identifying the reference sequence in the MSA without use of external 297 | # packages for fast pairwise alignments. 298 | 299 | print("Looking for PDBs in %s" % pdbdir) 300 | 301 | if options.i_ref is None: 302 | if options.pdbid is not None: 303 | try: 304 | seq_pdb, ats_pdb, dist_pdb = sca.pdbSeq( 305 | options.pdbid, options.chainID, pdbdir 306 | ) 307 | if options.species is not None: 308 | try: 309 | print( 310 | "Finding reference sequence using species-based" 311 | " best match.." 312 | ) 313 | i_ref = sca.MSAsearch( 314 | headers_full, 315 | sequences_full, 316 | seq_pdb, 317 | options.species, 318 | ) 319 | options.i_ref = i_ref 320 | print("reference sequence index is: %i" % (i_ref)) 321 | print(headers_full[i_ref]) 322 | print(sequences_full[i_ref]) 323 | except BaseException as e: 324 | print("Error: " + str(e)) 325 | print( 326 | "Cant find the reference sequence using" 327 | " species-based best_match! Using global" 328 | " MSAsearch..." 329 | ) 330 | try: 331 | i_ref = sca.MSAsearch( 332 | headers_full, sequences_full, seq_pdb 333 | ) 334 | options.i_ref = i_ref 335 | print("reference sequence index is: %i" % (i_ref)) 336 | print(headers_full[i_ref]) 337 | print(sequences_full[i_ref]) 338 | except BaseException as e: 339 | print("Error: " + str(e)) 340 | sys.exit("Error! Can't find reference sequence...") 341 | else: 342 | try: 343 | print( 344 | "Finding reference sequence using global" 345 | " MSAsearch..." 346 | ) 347 | i_ref = sca.MSAsearch( 348 | headers_full, sequences_full, seq_pdb 349 | ) 350 | options.i_ref = i_ref 351 | print("reference sequence index is: %i" % (i_ref)) 352 | print(headers_full[i_ref]) 353 | print(sequences_full[i_ref]) 354 | except BaseException as e: 355 | print("Error: " + str(e)) 356 | sys.exit("Error!! Can't find reference sequence...") 357 | sequences, ats = sca.makeATS( 358 | sequences_full, ats_pdb, seq_pdb, i_ref, options.truncate 359 | ) 360 | dist_new = np.zeros((len(ats), len(ats))) 361 | for (j, pos1) in enumerate(ats): 362 | for (k, pos2) in enumerate(ats): 363 | if k != j: 364 | if (pos1 == "-") or (pos2 == "-"): 365 | dist_new[j, k] == 1000 366 | else: 367 | ix_j = ats_pdb.index(pos1) 368 | ix_k = ats_pdb.index(pos2) 369 | dist_new[j, k] = dist_pdb[ix_j, ix_k] 370 | dist_pdb = dist_new 371 | except BaseException as e: 372 | print("Error: " + str(e)) 373 | sys.exit("Error!!! Something wrong with PDBid or path...") 374 | elif options.refseq is not None: 375 | print( 376 | "Finding reference sequence using provided sequence" " file..." 377 | ) 378 | try: 379 | h_tmp, s_tmp = sca.readAlg(options.refseq) 380 | i_ref = sca.MSAsearch(headers_full, sequences_full, s_tmp[0]) 381 | options.i_ref = i_ref 382 | print("reference sequence index is: %i" % (i_ref)) 383 | print(headers_full[i_ref]) 384 | if options.refpos is not None: 385 | try: 386 | f = open(options.refpos, "r") 387 | ats_tmp = [line.rstrip("\n") for line in f] 388 | f.close() 389 | except BaseException as e: 390 | print("Error: " + str(e)) 391 | print( 392 | "Error reading reference position file! Using" 393 | " default numbering 1 to number of positions" 394 | ) 395 | ats_tmp = [i + 1 for i in range(len(sequences[0]))] 396 | else: 397 | print( 398 | "No reference position list provided. Using" 399 | " default numbering 1 to number of positions" 400 | ) 401 | ats_tmp = [i + 1 for i in range(len(sequences[0]))] 402 | sequences, ats = sca.makeATS( 403 | sequences_full, ats_tmp, s_tmp[0], i_ref, options.truncate 404 | ) 405 | except BaseException as e: 406 | print("Error: " + str(e)) 407 | sys.exit("Error!! Can't find reference sequence...") 408 | else: 409 | msa_num = sca.lett2num(sequences_full) 410 | i_ref = sca.chooseRefSeq(sequences_full) 411 | print( 412 | "No reference sequence given, chose as default (%i): %s" 413 | % (i_ref, headers_full[i_ref]) 414 | ) 415 | sequences = sequences_full 416 | ats = [i + 1 for i in range(len(sequences[0]))] 417 | else: 418 | print("using provided reference index %i" % (i_ref)) 419 | print(ref_header) 420 | s_tmp = ref_sequence 421 | try: 422 | if options.refpos is not None: 423 | f = open(options.refpos, "r") 424 | ats_tmp = [line.rstrip("\n") for line in f] 425 | f.close() 426 | else: 427 | print("here!") 428 | ats_tmp = [i + 1 for i in range(len(s_tmp))] 429 | sequences, ats = sca.makeATS( 430 | sequences_full, ats_tmp, s_tmp, i_ref, options.truncate 431 | ) 432 | except BaseException as e: 433 | print("Error: " + str(e)) 434 | sys.exit("Error!! Can't find reference sequence...") 435 | 436 | # Filtering sequences and positions, calculations of effective number of 437 | # seqs 438 | print( 439 | "Conducting sequence and position filtering: alignment size is %i" 440 | " seqs, %i pos" % (len(sequences), len(sequences[0])) 441 | ) 442 | if options.pdbid is not None: 443 | print( 444 | "ATS and distmat size - ATS: %i, distmat: %i x %i" 445 | % (len(ats), len(dist_pdb), len(dist_pdb[0])) 446 | ) 447 | else: 448 | print( 449 | "ATS should also have %i positions - ATS: %i" 450 | % (len(sequences[0]), len(ats)) 451 | ) 452 | 453 | if i_ref is not None: 454 | alg0, seqw0, seqkeep = sca.filterSeq( 455 | sequences, 456 | i_ref, 457 | max_fracgaps=options.parameters[1], 458 | min_seqid=options.parameters[2], 459 | max_seqid=options.parameters[3], 460 | ) 461 | else: 462 | alg0, seqw0, seqkeep = sca.filterSeq( 463 | sequences, 464 | max_fracgaps=options.parameters[1], 465 | min_seqid=options.parameters[2], 466 | max_seqid=options.parameters[3], 467 | ) 468 | 469 | headers = [headers_full[s] for s in seqkeep] 470 | alg1, iposkeep = sca.filterPos(alg0, seqw0, options.parameters[0]) 471 | ats = [ats[i] for i in iposkeep] 472 | if options.pdbid is not None: 473 | distmat = dist_pdb[np.ix_(iposkeep, iposkeep)] 474 | effseqsprelimit = int(seqw0.sum()) 475 | Nseqprelimit = len(alg1) 476 | print( 477 | "After filtering: alignment size is %i seqs, %i effective seqs, %i" 478 | " pos" % (len(alg1), effseqsprelimit, len(alg1[0])) 479 | ) 480 | 481 | # Limitation of total sequences to [1.5 * # of effective sequences] if 482 | # Nselect is set to True 483 | if options.Nselect: 484 | seqsel = sca.randSel( 485 | seqw0, int(1.5 * effseqsprelimit), [seqkeep.index(i_ref)] 486 | ) 487 | alg = [alg1[s] for s in seqsel] 488 | hd = [headers[s] for s in seqsel] 489 | else: 490 | alg = alg1 491 | hd = headers 492 | 493 | # Calculation of final MSA, sequence weights 494 | seqw = sca.seqWeights(alg) 495 | effseqs = seqw.sum() 496 | msa_num = sca.lett2num(alg) 497 | Nseq, Npos = msa_num.shape 498 | print("Final alignment parameters:") 499 | print("Number of sequences: M = %i" % (Nseq)) 500 | print("Number of effective sequences: M' = %i" % (effseqs)) 501 | print("Number of alignment positions: L = %i" % (Npos)) 502 | 503 | if options.pdbid is not None: 504 | print("Number of positions in the ats: %i" % (len(ats))) 505 | structPos = [i for (i, k) in enumerate(ats) if k != "-"] 506 | print("Number of structure positions mapped: %i" % (len(structPos))) 507 | print( 508 | "Size of the distance matrix: %i x %i" 509 | % (len(distmat), len(distmat[0])) 510 | ) 511 | 512 | # Saving the important stuff. Everything is stored in a file called 513 | # [MSAname]_sequence.db. But we will also write out the final processed 514 | # alignment to a fasta file. 515 | 516 | filename = os.path.basename(options.alignment) 517 | filename_noext = os.path.splitext(filename)[0] 518 | f = open( 519 | os.path.join(destination, filename_noext) + "_processed" + ".fasta", 520 | "w", 521 | ) 522 | for i in range(len(alg)): 523 | f.write(">%s\n" % (hd[i])) 524 | f.write(alg[i] + "\n") 525 | f.close() 526 | 527 | D = {} 528 | D["alg"] = alg 529 | D["hd"] = hd 530 | D["msa_num"] = msa_num 531 | D["seqw"] = seqw 532 | D["Nseq"] = Nseq 533 | D["Npos"] = Npos 534 | D["ats"] = ats 535 | D["effseqs"] = effseqs 536 | D["limitseqs"] = options.Nselect 537 | D["NseqPrelimit"] = Nseqprelimit 538 | D["effseqsPrelimit"] = effseqsprelimit 539 | if options.pdbid is not None: 540 | D["pdbid"] = options.pdbid 541 | D["pdb_chainID"] = options.chainID 542 | D["distmat"] = distmat 543 | if options.refseq is not None: 544 | D["refseq"] = options.refseq 545 | if options.refpos is not None: 546 | D["refpos"] = options.refpos 547 | D["i_ref"] = i_ref 548 | D["trim_parameters"] = options.parameters 549 | D["truncate_flag"] = options.truncate 550 | 551 | db_filename = os.path.join(destination, filename_noext) 552 | print("Opening database file " + db_filename) 553 | db = {} 554 | db["sequence"] = D 555 | 556 | pickle.dump(db, open(db_filename + ".db", "wb")) 557 | 558 | if options.matfile: 559 | db["sequence"]["i_ref"] = i_ref + 1 # index from 1 for MATLAB 560 | savemat(db_filename, db, appendmat=True, oned_as="column") 561 | -------------------------------------------------------------------------------- /bin/scaSectorID: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """ 3 | The scaSectorID script does the preliminaries of sector identification and 4 | stores the outputs using the python tool pickle: 5 | 6 | 1) Chooses :math:`k_{max}` (the number of significant eigenmodes) by 7 | comparison of the :math:`\\tilde{C_{ij}}` eigenspectrum to that for the 8 | randomized matrices 9 | 2) Rotates the top :math:`k_{max}` eigenvectors using independent 10 | components analysis 11 | 3) Defines the amino acid positions that significantly contribute to each 12 | of the independent components (ICs) by empirically fitting each IC to 13 | the t-distribution and selecting positions with greater than a specified 14 | cutoff (default: p=0.95) on the CDF. 15 | 4) Assign positions into groups based on the independent component with 16 | which it has the greatest degree of co-evolution. 17 | 18 | **Key Arguments** 19 | --input, -i \*.db (the database produced by running scaCore) 20 | --kpos, -k number of significant eigenmodes for analysis (the default 21 | is to automatically choose using the eigenspectrum) 22 | --cutoff, -p empirically chosen cutoff for selecting AA positions with 23 | a significant contribution to each IC, Default = 0.95 24 | --matlab, -m write out the results of this script to a matlab workspace 25 | for further analysis 26 | 27 | **Example**:: 28 | 29 | scaSectorID -i PF00071_full.db 30 | 31 | :By: Kim Reynolds 32 | :On: 8.19.2014 33 | 34 | Copyright (C) 2015 Olivier Rivoire, Rama Ranganathan, Kimberly Reynolds 35 | 36 | This program is free software distributed under the BSD 3-clause license, 37 | please see the file LICENSE for details. 38 | """ 39 | 40 | import os 41 | import pickle 42 | import argparse 43 | import numpy as np 44 | from scipy.io import savemat 45 | from pysca import scaTools as sca 46 | 47 | if __name__ == "__main__": 48 | # parse inputs 49 | parser = argparse.ArgumentParser() 50 | parser.add_argument( 51 | "-i", 52 | "--input", 53 | dest="inputdb", 54 | required=True, 55 | help="database from running scaCore", 56 | ) 57 | parser.add_argument( 58 | "-o" "--output", 59 | dest="outputdb", 60 | default=None, 61 | help="output file for sector calculations", 62 | ) 63 | parser.add_argument( 64 | "-k", 65 | "--kpos", 66 | dest="kpos", 67 | type=int, 68 | default=0, 69 | help="number of significant eigenmodes for analysis " 70 | "(the default is to automatically choose using " 71 | "the eigenspectrum)", 72 | ) 73 | parser.add_argument( 74 | "-p", 75 | "--cutoff", 76 | dest="cutoff", 77 | type=float, 78 | default=0.95, 79 | help="number of significant eigenmodes for analysis " 80 | "(the default is to automatically choose using " 81 | "the eigenspectrum)", 82 | ) 83 | parser.add_argument( 84 | "-m", 85 | "--matlab", 86 | action="store_true", 87 | dest="matfile", 88 | default=False, 89 | help="write out the results of this script to a " 90 | "matlab workspace for further analysis", 91 | ) 92 | options = parser.parse_args() 93 | 94 | # extract the necessary stuff from the database... 95 | db_in = pickle.load(open(options.inputdb, "rb")) 96 | D_seq = db_in["sequence"] 97 | D_sca = db_in["sca"] 98 | 99 | msa_num = D_seq["msa_num"] 100 | seqw = D_seq["seqw"] 101 | lbda = D_sca["lbda"] 102 | Csca = D_sca["Csca"] 103 | tX = D_sca["tX"] 104 | Lrand = D_sca["Lrand"] 105 | 106 | # run the calculations 107 | Vsca, Lsca = sca.eigenVect(Csca) 108 | 109 | if options.kpos == 0: 110 | kpos = sca.chooseKpos(Lsca, Lrand) 111 | else: 112 | kpos = options.kpos 113 | print("Selected kpos=%i significant eigenmodes." % kpos) 114 | Vpica, Wpica = sca.rotICA(Vsca, kmax=kpos) 115 | ics, icsize, sortedpos, cutoff, scaled_pd, pd = sca.icList( 116 | Vpica, kpos, Csca, p_cut=options.cutoff 117 | ) 118 | 119 | Usca = tX.dot(Vsca[:, :kpos]).dot(np.diag(1 / np.sqrt(Lsca[:kpos]))) 120 | Upica = Wpica.dot(Usca.T).T 121 | for k in range(Upica.shape[1]): 122 | Upica[:, k] /= np.sqrt(Upica[:, k].T.dot(Upica[:, k])) 123 | Usica, Wsica = sca.rotICA(Usca, kmax=kpos) 124 | 125 | # saving... 126 | if options.outputdb is None: 127 | fn = os.path.basename(options.inputdb) 128 | output_path = os.path.abspath(os.path.dirname(options.inputdb)) 129 | else: 130 | fn = os.path.basename(options.outputdb) 131 | output_path = os.path.abspath(os.path.dirname(options.outputdb)) 132 | fn_noext = os.path.splitext(fn)[0] 133 | 134 | D = {} 135 | D["Vsca"] = Vsca 136 | D["Lsca"] = Lsca 137 | D["kpos"] = kpos 138 | D["Vpica"] = Vpica 139 | D["Wpica"] = Wpica 140 | D["Usca"] = Usca 141 | D["Upica"] = Upica 142 | D["Usica"] = Usica 143 | D["Wsica"] = Wsica 144 | D["ics"] = ics 145 | D["icsize"] = icsize 146 | D["sortedpos"] = sortedpos 147 | D["cutoff"] = cutoff 148 | D["scaled_pd"] = scaled_pd 149 | D["pd"] = pd 150 | 151 | db = {} 152 | db["sequence"] = D_seq 153 | db["sca"] = D_sca 154 | db["sector"] = D 155 | 156 | print( 157 | "Calculations complete, writing to database file " 158 | + os.path.join(output_path, fn_noext) 159 | ) 160 | pickle.dump(db, open(os.path.join(output_path, fn_noext) + ".db", "wb")) 161 | 162 | if options.matfile: 163 | # increment indices by 1 for MATLAB 164 | db["sector"]["sortedpos"] = [pos + 1 for pos in D["sortedpos"]] 165 | for ic in ics: 166 | ic.items = [item + 1 for item in ic.items] 167 | db["sector"]["ics"] = ics 168 | savemat( 169 | os.path.join(output_path, fn_noext) + ".mat", 170 | db, 171 | appendmat=True, 172 | oned_as="column", 173 | ) 174 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/SCA_DHFR.rst: -------------------------------------------------------------------------------- 1 | SCA 6.1 - The DHFR (dihydrofolate reductase) family 2 | =================================================== 3 | 4 | **Summary** This script describes the basic flow of the analytical steps 5 | in SCA6.0, using the DHFR family as an example. Here we compare results 6 | of the analysis for two different alignments: a PFAM alignment (PFAM 7 | PF00186) and an independent manually curated alignment constructed using 8 | a custom database of orthologous sequences (DHFR_PEPM3.an). Despite 9 | differences in the construction, sequence distribution and size of the 10 | two alignments, the sector definition is remarkably consistent: in both 11 | cases we arrive at a single sector assembled from six independent 12 | components. 13 | 14 | For this tutorial, the core calculation scripts should be run from the 15 | command line as follows: 16 | 17 | :: 18 | 19 | >> annotateMSA -i ../data/PF00186_full.txt -o ../outputs/PF00186_full.an -a 'pfam' -p ../data/pfamseq.txt 20 | >> scaProcessMSA -a ../data/PF00186_full.an -b ../data/ -s 1RX2 -c A -f 'Escherichia coli' -t -n 21 | >> scaCore -i ../output/PF00186_full.db 22 | >> scaSectorID -i ../output/PF00186_full.db 23 | 24 | >> annotateMSA -i ../data/DHFR_PEPM3.fasta -o ../output DHFR_PEPM3.an -a 'ncbi' -g ../data/DHFR_PEPM3.gis 25 | >> scaProcessMSA -a ../data/DHFR_PEPM3.an -b ../data/ -s 1RX2 -c A -t -n 26 | >> scaCore -i ../output/DHFR_PEPM3.db 27 | >> scaSectorID -i ../output/DHFR_PEPM3.db 28 | 29 | Note that we supply annotated alignments for all tutorial scripts *(the 30 | annotate_pfMSA step is slow, and should only be run once)*. 31 | 32 | **O.Rivoire, K.Reynolds and R.Ranganathan** 9/2014 33 | 34 | .. code:: python3 35 | 36 | import os 37 | import time 38 | import matplotlib.pyplot as plt 39 | import numpy as np 40 | import copy 41 | import colorsys 42 | import matplotlib.image as mpimg 43 | from IPython.display import display 44 | from IPython.display import Image 45 | import scipy.cluster.hierarchy as sch 46 | from scipy.stats import scoreatpercentile 47 | from pysca import scaTools as sca 48 | # import mpld3 49 | import pickle as pickle 50 | from optparse import OptionParser 51 | 52 | %matplotlib inline 53 | 54 | if not os.path.exists('../output/'): 55 | os.makedirs('../output/') 56 | 57 | Read in the results of the above three scripts (scaProcessMSA, scaCore 58 | and scaSectorID), stored as dictionaries in the databases 59 | PF00186_full.db and DHFR_PEPM3.db. To see what variables are stored in 60 | each dictionary, use: 61 | 62 | :: 63 | 64 | >> print dict.keys() 65 | 66 | .. code:: python3 67 | 68 | Dseq = list(); Dsca = list(); Dsect = list() 69 | db = pickle.load(open('../output/PF00186_full.db','rb')) 70 | Dseq.append(db['sequence']) 71 | Dsca.append(db['sca']) 72 | Dsect.append(db['sector']) 73 | db2 = pickle.load(open('../output/DHFR_PEPM3.db', 'rb')) 74 | Dseq.append(db2['sequence']) 75 | Dsca.append(db2['sca']) 76 | Dsect.append(db2['sector']) 77 | N_alg = 2 78 | AlgName = ['PFAM', 'Manual'] 79 | 80 | I. Statistical Structure of the Multiple Sequence Alignment (MSA) 81 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 82 | 83 | We start with a rough comparison of the sequence composition of the two 84 | alignments. Plot a histogram of all pairwise sequence identities *(left 85 | panel)* and a global view of the sequence similarity matrix (defined by 86 | :math:`S\equiv \frac{1}{L}XX^\top`) *(right panel)*. The PFAM alignment 87 | is show in the *top row* and the manual alignment is shown in the 88 | *bottom row*. The manual alignment is smaller (644 seqs vs 2000 for 89 | PFAM), but both alignments are well-described by a nearly homogeneous 90 | distribution of sequence identities with a mean value of about 35%. 91 | 92 | .. code:: python3 93 | 94 | ix = 1 95 | plt.rcParams['figure.figsize'] = 9, 15 96 | for k in range(N_alg): 97 | # List all elements above the diagonal (i level] 151 | descr_dict = {k:descr_list.count(k) for k in descr_list \ 152 | if descr_list.count(k)>=atleast} 153 | print('\n Level %i:' % level) 154 | print(descr_dict) 155 | 156 | 157 | .. parsed-literal:: 158 | 159 | Alignment: PFAM 160 | 161 | Level 0: 162 | {'Bacteria': 1486, 'Eukaryota': 210, 'Viruses': 37, 'Archaea': 24} 163 | 164 | Level 1: 165 | {'Proteobacteria': 581, 'Metazoa': 81, 'Chlamydiae': 14, 'Fungi': 60, 'Actinobacteria': 173, 'Firmicutes': 467, 'dsDNA viruses': 36, 'Tenericutes': 27, 'Bacteroidetes': 155, 'environmental samples': 24, 'Viridiplantae': 32, 'Fusobacteria': 10, 'Euryarchaeota': 23, 'stramenopiles': 11, 'Alveolata': 12} 166 | 167 | Level 2: 168 | {'Gammaproteobacteria': 317, 'Chordata': 36, 'Chlamydiales': 14, 'Dikarya': 59, 'Betaproteobacteria': 108, 'Actinobacteridae': 161, 'Lactobacillales': 176, 'Clostridia': 147, ' no RNA stage': 36, 'Mollicutes': 27, 'Bacteroidia': 70, 'Negativicutes': 26, 'Alphaproteobacteria': 137, 'Flavobacteriia': 52, 'Sphingobacteriia': 14, 'Arthropoda': 32, 'Deltaproteobacteria': 17, 'Bacillales': 103, 'Cytophagia': 12, 'Fusobacteriales': 10, 'Halobacteria': 21, 'Streptophyta': 24, 'Erysipelotrichi': 15, 'Coriobacteridae': 11} 169 | 170 | Level 3: 171 | {'Enterobacteriales': 78, 'Pseudomonadales': 38, 'Craniata': 32, 'Chlamydiaceae': 14, 'Ascomycota': 49, 'Burkholderiales': 65, 'Actinomycetales': 135, 'Chromatiales': 19, 'Lactobacillaceae': 70, 'Clostridiales': 145, 'Caudovirales': 14, 'Pasteurellales': 16, 'Mycoplasmataceae': 18, 'Bacteroidales': 70, 'Selenomonadales': 26, 'Streptococcaceae': 62, 'Vibrionales': 38, 'Enterococcaceae': 22, 'Rhizobiales': 65, 'Bifidobacteriales': 26, 'Flavobacteriales': 49, 'Rhodobacterales': 26, 'Oceanospirillales': 16, 'Sphingobacteriales': 14, 'Hexapoda': 27, 'Paenibacillaceae': 20, 'Neisseriales': 21, 'Bacillaceae': 52, 'Cytophagales': 12, 'Basidiomycota': 10, 'Halobacteriales': 21, 'Xanthomonadales': 17, 'Alteromonadales': 41, 'Sphingomonadales': 16, 'Legionellales': 10, 'Staphylococcus': 11, 'Embryophyta': 24, 'Thiotrichales': 10, 'Erysipelotrichales': 15, 'Coriobacteriales': 11, 'Caulobacterales': 10} 172 | Alignment: Manual 173 | 174 | Level 0: 175 | {'cellular organisms': 612} 176 | 177 | Level 1: 178 | {' Eukaryota': 151, ' Bacteria': 461} 179 | 180 | Level 2: 181 | {' Opisthokonta': 137, ' Proteobacteria': 259, ' Bacteroidetes/Chlorobi group': 42, ' Firmicutes': 100, ' Actinobacteria': 42, ' Alveolata': 11} 182 | 183 | Level 3: 184 | {' Fungi': 74, ' Alphaproteobacteria': 69, ' Bacteroidetes': 42, ' Betaproteobacteria': 58, ' Bacilli': 78, ' Metazoa': 62, ' Gammaproteobacteria': 126, ' Actinobacteria': 42, ' Clostridia': 21, ' Apicomplexa': 11} 185 | 186 | 187 | Based on this, we select taxonomic groups and colors for representation. 188 | Here, we just start by choosing the broadly well-represented groups. To 189 | see a complete color-coding legend, use: 190 | 191 | :: 192 | 193 | >>> sca.figColors() 194 | 195 | .. code:: python3 196 | 197 | phylo = list(); 198 | fam_names = ['Eukaryota', 'Bacteroidetes', 'Firmicutes', \ 199 | 'Actinobacteria', 'Proteobacteria'] 200 | col = (0, 0.18, 0.38, 0.6, 0.8) 201 | 202 | # Legend: Eukaryota = red, Bacteriodetes = yellow, Firmicutes = green, 203 | # Actinobacteria = blue, Proteobacteria = purple 204 | for a in range(N_alg): 205 | phylo_alg = list() 206 | for i,k in enumerate(fam_names): 207 | sf = sca.Unit() 208 | sf.name = fam_names[i].lower() 209 | sf.col = col[i] 210 | sf.items = [j for j,q in enumerate(Dseq[a]['hd']) \ 211 | if sf.name in q.lower()] 212 | phylo_alg.append(sf) 213 | phylo.append(phylo_alg) 214 | 215 | Plot the top six independent components of the sequence correlation 216 | matrix (with sequence weights); color-coded by phylogenetic annotation. 217 | We compare the phylogenetic sampling for the PFAM alignment *(top row)* 218 | and manual alignment\ *(bottom row)*. The data show some very clear 219 | seqeunce distinctions based on phylogeny, and the two alignments seem to 220 | differ somewhat in the sequence divergence captured. In particular, the 221 | eukaryotic sequences *(in red)* seem to form a more distinct group in 222 | the manual alignment than in the PFAM alignment. For the PFAM alignment, 223 | the bacteriodetes *(yellow)* diverge along :math:`U_1`, the 224 | actinobacteria *(blue)* along :math:`U_3`, the firmicutes *(green)* 225 | along :math:`U_4` and :math:`U_5`, and a subset of proteobacteria 226 | *(purple)* along :math:`U_6`. For the manual alignment, the eukaryotes 227 | *(red)* diverge along :math:`U_2` and :math:`U_6`, the actinobacteria 228 | *(blue)* along :math:`U_4`, the firmicutes *(green)* along :math:`U_3`, 229 | and a subset of proteobacteria *(purple)* along :math:`U_5` 230 | 231 | .. code:: python3 232 | 233 | plt.rcParams['figure.figsize'] = 9, 8 234 | ix = 1; 235 | for a in range(N_alg): 236 | U = Dsca[a]['Uica'][1] 237 | pairs = [[2*i,2*i+1] for i in range(3)] 238 | for k,[k1,k2] in enumerate(pairs): 239 | plt.subplot(2,3,ix) 240 | ix += 1 241 | sca.figUnits(U[:,k1], U[:,k2], phylo[a]) 242 | #sca.figUnits(U[:,k1], U[:,k2], subfam) 243 | plt.xlabel(r"${U'}^{(2)}_{%i}$"%(k1+1), fontsize=16) 244 | plt.ylabel(r"${U'}^{(2)}_{%i}$"%(k2+1), fontsize=16) 245 | plt.tight_layout() 246 | 247 | 248 | 249 | .. image:: _static/SCA_DHFR_13_0.png 250 | 251 | 252 | II. SCA…conservation and coevolution 253 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 254 | 255 | Plot the eigenspectrum of (1) the SCA positional coevolution matrix 256 | (:math:`\tilde{C_{ij}}`) *(black bars)* and (2) 10 trials of matrix 257 | randomization for comparison. This graph is used to choose the number of 258 | significant eigenmodes. Again, we plot the PFAM alignment in the *top 259 | row* and manual alignment in the *bottom row* for comparison. Overall 260 | the two eigenspectra are remarkably similar: due to small differences in 261 | the signficance cutoff, we define 6 signficant eigenmodes for the PFAM 262 | alignment, and 7 for the manual alignment. 263 | 264 | .. code:: python3 265 | 266 | for a in range(N_alg): 267 | plt.rcParams['figure.figsize'] = 9, 4 268 | hist0, bins = np.histogram(Dsca[a]['Lrand'].flatten(), bins=Dseq[a]['Npos'], \ 269 | range=(0,Dsect[a]['Lsca'].max())) 270 | hist1, bins = np.histogram(Dsect[a]['Lsca'], bins=Dseq[a]['Npos'],\ 271 | range=(0,Dsect[a]['Lsca'].max())) 272 | plt.subplot(2,1,a+1) 273 | plt.bar(bins[:-1], hist1, np.diff(bins),color='k') 274 | plt.plot(bins[:-1], hist0/Dsca[a]['Ntrials'], 'r', linewidth=3) 275 | plt.tick_params(labelsize=11) 276 | plt.xlabel('Eigenvalues', fontsize=18); plt.ylabel('Numbers', fontsize=18); 277 | print('Number of eigenmodes to keep is %i' %(Dsect[a]['kpos'])) 278 | plt.tight_layout() 279 | #mpld3.display() 280 | 281 | 282 | .. parsed-literal:: 283 | 284 | Number of eigenmodes to keep is 4 285 | Number of eigenmodes to keep is 6 286 | 287 | 288 | 289 | .. image:: _static/SCA_DHFR_16_1.png 290 | 291 | 292 | To define the positions with significant contributions each of the 293 | independent components (ICs), we make a empirical fit for each IC to the 294 | t-distribution and select positions with greater than a specified cutoff 295 | on the CDF. We choose :math:`p=0.95` as our cutoff. Note that since some 296 | positions might contribute significantly to more than one IC (and 297 | indication of non-independence of ICs), we apply a simple algorithm to 298 | assign such positions to one IC. Specifically, we assign positions to 299 | the IC with which it has the greatest degree of co-evolution. 300 | 301 | For brevity, we don’t plot the IC fits below (though we do in the other 302 | tutorial notebooks), but do print the list of positions associated with 303 | each IC for both alignments. Comparing between alignments, we can 304 | already see some distinctions in the residue positions associated to 305 | each component: IC1 is expanded for the manual alignment, ICs2,4+5 are 306 | similar for both alignments, and ICs 3+6 are swapped between the two 307 | alignments. 308 | 309 | .. code:: python3 310 | 311 | plt.rcParams['figure.figsize'] = 20, 5 312 | for a in range(N_alg): 313 | print("alignment: "+AlgName[a]) 314 | for n,ipos in enumerate(Dsect[a]['ics']): 315 | sort_ipos = sorted(ipos.items) 316 | ats_ipos = ([Dseq[a]['ats'][s] for s in sort_ipos]) 317 | ic_pymol = ('+'.join(ats_ipos)) 318 | print('IC %i is composed of %i positions:' % (n+1,len(ats_ipos))) 319 | print(ic_pymol + "\n") 320 | 321 | 322 | .. parsed-literal:: 323 | 324 | alignment: PFAM 325 | IC 1 is composed of 14 positions: 326 | 13+18+23+25+27+32+38+39+55+63+90+107+133+153 327 | 328 | IC 2 is composed of 19 positions: 329 | 7+14+15+31+35+42+43+44+46+49+54+57+59+61+94+95+96+113+122 330 | 331 | IC 3 is composed of 5 positions: 332 | 21+22+24+52+121 333 | 334 | IC 4 is composed of 11 positions: 335 | 6+11+40+47+50+51+53+92+100+111+125 336 | 337 | alignment: Manual 338 | IC 1 is composed of 18 positions: 339 | 13+18+23+25+27+28+32+38+39+51+55+63+71+105+107+121+133+158 340 | 341 | IC 2 is composed of 18 positions: 342 | 7+14+15+22+31+35+42+43+44+46+49+54+57+61+94+95+96+113 343 | 344 | IC 3 is composed of 9 positions: 345 | 40+47+50+52+53+59+81+100+103 346 | 347 | IC 4 is composed of 10 positions: 348 | 6+11+41+45+60+90+92+111+125+126 349 | 350 | IC 5 is composed of 6 positions: 351 | 5+21+115+122+123+147 352 | 353 | IC 6 is composed of 2 positions: 354 | 144+149 355 | 356 | 357 | 358 | To define protein sectors, we examine the structure of the SCA 359 | positional correlation matrix with positions contributing to the top 360 | independent components (ICs) ordered by weight *(left panel)*. Again we 361 | compare the results between the PFAM alignment *(top)* and manual 362 | alignment *(bottom)*. This provides a basis to determine/interpret which 363 | ICs are truly statistically independent (defining an independent sector) 364 | and which represent hierarchical breakdowns of one sector. 365 | 366 | For both alignments, it seems that the ICs reflect a hierarchical 367 | break-down of a single sector, as determined by the high degree of 368 | co-evolution in the off-diagonal components (see the dendrogram that 369 | follows). In the *right panels* the ICs are combined and re-ordered by 370 | their contribution to :math:`V_1^p` to better see this. 371 | 372 | .. code:: python3 373 | 374 | sectors = list() 375 | ix = 1 376 | for a in range(N_alg): 377 | # plot the SCA positional correlation matrix, ordered by contribution 378 | #to the top ICs 379 | plt.rcParams['figure.figsize'] = 9, 9 380 | plt.subplot(2,2,ix); ix +=1; 381 | plt.imshow(Dsca[a]['Csca'][np.ix_(Dsect[a]['sortedpos'],\ 382 | Dsect[a]['sortedpos'])],vmin=0, vmax=2,\ 383 | interpolation='none',aspect='equal',\ 384 | extent=[0,sum(Dsect[a]['icsize']),0,\ 385 | sum(Dsect[a]['icsize'])]) 386 | line_index=0 387 | for i in range(Dsect[a]['kpos']): 388 | plt.plot([line_index+Dsect[a]['icsize'][i],\ 389 | line_index+Dsect[a]['icsize'][i]],\ 390 | [0,sum(Dsect[a]['icsize'])],'w', linewidth = 2) 391 | plt.plot([0,sum(Dsect[a]['icsize'])],[sum(Dsect[a]['icsize'])\ 392 | -line_index,sum(Dsect[a]['icsize'])-line_index],\ 393 | 'w', linewidth = 2) 394 | line_index += Dsect[a]['icsize'][i] 395 | 396 | # combine all the ICs into a single sector and re-sort 397 | sec_groups = ([k for k in range(Dsect[a]['kpos'])]) 398 | sectors_alg = list() 399 | s = sca.Unit() 400 | all_items = list() 401 | all_Vp = list() 402 | for i in range(Dsect[a]['kpos']): 403 | all_items = all_items+Dsect[a]['ics'][i].items 404 | tmp1 = Dsect[a]['Vpica'][Dsect[a]['ics'][i].items,:] 405 | all_Vp = all_Vp + list(tmp1[:,0].T) 406 | svals = list(np.argsort(all_Vp)); svals.reverse() 407 | s.items = [all_items[i] for i in svals] 408 | s.col = (1/len(sec_groups))*n 409 | sectors_alg.append(s) 410 | sectors.append(sectors_alg) 411 | 412 | # plot the re-ordered matrix 413 | sortpos = list() 414 | for s in sectors[a]: 415 | sortpos.extend(s.items) 416 | plt.subplot(2,2,ix); ix += 1; 417 | line_index=0 418 | plt.imshow(Dsca[a]['Csca'][np.ix_(sortpos, sortpos)], \ 419 | vmin=0, vmax=2,interpolation='none',aspect='equal',\ 420 | extent=[0,len(sortpos),0,len(sortpos)]) 421 | for s in sectors[a]: 422 | plt.plot([line_index+len(s.items),line_index+len(s.items)],\ 423 | [0,len(sortpos)],'w', linewidth = 2) 424 | plt.plot([0,sum(Dsect[a]['icsize'])],[len(sortpos)-line_index, \ 425 | len(sortpos)-line_index],'w', linewidth = 2) 426 | line_index += len(s.items) 427 | plt.tight_layout() 428 | 429 | 430 | 431 | .. image:: _static/SCA_DHFR_20_0.png 432 | 433 | 434 | The below dendrogram diagrams the relationship between independent 435 | components for the PFAM alignment (the tree for the manual alignment is 436 | similar). In this plot, solid lines represent physically contiguous 437 | structural units, and dashed lines indicate spatially fragmented groups 438 | of residues. 439 | 440 | .. code:: python3 441 | 442 | i = Image(filename='../figs/DHFR_sec_hier.png'); i 443 | 444 | 445 | 446 | 447 | .. image:: _static/SCA_DHFR_22_0.png 448 | 449 | 450 | 451 | Print the sector positions (as defined for each alignment), in a format 452 | suitable for pyMol, and create two pyMol sessions with the sectors (and 453 | decomposition into independent components) as seperate objects. 454 | 455 | .. code:: python3 456 | 457 | for a in range(N_alg): 458 | print("Alignment: " + AlgName[a]) 459 | for i,k in enumerate(sectors[a]): 460 | sort_ipos = sorted(k.items) 461 | ats_ipos = ([Dseq[a]['ats'][s] for s in sort_ipos]) 462 | ic_pymol = ('+'.join(ats_ipos)) 463 | print('Sector %i is composed of %i positions:' % (i+1,len(ats_ipos))) 464 | print(ic_pymol + "\n") 465 | 466 | sca.writePymol('1RX2', sectors[0], Dsect[0]['ics'], Dseq[0]['ats'],\ 467 | '../output/PF00186.pml','A', '../data/', 0) 468 | sca.writePymol('1RX2', sectors[1], Dsect[1]['ics'], Dseq[1]['ats'],\ 469 | '../output/DHFR_PEPM3.pml','A', '../data/', 0) 470 | 471 | 472 | .. parsed-literal:: 473 | 474 | Alignment: PFAM 475 | Sector 1 is composed of 49 positions: 476 | 6+7+11+13+14+15+18+21+22+23+24+25+27+31+32+35+38+39+40+42+43+44+46+47+49+50+51+52+53+54+55+57+59+61+63+90+92+94+95+96+100+107+111+113+121+122+125+133+153 477 | 478 | Alignment: Manual 479 | Sector 1 is composed of 63 positions: 480 | 5+6+7+11+13+14+15+18+21+22+23+25+27+28+31+32+35+38+39+40+41+42+43+44+45+46+47+49+50+51+52+53+54+55+57+59+60+61+63+71+81+90+92+94+95+96+100+103+105+107+111+113+115+121+122+123+125+126+133+144+147+149+158 481 | 482 | 483 | 484 | As is evident from the position lists above, and as shown below, the 485 | structural pattern of the two sectors and their associated decomposition 486 | into independent components is highly similar when compared between the 487 | two alignments. The main difference is that the sector (and independent 488 | components) for the manual alignment systematically includes a few more 489 | residue postions. 490 | 491 | .. code:: python3 492 | 493 | i = Image(filename='../figs/DHFR_decompv2.png'); i 494 | 495 | 496 | 497 | 498 | .. image:: _static/SCA_DHFR_26_0.png 499 | 500 | 501 | 502 | III. The phylogenetic basis of the sector hierarchy 503 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 504 | 505 | How does the clear phylogenetic heterogeneity in the MSA influence the 506 | sector definitions? Since the sector definitions for the two alignments 507 | above are roughly equivalent, we only consider the larger (PFAM) 508 | alignment here. To address this, we take advantage of mathematical 509 | methods for mapping between the space of positional and sequence 510 | correlations, as described in *Rivoire et al*. Using this mapping, we 511 | plot the top :math:`k_{pos}` ICs as 2-D scatter plots with the 512 | corresponding sequence space divergence. 513 | 514 | .. code:: python3 515 | 516 | plt.rcParams['figure.figsize'] = 14, 10 517 | pairs = [ [x, x+1] for x in range(0, len(Dsect[0]['ics'])-1, 2) ] 518 | for n,[k1,k2] in enumerate(pairs): 519 | plt.subplot(2,len(pairs),n+1) 520 | sca.figUnits(Dsect[0]['Vpica'][:,k1], Dsect[0]['Vpica'][:,k2],\ 521 | Dsect[0]['ics'], dotsize = 6) 522 | plt.xlabel(r'$V^p_{%i}$' % (k1+1), fontsize=16) 523 | plt.ylabel(r'$V^p_{%i}$' % (k2+1), fontsize=16) 524 | plt.subplot(2,len(pairs),n+1+len(pairs)) 525 | sca.figUnits(Dsect[0]['Upica'][:,k1], Dsect[0]['Upica'][:,k2],\ 526 | phylo[0], dotsize = 6) 527 | plt.xlabel(r'$U^p_{%i}$' % (k1+1), fontsize=16) 528 | plt.ylabel(r'$U^p_{%i}$' % (k2+1), fontsize=16) 529 | plt.tight_layout() 530 | 531 | 532 | 533 | .. image:: _static/SCA_DHFR_29_0.png 534 | 535 | 536 | .. code:: python3 537 | 538 | plt.rcParams['figure.figsize'] = 20,8 539 | col = list() 540 | axis_lims = ([-0.06, 0.08],[-0.03, -0.01],[-0.05,0.03],[-0.01 ,0.05],\ 541 | [-0.02 ,0.05],[-0.05 ,0.03]) 542 | for k in phylo[0]: 543 | col = col + [colorsys.hsv_to_rgb(k.col,1,1)] 544 | for k in range(Dsect[0]['kpos']): 545 | forhist = list() 546 | for group in phylo[0]: 547 | forhist.append([Dsect[0]['Upica'][i,k] for i in group.items]) 548 | plt.subplot(2,Dsect[0]['kpos'],k+1) 549 | plt.hist(forhist, histtype='barstacked',color=col) 550 | plt.axis([axis_lims[k][0],axis_lims[k][1],0,600]) 551 | plt.xlabel(r'$U^p_{%i}$' % (k+1), fontsize=16) 552 | plt.tight_layout() 553 | 554 | 555 | 556 | .. image:: _static/SCA_DHFR_30_0.png 557 | 558 | 559 | We see some association of phylogeny with sector positions at the phylum 560 | level: for example the positions along :math:`V_3^p` are associated with 561 | the divergence of some bacteriodetes *(yellow)* and 562 | firmicutes\ *(green)* along :math:`U_3^p`. Further, the positions along 563 | ICs :math:`V_1^p`,\ :math:`V_5^p` and :math:`V_6^p` seem to seperate the 564 | eukaryotes *(red)* from the prokaryotes. 565 | 566 | So in conclusion, the DHFR family appears to have a single sector that 567 | can be decomposed into roughly six groups due to sequence divergence. 568 | Notably, the sector definition (and decomposition into ICs) is very 569 | similar for the two different sequence alignments. 570 | -------------------------------------------------------------------------------- /docs/source/SCA_G.rst: -------------------------------------------------------------------------------- 1 | SCA 6.1 - The G Protein Family 2 | ============================== 3 | 4 | **Summary:** This script describes the basic flow of analytical steps in 5 | SCA6.0, using the G-protein (small GTPase, Ras-like) family as an 6 | example (PFAM PF00071). The analysis consists of five steps, which 7 | directly follow from the accompanying publication (Rivoire et al, “An 8 | Evolution-Based Functional Decomposition of Natural Proteins”): 9 | 10 | **1) Alignment processing and composition.** We begin by analyzing the 11 | composition of the mulitple sequence alignment: what is the number of 12 | effective sequences, and how are the sequences structured into families? 13 | For the G-protein family, the PFAM alignment sequence space is 14 | well-sampled and fairly homogeneous (unstructured), as evidenced by the 15 | fact that overall alignment sequence similarity shows a unimodal 16 | distribution near 25%. 17 | 18 | **2) First-order statistics: position-specific conservation.** Next, we 19 | examine overall positional conservation for the sequence alignment. This 20 | shows the expected result, that a handful of positions are strongly 21 | conserved. 22 | 23 | **3) Second-order statistics: conserved correlations.** Plots of the SCA 24 | matrix (:math:`\tilde{C_{ij}}`), the associated eigenvectors and 25 | eigenspectrum, and the corresponding independent components (IC). We 26 | choose the number of significant eigenmodes, :math:`k^* = 4`, by 27 | comparison of the eigenspectrum of :math:`\tilde{C_{ij}}` to that for a 28 | collection of 10 randomized alignments. 29 | 30 | **4) Sector decomposition.** Sector definition based on the top 31 | :math:`k^*` ICs. We begin by fitting the top :math:`k^*` statistically 32 | significant ICs to a t-distribution, and then identify the amino acid 33 | positions contributing to the top five percent of the corresponding 34 | cumulative density function. We then construct a sub-matrix of 35 | :math:`\tilde{C_{ij}}` that contains only the selected top-scoring 36 | positions for the :math:`k^*` ICs, ordered by their degree of 37 | contribution to each IC. This plot is used to choose sector assignments. 38 | For the g-protein family, we define two sectors, sector 1 composed of 39 | ICs 1,2, and 3, and sector 2 composed of IC 4. Related to Figs. 4 and 5 40 | of the main text. 41 | 42 | **5) Sequence subfamilies and the basis of sector hierarchy.** We relate 43 | the two sectors (and underlying ICs) to the pattern of divergence 44 | between amino acid sequences. To do this, we make a mapping between 45 | sequence space and positional correlations using singular value 46 | decomposition. We see that the amino acid positions associated with IC1 47 | and IC2 differentiate between different g-protein subclasses, suggesting 48 | that these regions might tune allosteric regulation in a subclass 49 | specific way. 50 | 51 | Prior to running this tutorial, the core calculation scripts must be 52 | executed from the command line as follows: 53 | 54 | :: 55 | 56 | >> scaProcessMSA -a ../data/PF00071_rd2.an -b ../data/ -s 5P21 -c A -f 'Homo sapiens' -t -n 57 | >> scaCore -i ../output/PF00071_rd2.db 58 | >> scaSectorID -i ../output/PF00071_rd2.db 59 | 60 | Note that we supply pre-annotated alignments for all tutorial scripts 61 | *(the annotate_pfMSA step is slow, and should only be run once)*. 62 | 63 | **O.Rivoire, K.Reynolds and R.Ranganathan** 1/2015 64 | 65 | .. code:: python3 66 | 67 | import os 68 | import time 69 | import matplotlib.pyplot as plt 70 | import numpy as np 71 | import copy 72 | import scipy.cluster.hierarchy as sch 73 | from scipy.stats import scoreatpercentile 74 | import matplotlib.image as mpimg 75 | from IPython.display import display 76 | from IPython.display import Image 77 | from Bio.Seq import Seq 78 | from Bio import motifs 79 | import colorsys 80 | from pysca import scaTools as sca 81 | # import mpld3 82 | import pickle as pickle 83 | from optparse import OptionParser 84 | 85 | %matplotlib inline 86 | 87 | if not os.path.exists('../output/'): 88 | os.makedirs('../output/') 89 | 90 | To begin, we read in the results of the above three scripts 91 | (scaProcessMSA, scaCore and scaSectorID), stored as three dictionaries 92 | in the database PF00071_full.db. To see what variables are stored in 93 | each dictionary, use the command dictionary.keys( ), e.g.: 94 | 95 | :: 96 | 97 | >>> print Dseq.keys() 98 | 99 | .. code:: python3 100 | 101 | db = pickle.load(open('../output/PF00071_rd2.db','rb')) 102 | Dseq = db['sequence'] #the results of scaProcessMSA 103 | Dsca = db['sca'] #the results of scaCore 104 | Dsect = db['sector'] #the results of scaSectorID 105 | 106 | I. Alignment processing and composition 107 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 108 | 109 | First, we print out a few statistics describing the alignment: 110 | 111 | .. code:: python3 112 | 113 | print("After processing, the alignment size is %i sequences and %i positions" % \ 114 | (Dseq['Nseq'], Dseq['Npos'])) 115 | print("With sequence weights, there are %i effective sequences" % (Dseq['effseqs'])) 116 | 117 | 118 | .. parsed-literal:: 119 | 120 | After processing, the alignment size is 4974 sequences and 158 positions 121 | With sequence weights, there are 3366 effective sequences 122 | 123 | 124 | To examine alignment composition, we plot a histogram of all pairwise 125 | sequence identities *(left panel)* and a global view of the sequence 126 | similarity matrix (defined by :math:`S\equiv \frac{1}{L}XX^\top`) 127 | *(right panel)*. The data show that the alignment is described by a 128 | nearly homogeneous distribution of sequence identities with a mean value 129 | of about 25%. 130 | 131 | .. code:: python3 132 | 133 | # List all elements above the diagonal (i level] 175 | descr_dict = {k:descr_list.count(k) for k in descr_list \ 176 | if descr_list.count(k)>=atleast} 177 | print('\n Level %i:' % level) 178 | print(descr_dict) 179 | 180 | 181 | .. parsed-literal:: 182 | 183 | 184 | Level 0: 185 | {'Eukaryota': 4879, 'Bacteria': 74} 186 | 187 | Level 1: 188 | {'Metazoa': 2048, 'Alveolata': 530, 'Heterolobosea': 143, 'Parabasalia': 321, 'Cyanobacteria': 20, 'Bacteroidetes': 18, 'Fungi': 702, 'Euglenozoa': 148, 'Amoebozoa': 510, 'stramenopiles': 159, 'Proteobacteria': 27, 'Viridiplantae': 192, 'Choanoflagellida': 47, 'Ichthyosporea': 22, 'Diplomonadida': 32, 'Oxymonadida': 14} 189 | 190 | Level 2: 191 | {'Chordata': 985, 'Ciliophora': 408, 'Schizopyrenida': 143, 'Arthropoda': 520, 'Trichomonadida': 320, 'Dikarya': 622, 'Placozoa': 44, 'Porifera': 105, 'Kinetoplastida': 148, 'Archamoebae': 255, 'Cnidaria': 78, 'Nematoda': 175, 'Platyhelminthes': 89, 'Mycetozoa': 255, 'Chytridiomycota': 15, 'Bacillariophyta': 23, 'Blastocystis': 20, 'Pelagophyceae': 25, 'PX clade': 24, 'Gammaproteobacteria': 10, 'Streptophyta': 137, 'Oomycetes': 67, 'Echinodermata': 46, 'Salpingoecidae': 25, 'Apicomplexa': 89, 'Microsporidia': 46, 'Chlorophyta': 55, 'Capsaspora': 22, 'Perkinsea': 32, 'Codonosigidae': 22, 'Fungi incertae sedis': 17, 'Hexamitidae': 32} 192 | 193 | Level 3: 194 | {'Craniata': 809, 'Tunicata': 122, 'Intramacronucleata': 408, 'Vahlkampfiidae': 143, 'Hexapoda': 402, 'Trichomonadidae': 320, 'Ascomycota': 471, 'Trichoplax': 44, 'Demospongiae': 105, 'Basidiomycota': 151, 'Trypanosomatidae': 148, 'Entamoebidae': 255, 'Anthozoa': 76, 'Chromadorea': 154, 'Trematoda': 85, 'Crustacea': 73, 'Dictyosteliida': 253, 'Chytridiomycetes': 15, 'Coscinodiscophyceae': 16, 'Chelicerata': 45, 'Aureococcus': 25, 'Enoplea': 21, 'Phaeophyceae': 23, 'Embryophyta': 136, 'Cephalochordata': 54, 'Albuginales': 22, 'Eleutherozoa': 46, 'Salpingoeca': 25, 'Coccidia': 37, 'Unikaryonidae': 17, 'Mamiellophyceae': 24, 'Trebouxiophyceae': 12, 'Aconoidasida': 52, 'Perkinsida': 32, 'Peronosporales': 45, 'Enterocytozoonidae': 11, 'Monosiga': 22, 'Early diverging fungal lineages': 17, 'Giardiinae': 32, 'Chlorophyceae': 19} 195 | 196 | 197 | Based on this, we select taxonomic groups and associate them to colors 198 | for representation. We choose broad taxonomic groups that are 199 | well-represented in the alignment (corresponding to Level 1). To see a 200 | complete legend that maps numeric codes to color, use: 201 | 202 | :: 203 | 204 | >>> sca.figColors() 205 | 206 | .. code:: python3 207 | 208 | phylo = list(); 209 | fam_names = ['Metazoa', 'Amoebozoa', 'Viridiplantae', 'Fungi',\ 210 | 'Alveolata', 'Parabasalia'] 211 | col = (0, 0.6, 0.38, 0.18, 0.8, 0.5) 212 | #Metazoa = red, Amoebozoa = yellow, Viridiplantae = green, 213 | #Fungi = cyan, Alveolata = blue, Parabasalia = purple 214 | for i,k in enumerate(fam_names): 215 | sf = sca.Unit() 216 | sf.name = fam_names[i].lower() 217 | sf.col = col[i] 218 | sf.items = [j for j,q in enumerate(Dseq['hd']) if sf.name in q.lower()] 219 | phylo.append(sf) 220 | 221 | We also attempt to annotate the sequences by their declared sub-class of 222 | G protein - Ras, Rab, Rac, and Rho. These annotations are simply parsed 223 | from the header, and could contain mis-assignments. 224 | 225 | .. code:: python3 226 | 227 | gprot_names = ['Ras', 'Rab', 'Rac','Rho'] 228 | gprot_classes = list() 229 | col = (0, 0.65, 0.15, 0.38) 230 | #Ras=light blue, Rab = orange, Rac=yellow, Rho=dark blue 231 | for c,k in enumerate(gprot_names): 232 | gp = sca.Unit() 233 | gp.col = col[c] 234 | gp.name = k 235 | gp.items = [i for i,h in enumerate(Dseq['hd']) if k in h] 236 | gprot_classes.append(gp) 237 | 238 | To examine the relationship between global sequence similarity, 239 | phylogeny, and functional sub-class, we plot the top six independent 240 | components of the sequence correlation matrix (including sequence 241 | weights). In these plots, each point represents a particular sequence, 242 | and the distance between points reflects global sequence identity. In 243 | the top row each point (sequence) is color coded by phylogenetic 244 | annotation, in the bottom row, they are color-coded by g-protein class. 245 | 246 | .. code:: python3 247 | 248 | plt.rcParams['figure.figsize'] = 9, 8 249 | U = Dsca['Uica'][1] 250 | pairs = [[i,i+1] for i in range(0,6,2)] 251 | for k,[k1,k2] in enumerate(pairs): 252 | plt.subplot(2,3,k+1) 253 | sca.figUnits(U[:,k1], U[:,k2], phylo) 254 | plt.xlabel(r"$IC_{seq}^{%i}$"%(k1+1), fontsize=16) 255 | plt.ylabel(r"$IC_{seq}^{%i}$"%(k2+1), fontsize=16) 256 | plt.subplot(2,3,k+4) 257 | sca.figUnits(U[:,k1], U[:,k2], gprot_classes) 258 | plt.xlabel(r"$IC_{seq}^{%i}$"%(k1+1), fontsize=16) 259 | plt.ylabel(r"$IC_{seq}^{%i}$"%(k2+1), fontsize=16) 260 | plt.tight_layout() 261 | 262 | 263 | 264 | .. image:: _static/SCA_G_17_0.png 265 | 266 | 267 | The data show a mixed distribution of phylogenetic groups along modes 268 | 1-5. A subset of metazoan sequences emerges along the mode six, 269 | :math:`IC^{6}_{seq}`. In contrast, the top modes of the sequence 270 | similarity matrix do seem to correspond to functional G protein 271 | subclasses. For example, the Rho proteins *(green)* emerge along 272 | :math:`IC^{2}_{seq}` , the Ras proteins *(red)* along 273 | :math:`IC^{3}_{seq}`, and a subset of Rabs *(blue)* along 274 | :math:`IC^{4}_{seq}` and :math:`IC^{5}_{seq}` and a subset of Ras 275 | proteins along :math:`IC^{6}_{seq}`. Many G-protein paralogs (reflecting 276 | different subclasses) can be found in each type of organism, and thus 277 | the global pattern of sequence divergence is distinct from phylogeny. 278 | 279 | II. First-order statistics: position-specific conservation. 280 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 281 | 282 | Plot the position-specific conservation values for each g-protein 283 | position. :math:`D_i` is calculated according to equation S4 284 | (supplemental information). 285 | 286 | .. code:: python3 287 | 288 | fig, axs = plt.subplots(1,1, figsize=(9,4)) 289 | xvals = [i+1 for i in range(len(Dsca['Di']))] 290 | xticks = [0,45,95,144] 291 | plt.bar(xvals,Dsca['Di'], color='k') 292 | plt.tick_params(labelsize=11); plt.grid() 293 | axs.set_xticks(xticks); 294 | labels = [Dseq['ats'][k] for k in xticks] 295 | axs.set_xticklabels(labels); 296 | plt.xlabel('Amino acid position', fontsize=18); plt.ylabel('Di', fontsize=18); 297 | 298 | 299 | 300 | .. image:: _static/SCA_G_21_0.png 301 | 302 | 303 | III. Second-order statistics: conserved correlations. 304 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 305 | 306 | Plot the SCA correlation matrix ( :math:`\tilde{C_{ij}}` ) computed 307 | according to Equations 4+5 308 | 309 | .. code:: python3 310 | 311 | plt.rcParams['figure.figsize'] = 13, 8 312 | plt.imshow(Dsca['Csca'], vmin=0, vmax=1.4,interpolation='none',\ 313 | aspect='equal') 314 | 315 | 316 | 317 | 318 | .. parsed-literal:: 319 | 320 | 321 | 322 | 323 | 324 | 325 | .. image:: _static/SCA_G_24_1.png 326 | 327 | 328 | Plot the eigenspectrum of (1) the SCA positional coevolution matrix 329 | (:math:`\tilde{C_{ij}}`) *(black bars)* and (2) 10 trials of matrix 330 | randomization for comparison. This graph is used to choose the number of 331 | significant eigenmodes (:math:`k^* = 4`). 332 | 333 | .. code:: python3 334 | 335 | plt.rcParams['figure.figsize'] = 9, 4 336 | hist0, bins = np.histogram(Dsca['Lrand'].flatten(), bins=Dseq['Npos'], \ 337 | range=(0,Dsect['Lsca'].max())) 338 | hist1, bins = np.histogram(Dsect['Lsca'], bins=Dseq['Npos'], \ 339 | range=(0,Dsect['Lsca'].max())) 340 | plt.bar(bins[:-1], hist1, np.diff(bins),color='k') 341 | plt.plot(bins[:-1], hist0/Dsca['Ntrials'], 'r', linewidth=3) 342 | plt.tick_params(labelsize=11) 343 | plt.xlabel('Eigenvalues', fontsize=18); plt.ylabel('Numbers', fontsize=18); 344 | print('Number of eigenmodes to keep is %i' %(Dsect['kpos'])) 345 | #mpld3.display() 346 | 347 | 348 | .. parsed-literal:: 349 | 350 | Number of eigenmodes to keep is 4 351 | 352 | 353 | 354 | .. image:: _static/SCA_G_26_1.png 355 | 356 | 357 | Plot the top significant eigenmodes *(top row)* and associated 358 | independent components *(bottom row)*. The ICs are an optimally 359 | independent representation of the four different residue groups. 360 | 361 | .. code:: python3 362 | 363 | plt.rcParams['figure.figsize'] = 9, 6 364 | EVs = Dsect['Vsca'] 365 | ICs = Dsect['Vpica'] 366 | pairs = [ [x,x+1] for x in range(Dsect['kpos']-1)] 367 | ncols = len(pairs) 368 | for k,[k1,k2] in enumerate(pairs): 369 | plt.subplot(2,ncols,k+1) 370 | plt.plot(EVs[:,k1], EVs[:,k2], 'ok') 371 | plt.xlabel("EV%i"%(k1+1), fontsize=16) 372 | plt.ylabel("EV%i"%(k2+1), fontsize=16) 373 | plt.subplot(2,ncols,k+1+ncols) 374 | plt.plot(ICs[:,k1], ICs[:,k2], 'ok') 375 | plt.xlabel("IC%i"%(k1+1), fontsize=16) 376 | plt.ylabel("IC%i"%(k2+1), fontsize=16) 377 | plt.tight_layout() 378 | 379 | 380 | 381 | .. image:: _static/SCA_G_28_0.png 382 | 383 | 384 | IV. Sector decomposition. 385 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 386 | 387 | To define the positions with significant contributions to each of the 388 | independent components (ICs), we make a empirical fit for each IC to the 389 | t-distribution and select positions with greater than a specified cutoff 390 | on the CDF. We choose :math:`p=0.95` as our cutoff. Note that since some 391 | positions might contribute significantly to more than one IC (an 392 | indication of non-independence of ICs), we apply a simple algorithm to 393 | assign such positions to one IC. Specifically, we assign positions to 394 | the IC with which it has the greatest degree of co-evolution. 395 | 396 | The data indicate generally good fits for the top five ICs (also shown 397 | in supplemental figure S2), and we return the positions contributing to 398 | each IC in a format suitable for cut and paste into PyMol. 399 | 400 | .. code:: python3 401 | 402 | plt.rcParams['figure.figsize'] = 8, 8 403 | 404 | Vpica = Dsect['Vpica'] 405 | for k in range(Dsect['kpos']): 406 | iqr = scoreatpercentile(Vpica[:,k],75) - scoreatpercentile(Vpica[:,k],25) 407 | binwidth=2*iqr*(len(Vpica)**(-0.33)) 408 | nbins=int(round((max(Vpica[:,k])-min(Vpica[:,k]))/binwidth)) 409 | plt.subplot(Dsect['kpos'],1,k+1) 410 | h_params = plt.hist(Vpica[:,k], nbins) 411 | x_dist = np.linspace(min(h_params[1]), max(h_params[1]), num=100) 412 | plt.plot(x_dist,Dsect['scaled_pd'][k],'r',linewidth = 2) 413 | plt.plot([Dsect['cutoff'][k],Dsect['cutoff'][k]], [0,60], 'k--',linewidth = 1) 414 | plt.xlabel(r'$V^p_{%i}$'%(k+1), fontsize=14) 415 | plt.ylabel('Number', fontsize=14) 416 | plt.tight_layout() 417 | 418 | for n,ipos in enumerate(Dsect['ics']): 419 | sort_ipos = sorted(ipos.items) 420 | ats_ipos = ([Dseq['ats'][s] for s in sort_ipos]) 421 | ic_pymol = ('+'.join(ats_ipos)) 422 | print('IC %i is composed of %i positions:' % (n+1,len(ats_ipos))) 423 | print(ic_pymol + "\n") 424 | 425 | 426 | 427 | .. parsed-literal:: 428 | 429 | IC 1 is composed of 19 positions: 430 | 22+32+34+36+39+42+54+63+64+68+71+73+75+81+83+85+110+116+144 431 | 432 | IC 2 is composed of 8 positions: 433 | 5+11+56+61+62+72+96+99 434 | 435 | IC 3 is composed of 16 positions: 436 | 10+14+15+16+28+35+57+58+59+60+117+119+145+146+147+156 437 | 438 | IC 4 is composed of 13 positions: 439 | 17+23+82+84+90+115+123+125+129+130+134+141+143 440 | 441 | 442 | 443 | 444 | .. image:: _static/SCA_G_31_1.png 445 | 446 | 447 | To define protein sectors, we examine the structure of the SCA 448 | positional correlation matrix with positions contributing to the top 449 | independent components (ICs) ordered by weight (*left panel*). This 450 | provides a basis to determine/interpret which ICs are truly 451 | statistically independent (defining an independent sector) and which 452 | represent hierarchical breakdowns of one sector. In this case, the data 453 | suggest that ICs 1, 2, and 3 have strong inter-IC correlations and 454 | should be considered a single sector, and IC4 shows little corrleation 455 | with other ICs, implying a distinct sector (see the dendrogram that 456 | follows). In the *right panel* the ICs are re-ordered to reflect this 457 | decomposition. 458 | 459 | .. code:: python3 460 | 461 | #plot the SCA positional correlation matrix, ordered by contribution to the top ICs 462 | plt.rcParams['figure.figsize'] = 9, 9 463 | plt.subplot(121) 464 | plt.imshow(Dsca['Csca'][np.ix_(Dsect['sortedpos'], Dsect['sortedpos'])], \ 465 | vmin=0, vmax=2.2,interpolation='none',\ 466 | aspect='equal',extent=[0,sum(Dsect['icsize']),\ 467 | 0,sum(Dsect['icsize'])]) 468 | line_index=0 469 | for i in range(Dsect['kpos']): 470 | plt.plot([line_index+Dsect['icsize'][i],line_index+Dsect['icsize'][i]],\ 471 | [0,sum(Dsect['icsize'])],'w', linewidth = 2) 472 | plt.plot([0,sum(Dsect['icsize'])],[sum(Dsect['icsize'])-\ 473 | line_index,sum(Dsect['icsize'])-line_index],'w', linewidth = 2) 474 | line_index += Dsect['icsize'][i] 475 | 476 | #define the new sector groupings - 3 total 477 | sec_groups = ([0,1,2],[3]) 478 | sectors = list() 479 | c = [0.66, 0] 480 | for n,k in enumerate(sec_groups): 481 | s = sca.Unit() 482 | all_items = list() 483 | all_Vp = list() 484 | for i in k: 485 | all_items = all_items+Dsect['ics'][i].items 486 | all_Vp = all_Vp+list(Dsect['ics'][i].vect) 487 | svals = np.argsort(all_Vp) 488 | s.items = [all_items[i] for i in svals] 489 | s.col = c[n] 490 | sectors.append(s) 491 | 492 | #plot the re-ordered matrix 493 | plt.subplot(122) 494 | line_index=0 495 | sortpos = list() 496 | for s in sectors: 497 | sortpos.extend(s.items) 498 | plt.imshow(Dsca['Csca'][np.ix_(sortpos, sortpos)], vmin=0, vmax=2.2,\ 499 | interpolation='none',aspect='equal',\ 500 | extent=[0,len(sortpos),0,len(sortpos)]) 501 | for s in sectors: 502 | plt.plot([line_index+len(s.items),line_index+len(s.items)],\ 503 | [0,len(sortpos)],'w', linewidth = 2) 504 | plt.plot([0,sum(Dsect['icsize'])],[len(sortpos)-line_index,\ 505 | len(sortpos)-line_index],'w', linewidth = 2) 506 | line_index += len(s.items) 507 | plt.tight_layout() 508 | 509 | 510 | 511 | .. image:: _static/SCA_G_33_0.png 512 | 513 | 514 | The below dendrogram diagrams the relationship between independent 515 | components. In this plot, solid lines represent physically contiguous 516 | structural units, and dashed lines indicate spatially fragmented groups 517 | of residues. We see that ICs 1,2,and 3 combine to form a single sector 518 | (sector 1), and that sector 2 (IC4) is more independent. 519 | 520 | .. code:: python3 521 | 522 | i = Image(filename='../figs/Gprot_sec_hier.png'); i 523 | 524 | 525 | 526 | 527 | .. image:: _static/SCA_G_35_0.png 528 | 529 | 530 | 531 | The assignments have clear physical consistency with the concept of 532 | sectors as functional, physically contiguous units in the protein 533 | structure (see also Figs.4-5). In the *left panels*, sector one is 534 | formed from the combination of positions in IC1 *(bright blue)*, IC2 535 | *(light blue)* and IC3 *(cyan)*. Sector2 (IC4) is shown in red spheres, 536 | and forms a phyically contiguous unit structurally distinct from sector 537 | one. 538 | 539 | .. code:: python3 540 | 541 | i = Image(filename = '../figs/Gprot_secstruct.png'); i 542 | 543 | 544 | 545 | 546 | .. image:: _static/SCA_G_37_0.png 547 | 548 | 549 | 550 | Print the sector positions, in a format suitable for pyMol, and create a 551 | pyMol session (in the output directory) with the sectors (and 552 | decomposition into independent components) as seperate objects. 553 | 554 | .. code:: python3 555 | 556 | for i,k in enumerate(sectors): 557 | sort_ipos = sorted(k.items) 558 | ats_ipos = ([Dseq['ats'][s] for s in sort_ipos]) 559 | ic_pymol = ('+'.join(ats_ipos)) 560 | print('Sector %i is composed of %i positions:' % (i+1,len(ats_ipos))) 561 | print(ic_pymol + "\n") 562 | sca.writePymol('5P21', sectors, Dsect['ics'], Dseq['ats'], \ 563 | '../output/PF00071.pml','A', '../Inputs/', 0) 564 | 565 | 566 | .. parsed-literal:: 567 | 568 | Sector 1 is composed of 43 positions: 569 | 5+10+11+14+15+16+22+28+32+34+35+36+39+42+54+56+57+58+59+60+61+62+63+64+68+71+72+73+75+81+83+85+96+99+110+116+117+119+144+145+146+147+156 570 | 571 | Sector 2 is composed of 13 positions: 572 | 17+23+82+84+90+115+123+125+129+130+134+141+143 573 | 574 | 575 | 576 | V. Sequence subfamilies and the basis of sector hierarchy. 577 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 578 | 579 | How does the phylogenetic and functional heterogeneity in the MSA 580 | influence the sector definitions? To address this, we take advantage of 581 | mathematical methods for mapping between the space of positional and 582 | sequence correlations, as described in *Rivoire et al* (see equations 583 | 8-11). Using this mapping, we plot the top :math:`k^*` ICs of the matrix 584 | :math:`\tilde{C_{ij}}` as 2-D scatter plots *(top row)*, and compare 585 | them to the corresponding sequence space divergence *(middle and bottom 586 | rows)*. The amino acid positions contributing to each IC are colored by 587 | sector *(sector 1 = blue, sector 2 = red, top row)*. The sequences are 588 | color-coded according to phylogenetic classifications *(middle row)* or 589 | G-protein class *(bottom row)* as we defined above. 590 | 591 | .. code:: python3 592 | 593 | plt.rcParams['figure.figsize'] = 14, 10 594 | pairs = [ [x,x+1] for x in range(Dsect['kpos']-1)] 595 | ncols = len(pairs) 596 | for n,[k1,k2] in enumerate(pairs): 597 | plt.subplot(3,ncols,n+1) 598 | sca.figUnits(Dsect['Vpica'][:,k1], Dsect['Vpica'][:,k2], \ 599 | sectors, dotsize = 6) 600 | plt.xlabel('IC%i' % (k1+1), fontsize=16) 601 | plt.ylabel('IC%i' % (k2+1), fontsize=16) 602 | plt.subplot(3,ncols,n+1+ncols) 603 | sca.figUnits(Dsect['Upica'][:,k1], Dsect['Upica'][:,k2], \ 604 | phylo, dotsize = 6) 605 | plt.xlabel(r'$U^p_{%i}$' % (k1+1), fontsize=16) 606 | plt.ylabel(r'$U^p_{%i}$' % (k2+1), fontsize=16) 607 | plt.subplot(3,ncols,n+1+ncols*2) 608 | sca.figUnits(Dsect['Upica'][:,k1], Dsect['Upica'][:,k2], \ 609 | gprot_classes, dotsize = 6) 610 | plt.xlabel(r'$U^p_{%i}$' % (k1+1), fontsize=16) 611 | plt.ylabel(r'$U^p_{%i}$' % (k2+1), fontsize=16) 612 | plt.tight_layout() 613 | 614 | 615 | 616 | .. image:: _static/SCA_G_42_0.png 617 | 618 | 619 | There is some clear divergence in G-protein subtype along :math:`U_1^p` 620 | and :math:`U_2^p`, indicating that the amino acid positions associated 621 | with IC1 and IC2 vary in a subtype-specific pattern. To more clearly see 622 | seperations in sequence classification, we also plot the above 623 | distributions of sequences (along :math:`U_1^p`, 624 | :math:`U_2^p`,\ :math:`U_3^p`,and :math:`U_4^p`) as stacked bar plots. 625 | This representation lets us directly see the contribution of sequences 626 | that might be hidden (due to overlapping points) on the above scatter 627 | plots. The *top row* reflects phylogenetic classifications and the 628 | *bottom row* shows G-protein functional classes. 629 | 630 | .. code:: python3 631 | 632 | plt.rcParams['figure.figsize'] = 15, 4 633 | 634 | col = list() 635 | for k in gprot_classes: 636 | col = col + [colorsys.hsv_to_rgb(k.col,1,1)] 637 | for k in range(Dsect['kpos']): 638 | forhist = list() 639 | for group in gprot_classes: 640 | forhist.append([Dsect['Upica'][i,k] for i in group.items]) 641 | plt.subplot(2,Dsect['kpos'],k+5) 642 | plt.hist(forhist, histtype='barstacked',color=col) 643 | plt.xlabel(r'$U^p_{%i}$' % (k+1), fontsize=16) 644 | 645 | col = list() 646 | for k in phylo: 647 | col = col + [colorsys.hsv_to_rgb(k.col,1,1)] 648 | for k in range(Dsect['kpos']): 649 | forhist = list() 650 | for group in phylo: 651 | forhist.append([Dsect['Upica'][i,k] for i in group.items]) 652 | plt.subplot(2,Dsect['kpos'],k+1) 653 | plt.hist(forhist, histtype='barstacked',color=col) 654 | 655 | plt.tight_layout() 656 | 657 | 658 | 659 | .. image:: _static/SCA_G_44_0.png 660 | 661 | 662 | The interpretation for the two sectors is clear: 663 | 664 | **Sector 1** is composed of ICs 1,2 and 3 - we see above that the 665 | positions contributing to IC1 and IC2 seperate out the Ras-like *(red)* 666 | and Rho *(green)* g-protein functional classes (see the plots of 667 | :math:`U_1^p` and :math:`U_2^p` above). In contrast, the positions along 668 | IC3 and IC4 are associated with a homogeneous pattern of sequences; that 669 | is they have no obvious relationship to g-protein class or phylogeny. 670 | This suggests that sector 1 consists of a core element (IC3) that is 671 | conserved among G-proteins and two related/co-evolving parts which 672 | diverge in particular G-protein functional classes. The structural 673 | mapping of these positions is consistent with this interpretation - we 674 | observe that the positions associated with IC3 form the base of the 675 | nucleotide binding pocket (a general feature of the g-protein family) 676 | and that the IC1 and IC2 positions form a peripheral shell, which may 677 | reflect functional divergence in G-protein regulatory mechanisms in 678 | different family members. 679 | 680 | **Sector 2** is defined along (:math:`V_4^p`). The sequences along the 681 | corresponding component (:math:`U_4^p`) are homogeneously distributed 682 | with respect to both phylogeny and g-protein functional class, 683 | consistent with the notion that this sector is likley a global property 684 | of the entire alignment. 685 | -------------------------------------------------------------------------------- /docs/source/SCA_betalactamase.rst: -------------------------------------------------------------------------------- 1 | SCA 6.1 - The Beta-lactamase Enzyme Family 2 | ========================================== 3 | 4 | This script describes the basic flow of the analytical steps in SCA6.0, 5 | using the :math:`\beta`-lactamase enzyme family as an example (PFAM 6 | PF13354). The alignment contains some subfamily structure (clades of 7 | related sequences) as evidenced in Section 1. We identify two sectors: a 8 | core sector surrounding the active site that is shared across all 9 | sequences, and a more peripheral sector containing groups of residues 10 | that diverge in particular subfamilies. For this tutorial, the core 11 | scripts should be run as follows: 12 | 13 | :: 14 | 15 | >> annotateMSA -i ../data/PF13354_full.txt -o ../data/PF13354_full.an -a 'pfam' -p ../data/pfamseq.txt 16 | >> scaProcessMSA -a ../data/PF13354_full.an -b ../data/ -s 1FQG -c A -f 'Escherichia coli' -t -n 17 | >> scaCore -i ../output/PF13354_full.db 18 | >> scaSectorID -i ../output/PF13354_full.db 19 | 20 | Note that we supply annotated alignments for all tutorial scripts *(the 21 | annotate_pfMSA step is slow, and should only be run once)*. 22 | 23 | **O.Rivoire, K.Reynolds and R.Ranganathan** 9/2014 24 | 25 | .. code:: python3 26 | 27 | import os 28 | import time 29 | import matplotlib.pyplot as plt 30 | import math 31 | import numpy as np 32 | import copy 33 | import scipy.cluster.hierarchy as sch 34 | from scipy.stats import scoreatpercentile 35 | from pysca import scaTools as sca 36 | import colorsys 37 | # import mpld3 38 | import pickle as pickle 39 | from optparse import OptionParser 40 | 41 | if not os.path.exists('../output/'): 42 | os.makedirs('../output/') 43 | 44 | %matplotlib inline 45 | 46 | Read in the results of the above three scripts (scaProcessMSA, scaCore 47 | and scaSectorID), stored as three dictionaries in the database 48 | PF13354_full.db. To see what variables are stored in each dictionary, 49 | use: 50 | 51 | :: 52 | 53 | >>> list(db) 54 | 55 | .. code:: python3 56 | 57 | db = pickle.load(open('../output/PF13354_full.db','rb')) 58 | Dseq = db['sequence'] 59 | Dsca = db['sca'] 60 | Dsect = db['sector'] 61 | 62 | I. Statistical Structure of the Multiple Sequence Alignment (MSA) 63 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 64 | 65 | Plot a histogram of all pairwise sequence identities *(left panel)* and 66 | a global view of the sequence similarity matrix (defined by 67 | :math:`S\equiv \frac{1}{L}XX^\top`) *(right panel)*. The data show that 68 | the alignment is described by a nearly bimodal distribution of sequence 69 | identities with peaks near 25% and 45%. From the matrix at right, it is 70 | clear that the alignment is composed of several distinct sequence 71 | families. 72 | 73 | .. code:: python3 74 | 75 | # List all elements above the diagonal (i 104 | 105 | 106 | 107 | 108 | .. image:: _static/SCA_betalactamase_8_1.png 109 | 110 | 111 | To examine the role of sequence and position weighting on the structure 112 | of the sequence space, we compute correlation matrices between all pairs 113 | of sequences, either with or without sequence and position weights and 114 | project the corresponding sequence space (by eigenvalue decomposition) 115 | down to a small set of top modes that contain the statistically dominant 116 | relationships between sequences. Since eigenvalue decomposition does not 117 | necessarily provide the best representation of sequence groups (for 118 | reasons described in “xx”), we also apply independent components 119 | analysis (or ICA) to the top few eigenmodes; this manipulation provides 120 | a representation in which the top groupings of sequences in the 121 | alignment (if such exists) should separate along the so-called 122 | independent components (or ICs). Below we plot the following eigenmodes 123 | *(top row)* and independent components *(bottom row)*: 124 | 125 | :math:`\bullet` :math:`U^{(0)}` and :math:`U'^{(0)}`, the top eigenmodes 126 | and ICs without any weights; 127 | 128 | :math:`\bullet` :math:`U^{(1)}` and :math:`U'^{(1)}` the top eigenmodes 129 | and ICs with sequences weights; 130 | 131 | :math:`\bullet` :math:`U^{(2)}` and :math:`U'^{(2)}` the top eigenmodes 132 | and ICs with both sequences and positional weights. 133 | 134 | The sequences are colored by weight, with red indicating the most 135 | strongly downweighted sequences. In contrast to the g-protein example, 136 | we see that application of the sequence and position weights makes the 137 | sequence space apparently more uniform (removes some of the family or 138 | clade-like structure). 139 | 140 | .. code:: python3 141 | 142 | Useq = Dsca['Useq'] 143 | Uica = Dsca['Uica'] 144 | plt.rcParams['figure.figsize'] = 9, 8 145 | ica = ["","","","'","'","'"] 146 | for k,U in enumerate(Useq+Uica): 147 | plt.subplot(2,3,k+1) 148 | sca.figWeights(U[:,0], U[:,1], Dseq['seqw'][0]) 149 | plt.xlabel(r'${U%s}^{(%i)}_1$'%(ica[k],k%3), fontsize=16) 150 | plt.ylabel(r'${U%s}^{(%i)}_2$'%(ica[k],k%3), fontsize=16) 151 | plt.tight_layout() 152 | 153 | 154 | 155 | .. image:: _static/SCA_betalactamase_10_0.png 156 | 157 | 158 | To examine the relationship between divergence in *sequence similarity* 159 | and *phylogeny* in the sequence-weighted alignment, we plot the top 160 | independent components of the sequence correlation matrix (after 161 | sequence weighting), colored by phylogenetic group. We start by 162 | constructing a dictionary of phylogenetic annotations and checking the 163 | representation of sequences in the top taxonomic levels. The annotations 164 | are parsed from the sequence headers. 165 | 166 | .. code:: python3 167 | 168 | #construct a dictionary of phylogenetic groups 169 | annot = dict() 170 | for i, h in enumerate(Dseq['hd']): 171 | hs = sca.parseAlgHeader(h) 172 | annot[hs[0]] = sca.Annot(hs[1], hs[2], hs[3].replace('.','')) 173 | 174 | # Most frequent taxonomic groups: 175 | atleast = 10 176 | for level in range(4): 177 | descr_list = [a.taxo.split(',')[level] for a in annot.values() \ 178 | if len(a.taxo.split(',')) > level] 179 | descr_dict = {k:descr_list.count(k) for k in descr_list \ 180 | if descr_list.count(k)>=atleast} 181 | print('\n Level %i:' % level) 182 | print(descr_dict) 183 | 184 | 185 | .. parsed-literal:: 186 | 187 | 188 | Level 0: 189 | {'Bacteria': 803} 190 | 191 | Level 1: 192 | {'Proteobacteria': 380, 'Actinobacteria': 145, 'Firmicutes': 119, 'Deinococcus-Thermus': 11, 'Bacteroidetes': 46, 'Cyanobacteria': 59, 'Acidobacteria': 10, 'environmental samples': 18} 193 | 194 | Level 2: 195 | {'Gammaproteobacteria': 200, 'Actinobacteridae': 139, 'Bacillales': 55, 'Deinococci': 11, 'Clostridia': 41, 'Betaproteobacteria': 57, 'Chroococcales': 31, 'Alphaproteobacteria': 115, 'Lactobacillales': 12, 'Negativicutes': 11, 'Bacteroidia': 21, 'Nostocales': 10, 'Oscillatoriales': 11} 196 | 197 | Level 3: 198 | {'Enterobacteriales': 89, 'Actinomycetales': 139, 'Paenibacillaceae': 10, 'Clostridiales': 35, 'Burkholderiales': 55, 'Vibrionales': 28, 'Synechococcus': 14, 'Bacillaceae': 31, 'Rhizobiales': 48, 'Pseudomonadales': 28, 'Rhodospirillales': 16, 'Selenomonadales': 11, 'Sphingomonadales': 31, 'Caulobacterales': 10, 'Bacteroidales': 21, 'Thiotrichales': 12, 'Xanthomonadales': 16, 'Rhodobacterales': 10, 'Nostocaceae': 10} 199 | 200 | 201 | Based on this, we select taxonomic groups and colors for representation. 202 | Here, we just start by choosing the broadly well-represented groups. To 203 | see a complete color-coding legend, use: 204 | 205 | :: 206 | 207 | >>> sca.figColors() 208 | 209 | .. code:: python3 210 | 211 | phylo = list(); 212 | fam_names = ['Firmicutes', 'Actinobacteria', 'Bacteroidetes', \ 213 | 'Cyanobacteria', 'Proteobacteria'] 214 | col = (0, 0.18, 0.38, 0.5, 0.6) 215 | # Firmicutes = red, Actinobacteria = yellow, Bacteroidetes = cyan, 216 | # Cyanobacteria = green, Proteobacteria = blue 217 | 218 | for i,k in enumerate(fam_names): 219 | sf = sca.Unit() 220 | sf.name = fam_names[i].lower() 221 | sf.col = col[i] 222 | sf.items = [j for j,q in enumerate(Dseq['hd']) if sf.name in q.lower()] 223 | phylo.append(sf) 224 | 225 | Plot the top six independent components of the sequence correlation 226 | matrix (with sequence weights); color-coded by phylogenetic annotation. 227 | The sequences clearly seperate into groups related by phylogeny; the 228 | Proteobacteria *(blue)* seperate out on :math:`U_1`, the Firmicutes 229 | *(red)* seperate out on :math:`U_2`, the Cyanobacteria *(green)* 230 | seperate out on :math:`U_3`, and the Bacteroidetes *(cyan)* seperate out 231 | on :math:`U_5`. 232 | 233 | .. code:: python3 234 | 235 | plt.rcParams['figure.figsize'] = 9, 3.5 236 | U = Dsca['Uica'][1] 237 | pairs = [[2*i,2*i+1] for i in range(3)] 238 | for k,[k1,k2] in enumerate(pairs): 239 | plt.subplot(1,3,k+1) 240 | sca.figUnits(U[:,k1], U[:,k2], phylo) 241 | #sca.figUnits(U[:,k1], U[:,k2], subfam) 242 | plt.xlabel(r"${U'}^{(2)}_{%i}$"%(k1+1), fontsize=16) 243 | plt.ylabel(r"${U'}^{(2)}_{%i}$"%(k2+1), fontsize=16) 244 | plt.tight_layout() 245 | 246 | 247 | 248 | .. image:: _static/SCA_betalactamase_16_0.png 249 | 250 | 251 | II. SCA conservation and coevolution 252 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 253 | 254 | Plot the eigenspectrum of the SCA positional coevolution matrix 255 | (:math:`\tilde{C_{ij}}`) *(black bars)* and 10 trials of matrix 256 | randomization for comparison *(red line)*. This graph is used to choose 257 | the number of significant eigenmodes. 258 | 259 | .. code:: python3 260 | 261 | plt.rcParams['figure.figsize'] = 9, 3.5 262 | hist0, bins = np.histogram(Dsca['Lrand'].flatten(), bins=Dseq['Npos'], \ 263 | range=(0,Dsect['Lsca'].max())) 264 | hist1, bins = np.histogram(Dsect['Lsca'], bins=Dseq['Npos'], \ 265 | range=(0,Dsect['Lsca'].max())) 266 | plt.bar(bins[:-1], hist1, np.diff(bins),color='k') 267 | plt.plot(bins[:-1], hist0/Dsca['Ntrials'], 'r', linewidth=3) 268 | plt.tick_params(labelsize=11) 269 | plt.xlabel('Eigenvalues', fontsize=18); plt.ylabel('Numbers', fontsize=18); 270 | print('Number of eigenmodes to keep is %i' %(Dsect['kpos'])) 271 | 272 | 273 | .. parsed-literal:: 274 | 275 | Number of eigenmodes to keep is 7 276 | 277 | 278 | 279 | .. image:: _static/SCA_betalactamase_19_1.png 280 | 281 | 282 | To define the positions with significant contributions each of the 283 | independent components (ICs), we make a empirical fit for each IC to the 284 | t-distribution and select positions with greater than a specified cutoff 285 | on the CDF. We choose :math:`p=0.95` as our cutoff. Note that since some 286 | positions might contribute significantly to more than one IC (and 287 | indication of non-independence of ICs), we apply a simple algorithm to 288 | assign such positions to one IC. Specifically, we assign positions to 289 | the IC with which it has the greatest degree of co-evolution. 290 | 291 | The data indicate generally good fits for the top six ICs, and we return 292 | the positions contributing to each IC in a format suitable for cut and 293 | paste into PyMol. 294 | 295 | .. code:: python3 296 | 297 | plt.rcParams['figure.figsize'] = 10,5 298 | 299 | Vpica = Dsect['Vpica'] 300 | for k in range(Dsect['kpos']): 301 | iqr = scoreatpercentile(Vpica[:,k],75) - scoreatpercentile(Vpica[:,k],25) 302 | binwidth=2*iqr*(len(Vpica)**(-0.33)) 303 | nbins=int(round((max(Vpica[:,k])-min(Vpica[:,k]))/binwidth)) 304 | plt.subplot(1,Dsect['kpos'],k+1) 305 | h_params = plt.hist(Vpica[:,k], nbins) 306 | x_dist = np.linspace(min(h_params[1]), max(h_params[1]), num=100) 307 | plt.plot(x_dist,Dsect['scaled_pd'][k],'r',linewidth = 2) 308 | plt.xlabel(r'$V^p_{%i}$'%(k+1), fontsize=14) 309 | plt.ylabel('Number', fontsize=14) 310 | 311 | for n,ipos in enumerate(Dsect['ics']): 312 | sort_ipos = sorted(ipos.items) 313 | ats_ipos = ([Dseq['ats'][s] for s in sort_ipos]) 314 | ic_pymol = ('+'.join(ats_ipos)) 315 | print('IC %i is composed of %i positions:' % (n+1,len(ats_ipos))) 316 | print(ic_pymol + "\n") 317 | 318 | 319 | .. parsed-literal:: 320 | 321 | IC 1 is composed of 20 positions: 322 | 61+65+109+117+125+136+157+164+170+179+180+210+213+229+233+241+247+250+251+255 323 | 324 | IC 2 is composed of 16 positions: 325 | 63+70+71+73+91+130+131+132+134+143+156+182+196+226+234+236 326 | 327 | IC 3 is composed of 17 positions: 328 | 66+68+102+105+106+107+126+144+145+183+185+199+207+215+216+238+244 329 | 330 | IC 4 is composed of 12 positions: 331 | 69+72+123+139+149+151+153+161+162+163+186+193 332 | 333 | IC 5 is composed of 0 positions: 334 | 335 | 336 | IC 6 is composed of 13 positions: 337 | 67+85+87+148+160+181+190+200+203+211+221+225+231 338 | 339 | IC 7 is composed of 11 positions: 340 | 77+84+101+122+138+220+223+224+232+235+245 341 | 342 | 343 | 344 | 345 | .. image:: _static/SCA_betalactamase_21_1.png 346 | 347 | 348 | To define protein sectors, we examine the structure of the SCA 349 | positional correlation matrix with positions contributing to the top 350 | independent components (ICs) ordered by weight *(left panel)*. This 351 | provides a basis to determine/interpret which ICs are truly 352 | statistically independent (defining an independent sector) and which 353 | represent hierarchical breakdowns of one sector. 354 | 355 | IC 2 appears more distinct and is considered an independent sector 356 | *(sector 1)*. ICs 1,3,5,and 6 are strongly co-evolving, and should be 357 | combined into one sector. IC 4 also appears to be related to [1,3,5,6] 358 | and the combination of 1,3,4,5,6 makes up sector two. The sectors (2 in 359 | total) are defined accordingly, and in the *right panel*, these 360 | independent components have been re-ordered accordingly to visualize 361 | this decomposition. 362 | 363 | .. code:: python3 364 | 365 | # plot the SCA positional correlation matrix, ordered by contribution to the top ICs 366 | plt.rcParams['figure.figsize'] = 10, 10 367 | plt.subplot(121) 368 | plt.imshow(Dsca['Csca'][np.ix_(Dsect['sortedpos'], Dsect['sortedpos'])], \ 369 | vmin=0, vmax=2,interpolation='none',aspect='equal',\ 370 | extent=[0,sum(Dsect['icsize']),0,sum(Dsect['icsize'])]) 371 | line_index=0 372 | for i in range(Dsect['kpos']): 373 | plt.plot([line_index+Dsect['icsize'][i],line_index+Dsect['icsize'][i]],\ 374 | [0,sum(Dsect['icsize'])],'w', linewidth = 2) 375 | plt.plot([0,sum(Dsect['icsize'])],[sum(Dsect['icsize'])-line_index,\ 376 | sum(Dsect['icsize'])-line_index],'w', linewidth = 2) 377 | line_index += Dsect['icsize'][i] 378 | 379 | #define the new sector groupings - 2 total 380 | sec_groups = ([1],[0,2,4,5,3,6]) 381 | sectors = list() 382 | for n,k in enumerate(sec_groups): 383 | s = sca.Unit() 384 | all_items = list() 385 | for i in k: all_items = all_items+Dsect['ics'][i].items 386 | s.items = all_items 387 | s.col = (1/len(sec_groups))*n 388 | sectors.append(s) 389 | 390 | # plot the re-ordered matrix 391 | plt.subplot(122) 392 | line_index=0 393 | sortpos = list() 394 | for s in sectors: 395 | sortpos.extend(s.items) 396 | plt.imshow(Dsca['Csca'][np.ix_(sortpos, sortpos)], vmin=0, vmax=2,\ 397 | interpolation='none',aspect='equal',\ 398 | extent=[0,len(sortpos),0,len(sortpos)]) 399 | for s in sectors: 400 | plt.plot([line_index+len(s.items),line_index+len(s.items)],\ 401 | [0,len(sortpos)],'w', linewidth = 2) 402 | plt.plot([0,sum(Dsect['icsize'])],[len(sortpos)-line_index, \ 403 | len(sortpos)-line_index],'w', linewidth = 2) 404 | line_index += len(s.items) 405 | plt.tight_layout() 406 | 407 | 408 | 409 | .. image:: _static/SCA_betalactamase_23_0.png 410 | 411 | 412 | Print the sector positions, in a format suitable for pyMol, and create a 413 | pyMol session with the sectors (and decomposition into independent 414 | components) as seperate objects. Structurally, sectors 1+3 form 415 | physically contiguous units, and 2 is less so… this is consistent with 416 | the idea that sector 2/IC4 might be associated with sector 1/ICs1+3+5+6 417 | 418 | .. code:: python3 419 | 420 | for i,k in enumerate(sectors): 421 | sort_ipos = sorted(k.items) 422 | ats_ipos = ([Dseq['ats'][s] for s in sort_ipos]) 423 | ic_pymol = ('+'.join(ats_ipos)) 424 | print('Sector %i is composed of %i positions:' % (i+1,len(ats_ipos))) 425 | print(ic_pymol + "\n") 426 | sca.writePymol('../data/1FQG.pdb', sectors, Dsect['ics'], Dseq['ats'], \ 427 | '../output/PF13354.pml', 'A', '../data/', 0) 428 | 429 | 430 | .. parsed-literal:: 431 | 432 | Sector 1 is composed of 16 positions: 433 | 63+70+71+73+91+130+131+132+134+143+156+182+196+226+234+236 434 | 435 | Sector 2 is composed of 73 positions: 436 | 61+65+66+67+68+69+72+77+84+85+87+101+102+105+106+107+109+117+122+123+125+126+136+138+139+144+145+148+149+151+153+157+160+161+162+163+164+170+179+180+181+183+185+186+190+193+199+200+203+207+210+211+213+215+216+220+221+223+224+225+229+231+232+233+235+238+241+244+245+247+250+251+255 437 | 438 | 439 | 440 | III. The phylogenetic basis of the sector hierarchy 441 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 442 | 443 | How does the clear phylogenetic heterogeneity in the MSA influence the 444 | sector definitions? To address this, we take advantage of mathematical 445 | methods for mapping between the space of positional and sequence 446 | correlations, as described in *Rivoire et al*. Using this mapping, we 447 | plot the top :math:`k_{pos}` ICs as 2-D scatter plots with the 448 | corresponding sequence space divergence. The colors for the sequence 449 | space are according to the phylogenetic classifications we chose above. 450 | 451 | .. code:: python3 452 | 453 | plt.rcParams['figure.figsize'] = 15,8 454 | pairs = [ [x, x+1] for x in range(0, len(Dsect['ics'])-1, 2) ] 455 | for n,[k1,k2] in enumerate(pairs): 456 | plt.subplot(2,len(pairs),n+1) 457 | sca.figUnits(Dsect['Vpica'][:,k1], Dsect['Vpica'][:,k2], sectors, dotsize = 6) 458 | plt.xlabel(r'$V^p_{%i}$' % (k1+1), fontsize=16) 459 | plt.ylabel(r'$V^p_{%i}$' % (k2+1), fontsize=16) 460 | plt.subplot(2,len(pairs),n+len(pairs)+1) 461 | sca.figUnits(Dsect['Upica'][:,k1], Dsect['Upica'][:,k2], phylo, dotsize = 6) 462 | plt.xlabel(r'$U^p_{%i}$' % (k1+1), fontsize=16) 463 | plt.ylabel(r'$U^p_{%i}$' % (k2+1), fontsize=16) 464 | plt.tight_layout() 465 | 466 | 467 | 468 | .. image:: _static/SCA_betalactamase_28_0.png 469 | 470 | 471 | The interpretation for the two sectors: 472 | 473 | **Sector 1** is defined along (:math:`V_2^p`). The sequences along the 474 | corresponding component (:math:`U_2^p`) are homogeneously distributed 475 | with respect to phylogeny, consistent with the notion that this sector 476 | is a property of the entire alignment. Notably, this sector forms the 477 | catalytic core of the Beta-lactamase. 478 | 479 | **Sector 2** is composed of ICs 1,3,4 and 5 - and each of these is 480 | associated with some phylogenetic divergence. :math:`V_1^p` splits the 481 | cyanobacteria *(green)* from the proteobacteria *(blue)*, :math:`V_3^p` 482 | seperates the proteobacteria *(blue)* from other sequence families, 483 | :math:`V_5^p` seperates out a subset of the firmicutes *(red)*, and 484 | :math:`V_6^p` is associated with a divergence in the bacteriodetes 485 | *(cyan)*. Sector 2 forms a physically contiguous unit that resembles a 486 | shell around the active site. The decomposition described above suggests 487 | that some functional divergence in beta-lactamse dynamics or regulatory 488 | mechanism across phylogenetic lines may underlie the breakdown of this 489 | sector. 490 | 491 | For clarity, we also plot the same data as a stacked bar chart below. 492 | 493 | .. code:: python3 494 | 495 | plt.rcParams['figure.figsize'] = 20, 5 496 | 497 | col = list() 498 | for k in phylo: 499 | col = col + [colorsys.hsv_to_rgb(k.col,1,1)] 500 | for k in range(Dsect['kpos']): 501 | forhist = list() 502 | for group in phylo: 503 | forhist.append([Dsect['Upica'][i,k] for i in group.items]) 504 | plt.subplot(2,Dsect['kpos'],k+1) 505 | plt.hist(forhist, histtype='barstacked',color=col) 506 | 507 | 508 | 509 | .. image:: _static/SCA_betalactamase_30_0.png 510 | 511 | 512 | This concludes the script. 513 | -------------------------------------------------------------------------------- /docs/source/_static/BLactamase_sec_hier.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/BLactamase_sec_hier.png -------------------------------------------------------------------------------- /docs/source/_static/DHFR_decompv2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/DHFR_decompv2.png -------------------------------------------------------------------------------- /docs/source/_static/DHFR_sec_hier.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/DHFR_sec_hier.png -------------------------------------------------------------------------------- /docs/source/_static/Gprot_sec_hier.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/Gprot_sec_hier.png -------------------------------------------------------------------------------- /docs/source/_static/Gprot_secstruct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/Gprot_secstruct.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_DHFR_13_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_DHFR_13_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_DHFR_16_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_DHFR_16_1.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_DHFR_20_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_DHFR_20_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_DHFR_22_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_DHFR_22_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_DHFR_26_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_DHFR_26_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_DHFR_29_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_DHFR_29_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_DHFR_30_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_DHFR_30_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_DHFR_7_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_DHFR_7_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_G_17_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_G_17_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_G_21_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_G_21_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_G_24_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_G_24_1.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_G_26_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_G_26_1.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_G_28_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_G_28_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_G_31_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_G_31_1.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_G_33_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_G_33_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_G_35_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_G_35_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_G_37_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_G_37_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_G_42_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_G_42_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_G_44_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_G_44_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_G_9_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_G_9_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_S1A_17_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_S1A_17_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_S1A_20_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_S1A_20_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_S1A_23_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_S1A_23_1.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_S1A_25_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_S1A_25_1.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_S1A_27_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_S1A_27_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_S1A_30_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_S1A_30_1.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_S1A_32_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_S1A_32_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_S1A_38_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_S1A_38_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_S1A_41_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_S1A_41_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_S1A_9_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_S1A_9_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_betalactamase_10_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_betalactamase_10_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_betalactamase_16_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_betalactamase_16_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_betalactamase_19_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_betalactamase_19_1.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_betalactamase_21_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_betalactamase_21_1.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_betalactamase_23_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_betalactamase_23_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_betalactamase_28_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_betalactamase_28_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_betalactamase_30_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_betalactamase_30_0.png -------------------------------------------------------------------------------- /docs/source/_static/SCA_betalactamase_8_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_betalactamase_8_1.png -------------------------------------------------------------------------------- /docs/source/_static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/favicon.ico -------------------------------------------------------------------------------- /docs/source/_static/github-download-screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/github-download-screenshot.png -------------------------------------------------------------------------------- /docs/source/_static/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/logo.png -------------------------------------------------------------------------------- /docs/source/annotateMSA.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | annotateMSA 3 | =========== 4 | 5 | .. automodule:: annotateMSA 6 | :members: 7 | :undoc-members: 8 | :show-inheritance: 9 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # http://www.sphinx-doc.org/en/master/config 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('../modules')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'pySCA' 21 | copyright = '2019, Olivier Rivoire, Rama Ranganathan, Kimberly Reynolds' 22 | author = 'Olivier Rivoire, Rama Ranganathan, Kimberly Reynolds' 23 | 24 | # The short X.Y version. 25 | version = '6.1' 26 | 27 | # The full version, including alpha/beta/rc tags 28 | release = '6.1' 29 | 30 | 31 | # -- General configuration --------------------------------------------------- 32 | 33 | # Add any Sphinx extension module names here, as strings. They can be 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 35 | # ones. 36 | extensions = [ 37 | 'sphinx.ext.todo', 38 | 'sphinx.ext.viewcode', 39 | 'sphinx.ext.autodoc' 40 | ] 41 | 42 | # Add any paths that contain templates here, relative to this directory. 43 | templates_path = ['_templates'] 44 | 45 | # The master toctree document. 46 | master_doc = 'index' 47 | 48 | # List of patterns, relative to source directory, that match files and 49 | # directories to ignore when looking for source files. 50 | # This pattern also affects html_static_path and html_extra_path. 51 | exclude_patterns = [] 52 | 53 | # The name of the Pygments (syntax highlighting) style to use. 54 | pygments_style = 'friendly' 55 | 56 | 57 | # -- Options for HTML output ------------------------------------------------- 58 | 59 | # The theme to use for HTML and HTML Help pages. See the documentation for 60 | # a list of builtin themes. 61 | # 62 | html_theme = 'sphinx_rtd_theme' 63 | 64 | # Add any paths that contain custom static files (such as style sheets) here, 65 | # relative to this directory. They are copied after the builtin static files, 66 | # so a file named "default.css" will overwrite the builtin "default.css". 67 | html_static_path = ['_static'] 68 | 69 | 70 | # logo 71 | html_logo = '_static/logo.png' 72 | html_favicon = '_static/favicon.ico' 73 | 74 | html_theme_options = { 75 | 'logo_only': True 76 | } 77 | -------------------------------------------------------------------------------- /docs/source/examples.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Examples 3 | ======== 4 | 5 | .. toctree:: 6 | :maxdepth: 1 7 | 8 | SCA_G 9 | SCA_S1A 10 | SCA_DHFR 11 | SCA_betalactamase 12 | -------------------------------------------------------------------------------- /docs/source/get_started.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | Getting Started 3 | =============== 4 | 5 | Running a complete SCA analysis consists of five steps: 6 | 7 | 1) Constructing an alignment 8 | 2) Alignment pre-processing and conditioning 9 | 3) Calculation of the conservation and co-evolution statistics 10 | 4) Identifying statistically significant correlations 11 | 5) Interpretation of the results 12 | 13 | The core SCA calculations (steps 2,3, and 4) are each associated with a 14 | particular Python analysis script (:code:`scaProcessMSA`, :code:`scaCore`, and 15 | :code:`scaSectorID`, respectively). Sequential execution of each Python 16 | analysis script stores the results in a pickle database. This means the core 17 | SCA calculations can be run from the command line, or multiple proteins can be 18 | analyzed using a shell script (for an example, see `runAllNBCalcs.sh`). 19 | Following execution of the scripts, the pickle database can be loaded in an 20 | Jupyter notebook for visualizing the results and interpreting the data. 21 | Alternatively, the output of the analysis scripts can be saved as a MATLAB 22 | workspace, and results plotted/analyzed in MATLAB. Below we describe the five 23 | main analysis steps in more detail. 24 | 25 | 26 | File and Directory Structure 27 | ============================ 28 | 29 | The pySCA repository contains the following files and directories: 30 | 31 | Base Directory 32 | -------------- 33 | 34 | bin/ 35 | Contains the analysis scripts that use functions defined in the `scaTools.py` 36 | module. 37 | data/ 38 | Git submodule that contains the input sequence alignments (\*.fasta, \*.an) 39 | and structures (\*.pdb) for the analysis. The \*.an files correspond to 40 | fasta-formatted sequence files with taxonomic annotations. The inputs 41 | needed for all tutorials are here. 42 | output/ 43 | Contains the output of the analysis. Accordingly, it is an empty directory 44 | in a newly installed pySCA distribution. Running the scripts below will 45 | output a processed alignment (\*.fasta or \*.an file) and pickle database 46 | (\*.db file) to Outputs/. Similarly, if you choose to output results to a 47 | MATLAB workspace, the resulting \*.mat file will write to this directory. 48 | figs/ 49 | Contains a few figures that are loaded into the tutorials for illustration 50 | purposes. 51 | docs/ 52 | Contains this documentation. 53 | LICENSE 54 | This work is distributed under the standard BSD 3-clause open source 55 | software license. 56 | README.md 57 | Very basic introduction to the toolbox. 58 | scripts/ 59 | Contains scripts used to generate the input data from the example analyses. 60 | notebooks/ 61 | Contains a set of pySCA examples as Jupyter (formerly IPython) notebooks. 62 | pysca/ 63 | Contains the Python source code for the SCA implementation. 64 | 65 | `bin` Directory 66 | ----------------- 67 | 68 | alnFilterSeqSize, alnParseID, alnReplaceHeaders, alnChangeDelim, alnConvertGI 69 | These aren't essential to the main SCA utilities/package, but are little 70 | scripts that we often find useful in alignment construction. 71 | annotateMSA 72 | A script for adding taxonomic annotations to fasta-formatted sequence 73 | alignments 74 | scaProcessMSA 75 | The script that conducts alignment pre-processing and conditioning. This 76 | constitutes trimming the alignment for gaps, and removing low identity 77 | sequences. 78 | scaCore 79 | The script that computes SCA conservation and co-evolution values. 80 | scaSectorID 81 | The script that defines positions that show a statistically significant 82 | correlation. 83 | 84 | `scripts` Directory 85 | ------------------- 86 | 87 | runAllNBCalcs.sh 88 | A shell script that runs all of the calculations needed for the tutorials. 89 | This script also serves as an example for how to call the pySCA scripts 90 | from the command line. 91 | 92 | `notebooks` Directory 93 | --------------------- 94 | 95 | SCA_DHFR.ipynb 96 | Jupyter (formerly IPython) notebook tutorial for the Dihydrolate reductase 97 | enzyme family. 98 | SCA_G.ipynb 99 | Jupyter notebook tutorial for the small G proteins. 100 | SCA_betalactamase.ipynb 101 | Jupyter notebook tutorial for the Beta-lactamase enzyme family. 102 | SCA_S1A.ipynb 103 | Jupyter notebook tutorial for the S1A serine protease enzyme family. 104 | 105 | `pysca` Directory 106 | ----------------- 107 | 108 | scaTools.py 109 | Contains the pySCA library - the functions that implement all of the SCA 110 | calculations. 111 | settings.py 112 | Optional configuration file useful for specifying paths instead of having to 113 | so do on the command line. 114 | 115 | 116 | 1. Constructing and annotating a multiple sequence alignment 117 | ============================================================ 118 | 119 | The SCA method operates on a multiple sequence alignment of homologous protein 120 | sequences. You can begin the analysis by obtaining an alignment for your 121 | protein of interest from a curated database (for example PFAM: 122 | http://pfam.xfam.org/ ) or by constructing your own alignment. The details of 123 | alignment construction aren't covered here, but we may add a tutorial in future 124 | versions of this documentation. The critical thing is that the alignment 125 | contain on the order of 100 or more effective sequences. 126 | 127 | Once you have an alignment, it is helpful to add taxonomic annotations to the 128 | headers. These annotations are used in SCA to examine the relationship between 129 | sector positions and phylogenetic divergence (i.e. in the mapping between 130 | independent components and sequence space). The annotateMSA script contains 131 | two utilities to automate sequence annotation: one which uses the NCBI Entrez 132 | tools in BioPython, and one which uses PFAM database annotations (PFAM 133 | alignment specific). Please note that the annotation step can be slow (on the 134 | order of hours), but only needs to be done once per alignment. For further 135 | details please see the :doc:`/annotateMSA` documentation. 136 | 137 | 2. Alignment pre-processing and conditioning 138 | ============================================ 139 | 140 | Following alignment construction and annotation, the alignment is processed to: 141 | (1) remove highly gapped or low homology sequences, (2) remove highly gapped 142 | positions, (3) calculate sequence weights and (4) to create a mapping of 143 | alignment positions to a reference structure or sequence numbering system. This 144 | process is handled by the script :doc:`/scaProcessMSA`. Please see the script 145 | documentation for a complete list of optional arguments and notes on usage, and 146 | for a full description of computations 1-4, see the Rivoire et al 2016 methods 147 | paper (Box 1). [#Rivoire2016]_ The resulting output can be stored as either a 148 | Python pickle database or MATLAB workspace for further analysis. 149 | 150 | 3. Calculation of the conservation and co-evolution statistics 151 | ============================================================== 152 | 153 | The processed alignment and sequence weights computed in step 2 are then used 154 | in the calculation of evolutionary statistics by the script :doc:`scaCore`. 155 | This script handles the core calculations for: 156 | 157 | 1. Pairwise sequence correlations/sequence similarity 158 | 2. Single-site positional conservation from the Kullback-Leibler relative 159 | entropy, :math:`D_i^a`, and position weights from the gradient of the KL 160 | entropy, :math:`\frac{\partial{D_i^a}}{\partial{f_i^a}}`. See eqs. 1-2 161 | in Rivoire, 2016. [#Rivoire2016]_ 162 | 3. The SCA matrix :math:`\tilde{C_{ij}}`. See eq. 3 in Rivoire, 2016. 163 | [#Rivoire2016]_ 164 | 4. The projected alignment (eq. 10-11), and the projector (supplemental 165 | section 1H) [#Rivoire2016]_. 166 | 5. N trials (default N=10) of the randomized SCA matrix and associated 167 | eigenvectors and eigenvalues; used to choose the number of significant 168 | eigenmodes. 169 | 170 | The calculations and optional execution flags are further described in the 171 | script documentation. As for :doc:`scaProcessMSA`, the output can be stored as 172 | either a Python pickle database or MATLAB workspace for further analysis. 173 | 174 | 4. Identifying significant evolutionary correlations 175 | ==================================================== 176 | 177 | After the core calculations are complete, the next step is to define the 178 | significant number of eigenmodes/independent components for analysis 179 | (:math:`k_{max}`) and to select sector positions by their contributions to the 180 | top :math:`k_{max}` independent components. This is handled by the script 181 | :doc:`scaSectorID`. This script also computes the sequence-to-position space 182 | mapping as in eq.10-11 and fig. 7. As for :doc:`scaProcessMSA` and 183 | :doc:`scaCore`, the output can be stored as either a Python shelve database or 184 | MATLAB workspace for further analysis. 185 | 186 | 5. Interpretation of the results and sector definition 187 | ====================================================== 188 | 189 | Execution of annotateMSA, scaProcessMSA, scaCore, and scaSectorID completes 190 | the calculation of SCA terms and results in a single pickle database (\*.db 191 | file, and optionally, a MATLAB workspace) containing the collected results. The 192 | final step is to interpret these calculations and evaluate the 193 | (non-)independence of the amino acid positions associated with each independent 194 | component (as in Fig. 4). 195 | 196 | The :doc:`tutorials ` are designed to provide examples of this process, 197 | and to illustrate different aspects of SCA usage (please see the individual 198 | tutorial headers for more information). 199 | 200 | 201 | **Further Reading/References:** 202 | 203 | .. [#Halabi2009] Halabi N, Rivoire O, Leibler S, and Ranganathan R. "Protein 204 | sectors: evolutionary unis of three-dimensional structure." *Cell.* 2009 205 | v.138 p.774 206 | 207 | .. [#Smock2010] Smock RG, Rivoire O, Russ WP, Swain JF, Leibler S, Ranganathan 208 | R, Gierasch LM. "An interdomain sector mediating allostery in Hsp70 209 | molecular chaperones." *MSB.* 2010 v.6 p.414 210 | 211 | .. [#Reynolds2013] Reynolds KA, Russ WP, Socolich M, Ranganathan R. 212 | "Evolution-based design of proteins." *Methods Enzymol.* 2013 v.523 p.213 213 | 214 | .. [#Rivoire2016] Rivoire, O., Reynolds, K. A., and Ranganathan, R. 215 | Evolution-Based Functional Decomposition of Proteins. *PLOS Computational 216 | Biology* 12, e1004817 (2016). 217 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. pySCA documentation master file, created by 2 | sphinx-quickstart on Mon Jul 15 09:03:26 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | ============================================================================= 7 | Statistical Coupling Analysis in Python. 8 | ============================================================================= 9 | 10 | The Statistical Coupling Analysis (SCA) is an approach for characterizing the 11 | pattern of evolutionary constraints on and between amino acid positions in a 12 | protein family. Given a representative multiple sequence alignment of the 13 | family, the analysis provides methods for quantitatively measuring the overall 14 | functional constraint at each sequence position (the position-specific, or 15 | "first-order" analysis of conservation), and for measuring and analyzing the 16 | coupled functional constraint on all pairs of sequence positions (the 17 | pairwise-correlated, or "second-order" analysis of conservation). The premise 18 | is that extending the traditional definition of conservation to include 19 | correlations between positions will contribute to defining the architecture of 20 | functional interactions between amino acids, and more importantly, help define 21 | the basic physical principles underlying protein structure, function, and 22 | evolution. 23 | 24 | **Please Cite:** 25 | 26 | Rivoire, O., Reynolds, K. A., and Ranganathan, R. Evolution-Based Functional 27 | Decomposition of Proteins. *PLOS Computational Biology* 12, e1004817 (2016). 28 | 29 | I. Installing and Using pySCA 30 | ============================== 31 | 32 | .. toctree:: 33 | :maxdepth: 2 34 | 35 | install 36 | get_started 37 | usage 38 | examples 39 | modules 40 | versions 41 | 42 | II. Indices and Tables 43 | ======================= 44 | 45 | * :ref:`genindex` 46 | * :ref:`modindex` 47 | * :ref:`search` 48 | -------------------------------------------------------------------------------- /docs/source/install.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Installation 3 | ============ 4 | 5 | The following are a set of command-line-centric instructions for installing 6 | pySCA on Linux, Windows, and macOS operating systems. 7 | 8 | 1. Install Dependencies 9 | ======================= 10 | 11 | Choose the set of instructions in this section based on your operating system. 12 | 13 | Linux (Ubuntu 18.04) 14 | -------------------- 15 | 16 | Before installing pySCA, install the following packages from your package 17 | repository: 18 | 19 | 1) Python 3 20 | 2) Pip 21 | 3) GCC 22 | 23 | .. code-block:: bash 24 | 25 | sudo apt-get install python3 python3-pip git gcc 26 | 27 | 28 | Next, download and install `FASTA 29 | `_ from GitHub. 30 | FASTA is needed for the :code:`ggsearch36` function. 31 | 32 | .. code-block:: bash 33 | 34 | git clone https://github.com/wrpearson/fasta36.git 35 | cd fasta36/src 36 | make -j2 -f ../make/Makefile.linux all 37 | sudo cp -r ../bin /usr/local 38 | sudo rm /usr/local/bin/README 39 | cd ../.. 40 | 41 | This will compile and install FASTA in the `/usr/local/bin` directory. This 42 | is already on your system PATH, so programs will be able to find it without 43 | additional configuration. 44 | 45 | Alternatively, instead of :code:`ggsearch36`, one can instead use the 46 | :code:`needle` function from the `EMBOSS software package 47 | `_. 48 | 49 | macOS (10.15) 50 | ~~~~~~~~~~~~~ 51 | 52 | To install the dependencies on macOS, you will need to enable Xcode 53 | (:code:`xcode-select --install`) and install `Homebrew `_, and 54 | unofficial package manager. 55 | 56 | The installation process will involve entering commands in the terminal and 57 | editing text files. 58 | 59 | For those unfamiliar, to run commands in the terminal: 60 | 61 | 1) Search for "terminal" in the launcher. 62 | 2) Open the "Terminal" application. 63 | 3) Type in a command into the terminal. 64 | 4) Hit ENTER. 65 | 66 | Repeat steps 3 and 4 for all the commands you need to run. 67 | 68 | Open your terminal. Search for "terminal" in your launcher's list of 69 | applications. The following are a set of command-line instructions. 70 | 71 | **Xcode** 72 | 73 | Now, installation of dependencies. The first step is to enable macOS developer 74 | tools (i.e. Xcode). If this has not already been installed, run the following 75 | in the terminal:: 76 | 77 | xcode-select --install 78 | 79 | You will be prompted for your password at this step. 80 | 81 | **Homebrew** 82 | 83 | To install Homebrew, run (copied from the Homebrew website) the following in 84 | the terminal:: 85 | 86 | /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install.sh)" 87 | 88 | You will be prompted for your password at this step. 89 | 90 | If you run into permissions errors when installing Homebrew, complaining that 91 | root owns the `/usr/local/` directory, you can change the ownership by 92 | running in the terminal:: 93 | 94 | sudo chown -R /usr/local 95 | 96 | where `` should be substituted with your username. You can view your 97 | username in the right prompt of your open terminal windows (e.g. 98 | @). If still unsure, type :code:`whoami` into the terminal, which 99 | will print out username name. (Don't use :code:`sudo whoami`.) 100 | 101 | **Python and GCC** 102 | 103 | Once Homebrew is installed, install Python 3 and GCC:: 104 | 105 | brew install gcc python3 106 | 107 | This will install the most recent version of Python and the GCC compiler in 108 | your `/usr/local` directory. To ensure that these programs are used (rather the 109 | ones packged with the base system), add the following to your shell rc file 110 | (e.g. `~/.bashrc` or `~/.zshrc`, depending on what your default shell):: 111 | 112 | export CC=/usr/local/bin/gcc-9 113 | export CXX=/usr/local/bin/g++-9 114 | alias gcc=/usr/local/bin/gcc-9 115 | alias g++=/usr/local/bin/g++-9 116 | 117 | alias python=/usr/local/bin/python3 118 | alias python3=/usr/local/bin/python3 119 | alias pip=/usr/local/bin/pip3 120 | alias pip3=/usr/local/bin/pip3 121 | alias python-config=/usr/local/bin/python3-config 122 | alias python3-config=/usr/local/bin/python3-config 123 | 124 | If unsure of your default shell, run :code:`echo $SHELL` in your terminal. If 125 | the output contains :code:`zsh`, you need to edit `~/.zshrc` (note the '.' at 126 | the beginning of the file name), and if the output contains :code:`bash`, edit 127 | `~/.bashrc`. 128 | 129 | To edit these files, you can either evoke a text editor from the terminal, or 130 | your can locate the file in your fie explorer and open it in a text editor 131 | there. If you opt to use the terminal for edits, one option is to run:: 132 | 133 | nano ~/.bashrc 134 | 135 | (or :code:`nano ~/.zshrc` as appropriate) 136 | 137 | Go to the bottom of the file and append the block of lines above. Save the 138 | changes and exit the editor. 139 | 140 | Next, you will need to update your shell environment to reflect the changes 141 | made by appending those lines: 142 | 143 | 1) Option 1: Close and re-open the terminal. 144 | 2) Option 2: Run in the terminal:: 145 | 146 | source ~/.bashrc 147 | 148 | Now, you can check the new aliases and environmental variables by running in 149 | the terminal:: 150 | 151 | alias 152 | echo $CC 153 | echo $CXX 154 | 155 | In the output, look for lines corresponding to the aliased commands you copied 156 | earlier, namely :code:`python` and :code:`pip`. Make sure they match. 157 | 158 | **FASTA** 159 | 160 | Once the aliases and environmental variables are properly set, install FASTA: 161 | 162 | .. code-block:: bash 163 | 164 | git clone https://github.com/wrpearson/fasta36.git 165 | cd fasta36/src 166 | make -j2 -f ../make/Makefile.os_x86_64 all 167 | cp -r ../bin /usr/local 168 | rm /usr/local/bin/README 169 | cd ../.. 170 | 171 | FASTA will now be installed in the `/usr/local/bin` directory, and the 172 | :code:`ggsearch36` program included in FASTA will now be on the system PATH. 173 | 174 | If the installation fails due to permissions issues caused by root owning 175 | `/usr/local/bin`, you may need to use :code:`sudo` to run the following: 176 | 177 | .. code-block:: bash 178 | 179 | cd fasta36/src 180 | sudo cp -r ../bin /usr/local 181 | sudo rm /usr/local/bin/README 182 | cd ../.. 183 | 184 | You will be prompted for your password at this step. 185 | 186 | Once installed, you can check that :code:`ggsearch36` is up and running by 187 | entering in the terminal:: 188 | 189 | ggsearch36 190 | 191 | This should print out the usage information for :code:`ggsearch36`. 192 | 193 | Windows 10 194 | ---------- 195 | 196 | First, download and install: 197 | 198 | 1) `MSYS2 `_, 199 | 2) `Python 3 `_, and 200 | 3) `Git for Windows `_. 201 | 202 | **IMPORTANT:** When opening Python 3 installer, click the checkbox to make sure 203 | Python is added to your system PATH. For the rest of the prompts, use the 204 | defaults. 205 | 206 | Git for Windows creates both a Bash prompt with Git for version control that 207 | also can access Python 3 installation on the system PATH. Again, it's essential 208 | that Python 3 added there. If not (or if you don't remember), run the Python 3 209 | installer again and make sure. 210 | 211 | MSYS2 is a program for compiling Windows programs using a POSIX compatibility 212 | layer and ported versions of GCC, binutils, and other utilities. Essentially, 213 | one can use it to compile Windows exe's as one would on a Unix operating 214 | system, like macOS or Linux. 215 | 216 | The reason to install it here is to compile FASTA. Once MSYS2 is installed, 217 | open in and run:: 218 | 219 | pacman -Syu 220 | 221 | The program will upgrade some core packages and request to be closed. Do so, 222 | re-open it, and once again run:: 223 | 224 | pacman -Syu 225 | 226 | Once the upgrade are complete, install the following packages with the package 227 | manager (:code:`pacman`) by running:: 228 | 229 | pacman -S git vim make gcc 230 | 231 | Now, you can download and install FASTA:: 232 | 233 | git clone https://github.com/wrpearson/fasta36.git 234 | cd fasta36/src 235 | make CC=/usr/bin/gcc LD=/usr/bin/ld -j2 -f ../make/Makefile.linux all 236 | cp -r ../bin /usr/local/ 237 | rm /usr/local/bin/README 238 | cd ../.. 239 | 240 | Now, the :code:`ggsearch36` program, along with the others, are installed in 241 | `/usr/local/bin`. The next step is to add this directory into your Windows 242 | system PATH variable: 243 | 244 | 1. Type 'env' in the start search bar. 245 | 2. Click 'Edit the system environment variables'. 246 | 3. Click on 'Environment Variables...' toward the bottom of the window that 247 | opens. 248 | 4. Select 'Path' in one of the two selection windows (either 'User variables' 249 | or 'System variables' is fine) 250 | 5. Once 'Path' is highlighted, click 'Edit...' 251 | 6. Enter the `/usr/local/bin` as a new PATH entry. You can either: 252 | 253 | - Click 'New' in the new window and enter the path to `/usr/local/bin` in 254 | the MSYS2 installation folder (default: `C:\msys64\usr\local\bin`). 255 | - Click the 'Browse...' button and navigate to the `C:\msys64\usr\local\bin` 256 | directory. 257 | 258 | 7. When the new entry is added, click 'OK' on all the opened windows to set all 259 | the changes. You will need to close and re-open terminals for the changes to 260 | be reflected. 261 | 262 | Now, :code:`ggsearch36.exe` will be available to all running programs. 263 | 264 | Finally, launch Git Bash (from Git for Windows) after installed. Open the 265 | `~/.bash_profile` file in the text editor, default `vi`, by running :code:`vi 266 | ~/.bash_profile`. In this file add the lines:: 267 | 268 | alias python="winpty python.exe" 269 | alias pip="winpty pip.exe" 270 | 271 | Close the terminal and open it again. Now, you will be able to invoke the 272 | Python REPL and pip from the Git Bash prompt. Also, if the PATH variable was 273 | properly updated to contain the `/usr/local/bin` folder from the FASTA step, 274 | :code:`ggsearch36.exe` will also be available in Git Bash. 275 | 276 | You will use the Git Bash prompt to download and install pySCA. 277 | 278 | 2. Other Dependencies 279 | ================================ 280 | 281 | The following steps are optional but highly recommended: 282 | 283 | 1) `PFAM annotations (click to download) 284 | `_ - 285 | the file `pfamseq.txt` contains phylogenetic annotations for PFAM sequences. 286 | This is necessary if you would like to annotate PFAM alignments with 287 | taxonomic/phylogenetic information using the :code:`annotateMSA` script 288 | provided by pySCA. The file is available from the PFAM FTP site in 289 | compressed (\*.gz) format. Just be aware that the file is quite large (~10 290 | Gb download, ~30 Gb decompressed), so check beforehand that you have 291 | available space on your hard drive. 292 | 293 | 2) `PyMol `_ - necessary if you would like to use pySCA's 294 | automated structure mapping scripts, and useful for mapping the sectors to 295 | structure in general. 296 | 297 | The version of the code provided in the linked website requires a paid 298 | license. For an open-source (free) build of the code, albeit with some 299 | functionality removed, you can follow the following instructions: 300 | 301 | **Linux** - install from your distribution package repository 302 | 303 | **macOS** - run in the terminal:: 304 | 305 | brew cask install xquartz 306 | brew install brewsci/bio/pymol 307 | 308 | **Windows** - look for `online builds 309 | `_ 310 | 311 | 312 | 3) `mpld3 `_ - a package that allows more 313 | interactive plot visualization in Jupyter notebooks. If you choose not to 314 | install this (optional) package, you will need to comment out the 315 | `import mpld3` lines at the beginning of the tutorials. To install, run in the terminal:: 316 | 317 | pip install mpld3 318 | 319 | 320 | 3. Download Code 321 | ================ 322 | 323 | The pySCA package, tutorials, and associated scripts are available for download 324 | from the `GitHub repository `_. There 325 | are several options for doing so. 326 | 327 | A. Use Git 328 | ---------- 329 | 330 | If you have :code:`git` installed on your system, you can use it to clone the 331 | repository from GitHub. From the command line, run: 332 | 333 | .. code-block:: bash 334 | 335 | git clone https://github.com/ranganathanlab/pySCA.git 336 | 337 | For development and troubleshooting purposes, using Git is preferred. 338 | 339 | The code will now be downloaded in a directory called `pySCA`. 340 | 341 | B. (OR) Download from the Website 342 | --------------------------------- 343 | 344 | Though not recommended, you can also download the source code from the GitHub 345 | website. Click the green "Clone or download" tab pictured below to obtain the 346 | latest code. 347 | 348 | .. image:: _static/github-download-screenshot.png 349 | 350 | In the event that you need older versions of the code, you can use the 351 | `releases `_ tab on the 352 | GitHub page to download older tagged version. 353 | 354 | 4. (OPTIONAL) Modify Settings 355 | ============================= 356 | 357 | Before installing pySCA, for your convenience, you may specify default paths in 358 | the `settings.py` file found in the `pysca` directory of the pySCA codebase. 359 | Setting these is optional, for not doing so simply meaning having to set a few 360 | command line options when running the code later. 361 | 362 | :path2pfamseq: location of the `pfamseq.txt` text file (default: 363 | `pfamseq.txt`). Use an absolute path to specify location. 364 | 365 | :path2pfamdb: location of the `pfamseq.db` SQLite database (default: 366 | `pfamseq.db`) --- the database is generated by the `getPfamDB.sh` 367 | script and much faster to process than the text file. 368 | 369 | :path2structures: location of your PDB structures for analysis (default: `.`). 370 | This variable should be set to the absolute path of the 371 | directory where you store all your PDB structures. 372 | 373 | :path2output: name of the directory where to output the SCA results (default: 374 | `output/`) 375 | 376 | :path2pymol: path to PyMOL executable. If unset -- the default -- pySCA will 377 | look for PyMOL in the system PATH. This variable will only need to 378 | be set if PyMOL is installed in an exotic location and cannot be 379 | started by simply running :code:`pymol` in the terminal. 380 | 381 | If you ever want to change these variables at a later time, edit the 382 | `settings.py` file and then **re-install** pySCA. Follow the installation 383 | procedure in the next step. 384 | 385 | 5. Install pySCA 386 | ================ 387 | 388 | The processing scripts found in the `bin/` directory and the SCA toolbox in 389 | `pysca/` can now be installed. To install them system-wide, go to the base of 390 | the repository (i.e. the `pySCA/` directory downloaded by Git) and run in the 391 | terminal: 392 | 393 | .. code-block:: bash 394 | 395 | pip install . 396 | 397 | Note the '.' at the end. Don't omit it --- it tells :code:`pip` to look in the 398 | current directory for configuration instructions. 399 | 400 | Pip will first install python package dependencies: 401 | 402 | 1) Numpy 403 | 2) Scipy 404 | 3) Argparse 405 | 4) Wheel 406 | 5) Matplotlib 407 | 408 | Then, it install the pySCA code itself. 409 | 410 | *If and only if you run into permissions errors*, two options are to either: 411 | 412 | **A. Install pySCA locally** 413 | 414 | To install pySCA in your user directory (and without root privileges), run in 415 | the terminal:: 416 | 417 | pip install --user . 418 | 419 | This option is useful if you are working on a system where you do not have 420 | administrator access. 421 | 422 | Note that to use locally installed scripts, the installation directory needs to 423 | be in the system PATH. To check whether that is the case, run:: 424 | 425 | echo $PATH | grep --color=auto "$(python -m site --user-base)/bin" 426 | 427 | If the installation directory is highlighted in the output, then the PATH is 428 | configured correctly. If it is not found, then it needs to be added manually. 429 | Open you shell configuration file (e.g. `~/.bashrc`) and add the directory to the 430 | PATH variable by appending the following line:: 431 | 432 | export PATH="$HOME/.local/bin:$PATH" 433 | 434 | The exact path (the text following the semicolon) may differ on your system, 435 | but it can easily be found by running `echo $(python -m site --user-base)/bin`. 436 | 437 | **OR B. Install pySCA globally as root** 438 | 439 | To install pySCA system-wide, run (as root/administrator):: 440 | 441 | sudo pip install . 442 | 443 | This will obviate any need to mess around with local PATH variables, and pySCA 444 | will be accessible to all users on the system. 445 | 446 | Now, with the pySCA code installed, each of the commands found in bin/ can now 447 | be run from the command line. 448 | 449 | 450 | 6. Getting Started and Running the Tutorials 451 | ============================================ 452 | 453 | The :doc:`"getting started" ` section of this documentation 454 | provides instructions on how to run some initial calculations and the 455 | tutorials. The basic idea behind the pySCA code is that the core calculations 456 | are performed using a series of executable Python scripts, and then the results 457 | can be loaded and analyzed/visualized using an Jupyter notebook (or 458 | alternatively, MATLAB). 459 | 460 | All of the tutorials are written provided as Jupyter notebooks. For more on 461 | how Jupyter notebooks work, see: ``_. Prior to running the 462 | notebook tutorials, you'll need to run the core calculation scripts that 463 | generate the input for the notebooks. One way to do this is with the shell 464 | script "runAllNBCalcs.sh", and there is more information on this in the 465 | :doc:`"getting started" ` section. Once the calculations are 466 | completed, you can begin the tutorial in interactive Python from the command 467 | line, by typing: 468 | 469 | To install Jupyter, run: 470 | 471 | .. code-block:: bash 472 | 473 | pip install jupyterlab 474 | 475 | 476 | You can then open the notebooks from the command line by running: 477 | 478 | .. code-block:: bash 479 | 480 | jupyter notebook 481 | 482 | 483 | .. **Important:** The :code:`ggearch36`, :code:`needle`, and :code:`pymol` 484 | .. programs need to be on the system PATH. 485 | .. 486 | .. To view your system PATH, run in the terminal:: 487 | .. 488 | .. echo $PATH 489 | .. 490 | .. To add directories containing the required prorams to your system path, you 491 | .. will need to edit your shell configuration file (e.g. `.bashrc` or 492 | .. `.bash_profile`) found at the base of your user directory. To add a directory 493 | .. to the system PATH, open up the file and apped the line:: 494 | .. 495 | .. export PATH="$PATH:" 496 | .. 497 | .. where `` is replaced with the path to the directory 498 | .. containing a program you wish to add (e.g. `~/.local/bin`). After saving the 499 | .. changes, new terminals will use the updated PATH. 500 | .. 501 | .. **Important:** To add an already-installed program is to the PATH, run:: 502 | .. 503 | .. $ whereis 504 | .. 505 | .. to find where `` (e.g. :code:`pymol`) is located, and add its 506 | .. directory to the system PATH in the manner described above. 507 | .. 508 | .. **Important:** Your requirements will vary depending on the size of your 509 | .. sequence alignments, but as a rule of thumb, the toolbox is best used on a 510 | .. system with at least 8 GB of RAM. pySCA may run with Less, but there will be a 511 | .. greater risk when using modestly-sized multeiple sequence alignments of 512 | .. processes using more memory than available and subsequently getting killed by 513 | .. the operating system's scheduler. 514 | -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | The pySCA Code 3 | ============== 4 | 5 | .. toctree:: 6 | :maxdepth: 4 7 | 8 | annotateMSA 9 | scaProcessMSA 10 | scaCore 11 | scaSectorID 12 | scaTools 13 | -------------------------------------------------------------------------------- /docs/source/scaCore.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | scaCore 3 | ======= 4 | 5 | .. automodule:: scaCore 6 | :members: 7 | :undoc-members: 8 | :show-inheritance: 9 | -------------------------------------------------------------------------------- /docs/source/scaProcessMSA.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | scaProcessMSA 3 | ============= 4 | 5 | .. automodule:: scaProcessMSA 6 | :members: 7 | :undoc-members: 8 | :show-inheritance: 9 | -------------------------------------------------------------------------------- /docs/source/scaSectorID.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | scaSectorID 3 | =========== 4 | 5 | .. automodule:: scaSectorID 6 | :members: 7 | :undoc-members: 8 | :show-inheritance: 9 | -------------------------------------------------------------------------------- /docs/source/scaTools.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | scaTools 3 | ======== 4 | 5 | .. automodule:: scaTools 6 | :members: 7 | :undoc-members: 8 | :show-inheritance: 9 | -------------------------------------------------------------------------------- /docs/source/usage.rst: -------------------------------------------------------------------------------- 1 | ===== 2 | Usage 3 | ===== 4 | 5 | We provide tutorials that walk through the process of sector identification for 6 | three protein families: the Ras-like small G-proteins, the metabolic enzyme 7 | Dihydrofolate Reductase (DHFR), and the antibiotic resistance enzyme 8 | Beta-lactamase. 9 | 10 | To run the SCA calculations for all three examples, you can execute the 11 | following shell script from the scripts/ directory:: 12 | 13 | ./runAllNBCalcs.sh 14 | 15 | For each example, this will generate the following outputs in the output/ 16 | directory: 17 | 18 | 1. A pickle database (\*.db file) that contains the results of the 19 | calculations (these are then read in and analyzed in the IPython 20 | notebooks - \*.ipynb) 21 | 2. A \*.log file that provides some information about the analysis 22 | 3. A processed alignment (\*.fasta file) resulting from the 23 | scaProcessMSA script. 24 | 25 | Following this step, you can begin the tutorial as an interactive Jupyter 26 | notebook from the command line as follows:: 27 | 28 | jupyter notebook SCA_G.ipynb 29 | 30 | This should open the notebook in a browser window, where you can run the code, 31 | and examine the SCA results. 32 | -------------------------------------------------------------------------------- /docs/source/versions.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | Distributions 3 | ============= 4 | 5 | Previous versions of SCA were implemented as MATLAB toolboxes and contain 6 | various accessory codes for data formatting, display, and analysis. 7 | 8 | :SCA Toolbox 1.5: 9 | The original SCA method as specified in Lockless and Ranganathan (4) with one 10 | modification that was used in all subsequent papers: the division of binomial 11 | probabilities by the mean probability of amino acids in the alignment is 12 | removed. This version is longer in active use. 13 | 14 | :SCA Toolbox 2.5: 15 | The bootstrap-based approach for SCA. Position-specific conservation 16 | calculated as in Eq. (4) and correlations calculated as in Eq. (9). Matrix 17 | reduction per Eq. (32). 18 | 19 | :SCA Toolbox 3.0: 20 | The analytical calculation of correlations weighted by gradients of relative 21 | entropy. Position-specific conservation calculated as in Eq. (4) and 22 | correlations calculated as in Eq. (9)-(33). For non-binarized alignments, 23 | matrix reduction is per Eq. (32). 24 | 25 | :SCA Toolbox 4.0: 26 | Analytical calculations as in v3.0, but now including sector identification 27 | methods as described in Ref. (2). 28 | 29 | :SCA Toolbox 5.0: 30 | Calculation of positional and sequence correlations matrices by the alignment 31 | projection method as per Eq. (19) and Eq. (20), and calculation of the 32 | mapping between them Eq. (21). Includes methods for sector identification and 33 | exploring relationships between positional and sequence correlations. 34 | 35 | :SCA Toolbox 6.0: 36 | Calculation of first-order and second-order statistics for positional amino 37 | acid frequencies using sequences weighted by similarity in the multiple 38 | sequence alignment. 39 | 40 | :SCA Toolbox 6.1: 41 | Port from Python 2 to Python 3. Also includes updated annotation scripts, 42 | changes to the command-line interface, and the option to install analysis 43 | scripts as system-wide executables. 44 | 45 | To obtain previous distributions, please contact Dr. Rama Ranganathan. 46 | -------------------------------------------------------------------------------- /figs/BLactamase_sec_hier.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/figs/BLactamase_sec_hier.png -------------------------------------------------------------------------------- /figs/DHFR_decompv2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/figs/DHFR_decompv2.png -------------------------------------------------------------------------------- /figs/DHFR_sec_hier.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/figs/DHFR_sec_hier.png -------------------------------------------------------------------------------- /figs/Gprot_sec_hier.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/figs/Gprot_sec_hier.png -------------------------------------------------------------------------------- /figs/Gprot_secstruct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/figs/Gprot_secstruct.png -------------------------------------------------------------------------------- /pysca/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/pysca/__init__.py -------------------------------------------------------------------------------- /pysca/settings.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | """ Global settings for pySCA. """ 3 | 4 | # 5 | # PATHS 6 | # 7 | # These have to be changed to be consistent with user-defined paths. This 8 | # script is tested against the `runAllNBCalcs.sh` scripts, and because the 9 | # script includes a `cd ../` command before running any of the python scripts, 10 | # the base directory is the root of the repository. 11 | # 12 | 13 | # Enter absolute path (e.g. /home//pfamseq.txt) to the file 'pfamseq.txt' 14 | # from 15 | # ftp://ftp.sanger.ac.uk/pub/databases/Pfam/current_release/database_files/ 16 | # and/or the SQLite database `pfamseq.db` if it exists. 17 | path2pfamseq = "pfamseq.txt" # replace with absolute path to pfamseq.txt 18 | path2pfamseqdb = ( 19 | "pfamseq.db" # replace with absolute path to pfamseq.db (if present) 20 | ) 21 | 22 | # the location of your PDB structures 23 | path2structures = ( 24 | "." # replace with absolute path to directory of PDB structures 25 | ) 26 | 27 | # Also assumes that a folder named 'output/' is in the path. Change to '.' if 28 | # you want results printed in the current working directory by default. 29 | path2output = "output/" 30 | 31 | # Used for pulling species, taxonomy annotations from ncbi database. PLEASE 32 | # change to your own your email!!! 33 | entrezemail = "your.email@youruniversity.edu" 34 | 35 | # If you are using a version of PyMOL not intalled in your system PATH, you can 36 | # add the path here. Use an absolute path to the PyMOL executable, or leave 37 | # empty to use PyMOL on the system PATH. 38 | path2pymol = "" 39 | -------------------------------------------------------------------------------- /scripts/getPfamDB.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | set -eu 3 | 4 | # 5 | # The Pfam annotation script is much, much faster when using a database instead 6 | # of iterating over a 20 GB text file line by line. This script is intended to 7 | # download the text file and convert it into a SQLite3 database. 8 | # 9 | # I recommend running this overnight when you aren't using your computer. 10 | # SQLite3 has to create key-value pairs for over 40 million sequences, and it 11 | # is VERY, VERY slow. 12 | # 13 | # Dependencies: wget, sqlite3, awk, and gzip or pigz 14 | # 15 | 16 | # 17 | # Globals 18 | # 19 | 20 | pfamurl="ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/database_files" 21 | pfamheaders="pfamseq.sql" 22 | pfamdata="pfamseq.txt" 23 | pfamdb="pfamseq.db" 24 | 25 | gzip=gzip # replace this value with whatever gzip compression tool you use 26 | 27 | 28 | # 29 | # Download and extract the data 30 | # 31 | 32 | echo "Downloading the Pfam database files and generate a SQLite3 database." 33 | echo "Requires ~90 GB of free storage and could take several hours." 34 | 35 | echo "Downloading the Pfam annotated sequence data." 36 | 37 | wget -Nc "${pfamurl}/${pfamheaders}.gz" 38 | wget -Nc "${pfamurl}/${pfamdata}.gz" 39 | echo "Got 'em." 40 | 41 | echo "Decompress the gzipped files." 42 | echo "This will take a while." 43 | if test "$(command -v ${gzip})"; then 44 | ${gzip} -vd "${pfamheaders}.gz" 45 | ${gzip} -vd "${pfamdata}.gz" 46 | else 47 | echo "${gzip} not found. Exiting." 48 | exit 3 49 | fi 50 | echo "Done!" 51 | 52 | 53 | # 54 | # Create the database 55 | # 56 | 57 | # The SQL dump on the server is for MySQL (MariaDB), so it needs to be 58 | # converted to a format compatible with SQLite3. 59 | 60 | echo "Converting the MySQL dump to SQLite3." 61 | git clone --depth 1 https://github.com/dumblob/mysql2sqlite.git 62 | ./mysql2sqlite/mysql2sqlite "${pfamheaders}" | sqlite3 "${pfamdb}" 63 | rm -rf mysql2sqlite 64 | 65 | echo "Importing data." 66 | sqlite3 -batch "${pfamdb}" << "EOF" 67 | .separator "\t" 68 | .import pfamseq.txt pfamseq 69 | EOF 70 | echo "Done!" 71 | -------------------------------------------------------------------------------- /scripts/rstZipFixUrl.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | set -eu 3 | 4 | # 5 | # This script is intended to change the URLs and file names of the zipped 6 | # output from a Jupyter notebook (see 'Download as rst' option). By default, 7 | # the images names are 'output_\*.png', with corresponding URLs in the RST 8 | # file, which will cause naming clashes when including multiple notebooks, each 9 | # with different images, to the _static folder. 10 | # 11 | # This script will take the filename of the zip file, extract its contents, 12 | # rename the images from 'output' to '', and update the URLS in the 13 | # RST files. 14 | # 15 | # Input: 16 | # - zip file containing RST and image from Jupyter notebook 17 | # Output: 18 | # - directory containing RST file and images with updated URLs 19 | # 20 | # Usage: 21 | # ./rstZipFixUrl.sh 22 | # 23 | 24 | docsdir="../docs/source" 25 | docsstaticdir="_static" 26 | 27 | filename=$(basename ${1%.*}) 28 | extension=${1##*.} 29 | 30 | if [[ "${extension}" != "zip" ]]; then 31 | echo "ERROR: Input is not a zipped archive." 32 | exit 3 33 | fi 34 | 35 | tmpdir=tmp_${filename} 36 | 37 | mkdir -p ${tmpdir} 38 | cd ${tmpdir} 39 | 40 | unzip ../${1} 41 | 42 | sed -i "s,output_\([0-9_]\+\).png,${docsstaticdir}/${filename}_\1.png,g" ${filename}.rst 43 | sed -i "s,^\.\. code:: ipython3,\.\. code:: python3,g" ${filename}.rst 44 | for png in *.png; do 45 | newpng=$(echo ${png} | sed -e "s/output_\([0-9_]\+\).png/${filename}_\1.png/g") 46 | mv ${png} ${newpng} 47 | done 48 | 49 | cd ../ 50 | 51 | mv ${tmpdir}/${filename}.rst ${docsdir}/ 52 | mv ${tmpdir}/${filename}_*.png ${docsdir}/${docsstaticdir}/ 53 | 54 | rmdir ${tmpdir} 55 | -------------------------------------------------------------------------------- /scripts/runAllNBCalcs.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | set -eu 3 | 4 | # Globals 5 | 6 | datadir=data 7 | outputdir=output 8 | 9 | datarepo="https://github.com/ranganathanlab/pySCA-data" 10 | version=6.1 11 | 12 | # Download the data 13 | 14 | cd ../ 15 | 16 | # In the event git is not installed, just directly download the data from 17 | # GitHub using wget or curl (in order of preference). Also, check to see if tar 18 | # is installed. If not, download the zipped archive. 19 | if [ -x "$(command -v git)" ] && [ -d ".git/" ]; then 20 | git submodule init 21 | git submodule update --force 22 | elif [ -x "$(command -v wget)" ]; then 23 | echo "git not installed --- trying wget" 24 | if [ -x "$(command -v tar)" ]; then 25 | wget -nc ${datarepo}/archive/v${version}.tar.gz 26 | tar xf v${version}.tar.gz 27 | elif [ -x "$(command -v unzip)" ]; then 28 | wget -nc ${datarepo}/archive/v${version}.zip 29 | unzip v${version}.zip 30 | else 31 | echo "'unzip' or 'tar' (with gzip) is required for decompressing data." 32 | exit 3 33 | fi 34 | mkdir -p ${datadir} 35 | mv -v pySCA-data-${version}/* ${datadir}/ 36 | rm -rvf pySCA-data-${version} 37 | elif [ -x "$(command -v curl)" ]; then 38 | echo "git not installed --- trying curl" 39 | if [ -x "$(command -v tar)" ]; then 40 | curl -L -O -C - ${datarepo}/archive/v${version}.tar.gz 41 | tar xf v${version}.tar.gz 42 | elif [ -x "$(command -v unzip)" ]; then 43 | curl -L -O -C - ${datarepo}/archive/v${version}.zip 44 | unzip v${version}.zip 45 | else 46 | echo "'unzip' or 'tar' (with gzip) is required for decompressing data." 47 | exit 3 48 | fi 49 | mkdir -p ${datadir} 50 | mv -v pySCA-data-${version}/* ${datadir}/ 51 | rm -rvf pySCA-data-${version} 52 | fi 53 | 54 | # Generate the output files 55 | 56 | mkdir -vp ${outputdir} 57 | 58 | # The S1A serine proteases 59 | echo "S1A serine protease Calculations:" | tee ${outputdir}/s1A_halabi.log 60 | scaProcessMSA \ 61 | -a ${datadir}/s1Ahalabi_1470_nosnakes.an \ 62 | -b ${datadir} \ 63 | -s 3TGI \ 64 | -c E \ 65 | -d ${outputdir} \ 66 | -t -n 2>&1 | tee -a ${outputdir}/s1A_halabi.log 67 | scaCore -i ${outputdir}/s1Ahalabi_1470_nosnakes.db 2>&1 | \ 68 | tee -a ${outputdir}/s1A_halabi.log 69 | scaSectorID -i ${outputdir}/s1Ahalabi_1470_nosnakes.db 2>&1 | \ 70 | tee -a ${outputdir}/s1A_halabi.log 71 | echo 72 | 73 | # Beta-lactamase 74 | echo "Beta-lactamase Calculations:" | tee ${outputdir}/PF13354.log 75 | scaProcessMSA \ 76 | -a ${datadir}/PF13354_full.an \ 77 | -b ${datadir} \ 78 | -s 1FQG \ 79 | -c A \ 80 | -d ${outputdir} \ 81 | -f 'Escherichia coli' \ 82 | -t -n 2>&1 | tee -a ${outputdir}/PF13354.log 83 | scaCore -i ${outputdir}/PF13354_full.db 2>&1 | \ 84 | tee -a ${outputdir}/PF13354.log 85 | scaSectorID -i ${outputdir}/PF13354_full.db 2>&1 | \ 86 | tee -a ${outputdir}/PF13354.log 87 | echo 88 | 89 | # G-protein - this analysis is run with two alignments - the full Pfam 90 | # alignment (PF00071_full) and the Pfam alignment filtered to remove several 91 | # N-terminal truncation mutants. PF00071_rd2 is the aligment discussed in the 92 | # manuscript. 93 | echo "G-protein calculations:" | tee ${outputdir}/PF00071.log 94 | scaProcessMSA \ 95 | -a ${datadir}/PF00071_full.an \ 96 | -b ${datadir} \ 97 | -s 5P21 \ 98 | -c A \ 99 | -d ${outputdir} \ 100 | -f 'Homo sapiens' \ 101 | -t -n 2>&1 | tee -a ${outputdir}/PF00071.log 102 | scaCore -i ${outputdir}/PF00071_full.db 2>&1 | \ 103 | tee -a ${outputdir}/PF00071.log 104 | scaSectorID -i ${outputdir}/PF00071_full.db 2>&1 | \ 105 | tee -a ${outputdir}/PF00071.log 106 | echo 107 | 108 | echo "G-protein calculations:" | tee ${outputdir}/PF00071_rd2.log 109 | scaProcessMSA \ 110 | -a ${datadir}/PF00071_rd2.an \ 111 | -b ${datadir} \ 112 | -s 5P21 \ 113 | -c A \ 114 | -d ${outputdir} \ 115 | -f 'Homo sapiens' \ 116 | -t -n 2>&1 | tee -a ${outputdir}/PF00071_rd2.log 117 | scaCore -i ${outputdir}/PF00071_rd2.db 2>&1 | \ 118 | tee -a ${outputdir}/PF00071_rd2.log 119 | scaSectorID -i ${outputdir}/PF00071_rd2.db 2>&1 | \ 120 | tee -a ${outputdir}/PF00071_rd2.log 121 | echo 122 | 123 | # DHFR - this analysis is also run with two alignments for comparison - 124 | # the full PFAM alignment (PF00186_full.an) and a manually curated alignment 125 | # (DHFR_PEPM3.an) 126 | echo "DHFR Calculations:" | tee ${outputdir}/PF00186.log 127 | scaProcessMSA \ 128 | -a ${datadir}/PF00186_full.an \ 129 | -b ${datadir} \ 130 | -s 1RX2 \ 131 | -c A \ 132 | -d ${outputdir} \ 133 | -f 'Escherichia coli' \ 134 | -t -n 2>&1 | tee -a ${outputdir}/PF00186.log 135 | scaCore -i ${outputdir}/PF00186_full.db 2>&1 | \ 136 | tee -a ${outputdir}/PF00186.log 137 | scaSectorID -i ${outputdir}/PF00186_full.db 2>&1 | \ 138 | tee -a ${outputdir}/PF00186.log 139 | echo 140 | 141 | echo "DHFR Calculations:" | tee ${outputdir}/DHFR_PEPM3.log 142 | scaProcessMSA \ 143 | -a ${datadir}/DHFR_PEPM3.an \ 144 | -b ${datadir} \ 145 | -s 1RX2 \ 146 | -c A \ 147 | -d ${outputdir} \ 148 | -t -n 2>&1 | tee -a ${outputdir}/DHFR_PEPM3.log 149 | scaCore -i ${outputdir}/DHFR_PEPM3.db 2>&1 | \ 150 | tee -a ${outputdir}/DHFR_PEPM3.log 151 | scaSectorID -i ${outputdir}/DHFR_PEPM3.db 2>&1 | \ 152 | tee -a ${outputdir}/DHFR_PEPM3.log 153 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | """ Installation and setup for pySCA""" 3 | 4 | from setuptools import setup 5 | 6 | setup( 7 | name="pySCA", 8 | version="6.1", 9 | author="Olivier Rivoire, Rama Ranganathan, and Kimberly Reynolds", 10 | maintainer="Ansel George", 11 | packages=["pysca"], 12 | package_data={"pysca": ["settings.py"]}, 13 | description="Python 3 implementation of Statistical Coupling Analysis (SCA)", 14 | url="https://ranganathanlab.gitlab.io/pySCA", 15 | download_url="https://github.com/ranganathanlab/pySCA", 16 | long_description=open("README.md", "r").read(), 17 | install_requires=[ 18 | "biopython", 19 | "numpy", 20 | "scipy", 21 | "argparse", 22 | "wheel", 23 | "matplotlib", 24 | ], 25 | scripts=[ 26 | "bin/alnChangeDelim", 27 | "bin/alnFilterSeqSize", 28 | "bin/alnParseID", 29 | "bin/annotateMSA", 30 | "bin/scaProcessMSA", 31 | "bin/alnConvertGI", 32 | "bin/alnReplaceHeaders", 33 | "bin/scaCore", 34 | "bin/scaSectorID", 35 | ], 36 | ) 37 | --------------------------------------------------------------------------------