├── .gitignore
├── .gitlab-ci.yml
├── .gitmodules
├── LICENSE
├── README.md
├── VERSIONS.rst
├── bin
    ├── alnChangeDelim
    ├── alnConvertGI
    ├── alnFilterSeqSize
    ├── alnParseID
    ├── alnReplaceHeaders
    ├── annotateMSA
    ├── scaCore
    ├── scaProcessMSA
    └── scaSectorID
├── docs
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── SCA_DHFR.rst
    │   ├── SCA_G.rst
    │   ├── SCA_S1A.rst
    │   ├── SCA_betalactamase.rst
    │   ├── _static
    │       ├── BLactamase_sec_hier.png
    │       ├── DHFR_decompv2.png
    │       ├── DHFR_sec_hier.png
    │       ├── Gprot_sec_hier.png
    │       ├── Gprot_secstruct.png
    │       ├── SCA_DHFR_13_0.png
    │       ├── SCA_DHFR_16_1.png
    │       ├── SCA_DHFR_20_0.png
    │       ├── SCA_DHFR_22_0.png
    │       ├── SCA_DHFR_26_0.png
    │       ├── SCA_DHFR_29_0.png
    │       ├── SCA_DHFR_30_0.png
    │       ├── SCA_DHFR_7_0.png
    │       ├── SCA_G_17_0.png
    │       ├── SCA_G_21_0.png
    │       ├── SCA_G_24_1.png
    │       ├── SCA_G_26_1.png
    │       ├── SCA_G_28_0.png
    │       ├── SCA_G_31_1.png
    │       ├── SCA_G_33_0.png
    │       ├── SCA_G_35_0.png
    │       ├── SCA_G_37_0.png
    │       ├── SCA_G_42_0.png
    │       ├── SCA_G_44_0.png
    │       ├── SCA_G_9_0.png
    │       ├── SCA_S1A_17_0.png
    │       ├── SCA_S1A_20_0.png
    │       ├── SCA_S1A_23_1.png
    │       ├── SCA_S1A_25_1.png
    │       ├── SCA_S1A_27_0.png
    │       ├── SCA_S1A_30_1.png
    │       ├── SCA_S1A_32_0.png
    │       ├── SCA_S1A_38_0.png
    │       ├── SCA_S1A_41_0.png
    │       ├── SCA_S1A_9_0.png
    │       ├── SCA_betalactamase_10_0.png
    │       ├── SCA_betalactamase_16_0.png
    │       ├── SCA_betalactamase_19_1.png
    │       ├── SCA_betalactamase_21_1.png
    │       ├── SCA_betalactamase_23_0.png
    │       ├── SCA_betalactamase_28_0.png
    │       ├── SCA_betalactamase_30_0.png
    │       ├── SCA_betalactamase_8_1.png
    │       ├── favicon.ico
    │       ├── github-download-screenshot.png
    │       └── logo.png
    │   ├── annotateMSA.rst
    │   ├── conf.py
    │   ├── examples.rst
    │   ├── get_started.rst
    │   ├── index.rst
    │   ├── install.rst
    │   ├── modules.rst
    │   ├── scaCore.rst
    │   ├── scaProcessMSA.rst
    │   ├── scaSectorID.rst
    │   ├── scaTools.rst
    │   ├── usage.rst
    │   └── versions.rst
├── figs
    ├── BLactamase_sec_hier.png
    ├── DHFR_decompv2.png
    ├── DHFR_sec_hier.png
    ├── Gprot_sec_hier.png
    └── Gprot_secstruct.png
├── notebooks
    ├── SCA_DHFR.ipynb
    ├── SCA_G.ipynb
    ├── SCA_S1A.ipynb
    └── SCA_betalactamase.ipynb
├── pysca
    ├── __init__.py
    ├── scaTools.py
    └── settings.py
├── scripts
    ├── getPfamDB.sh
    ├── rstZipFixUrl.sh
    └── runAllNBCalcs.sh
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | *.egg-info
  2 | *.zip
  3 | *.xz
  4 | *.gz
  5 | .listing
  6 | *.sw[ap]
  7 | *.DS_Store
  8 | *~
  9 | 
 10 | docs/build
 11 | output/
 12 | *.bak
 13 | 
 14 | # Byte-compiled / optimized / DLL files
 15 | __pycache__/
 16 | *.py[cod]
 17 | *$py.class
 18 | 
 19 | # C extensions
 20 | *.so
 21 | 
 22 | # Distribution / packaging
 23 | .Python
 24 | build/
 25 | develop-eggs/
 26 | dist/
 27 | downloads/
 28 | eggs/
 29 | .eggs/
 30 | lib/
 31 | lib64/
 32 | parts/
 33 | sdist/
 34 | var/
 35 | wheels/
 36 | pip-wheel-metadata/
 37 | share/python-wheels/
 38 | *.egg-info/
 39 | .installed.cfg
 40 | *.egg
 41 | MANIFEST
 42 | 
 43 | # PyInstaller
 44 | #  Usually these files are written by a python script from a template
 45 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 46 | *.manifest
 47 | *.spec
 48 | 
 49 | # Installer logs
 50 | pip-log.txt
 51 | pip-delete-this-directory.txt
 52 | 
 53 | # Unit test / coverage reports
 54 | htmlcov/
 55 | .tox/
 56 | .nox/
 57 | .coverage
 58 | .coverage.*
 59 | .cache
 60 | nosetests.xml
 61 | coverage.xml
 62 | *.cover
 63 | .hypothesis/
 64 | .pytest_cache/
 65 | 
 66 | # Translations
 67 | *.mo
 68 | *.pot
 69 | 
 70 | # Django stuff:
 71 | *.log
 72 | local_settings.py
 73 | db.sqlite3
 74 | db.sqlite3-journal
 75 | 
 76 | # Flask stuff:
 77 | instance/
 78 | .webassets-cache
 79 | 
 80 | # Scrapy stuff:
 81 | .scrapy
 82 | 
 83 | # Sphinx documentation
 84 | docs/_build/
 85 | 
 86 | # PyBuilder
 87 | target/
 88 | 
 89 | # Jupyter Notebook
 90 | .ipynb_checkpoints
 91 | 
 92 | # IPython
 93 | profile_default/
 94 | ipython_config.py
 95 | 
 96 | # pyenv
 97 | .python-version
 98 | 
 99 | # pipenv
100 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
101 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
102 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
103 | #   install all needed dependencies.
104 | #Pipfile.lock
105 | 
106 | # celery beat schedule file
107 | celerybeat-schedule
108 | 
109 | # SageMath parsed files
110 | *.sage.py
111 | 
112 | # Environments
113 | .env
114 | .venv
115 | env/
116 | venv/
117 | ENV/
118 | env.bak/
119 | venv.bak/
120 | 
121 | # Spyder project settings
122 | .spyderproject
123 | .spyproject
124 | 
125 | # Rope project settings
126 | .ropeproject
127 | 
128 | # mkdocs documentation
129 | /site
130 | 
131 | # mypy
132 | .mypy_cache/
133 | .dmypy.json
134 | dmypy.json
135 | 
136 | # Pyre type checker
137 | .pyre/
138 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | image: debian:bullseye-slim
 2 | 
 3 | pages:
 4 |   script:
 5 |     - apt-get update
 6 |     - apt-get upgrade -y
 7 |     - apt-get install -y python3 python3-numpy python3-scipy python3-biopython python3-pip python3-matplotlib python3-sphinx python3-sphinx-rtd-theme
 8 |     - apt-get install -y make
 9 |     - pip3 install .
10 |     - mkdir -p docs/modules
11 |     - for file in bin/*; do cp "${file}" "docs/modules/`basename $file`.py"; done
12 |     - cp pysca/scaTools.py docs/modules
13 |     - make -C docs dirhtml
14 |     - mv docs/build/dirhtml public/ 
15 |   artifacts:
16 |     paths:
17 |       - public
18 |   only:
19 |     - master
20 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "data"]
2 | 	path = data
3 | 	url = https://gitlab.com/ranganathanlab/pySCA-data.git
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015, Olivier Rivoire, Rama Ranganathan, Kimberly Reynolds, Ansel
 2 | George
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice, this
 9 |    list of conditions and the following disclaimer.
10 | 
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 |    this list of conditions and the following disclaimer in the documentation
13 |    and/or other materials provided with the distribution.
14 | 
15 | 3. Neither the name of the copyright holder nor the names of its contributors
16 |    may be used to endorse or promote products derived from this software without
17 |    specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # pySCA
 2 | 
 3 | ![Website Build Status](https://gitlab.com/ranganathanlab/pySCA/badges/master/pipeline.svg)
 4 | 
 5 | > 09.2020
 6 | >
 7 | > Copyright (C) 2019 Olivier Rivoire, Rama Ranganathan, and Kimberly Reynolds
 8 | >
 9 | > This program is free software distributed under the BSD 3-clause license,
10 | > please see the file LICENSE for details.
11 | 
12 | The current version of the Statistical Coupling Analysis (SCA) analysis is
13 | implemented in Python. This directory contains the necessary code for running
14 | the SCA calculations, as well examples/tutorials for the dihydrofolate
15 | reductase (DHFR) enzyme family, the S1A serine proteases, the small G-protein
16 | family and the Beta-lactamase enzyme family. The tutorials are distributed as
17 | Jupyter notebooks; for details please see:
18 | [https://jupyter.org/](https://jupyter.org/).
19 | 
20 | For installation instructions, and an introduction to using the toolbox, please
21 | refer to the website:
22 | 
23 | [https://ranganathanlab.gitlab.io/pySCA](https://ranganathanlab.gitlab.io/pySCA)
24 | 
25 | or look through the [RST files](docs/source) included with the pySCA
26 | distribution.
27 | 
28 | ## Contents of `/`
29 | 
30 | |            |                                                         |
31 | | :---       | :---                                                    |
32 | | bin/       | Executables for running SCA analysis functions          |
33 | | data/      | Input data (including those needed for the tutorials)   |
34 | | docs/      | HTML documentation (generated by Sphinx)                |
35 | | figs/      | Figures used for the notebooks and documentation        |
36 | | notebooks/ | Example SCA notebooks                                   |
37 | | output/    | Output files (empty at install, use `runAllNBCalcs.sh`) |
38 | | pysca/     | Python code for SCA                                     |
39 | | scripts/   | Utility scripts used to generate example data           |
40 | 
41 | ## Contents of `bin/`
42 | 
43 | |               |                                                                  |
44 | | :---          | :---                                                             |
45 | | annotateMSA   | Annotates alignments with phylogenetic/taxonomic information     |
46 | | scaProcessMSA | Conducts some initial processing of the sequence alignment       |
47 | | scaCore       | Runs the core SCA calculations                                   |
48 | | scaSectorID   | Defines sectors given the results of the calculations in scaCore |
49 | 
50 | ## Contents of `pysca/`
51 | 
52 | |             |                                                      |
53 | | :---        | :---                                                 |
54 | | scaTools.py | The SCA toolbox - functions for the SCA calculations |
55 | | settings.py | Global configuration settings for the analysis       |
56 | 
57 | ## Contents of `notebooks/`
58 | 
59 | |                         |                                     |
60 | | :---                    | :---                                |
61 | | SCA_DHFR.ipynb          | Example for DHFR                    |
62 | | SCA_G.ipynb             | Example for the small G proteins    |
63 | | SCA_betalactamase.ipynb | Example for the beta-lactamases     |
64 | | SCA_S1A.ipynb           | Example for the S1A serine protease |
65 | 


--------------------------------------------------------------------------------
/VERSIONS.rst:
--------------------------------------------------------------------------------
1 | docs/source/versions.rst


--------------------------------------------------------------------------------
/bin/alnChangeDelim:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | """
 3 | A script to change the field delimiter of a multiple sequence alignment (FASTA
 4 | format).
 5 | 
 6 | **Arguments**
 7 |     Input_MSA.fasta (the alignment to be processed)
 8 | 
 9 | **Keyword Arguments**
10 |     --output             output file name, default: FilteredAln.fa
11 |     --old-delim          delimiter separating header fields, default: "_"
12 |     --new-delim          delimiter separating header fields
13 | 
14 | :By: Kim Reynolds
15 | :On: 6.5.2015
16 | 
17 | Copyright (C) 2015 Olivier Rivoire, Rama Ranganathan, Kimberly Reynolds
18 | 
19 | This program is free software distributed under the BSD 3-clause
20 | license, please see the file LICENSE for details.
21 | """
22 | 
23 | import argparse
24 | import sys
25 | import statistics as stat
26 | from pysca import scaTools as sca
27 | 
28 | if __name__ == "__main__":
29 |     # Parse inputs
30 |     parser = argparse.ArgumentParser()
31 |     parser.add_argument("alignment", help="Input Sequence Alignment")
32 |     parser.add_argument(
33 |         "-o",
34 |         "--output",
35 |         dest="outputfile",
36 |         default="output.acc",
37 |         help="specify an outputfile name",
38 |     )
39 |     parser.add_argument(
40 |         "-d",
41 |         "--old-delim",
42 |         dest="old_delim",
43 |         help="specify the field delimiter in the header",
44 |     )
45 |     parser.add_argument(
46 |         "-n",
47 |         "--new-delim",
48 |         dest="new_delim",
49 |         help="specify the field delimiter in the header",
50 |     )
51 | 
52 |     options = parser.parse_args()
53 | 
54 |     if (options.new_delim is None) or (options.old_delim is None):
55 |         sys.exit("ERROR: Input and output delimiters must be specified.")
56 | 
57 |     headers, seqs = sca.readAlg(options.alignment)
58 | 
59 |     # Check that the old delimiter and new delimiters return a consistent
60 |     # number of fields across all sequences.
61 |     counts = []
62 |     checks = []
63 |     for i, header in enumerate(headers):
64 |         # Check that the old delimiter works.
65 |         fields = header.split(options.old_delim)
66 |         counts.append(len(fields))
67 |         # Check that the new delimiter is not found inside the fields.
68 |         checks.append(sum([options.new_delim in field for field in fields]))
69 | 
70 |     # Assume the correct number of fields is the mode of the entire set.
71 |     count = stat.mode(counts)
72 | 
73 |     # Print error messages for each sequences where either the number of fields
74 |     # is inconsistent or if the new delimiter is a bad choice given the content
75 |     # of the fields.
76 |     arewegood = True
77 |     for i, header in enumerate(headers):
78 |         if counts[i] != count:
79 |             print("WARNING: sequence %s has %s fields" % (header, counts[i]))
80 |             #  arewegood = False
81 |         if checks[i] > 0:
82 |             print(
83 |                 "ERROR: delimiter '%s' incompatible with %s"
84 |                 % (options.new_delim, header)
85 |             )
86 |             arewegood = False
87 | 
88 |     if not arewegood:
89 |         sys.exit("Errors found. Output not written.")
90 | 
91 |     # Write the file if no serious errors are found.
92 |     f = open(options.outputfile, "w")
93 |     for i, header in enumerate(headers):
94 |         fields = header.split(options.old_delim)
95 |         f.write(">%s\n" % (options.new_delim).join(fields))
96 |         f.write("%s\n" % seqs[i])
97 |     print("Done. Output written to %s." % options.outputfile)
98 | 


--------------------------------------------------------------------------------
/bin/alnConvertGI:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | """
  3 | A script to convert GI numbers in the header of a FASTA file to accession
  4 | numbers.
  5 | 
  6 | **Arguments**
  7 |     Input_MSA.fasta (the alignment to be processed)
  8 | 
  9 | **Keyword Arguments**
 10 |     --output             output file name, default: FilteredAln.fa
 11 |     --delim              delimiter separating header fields, default: "_"
 12 |     --email              email to associate with Entrez web API queries
 13 | 
 14 | :By: Kim Reynolds
 15 | :On: 6.5.2015
 16 | 
 17 | Copyright (C) 2015 Olivier Rivoire, Rama Ranganathan, Kimberly Reynolds
 18 | 
 19 | This program is free software distributed under the BSD 3-clause
 20 | license, please see the file LICENSE for details.
 21 | """
 22 | 
 23 | import argparse
 24 | import sys
 25 | from Bio import Entrez
 26 | from pysca import scaTools as sca
 27 | from pysca import settings
 28 | 
 29 | if __name__ == "__main__":
 30 |     # Parse inputs
 31 |     parser = argparse.ArgumentParser()
 32 |     parser.add_argument("alignment", help="Input Sequence Alignment")
 33 |     parser.add_argument(
 34 |         "-o",
 35 |         "--output",
 36 |         dest="outputfile",
 37 |         default="output.acc",
 38 |         help="specify an outputfile name",
 39 |     )
 40 |     parser.add_argument(
 41 |         "-d",
 42 |         "--delim",
 43 |         dest="delim",
 44 |         default="_",
 45 |         help="specify the field delimiter in the header",
 46 |     )
 47 |     parser.add_argument(
 48 |         "-e",
 49 |         "--entrez_email",
 50 |         dest="email",
 51 |         default=None,
 52 |         help="email address for querying Entrez web API",
 53 |     )
 54 | 
 55 |     options = parser.parse_args()
 56 | 
 57 |     if options.email is None:
 58 |         Entrez.email = settings.entrezemail
 59 |     else:
 60 |         Entrez.email = options.email
 61 | 
 62 |     headers, seqs = sca.readAlg(options.alignment)
 63 |     gis = [h.split(options.delim)[1] for h in headers]
 64 | 
 65 |     # Check that the GI numbers are valid.
 66 |     for i, gi in enumerate(gis):
 67 |         if not gi.isdigit():
 68 |             print("Invalid GI '%s' at line %s. Omitting." % (gi, i))
 69 |             gis[i] = "0"  # Needs to be a character, not an int.
 70 | 
 71 |     gi_blocksize = 50  # more GIs need to be submitted as a POST request
 72 |     gi_blocks = [
 73 |         gis[x : x + gi_blocksize] for x in range(0, len(gis), gi_blocksize)
 74 |     ]
 75 | 
 76 |     # Query the Entrez web API with GI numbers and store the retured accession
 77 |     # numbers in an array.
 78 |     acc_ids = []
 79 |     for gi_block in gi_blocks:
 80 |         handle = Entrez.efetch(db="protein", rettype="acc", id=gi_block)
 81 |         res = handle.read().splitlines()
 82 |         handle.close()
 83 |         if len(res) == len(gi_block):
 84 |             acc_ids.extend([acc_id if acc_id else "0" for acc_id in res])
 85 |         else:
 86 |             sys.exit("ERROR: Different number of accession IDs returned.")
 87 | 
 88 |     # Using '_' as a delimiter is a problem for accession numbers because they
 89 |     # are often in the form XX_XXXXX.1, meaning the number will be split. If
 90 |     # the supplied (or defaulted) delimited is '_', convert the delimiter to
 91 |     # something else.
 92 |     if options.delim == "_":
 93 |         print(
 94 |             "WARNING: '_' is not a good delimiter for accession "
 95 |             "numbers (e.g. YP_969813.1)."
 96 |         )
 97 |         print("The output will use '___' as a delimiter instead.")
 98 |         newdelim = "___"
 99 |     else:
100 |         newdelim = options.delim
101 | 
102 |     # Replace GI field with accession numbers in the headers and write the
103 |     # updated alignment to disk.
104 |     f = open(options.outputfile, "w")
105 |     for i, header in enumerate(headers):
106 |         fields = header.split(options.delim)
107 |         fields[0] = "ref"
108 |         fields[1] = acc_ids[i]
109 |         f.write(">%s\n" % (newdelim).join(fields))
110 |         f.write("%s\n" % seqs[i])
111 |     print("Done. Output written to %s." % options.outputfile)
112 | 


--------------------------------------------------------------------------------
/bin/alnFilterSeqSize:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | """
 3 | A basic script to filter a fasta file of sequences by size - a useful step to
 4 | remove partial sequences or sequences that would potentially introduce a large
 5 | number of gaps in the alignment. This script reads in the alignment, computes
 6 | the average sequence length, and outputs a new alignment that keeps sequences
 7 | of length mean +/- tolerance (tolerance default = 50)
 8 | 
 9 | **Arguments**
10 | 
11 |     Input_MSA.fasta (the alignment to be processed)
12 | 
13 | **Keyword Arguments**
14 | 
15 |     --tolerance, -t      allowable sequence length variation (in number of
16 |                          amino acids), default: 50
17 |     --output             output file name, default: FilteredAln.fa
18 | 
19 | :By: Kim Reynolds
20 | :On: 6.5.2015
21 | 
22 | Copyright (C) 2015 Olivier Rivoire, Rama Ranganathan, Kimberly Reynolds
23 | 
24 | This program is free software distributed under the BSD 3-clause license,
25 | please see the file LICENSE for details.
26 | """
27 | 
28 | import argparse
29 | import numpy as np
30 | from pysca import scaTools as sca
31 | 
32 | if __name__ == "__main__":
33 |     # Parse inputs
34 |     parser = argparse.ArgumentParser()
35 |     parser.add_argument("alignment", help="Input Sequence Alignment")
36 |     parser.add_argument(
37 |         "-t",
38 |         "--tolerance",
39 |         dest="tol",
40 |         type=int,
41 |         default=50,
42 |         help="allowable sequence length variation in number of"
43 |         " amino acids (alignment will be trimmed to mean"
44 |         " +/-tolerance, default = 50)",
45 |     )
46 |     parser.add_argument(
47 |         "--output",
48 |         dest="outputfile",
49 |         default="FilteredAln.fa",
50 |         help="specify an outputfile name",
51 |     )
52 | 
53 |     options = parser.parse_args()
54 | 
55 |     headers, seqs = sca.readAlg(options.alignment)
56 |     seqLen = np.zeros((len(seqs), 1)).astype(int)
57 |     for i, k in enumerate(seqs):
58 |         seqLen[i] = len(k)
59 |     avgLen = seqLen.mean()
60 |     print("Average sequence length: %i" % avgLen)
61 |     print("Min: %i, Max %i" % (seqLen.min(), seqLen.max()))
62 |     minsz = avgLen - options.tol
63 |     maxsz = avgLen + options.tol
64 |     print("Keeping sequences in the range: %i - %i" % (minsz, maxsz))
65 | 
66 |     keepSeqs = list()
67 |     keepHeaders = list()
68 |     for i, k in enumerate(seqLen):
69 |         if (k > minsz) & (k < maxsz):
70 |             keepSeqs.append(seqs[i])
71 |             keepHeaders.append(headers[i])
72 | 
73 |     print("Keeping %i of %i total sequences" % (len(keepSeqs), len(seqLen)))
74 | 
75 |     f = open(options.outputfile, "w")
76 |     for i, k in enumerate(keepSeqs):
77 |         f.write(">%s\n" % keepHeaders[i])
78 |         f.write("%s\n" % keepSeqs[i])
79 |     f.close()
80 | 


--------------------------------------------------------------------------------
/bin/alnParseID:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | """
 3 | A script to parse accession numbers from the headers of an alignment with
 4 | typical Blast formatting.
 5 | 
 6 | **Arguments**
 7 |     Input_MSA.fasta (the alignment to be processed)
 8 | 
 9 | **Keyword Arguments**
10 |     --output             output file name, default: FilteredAln.fa
11 |     --delim              delimiter for fields in the header for each sequence,
12 |                          default: '_'
13 | 
14 | :By: Kim Reynolds
15 | :On: 6.5.2015
16 | 
17 | Copyright (C) 2015 Olivier Rivoire, Rama Ranganathan, Kimberly Reynolds
18 | 
19 | This program is free software distributed under the BSD 3-clause
20 | license, please see the file LICENSE for details.
21 | """
22 | 
23 | import argparse
24 | import os
25 | import sys
26 | from pysca import scaTools as sca
27 | 
28 | if __name__ == "__main__":
29 |     # Parse inputs
30 |     parser = argparse.ArgumentParser()
31 |     parser.add_argument(
32 |         "-i",
33 |         "--input",
34 |         dest="alignment",
35 |         required=True,
36 |         help="input sequence alignment",
37 |     )
38 |     parser.add_argument(
39 |         "-o",
40 |         "--output",
41 |         dest="outputfile",
42 |         default=None,
43 |         help="specify an outputfile name",
44 |     )
45 |     parser.add_argument(
46 |         "-d",
47 |         "--delim",
48 |         dest="delim",
49 |         default="_",
50 |         help="specify the field delimiter in the header",
51 |     )
52 |     parser.add_argument(
53 |         "-t",
54 |         "--type",
55 |         dest="type",
56 |         default="gi",
57 |         required=True,
58 |         help="type of identifier to parse out of the header "
59 |         "('gi' or 'acc')",
60 |     )
61 |     options = parser.parse_args()
62 | 
63 |     # Read in the MSA.
64 |     headers, seqs = sca.readAlg(options.alignment)
65 | 
66 |     # Get index of accession number in the header fields.
67 |     if options.type == "gi":
68 |         separator = "gi"
69 |     elif options.type == "acc" or options.type == "ref":
70 |         separator = "ref"
71 |     else:
72 |         sys.exit("ID type %s not known" % options.type)
73 | 
74 |     try:
75 |         #  acc_idx = (headers[0].split(options.delim)).index('res') + 1
76 |         acc_idx = (headers[0].split(options.delim)).index(separator) + 1
77 |     except BaseException as e:
78 |         print("ERROR: %s" % e)
79 |         sys.exit("Accession field not found in %s." % options.alignment)
80 | 
81 |     acc_ids = [h.split(options.delim)[acc_idx] for h in headers]
82 | 
83 |     if options.outputfile:
84 |         outputfile = options.outputfile
85 |     else:
86 |         outputfile = (
87 |             os.path.splitext(options.alignment)[0] + "." + options.type
88 |         )
89 | 
90 |     f = open(outputfile, "w")
91 |     for acc_id in acc_ids:
92 |         f.write("%s\n" % acc_id)
93 |     f.close()
94 | 


--------------------------------------------------------------------------------
/bin/alnReplaceHeaders:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | A script that replaces the headers of one FASTA file with headers from another.
 4 | It assumes the sequences in the two FASTA files are in identical order. This is
 5 | useful, for example, when working with Promals3D alignments (which often have
 6 | the header information truncated).
 7 | 
 8 | **Arguments**
 9 |     Headers.fasta      (Alignment that is providing the headers)
10 |     Sequences.fasta    (Alignment that is providing the sequences)
11 | 
12 | **Keyword Arguments**
13 |     --headers          header alignment file name
14 |     --seqs             sequences alignment file name
15 |     --output           output file name, default: FixedHeaders.fa
16 | 
17 | :By: Kim Reynolds
18 | :On: 6.5.2015
19 | 
20 | Copyright (C) 2015 Olivier Rivoire, Rama Ranganathan, Kimberly Reynolds
21 | 
22 | This program is free software distributed under the BSD 3-clause
23 | license, please see the file LICENSE for details.
24 | """
25 | 
26 | import argparse
27 | import sys
28 | from pysca import scaTools as sca
29 | 
30 | if __name__ == "__main__":
31 |     # Parse inputs
32 |     parser = argparse.ArgumentParser()
33 |     parser.add_argument(
34 |         "-r",
35 |         "--headers",
36 |         dest="alg_headers",
37 |         help="alignment providing the headers",
38 |     )
39 |     parser.add_argument(
40 |         "-s",
41 |         "--sequences",
42 |         dest="alg_seqs",
43 |         help="alignment providing the sequences",
44 |     )
45 |     parser.add_argument(
46 |         "-o",
47 |         "--output",
48 |         dest="outputfile",
49 |         default="FixedHeaders.fa",
50 |         help="specify an outputfile name",
51 |     )
52 |     options = parser.parse_args()
53 | 
54 |     print("WARNING: This script assumes that the headers of the two input")
55 |     print("FASTA files are in IDENTICAL order. If this is NOT true, the")
56 |     print("script will give incorrect results.")
57 | 
58 |     if (options.alg_headers is None) or (options.alg_seqs is None):
59 |         sys.exit("Incorrect usage. (See `alnReplaceHeaders.py --help`)")
60 | 
61 |     headers1, seqs1 = sca.readAlg(options.alg_headers)
62 |     headers2, seqs2 = sca.readAlg(options.alg_seqs)
63 | 
64 |     if len(seqs2) != len(headers1):
65 |         sys.exit("ERROR: The length of the two alignments does not match.")
66 | 
67 |     f = open(options.outputfile, "w")
68 |     for i, k in enumerate(headers1):
69 |         f.write(">%s\n" % k)
70 |         f.write("%s\n" % seqs2[i])
71 |     f.close()
72 | 


--------------------------------------------------------------------------------
/bin/annotateMSA:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | """
  3 | The annotateMSA script provides utilities to automatically annotate sequence
  4 | headers (for a FASTA file) with taxonomic information. Currently this can be
  5 | done in one of two ways:
  6 | 
  7 |     1) For Pfam alignments, annotations can be extracted from the file
  8 |        pfamseq.txt (please download from:
  9 |        ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/database_files/pfamseq.txt.gz)
 10 | 
 11 |     2) For Blast alignments, annotations can be added using the NCBI Entrez
 12 |        utilities provided by BioPython. They can be based on GI or accession
 13 |        numbers that are used to query NCBI for taxonomy information (note that
 14 |        this approach requires a network connection).
 15 | 
 16 | To extract GI or accession numbers, use the scripts alnParseGI.py or
 17 | alnParseAcc.py, respectively.
 18 | 
 19 | For both the Pfam and NCBI utilities, the process of sequence annotation *can
 20 | be slow* (on the order of hours, particularly for NCBI entrez with larger
 21 | alignments). However, the annotation process only needs to be run once per
 22 | alignment.
 23 | 
 24 | **Keyword Arguments**
 25 |     -i, --input         Some input sequence alignment, Default: Input_MSA.fasta
 26 |     -o, --output        Specify an output file, Default: Output_MSA.an
 27 |     -a, --annot         Annotation method. Options are 'pfam' or 'ncbi'.
 28 |                         Default: 'pfam'
 29 |     -l, --idList        This argument is necessary for the 'ncbi' method.
 30 |                         Specifies a file containing a list of GI numbers
 31 |                         corresponding to the sequence order in the alignment; a
 32 |                         number of "0" indicates that a GI number wasn't
 33 |                         assigned for a particular sequence.
 34 |     -g, --giList        Deprecated. Identical to '--idList' and kept to keep
 35 |                         the CLI consistent with older versions of pySCA.
 36 |     -p, --pfam_seq      Location of the pfamseq.txt file. Defaults to
 37 |                         path2pfamseq (specified at the top of scaTools.py)
 38 |     -m, --delimiter     Character(s) used for separating fields in the sequence
 39 |                         headers of the annotated output. Default: '|'
 40 | 
 41 | **Examples**::
 42 | 
 43 |   annotateMSA -i PF00186_full.txt -o PF00186_full.an -a 'pfam'
 44 |   annotateMSA -i DHFR_PEPM3.fasta -o DHFR_PEPM3.an -a 'ncbi' -l DHFR_PEPM3.gi
 45 | 
 46 | :By: Rama Ranganathan, Kim Reynolds
 47 | :On: 9.22.2014
 48 | 
 49 | Copyright (C) 2015 Olivier Rivoire, Rama Ranganathan, Kimberly Reynolds
 50 | 
 51 | This program is free software distributed under the BSD 3-clause license,
 52 | please see the file LICENSE for details.
 53 | """
 54 | 
 55 | import sys
 56 | import argparse
 57 | import os
 58 | from pysca import scaTools as sca
 59 | from pysca import settings
 60 | 
 61 | if __name__ == "__main__":
 62 | 
 63 |     # parse inputs
 64 |     parser = argparse.ArgumentParser()
 65 |     parser.add_argument(
 66 |         "-i",
 67 |         "--input",
 68 |         required=True,
 69 |         dest="Input_MSA",
 70 |         help="input sequence alignment",
 71 |     )
 72 |     parser.add_argument(
 73 |         "-o",
 74 |         "--output",
 75 |         dest="output",
 76 |         default="Output.an",
 77 |         help="Outputfile name. Default: Output.an",
 78 |     )
 79 |     parser.add_argument(
 80 |         "-a",
 81 |         "--annot",
 82 |         dest="annot",
 83 |         default="pfam",
 84 |         help="Annotation method. Options are 'pfam' or 'ncbi'."
 85 |         " Default: 'pfam'",
 86 |     )
 87 |     parser.add_argument(
 88 |         "-l",
 89 |         "--idList",
 90 |         dest="idList",
 91 |         default=None,
 92 |         help="This argument is necessary for the 'ncbi' "
 93 |         "method. Specifies a file containing a list of "
 94 |         "GI or accession numbers corresponding to the "
 95 |         "sequence order in the alignment; a number of 0 "
 96 |         "indicates that one wasn't assigned for a "
 97 |         "particular sequence.",
 98 |     )
 99 |     parser.add_argument(
100 |         "-g",
101 |         "--giList",
102 |         dest="idList",
103 |         default=None,
104 |         help="Command kept for compatibility with previous "
105 |         "versions. Use '-l' or '--idList' instead.",
106 |     )
107 |     parser.add_argument(
108 |         "-p",
109 |         "--pfam_seq",
110 |         dest="pfamseq",
111 |         default=None,
112 |         help="Location of the pfamseq.txt file. Defaults to "
113 |         "path2pfamseq (specified in settings.py)",
114 |     )
115 |     parser.add_argument(
116 |         "-d",
117 |         "--pfam_db",
118 |         dest="pfamdb",
119 |         default=None,
120 |         help="Location of the pfamseq.db file. Priority over "
121 |         "pfamseq.txt file. Defaults to path2pfamseqdb "
122 |         "(specified in settings.py)",
123 |     )
124 |     parser.add_argument(
125 |         "-e",
126 |         "--entrez_email",
127 |         dest="email",
128 |         default=None,
129 |         help="email address for querying Entrez web API",
130 |     )
131 |     parser.add_argument(
132 |         "-m",
133 |         "--delimiter",
134 |         dest="delimiter",
135 |         default="|",
136 |         help="delimiter for fields for generated FASTA files.",
137 |     )
138 |     options = parser.parse_args()
139 | 
140 |     if (options.annot != "pfam") & (options.annot != "ncbi"):
141 |         sys.exit(
142 |             "The option -a must be set to 'pfam' or 'ncbi' - other"
143 |             " keywords are not allowed."
144 |         )
145 | 
146 |     if options.annot == "ncbi":
147 |         if (options.idList is None) and (options.giList is None):
148 |             sys.exit(
149 |                 "To use NCBI Entrez annotation, you must specify a file "
150 |                 "containing a list of GI numbers (see the --idList "
151 |                 "argument)."
152 |             )
153 | 
154 |     if options.annot == "pfam":
155 |         # Annotate a Pfam alignment
156 |         if options.pfamdb is not None:  # default to db query over txt search
157 |             sca.AnnotPfamDB(
158 |                 options.Input_MSA,
159 |                 options.output,
160 |                 options.pfamdb,
161 |                 options.delimiter,
162 |             )
163 |         elif options.pfamseq is not None:
164 |             sca.AnnotPfam(
165 |                 options.Input_MSA,
166 |                 options.output,
167 |                 options.pfamseq,
168 |                 options.delimiter,
169 |             )
170 |         else:
171 |             # If no database or text file supplied to annotateMSA, then default
172 |             # to the files defined in settings.py.
173 |             if os.path.exists(settings.path2pfamseqdb):
174 |                 sca.AnnotPfamDB(
175 |                     options.Input_MSA, options.output, options.delimiter
176 |                 )
177 |             elif os.path.exists(settings.path2pfamseq):
178 |                 sca.AnnotPfam(
179 |                     options.Input_MSA, options.output, options.delimiter
180 |                 )
181 |             else:
182 |                 sys.exit("No Pfam file found. Exiting.")
183 |     elif options.annot == "ncbi":
184 |         # Annotate using GI numbers/NCBI Entrez
185 |         if options.email is None:
186 |             sca.AnnotNCBI(options.Input_MSA, options.output, options.idList)
187 |         else:
188 |             sca.AnnotNCBI(
189 |                 options.Input_MSA,
190 |                 options.output,
191 |                 options.idList,
192 |                 options.email,
193 |                 options.delimiter,
194 |             )
195 | 


--------------------------------------------------------------------------------
/bin/scaCore:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | """
  3 | The scaCore script runs the core calculations for SCA, and stores the output
  4 | using the Python tool pickle. These calculations can be divided into two parts:
  5 | 
  6 |     1) Sequence correlations:
  7 | 
  8 |         a) Compute simMat = the global sequence similarity matrix for the
  9 |            alignment
 10 |         b) Compute Useq and Uica = the eigenvectors (and independent
 11 |            components) for the following sequence correlation matrices:
 12 | 
 13 |             * unweighted (:math:`U^0`)
 14 |             * sequence weights applied (:math:`U^1`)
 15 |             * both sequence and position weights applied (:math:`U^2`)
 16 | 
 17 |     2) Positional correlations:
 18 | 
 19 |         a) Compute the single-site position weights and positional conservation
 20 |            values (:math:`D_i` and :math:`D_i^a`)
 21 |         b) Compute the dimension-reduced SCA correlation matrix
 22 |            :math:`\\tilde{C_{ij}}`, the projected alignment :math:`tX`, and the
 23 |            projector
 24 |         c) Compute Ntrials of the randomized SCA matrix, and the eigenvectors
 25 |            and eigenvalues associated with each
 26 | 
 27 | **Arguments**
 28 | 
 29 | **Keyword Arguments**
 30 |     -i               \*.db (the database produced by running scaProcessMSA)
 31 |     -n               norm type for dimension-reducing the sca matrix. Options
 32 |                      are: 'spec' (the spectral norm) or 'frob' (frobenius
 33 |                      norm). Default: frob
 34 |     -l               lambda parameter for pseudo-counting the alignment.
 35 |                      Default: 0.03
 36 |     --Ntrials, -t    number of randomization trials
 37 |     --matlab, -m     write out the results of these calculations to a MATLAB
 38 |                      workspace for further analysis
 39 | 
 40 | **Example**::
 41 | 
 42 |   scaCore -i PF00071_full.db
 43 | 
 44 | :By: Rama Ranganathan, Kim Reynolds
 45 | :On: 8.5.2014
 46 | 
 47 | Copyright (C) 2015 Olivier Rivoire, Rama Ranganathan, Kimberly Reynolds
 48 | 
 49 | This program is free software distributed under the BSD 3-clause license,
 50 | please see the file LICENSE for details.
 51 | """
 52 | 
 53 | import sys
 54 | import time
 55 | import os
 56 | import pickle
 57 | import argparse
 58 | from scipy.io import savemat
 59 | from pysca import scaTools as sca
 60 | 
 61 | if __name__ == "__main__":
 62 | 
 63 |     # Parse inputs
 64 |     parser = argparse.ArgumentParser()
 65 |     parser.add_argument(
 66 |         "-i" "--input",
 67 |         dest="inputdb",
 68 |         required=True,
 69 |         help="database from running scaProcessMSA",
 70 |     )
 71 |     parser.add_argument(
 72 |         "-o" "--output",
 73 |         dest="outputdb",
 74 |         default=None,
 75 |         help="output file for core calculations",
 76 |     )
 77 |     parser.add_argument(
 78 |         "-n",
 79 |         dest="norm",
 80 |         default="frob",
 81 |         help="norm type for dimension-reducing the sca matrix."
 82 |         "Options are: 'spec' (the spectral norm) or "
 83 |         "'frob' (frobenius norm). Default: frob",
 84 |     )
 85 |     parser.add_argument(
 86 |         "-t",
 87 |         "--Ntrials",
 88 |         dest="Ntrials",
 89 |         default=10,
 90 |         type=int,
 91 |         help="number of randomization trials",
 92 |     )
 93 |     parser.add_argument(
 94 |         "-l",
 95 |         dest="lbda",
 96 |         default=0.03,
 97 |         type=float,
 98 |         help="lambda parameter for pseudo-counting the "
 99 |         "alignment. Default: 0.03",
100 |     )
101 |     parser.add_argument(
102 |         "-q",
103 |         dest="kseq",
104 |         default=30,
105 |         type=int,
106 |         help="number of eigenvectors to computes in sequence matrix. "
107 |         "Default: 30",
108 |     )
109 |     parser.add_argument(
110 |         "-c",
111 |         dest="kica",
112 |         default=15,
113 |         type=int,
114 |         help="number of independent components to compute from sequence "
115 |         "alignment matrix. Default: 15",
116 |     )
117 |     parser.add_argument(
118 |         "-m",
119 |         "--matlab",
120 |         dest="matfile",
121 |         action="store_true",
122 |         default=False,
123 |         help="write out the results of these calculations to "
124 |         "a MATLAB workspace for further analysis.",
125 |     )
126 |     options = parser.parse_args()
127 | 
128 |     if (options.norm != "frob") & (options.norm != "spec"):
129 |         sys.exit(
130 |             "The option -n must be set to 'frob' or 'spec' - other "
131 |             "keywords are not allowed."
132 |         )
133 | 
134 |     # extract the necessary stuff from the database...
135 |     db_in = pickle.load(open(options.inputdb, "rb"))
136 |     D_in = db_in["sequence"]
137 | 
138 |     msa_num = D_in["msa_num"]
139 |     seqw = D_in["seqw"]
140 |     Nseq = D_in["Nseq"]
141 |     Npos = D_in["Npos"]
142 |     ats = D_in["ats"]
143 |     hd = D_in["hd"]
144 | 
145 |     # sequence analysis
146 |     print("Computing the sequence projections.")
147 |     Useq, Uica = sca.seqProj(
148 |         msa_num, seqw, kseq=options.kseq, kica=options.kica
149 |     )
150 | 
151 |     print("Computing sequence similarity matrix.")
152 |     simMat = sca.seqSim(msa_num)
153 | 
154 |     # SCA calculations
155 |     print("Computing the SCA conservation and correlation values.")
156 |     Wia, Dia, Di = sca.posWeights(msa_num, seqw, options.lbda)
157 |     Csca, tX, Proj = sca.scaMat(msa_num, seqw, options.norm, options.lbda)
158 | 
159 |     # Matrix randomizations
160 |     print("Computing matrix randomizations...")
161 |     start = time.time()
162 |     Vrand, Lrand, Crand = sca.randomize(
163 |         msa_num, options.Ntrials, seqw, options.lbda
164 |     )
165 |     end = time.time()
166 |     print(
167 |         "Randomizations complete, %i trials, time: %.1f minutes"
168 |         % (options.Ntrials, (end - start) / 60)
169 |     )
170 | 
171 |     # saving...
172 |     if options.outputdb is None:
173 |         fn = os.path.basename(options.inputdb)
174 |         output_path = os.path.abspath(os.path.dirname(options.inputdb))
175 |     else:
176 |         fn = os.path.basename(options.outputdb)
177 |         output_path = os.path.abspath(os.path.dirname(options.outputdb))
178 |     fn_noext = os.path.splitext(fn)[0]
179 | 
180 |     D = {}
181 |     D["Useq"] = Useq
182 |     D["Uica"] = Uica
183 |     D["simMat"] = simMat
184 |     D["lbda"] = options.lbda
185 |     D["Dia"] = Dia
186 |     D["Di"] = Di
187 |     D["Csca"] = Csca
188 |     D["tX"] = tX
189 |     D["Proj"] = Proj
190 |     D["Ntrials"] = options.Ntrials
191 |     D["Vrand"] = Vrand
192 |     D["Lrand"] = Lrand
193 |     D["Crand"] = Crand
194 | 
195 |     db = {}
196 |     db["sequence"] = D_in
197 |     db["sca"] = D
198 | 
199 |     print(
200 |         "Calculations complete, writing to database file "
201 |         + os.path.join(output_path, fn_noext)
202 |     )
203 |     pickle.dump(db, open(os.path.join(output_path, fn_noext) + ".db", "wb"))
204 | 
205 |     if options.matfile:
206 |         savemat(
207 |             os.path.join(output_path, fn_noext) + ".mat",
208 |             db,
209 |             appendmat=True,
210 |             oned_as="column",
211 |         )
212 | 


--------------------------------------------------------------------------------
/bin/scaProcessMSA:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | """
  3 | The scaProcessMSA script conducts the basic steps in multiple sequence
  4 | alignment (MSA) pre-processing for SCA, and stores the results using the python
  5 | tool pickle:
  6 | 
  7 |     1)  Trim the alignment, either by truncating to a reference sequence
  8 |         (specified with the -t flag) or by removing excessively gapped
  9 |         positions (set to positions with more than 40% gaps)
 10 | 
 11 |     2)  Identify/designate a reference sequence in the alignment, and create a
 12 |         mapping of the alignment numberings to position numberings for the
 13 |         reference sequence. The reference sequence can be specified in one of
 14 |         four ways:
 15 | 
 16 |             a) By supplying a PDB file - in this case, the reference sequence
 17 |                is taken from the PDB (see the pdb kwarg)
 18 | 
 19 |             b) By supplying a reference sequence directly (as a fasta file -
 20 |                see the refseq kwarg)
 21 | 
 22 |             c) By supplying the index of the reference sequence in the
 23 |                alignment (see the refseq kwarg)
 24 | 
 25 |             d) If no reference sequence is supplied by the user, one is
 26 |                automatically selected using the scaTools function chooseRef.
 27 | 
 28 |         The position numbers (for mapping the alignment) can be specified in
 29 |         one of three ways:
 30 | 
 31 |             a) By supplying a PDB file - in this case the alignment positions
 32 |                are mapped to structure positions
 33 | 
 34 |             b) By supplying a list of reference positions (see the refpos
 35 |                kwarg)
 36 | 
 37 |             c) If no reference positions are supplied by the user, sequential
 38 |                numbering (starting at 1) is assumed.
 39 | 
 40 |     3)  Filter sequences to remove highly gapped sequences, and sequences with
 41 |         an identity below or above some minimum or maximum value to the
 42 |         reference sequence (see the parameters kwarg)
 43 |     4)  Filter positions to remove highly gapped positions (default 20% gaps,
 44 |         can also be set using --parameters)
 45 |     5)  Calculate sequence weights and write out the final alignment and other
 46 |         variables
 47 | 
 48 | **Key Arguments**
 49 |      --alignment, -a   Input_MSA.fasta (the alignment to be processed,
 50 |                        typically the headers contain taxonomic information for
 51 |                        the sequences).
 52 |      --pdb, -s         PDB identifier (ex: 1RX2)
 53 |      --pdbdir, -b      directory where PDB files are stored
 54 |      --chainID, -c     chain ID in the PDB for the reference sequence
 55 |      --species, -f     species of the reference sequence
 56 |      --refseq, -r      reference sequence, supplied as a fasta file
 57 |      --refpos, -o      reference positions, supplied as a text file with one
 58 |                        position specified per line
 59 |      --refindex, -i    reference sequence number in the alignment, COUNTING
 60 |                        FROM 0
 61 |      --parameters, -p  list of parameters for filtering the alignment:
 62 |                        [max_frac_gaps for positions, max_frac_gaps for
 63 |                        sequences, min SID to reference seq, max SID to
 64 |                        reference seq]
 65 |                        default values: [0.2, 0.2, 0.2, 0.8] (see filterPos and
 66 |                        filterSeq functions for details)
 67 |      --selectSeqs, -n  subsample the alignment to (1.5 * the number of
 68 |                        effective sequences) to reduce computational time,
 69 |                        default: False
 70 |      --truncate, -t    truncate the alignment to the positions in the reference
 71 |                        PDB, default: False
 72 |      --matlab, -m      write out the results of this script to a matlab
 73 |                        workspace for further analysis
 74 |      --dest, -d        destination for output files
 75 | 
 76 | **Example**::
 77 | 
 78 |   scaProcessMSA -a PF00071_full.an -s 5P21 -c A -f 'Homo sapiens'
 79 | 
 80 | :By: Rama Ranganathan
 81 | :On: 8.5.2014
 82 | 
 83 | Copyright (C) 2015 Olivier Rivoire, Rama Ranganathan, Kimberly Reynolds
 84 | 
 85 | This program is free software distributed under the BSD 3-clause license,
 86 | please see the file LICENSE for details.
 87 | """
 88 | 
 89 | import sys
 90 | import os
 91 | import pickle
 92 | import argparse
 93 | import numpy as np
 94 | from scipy.io import savemat
 95 | from pysca import scaTools as sca
 96 | from pysca import settings
 97 | 
 98 | if __name__ == "__main__":
 99 |     # Parse inputs
100 |     parser = argparse.ArgumentParser()
101 |     parser.add_argument(
102 |         "-a",
103 |         "--alignment",
104 |         dest="alignment",
105 |         required=True,
106 |         help="Input Sequence Alignment",
107 |     )
108 |     parser.add_argument(
109 |         "-d",
110 |         "--dest",
111 |         dest="destination",
112 |         default=None,
113 |         help="specify an output directory",
114 |     )
115 |     parser.add_argument(
116 |         "-s", "--pdb", dest="pdbid", help="PDB identifier (ex: 1RX2)"
117 |     )
118 |     parser.add_argument(
119 |         "-b",
120 |         "--pdbdir",
121 |         dest="pdbdir",
122 |         default=None,
123 |         help="directory where PDBs are stored",
124 |     )
125 |     parser.add_argument(
126 |         "-c",
127 |         "--chainID",
128 |         dest="chainID",
129 |         default="A",
130 |         help="chain ID in the PDB for the reference sequence",
131 |     )
132 |     parser.add_argument(
133 |         "-f",
134 |         "--species",
135 |         dest="species",
136 |         help="species of the reference sequence",
137 |     )
138 |     parser.add_argument(
139 |         "-r",
140 |         "--refseq",
141 |         dest="refseq",
142 |         help="reference sequence, supplied as a fasta file",
143 |     )
144 |     parser.add_argument(
145 |         "-o",
146 |         "--refpos",
147 |         dest="refpos",
148 |         help="reference positions, supplied as a text file "
149 |         "with one position specified per line",
150 |     )
151 |     parser.add_argument(
152 |         "-i",
153 |         "--refindex",
154 |         dest="i_ref",
155 |         type=int,
156 |         help="reference sequence number in the alignment, " "COUNTING FROM 0",
157 |     )
158 |     parser.add_argument(
159 |         "-p",
160 |         "--parameters",
161 |         dest="parameters",
162 |         default=[0.2, 0.2, 0.2, 0.8],
163 |         type=float,
164 |         nargs=4,
165 |         help="list of parameters for filtering the alignment: "
166 |         "[max_frac_gaps for positions, max_frac_gaps for "
167 |         "sequences, min SID to reference seq, max SID to "
168 |         "reference seq] default values: [0.2, 0.2, 0.2, "
169 |         "0.8] (see filterPos and filterSeq functions for "
170 |         "details).",
171 |     )
172 |     parser.add_argument(
173 |         "-n",
174 |         "--selectSeqs",
175 |         action="store_true",
176 |         dest="Nselect",
177 |         default=False,
178 |         help="subsample the alignment to (1.5 * the number of "
179 |         "effective sequences) to reduce computational "
180 |         "time, default: False",
181 |     )
182 |     parser.add_argument(
183 |         "-t",
184 |         "--truncate",
185 |         action="store_true",
186 |         dest="truncate",
187 |         default=False,
188 |         help="truncate the alignment to the positions in the "
189 |         "reference PDB, default: False",
190 |     )
191 |     parser.add_argument(
192 |         "-m",
193 |         "--matlab",
194 |         action="store_true",
195 |         dest="matfile",
196 |         default=False,
197 |         help="write out the results of this script to a matlab"
198 |         " workspace for further analysis",
199 |     )
200 |     options = parser.parse_args()
201 | 
202 |     # A little bit of error checking/feedback for the user.
203 |     if options.i_ref is None:
204 |         if options.species is not None and options.pdbid is None:
205 |             print("No PDBid, ignoring species...")
206 |             options.species = None
207 |         if options.refseq is not None and options.refpos is None:
208 |             print(
209 |                 "Using reference sequence but no position list provided! "
210 |                 "Just numbering positions 1 to length(sequence)"
211 |             )
212 |             if options.pdbid is not None:
213 |                 print("And...ignoring the PDB file...")
214 |                 options.pdbid = None
215 |             options.refpos = [i + 1 for i in range(len(options.refseq))]
216 |         if options.refseq is not None and options.refpos is not None:
217 |             print("Using the reference sequence and position list...")
218 |             if options.pdbid is not None:
219 |                 print("And...ignoring the PDB file...")
220 |                 options.pdbid = None
221 |     else:
222 |         i_ref = options.i_ref
223 | 
224 |     # Pick an output directory.
225 |     if options.destination is None:
226 |         if settings.path2output is None:
227 |             destination = os.getcwd()
228 |         else:
229 |             destination = os.path.abspath(settings.path2output)
230 |     else:
231 |         destination = os.path.abspath(options.destination)
232 | 
233 |     if not os.path.exists(destination):
234 |         os.makedirs(destination)
235 | 
236 |     # Set the directory where PDB files are stored.
237 |     if options.pdbdir is None:
238 |         pdbdir = settings.path2structures
239 |     elif os.path.exists(options.pdbdir):
240 |         pdbdir = options.pdbdir
241 |     else:
242 |         sys.exit("PDB directory '%s/' not found." % options.pdbdir)
243 | 
244 |     # Read in initial alignment
245 |     headers_full, sequences_full = sca.readAlg(options.alignment)
246 |     print(
247 |         "Loaded alignment of %i sequences, %i positions."
248 |         % (len(headers_full), len(sequences_full[0]))
249 |     )
250 | 
251 |     if options.i_ref is not None:
252 |         ref_header = headers_full[options.i_ref]
253 |         ref_sequence = (sequences_full[options.i_ref]).replace(".", "-")
254 | 
255 |     # Check the alignment and remove sequences containing non-standard amino
256 |     # acids
257 |     print("Checking alignment for non-standard amino acids")
258 |     alg_out = list()
259 |     hd_out = list()
260 |     for i, k in enumerate(sequences_full):
261 |         flag = 0
262 |         l = k.replace(".", "-")
263 |         for j, aa in enumerate(l):
264 |             if aa not in "ACDEFGHIKLMNPQRSTVWY-":
265 |                 flag = 1
266 |         if flag == 0:
267 |             alg_out.append(l)
268 |             hd_out.append(headers_full[i])
269 |     headers_full = hd_out
270 |     sequences_full = alg_out
271 |     print(
272 |         "Aligment size after removing sequences with non-standard amino "
273 |         "acids: %i" % (len(sequences_full))
274 |     )
275 | 
276 |     # Do an initial trimming to remove excessively gapped positions - this is
277 |     # critical for building a correct ATS
278 |     print("Trimming alignment for highly gapped positions (80% or more).")
279 |     alg_out, poskeep = sca.filterPos(sequences_full, [1], 0.8)
280 |     sequences_ori = sequences_full
281 |     sequences_full = alg_out
282 |     print(
283 |         "Alignment size post-trimming: %i positions" % len(sequences_full[0])
284 |     )
285 | 
286 |     if options.i_ref is not None:
287 |         ref_sequence = "".join([ref_sequence[i] for i in poskeep])
288 | 
289 |     # If i_ref is directly provided, we use it, ignoring all else.
290 |     # Otherwise, we explore the other ways of specifying a reference
291 |     # sequences: (1) providing a PDBid (chainID defaults to 'A'), (2)
292 |     # providing the protein sequence with position numbers (defaults to
293 |     # just sequence numbering). If none of these is provided, we just make
294 |     # an alignment based numbering for ats. If a PDBid is provided, there
295 |     # is an option to also provide species information to permit
296 |     # identifying the reference sequence in the MSA without use of external
297 |     # packages for fast pairwise alignments.
298 | 
299 |     print("Looking for PDBs in %s" % pdbdir)
300 | 
301 |     if options.i_ref is None:
302 |         if options.pdbid is not None:
303 |             try:
304 |                 seq_pdb, ats_pdb, dist_pdb = sca.pdbSeq(
305 |                     options.pdbid, options.chainID, pdbdir
306 |                 )
307 |                 if options.species is not None:
308 |                     try:
309 |                         print(
310 |                             "Finding reference sequence using species-based"
311 |                             " best match.."
312 |                         )
313 |                         i_ref = sca.MSAsearch(
314 |                             headers_full,
315 |                             sequences_full,
316 |                             seq_pdb,
317 |                             options.species,
318 |                         )
319 |                         options.i_ref = i_ref
320 |                         print("reference sequence index is: %i" % (i_ref))
321 |                         print(headers_full[i_ref])
322 |                         print(sequences_full[i_ref])
323 |                     except BaseException as e:
324 |                         print("Error: " + str(e))
325 |                         print(
326 |                             "Cant find the reference sequence using"
327 |                             " species-based best_match! Using global"
328 |                             " MSAsearch..."
329 |                         )
330 |                         try:
331 |                             i_ref = sca.MSAsearch(
332 |                                 headers_full, sequences_full, seq_pdb
333 |                             )
334 |                             options.i_ref = i_ref
335 |                             print("reference sequence index is: %i" % (i_ref))
336 |                             print(headers_full[i_ref])
337 |                             print(sequences_full[i_ref])
338 |                         except BaseException as e:
339 |                             print("Error: " + str(e))
340 |                             sys.exit("Error! Can't find reference sequence...")
341 |                 else:
342 |                     try:
343 |                         print(
344 |                             "Finding reference sequence using global"
345 |                             " MSAsearch..."
346 |                         )
347 |                         i_ref = sca.MSAsearch(
348 |                             headers_full, sequences_full, seq_pdb
349 |                         )
350 |                         options.i_ref = i_ref
351 |                         print("reference sequence index is: %i" % (i_ref))
352 |                         print(headers_full[i_ref])
353 |                         print(sequences_full[i_ref])
354 |                     except BaseException as e:
355 |                         print("Error: " + str(e))
356 |                         sys.exit("Error!!  Can't find reference sequence...")
357 |                 sequences, ats = sca.makeATS(
358 |                     sequences_full, ats_pdb, seq_pdb, i_ref, options.truncate
359 |                 )
360 |                 dist_new = np.zeros((len(ats), len(ats)))
361 |                 for (j, pos1) in enumerate(ats):
362 |                     for (k, pos2) in enumerate(ats):
363 |                         if k != j:
364 |                             if (pos1 == "-") or (pos2 == "-"):
365 |                                 dist_new[j, k] == 1000
366 |                             else:
367 |                                 ix_j = ats_pdb.index(pos1)
368 |                                 ix_k = ats_pdb.index(pos2)
369 |                                 dist_new[j, k] = dist_pdb[ix_j, ix_k]
370 |                 dist_pdb = dist_new
371 |             except BaseException as e:
372 |                 print("Error: " + str(e))
373 |                 sys.exit("Error!!! Something wrong with PDBid or path...")
374 |         elif options.refseq is not None:
375 |             print(
376 |                 "Finding reference sequence using provided sequence" " file..."
377 |             )
378 |             try:
379 |                 h_tmp, s_tmp = sca.readAlg(options.refseq)
380 |                 i_ref = sca.MSAsearch(headers_full, sequences_full, s_tmp[0])
381 |                 options.i_ref = i_ref
382 |                 print("reference sequence index is: %i" % (i_ref))
383 |                 print(headers_full[i_ref])
384 |                 if options.refpos is not None:
385 |                     try:
386 |                         f = open(options.refpos, "r")
387 |                         ats_tmp = [line.rstrip("\n") for line in f]
388 |                         f.close()
389 |                     except BaseException as e:
390 |                         print("Error: " + str(e))
391 |                         print(
392 |                             "Error reading reference position file! Using"
393 |                             " default numbering 1 to number of positions"
394 |                         )
395 |                         ats_tmp = [i + 1 for i in range(len(sequences[0]))]
396 |                 else:
397 |                     print(
398 |                         "No reference position list provided. Using"
399 |                         " default numbering 1 to number of positions"
400 |                     )
401 |                     ats_tmp = [i + 1 for i in range(len(sequences[0]))]
402 |                 sequences, ats = sca.makeATS(
403 |                     sequences_full, ats_tmp, s_tmp[0], i_ref, options.truncate
404 |                 )
405 |             except BaseException as e:
406 |                 print("Error: " + str(e))
407 |                 sys.exit("Error!! Can't find reference sequence...")
408 |         else:
409 |             msa_num = sca.lett2num(sequences_full)
410 |             i_ref = sca.chooseRefSeq(sequences_full)
411 |             print(
412 |                 "No reference sequence given, chose as default (%i): %s"
413 |                 % (i_ref, headers_full[i_ref])
414 |             )
415 |             sequences = sequences_full
416 |             ats = [i + 1 for i in range(len(sequences[0]))]
417 |     else:
418 |         print("using provided reference index %i" % (i_ref))
419 |         print(ref_header)
420 |         s_tmp = ref_sequence
421 |         try:
422 |             if options.refpos is not None:
423 |                 f = open(options.refpos, "r")
424 |                 ats_tmp = [line.rstrip("\n") for line in f]
425 |                 f.close()
426 |             else:
427 |                 print("here!")
428 |                 ats_tmp = [i + 1 for i in range(len(s_tmp))]
429 |             sequences, ats = sca.makeATS(
430 |                 sequences_full, ats_tmp, s_tmp, i_ref, options.truncate
431 |             )
432 |         except BaseException as e:
433 |             print("Error: " + str(e))
434 |             sys.exit("Error!! Can't find reference sequence...")
435 | 
436 |     # Filtering sequences and positions, calculations of effective number of
437 |     # seqs
438 |     print(
439 |         "Conducting sequence and position filtering: alignment size is %i"
440 |         " seqs, %i pos" % (len(sequences), len(sequences[0]))
441 |     )
442 |     if options.pdbid is not None:
443 |         print(
444 |             "ATS and distmat size - ATS: %i, distmat: %i x %i"
445 |             % (len(ats), len(dist_pdb), len(dist_pdb[0]))
446 |         )
447 |     else:
448 |         print(
449 |             "ATS should also have %i positions - ATS: %i"
450 |             % (len(sequences[0]), len(ats))
451 |         )
452 | 
453 |     if i_ref is not None:
454 |         alg0, seqw0, seqkeep = sca.filterSeq(
455 |             sequences,
456 |             i_ref,
457 |             max_fracgaps=options.parameters[1],
458 |             min_seqid=options.parameters[2],
459 |             max_seqid=options.parameters[3],
460 |         )
461 |     else:
462 |         alg0, seqw0, seqkeep = sca.filterSeq(
463 |             sequences,
464 |             max_fracgaps=options.parameters[1],
465 |             min_seqid=options.parameters[2],
466 |             max_seqid=options.parameters[3],
467 |         )
468 | 
469 |     headers = [headers_full[s] for s in seqkeep]
470 |     alg1, iposkeep = sca.filterPos(alg0, seqw0, options.parameters[0])
471 |     ats = [ats[i] for i in iposkeep]
472 |     if options.pdbid is not None:
473 |         distmat = dist_pdb[np.ix_(iposkeep, iposkeep)]
474 |     effseqsprelimit = int(seqw0.sum())
475 |     Nseqprelimit = len(alg1)
476 |     print(
477 |         "After filtering: alignment size is %i seqs, %i effective seqs, %i"
478 |         " pos" % (len(alg1), effseqsprelimit, len(alg1[0]))
479 |     )
480 | 
481 |     # Limitation of total sequences to [1.5 * # of effective sequences] if
482 |     # Nselect is set to True
483 |     if options.Nselect:
484 |         seqsel = sca.randSel(
485 |             seqw0, int(1.5 * effseqsprelimit), [seqkeep.index(i_ref)]
486 |         )
487 |         alg = [alg1[s] for s in seqsel]
488 |         hd = [headers[s] for s in seqsel]
489 |     else:
490 |         alg = alg1
491 |         hd = headers
492 | 
493 |     # Calculation of final MSA, sequence weights
494 |     seqw = sca.seqWeights(alg)
495 |     effseqs = seqw.sum()
496 |     msa_num = sca.lett2num(alg)
497 |     Nseq, Npos = msa_num.shape
498 |     print("Final alignment parameters:")
499 |     print("Number of sequences: M = %i" % (Nseq))
500 |     print("Number of effective sequences: M' = %i" % (effseqs))
501 |     print("Number of alignment positions: L = %i" % (Npos))
502 | 
503 |     if options.pdbid is not None:
504 |         print("Number of positions in the ats: %i" % (len(ats)))
505 |         structPos = [i for (i, k) in enumerate(ats) if k != "-"]
506 |         print("Number of structure positions mapped: %i" % (len(structPos)))
507 |         print(
508 |             "Size of the distance matrix: %i x %i"
509 |             % (len(distmat), len(distmat[0]))
510 |         )
511 | 
512 |     # Saving the important stuff. Everything is stored in a file called
513 |     # [MSAname]_sequence.db.  But we will also write out the final processed
514 |     # alignment to a fasta file.
515 | 
516 |     filename = os.path.basename(options.alignment)
517 |     filename_noext = os.path.splitext(filename)[0]
518 |     f = open(
519 |         os.path.join(destination, filename_noext) + "_processed" + ".fasta",
520 |         "w",
521 |     )
522 |     for i in range(len(alg)):
523 |         f.write(">%s\n" % (hd[i]))
524 |         f.write(alg[i] + "\n")
525 |     f.close()
526 | 
527 |     D = {}
528 |     D["alg"] = alg
529 |     D["hd"] = hd
530 |     D["msa_num"] = msa_num
531 |     D["seqw"] = seqw
532 |     D["Nseq"] = Nseq
533 |     D["Npos"] = Npos
534 |     D["ats"] = ats
535 |     D["effseqs"] = effseqs
536 |     D["limitseqs"] = options.Nselect
537 |     D["NseqPrelimit"] = Nseqprelimit
538 |     D["effseqsPrelimit"] = effseqsprelimit
539 |     if options.pdbid is not None:
540 |         D["pdbid"] = options.pdbid
541 |         D["pdb_chainID"] = options.chainID
542 |         D["distmat"] = distmat
543 |     if options.refseq is not None:
544 |         D["refseq"] = options.refseq
545 |     if options.refpos is not None:
546 |         D["refpos"] = options.refpos
547 |     D["i_ref"] = i_ref
548 |     D["trim_parameters"] = options.parameters
549 |     D["truncate_flag"] = options.truncate
550 | 
551 |     db_filename = os.path.join(destination, filename_noext)
552 |     print("Opening database file " + db_filename)
553 |     db = {}
554 |     db["sequence"] = D
555 | 
556 |     pickle.dump(db, open(db_filename + ".db", "wb"))
557 | 
558 |     if options.matfile:
559 |         db["sequence"]["i_ref"] = i_ref + 1  # index from 1 for MATLAB
560 |         savemat(db_filename, db, appendmat=True, oned_as="column")
561 | 


--------------------------------------------------------------------------------
/bin/scaSectorID:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | """
  3 | The scaSectorID script does the preliminaries of sector identification and
  4 | stores the outputs using the python tool pickle:
  5 | 
  6 |     1) Chooses :math:`k_{max}` (the number of significant eigenmodes) by
  7 |        comparison of the :math:`\\tilde{C_{ij}}` eigenspectrum to that for the
  8 |        randomized matrices
  9 |     2) Rotates the top :math:`k_{max}` eigenvectors using independent
 10 |        components analysis
 11 |     3) Defines the amino acid positions that significantly contribute to each
 12 |        of the independent components (ICs) by empirically fitting each IC to
 13 |        the t-distribution and selecting positions with greater than a specified
 14 |        cutoff (default: p=0.95) on the CDF.
 15 |     4) Assign positions into groups based on the independent component with
 16 |        which it has the greatest degree of co-evolution.
 17 | 
 18 | **Key Arguments**
 19 |     --input, -i      \*.db (the database produced by running scaCore)
 20 |     --kpos, -k       number of significant eigenmodes for analysis (the default
 21 |                      is to automatically choose using the eigenspectrum)
 22 |     --cutoff, -p     empirically chosen cutoff for selecting AA positions with
 23 |                      a significant contribution to each IC, Default = 0.95
 24 |     --matlab, -m     write out the results of this script to a matlab workspace
 25 |                      for further analysis
 26 | 
 27 | **Example**::
 28 | 
 29 |   scaSectorID -i PF00071_full.db
 30 | 
 31 | :By: Kim Reynolds
 32 | :On: 8.19.2014
 33 | 
 34 | Copyright (C) 2015 Olivier Rivoire, Rama Ranganathan, Kimberly Reynolds
 35 | 
 36 | This program is free software distributed under the BSD 3-clause license,
 37 | please see the file LICENSE for details.
 38 | """
 39 | 
 40 | import os
 41 | import pickle
 42 | import argparse
 43 | import numpy as np
 44 | from scipy.io import savemat
 45 | from pysca import scaTools as sca
 46 | 
 47 | if __name__ == "__main__":
 48 |     # parse inputs
 49 |     parser = argparse.ArgumentParser()
 50 |     parser.add_argument(
 51 |         "-i",
 52 |         "--input",
 53 |         dest="inputdb",
 54 |         required=True,
 55 |         help="database from running scaCore",
 56 |     )
 57 |     parser.add_argument(
 58 |         "-o" "--output",
 59 |         dest="outputdb",
 60 |         default=None,
 61 |         help="output file for sector calculations",
 62 |     )
 63 |     parser.add_argument(
 64 |         "-k",
 65 |         "--kpos",
 66 |         dest="kpos",
 67 |         type=int,
 68 |         default=0,
 69 |         help="number of significant eigenmodes for analysis "
 70 |         "(the default is to automatically choose using "
 71 |         "the eigenspectrum)",
 72 |     )
 73 |     parser.add_argument(
 74 |         "-p",
 75 |         "--cutoff",
 76 |         dest="cutoff",
 77 |         type=float,
 78 |         default=0.95,
 79 |         help="number of significant eigenmodes for analysis "
 80 |         "(the default is to automatically choose using "
 81 |         "the eigenspectrum)",
 82 |     )
 83 |     parser.add_argument(
 84 |         "-m",
 85 |         "--matlab",
 86 |         action="store_true",
 87 |         dest="matfile",
 88 |         default=False,
 89 |         help="write out the results of this script to a "
 90 |         "matlab workspace for further analysis",
 91 |     )
 92 |     options = parser.parse_args()
 93 | 
 94 |     # extract the necessary stuff from the database...
 95 |     db_in = pickle.load(open(options.inputdb, "rb"))
 96 |     D_seq = db_in["sequence"]
 97 |     D_sca = db_in["sca"]
 98 | 
 99 |     msa_num = D_seq["msa_num"]
100 |     seqw = D_seq["seqw"]
101 |     lbda = D_sca["lbda"]
102 |     Csca = D_sca["Csca"]
103 |     tX = D_sca["tX"]
104 |     Lrand = D_sca["Lrand"]
105 | 
106 |     # run the calculations
107 |     Vsca, Lsca = sca.eigenVect(Csca)
108 | 
109 |     if options.kpos == 0:
110 |         kpos = sca.chooseKpos(Lsca, Lrand)
111 |     else:
112 |         kpos = options.kpos
113 |     print("Selected kpos=%i significant eigenmodes." % kpos)
114 |     Vpica, Wpica = sca.rotICA(Vsca, kmax=kpos)
115 |     ics, icsize, sortedpos, cutoff, scaled_pd, pd = sca.icList(
116 |         Vpica, kpos, Csca, p_cut=options.cutoff
117 |     )
118 | 
119 |     Usca = tX.dot(Vsca[:, :kpos]).dot(np.diag(1 / np.sqrt(Lsca[:kpos])))
120 |     Upica = Wpica.dot(Usca.T).T
121 |     for k in range(Upica.shape[1]):
122 |         Upica[:, k] /= np.sqrt(Upica[:, k].T.dot(Upica[:, k]))
123 |     Usica, Wsica = sca.rotICA(Usca, kmax=kpos)
124 | 
125 |     # saving...
126 |     if options.outputdb is None:
127 |         fn = os.path.basename(options.inputdb)
128 |         output_path = os.path.abspath(os.path.dirname(options.inputdb))
129 |     else:
130 |         fn = os.path.basename(options.outputdb)
131 |         output_path = os.path.abspath(os.path.dirname(options.outputdb))
132 |     fn_noext = os.path.splitext(fn)[0]
133 | 
134 |     D = {}
135 |     D["Vsca"] = Vsca
136 |     D["Lsca"] = Lsca
137 |     D["kpos"] = kpos
138 |     D["Vpica"] = Vpica
139 |     D["Wpica"] = Wpica
140 |     D["Usca"] = Usca
141 |     D["Upica"] = Upica
142 |     D["Usica"] = Usica
143 |     D["Wsica"] = Wsica
144 |     D["ics"] = ics
145 |     D["icsize"] = icsize
146 |     D["sortedpos"] = sortedpos
147 |     D["cutoff"] = cutoff
148 |     D["scaled_pd"] = scaled_pd
149 |     D["pd"] = pd
150 | 
151 |     db = {}
152 |     db["sequence"] = D_seq
153 |     db["sca"] = D_sca
154 |     db["sector"] = D
155 | 
156 |     print(
157 |         "Calculations complete, writing to database file "
158 |         + os.path.join(output_path, fn_noext)
159 |     )
160 |     pickle.dump(db, open(os.path.join(output_path, fn_noext) + ".db", "wb"))
161 | 
162 |     if options.matfile:
163 |         # increment indices by 1 for MATLAB
164 |         db["sector"]["sortedpos"] = [pos + 1 for pos in D["sortedpos"]]
165 |         for ic in ics:
166 |             ic.items = [item + 1 for item in ic.items]
167 |         db["sector"]["ics"] = ics
168 |         savemat(
169 |             os.path.join(output_path, fn_noext) + ".mat",
170 |             db,
171 |             appendmat=True,
172 |             oned_as="column",
173 |         )
174 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/SCA_DHFR.rst:
--------------------------------------------------------------------------------
  1 | SCA 6.1 - The DHFR (dihydrofolate reductase) family
  2 | ===================================================
  3 | 
  4 | **Summary** This script describes the basic flow of the analytical steps
  5 | in SCA6.0, using the DHFR family as an example. Here we compare results
  6 | of the analysis for two different alignments: a PFAM alignment (PFAM
  7 | PF00186) and an independent manually curated alignment constructed using
  8 | a custom database of orthologous sequences (DHFR_PEPM3.an). Despite
  9 | differences in the construction, sequence distribution and size of the
 10 | two alignments, the sector definition is remarkably consistent: in both
 11 | cases we arrive at a single sector assembled from six independent
 12 | components.
 13 | 
 14 | For this tutorial, the core calculation scripts should be run from the
 15 | command line as follows:
 16 | 
 17 | ::
 18 | 
 19 |    >> annotateMSA -i ../data/PF00186_full.txt -o ../outputs/PF00186_full.an -a 'pfam' -p ../data/pfamseq.txt
 20 |    >> scaProcessMSA -a ../data/PF00186_full.an -b ../data/ -s 1RX2 -c A -f 'Escherichia coli' -t -n
 21 |    >> scaCore -i ../output/PF00186_full.db
 22 |    >> scaSectorID -i ../output/PF00186_full.db
 23 | 
 24 |    >> annotateMSA -i ../data/DHFR_PEPM3.fasta -o ../output DHFR_PEPM3.an -a 'ncbi' -g ../data/DHFR_PEPM3.gis
 25 |    >> scaProcessMSA -a ../data/DHFR_PEPM3.an -b ../data/ -s 1RX2 -c A -t -n
 26 |    >> scaCore -i ../output/DHFR_PEPM3.db
 27 |    >> scaSectorID -i ../output/DHFR_PEPM3.db
 28 | 
 29 | Note that we supply annotated alignments for all tutorial scripts *(the
 30 | annotate_pfMSA step is slow, and should only be run once)*.
 31 | 
 32 | **O.Rivoire, K.Reynolds and R.Ranganathan** 9/2014
 33 | 
 34 | .. code:: python3
 35 | 
 36 |     import os
 37 |     import time
 38 |     import matplotlib.pyplot as plt
 39 |     import numpy as np
 40 |     import copy
 41 |     import colorsys
 42 |     import matplotlib.image as mpimg
 43 |     from IPython.display import display
 44 |     from IPython.display import Image
 45 |     import scipy.cluster.hierarchy as sch
 46 |     from scipy.stats import scoreatpercentile 
 47 |     from pysca import scaTools as sca
 48 |     # import mpld3
 49 |     import pickle as pickle
 50 |     from optparse import OptionParser
 51 |     
 52 |     %matplotlib inline
 53 |     
 54 |     if not os.path.exists('../output/'):
 55 |         os.makedirs('../output/')  
 56 | 
 57 | Read in the results of the above three scripts (scaProcessMSA, scaCore
 58 | and scaSectorID), stored as dictionaries in the databases
 59 | PF00186_full.db and DHFR_PEPM3.db. To see what variables are stored in
 60 | each dictionary, use:
 61 | 
 62 | ::
 63 | 
 64 |    >> print dict.keys()
 65 | 
 66 | .. code:: python3
 67 | 
 68 |     Dseq = list(); Dsca = list(); Dsect = list()
 69 |     db = pickle.load(open('../output/PF00186_full.db','rb'))
 70 |     Dseq.append(db['sequence'])
 71 |     Dsca.append(db['sca'])
 72 |     Dsect.append(db['sector'])
 73 |     db2 = pickle.load(open('../output/DHFR_PEPM3.db', 'rb'))
 74 |     Dseq.append(db2['sequence'])
 75 |     Dsca.append(db2['sca'])
 76 |     Dsect.append(db2['sector'])
 77 |     N_alg = 2
 78 |     AlgName = ['PFAM', 'Manual']
 79 | 
 80 | I. Statistical Structure of the Multiple Sequence Alignment (MSA)
 81 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 82 | 
 83 | We start with a rough comparison of the sequence composition of the two
 84 | alignments. Plot a histogram of all pairwise sequence identities *(left
 85 | panel)* and a global view of the sequence similarity matrix (defined by
 86 | :math:`S\equiv \frac{1}{L}XX^\top`) *(right panel)*. The PFAM alignment
 87 | is show in the *top row* and the manual alignment is shown in the
 88 | *bottom row*. The manual alignment is smaller (644 seqs vs 2000 for
 89 | PFAM), but both alignments are well-described by a nearly homogeneous
 90 | distribution of sequence identities with a mean value of about 35%.
 91 | 
 92 | .. code:: python3
 93 | 
 94 |     ix = 1
 95 |     plt.rcParams['figure.figsize'] = 9, 15
 96 |     for k in range(N_alg):
 97 |         # List all elements above the diagonal (i<j):
 98 |         listS = [Dsca[k]['simMat'][i,j] for i in range(Dsca[k]['simMat'].shape[0]) \
 99 |                  for j in range(i+1, Dsca[k]['simMat'].shape[1])]
100 |         
101 |         #Cluster the sequence similarity matrix
102 |         Z = sch.linkage(Dsca[k]['simMat'],method = 'complete', metric = 'cityblock')
103 |         R = sch.dendrogram(Z,no_plot = True)
104 |         ind = R['leaves']
105 |         
106 |         #Plotting
107 |         plt.rcParams['figure.figsize'] = 14, 4 
108 |         plt.subplot(2,2,ix)
109 |         ix += 1
110 |         plt.hist(listS, int(round(Dseq[k]['Npos']/2)))
111 |         plt.xlabel('Pairwise sequence identities', fontsize=14)
112 |         plt.ylabel('Number', fontsize=14)
113 |         plt.subplot(2,2,ix)
114 |         ix += 1
115 |         plt.imshow(Dsca[k]['simMat'][np.ix_(ind,ind)], vmin=0, vmax=1); plt.colorbar();   
116 |         plt.tight_layout()
117 | 
118 | 
119 | 
120 | .. image:: _static/SCA_DHFR_7_0.png
121 | 
122 | 
123 | To examine: (1) sampling differences between the alignments and (2) the
124 | relationship between divergence in *sequence similarity* and *phylogeny*
125 | for both alignments, we plot the top independent components of the
126 | sequence correlation matrix (after sequence weighting), colored by
127 | phylogenetic group. We start by constructing a dictionary of
128 | phylogenetic annotations and checking the representation of sequences in
129 | the top taxonomic levels. The annotations are parsed from the sequence
130 | headers:
131 | 
132 | .. code:: python3
133 | 
134 |     for k in range(N_alg):
135 |         print("Alignment: " + AlgName[k])
136 |         
137 |         # Cnstruct a dictionary of phylogenetic groups
138 |         annot = dict()
139 |         for i, h in enumerate(Dseq[k]['hd']):
140 |             hs = sca.parseAlgHeader(h)
141 |             if (len(hs) == 4):
142 |                 annot[hs[0]] = sca.Annot(hs[1], hs[2], hs[3].replace('.',''))
143 |             elif (len(hs) == 3):
144 |                 annot[hs[0]] = sca.Annot(hs[0],hs[1], hs[2].replace('.',''))
145 |                 
146 |         # Most frequent taxonomic groups:
147 |         atleast = 10
148 |         for level in range(4):
149 |             descr_list = [a.taxo.split(',')[level] for a in annot.values() \
150 |                           if len(a.taxo.split(',')) > level]
151 |             descr_dict = {k:descr_list.count(k) for k in descr_list \
152 |                           if descr_list.count(k)>=atleast}
153 |             print('\n Level %i:' % level)
154 |             print(descr_dict)
155 | 
156 | 
157 | .. parsed-literal::
158 | 
159 |     Alignment: PFAM
160 |     
161 |      Level 0:
162 |     {'Bacteria': 1486, 'Eukaryota': 210, 'Viruses': 37, 'Archaea': 24}
163 |     
164 |      Level 1:
165 |     {'Proteobacteria': 581, 'Metazoa': 81, 'Chlamydiae': 14, 'Fungi': 60, 'Actinobacteria': 173, 'Firmicutes': 467, 'dsDNA viruses': 36, 'Tenericutes': 27, 'Bacteroidetes': 155, 'environmental samples': 24, 'Viridiplantae': 32, 'Fusobacteria': 10, 'Euryarchaeota': 23, 'stramenopiles': 11, 'Alveolata': 12}
166 |     
167 |      Level 2:
168 |     {'Gammaproteobacteria': 317, 'Chordata': 36, 'Chlamydiales': 14, 'Dikarya': 59, 'Betaproteobacteria': 108, 'Actinobacteridae': 161, 'Lactobacillales': 176, 'Clostridia': 147, ' no RNA stage': 36, 'Mollicutes': 27, 'Bacteroidia': 70, 'Negativicutes': 26, 'Alphaproteobacteria': 137, 'Flavobacteriia': 52, 'Sphingobacteriia': 14, 'Arthropoda': 32, 'Deltaproteobacteria': 17, 'Bacillales': 103, 'Cytophagia': 12, 'Fusobacteriales': 10, 'Halobacteria': 21, 'Streptophyta': 24, 'Erysipelotrichi': 15, 'Coriobacteridae': 11}
169 |     
170 |      Level 3:
171 |     {'Enterobacteriales': 78, 'Pseudomonadales': 38, 'Craniata': 32, 'Chlamydiaceae': 14, 'Ascomycota': 49, 'Burkholderiales': 65, 'Actinomycetales': 135, 'Chromatiales': 19, 'Lactobacillaceae': 70, 'Clostridiales': 145, 'Caudovirales': 14, 'Pasteurellales': 16, 'Mycoplasmataceae': 18, 'Bacteroidales': 70, 'Selenomonadales': 26, 'Streptococcaceae': 62, 'Vibrionales': 38, 'Enterococcaceae': 22, 'Rhizobiales': 65, 'Bifidobacteriales': 26, 'Flavobacteriales': 49, 'Rhodobacterales': 26, 'Oceanospirillales': 16, 'Sphingobacteriales': 14, 'Hexapoda': 27, 'Paenibacillaceae': 20, 'Neisseriales': 21, 'Bacillaceae': 52, 'Cytophagales': 12, 'Basidiomycota': 10, 'Halobacteriales': 21, 'Xanthomonadales': 17, 'Alteromonadales': 41, 'Sphingomonadales': 16, 'Legionellales': 10, 'Staphylococcus': 11, 'Embryophyta': 24, 'Thiotrichales': 10, 'Erysipelotrichales': 15, 'Coriobacteriales': 11, 'Caulobacterales': 10}
172 |     Alignment: Manual
173 |     
174 |      Level 0:
175 |     {'cellular organisms': 612}
176 |     
177 |      Level 1:
178 |     {' Eukaryota': 151, ' Bacteria': 461}
179 |     
180 |      Level 2:
181 |     {' Opisthokonta': 137, ' Proteobacteria': 259, ' Bacteroidetes/Chlorobi group': 42, ' Firmicutes': 100, ' Actinobacteria': 42, ' Alveolata': 11}
182 |     
183 |      Level 3:
184 |     {' Fungi': 74, ' Alphaproteobacteria': 69, ' Bacteroidetes': 42, ' Betaproteobacteria': 58, ' Bacilli': 78, ' Metazoa': 62, ' Gammaproteobacteria': 126, ' Actinobacteria': 42, ' Clostridia': 21, ' Apicomplexa': 11}
185 | 
186 | 
187 | Based on this, we select taxonomic groups and colors for representation.
188 | Here, we just start by choosing the broadly well-represented groups. To
189 | see a complete color-coding legend, use:
190 | 
191 | ::
192 | 
193 |    >>> sca.figColors()
194 | 
195 | .. code:: python3
196 | 
197 |     phylo = list();
198 |     fam_names = ['Eukaryota', 'Bacteroidetes', 'Firmicutes', \
199 |                  'Actinobacteria', 'Proteobacteria']
200 |     col = (0, 0.18, 0.38, 0.6, 0.8) 
201 |     
202 |     # Legend: Eukaryota = red, Bacteriodetes = yellow, Firmicutes = green,
203 |     # Actinobacteria = blue, Proteobacteria = purple
204 |     for a in range(N_alg):
205 |         phylo_alg = list()
206 |         for i,k in enumerate(fam_names):
207 |             sf = sca.Unit()
208 |             sf.name = fam_names[i].lower()
209 |             sf.col = col[i]
210 |             sf.items = [j for j,q in enumerate(Dseq[a]['hd'])  \
211 |                         if sf.name in q.lower()]
212 |             phylo_alg.append(sf)
213 |         phylo.append(phylo_alg)    
214 | 
215 | Plot the top six independent components of the sequence correlation
216 | matrix (with sequence weights); color-coded by phylogenetic annotation.
217 | We compare the phylogenetic sampling for the PFAM alignment *(top row)*
218 | and manual alignment\ *(bottom row)*. The data show some very clear
219 | seqeunce distinctions based on phylogeny, and the two alignments seem to
220 | differ somewhat in the sequence divergence captured. In particular, the
221 | eukaryotic sequences *(in red)* seem to form a more distinct group in
222 | the manual alignment than in the PFAM alignment. For the PFAM alignment,
223 | the bacteriodetes *(yellow)* diverge along :math:`U_1`, the
224 | actinobacteria *(blue)* along :math:`U_3`, the firmicutes *(green)*
225 | along :math:`U_4` and :math:`U_5`, and a subset of proteobacteria
226 | *(purple)* along :math:`U_6`. For the manual alignment, the eukaryotes
227 | *(red)* diverge along :math:`U_2` and :math:`U_6`, the actinobacteria
228 | *(blue)* along :math:`U_4`, the firmicutes *(green)* along :math:`U_3`,
229 | and a subset of proteobacteria *(purple)* along :math:`U_5`
230 | 
231 | .. code:: python3
232 | 
233 |     plt.rcParams['figure.figsize'] = 9, 8
234 |     ix = 1;
235 |     for a in range(N_alg):
236 |         U = Dsca[a]['Uica'][1]
237 |         pairs = [[2*i,2*i+1] for i in range(3)]
238 |         for k,[k1,k2] in enumerate(pairs):
239 |             plt.subplot(2,3,ix)
240 |             ix += 1
241 |             sca.figUnits(U[:,k1], U[:,k2], phylo[a])
242 |             #sca.figUnits(U[:,k1], U[:,k2], subfam)
243 |             plt.xlabel(r"${U'}^{(2)}_{%i}$"%(k1+1), fontsize=16)
244 |             plt.ylabel(r"${U'}^{(2)}_{%i}$"%(k2+1), fontsize=16)
245 |         plt.tight_layout()
246 | 
247 | 
248 | 
249 | .. image:: _static/SCA_DHFR_13_0.png
250 | 
251 | 
252 | II. SCA…conservation and coevolution
253 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
254 | 
255 | Plot the eigenspectrum of (1) the SCA positional coevolution matrix
256 | (:math:`\tilde{C_{ij}}`) *(black bars)* and (2) 10 trials of matrix
257 | randomization for comparison. This graph is used to choose the number of
258 | significant eigenmodes. Again, we plot the PFAM alignment in the *top
259 | row* and manual alignment in the *bottom row* for comparison. Overall
260 | the two eigenspectra are remarkably similar: due to small differences in
261 | the signficance cutoff, we define 6 signficant eigenmodes for the PFAM
262 | alignment, and 7 for the manual alignment.
263 | 
264 | .. code:: python3
265 | 
266 |     for a in range(N_alg):
267 |         plt.rcParams['figure.figsize'] = 9, 4 
268 |         hist0, bins = np.histogram(Dsca[a]['Lrand'].flatten(), bins=Dseq[a]['Npos'], \
269 |                                    range=(0,Dsect[a]['Lsca'].max()))
270 |         hist1, bins = np.histogram(Dsect[a]['Lsca'], bins=Dseq[a]['Npos'],\
271 |                                    range=(0,Dsect[a]['Lsca'].max()))
272 |         plt.subplot(2,1,a+1)
273 |         plt.bar(bins[:-1], hist1, np.diff(bins),color='k')
274 |         plt.plot(bins[:-1], hist0/Dsca[a]['Ntrials'], 'r', linewidth=3)
275 |         plt.tick_params(labelsize=11)
276 |         plt.xlabel('Eigenvalues', fontsize=18); plt.ylabel('Numbers', fontsize=18);
277 |         print('Number of eigenmodes to keep is %i' %(Dsect[a]['kpos']))
278 |     plt.tight_layout()
279 |     #mpld3.display()
280 | 
281 | 
282 | .. parsed-literal::
283 | 
284 |     Number of eigenmodes to keep is 4
285 |     Number of eigenmodes to keep is 6
286 | 
287 | 
288 | 
289 | .. image:: _static/SCA_DHFR_16_1.png
290 | 
291 | 
292 | To define the positions with significant contributions each of the
293 | independent components (ICs), we make a empirical fit for each IC to the
294 | t-distribution and select positions with greater than a specified cutoff
295 | on the CDF. We choose :math:`p=0.95` as our cutoff. Note that since some
296 | positions might contribute significantly to more than one IC (and
297 | indication of non-independence of ICs), we apply a simple algorithm to
298 | assign such positions to one IC. Specifically, we assign positions to
299 | the IC with which it has the greatest degree of co-evolution.
300 | 
301 | For brevity, we don’t plot the IC fits below (though we do in the other
302 | tutorial notebooks), but do print the list of positions associated with
303 | each IC for both alignments. Comparing between alignments, we can
304 | already see some distinctions in the residue positions associated to
305 | each component: IC1 is expanded for the manual alignment, ICs2,4+5 are
306 | similar for both alignments, and ICs 3+6 are swapped between the two
307 | alignments.
308 | 
309 | .. code:: python3
310 | 
311 |     plt.rcParams['figure.figsize'] = 20, 5 
312 |     for a in range(N_alg):
313 |         print("alignment: "+AlgName[a])
314 |         for n,ipos in enumerate(Dsect[a]['ics']):
315 |             sort_ipos = sorted(ipos.items)
316 |             ats_ipos = ([Dseq[a]['ats'][s] for s in sort_ipos])
317 |             ic_pymol = ('+'.join(ats_ipos))
318 |             print('IC %i is composed of %i positions:' % (n+1,len(ats_ipos)))
319 |             print(ic_pymol + "\n")
320 | 
321 | 
322 | .. parsed-literal::
323 | 
324 |     alignment: PFAM
325 |     IC 1 is composed of 14 positions:
326 |     13+18+23+25+27+32+38+39+55+63+90+107+133+153
327 |     
328 |     IC 2 is composed of 19 positions:
329 |     7+14+15+31+35+42+43+44+46+49+54+57+59+61+94+95+96+113+122
330 |     
331 |     IC 3 is composed of 5 positions:
332 |     21+22+24+52+121
333 |     
334 |     IC 4 is composed of 11 positions:
335 |     6+11+40+47+50+51+53+92+100+111+125
336 |     
337 |     alignment: Manual
338 |     IC 1 is composed of 18 positions:
339 |     13+18+23+25+27+28+32+38+39+51+55+63+71+105+107+121+133+158
340 |     
341 |     IC 2 is composed of 18 positions:
342 |     7+14+15+22+31+35+42+43+44+46+49+54+57+61+94+95+96+113
343 |     
344 |     IC 3 is composed of 9 positions:
345 |     40+47+50+52+53+59+81+100+103
346 |     
347 |     IC 4 is composed of 10 positions:
348 |     6+11+41+45+60+90+92+111+125+126
349 |     
350 |     IC 5 is composed of 6 positions:
351 |     5+21+115+122+123+147
352 |     
353 |     IC 6 is composed of 2 positions:
354 |     144+149
355 |     
356 | 
357 | 
358 | To define protein sectors, we examine the structure of the SCA
359 | positional correlation matrix with positions contributing to the top
360 | independent components (ICs) ordered by weight *(left panel)*. Again we
361 | compare the results between the PFAM alignment *(top)* and manual
362 | alignment *(bottom)*. This provides a basis to determine/interpret which
363 | ICs are truly statistically independent (defining an independent sector)
364 | and which represent hierarchical breakdowns of one sector.
365 | 
366 | For both alignments, it seems that the ICs reflect a hierarchical
367 | break-down of a single sector, as determined by the high degree of
368 | co-evolution in the off-diagonal components (see the dendrogram that
369 | follows). In the *right panels* the ICs are combined and re-ordered by
370 | their contribution to :math:`V_1^p` to better see this.
371 | 
372 | .. code:: python3
373 | 
374 |     sectors = list()
375 |     ix = 1
376 |     for a in range(N_alg):
377 |         # plot the SCA positional correlation matrix, ordered by contribution 
378 |         #to the top ICs
379 |         plt.rcParams['figure.figsize'] = 9, 9 
380 |         plt.subplot(2,2,ix); ix +=1;
381 |         plt.imshow(Dsca[a]['Csca'][np.ix_(Dsect[a]['sortedpos'],\
382 |                     Dsect[a]['sortedpos'])],vmin=0, vmax=2,\
383 |                    interpolation='none',aspect='equal',\
384 |                    extent=[0,sum(Dsect[a]['icsize']),0,\
385 |                            sum(Dsect[a]['icsize'])])
386 |         line_index=0
387 |         for i in range(Dsect[a]['kpos']):
388 |             plt.plot([line_index+Dsect[a]['icsize'][i],\
389 |                       line_index+Dsect[a]['icsize'][i]],\
390 |                      [0,sum(Dsect[a]['icsize'])],'w', linewidth = 2)
391 |             plt.plot([0,sum(Dsect[a]['icsize'])],[sum(Dsect[a]['icsize'])\
392 |                             -line_index,sum(Dsect[a]['icsize'])-line_index],\
393 |                      'w', linewidth = 2)
394 |             line_index += Dsect[a]['icsize'][i] 
395 |     
396 |         # combine all the ICs into a single sector and re-sort
397 |         sec_groups = ([k for k in range(Dsect[a]['kpos'])])
398 |         sectors_alg = list()
399 |         s = sca.Unit()
400 |         all_items = list()
401 |         all_Vp = list()
402 |         for i in range(Dsect[a]['kpos']): 
403 |             all_items = all_items+Dsect[a]['ics'][i].items
404 |             tmp1 = Dsect[a]['Vpica'][Dsect[a]['ics'][i].items,:]
405 |             all_Vp = all_Vp + list(tmp1[:,0].T) 
406 |         svals = list(np.argsort(all_Vp)); svals.reverse()  
407 |         s.items = [all_items[i] for i in svals]
408 |         s.col = (1/len(sec_groups))*n
409 |         sectors_alg.append(s)
410 |         sectors.append(sectors_alg)
411 |         
412 |         # plot the re-ordered matrix
413 |         sortpos = list()
414 |         for s in sectors[a]:
415 |             sortpos.extend(s.items)
416 |         plt.subplot(2,2,ix); ix += 1;
417 |         line_index=0
418 |         plt.imshow(Dsca[a]['Csca'][np.ix_(sortpos, sortpos)], \
419 |                    vmin=0, vmax=2,interpolation='none',aspect='equal',\
420 |                extent=[0,len(sortpos),0,len(sortpos)])
421 |         for s in sectors[a]:
422 |             plt.plot([line_index+len(s.items),line_index+len(s.items)],\
423 |                      [0,len(sortpos)],'w', linewidth = 2)
424 |             plt.plot([0,sum(Dsect[a]['icsize'])],[len(sortpos)-line_index, \
425 |                             len(sortpos)-line_index],'w', linewidth = 2)
426 |             line_index += len(s.items)
427 |         plt.tight_layout()
428 | 
429 | 
430 | 
431 | .. image:: _static/SCA_DHFR_20_0.png
432 | 
433 | 
434 | The below dendrogram diagrams the relationship between independent
435 | components for the PFAM alignment (the tree for the manual alignment is
436 | similar). In this plot, solid lines represent physically contiguous
437 | structural units, and dashed lines indicate spatially fragmented groups
438 | of residues.
439 | 
440 | .. code:: python3
441 | 
442 |     i = Image(filename='../figs/DHFR_sec_hier.png'); i
443 | 
444 | 
445 | 
446 | 
447 | .. image:: _static/SCA_DHFR_22_0.png
448 | 
449 | 
450 | 
451 | Print the sector positions (as defined for each alignment), in a format
452 | suitable for pyMol, and create two pyMol sessions with the sectors (and
453 | decomposition into independent components) as seperate objects.
454 | 
455 | .. code:: python3
456 | 
457 |     for a in range(N_alg):
458 |         print("Alignment: " + AlgName[a])
459 |         for i,k in enumerate(sectors[a]):
460 |             sort_ipos = sorted(k.items)
461 |             ats_ipos = ([Dseq[a]['ats'][s] for s in sort_ipos])
462 |             ic_pymol = ('+'.join(ats_ipos))
463 |             print('Sector %i is composed of %i positions:' % (i+1,len(ats_ipos)))
464 |             print(ic_pymol + "\n")
465 |             
466 |     sca.writePymol('1RX2', sectors[0], Dsect[0]['ics'], Dseq[0]['ats'],\
467 |                    '../output/PF00186.pml','A', '../data/', 0)  
468 |     sca.writePymol('1RX2', sectors[1], Dsect[1]['ics'], Dseq[1]['ats'],\
469 |                    '../output/DHFR_PEPM3.pml','A', '../data/', 0)  
470 | 
471 | 
472 | .. parsed-literal::
473 | 
474 |     Alignment: PFAM
475 |     Sector 1 is composed of 49 positions:
476 |     6+7+11+13+14+15+18+21+22+23+24+25+27+31+32+35+38+39+40+42+43+44+46+47+49+50+51+52+53+54+55+57+59+61+63+90+92+94+95+96+100+107+111+113+121+122+125+133+153
477 |     
478 |     Alignment: Manual
479 |     Sector 1 is composed of 63 positions:
480 |     5+6+7+11+13+14+15+18+21+22+23+25+27+28+31+32+35+38+39+40+41+42+43+44+45+46+47+49+50+51+52+53+54+55+57+59+60+61+63+71+81+90+92+94+95+96+100+103+105+107+111+113+115+121+122+123+125+126+133+144+147+149+158
481 |     
482 | 
483 | 
484 | As is evident from the position lists above, and as shown below, the
485 | structural pattern of the two sectors and their associated decomposition
486 | into independent components is highly similar when compared between the
487 | two alignments. The main difference is that the sector (and independent
488 | components) for the manual alignment systematically includes a few more
489 | residue postions.
490 | 
491 | .. code:: python3
492 | 
493 |     i = Image(filename='../figs/DHFR_decompv2.png'); i
494 | 
495 | 
496 | 
497 | 
498 | .. image:: _static/SCA_DHFR_26_0.png
499 | 
500 | 
501 | 
502 | III. The phylogenetic basis of the sector hierarchy
503 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
504 | 
505 | How does the clear phylogenetic heterogeneity in the MSA influence the
506 | sector definitions? Since the sector definitions for the two alignments
507 | above are roughly equivalent, we only consider the larger (PFAM)
508 | alignment here. To address this, we take advantage of mathematical
509 | methods for mapping between the space of positional and sequence
510 | correlations, as described in *Rivoire et al*. Using this mapping, we
511 | plot the top :math:`k_{pos}` ICs as 2-D scatter plots with the
512 | corresponding sequence space divergence.
513 | 
514 | .. code:: python3
515 | 
516 |     plt.rcParams['figure.figsize'] = 14, 10 
517 |     pairs = [ [x, x+1] for x in range(0, len(Dsect[0]['ics'])-1, 2) ]
518 |     for n,[k1,k2] in enumerate(pairs):
519 |         plt.subplot(2,len(pairs),n+1)
520 |         sca.figUnits(Dsect[0]['Vpica'][:,k1], Dsect[0]['Vpica'][:,k2],\
521 |                      Dsect[0]['ics'], dotsize = 6)
522 |         plt.xlabel(r'$V^p_{%i}$' % (k1+1), fontsize=16)
523 |         plt.ylabel(r'$V^p_{%i}$' % (k2+1), fontsize=16)
524 |         plt.subplot(2,len(pairs),n+1+len(pairs))
525 |         sca.figUnits(Dsect[0]['Upica'][:,k1], Dsect[0]['Upica'][:,k2],\
526 |                      phylo[0], dotsize = 6)
527 |         plt.xlabel(r'$U^p_{%i}$' % (k1+1), fontsize=16)
528 |         plt.ylabel(r'$U^p_{%i}$' % (k2+1), fontsize=16)
529 |     plt.tight_layout()
530 | 
531 | 
532 | 
533 | .. image:: _static/SCA_DHFR_29_0.png
534 | 
535 | 
536 | .. code:: python3
537 | 
538 |     plt.rcParams['figure.figsize'] = 20,8 
539 |     col = list()
540 |     axis_lims = ([-0.06, 0.08],[-0.03, -0.01],[-0.05,0.03],[-0.01 ,0.05],\
541 |                  [-0.02 ,0.05],[-0.05 ,0.03])
542 |     for k in phylo[0]:
543 |         col = col + [colorsys.hsv_to_rgb(k.col,1,1)]
544 |     for k in range(Dsect[0]['kpos']):
545 |         forhist = list()
546 |         for group in phylo[0]:
547 |             forhist.append([Dsect[0]['Upica'][i,k] for i in group.items])
548 |         plt.subplot(2,Dsect[0]['kpos'],k+1)
549 |         plt.hist(forhist, histtype='barstacked',color=col)
550 |         plt.axis([axis_lims[k][0],axis_lims[k][1],0,600])
551 |         plt.xlabel(r'$U^p_{%i}$' % (k+1), fontsize=16)
552 |     plt.tight_layout()
553 | 
554 | 
555 | 
556 | .. image:: _static/SCA_DHFR_30_0.png
557 | 
558 | 
559 | We see some association of phylogeny with sector positions at the phylum
560 | level: for example the positions along :math:`V_3^p` are associated with
561 | the divergence of some bacteriodetes *(yellow)* and
562 | firmicutes\ *(green)* along :math:`U_3^p`. Further, the positions along
563 | ICs :math:`V_1^p`,\ :math:`V_5^p` and :math:`V_6^p` seem to seperate the
564 | eukaryotes *(red)* from the prokaryotes.
565 | 
566 | So in conclusion, the DHFR family appears to have a single sector that
567 | can be decomposed into roughly six groups due to sequence divergence.
568 | Notably, the sector definition (and decomposition into ICs) is very
569 | similar for the two different sequence alignments.
570 | 


--------------------------------------------------------------------------------
/docs/source/SCA_G.rst:
--------------------------------------------------------------------------------
  1 | SCA 6.1 - The G Protein Family
  2 | ==============================
  3 | 
  4 | **Summary:** This script describes the basic flow of analytical steps in
  5 | SCA6.0, using the G-protein (small GTPase, Ras-like) family as an
  6 | example (PFAM PF00071). The analysis consists of five steps, which
  7 | directly follow from the accompanying publication (Rivoire et al, “An
  8 | Evolution-Based Functional Decomposition of Natural Proteins”):
  9 | 
 10 | **1) Alignment processing and composition.** We begin by analyzing the
 11 | composition of the mulitple sequence alignment: what is the number of
 12 | effective sequences, and how are the sequences structured into families?
 13 | For the G-protein family, the PFAM alignment sequence space is
 14 | well-sampled and fairly homogeneous (unstructured), as evidenced by the
 15 | fact that overall alignment sequence similarity shows a unimodal
 16 | distribution near 25%.
 17 | 
 18 | **2) First-order statistics: position-specific conservation.** Next, we
 19 | examine overall positional conservation for the sequence alignment. This
 20 | shows the expected result, that a handful of positions are strongly
 21 | conserved.
 22 | 
 23 | **3) Second-order statistics: conserved correlations.** Plots of the SCA
 24 | matrix (:math:`\tilde{C_{ij}}`), the associated eigenvectors and
 25 | eigenspectrum, and the corresponding independent components (IC). We
 26 | choose the number of significant eigenmodes, :math:`k^* = 4`, by
 27 | comparison of the eigenspectrum of :math:`\tilde{C_{ij}}` to that for a
 28 | collection of 10 randomized alignments.
 29 | 
 30 | **4) Sector decomposition.** Sector definition based on the top
 31 | :math:`k^*` ICs. We begin by fitting the top :math:`k^*` statistically
 32 | significant ICs to a t-distribution, and then identify the amino acid
 33 | positions contributing to the top five percent of the corresponding
 34 | cumulative density function. We then construct a sub-matrix of
 35 | :math:`\tilde{C_{ij}}` that contains only the selected top-scoring
 36 | positions for the :math:`k^*` ICs, ordered by their degree of
 37 | contribution to each IC. This plot is used to choose sector assignments.
 38 | For the g-protein family, we define two sectors, sector 1 composed of
 39 | ICs 1,2, and 3, and sector 2 composed of IC 4. Related to Figs. 4 and 5
 40 | of the main text.
 41 | 
 42 | **5) Sequence subfamilies and the basis of sector hierarchy.** We relate
 43 | the two sectors (and underlying ICs) to the pattern of divergence
 44 | between amino acid sequences. To do this, we make a mapping between
 45 | sequence space and positional correlations using singular value
 46 | decomposition. We see that the amino acid positions associated with IC1
 47 | and IC2 differentiate between different g-protein subclasses, suggesting
 48 | that these regions might tune allosteric regulation in a subclass
 49 | specific way.
 50 | 
 51 | Prior to running this tutorial, the core calculation scripts must be
 52 | executed from the command line as follows:
 53 | 
 54 | ::
 55 | 
 56 |    >> scaProcessMSA -a ../data/PF00071_rd2.an -b ../data/ -s 5P21 -c A -f 'Homo sapiens' -t -n
 57 |    >> scaCore -i ../output/PF00071_rd2.db
 58 |    >> scaSectorID -i ../output/PF00071_rd2.db
 59 | 
 60 | Note that we supply pre-annotated alignments for all tutorial scripts
 61 | *(the annotate_pfMSA step is slow, and should only be run once)*.
 62 | 
 63 | **O.Rivoire, K.Reynolds and R.Ranganathan** 1/2015
 64 | 
 65 | .. code:: python3
 66 | 
 67 |     import os
 68 |     import time
 69 |     import matplotlib.pyplot as plt
 70 |     import numpy as np
 71 |     import copy
 72 |     import scipy.cluster.hierarchy as sch
 73 |     from scipy.stats import scoreatpercentile 
 74 |     import matplotlib.image as mpimg
 75 |     from IPython.display import display
 76 |     from IPython.display import Image
 77 |     from Bio.Seq import Seq
 78 |     from Bio import motifs
 79 |     import colorsys
 80 |     from pysca import scaTools as sca
 81 |     # import mpld3
 82 |     import pickle as pickle
 83 |     from optparse import OptionParser
 84 |     
 85 |     %matplotlib inline
 86 |     
 87 |     if not os.path.exists('../output/'):
 88 |         os.makedirs('../output/')  
 89 | 
 90 | To begin, we read in the results of the above three scripts
 91 | (scaProcessMSA, scaCore and scaSectorID), stored as three dictionaries
 92 | in the database PF00071_full.db. To see what variables are stored in
 93 | each dictionary, use the command dictionary.keys( ), e.g.:
 94 | 
 95 | ::
 96 | 
 97 |    >>> print Dseq.keys()
 98 | 
 99 | .. code:: python3
100 | 
101 |     db = pickle.load(open('../output/PF00071_rd2.db','rb'))
102 |     Dseq = db['sequence']  #the results of scaProcessMSA
103 |     Dsca = db['sca']       #the results of scaCore
104 |     Dsect = db['sector']   #the results of scaSectorID
105 | 
106 | I. Alignment processing and composition
107 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
108 | 
109 | First, we print out a few statistics describing the alignment:
110 | 
111 | .. code:: python3
112 | 
113 |     print("After processing, the alignment size is %i sequences and %i positions" % \
114 |           (Dseq['Nseq'], Dseq['Npos']))
115 |     print("With sequence weights, there are %i effective sequences" % (Dseq['effseqs']))
116 | 
117 | 
118 | .. parsed-literal::
119 | 
120 |     After processing, the alignment size is 4974 sequences and 158 positions
121 |     With sequence weights, there are 3366 effective sequences
122 | 
123 | 
124 | To examine alignment composition, we plot a histogram of all pairwise
125 | sequence identities *(left panel)* and a global view of the sequence
126 | similarity matrix (defined by :math:`S\equiv \frac{1}{L}XX^\top`)
127 | *(right panel)*. The data show that the alignment is described by a
128 | nearly homogeneous distribution of sequence identities with a mean value
129 | of about 25%.
130 | 
131 | .. code:: python3
132 | 
133 |     # List all elements above the diagonal (i<j):
134 |     listS = [Dsca['simMat'][i,j] for i in range(Dsca['simMat'].shape[0]) \
135 |              for j in range(i+1, Dsca['simMat'].shape[1])]
136 |     
137 |     # Cluster the sequence similarity matrix
138 |     Z = sch.linkage(Dsca['simMat'],method = 'complete', metric = 'cityblock')
139 |     R = sch.dendrogram(Z,no_plot = True)
140 |     ind = R['leaves']
141 |     
142 |     # Plotting
143 |     plt.rcParams['figure.figsize'] = 9, 4 
144 |     plt.subplot(121)
145 |     plt.hist(listS, int(round(Dseq['Npos']/2)))
146 |     plt.xlabel('Pairwise sequence identities', fontsize=14)
147 |     plt.ylabel('Number', fontsize=14)
148 |     plt.subplot(122)
149 |     plt.imshow(Dsca['simMat'][np.ix_(ind,ind)], vmin=0, vmax=1); plt.colorbar();
150 | 
151 | 
152 | 
153 | .. image:: _static/SCA_G_9_0.png
154 | 
155 | 
156 | **Sequence annotations:** We construct a dictionary of phylogenetic
157 | annotations (parsed from the sequence headers) and check the
158 | representation of sequences in the top taxonomic levels. For each level,
159 | we print a list of taxonomic classes, and the number of sequences found
160 | for each.
161 | 
162 | .. code:: python3
163 | 
164 |     #construct a dictionary of phylogenetic groups
165 |     annot = dict()
166 |     for i, h in enumerate(Dseq['hd']):
167 |         hs = h.split('|')
168 |         annot[hs[0]] = sca.Annot(hs[1], hs[2], hs[3].replace('.',''))
169 |         
170 |     # Most frequent taxonomic groups:
171 |     atleast = 10
172 |     for level in range(4):
173 |         descr_list = [a.taxo.split(',')[level] for a in annot.values() \
174 |                       if len(a.taxo.split(',')) > level]
175 |         descr_dict = {k:descr_list.count(k) for k in descr_list \
176 |                       if descr_list.count(k)>=atleast}
177 |         print('\n Level %i:' % level)
178 |         print(descr_dict)
179 | 
180 | 
181 | .. parsed-literal::
182 | 
183 |     
184 |      Level 0:
185 |     {'Eukaryota': 4879, 'Bacteria': 74}
186 |     
187 |      Level 1:
188 |     {'Metazoa': 2048, 'Alveolata': 530, 'Heterolobosea': 143, 'Parabasalia': 321, 'Cyanobacteria': 20, 'Bacteroidetes': 18, 'Fungi': 702, 'Euglenozoa': 148, 'Amoebozoa': 510, 'stramenopiles': 159, 'Proteobacteria': 27, 'Viridiplantae': 192, 'Choanoflagellida': 47, 'Ichthyosporea': 22, 'Diplomonadida': 32, 'Oxymonadida': 14}
189 |     
190 |      Level 2:
191 |     {'Chordata': 985, 'Ciliophora': 408, 'Schizopyrenida': 143, 'Arthropoda': 520, 'Trichomonadida': 320, 'Dikarya': 622, 'Placozoa': 44, 'Porifera': 105, 'Kinetoplastida': 148, 'Archamoebae': 255, 'Cnidaria': 78, 'Nematoda': 175, 'Platyhelminthes': 89, 'Mycetozoa': 255, 'Chytridiomycota': 15, 'Bacillariophyta': 23, 'Blastocystis': 20, 'Pelagophyceae': 25, 'PX clade': 24, 'Gammaproteobacteria': 10, 'Streptophyta': 137, 'Oomycetes': 67, 'Echinodermata': 46, 'Salpingoecidae': 25, 'Apicomplexa': 89, 'Microsporidia': 46, 'Chlorophyta': 55, 'Capsaspora': 22, 'Perkinsea': 32, 'Codonosigidae': 22, 'Fungi incertae sedis': 17, 'Hexamitidae': 32}
192 |     
193 |      Level 3:
194 |     {'Craniata': 809, 'Tunicata': 122, 'Intramacronucleata': 408, 'Vahlkampfiidae': 143, 'Hexapoda': 402, 'Trichomonadidae': 320, 'Ascomycota': 471, 'Trichoplax': 44, 'Demospongiae': 105, 'Basidiomycota': 151, 'Trypanosomatidae': 148, 'Entamoebidae': 255, 'Anthozoa': 76, 'Chromadorea': 154, 'Trematoda': 85, 'Crustacea': 73, 'Dictyosteliida': 253, 'Chytridiomycetes': 15, 'Coscinodiscophyceae': 16, 'Chelicerata': 45, 'Aureococcus': 25, 'Enoplea': 21, 'Phaeophyceae': 23, 'Embryophyta': 136, 'Cephalochordata': 54, 'Albuginales': 22, 'Eleutherozoa': 46, 'Salpingoeca': 25, 'Coccidia': 37, 'Unikaryonidae': 17, 'Mamiellophyceae': 24, 'Trebouxiophyceae': 12, 'Aconoidasida': 52, 'Perkinsida': 32, 'Peronosporales': 45, 'Enterocytozoonidae': 11, 'Monosiga': 22, 'Early diverging fungal lineages': 17, 'Giardiinae': 32, 'Chlorophyceae': 19}
195 | 
196 | 
197 | Based on this, we select taxonomic groups and associate them to colors
198 | for representation. We choose broad taxonomic groups that are
199 | well-represented in the alignment (corresponding to Level 1). To see a
200 | complete legend that maps numeric codes to color, use:
201 | 
202 | ::
203 | 
204 |    >>> sca.figColors()
205 | 
206 | .. code:: python3
207 | 
208 |     phylo = list();
209 |     fam_names = ['Metazoa', 'Amoebozoa', 'Viridiplantae', 'Fungi',\
210 |                  'Alveolata', 'Parabasalia']
211 |     col = (0, 0.6, 0.38, 0.18, 0.8, 0.5)
212 |     #Metazoa = red, Amoebozoa = yellow, Viridiplantae = green, 
213 |     #Fungi = cyan, Alveolata = blue, Parabasalia = purple
214 |     for i,k in enumerate(fam_names):
215 |         sf = sca.Unit()
216 |         sf.name = fam_names[i].lower()
217 |         sf.col = col[i]
218 |         sf.items = [j for j,q in enumerate(Dseq['hd'])  if sf.name in q.lower()]
219 |         phylo.append(sf)
220 | 
221 | We also attempt to annotate the sequences by their declared sub-class of
222 | G protein - Ras, Rab, Rac, and Rho. These annotations are simply parsed
223 | from the header, and could contain mis-assignments.
224 | 
225 | .. code:: python3
226 | 
227 |     gprot_names = ['Ras', 'Rab', 'Rac','Rho']
228 |     gprot_classes = list()
229 |     col = (0, 0.65, 0.15, 0.38)
230 |     #Ras=light blue, Rab = orange, Rac=yellow, Rho=dark blue
231 |     for c,k in enumerate(gprot_names):
232 |         gp = sca.Unit()
233 |         gp.col = col[c]
234 |         gp.name = k
235 |         gp.items = [i for i,h in enumerate(Dseq['hd']) if k in h]
236 |         gprot_classes.append(gp)
237 | 
238 | To examine the relationship between global sequence similarity,
239 | phylogeny, and functional sub-class, we plot the top six independent
240 | components of the sequence correlation matrix (including sequence
241 | weights). In these plots, each point represents a particular sequence,
242 | and the distance between points reflects global sequence identity. In
243 | the top row each point (sequence) is color coded by phylogenetic
244 | annotation, in the bottom row, they are color-coded by g-protein class.
245 | 
246 | .. code:: python3
247 | 
248 |     plt.rcParams['figure.figsize'] = 9, 8
249 |     U = Dsca['Uica'][1]
250 |     pairs = [[i,i+1] for i in range(0,6,2)]
251 |     for k,[k1,k2] in enumerate(pairs):
252 |         plt.subplot(2,3,k+1)
253 |         sca.figUnits(U[:,k1], U[:,k2], phylo)
254 |         plt.xlabel(r"$IC_{seq}^{%i}$"%(k1+1), fontsize=16)
255 |         plt.ylabel(r"$IC_{seq}^{%i}$"%(k2+1), fontsize=16)
256 |         plt.subplot(2,3,k+4)
257 |         sca.figUnits(U[:,k1], U[:,k2], gprot_classes)
258 |         plt.xlabel(r"$IC_{seq}^{%i}$"%(k1+1), fontsize=16)
259 |         plt.ylabel(r"$IC_{seq}^{%i}$"%(k2+1), fontsize=16)
260 |     plt.tight_layout()
261 | 
262 | 
263 | 
264 | .. image:: _static/SCA_G_17_0.png
265 | 
266 | 
267 | The data show a mixed distribution of phylogenetic groups along modes
268 | 1-5. A subset of metazoan sequences emerges along the mode six,
269 | :math:`IC^{6}_{seq}`. In contrast, the top modes of the sequence
270 | similarity matrix do seem to correspond to functional G protein
271 | subclasses. For example, the Rho proteins *(green)* emerge along
272 | :math:`IC^{2}_{seq}` , the Ras proteins *(red)* along
273 | :math:`IC^{3}_{seq}`, and a subset of Rabs *(blue)* along
274 | :math:`IC^{4}_{seq}` and :math:`IC^{5}_{seq}` and a subset of Ras
275 | proteins along :math:`IC^{6}_{seq}`. Many G-protein paralogs (reflecting
276 | different subclasses) can be found in each type of organism, and thus
277 | the global pattern of sequence divergence is distinct from phylogeny.
278 | 
279 | II.  First-order statistics: position-specific conservation. 
280 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
281 | 
282 | Plot the position-specific conservation values for each g-protein
283 | position. :math:`D_i` is calculated according to equation S4
284 | (supplemental information).
285 | 
286 | .. code:: python3
287 | 
288 |     fig, axs = plt.subplots(1,1, figsize=(9,4))
289 |     xvals = [i+1 for i in range(len(Dsca['Di']))]
290 |     xticks = [0,45,95,144]
291 |     plt.bar(xvals,Dsca['Di'], color='k')
292 |     plt.tick_params(labelsize=11); plt.grid()
293 |     axs.set_xticks(xticks);
294 |     labels = [Dseq['ats'][k] for k in xticks]
295 |     axs.set_xticklabels(labels);
296 |     plt.xlabel('Amino acid position', fontsize=18); plt.ylabel('Di', fontsize=18);
297 | 
298 | 
299 | 
300 | .. image:: _static/SCA_G_21_0.png
301 | 
302 | 
303 | III. Second-order statistics: conserved correlations.
304 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
305 | 
306 | Plot the SCA correlation matrix ( :math:`\tilde{C_{ij}}` ) computed
307 | according to Equations 4+5
308 | 
309 | .. code:: python3
310 | 
311 |     plt.rcParams['figure.figsize'] = 13, 8
312 |     plt.imshow(Dsca['Csca'], vmin=0, vmax=1.4,interpolation='none',\
313 |                aspect='equal')
314 | 
315 | 
316 | 
317 | 
318 | .. parsed-literal::
319 | 
320 |     <matplotlib.image.AxesImage at 0x7f7db4628670>
321 | 
322 | 
323 | 
324 | 
325 | .. image:: _static/SCA_G_24_1.png
326 | 
327 | 
328 | Plot the eigenspectrum of (1) the SCA positional coevolution matrix
329 | (:math:`\tilde{C_{ij}}`) *(black bars)* and (2) 10 trials of matrix
330 | randomization for comparison. This graph is used to choose the number of
331 | significant eigenmodes (:math:`k^* = 4`).
332 | 
333 | .. code:: python3
334 | 
335 |     plt.rcParams['figure.figsize'] = 9, 4 
336 |     hist0, bins = np.histogram(Dsca['Lrand'].flatten(), bins=Dseq['Npos'], \
337 |                                range=(0,Dsect['Lsca'].max()))
338 |     hist1, bins = np.histogram(Dsect['Lsca'], bins=Dseq['Npos'], \
339 |                                range=(0,Dsect['Lsca'].max()))
340 |     plt.bar(bins[:-1], hist1, np.diff(bins),color='k')
341 |     plt.plot(bins[:-1], hist0/Dsca['Ntrials'], 'r', linewidth=3)
342 |     plt.tick_params(labelsize=11)
343 |     plt.xlabel('Eigenvalues', fontsize=18); plt.ylabel('Numbers', fontsize=18);
344 |     print('Number of eigenmodes to keep is %i' %(Dsect['kpos']))
345 |     #mpld3.display()
346 | 
347 | 
348 | .. parsed-literal::
349 | 
350 |     Number of eigenmodes to keep is 4
351 | 
352 | 
353 | 
354 | .. image:: _static/SCA_G_26_1.png
355 | 
356 | 
357 | Plot the top significant eigenmodes *(top row)* and associated
358 | independent components *(bottom row)*. The ICs are an optimally
359 | independent representation of the four different residue groups.
360 | 
361 | .. code:: python3
362 | 
363 |     plt.rcParams['figure.figsize'] = 9, 6
364 |     EVs = Dsect['Vsca']
365 |     ICs = Dsect['Vpica']
366 |     pairs = [ [x,x+1] for x in range(Dsect['kpos']-1)]
367 |     ncols = len(pairs)
368 |     for k,[k1,k2] in enumerate(pairs):
369 |         plt.subplot(2,ncols,k+1)
370 |         plt.plot(EVs[:,k1], EVs[:,k2], 'ok')
371 |         plt.xlabel("EV%i"%(k1+1), fontsize=16)
372 |         plt.ylabel("EV%i"%(k2+1), fontsize=16)
373 |         plt.subplot(2,ncols,k+1+ncols)
374 |         plt.plot(ICs[:,k1], ICs[:,k2], 'ok')
375 |         plt.xlabel("IC%i"%(k1+1), fontsize=16)
376 |         plt.ylabel("IC%i"%(k2+1), fontsize=16)
377 |     plt.tight_layout()
378 | 
379 | 
380 | 
381 | .. image:: _static/SCA_G_28_0.png
382 | 
383 | 
384 | IV.  Sector decomposition. 
385 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
386 | 
387 | To define the positions with significant contributions to each of the
388 | independent components (ICs), we make a empirical fit for each IC to the
389 | t-distribution and select positions with greater than a specified cutoff
390 | on the CDF. We choose :math:`p=0.95` as our cutoff. Note that since some
391 | positions might contribute significantly to more than one IC (an
392 | indication of non-independence of ICs), we apply a simple algorithm to
393 | assign such positions to one IC. Specifically, we assign positions to
394 | the IC with which it has the greatest degree of co-evolution.
395 | 
396 | The data indicate generally good fits for the top five ICs (also shown
397 | in supplemental figure S2), and we return the positions contributing to
398 | each IC in a format suitable for cut and paste into PyMol.
399 | 
400 | .. code:: python3
401 | 
402 |     plt.rcParams['figure.figsize'] = 8, 8 
403 |     
404 |     Vpica = Dsect['Vpica']
405 |     for k in range(Dsect['kpos']):
406 |         iqr = scoreatpercentile(Vpica[:,k],75) - scoreatpercentile(Vpica[:,k],25)
407 |         binwidth=2*iqr*(len(Vpica)**(-0.33))
408 |         nbins=int(round((max(Vpica[:,k])-min(Vpica[:,k]))/binwidth))
409 |         plt.subplot(Dsect['kpos'],1,k+1)
410 |         h_params = plt.hist(Vpica[:,k], nbins)
411 |         x_dist = np.linspace(min(h_params[1]), max(h_params[1]), num=100)
412 |         plt.plot(x_dist,Dsect['scaled_pd'][k],'r',linewidth = 2)  
413 |         plt.plot([Dsect['cutoff'][k],Dsect['cutoff'][k]], [0,60], 'k--',linewidth = 1)
414 |         plt.xlabel(r'$V^p_{%i}$'%(k+1), fontsize=14)
415 |         plt.ylabel('Number', fontsize=14)
416 |     plt.tight_layout()    
417 |     
418 |     for n,ipos in enumerate(Dsect['ics']):
419 |         sort_ipos = sorted(ipos.items)
420 |         ats_ipos = ([Dseq['ats'][s] for s in sort_ipos])
421 |         ic_pymol = ('+'.join(ats_ipos))
422 |         print('IC %i is composed of %i positions:' % (n+1,len(ats_ipos)))
423 |         print(ic_pymol + "\n")
424 |             
425 | 
426 | 
427 | .. parsed-literal::
428 | 
429 |     IC 1 is composed of 19 positions:
430 |     22+32+34+36+39+42+54+63+64+68+71+73+75+81+83+85+110+116+144
431 |     
432 |     IC 2 is composed of 8 positions:
433 |     5+11+56+61+62+72+96+99
434 |     
435 |     IC 3 is composed of 16 positions:
436 |     10+14+15+16+28+35+57+58+59+60+117+119+145+146+147+156
437 |     
438 |     IC 4 is composed of 13 positions:
439 |     17+23+82+84+90+115+123+125+129+130+134+141+143
440 |     
441 | 
442 | 
443 | 
444 | .. image:: _static/SCA_G_31_1.png
445 | 
446 | 
447 | To define protein sectors, we examine the structure of the SCA
448 | positional correlation matrix with positions contributing to the top
449 | independent components (ICs) ordered by weight (*left panel*). This
450 | provides a basis to determine/interpret which ICs are truly
451 | statistically independent (defining an independent sector) and which
452 | represent hierarchical breakdowns of one sector. In this case, the data
453 | suggest that ICs 1, 2, and 3 have strong inter-IC correlations and
454 | should be considered a single sector, and IC4 shows little corrleation
455 | with other ICs, implying a distinct sector (see the dendrogram that
456 | follows). In the *right panel* the ICs are re-ordered to reflect this
457 | decomposition.
458 | 
459 | .. code:: python3
460 | 
461 |     #plot the SCA positional correlation matrix, ordered by contribution to the top ICs
462 |     plt.rcParams['figure.figsize'] = 9, 9 
463 |     plt.subplot(121)
464 |     plt.imshow(Dsca['Csca'][np.ix_(Dsect['sortedpos'], Dsect['sortedpos'])], \
465 |                vmin=0, vmax=2.2,interpolation='none',\
466 |                aspect='equal',extent=[0,sum(Dsect['icsize']),\
467 |                                       0,sum(Dsect['icsize'])])
468 |     line_index=0
469 |     for i in range(Dsect['kpos']):
470 |         plt.plot([line_index+Dsect['icsize'][i],line_index+Dsect['icsize'][i]],\
471 |                  [0,sum(Dsect['icsize'])],'w', linewidth = 2)
472 |         plt.plot([0,sum(Dsect['icsize'])],[sum(Dsect['icsize'])-\
473 |                 line_index,sum(Dsect['icsize'])-line_index],'w', linewidth = 2)
474 |         line_index += Dsect['icsize'][i] 
475 |     
476 |     #define the new sector groupings - 3 total
477 |     sec_groups = ([0,1,2],[3])
478 |     sectors = list()
479 |     c = [0.66, 0]
480 |     for n,k in enumerate(sec_groups):
481 |         s = sca.Unit()
482 |         all_items = list()
483 |         all_Vp = list()
484 |         for i in k: 
485 |             all_items = all_items+Dsect['ics'][i].items
486 |             all_Vp = all_Vp+list(Dsect['ics'][i].vect)
487 |         svals = np.argsort(all_Vp)    
488 |         s.items = [all_items[i] for i in svals]
489 |         s.col = c[n]
490 |         sectors.append(s)
491 |     
492 |     #plot the re-ordered matrix
493 |     plt.subplot(122)
494 |     line_index=0
495 |     sortpos = list()
496 |     for s in sectors:
497 |         sortpos.extend(s.items)
498 |     plt.imshow(Dsca['Csca'][np.ix_(sortpos, sortpos)], vmin=0, vmax=2.2,\
499 |                interpolation='none',aspect='equal',\
500 |                extent=[0,len(sortpos),0,len(sortpos)])
501 |     for s in sectors:
502 |         plt.plot([line_index+len(s.items),line_index+len(s.items)],\
503 |                  [0,len(sortpos)],'w', linewidth = 2)
504 |         plt.plot([0,sum(Dsect['icsize'])],[len(sortpos)-line_index,\
505 |                         len(sortpos)-line_index],'w', linewidth = 2)
506 |         line_index += len(s.items)
507 |     plt.tight_layout()
508 | 
509 | 
510 | 
511 | .. image:: _static/SCA_G_33_0.png
512 | 
513 | 
514 | The below dendrogram diagrams the relationship between independent
515 | components. In this plot, solid lines represent physically contiguous
516 | structural units, and dashed lines indicate spatially fragmented groups
517 | of residues. We see that ICs 1,2,and 3 combine to form a single sector
518 | (sector 1), and that sector 2 (IC4) is more independent.
519 | 
520 | .. code:: python3
521 | 
522 |     i = Image(filename='../figs/Gprot_sec_hier.png'); i
523 | 
524 | 
525 | 
526 | 
527 | .. image:: _static/SCA_G_35_0.png
528 | 
529 | 
530 | 
531 | The assignments have clear physical consistency with the concept of
532 | sectors as functional, physically contiguous units in the protein
533 | structure (see also Figs.4-5). In the *left panels*, sector one is
534 | formed from the combination of positions in IC1 *(bright blue)*, IC2
535 | *(light blue)* and IC3 *(cyan)*. Sector2 (IC4) is shown in red spheres,
536 | and forms a phyically contiguous unit structurally distinct from sector
537 | one.
538 | 
539 | .. code:: python3
540 | 
541 |     i = Image(filename = '../figs/Gprot_secstruct.png'); i
542 | 
543 | 
544 | 
545 | 
546 | .. image:: _static/SCA_G_37_0.png
547 | 
548 | 
549 | 
550 | Print the sector positions, in a format suitable for pyMol, and create a
551 | pyMol session (in the output directory) with the sectors (and
552 | decomposition into independent components) as seperate objects.
553 | 
554 | .. code:: python3
555 | 
556 |     for i,k in enumerate(sectors):
557 |         sort_ipos = sorted(k.items)
558 |         ats_ipos = ([Dseq['ats'][s] for s in sort_ipos])
559 |         ic_pymol = ('+'.join(ats_ipos))
560 |         print('Sector %i is composed of %i positions:' % (i+1,len(ats_ipos)))
561 |         print(ic_pymol + "\n")
562 |     sca.writePymol('5P21', sectors, Dsect['ics'], Dseq['ats'], \
563 |                    '../output/PF00071.pml','A', '../Inputs/', 0)  
564 | 
565 | 
566 | .. parsed-literal::
567 | 
568 |     Sector 1 is composed of 43 positions:
569 |     5+10+11+14+15+16+22+28+32+34+35+36+39+42+54+56+57+58+59+60+61+62+63+64+68+71+72+73+75+81+83+85+96+99+110+116+117+119+144+145+146+147+156
570 |     
571 |     Sector 2 is composed of 13 positions:
572 |     17+23+82+84+90+115+123+125+129+130+134+141+143
573 |     
574 | 
575 | 
576 | V. Sequence subfamilies and the basis of sector hierarchy.
577 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
578 | 
579 | How does the phylogenetic and functional heterogeneity in the MSA
580 | influence the sector definitions? To address this, we take advantage of
581 | mathematical methods for mapping between the space of positional and
582 | sequence correlations, as described in *Rivoire et al* (see equations
583 | 8-11). Using this mapping, we plot the top :math:`k^*` ICs of the matrix
584 | :math:`\tilde{C_{ij}}` as 2-D scatter plots *(top row)*, and compare
585 | them to the corresponding sequence space divergence *(middle and bottom
586 | rows)*. The amino acid positions contributing to each IC are colored by
587 | sector *(sector 1 = blue, sector 2 = red, top row)*. The sequences are
588 | color-coded according to phylogenetic classifications *(middle row)* or
589 | G-protein class *(bottom row)* as we defined above.
590 | 
591 | .. code:: python3
592 | 
593 |     plt.rcParams['figure.figsize'] = 14, 10 
594 |     pairs = [ [x,x+1] for x in range(Dsect['kpos']-1)]
595 |     ncols = len(pairs)
596 |     for n,[k1,k2] in enumerate(pairs):
597 |         plt.subplot(3,ncols,n+1)
598 |         sca.figUnits(Dsect['Vpica'][:,k1], Dsect['Vpica'][:,k2], \
599 |                      sectors, dotsize = 6)
600 |         plt.xlabel('IC%i' % (k1+1), fontsize=16)
601 |         plt.ylabel('IC%i' % (k2+1), fontsize=16)
602 |         plt.subplot(3,ncols,n+1+ncols)
603 |         sca.figUnits(Dsect['Upica'][:,k1], Dsect['Upica'][:,k2], \
604 |                      phylo, dotsize = 6)
605 |         plt.xlabel(r'$U^p_{%i}$' % (k1+1), fontsize=16)
606 |         plt.ylabel(r'$U^p_{%i}$' % (k2+1), fontsize=16)
607 |         plt.subplot(3,ncols,n+1+ncols*2)
608 |         sca.figUnits(Dsect['Upica'][:,k1], Dsect['Upica'][:,k2], \
609 |                      gprot_classes, dotsize = 6)
610 |         plt.xlabel(r'$U^p_{%i}$' % (k1+1), fontsize=16)
611 |         plt.ylabel(r'$U^p_{%i}$' % (k2+1), fontsize=16)
612 |     plt.tight_layout()
613 | 
614 | 
615 | 
616 | .. image:: _static/SCA_G_42_0.png
617 | 
618 | 
619 | There is some clear divergence in G-protein subtype along :math:`U_1^p`
620 | and :math:`U_2^p`, indicating that the amino acid positions associated
621 | with IC1 and IC2 vary in a subtype-specific pattern. To more clearly see
622 | seperations in sequence classification, we also plot the above
623 | distributions of sequences (along :math:`U_1^p`,
624 | :math:`U_2^p`,\ :math:`U_3^p`,and :math:`U_4^p`) as stacked bar plots.
625 | This representation lets us directly see the contribution of sequences
626 | that might be hidden (due to overlapping points) on the above scatter
627 | plots. The *top row* reflects phylogenetic classifications and the
628 | *bottom row* shows G-protein functional classes.
629 | 
630 | .. code:: python3
631 | 
632 |     plt.rcParams['figure.figsize'] = 15, 4 
633 |     
634 |     col = list()
635 |     for k in gprot_classes:
636 |         col = col + [colorsys.hsv_to_rgb(k.col,1,1)]
637 |     for k in range(Dsect['kpos']):
638 |         forhist = list()
639 |         for group in gprot_classes:
640 |             forhist.append([Dsect['Upica'][i,k] for i in group.items])
641 |         plt.subplot(2,Dsect['kpos'],k+5)
642 |         plt.hist(forhist, histtype='barstacked',color=col)
643 |         plt.xlabel(r'$U^p_{%i}$' % (k+1), fontsize=16)
644 |         
645 |     col = list()
646 |     for k in phylo:
647 |         col = col + [colorsys.hsv_to_rgb(k.col,1,1)]
648 |     for k in range(Dsect['kpos']):
649 |         forhist = list()
650 |         for group in phylo:
651 |             forhist.append([Dsect['Upica'][i,k] for i in group.items])
652 |         plt.subplot(2,Dsect['kpos'],k+1)
653 |         plt.hist(forhist, histtype='barstacked',color=col)
654 |     
655 |     plt.tight_layout()
656 | 
657 | 
658 | 
659 | .. image:: _static/SCA_G_44_0.png
660 | 
661 | 
662 | The interpretation for the two sectors is clear:
663 | 
664 | **Sector 1** is composed of ICs 1,2 and 3 - we see above that the
665 | positions contributing to IC1 and IC2 seperate out the Ras-like *(red)*
666 | and Rho *(green)* g-protein functional classes (see the plots of
667 | :math:`U_1^p` and :math:`U_2^p` above). In contrast, the positions along
668 | IC3 and IC4 are associated with a homogeneous pattern of sequences; that
669 | is they have no obvious relationship to g-protein class or phylogeny.
670 | This suggests that sector 1 consists of a core element (IC3) that is
671 | conserved among G-proteins and two related/co-evolving parts which
672 | diverge in particular G-protein functional classes. The structural
673 | mapping of these positions is consistent with this interpretation - we
674 | observe that the positions associated with IC3 form the base of the
675 | nucleotide binding pocket (a general feature of the g-protein family)
676 | and that the IC1 and IC2 positions form a peripheral shell, which may
677 | reflect functional divergence in G-protein regulatory mechanisms in
678 | different family members.
679 | 
680 | **Sector 2** is defined along (:math:`V_4^p`). The sequences along the
681 | corresponding component (:math:`U_4^p`) are homogeneously distributed
682 | with respect to both phylogeny and g-protein functional class,
683 | consistent with the notion that this sector is likley a global property
684 | of the entire alignment.
685 | 


--------------------------------------------------------------------------------
/docs/source/SCA_betalactamase.rst:
--------------------------------------------------------------------------------
  1 | SCA 6.1 - The Beta-lactamase Enzyme Family
  2 | ==========================================
  3 | 
  4 | This script describes the basic flow of the analytical steps in SCA6.0,
  5 | using the :math:`\beta`-lactamase enzyme family as an example (PFAM
  6 | PF13354). The alignment contains some subfamily structure (clades of
  7 | related sequences) as evidenced in Section 1. We identify two sectors: a
  8 | core sector surrounding the active site that is shared across all
  9 | sequences, and a more peripheral sector containing groups of residues
 10 | that diverge in particular subfamilies. For this tutorial, the core
 11 | scripts should be run as follows:
 12 | 
 13 | ::
 14 | 
 15 |    >> annotateMSA -i ../data/PF13354_full.txt -o ../data/PF13354_full.an -a 'pfam' -p ../data/pfamseq.txt
 16 |    >> scaProcessMSA -a ../data/PF13354_full.an -b ../data/ -s 1FQG -c A -f 'Escherichia coli' -t -n
 17 |    >> scaCore -i ../output/PF13354_full.db
 18 |    >> scaSectorID -i ../output/PF13354_full.db
 19 | 
 20 | Note that we supply annotated alignments for all tutorial scripts *(the
 21 | annotate_pfMSA step is slow, and should only be run once)*.
 22 | 
 23 | **O.Rivoire, K.Reynolds and R.Ranganathan** 9/2014
 24 | 
 25 | .. code:: python3
 26 | 
 27 |     import os
 28 |     import time
 29 |     import matplotlib.pyplot as plt
 30 |     import math
 31 |     import numpy as np
 32 |     import copy
 33 |     import scipy.cluster.hierarchy as sch
 34 |     from scipy.stats import scoreatpercentile 
 35 |     from pysca import scaTools as sca
 36 |     import colorsys
 37 |     # import mpld3
 38 |     import pickle as pickle
 39 |     from optparse import OptionParser
 40 |     
 41 |     if not os.path.exists('../output/'):
 42 |         os.makedirs('../output/')
 43 |         
 44 |     %matplotlib inline
 45 | 
 46 | Read in the results of the above three scripts (scaProcessMSA, scaCore
 47 | and scaSectorID), stored as three dictionaries in the database
 48 | PF13354_full.db. To see what variables are stored in each dictionary,
 49 | use:
 50 | 
 51 | ::
 52 | 
 53 |    >>> list(db)
 54 | 
 55 | .. code:: python3
 56 | 
 57 |     db = pickle.load(open('../output/PF13354_full.db','rb'))
 58 |     Dseq = db['sequence']
 59 |     Dsca = db['sca']
 60 |     Dsect = db['sector']
 61 | 
 62 | I. Statistical Structure of the Multiple Sequence Alignment (MSA)
 63 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 64 | 
 65 | Plot a histogram of all pairwise sequence identities *(left panel)* and
 66 | a global view of the sequence similarity matrix (defined by
 67 | :math:`S\equiv \frac{1}{L}XX^\top`) *(right panel)*. The data show that
 68 | the alignment is described by a nearly bimodal distribution of sequence
 69 | identities with peaks near 25% and 45%. From the matrix at right, it is
 70 | clear that the alignment is composed of several distinct sequence
 71 | families.
 72 | 
 73 | .. code:: python3
 74 | 
 75 |     # List all elements above the diagonal (i<j):
 76 |     listS = [Dsca['simMat'][i,j] for i in range(Dsca['simMat'].shape[0]) \
 77 |              for j in range(i+1, Dsca['simMat'].shape[1])]
 78 |     
 79 |     # Cluster the sequence similarity matrix
 80 |     Z = sch.linkage(Dsca['simMat'], method = 'complete', metric = 'cityblock')
 81 |     R = sch.dendrogram(Z,no_plot = True)
 82 |     ind = R['leaves']
 83 | 
 84 | .. code:: python3
 85 | 
 86 |     # Plotting
 87 |     plt.rcParams['figure.figsize'] = 9, 4 
 88 |     
 89 |     plt.subplot(121)
 90 |     plt.hist(listS, math.floor(Dseq['Npos']/2))
 91 |     plt.xlabel('Pairwise sequence identities', fontsize=14)
 92 |     plt.ylabel('Number', fontsize=14)
 93 |     
 94 |     plt.subplot(122)
 95 |     plt.imshow(Dsca['simMat'][np.ix_(ind,ind)], vmin=0, vmax=1)
 96 |     plt.colorbar()
 97 | 
 98 | 
 99 | 
100 | 
101 | .. parsed-literal::
102 | 
103 |     <matplotlib.colorbar.Colorbar at 0x7f4b716d5610>
104 | 
105 | 
106 | 
107 | 
108 | .. image:: _static/SCA_betalactamase_8_1.png
109 | 
110 | 
111 | To examine the role of sequence and position weighting on the structure
112 | of the sequence space, we compute correlation matrices between all pairs
113 | of sequences, either with or without sequence and position weights and
114 | project the corresponding sequence space (by eigenvalue decomposition)
115 | down to a small set of top modes that contain the statistically dominant
116 | relationships between sequences. Since eigenvalue decomposition does not
117 | necessarily provide the best representation of sequence groups (for
118 | reasons described in “xx”), we also apply independent components
119 | analysis (or ICA) to the top few eigenmodes; this manipulation provides
120 | a representation in which the top groupings of sequences in the
121 | alignment (if such exists) should separate along the so-called
122 | independent components (or ICs). Below we plot the following eigenmodes
123 | *(top row)* and independent components *(bottom row)*:
124 | 
125 | :math:`\bullet` :math:`U^{(0)}` and :math:`U'^{(0)}`, the top eigenmodes
126 | and ICs without any weights;
127 | 
128 | :math:`\bullet` :math:`U^{(1)}` and :math:`U'^{(1)}` the top eigenmodes
129 | and ICs with sequences weights;
130 | 
131 | :math:`\bullet` :math:`U^{(2)}` and :math:`U'^{(2)}` the top eigenmodes
132 | and ICs with both sequences and positional weights.
133 | 
134 | The sequences are colored by weight, with red indicating the most
135 | strongly downweighted sequences. In contrast to the g-protein example,
136 | we see that application of the sequence and position weights makes the
137 | sequence space apparently more uniform (removes some of the family or
138 | clade-like structure).
139 | 
140 | .. code:: python3
141 | 
142 |     Useq = Dsca['Useq']
143 |     Uica = Dsca['Uica']
144 |     plt.rcParams['figure.figsize'] = 9, 8 
145 |     ica = ["","","","'","'","'"]
146 |     for k,U in enumerate(Useq+Uica):
147 |         plt.subplot(2,3,k+1)
148 |         sca.figWeights(U[:,0], U[:,1], Dseq['seqw'][0])
149 |         plt.xlabel(r'${U%s}^{(%i)}_1$'%(ica[k],k%3), fontsize=16)
150 |         plt.ylabel(r'${U%s}^{(%i)}_2$'%(ica[k],k%3), fontsize=16)
151 |     plt.tight_layout()
152 | 
153 | 
154 | 
155 | .. image:: _static/SCA_betalactamase_10_0.png
156 | 
157 | 
158 | To examine the relationship between divergence in *sequence similarity*
159 | and *phylogeny* in the sequence-weighted alignment, we plot the top
160 | independent components of the sequence correlation matrix (after
161 | sequence weighting), colored by phylogenetic group. We start by
162 | constructing a dictionary of phylogenetic annotations and checking the
163 | representation of sequences in the top taxonomic levels. The annotations
164 | are parsed from the sequence headers.
165 | 
166 | .. code:: python3
167 | 
168 |     #construct a dictionary of phylogenetic groups
169 |     annot = dict()
170 |     for i, h in enumerate(Dseq['hd']):
171 |         hs = sca.parseAlgHeader(h)
172 |         annot[hs[0]] = sca.Annot(hs[1], hs[2], hs[3].replace('.',''))
173 |         
174 |     # Most frequent taxonomic groups:
175 |     atleast = 10
176 |     for level in range(4):
177 |         descr_list = [a.taxo.split(',')[level] for a in annot.values() \
178 |                       if len(a.taxo.split(',')) > level]
179 |         descr_dict = {k:descr_list.count(k) for k in descr_list \
180 |                       if descr_list.count(k)>=atleast}
181 |         print('\n Level %i:' % level)
182 |         print(descr_dict)
183 | 
184 | 
185 | .. parsed-literal::
186 | 
187 |     
188 |      Level 0:
189 |     {'Bacteria': 803}
190 |     
191 |      Level 1:
192 |     {'Proteobacteria': 380, 'Actinobacteria': 145, 'Firmicutes': 119, 'Deinococcus-Thermus': 11, 'Bacteroidetes': 46, 'Cyanobacteria': 59, 'Acidobacteria': 10, 'environmental samples': 18}
193 |     
194 |      Level 2:
195 |     {'Gammaproteobacteria': 200, 'Actinobacteridae': 139, 'Bacillales': 55, 'Deinococci': 11, 'Clostridia': 41, 'Betaproteobacteria': 57, 'Chroococcales': 31, 'Alphaproteobacteria': 115, 'Lactobacillales': 12, 'Negativicutes': 11, 'Bacteroidia': 21, 'Nostocales': 10, 'Oscillatoriales': 11}
196 |     
197 |      Level 3:
198 |     {'Enterobacteriales': 89, 'Actinomycetales': 139, 'Paenibacillaceae': 10, 'Clostridiales': 35, 'Burkholderiales': 55, 'Vibrionales': 28, 'Synechococcus': 14, 'Bacillaceae': 31, 'Rhizobiales': 48, 'Pseudomonadales': 28, 'Rhodospirillales': 16, 'Selenomonadales': 11, 'Sphingomonadales': 31, 'Caulobacterales': 10, 'Bacteroidales': 21, 'Thiotrichales': 12, 'Xanthomonadales': 16, 'Rhodobacterales': 10, 'Nostocaceae': 10}
199 | 
200 | 
201 | Based on this, we select taxonomic groups and colors for representation.
202 | Here, we just start by choosing the broadly well-represented groups. To
203 | see a complete color-coding legend, use:
204 | 
205 | ::
206 | 
207 |    >>> sca.figColors()
208 | 
209 | .. code:: python3
210 | 
211 |     phylo = list();
212 |     fam_names = ['Firmicutes', 'Actinobacteria', 'Bacteroidetes', \
213 |                  'Cyanobacteria', 'Proteobacteria']
214 |     col = (0, 0.18, 0.38, 0.5, 0.6)
215 |     # Firmicutes = red, Actinobacteria = yellow, Bacteroidetes = cyan, 
216 |     # Cyanobacteria = green, Proteobacteria = blue
217 |     
218 |     for i,k in enumerate(fam_names):
219 |         sf = sca.Unit()
220 |         sf.name = fam_names[i].lower()
221 |         sf.col = col[i]
222 |         sf.items = [j for j,q in enumerate(Dseq['hd'])  if sf.name in q.lower()]
223 |         phylo.append(sf)
224 | 
225 | Plot the top six independent components of the sequence correlation
226 | matrix (with sequence weights); color-coded by phylogenetic annotation.
227 | The sequences clearly seperate into groups related by phylogeny; the
228 | Proteobacteria *(blue)* seperate out on :math:`U_1`, the Firmicutes
229 | *(red)* seperate out on :math:`U_2`, the Cyanobacteria *(green)*
230 | seperate out on :math:`U_3`, and the Bacteroidetes *(cyan)* seperate out
231 | on :math:`U_5`.
232 | 
233 | .. code:: python3
234 | 
235 |     plt.rcParams['figure.figsize'] = 9, 3.5
236 |     U = Dsca['Uica'][1]
237 |     pairs = [[2*i,2*i+1] for i in range(3)]
238 |     for k,[k1,k2] in enumerate(pairs):
239 |         plt.subplot(1,3,k+1)
240 |         sca.figUnits(U[:,k1], U[:,k2], phylo)
241 |         #sca.figUnits(U[:,k1], U[:,k2], subfam)
242 |         plt.xlabel(r"${U'}^{(2)}_{%i}$"%(k1+1), fontsize=16)
243 |         plt.ylabel(r"${U'}^{(2)}_{%i}$"%(k2+1), fontsize=16)
244 |     plt.tight_layout()
245 | 
246 | 
247 | 
248 | .. image:: _static/SCA_betalactamase_16_0.png
249 | 
250 | 
251 | II. SCA conservation and coevolution
252 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
253 | 
254 | Plot the eigenspectrum of the SCA positional coevolution matrix
255 | (:math:`\tilde{C_{ij}}`) *(black bars)* and 10 trials of matrix
256 | randomization for comparison *(red line)*. This graph is used to choose
257 | the number of significant eigenmodes.
258 | 
259 | .. code:: python3
260 | 
261 |     plt.rcParams['figure.figsize'] = 9, 3.5 
262 |     hist0, bins = np.histogram(Dsca['Lrand'].flatten(), bins=Dseq['Npos'], \
263 |                                range=(0,Dsect['Lsca'].max()))
264 |     hist1, bins = np.histogram(Dsect['Lsca'], bins=Dseq['Npos'], \
265 |                                range=(0,Dsect['Lsca'].max()))
266 |     plt.bar(bins[:-1], hist1, np.diff(bins),color='k')
267 |     plt.plot(bins[:-1], hist0/Dsca['Ntrials'], 'r', linewidth=3)
268 |     plt.tick_params(labelsize=11)
269 |     plt.xlabel('Eigenvalues', fontsize=18); plt.ylabel('Numbers', fontsize=18);
270 |     print('Number of eigenmodes to keep is %i' %(Dsect['kpos']))
271 | 
272 | 
273 | .. parsed-literal::
274 | 
275 |     Number of eigenmodes to keep is 7
276 | 
277 | 
278 | 
279 | .. image:: _static/SCA_betalactamase_19_1.png
280 | 
281 | 
282 | To define the positions with significant contributions each of the
283 | independent components (ICs), we make a empirical fit for each IC to the
284 | t-distribution and select positions with greater than a specified cutoff
285 | on the CDF. We choose :math:`p=0.95` as our cutoff. Note that since some
286 | positions might contribute significantly to more than one IC (and
287 | indication of non-independence of ICs), we apply a simple algorithm to
288 | assign such positions to one IC. Specifically, we assign positions to
289 | the IC with which it has the greatest degree of co-evolution.
290 | 
291 | The data indicate generally good fits for the top six ICs, and we return
292 | the positions contributing to each IC in a format suitable for cut and
293 | paste into PyMol.
294 | 
295 | .. code:: python3
296 | 
297 |     plt.rcParams['figure.figsize'] = 10,5 
298 |     
299 |     Vpica = Dsect['Vpica']
300 |     for k in range(Dsect['kpos']):
301 |         iqr = scoreatpercentile(Vpica[:,k],75) - scoreatpercentile(Vpica[:,k],25)
302 |         binwidth=2*iqr*(len(Vpica)**(-0.33))
303 |         nbins=int(round((max(Vpica[:,k])-min(Vpica[:,k]))/binwidth))
304 |         plt.subplot(1,Dsect['kpos'],k+1)
305 |         h_params = plt.hist(Vpica[:,k], nbins)
306 |         x_dist = np.linspace(min(h_params[1]), max(h_params[1]), num=100)
307 |         plt.plot(x_dist,Dsect['scaled_pd'][k],'r',linewidth = 2)    
308 |         plt.xlabel(r'$V^p_{%i}$'%(k+1), fontsize=14)
309 |         plt.ylabel('Number', fontsize=14)
310 |         
311 |     for n,ipos in enumerate(Dsect['ics']):
312 |         sort_ipos = sorted(ipos.items)
313 |         ats_ipos = ([Dseq['ats'][s] for s in sort_ipos])
314 |         ic_pymol = ('+'.join(ats_ipos))
315 |         print('IC %i is composed of %i positions:' % (n+1,len(ats_ipos)))
316 |         print(ic_pymol + "\n")
317 | 
318 | 
319 | .. parsed-literal::
320 | 
321 |     IC 1 is composed of 20 positions:
322 |     61+65+109+117+125+136+157+164+170+179+180+210+213+229+233+241+247+250+251+255
323 |     
324 |     IC 2 is composed of 16 positions:
325 |     63+70+71+73+91+130+131+132+134+143+156+182+196+226+234+236
326 |     
327 |     IC 3 is composed of 17 positions:
328 |     66+68+102+105+106+107+126+144+145+183+185+199+207+215+216+238+244
329 |     
330 |     IC 4 is composed of 12 positions:
331 |     69+72+123+139+149+151+153+161+162+163+186+193
332 |     
333 |     IC 5 is composed of 0 positions:
334 |     
335 |     
336 |     IC 6 is composed of 13 positions:
337 |     67+85+87+148+160+181+190+200+203+211+221+225+231
338 |     
339 |     IC 7 is composed of 11 positions:
340 |     77+84+101+122+138+220+223+224+232+235+245
341 |     
342 | 
343 | 
344 | 
345 | .. image:: _static/SCA_betalactamase_21_1.png
346 | 
347 | 
348 | To define protein sectors, we examine the structure of the SCA
349 | positional correlation matrix with positions contributing to the top
350 | independent components (ICs) ordered by weight *(left panel)*. This
351 | provides a basis to determine/interpret which ICs are truly
352 | statistically independent (defining an independent sector) and which
353 | represent hierarchical breakdowns of one sector.
354 | 
355 | IC 2 appears more distinct and is considered an independent sector
356 | *(sector 1)*. ICs 1,3,5,and 6 are strongly co-evolving, and should be
357 | combined into one sector. IC 4 also appears to be related to [1,3,5,6]
358 | and the combination of 1,3,4,5,6 makes up sector two. The sectors (2 in
359 | total) are defined accordingly, and in the *right panel*, these
360 | independent components have been re-ordered accordingly to visualize
361 | this decomposition.
362 | 
363 | .. code:: python3
364 | 
365 |     # plot the SCA positional correlation matrix, ordered by contribution to the top ICs
366 |     plt.rcParams['figure.figsize'] = 10, 10 
367 |     plt.subplot(121)
368 |     plt.imshow(Dsca['Csca'][np.ix_(Dsect['sortedpos'], Dsect['sortedpos'])], \
369 |                vmin=0, vmax=2,interpolation='none',aspect='equal',\
370 |                extent=[0,sum(Dsect['icsize']),0,sum(Dsect['icsize'])])
371 |     line_index=0
372 |     for i in range(Dsect['kpos']):
373 |         plt.plot([line_index+Dsect['icsize'][i],line_index+Dsect['icsize'][i]],\
374 |                  [0,sum(Dsect['icsize'])],'w', linewidth = 2)
375 |         plt.plot([0,sum(Dsect['icsize'])],[sum(Dsect['icsize'])-line_index,\
376 |                             sum(Dsect['icsize'])-line_index],'w', linewidth = 2)
377 |         line_index += Dsect['icsize'][i] 
378 |     
379 |     #define the new sector groupings - 2 total
380 |     sec_groups = ([1],[0,2,4,5,3,6])
381 |     sectors = list()
382 |     for n,k in enumerate(sec_groups):
383 |         s = sca.Unit()
384 |         all_items = list()
385 |         for i in k: all_items = all_items+Dsect['ics'][i].items
386 |         s.items = all_items
387 |         s.col = (1/len(sec_groups))*n
388 |         sectors.append(s)
389 |     
390 |     # plot the re-ordered matrix
391 |     plt.subplot(122)
392 |     line_index=0
393 |     sortpos = list()
394 |     for s in sectors:
395 |         sortpos.extend(s.items)
396 |     plt.imshow(Dsca['Csca'][np.ix_(sortpos, sortpos)], vmin=0, vmax=2,\
397 |                interpolation='none',aspect='equal',\
398 |                extent=[0,len(sortpos),0,len(sortpos)])
399 |     for s in sectors:
400 |         plt.plot([line_index+len(s.items),line_index+len(s.items)],\
401 |                  [0,len(sortpos)],'w', linewidth = 2)
402 |         plt.plot([0,sum(Dsect['icsize'])],[len(sortpos)-line_index, \
403 |                                            len(sortpos)-line_index],'w', linewidth = 2)
404 |         line_index += len(s.items)
405 |     plt.tight_layout()
406 | 
407 | 
408 | 
409 | .. image:: _static/SCA_betalactamase_23_0.png
410 | 
411 | 
412 | Print the sector positions, in a format suitable for pyMol, and create a
413 | pyMol session with the sectors (and decomposition into independent
414 | components) as seperate objects. Structurally, sectors 1+3 form
415 | physically contiguous units, and 2 is less so… this is consistent with
416 | the idea that sector 2/IC4 might be associated with sector 1/ICs1+3+5+6
417 | 
418 | .. code:: python3
419 | 
420 |     for i,k in enumerate(sectors):
421 |         sort_ipos = sorted(k.items)
422 |         ats_ipos = ([Dseq['ats'][s] for s in sort_ipos])
423 |         ic_pymol = ('+'.join(ats_ipos))
424 |         print('Sector %i is composed of %i positions:' % (i+1,len(ats_ipos)))
425 |         print(ic_pymol + "\n")
426 |     sca.writePymol('../data/1FQG.pdb', sectors, Dsect['ics'], Dseq['ats'], \
427 |                    '../output/PF13354.pml', 'A', '../data/', 0)  
428 | 
429 | 
430 | .. parsed-literal::
431 | 
432 |     Sector 1 is composed of 16 positions:
433 |     63+70+71+73+91+130+131+132+134+143+156+182+196+226+234+236
434 |     
435 |     Sector 2 is composed of 73 positions:
436 |     61+65+66+67+68+69+72+77+84+85+87+101+102+105+106+107+109+117+122+123+125+126+136+138+139+144+145+148+149+151+153+157+160+161+162+163+164+170+179+180+181+183+185+186+190+193+199+200+203+207+210+211+213+215+216+220+221+223+224+225+229+231+232+233+235+238+241+244+245+247+250+251+255
437 |     
438 | 
439 | 
440 | III. The phylogenetic basis of the sector hierarchy
441 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
442 | 
443 | How does the clear phylogenetic heterogeneity in the MSA influence the
444 | sector definitions? To address this, we take advantage of mathematical
445 | methods for mapping between the space of positional and sequence
446 | correlations, as described in *Rivoire et al*. Using this mapping, we
447 | plot the top :math:`k_{pos}` ICs as 2-D scatter plots with the
448 | corresponding sequence space divergence. The colors for the sequence
449 | space are according to the phylogenetic classifications we chose above.
450 | 
451 | .. code:: python3
452 | 
453 |     plt.rcParams['figure.figsize'] = 15,8
454 |     pairs = [ [x, x+1] for x in range(0, len(Dsect['ics'])-1, 2) ]
455 |     for n,[k1,k2] in enumerate(pairs):
456 |         plt.subplot(2,len(pairs),n+1)
457 |         sca.figUnits(Dsect['Vpica'][:,k1], Dsect['Vpica'][:,k2], sectors, dotsize = 6)
458 |         plt.xlabel(r'$V^p_{%i}$' % (k1+1), fontsize=16)
459 |         plt.ylabel(r'$V^p_{%i}$' % (k2+1), fontsize=16)
460 |         plt.subplot(2,len(pairs),n+len(pairs)+1)
461 |         sca.figUnits(Dsect['Upica'][:,k1], Dsect['Upica'][:,k2], phylo, dotsize = 6)
462 |         plt.xlabel(r'$U^p_{%i}$' % (k1+1), fontsize=16)
463 |         plt.ylabel(r'$U^p_{%i}$' % (k2+1), fontsize=16)
464 |     plt.tight_layout()
465 | 
466 | 
467 | 
468 | .. image:: _static/SCA_betalactamase_28_0.png
469 | 
470 | 
471 | The interpretation for the two sectors:
472 | 
473 | **Sector 1** is defined along (:math:`V_2^p`). The sequences along the
474 | corresponding component (:math:`U_2^p`) are homogeneously distributed
475 | with respect to phylogeny, consistent with the notion that this sector
476 | is a property of the entire alignment. Notably, this sector forms the
477 | catalytic core of the Beta-lactamase.
478 | 
479 | **Sector 2** is composed of ICs 1,3,4 and 5 - and each of these is
480 | associated with some phylogenetic divergence. :math:`V_1^p` splits the
481 | cyanobacteria *(green)* from the proteobacteria *(blue)*, :math:`V_3^p`
482 | seperates the proteobacteria *(blue)* from other sequence families,
483 | :math:`V_5^p` seperates out a subset of the firmicutes *(red)*, and
484 | :math:`V_6^p` is associated with a divergence in the bacteriodetes
485 | *(cyan)*. Sector 2 forms a physically contiguous unit that resembles a
486 | shell around the active site. The decomposition described above suggests
487 | that some functional divergence in beta-lactamse dynamics or regulatory
488 | mechanism across phylogenetic lines may underlie the breakdown of this
489 | sector.
490 | 
491 | For clarity, we also plot the same data as a stacked bar chart below.
492 | 
493 | .. code:: python3
494 | 
495 |     plt.rcParams['figure.figsize'] = 20, 5 
496 |         
497 |     col = list()
498 |     for k in phylo:
499 |         col = col + [colorsys.hsv_to_rgb(k.col,1,1)]
500 |     for k in range(Dsect['kpos']):
501 |         forhist = list()
502 |         for group in phylo:
503 |             forhist.append([Dsect['Upica'][i,k] for i in group.items])
504 |         plt.subplot(2,Dsect['kpos'],k+1)
505 |         plt.hist(forhist, histtype='barstacked',color=col)
506 | 
507 | 
508 | 
509 | .. image:: _static/SCA_betalactamase_30_0.png
510 | 
511 | 
512 | This concludes the script.
513 | 


--------------------------------------------------------------------------------
/docs/source/_static/BLactamase_sec_hier.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/BLactamase_sec_hier.png


--------------------------------------------------------------------------------
/docs/source/_static/DHFR_decompv2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/DHFR_decompv2.png


--------------------------------------------------------------------------------
/docs/source/_static/DHFR_sec_hier.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/DHFR_sec_hier.png


--------------------------------------------------------------------------------
/docs/source/_static/Gprot_sec_hier.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/Gprot_sec_hier.png


--------------------------------------------------------------------------------
/docs/source/_static/Gprot_secstruct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/Gprot_secstruct.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_DHFR_13_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_DHFR_13_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_DHFR_16_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_DHFR_16_1.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_DHFR_20_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_DHFR_20_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_DHFR_22_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_DHFR_22_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_DHFR_26_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_DHFR_26_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_DHFR_29_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_DHFR_29_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_DHFR_30_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_DHFR_30_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_DHFR_7_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_DHFR_7_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_G_17_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_G_17_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_G_21_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_G_21_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_G_24_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_G_24_1.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_G_26_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_G_26_1.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_G_28_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_G_28_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_G_31_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_G_31_1.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_G_33_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_G_33_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_G_35_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_G_35_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_G_37_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_G_37_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_G_42_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_G_42_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_G_44_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_G_44_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_G_9_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_G_9_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_S1A_17_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_S1A_17_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_S1A_20_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_S1A_20_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_S1A_23_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_S1A_23_1.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_S1A_25_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_S1A_25_1.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_S1A_27_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_S1A_27_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_S1A_30_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_S1A_30_1.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_S1A_32_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_S1A_32_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_S1A_38_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_S1A_38_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_S1A_41_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_S1A_41_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_S1A_9_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_S1A_9_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_betalactamase_10_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_betalactamase_10_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_betalactamase_16_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_betalactamase_16_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_betalactamase_19_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_betalactamase_19_1.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_betalactamase_21_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_betalactamase_21_1.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_betalactamase_23_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_betalactamase_23_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_betalactamase_28_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_betalactamase_28_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_betalactamase_30_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_betalactamase_30_0.png


--------------------------------------------------------------------------------
/docs/source/_static/SCA_betalactamase_8_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/SCA_betalactamase_8_1.png


--------------------------------------------------------------------------------
/docs/source/_static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/favicon.ico


--------------------------------------------------------------------------------
/docs/source/_static/github-download-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/github-download-screenshot.png


--------------------------------------------------------------------------------
/docs/source/_static/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/docs/source/_static/logo.png


--------------------------------------------------------------------------------
/docs/source/annotateMSA.rst:
--------------------------------------------------------------------------------
1 | ===========
2 | annotateMSA
3 | ===========
4 | 
5 | .. automodule:: annotateMSA
6 |     :members:
7 |     :undoc-members:
8 |     :show-inheritance:
9 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # http://www.sphinx-doc.org/en/master/config
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('../modules'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'pySCA'
21 | copyright = '2019, Olivier Rivoire, Rama Ranganathan, Kimberly Reynolds'
22 | author = 'Olivier Rivoire, Rama Ranganathan, Kimberly Reynolds'
23 | 
24 | # The short X.Y version.
25 | version = '6.1'
26 | 
27 | # The full version, including alpha/beta/rc tags
28 | release = '6.1'
29 | 
30 | 
31 | # -- General configuration ---------------------------------------------------
32 | 
33 | # Add any Sphinx extension module names here, as strings. They can be
34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
35 | # ones.
36 | extensions = [
37 |     'sphinx.ext.todo',
38 |     'sphinx.ext.viewcode',
39 |     'sphinx.ext.autodoc'
40 | ]
41 | 
42 | # Add any paths that contain templates here, relative to this directory.
43 | templates_path = ['_templates']
44 | 
45 | # The master toctree document.
46 | master_doc = 'index'
47 | 
48 | # List of patterns, relative to source directory, that match files and
49 | # directories to ignore when looking for source files.
50 | # This pattern also affects html_static_path and html_extra_path.
51 | exclude_patterns = []
52 | 
53 | # The name of the Pygments (syntax highlighting) style to use.
54 | pygments_style = 'friendly'
55 | 
56 | 
57 | # -- Options for HTML output -------------------------------------------------
58 | 
59 | # The theme to use for HTML and HTML Help pages.  See the documentation for
60 | # a list of builtin themes.
61 | #
62 | html_theme = 'sphinx_rtd_theme'
63 | 
64 | # Add any paths that contain custom static files (such as style sheets) here,
65 | # relative to this directory. They are copied after the builtin static files,
66 | # so a file named "default.css" will overwrite the builtin "default.css".
67 | html_static_path = ['_static']
68 | 
69 | 
70 | # logo
71 | html_logo = '_static/logo.png'
72 | html_favicon = '_static/favicon.ico'
73 | 
74 | html_theme_options = {
75 |     'logo_only': True
76 | }
77 | 


--------------------------------------------------------------------------------
/docs/source/examples.rst:
--------------------------------------------------------------------------------
 1 | ========
 2 | Examples
 3 | ========
 4 | 
 5 | .. toctree::
 6 |    :maxdepth: 1
 7 | 
 8 |    SCA_G
 9 |    SCA_S1A
10 |    SCA_DHFR
11 |    SCA_betalactamase
12 | 


--------------------------------------------------------------------------------
/docs/source/get_started.rst:
--------------------------------------------------------------------------------
  1 | ===============
  2 | Getting Started
  3 | ===============
  4 | 
  5 | Running a complete SCA analysis consists of five steps:
  6 | 
  7 |   1) Constructing an alignment
  8 |   2) Alignment pre-processing and conditioning
  9 |   3) Calculation of the conservation and co-evolution statistics
 10 |   4) Identifying statistically significant correlations
 11 |   5) Interpretation of the results
 12 | 
 13 | The core SCA calculations (steps 2,3, and 4) are each associated with a
 14 | particular Python analysis script (:code:`scaProcessMSA`, :code:`scaCore`, and
 15 | :code:`scaSectorID`, respectively). Sequential execution of each Python
 16 | analysis script stores the results in a pickle database. This means the core
 17 | SCA calculations can be run from the command line, or multiple proteins can be
 18 | analyzed using a shell script (for an example, see `runAllNBCalcs.sh`).
 19 | Following execution of the scripts, the pickle database can be loaded in an
 20 | Jupyter notebook for visualizing the results and interpreting the data.
 21 | Alternatively, the output of the analysis scripts can be saved as a MATLAB
 22 | workspace, and results plotted/analyzed in MATLAB. Below we describe the five
 23 | main analysis steps in more detail.
 24 | 
 25 | 
 26 | File and Directory Structure
 27 | ============================
 28 | 
 29 | The pySCA repository contains the following files and directories:
 30 | 
 31 | Base Directory
 32 | --------------
 33 | 
 34 | bin/
 35 |   Contains the analysis scripts that use functions defined in the `scaTools.py`
 36 |   module.
 37 | data/
 38 |   Git submodule that contains the input sequence alignments (\*.fasta, \*.an)
 39 |   and structures (\*.pdb) for the analysis. The \*.an files correspond to
 40 |   fasta-formatted sequence files with taxonomic annotations. The inputs
 41 |   needed for all tutorials are here.
 42 | output/
 43 |   Contains the output of the analysis. Accordingly, it is an empty directory
 44 |   in a newly installed pySCA distribution. Running the scripts below will
 45 |   output a processed alignment (\*.fasta or \*.an file) and pickle database
 46 |   (\*.db file) to Outputs/. Similarly, if you choose to output results to a
 47 |   MATLAB workspace, the resulting \*.mat file will write to this directory.
 48 | figs/
 49 |   Contains a few figures that are loaded into the tutorials for illustration
 50 |   purposes.
 51 | docs/
 52 |   Contains this documentation.
 53 | LICENSE
 54 |   This work is distributed under the standard BSD 3-clause open source
 55 |   software license.
 56 | README.md
 57 |   Very basic introduction to the toolbox.
 58 | scripts/
 59 |   Contains scripts used to generate the input data from the example analyses.
 60 | notebooks/
 61 |   Contains a set of pySCA examples as Jupyter (formerly IPython) notebooks.
 62 | pysca/
 63 |   Contains the Python source code for the SCA implementation.
 64 | 
 65 | `bin` Directory
 66 | -----------------
 67 | 
 68 | alnFilterSeqSize, alnParseID, alnReplaceHeaders, alnChangeDelim, alnConvertGI
 69 |   These aren't essential to the main SCA utilities/package, but are little
 70 |   scripts that we often find useful in alignment construction.
 71 | annotateMSA
 72 |   A script for adding taxonomic annotations to fasta-formatted sequence
 73 |   alignments
 74 | scaProcessMSA
 75 |   The script that conducts alignment pre-processing and conditioning. This
 76 |   constitutes trimming the alignment for gaps, and removing low identity
 77 |   sequences.
 78 | scaCore
 79 |   The script that computes SCA conservation and co-evolution values.
 80 | scaSectorID
 81 |   The script that defines positions that show a statistically significant
 82 |   correlation.
 83 | 
 84 | `scripts` Directory
 85 | -------------------
 86 | 
 87 | runAllNBCalcs.sh
 88 |   A shell script that runs all of the calculations needed for the tutorials.
 89 |   This script also serves as an example for how to call the pySCA scripts
 90 |   from the command line.
 91 | 
 92 | `notebooks` Directory
 93 | ---------------------
 94 | 
 95 | SCA_DHFR.ipynb
 96 |   Jupyter (formerly IPython) notebook tutorial for the Dihydrolate reductase
 97 |   enzyme family.
 98 | SCA_G.ipynb
 99 |   Jupyter notebook tutorial for the small G proteins.
100 | SCA_betalactamase.ipynb
101 |   Jupyter notebook tutorial for the Beta-lactamase enzyme family.
102 | SCA_S1A.ipynb
103 |   Jupyter notebook tutorial for the S1A serine protease enzyme family.
104 | 
105 | `pysca` Directory
106 | -----------------
107 | 
108 | scaTools.py
109 |   Contains the pySCA library - the functions that implement all of the SCA
110 |   calculations.
111 | settings.py
112 |   Optional configuration file useful for specifying paths instead of having to
113 |   so do on the command line.
114 | 
115 | 
116 | 1. Constructing and annotating a multiple sequence alignment
117 | ============================================================
118 | 
119 | The SCA method operates on a multiple sequence alignment of homologous protein
120 | sequences. You can begin the analysis by obtaining an alignment for your
121 | protein of interest from a curated database (for example PFAM:
122 | http://pfam.xfam.org/ ) or by constructing your own alignment. The details of
123 | alignment construction aren't covered here, but we may add a tutorial in future
124 | versions of this documentation. The critical thing is that the alignment
125 | contain on the order of 100 or more effective sequences.
126 | 
127 | Once you have an alignment, it is helpful to add taxonomic annotations to the
128 | headers. These annotations are used in SCA to examine the relationship between
129 | sector positions and phylogenetic divergence (i.e. in the mapping between
130 | independent components and sequence space). The annotateMSA script contains
131 | two utilities to automate sequence annotation: one which uses the NCBI Entrez
132 | tools in BioPython, and one which uses PFAM database annotations (PFAM
133 | alignment specific). Please note that the annotation step can be slow (on the
134 | order of hours), but only needs to be done once per alignment. For further
135 | details please see the :doc:`/annotateMSA` documentation.
136 | 
137 | 2. Alignment pre-processing and conditioning
138 | ============================================
139 | 
140 | Following alignment construction and annotation, the alignment is processed to:
141 | (1) remove highly gapped or low homology sequences, (2) remove highly gapped
142 | positions, (3) calculate sequence weights and (4) to create a mapping of
143 | alignment positions to a reference structure or sequence numbering system. This
144 | process is handled by the script :doc:`/scaProcessMSA`. Please see the script
145 | documentation for a complete list of optional arguments and notes on usage, and
146 | for a full description of computations 1-4, see the Rivoire et al 2016 methods
147 | paper (Box 1). [#Rivoire2016]_ The resulting output can be stored as either a
148 | Python pickle database or MATLAB workspace for further analysis.
149 | 
150 | 3. Calculation of the conservation and co-evolution statistics
151 | ==============================================================
152 | 
153 | The processed alignment and sequence weights computed in step 2 are then used
154 | in the calculation of evolutionary statistics by the script :doc:`scaCore`.
155 | This script handles the core calculations for:
156 | 
157 |     1. Pairwise sequence correlations/sequence similarity
158 |     2. Single-site positional conservation from the Kullback-Leibler relative
159 |        entropy, :math:`D_i^a`, and position weights from the gradient of the KL
160 |        entropy, :math:`\frac{\partial{D_i^a}}{\partial{f_i^a}}`. See eqs. 1-2
161 |        in Rivoire, 2016. [#Rivoire2016]_
162 |     3. The SCA matrix :math:`\tilde{C_{ij}}`. See eq. 3 in Rivoire, 2016.
163 |        [#Rivoire2016]_
164 |     4. The projected alignment (eq. 10-11), and the projector (supplemental
165 |        section 1H) [#Rivoire2016]_.
166 |     5. N trials (default N=10) of the randomized SCA matrix and associated
167 |        eigenvectors and eigenvalues; used to choose the number of significant
168 |        eigenmodes.
169 | 
170 | The calculations and optional execution flags are further described in the
171 | script documentation. As for :doc:`scaProcessMSA`, the output can be stored as
172 | either a Python pickle database or MATLAB workspace for further analysis.
173 | 
174 | 4. Identifying significant evolutionary correlations
175 | ====================================================
176 | 
177 | After the core calculations are complete, the next step is to define the
178 | significant number of eigenmodes/independent components for analysis
179 | (:math:`k_{max}`) and to select sector positions by their contributions to the
180 | top :math:`k_{max}` independent components. This is handled by the script
181 | :doc:`scaSectorID`. This script also computes the sequence-to-position space
182 | mapping as in eq.10-11 and fig. 7. As for :doc:`scaProcessMSA` and
183 | :doc:`scaCore`, the output can be stored as either a Python shelve database or
184 | MATLAB workspace for further analysis.
185 | 
186 | 5. Interpretation of the results and sector definition
187 | ======================================================
188 | 
189 | Execution of annotateMSA, scaProcessMSA, scaCore, and scaSectorID completes
190 | the calculation of SCA terms and results in a single pickle database (\*.db
191 | file, and optionally, a MATLAB workspace) containing the collected results. The
192 | final step is to interpret these calculations and evaluate the
193 | (non-)independence of the amino acid positions associated with each independent
194 | component (as in Fig. 4).
195 | 
196 | The :doc:`tutorials <usage>` are designed to provide examples of this process,
197 | and to illustrate different aspects of SCA usage (please see the individual
198 | tutorial headers for more information).
199 | 
200 | 
201 | **Further Reading/References:**
202 | 
203 | .. [#Halabi2009] Halabi N, Rivoire O, Leibler S, and Ranganathan R. "Protein
204 |    sectors: evolutionary unis of three-dimensional structure." *Cell.* 2009
205 |    v.138 p.774
206 | 
207 | .. [#Smock2010] Smock RG, Rivoire O, Russ WP, Swain JF, Leibler S, Ranganathan
208 |    R, Gierasch LM. "An interdomain sector mediating allostery in Hsp70
209 |    molecular chaperones." *MSB.* 2010 v.6 p.414
210 | 
211 | .. [#Reynolds2013] Reynolds KA, Russ WP, Socolich M, Ranganathan R.
212 |    "Evolution-based design of proteins." *Methods Enzymol.* 2013 v.523 p.213
213 | 
214 | .. [#Rivoire2016] Rivoire, O., Reynolds, K. A., and Ranganathan, R.
215 |    Evolution-Based Functional Decomposition of Proteins. *PLOS Computational
216 |    Biology* 12, e1004817 (2016).
217 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. pySCA documentation master file, created by
 2 |    sphinx-quickstart on Mon Jul 15 09:03:26 2019.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | =============================================================================
 7 | Statistical Coupling Analysis in Python.
 8 | =============================================================================
 9 | 
10 | The Statistical Coupling Analysis (SCA) is an approach for characterizing the
11 | pattern of evolutionary constraints on and between amino acid positions in a
12 | protein family. Given a representative multiple sequence alignment of the
13 | family, the analysis provides methods for quantitatively measuring the overall
14 | functional constraint at each sequence position (the position-specific, or
15 | "first-order" analysis of conservation), and for measuring and analyzing the
16 | coupled functional constraint on all pairs of sequence positions (the
17 | pairwise-correlated, or "second-order" analysis of conservation). The premise
18 | is that extending the traditional definition of conservation to include
19 | correlations between positions will contribute to defining the architecture of
20 | functional interactions between amino acids, and more importantly, help define
21 | the basic physical principles underlying protein structure, function, and
22 | evolution. 
23 | 
24 | **Please Cite:**
25 | 
26 |   Rivoire, O., Reynolds, K. A., and Ranganathan, R.  Evolution-Based Functional
27 |   Decomposition of Proteins. *PLOS Computational Biology* 12, e1004817 (2016).
28 | 
29 | I. Installing and Using pySCA
30 | ==============================
31 | 
32 | .. toctree::
33 |    :maxdepth: 2
34 | 	      
35 |    install
36 |    get_started
37 |    usage
38 |    examples
39 |    modules
40 |    versions
41 | 
42 | II. Indices and Tables
43 | =======================
44 | 
45 | * :ref:`genindex`
46 | * :ref:`modindex`
47 | * :ref:`search`
48 | 


--------------------------------------------------------------------------------
/docs/source/install.rst:
--------------------------------------------------------------------------------
  1 | ============
  2 | Installation
  3 | ============
  4 | 
  5 | The following are a set of command-line-centric instructions for installing
  6 | pySCA on Linux, Windows, and macOS operating systems.
  7 | 
  8 | 1. Install Dependencies
  9 | =======================
 10 | 
 11 | Choose the set of instructions in this section based on your operating system.
 12 | 
 13 | Linux (Ubuntu 18.04)
 14 | --------------------
 15 | 
 16 | Before installing pySCA, install the following packages from your package
 17 | repository:
 18 | 
 19 | 1) Python 3
 20 | 2) Pip
 21 | 3) GCC
 22 | 
 23 | .. code-block:: bash
 24 | 
 25 |    sudo apt-get install python3 python3-pip git gcc
 26 | 
 27 | 
 28 | Next, download and install `FASTA
 29 | <http://fasta.bioch.virginia.edu/fasta_www2/fasta_down.shtml>`_ from GitHub.
 30 | FASTA is needed for the :code:`ggsearch36` function.
 31 | 
 32 | .. code-block:: bash
 33 | 
 34 |    git clone https://github.com/wrpearson/fasta36.git
 35 |    cd fasta36/src
 36 |    make -j2 -f ../make/Makefile.linux all
 37 |    sudo cp -r ../bin /usr/local
 38 |    sudo rm /usr/local/bin/README
 39 |    cd ../..
 40 | 
 41 | This will compile and install FASTA in the `/usr/local/bin` directory. This
 42 | is already on your system PATH, so programs will be able to find it without
 43 | additional configuration.
 44 | 
 45 | Alternatively, instead of :code:`ggsearch36`, one can instead use the
 46 | :code:`needle` function from the `EMBOSS software package
 47 | <ftp://emboss.open-bio.org/pub/EMBOSS/>`_.
 48 | 
 49 | macOS (10.15)
 50 | ~~~~~~~~~~~~~
 51 | 
 52 | To install the dependencies on macOS, you will need to enable Xcode
 53 | (:code:`xcode-select --install`) and install `Homebrew <https://brew.sh>`_, and
 54 | unofficial package manager.
 55 | 
 56 | The installation process will involve entering commands in the terminal and
 57 | editing text files.
 58 | 
 59 | For those unfamiliar, to run commands in the terminal:
 60 | 
 61 | 1) Search for "terminal" in the launcher.
 62 | 2) Open the "Terminal" application.
 63 | 3) Type in a command into the terminal.
 64 | 4) Hit ENTER.
 65 | 
 66 | Repeat steps 3 and 4 for all the commands you need to run.
 67 | 
 68 | Open your terminal. Search for "terminal" in your launcher's list of
 69 | applications. The following are a set of command-line instructions.
 70 | 
 71 | **Xcode**
 72 | 
 73 | Now, installation of dependencies. The first step is to enable macOS developer
 74 | tools (i.e. Xcode). If this has not already been installed, run the following
 75 | in the terminal::
 76 | 
 77 |   xcode-select --install
 78 | 
 79 | You will be prompted for your password at this step.
 80 | 
 81 | **Homebrew**
 82 | 
 83 | To install Homebrew, run (copied from the Homebrew website) the following in
 84 | the terminal::
 85 | 
 86 |   /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install.sh)"
 87 | 
 88 | You will be prompted for your password at this step.
 89 | 
 90 | If you run into permissions errors when installing Homebrew, complaining that
 91 | root owns the `/usr/local/` directory, you can change the ownership by
 92 | running in the terminal::
 93 | 
 94 |   sudo chown -R <user> /usr/local
 95 | 
 96 | where `<user>` should be substituted with your username. You can view your
 97 | username in the right prompt of your open terminal windows (e.g.
 98 | <user>@<host>). If still unsure, type :code:`whoami` into the terminal, which
 99 | will print out username name. (Don't use :code:`sudo whoami`.)
100 | 
101 | **Python and GCC**
102 | 
103 | Once Homebrew is installed, install Python 3 and GCC::
104 | 
105 |   brew install gcc python3
106 | 
107 | This will install the most recent version of Python and the GCC compiler in
108 | your `/usr/local` directory. To ensure that these programs are used (rather the
109 | ones packged with the base system), add the following to your shell rc file
110 | (e.g. `~/.bashrc` or `~/.zshrc`, depending on what your default shell)::
111 | 
112 |   export CC=/usr/local/bin/gcc-9
113 |   export CXX=/usr/local/bin/g++-9
114 |   alias gcc=/usr/local/bin/gcc-9
115 |   alias g++=/usr/local/bin/g++-9
116 | 
117 |   alias python=/usr/local/bin/python3
118 |   alias python3=/usr/local/bin/python3
119 |   alias pip=/usr/local/bin/pip3
120 |   alias pip3=/usr/local/bin/pip3
121 |   alias python-config=/usr/local/bin/python3-config
122 |   alias python3-config=/usr/local/bin/python3-config
123 | 
124 | If unsure of your default shell, run :code:`echo $SHELL` in your terminal. If
125 | the output contains :code:`zsh`, you need to edit `~/.zshrc` (note the '.' at
126 | the beginning of the file name), and if the output contains :code:`bash`, edit
127 | `~/.bashrc`.
128 | 
129 | To edit these files, you can either evoke a text editor from the terminal, or
130 | your can locate the file in your fie explorer and open it in a text editor
131 | there. If you opt to use the terminal for edits, one option is to run::
132 | 
133 |   nano ~/.bashrc
134 | 
135 | (or :code:`nano ~/.zshrc` as appropriate)
136 | 
137 | Go to the bottom of the file and append the block of lines above. Save the
138 | changes and exit the editor.
139 | 
140 | Next, you will need to update your shell environment to reflect the changes
141 | made by appending those lines:
142 | 
143 | 1) Option 1: Close and re-open the terminal.
144 | 2) Option 2: Run in the terminal::
145 | 
146 |      source ~/.bashrc
147 | 
148 | Now, you can check the new aliases and environmental variables by running in
149 | the terminal::
150 | 
151 |   alias
152 |   echo $CC
153 |   echo $CXX
154 | 
155 | In the output, look for lines corresponding to the aliased commands you copied
156 | earlier, namely :code:`python` and :code:`pip`. Make sure they match.
157 | 
158 | **FASTA**
159 | 
160 | Once the aliases and environmental variables are properly set, install FASTA:
161 | 
162 | .. code-block:: bash
163 | 
164 |    git clone https://github.com/wrpearson/fasta36.git
165 |    cd fasta36/src
166 |    make -j2 -f ../make/Makefile.os_x86_64 all
167 |    cp -r ../bin /usr/local
168 |    rm /usr/local/bin/README
169 |    cd ../..
170 | 
171 | FASTA will now be installed in the `/usr/local/bin` directory, and the
172 | :code:`ggsearch36` program included in FASTA will now be on the system PATH.
173 | 
174 | If the installation fails due to permissions issues caused by root owning
175 | `/usr/local/bin`, you may need to use :code:`sudo` to run the following:
176 | 
177 | .. code-block:: bash
178 | 
179 |    cd fasta36/src
180 |    sudo cp -r ../bin /usr/local
181 |    sudo rm /usr/local/bin/README
182 |    cd ../..
183 | 
184 | You will be prompted for your password at this step.
185 | 
186 | Once installed, you can check that :code:`ggsearch36` is up and running by
187 | entering in the terminal::
188 | 
189 |   ggsearch36
190 | 
191 | This should print out the usage information for :code:`ggsearch36`.
192 | 
193 | Windows 10
194 | ----------
195 | 
196 | First, download and install:
197 | 
198 | 1) `MSYS2 <https://www.msys2.org>`_,
199 | 2) `Python 3 <https://www.python.org/downloads/>`_, and
200 | 3) `Git for Windows <https://gitforwindows.org/>`_.
201 | 
202 | **IMPORTANT:** When opening Python 3 installer, click the checkbox to make sure
203 | Python is added to your system PATH. For the rest of the prompts, use the
204 | defaults.
205 | 
206 | Git for Windows creates both a Bash prompt with Git for version control that
207 | also can access Python 3 installation on the system PATH. Again, it's essential
208 | that Python 3 added there. If not (or if you don't remember), run the Python 3
209 | installer again and make sure.
210 | 
211 | MSYS2 is a program for compiling Windows programs using a POSIX compatibility
212 | layer and ported versions of GCC, binutils, and other utilities. Essentially,
213 | one can use it to compile Windows exe's as one would on a Unix operating
214 | system, like macOS or Linux.
215 | 
216 | The reason to install it here is to compile FASTA. Once MSYS2 is installed,
217 | open in and run::
218 | 
219 |   pacman -Syu
220 | 
221 | The program will upgrade some core packages and request to be closed. Do so,
222 | re-open it, and once again run::
223 | 
224 |   pacman -Syu
225 | 
226 | Once the upgrade are complete, install the following packages with the package
227 | manager (:code:`pacman`) by running::
228 | 
229 |   pacman -S git vim make gcc
230 | 
231 | Now, you can download and install FASTA::
232 | 
233 |   git clone https://github.com/wrpearson/fasta36.git
234 |   cd fasta36/src
235 |   make CC=/usr/bin/gcc LD=/usr/bin/ld -j2 -f ../make/Makefile.linux all
236 |   cp -r ../bin /usr/local/
237 |   rm /usr/local/bin/README
238 |   cd ../..
239 | 
240 | Now, the :code:`ggsearch36` program, along with the others, are installed in
241 | `/usr/local/bin`. The next step is to add this directory into your Windows
242 | system PATH variable:
243 | 
244 | 1. Type 'env' in the start search bar.
245 | 2. Click 'Edit the system environment variables'.
246 | 3. Click on 'Environment Variables...' toward the bottom of the window that
247 |    opens.
248 | 4. Select 'Path' in one of the two selection windows (either 'User variables'
249 |    or 'System variables' is fine)
250 | 5. Once 'Path' is highlighted, click 'Edit...'
251 | 6. Enter the `/usr/local/bin` as a new PATH entry. You can either:
252 | 
253 |    - Click 'New' in the new window and enter the path to `/usr/local/bin` in
254 |      the MSYS2 installation folder (default: `C:\msys64\usr\local\bin`).
255 |    - Click the 'Browse...' button and navigate to the `C:\msys64\usr\local\bin`
256 |      directory.
257 | 
258 | 7. When the new entry is added, click 'OK' on all the opened windows to set all
259 |    the changes. You will need to close and re-open terminals for the changes to
260 |    be reflected.
261 | 
262 | Now, :code:`ggsearch36.exe` will be available to all running programs.
263 | 
264 | Finally, launch Git Bash (from Git for Windows) after installed. Open the
265 | `~/.bash_profile` file in the text editor, default `vi`, by running :code:`vi
266 | ~/.bash_profile`. In this file add the lines::
267 | 
268 |   alias python="winpty python.exe"
269 |   alias pip="winpty pip.exe"
270 | 
271 | Close the terminal and open it again. Now, you will be able to invoke the
272 | Python REPL and pip from the Git Bash prompt. Also, if the PATH variable was
273 | properly updated to contain the `/usr/local/bin` folder from the FASTA step,
274 | :code:`ggsearch36.exe` will also be available in Git Bash.
275 | 
276 | You will use the Git Bash prompt to download and install pySCA.
277 | 
278 | 2. Other Dependencies
279 | ================================
280 | 
281 | The following steps are optional but highly recommended:
282 | 
283 | 1) `PFAM annotations (click to download)
284 |    <ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/database_files/pfamseq.txt.gz>`_ -
285 |    the file `pfamseq.txt` contains phylogenetic annotations for PFAM sequences.
286 |    This is necessary if you would like to annotate PFAM alignments with
287 |    taxonomic/phylogenetic information using the :code:`annotateMSA` script
288 |    provided by pySCA. The file is available from the PFAM FTP site in
289 |    compressed (\*.gz) format. Just be aware that the file is quite large (~10
290 |    Gb download, ~30 Gb decompressed), so check beforehand that you have
291 |    available space on your hard drive.
292 | 
293 | 2) `PyMol <https://pymol.org/2/>`_ - necessary if you would like to use pySCA's
294 |    automated structure mapping scripts, and useful for mapping the sectors to
295 |    structure in general.
296 | 
297 |    The version of the code provided in the linked website requires a paid
298 |    license. For an open-source (free) build of the code, albeit with some
299 |    functionality removed, you can follow the following instructions:
300 | 
301 |    **Linux** - install from your distribution package repository
302 | 
303 |    **macOS** - run in the terminal::
304 | 
305 |        brew cask install xquartz
306 |        brew install brewsci/bio/pymol
307 | 
308 |    **Windows** - look for `online builds
309 |    <https://www.lfd.uci.edu/~gohlke/pythonlibs/#pymol-open-source>`_
310 | 
311 | 
312 | 3) `mpld3 <http://mpld3.github.io/>`_ - a package that allows more
313 |    interactive plot visualization in Jupyter notebooks. If you choose not to
314 |    install this (optional) package, you will need to comment out the
315 |    `import mpld3` lines at the beginning of the tutorials. To install, run in the terminal::
316 | 
317 |      pip install mpld3
318 | 
319 | 
320 | 3. Download Code
321 | ================
322 | 
323 | The pySCA package, tutorials, and associated scripts are available for download
324 | from the `GitHub repository <https://github.com/ranganathanlab/pySCA>`_. There
325 | are several options for doing so.
326 | 
327 | A. Use Git
328 | ----------
329 | 
330 | If you have :code:`git` installed on your system, you can use it to clone the
331 | repository from GitHub. From the command line, run:
332 | 
333 | .. code-block:: bash
334 | 
335 |    git clone https://github.com/ranganathanlab/pySCA.git
336 | 
337 | For development and troubleshooting purposes, using Git is preferred.
338 | 
339 | The code will now be downloaded in a directory called `pySCA`.
340 | 
341 | B. (OR) Download from the Website
342 | ---------------------------------
343 | 
344 | Though not recommended, you can also download the source code from the GitHub
345 | website. Click the green "Clone or download" tab pictured below to obtain the
346 | latest code.
347 | 
348 | .. image:: _static/github-download-screenshot.png
349 | 
350 | In the event that you need older versions of the code, you can use the
351 | `releases <https://github.com/ranganathanlab/pySCA/releases>`_ tab on the
352 | GitHub page to download older tagged version.
353 | 
354 | 4. (OPTIONAL) Modify Settings
355 | =============================
356 | 
357 | Before installing pySCA, for your convenience, you may specify default paths in
358 | the `settings.py` file found in the `pysca` directory of the pySCA codebase.
359 | Setting these is optional, for not doing so simply meaning having to set a few
360 | command line options when running the code later.
361 | 
362 | :path2pfamseq: location of the `pfamseq.txt` text file (default:
363 |                `pfamseq.txt`). Use an absolute path to specify location.
364 | 
365 | :path2pfamdb: location of the `pfamseq.db` SQLite database (default:
366 |               `pfamseq.db`) --- the database is generated by the `getPfamDB.sh`
367 |               script and much faster to process than the text file.
368 | 
369 | :path2structures: location of your PDB structures for analysis (default: `.`).
370 |                   This variable should be set to the absolute path of the
371 |                   directory where you store all your PDB structures.
372 | 
373 | :path2output: name of the directory where to output the SCA results (default:
374 |               `output/`)
375 | 
376 | :path2pymol: path to PyMOL executable. If unset -- the default -- pySCA will
377 |              look for PyMOL in the system PATH. This variable will only need to
378 |              be set if PyMOL is installed in an exotic location and cannot be
379 |              started by simply running :code:`pymol` in the terminal.
380 | 
381 | If you ever want to change these variables at a later time, edit the
382 | `settings.py` file and then **re-install** pySCA. Follow the installation
383 | procedure in the next step.
384 | 
385 | 5. Install pySCA
386 | ================
387 | 
388 | The processing scripts found in the `bin/` directory and the SCA toolbox in
389 | `pysca/` can now be installed. To install them system-wide, go to the base of
390 | the repository (i.e. the `pySCA/` directory downloaded by Git) and run in the
391 | terminal:
392 | 
393 | .. code-block:: bash
394 | 
395 |    pip install .
396 | 
397 | Note the '.' at the end. Don't omit it --- it tells :code:`pip` to look in the
398 | current directory for configuration instructions.
399 | 
400 | Pip will first install python package dependencies:
401 | 
402 | 1) Numpy
403 | 2) Scipy
404 | 3) Argparse
405 | 4) Wheel
406 | 5) Matplotlib
407 | 
408 | Then, it install the pySCA code itself.
409 | 
410 | *If and only if you run into permissions errors*, two options are to either:
411 | 
412 | **A. Install pySCA locally**
413 | 
414 | To install pySCA in your user directory (and without root privileges), run in
415 | the terminal::
416 | 
417 |   pip install --user .
418 | 
419 | This option is useful if you are working on a system where you do not have
420 | administrator access.
421 | 
422 | Note that to use locally installed scripts, the installation directory needs to
423 | be in the system PATH. To check whether that is the case, run::
424 | 
425 |   echo $PATH | grep --color=auto "$(python -m site --user-base)/bin"
426 | 
427 | If the installation directory is highlighted in the output, then the PATH is
428 | configured correctly. If it is not found, then it needs to be added manually.
429 | Open you shell configuration file (e.g. `~/.bashrc`) and add the directory to the
430 | PATH variable by appending the following line::
431 | 
432 |   export PATH="$HOME/.local/bin:$PATH"
433 | 
434 | The exact path (the text following the semicolon) may differ on your system,
435 | but it can easily be found by running `echo $(python -m site --user-base)/bin`.
436 | 
437 | **OR B. Install pySCA globally as root**
438 | 
439 | To install pySCA system-wide, run (as root/administrator)::
440 | 
441 |   sudo pip install .
442 | 
443 | This will obviate any need to mess around with local PATH variables, and pySCA
444 | will be accessible to all users on the system.
445 | 
446 | Now, with the pySCA code installed, each of the commands found in bin/ can now
447 | be run from the command line.
448 | 
449 | 
450 | 6. Getting Started and Running the Tutorials
451 | ============================================
452 | 
453 | The :doc:`"getting started" <get_started>` section of this documentation
454 | provides instructions on how to run some initial calculations and the
455 | tutorials. The basic idea behind the pySCA code is that the core calculations
456 | are performed using a series of executable Python scripts, and then the results
457 | can be loaded and analyzed/visualized using an Jupyter notebook (or
458 | alternatively, MATLAB).
459 | 
460 | All of the tutorials are written provided as Jupyter notebooks. For more on
461 | how Jupyter notebooks work, see: `<https://jupyter.org>`_. Prior to running the
462 | notebook tutorials, you'll need to run the core calculation scripts that
463 | generate the input for the notebooks. One way to do this is with the shell
464 | script "runAllNBCalcs.sh", and there is more information on this in the
465 | :doc:`"getting started" <get_started>` section. Once the calculations are
466 | completed, you can begin the tutorial in interactive Python from the command
467 | line, by typing:
468 | 
469 | To install Jupyter, run:
470 | 
471 | .. code-block:: bash
472 | 
473 |    pip install jupyterlab
474 | 
475 | 
476 | You can then open the notebooks from the command line by running:
477 | 
478 | .. code-block:: bash
479 | 
480 |    jupyter notebook <notebook.ipynb>
481 | 
482 | 
483 | .. **Important:** The :code:`ggearch36`, :code:`needle`, and :code:`pymol`
484 | .. programs need to be on the system PATH.
485 | ..
486 | .. To view your system PATH, run in the terminal::
487 | ..
488 | ..   echo $PATH
489 | ..
490 | .. To add directories containing the required prorams to your system path, you
491 | .. will need to edit your shell configuration file (e.g. `.bashrc` or
492 | .. `.bash_profile`) found at the base of your user directory. To add a directory
493 | .. to the system PATH, open up the file and apped the line::
494 | ..
495 | ..   export PATH="$PATH:<path to directory>"
496 | ..
497 | .. where `<path to directory>` is replaced with the path to the directory
498 | .. containing a program you wish to add (e.g. `~/.local/bin`). After saving the
499 | .. changes, new terminals will use the updated PATH.
500 | ..
501 | .. **Important:** To add an already-installed program is to the PATH, run::
502 | ..
503 | ..   $ whereis <program>
504 | ..
505 | .. to find where `<program>` (e.g. :code:`pymol`) is located, and add its
506 | .. directory to the system PATH in the manner described above.
507 | ..
508 | .. **Important:** Your requirements will vary depending on the size of your
509 | .. sequence alignments, but as a rule of thumb, the toolbox is best used on a
510 | .. system with at least 8 GB of RAM. pySCA may run with Less, but there will be a
511 | .. greater risk when using modestly-sized multeiple sequence alignments of
512 | .. processes using more memory than available and subsequently getting killed by
513 | .. the operating system's scheduler.
514 | 


--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
 1 | ==============
 2 | The pySCA Code
 3 | ==============
 4 | 
 5 | .. toctree::
 6 |    :maxdepth: 4
 7 | 
 8 |    annotateMSA
 9 |    scaProcessMSA
10 |    scaCore
11 |    scaSectorID
12 |    scaTools
13 | 


--------------------------------------------------------------------------------
/docs/source/scaCore.rst:
--------------------------------------------------------------------------------
1 | =======
2 | scaCore
3 | =======
4 | 
5 | .. automodule:: scaCore
6 |     :members:
7 |     :undoc-members:
8 |     :show-inheritance:
9 | 


--------------------------------------------------------------------------------
/docs/source/scaProcessMSA.rst:
--------------------------------------------------------------------------------
1 | =============
2 | scaProcessMSA
3 | =============
4 | 
5 | .. automodule:: scaProcessMSA
6 |     :members:
7 |     :undoc-members:
8 |     :show-inheritance:
9 | 


--------------------------------------------------------------------------------
/docs/source/scaSectorID.rst:
--------------------------------------------------------------------------------
1 | ===========
2 | scaSectorID
3 | ===========
4 | 
5 | .. automodule:: scaSectorID
6 |     :members:
7 |     :undoc-members:
8 |     :show-inheritance:
9 | 


--------------------------------------------------------------------------------
/docs/source/scaTools.rst:
--------------------------------------------------------------------------------
1 | ========
2 | scaTools
3 | ========
4 | 
5 | .. automodule:: scaTools
6 |     :members:
7 |     :undoc-members:
8 |     :show-inheritance:
9 | 


--------------------------------------------------------------------------------
/docs/source/usage.rst:
--------------------------------------------------------------------------------
 1 | =====
 2 | Usage
 3 | =====
 4 | 
 5 | We provide tutorials that walk through the process of sector identification for
 6 | three protein families: the Ras-like small G-proteins, the metabolic enzyme
 7 | Dihydrofolate Reductase (DHFR), and the antibiotic resistance enzyme
 8 | Beta-lactamase.
 9 | 
10 | To run the SCA calculations for all three examples, you can execute the
11 | following shell script from the scripts/ directory::
12 | 
13 |   ./runAllNBCalcs.sh
14 | 
15 | For each example, this will generate the following outputs in the output/
16 | directory:
17 | 
18 |   1.  A pickle database (\*.db file) that contains the results of the
19 |       calculations (these are then read in and analyzed in the IPython
20 |       notebooks - \*.ipynb)
21 |   2.  A \*.log file that provides some information about the analysis
22 |   3.  A processed alignment (\*.fasta file) resulting from the
23 |       scaProcessMSA script.
24 | 
25 | Following this step, you can begin the tutorial as an interactive Jupyter
26 | notebook from the command line as follows::
27 | 
28 |   jupyter notebook SCA_G.ipynb
29 | 
30 | This should open the notebook in a browser window, where you can run the code,
31 | and examine the SCA results.
32 | 


--------------------------------------------------------------------------------
/docs/source/versions.rst:
--------------------------------------------------------------------------------
 1 | =============
 2 | Distributions
 3 | =============
 4 | 
 5 | Previous versions of SCA were implemented as MATLAB toolboxes and contain
 6 | various accessory codes for data formatting, display, and analysis.
 7 | 
 8 | :SCA Toolbox 1.5: 
 9 |   The original SCA method as specified in Lockless and Ranganathan (4) with one
10 |   modification that was used in all subsequent papers: the division of binomial
11 |   probabilities by the mean probability of amino acids in the alignment is
12 |   removed. This version is longer in active use.
13 | 
14 | :SCA Toolbox 2.5:
15 |   The bootstrap-based approach for SCA. Position-specific conservation
16 |   calculated as in Eq. (4) and correlations calculated as in Eq.  (9). Matrix
17 |   reduction per Eq. (32).
18 | 
19 | :SCA Toolbox 3.0:
20 |   The analytical calculation of correlations weighted by gradients of relative
21 |   entropy. Position-specific conservation calculated as in Eq. (4) and
22 |   correlations calculated as in Eq. (9)-(33). For non-binarized alignments,
23 |   matrix reduction is per Eq. (32).
24 | 
25 | :SCA Toolbox 4.0:
26 |   Analytical calculations as in v3.0, but now including sector identification
27 |   methods as described in Ref. (2).
28 | 
29 | :SCA Toolbox 5.0:
30 |   Calculation of positional and sequence correlations matrices by the alignment
31 |   projection method as per Eq. (19) and Eq. (20), and calculation of the
32 |   mapping between them Eq. (21). Includes methods for sector identification and
33 |   exploring relationships between positional and sequence correlations. 
34 | 
35 | :SCA Toolbox 6.0:
36 |   Calculation of first-order and second-order statistics for positional amino
37 |   acid frequencies using sequences weighted by similarity in the multiple
38 |   sequence alignment. 
39 | 
40 | :SCA Toolbox 6.1:
41 |   Port from Python 2 to Python 3. Also includes updated annotation scripts,
42 |   changes to the command-line interface, and the option to install analysis
43 |   scripts as system-wide executables.
44 | 
45 | To obtain previous distributions, please contact Dr. Rama Ranganathan.
46 | 


--------------------------------------------------------------------------------
/figs/BLactamase_sec_hier.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/figs/BLactamase_sec_hier.png


--------------------------------------------------------------------------------
/figs/DHFR_decompv2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/figs/DHFR_decompv2.png


--------------------------------------------------------------------------------
/figs/DHFR_sec_hier.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/figs/DHFR_sec_hier.png


--------------------------------------------------------------------------------
/figs/Gprot_sec_hier.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/figs/Gprot_sec_hier.png


--------------------------------------------------------------------------------
/figs/Gprot_secstruct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/figs/Gprot_secstruct.png


--------------------------------------------------------------------------------
/pysca/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ranganathanlab/pySCA/778afb1a70fc6acc0c76ade74f35eb5cb6e8257a/pysca/__init__.py


--------------------------------------------------------------------------------
/pysca/settings.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | """ Global settings for pySCA. """
 3 | 
 4 | #
 5 | # PATHS
 6 | #
 7 | # These have to be changed to be consistent with user-defined paths. This
 8 | # script is tested against the `runAllNBCalcs.sh` scripts, and because the
 9 | # script includes a `cd ../` command before running any of the python scripts,
10 | # the base directory is the root of the repository.
11 | #
12 | 
13 | # Enter absolute path (e.g. /home/<user>/pfamseq.txt) to the file 'pfamseq.txt'
14 | # from
15 | # ftp://ftp.sanger.ac.uk/pub/databases/Pfam/current_release/database_files/
16 | # and/or the SQLite database `pfamseq.db` if it exists.
17 | path2pfamseq = "pfamseq.txt"  # replace with absolute path to pfamseq.txt
18 | path2pfamseqdb = (
19 |     "pfamseq.db"  # replace with absolute path to pfamseq.db (if present)
20 | )
21 | 
22 | # the location of your PDB structures
23 | path2structures = (
24 |     "."  # replace with absolute path to directory of PDB structures
25 | )
26 | 
27 | # Also assumes that a folder named 'output/' is in the path. Change to '.' if
28 | # you want results printed in the current working directory by default.
29 | path2output = "output/"
30 | 
31 | # Used for pulling species, taxonomy annotations from ncbi database. PLEASE
32 | # change to your own your email!!!
33 | entrezemail = "your.email@youruniversity.edu"
34 | 
35 | # If you are using a version of PyMOL not intalled in your system PATH, you can
36 | # add the path here. Use an absolute path to the PyMOL executable, or leave
37 | # empty to use PyMOL on the system PATH.
38 | path2pymol = ""
39 | 


--------------------------------------------------------------------------------
/scripts/getPfamDB.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | set -eu
 3 | 
 4 | #
 5 | # The Pfam annotation script is much, much faster when using a database instead
 6 | # of iterating over a 20 GB text file line by line. This script is intended to
 7 | # download the text file and convert it into a SQLite3 database.
 8 | #
 9 | # I recommend running this overnight when you aren't using your computer.
10 | # SQLite3 has to create key-value pairs for over 40 million sequences, and it
11 | # is VERY, VERY slow.
12 | #
13 | # Dependencies: wget, sqlite3, awk, and gzip or pigz
14 | #
15 | 
16 | #
17 | # Globals
18 | #
19 | 
20 | pfamurl="ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/database_files"
21 | pfamheaders="pfamseq.sql"
22 | pfamdata="pfamseq.txt"
23 | pfamdb="pfamseq.db"
24 | 
25 | gzip=gzip  # replace this value with whatever gzip compression tool you use
26 | 
27 | 
28 | #
29 | # Download and extract the data
30 | #
31 | 
32 | echo "Downloading the Pfam database files and generate a SQLite3 database."
33 | echo "Requires ~90 GB of free storage and could take several hours."
34 | 
35 | echo "Downloading the Pfam annotated sequence data."
36 | 
37 | wget -Nc "${pfamurl}/${pfamheaders}.gz"
38 | wget -Nc "${pfamurl}/${pfamdata}.gz"
39 | echo "Got 'em."
40 | 
41 | echo "Decompress the gzipped files."
42 | echo "This will take a while."
43 | if test "$(command -v ${gzip})"; then
44 |   ${gzip} -vd "${pfamheaders}.gz"
45 |   ${gzip} -vd "${pfamdata}.gz"
46 | else
47 |   echo "${gzip} not found. Exiting."
48 |   exit 3
49 | fi
50 | echo "Done!"
51 | 
52 | 
53 | #
54 | # Create the database
55 | #
56 | 
57 | # The SQL dump on the server is for MySQL (MariaDB), so it needs to be
58 | # converted to a format compatible with SQLite3.
59 | 
60 | echo "Converting the MySQL dump to SQLite3."
61 | git clone --depth 1 https://github.com/dumblob/mysql2sqlite.git
62 | ./mysql2sqlite/mysql2sqlite "${pfamheaders}" | sqlite3 "${pfamdb}"
63 | rm -rf mysql2sqlite
64 | 
65 | echo "Importing data."
66 | sqlite3 -batch "${pfamdb}" << "EOF"
67 | .separator "\t"
68 | .import pfamseq.txt pfamseq
69 | EOF
70 | echo "Done!"
71 | 


--------------------------------------------------------------------------------
/scripts/rstZipFixUrl.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | set -eu
 3 | 
 4 | #
 5 | # This script is intended to change the URLs and file names of the zipped
 6 | # output from a Jupyter notebook (see 'Download as rst' option). By default,
 7 | # the images names are 'output_\*.png', with corresponding URLs in the RST
 8 | # file, which will cause naming clashes when including multiple notebooks, each
 9 | # with different images, to the _static folder.
10 | #
11 | # This script will take the filename of the zip file, extract its contents,
12 | # rename the images from 'output' to '<filename>', and update the URLS in the
13 | # RST files.
14 | #
15 | # Input:
16 | #   - zip file containing RST and image from Jupyter notebook
17 | # Output:
18 | #   - directory containing RST file and images with updated URLs
19 | #
20 | # Usage:
21 | #   ./rstZipFixUrl.sh <path to zip>
22 | #
23 | 
24 | docsdir="../docs/source"
25 | docsstaticdir="_static"
26 | 
27 | filename=$(basename ${1%.*})
28 | extension=${1##*.}
29 | 
30 | if [[ "${extension}" != "zip" ]]; then
31 |   echo "ERROR: Input is not a zipped archive."
32 |   exit 3
33 | fi
34 | 
35 | tmpdir=tmp_${filename}
36 | 
37 | mkdir -p ${tmpdir}
38 | cd ${tmpdir}
39 | 
40 | unzip ../${1}
41 | 
42 | sed -i "s,output_\([0-9_]\+\).png,${docsstaticdir}/${filename}_\1.png,g" ${filename}.rst
43 | sed -i "s,^\.\. code:: ipython3,\.\. code:: python3,g" ${filename}.rst
44 | for png in *.png; do
45 |   newpng=$(echo ${png} | sed -e "s/output_\([0-9_]\+\).png/${filename}_\1.png/g")
46 |   mv ${png} ${newpng}
47 | done
48 | 
49 | cd ../
50 | 
51 | mv ${tmpdir}/${filename}.rst ${docsdir}/
52 | mv ${tmpdir}/${filename}_*.png ${docsdir}/${docsstaticdir}/
53 | 
54 | rmdir ${tmpdir}
55 | 


--------------------------------------------------------------------------------
/scripts/runAllNBCalcs.sh:
--------------------------------------------------------------------------------
  1 | #! /bin/bash
  2 | set -eu
  3 | 
  4 | # Globals
  5 | 
  6 | datadir=data
  7 | outputdir=output
  8 | 
  9 | datarepo="https://github.com/ranganathanlab/pySCA-data"
 10 | version=6.1
 11 | 
 12 | # Download the data
 13 | 
 14 | cd ../
 15 | 
 16 | # In the event git is not installed, just directly download the data from
 17 | # GitHub using wget or curl (in order of preference). Also, check to see if tar
 18 | # is installed. If not, download the zipped archive.
 19 | if [ -x "$(command -v git)" ] && [ -d ".git/" ]; then
 20 |   git submodule init
 21 |   git submodule update --force
 22 | elif [ -x "$(command -v wget)" ]; then
 23 |   echo "git not installed --- trying wget"
 24 |   if [ -x "$(command -v tar)" ]; then
 25 |     wget -nc ${datarepo}/archive/v${version}.tar.gz
 26 |     tar xf v${version}.tar.gz
 27 |   elif [ -x "$(command -v unzip)" ]; then
 28 |     wget -nc ${datarepo}/archive/v${version}.zip
 29 |     unzip v${version}.zip
 30 |   else
 31 |     echo "'unzip' or 'tar' (with gzip) is required for decompressing data."
 32 |     exit 3
 33 |   fi
 34 |   mkdir -p ${datadir}
 35 |   mv -v pySCA-data-${version}/* ${datadir}/
 36 |   rm -rvf pySCA-data-${version}
 37 | elif [ -x "$(command -v curl)" ]; then
 38 |   echo "git not installed --- trying curl"
 39 |   if [ -x "$(command -v tar)" ]; then
 40 |     curl -L -O -C - ${datarepo}/archive/v${version}.tar.gz
 41 |     tar xf v${version}.tar.gz
 42 |   elif [ -x "$(command -v unzip)" ]; then
 43 |     curl -L -O -C - ${datarepo}/archive/v${version}.zip
 44 |     unzip v${version}.zip
 45 |   else
 46 |     echo "'unzip' or 'tar' (with gzip) is required for decompressing data."
 47 |     exit 3
 48 |   fi
 49 |   mkdir -p ${datadir}
 50 |   mv -v pySCA-data-${version}/* ${datadir}/
 51 |   rm -rvf pySCA-data-${version}
 52 | fi
 53 | 
 54 | # Generate the output files
 55 | 
 56 | mkdir -vp ${outputdir}
 57 | 
 58 | # The S1A serine proteases
 59 | echo "S1A serine protease Calculations:" | tee ${outputdir}/s1A_halabi.log
 60 | scaProcessMSA \
 61 |   -a ${datadir}/s1Ahalabi_1470_nosnakes.an \
 62 |   -b ${datadir} \
 63 |   -s 3TGI \
 64 |   -c E \
 65 |   -d ${outputdir} \
 66 |   -t -n 2>&1 | tee -a ${outputdir}/s1A_halabi.log
 67 | scaCore -i ${outputdir}/s1Ahalabi_1470_nosnakes.db 2>&1 | \
 68 |   tee -a ${outputdir}/s1A_halabi.log
 69 | scaSectorID -i ${outputdir}/s1Ahalabi_1470_nosnakes.db 2>&1 | \
 70 |   tee -a ${outputdir}/s1A_halabi.log
 71 | echo
 72 | 
 73 | # Beta-lactamase
 74 | echo "Beta-lactamase Calculations:" | tee ${outputdir}/PF13354.log
 75 | scaProcessMSA \
 76 |   -a ${datadir}/PF13354_full.an \
 77 |   -b ${datadir} \
 78 |   -s 1FQG \
 79 |   -c A \
 80 |   -d ${outputdir} \
 81 |   -f 'Escherichia coli' \
 82 |   -t -n 2>&1 | tee -a ${outputdir}/PF13354.log
 83 | scaCore -i ${outputdir}/PF13354_full.db 2>&1 | \
 84 |   tee -a ${outputdir}/PF13354.log
 85 | scaSectorID -i ${outputdir}/PF13354_full.db 2>&1 | \
 86 |   tee -a ${outputdir}/PF13354.log
 87 | echo
 88 | 
 89 | # G-protein - this analysis is run with two alignments - the full Pfam
 90 | # alignment (PF00071_full) and the Pfam alignment filtered to remove several
 91 | # N-terminal truncation mutants. PF00071_rd2 is the aligment discussed in the
 92 | # manuscript.
 93 | echo "G-protein calculations:" | tee ${outputdir}/PF00071.log
 94 | scaProcessMSA \
 95 |   -a ${datadir}/PF00071_full.an \
 96 |   -b ${datadir} \
 97 |   -s 5P21 \
 98 |   -c A \
 99 |   -d ${outputdir} \
100 |   -f 'Homo sapiens' \
101 |   -t -n 2>&1 | tee -a ${outputdir}/PF00071.log
102 | scaCore -i ${outputdir}/PF00071_full.db 2>&1 | \
103 |   tee -a ${outputdir}/PF00071.log
104 | scaSectorID -i ${outputdir}/PF00071_full.db 2>&1 | \
105 |   tee -a ${outputdir}/PF00071.log
106 | echo
107 | 
108 | echo "G-protein calculations:" | tee ${outputdir}/PF00071_rd2.log
109 | scaProcessMSA \
110 |   -a ${datadir}/PF00071_rd2.an \
111 |   -b ${datadir} \
112 |   -s 5P21 \
113 |   -c A \
114 |   -d ${outputdir} \
115 |   -f 'Homo sapiens' \
116 |   -t -n 2>&1 | tee -a ${outputdir}/PF00071_rd2.log
117 | scaCore -i ${outputdir}/PF00071_rd2.db 2>&1 | \
118 |   tee -a ${outputdir}/PF00071_rd2.log
119 | scaSectorID -i ${outputdir}/PF00071_rd2.db 2>&1 | \
120 |   tee -a ${outputdir}/PF00071_rd2.log
121 | echo
122 | 
123 | # DHFR - this analysis is also run with two alignments for comparison -
124 | # the full PFAM alignment (PF00186_full.an) and a manually curated alignment
125 | # (DHFR_PEPM3.an)
126 | echo "DHFR Calculations:" | tee ${outputdir}/PF00186.log
127 | scaProcessMSA \
128 |   -a ${datadir}/PF00186_full.an \
129 |   -b ${datadir} \
130 |   -s 1RX2 \
131 |   -c A \
132 |   -d ${outputdir} \
133 |   -f 'Escherichia coli' \
134 |   -t -n 2>&1 | tee -a ${outputdir}/PF00186.log
135 | scaCore -i ${outputdir}/PF00186_full.db 2>&1 | \
136 |   tee -a ${outputdir}/PF00186.log
137 | scaSectorID -i ${outputdir}/PF00186_full.db 2>&1 | \
138 |   tee -a ${outputdir}/PF00186.log
139 | echo
140 | 
141 | echo "DHFR Calculations:" | tee ${outputdir}/DHFR_PEPM3.log
142 | scaProcessMSA \
143 |   -a ${datadir}/DHFR_PEPM3.an \
144 |   -b ${datadir} \
145 |   -s 1RX2 \
146 |   -c A \
147 |   -d ${outputdir} \
148 |   -t -n 2>&1 | tee -a ${outputdir}/DHFR_PEPM3.log
149 | scaCore -i ${outputdir}/DHFR_PEPM3.db 2>&1 | \
150 |   tee -a ${outputdir}/DHFR_PEPM3.log
151 | scaSectorID -i ${outputdir}/DHFR_PEPM3.db 2>&1 | \
152 |   tee -a ${outputdir}/DHFR_PEPM3.log
153 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | """ Installation and setup for pySCA"""
 3 | 
 4 | from setuptools import setup
 5 | 
 6 | setup(
 7 |     name="pySCA",
 8 |     version="6.1",
 9 |     author="Olivier Rivoire, Rama Ranganathan, and Kimberly Reynolds",
10 |     maintainer="Ansel George",
11 |     packages=["pysca"],
12 |     package_data={"pysca": ["settings.py"]},
13 |     description="Python 3 implementation of Statistical Coupling Analysis (SCA)",
14 |     url="https://ranganathanlab.gitlab.io/pySCA",
15 |     download_url="https://github.com/ranganathanlab/pySCA",
16 |     long_description=open("README.md", "r").read(),
17 |     install_requires=[
18 |         "biopython",
19 |         "numpy",
20 |         "scipy",
21 |         "argparse",
22 |         "wheel",
23 |         "matplotlib",
24 |     ],
25 |     scripts=[
26 |         "bin/alnChangeDelim",
27 |         "bin/alnFilterSeqSize",
28 |         "bin/alnParseID",
29 |         "bin/annotateMSA",
30 |         "bin/scaProcessMSA",
31 |         "bin/alnConvertGI",
32 |         "bin/alnReplaceHeaders",
33 |         "bin/scaCore",
34 |         "bin/scaSectorID",
35 |     ],
36 | )
37 | 


--------------------------------------------------------------------------------