├── .gitignore
├── LICENSE
├── README.md
├── scripts
    ├── calc_fcc_matrix.py
    ├── cluster_fcc.py
    ├── make_contacts.py
    ├── pdb_chainxseg.py
    └── ppretty_clusters.py
└── src
    ├── Makefile
    ├── contact_fcc.cpp
    ├── contact_fcc_intra.cpp
    └── contact_fcc_lig.cpp


/.gitignore:
--------------------------------------------------------------------------------
1 | src/contact_fcc
2 | src/contact_fcc_lig
3 | src/contact_fcc_intra
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Apache license 2.0
 2 | 
 3 | Copyright 2013 João Rodrigues
 4 | 
 5 | Licensed under the Apache License, Version 2.0 (the "License");
 6 | you may not use this file except in compliance with the License.
 7 | You may obtain a copy of the License at
 8 | 
 9 |     http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 | Unless required by applicable law or agreed to in writing, software
12 | distributed under the License is distributed on an "AS IS" BASIS,
13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | See the License for the specific language governing permissions and
15 | limitations under the License.
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | FCC Clustering Algorithm
 2 | ========================
 3 | 
 4 | *Fraction of Common Contacts Clustering Algorithm for Protein Models from Structure Prediction Methods*
 5 | 
 6 | About FCC
 7 | ---------
 8 | 
 9 | Structure prediction methods generate a large number of models of which only a fraction matches the biologically relevant structure. To identify this (near-)native model, we often employ clustering 
10 | algorithms, based on the assumption that, in the energy landscape of every biomolecule, its native state lies in a wide basin neighboring other structurally similar states. RMSD-based clustering, the 
11 | current method of choice, is inadequate for large multi-molecular complexes, particularly when their components are symmetric. We developed a novel clustering strategy that is based on a very 
12 | efficient similarity measure - the fraction of common contacts. The outcome of this calculation is a number between 0 and 1, which corresponds to the fraction of residue pairs that are present in 
13 | both the reference and the mobile complex.
14 | 
15 | Advantages of FCC clustering vs. RMSD-based clustering:
16 | * 100-times faster on average.
17 | * Handles symmetry by consider complexes as entities instead of collections of chains.
18 | * Does not require atom equivalence (clusters mutants, missing loops, etc).
19 | * Handles any molecule type (protein, DNA, RNA, carbohydrates, lipids, ligands, etc).
20 | * Allows multiple levels of "resolution": chain-chain contacts, residue-residue contacts, residue-atom contacts, etc.
21 | 
22 | How to Cite
23 | -----------
24 | Rodrigues JPGLM, Trellet M, Schmitz C, Kastritis P, Karaca E, Melquiond ASJ, Bonvin AMJJ. 
25 | [Clustering biomolecular complexes by residue contacts similarity.] [1]
26 | Proteins: Structure, Function, and Bioinformatics 2012;80(7):1810–1817.
27 | 
28 | Requirements
29 | ------------
30 | 
31 | * Python 2.6+
32 | * C/C++ Compiler
33 | 
34 | Installation
35 | ------------
36 | 
37 | Navigate to the src/ folder and issue 'make' to compile the contact programs.
38 | Edit the Makefile if necessary (e.g. different compiler, optimization level).
39 | 
40 | Usage
41 | ------------
42 | 
43 | All scripts produce usage documentation if called without any arguments. Further,
44 | the '-h' option produces (for Python scripts) a more detailed help with descriptions
45 | of all available options.
46 | 
47 | For most cases, the following setup is enough:
48 | 
49 |     # Make a file list with all your PDB files
50 |     ls *pdb > pdb.list
51 |     
52 |     # Ensure all PDB models have segID identifiers
53 |     # Convert chainIDs to segIDs if necessary using scripts/pdb_chainxseg.py
54 |     for pdb in $( cat pdb.list ); do pdb_chainxseg.py $pdb > temp; mv temp $pdb; done
55 | 
56 |     # Generate contact files for all PDB files in pdb.list
57 |     # using 4 cores on this machine.
58 |     python2.6 make_contacts.py -f pdb.list -n 4
59 | 
60 |     # Create a file listing the names of the contact files
61 |     # Use file.list to maintain order in the cluster output
62 |     sed -e 's/pdb/contacts/' pdb.list | sed -e '/^$/d' > pdb.contacts
63 | 
64 |     # Calculate the similarity matrix
65 |     python2.6 calc_fcc_matrix.py -f pdb.contacts -o fcc_matrix.out
66 | 
67 |     # Cluster the similarity matrix using a threshold of 0.75 (75% contacts in common)
68 |     python2.6 cluster_fcc.py fcc_matrix.out 0.75 -o clusters_0.75.out
69 | 
70 |     # Use ppretty_clusters.py to output meaningful names instead of model indexes
71 |     python2.6 ppretty_clusters.py clusters_0.75.out pdb.list
72 | 
73 | Authors
74 | ------
75 | 
76 | João Rodrigues
77 | 
78 | Mikael Trellet
79 | 
80 | Adrien Melquiond
81 | 
82 | Christophe Schmitz
83 | 
84 | Ezgi Karaca
85 | 
86 | Panagiotis Kastritis
87 | 
88 | [Alexandre Bonvin] [2]
89 | 
90 | [1]: http://www.ncbi.nlm.nih.gov/pubmed/22489062 "FCC @ Pubmed"
91 | [2]: http://nmr.chem.uu.nl/~abonvin "Alexandre Bonvin's Homepage"
92 | 


--------------------------------------------------------------------------------
/scripts/calc_fcc_matrix.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8  -*-
  3 | 
  4 | """
  5 | Calculates a matrix of fraction of common contacts between two or more structures.
  6 | 
  7 | Authors:
  8 |         RODRIGUES Joao
  9 |         TRELLET Mikael
 10 |         MELQUIOND Adrien
 11 | """
 12 | 
 13 | # Contact Parsing routines
 14 | def parse_contact_file(f_list, ignore_chain):
 15 |     """Parses a list of contact files."""
 16 |     
 17 |     if ignore_chain:
 18 |         contacts = [ [ int(l[0:5]+l[6:-1]) for l in open(f)] for f in f_list if f.strip()]
 19 |     else:
 20 |         contacts = [ set([ int(l) for l in open(f)]) for f in f_list if f.strip()]
 21 | 
 22 |     return contacts
 23 | 
 24 | # FCC Calculation Routine
 25 | def calculate_fcc(listA, listB):
 26 |     """
 27 |     Calculates the fraction of common elements between two lists
 28 |     taking into account chain IDs
 29 |     """
 30 |     
 31 |     cc = len(listA.intersection(listB))
 32 |     cc_v = len(listB.intersection(listA))
 33 |     
 34 |     return (cc, cc_v)
 35 |     
 36 | def calculate_fcc_nc(listA, listB):
 37 |     """
 38 |     Calculates the fraction of common elements between two lists
 39 |     not taking into account chain IDs. Much Slower.
 40 |     """
 41 |     
 42 |     largest,smallest = sorted([listA, listB], key=len)
 43 |     ncommon = len([ele for ele in largest if ele in smallest])
 44 |     return (ncommon, ncommon)
 45 | 
 46 | # Matrix Calculation
 47 | 
 48 | def calculate_pairwise_matrix(contacts, ignore_chain):
 49 |     """ Calculates a matrix of pairwise fraction of common contacts (FCC).
 50 |         Outputs numeric indexes.
 51 | 
 52 |         contacts: list_of_unique_pairs_of_residues [set/list]
 53 |         
 54 |         Returns pairwise matrix as an iterator, each entry in the form:
 55 |         FCC(cplx_1/cplx_2) FCC(cplx_2/cplx_1)
 56 |     """
 57 |     
 58 |     contact_lengths = []
 59 |     for c in contacts:
 60 |         try:
 61 |             ic = 1.0/len(c)
 62 |         except ZeroDivisionError:
 63 |             ic = 0
 64 |         contact_lengths.append(ic)
 65 |       
 66 |     if ignore_chain:
 67 |         calc_fcc = calculate_fcc_nc
 68 |     else:
 69 |         calc_fcc = calculate_fcc
 70 |     
 71 |     for i in xrange(len(contacts)):
 72 | 
 73 |         for k in xrange(i+1, len(contacts)):
 74 |             cc, cc_v = calc_fcc(contacts[i], contacts[k])
 75 |             fcc, fcc_v = cc*contact_lengths[i], cc*contact_lengths[k]
 76 |             yield (i+1, k+1, fcc, fcc_v)
 77 | 
 78 | def _output_fcc(output, values, f_buffer):
 79 | 
 80 |     buf = []
 81 |     for i in values:
 82 |         buf.append(i)
 83 |         if len(buf) == f_buffer:
 84 |             output( ''.join(["%s %s %1.3f %1.3f\n" %(i[0],i[1],i[2],i[3]) for i in buf]) )
 85 |             buf = []
 86 |     output( ''.join(["%s %s %1.3f %1.3f\n" %(i[0],i[1],i[2],i[3]) for i in buf]) )
 87 |     
 88 | if __name__ == '__main__':
 89 |     
 90 |     import optparse
 91 |     import sys
 92 |     from time import time, ctime
 93 |     import os
 94 |     
 95 |     USAGE = "%s <contacts file 1> <contacts file 2> ... [options]\n" %os.path.basename(sys.argv[0])
 96 |     
 97 |     parser = optparse.OptionParser(usage=USAGE)
 98 |     parser.add_option('-o', '--output', dest="output_file", action='store', type='string',
 99 |                         default=sys.stdout,
100 |                         help='Output File [default: STDOUT]')
101 |     parser.add_option('-f', '--file', dest="input_file", action='store', type='string',
102 |                         help='Input file (one contact file name per line)')
103 |     parser.add_option('-b', '--buffer_size', dest="buffer_size", action='store', type='string',
104 |                         default=50000, 
105 |                         help='Buffer size for writing output. Number of lines to cache before writing to file [default: 50000]')
106 |     parser.add_option('-i', '--ignore_chain', dest="ignore_chain_char", action='store_true',
107 |                         help='Ignore chain character in residue code. Use for homomeric complexes.')
108 |  
109 |     (options, args) = parser.parse_args()
110 |     
111 |     if options.input_file:
112 |         args = [name.strip() for name in open(options.input_file)]
113 |     
114 |     if len(args) < 2:
115 |         sys.stderr.write("- Provide (at least) two structures to calculate a matrix. You provided %s.\n" %len(args))
116 |         sys.stderr.write(USAGE)
117 |         sys.exit(1)
118 | 
119 |     sys.stderr.write("+ BEGIN: %s\n" %ctime())
120 |     if options.ignore_chain_char:
121 |         sys.stderr.write("+ Ignoring chains. Expect a considerable slowdown!!\n")
122 |         exclude_chains = True
123 |     else:
124 |         exclude_chains = False
125 |         
126 |     t_init = time()
127 |     sys.stderr.write("+ Parsing %i contact files\n" %len(args))
128 | 
129 |     c = parse_contact_file(args, exclude_chains)
130 |     
131 |     m = calculate_pairwise_matrix(c, exclude_chains)
132 |     
133 |     if isinstance(options.output_file, str):
134 |         f = open(options.output_file, 'w')
135 |     else:
136 |         f = options.output_file
137 | 
138 |     sys.stderr.write("+ Calculating Matrix\n") # Matrix is calculated when writing. Generator property.
139 |     sys.stderr.write("+ Writing matrix to %s\n" %f.name)
140 |     _output_fcc(f.write, m, options.buffer_size)
141 |     
142 |     if isinstance(options.output_file, str):
143 |         f.close()
144 |     t_elapsed = time()-t_init
145 |     sys.stderr.write("+ END: %s [%6.2f seconds elapsed]\n" %(ctime(), t_elapsed))
146 | 


--------------------------------------------------------------------------------
/scripts/cluster_fcc.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8  -*-
  3 | 
  4 | """
  5 | Asymmetric Taylor-Butina Disjoint Clustering Algorithm.
  6 | 
  7 | Authors:
  8 |            RODRIGUES Joao
  9 |            TRELLET Mikael
 10 |            MELQUIOND Adrien
 11 | """
 12 | 
 13 | class Element(object):
 14 |     """Defines a 'clusterable' Element"""
 15 | 
 16 |     __slots__ = ['name', 'cluster', 'neighbors']
 17 | 
 18 |     def __init__(self, name):
 19 |         self.name = name
 20 |         self.cluster = 0
 21 |         self.neighbors = set()
 22 | 
 23 | 
 24 |     def add_neighbor(self, neighbor):
 25 |         """Adds another element to the neighbor list"""
 26 |         self.neighbors.add(neighbor)
 27 | 
 28 |     def assign_cluster(self, clust_id):
 29 |         """Assigns the Element to Cluster. 0 if unclustered"""
 30 |         self.cluster = clust_id
 31 | 
 32 | class Cluster(object):
 33 |     """Defines a Cluster. A Cluster is created with a name and a center (Element class)"""
 34 | 
 35 |     __slots__ = ['name', 'center', 'members']
 36 | 
 37 |     def __init__(self, name, center):
 38 |         
 39 |         self.name = name
 40 |         self.center = center
 41 | 
 42 |         self.members = []
 43 |         
 44 |         self.populate()
 45 |         
 46 |     def __len__(self):
 47 |         return len(self.members)+1 # +1 Center
 48 |     
 49 |     def populate(self):
 50 |         """
 51 |         Populates the Cluster member list through the 
 52 |         neighbor list of its center.
 53 |         """
 54 | 
 55 |         name = self.name
 56 |         # Assign center
 57 |         ctr = self.center
 58 |         ctr.assign_cluster(name)
 59 |         
 60 |         mlist = self.members
 61 |         # Assign members
 62 |         ctr_nlist = (n for n in ctr.neighbors if not n.cluster)
 63 |         for e in ctr_nlist:
 64 |             mlist.append(e)
 65 |             e.assign_cluster(name)
 66 |     
 67 |     def add_member(self, element):
 68 |         """
 69 |         Adds one single element to the cluster.
 70 |         """
 71 |         l = self.members
 72 |         l.append(element)
 73 |         element.assign_cluster(self.name)
 74 | 
 75 | def read_matrix(path, cutoff, strictness):
 76 |     """ 
 77 |     Reads in a four column matrix (1 2 0.123 0.456\n) 
 78 |     and creates an dictionary of Elements.
 79 |     
 80 |     The strictness factor is a <float> that multiplies by the cutoff 
 81 |     to produce a new cutoff for the second half of the matrix. Used to
 82 |     allow some variability while keeping very small interfaces from clustering
 83 |     with anything remotely similar.
 84 |     """
 85 | 
 86 |     cutoff = float(cutoff)
 87 |     partner_cutoff = float(cutoff) * float(strictness)
 88 |     
 89 |     elements = {}
 90 | 
 91 |     f = open(path, 'r')
 92 |     for line in f:
 93 |         ref, mobi, dRM, dMR = line.split()
 94 |         ref = int(ref)
 95 |         mobi = int(mobi)
 96 |         dRM = float(dRM)
 97 |         dMR = float(dMR)
 98 | 
 99 |         # Create or Retrieve Elements
100 |         if ref not in elements:
101 |             r = Element(ref)
102 |             elements[ref] = r
103 |         else:
104 |             r = elements[ref]
105 |         
106 |         if mobi not in elements:
107 |             m = Element(mobi)
108 |             elements[mobi] = m
109 |         else:
110 |             m = elements[mobi]    
111 | 
112 |         # Assign neighbors
113 |         if dRM >= cutoff and dMR >= partner_cutoff:
114 |             r.add_neighbor(m)
115 |         if dMR >= cutoff and dRM >= partner_cutoff:
116 |             m.add_neighbor(r)
117 | 
118 |     f.close()
119 | 
120 |     return elements
121 | 
122 | def remove_true_singletons(element_pool):
123 |     """ Removes from the pool elements without any neighbor """
124 |     
125 |     ep = element_pool
126 | 
127 |     ts = set([e for e in ep if not ep[e].neighbors])
128 | 
129 |     # Remove ts from everybody's neighbor list
130 |     ts_e = set(ep[e] for e in ts)
131 |     for e in element_pool:
132 |         ep[e].neighbors = ep[e].neighbors.difference(ts_e)
133 | 
134 |     # Remove ts from pool
135 |     for e in ts:
136 |         del ep[e]
137 | 
138 |     return (ts, ep)
139 | 
140 | def cluster_elements(element_pool, threshold):
141 |     """ 
142 |     Groups Elements within a given threshold 
143 |     together in the same cluster.
144 |     """
145 |     
146 |     clusters = []
147 |     threshold -= 1 # Account for center
148 |     ep = element_pool
149 |     cn = 1 # Cluster Number
150 |     while 1:
151 |         # Clusterable elements
152 |         ce = [e for e in ep if not ep[e].cluster]
153 |         if not ce: # No more elements to cluster
154 |             break
155 |         
156 |         # Select Cluster Center
157 |         # Element with largest neighbor list
158 |         ctr_nlist, ctr = sorted([(len([se for se in ep[e].neighbors if not se.cluster]), e) for e in ce])[-1]
159 | 
160 |         # Cluster until length of remaining elements lists are above threshold
161 |         if ctr_nlist < threshold:
162 |             break
163 |         
164 |         # Create Cluster
165 |         c = Cluster(cn, ep[ctr])
166 |         cn += 1
167 |         clusters.append(c)
168 | 
169 |     return (ep, clusters)
170 | 
171 | def output_clusters(handle, clusters):
172 |     """Outputs the cluster name, center, and members."""
173 | 
174 |     write = handle.write
175 | 
176 |     for c in clusters:
177 |         write( "Cluster %s -> %s " %(c.name, c.center.name) )
178 |         for m in sorted(c.members, key=lambda k: k.name):
179 |             write( "%s " %m.name )
180 |         write("\n")
181 | 
182 | if __name__ == "__main__":
183 | 
184 |     import optparse
185 |     import sys
186 |     from time import time, ctime
187 |     import os
188 | 
189 |     USAGE="%s <matrix file> <threshold [float]> [options]" %os.path.basename(sys.argv[0])
190 | 
191 |     parser = optparse.OptionParser(usage=USAGE)
192 |     parser.add_option('-o', '--output', dest="output_handle", action='store', type='str',
193 |                     default=sys.stdout,
194 |                     help='Output File [STDOUT]')
195 |     parser.add_option('-c', '--cluster-size', dest="clus_size", action="store", type="int",
196 |                     default=4, 
197 |                     help="Minimum number of elements in a cluster [4]")
198 |     parser.add_option('-s', '--strictness', dest="strictness", action="store", type='float',
199 |                     default=0.75,
200 |                     help="Multiplier for cutoff for M->R inclusion threshold. [0.75 or effective cutoff of 0.5625]")
201 | 
202 | 
203 |     (options, args) = parser.parse_args()
204 | 
205 |     if sys.version_info[0:2] < (2,6):
206 |         cur_version = "%s.%s" %sys.version_info[0:2]
207 |         sys.stderr.write("- Python version not supported (%s). Please use 2.5 or newer.\n" %cur_version )
208 |         sys.exit(1)
209 |     if len(args) != 2:
210 |         sys.stderr.write("- Invalid number of arguments: %i\n" %len(args))
211 |         sys.stderr.write("USAGE: %s\n" %USAGE)
212 |         sys.exit(1)
213 |     
214 |     fmatrix, cutoff = args
215 |     cutoff = float(cutoff)
216 |     
217 |     # Read Matrix
218 |     sys.stderr.write("+ BEGIN: %s\n" %ctime())
219 |     t_init = time()
220 | 
221 |     try:
222 |         pool = read_matrix(fmatrix, cutoff, options.strictness)
223 |     except IOError:
224 |         sys.stderr.write("File not found: %s\n" %fmatrix)
225 |         sys.exit(1)
226 | 
227 |     sys.stderr.write("+ Read %ix%i distance matrix in %i seconds\n" %(len(pool), len(pool), int(time()-t_init)))
228 |     
229 |     # ts, pool = remove_true_singletons(pool)
230 |     # sys.stderr.write("+ Detected %i True Singletons\n" %len(ts))
231 | 
232 |     # Cluster
233 |     element_pool, clusters = cluster_elements(pool, options.clus_size)
234 | 
235 |     # Output Clusters
236 |     o = options.output_handle
237 |     if isinstance(o, str):
238 |         o_handle = open(o, 'w')
239 |     else:
240 |         o_handle = o
241 | 
242 |     sys.stderr.write("+ Writing %i Clusters\n" %len(clusters))
243 |     output_clusters(o_handle, clusters)
244 |     if isinstance(o, str):
245 |         o_handle.close()
246 | 
247 |     total_elements = len(element_pool)
248 |     clustered = sum([len(c) for c in clusters])
249 |     # Calculate coverage
250 |     clust_coverage = clustered*100/float(total_elements)
251 |     sys.stderr.write("+ Coverage %3.2f%% (%i/%i)\n" %(clust_coverage, clustered, total_elements))
252 |     t_elapsed = time()-t_init
253 |     sys.stderr.write( "+ END: %s [%3.2f seconds]\n" %(ctime(), t_elapsed))
254 | 


--------------------------------------------------------------------------------
/scripts/make_contacts.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8  -*-
  3 | 
  4 | """
  5 | Script to calculate contact lists on PDB files.
  6 | Requires external executable to calculate the contacts!
  7 | 
  8 | Authors:
  9 |            RODRIGUES Joao
 10 |            TRELLET Mikael
 11 |            MELQUIOND Adrien
 12 | """
 13 | 
 14 | from multiprocessing import Process
 15 | from subprocess import Popen, PIPE
 16 | 
 17 | def _calculate_contacts(executable, pdbfile, d_cutoff, filter_selection=None, extension='.contacts'):
 18 |     """
 19 |     Outputs a list of contacts based on vector analysis
 20 |     of the PDB file.
 21 |     
 22 |     Arguments:
 23 |     executable  - path to contact calculation program
 24 |     pdbfile     - path to PDB-formatted file (.pdb extension)
 25 |     d_cutoff    - minimal distance in A to consider a contact (float)
 26 |     filter_selection - list of identifiers to filter contacts (list of strings)
 27 |     """
 28 |     
 29 |     pdbname = os.path.basename(pdbfile)[:-4]
 30 | 
 31 |     p = Popen([executable, pdbfile, d_cutoff], stdout=PIPE)
 32 |     p_output = p.communicate()[0]
 33 |     contacts = sorted(list(set([l for l in p_output.split('\n')][:-1])))
 34 |     
 35 |     # Filter contacts
 36 |     if filter_selection:
 37 |         contacts = filter(lambda x: x[5] in filter_selection and x[-1] in filter_selection, contacts)
 38 |     # 
 39 | 
 40 |     outfile = os.path.join(os.path.dirname(pdbfile), "%s%s" %(pdbname, extension))
 41 |     with open(outfile, 'w') as o:
 42 |         o.write('\n'.join(contacts))
 43 | 
 44 |     return 0
 45 |     
 46 | if __name__ == '__main__':
 47 |     
 48 |     import optparse
 49 |     import os, sys
 50 | 
 51 |     USAGE = "%s [-f structures.txt] [-n 4] [-c 5.0] file1.pdb file2.pdb"
 52 | 
 53 |     parser = optparse.OptionParser(usage=USAGE)
 54 |     parser.add_option('-c', '--cutoff', dest="d_cutoff", action='store', type='string',
 55 |                         default="5.0", 
 56 |                         help='Distance cutoff to evaluate contacts. [default: 5.0A]')
 57 |     parser.add_option('-f', '--file', dest="input_file", action='store', type='string',
 58 |                         help='Input file (one file path per line)')
 59 |     parser.add_option('-n', '--nproc', dest="nproc", action='store', type='string',
 60 |                         default=1, 
 61 |                         help='Number of simultaneous processes to launch in each round. [default: 1]')
 62 |     parser.add_option('-e', '--exec', dest="executable", action='store', type='string',
 63 |                         default='%s/../src/contact_fcc' %os.path.dirname(sys.argv[0]), 
 64 |                         help='Path to the executable C++ program to calculate contacts [default: ../fcc/src/contact_fcc]')
 65 |     parser.add_option('-s', '--selection', dest="selection", action='store', type='string',
 66 |                         default=None, 
 67 |                         help='Filter contacts based on their segids. [Default: No filtering. All chains] [Example: A,C]')
 68 | 
 69 |     (options, args) = parser.parse_args()
 70 | 
 71 |     if options.input_file:
 72 |         args = [name.strip() for name in open(options.input_file) if name.strip()]
 73 |     
 74 |     if not args:
 75 |         print "No files provided. Exiting"
 76 |         print USAGE
 77 |         sys.exit(1)
 78 |     
 79 |     # Convert to full paths
 80 |     args = map(os.path.abspath, args)
 81 |     
 82 |     nproc = int(options.nproc)
 83 |     cutoff = options.d_cutoff
 84 | 
 85 |     executable = options.executable
 86 |     if not os.path.exists(executable):
 87 |         print "Path not found: %s" %os.path.abspath(executable)
 88 |         sys.exit(1)
 89 |     executable = os.path.abspath(executable)
 90 | 
 91 |     if options.selection:
 92 |         filter_selection = set(options.selection.split(','))
 93 |         representative = open(args[0])
 94 |         repr_chains = dict([(j,str(i)) for i,j in enumerate(sorted(set([l[72] for l in representative if l.startswith('ATOM')])), start=1)])
 95 |         filter_selection = map(repr_chains.get, filter_selection)
 96 |         representative.close()
 97 |         oextension = '.contacts-'+''.join(options.selection.split(','))
 98 |     else:
 99 |         filter_selection = None
100 |         oextension = '.contacts'
101 | 
102 |     queue = []
103 |     
104 |     while 1:
105 |         
106 |         arg = args.pop()        
107 |         # Create Process for arg
108 |         p = Process(target=_calculate_contacts, args=(executable, arg, cutoff, filter_selection, oextension))
109 |         queue.append(p)
110 |         
111 |         if (len(queue) == nproc) or (not args and len(queue)):
112 | 
113 |             for job in queue:
114 |                 job.start()
115 |             for job in queue: # Waiting for job to finish
116 |                 job.join()
117 |             queue = []
118 |         
119 |         if not args and not queue:
120 |             break
121 |     
122 |     print "Finished"
123 | 


--------------------------------------------------------------------------------
/scripts/pdb_chainxseg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Exchanges segment ID for chain ID in a PDB file.
 5 | 
 6 | usage: python pdb_chainxseg.py <pdb file>
 7 | example: python pdb_chainxseg.py 1CTF.pdb
 8 | 
 9 | This program is part of the PDB tools distributed with HADDOCK
10 | or with the HADDOCK tutorial. The utilities in this package
11 | can be used to quickly manipulate PDB files, with the benefit
12 | of 'piping' several different commands. This is a rewrite of old
13 | FORTRAN77 code that was taking too much effort to compile. RIP.
14 | """
15 | 
16 | import os
17 | import re
18 | import sys
19 | 
20 | __author__ = "Joao Rodrigues"
21 | 
22 | USAGE = "usage: " + sys.argv[0] + " <pdb file>\n"
23 | 
24 | def check_input(args):
25 |     """Checks whether to read from stdin/file and validates user input/options."""
26 |     
27 |     if not len(args):
28 |         # Read from pipe
29 |         if not sys.stdin.isatty():
30 |             pdbfh = sys.stdin
31 |         else:
32 |             sys.stderr.write(USAGE)
33 |             sys.exit(1)
34 |     elif len(args) == 1:
35 |         # Read from file
36 |         if not os.path.exists(args[0]):
37 |             sys.stderr.write('File not found: ' + args[0] + '\n')
38 |             sys.stderr.write(USAGE)
39 |             sys.exit(1)
40 |         pdbfh = open(args[0], 'r')
41 |     else:
42 |         sys.stderr.write(USAGE)
43 |         sys.exit(1)
44 |  
45 |     return pdbfh
46 | 
47 | def _swap_chainxseg(fhandle):
48 |     """Enclosing logic in a function to speed up a bit"""
49 | 
50 |     coord_re = re.compile('^(ATOM|HETATM)')
51 |     fhandle = fhandle
52 | 
53 |     for line in fhandle:
54 |         if coord_re.match(line):
55 |             yield line[:72] + line[21].ljust(4) + line[76:]
56 |         else:
57 |             yield line
58 | 
59 | if __name__ == '__main__':
60 |     # Check Input
61 |     pdbfh = check_input(sys.argv[1:])
62 |     
63 |     # Do the job
64 |     new_pdb = _swap_chainxseg(pdbfh)
65 | 
66 |     try:
67 |         sys.stdout.write(''.join(new_pdb))
68 |         sys.stdout.flush()
69 |     except IOError:
70 |         # This is here to catch Broken Pipes
71 |         # for example to use 'head' or 'tail' without
72 |         # the error message showing up
73 |         pass   
74 | 
75 |     # last line of the script
76 |     # We can close it even if it is sys.stdin
77 |     pdbfh.close()
78 |     sys.exit(0)


--------------------------------------------------------------------------------
/scripts/ppretty_clusters.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Outputs the names of the cluster members based on a clustering output file
 5 | and the original file listing the PDB files
 6 | 
 7 | Authors:
 8 |            RODRIGUES Joao
 9 | """
10 | 
11 | import os
12 | import sys
13 | 
14 | USAGE = "python %s <cluster_x.out> <file.nam>" %os.path.basename(sys.argv[0])
15 | 
16 | def read_clusters(path):
17 |     """
18 |     Reads clusters from a FCC output file.
19 |     """
20 | 
21 |     clusters = []
22 |     cl_file = open(path, 'r')
23 |     for line in cl_file:
24 |         # Cluster 8 -> 193 141 142 144 151 168 171 172 178
25 |         models = map(int, line.split()[3:])
26 |         clusters.append(models)
27 | 
28 |     return clusters
29 | 
30 | def read_list(path):
31 |     """
32 |     Reads a list containing one file per line.
33 |     Returns an index of line number - line content
34 |     """
35 | 
36 |     with open(path, 'r') as fhandle:
37 |         fdata = {}
38 |         for nline, line in enumerate(fhandle):
39 |             if not line.strip():
40 |                 continue
41 |             # Remove extension
42 |             fdata[nline+1] = '.'.join(line.strip().split('.')[:-1])
43 | 
44 |     return fdata
45 | 
46 | def cross_data(clusters, flist):
47 |     """
48 |     Matches names in flist to the numbers in clusters.
49 |     """
50 | 
51 |     named_clusters = []
52 |     for cl in clusters:
53 |         ncl = [flist[s] for s in cl]
54 |         named_clusters.append(ncl)
55 | 
56 |     return named_clusters            
57 | 
58 | if __name__ == '__main__':
59 | 
60 |     if len(sys.argv[1:]) != 2:
61 |         print USAGE
62 |         sys.exit(1)
63 | 
64 | 
65 |     cluster_file = os.path.abspath(sys.argv[1])
66 |     pdblist_file = os.path.abspath(sys.argv[2])
67 |     
68 |     try:
69 |         cl_list = read_clusters(cluster_file)
70 |     except IOError:
71 |         sys.stderr.write('Error: file not found (%s)\nAborting..\n' %cluster_file)
72 |         sys.exit(1)
73 | 
74 |     try:
75 |         pdb_list = read_list(pdblist_file)
76 |     except IOError:
77 |         sys.stderr.write('Error: file not found (%s)\nAborting..\n' %pdblist_file)
78 |         sys.exit(1)
79 | 
80 |     named_clusters = cross_data(cl_list, pdb_list)
81 |     
82 |     # Output
83 |     for i, nc in enumerate(named_clusters):
84 |         print "Cluster %i -> %s" %(i+1, ' '.join(nc))
85 | 


--------------------------------------------------------------------------------
/src/Makefile:
--------------------------------------------------------------------------------
 1 | #####################################
 2 | #                                   #
 3 | #         FCC Clustering            #
 4 | #             Makefile              #
 5 | #                                   #
 6 | #####################################
 7 | 
 8 | CPP=g++
 9 | CPFLAGS=-O2
10 | 
11 | EXEC=contact_fcc contact_fcc_lig
12 | 
13 | all: 
14 | 	make $(EXEC)
15 | 
16 | contact_fcc: contact_fcc.cpp
17 | 	$(CPP) $(CPFLAGS) -o contact_fcc contact_fcc.cpp
18 | 
19 | contact_fcc_lig: contact_fcc_lig.cpp
20 | 	$(CPP) $(CPFLAGS) -o contact_fcc_lig contact_fcc_lig.cpp
21 | 
22 | contact_fcc_intra: contact_fcc_intra.cpp
23 | 	$(CPP) $(CPFLAGS) -o contact_fcc_intra contact_fcc_intra.cpp
24 | 
25 | clean :
26 | 	/bin/rm $(EXEC)
27 | 
28 | 


--------------------------------------------------------------------------------
/src/contact_fcc.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <cstdio>
  3 | #include <cstring>
  4 | #include <string>
  5 | #include <map>
  6 | #include <set>
  7 | #include <vector>
  8 | #include <cmath>
  9 | #include <stdio.h>
 10 | #include <stdlib.h>
 11 | 
 12 | using namespace std;
 13 | 
 14 | struct Coor3f {
 15 |   float x;
 16 |   float y;
 17 |   float z;
 18 | };
 19 | 
 20 | struct Residue {
 21 |   int nr;
 22 |   vector<Coor3f> coor;
 23 |   vector<string> atom;
 24 |   int seg;
 25 | };
 26 | 
 27 | vector<Residue> res;
 28 | 
 29 | bool seg_sorter (Residue res_a, Residue res_b) {
 30 |   int segA = res_a.seg;
 31 |   int segB = res_b.seg;
 32 |   return (segA < segB);
 33 | };
 34 | 
 35 | int main(int argc, char *argv[]) {
 36 |   char buf[2000];
 37 | 
 38 |   if (argc < 3) {
 39 |     fprintf(stderr,"ERROR: Too few arguments\n");
 40 |     fprintf(stderr, "Usage: contact <pdb file> <cutoff>\n");
 41 |     return 1;
 42 |   }
 43 | 
 44 |   char *filename = argv[1];
 45 |   float cutoff = atof(argv[2]);
 46 | 
 47 |   if (cutoff < 0 || cutoff > 100) {
 48 |     fprintf(stderr,"ERROR: Cutoff out of range\n");
 49 |     fprintf(stderr, "Usage: contact <pdb file> <cutoff>\n");
 50 |     return 1;
 51 |   }
 52 | 
 53 |   FILE *fil = fopen(filename, "r");
 54 |   if (fil == NULL) {
 55 |     fprintf(stderr, "ERROR: PDB file %s does not exist\n", filename);
 56 |     return 1;
 57 |   }
 58 |   
 59 |   int currnr = -99999;
 60 |   char currseg;
 61 |   int segid = 0;
 62 | 
 63 |   set<int> nonconv;
 64 |   while (!feof(fil)) {
 65 |     char code[10];
 66 |     char atom[5];
 67 |     if (!fgets(buf, 2000, fil)) break;
 68 |     sscanf(buf, "%s %*d %s", code, atom);
 69 | 
 70 |     // Ignore HETATM and hydrogens
 71 |     if (!strncmp(code,"ATOM", 4) && ( atom[0] != 'H' && !(  isdigit(atom[0]) && atom[1] == 'H' )  )  ) {
 72 |       int nr = atoi(buf + 22);
 73 |       char seg = buf[72];
 74 |       if (seg != currseg) {
 75 |         currseg = seg;
 76 |         segid++;
 77 |       }
 78 |       if (nr != currnr) {
 79 |           Residue r;
 80 |           r.nr = nr+10000;
 81 |     	    r.seg = segid;
 82 |   	      res.push_back(r);
 83 |   	      currnr = r.nr;
 84 |       }
 85 |       Residue &rcurr = res[res.size() -1];
 86 |       Coor3f ccoor;
 87 |       ccoor.x = atof(buf+27);
 88 |       ccoor.y = atof(buf+38);
 89 |       ccoor.z = atof(buf+46);
 90 |       rcurr.coor.push_back(ccoor);
 91 |       string atom2(atom);
 92 |       rcurr.atom.push_back(atom2);
 93 |     }
 94 |   }
 95 | 
 96 |   if (!res.size()) {fprintf(stderr, "ERROR: PDB file %s contains no residues\n", filename); return 1;}
 97 | 
 98 |   // Sort the residues by segment to avoid random chain ordering problems
 99 |   sort (res.begin(), res.end(), seg_sorter);
100 | 
101 |   double cutoffsq = cutoff * cutoff;
102 | 
103 |   for (int n = 0; n < res.size(); n++) {
104 |     vector<Coor3f> &c1 = res[n].coor;
105 |     int seg1 = res[n].seg;
106 |     for (int nn = n + 1; nn < res.size(); nn++) {
107 |       int seg2 = res[nn].seg;
108 |       if (seg1 == seg2) continue;
109 |       vector<Coor3f> &c2 = res[nn].coor;
110 |       for (int i = 0; i < res[n].coor.size(); i++) {
111 |         for (int ii = 0; ii < res[nn].coor.size(); ii++) {
112 | 	  double currdissq =
113 | 	    (c1[i].x - c2[ii].x) * (c1[i].x - c2[ii].x) +
114 | 	    (c1[i].y - c2[ii].y) * (c1[i].y - c2[ii].y) +
115 | 	    (c1[i].z - c2[ii].z) * (c1[i].z - c2[ii].z);
116 | 	   if (currdissq < cutoffsq) {
117 | 	     printf ("%d%d%d%d\n", res[n].nr, res[n].seg, res[nn].nr, res[nn].seg);
118 | 	   }
119 |         }
120 |       }
121 |     }
122 |   }
123 |   fclose(fil);
124 | }
125 | 


--------------------------------------------------------------------------------
/src/contact_fcc_intra.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <cstdio>
  3 | #include <cstring>
  4 | #include <string>
  5 | #include <map>
  6 | #include <set>
  7 | #include <vector>
  8 | #include <cmath>
  9 | #include <stdio.h>
 10 | #include <stdlib.h>
 11 | 
 12 | using namespace std;
 13 | 
 14 | struct Coor3f {
 15 |   float x;
 16 |   float y;
 17 |   float z;
 18 | };
 19 | 
 20 | struct Residue {
 21 |   int nr;
 22 |   vector<Coor3f> coor;
 23 |   vector<string> atom;
 24 |   int seg;
 25 | };
 26 | 
 27 | vector<Residue> res;
 28 | 
 29 | int main(int argc, char *argv[]) {
 30 |   char buf[2000];
 31 | 
 32 |   if (argc < 3) {
 33 |     fprintf(stderr,"ERROR: Too few arguments\n");
 34 |     fprintf(stderr, "Usage: contact <pdb file> <cutoff>\n");
 35 |     return 1;
 36 |   }
 37 | 
 38 |   char *filename = argv[1];
 39 |   float cutoff = atof(argv[2]);
 40 | 
 41 |   if (cutoff < 0 || cutoff > 100) {
 42 |     fprintf(stderr,"ERROR: Cutoff out of range\n");
 43 |     fprintf(stderr, "Usage: contact <pdb file> <cutoff>\n");
 44 |     return 1;
 45 |   }
 46 | 
 47 |   FILE *fil = fopen(filename, "r");
 48 |   if (fil == NULL) {
 49 |     fprintf(stderr, "ERROR: PDB file %s does not exist\n", filename);
 50 |     return 1;
 51 |   }
 52 |   int currnr = -99999;
 53 |   char currseg;
 54 |   int segid = 0;
 55 | 
 56 |   set<int> nonconv;
 57 |   while (!feof(fil)) {
 58 |     char code[10];
 59 |     char atom[5];
 60 |     if (!fgets(buf, 2000, fil)) break;
 61 |     sscanf(buf, "%s %*d %s", code, atom);
 62 | 
 63 |     // Ignore HETATM and hydrogens
 64 |     if (!strncmp(code,"ATOM", 4) && ( atom[0] != 'H' && !(  isdigit(atom[0]) && atom[1] == 'H' )  )  ) {
 65 |       int nr = atoi(buf + 22);
 66 |       char seg = buf[72];
 67 |       if (seg != currseg) {
 68 |         currseg = seg;
 69 |         segid++;
 70 |       }
 71 |       if (nr != currnr) {
 72 |           Residue r;
 73 |           r.nr = nr+10000;
 74 |     	    r.seg = segid;
 75 |   	      res.push_back(r);
 76 |   	      currnr = r.nr;
 77 |       }
 78 |       Residue &rcurr = res[res.size() -1];
 79 |       Coor3f ccoor;
 80 |       ccoor.x = atof(buf+27);
 81 |       ccoor.y = atof(buf+38);
 82 |       ccoor.z = atof(buf+46);
 83 |       rcurr.coor.push_back(ccoor);
 84 |       string atom2(atom);
 85 |       rcurr.atom.push_back(atom2);
 86 |     }
 87 |   }
 88 | 
 89 |   if (!res.size()) {fprintf(stderr, "ERROR: PDB file %s contains no residues\n", filename); return 1;}
 90 | 
 91 |   double cutoffsq = cutoff * cutoff;
 92 | 
 93 |   for (int n = 0; n < res.size(); n++) {
 94 |     vector<Coor3f> &c1 = res[n].coor;
 95 |     int seg1 = res[n].seg;
 96 |     for (int nn = n + 1; nn < res.size(); nn++) {
 97 |       int seg2 = res[nn].seg;
 98 |       if (seg1 != seg2) continue;
 99 |       vector<Coor3f> &c2 = res[nn].coor;
100 |       for (int i = 0; i < res[n].coor.size(); i++) {
101 |         for (int ii = 0; ii < res[nn].coor.size(); ii++) {
102 | 	  double currdissq =
103 | 	    (c1[i].x - c2[ii].x) * (c1[i].x - c2[ii].x) +
104 | 	    (c1[i].y - c2[ii].y) * (c1[i].y - c2[ii].y) +
105 | 	    (c1[i].z - c2[ii].z) * (c1[i].z - c2[ii].z);
106 | 	   if (currdissq < cutoffsq) {
107 | 	     printf ("%d%d%d%d\n", res[n].nr, res[n].seg, res[nn].nr, res[nn].seg);
108 | 	   }
109 |         }
110 |       }
111 |     }
112 |   }
113 |   fclose(fil);
114 | }
115 | 


--------------------------------------------------------------------------------
/src/contact_fcc_lig.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <cstdio>
  3 | #include <cstring>
  4 | #include <string>
  5 | #include <map>
  6 | #include <set>
  7 | #include <vector>
  8 | #include <cmath>
  9 | #include <stdio.h>
 10 | #include <stdlib.h>
 11 | 
 12 | using namespace std;
 13 | 
 14 | struct Coor3f {
 15 |   float x;
 16 |   float y;
 17 |   float z;
 18 | };
 19 | 
 20 | struct Residue {
 21 |   int nr;
 22 |   vector<Coor3f> coor;
 23 |   vector<string> atom;
 24 |   int seg;
 25 | };
 26 | 
 27 | vector<Residue> res;
 28 | 
 29 | bool seg_sorter (Residue res_a, Residue res_b) {
 30 |   int segA = res_a.seg;
 31 |   int segB = res_b.seg;
 32 |   return (segA < segB);
 33 | };
 34 | 
 35 | int main(int argc, char *argv[]) {
 36 |   char buf[2000];
 37 | 
 38 |   if (argc < 3) {
 39 |     fprintf(stderr,"ERROR: Too few arguments\n");
 40 |     fprintf(stderr, "Usage: contact <pdb file> <cutoff>\n");
 41 |     return 1;
 42 |   }
 43 | 
 44 |   char *filename = argv[1];
 45 |   float cutoff = atof(argv[2]);
 46 | 
 47 |   if (cutoff < 0 || cutoff > 100) {
 48 |     fprintf(stderr,"ERROR: Cutoff out of range\n");
 49 |     fprintf(stderr, "Usage: contact <pdb file> <cutoff>\n");
 50 |     return 1;
 51 |   }
 52 | 
 53 |   FILE *fil = fopen(filename, "r");
 54 |   if (fil == NULL) {
 55 |     fprintf(stderr, "ERROR: PDB file %s does not exist\n", filename);
 56 |     return 1;
 57 |   }
 58 |   int currnr = -99999;
 59 |   char currseg;
 60 |   int segid = 0;
 61 | 
 62 |   set<int> nonconv;
 63 |   while (!feof(fil)) {
 64 |     char code[10];
 65 |     char atom[5];
 66 |     if (!fgets(buf, 2000, fil)) break;
 67 |     sscanf(buf, "%s %*d %s", code, atom);
 68 | 
 69 |     // Ignore HETATM and hydrogens
 70 |     if (!strncmp(code,"ATOM", 4) && ( atom[0] != 'H' && !(  isdigit(atom[0]) && atom[1] == 'H' )  )  ) {
 71 |       int nr = atoi(buf + 22);
 72 |       char seg = buf[72];
 73 |       if (seg != currseg) {
 74 |         currseg = seg;
 75 |         segid++;
 76 |       }
 77 |       if (nr != currnr) {
 78 |           Residue r;
 79 |           r.nr = nr+10000;
 80 |     	    r.seg = segid;
 81 |   	      res.push_back(r);
 82 |   	      currnr = r.nr;
 83 |       }
 84 |       Residue &rcurr = res[res.size() -1];
 85 |       Coor3f ccoor;
 86 |       ccoor.x = atof(buf+27);
 87 |       ccoor.y = atof(buf+38);
 88 |       ccoor.z = atof(buf+46);
 89 |       rcurr.coor.push_back(ccoor);
 90 |       string atom2(atom);
 91 |       rcurr.atom.push_back(atom2);
 92 |     }
 93 |   }
 94 | 
 95 |   if (!res.size()) {fprintf(stderr, "ERROR: PDB file %s contains no residues\n", filename); return 1;}
 96 | 
 97 |   // Sort the residues by segment to avoid random chain ordering problems
 98 |   sort (res.begin(), res.end(), seg_sorter);
 99 | 
100 |   double cutoffsq = cutoff * cutoff;
101 | 
102 |   for (int n = 0; n < res.size(); n++) {
103 |     vector<Coor3f> &c1 = res[n].coor;
104 |     int seg1 = res[n].seg;
105 |     for (int nn = n + 1; nn < res.size(); nn++) {
106 |       int seg2 = res[nn].seg;
107 |       if (seg1 == seg2) continue;
108 |       vector<Coor3f> &c2 = res[nn].coor;
109 |       for (int i = 0; i < res[n].coor.size(); i++) {
110 |         for (int ii = 0; ii < res[nn].coor.size(); ii++) {
111 | 	  double currdissq =
112 | 	    (c1[i].x - c2[ii].x) * (c1[i].x - c2[ii].x) +
113 | 	    (c1[i].y - c2[ii].y) * (c1[i].y - c2[ii].y) +
114 | 	    (c1[i].z - c2[ii].z) * (c1[i].z - c2[ii].z);
115 | 	   if (currdissq < cutoffsq) {
116 | 	     printf ("%d%d%d%d\n", res[n].nr, res[n].seg, nn, res[nn].seg);
117 | //	     printf ("%d %c %s %d %c %s %f\n", res[n].nr, res[n].seg, res[n].atom[i].c_str(), res[nn].nr, res[nn].seg, res[nn].atom[ii].c_str(), sqrt(currdissq));
118 | 	   }
119 |         }
120 |       }
121 |     }
122 |   }
123 |   fclose(fil);
124 | }
125 | 


--------------------------------------------------------------------------------