├── .gitignore ├── LICENSE ├── README.md ├── scripts ├── calc_fcc_matrix.py ├── cluster_fcc.py ├── make_contacts.py ├── pdb_chainxseg.py └── ppretty_clusters.py └── src ├── Makefile ├── contact_fcc.cpp ├── contact_fcc_intra.cpp └── contact_fcc_lig.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | src/contact_fcc 2 | src/contact_fcc_lig 3 | src/contact_fcc_intra 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache license 2.0 2 | 3 | Copyright 2013 João Rodrigues 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | FCC Clustering Algorithm 2 | ======================== 3 | 4 | *Fraction of Common Contacts Clustering Algorithm for Protein Models from Structure Prediction Methods* 5 | 6 | About FCC 7 | --------- 8 | 9 | Structure prediction methods generate a large number of models of which only a fraction matches the biologically relevant structure. To identify this (near-)native model, we often employ clustering 10 | algorithms, based on the assumption that, in the energy landscape of every biomolecule, its native state lies in a wide basin neighboring other structurally similar states. RMSD-based clustering, the 11 | current method of choice, is inadequate for large multi-molecular complexes, particularly when their components are symmetric. We developed a novel clustering strategy that is based on a very 12 | efficient similarity measure - the fraction of common contacts. The outcome of this calculation is a number between 0 and 1, which corresponds to the fraction of residue pairs that are present in 13 | both the reference and the mobile complex. 14 | 15 | Advantages of FCC clustering vs. RMSD-based clustering: 16 | * 100-times faster on average. 17 | * Handles symmetry by consider complexes as entities instead of collections of chains. 18 | * Does not require atom equivalence (clusters mutants, missing loops, etc). 19 | * Handles any molecule type (protein, DNA, RNA, carbohydrates, lipids, ligands, etc). 20 | * Allows multiple levels of "resolution": chain-chain contacts, residue-residue contacts, residue-atom contacts, etc. 21 | 22 | How to Cite 23 | ----------- 24 | Rodrigues JPGLM, Trellet M, Schmitz C, Kastritis P, Karaca E, Melquiond ASJ, Bonvin AMJJ. 25 | [Clustering biomolecular complexes by residue contacts similarity.] [1] 26 | Proteins: Structure, Function, and Bioinformatics 2012;80(7):1810–1817. 27 | 28 | Requirements 29 | ------------ 30 | 31 | * Python 2.6+ 32 | * C/C++ Compiler 33 | 34 | Installation 35 | ------------ 36 | 37 | Navigate to the src/ folder and issue 'make' to compile the contact programs. 38 | Edit the Makefile if necessary (e.g. different compiler, optimization level). 39 | 40 | Usage 41 | ------------ 42 | 43 | All scripts produce usage documentation if called without any arguments. Further, 44 | the '-h' option produces (for Python scripts) a more detailed help with descriptions 45 | of all available options. 46 | 47 | For most cases, the following setup is enough: 48 | 49 | # Make a file list with all your PDB files 50 | ls *pdb > pdb.list 51 | 52 | # Ensure all PDB models have segID identifiers 53 | # Convert chainIDs to segIDs if necessary using scripts/pdb_chainxseg.py 54 | for pdb in $( cat pdb.list ); do pdb_chainxseg.py $pdb > temp; mv temp $pdb; done 55 | 56 | # Generate contact files for all PDB files in pdb.list 57 | # using 4 cores on this machine. 58 | python2.6 make_contacts.py -f pdb.list -n 4 59 | 60 | # Create a file listing the names of the contact files 61 | # Use file.list to maintain order in the cluster output 62 | sed -e 's/pdb/contacts/' pdb.list | sed -e '/^$/d' > pdb.contacts 63 | 64 | # Calculate the similarity matrix 65 | python2.6 calc_fcc_matrix.py -f pdb.contacts -o fcc_matrix.out 66 | 67 | # Cluster the similarity matrix using a threshold of 0.75 (75% contacts in common) 68 | python2.6 cluster_fcc.py fcc_matrix.out 0.75 -o clusters_0.75.out 69 | 70 | # Use ppretty_clusters.py to output meaningful names instead of model indexes 71 | python2.6 ppretty_clusters.py clusters_0.75.out pdb.list 72 | 73 | Authors 74 | ------ 75 | 76 | João Rodrigues 77 | 78 | Mikael Trellet 79 | 80 | Adrien Melquiond 81 | 82 | Christophe Schmitz 83 | 84 | Ezgi Karaca 85 | 86 | Panagiotis Kastritis 87 | 88 | [Alexandre Bonvin] [2] 89 | 90 | [1]: http://www.ncbi.nlm.nih.gov/pubmed/22489062 "FCC @ Pubmed" 91 | [2]: http://nmr.chem.uu.nl/~abonvin "Alexandre Bonvin's Homepage" 92 | -------------------------------------------------------------------------------- /scripts/calc_fcc_matrix.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | """ 5 | Calculates a matrix of fraction of common contacts between two or more structures. 6 | 7 | Authors: 8 | RODRIGUES Joao 9 | TRELLET Mikael 10 | MELQUIOND Adrien 11 | """ 12 | 13 | # Contact Parsing routines 14 | def parse_contact_file(f_list, ignore_chain): 15 | """Parses a list of contact files.""" 16 | 17 | if ignore_chain: 18 | contacts = [ [ int(l[0:5]+l[6:-1]) for l in open(f)] for f in f_list if f.strip()] 19 | else: 20 | contacts = [ set([ int(l) for l in open(f)]) for f in f_list if f.strip()] 21 | 22 | return contacts 23 | 24 | # FCC Calculation Routine 25 | def calculate_fcc(listA, listB): 26 | """ 27 | Calculates the fraction of common elements between two lists 28 | taking into account chain IDs 29 | """ 30 | 31 | cc = len(listA.intersection(listB)) 32 | cc_v = len(listB.intersection(listA)) 33 | 34 | return (cc, cc_v) 35 | 36 | def calculate_fcc_nc(listA, listB): 37 | """ 38 | Calculates the fraction of common elements between two lists 39 | not taking into account chain IDs. Much Slower. 40 | """ 41 | 42 | largest,smallest = sorted([listA, listB], key=len) 43 | ncommon = len([ele for ele in largest if ele in smallest]) 44 | return (ncommon, ncommon) 45 | 46 | # Matrix Calculation 47 | 48 | def calculate_pairwise_matrix(contacts, ignore_chain): 49 | """ Calculates a matrix of pairwise fraction of common contacts (FCC). 50 | Outputs numeric indexes. 51 | 52 | contacts: list_of_unique_pairs_of_residues [set/list] 53 | 54 | Returns pairwise matrix as an iterator, each entry in the form: 55 | FCC(cplx_1/cplx_2) FCC(cplx_2/cplx_1) 56 | """ 57 | 58 | contact_lengths = [] 59 | for c in contacts: 60 | try: 61 | ic = 1.0/len(c) 62 | except ZeroDivisionError: 63 | ic = 0 64 | contact_lengths.append(ic) 65 | 66 | if ignore_chain: 67 | calc_fcc = calculate_fcc_nc 68 | else: 69 | calc_fcc = calculate_fcc 70 | 71 | for i in xrange(len(contacts)): 72 | 73 | for k in xrange(i+1, len(contacts)): 74 | cc, cc_v = calc_fcc(contacts[i], contacts[k]) 75 | fcc, fcc_v = cc*contact_lengths[i], cc*contact_lengths[k] 76 | yield (i+1, k+1, fcc, fcc_v) 77 | 78 | def _output_fcc(output, values, f_buffer): 79 | 80 | buf = [] 81 | for i in values: 82 | buf.append(i) 83 | if len(buf) == f_buffer: 84 | output( ''.join(["%s %s %1.3f %1.3f\n" %(i[0],i[1],i[2],i[3]) for i in buf]) ) 85 | buf = [] 86 | output( ''.join(["%s %s %1.3f %1.3f\n" %(i[0],i[1],i[2],i[3]) for i in buf]) ) 87 | 88 | if __name__ == '__main__': 89 | 90 | import optparse 91 | import sys 92 | from time import time, ctime 93 | import os 94 | 95 | USAGE = "%s ... [options]\n" %os.path.basename(sys.argv[0]) 96 | 97 | parser = optparse.OptionParser(usage=USAGE) 98 | parser.add_option('-o', '--output', dest="output_file", action='store', type='string', 99 | default=sys.stdout, 100 | help='Output File [default: STDOUT]') 101 | parser.add_option('-f', '--file', dest="input_file", action='store', type='string', 102 | help='Input file (one contact file name per line)') 103 | parser.add_option('-b', '--buffer_size', dest="buffer_size", action='store', type='string', 104 | default=50000, 105 | help='Buffer size for writing output. Number of lines to cache before writing to file [default: 50000]') 106 | parser.add_option('-i', '--ignore_chain', dest="ignore_chain_char", action='store_true', 107 | help='Ignore chain character in residue code. Use for homomeric complexes.') 108 | 109 | (options, args) = parser.parse_args() 110 | 111 | if options.input_file: 112 | args = [name.strip() for name in open(options.input_file)] 113 | 114 | if len(args) < 2: 115 | sys.stderr.write("- Provide (at least) two structures to calculate a matrix. You provided %s.\n" %len(args)) 116 | sys.stderr.write(USAGE) 117 | sys.exit(1) 118 | 119 | sys.stderr.write("+ BEGIN: %s\n" %ctime()) 120 | if options.ignore_chain_char: 121 | sys.stderr.write("+ Ignoring chains. Expect a considerable slowdown!!\n") 122 | exclude_chains = True 123 | else: 124 | exclude_chains = False 125 | 126 | t_init = time() 127 | sys.stderr.write("+ Parsing %i contact files\n" %len(args)) 128 | 129 | c = parse_contact_file(args, exclude_chains) 130 | 131 | m = calculate_pairwise_matrix(c, exclude_chains) 132 | 133 | if isinstance(options.output_file, str): 134 | f = open(options.output_file, 'w') 135 | else: 136 | f = options.output_file 137 | 138 | sys.stderr.write("+ Calculating Matrix\n") # Matrix is calculated when writing. Generator property. 139 | sys.stderr.write("+ Writing matrix to %s\n" %f.name) 140 | _output_fcc(f.write, m, options.buffer_size) 141 | 142 | if isinstance(options.output_file, str): 143 | f.close() 144 | t_elapsed = time()-t_init 145 | sys.stderr.write("+ END: %s [%6.2f seconds elapsed]\n" %(ctime(), t_elapsed)) 146 | -------------------------------------------------------------------------------- /scripts/cluster_fcc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | """ 5 | Asymmetric Taylor-Butina Disjoint Clustering Algorithm. 6 | 7 | Authors: 8 | RODRIGUES Joao 9 | TRELLET Mikael 10 | MELQUIOND Adrien 11 | """ 12 | 13 | class Element(object): 14 | """Defines a 'clusterable' Element""" 15 | 16 | __slots__ = ['name', 'cluster', 'neighbors'] 17 | 18 | def __init__(self, name): 19 | self.name = name 20 | self.cluster = 0 21 | self.neighbors = set() 22 | 23 | 24 | def add_neighbor(self, neighbor): 25 | """Adds another element to the neighbor list""" 26 | self.neighbors.add(neighbor) 27 | 28 | def assign_cluster(self, clust_id): 29 | """Assigns the Element to Cluster. 0 if unclustered""" 30 | self.cluster = clust_id 31 | 32 | class Cluster(object): 33 | """Defines a Cluster. A Cluster is created with a name and a center (Element class)""" 34 | 35 | __slots__ = ['name', 'center', 'members'] 36 | 37 | def __init__(self, name, center): 38 | 39 | self.name = name 40 | self.center = center 41 | 42 | self.members = [] 43 | 44 | self.populate() 45 | 46 | def __len__(self): 47 | return len(self.members)+1 # +1 Center 48 | 49 | def populate(self): 50 | """ 51 | Populates the Cluster member list through the 52 | neighbor list of its center. 53 | """ 54 | 55 | name = self.name 56 | # Assign center 57 | ctr = self.center 58 | ctr.assign_cluster(name) 59 | 60 | mlist = self.members 61 | # Assign members 62 | ctr_nlist = (n for n in ctr.neighbors if not n.cluster) 63 | for e in ctr_nlist: 64 | mlist.append(e) 65 | e.assign_cluster(name) 66 | 67 | def add_member(self, element): 68 | """ 69 | Adds one single element to the cluster. 70 | """ 71 | l = self.members 72 | l.append(element) 73 | element.assign_cluster(self.name) 74 | 75 | def read_matrix(path, cutoff, strictness): 76 | """ 77 | Reads in a four column matrix (1 2 0.123 0.456\n) 78 | and creates an dictionary of Elements. 79 | 80 | The strictness factor is a that multiplies by the cutoff 81 | to produce a new cutoff for the second half of the matrix. Used to 82 | allow some variability while keeping very small interfaces from clustering 83 | with anything remotely similar. 84 | """ 85 | 86 | cutoff = float(cutoff) 87 | partner_cutoff = float(cutoff) * float(strictness) 88 | 89 | elements = {} 90 | 91 | f = open(path, 'r') 92 | for line in f: 93 | ref, mobi, dRM, dMR = line.split() 94 | ref = int(ref) 95 | mobi = int(mobi) 96 | dRM = float(dRM) 97 | dMR = float(dMR) 98 | 99 | # Create or Retrieve Elements 100 | if ref not in elements: 101 | r = Element(ref) 102 | elements[ref] = r 103 | else: 104 | r = elements[ref] 105 | 106 | if mobi not in elements: 107 | m = Element(mobi) 108 | elements[mobi] = m 109 | else: 110 | m = elements[mobi] 111 | 112 | # Assign neighbors 113 | if dRM >= cutoff and dMR >= partner_cutoff: 114 | r.add_neighbor(m) 115 | if dMR >= cutoff and dRM >= partner_cutoff: 116 | m.add_neighbor(r) 117 | 118 | f.close() 119 | 120 | return elements 121 | 122 | def remove_true_singletons(element_pool): 123 | """ Removes from the pool elements without any neighbor """ 124 | 125 | ep = element_pool 126 | 127 | ts = set([e for e in ep if not ep[e].neighbors]) 128 | 129 | # Remove ts from everybody's neighbor list 130 | ts_e = set(ep[e] for e in ts) 131 | for e in element_pool: 132 | ep[e].neighbors = ep[e].neighbors.difference(ts_e) 133 | 134 | # Remove ts from pool 135 | for e in ts: 136 | del ep[e] 137 | 138 | return (ts, ep) 139 | 140 | def cluster_elements(element_pool, threshold): 141 | """ 142 | Groups Elements within a given threshold 143 | together in the same cluster. 144 | """ 145 | 146 | clusters = [] 147 | threshold -= 1 # Account for center 148 | ep = element_pool 149 | cn = 1 # Cluster Number 150 | while 1: 151 | # Clusterable elements 152 | ce = [e for e in ep if not ep[e].cluster] 153 | if not ce: # No more elements to cluster 154 | break 155 | 156 | # Select Cluster Center 157 | # Element with largest neighbor list 158 | ctr_nlist, ctr = sorted([(len([se for se in ep[e].neighbors if not se.cluster]), e) for e in ce])[-1] 159 | 160 | # Cluster until length of remaining elements lists are above threshold 161 | if ctr_nlist < threshold: 162 | break 163 | 164 | # Create Cluster 165 | c = Cluster(cn, ep[ctr]) 166 | cn += 1 167 | clusters.append(c) 168 | 169 | return (ep, clusters) 170 | 171 | def output_clusters(handle, clusters): 172 | """Outputs the cluster name, center, and members.""" 173 | 174 | write = handle.write 175 | 176 | for c in clusters: 177 | write( "Cluster %s -> %s " %(c.name, c.center.name) ) 178 | for m in sorted(c.members, key=lambda k: k.name): 179 | write( "%s " %m.name ) 180 | write("\n") 181 | 182 | if __name__ == "__main__": 183 | 184 | import optparse 185 | import sys 186 | from time import time, ctime 187 | import os 188 | 189 | USAGE="%s [options]" %os.path.basename(sys.argv[0]) 190 | 191 | parser = optparse.OptionParser(usage=USAGE) 192 | parser.add_option('-o', '--output', dest="output_handle", action='store', type='str', 193 | default=sys.stdout, 194 | help='Output File [STDOUT]') 195 | parser.add_option('-c', '--cluster-size', dest="clus_size", action="store", type="int", 196 | default=4, 197 | help="Minimum number of elements in a cluster [4]") 198 | parser.add_option('-s', '--strictness', dest="strictness", action="store", type='float', 199 | default=0.75, 200 | help="Multiplier for cutoff for M->R inclusion threshold. [0.75 or effective cutoff of 0.5625]") 201 | 202 | 203 | (options, args) = parser.parse_args() 204 | 205 | if sys.version_info[0:2] < (2,6): 206 | cur_version = "%s.%s" %sys.version_info[0:2] 207 | sys.stderr.write("- Python version not supported (%s). Please use 2.5 or newer.\n" %cur_version ) 208 | sys.exit(1) 209 | if len(args) != 2: 210 | sys.stderr.write("- Invalid number of arguments: %i\n" %len(args)) 211 | sys.stderr.write("USAGE: %s\n" %USAGE) 212 | sys.exit(1) 213 | 214 | fmatrix, cutoff = args 215 | cutoff = float(cutoff) 216 | 217 | # Read Matrix 218 | sys.stderr.write("+ BEGIN: %s\n" %ctime()) 219 | t_init = time() 220 | 221 | try: 222 | pool = read_matrix(fmatrix, cutoff, options.strictness) 223 | except IOError: 224 | sys.stderr.write("File not found: %s\n" %fmatrix) 225 | sys.exit(1) 226 | 227 | sys.stderr.write("+ Read %ix%i distance matrix in %i seconds\n" %(len(pool), len(pool), int(time()-t_init))) 228 | 229 | # ts, pool = remove_true_singletons(pool) 230 | # sys.stderr.write("+ Detected %i True Singletons\n" %len(ts)) 231 | 232 | # Cluster 233 | element_pool, clusters = cluster_elements(pool, options.clus_size) 234 | 235 | # Output Clusters 236 | o = options.output_handle 237 | if isinstance(o, str): 238 | o_handle = open(o, 'w') 239 | else: 240 | o_handle = o 241 | 242 | sys.stderr.write("+ Writing %i Clusters\n" %len(clusters)) 243 | output_clusters(o_handle, clusters) 244 | if isinstance(o, str): 245 | o_handle.close() 246 | 247 | total_elements = len(element_pool) 248 | clustered = sum([len(c) for c in clusters]) 249 | # Calculate coverage 250 | clust_coverage = clustered*100/float(total_elements) 251 | sys.stderr.write("+ Coverage %3.2f%% (%i/%i)\n" %(clust_coverage, clustered, total_elements)) 252 | t_elapsed = time()-t_init 253 | sys.stderr.write( "+ END: %s [%3.2f seconds]\n" %(ctime(), t_elapsed)) 254 | -------------------------------------------------------------------------------- /scripts/make_contacts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | """ 5 | Script to calculate contact lists on PDB files. 6 | Requires external executable to calculate the contacts! 7 | 8 | Authors: 9 | RODRIGUES Joao 10 | TRELLET Mikael 11 | MELQUIOND Adrien 12 | """ 13 | 14 | from multiprocessing import Process 15 | from subprocess import Popen, PIPE 16 | 17 | def _calculate_contacts(executable, pdbfile, d_cutoff, filter_selection=None, extension='.contacts'): 18 | """ 19 | Outputs a list of contacts based on vector analysis 20 | of the PDB file. 21 | 22 | Arguments: 23 | executable - path to contact calculation program 24 | pdbfile - path to PDB-formatted file (.pdb extension) 25 | d_cutoff - minimal distance in A to consider a contact (float) 26 | filter_selection - list of identifiers to filter contacts (list of strings) 27 | """ 28 | 29 | pdbname = os.path.basename(pdbfile)[:-4] 30 | 31 | p = Popen([executable, pdbfile, d_cutoff], stdout=PIPE) 32 | p_output = p.communicate()[0] 33 | contacts = sorted(list(set([l for l in p_output.split('\n')][:-1]))) 34 | 35 | # Filter contacts 36 | if filter_selection: 37 | contacts = filter(lambda x: x[5] in filter_selection and x[-1] in filter_selection, contacts) 38 | # 39 | 40 | outfile = os.path.join(os.path.dirname(pdbfile), "%s%s" %(pdbname, extension)) 41 | with open(outfile, 'w') as o: 42 | o.write('\n'.join(contacts)) 43 | 44 | return 0 45 | 46 | if __name__ == '__main__': 47 | 48 | import optparse 49 | import os, sys 50 | 51 | USAGE = "%s [-f structures.txt] [-n 4] [-c 5.0] file1.pdb file2.pdb" 52 | 53 | parser = optparse.OptionParser(usage=USAGE) 54 | parser.add_option('-c', '--cutoff', dest="d_cutoff", action='store', type='string', 55 | default="5.0", 56 | help='Distance cutoff to evaluate contacts. [default: 5.0A]') 57 | parser.add_option('-f', '--file', dest="input_file", action='store', type='string', 58 | help='Input file (one file path per line)') 59 | parser.add_option('-n', '--nproc', dest="nproc", action='store', type='string', 60 | default=1, 61 | help='Number of simultaneous processes to launch in each round. [default: 1]') 62 | parser.add_option('-e', '--exec', dest="executable", action='store', type='string', 63 | default='%s/../src/contact_fcc' %os.path.dirname(sys.argv[0]), 64 | help='Path to the executable C++ program to calculate contacts [default: ../fcc/src/contact_fcc]') 65 | parser.add_option('-s', '--selection', dest="selection", action='store', type='string', 66 | default=None, 67 | help='Filter contacts based on their segids. [Default: No filtering. All chains] [Example: A,C]') 68 | 69 | (options, args) = parser.parse_args() 70 | 71 | if options.input_file: 72 | args = [name.strip() for name in open(options.input_file) if name.strip()] 73 | 74 | if not args: 75 | print "No files provided. Exiting" 76 | print USAGE 77 | sys.exit(1) 78 | 79 | # Convert to full paths 80 | args = map(os.path.abspath, args) 81 | 82 | nproc = int(options.nproc) 83 | cutoff = options.d_cutoff 84 | 85 | executable = options.executable 86 | if not os.path.exists(executable): 87 | print "Path not found: %s" %os.path.abspath(executable) 88 | sys.exit(1) 89 | executable = os.path.abspath(executable) 90 | 91 | if options.selection: 92 | filter_selection = set(options.selection.split(',')) 93 | representative = open(args[0]) 94 | repr_chains = dict([(j,str(i)) for i,j in enumerate(sorted(set([l[72] for l in representative if l.startswith('ATOM')])), start=1)]) 95 | filter_selection = map(repr_chains.get, filter_selection) 96 | representative.close() 97 | oextension = '.contacts-'+''.join(options.selection.split(',')) 98 | else: 99 | filter_selection = None 100 | oextension = '.contacts' 101 | 102 | queue = [] 103 | 104 | while 1: 105 | 106 | arg = args.pop() 107 | # Create Process for arg 108 | p = Process(target=_calculate_contacts, args=(executable, arg, cutoff, filter_selection, oextension)) 109 | queue.append(p) 110 | 111 | if (len(queue) == nproc) or (not args and len(queue)): 112 | 113 | for job in queue: 114 | job.start() 115 | for job in queue: # Waiting for job to finish 116 | job.join() 117 | queue = [] 118 | 119 | if not args and not queue: 120 | break 121 | 122 | print "Finished" 123 | -------------------------------------------------------------------------------- /scripts/pdb_chainxseg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Exchanges segment ID for chain ID in a PDB file. 5 | 6 | usage: python pdb_chainxseg.py 7 | example: python pdb_chainxseg.py 1CTF.pdb 8 | 9 | This program is part of the PDB tools distributed with HADDOCK 10 | or with the HADDOCK tutorial. The utilities in this package 11 | can be used to quickly manipulate PDB files, with the benefit 12 | of 'piping' several different commands. This is a rewrite of old 13 | FORTRAN77 code that was taking too much effort to compile. RIP. 14 | """ 15 | 16 | import os 17 | import re 18 | import sys 19 | 20 | __author__ = "Joao Rodrigues" 21 | 22 | USAGE = "usage: " + sys.argv[0] + " \n" 23 | 24 | def check_input(args): 25 | """Checks whether to read from stdin/file and validates user input/options.""" 26 | 27 | if not len(args): 28 | # Read from pipe 29 | if not sys.stdin.isatty(): 30 | pdbfh = sys.stdin 31 | else: 32 | sys.stderr.write(USAGE) 33 | sys.exit(1) 34 | elif len(args) == 1: 35 | # Read from file 36 | if not os.path.exists(args[0]): 37 | sys.stderr.write('File not found: ' + args[0] + '\n') 38 | sys.stderr.write(USAGE) 39 | sys.exit(1) 40 | pdbfh = open(args[0], 'r') 41 | else: 42 | sys.stderr.write(USAGE) 43 | sys.exit(1) 44 | 45 | return pdbfh 46 | 47 | def _swap_chainxseg(fhandle): 48 | """Enclosing logic in a function to speed up a bit""" 49 | 50 | coord_re = re.compile('^(ATOM|HETATM)') 51 | fhandle = fhandle 52 | 53 | for line in fhandle: 54 | if coord_re.match(line): 55 | yield line[:72] + line[21].ljust(4) + line[76:] 56 | else: 57 | yield line 58 | 59 | if __name__ == '__main__': 60 | # Check Input 61 | pdbfh = check_input(sys.argv[1:]) 62 | 63 | # Do the job 64 | new_pdb = _swap_chainxseg(pdbfh) 65 | 66 | try: 67 | sys.stdout.write(''.join(new_pdb)) 68 | sys.stdout.flush() 69 | except IOError: 70 | # This is here to catch Broken Pipes 71 | # for example to use 'head' or 'tail' without 72 | # the error message showing up 73 | pass 74 | 75 | # last line of the script 76 | # We can close it even if it is sys.stdin 77 | pdbfh.close() 78 | sys.exit(0) -------------------------------------------------------------------------------- /scripts/ppretty_clusters.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Outputs the names of the cluster members based on a clustering output file 5 | and the original file listing the PDB files 6 | 7 | Authors: 8 | RODRIGUES Joao 9 | """ 10 | 11 | import os 12 | import sys 13 | 14 | USAGE = "python %s " %os.path.basename(sys.argv[0]) 15 | 16 | def read_clusters(path): 17 | """ 18 | Reads clusters from a FCC output file. 19 | """ 20 | 21 | clusters = [] 22 | cl_file = open(path, 'r') 23 | for line in cl_file: 24 | # Cluster 8 -> 193 141 142 144 151 168 171 172 178 25 | models = map(int, line.split()[3:]) 26 | clusters.append(models) 27 | 28 | return clusters 29 | 30 | def read_list(path): 31 | """ 32 | Reads a list containing one file per line. 33 | Returns an index of line number - line content 34 | """ 35 | 36 | with open(path, 'r') as fhandle: 37 | fdata = {} 38 | for nline, line in enumerate(fhandle): 39 | if not line.strip(): 40 | continue 41 | # Remove extension 42 | fdata[nline+1] = '.'.join(line.strip().split('.')[:-1]) 43 | 44 | return fdata 45 | 46 | def cross_data(clusters, flist): 47 | """ 48 | Matches names in flist to the numbers in clusters. 49 | """ 50 | 51 | named_clusters = [] 52 | for cl in clusters: 53 | ncl = [flist[s] for s in cl] 54 | named_clusters.append(ncl) 55 | 56 | return named_clusters 57 | 58 | if __name__ == '__main__': 59 | 60 | if len(sys.argv[1:]) != 2: 61 | print USAGE 62 | sys.exit(1) 63 | 64 | 65 | cluster_file = os.path.abspath(sys.argv[1]) 66 | pdblist_file = os.path.abspath(sys.argv[2]) 67 | 68 | try: 69 | cl_list = read_clusters(cluster_file) 70 | except IOError: 71 | sys.stderr.write('Error: file not found (%s)\nAborting..\n' %cluster_file) 72 | sys.exit(1) 73 | 74 | try: 75 | pdb_list = read_list(pdblist_file) 76 | except IOError: 77 | sys.stderr.write('Error: file not found (%s)\nAborting..\n' %pdblist_file) 78 | sys.exit(1) 79 | 80 | named_clusters = cross_data(cl_list, pdb_list) 81 | 82 | # Output 83 | for i, nc in enumerate(named_clusters): 84 | print "Cluster %i -> %s" %(i+1, ' '.join(nc)) 85 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | ##################################### 2 | # # 3 | # FCC Clustering # 4 | # Makefile # 5 | # # 6 | ##################################### 7 | 8 | CPP=g++ 9 | CPFLAGS=-O2 10 | 11 | EXEC=contact_fcc contact_fcc_lig 12 | 13 | all: 14 | make $(EXEC) 15 | 16 | contact_fcc: contact_fcc.cpp 17 | $(CPP) $(CPFLAGS) -o contact_fcc contact_fcc.cpp 18 | 19 | contact_fcc_lig: contact_fcc_lig.cpp 20 | $(CPP) $(CPFLAGS) -o contact_fcc_lig contact_fcc_lig.cpp 21 | 22 | contact_fcc_intra: contact_fcc_intra.cpp 23 | $(CPP) $(CPFLAGS) -o contact_fcc_intra contact_fcc_intra.cpp 24 | 25 | clean : 26 | /bin/rm $(EXEC) 27 | 28 | -------------------------------------------------------------------------------- /src/contact_fcc.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | using namespace std; 13 | 14 | struct Coor3f { 15 | float x; 16 | float y; 17 | float z; 18 | }; 19 | 20 | struct Residue { 21 | int nr; 22 | vector coor; 23 | vector atom; 24 | int seg; 25 | }; 26 | 27 | vector res; 28 | 29 | bool seg_sorter (Residue res_a, Residue res_b) { 30 | int segA = res_a.seg; 31 | int segB = res_b.seg; 32 | return (segA < segB); 33 | }; 34 | 35 | int main(int argc, char *argv[]) { 36 | char buf[2000]; 37 | 38 | if (argc < 3) { 39 | fprintf(stderr,"ERROR: Too few arguments\n"); 40 | fprintf(stderr, "Usage: contact \n"); 41 | return 1; 42 | } 43 | 44 | char *filename = argv[1]; 45 | float cutoff = atof(argv[2]); 46 | 47 | if (cutoff < 0 || cutoff > 100) { 48 | fprintf(stderr,"ERROR: Cutoff out of range\n"); 49 | fprintf(stderr, "Usage: contact \n"); 50 | return 1; 51 | } 52 | 53 | FILE *fil = fopen(filename, "r"); 54 | if (fil == NULL) { 55 | fprintf(stderr, "ERROR: PDB file %s does not exist\n", filename); 56 | return 1; 57 | } 58 | 59 | int currnr = -99999; 60 | char currseg; 61 | int segid = 0; 62 | 63 | set nonconv; 64 | while (!feof(fil)) { 65 | char code[10]; 66 | char atom[5]; 67 | if (!fgets(buf, 2000, fil)) break; 68 | sscanf(buf, "%s %*d %s", code, atom); 69 | 70 | // Ignore HETATM and hydrogens 71 | if (!strncmp(code,"ATOM", 4) && ( atom[0] != 'H' && !( isdigit(atom[0]) && atom[1] == 'H' ) ) ) { 72 | int nr = atoi(buf + 22); 73 | char seg = buf[72]; 74 | if (seg != currseg) { 75 | currseg = seg; 76 | segid++; 77 | } 78 | if (nr != currnr) { 79 | Residue r; 80 | r.nr = nr+10000; 81 | r.seg = segid; 82 | res.push_back(r); 83 | currnr = r.nr; 84 | } 85 | Residue &rcurr = res[res.size() -1]; 86 | Coor3f ccoor; 87 | ccoor.x = atof(buf+27); 88 | ccoor.y = atof(buf+38); 89 | ccoor.z = atof(buf+46); 90 | rcurr.coor.push_back(ccoor); 91 | string atom2(atom); 92 | rcurr.atom.push_back(atom2); 93 | } 94 | } 95 | 96 | if (!res.size()) {fprintf(stderr, "ERROR: PDB file %s contains no residues\n", filename); return 1;} 97 | 98 | // Sort the residues by segment to avoid random chain ordering problems 99 | sort (res.begin(), res.end(), seg_sorter); 100 | 101 | double cutoffsq = cutoff * cutoff; 102 | 103 | for (int n = 0; n < res.size(); n++) { 104 | vector &c1 = res[n].coor; 105 | int seg1 = res[n].seg; 106 | for (int nn = n + 1; nn < res.size(); nn++) { 107 | int seg2 = res[nn].seg; 108 | if (seg1 == seg2) continue; 109 | vector &c2 = res[nn].coor; 110 | for (int i = 0; i < res[n].coor.size(); i++) { 111 | for (int ii = 0; ii < res[nn].coor.size(); ii++) { 112 | double currdissq = 113 | (c1[i].x - c2[ii].x) * (c1[i].x - c2[ii].x) + 114 | (c1[i].y - c2[ii].y) * (c1[i].y - c2[ii].y) + 115 | (c1[i].z - c2[ii].z) * (c1[i].z - c2[ii].z); 116 | if (currdissq < cutoffsq) { 117 | printf ("%d%d%d%d\n", res[n].nr, res[n].seg, res[nn].nr, res[nn].seg); 118 | } 119 | } 120 | } 121 | } 122 | } 123 | fclose(fil); 124 | } 125 | -------------------------------------------------------------------------------- /src/contact_fcc_intra.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | using namespace std; 13 | 14 | struct Coor3f { 15 | float x; 16 | float y; 17 | float z; 18 | }; 19 | 20 | struct Residue { 21 | int nr; 22 | vector coor; 23 | vector atom; 24 | int seg; 25 | }; 26 | 27 | vector res; 28 | 29 | int main(int argc, char *argv[]) { 30 | char buf[2000]; 31 | 32 | if (argc < 3) { 33 | fprintf(stderr,"ERROR: Too few arguments\n"); 34 | fprintf(stderr, "Usage: contact \n"); 35 | return 1; 36 | } 37 | 38 | char *filename = argv[1]; 39 | float cutoff = atof(argv[2]); 40 | 41 | if (cutoff < 0 || cutoff > 100) { 42 | fprintf(stderr,"ERROR: Cutoff out of range\n"); 43 | fprintf(stderr, "Usage: contact \n"); 44 | return 1; 45 | } 46 | 47 | FILE *fil = fopen(filename, "r"); 48 | if (fil == NULL) { 49 | fprintf(stderr, "ERROR: PDB file %s does not exist\n", filename); 50 | return 1; 51 | } 52 | int currnr = -99999; 53 | char currseg; 54 | int segid = 0; 55 | 56 | set nonconv; 57 | while (!feof(fil)) { 58 | char code[10]; 59 | char atom[5]; 60 | if (!fgets(buf, 2000, fil)) break; 61 | sscanf(buf, "%s %*d %s", code, atom); 62 | 63 | // Ignore HETATM and hydrogens 64 | if (!strncmp(code,"ATOM", 4) && ( atom[0] != 'H' && !( isdigit(atom[0]) && atom[1] == 'H' ) ) ) { 65 | int nr = atoi(buf + 22); 66 | char seg = buf[72]; 67 | if (seg != currseg) { 68 | currseg = seg; 69 | segid++; 70 | } 71 | if (nr != currnr) { 72 | Residue r; 73 | r.nr = nr+10000; 74 | r.seg = segid; 75 | res.push_back(r); 76 | currnr = r.nr; 77 | } 78 | Residue &rcurr = res[res.size() -1]; 79 | Coor3f ccoor; 80 | ccoor.x = atof(buf+27); 81 | ccoor.y = atof(buf+38); 82 | ccoor.z = atof(buf+46); 83 | rcurr.coor.push_back(ccoor); 84 | string atom2(atom); 85 | rcurr.atom.push_back(atom2); 86 | } 87 | } 88 | 89 | if (!res.size()) {fprintf(stderr, "ERROR: PDB file %s contains no residues\n", filename); return 1;} 90 | 91 | double cutoffsq = cutoff * cutoff; 92 | 93 | for (int n = 0; n < res.size(); n++) { 94 | vector &c1 = res[n].coor; 95 | int seg1 = res[n].seg; 96 | for (int nn = n + 1; nn < res.size(); nn++) { 97 | int seg2 = res[nn].seg; 98 | if (seg1 != seg2) continue; 99 | vector &c2 = res[nn].coor; 100 | for (int i = 0; i < res[n].coor.size(); i++) { 101 | for (int ii = 0; ii < res[nn].coor.size(); ii++) { 102 | double currdissq = 103 | (c1[i].x - c2[ii].x) * (c1[i].x - c2[ii].x) + 104 | (c1[i].y - c2[ii].y) * (c1[i].y - c2[ii].y) + 105 | (c1[i].z - c2[ii].z) * (c1[i].z - c2[ii].z); 106 | if (currdissq < cutoffsq) { 107 | printf ("%d%d%d%d\n", res[n].nr, res[n].seg, res[nn].nr, res[nn].seg); 108 | } 109 | } 110 | } 111 | } 112 | } 113 | fclose(fil); 114 | } 115 | -------------------------------------------------------------------------------- /src/contact_fcc_lig.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | using namespace std; 13 | 14 | struct Coor3f { 15 | float x; 16 | float y; 17 | float z; 18 | }; 19 | 20 | struct Residue { 21 | int nr; 22 | vector coor; 23 | vector atom; 24 | int seg; 25 | }; 26 | 27 | vector res; 28 | 29 | bool seg_sorter (Residue res_a, Residue res_b) { 30 | int segA = res_a.seg; 31 | int segB = res_b.seg; 32 | return (segA < segB); 33 | }; 34 | 35 | int main(int argc, char *argv[]) { 36 | char buf[2000]; 37 | 38 | if (argc < 3) { 39 | fprintf(stderr,"ERROR: Too few arguments\n"); 40 | fprintf(stderr, "Usage: contact \n"); 41 | return 1; 42 | } 43 | 44 | char *filename = argv[1]; 45 | float cutoff = atof(argv[2]); 46 | 47 | if (cutoff < 0 || cutoff > 100) { 48 | fprintf(stderr,"ERROR: Cutoff out of range\n"); 49 | fprintf(stderr, "Usage: contact \n"); 50 | return 1; 51 | } 52 | 53 | FILE *fil = fopen(filename, "r"); 54 | if (fil == NULL) { 55 | fprintf(stderr, "ERROR: PDB file %s does not exist\n", filename); 56 | return 1; 57 | } 58 | int currnr = -99999; 59 | char currseg; 60 | int segid = 0; 61 | 62 | set nonconv; 63 | while (!feof(fil)) { 64 | char code[10]; 65 | char atom[5]; 66 | if (!fgets(buf, 2000, fil)) break; 67 | sscanf(buf, "%s %*d %s", code, atom); 68 | 69 | // Ignore HETATM and hydrogens 70 | if (!strncmp(code,"ATOM", 4) && ( atom[0] != 'H' && !( isdigit(atom[0]) && atom[1] == 'H' ) ) ) { 71 | int nr = atoi(buf + 22); 72 | char seg = buf[72]; 73 | if (seg != currseg) { 74 | currseg = seg; 75 | segid++; 76 | } 77 | if (nr != currnr) { 78 | Residue r; 79 | r.nr = nr+10000; 80 | r.seg = segid; 81 | res.push_back(r); 82 | currnr = r.nr; 83 | } 84 | Residue &rcurr = res[res.size() -1]; 85 | Coor3f ccoor; 86 | ccoor.x = atof(buf+27); 87 | ccoor.y = atof(buf+38); 88 | ccoor.z = atof(buf+46); 89 | rcurr.coor.push_back(ccoor); 90 | string atom2(atom); 91 | rcurr.atom.push_back(atom2); 92 | } 93 | } 94 | 95 | if (!res.size()) {fprintf(stderr, "ERROR: PDB file %s contains no residues\n", filename); return 1;} 96 | 97 | // Sort the residues by segment to avoid random chain ordering problems 98 | sort (res.begin(), res.end(), seg_sorter); 99 | 100 | double cutoffsq = cutoff * cutoff; 101 | 102 | for (int n = 0; n < res.size(); n++) { 103 | vector &c1 = res[n].coor; 104 | int seg1 = res[n].seg; 105 | for (int nn = n + 1; nn < res.size(); nn++) { 106 | int seg2 = res[nn].seg; 107 | if (seg1 == seg2) continue; 108 | vector &c2 = res[nn].coor; 109 | for (int i = 0; i < res[n].coor.size(); i++) { 110 | for (int ii = 0; ii < res[nn].coor.size(); ii++) { 111 | double currdissq = 112 | (c1[i].x - c2[ii].x) * (c1[i].x - c2[ii].x) + 113 | (c1[i].y - c2[ii].y) * (c1[i].y - c2[ii].y) + 114 | (c1[i].z - c2[ii].z) * (c1[i].z - c2[ii].z); 115 | if (currdissq < cutoffsq) { 116 | printf ("%d%d%d%d\n", res[n].nr, res[n].seg, nn, res[nn].seg); 117 | // printf ("%d %c %s %d %c %s %f\n", res[n].nr, res[n].seg, res[n].atom[i].c_str(), res[nn].nr, res[nn].seg, res[nn].atom[ii].c_str(), sqrt(currdissq)); 118 | } 119 | } 120 | } 121 | } 122 | } 123 | fclose(fil); 124 | } 125 | --------------------------------------------------------------------------------