├── README.md ├── .gitignore ├── SigNetNode.py ├── InSilico DREAM8.py ├── DREAM8Exp.py ├── DREAM8-CellLine-specific.py ├── NamedMatrix.py ├── SteinerTree.py ├── LICENSE └── PyGibbCAMP.py /README.md: -------------------------------------------------------------------------------- 1 | PyGibbCAMP 2 | ========== 3 | 4 | A Python implementation of causal inference of pathways using Gibbs sample approach 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | 29 | # Translations 30 | *.mo 31 | 32 | # Mr Developer 33 | .mr.developer.cfg 34 | .project 35 | .pydevproject 36 | -------------------------------------------------------------------------------- /SigNetNode.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class SigNetNode: 4 | """ An object represetation of nodes in a signaling network""" 5 | 6 | ## constructor 7 | # @param name String representation of the node 8 | # @param nodeType String represetation of the type of a node. Possible 9 | # @param bMeasured 10 | def __init__(self, name, nodeType, bMeasured): 11 | self.name = name 12 | self.type = nodeType 13 | self.bMeasured = bMeasured 14 | 15 | 16 | 17 | #test comments from sjlu99 18 | #test 2 19 | -------------------------------------------------------------------------------- /InSilico DREAM8.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Sep 4 11:33:32 2013 4 | 5 | @author: xinghualu 6 | """ 7 | 8 | 9 | from PyGibbCAMP import PyGibbCAMP 10 | 11 | import numpy as np 12 | 13 | dataDir = 'ProcessedData/insilico/' 14 | 15 | nodeFile = "ProcessedData/insilico/name.matching.csv" 16 | dataMatrix = dataDir + "data.matrix.insilico.csv" 17 | perturbMatrix = dataDir + "perturbation.table.insilico.csv" 18 | missDataMatrix = None 19 | net = PyGibbCAMP(nodeFile = nodeFile, dataMatrixFile = dataMatrix, perturbMatrix = perturbMatrix, missingDataMatrix= missDataMatrix) 20 | 21 | nParents = 3 22 | for alpha in np.arange(0.1, 1, 0.1, dtype=np.float): 23 | for i in range(5): 24 | pickleFileName = dataMatrix + ".chain" + str(i) + ".nParents" + str(nParents) + ".alpha-" + str(alpha) + ".pickle" 25 | net.trainGibbsEM(pickleDumpFile = pickleFileName, nChains = 1, alpha=alpha, maxIter = 500, nParents = nParents) 26 | 27 | -------------------------------------------------------------------------------- /DREAM8Exp.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Sep 4 11:33:32 2013 4 | 5 | @author: xinghualu 6 | """ 7 | import cPickle 8 | import numpy as np 9 | from PyGibbCAMP import PyGibbCAMP 10 | 11 | dataDir = 'ProcessedData/' 12 | 13 | nodeFile = dataDir + "name.matching.csv" 14 | dataMatrix = dataDir + "data.matrix.normalized.csv" 15 | missDataMatrix = dataDir + "missDataMatrix.csv" 16 | perturbMatrix = dataDir + "perturbation.table.csv" 17 | 18 | nParents = 2 19 | for alpha in np.arange(1., .5, -0.1, dtype=np.float): 20 | for i in range(5): 21 | pickleFileName = dataMatrix + ".chain" + str(i) + ".nParents" + str(nParents) + ".alpha-" + str(alpha) + ".pickle" 22 | model = PyGibbCAMP(nodeFile = nodeFile, dataMatrixFile = dataMatrix, perturbMatrix = perturbMatrix, missingDataMatrix= missDataMatrix) 23 | model.trainGibbsEM(pickleDumpFile = pickleFileName, nParents = nParents, nChains = 1, alpha= alpha, maxIter = 500) 24 | 25 | #cPickle.dump(model, open("final-model-09-06-13-alpha.05.nParent.4.pickle", 'wb')) 26 | -------------------------------------------------------------------------------- /DREAM8-CellLine-specific.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Sep 4 11:33:32 2013 4 | 5 | @author: xinghualu 6 | """ 7 | import cPickle 8 | 9 | from PyGibbCAMP import PyGibbCAMP 10 | import os 11 | import numpy as np 12 | 13 | dataDir = 'ProcessedData/cellline.specific.tables/UACC812/' 14 | nodeFile = "ProcessedData/name.matching.csv" 15 | 16 | 17 | dataMatrix = dataDir + "data.matrix.csv" 18 | perturbMatrix = dataDir + "perturbation.table.csv" 19 | if os.path.exists(dataDir + "missDataMatrix.csv"): 20 | missDataMatrix = dataDir + "missDataMatrix.csv" 21 | else: 22 | missDataMatrix = None 23 | 24 | nParents = 2 25 | for alpha in np.arange(1., .4, -0.1, dtype=np.float): 26 | for i in range(5): 27 | pickleFileName = dataMatrix + ".chain" + str(i) + ".nParents" + str(nParents) + ".alpha-" + str(alpha) + ".pickle" 28 | net = PyGibbCAMP(nodeFile = nodeFile, dataMatrixFile = dataMatrix, perturbMatrix = perturbMatrix, missingDataMatrix= missDataMatrix) 29 | net.trainGibbsEM(pickleDumpFile = pickleFileName, nParents = nParents, nChains = 1, alpha=alpha, maxIter = 500) 30 | 31 | -------------------------------------------------------------------------------- /NamedMatrix.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | A wrapper class enable access data matrix elements by col and row names 4 | 5 | Created on Sun Aug 25 08:40:33 2013 6 | 7 | 8 | 9 | @author: xinghualu 10 | """ 11 | 12 | import numpy as np 13 | from StringIO import StringIO 14 | 15 | class NamedMatrix: 16 | ## Constructor 17 | # @param filename=None A string point to a text matrix file 18 | # @param delimiter=',' A string indicate the delimiter separating fields in txt 19 | # @param npMatrix=None A reference to a numpy matrix 20 | # @colnames A string array of column names 21 | # @rownames A string array of rownames 22 | 23 | def __init__(self, filename = None, delimiter = ',', npMatrix = None, colnames = None, rownames = None): 24 | 25 | if filename and npMatrix: 26 | raise Exception ("Cannot create a NamedMatrix with both 'npMatrix' and 'filename' arguments set") 27 | if not filename and npMatrix == None: 28 | raise Exception ("Attempt to create a NameMatrix without 'filename' or an 'npMatrix'") 29 | 30 | if filename: 31 | print "Extracting matrix file " + filename 32 | try: 33 | f = open(filename, 'r') 34 | lines = f.readlines() 35 | except IOError: 36 | print "Fail to read file " + filename 37 | return 38 | 39 | if len(lines) == 1: # Mac version csv, with "\r" as return 40 | lines = lines[0].split("\r") 41 | self.colnames = lines.pop(0).rstrip().split(',') # split header and extract colnames 42 | map(lambda x: x.rstrip(), lines) # remove the "\r" 43 | lines = "\n".join(lines) # use "\n" to join lines 44 | else: 45 | self.colnames = lines.pop(0).rstrip().split(',') 46 | lines = "".join(lines) 47 | 48 | self.colnames.pop(0) 49 | 50 | # extract condition name 51 | self.rownames = list() 52 | for l in lines.split("\n"): 53 | self.rownames.append(l.split(',')[0]) 54 | 55 | # read in data and generate a numpy data matrix 56 | self.data = np.genfromtxt(StringIO(lines), delimiter = ",", usecols=tuple(range(1, len(self.colnames)+1))) 57 | 58 | if npMatrix != None: 59 | self.data = npMatrix 60 | nrow, ncol = np.shape(self.data) 61 | if colnames: 62 | if len(colnames) == ncol: 63 | self.colnames = colnames 64 | else: 65 | raise Exception("Dimensions of input colnames and matrix do not agree") 66 | else: 67 | self.colnames = list() 68 | for c in range(ncol): 69 | self.colnames.append('c' + str(c)) 70 | if rownames: 71 | if len(rownames) == nrow: 72 | self.rownames = rownames 73 | else: 74 | raise Exception("Dimensions of input rownames and matrix do not agree") 75 | else: 76 | self.rownames = list() 77 | for r in range(nrow): 78 | self.rownames.append('r' + str(r)) 79 | 80 | self.nrows, self.ncols = np.shape(self.data) 81 | 82 | def setColnames(self, colnames): 83 | if len(colnames) == len(self.colnames): 84 | self.colnames = colnames 85 | else: 86 | raise Exception("New colnames vector has differnt dimension as the original colnames") 87 | 88 | def getColnames(self): 89 | return self.colnames 90 | 91 | def setRownames(self, rownames): 92 | if len(rownames) == len(self.rownames): 93 | self.rownames = rownames 94 | else: 95 | raise Exception("New rownames vector has differnt dimension as the original colnames") 96 | 97 | def getRownames(self): 98 | return self.rownames 99 | 100 | def getValuesByCol(self, colnames): 101 | if isinstance (colnames, list): 102 | if not set(colnames) <= set(self.colnames): 103 | raise Exception("Try to access nonexisting columns") 104 | else: 105 | colIndx = map(lambda x: self.colnames.index(x), colnames) 106 | ixgrid = np.ix_(range(self.nrows), colIndx) 107 | return self.data[ixgrid] 108 | 109 | if isinstance(colnames, basestring): 110 | if colnames not in self.colnames: 111 | raise Exception ("Try to access non-existing column") 112 | else: 113 | return self.data[:, self.colnames.index(colnames)] 114 | 115 | 116 | def setValuesByColName(self, values, col): 117 | self.data[:,self.colnames.index(col)] = values 118 | 119 | 120 | 121 | def shape(self): 122 | if self.data != None: 123 | return np.shape(self.data) 124 | 125 | else: 126 | return None 127 | 128 | ## Return the position indices of colnames 129 | def findColIndices(self, colnames): 130 | if isinstance (colnames, list): 131 | if not set(colnames) <= set(self.colnames): 132 | raise Exception("Try to access nonexisting columns") 133 | else: 134 | colIndx = map(lambda x: self.colnames.index(x), colnames) 135 | return colIndx 136 | 137 | if isinstance(colnames, basestring): 138 | if colnames not in self.colnames: 139 | raise Exception ("Try to access non-existing column") 140 | else: 141 | return self.colnames.index(colnames) 142 | 143 | 144 | ## Return the position indices of rownames 145 | def findRowIndices(self, rownames): 146 | if set(rownames) - set(self.rownames): 147 | raise Exception("Unknown column name is used to query index") 148 | 149 | return [lambda x: self.rownames.index(x) for x in rownames] 150 | 151 | 152 | def setCellValue(self, rowname, colname, value): 153 | value = np.float(value) # force it into a np.float 154 | self.data[self.rownames.index(rowname), self.colnames.index(colname)] = value 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /SteinerTree.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Sep 7 12:57:57 2013 4 | 5 | @author: xinghualu 6 | """ 7 | 8 | # This is a generalized implementation of the Kou algorithm for creating Steiner Trees. It is not 9 | # tied to GOGrapher and can be used with any networkx wieghted graph. 10 | 11 | from heapq import * 12 | from networkx import * 13 | from networkx import Graph 14 | 15 | ## Extract a Steiner tree from a weighted graph, given a list of vertices of interest 16 | # @param G A Graph with weighted edges 17 | # @param voi A list of vertices of interest 18 | # @param generator A method to make a new Graph instance (in the case that you've extended Graph) 19 | # \returns a new graph if no errors, None otherwise 20 | def make_steiner_tree(G, voi, generator=None): 21 | mst = Graph() 22 | for v in voi: 23 | if not v in G: 24 | raise ValueError, "make_steiner_tree(): Some vertice not in original graph" 25 | if len(voi) == 0: 26 | return mst 27 | if len(voi) == 1: 28 | mst.add_node(voi[0]) 29 | return mst 30 | 31 | # Initially, use (a version of) Kruskal's algorithm to extract a minimal spanning tree 32 | # from a weighted graph. This algorithm differs in that only a subset of vertices are 33 | # going to be present in the final subgraph (which is not truely a MST - must use Prim's 34 | # algorithm later. 35 | 36 | # extract all shortest paths among the voi 37 | heapq = [] 38 | paths = {} 39 | 40 | # load all the paths bwteen the Steiner vertices. Store them in a heap queue 41 | # and reconstruct the MST of the complete graph using Kruskal's algorithm 42 | for i in range(len(voi) - 1): 43 | v1 = voi[i] 44 | for v2 in voi[i+1:]: 45 | result = bidirectional_dijkstra(G, v1, v2) 46 | if result == False: 47 | raise RuntimeError, "The two vertices given (%s, %s) don't exist on the same connected graph" % (v1, v2) 48 | #print "The two vertices given (%s, %s) don't exist on the same connected graph" % (v1, v2) 49 | distance, vertList = result 50 | keys = [v1, v2] 51 | keys.sort() 52 | key = "%s:%s" % tuple(keys) 53 | paths[key] = (vertList) 54 | heappush(heapq, (distance, v1, v2)) 55 | 56 | 57 | # construct the minimum spanning tree of the complete graph 58 | while heapq: 59 | w, v1, v2 = heappop(heapq) 60 | # if no path exists yet between v1 and v2, add this one 61 | if v1 not in mst or v2 not in mst or not has_path(mst, v1, v2): 62 | mst.add_edge(v1, v2,weight=w) 63 | 64 | # check if the graph is tree and correct 65 | sTree = set(mst.nodes()) 66 | sSteiner = set(voi) 67 | if sTree ^ sSteiner: 68 | raise RuntimeError, 'Failed to construct MST spanning tree' 69 | 70 | # reconstruct subgraph of origGraph using the paths 71 | if generator is None: 72 | subgraph = Graph() 73 | else: 74 | subgraph = generator() 75 | for edge in mst.edges_iter(data=True): 76 | keys = [edge[0],edge[1]] 77 | keys.sort() 78 | key = "%s:%s" % tuple(keys) 79 | vList = paths[key] 80 | for i in range(len(vList) - 1): 81 | v1 = vList[i] 82 | v2 = vList[i+1] 83 | w = G[v1][v2] 84 | subgraph.add_edge(v1, v2, w) 85 | # get rid of possible loops - result will be a true MST 86 | subgraph = make_prim_mst(subgraph, generator) 87 | 88 | # remove intermediate nodes in paths that are not in list of voi 89 | return _trimTree(subgraph, voi) 90 | 91 | 92 | ## remove intermediate nodes in paths that are not in list of voi in given graph 93 | # @param graph A weighted GOGenePubmedGraph 94 | # @param voi 95 | #/return graph An updated version of the GOGenePubmedGraph 96 | def _trimTree(graph, voi): 97 | trimKeepTrack = [] 98 | firstNode = voi[0] 99 | if len(graph.neighbors(firstNode)) < 2: 100 | trimKeepTrack.append(firstNode) 101 | firstNeighbor = graph.neighbors(firstNode)[0] 102 | trimKeepTrack.append(firstNeighbor) 103 | graph = _trim(firstNeighbor, graph, trimKeepTrack, voi) 104 | else: 105 | trimKeepTrack.append(firstNode) 106 | graph = _trim(firstNode, graph, trimKeepTrack, voi) 107 | return graph 108 | 109 | 110 | def _trim(node, graph, trimKeepTrack, voi): 111 | if len(graph.adj[node].keys()) > 1: 112 | for nodeNeighbor in graph.adj[node].keys(): 113 | if nodeNeighbor not in trimKeepTrack: 114 | trimKeepTrack.append(nodeNeighbor) 115 | graph = _trim(nodeNeighbor, graph, trimKeepTrack, voi) 116 | if len(graph.adj[node].keys()) < 2: 117 | if node not in voi: 118 | graph.remove_node(node) 119 | return graph 120 | 121 | 122 | """ 123 | Prim's algorithm: constructs the minimum spanning tree (MST) from an instance of weighted Graph 124 | @param G An weighted Graph() 125 | @param generator A method to make a new Graph instance (in the case that you've extended Graph) 126 | \returns A MST verstion of G 127 | """ 128 | ## generate the prim's algorithm mst 129 | # @param G A weighted GOGenePubmedGraph 130 | # @param generator Always set to None 131 | # /return mst Returns the created MST 132 | def make_prim_mst(G, generator=None): 133 | if generator is None: 134 | mst = Graph() 135 | else: 136 | mst = generator() 137 | #priorityQ is a list of list (the reverse of the edge tuple with the weight in the front) 138 | priorityQ = [] 139 | firstNode = G.nodes()[0] 140 | mst.add_node(firstNode) 141 | for edge in G.edges_iter(firstNode, data=True): 142 | if len(edge) != 3 or edge[2] is None: 143 | raise ValueError, "make_prim_mst accepts a weighted graph only (with numerical weights)" 144 | heappush(priorityQ, (edge[2], edge)) 145 | 146 | while len(mst.edges()) < (G.order()-1): 147 | w, minEdge = heappop(priorityQ) 148 | if len(minEdge) != 3 or minEdge[2] is None: 149 | raise ValueError, "make_prim_mst accepts a weighted graph only (with numerical weights)" 150 | v1, v2, w = minEdge 151 | if v1 not in mst: 152 | for edge in G.edges_iter(v1, data=True): 153 | if edge == minEdge: 154 | continue 155 | heappush(priorityQ, (edge[2], edge)) 156 | elif v2 not in mst: 157 | for edge in G.edges_iter(v2, data=True): 158 | if edge == minEdge: 159 | continue 160 | heappush(priorityQ, (edge[2], edge)) 161 | else: 162 | # non-crossing edge 163 | continue 164 | mst.add_edge(minEdge[0],minEdge[1],minEdge[2]) 165 | return mst -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, and 10 | distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright 13 | owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all other entities 16 | that control, are controlled by, or are under common control with that entity. 17 | For the purposes of this definition, "control" means (i) the power, direct or 18 | indirect, to cause the direction or management of such entity, whether by 19 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the 20 | outstanding shares, or (iii) beneficial ownership of such entity. 21 | 22 | "You" (or "Your") shall mean an individual or Legal Entity exercising 23 | permissions granted by this License. 24 | 25 | "Source" form shall mean the preferred form for making modifications, including 26 | but not limited to software source code, documentation source, and configuration 27 | files. 28 | 29 | "Object" form shall mean any form resulting from mechanical transformation or 30 | translation of a Source form, including but not limited to compiled object code, 31 | generated documentation, and conversions to other media types. 32 | 33 | "Work" shall mean the work of authorship, whether in Source or Object form, made 34 | available under the License, as indicated by a copyright notice that is included 35 | in or attached to the work (an example is provided in the Appendix below). 36 | 37 | "Derivative Works" shall mean any work, whether in Source or Object form, that 38 | is based on (or derived from) the Work and for which the editorial revisions, 39 | annotations, elaborations, or other modifications represent, as a whole, an 40 | original work of authorship. For the purposes of this License, Derivative Works 41 | shall not include works that remain separable from, or merely link (or bind by 42 | name) to the interfaces of, the Work and Derivative Works thereof. 43 | 44 | "Contribution" shall mean any work of authorship, including the original version 45 | of the Work and any modifications or additions to that Work or Derivative Works 46 | thereof, that is intentionally submitted to Licensor for inclusion in the Work 47 | by the copyright owner or by an individual or Legal Entity authorized to submit 48 | on behalf of the copyright owner. For the purposes of this definition, 49 | "submitted" means any form of electronic, verbal, or written communication sent 50 | to the Licensor or its representatives, including but not limited to 51 | communication on electronic mailing lists, source code control systems, and 52 | issue tracking systems that are managed by, or on behalf of, the Licensor for 53 | the purpose of discussing and improving the Work, but excluding communication 54 | that is conspicuously marked or otherwise designated in writing by the copyright 55 | owner as "Not a Contribution." 56 | 57 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf 58 | of whom a Contribution has been received by Licensor and subsequently 59 | incorporated within the Work. 60 | 61 | 2. Grant of Copyright License. 62 | 63 | Subject to the terms and conditions of this License, each Contributor hereby 64 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 65 | irrevocable copyright license to reproduce, prepare Derivative Works of, 66 | publicly display, publicly perform, sublicense, and distribute the Work and such 67 | Derivative Works in Source or Object form. 68 | 69 | 3. Grant of Patent License. 70 | 71 | Subject to the terms and conditions of this License, each Contributor hereby 72 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 73 | irrevocable (except as stated in this section) patent license to make, have 74 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where 75 | such license applies only to those patent claims licensable by such Contributor 76 | that are necessarily infringed by their Contribution(s) alone or by combination 77 | of their Contribution(s) with the Work to which such Contribution(s) was 78 | submitted. If You institute patent litigation against any entity (including a 79 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a 80 | Contribution incorporated within the Work constitutes direct or contributory 81 | patent infringement, then any patent licenses granted to You under this License 82 | for that Work shall terminate as of the date such litigation is filed. 83 | 84 | 4. Redistribution. 85 | 86 | You may reproduce and distribute copies of the Work or Derivative Works thereof 87 | in any medium, with or without modifications, and in Source or Object form, 88 | provided that You meet the following conditions: 89 | 90 | You must give any other recipients of the Work or Derivative Works a copy of 91 | this License; and 92 | You must cause any modified files to carry prominent notices stating that You 93 | changed the files; and 94 | You must retain, in the Source form of any Derivative Works that You distribute, 95 | all copyright, patent, trademark, and attribution notices from the Source form 96 | of the Work, excluding those notices that do not pertain to any part of the 97 | Derivative Works; and 98 | If the Work includes a "NOTICE" text file as part of its distribution, then any 99 | Derivative Works that You distribute must include a readable copy of the 100 | attribution notices contained within such NOTICE file, excluding those notices 101 | that do not pertain to any part of the Derivative Works, in at least one of the 102 | following places: within a NOTICE text file distributed as part of the 103 | Derivative Works; within the Source form or documentation, if provided along 104 | with the Derivative Works; or, within a display generated by the Derivative 105 | Works, if and wherever such third-party notices normally appear. The contents of 106 | the NOTICE file are for informational purposes only and do not modify the 107 | License. You may add Your own attribution notices within Derivative Works that 108 | You distribute, alongside or as an addendum to the NOTICE text from the Work, 109 | provided that such additional attribution notices cannot be construed as 110 | modifying the License. 111 | You may add Your own copyright statement to Your modifications and may provide 112 | additional or different license terms and conditions for use, reproduction, or 113 | distribution of Your modifications, or for any such Derivative Works as a whole, 114 | provided Your use, reproduction, and distribution of the Work otherwise complies 115 | with the conditions stated in this License. 116 | 117 | 5. Submission of Contributions. 118 | 119 | Unless You explicitly state otherwise, any Contribution intentionally submitted 120 | for inclusion in the Work by You to the Licensor shall be under the terms and 121 | conditions of this License, without any additional terms or conditions. 122 | Notwithstanding the above, nothing herein shall supersede or modify the terms of 123 | any separate license agreement you may have executed with Licensor regarding 124 | such Contributions. 125 | 126 | 6. Trademarks. 127 | 128 | This License does not grant permission to use the trade names, trademarks, 129 | service marks, or product names of the Licensor, except as required for 130 | reasonable and customary use in describing the origin of the Work and 131 | reproducing the content of the NOTICE file. 132 | 133 | 7. Disclaimer of Warranty. 134 | 135 | Unless required by applicable law or agreed to in writing, Licensor provides the 136 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, 137 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, 138 | including, without limitation, any warranties or conditions of TITLE, 139 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are 140 | solely responsible for determining the appropriateness of using or 141 | redistributing the Work and assume any risks associated with Your exercise of 142 | permissions under this License. 143 | 144 | 8. Limitation of Liability. 145 | 146 | In no event and under no legal theory, whether in tort (including negligence), 147 | contract, or otherwise, unless required by applicable law (such as deliberate 148 | and grossly negligent acts) or agreed to in writing, shall any Contributor be 149 | liable to You for damages, including any direct, indirect, special, incidental, 150 | or consequential damages of any character arising as a result of this License or 151 | out of the use or inability to use the Work (including but not limited to 152 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or 153 | any and all other commercial damages or losses), even if such Contributor has 154 | been advised of the possibility of such damages. 155 | 156 | 9. Accepting Warranty or Additional Liability. 157 | 158 | While redistributing the Work or Derivative Works thereof, You may choose to 159 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or 160 | other liability obligations and/or rights consistent with this License. However, 161 | in accepting such obligations, You may act only on Your own behalf and on Your 162 | sole responsibility, not on behalf of any other Contributor, and only if You 163 | agree to indemnify, defend, and hold each Contributor harmless for any liability 164 | incurred by, or claims asserted against, such Contributor by reason of your 165 | accepting any such warranty or additional liability. 166 | 167 | END OF TERMS AND CONDITIONS 168 | 169 | APPENDIX: How to apply the Apache License to your work 170 | 171 | To apply the Apache License to your work, attach the following boilerplate 172 | notice, with the fields enclosed by brackets "[]" replaced with your own 173 | identifying information. (Don't include the brackets!) The text should be 174 | enclosed in the appropriate comment syntax for the file format. We also 175 | recommend that a file or class name and description of purpose be included on 176 | the same "printed page" as the copyright notice for easier identification within 177 | third-party archives. 178 | 179 | Copyright [yyyy] [name of copyright owner] 180 | 181 | Licensed under the Apache License, Version 2.0 (the "License"); 182 | you may not use this file except in compliance with the License. 183 | You may obtain a copy of the License at 184 | 185 | http://www.apache.org/licenses/LICENSE-2.0 186 | 187 | Unless required by applicable law or agreed to in writing, software 188 | distributed under the License is distributed on an "AS IS" BASIS, 189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 190 | See the License for the specific language governing permissions and 191 | limitations under the License. 192 | -------------------------------------------------------------------------------- /PyGibbCAMP.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @ PyCAMP Python Causal Modeling of Pathways, a python implmentation for modeling 4 | causal relationship bewtween cellular signaling proteins, particularly phosphorylated 5 | proteins based on reverse phase protein array (RPPA) data. This model is designed 6 | to model the signal transduction through series of protein phosphorylation cascade, 7 | in which phosphorylation of a protein often activate the protein, which in turn 8 | lead to phosphorylation of other proteins. This model represent 9 | the phosphorylation state(s) and activation state of a protein separately such that the model 10 | is capable of capture the fact that, at certain time, phosphorylation of a protein 11 | can be decoupled by drug and inhibitors. 12 | 13 | 14 | Created on Wed Aug 14 19:16:25 2013 15 | 16 | @author: Xinghua Lu 17 | """ 18 | 19 | import networkx as nx 20 | import numpy as np 21 | from numpy import matlib 22 | from rpy2 import robjects 23 | import math, cPickle, re 24 | from SigNetNode import SigNetNode 25 | from StringIO import StringIO 26 | from NamedMatrix import NamedMatrix 27 | from SteinerTree import * 28 | 29 | import rpy2.robjects.numpy2ri 30 | rpy2.robjects.numpy2ri.activate() # enable directly pass numpy arrary or matrix as arguments to rpy object 31 | R = robjects.r # load R instance 32 | R.library("glmnet") 33 | glmnet = R('glmnet') # make glmnet from R a callable python object 34 | 35 | R.library("mixtools") 36 | normalmixEM = R('normalmixEM') 37 | 38 | 39 | 40 | class PyGibbCAMP: 41 | ## Constructor 42 | # @param nodeFile A string of pathname of file containing nodes. The 43 | # name, type, measured 44 | # @param edgeFile A list of tuples, each containing a source and sink node 45 | # of an edge 46 | # @param dataMatrixFile A string to data 47 | def __init__(self, nodeFile , dataMatrixFile , perturbMatrix = None, missingDataMatrix=None): 48 | self.network = None 49 | self.obsData = None 50 | self.missingDataMatrix = None 51 | perturbInstances = None 52 | self.nChains = 1 53 | 54 | self.dictPerturbEffect = {'AKT1' : [('GSK690693', 0), \ 55 | ('GSK690693_GSK1120212', 0)], 'MAP2K1' : [('GSK690693_GSK1120212', 0)],\ 56 | 'EGFR': [('EGF' , 1), ('FGF1', 1)]} 57 | # self.stimuli = ['EGF', 'FGF1', 'HGF', 'IGF1', 'Insulin', 'NRG1', 'PBS', 'Serum'] 58 | 59 | # parse data mastrix by calling NamedMatrix class 60 | if not dataMatrixFile: 61 | raise Exception("Cannot create PyCAMP obj without 'dataMatrixFile'") 62 | return 63 | self.obsData = NamedMatrix(dataMatrixFile) 64 | nCases, nAntibodies = np.shape(self.obsData.data) 65 | self.obsData.colnames = map(lambda s: s+'F', self.obsData.colnames) 66 | self.obsDataFileName = dataMatrixFile 67 | 68 | if perturbMatrix: 69 | self.perturbData = NamedMatrix(perturbMatrix) 70 | perturbInstances = self.perturbData.getColnames() 71 | self.perturbInstances = perturbInstances 72 | 73 | if missingDataMatrix: 74 | self.missingDataMatrix = NamedMatrix(missingDataMatrix) 75 | allMissing = np.sum(self.missingDataMatrix, 0) == nCases 76 | if np.any(allMissing): 77 | raise Exception ("Data matrix contain data-less columns") 78 | self.missingDataMatrix.colnames = map(lambda s: s+'F', self.missingDataMatrix.colnames) 79 | 80 | if not nodeFile: 81 | raise Exception("Calling 'intiNetwork' with empty nodeFile name") 82 | return 83 | 84 | try: 85 | nf = open(nodeFile, "r") 86 | nodeLines = nf.readlines() 87 | if len(nodeLines) == 1: # Mac files end a line with \r instead of \n 88 | nodeLines = nodeLines[0].split("\r") 89 | nf.close() 90 | except IOError: 91 | raise Exception( "Failed to open the file containing nodes") 92 | return 93 | 94 | print "Creating network" 95 | self.network = nx.DiGraph() 96 | 97 | self.dictProteinToAntibody = dict() 98 | self.dictAntibodyToProtein = dict() 99 | # parse nodes 100 | for line in nodeLines: 101 | #print line 102 | protein, antibody = line.rstrip().split(',') 103 | 104 | if protein not in self.dictProteinToAntibody: 105 | self.dictProteinToAntibody[protein] = [] 106 | self.dictProteinToAntibody[protein].append(antibody) 107 | self.dictAntibodyToProtein[antibody] = protein 108 | 109 | fluo = antibody + 'F' 110 | if protein not in self.network: 111 | self.network.add_node(protein, nodeObj = SigNetNode(protein, 'ACTIVATIONSTATE', False)) 112 | self.network.add_node(antibody, nodeObj= SigNetNode(antibody, 'PHOSPHORYLATIONSTATE', False)) 113 | self.network.add_node(fluo, nodeObj = SigNetNode(fluo, 'FLUORESCENCE', True)) 114 | self.network.add_edge(antibody, protein) 115 | self.network.add_edge(antibody, fluo) 116 | 117 | for perturb in perturbInstances: 118 | self.network.add_node(perturb, nodeObj = SigNetNode(perturb, 'PERTURBATION', True)) 119 | 120 | # Add edges between PERTURBATION, protein activity,and phosphorylation layers 121 | for pro in self.dictProteinToAntibody: 122 | for phos in self.dictAntibodyToProtein: 123 | if self.dictAntibodyToProtein[phos] == pro: 124 | continue 125 | self.network.add_edge(pro, phos) 126 | for perturb in perturbInstances: 127 | self.network.add_edge(perturb, pro) 128 | 129 | 130 | ## Init parameters of the model 131 | # In Bayesian network setting, the joint probability is calculated 132 | # through the product of a series conditional probability. The parameters 133 | # of the PyCAMP model defines p(x | Pa(X)). For observed fluorescent node 134 | # the conditional probability is a mixture of two Gaussian distribution. 135 | # therefore, the parameters are two pairs of mu and sigma. For 136 | # the hidden variables representing phosphorylation states and activation 137 | # states of proteins, the conditional probability is defined by a logistic 138 | # regression. Therefore, the parameters associated with such a node is a 139 | # vector of real numbers. 140 | # 141 | def _initParams(self): 142 | print "Initialize parameters associated with each node in each MCMC chain" 143 | for nodeId in self.network: 144 | self._initNodeParams(nodeId) 145 | 146 | def _initNodeParams(self, nodeId): 147 | nodeObj = self.network.node[nodeId]['nodeObj'] 148 | if nodeObj.type == 'FLUORESCENCE': 149 | # Estimate mean and sd of fluo signal using mixture model 150 | if self.missingDataMatrix and nodeId in self.missingDataMatrix.getColnames(): 151 | nodeData = self.obsData.getValuesByCol( nodeId) 152 | nodeData = nodeData[self.missingDataMatrix.getValuesByCol(nodeId) == 0] 153 | else: 154 | nodeData = self.obsData.getValuesByCol(nodeId) 155 | nodeObj.mus = np.zeros((self.nChains, 2)) 156 | nodeObj.sigmas = np.zeros((self.nChains, 2)) 157 | for c in range(self.nChains): 158 | mixGaussians = normalmixEM(robjects.FloatVector(nodeData), k = 2 ) 159 | # mus and sigmas are represented as nChain x 2 matrices 160 | nodeObj.mus[c,:] = np.array(mixGaussians[2]) 161 | nodeObj.sigmas[c,:] = np.array(mixGaussians[3]) 162 | else: 163 | preds = self.network.predecessors(nodeId) 164 | if len(preds) > 0: 165 | nodeObj.paramNames = preds 166 | nodeObj.params = np.random.randn(self.nChains, len(preds) + 1) 167 | else: 168 | nodeObj.params = None 169 | 170 | 171 | ## Initialize latent variables 172 | # 173 | # 174 | def _initHiddenStates(self): 175 | hiddenNodes = [n for n in self.network if not self.network.node[n]['nodeObj'].bMeasured] 176 | phosNodes = [n for n in self.network if self.network.node[n]['nodeObj'].type == 'PHOSPHORYLATIONSTATE'] 177 | #print str(phosNodes) 178 | nCases, nAntibody = self.obsData.shape() 179 | caseNames = self.obsData.getRownames() 180 | 181 | self.nodeStates = list() 182 | for c in range(self.nChains): 183 | tmp = np.zeros((nCases, len(hiddenNodes))) 184 | tmp[np.random.rand(nCases, len(hiddenNodes)) < 0.3] = 1 185 | tmp = np.column_stack((tmp, self.perturbData.data)) 186 | colnames = hiddenNodes + self.perturbData.colnames 187 | self.nodeStates.append(NamedMatrix(npMatrix = tmp, colnames = colnames, rownames = caseNames)) 188 | 189 | #initialize phos state based on the observed fluo 190 | for node in phosNodes: 191 | fluoNode = node + 'F' 192 | #print "phosNode:" + node + "; fluoNode: " + fluoNode 193 | fluoNodeObj = self.network.node[fluoNode]['nodeObj'] 194 | fluoData = self.obsData.getValuesByCol(fluoNode) 195 | tmp = np.zeros(nCases) 196 | phosProbOne = - np.log(fluoNodeObj.sigmas[c, 1])\ 197 | - 0.5 * np.square(fluoData - fluoNodeObj.mus[c, 1]) / np.square(fluoNodeObj.sigmas[c, 1]) 198 | phosProbZero = - np.log(fluoNodeObj.sigmas[c, 0])\ 199 | - 0.5 * np.square(fluoData - fluoNodeObj.mus[c, 0]) / np.square(fluoNodeObj.sigmas[c, 0]) 200 | tmp[phosProbOne > phosProbZero] = 1 201 | nodeIndx = self.nodeStates[c].findColIndices(node) 202 | self.nodeStates[c].data[:,nodeIndx] = tmp 203 | 204 | # take care of missing values by random sampling 205 | if self.missingDataMatrix: 206 | if node in self.missingDataMatrix.getColnames(): 207 | #print "processing node with missing values: " + nodeId 208 | missingCases = self.missingDataMatrix.getValuesByCol(node) == 1 209 | tmp = np.zeros(sum(missingCases)) 210 | tmp[np.random.rand(len(tmp)) <= 0.3] = 1 211 | self.nodeStates[c].data[missingCases, nodeIndx] = tmp 212 | 213 | 214 | 215 | ## Calculate the marginal probability of observing the measured data by 216 | # integrating out all possible setting of latent variable states and 217 | # model parameters. 218 | def calcEvidenceLikelihood(self): 219 | phosNodes = [n for n in self.network if self.network.node[n]['nodeObj'].type == 'PHOSPHORYLATIONSTATE'] 220 | loglikelihood = 0 221 | nCases, nAntibodies = np.shape(self.obsData.data) 222 | for nodeId in phosNodes: 223 | nodeObj = self.network.node[nodeId]['nodeObj'] 224 | nodeIndx = self.nodeStates[0].findColIndices(nodeId) 225 | preds = self.network.predecessors(nodeId) 226 | for c in range(self.nChains): 227 | nodeData = self.nodeStates[c].data[:, nodeIndx] 228 | predStates = np.column_stack((np.ones(nCases), self.nodeStates[c].getValuesByCol(preds))) 229 | pOneCondOnParents = 1 / (1 + np.exp( - np.dot(predStates, nodeObj.params[c,:]))) 230 | pOneCondOnParents[pOneCondOnParents == 1.] -= np.finfo(np.float).eps 231 | 232 | loglikelihood += np.sum(nodeData * np.log(pOneCondOnParents) \ 233 | + (1 - nodeData) * np.log(1 - pOneCondOnParents)) 234 | 235 | loglikelihood /= self.nChains 236 | return loglikelihood 237 | 238 | ## Perform graph search 239 | def trainGibbsEM(self, nChains = 10, alpha = 0.1, nParents = 4, nSamples = 5, pickleDumpFile = None, maxIter = 1000): 240 | self.nChains = nChains 241 | self.alpha = alpha 242 | self.likelihood = list() 243 | self.nSamples = nSamples 244 | self.nParents = nParents 245 | 246 | if pickleDumpFile: 247 | self.pickleDumpFile = pickleDumpFile 248 | else: 249 | self.pickleDumpFile = self.obsDataFileName + "alpha" + str(self.alpha) + ".pickle" 250 | 251 | # check if the network and data agrees 252 | nodeToDelete = list() 253 | for nodeId in self.network: 254 | if self.network.node[nodeId]['nodeObj'].type == 'FLUORESCENCE' and nodeId not in self.obsData.getColnames(): 255 | print "Node " + nodeId + " don't has associated data" 256 | nodeToDelete.append(nodeId) 257 | nodeToDelete.append(self.network.predecessors(nodeId)[0]) 258 | for nodeId in nodeToDelete: 259 | if self.network.has_node(nodeId): 260 | print "removing node " + nodeId 261 | self.network.remove_node(nodeId) 262 | 263 | # Starting EM set up Markov chains to train a model purely based on prior knowledge 264 | self._initParams() 265 | self._initHiddenStates() 266 | 267 | # perform update of latent variables in a layer-wise manner 268 | self.likelihood = list() 269 | 270 | self.expectedStates = list() 271 | nCases, nAntibodies = np.shape(self.obsData.data) 272 | for c in range(self.nChains): 273 | # each chain collect expected statistics of nodes from samples along the chain 274 | self.expectedStates.append(np.zeros(np.shape(self.nodeStates[c].data))) 275 | 276 | print "Starting EM: alpha = " + str(self.alpha) + "; nChains = " + str(self.nChains) + "; nSamples = " + str (self.nSamples) + "; nParents = " + str(self.nParents) 277 | optLikelihood = float("-inf") 278 | bConverged = False 279 | sampleCount = 0 280 | 281 | likelihood = self.calcEvidenceLikelihood() 282 | print "nIter: 0" + "; log likelihood of evidence: " + str(likelihood) 283 | self.likelihood.append(likelihood) 284 | for nIter in range(maxIter): 285 | 286 | # E-step of EM 287 | self._updateActivationStates() 288 | if (nIter+1) % 2 == 0: # we collect sample every other iteration 289 | sampleCount += 1 290 | for c in range(self.nChains): 291 | self.expectedStates[c] += self.nodeStates[c].data 292 | 293 | # M-step of EM. We only update parameters after a collecting a certain number of samples 294 | if sampleCount >= self.nSamples: 295 | sampleCount = 0 296 | # take expectation of sample states 297 | self.expectedStates = map(lambda x: x / self.nSamples, self.expectedStates) 298 | self._updteParams(self.alpha, nparents = self.nParents) 299 | 300 | likelihood = self.calcEvidenceLikelihood() 301 | self.likelihood.append(likelihood) 302 | print "nIter: " + str(nIter + 1) + "; log likelihood of evidence: " + str(likelihood) 303 | 304 | # collect the current best fit models 305 | if likelihood > optLikelihood: 306 | optLikelihood = likelihood 307 | try: 308 | cPickle.dump(self, open(self.pickleDumpFile, 'wb')) 309 | except: 310 | raise Exception("Cannot create pickle dumpfile " + self.pickleDumpFile) 311 | 312 | bConverged = self._checkConvergence() 313 | if bConverged: 314 | print "EM converged!" 315 | break 316 | 317 | for c in range(self.nChains): # clear expectedStates 318 | self.expectedStates[c] = np.zeros(np.shape(self.nodeStates[c].data)) 319 | 320 | # now try to delete edges that does contribute to evidence 321 | #self.trimEdgeByConsensus(.9) 322 | return self 323 | 324 | def _checkConvergence(self): 325 | # To do, add convergence checking code 326 | if len(self.likelihood) < 20: 327 | return False 328 | 329 | ml = np.mean(self.likelihood[-5:-1]) 330 | ratio = abs(self.likelihood[-1] - ml ) / abs(ml) 331 | return ratio <= 0.001 332 | 333 | def _updateActivationStates(self): 334 | nCases, antibody = np.shape(self.obsData.data) 335 | nCases, nHiddenNodes = np.shape(self.nodeStates[0].data) 336 | 337 | # interate through all nodes. 338 | activationNode = [n for n in self.network if self.network.node[n]['nodeObj'].type == 'ACTIVATIONSTATE'] 339 | 340 | for nodeId in activationNode: 341 | for c in range(self.nChains): 342 | curNodeMarginal = self.calcNodeCondProb(nodeId, c) 343 | 344 | # sample states of current node based on the prob, and update 345 | sampleState = np.zeros(nCases) 346 | sampleState[curNodeMarginal >= np.random.rand(nCases)] = 1. 347 | curNodeIndx = self.nodeStates[c].findColIndices(nodeId) 348 | self.nodeStates[c].data[:, curNodeIndx] = sampleState 349 | 350 | # clamp the activationState of perturbed nodes to a fix value 351 | if nodeId in self.dictPerturbEffect: 352 | # the diction keeps a list conditins under which the node is perurbed and the state to be clamped to 353 | for condition, state in self.dictPerturbEffect[nodeId]: 354 | perturbState = self.nodeStates[c].getValuesByCol(condition) 355 | indx = self.nodeStates[c].findColIndices(nodeId) 356 | self.nodeStates[c].data[perturbState==1, indx] = state 357 | 358 | 359 | def calcNodeCondProb(self, nodeId, c): 360 | """ 361 | Calculate the marginal probability of a node's state set to "1" conditioning 362 | on all evidence. 363 | 364 | args: 365 | nodeId A string id of the node of interest 366 | c An integer indicate the chain from which the parameter 367 | vector to be used 368 | """ 369 | nodeObj = self.network.node[nodeId]['nodeObj'] 370 | if nodeObj.bMeasured: 371 | raise Exception("Call _caclNodeMarginalProb on an observed variable " + nodeId) 372 | 373 | nCases, nAntibody = np.shape(self.obsData.data) 374 | 375 | # collect the state of the predecessors of the node 376 | preds = self.network.predecessors(nodeId) 377 | logProbOneCondOnParents = 0 378 | logProbZeroCondOnParents = 0 379 | if len(preds) > 0: # if the node has parents 380 | # calculate p(curNode = 1 | parents); 381 | nodeParams = nodeObj.params[c,:] 382 | predStates = np.column_stack((np.ones(nCases), self.nodeStates[c].getValuesByCol(preds))) 383 | pOneCondOnParents = 1 / (1 + np.exp( - np.dot(predStates, nodeParams))) 384 | pOneCondOnParents[pOneCondOnParents == 1] -= np.finfo(np.float).eps 385 | pOneCondOnParents[pOneCondOnParents == 0] += np.finfo(np.float).eps 386 | logProbOneCondOnParents = np.log(pOneCondOnParents) 387 | logProbZeroCondOnParents = np.log(1 - pOneCondOnParents) 388 | 389 | # collect evidence from children 390 | logProbChildCondOne = 0 # the prob of child conditioning on current node == 1 391 | logProdOfChildCondZeros = 0 392 | 393 | children = self.network.successors(nodeId) 394 | if len(children) > 0: 395 | for child in children: 396 | childNodeObj = self.network.node[child]['nodeObj'] 397 | curChildStates = self.nodeStates[c].getValuesByCol(child) 398 | 399 | # Collect states of the predecessors of the child 400 | childPreds = self.network.predecessors(child) 401 | childNodeParams = childNodeObj.params[c,:] 402 | childPredStates = self.nodeStates[c].getValuesByCol(childPreds) 403 | childPredStates = np.column_stack((np.ones(nCases), childPredStates)) # padding data with a column ones as bias 404 | 405 | # Set the state of current node to ones 406 | curNodePosInPredList = childPreds.index(nodeId) + 1 # offset by 1 because padding 407 | if childNodeParams[curNodePosInPredList] == 0: # not an real edge 408 | continue 409 | childPredStates[:, curNodePosInPredList] = np.ones(nCases) 410 | pChildCondCurNodeOnes = 1 / (1 + np.exp(-np.dot(childPredStates, childNodeParams))) 411 | pChildCondCurNodeOnes[pChildCondCurNodeOnes==1] -= np.finfo(np.float).eps 412 | pChildCondCurNodeOnes[pChildCondCurNodeOnes==0] += np.finfo(np.float).eps 413 | logProbChildCondOne += np.log (curChildStates * pChildCondCurNodeOnes + (1 - curChildStates) * (1 - pChildCondCurNodeOnes)) 414 | 415 | # set the state of the current node (nodeId) to zeros 416 | childPredStates [:, curNodePosInPredList] = np.zeros(nCases) 417 | pChildCondCurNodeZeros = 1 / (1 + np.exp(- np.dot(childPredStates, childNodeParams))) 418 | pChildCondCurNodeZeros[pChildCondCurNodeZeros==1] -= np.finfo(np.float).eps 419 | pChildCondCurNodeZeros[pChildCondCurNodeZeros==0] += np.finfo(np.float).eps 420 | logProdOfChildCondZeros += np.log(curChildStates * pChildCondCurNodeZeros + (1 - curChildStates) * (1 - pChildCondCurNodeZeros)) 421 | 422 | # now we can calculate the marginal probability of current node 423 | curNodeMarginal = 1 / (1 + np.exp(logProbZeroCondOnParents + logProdOfChildCondZeros - logProbOneCondOnParents - logProbChildCondOne)) 424 | return curNodeMarginal 425 | 426 | 427 | def parseGlmnetCoef(self, glmnet_res): 428 | """ Parse the 'beta' matrix returned by calling glmnet through RPy2. 429 | Return the first column of 'beta' matrix of the glmnet object 430 | with 3 or more non-zero values 431 | """ 432 | # read in intercept; a vector of length of nLambda 433 | a0 = np.array(glmnet_res.rx('a0'))[0] 434 | 435 | # Read in lines of beta matrix txt, which is a nVariables * nLambda. 436 | # Since we call glmnet by padding x with a column of 1s, we only work 437 | # with the 'beta' matrix returned by fit 438 | betaLines = StringIO(str(glmnet_res.rx('beta'))).readlines() 439 | dimStr = re.search("\d+\s+x\s+\d+", betaLines[1]).group(0) 440 | if not dimStr: 441 | raise Exception("'parse_glmnet_res' could not determine the dims of beta") 442 | nVariables , nLambda = map(int, dimStr.split(' x ')) 443 | betaMatrix = np.zeros( (nVariables, nLambda), dtype=np.float) 444 | 445 | # glmnet print beta matrix in mulitple blocks with 446 | # nVariable * blockSize 447 | blockSize = len(betaLines[4].split()) - 1 448 | curBlockColStart = - blockSize 449 | for line in betaLines: #read in blocks 450 | m = re.search('^V\d+', line) 451 | if not m: # only find the lines begins with 'V\d' 452 | continue 453 | else: 454 | rowIndx = int(m.group(0)[1:len(m.group(0))]) 455 | if rowIndx == 1: 456 | curBlockColStart += blockSize 457 | 458 | # set 'rowIndx' as start from 0 459 | rowIndx -= 1 460 | 461 | fields = line.rstrip().split() 462 | fields.pop(0) 463 | if len(fields) != blockSize: 464 | blockSize = len(fields) 465 | for j in range(blockSize): 466 | if fields[j] == '.': 467 | continue 468 | else: 469 | betaMatrix[rowIndx, curBlockColStart + j] = float(fields[j]) 470 | 471 | return a0, betaMatrix 472 | 473 | 474 | def _updteParams(self, alpha = 0.1, nparents=None): 475 | # Update the parameter associated with each node, p(n | Pa(n)) using logistic regression, 476 | # using expected states of precessors as X and current node states acrss samples as y 477 | nCases, nVariables = np.shape(self.obsData.data) 478 | if not nparents: 479 | nparents = self.nParents 480 | 481 | for nodeId in self.network: 482 | nodeObj = self.network.node[nodeId]['nodeObj'] 483 | if nodeObj.type == 'FLUORESCENCE' or nodeObj.type == 'PERTURBATION': 484 | continue 485 | nodeObj.fitRes = list() 486 | preds = self.network.predecessors(nodeId) 487 | predIndices = self.nodeStates[0].findColIndices(preds) 488 | 489 | for c in range(self.nChains): 490 | expectedPredState = self.expectedStates[c][:, predIndices] 491 | #x = np.column_stack((np.ones(nCases), expectedPredState)) 492 | x = np.column_stack((np.ones(nCases), expectedPredState)) 493 | y = self.nodeStates[c].getValuesByCol(nodeId) 494 | 495 | #check if all x and y are of same value, which will lead to problem for glmnet 496 | rIndx = map(lambda z: int(math.floor(z)), np.random.rand(50) * nCases) 497 | if sum(y) == nCases: # if every y == 1 498 | y[rIndx] = 0 499 | elif sum( map(lambda x: 1 - x, y)) == nCases: 500 | y[rIndx] = 1 501 | y = robjects.vectors.IntVector(y) 502 | 503 | allRwoSumOnes = np.where(np.sum(x, 0) == nCases)[0] 504 | for col in allRwoSumOnes: 505 | rIndx = map(lambda z: int(math.floor(z)), np.random.rand(3) * nCases) 506 | x[rIndx, col] = 0 507 | allZeros = np.where(np.sum(np.ones(np.shape(x)) - x, 0) == nCases) 508 | for col in allZeros[0]: 509 | rIndx = map(lambda z: int(math.floor(z)), np.random.rand(3) * nCases) 510 | x[rIndx, col] = 1 511 | 512 | # call logistic regression using glmnet from Rpy 513 | fit = glmnet (x, y, alpha = alpha, family = "binomial", intercept = 0) 514 | nodeObj.fitRes.append(fit) 515 | 516 | # extract coefficients glmnet, keep the first set beta with nParent non-zeros values 517 | a0, betaMatrix = self.parseGlmnetCoef(fit) 518 | for j in range(np.shape(betaMatrix)[1]): 519 | if sum(betaMatrix[:, j] != 0.) >= nparents: 520 | break 521 | if j >= len(a0): 522 | j = len(a0) - 1 523 | 524 | myparams = betaMatrix[:, j] 525 | if sum( myparams != 0.) > nparents: 526 | sortedParams = sorted(np.abs(myparams)) 527 | myparams[np.abs(myparams) < sortedParams[-self.nParents]] = 0. 528 | 529 | nodeObj.params[c,:] = myparams 530 | 531 | 532 | def getStimuliSpecificNet(self, stimulus): 533 | self.stimuli = ['EGF', 'FGF1', 'HGF', 'IGF1', 'Insulin', 'NRG1', 'PBS', 'Serum'] 534 | #self.stimuli = ['loLIG1', 'hiLIG1', 'loLIG2', 'hiLIG2'] 535 | # trim unused edges 536 | if not stimulus in self.nodeStates[0].getColnames(): 537 | raise Exception("Input stimulus '" + stimulus + "' is not in the experiment data") 538 | 539 | #self.trimEdgeByConsensus(0.9) 540 | stimulusCases = self.perturbData.getValuesByCol(stimulus) == 1 541 | controlCases = np.sum(self.perturbData.getValuesByCol(self.stimuli), 1) == 0 542 | 543 | # identify the nodes to keep by determine if a node responds to a stimuli 544 | activeNodes = set() 545 | activeNodes.add(stimulus) 546 | for nodeId in self.network: 547 | if self.network.node[nodeId]['nodeObj'].type == 'FLUORESCENCE' \ 548 | or self.network.node[nodeId]['nodeObj'].type == 'fluorescence': 549 | nodeControlValues = self.obsData.getValuesByCol(nodeId)[controlCases] 550 | nodeStimulValues = self.obsData.getValuesByCol(nodeId)[stimulusCases] 551 | ttestRes = R('t.test')(robjects.FloatVector(nodeControlValues), robjects.FloatVector(nodeStimulValues)) 552 | pvalue = np.array(ttestRes.rx('p.value')[0])[0] 553 | if pvalue < 0.05: 554 | activeNodes.add(self.network.predecessors(nodeId)[0]) 555 | 556 | # copy network to a tmp, redirect edges from activation state nodes 557 | # Edge indicates the impact 558 | tmpNet = nx.DiGraph() 559 | for u, v in self.network.edges(): 560 | # we are only interested in the edge from protein point to antibody 561 | if (self.network.node[u]['nodeObj'].type == 'ACTIVATIONSTATE'\ 562 | or self.network.node[u]['nodeObj'].type == 'activeState')\ 563 | and (self.network.node[v]['nodeObj'].type == 'PHOSPHORYLATIONSTATE'\ 564 | or self.network.node[v]['nodeObj'].type == 'phosState'): 565 | # extract parameters associated with u and v 566 | vPreds = self.network.predecessors(v) 567 | uIndx = vPreds.index(u) 568 | vParams = np.sum(self.network.node[v]['nodeObj'].params, 0) 569 | if len(vParams) != (len(vPreds) + 1): 570 | raise Exception ("Bug in retrieving parameters of node v " + u) 571 | paramZeros = np.sum(self.network.node[v]['nodeObj'].params == 0, 0) 572 | if np.float(paramZeros[uIndx+1]) / float(self.nChains) > .9: 573 | continue # don't add edge with beta == 0 574 | 575 | for ab in self.dictProteinToAntibody[u]: 576 | if ab not in self.network: 577 | continue 578 | # find the impact of phosphorylation on activation state 579 | uPreds = self.network.predecessors(u) 580 | uParams = np.mean(self.network.node[u]['nodeObj'].params, 0) 581 | if len(uParams) != (len(uPreds) + 1): 582 | raise Exception ("Bug in retrieving parameters of node v " + u) 583 | #uAntibodyParam = uParams[uPreds.index(ab) + 1] 584 | 585 | # if vParams[uIndx+1] > 0. and (vParams[uIndx+1] * uAntibodyParam) > 0: 586 | # tmpNet.add_edge(ab, v, effect = "+", betaValue = vParams[uIndx+1]) 587 | # elif (vParams[uIndx+1] * uAntibodyParam) < 0.: 588 | # tmpNet.add_edge(ab, v, effect = "-", betaValue = vParams[uIndx+1]) 589 | if vParams[uIndx+1] > 0. : 590 | tmpNet.add_edge(ab, v, effect = "+", betaValue = vParams[uIndx+1]) 591 | elif vParams[uIndx+1] < 0.: 592 | tmpNet.add_edge(ab, v, effect = "-", betaValue = vParams[uIndx+1]) 593 | 594 | # remove leave nodes that is not in activeNodes list 595 | while True: 596 | leafNodes = [] 597 | for nodeId in tmpNet: 598 | if (nodeId not in activeNodes and len(tmpNet.successors(nodeId)) == 0)\ 599 | or (nodeId not in activeNodes and len(tmpNet.predecessors(nodeId)) == 0): 600 | leafNodes.append(nodeId) 601 | 602 | if len(leafNodes) == 0: 603 | break 604 | 605 | for leaf in leafNodes: 606 | tmpNet.remove_node(leaf) 607 | 608 | # now try to remove cycles and make the tmpNet a DAG 609 | return tmpNet 610 | 611 | 612 | 613 | def toGraphML(self, filename): 614 | tmpNet = nx.DiGraph() 615 | for edge in self.network.edges(): 616 | tmpNet.add_edge(edge) 617 | 618 | nx.write_graphml(tmpNet, filename, encoding='utf-8', prettyprint=True) 619 | 620 | # # this funciton implement 621 | # def K2LikeGreedySearch (self, tmpNet): 622 | # for node in tmpNet: 623 | # ancestors = tmpNet.predecessors(node) 624 | # preds = [] 625 | # while True: 626 | # 627 | 628 | 629 | 630 | 631 | 632 | 633 | --------------------------------------------------------------------------------