├── README.md
├── .gitignore
├── SigNetNode.py
├── InSilico DREAM8.py
├── DREAM8Exp.py
├── DREAM8-CellLine-specific.py
├── NamedMatrix.py
├── SteinerTree.py
├── LICENSE
└── PyGibbCAMP.py


/README.md:
--------------------------------------------------------------------------------
1 | PyGibbCAMP
2 | ==========
3 | 
4 | A Python implementation of causal inference of pathways using Gibbs sample approach
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | 
21 | # Installer logs
22 | pip-log.txt
23 | 
24 | # Unit test / coverage reports
25 | .coverage
26 | .tox
27 | nosetests.xml
28 | 
29 | # Translations
30 | *.mo
31 | 
32 | # Mr Developer
33 | .mr.developer.cfg
34 | .project
35 | .pydevproject
36 | 


--------------------------------------------------------------------------------
/SigNetNode.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class SigNetNode:
 4 |     """ An object represetation of nodes in a signaling network"""
 5 | 
 6 |     ## constructor
 7 |     # @param    name  String representation of the node
 8 |     # @param    nodeType  String represetation of the type of a node.  Possible
 9 |     # @param    bMeasured 
10 |     def __init__(self, name, nodeType, bMeasured):
11 |         self.name = name
12 |         self.type = nodeType
13 |         self.bMeasured = bMeasured
14 | 
15 | 
16 | 
17 | #test comments from sjlu99
18 | #test 2
19 | 


--------------------------------------------------------------------------------
/InSilico DREAM8.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Sep  4 11:33:32 2013
 4 | 
 5 | @author: xinghualu
 6 | """
 7 | 
 8 | 
 9 | from PyGibbCAMP import PyGibbCAMP
10 | 
11 | import numpy as np
12 | 
13 | dataDir = 'ProcessedData/insilico/'
14 | 
15 | nodeFile = "ProcessedData/insilico/name.matching.csv"
16 | dataMatrix =  dataDir  + "data.matrix.insilico.csv"
17 | perturbMatrix =  dataDir  + "perturbation.table.insilico.csv"
18 | missDataMatrix = None
19 | net = PyGibbCAMP(nodeFile = nodeFile, dataMatrixFile = dataMatrix, perturbMatrix = perturbMatrix, missingDataMatrix= missDataMatrix)
20 | 
21 | nParents = 3  
22 | for alpha in np.arange(0.1, 1, 0.1, dtype=np.float):
23 |     for i in range(5):
24 |         pickleFileName = dataMatrix + ".chain" + str(i) + ".nParents" + str(nParents) +  ".alpha-" + str(alpha) + ".pickle"
25 |         net.trainGibbsEM(pickleDumpFile = pickleFileName, nChains = 1, alpha=alpha, maxIter = 500, nParents = nParents)
26 | 
27 | 


--------------------------------------------------------------------------------
/DREAM8Exp.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Sep  4 11:33:32 2013
 4 | 
 5 | @author: xinghualu
 6 | """
 7 | import cPickle
 8 | import numpy as np
 9 | from PyGibbCAMP import PyGibbCAMP
10 | 
11 | dataDir = 'ProcessedData/'
12 | 
13 | nodeFile = dataDir + "name.matching.csv"
14 | dataMatrix = dataDir + "data.matrix.normalized.csv"
15 | missDataMatrix = dataDir + "missDataMatrix.csv"
16 | perturbMatrix = dataDir + "perturbation.table.csv"
17 | 
18 | nParents = 2
19 | for alpha in np.arange(1., .5, -0.1, dtype=np.float):
20 |     for i in range(5):
21 |         pickleFileName = dataMatrix + ".chain" + str(i) + ".nParents" + str(nParents) +  ".alpha-" + str(alpha) + ".pickle"
22 |         model = PyGibbCAMP(nodeFile = nodeFile, dataMatrixFile = dataMatrix, perturbMatrix = perturbMatrix, missingDataMatrix= missDataMatrix)
23 |         model.trainGibbsEM(pickleDumpFile = pickleFileName, nParents = nParents, nChains = 1, alpha= alpha, maxIter = 500)
24 | 
25 | #cPickle.dump(model, open("final-model-09-06-13-alpha.05.nParent.4.pickle", 'wb'))
26 | 


--------------------------------------------------------------------------------
/DREAM8-CellLine-specific.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Sep  4 11:33:32 2013
 4 | 
 5 | @author: xinghualu
 6 | """
 7 | import cPickle
 8 | 
 9 | from PyGibbCAMP import PyGibbCAMP
10 | import os
11 | import numpy as np
12 | 
13 | dataDir = 'ProcessedData/cellline.specific.tables/UACC812/'
14 | nodeFile = "ProcessedData/name.matching.csv"
15 | 
16 | 
17 | dataMatrix =  dataDir   + "data.matrix.csv"
18 | perturbMatrix =  dataDir   + "perturbation.table.csv"
19 | if os.path.exists(dataDir + "missDataMatrix.csv"): 
20 |     missDataMatrix =  dataDir + "missDataMatrix.csv"
21 | else:
22 |     missDataMatrix = None
23 | 
24 | nParents = 2
25 | for alpha in np.arange(1., .4, -0.1, dtype=np.float):
26 |     for i in range(5):
27 |         pickleFileName = dataMatrix + ".chain" + str(i) + ".nParents" + str(nParents) +  ".alpha-" + str(alpha) + ".pickle"
28 |         net = PyGibbCAMP(nodeFile = nodeFile, dataMatrixFile = dataMatrix, perturbMatrix = perturbMatrix, missingDataMatrix= missDataMatrix)
29 |         net.trainGibbsEM(pickleDumpFile = pickleFileName, nParents = nParents, nChains = 1, alpha=alpha, maxIter = 500)
30 | 
31 | 


--------------------------------------------------------------------------------
/NamedMatrix.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | A wrapper class enable access data matrix elements by col and row names
  4 | 
  5 | Created on Sun Aug 25 08:40:33 2013
  6 | 
  7 | 
  8 | 
  9 | @author: xinghualu
 10 | """
 11 | 
 12 | import numpy as np
 13 | from StringIO import StringIO
 14 | 
 15 | class NamedMatrix:
 16 |     ## Constructor
 17 |     #  @param  filename=None  A string point to a text matrix file
 18 |     #  @param delimiter=','  A string indicate the delimiter separating fields in txt
 19 |     #  @param npMatrix=None  A reference to a numpy matrix
 20 |     #  @colnames  A string array of column names
 21 |     #  @rownames  A string array of rownames
 22 | 
 23 |     def __init__(self, filename = None, delimiter = ',', npMatrix = None, colnames = None, rownames = None):
 24 |         
 25 |         if filename and npMatrix:  
 26 |             raise Exception ("Cannot create a NamedMatrix with both 'npMatrix' and 'filename' arguments set")
 27 |         if not filename and  npMatrix == None:
 28 |             raise Exception ("Attempt to create a NameMatrix without 'filename' or an 'npMatrix'")
 29 |         
 30 |         if filename:
 31 |             print "Extracting matrix file " + filename
 32 |             try:
 33 |                 f = open(filename, 'r')
 34 |                 lines = f.readlines()
 35 |             except IOError:
 36 |                 print "Fail to read  file " + filename
 37 |                 return
 38 |             
 39 |             if len(lines) == 1:  # Mac version csv, with "\r" as return
 40 |                 lines = lines[0].split("\r")
 41 |                 self.colnames = lines.pop(0).rstrip().split(',') # split  header and extract colnames
 42 |                 map(lambda x: x.rstrip(), lines)  # remove the "\r"
 43 |                 lines = "\n".join(lines)  # use "\n" to join lines
 44 |             else:
 45 |                 self.colnames = lines.pop(0).rstrip().split(',')
 46 |                 lines = "".join(lines)
 47 |                 
 48 |             self.colnames.pop(0) 
 49 |             
 50 |             # extract condition name
 51 |             self.rownames = list()            
 52 |             for l in lines.split("\n"):
 53 |                 self.rownames.append(l.split(',')[0]) 
 54 |                             
 55 |             # read in data and generate a numpy data matrix
 56 |             self.data = np.genfromtxt(StringIO(lines), delimiter = ",", usecols=tuple(range(1, len(self.colnames)+1)))
 57 |             
 58 |         if npMatrix != None:
 59 |             self.data = npMatrix
 60 |             nrow, ncol = np.shape(self.data)
 61 |             if colnames:
 62 |                 if len(colnames) == ncol:
 63 |                     self.colnames = colnames
 64 |                 else:
 65 |                     raise Exception("Dimensions of input colnames and matrix do not agree")
 66 |             else:
 67 |                 self.colnames = list()
 68 |                 for c in range(ncol):
 69 |                     self.colnames.append('c' + str(c))
 70 |             if rownames:
 71 |                 if len(rownames) == nrow:
 72 |                     self.rownames = rownames
 73 |                 else:
 74 |                     raise Exception("Dimensions of input rownames and matrix do not agree")
 75 |             else:
 76 |                 self.rownames = list()
 77 |                 for r in range(nrow):
 78 |                     self.rownames.append('r' + str(r))
 79 |                     
 80 |         self.nrows, self.ncols = np.shape(self.data)
 81 |                     
 82 |     def setColnames(self, colnames):
 83 |         if len(colnames) == len(self.colnames):
 84 |             self.colnames = colnames
 85 |         else:
 86 |             raise Exception("New colnames vector has differnt dimension as the original colnames")
 87 |             
 88 |     def getColnames(self):
 89 |         return self.colnames
 90 |             
 91 |     def setRownames(self, rownames):
 92 |         if len(rownames) == len(self.rownames):
 93 |             self.rownames = rownames
 94 |         else:
 95 |             raise Exception("New rownames vector has differnt dimension as the original colnames")
 96 |     
 97 |     def getRownames(self):
 98 |         return self.rownames
 99 |             
100 |     def getValuesByCol(self, colnames):
101 |         if isinstance (colnames, list):
102 |             if not set(colnames) <= set(self.colnames):
103 |                 raise Exception("Try to access nonexisting columns")
104 |             else:
105 |                 colIndx = map(lambda x: self.colnames.index(x), colnames)
106 |                 ixgrid = np.ix_(range(self.nrows), colIndx)
107 |                 return self.data[ixgrid]
108 | 
109 |         if isinstance(colnames, basestring): 
110 |             if colnames not in self.colnames:
111 |                 raise Exception ("Try to access non-existing column")
112 |             else:
113 |                 return self.data[:, self.colnames.index(colnames)]
114 |                 
115 |         
116 |     def setValuesByColName(self, values, col):      
117 |         self.data[:,self.colnames.index(col)] = values
118 |         
119 |         
120 |      
121 |     def shape(self):
122 |         if self.data != None:
123 |             return np.shape(self.data)
124 |             
125 |         else:
126 |             return None
127 |             
128 |     ## Return the position indices of colnames  
129 |     def findColIndices(self, colnames):
130 |         if isinstance (colnames, list):
131 |             if not set(colnames) <= set(self.colnames):
132 |                 raise Exception("Try to access nonexisting columns")
133 |             else:
134 |                 colIndx = map(lambda x: self.colnames.index(x), colnames)
135 |                 return colIndx
136 | 
137 |         if isinstance(colnames, basestring): 
138 |             if colnames not in self.colnames:
139 |                 raise Exception ("Try to access non-existing column")
140 |             else:
141 |                 return self.colnames.index(colnames)
142 |                 
143 |         
144 |     ## Return the position indices of rownames 
145 |     def findRowIndices(self, rownames):
146 |         if set(rownames) - set(self.rownames):
147 |             raise Exception("Unknown column name is used to query index")
148 |             
149 |         return [lambda x: self.rownames.index(x) for x in rownames]
150 |         
151 |     
152 |     def setCellValue(self, rowname, colname, value):
153 |         value = np.float(value) # force it into a np.float
154 |         self.data[self.rownames.index(rowname), self.colnames.index(colname)] = value
155 |             
156 |         
157 |     
158 |     
159 |     
160 |         
161 |         
162 |             
163 |             
164 |             
165 |             
166 |                         
167 |   
168 |                     
169 | 


--------------------------------------------------------------------------------
/SteinerTree.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat Sep  7 12:57:57 2013
  4 | 
  5 | @author: xinghualu
  6 | """
  7 | 
  8 | # This is a generalized implementation of the Kou algorithm for creating Steiner Trees.  It is not
  9 | # tied to GOGrapher and can be used with any networkx wieghted graph.
 10 | 
 11 | from heapq import *
 12 | from networkx import *
 13 | from networkx import Graph
 14 | 
 15 | ## Extract a Steiner tree from a weighted graph, given a list of vertices of interest
 16 | # @param G  A Graph with weighted edges
 17 | # @param voi  A list of vertices of interest
 18 | # @param generator A method to make a new Graph instance (in the case that you've extended Graph)
 19 | # \returns a new graph if no errors, None otherwise
 20 | def make_steiner_tree(G, voi, generator=None):
 21 |         mst = Graph()
 22 |         for v in voi:
 23 |                 if not v in G:
 24 |                         raise ValueError, "make_steiner_tree(): Some vertice not in original graph"
 25 |         if len(voi) == 0:
 26 |                 return mst
 27 |         if len(voi) == 1:
 28 |                 mst.add_node(voi[0])
 29 |                 return mst
 30 | 
 31 |         # Initially, use (a version of) Kruskal's algorithm to extract a minimal spanning tree
 32 |         # from a weighted graph.  This algorithm differs in that only a subset of vertices are
 33 |         # going to be present in the final subgraph (which is not truely a MST - must use Prim's
 34 |         # algorithm later.
 35 | 
 36 |         # extract all shortest paths among the voi
 37 |         heapq = []
 38 |         paths = {}
 39 | 
 40 |         # load all the paths bwteen the Steiner vertices. Store them in a heap queue
 41 |         # and reconstruct the MST of the complete graph using Kruskal's algorithm
 42 |         for i in range(len(voi) - 1):
 43 |                 v1 = voi[i]
 44 |                 for v2  in voi[i+1:]:
 45 |                         result = bidirectional_dijkstra(G, v1, v2)
 46 |                         if result == False:
 47 |                                 raise RuntimeError, "The two vertices given (%s, %s) don't exist on the same connected graph" % (v1, v2)
 48 |                                 #print "The two vertices given (%s, %s) don't exist on the same connected graph" % (v1, v2)
 49 |                         distance, vertList = result
 50 |                         keys = [v1, v2]
 51 |                         keys.sort()
 52 |                         key = "%s:%s" % tuple(keys)
 53 |                         paths[key] = (vertList)
 54 |                         heappush(heapq, (distance, v1, v2))
 55 | 
 56 |                                
 57 |         # construct the minimum spanning tree of the complete graph
 58 |         while heapq:
 59 |                 w, v1, v2 = heappop(heapq)
 60 |                 # if no path exists yet between v1 and v2, add this one
 61 |                 if v1 not in mst or v2 not in mst or not has_path(mst, v1, v2):
 62 |                         mst.add_edge(v1, v2,weight=w)
 63 | 
 64 |         # check if the graph is tree and correct
 65 |         sTree = set(mst.nodes())
 66 |         sSteiner = set(voi)
 67 |         if sTree ^ sSteiner:
 68 |                 raise RuntimeError, 'Failed to construct MST spanning tree'
 69 |        
 70 |         # reconstruct subgraph of origGraph using the paths
 71 |         if generator is None:
 72 |                 subgraph  = Graph()
 73 |         else:
 74 |                 subgraph = generator()
 75 |         for edge in mst.edges_iter(data=True):
 76 |                 keys = [edge[0],edge[1]]
 77 |                 keys.sort()
 78 |                 key = "%s:%s" % tuple(keys)
 79 |                 vList = paths[key]
 80 |                 for i in range(len(vList) - 1):
 81 |                         v1 = vList[i]
 82 |                         v2 = vList[i+1]
 83 |                         w = G[v1][v2]
 84 |                         subgraph.add_edge(v1, v2, w)
 85 |         # get rid of possible loops - result will be a true MST
 86 |         subgraph = make_prim_mst(subgraph, generator)
 87 | 
 88 |         # remove intermediate nodes in paths that are not in list of voi
 89 |         return _trimTree(subgraph, voi)
 90 | 
 91 | 
 92 | ## remove intermediate nodes in paths that are not in list of voi in given graph
 93 | # @param        graph A weighted GOGenePubmedGraph
 94 | # @param        voi
 95 | #/return        graph An updated version of the GOGenePubmedGraph
 96 | def _trimTree(graph, voi):
 97 |         trimKeepTrack = []
 98 |         firstNode = voi[0]
 99 |         if len(graph.neighbors(firstNode)) < 2:
100 |                 trimKeepTrack.append(firstNode)
101 |                 firstNeighbor = graph.neighbors(firstNode)[0]
102 |                 trimKeepTrack.append(firstNeighbor)
103 |                 graph = _trim(firstNeighbor, graph, trimKeepTrack, voi)
104 |         else:
105 |                 trimKeepTrack.append(firstNode)
106 |                 graph = _trim(firstNode, graph, trimKeepTrack, voi)
107 |         return graph
108 | 
109 | 
110 | def _trim(node, graph, trimKeepTrack, voi):
111 |         if len(graph.adj[node].keys()) > 1:
112 |                 for nodeNeighbor in graph.adj[node].keys():
113 |                         if nodeNeighbor not in trimKeepTrack:
114 |                                 trimKeepTrack.append(nodeNeighbor)
115 |                                 graph = _trim(nodeNeighbor, graph, trimKeepTrack, voi)
116 |         if len(graph.adj[node].keys()) < 2:
117 |                 if node not in voi:
118 |                         graph.remove_node(node)
119 |         return graph
120 | 
121 | 
122 | """
123 | Prim's algorithm: constructs the minimum spanning tree (MST) from an instance of weighted Graph
124 | @param G An weighted Graph()
125 | @param generator A method to make a new Graph instance (in the case that you've extended Graph)
126 | \returns A MST verstion of G
127 | """
128 | ## generate the prim's algorithm mst
129 | # @param        G A weighted GOGenePubmedGraph
130 | # @param        generator Always set to None
131 | # /return       mst Returns the created MST
132 | def make_prim_mst(G, generator=None):
133 |         if generator is None:
134 |                 mst = Graph()
135 |         else:
136 |                 mst = generator()      
137 |         #priorityQ is a list of list (the reverse of the edge tuple with the weight in the front)
138 |         priorityQ = []
139 |         firstNode = G.nodes()[0]
140 |         mst.add_node(firstNode)
141 |         for edge in G.edges_iter(firstNode, data=True):
142 |                 if len(edge) != 3 or edge[2] is None:
143 |                         raise ValueError, "make_prim_mst accepts a weighted graph only (with numerical weights)"
144 |                 heappush(priorityQ, (edge[2], edge))
145 | 
146 |         while len(mst.edges()) < (G.order()-1):
147 |                 w, minEdge = heappop(priorityQ)
148 |                 if len(minEdge) != 3 or minEdge[2] is None:
149 |                         raise ValueError, "make_prim_mst accepts a weighted graph only (with numerical weights)"
150 |                 v1, v2, w = minEdge
151 |                 if v1 not in mst:
152 |                         for edge in G.edges_iter(v1, data=True):
153 |                                 if edge == minEdge:
154 |                                         continue
155 |                                 heappush(priorityQ, (edge[2], edge))
156 |                 elif v2 not in mst:
157 |                         for edge in G.edges_iter(v2, data=True):
158 |                                 if edge == minEdge:
159 |                                         continue
160 |                                 heappush(priorityQ, (edge[2], edge))
161 |                 else:
162 |                         # non-crossing edge
163 |                         continue
164 |                 mst.add_edge(minEdge[0],minEdge[1],minEdge[2])
165 |         return mst


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 | Version 2.0, January 2004
  3 | http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 | "License" shall mean the terms and conditions for use, reproduction, and
 10 | distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright
 13 | owner that is granting the License.
 14 | 
 15 | "Legal Entity" shall mean the union of the acting entity and all other entities
 16 | that control, are controlled by, or are under common control with that entity.
 17 | For the purposes of this definition, "control" means (i) the power, direct or
 18 | indirect, to cause the direction or management of such entity, whether by
 19 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
 20 | outstanding shares, or (iii) beneficial ownership of such entity.
 21 | 
 22 | "You" (or "Your") shall mean an individual or Legal Entity exercising
 23 | permissions granted by this License.
 24 | 
 25 | "Source" form shall mean the preferred form for making modifications, including
 26 | but not limited to software source code, documentation source, and configuration
 27 | files.
 28 | 
 29 | "Object" form shall mean any form resulting from mechanical transformation or
 30 | translation of a Source form, including but not limited to compiled object code,
 31 | generated documentation, and conversions to other media types.
 32 | 
 33 | "Work" shall mean the work of authorship, whether in Source or Object form, made
 34 | available under the License, as indicated by a copyright notice that is included
 35 | in or attached to the work (an example is provided in the Appendix below).
 36 | 
 37 | "Derivative Works" shall mean any work, whether in Source or Object form, that
 38 | is based on (or derived from) the Work and for which the editorial revisions,
 39 | annotations, elaborations, or other modifications represent, as a whole, an
 40 | original work of authorship. For the purposes of this License, Derivative Works
 41 | shall not include works that remain separable from, or merely link (or bind by
 42 | name) to the interfaces of, the Work and Derivative Works thereof.
 43 | 
 44 | "Contribution" shall mean any work of authorship, including the original version
 45 | of the Work and any modifications or additions to that Work or Derivative Works
 46 | thereof, that is intentionally submitted to Licensor for inclusion in the Work
 47 | by the copyright owner or by an individual or Legal Entity authorized to submit
 48 | on behalf of the copyright owner. For the purposes of this definition,
 49 | "submitted" means any form of electronic, verbal, or written communication sent
 50 | to the Licensor or its representatives, including but not limited to
 51 | communication on electronic mailing lists, source code control systems, and
 52 | issue tracking systems that are managed by, or on behalf of, the Licensor for
 53 | the purpose of discussing and improving the Work, but excluding communication
 54 | that is conspicuously marked or otherwise designated in writing by the copyright
 55 | owner as "Not a Contribution."
 56 | 
 57 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf
 58 | of whom a Contribution has been received by Licensor and subsequently
 59 | incorporated within the Work.
 60 | 
 61 | 2. Grant of Copyright License.
 62 | 
 63 | Subject to the terms and conditions of this License, each Contributor hereby
 64 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
 65 | irrevocable copyright license to reproduce, prepare Derivative Works of,
 66 | publicly display, publicly perform, sublicense, and distribute the Work and such
 67 | Derivative Works in Source or Object form.
 68 | 
 69 | 3. Grant of Patent License.
 70 | 
 71 | Subject to the terms and conditions of this License, each Contributor hereby
 72 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
 73 | irrevocable (except as stated in this section) patent license to make, have
 74 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where
 75 | such license applies only to those patent claims licensable by such Contributor
 76 | that are necessarily infringed by their Contribution(s) alone or by combination
 77 | of their Contribution(s) with the Work to which such Contribution(s) was
 78 | submitted. If You institute patent litigation against any entity (including a
 79 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a
 80 | Contribution incorporated within the Work constitutes direct or contributory
 81 | patent infringement, then any patent licenses granted to You under this License
 82 | for that Work shall terminate as of the date such litigation is filed.
 83 | 
 84 | 4. Redistribution.
 85 | 
 86 | You may reproduce and distribute copies of the Work or Derivative Works thereof
 87 | in any medium, with or without modifications, and in Source or Object form,
 88 | provided that You meet the following conditions:
 89 | 
 90 | You must give any other recipients of the Work or Derivative Works a copy of
 91 | this License; and
 92 | You must cause any modified files to carry prominent notices stating that You
 93 | changed the files; and
 94 | You must retain, in the Source form of any Derivative Works that You distribute,
 95 | all copyright, patent, trademark, and attribution notices from the Source form
 96 | of the Work, excluding those notices that do not pertain to any part of the
 97 | Derivative Works; and
 98 | If the Work includes a "NOTICE" text file as part of its distribution, then any
 99 | Derivative Works that You distribute must include a readable copy of the
100 | attribution notices contained within such NOTICE file, excluding those notices
101 | that do not pertain to any part of the Derivative Works, in at least one of the
102 | following places: within a NOTICE text file distributed as part of the
103 | Derivative Works; within the Source form or documentation, if provided along
104 | with the Derivative Works; or, within a display generated by the Derivative
105 | Works, if and wherever such third-party notices normally appear. The contents of
106 | the NOTICE file are for informational purposes only and do not modify the
107 | License. You may add Your own attribution notices within Derivative Works that
108 | You distribute, alongside or as an addendum to the NOTICE text from the Work,
109 | provided that such additional attribution notices cannot be construed as
110 | modifying the License.
111 | You may add Your own copyright statement to Your modifications and may provide
112 | additional or different license terms and conditions for use, reproduction, or
113 | distribution of Your modifications, or for any such Derivative Works as a whole,
114 | provided Your use, reproduction, and distribution of the Work otherwise complies
115 | with the conditions stated in this License.
116 | 
117 | 5. Submission of Contributions.
118 | 
119 | Unless You explicitly state otherwise, any Contribution intentionally submitted
120 | for inclusion in the Work by You to the Licensor shall be under the terms and
121 | conditions of this License, without any additional terms or conditions.
122 | Notwithstanding the above, nothing herein shall supersede or modify the terms of
123 | any separate license agreement you may have executed with Licensor regarding
124 | such Contributions.
125 | 
126 | 6. Trademarks.
127 | 
128 | This License does not grant permission to use the trade names, trademarks,
129 | service marks, or product names of the Licensor, except as required for
130 | reasonable and customary use in describing the origin of the Work and
131 | reproducing the content of the NOTICE file.
132 | 
133 | 7. Disclaimer of Warranty.
134 | 
135 | Unless required by applicable law or agreed to in writing, Licensor provides the
136 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS,
137 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
138 | including, without limitation, any warranties or conditions of TITLE,
139 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
140 | solely responsible for determining the appropriateness of using or
141 | redistributing the Work and assume any risks associated with Your exercise of
142 | permissions under this License.
143 | 
144 | 8. Limitation of Liability.
145 | 
146 | In no event and under no legal theory, whether in tort (including negligence),
147 | contract, or otherwise, unless required by applicable law (such as deliberate
148 | and grossly negligent acts) or agreed to in writing, shall any Contributor be
149 | liable to You for damages, including any direct, indirect, special, incidental,
150 | or consequential damages of any character arising as a result of this License or
151 | out of the use or inability to use the Work (including but not limited to
152 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or
153 | any and all other commercial damages or losses), even if such Contributor has
154 | been advised of the possibility of such damages.
155 | 
156 | 9. Accepting Warranty or Additional Liability.
157 | 
158 | While redistributing the Work or Derivative Works thereof, You may choose to
159 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or
160 | other liability obligations and/or rights consistent with this License. However,
161 | in accepting such obligations, You may act only on Your own behalf and on Your
162 | sole responsibility, not on behalf of any other Contributor, and only if You
163 | agree to indemnify, defend, and hold each Contributor harmless for any liability
164 | incurred by, or claims asserted against, such Contributor by reason of your
165 | accepting any such warranty or additional liability.
166 | 
167 | END OF TERMS AND CONDITIONS
168 | 
169 | APPENDIX: How to apply the Apache License to your work
170 | 
171 | To apply the Apache License to your work, attach the following boilerplate
172 | notice, with the fields enclosed by brackets "[]" replaced with your own
173 | identifying information. (Don't include the brackets!) The text should be
174 | enclosed in the appropriate comment syntax for the file format. We also
175 | recommend that a file or class name and description of purpose be included on
176 | the same "printed page" as the copyright notice for easier identification within
177 | third-party archives.
178 | 
179 |    Copyright [yyyy] [name of copyright owner]
180 | 
181 |    Licensed under the Apache License, Version 2.0 (the "License");
182 |    you may not use this file except in compliance with the License.
183 |    You may obtain a copy of the License at
184 | 
185 |      http://www.apache.org/licenses/LICENSE-2.0
186 | 
187 |    Unless required by applicable law or agreed to in writing, software
188 |    distributed under the License is distributed on an "AS IS" BASIS,
189 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
190 |    See the License for the specific language governing permissions and
191 |    limitations under the License.
192 | 


--------------------------------------------------------------------------------
/PyGibbCAMP.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @ PyCAMP  Python Causal Modeling of Pathways, a python implmentation for modeling
  4 | causal relationship bewtween cellular signaling proteins, particularly phosphorylated
  5 | proteins based on reverse phase protein array (RPPA) data.  This model is designed
  6 | to model the signal transduction through series of protein phosphorylation cascade,
  7 | in which phosphorylation of a protein often activate the protein, which in turn
  8 | lead to phosphorylation of other proteins.  This model represent
  9 | the phosphorylation state(s) and activation state of a protein separately such that the model
 10 | is capable of capture the fact that, at certain time, phosphorylation of a protein
 11 | can be decoupled by drug and inhibitors. 
 12 | 
 13 | 
 14 | Created on Wed Aug 14 19:16:25 2013
 15 | 
 16 | @author: Xinghua  Lu
 17 | """
 18 | 
 19 | import networkx as nx
 20 | import numpy as np
 21 | from numpy import matlib
 22 | from rpy2 import robjects 
 23 | import  math, cPickle, re
 24 | from SigNetNode import SigNetNode
 25 | from StringIO import StringIO
 26 | from NamedMatrix import NamedMatrix
 27 | from SteinerTree import *
 28 | 
 29 | import rpy2.robjects.numpy2ri
 30 | rpy2.robjects.numpy2ri.activate()   # enable directly pass numpy arrary or matrix as arguments to rpy object
 31 | R = robjects.r                      # load R instance
 32 | R.library("glmnet")
 33 | glmnet = R('glmnet')                # make glmnet from R a callable python object
 34 | 
 35 | R.library("mixtools")
 36 | normalmixEM = R('normalmixEM')
 37 | 
 38 | 
 39 | 
 40 | class PyGibbCAMP:  
 41 |     ## Constructor
 42 |     #  @param nodeFile  A string of pathname of file containing nodes.  The 
 43 |     #                   name, type, measured
 44 |     #  @param edgeFile  A list of tuples, each containing a source and sink node 
 45 |     #                   of an edge
 46 |     #  @param dataMatrixFile  A string to data
 47 |     def __init__(self, nodeFile , dataMatrixFile , perturbMatrix = None, missingDataMatrix=None):
 48 |         self.network = None
 49 |         self.obsData = None
 50 |         self.missingDataMatrix = None
 51 |         perturbInstances = None
 52 |         self.nChains = 1
 53 |         
 54 |         self.dictPerturbEffect = {'AKT1' : [('GSK690693',	0), \
 55 |         ('GSK690693_GSK1120212', 0)], 'MAP2K1' : [('GSK690693_GSK1120212', 0)],\
 56 |         'EGFR': [('EGF' , 1), ('FGF1', 1)]}
 57 | #        self.stimuli = ['EGF',	'FGF1',	'HGF',	'IGF1', 'Insulin',	'NRG1',	'PBS',	'Serum']
 58 | 
 59 |         # parse data mastrix by calling NamedMatrix class
 60 |         if not dataMatrixFile:
 61 |             raise Exception("Cannot create PyCAMP obj without 'dataMatrixFile'")
 62 |             return
 63 |         self.obsData = NamedMatrix(dataMatrixFile)
 64 |         nCases, nAntibodies = np.shape(self.obsData.data)
 65 |         self.obsData.colnames = map(lambda s: s+'F', self.obsData.colnames)
 66 |         self.obsDataFileName = dataMatrixFile
 67 |         
 68 |         if perturbMatrix:        
 69 |             self.perturbData = NamedMatrix(perturbMatrix)
 70 |             perturbInstances = self.perturbData.getColnames()
 71 |             self.perturbInstances = perturbInstances
 72 |                     
 73 |         if missingDataMatrix:
 74 |             self.missingDataMatrix = NamedMatrix(missingDataMatrix)
 75 |             allMissing = np.sum(self.missingDataMatrix, 0) ==  nCases
 76 |             if np.any(allMissing):
 77 |                 raise Exception ("Data matrix contain data-less columns")
 78 |             self.missingDataMatrix.colnames = map(lambda s: s+'F', self.missingDataMatrix.colnames)
 79 | 
 80 |         if not nodeFile:
 81 |             raise Exception("Calling 'intiNetwork' with empty nodeFile name")
 82 |             return
 83 | 
 84 |         try:
 85 |             nf = open(nodeFile, "r")
 86 |             nodeLines = nf.readlines()
 87 |             if len(nodeLines) == 1:  # Mac files end a line with \r instead of \n
 88 |                 nodeLines = nodeLines[0].split("\r")
 89 |             nf.close()
 90 |         except IOError:
 91 |             raise Exception( "Failed to open the file containing nodes")
 92 |             return
 93 |             
 94 |         print "Creating network"          
 95 |         self.network = nx.DiGraph()
 96 | 
 97 |         self.dictProteinToAntibody = dict()
 98 |         self.dictAntibodyToProtein = dict()
 99 |         # parse nodes
100 |         for line in nodeLines:
101 |             #print line
102 |             protein, antibody = line.rstrip().split(',')
103 |             
104 |             if protein not in self.dictProteinToAntibody:
105 |                 self.dictProteinToAntibody[protein] = []
106 |             self.dictProteinToAntibody[protein].append(antibody)
107 |             self.dictAntibodyToProtein[antibody] = protein
108 |             
109 |             fluo = antibody + 'F'
110 |             if protein not in self.network:
111 |                 self.network.add_node(protein, nodeObj = SigNetNode(protein, 'ACTIVATIONSTATE', False))
112 |             self.network.add_node(antibody, nodeObj= SigNetNode(antibody, 'PHOSPHORYLATIONSTATE', False))
113 |             self.network.add_node(fluo, nodeObj = SigNetNode(fluo, 'FLUORESCENCE', True))
114 |             self.network.add_edge(antibody, protein)
115 |             self.network.add_edge(antibody, fluo)
116 |         
117 |         for perturb in perturbInstances:
118 |             self.network.add_node(perturb, nodeObj = SigNetNode(perturb, 'PERTURBATION', True))                
119 |             
120 |         # Add edges between PERTURBATION, protein activity,and  phosphorylation layers 
121 |         for pro in self.dictProteinToAntibody:
122 |             for phos in self.dictAntibodyToProtein:
123 |                 if self.dictAntibodyToProtein[phos] == pro:
124 |                     continue
125 |                 self.network.add_edge(pro, phos)
126 |             for perturb in perturbInstances:
127 |                 self.network.add_edge(perturb, pro)
128 |             
129 |         
130 |     ## Init parameters of the model
131 |     #  In Bayesian network setting, the joint probability is calculated
132 |     #  through the product of a series conditional probability.  The parameters
133 |     #  of the PyCAMP model defines p(x | Pa(X)).  For observed fluorescent node
134 |     #  the conditional probability is a mixture of two Gaussian distribution.  
135 |     #  therefore, the parameters are two pairs of mu and sigma.  For
136 |     #  the hidden variables representing phosphorylation states and activation
137 |     #  states of proteins, the conditional probability is defined by a logistic
138 |     #  regression. Therefore, the parameters associated with such a node is a 
139 |     #  vector of real numbers.
140 |     # 
141 |     def _initParams(self):
142 |         print "Initialize parameters associated with each node in each MCMC chain"
143 |         for nodeId in self.network: 
144 |             self._initNodeParams(nodeId)
145 |             
146 |     def _initNodeParams(self, nodeId):
147 |         nodeObj = self.network.node[nodeId]['nodeObj']
148 |         if nodeObj.type == 'FLUORESCENCE':                
149 |             # Estimate mean and sd of fluo signal using mixture model
150 |             if self.missingDataMatrix and nodeId in self.missingDataMatrix.getColnames():
151 |                 nodeData = self.obsData.getValuesByCol( nodeId)
152 |                 nodeData = nodeData[self.missingDataMatrix.getValuesByCol(nodeId) == 0]
153 |             else:
154 |                 nodeData = self.obsData.getValuesByCol(nodeId)
155 |             nodeObj.mus = np.zeros((self.nChains, 2))
156 |             nodeObj.sigmas = np.zeros((self.nChains, 2))
157 |             for c in range(self.nChains):   
158 |                 mixGaussians = normalmixEM(robjects.FloatVector(nodeData), k = 2 )
159 |                 # mus and sigmas are represented as nChain x 2 matrices
160 |                 nodeObj.mus[c,:] = np.array(mixGaussians[2])
161 |                 nodeObj.sigmas[c,:] = np.array(mixGaussians[3])            
162 |         else:
163 |             preds = self.network.predecessors(nodeId)
164 |             if len(preds) > 0:
165 |                 nodeObj.paramNames = preds
166 |                 nodeObj.params = np.random.randn(self.nChains, len(preds) + 1)
167 |             else:
168 |                 nodeObj.params  = None
169 |                 
170 |     
171 |     ## Initialize latent variables
172 |     #    
173 |     #
174 |     def _initHiddenStates(self):
175 |         hiddenNodes = [n for n in self.network if not self.network.node[n]['nodeObj'].bMeasured]
176 |         phosNodes = [n for n in self.network if self.network.node[n]['nodeObj'].type == 'PHOSPHORYLATIONSTATE']
177 |         #print str(phosNodes)
178 |         nCases, nAntibody = self.obsData.shape()
179 |         caseNames = self.obsData.getRownames()
180 |         
181 |         self.nodeStates = list()
182 |         for c in range(self.nChains):
183 |             tmp = np.zeros((nCases, len(hiddenNodes)))
184 |             tmp[np.random.rand(nCases, len(hiddenNodes)) < 0.3] = 1
185 |             tmp = np.column_stack((tmp, self.perturbData.data))
186 |             colnames = hiddenNodes + self.perturbData.colnames
187 |             self.nodeStates.append(NamedMatrix(npMatrix = tmp, colnames = colnames, rownames = caseNames))
188 |             
189 |             #initialize phos state based on the observed fluo 
190 |             for node in phosNodes:
191 |                 fluoNode = node + 'F'
192 |                 #print "phosNode:" + node + "; fluoNode: " + fluoNode
193 |                 fluoNodeObj = self.network.node[fluoNode]['nodeObj']
194 |                 fluoData = self.obsData.getValuesByCol(fluoNode)
195 |                 tmp = np.zeros(nCases)
196 |                 phosProbOne = - np.log(fluoNodeObj.sigmas[c, 1])\
197 |                 - 0.5 * np.square(fluoData - fluoNodeObj.mus[c, 1]) / np.square(fluoNodeObj.sigmas[c, 1])                    
198 |                 phosProbZero = - np.log(fluoNodeObj.sigmas[c, 0])\
199 |                 - 0.5 * np.square(fluoData - fluoNodeObj.mus[c, 0]) / np.square(fluoNodeObj.sigmas[c, 0])
200 |                 tmp[phosProbOne > phosProbZero] = 1
201 |                 nodeIndx = self.nodeStates[c].findColIndices(node)
202 |                 self.nodeStates[c].data[:,nodeIndx] = tmp
203 |                 
204 |                 # take care of missing values by random sampling
205 |                 if self.missingDataMatrix:
206 |                     if node in self.missingDataMatrix.getColnames(): 
207 |                         #print "processing node with missing values: " + nodeId
208 |                         missingCases = self.missingDataMatrix.getValuesByCol(node) == 1
209 |                         tmp = np.zeros(sum(missingCases))
210 |                         tmp[np.random.rand(len(tmp)) <= 0.3] = 1
211 |                         self.nodeStates[c].data[missingCases, nodeIndx] = tmp
212 |                     
213 |         
214 |         
215 |     ## Calculate the marginal probability of observing the measured data by
216 |     #  integrating out all possible setting of latent variable states and 
217 |     #  model parameters.            
218 |     def calcEvidenceLikelihood(self):
219 |         phosNodes = [n for n in self.network if self.network.node[n]['nodeObj'].type == 'PHOSPHORYLATIONSTATE']
220 |         loglikelihood = 0
221 |         nCases, nAntibodies = np.shape(self.obsData.data) 
222 |         for nodeId in phosNodes:
223 |             nodeObj = self.network.node[nodeId]['nodeObj']
224 |             nodeIndx = self.nodeStates[0].findColIndices(nodeId)
225 |             preds = self.network.predecessors(nodeId)
226 |             for c in range(self.nChains):
227 |                 nodeData = self.nodeStates[c].data[:, nodeIndx]
228 |                 predStates = np.column_stack((np.ones(nCases), self.nodeStates[c].getValuesByCol(preds)))
229 |                 pOneCondOnParents = 1 / (1 + np.exp( - np.dot(predStates, nodeObj.params[c,:])))
230 |                 pOneCondOnParents[pOneCondOnParents == 1.] -= np.finfo(np.float).eps
231 |                 
232 |                 loglikelihood += np.sum(nodeData * np.log(pOneCondOnParents) \
233 |                 + (1 - nodeData) * np.log(1 - pOneCondOnParents))
234 |                 
235 |             loglikelihood /= self.nChains
236 |             return loglikelihood
237 |         
238 |     ## Perform graph search
239 |     def trainGibbsEM(self, nChains = 10, alpha = 0.1, nParents = 4, nSamples = 5, pickleDumpFile = None, maxIter = 1000):
240 |         self.nChains = nChains
241 |         self.alpha = alpha  
242 |         self.likelihood = list()
243 |         self.nSamples = nSamples
244 |         self.nParents = nParents
245 |         
246 |         if pickleDumpFile:
247 |             self.pickleDumpFile = pickleDumpFile
248 |         else:
249 |             self.pickleDumpFile = self.obsDataFileName + "alpha" + str(self.alpha) +  ".pickle"  
250 |         
251 |         # check if the network and data agrees
252 |         nodeToDelete = list()
253 |         for nodeId in self.network:
254 |             if self.network.node[nodeId]['nodeObj'].type == 'FLUORESCENCE' and nodeId not in self.obsData.getColnames():
255 |                 print "Node " + nodeId + " don't has associated data"
256 |                 nodeToDelete.append(nodeId)
257 |                 nodeToDelete.append(self.network.predecessors(nodeId)[0])
258 |         for nodeId in nodeToDelete:
259 |             if self.network.has_node(nodeId):
260 |                 print "removing node " + nodeId
261 |                 self.network.remove_node(nodeId)
262 | 
263 |         # Starting EM set up Markov chains  to train a model purely based on prior knowledge        
264 |         self._initParams()
265 |         self._initHiddenStates()
266 | 
267 |         # perform update of latent variables in a layer-wise manner
268 |         self.likelihood = list()        
269 |         
270 |         self.expectedStates = list()
271 |         nCases, nAntibodies = np.shape(self.obsData.data)
272 |         for c in range(self.nChains):                  
273 |             # each chain collect expected statistics of nodes from samples along the chain
274 |             self.expectedStates.append(np.zeros(np.shape(self.nodeStates[c].data)))
275 | 
276 |         print "Starting EM: alpha = " + str(self.alpha) + "; nChains = " + str(self.nChains) + "; nSamples = " + str (self.nSamples) + "; nParents = " + str(self.nParents)
277 |         optLikelihood = float("-inf")
278 |         bConverged = False
279 |         sampleCount = 0
280 |         
281 |         likelihood = self.calcEvidenceLikelihood()
282 |         print "nIter: 0"  + "; log likelihood of evidence: " + str(likelihood)
283 |         self.likelihood.append(likelihood)
284 |         for nIter in range(maxIter): 
285 |                 
286 |             # E-step of EM
287 |             self._updateActivationStates()            
288 |             if  (nIter+1) % 2 == 0: # we collect sample every other iteration
289 |                 sampleCount += 1
290 |                 for c in range(self.nChains):
291 |                     self.expectedStates[c] +=  self.nodeStates[c].data                
292 |                 
293 |             # M-step of EM.  We only update parameters after a collecting a certain number of samples
294 |             if sampleCount >= self.nSamples:                    
295 |                 sampleCount = 0
296 |                  # take expectation of sample states
297 |                 self.expectedStates = map(lambda x: x / self.nSamples, self.expectedStates)
298 |                 self._updteParams(self.alpha, nparents = self.nParents)
299 |                 
300 |                 likelihood = self.calcEvidenceLikelihood()
301 |                 self.likelihood.append(likelihood)   
302 |                 print "nIter: " + str(nIter + 1) + "; log likelihood of evidence: " + str(likelihood)                    
303 | 
304 |                 # collect the current best fit models
305 |                 if likelihood > optLikelihood:
306 |                     optLikelihood = likelihood
307 |                     try:
308 |                         cPickle.dump(self, open(self.pickleDumpFile, 'wb'))
309 |                     except: 
310 |                         raise Exception("Cannot create pickle dumpfile " + self.pickleDumpFile)
311 | 
312 |                 bConverged = self._checkConvergence()
313 |                 if bConverged:
314 |                     print "EM converged!"
315 |                     break
316 |                 
317 |                 for c in range(self.nChains):  # clear expectedStates
318 |                     self.expectedStates[c] = np.zeros(np.shape(self.nodeStates[c].data))
319 |                 
320 |         # now try to delete edges that does contribute to evidence
321 |         #self.trimEdgeByConsensus(.9)
322 |         return self  
323 |             
324 |     def _checkConvergence(self):
325 |         # To do, add convergence checking code
326 |         if len(self.likelihood) < 20:
327 |             return False
328 |             
329 |         ml = np.mean(self.likelihood[-5:-1])
330 |         ratio = abs(self.likelihood[-1] - ml ) / abs(ml)        
331 |         return ratio <= 0.001
332 | 
333 |     def _updateActivationStates(self):
334 |         nCases, antibody = np.shape(self.obsData.data)
335 |         nCases, nHiddenNodes = np.shape(self.nodeStates[0].data)
336 | 
337 |         # interate through all nodes. 
338 |         activationNode = [n for n in self.network if self.network.node[n]['nodeObj'].type == 'ACTIVATIONSTATE']
339 |                     
340 |         for nodeId in activationNode: 
341 |             for c in range(self.nChains):
342 |                 curNodeMarginal = self.calcNodeCondProb(nodeId, c)
343 |                 
344 |                 # sample states of current node based on the prob, and update 
345 |                 sampleState = np.zeros(nCases)
346 |                 sampleState[curNodeMarginal >= np.random.rand(nCases)] = 1.
347 |                 curNodeIndx = self.nodeStates[c].findColIndices(nodeId)
348 |                 self.nodeStates[c].data[:, curNodeIndx] = sampleState
349 |                 
350 |                 # clamp the activationState of perturbed nodes to a fix value
351 |                 if nodeId in self.dictPerturbEffect:
352 |                     # the diction keeps a list conditins under which the node is perurbed and the state to be clamped to
353 |                     for condition, state in self.dictPerturbEffect[nodeId]:
354 |                         perturbState = self.nodeStates[c].getValuesByCol(condition)
355 |                         indx = self.nodeStates[c].findColIndices(nodeId)
356 |                         self.nodeStates[c].data[perturbState==1, indx] = state
357 |                         
358 |             
359 |     def calcNodeCondProb(self, nodeId, c):
360 |         """
361 |         Calculate the marginal probability of a node's state set to "1" conditioning 
362 |         on all evidence.
363 |         
364 |         args:
365 |              nodeId   A string id of the node of interest
366 |              c        An integer indicate the chain from which the parameter 
367 |                          vector to be used  
368 |         """
369 |         nodeObj = self.network.node[nodeId]['nodeObj']
370 |         if nodeObj.bMeasured:
371 |             raise Exception("Call _caclNodeMarginalProb on an observed variable " + nodeId)
372 | 
373 |         nCases, nAntibody = np.shape(self.obsData.data)        
374 | 
375 |         # collect the state of the predecessors of the node
376 |         preds = self.network.predecessors(nodeId)        
377 |         logProbOneCondOnParents = 0
378 |         logProbZeroCondOnParents = 0
379 |         if len(preds) > 0:  # if the node has parents  
380 |             # calculate p(curNode = 1 | parents);                 
381 |             nodeParams = nodeObj.params[c,:] 
382 |             predStates =  np.column_stack((np.ones(nCases), self.nodeStates[c].getValuesByCol(preds))) 
383 |             pOneCondOnParents = 1 / (1 + np.exp( - np.dot(predStates, nodeParams)))
384 |             pOneCondOnParents[pOneCondOnParents == 1] -= np.finfo(np.float).eps
385 |             pOneCondOnParents[pOneCondOnParents == 0] += np.finfo(np.float).eps
386 |             logProbOneCondOnParents  = np.log(pOneCondOnParents)
387 |             logProbZeroCondOnParents = np.log(1 - pOneCondOnParents)
388 | 
389 |         # collect  evidence from  children 
390 |         logProbChildCondOne = 0  # the prob of child conditioning on current node == 1
391 |         logProdOfChildCondZeros = 0
392 |         
393 |         children = self.network.successors(nodeId)
394 |         if len(children) > 0:
395 |             for child in children:  
396 |                 childNodeObj = self.network.node[child]['nodeObj']
397 |                 curChildStates = self.nodeStates[c].getValuesByCol(child)                    
398 |                 
399 |                 # Collect states of the predecessors of the child
400 |                 childPreds = self.network.predecessors(child)
401 |                 childNodeParams = childNodeObj.params[c,:]
402 |                 childPredStates = self.nodeStates[c].getValuesByCol(childPreds)
403 |                 childPredStates = np.column_stack((np.ones(nCases), childPredStates)) # padding data with a column ones as bias
404 | 
405 |                 # Set the state of current node to ones 
406 |                 curNodePosInPredList = childPreds.index(nodeId) + 1 # offset by 1 because padding 
407 |                 if childNodeParams[curNodePosInPredList] == 0:  # not an real edge 
408 |                     continue
409 |                 childPredStates[:, curNodePosInPredList] = np.ones(nCases)                
410 |                 pChildCondCurNodeOnes = 1 / (1 + np.exp(-np.dot(childPredStates, childNodeParams)))
411 |                 pChildCondCurNodeOnes[pChildCondCurNodeOnes==1] -= np.finfo(np.float).eps
412 |                 pChildCondCurNodeOnes[pChildCondCurNodeOnes==0] += np.finfo(np.float).eps
413 |                 logProbChildCondOne += np.log (curChildStates * pChildCondCurNodeOnes + (1 - curChildStates) * (1 - pChildCondCurNodeOnes))
414 |                     
415 |                 # set the state of the current node (nodeId) to zeros 
416 |                 childPredStates [:, curNodePosInPredList] = np.zeros(nCases)
417 |                 pChildCondCurNodeZeros = 1 / (1 + np.exp(- np.dot(childPredStates, childNodeParams))) 
418 |                 pChildCondCurNodeZeros[pChildCondCurNodeZeros==1]  -= np.finfo(np.float).eps
419 |                 pChildCondCurNodeZeros[pChildCondCurNodeZeros==0]  += np.finfo(np.float).eps
420 |                 logProdOfChildCondZeros += np.log(curChildStates * pChildCondCurNodeZeros + (1 - curChildStates) * (1 - pChildCondCurNodeZeros))
421 | 
422 |         # now we can calculate the marginal probability of current node 
423 |         curNodeMarginal = 1 / (1 + np.exp(logProbZeroCondOnParents + logProdOfChildCondZeros - logProbOneCondOnParents - logProbChildCondOne))
424 |         return curNodeMarginal
425 |     
426 | 
427 |     def parseGlmnetCoef(self, glmnet_res):        
428 |         """ Parse the 'beta' matrix returned by calling glmnet through RPy2.
429 |             Return the first column of 'beta' matrix of the glmnet object 
430 |             with 3 or more non-zero values 
431 |             """
432 |         # read in intercept; a vector of length of nLambda
433 |         a0 = np.array(glmnet_res.rx('a0'))[0]
434 |         
435 |         # Read in lines of beta matrix txt, which is a nVariables * nLambda.
436 |         # Since we call glmnet by padding x with a column of 1s, we only work
437 |         # with the 'beta' matrix returned by fit
438 |         betaLines = StringIO(str(glmnet_res.rx('beta'))).readlines()
439 |         dimStr = re.search("\d+\s+x\s+\d+", betaLines[1]).group(0)
440 |         if not dimStr:
441 |             raise Exception("'parse_glmnet_res' could not determine the dims of beta")
442 |         nVariables , nLambda = map(int, dimStr.split(' x ')) 
443 |         betaMatrix = np.zeros( (nVariables, nLambda), dtype=np.float)
444 |         
445 |         # glmnet print beta matrix in mulitple blocks with 
446 |         # nVariable * blockSize
447 |         blockSize = len(betaLines[4].split()) - 1
448 |         curBlockColStart = - blockSize
449 |         for line in betaLines:  #read in blocks
450 |             m = re.search('^V\d+', line)
451 |             if not m:  # only find the lines begins with 'V\d'
452 |                 continue
453 |             else:
454 |                 rowIndx = int(m.group(0)[1:len(m.group(0))]) 
455 |             if rowIndx == 1:
456 |                 curBlockColStart += blockSize
457 |                 
458 |             # set 'rowIndx' as start from 0
459 |             rowIndx -= 1
460 | 
461 |             fields = line.rstrip().split()
462 |             fields.pop(0)
463 |             if len(fields) != blockSize:
464 |                 blockSize = len(fields)
465 |             for j in range(blockSize):
466 |                 if fields[j] == '.':
467 |                     continue
468 |                 else:
469 |                     betaMatrix[rowIndx, curBlockColStart + j] = float(fields[j])                 
470 |                             
471 |         return a0, betaMatrix       
472 |       
473 |         
474 |     def _updteParams(self, alpha = 0.1, nparents=None):
475 |         # Update the parameter associated with each node, p(n | Pa(n)) using logistic regression,
476 |         # using expected states of precessors as X and current node states acrss samples as y
477 |         nCases, nVariables = np.shape(self.obsData.data)
478 |         if not nparents:
479 |             nparents = self.nParents
480 |         
481 |         for nodeId in self.network:     
482 |             nodeObj = self.network.node[nodeId]['nodeObj'] 
483 |             if nodeObj.type == 'FLUORESCENCE' or nodeObj.type == 'PERTURBATION':
484 |                 continue
485 |             nodeObj.fitRes = list()
486 |             preds = self.network.predecessors(nodeId)
487 |             predIndices = self.nodeStates[0].findColIndices(preds)
488 |                        
489 |             for c in range(self.nChains): 
490 |                 expectedPredState = self.expectedStates[c][:, predIndices]
491 |                 #x = np.column_stack((np.ones(nCases), expectedPredState))                    
492 |                 x =  np.column_stack((np.ones(nCases), expectedPredState))
493 |                 y = self.nodeStates[c].getValuesByCol(nodeId) 
494 |                     
495 |                 #check if all x and y are of same value, which will lead to problem for glmnet
496 |                 rIndx = map(lambda z: int(math.floor(z)), np.random.rand(50) * nCases)
497 |                 if sum(y) == nCases:  # if every y == 1                      
498 |                     y[rIndx] = 0                        
499 |                 elif sum( map(lambda x: 1 - x, y)) == nCases:
500 |                     y[rIndx] = 1        
501 |                 y = robjects.vectors.IntVector(y)
502 |                 
503 |                 allRwoSumOnes = np.where(np.sum(x, 0) == nCases)[0]
504 |                 for col in allRwoSumOnes:
505 |                     rIndx = map(lambda z: int(math.floor(z)), np.random.rand(3) * nCases)
506 |                     x[rIndx, col] = 0 
507 |                 allZeros = np.where(np.sum(np.ones(np.shape(x)) - x, 0) == nCases) 
508 |                 for col in allZeros[0]:
509 |                     rIndx = map(lambda z: int(math.floor(z)), np.random.rand(3) * nCases)
510 |                     x[rIndx, col] = 1
511 |                     
512 |                 # call logistic regression using glmnet from Rpy
513 |                 fit = glmnet (x, y, alpha = alpha, family = "binomial", intercept = 0)
514 |                 nodeObj.fitRes.append(fit)
515 |                     
516 |                 # extract coefficients glmnet, keep the first set beta with nParent non-zeros values
517 |                 a0, betaMatrix = self.parseGlmnetCoef(fit) 
518 |                 for j in range(np.shape(betaMatrix)[1]):
519 |                     if sum(betaMatrix[:, j] != 0.) >= nparents:
520 |                         break
521 |                 if j >= len(a0):
522 |                     j = len(a0) - 1
523 |                     
524 |                 myparams = betaMatrix[:, j]
525 |                 if sum( myparams != 0.) > nparents:
526 |                     sortedParams = sorted(np.abs(myparams))                    
527 |                     myparams[np.abs(myparams) < sortedParams[-self.nParents]] = 0.  
528 |                     
529 |                 nodeObj.params[c,:] =  myparams
530 |                         
531 |                         
532 |     def getStimuliSpecificNet(self, stimulus):  
533 |         self.stimuli = ['EGF',	'FGF1',	'HGF',	'IGF1',	 'Insulin',	'NRG1',	 'PBS',	 'Serum']
534 |         #self.stimuli = ['loLIG1',	'hiLIG1',	'loLIG2',	'hiLIG2']
535 |         # trim unused edges
536 |         if not stimulus in self.nodeStates[0].getColnames():
537 |             raise Exception("Input stimulus '" + stimulus + "' is not in the experiment data")
538 | 
539 |         #self.trimEdgeByConsensus(0.9)
540 |         stimulusCases = self.perturbData.getValuesByCol(stimulus) == 1
541 |         controlCases = np.sum(self.perturbData.getValuesByCol(self.stimuli), 1) == 0
542 |         
543 |         # identify the nodes to keep by determine if a node responds to a stimuli
544 |         activeNodes = set()
545 |         activeNodes.add(stimulus)
546 |         for nodeId in self.network:            
547 |             if self.network.node[nodeId]['nodeObj'].type == 'FLUORESCENCE' \
548 |             or self.network.node[nodeId]['nodeObj'].type == 'fluorescence':
549 |                 nodeControlValues = self.obsData.getValuesByCol(nodeId)[controlCases]
550 |                 nodeStimulValues = self.obsData.getValuesByCol(nodeId)[stimulusCases]
551 |                 ttestRes = R('t.test')(robjects.FloatVector(nodeControlValues), robjects.FloatVector(nodeStimulValues))
552 |                 pvalue = np.array(ttestRes.rx('p.value')[0])[0]
553 |                 if pvalue < 0.05:
554 |                     activeNodes.add(self.network.predecessors(nodeId)[0])
555 | 
556 |         # copy network to a tmp, redirect edges from activation state nodes 
557 |         # Edge indicates the impact 
558 |         tmpNet = nx.DiGraph()
559 |         for u,  v in self.network.edges():
560 |             # we are only interested in the edge from protein point to antibody
561 |             if (self.network.node[u]['nodeObj'].type == 'ACTIVATIONSTATE'\
562 |             or self.network.node[u]['nodeObj'].type == 'activeState')\
563 |             and (self.network.node[v]['nodeObj'].type == 'PHOSPHORYLATIONSTATE'\
564 |             or self.network.node[v]['nodeObj'].type == 'phosState'):
565 |                 # extract parameters associated with u and v
566 |                 vPreds = self.network.predecessors(v)
567 |                 uIndx = vPreds.index(u)
568 |                 vParams = np.sum(self.network.node[v]['nodeObj'].params, 0) 
569 |                 if len(vParams) != (len(vPreds) + 1):
570 |                     raise Exception ("Bug in retrieving parameters of node v " + u)
571 |                 paramZeros = np.sum(self.network.node[v]['nodeObj'].params == 0, 0)
572 |                 if np.float(paramZeros[uIndx+1]) / float(self.nChains) > .9:
573 |                     continue  # don't add edge with beta == 0
574 |                     
575 |                 for ab in self.dictProteinToAntibody[u]: 
576 |                     if ab not in self.network:
577 |                         continue
578 |                     # find the impact of phosphorylation on activation state
579 |                     uPreds = self.network.predecessors(u)
580 |                     uParams = np.mean(self.network.node[u]['nodeObj'].params, 0) 
581 |                     if len(uParams) != (len(uPreds) + 1):
582 |                         raise Exception ("Bug in retrieving parameters of node v " + u)
583 |                     #uAntibodyParam = uParams[uPreds.index(ab) + 1]
584 |                     
585 | #                    if vParams[uIndx+1] > 0. and (vParams[uIndx+1] * uAntibodyParam) > 0:
586 | #                        tmpNet.add_edge(ab, v, effect = "+", betaValue = vParams[uIndx+1])
587 | #                    elif (vParams[uIndx+1] * uAntibodyParam) < 0.:
588 | #                        tmpNet.add_edge(ab, v, effect = "-", betaValue = vParams[uIndx+1])          
589 |                     if vParams[uIndx+1] > 0. :
590 |                         tmpNet.add_edge(ab, v, effect = "+", betaValue = vParams[uIndx+1])
591 |                     elif vParams[uIndx+1]  < 0.:
592 |                         tmpNet.add_edge(ab, v, effect = "-", betaValue = vParams[uIndx+1])          
593 |             
594 |         # remove leave nodes that is not in activeNodes list
595 |         while True:
596 |             leafNodes = []
597 |             for nodeId in tmpNet:                     
598 |                 if (nodeId not in activeNodes and len(tmpNet.successors(nodeId)) == 0)\
599 |                 or (nodeId not in activeNodes and len(tmpNet.predecessors(nodeId)) == 0):
600 |                     leafNodes.append(nodeId)
601 |                     
602 |             if len(leafNodes) == 0:
603 |                 break
604 |             
605 |             for leaf in leafNodes:
606 |                 tmpNet.remove_node(leaf)
607 |         
608 |         # now try to remove cycles and make the tmpNet a DAG
609 |         return tmpNet
610 |             
611 |                          
612 |                         
613 |     def toGraphML(self, filename):
614 |         tmpNet = nx.DiGraph()
615 |         for edge in self.network.edges():
616 |             tmpNet.add_edge(edge)
617 |             
618 |         nx.write_graphml(tmpNet, filename, encoding='utf-8', prettyprint=True)
619 |         
620 | #    # this funciton implement 
621 | #    def K2LikeGreedySearch (self, tmpNet):
622 | #        for node in tmpNet:
623 | #            ancestors = tmpNet.predecessors(node)
624 | #            preds = []
625 | #            while True:
626 | #                
627 |                 
628 |                 
629 |                 
630 |             
631 |         
632 | 
633 | 


--------------------------------------------------------------------------------