├── README.md ├── geo_db.py └── dcs.py /README.md: -------------------------------------------------------------------------------- 1 | Implementing the Dependency-based compositional semantics (DSC) representation from Percy Liang's 2 | [PhD Thesis](http://cs.stanford.edu/~pliang/papers/dcs-thesis2011.pdf) 3 | 4 | 5 | -------------------------------------------------------------------------------- /geo_db.py: -------------------------------------------------------------------------------- 1 | 2 | # read in geobase 3 | # ftp://ftp.cs.utexas.edu/pub/mooney/nl-ilp-data/geosystem/geobase 4 | 5 | _state = [] 6 | _population = {} 7 | _major = [] # major cities 8 | _area = {} 9 | _capital = {} 10 | _contains= {} 11 | _state_abbr = {} 12 | _city = [] 13 | 14 | 15 | def state(): 16 | return [(s,) for s in _state] 17 | 18 | def major(): 19 | return [(c,) for c in _major] 20 | 21 | def population(): 22 | return _population.items() 23 | 24 | def area(): 25 | return _area.items() 26 | 27 | def captial(): 28 | return _capital.items() 29 | 30 | def city(): 31 | return [(c,) for c in _city] 32 | 33 | def contains(): 34 | return _contains.items() 35 | 36 | loc = contains 37 | 38 | def add_state(line): 39 | line = line.replace("'","") 40 | name, abbr, capital, population, area,\ 41 | number, city1, city2, city3, city4 = line[6:-2].split(",") 42 | _state.append(abbr) 43 | _state_abbr[name] = abbr 44 | _population[abbr] = population 45 | 46 | _area[abbr] = area 47 | _capital[abbr] = area 48 | 49 | def add_city(line): 50 | line = line.replace("'","") 51 | state, stateabbr, city, population = line[5:-2].split(",") 52 | _city.append(city) 53 | population = int(population) 54 | if population > 500000: 55 | _major.append(city) 56 | _population[city] = population 57 | _contains[city] = stateabbr 58 | 59 | def add_border(line): 60 | pass 61 | 62 | with open('geobase', 'r') as f: 63 | for line in f: 64 | if line.endswith(".\n"): 65 | if line.startswith("state"): 66 | add_state(line.strip()) 67 | elif line.startswith("city"): 68 | add_city(line.strip()) 69 | elif line.startswith("border"): 70 | add_border(line.strip()) 71 | 72 | if __name__ == '__main__': 73 | print len(_city), len(_major) 74 | if False: 75 | print population() 76 | print captial() 77 | print contains() 78 | -------------------------------------------------------------------------------- /dcs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Dependency-based compositional semantics (DCS) 5 | - DCS's motivation was "to create a transparent interface between syntax and semantics." 6 | 7 | References: 8 | [1] Percy Liang's 2011 PhD Thesis. 9 | 10 | """ 11 | 12 | import itertools 13 | import inspect 14 | from collections import defaultdict 15 | from geo_db import * 16 | 17 | NULL = ('NULL',) 18 | 19 | class Relation(object): 20 | """ A procedure that is applied to two trees """ 21 | pass 22 | 23 | class Join(Relation): 24 | """ Ensures that value of the parent's denotation at parent_index is equal to 25 | the value of the child's denotation at child_index """ 26 | 27 | def __init__(self, parent_index, child_index): 28 | self.parent_index = parent_index 29 | self.child_index = child_index 30 | 31 | def __repr__(self): 32 | return "%i/%i" % (self.parent_index, self.child_index) 33 | 34 | def lambda_formula(self, parent, child): 35 | return "%s = %s" % (parent, child) 36 | 37 | 38 | def __call__(self, parent, child): 39 | """ Takes a cross product of tuples (the denotations of parent and child) 40 | and extracts all tuples that match the equality constraint. 41 | Then it "projects" the results and only takes those up to the parent's arity""" 42 | # we dont keep stores in the denotation, so we don't have to remove them. 43 | results = [] 44 | self.twice = True 45 | self.once = True 46 | print "\n\ncalled", parent.predicate, 47 | if child.denotation is not None: 48 | print len(child.denotation) 49 | else: 50 | print "Null child" 51 | print "first c", child.denotation[0] 52 | 53 | def join_function(pc): 54 | if self.once: 55 | print pc 56 | print pc[0][self.parent_index-1], pc[1][self.child_index-1] 57 | self.once = False 58 | return pc[0][self.parent_index-1] == pc[1][self.child_index-1] 59 | 60 | def detuple(pc): 61 | # and de-child 62 | #if not isinstance(pc[0], tuple): 63 | # return tuple(pc[0])# + pc[1] 64 | return pc[0] #+ pc[1] 65 | 66 | for match in map(detuple, filter(join_function,\ 67 | itertools.product(parent.denotation, child.denotation))): 68 | # the projection stage -- take the subset of the tuples corresponding 69 | # to the arity of the parent's predicate 70 | if self.twice: 71 | print "MATCH", match 72 | self.twice = False 73 | 74 | results.append(match) 75 | 76 | return results 77 | 78 | class Aggregate(Relation): 79 | """ Sets the parent to all acceptable values of the child. 80 | 81 | Takes a subtree and reifies its denotation so that it can be accessed by other nodes. 82 | 83 | The aggregation relation sets the parent node to the denotation of the child node. 84 | 85 | Analogous to lambda abstraction. 86 | """ 87 | 88 | def __repr__(self): 89 | return "\sigma" 90 | 91 | def __call__(self, parent, child): 92 | # enumerate the child 93 | print "Called aggregate on", parent.predicate, child.predicate 94 | return tuple([child.denotation]) 95 | 96 | class MarkRelation(Relation): 97 | """ Takes a denotation, d, a mark relation r in [C,Q,E], and a child denotation c 98 | and sets the store of d in column 1 to be (r, d, c). 99 | 100 | Mark allows a node that is lower in the tree to be invoked by a parent tree. 101 | 102 | 2.4c -- the population node is marked, putting the child argmax is put in a temporary 103 | store, and then when city is executed, argmax is invoked removed from thes tore and 104 | invoked. 105 | 106 | Denotations are augmented to include information about all marked nodes, since they 107 | can be accessed by an execute relation.""" 108 | 109 | def __init__(self, child_denotation): 110 | self.child = child_denotation 111 | 112 | class Extract(MarkRelation): 113 | """ Marks the node for extraction """ 114 | 115 | def __repr__(self): 116 | return "E" 117 | 118 | class Compare(MarkRelation): 119 | """ Marks the node for superlative, comparatives """ 120 | 121 | def __repr__(self): 122 | return "C" 123 | 124 | class Quantify(MarkRelation): 125 | """ Marks the node for Quantification, negation """ 126 | 127 | def __repr__(self): 128 | return "Q" 129 | 130 | class Execute(Relation): 131 | """ Processes a marked descendent node and applies it at the desired 132 | point. """ 133 | 134 | def __init__(self, *indices): 135 | """ Indices is an array of nodes that specifies the order of marked 136 | nodes that you want to execute. 137 | """ 138 | self.indices = indices 139 | 140 | def __repr__(self): 141 | return "x_i" 142 | 143 | def __call__(self, parent, child): 144 | print "called execute but i don't know what to do" 145 | return child.denotation 146 | 147 | class DCSTree(object): 148 | 149 | def __init__(self, predicate=None): 150 | self.predicate = predicate 151 | 152 | self.arity = 1 # arity of the predicate 153 | if inspect.isfunction(predicate): 154 | self.arity = predicate.func_code.co_argcount 155 | elif inspect.ismethod(predicate): 156 | # remove self 157 | self.arity = predicate.func_code.co_argcount - 1 158 | 159 | # edges is a list of (Relation, DCSTree) tuples 160 | self.edges = [] 161 | self.denotation = [NULL] 162 | self.stores = None # for marked nodes 163 | 164 | def add_child(self, relation, child): 165 | assert isinstance(child, DCSTree) 166 | assert isinstance(relation, Relation) 167 | self.edges.append((relation, child,)) 168 | 169 | def get_children(self): 170 | return [child for (_,child) in self.edges] 171 | 172 | def is_leaf(self): 173 | return len(self.edges) == 0 174 | 175 | def is_grounded(self): 176 | return not self.denotation is None 177 | 178 | def ground(self, world=None): 179 | """ 180 | A denotation consists of n columns, where each column is either the root node 181 | or a non executed marked node. Ordered by preorder traversal (self, *children) 182 | 183 | Denotation is a set of arrays, where each is a feasible assignment of values 184 | to columns. 185 | 186 | 2.7, there are two columns: one for root state and size, marked by c: 187 | 188 | state, column 1 = OK 189 | size, column 2 = (TX, 2.7e5) 190 | 191 | If a node is Marked, its denotation also contains a 'store' with information 192 | to be retrieved when that marked node is executed 193 | 194 | Stores have: 195 | - the mark relation 196 | - the base denotation (the denotation of the node excluding the mark relation)[[size]] 197 | - the denotation of the child of the mark relation [[argmax]] 198 | 199 | """ 200 | # ground all children 201 | for child in self.get_children(): 202 | child.ground(world) 203 | 204 | # ground itself 205 | print "Grounding ", self.predicate, self.predicate in globals() 206 | if self.predicate is not None and self.predicate in globals(): 207 | self.denotation = globals()[self.predicate]() 208 | elif self.predicate is not None: 209 | c = itertools.chain([c.denotation for c in self.get_children()]) 210 | self.denotation = [entry + (self.predicate(*entry),) for entry in c] 211 | print "DENOTATION", self.denotation 212 | if not self.is_grounded(): 213 | raise NotImplemented() 214 | 215 | for relation, child in self.edges: 216 | self.denotation = relation(self, child) 217 | 218 | return self.denotation 219 | 220 | 221 | def __repr__(self): 222 | child_string = ":".join(["%s:%s" % (r, c) for r, c in self.edges]) 223 | if child_string: 224 | return "<%s;%s>" % (self.predicate, child_string) 225 | else: 226 | return "<%s>" % (self.predicate) 227 | 228 | def lambda_formula(self, used_symbols=None): 229 | """ Turns the tree into a lambda expression and returns all of the 230 | a list of the terms (strings) 231 | """ 232 | declarations = [] 233 | dec_type = 'E' 234 | 235 | if used_symbols is None: 236 | # first call. top level predicate is a lambda reduction, not an 237 | # existential quantifier 238 | used_symbols = set() 239 | dec_type = r'\lambda' 240 | #dec_type = u'λ' 241 | 242 | # perform the alpha-reduction, getting a unique variable name for each 243 | # predicate 244 | p, offset = 1, 1 245 | while True: 246 | p = "%s%i" % (self.predicate[0].lower(), offset,) 247 | if p not in used_symbols: break 248 | offset += 1 249 | 250 | used_symbols.add(p) 251 | self.p = p 252 | declarations.append("%s %s " % (dec_type, p)) 253 | 254 | for (relation, child) in self.edges: 255 | declarations += child.lambda_formula(used_symbols) 256 | # two iterations to keep formulas at the end 257 | for (relation, child) in self.edges: 258 | if hasattr(relation, 'lambda_formula'): 259 | declarations.append(relation.lambda_formula(self.p, child.p)) 260 | 261 | return declarations 262 | 263 | 264 | @classmethod 265 | def combine(clz, left_tree, right_tree): 266 | """ Takes two trees, L and R, and accumulates all combinations of 267 | (a) L + R with L as root 268 | (b) L + R with R as root 269 | 270 | All types of relations with relevant arity are considered (e.g. join 271 | and execute) 272 | 273 | Then trace predicates are considered allowing d-1 additional predicates. 274 | """ 275 | pass 276 | 277 | 278 | def count(a): 279 | return len(a) 280 | 281 | def argmax(measure, a): 282 | return max(a, key=measure) 283 | 284 | def argmin(measure, a): 285 | return min(a, key=measure) 286 | 287 | # define generalized quantifiers 288 | # a is restrictor and b is a nuclear scope 289 | def some(a, b): 290 | return len(a.intersect(b)) > 0 291 | 292 | def every(a, b): 293 | return a.is_subset(b) 294 | 295 | def no(a, b): 296 | return len(a.intersect(b)) == 0 297 | 298 | def most(a, b): 299 | return len(a.intersect(b)) > (0.5 * len(a)) 300 | 301 | 302 | # superlative and comparative 303 | # measure function 304 | def more(measure, a, b): 305 | return max(measure(a)) > max(measure(b)) 306 | 307 | def less(measure, a, b): 308 | return min(measure(a)) < min(measure(b)) 309 | 310 | #------------------------------------------------------------------------------ 311 | # Join function (Figure 2.2 from [1]) 312 | #------------------------------------------------------------------------------ 313 | # major cities in ca 314 | def test_join(): 315 | d = DCSTree("city") 316 | d.add_child(Join(1,1), DCSTree("major")) 317 | l = DCSTree("loc") 318 | d.add_child(Join(1,1), l) 319 | l.add_child(Join(2,1), DCSTree("ca")) 320 | assert d.ground() == [('los angeles',), ('san diego',), ('san francisco',), ('san jose',)] 321 | 322 | #------------------------------------------------------------------------------ 323 | # Aggregation function (Page 14) 324 | #------------------------------------------------------------------------------ 325 | # number of major cities 326 | r = DCSTree() 327 | ct = DCSTree(count) 328 | agg = DCSTree() 329 | c = DCSTree("city") 330 | r.add_child(Join(1,2), ct) 331 | ct.add_child(Join(1,1), agg) 332 | agg.add_child(Aggregate(), c) 333 | c.add_child(Join(1,1), DCSTree("major")) 334 | print ct.ground() 335 | 336 | if False: 337 | tfb = DCSTree() # 24 b 338 | am = DCSTree(argmax) 339 | tfb.add_child(Join(1,2), am) 340 | 341 | 342 | null = DCSTree() 343 | p= DCSTree("population") 344 | c = DCSTree("city") 345 | null.add_child(Aggregate(), p) 346 | p.add_child(Join(1,1), c) 347 | print null.ground(1) 348 | #print "CITY", city.ground(1) 349 | #print "POPULATION", population.ground(1) 350 | 351 | #------------------------------------------------------------------------------ 352 | #------------------------------------------------------------------------------ 353 | # state borders state 354 | null = DCSTree() 355 | 356 | state = DCSTree("state") 357 | state.denotation = [("AL",), ("AK",), ("CT",)] 358 | 359 | border = DCSTree("border") 360 | border.denotation = [("FL", "AL"), 361 | ("GA", "AL"), 362 | ("MS", "AL"), 363 | ("TN", "AL"),] 364 | 365 | state2 = DCSTree("state") 366 | state2.denotation = [("AL",), ("AK",), ("CT",)] 367 | 368 | #null.add_child(Aggregate(), state) 369 | state.add_child(Join(1,1), border) 370 | border.add_child(Join(2,1), state2) 371 | state2.add_child(Execute(), DCSTree()) 372 | 373 | #print null 374 | #print null.ground(None) 375 | 376 | print state.ground(None) 377 | --------------------------------------------------------------------------------